1 /* ============================================================================
2 * Douglas Thrift's Search Engine License
3 *
4 * Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * 3. The end-user documentation included with the redistribution, if any, must
16 * include the following acknowledgment:
17 *
18 * "This product includes software developed by Douglas Thrift
19 * (http://computers.douglasthrift.net/searchengine/)."
20 *
21 * Alternately, this acknowledgment may appear in the software itself, if
22 * and wherever such third-party acknowledgments normally appear.
23 *
24 * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25 * be used to endorse or promote products derived from this software without
26 * specific prior written permission. For written permission, please visit
27 * http://www.douglasthrift.net/contact.cgi for contact information.
28 *
29 * 5. Products derived from this software may not be called "Douglas Thrift's
30 * Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31 * name, without prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 * ============================================================================
44 */
45 // Douglas Thrift's Search Engine Indexer
46 //
47 // Douglas Thrift
48 //
49 // $Id: Indexer.cpp 372 2008-08-23 11:00:12Z douglas $
50
51 #include "Indexer.hpp"
52
53 #ifndef _WIN32
54 #include <unistd.h>
55 #else // _WIN32
unlink(const char * filename)56 inline int unlink(const char* filename) { return DeleteFile(filename); }
57 #endif // _WIN32
58
index(string & begin)59 void Indexer::index(string& begin)
60 {
61 size_t separator(indexFile.rfind(slash));
62 string dtd(separator != string::npos ? indexFile.substr(0, separator) +
63 slash + "index.dtd" : "index.dtd");
64 ifstream fin(dtd.c_str());
65
66 if (!fin.is_open())
67 {
68 ofstream fout(dtd.c_str());
69
70 fout << "<!ELEMENT index (page*)>\n"
71 << "<!ELEMENT page (address, port?, tls?, path, size, title?, "
72 << "description?, text, heading*)>\n"
73 << "<!ELEMENT address (#PCDATA)>\n"
74 << "<!ELEMENT port (#PCDATA)>\n"
75 << "<!ELEMENT tls (#PCDATA)>\n"
76 << "<!ELEMENT path (#PCDATA)>\n"
77 << "<!ELEMENT size (#PCDATA)>\n"
78 << "<!ELEMENT title (#PCDATA)>\n"
79 << "<!ELEMENT description (#PCDATA)>\n"
80 << "<!ELEMENT text (#PCDATA)>\n"
81 << "<!ELEMENT heading (#PCDATA)>\n";
82
83 fout.close();
84 }
85
86 fin.close();
87
88 string lock(indexFile + ".lock");
89 ofstream fout(lock.c_str());
90
91 fout.close();
92 fout.open(indexFile.c_str());
93
94 fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
95 << "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
96 << "<index>\n";
97
98 URL first(begin);
99
100 index(first, fout);
101
102 while (!links.empty())
103 {
104 URL next(links.front());
105 string referer(referers.front());
106
107 links.pop();
108 referers.pop();
109
110 if (debug) cerr << "next = " << next << "\n";
111
112 index(next, fout, referer);
113 }
114
115 fout << "</index>\n";
116
117 fout.close();
118
119 unlink(lock.c_str());
120 }
121
index(URL & url,ofstream & fout,const string & referer)122 void Indexer::index(URL& url, ofstream& fout, const string& referer)
123 {
124 if (domains.find(url.getAddress()) != domains.end() &&
125 pages.find(url.getURL()) == pages.end())
126 {
127 if (checked.find(url.getAddress() + (url.getPort() != 80 ? ":" +
128 url.getPort() : string(""))) == checked.end())
129 {
130 robots(url);
131 }
132
133 if (!restricted(url))
134 {
135 if (http.handle(url, referer, true))
136 {
137 if (http.contentType().find("text/plain") == 0 ||
138 http.contentType().find("text/html") == 0)
139 {
140 http.clear();
141
142 if (!http.handle(url, referer)) exit(1);
143
144 cout << "Indexing " << url << " ... " << flush;
145
146 if (processor.process(http, url))
147 {
148 Page page(processor.getPage());
149
150 fout << page << "\n";
151
152 cout << "done.\n";
153 }
154 else
155 {
156 cout << "canceled.\n";
157 }
158
159 pages.insert(url.getURL());
160
161 Set pageLinks(processor.getLinks());
162
163 processor.reset();
164
165 for (SetIterator link(pageLinks.begin()); link !=
166 pageLinks.end(); link++)
167 {
168 if (pages.find(*link) == pages.end())
169 {
170 links.push(*link);
171 referers.push(url.getURL());
172 }
173 }
174 }
175 else
176 {
177 // unhandled content
178 }
179 }
180 else if (!http.redirect().empty())
181 {
182 if (pages.find(http.redirect()) == pages.end())
183 {
184 links.push(http.redirect());
185 referers.push(url.getURL());
186 }
187 }
188
189 http.clear();
190 }
191 }
192 }
193
restricted(URL & url)194 bool Indexer::restricted(URL& url)
195 {
196 bool answer(false);
197
198 for (SetIterator itor(restrictions.begin()); itor != restrictions.end();
199 itor++)
200 {
201 URL checker(*itor);
202
203 if (url.getAddress() == checker.getAddress() && url.getPort() ==
204 checker.getPort())
205 {
206 if (url.getPath().find(checker.getPath()) == 0)
207 {
208 answer = true;
209
210 break;
211 }
212 }
213 }
214
215 return answer;
216 }
217
robots(URL & url)218 void Indexer::robots(URL& url)
219 {
220 URL robots(url);
221
222 robots.setPath("/robots.txt");
223
224 if (http.handle(robots))
225 {
226 cout << "Checking " << robots << " ... " << flush;
227
228 string line;
229 bool record(false), hasVersion(false), hasName(false), hasAll(false);
230 Robot state(none);
231 Set restrictionsVersion, restrictionsName, restrictionsAll;
232
233 while (http.good())
234 {
235 http.getline(line);
236
237 size_t comment(line.find('#'));
238
239 if (comment != string::npos) line.erase(comment);
240
241 if (line.empty() && comment == string::npos) record = false;
242 if (line.empty()) continue;
243
244 size_t colon(line.find(':'));
245 string field(line.substr(0, colon));
246 string value(line.substr(colon + 1));
247
248 normalize(value);
249
250 if (field == "User-agent" && value == agent(true))
251 {
252 state = version;
253 record = true;
254 hasVersion = true;
255 }
256 else if (field == "User-agent" && value == agent(false))
257 {
258 state = name;
259 record = true;
260 hasName = true;
261 }
262 else if (field == "User-agent" && value == "*")
263 {
264 state = all;
265 record = true;
266 hasAll = true;
267 }
268 else if (field == "Disallow" && record && value.empty())
269 {
270 // no restrictions
271 }
272 else if (field == "Disallow" && record)
273 {
274 URL restriction(robots);
275
276 restriction.setPath(value);
277
278 switch (state)
279 {
280 case version:
281 restrictionsVersion.insert(restriction.getURL());
282 break;
283 case name:
284 restrictionsName.insert(restriction.getURL());
285 break;
286 case all:
287 restrictionsAll.insert(restriction.getURL());
288 break;
289 default:
290 break;
291 }
292 }
293 }
294
295 if (hasVersion)
296 {
297 state = version;
298 }
299 else if (hasName)
300 {
301 state = name;
302 }
303 else if (hasAll)
304 {
305 state = all;
306 }
307 else
308 {
309 state = none;
310 }
311
312 SetIterator itor;
313
314 switch (state)
315 {
316 case version:
317 for (itor = restrictionsVersion.begin(); itor !=
318 restrictionsVersion.end(); itor++)
319 {
320 restrictions.insert(*itor);
321 }
322 break;
323 case name:
324 for (itor = restrictionsName.begin(); itor !=
325 restrictionsName.end(); itor++)
326 {
327 restrictions.insert(*itor);
328 }
329 break;
330 case all:
331 for (itor = restrictionsAll.begin(); itor != restrictionsAll.end();
332 itor++)
333 {
334 restrictions.insert(*itor);
335 }
336 break;
337 default:
338 break;
339 }
340
341 cout << "done.\n";
342 }
343
344 http.clear();
345
346 checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
347 url.getPort() : "");
348 }
349