1 /* ============================================================================
2  * Douglas Thrift's Search Engine License
3  *
4  * Copyright (C) 2002-2004, 2008, Douglas Thrift. All Rights Reserved.
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright notice,
9  *    this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  *    this list of conditions and the following disclaimer in the documentation
13  *    and/or other materials provided with the distribution.
14  *
15  * 3. The end-user documentation included with the redistribution, if any, must
16  *    include the following acknowledgment:
17  *
18  *       "This product includes software developed by Douglas Thrift
19  *       (http://computers.douglasthrift.net/searchengine/)."
20  *
21  *    Alternately, this acknowledgment may appear in the software itself, if
22  *    and wherever such third-party acknowledgments normally appear.
23  *
24  * 4. The names "Douglas Thrift" and "Douglas Thrift's Search Engine" must not
25  *    be used to endorse or promote products derived from this software without
26  *    specific prior written permission.  For written permission, please visit
27  *    http://www.douglasthrift.net/contact.cgi for contact information.
28  *
29  * 5. Products derived from this software may not be called "Douglas Thrift's
30  *    Search Engine", nor may "Douglas Thrift's Search Engine" appear in their
31  *    name, without prior written permission.
32  *
33  * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
34  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
35  * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
36  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
37  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
38  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39  * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
40  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
42  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43  * ============================================================================
44  */
45 // Douglas Thrift's Search Engine Indexer
46 //
47 // Douglas Thrift
48 //
49 // $Id: Indexer.cpp 372 2008-08-23 11:00:12Z douglas $
50 
51 #include "Indexer.hpp"
52 
53 #ifndef _WIN32
54 #include <unistd.h>
55 #else // _WIN32
unlink(const char * filename)56 inline int unlink(const char* filename) { return DeleteFile(filename); }
57 #endif // _WIN32
58 
index(string & begin)59 void Indexer::index(string& begin)
60 {
61 	size_t separator(indexFile.rfind(slash));
62 	string dtd(separator != string::npos ? indexFile.substr(0, separator) +
63 		slash + "index.dtd" : "index.dtd");
64 	ifstream fin(dtd.c_str());
65 
66 	if (!fin.is_open())
67 	{
68 		ofstream fout(dtd.c_str());
69 
70 		fout << "<!ELEMENT index (page*)>\n"
71 			<< "<!ELEMENT page (address, port?, tls?, path, size, title?, "
72 			<< "description?, text, heading*)>\n"
73 			<< "<!ELEMENT address (#PCDATA)>\n"
74 			<< "<!ELEMENT port (#PCDATA)>\n"
75 			<< "<!ELEMENT tls (#PCDATA)>\n"
76 			<< "<!ELEMENT path (#PCDATA)>\n"
77 			<< "<!ELEMENT size (#PCDATA)>\n"
78 			<< "<!ELEMENT title (#PCDATA)>\n"
79 			<< "<!ELEMENT description (#PCDATA)>\n"
80 			<< "<!ELEMENT text (#PCDATA)>\n"
81 			<< "<!ELEMENT heading (#PCDATA)>\n";
82 
83 		fout.close();
84 	}
85 
86 	fin.close();
87 
88 	string lock(indexFile + ".lock");
89 	ofstream fout(lock.c_str());
90 
91 	fout.close();
92 	fout.open(indexFile.c_str());
93 
94 	fout << "<?xml version=\"1.0\" encoding=\"ISO-8859-1\" standalone=\"no\"?>"
95 		<< "\n<!DOCTYPE index SYSTEM \"index.dtd\">\n"
96 		<< "<index>\n";
97 
98 	URL first(begin);
99 
100 	index(first, fout);
101 
102 	while (!links.empty())
103 	{
104 		URL next(links.front());
105 		string referer(referers.front());
106 
107 		links.pop();
108 		referers.pop();
109 
110 		if (debug) cerr << "next = " << next << "\n";
111 
112 		index(next, fout, referer);
113 	}
114 
115 	fout << "</index>\n";
116 
117 	fout.close();
118 
119 	unlink(lock.c_str());
120 }
121 
index(URL & url,ofstream & fout,const string & referer)122 void Indexer::index(URL& url, ofstream& fout, const string& referer)
123 {
124 	if (domains.find(url.getAddress()) != domains.end() &&
125 		pages.find(url.getURL()) == pages.end())
126 	{
127 		if (checked.find(url.getAddress() + (url.getPort() != 80 ? ":" +
128 			url.getPort() : string(""))) == checked.end())
129 		{
130 			robots(url);
131 		}
132 
133 		if (!restricted(url))
134 		{
135 			if (http.handle(url, referer, true))
136 			{
137 				if (http.contentType().find("text/plain") == 0 ||
138 					http.contentType().find("text/html") == 0)
139 				{
140 					http.clear();
141 
142 					if (!http.handle(url, referer)) exit(1);
143 
144 					cout << "Indexing " << url << " ... " << flush;
145 
146 					if (processor.process(http, url))
147 					{
148 						Page page(processor.getPage());
149 
150 						fout << page << "\n";
151 
152 						cout << "done.\n";
153 					}
154 					else
155 					{
156 						cout << "canceled.\n";
157 					}
158 
159 					pages.insert(url.getURL());
160 
161 					Set pageLinks(processor.getLinks());
162 
163 					processor.reset();
164 
165 					for (SetIterator link(pageLinks.begin()); link !=
166 						pageLinks.end(); link++)
167 					{
168 						if (pages.find(*link) == pages.end())
169 						{
170 							links.push(*link);
171 							referers.push(url.getURL());
172 						}
173 					}
174 				}
175 				else
176 				{
177 					// unhandled content
178 				}
179 			}
180 			else if (!http.redirect().empty())
181 			{
182 				if (pages.find(http.redirect()) == pages.end())
183 				{
184 					links.push(http.redirect());
185 					referers.push(url.getURL());
186 				}
187 			}
188 
189 			http.clear();
190 		}
191 	}
192 }
193 
restricted(URL & url)194 bool Indexer::restricted(URL& url)
195 {
196 	bool answer(false);
197 
198 	for (SetIterator itor(restrictions.begin()); itor != restrictions.end();
199 		itor++)
200 	{
201 		URL checker(*itor);
202 
203 		if (url.getAddress() == checker.getAddress() && url.getPort() ==
204 			checker.getPort())
205 		{
206 			if (url.getPath().find(checker.getPath()) == 0)
207 			{
208 				answer = true;
209 
210 				break;
211 			}
212 		}
213 	}
214 
215 	return answer;
216 }
217 
robots(URL & url)218 void Indexer::robots(URL& url)
219 {
220 	URL robots(url);
221 
222 	robots.setPath("/robots.txt");
223 
224 	if (http.handle(robots))
225 	{
226 		cout << "Checking " << robots << " ... " << flush;
227 
228 		string line;
229 		bool record(false), hasVersion(false), hasName(false), hasAll(false);
230 		Robot state(none);
231 		Set restrictionsVersion, restrictionsName, restrictionsAll;
232 
233 		while (http.good())
234 		{
235 			http.getline(line);
236 
237 			size_t comment(line.find('#'));
238 
239 			if (comment != string::npos) line.erase(comment);
240 
241 			if (line.empty() && comment == string::npos) record = false;
242 			if (line.empty()) continue;
243 
244 			size_t colon(line.find(':'));
245 			string field(line.substr(0, colon));
246 			string value(line.substr(colon + 1));
247 
248 			normalize(value);
249 
250 			if (field == "User-agent" && value == agent(true))
251 			{
252 				state = version;
253 				record = true;
254 				hasVersion = true;
255 			}
256 			else if (field == "User-agent" && value == agent(false))
257 			{
258 				state = name;
259 				record = true;
260 				hasName = true;
261 			}
262 			else if (field == "User-agent" && value == "*")
263 			{
264 				state = all;
265 				record = true;
266 				hasAll = true;
267 			}
268 			else if (field == "Disallow" && record && value.empty())
269 			{
270 				// no restrictions
271 			}
272 			else if (field == "Disallow" && record)
273 			{
274 				URL restriction(robots);
275 
276 				restriction.setPath(value);
277 
278 				switch (state)
279 				{
280 				case version:
281 					restrictionsVersion.insert(restriction.getURL());
282 					break;
283 				case name:
284 					restrictionsName.insert(restriction.getURL());
285 					break;
286 				case all:
287 					restrictionsAll.insert(restriction.getURL());
288 					break;
289 				default:
290 					break;
291 				}
292 			}
293 		}
294 
295 		if (hasVersion)
296 		{
297 			state = version;
298 		}
299 		else if (hasName)
300 		{
301 			state = name;
302 		}
303 		else if (hasAll)
304 		{
305 			state = all;
306 		}
307 		else
308 		{
309 			state = none;
310 		}
311 
312 		SetIterator itor;
313 
314 		switch (state)
315 		{
316 		case version:
317 			for (itor = restrictionsVersion.begin(); itor !=
318 				restrictionsVersion.end(); itor++)
319 			{
320 				restrictions.insert(*itor);
321 			}
322 			break;
323 		case name:
324 			for (itor = restrictionsName.begin(); itor !=
325 				restrictionsName.end(); itor++)
326 			{
327 				restrictions.insert(*itor);
328 			}
329 			break;
330 		case all:
331 			for (itor = restrictionsAll.begin(); itor != restrictionsAll.end();
332 				itor++)
333 			{
334 				restrictions.insert(*itor);
335 			}
336 			break;
337 		default:
338 			break;
339 		}
340 
341 		cout << "done.\n";
342 	}
343 
344 	http.clear();
345 
346 	checked.insert(url.getAddress() += url.getPort() != 80 ? ":" +
347 		url.getPort() : "");
348 }
349