1 /*
2  *  Copyright 2005-2021 Fabrice Colin
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, write to the Free Software
16  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  */
18 
19 #include <sys/types.h>
20 #include <dirent.h>
21 #include <sys/stat.h>
22 #include <unistd.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <string.h>
27 #include <signal.h>
28 #include <time.h>
29 #include <errno.h>
30 #ifdef __OpenBSD__
31 #include <sys/param.h>
32 #include <sys/sysctl.h>
33 #endif
34 #include <exception>
35 #include <iostream>
36 #include <fstream>
37 #include <algorithm>
38 #include <glibmm/miscutils.h>
39 #include <glibmm/convert.h>
40 #include <glibmm/exception.h>
41 
42 #include "config.h"
43 #include "NLS.h"
44 #include "Languages.h"
45 #include "MIMEScanner.h"
46 #include "TimeConverter.h"
47 #include "Timer.h"
48 #include "Url.h"
49 #include "HtmlFilter.h"
50 #include "FilterUtils.h"
51 #include "DownloaderFactory.h"
52 #include "FilterWrapper.h"
53 #include "ModuleFactory.h"
54 #include "WebEngine.h"
55 #include "WorkerThreads.h"
56 
57 using namespace std;
58 using namespace Glib;
59 
QueueManager(const string & defaultIndexLocation,unsigned int maxThreadsTime,bool scanLocalFiles)60 QueueManager::QueueManager(const string &defaultIndexLocation,
61 	unsigned int maxThreadsTime, bool scanLocalFiles) :
62 	ThreadsManager(defaultIndexLocation, maxThreadsTime),
63 	m_scanLocalFiles(scanLocalFiles),
64 	m_stopIndexing(false),
65 	m_actionQueue(PinotSettings::getInstance().getHistoryDatabaseName(), get_application_name())
66 {
67 }
68 
~QueueManager()69 QueueManager::~QueueManager()
70 {
71 }
72 
index_document(const DocumentInfo & docInfo)73 ustring QueueManager::index_document(const DocumentInfo &docInfo)
74 {
75 	string location(docInfo.getLocation());
76 
77 #ifdef DEBUG
78 	clog << "ThreadsManager::index_document: called with " << location << endl;
79 #endif
80 	if (m_stopIndexing == true)
81 	{
82 #ifdef DEBUG
83 		clog << "ThreadsManager::index_document: stopped indexing" << endl;
84 #endif
85 		return _("Indexing was stopped");
86 	}
87 
88 	if (location.empty() == true)
89 	{
90 		// Nothing to do
91 		return "";
92 	}
93 
94 	// If the document is a mail message, we can't index it again
95 	Url urlObj(location);
96 	if (urlObj.getProtocol() == "mailbox")
97 	{
98 		return _("Can't index mail here");
99 	}
100 
101 	// Is the document being indexed/updated ?
102 	if (write_lock_lists() == true)
103 	{
104 		bool beingProcessed = true;
105 
106 		if (m_beingIndexed.find(location) == m_beingIndexed.end())
107 		{
108 			m_beingIndexed.insert(location);
109 			beingProcessed = false;
110 		}
111 
112 		unlock_lists();
113 
114 		if (beingProcessed == true)
115 		{
116 			// FIXME: we may have to set labels on this document
117 			// FIXME: fix this for RTL languages
118 			ustring status(location);
119 			status += " ";
120 			status += _("is already being indexed");
121 			return status;
122 		}
123 	}
124 
125 	// Is the document blacklisted ?
126 	if (PinotSettings::getInstance().isBlackListed(location) == true)
127 	{
128 		// FIXME: fix this for RTL languages
129 		ustring status(location);
130 		status += " ";
131 		status += _("is blacklisted");
132 		return status;
133 	}
134 
135 	if ((m_scanLocalFiles == true) &&
136 		(urlObj.isLocal() == true))
137 	{
138 #ifdef DEBUG
139 		clog << "ThreadsManager::index_document: scanning " <<
140 			urlObj.getLocation() + "/" + urlObj.getFile() << endl;
141 #endif
142 		// This handles both directories and files
143 		start_thread(new DirectoryScannerThread(docInfo,
144 			m_defaultIndexLocation, 0, true, true));
145 	}
146 	else
147 	{
148 		start_thread(new IndexingThread(docInfo, m_defaultIndexLocation));
149 	}
150 
151 	return "";
152 }
153 
clear_queues(void)154 void QueueManager::clear_queues(void)
155 {
156 	if (write_lock_lists() == true)
157 	{
158 		m_beingIndexed.clear();
159 
160 		unlock_lists();
161 
162 		m_actionQueue.expireItems(time(NULL));
163 	}
164 }
165 
queue_index(const DocumentInfo & docInfo)166 ustring QueueManager::queue_index(const DocumentInfo &docInfo)
167 {
168 	bool addToQueue = false;
169 
170 	if (get_threads_count() >= m_maxIndexThreads)
171 	{
172 #ifdef DEBUG
173 		clog << "QueueManager::queue_index: too many threads" << endl;
174 #endif
175 		addToQueue = true;
176 	}
177 #ifdef HAVE_GETLOADAVG
178 	// Get the load averaged over the last minute
179 	else
180 	{
181 		double averageLoad[3];
182 
183 		if (getloadavg(averageLoad, 3) != -1)
184 		{
185 			// FIXME: is LOADAVG_1MIN Solaris specific ?
186 			if (averageLoad[0] >= (double)m_numCPUs * 4)
187 			{
188 				// Don't add to the load, queue this
189 				addToQueue = true;
190 			}
191 		}
192 	}
193 #endif
194 
195 	if (addToQueue == true)
196 	{
197 		m_actionQueue.pushItem(ActionQueue::INDEX, docInfo);
198 
199 		return "";
200 	}
201 
202 	return index_document(docInfo);
203 }
204 
pop_queue(const string & urlWasIndexed)205 bool QueueManager::pop_queue(const string &urlWasIndexed)
206 {
207 	bool getItem = true;
208 	bool emptyQueue = false;
209 
210 #ifdef DEBUG
211 	clog << "QueueManager::pop_queue: called with " << urlWasIndexed << endl;
212 #endif
213 	if (get_threads_count() >= m_maxIndexThreads)
214 	{
215 #ifdef DEBUG
216 		clog << "QueueManager::pop_queue: too many threads" << endl;
217 #endif
218 		getItem = false;
219 	}
220 
221 	if (write_lock_lists() == true)
222 	{
223 		// Update the in-progress list
224 		if (urlWasIndexed.empty() == false)
225 		{
226 			set<string>::iterator urlIter = m_beingIndexed.find(urlWasIndexed);
227 			if (urlIter != m_beingIndexed.end())
228 			{
229 				m_beingIndexed.erase(urlIter);
230 			}
231 		}
232 
233 		unlock_lists();
234 
235 		// Get an item ?
236 		if (getItem == true)
237 		{
238 			ActionQueue::ActionType type;
239 			DocumentInfo docInfo;
240 			string previousLocation;
241 
242 			// Assume the queue is empty
243 			emptyQueue = true;
244 
245 			while (m_actionQueue.popItem(type, docInfo) == true)
246 			{
247 				ustring status;
248 
249 				if (type != ActionQueue::INDEX)
250 				{
251 					continue;
252 				}
253 
254 				// The queue isn't actually empty
255 				emptyQueue = false;
256 
257 				if (docInfo.getLocation() == previousLocation)
258 				{
259 					// Something dodgy is going on, we got the same item twice !
260 					// FIXME: fix this for RTL languages
261 					status = previousLocation;
262 					status += " ";
263 					status += _("is already being indexed");
264 				}
265 				else
266 				{
267 					status = index_document(docInfo);
268 				}
269 
270 				if (status.empty() == true)
271 				{
272 					break;
273 				}
274 
275 				previousLocation = docInfo.getLocation();
276 			}
277 		}
278 	}
279 
280 	return emptyQueue;
281 }
282 
ListerThread(const PinotSettings::IndexProperties & indexProps,unsigned int startDoc)283 ListerThread::ListerThread(const PinotSettings::IndexProperties &indexProps,
284 	unsigned int startDoc) :
285 	WorkerThread(),
286 	m_indexProps(indexProps),
287 	m_startDoc(startDoc),
288 	m_documentsCount(0)
289 {
290 }
291 
~ListerThread()292 ListerThread::~ListerThread()
293 {
294 }
295 
getType(void) const296 string ListerThread::getType(void) const
297 {
298 	return "ListerThread";
299 }
300 
getIndexProperties(void) const301 PinotSettings::IndexProperties ListerThread::getIndexProperties(void) const
302 {
303 	return m_indexProps;
304 }
305 
getStartDoc(void) const306 unsigned int ListerThread::getStartDoc(void) const
307 {
308 	return m_startDoc;
309 }
310 
getDocuments(void) const311 const vector<DocumentInfo> &ListerThread::getDocuments(void) const
312 {
313 	return m_documentsList;
314 }
315 
getDocumentsCount(void) const316 unsigned int ListerThread::getDocumentsCount(void) const
317 {
318 	return m_documentsCount;
319 }
320 
QueryingThread(const PinotSettings::IndexProperties & indexProps,const QueryProperties & queryProps,unsigned int startDoc,bool listingIndex)321 QueryingThread::QueryingThread(const PinotSettings::IndexProperties &indexProps,
322 	const QueryProperties &queryProps, unsigned int startDoc, bool listingIndex) :
323 	ListerThread(indexProps, startDoc),
324 	m_engineName(PinotSettings::getInstance().m_defaultBackend),
325 	m_engineDisplayableName(indexProps.m_name),
326 	m_engineOption(indexProps.m_location),
327 	m_queryProps(queryProps),
328 	m_listingIndex(listingIndex),
329 	m_correctedSpelling(false),
330 	m_isLive(true)
331 {
332 #ifdef DEBUG
333 	clog << "QueryingThread: engine " << m_engineName << ", " << m_engineOption
334 		<< ", mode " << m_listingIndex << endl;
335 #endif
336 }
337 
QueryingThread(const string & engineName,const string & engineDisplayableName,const string & engineOption,const QueryProperties & queryProps,unsigned int startDoc)338 QueryingThread::QueryingThread(const string &engineName, const string &engineDisplayableName,
339 	const string &engineOption, const QueryProperties &queryProps,
340 	unsigned int startDoc) :
341 	ListerThread(PinotSettings::IndexProperties(engineDisplayableName, engineOption, 0, false), startDoc),
342 	m_engineName(engineName),
343 	m_engineDisplayableName(engineDisplayableName),
344 	m_engineOption(engineOption),
345 	m_queryProps(queryProps),
346 	m_listingIndex(false),
347 	m_correctedSpelling(false),
348 	m_isLive(true)
349 {
350 #ifdef DEBUG
351 	clog << "QueryingThread: engine " << m_engineName << ", " << m_engineOption
352 		<< ", mode 0" << endl;
353 #endif
354 }
355 
~QueryingThread()356 QueryingThread::~QueryingThread()
357 {
358 }
359 
getType(void) const360 string QueryingThread::getType(void) const
361 {
362 	if (m_listingIndex == true)
363 	{
364 		return ListerThread::getType();
365 	}
366 
367 	return "QueryingThread";
368 }
369 
isLive(void) const370 bool QueryingThread::isLive(void) const
371 {
372 	return m_isLive;
373 }
374 
getEngineName(void) const375 string QueryingThread::getEngineName(void) const
376 {
377 	return m_engineDisplayableName;
378 }
379 
getQuery(bool & wasCorrected) const380 QueryProperties QueryingThread::getQuery(bool &wasCorrected) const
381 {
382 	wasCorrected = m_correctedSpelling;
383 	return m_queryProps;
384 }
385 
getCharset(void) const386 string QueryingThread::getCharset(void) const
387 {
388 	return m_resultsCharset;
389 }
390 
findPlugin(void)391 bool QueryingThread::findPlugin(void)
392 {
393 	string pluginName;
394 
395 	if ((m_engineName.empty() == true) &&
396 		(m_engineOption.empty() == false))
397 	{
398 		pluginName = m_engineOption;
399 	}
400 	else if ((m_engineName.empty() == false) &&
401 		(m_engineOption.empty() == true))
402 	{
403 		pluginName = m_engineName;
404 	}
405 
406 	if (pluginName.empty() == false)
407 	{
408 		set<ModuleProperties> engines;
409 		PinotSettings::getInstance().getSearchEngines(engines, "");
410 #ifdef DEBUG
411 		clog << "QueryingThread::findPlugin: looking for a plugin named " << pluginName << endl;
412 #endif
413 
414 		// Is there a plugin with such a name ?
415 		ModuleProperties modProps("sherlock", pluginName, "", "");
416 		set<ModuleProperties>::const_iterator engineIter = engines.find(modProps);
417 		if (engineIter == engines.end())
418 		{
419 			// Try again
420 			modProps.m_name = "opensearch";
421 			engineIter = engines.find(modProps);
422 		}
423 
424 		if (engineIter != engines.end())
425 		{
426 			// Yes, there is !
427 			m_engineName = engineIter->m_name;
428 			m_engineDisplayableName = engineIter->m_longName;
429 			m_engineOption = engineIter->m_option;
430 #ifdef DEBUG
431 			clog << "QueryingThread::findPlugin: found " << m_engineName << ", " << m_engineDisplayableName << ", " << m_engineOption << endl;
432 #endif
433 
434 			return true;
435 		}
436 	}
437 
438 	return false;
439 }
440 
EngineQueryThread(const PinotSettings::IndexProperties & indexProps,const QueryProperties & queryProps,unsigned int startDoc,bool listingIndex)441 EngineQueryThread::EngineQueryThread(const PinotSettings::IndexProperties &indexProps,
442 	const QueryProperties &queryProps, unsigned int startDoc, bool listingIndex) :
443 	QueryingThread(indexProps, queryProps, startDoc, listingIndex)
444 {
445 }
446 
EngineQueryThread(const PinotSettings::IndexProperties & indexProps,const QueryProperties & queryProps,const set<string> & limitToDocsSet,unsigned int startDoc)447 EngineQueryThread::EngineQueryThread(const PinotSettings::IndexProperties &indexProps,
448 	const QueryProperties &queryProps, const set<string> &limitToDocsSet,
449 	unsigned int startDoc) :
450 	QueryingThread(indexProps, queryProps, startDoc, false)
451 {
452 	copy(limitToDocsSet.begin(), limitToDocsSet.end(),
453 		inserter(m_limitToDocsSet, m_limitToDocsSet.begin()));
454 }
455 
EngineQueryThread(const string & engineName,const string & engineDisplayableName,const string & engineOption,const QueryProperties & queryProps,unsigned int startDoc)456 EngineQueryThread::EngineQueryThread(const string &engineName, const string &engineDisplayableName,
457 	const string &engineOption, const QueryProperties &queryProps, unsigned int startDoc) :
458 	QueryingThread(engineName, engineDisplayableName, engineOption, queryProps, startDoc)
459 {
460 }
461 
~EngineQueryThread()462 EngineQueryThread::~EngineQueryThread()
463 {
464 }
465 
processResults(const vector<DocumentInfo> & resultsList)466 void EngineQueryThread::processResults(const vector<DocumentInfo> &resultsList)
467 {
468 	PinotSettings &settings = PinotSettings::getInstance();
469 	IndexInterface *pDocsIndex = NULL;
470 	IndexInterface *pDaemonIndex = NULL;
471 	unsigned int indexId = 0;
472 	bool isIndexQuery = false;
473 
474 	// Are we querying an index ?
475 	if (ModuleFactory::isSupported(m_engineName, true) == true)
476 	{
477 		// Internal index ?
478 		if ((m_engineOption == settings.m_docsIndexLocation) ||
479 			(m_engineOption == settings.m_daemonIndexLocation))
480 		{
481 			indexId = settings.getIndexPropertiesByLocation(m_engineOption).m_id;
482 			isIndexQuery = true;
483 		}
484 	}
485 
486 	// Will we have to query internal indices ?
487 	if (isIndexQuery == false)
488 	{
489 		pDocsIndex = settings.getIndex(settings.m_docsIndexLocation);
490 		pDaemonIndex = settings.getIndex(settings.m_daemonIndexLocation);
491 	}
492 
493 	// Copy the results list
494 	for (vector<DocumentInfo>::const_iterator resultIter = resultsList.begin();
495 		resultIter != resultsList.end(); ++resultIter)
496 	{
497 		DocumentInfo currentDoc(*resultIter);
498 		string title(_("No title"));
499 		string location(currentDoc.getLocation(true));
500 		string language(currentDoc.getLanguage());
501 		unsigned int docId = 0;
502 
503 		// The title may contain formatting
504 		if (currentDoc.getTitle().empty() == false)
505 		{
506 			title = FilterUtils::stripMarkup(currentDoc.getTitle());
507 		}
508 		currentDoc.setTitle(title);
509 #ifdef DEBUG
510 		clog << "EngineQueryThread::processResults: title is " << title << endl;
511 #endif
512 
513 		// Use the query's language if the result's is unknown
514 		if (language.empty() == true)
515 		{
516 			language = m_queryProps.getStemmingLanguage();
517 		}
518 		currentDoc.setLanguage(language);
519 
520 		if (isIndexQuery == true)
521 		{
522 			unsigned int tmpId = 0;
523 
524 			// The index engine should have set this
525 			docId = currentDoc.getIsIndexed(tmpId);
526 		}
527 
528 		// Is this in one of the indexes ?
529 		if ((pDocsIndex != NULL) &&
530 			(pDocsIndex->isGood() == true))
531 		{
532 			docId = pDocsIndex->hasDocument(location);
533 			if (docId > 0)
534 			{
535 				indexId = settings.getIndexPropertiesByName(_("My Web Pages")).m_id;
536 			}
537 		}
538 		if ((pDaemonIndex != NULL) &&
539 			(pDaemonIndex->isGood() == true) &&
540 			(docId == 0))
541 		{
542 			docId = pDaemonIndex->hasDocument(location);
543 			if (docId > 0)
544 			{
545 				indexId = settings.getIndexPropertiesByName(_("My Documents")).m_id;
546 			}
547 		}
548 
549 		if (docId > 0)
550 		{
551 			currentDoc.setIsIndexed(indexId, docId);
552 #ifdef DEBUG
553 			clog << "EngineQueryThread::processResults: found in index " << indexId << endl;
554 #endif
555 		}
556 #ifdef DEBUG
557 		else clog << "EngineQueryThread::processResults: not found in any index" << endl;
558 #endif
559 
560 		m_documentsList.push_back(currentDoc);
561 	}
562 
563 	if (pDocsIndex != NULL)
564 	{
565 		delete pDocsIndex;
566 	}
567 	if (pDaemonIndex != NULL)
568 	{
569 		delete pDaemonIndex;
570 	}
571 }
572 
processResults(const vector<DocumentInfo> & resultsList,unsigned int indexId)573 void EngineQueryThread::processResults(const vector<DocumentInfo> &resultsList,
574 	unsigned int indexId)
575 {
576 	unsigned int zeroId = 0;
577 
578 	// Copy the results list
579 	for (vector<DocumentInfo>::const_iterator resultIter = resultsList.begin();
580 		resultIter != resultsList.end(); ++resultIter)
581 	{
582 		DocumentInfo currentDoc(*resultIter);
583 
584 		// The engine has no notion of index IDs
585 		unsigned int docId = currentDoc.getIsIndexed(zeroId);
586 		currentDoc.setIsIndexed(indexId, docId);
587 
588 		m_documentsList.push_back(currentDoc);
589 	}
590 }
591 
doWork(void)592 void EngineQueryThread::doWork(void)
593 {
594 	PinotSettings &settings = PinotSettings::getInstance();
595 
596 	// Get the SearchEngine
597 	SearchEngineInterface *pEngine = ModuleFactory::getSearchEngine(m_engineName, m_engineOption);
598 	if (pEngine == NULL)
599 	{
600 		// Try again
601 		if (findPlugin() == true)
602 		{
603 			pEngine = ModuleFactory::getSearchEngine(m_engineName, m_engineOption);
604 		}
605 
606 		if (pEngine == NULL)
607 		{
608 			m_errorNum = UNKNOWN_ENGINE;
609 			m_errorParam = m_engineDisplayableName;
610 			return;
611 		}
612 	}
613 
614 	// Set up the proxy
615 	WebEngine *pWebEngine = dynamic_cast<WebEngine *>(pEngine);
616 	if (pWebEngine != NULL)
617 	{
618 		DownloaderInterface *pDownloader = pWebEngine->getDownloader();
619 		if ((pDownloader != NULL) &&
620 			(settings.m_proxyEnabled == true) &&
621 			(settings.m_proxyAddress.empty() == false))
622 		{
623 			char portStr[64];
624 
625 			pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
626 			snprintf(portStr, 64, "%u", settings.m_proxyPort);
627 			pDownloader->setSetting("proxyport", portStr);
628 			pDownloader->setSetting("proxytype", settings.m_proxyType);
629 		}
630 
631 		pWebEngine->setEditableValues(settings.m_editablePluginValues);
632 	}
633 
634 	if (m_listingIndex == false)
635 	{
636 		pEngine->setLimitSet(m_limitToDocsSet);
637 	}
638 
639 	// Run the query
640 	pEngine->setDefaultOperator(SearchEngineInterface::DEFAULT_OP_AND);
641 	if (pEngine->runQuery(m_queryProps, m_startDoc) == false)
642 	{
643 		m_errorNum = QUERY_FAILED;
644 		m_errorParam = m_engineDisplayableName;
645 	}
646 	else
647 	{
648 		const vector<DocumentInfo> &resultsList = pEngine->getResults();
649 
650 		m_documentsList.clear();
651 		m_documentsList.reserve(resultsList.size());
652 		m_documentsCount = pEngine->getResultsCountEstimate();
653 #ifdef DEBUG
654 		clog << "EngineQueryThread::doWork: " << resultsList.size() << " off " << m_documentsCount
655 			<< " results to process, starting at position " << m_startDoc << endl;
656 #endif
657 
658 		m_resultsCharset = pEngine->getResultsCharset();
659 		if (m_listingIndex == false)
660 		{
661 			processResults(resultsList);
662 		}
663 		else
664 		{
665 			processResults(resultsList,
666 				PinotSettings::getInstance().getIndexPropertiesByName(m_engineDisplayableName).m_id);
667 		}
668 
669 		// Don't spellcheck if the query was modified in any way
670 		if (m_queryProps.getModified() == false)
671 		{
672 			string correctedFreeQuery(pEngine->getSpellingCorrection());
673 
674 			// Any spelling correction ?
675 			if (correctedFreeQuery.empty() == false)
676 			{
677 				m_correctedSpelling = true;
678 				m_queryProps.setFreeQuery(correctedFreeQuery);
679 			}
680 		}
681 	}
682 
683 	delete pEngine;
684 }
685 
DownloadingThread(const DocumentInfo & docInfo)686 DownloadingThread::DownloadingThread(const DocumentInfo &docInfo) :
687 	WorkerThread(),
688 	m_docInfo(docInfo),
689 	m_pDoc(NULL),
690 	m_pDownloader(NULL)
691 {
692 }
693 
DownloadingThread()694 DownloadingThread::DownloadingThread() :
695 	WorkerThread(),
696 	m_docInfo("", "", "", ""),
697 	m_pDoc(NULL),
698 	m_pDownloader(NULL)
699 {
700 }
701 
~DownloadingThread()702 DownloadingThread::~DownloadingThread()
703 {
704 	if (m_pDoc != NULL)
705 	{
706 		delete m_pDoc;
707 	}
708 	if (m_pDownloader != NULL)
709 	{
710 		delete m_pDownloader;
711 	}
712 }
713 
getType(void) const714 string DownloadingThread::getType(void) const
715 {
716 	return "DownloadingThread";
717 }
718 
getURL(void) const719 string DownloadingThread::getURL(void) const
720 {
721 	return m_docInfo.getLocation();
722 }
723 
getDocument(void) const724 const Document *DownloadingThread::getDocument(void) const
725 {
726 	return m_pDoc;
727 }
728 
doWork(void)729 void DownloadingThread::doWork(void)
730 {
731 	Url thisUrl(m_docInfo.getLocation());
732 	bool getDownloader = true;
733 
734 	if (m_pDoc != NULL)
735 	{
736 		delete m_pDoc;
737 		m_pDoc = NULL;
738 	}
739 
740 	// Get a Downloader
741 	if (m_pDownloader != NULL)
742 	{
743 		// Same protocol as what we now need ?
744 		if (m_protocol == thisUrl.getProtocol())
745 		{
746 			getDownloader = false;
747 		}
748 		else
749 		{
750 			delete m_pDownloader;
751 			m_pDownloader = NULL;
752 			m_protocol.clear();
753 		}
754 	}
755 	if (getDownloader == true)
756 	{
757 		m_protocol = thisUrl.getProtocol();
758 		m_pDownloader = DownloaderFactory::getDownloader(m_protocol);
759 	}
760 
761 	if (m_pDownloader == NULL)
762 	{
763 		m_errorNum = UNSUPPORTED_PROTOCOL;
764 		m_errorParam = thisUrl.getProtocol();
765 	}
766 	else if (m_done == false)
767 	{
768 		Timer collectTimer;
769 		PinotSettings &settings = PinotSettings::getInstance();
770 
771 		// Set up the proxy
772 		if ((getDownloader == true) &&
773 			(settings.m_proxyEnabled == true) &&
774 			(settings.m_proxyAddress.empty() == false))
775 		{
776 			char portStr[64];
777 
778 			m_pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
779 			snprintf(portStr, 64, "%u", settings.m_proxyPort);
780 			m_pDownloader->setSetting("proxyport", portStr);
781 			m_pDownloader->setSetting("proxytype", settings.m_proxyType);
782 		}
783 
784 		collectTimer.start();
785 
786 		m_pDoc = m_pDownloader->retrieveUrl(m_docInfo);
787 
788 		clog << "Retrieved " << m_docInfo.getLocation() << " in " << collectTimer.stop() << " ms" << endl;
789 	}
790 
791 	if (m_pDoc == NULL)
792 	{
793 		m_errorNum = DOWNLOAD_FAILED;
794 		m_errorParam = m_docInfo.getLocation();
795 	}
796 }
797 
IndexingThread(const DocumentInfo & docInfo,const string & indexLocation,bool allowAllMIMETypes)798 IndexingThread::IndexingThread(const DocumentInfo &docInfo, const string &indexLocation,
799 	bool allowAllMIMETypes) :
800 	DownloadingThread(docInfo),
801 	m_pIndex(NULL),
802 	m_indexLocation(indexLocation),
803 	m_allowAllMIMETypes(allowAllMIMETypes),
804 	m_update(false),
805 	m_docId(0)
806 {
807 }
808 
~IndexingThread()809 IndexingThread::~IndexingThread()
810 {
811 	if (m_pIndex != NULL)
812 	{
813 		delete m_pIndex;
814 	}
815 }
816 
getType(void) const817 string IndexingThread::getType(void) const
818 {
819 	return "IndexingThread";
820 }
821 
getDocumentInfo(void) const822 const DocumentInfo &IndexingThread::getDocumentInfo(void) const
823 {
824 	return m_docInfo;
825 }
826 
getDocumentID(void) const827 unsigned int IndexingThread::getDocumentID(void) const
828 {
829 	return m_docId;
830 }
831 
isNewDocument(void) const832 bool IndexingThread::isNewDocument(void) const
833 {
834 	// If the thread is set to perform an update, the document isn't new
835 	if (m_update == true)
836 	{
837 		return false;
838 	}
839 	return true;
840 }
841 
doWork(void)842 void IndexingThread::doWork(void)
843 {
844 	Url thisUrl(m_docInfo.getLocation());
845 	bool reliableType = false, doDownload = true;
846 
847 	// First things first, get the index
848 	if (m_pIndex == NULL)
849 	{
850 		m_pIndex = PinotSettings::getInstance().getIndex(m_indexLocation);
851 	}
852 	if ((m_pIndex == NULL) ||
853 		(m_pIndex->isGood() == false))
854 	{
855 		m_errorNum = INDEX_ERROR;
856 		m_errorParam = m_indexLocation;
857 		return;
858 	}
859 
860 	// Is it an update ?
861 	m_docId = m_pIndex->hasDocument(m_docInfo.getLocation(true));
862 	if (m_docId > 0)
863 	{
864 		// Ignore robots directives on updates
865 		m_update = true;
866 	}
867 
868 	if (m_docInfo.getType().empty() == true)
869 	{
870 		m_docInfo.setType(MIMEScanner::scanUrl(thisUrl));
871 	}
872 	else if (thisUrl.isLocal() == true)
873 	{
874 		// There's a good chance the supplied type is accurate
875 		// if the document is a local file
876 		reliableType = true;
877 	}
878 
879 	if (m_docInfo.getIsDirectory() == true)
880 	{
881 		doDownload = false;
882 #ifdef DEBUG
883 		clog << "IndexingThread::doWork: skipping download of directory " << m_docInfo.getLocation() << endl;
884 #endif
885 	}
886 	else if (FilterUtils::isSupportedType(m_docInfo.getType()) == false)
887 	{
888 		// Skip unsupported types ?
889 		if (m_allowAllMIMETypes == false)
890 		{
891 			m_errorNum = UNSUPPORTED_TYPE;
892 			m_errorParam = m_docInfo.getType();
893 
894 			return;
895 		}
896 
897 		if (reliableType == true)
898 		{
899 			doDownload = false;
900 #ifdef DEBUG
901 			clog << "IndexingThread::doWork: skipping download of unsupported type " << m_docInfo.getLocation() << endl;
902 #endif
903 		}
904 	}
905 	else
906 	{
907 		Dijon::Filter *pFilter = FilterUtils::getFilter(m_docInfo.getType());
908 
909 		if (pFilter != NULL)
910 		{
911 			// We may be able to feed the document directly to the filter
912 			if (((pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME) == true) &&
913 				(thisUrl.getProtocol() == "file")) ||
914 				((pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_URI) == true) &&
915 				(thisUrl.isLocal() == false)))
916 			{
917 				doDownload = false;
918 #ifdef DEBUG
919 				clog << "IndexingThread::doWork: let filter download " << m_docInfo.getLocation() << endl;
920 #endif
921 			}
922 
923 			delete pFilter;
924 		}
925 	}
926 
927 	// We may not have to download the document
928 	if (doDownload == true)
929 	{
930 		DownloadingThread::doWork();
931 	}
932 	else
933 	{
934 		if (m_pDoc != NULL)
935 		{
936 			delete m_pDoc;
937 			m_pDoc = NULL;
938 		}
939 		m_pDoc = new Document(m_docInfo);
940 
941 		m_pDoc->setTimestamp(m_docInfo.getTimestamp());
942 		m_pDoc->setSize(m_docInfo.getSize());
943 	}
944 
945 	if (m_pDoc != NULL)
946 	{
947 		Timer indexTimer;
948 		string docType(m_pDoc->getType());
949 		bool success = false;
950 
951 		indexTimer.start();
952 
953 		// The type may have been obtained when downloading
954 		if (docType.empty() == false)
955 		{
956 			// Use the document's type
957 			m_docInfo.setType(docType);
958 		}
959 		else
960 		{
961 			// Use the type we were supplied with
962 			m_pDoc->setType(m_docInfo.getType());
963 		}
964 
965 		if (m_docInfo.getTitle().empty() == false)
966 		{
967 			// Use the title we were supplied with
968 			m_pDoc->setTitle(m_docInfo.getTitle());
969 		}
970 		else
971 		{
972 			// Use the document's
973 			m_docInfo.setTitle(m_pDoc->getTitle());
974 		}
975 #ifdef DEBUG
976 		clog << "IndexingThread::doWork: title is " << m_pDoc->getTitle() << endl;
977 #endif
978 
979 		// Check again as the downloader may have altered the MIME type
980 		if (FilterUtils::isSupportedType(m_docInfo.getType()) == false)
981 		{
982 			// Skip unsupported types ?
983 			if (m_allowAllMIMETypes == false)
984 			{
985 				m_errorNum = UNSUPPORTED_TYPE;
986 				m_errorParam = m_docInfo.getType();
987 
988 				return;
989 			}
990 
991 			// Let FilterWrapper handle unspported documents
992 		}
993 		else if ((PinotSettings::getInstance().m_ignoreRobotsDirectives == false) &&
994 			(thisUrl.isLocal() == false) &&
995 			(m_docInfo.getType().length() >= 9) &&
996 			(m_docInfo.getType().substr(9) == "text/html"))
997 		{
998 			Dijon::HtmlFilter htmlFilter;
999 
1000 			htmlFilter.set_mime_type(m_docInfo.getType());
1001 
1002 			if ((FilterUtils::feedFilter(*m_pDoc, &htmlFilter) == true) &&
1003 				(htmlFilter.next_document() == true))
1004 			{
1005 				const map<string, string> &metaData = htmlFilter.get_meta_data();
1006 
1007 				// See if the document has a ROBOTS META tag
1008 				map<string, string>::const_iterator robotsIter = metaData.find("robots");
1009 				if (robotsIter != metaData.end())
1010 				{
1011 					string robotsDirectives(robotsIter->second);
1012 
1013 					// Is indexing allowed ?
1014 					string::size_type pos1 = robotsDirectives.find("none");
1015 					string::size_type pos2 = robotsDirectives.find("noindex");
1016 					if ((pos1 != string::npos) ||
1017 						(pos2 != string::npos))
1018 					{
1019 						// No, it isn't
1020 						m_errorNum = ROBOTS_FORBIDDEN;
1021 						m_errorParam = m_docInfo.getLocation();
1022 
1023 						return;
1024 					}
1025 				}
1026 			}
1027 #ifdef DEBUG
1028 			else clog << "IndexingThread::doWork: couldn't check document for ROBOTS directive" << endl;
1029 #endif
1030 		}
1031 
1032 		if (m_done == false)
1033 		{
1034 			FilterWrapper wrapFilter(m_pIndex);
1035 
1036 			// Update an existing document or add to the index ?
1037 			if (m_update == true)
1038 			{
1039 				set<string> labels;
1040 
1041 				// Make sure labels are preserved
1042 				m_pIndex->getDocumentLabels(m_docId, labels);
1043 				m_pDoc->setLabels(labels);
1044 
1045 				// Update the document
1046 				if (wrapFilter.updateDocument(*m_pDoc, m_docId) == true)
1047 				{
1048 #ifdef DEBUG
1049 					clog << "IndexingThread::doWork: updated " << m_pDoc->getLocation()
1050 						<< " at " << m_docId << endl;
1051 #endif
1052 					success = true;
1053 				}
1054 #ifdef DEBUG
1055 				else clog << "IndexingThread::doWork: couldn't update " << m_pDoc->getLocation() << endl;
1056 #endif
1057 			}
1058 			else
1059 			{
1060 				unsigned int docId = 0;
1061 #ifdef DEBUG
1062 				clog << "IndexingThread::doWork: " << m_docInfo.getLabels().size()
1063 					<< " labels for URL " << m_pDoc->getLocation() << endl;
1064 #endif
1065 
1066 				// Index the document
1067 				success = wrapFilter.indexDocument(*m_pDoc, m_docInfo.getLabels(), docId);
1068 				if (success == true)
1069 				{
1070 					m_docId = docId;
1071 #ifdef DEBUG
1072 					clog << "IndexingThread::doWork: indexed " << m_pDoc->getLocation()
1073 						<< " to " << m_docId << endl;
1074 #endif
1075 				}
1076 #ifdef DEBUG
1077 				else clog << "IndexingThread::doWork: couldn't index " << m_pDoc->getLocation() << endl;
1078 #endif
1079 			}
1080 
1081 			if (success == false)
1082 			{
1083 				m_errorNum = INDEXING_FAILED;
1084 				m_errorParam = m_docInfo.getLocation();
1085 			}
1086 			else
1087 			{
1088 				// Flush the index ?
1089 				if (m_immediateFlush == true)
1090 				{
1091 					m_pIndex->flush();
1092 				}
1093 
1094 				// The document properties may have changed
1095 				m_pIndex->getDocumentInfo(m_docId, m_docInfo);
1096 				m_docInfo.setIsIndexed(
1097 					PinotSettings::getInstance().getIndexPropertiesByLocation(m_indexLocation).m_id,
1098 					m_docId);
1099 
1100 				clog << "Indexed " << m_docInfo.getLocation() << " in " << indexTimer.stop() << " ms" << endl;
1101 			}
1102 		}
1103 	}
1104 #ifdef DEBUG
1105 	else clog << "IndexingThread::doWork: couldn't download " << m_docInfo.getLocation() << endl;
1106 #endif
1107 }
1108 
UnindexingThread(const set<unsigned int> & docIdList)1109 UnindexingThread::UnindexingThread(const set<unsigned int> &docIdList) :
1110 	WorkerThread(),
1111 	m_indexLocation(PinotSettings::getInstance().m_docsIndexLocation),
1112 	m_docsCount(0)
1113 {
1114 	copy(docIdList.begin(), docIdList.end(), inserter(m_docIdList, m_docIdList.begin()));
1115 }
1116 
UnindexingThread(const set<string> & labelNames,const string & indexLocation)1117 UnindexingThread::UnindexingThread(const set<string> &labelNames, const string &indexLocation) :
1118 	WorkerThread(),
1119 	m_indexLocation(indexLocation),
1120 	m_docsCount(0)
1121 {
1122 	copy(labelNames.begin(), labelNames.end(), inserter(m_labelNames, m_labelNames.begin()));
1123 	if (indexLocation.empty() == true)
1124 	{
1125 		m_indexLocation = PinotSettings::getInstance().m_docsIndexLocation;
1126 	}
1127 }
1128 
~UnindexingThread()1129 UnindexingThread::~UnindexingThread()
1130 {
1131 }
1132 
getType(void) const1133 string UnindexingThread::getType(void) const
1134 {
1135 	return "UnindexingThread";
1136 }
1137 
getDocumentsCount(void) const1138 unsigned int UnindexingThread::getDocumentsCount(void) const
1139 {
1140 	return m_docsCount;
1141 }
1142 
doWork(void)1143 void UnindexingThread::doWork(void)
1144 {
1145 	IndexInterface *pIndex = PinotSettings::getInstance().getIndex(m_indexLocation);
1146 
1147 	if ((pIndex == NULL) ||
1148 		(pIndex->isGood() == false))
1149 	{
1150 		m_errorNum = INDEX_ERROR;
1151 		m_errorParam = m_indexLocation;
1152 		if (pIndex != NULL)
1153 		{
1154 			delete pIndex;
1155 		}
1156 		return;
1157 	}
1158 
1159 	// Be pessimistic and assume something will go wrong ;-)
1160 	m_errorNum = UNINDEXING_FAILED;
1161 
1162 	// Are we supposed to remove documents based on labels ?
1163 	if (m_docIdList.empty() == true)
1164 	{
1165 		// Yep, delete documents one label at a time
1166 		for (set<string>::iterator iter = m_labelNames.begin(); iter != m_labelNames.end(); ++iter)
1167 		{
1168 			string labelName = (*iter);
1169 
1170 			// By unindexing all documents that match the label,
1171 			// we effectively delete the label from the index
1172 			if (pIndex->unindexDocuments(labelName, IndexInterface::BY_LABEL) == true)
1173 			{
1174 #ifdef DEBUG
1175 				clog << "UnindexingThread::doWork: removed label " << labelName << endl;
1176 #endif
1177 				// OK
1178 				++m_docsCount;
1179 			}
1180 #ifdef DEBUG
1181 			else clog << "UnindexingThread::doWork: couldn't remove label " << labelName << endl;
1182 #endif
1183 		}
1184 
1185 		// Nothing to report
1186 		m_errorNum = 0;
1187 	}
1188 	else
1189 	{
1190 		for (set<unsigned int>::iterator iter = m_docIdList.begin(); iter != m_docIdList.end(); ++iter)
1191 		{
1192 			unsigned int docId = (*iter);
1193 
1194 			if (pIndex->unindexDocument(docId) == true)
1195 			{
1196 #ifdef DEBUG
1197 				clog << "UnindexingThread::doWork: removed " << docId << endl;
1198 #endif
1199 				// OK
1200 				++m_docsCount;
1201 			}
1202 #ifdef DEBUG
1203 			else clog << "UnindexingThread::doWork: couldn't remove " << docId << endl;
1204 #endif
1205 		}
1206 #ifdef DEBUG
1207 		clog << "UnindexingThread::doWork: removed " << m_docsCount << " documents" << endl;
1208 #endif
1209 	}
1210 
1211 	if (m_docsCount > 0)
1212 	{
1213 		// Flush the index ?
1214 		if (m_immediateFlush == true)
1215 		{
1216 			pIndex->flush();
1217 		}
1218 
1219 		// Nothing to report
1220 		m_errorNum = 0;
1221 	}
1222 
1223 	delete pIndex;
1224 }
1225 
HistoryMonitorThread(MonitorInterface * pMonitor,MonitorHandler * pHandler)1226 HistoryMonitorThread::HistoryMonitorThread(MonitorInterface *pMonitor, MonitorHandler *pHandler) :
1227 	MonitorThread(pMonitor, pHandler),
1228 	m_crawlHistory(PinotSettings::getInstance().getHistoryDatabaseName())
1229 {
1230 }
1231 
~HistoryMonitorThread()1232 HistoryMonitorThread::~HistoryMonitorThread()
1233 {
1234 }
1235 
isFileBlacklisted(const string & location)1236 bool HistoryMonitorThread::isFileBlacklisted(const string &location)
1237 {
1238 	return PinotSettings::getInstance().isBlackListed(location);
1239 }
1240 
fileModified(const string & location)1241 void HistoryMonitorThread::fileModified(const string &location)
1242 {
1243 	CrawlHistory::CrawlStatus status = CrawlHistory::UNKNOWN;
1244 	struct stat fileStat;
1245 	time_t itemDate = 0;
1246 
1247 	if (m_crawlHistory.hasItem("file://" + location, status, itemDate) == true)
1248 	{
1249 		// Was the file actually modified ?
1250 		if ((stat(location.c_str(), &fileStat) == 0) &&
1251 				(itemDate < fileStat.st_mtime))
1252 		{
1253 			m_pHandler->fileModified(location);
1254 		}
1255 #ifdef DEBUG
1256 		else clog << "HistoryMonitorThread::fileModified: file wasn't modified" << endl;
1257 #endif
1258 	}
1259 #ifdef DEBUG
1260 	else clog << "HistoryMonitorThread::fileModified: file wasn't crawled" << endl;
1261 #endif
1262 }
1263 
DirectoryScannerThread(const DocumentInfo & docInfo,const string & indexLocation,unsigned int maxLevel,bool inlineIndexing,bool followSymLinks)1264 DirectoryScannerThread::DirectoryScannerThread(const DocumentInfo &docInfo,
1265 	const string &indexLocation, unsigned int maxLevel,
1266 	bool inlineIndexing, bool followSymLinks) :
1267 	IndexingThread(docInfo, indexLocation),
1268 	m_currentLevel(0),
1269 	m_maxLevel(maxLevel),
1270 	m_inlineIndexing(inlineIndexing),
1271 	m_followSymLinks(followSymLinks)
1272 {
1273 	Url urlObj(docInfo.getLocation());
1274 
1275 	m_dirName = urlObj.getLocation() + "/" + urlObj.getFile();
1276 }
1277 
~DirectoryScannerThread()1278 DirectoryScannerThread::~DirectoryScannerThread()
1279 {
1280 }
1281 
getType(void) const1282 string DirectoryScannerThread::getType(void) const
1283 {
1284 	if (m_inlineIndexing == true)
1285 	{
1286 		return IndexingThread::getType();
1287 	}
1288 
1289 	return "DirectoryScannerThread";
1290 }
1291 
getDirectory(void) const1292 string DirectoryScannerThread::getDirectory(void) const
1293 {
1294 	return m_dirName;
1295 }
1296 
stop(void)1297 void DirectoryScannerThread::stop(void)
1298 {
1299 	// Disconnect the signal
1300 	sigc::signal2<void, DocumentInfo, bool>::slot_list_type slotsList = m_signalFileFound.slots();
1301 	sigc::signal2<void, DocumentInfo, bool>::slot_list_type::iterator slotIter = slotsList.begin();
1302 	if (slotIter != slotsList.end())
1303 	{
1304 		if (slotIter->empty() == false)
1305 		{
1306 			slotIter->block();
1307 			slotIter->disconnect();
1308 		}
1309 	}
1310 	WorkerThread::stop();
1311 }
1312 
getFileFoundSignal(void)1313 sigc::signal2<void, DocumentInfo, bool>& DirectoryScannerThread::getFileFoundSignal(void)
1314 {
1315 	return m_signalFileFound;
1316 }
1317 
recordCrawled(const string & location,time_t itemDate)1318 void DirectoryScannerThread::recordCrawled(const string &location, time_t itemDate)
1319 {
1320 	// Nothing to do by default
1321 }
1322 
isIndexable(const string & entryName) const1323 bool DirectoryScannerThread::isIndexable(const string &entryName) const
1324 {
1325 	string entryDir(path_get_dirname(entryName) + "/");
1326 
1327 	// Is this under the directory being scanned ?
1328 	if ((entryDir.length() >= m_dirName.length()) &&
1329 		(entryDir.substr(0, m_dirName.length()) == m_dirName))
1330 	{
1331 		// Yes, it is
1332 #ifdef DEBUG
1333 		clog << "DirectoryScannerThread::isIndexable: under " << m_dirName << endl;
1334 #endif
1335 		return true;
1336 	}
1337 
1338 	return false;
1339 }
1340 
wasCrawled(const string & location,time_t & itemDate)1341 bool DirectoryScannerThread::wasCrawled(const string &location, time_t &itemDate)
1342 {
1343 	// This information is unknown
1344 	return false;
1345 }
1346 
recordCrawling(const string & location,bool itemExists,time_t & itemDate)1347 void DirectoryScannerThread::recordCrawling(const string &location, bool itemExists, time_t &itemDate)
1348 {
1349 	// Nothing to do by default
1350 }
1351 
recordError(const string & location,int errorCode)1352 void DirectoryScannerThread::recordError(const string &location, int errorCode)
1353 {
1354 	// Nothing to do by default
1355 }
1356 
recordSymlink(const string & location,time_t itemDate)1357 void DirectoryScannerThread::recordSymlink(const string &location, time_t itemDate)
1358 {
1359 	// Nothing to do by default
1360 }
1361 
monitorEntry(const string & entryName)1362 bool DirectoryScannerThread::monitorEntry(const string &entryName)
1363 {
1364 	// Nothing to do by default
1365 	return true;
1366 }
1367 
unmonitorEntry(const string & entryName)1368 void DirectoryScannerThread::unmonitorEntry(const string &entryName)
1369 {
1370 	// Nothing to do by default
1371 }
1372 
foundFile(const DocumentInfo & docInfo)1373 void DirectoryScannerThread::foundFile(const DocumentInfo &docInfo)
1374 {
1375 	if ((docInfo.getLocation().empty() == true) ||
1376 		(m_done == true))
1377 	{
1378 		return;
1379 	}
1380 
1381 	if (m_inlineIndexing == true)
1382 	{
1383 		// Reset base class members
1384 		m_docInfo = docInfo;
1385 		m_docId = 0;
1386 		m_update = false;
1387 
1388 		IndexingThread::doWork();
1389 #ifdef DEBUG
1390 		clog << "DirectoryScannerThread::foundFile: indexed " << docInfo.getLocation() << " to " << m_docId << endl;
1391 #endif
1392 	}
1393 	else
1394 	{
1395 		// Delegate indexing
1396 		// Report everything as file to avoid triggering another crawl
1397 		m_signalFileFound(docInfo, false);
1398 	}
1399 }
1400 
scanEntry(const string & entryName,int & entryStatus,bool statLinks)1401 bool DirectoryScannerThread::scanEntry(const string &entryName,
1402 	int &entryStatus, bool statLinks)
1403 {
1404 	string location("file://" + entryName);
1405 	DocumentInfo docInfo("", location, "", "");
1406 	time_t itemDate = time(NULL);
1407 	struct stat fileStat;
1408 	bool scanSuccess = true, reportFile = false, itemExists = false;
1409 
1410 	if (entryName.empty() == true)
1411 	{
1412 		return false;
1413 	}
1414 
1415 	// Skip . .. and dotfiles
1416 	Url urlObj(location);
1417 	if (urlObj.getFile()[0] == '.')
1418 	{
1419 #ifdef DEBUG
1420 		clog << "DirectoryScannerThread::scanEntry: skipped dotfile " << urlObj.getFile() << endl;
1421 #endif
1422 		return false;
1423 	}
1424 #ifdef DEBUG
1425 	clog << "DirectoryScannerThread::scanEntry: checking " << entryName << endl;
1426 #endif
1427 
1428 #ifdef HAVE_LSTAT
1429 	// Stat links, or the stuff it refers to ?
1430 	if (statLinks == true)
1431 	{
1432 		entryStatus = lstat(entryName.c_str(), &fileStat);
1433 	}
1434 	else
1435 	{
1436 #endif
1437 		entryStatus = stat(entryName.c_str(), &fileStat);
1438 #ifdef HAVE_LSTAT
1439 	}
1440 #endif
1441 
1442 	if (entryStatus == -1)
1443 	{
1444 		entryStatus = errno;
1445 		scanSuccess = false;
1446 #ifdef DEBUG
1447 		clog << "DirectoryScannerThread::scanEntry: stat failed with error " << entryStatus << endl;
1448 #endif
1449 	}
1450 #ifdef HAVE_LSTAT
1451 	// Special processing applies if it's a symlink
1452 	else if (S_ISLNK(fileStat.st_mode))
1453 	{
1454 		string realEntryName(entryName);
1455 		string entryNameReferree;
1456 		bool isInIndexableLocation = false;
1457 
1458 		// If symlinks are followed, check if this symlink is blacklisted
1459 		if ((m_followSymLinks == false) ||
1460 			(PinotSettings::getInstance().isBlackListed(entryName) == true))
1461 		{
1462 #ifdef DEBUG
1463 			clog << "DirectoryScannerThread::scanEntry: skipped symlink " << entryName << endl;
1464 #endif
1465 			return false;
1466 		}
1467 
1468 		// Are we already following a symlink to a directory ?
1469 		if (m_currentLinks.empty() == false)
1470 		{
1471 			string linkToDir(m_currentLinks.top() + "/");
1472 
1473 			// Yes, we are
1474 			if ((entryName.length() > linkToDir.length()) &&
1475 				(entryName.substr(0, linkToDir.length()) == linkToDir))
1476 			{
1477 				// ...and this entry is below it
1478 				realEntryName.replace(0, linkToDir.length() - 1, m_currentLinkReferrees.top());
1479 #ifdef DEBUG
1480 				clog << "DirectoryScannerThread::scanEntry: really at " << realEntryName << endl;
1481 #endif
1482 				isInIndexableLocation = isIndexable(realEntryName);
1483 			}
1484 		}
1485 
1486 		char *pBuf = g_file_read_link(realEntryName.c_str(), NULL);
1487 		if (pBuf != NULL)
1488 		{
1489 			string linkLocation(filename_to_utf8(pBuf));
1490 			if (path_is_absolute(linkLocation) == true)
1491 			{
1492 				entryNameReferree = linkLocation;
1493 			}
1494 			else
1495 			{
1496 				string entryDir(path_get_dirname(realEntryName));
1497 
1498 				entryNameReferree = Url::resolvePath(entryDir, linkLocation);
1499 			}
1500 
1501 			if (entryNameReferree[entryNameReferree.length() - 1] == '/')
1502 			{
1503 				// Drop the terminating slash
1504 				entryNameReferree.resize(entryNameReferree.length() - 1);
1505 			}
1506 #ifdef DEBUG
1507 			clog << "DirectoryScannerThread::scanEntry: symlink resolved to " << entryNameReferree << endl;
1508 #endif
1509 
1510 			g_free(pBuf);
1511 		}
1512 
1513 		string referreeLocation("file://" + entryNameReferree);
1514 		time_t referreeItemDate;
1515 
1516 		// Check whether this will be, or has already been crawled
1517 		// Referrees in indexable locations will be indexed later on
1518 		if ((isInIndexableLocation == false) &&
1519 			(isIndexable(entryNameReferree) == false) &&
1520 			(wasCrawled(referreeLocation, referreeItemDate) == false))
1521 		{
1522 			m_currentLinks.push(entryName);
1523 			m_currentLinkReferrees.push(entryNameReferree);
1524 
1525 			// Add a dummy entry for this referree
1526 			// It will ensure it's not indexed more than once and it shouldn't do any harm
1527 			recordSymlink(referreeLocation, itemDate);
1528 
1529 			// Do it again, this time by stat'ing what the link refers to
1530 			bool scannedReferree = scanEntry(entryName, entryStatus, false);
1531 
1532 			m_currentLinks.pop();
1533 			m_currentLinkReferrees.pop();
1534 
1535 			return scannedReferree;
1536 		}
1537 		else
1538 		{
1539 			clog << "Skipping " << entryName << ": it links to " << entryNameReferree
1540 				<< " which will be crawled, or has already been crawled" << endl;
1541 
1542 			// This should ensure that only metadata is indexed
1543 			docInfo.setType("inode/symlink");
1544 			reportFile = true;
1545 		}
1546 	}
1547 #endif
1548 
1549 	// Is this item in the database already ?
1550 	itemExists = wasCrawled(location, itemDate);
1551 	// Put it in if necessary
1552 	recordCrawling(location, itemExists, itemDate);
1553 
1554 	// If stat'ing didn't fail, see if it's a file or a directory
1555 	if ((entryStatus == 0) &&
1556 		(S_ISREG(fileStat.st_mode)))
1557 	{
1558 		// Is this file blacklisted ?
1559 		// We have to check early so that if necessary the file's status stays at TO_CRAWL
1560 		// and it is removed from the index at the end of this crawl
1561 		if (PinotSettings::getInstance().isBlackListed(entryName) == false)
1562 		{
1563 			reportFile = true;
1564 		}
1565 	}
1566 	else if ((entryStatus == 0) &&
1567 		(S_ISDIR(fileStat.st_mode)))
1568 	{
1569 		docInfo.setType("x-directory/normal");
1570 
1571 		// Can we scan this directory ?
1572 		if (((m_maxLevel == 0) ||
1573 			(m_currentLevel < m_maxLevel)) &&
1574 			(PinotSettings::getInstance().isBlackListed(entryName) == false))
1575 		{
1576 			++m_currentLevel;
1577 
1578 			// Open the directory
1579 			DIR *pDir = opendir(entryName.c_str());
1580 			if (pDir != NULL)
1581 			{
1582 				// Monitor first so that we don't miss events
1583 				// If monitoring is not possible, record the first case
1584 				if ((monitorEntry(entryName) == false) &&
1585 					(entryStatus != MONITORING_FAILED))
1586 				{
1587 					entryStatus = MONITORING_FAILED;
1588 				}
1589 #ifdef DEBUG
1590 				clog << "DirectoryScannerThread::scanEntry: entering " << entryName << endl;
1591 #endif
1592 
1593 				// Iterate through this directory's entries
1594 				struct dirent *pDirEntry = readdir(pDir);
1595 				while ((m_done == false) &&
1596 					(pDirEntry != NULL))
1597 				{
1598 					char *pEntryName = pDirEntry->d_name;
1599 
1600 					// Skip . .. and dotfiles
1601 					if ((pEntryName != NULL) &&
1602 						(pEntryName[0] != '.'))
1603 					{
1604 						string subEntryName(entryName);
1605 						int subEntryStatus = 0;
1606 
1607 						if (entryName[entryName.length() - 1] != '/')
1608 						{
1609 							subEntryName += "/";
1610 						}
1611 						subEntryName += pEntryName;
1612 
1613 						// Scan this entry
1614 						scanEntry(subEntryName, subEntryStatus);
1615 					}
1616 
1617 					// Next entry
1618 					pDirEntry = readdir(pDir);
1619 				}
1620 #ifdef DEBUG
1621 				clog << "DirectoryScannerThread::scanEntry: leaving " << entryName << endl;
1622 #endif
1623 
1624 				// Close the directory
1625 				closedir(pDir);
1626 				--m_currentLevel;
1627 				reportFile = true;
1628 			}
1629 			else
1630 			{
1631 				entryStatus = errno;
1632 				scanSuccess = false;
1633 #ifdef DEBUG
1634 				clog << "DirectoryScannerThread::scanEntry: opendir failed with error " << entryStatus << endl;
1635 #endif
1636 			}
1637 		}
1638 	}
1639 	// Is it some unknown type ?
1640 	else if ((entryStatus == 0)
1641 #ifdef HAVE_LSTAT
1642 		&& (!S_ISLNK(fileStat.st_mode))
1643 #endif
1644 		)
1645 	{
1646 #ifdef DEBUG
1647 		clog << "DirectoryScannerThread::scanEntry: unknown entry type" << endl;
1648 #endif
1649 		entryStatus = ENOENT;
1650 		scanSuccess = false;
1651 	}
1652 
1653 	// Was it modified after the last crawl ?
1654 	if ((itemExists == true) &&
1655 		(itemDate >= fileStat.st_mtime))
1656 	{
1657 		// No, it wasn't
1658 #ifdef DEBUG
1659 		clog << "DirectoryScannerThread::scanEntry: no change to " << location << endl;
1660 #endif
1661 		reportFile = false;
1662 	}
1663 
1664 	if (m_done == true)
1665 	{
1666 		// Don't record or report the file
1667 		reportFile = false;
1668 	}
1669 	// Did an error occur ?
1670 	else if (entryStatus != 0)
1671 	{
1672 		// Record this error
1673 		recordError(location, entryStatus);
1674 
1675 		if (scanSuccess == false)
1676 		{
1677 			return scanSuccess;
1678 		}
1679 	}
1680 	// History of new or modified files, especially their timestamp, is always updated
1681 	// Others' are updated only if we are doing a full scan because
1682 	// the status has to be reset to CRAWLED, so that they are not unindexed
1683 	else if ((itemExists == false) ||
1684 		(reportFile == true))
1685 	{
1686 		recordCrawled(location, fileStat.st_mtime);
1687 	}
1688 
1689 	// If a major error occurred, this won't be true
1690 	if (reportFile == true)
1691 	{
1692 		if (docInfo.getType().empty() == true)
1693 		{
1694 			// Scan the file
1695 			docInfo.setType(MIMEScanner::scanFile(entryName));
1696 		}
1697 		docInfo.setTimestamp(TimeConverter::toTimestamp(fileStat.st_mtime));
1698 		docInfo.setSize(fileStat.st_size);
1699 
1700 		foundFile(docInfo);
1701 	}
1702 
1703 	return scanSuccess;
1704 }
1705 
doWork(void)1706 void DirectoryScannerThread::doWork(void)
1707 {
1708 	Timer scanTimer;
1709 	int entryStatus = 0;
1710 
1711 	if (m_dirName.empty() == true)
1712 	{
1713 		return;
1714 	}
1715 	scanTimer.start();
1716 
1717 	if (scanEntry(m_dirName, entryStatus) == false)
1718 	{
1719 		if (entryStatus == 0)
1720 		{
1721 			m_errorNum = OPENDIR_FAILED;
1722 		}
1723 		else
1724 		{
1725 			m_errorNum = entryStatus;
1726 		}
1727 		m_errorParam = m_dirName;
1728 	}
1729 	clog << "Scanned " << m_dirName << " in " << scanTimer.stop() << " ms" << endl;
1730 }
1731 
1732