1 /*
2 * Copyright 2005-2021 Fabrice Colin
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18
19 #include <sys/types.h>
20 #include <dirent.h>
21 #include <sys/stat.h>
22 #include <unistd.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <fcntl.h>
26 #include <string.h>
27 #include <signal.h>
28 #include <time.h>
29 #include <errno.h>
30 #ifdef __OpenBSD__
31 #include <sys/param.h>
32 #include <sys/sysctl.h>
33 #endif
34 #include <exception>
35 #include <iostream>
36 #include <fstream>
37 #include <algorithm>
38 #include <glibmm/miscutils.h>
39 #include <glibmm/convert.h>
40 #include <glibmm/exception.h>
41
42 #include "config.h"
43 #include "NLS.h"
44 #include "Languages.h"
45 #include "MIMEScanner.h"
46 #include "TimeConverter.h"
47 #include "Timer.h"
48 #include "Url.h"
49 #include "HtmlFilter.h"
50 #include "FilterUtils.h"
51 #include "DownloaderFactory.h"
52 #include "FilterWrapper.h"
53 #include "ModuleFactory.h"
54 #include "WebEngine.h"
55 #include "WorkerThreads.h"
56
57 using namespace std;
58 using namespace Glib;
59
QueueManager(const string & defaultIndexLocation,unsigned int maxThreadsTime,bool scanLocalFiles)60 QueueManager::QueueManager(const string &defaultIndexLocation,
61 unsigned int maxThreadsTime, bool scanLocalFiles) :
62 ThreadsManager(defaultIndexLocation, maxThreadsTime),
63 m_scanLocalFiles(scanLocalFiles),
64 m_stopIndexing(false),
65 m_actionQueue(PinotSettings::getInstance().getHistoryDatabaseName(), get_application_name())
66 {
67 }
68
~QueueManager()69 QueueManager::~QueueManager()
70 {
71 }
72
index_document(const DocumentInfo & docInfo)73 ustring QueueManager::index_document(const DocumentInfo &docInfo)
74 {
75 string location(docInfo.getLocation());
76
77 #ifdef DEBUG
78 clog << "ThreadsManager::index_document: called with " << location << endl;
79 #endif
80 if (m_stopIndexing == true)
81 {
82 #ifdef DEBUG
83 clog << "ThreadsManager::index_document: stopped indexing" << endl;
84 #endif
85 return _("Indexing was stopped");
86 }
87
88 if (location.empty() == true)
89 {
90 // Nothing to do
91 return "";
92 }
93
94 // If the document is a mail message, we can't index it again
95 Url urlObj(location);
96 if (urlObj.getProtocol() == "mailbox")
97 {
98 return _("Can't index mail here");
99 }
100
101 // Is the document being indexed/updated ?
102 if (write_lock_lists() == true)
103 {
104 bool beingProcessed = true;
105
106 if (m_beingIndexed.find(location) == m_beingIndexed.end())
107 {
108 m_beingIndexed.insert(location);
109 beingProcessed = false;
110 }
111
112 unlock_lists();
113
114 if (beingProcessed == true)
115 {
116 // FIXME: we may have to set labels on this document
117 // FIXME: fix this for RTL languages
118 ustring status(location);
119 status += " ";
120 status += _("is already being indexed");
121 return status;
122 }
123 }
124
125 // Is the document blacklisted ?
126 if (PinotSettings::getInstance().isBlackListed(location) == true)
127 {
128 // FIXME: fix this for RTL languages
129 ustring status(location);
130 status += " ";
131 status += _("is blacklisted");
132 return status;
133 }
134
135 if ((m_scanLocalFiles == true) &&
136 (urlObj.isLocal() == true))
137 {
138 #ifdef DEBUG
139 clog << "ThreadsManager::index_document: scanning " <<
140 urlObj.getLocation() + "/" + urlObj.getFile() << endl;
141 #endif
142 // This handles both directories and files
143 start_thread(new DirectoryScannerThread(docInfo,
144 m_defaultIndexLocation, 0, true, true));
145 }
146 else
147 {
148 start_thread(new IndexingThread(docInfo, m_defaultIndexLocation));
149 }
150
151 return "";
152 }
153
clear_queues(void)154 void QueueManager::clear_queues(void)
155 {
156 if (write_lock_lists() == true)
157 {
158 m_beingIndexed.clear();
159
160 unlock_lists();
161
162 m_actionQueue.expireItems(time(NULL));
163 }
164 }
165
queue_index(const DocumentInfo & docInfo)166 ustring QueueManager::queue_index(const DocumentInfo &docInfo)
167 {
168 bool addToQueue = false;
169
170 if (get_threads_count() >= m_maxIndexThreads)
171 {
172 #ifdef DEBUG
173 clog << "QueueManager::queue_index: too many threads" << endl;
174 #endif
175 addToQueue = true;
176 }
177 #ifdef HAVE_GETLOADAVG
178 // Get the load averaged over the last minute
179 else
180 {
181 double averageLoad[3];
182
183 if (getloadavg(averageLoad, 3) != -1)
184 {
185 // FIXME: is LOADAVG_1MIN Solaris specific ?
186 if (averageLoad[0] >= (double)m_numCPUs * 4)
187 {
188 // Don't add to the load, queue this
189 addToQueue = true;
190 }
191 }
192 }
193 #endif
194
195 if (addToQueue == true)
196 {
197 m_actionQueue.pushItem(ActionQueue::INDEX, docInfo);
198
199 return "";
200 }
201
202 return index_document(docInfo);
203 }
204
pop_queue(const string & urlWasIndexed)205 bool QueueManager::pop_queue(const string &urlWasIndexed)
206 {
207 bool getItem = true;
208 bool emptyQueue = false;
209
210 #ifdef DEBUG
211 clog << "QueueManager::pop_queue: called with " << urlWasIndexed << endl;
212 #endif
213 if (get_threads_count() >= m_maxIndexThreads)
214 {
215 #ifdef DEBUG
216 clog << "QueueManager::pop_queue: too many threads" << endl;
217 #endif
218 getItem = false;
219 }
220
221 if (write_lock_lists() == true)
222 {
223 // Update the in-progress list
224 if (urlWasIndexed.empty() == false)
225 {
226 set<string>::iterator urlIter = m_beingIndexed.find(urlWasIndexed);
227 if (urlIter != m_beingIndexed.end())
228 {
229 m_beingIndexed.erase(urlIter);
230 }
231 }
232
233 unlock_lists();
234
235 // Get an item ?
236 if (getItem == true)
237 {
238 ActionQueue::ActionType type;
239 DocumentInfo docInfo;
240 string previousLocation;
241
242 // Assume the queue is empty
243 emptyQueue = true;
244
245 while (m_actionQueue.popItem(type, docInfo) == true)
246 {
247 ustring status;
248
249 if (type != ActionQueue::INDEX)
250 {
251 continue;
252 }
253
254 // The queue isn't actually empty
255 emptyQueue = false;
256
257 if (docInfo.getLocation() == previousLocation)
258 {
259 // Something dodgy is going on, we got the same item twice !
260 // FIXME: fix this for RTL languages
261 status = previousLocation;
262 status += " ";
263 status += _("is already being indexed");
264 }
265 else
266 {
267 status = index_document(docInfo);
268 }
269
270 if (status.empty() == true)
271 {
272 break;
273 }
274
275 previousLocation = docInfo.getLocation();
276 }
277 }
278 }
279
280 return emptyQueue;
281 }
282
ListerThread(const PinotSettings::IndexProperties & indexProps,unsigned int startDoc)283 ListerThread::ListerThread(const PinotSettings::IndexProperties &indexProps,
284 unsigned int startDoc) :
285 WorkerThread(),
286 m_indexProps(indexProps),
287 m_startDoc(startDoc),
288 m_documentsCount(0)
289 {
290 }
291
~ListerThread()292 ListerThread::~ListerThread()
293 {
294 }
295
getType(void) const296 string ListerThread::getType(void) const
297 {
298 return "ListerThread";
299 }
300
getIndexProperties(void) const301 PinotSettings::IndexProperties ListerThread::getIndexProperties(void) const
302 {
303 return m_indexProps;
304 }
305
getStartDoc(void) const306 unsigned int ListerThread::getStartDoc(void) const
307 {
308 return m_startDoc;
309 }
310
getDocuments(void) const311 const vector<DocumentInfo> &ListerThread::getDocuments(void) const
312 {
313 return m_documentsList;
314 }
315
getDocumentsCount(void) const316 unsigned int ListerThread::getDocumentsCount(void) const
317 {
318 return m_documentsCount;
319 }
320
QueryingThread(const PinotSettings::IndexProperties & indexProps,const QueryProperties & queryProps,unsigned int startDoc,bool listingIndex)321 QueryingThread::QueryingThread(const PinotSettings::IndexProperties &indexProps,
322 const QueryProperties &queryProps, unsigned int startDoc, bool listingIndex) :
323 ListerThread(indexProps, startDoc),
324 m_engineName(PinotSettings::getInstance().m_defaultBackend),
325 m_engineDisplayableName(indexProps.m_name),
326 m_engineOption(indexProps.m_location),
327 m_queryProps(queryProps),
328 m_listingIndex(listingIndex),
329 m_correctedSpelling(false),
330 m_isLive(true)
331 {
332 #ifdef DEBUG
333 clog << "QueryingThread: engine " << m_engineName << ", " << m_engineOption
334 << ", mode " << m_listingIndex << endl;
335 #endif
336 }
337
QueryingThread(const string & engineName,const string & engineDisplayableName,const string & engineOption,const QueryProperties & queryProps,unsigned int startDoc)338 QueryingThread::QueryingThread(const string &engineName, const string &engineDisplayableName,
339 const string &engineOption, const QueryProperties &queryProps,
340 unsigned int startDoc) :
341 ListerThread(PinotSettings::IndexProperties(engineDisplayableName, engineOption, 0, false), startDoc),
342 m_engineName(engineName),
343 m_engineDisplayableName(engineDisplayableName),
344 m_engineOption(engineOption),
345 m_queryProps(queryProps),
346 m_listingIndex(false),
347 m_correctedSpelling(false),
348 m_isLive(true)
349 {
350 #ifdef DEBUG
351 clog << "QueryingThread: engine " << m_engineName << ", " << m_engineOption
352 << ", mode 0" << endl;
353 #endif
354 }
355
~QueryingThread()356 QueryingThread::~QueryingThread()
357 {
358 }
359
getType(void) const360 string QueryingThread::getType(void) const
361 {
362 if (m_listingIndex == true)
363 {
364 return ListerThread::getType();
365 }
366
367 return "QueryingThread";
368 }
369
isLive(void) const370 bool QueryingThread::isLive(void) const
371 {
372 return m_isLive;
373 }
374
getEngineName(void) const375 string QueryingThread::getEngineName(void) const
376 {
377 return m_engineDisplayableName;
378 }
379
getQuery(bool & wasCorrected) const380 QueryProperties QueryingThread::getQuery(bool &wasCorrected) const
381 {
382 wasCorrected = m_correctedSpelling;
383 return m_queryProps;
384 }
385
getCharset(void) const386 string QueryingThread::getCharset(void) const
387 {
388 return m_resultsCharset;
389 }
390
findPlugin(void)391 bool QueryingThread::findPlugin(void)
392 {
393 string pluginName;
394
395 if ((m_engineName.empty() == true) &&
396 (m_engineOption.empty() == false))
397 {
398 pluginName = m_engineOption;
399 }
400 else if ((m_engineName.empty() == false) &&
401 (m_engineOption.empty() == true))
402 {
403 pluginName = m_engineName;
404 }
405
406 if (pluginName.empty() == false)
407 {
408 set<ModuleProperties> engines;
409 PinotSettings::getInstance().getSearchEngines(engines, "");
410 #ifdef DEBUG
411 clog << "QueryingThread::findPlugin: looking for a plugin named " << pluginName << endl;
412 #endif
413
414 // Is there a plugin with such a name ?
415 ModuleProperties modProps("sherlock", pluginName, "", "");
416 set<ModuleProperties>::const_iterator engineIter = engines.find(modProps);
417 if (engineIter == engines.end())
418 {
419 // Try again
420 modProps.m_name = "opensearch";
421 engineIter = engines.find(modProps);
422 }
423
424 if (engineIter != engines.end())
425 {
426 // Yes, there is !
427 m_engineName = engineIter->m_name;
428 m_engineDisplayableName = engineIter->m_longName;
429 m_engineOption = engineIter->m_option;
430 #ifdef DEBUG
431 clog << "QueryingThread::findPlugin: found " << m_engineName << ", " << m_engineDisplayableName << ", " << m_engineOption << endl;
432 #endif
433
434 return true;
435 }
436 }
437
438 return false;
439 }
440
EngineQueryThread(const PinotSettings::IndexProperties & indexProps,const QueryProperties & queryProps,unsigned int startDoc,bool listingIndex)441 EngineQueryThread::EngineQueryThread(const PinotSettings::IndexProperties &indexProps,
442 const QueryProperties &queryProps, unsigned int startDoc, bool listingIndex) :
443 QueryingThread(indexProps, queryProps, startDoc, listingIndex)
444 {
445 }
446
EngineQueryThread(const PinotSettings::IndexProperties & indexProps,const QueryProperties & queryProps,const set<string> & limitToDocsSet,unsigned int startDoc)447 EngineQueryThread::EngineQueryThread(const PinotSettings::IndexProperties &indexProps,
448 const QueryProperties &queryProps, const set<string> &limitToDocsSet,
449 unsigned int startDoc) :
450 QueryingThread(indexProps, queryProps, startDoc, false)
451 {
452 copy(limitToDocsSet.begin(), limitToDocsSet.end(),
453 inserter(m_limitToDocsSet, m_limitToDocsSet.begin()));
454 }
455
EngineQueryThread(const string & engineName,const string & engineDisplayableName,const string & engineOption,const QueryProperties & queryProps,unsigned int startDoc)456 EngineQueryThread::EngineQueryThread(const string &engineName, const string &engineDisplayableName,
457 const string &engineOption, const QueryProperties &queryProps, unsigned int startDoc) :
458 QueryingThread(engineName, engineDisplayableName, engineOption, queryProps, startDoc)
459 {
460 }
461
~EngineQueryThread()462 EngineQueryThread::~EngineQueryThread()
463 {
464 }
465
processResults(const vector<DocumentInfo> & resultsList)466 void EngineQueryThread::processResults(const vector<DocumentInfo> &resultsList)
467 {
468 PinotSettings &settings = PinotSettings::getInstance();
469 IndexInterface *pDocsIndex = NULL;
470 IndexInterface *pDaemonIndex = NULL;
471 unsigned int indexId = 0;
472 bool isIndexQuery = false;
473
474 // Are we querying an index ?
475 if (ModuleFactory::isSupported(m_engineName, true) == true)
476 {
477 // Internal index ?
478 if ((m_engineOption == settings.m_docsIndexLocation) ||
479 (m_engineOption == settings.m_daemonIndexLocation))
480 {
481 indexId = settings.getIndexPropertiesByLocation(m_engineOption).m_id;
482 isIndexQuery = true;
483 }
484 }
485
486 // Will we have to query internal indices ?
487 if (isIndexQuery == false)
488 {
489 pDocsIndex = settings.getIndex(settings.m_docsIndexLocation);
490 pDaemonIndex = settings.getIndex(settings.m_daemonIndexLocation);
491 }
492
493 // Copy the results list
494 for (vector<DocumentInfo>::const_iterator resultIter = resultsList.begin();
495 resultIter != resultsList.end(); ++resultIter)
496 {
497 DocumentInfo currentDoc(*resultIter);
498 string title(_("No title"));
499 string location(currentDoc.getLocation(true));
500 string language(currentDoc.getLanguage());
501 unsigned int docId = 0;
502
503 // The title may contain formatting
504 if (currentDoc.getTitle().empty() == false)
505 {
506 title = FilterUtils::stripMarkup(currentDoc.getTitle());
507 }
508 currentDoc.setTitle(title);
509 #ifdef DEBUG
510 clog << "EngineQueryThread::processResults: title is " << title << endl;
511 #endif
512
513 // Use the query's language if the result's is unknown
514 if (language.empty() == true)
515 {
516 language = m_queryProps.getStemmingLanguage();
517 }
518 currentDoc.setLanguage(language);
519
520 if (isIndexQuery == true)
521 {
522 unsigned int tmpId = 0;
523
524 // The index engine should have set this
525 docId = currentDoc.getIsIndexed(tmpId);
526 }
527
528 // Is this in one of the indexes ?
529 if ((pDocsIndex != NULL) &&
530 (pDocsIndex->isGood() == true))
531 {
532 docId = pDocsIndex->hasDocument(location);
533 if (docId > 0)
534 {
535 indexId = settings.getIndexPropertiesByName(_("My Web Pages")).m_id;
536 }
537 }
538 if ((pDaemonIndex != NULL) &&
539 (pDaemonIndex->isGood() == true) &&
540 (docId == 0))
541 {
542 docId = pDaemonIndex->hasDocument(location);
543 if (docId > 0)
544 {
545 indexId = settings.getIndexPropertiesByName(_("My Documents")).m_id;
546 }
547 }
548
549 if (docId > 0)
550 {
551 currentDoc.setIsIndexed(indexId, docId);
552 #ifdef DEBUG
553 clog << "EngineQueryThread::processResults: found in index " << indexId << endl;
554 #endif
555 }
556 #ifdef DEBUG
557 else clog << "EngineQueryThread::processResults: not found in any index" << endl;
558 #endif
559
560 m_documentsList.push_back(currentDoc);
561 }
562
563 if (pDocsIndex != NULL)
564 {
565 delete pDocsIndex;
566 }
567 if (pDaemonIndex != NULL)
568 {
569 delete pDaemonIndex;
570 }
571 }
572
processResults(const vector<DocumentInfo> & resultsList,unsigned int indexId)573 void EngineQueryThread::processResults(const vector<DocumentInfo> &resultsList,
574 unsigned int indexId)
575 {
576 unsigned int zeroId = 0;
577
578 // Copy the results list
579 for (vector<DocumentInfo>::const_iterator resultIter = resultsList.begin();
580 resultIter != resultsList.end(); ++resultIter)
581 {
582 DocumentInfo currentDoc(*resultIter);
583
584 // The engine has no notion of index IDs
585 unsigned int docId = currentDoc.getIsIndexed(zeroId);
586 currentDoc.setIsIndexed(indexId, docId);
587
588 m_documentsList.push_back(currentDoc);
589 }
590 }
591
doWork(void)592 void EngineQueryThread::doWork(void)
593 {
594 PinotSettings &settings = PinotSettings::getInstance();
595
596 // Get the SearchEngine
597 SearchEngineInterface *pEngine = ModuleFactory::getSearchEngine(m_engineName, m_engineOption);
598 if (pEngine == NULL)
599 {
600 // Try again
601 if (findPlugin() == true)
602 {
603 pEngine = ModuleFactory::getSearchEngine(m_engineName, m_engineOption);
604 }
605
606 if (pEngine == NULL)
607 {
608 m_errorNum = UNKNOWN_ENGINE;
609 m_errorParam = m_engineDisplayableName;
610 return;
611 }
612 }
613
614 // Set up the proxy
615 WebEngine *pWebEngine = dynamic_cast<WebEngine *>(pEngine);
616 if (pWebEngine != NULL)
617 {
618 DownloaderInterface *pDownloader = pWebEngine->getDownloader();
619 if ((pDownloader != NULL) &&
620 (settings.m_proxyEnabled == true) &&
621 (settings.m_proxyAddress.empty() == false))
622 {
623 char portStr[64];
624
625 pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
626 snprintf(portStr, 64, "%u", settings.m_proxyPort);
627 pDownloader->setSetting("proxyport", portStr);
628 pDownloader->setSetting("proxytype", settings.m_proxyType);
629 }
630
631 pWebEngine->setEditableValues(settings.m_editablePluginValues);
632 }
633
634 if (m_listingIndex == false)
635 {
636 pEngine->setLimitSet(m_limitToDocsSet);
637 }
638
639 // Run the query
640 pEngine->setDefaultOperator(SearchEngineInterface::DEFAULT_OP_AND);
641 if (pEngine->runQuery(m_queryProps, m_startDoc) == false)
642 {
643 m_errorNum = QUERY_FAILED;
644 m_errorParam = m_engineDisplayableName;
645 }
646 else
647 {
648 const vector<DocumentInfo> &resultsList = pEngine->getResults();
649
650 m_documentsList.clear();
651 m_documentsList.reserve(resultsList.size());
652 m_documentsCount = pEngine->getResultsCountEstimate();
653 #ifdef DEBUG
654 clog << "EngineQueryThread::doWork: " << resultsList.size() << " off " << m_documentsCount
655 << " results to process, starting at position " << m_startDoc << endl;
656 #endif
657
658 m_resultsCharset = pEngine->getResultsCharset();
659 if (m_listingIndex == false)
660 {
661 processResults(resultsList);
662 }
663 else
664 {
665 processResults(resultsList,
666 PinotSettings::getInstance().getIndexPropertiesByName(m_engineDisplayableName).m_id);
667 }
668
669 // Don't spellcheck if the query was modified in any way
670 if (m_queryProps.getModified() == false)
671 {
672 string correctedFreeQuery(pEngine->getSpellingCorrection());
673
674 // Any spelling correction ?
675 if (correctedFreeQuery.empty() == false)
676 {
677 m_correctedSpelling = true;
678 m_queryProps.setFreeQuery(correctedFreeQuery);
679 }
680 }
681 }
682
683 delete pEngine;
684 }
685
DownloadingThread(const DocumentInfo & docInfo)686 DownloadingThread::DownloadingThread(const DocumentInfo &docInfo) :
687 WorkerThread(),
688 m_docInfo(docInfo),
689 m_pDoc(NULL),
690 m_pDownloader(NULL)
691 {
692 }
693
DownloadingThread()694 DownloadingThread::DownloadingThread() :
695 WorkerThread(),
696 m_docInfo("", "", "", ""),
697 m_pDoc(NULL),
698 m_pDownloader(NULL)
699 {
700 }
701
~DownloadingThread()702 DownloadingThread::~DownloadingThread()
703 {
704 if (m_pDoc != NULL)
705 {
706 delete m_pDoc;
707 }
708 if (m_pDownloader != NULL)
709 {
710 delete m_pDownloader;
711 }
712 }
713
getType(void) const714 string DownloadingThread::getType(void) const
715 {
716 return "DownloadingThread";
717 }
718
getURL(void) const719 string DownloadingThread::getURL(void) const
720 {
721 return m_docInfo.getLocation();
722 }
723
getDocument(void) const724 const Document *DownloadingThread::getDocument(void) const
725 {
726 return m_pDoc;
727 }
728
doWork(void)729 void DownloadingThread::doWork(void)
730 {
731 Url thisUrl(m_docInfo.getLocation());
732 bool getDownloader = true;
733
734 if (m_pDoc != NULL)
735 {
736 delete m_pDoc;
737 m_pDoc = NULL;
738 }
739
740 // Get a Downloader
741 if (m_pDownloader != NULL)
742 {
743 // Same protocol as what we now need ?
744 if (m_protocol == thisUrl.getProtocol())
745 {
746 getDownloader = false;
747 }
748 else
749 {
750 delete m_pDownloader;
751 m_pDownloader = NULL;
752 m_protocol.clear();
753 }
754 }
755 if (getDownloader == true)
756 {
757 m_protocol = thisUrl.getProtocol();
758 m_pDownloader = DownloaderFactory::getDownloader(m_protocol);
759 }
760
761 if (m_pDownloader == NULL)
762 {
763 m_errorNum = UNSUPPORTED_PROTOCOL;
764 m_errorParam = thisUrl.getProtocol();
765 }
766 else if (m_done == false)
767 {
768 Timer collectTimer;
769 PinotSettings &settings = PinotSettings::getInstance();
770
771 // Set up the proxy
772 if ((getDownloader == true) &&
773 (settings.m_proxyEnabled == true) &&
774 (settings.m_proxyAddress.empty() == false))
775 {
776 char portStr[64];
777
778 m_pDownloader->setSetting("proxyaddress", settings.m_proxyAddress);
779 snprintf(portStr, 64, "%u", settings.m_proxyPort);
780 m_pDownloader->setSetting("proxyport", portStr);
781 m_pDownloader->setSetting("proxytype", settings.m_proxyType);
782 }
783
784 collectTimer.start();
785
786 m_pDoc = m_pDownloader->retrieveUrl(m_docInfo);
787
788 clog << "Retrieved " << m_docInfo.getLocation() << " in " << collectTimer.stop() << " ms" << endl;
789 }
790
791 if (m_pDoc == NULL)
792 {
793 m_errorNum = DOWNLOAD_FAILED;
794 m_errorParam = m_docInfo.getLocation();
795 }
796 }
797
IndexingThread(const DocumentInfo & docInfo,const string & indexLocation,bool allowAllMIMETypes)798 IndexingThread::IndexingThread(const DocumentInfo &docInfo, const string &indexLocation,
799 bool allowAllMIMETypes) :
800 DownloadingThread(docInfo),
801 m_pIndex(NULL),
802 m_indexLocation(indexLocation),
803 m_allowAllMIMETypes(allowAllMIMETypes),
804 m_update(false),
805 m_docId(0)
806 {
807 }
808
~IndexingThread()809 IndexingThread::~IndexingThread()
810 {
811 if (m_pIndex != NULL)
812 {
813 delete m_pIndex;
814 }
815 }
816
getType(void) const817 string IndexingThread::getType(void) const
818 {
819 return "IndexingThread";
820 }
821
getDocumentInfo(void) const822 const DocumentInfo &IndexingThread::getDocumentInfo(void) const
823 {
824 return m_docInfo;
825 }
826
getDocumentID(void) const827 unsigned int IndexingThread::getDocumentID(void) const
828 {
829 return m_docId;
830 }
831
isNewDocument(void) const832 bool IndexingThread::isNewDocument(void) const
833 {
834 // If the thread is set to perform an update, the document isn't new
835 if (m_update == true)
836 {
837 return false;
838 }
839 return true;
840 }
841
doWork(void)842 void IndexingThread::doWork(void)
843 {
844 Url thisUrl(m_docInfo.getLocation());
845 bool reliableType = false, doDownload = true;
846
847 // First things first, get the index
848 if (m_pIndex == NULL)
849 {
850 m_pIndex = PinotSettings::getInstance().getIndex(m_indexLocation);
851 }
852 if ((m_pIndex == NULL) ||
853 (m_pIndex->isGood() == false))
854 {
855 m_errorNum = INDEX_ERROR;
856 m_errorParam = m_indexLocation;
857 return;
858 }
859
860 // Is it an update ?
861 m_docId = m_pIndex->hasDocument(m_docInfo.getLocation(true));
862 if (m_docId > 0)
863 {
864 // Ignore robots directives on updates
865 m_update = true;
866 }
867
868 if (m_docInfo.getType().empty() == true)
869 {
870 m_docInfo.setType(MIMEScanner::scanUrl(thisUrl));
871 }
872 else if (thisUrl.isLocal() == true)
873 {
874 // There's a good chance the supplied type is accurate
875 // if the document is a local file
876 reliableType = true;
877 }
878
879 if (m_docInfo.getIsDirectory() == true)
880 {
881 doDownload = false;
882 #ifdef DEBUG
883 clog << "IndexingThread::doWork: skipping download of directory " << m_docInfo.getLocation() << endl;
884 #endif
885 }
886 else if (FilterUtils::isSupportedType(m_docInfo.getType()) == false)
887 {
888 // Skip unsupported types ?
889 if (m_allowAllMIMETypes == false)
890 {
891 m_errorNum = UNSUPPORTED_TYPE;
892 m_errorParam = m_docInfo.getType();
893
894 return;
895 }
896
897 if (reliableType == true)
898 {
899 doDownload = false;
900 #ifdef DEBUG
901 clog << "IndexingThread::doWork: skipping download of unsupported type " << m_docInfo.getLocation() << endl;
902 #endif
903 }
904 }
905 else
906 {
907 Dijon::Filter *pFilter = FilterUtils::getFilter(m_docInfo.getType());
908
909 if (pFilter != NULL)
910 {
911 // We may be able to feed the document directly to the filter
912 if (((pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME) == true) &&
913 (thisUrl.getProtocol() == "file")) ||
914 ((pFilter->is_data_input_ok(Dijon::Filter::DOCUMENT_URI) == true) &&
915 (thisUrl.isLocal() == false)))
916 {
917 doDownload = false;
918 #ifdef DEBUG
919 clog << "IndexingThread::doWork: let filter download " << m_docInfo.getLocation() << endl;
920 #endif
921 }
922
923 delete pFilter;
924 }
925 }
926
927 // We may not have to download the document
928 if (doDownload == true)
929 {
930 DownloadingThread::doWork();
931 }
932 else
933 {
934 if (m_pDoc != NULL)
935 {
936 delete m_pDoc;
937 m_pDoc = NULL;
938 }
939 m_pDoc = new Document(m_docInfo);
940
941 m_pDoc->setTimestamp(m_docInfo.getTimestamp());
942 m_pDoc->setSize(m_docInfo.getSize());
943 }
944
945 if (m_pDoc != NULL)
946 {
947 Timer indexTimer;
948 string docType(m_pDoc->getType());
949 bool success = false;
950
951 indexTimer.start();
952
953 // The type may have been obtained when downloading
954 if (docType.empty() == false)
955 {
956 // Use the document's type
957 m_docInfo.setType(docType);
958 }
959 else
960 {
961 // Use the type we were supplied with
962 m_pDoc->setType(m_docInfo.getType());
963 }
964
965 if (m_docInfo.getTitle().empty() == false)
966 {
967 // Use the title we were supplied with
968 m_pDoc->setTitle(m_docInfo.getTitle());
969 }
970 else
971 {
972 // Use the document's
973 m_docInfo.setTitle(m_pDoc->getTitle());
974 }
975 #ifdef DEBUG
976 clog << "IndexingThread::doWork: title is " << m_pDoc->getTitle() << endl;
977 #endif
978
979 // Check again as the downloader may have altered the MIME type
980 if (FilterUtils::isSupportedType(m_docInfo.getType()) == false)
981 {
982 // Skip unsupported types ?
983 if (m_allowAllMIMETypes == false)
984 {
985 m_errorNum = UNSUPPORTED_TYPE;
986 m_errorParam = m_docInfo.getType();
987
988 return;
989 }
990
991 // Let FilterWrapper handle unspported documents
992 }
993 else if ((PinotSettings::getInstance().m_ignoreRobotsDirectives == false) &&
994 (thisUrl.isLocal() == false) &&
995 (m_docInfo.getType().length() >= 9) &&
996 (m_docInfo.getType().substr(9) == "text/html"))
997 {
998 Dijon::HtmlFilter htmlFilter;
999
1000 htmlFilter.set_mime_type(m_docInfo.getType());
1001
1002 if ((FilterUtils::feedFilter(*m_pDoc, &htmlFilter) == true) &&
1003 (htmlFilter.next_document() == true))
1004 {
1005 const map<string, string> &metaData = htmlFilter.get_meta_data();
1006
1007 // See if the document has a ROBOTS META tag
1008 map<string, string>::const_iterator robotsIter = metaData.find("robots");
1009 if (robotsIter != metaData.end())
1010 {
1011 string robotsDirectives(robotsIter->second);
1012
1013 // Is indexing allowed ?
1014 string::size_type pos1 = robotsDirectives.find("none");
1015 string::size_type pos2 = robotsDirectives.find("noindex");
1016 if ((pos1 != string::npos) ||
1017 (pos2 != string::npos))
1018 {
1019 // No, it isn't
1020 m_errorNum = ROBOTS_FORBIDDEN;
1021 m_errorParam = m_docInfo.getLocation();
1022
1023 return;
1024 }
1025 }
1026 }
1027 #ifdef DEBUG
1028 else clog << "IndexingThread::doWork: couldn't check document for ROBOTS directive" << endl;
1029 #endif
1030 }
1031
1032 if (m_done == false)
1033 {
1034 FilterWrapper wrapFilter(m_pIndex);
1035
1036 // Update an existing document or add to the index ?
1037 if (m_update == true)
1038 {
1039 set<string> labels;
1040
1041 // Make sure labels are preserved
1042 m_pIndex->getDocumentLabels(m_docId, labels);
1043 m_pDoc->setLabels(labels);
1044
1045 // Update the document
1046 if (wrapFilter.updateDocument(*m_pDoc, m_docId) == true)
1047 {
1048 #ifdef DEBUG
1049 clog << "IndexingThread::doWork: updated " << m_pDoc->getLocation()
1050 << " at " << m_docId << endl;
1051 #endif
1052 success = true;
1053 }
1054 #ifdef DEBUG
1055 else clog << "IndexingThread::doWork: couldn't update " << m_pDoc->getLocation() << endl;
1056 #endif
1057 }
1058 else
1059 {
1060 unsigned int docId = 0;
1061 #ifdef DEBUG
1062 clog << "IndexingThread::doWork: " << m_docInfo.getLabels().size()
1063 << " labels for URL " << m_pDoc->getLocation() << endl;
1064 #endif
1065
1066 // Index the document
1067 success = wrapFilter.indexDocument(*m_pDoc, m_docInfo.getLabels(), docId);
1068 if (success == true)
1069 {
1070 m_docId = docId;
1071 #ifdef DEBUG
1072 clog << "IndexingThread::doWork: indexed " << m_pDoc->getLocation()
1073 << " to " << m_docId << endl;
1074 #endif
1075 }
1076 #ifdef DEBUG
1077 else clog << "IndexingThread::doWork: couldn't index " << m_pDoc->getLocation() << endl;
1078 #endif
1079 }
1080
1081 if (success == false)
1082 {
1083 m_errorNum = INDEXING_FAILED;
1084 m_errorParam = m_docInfo.getLocation();
1085 }
1086 else
1087 {
1088 // Flush the index ?
1089 if (m_immediateFlush == true)
1090 {
1091 m_pIndex->flush();
1092 }
1093
1094 // The document properties may have changed
1095 m_pIndex->getDocumentInfo(m_docId, m_docInfo);
1096 m_docInfo.setIsIndexed(
1097 PinotSettings::getInstance().getIndexPropertiesByLocation(m_indexLocation).m_id,
1098 m_docId);
1099
1100 clog << "Indexed " << m_docInfo.getLocation() << " in " << indexTimer.stop() << " ms" << endl;
1101 }
1102 }
1103 }
1104 #ifdef DEBUG
1105 else clog << "IndexingThread::doWork: couldn't download " << m_docInfo.getLocation() << endl;
1106 #endif
1107 }
1108
UnindexingThread(const set<unsigned int> & docIdList)1109 UnindexingThread::UnindexingThread(const set<unsigned int> &docIdList) :
1110 WorkerThread(),
1111 m_indexLocation(PinotSettings::getInstance().m_docsIndexLocation),
1112 m_docsCount(0)
1113 {
1114 copy(docIdList.begin(), docIdList.end(), inserter(m_docIdList, m_docIdList.begin()));
1115 }
1116
UnindexingThread(const set<string> & labelNames,const string & indexLocation)1117 UnindexingThread::UnindexingThread(const set<string> &labelNames, const string &indexLocation) :
1118 WorkerThread(),
1119 m_indexLocation(indexLocation),
1120 m_docsCount(0)
1121 {
1122 copy(labelNames.begin(), labelNames.end(), inserter(m_labelNames, m_labelNames.begin()));
1123 if (indexLocation.empty() == true)
1124 {
1125 m_indexLocation = PinotSettings::getInstance().m_docsIndexLocation;
1126 }
1127 }
1128
~UnindexingThread()1129 UnindexingThread::~UnindexingThread()
1130 {
1131 }
1132
getType(void) const1133 string UnindexingThread::getType(void) const
1134 {
1135 return "UnindexingThread";
1136 }
1137
getDocumentsCount(void) const1138 unsigned int UnindexingThread::getDocumentsCount(void) const
1139 {
1140 return m_docsCount;
1141 }
1142
doWork(void)1143 void UnindexingThread::doWork(void)
1144 {
1145 IndexInterface *pIndex = PinotSettings::getInstance().getIndex(m_indexLocation);
1146
1147 if ((pIndex == NULL) ||
1148 (pIndex->isGood() == false))
1149 {
1150 m_errorNum = INDEX_ERROR;
1151 m_errorParam = m_indexLocation;
1152 if (pIndex != NULL)
1153 {
1154 delete pIndex;
1155 }
1156 return;
1157 }
1158
1159 // Be pessimistic and assume something will go wrong ;-)
1160 m_errorNum = UNINDEXING_FAILED;
1161
1162 // Are we supposed to remove documents based on labels ?
1163 if (m_docIdList.empty() == true)
1164 {
1165 // Yep, delete documents one label at a time
1166 for (set<string>::iterator iter = m_labelNames.begin(); iter != m_labelNames.end(); ++iter)
1167 {
1168 string labelName = (*iter);
1169
1170 // By unindexing all documents that match the label,
1171 // we effectively delete the label from the index
1172 if (pIndex->unindexDocuments(labelName, IndexInterface::BY_LABEL) == true)
1173 {
1174 #ifdef DEBUG
1175 clog << "UnindexingThread::doWork: removed label " << labelName << endl;
1176 #endif
1177 // OK
1178 ++m_docsCount;
1179 }
1180 #ifdef DEBUG
1181 else clog << "UnindexingThread::doWork: couldn't remove label " << labelName << endl;
1182 #endif
1183 }
1184
1185 // Nothing to report
1186 m_errorNum = 0;
1187 }
1188 else
1189 {
1190 for (set<unsigned int>::iterator iter = m_docIdList.begin(); iter != m_docIdList.end(); ++iter)
1191 {
1192 unsigned int docId = (*iter);
1193
1194 if (pIndex->unindexDocument(docId) == true)
1195 {
1196 #ifdef DEBUG
1197 clog << "UnindexingThread::doWork: removed " << docId << endl;
1198 #endif
1199 // OK
1200 ++m_docsCount;
1201 }
1202 #ifdef DEBUG
1203 else clog << "UnindexingThread::doWork: couldn't remove " << docId << endl;
1204 #endif
1205 }
1206 #ifdef DEBUG
1207 clog << "UnindexingThread::doWork: removed " << m_docsCount << " documents" << endl;
1208 #endif
1209 }
1210
1211 if (m_docsCount > 0)
1212 {
1213 // Flush the index ?
1214 if (m_immediateFlush == true)
1215 {
1216 pIndex->flush();
1217 }
1218
1219 // Nothing to report
1220 m_errorNum = 0;
1221 }
1222
1223 delete pIndex;
1224 }
1225
HistoryMonitorThread(MonitorInterface * pMonitor,MonitorHandler * pHandler)1226 HistoryMonitorThread::HistoryMonitorThread(MonitorInterface *pMonitor, MonitorHandler *pHandler) :
1227 MonitorThread(pMonitor, pHandler),
1228 m_crawlHistory(PinotSettings::getInstance().getHistoryDatabaseName())
1229 {
1230 }
1231
~HistoryMonitorThread()1232 HistoryMonitorThread::~HistoryMonitorThread()
1233 {
1234 }
1235
isFileBlacklisted(const string & location)1236 bool HistoryMonitorThread::isFileBlacklisted(const string &location)
1237 {
1238 return PinotSettings::getInstance().isBlackListed(location);
1239 }
1240
fileModified(const string & location)1241 void HistoryMonitorThread::fileModified(const string &location)
1242 {
1243 CrawlHistory::CrawlStatus status = CrawlHistory::UNKNOWN;
1244 struct stat fileStat;
1245 time_t itemDate = 0;
1246
1247 if (m_crawlHistory.hasItem("file://" + location, status, itemDate) == true)
1248 {
1249 // Was the file actually modified ?
1250 if ((stat(location.c_str(), &fileStat) == 0) &&
1251 (itemDate < fileStat.st_mtime))
1252 {
1253 m_pHandler->fileModified(location);
1254 }
1255 #ifdef DEBUG
1256 else clog << "HistoryMonitorThread::fileModified: file wasn't modified" << endl;
1257 #endif
1258 }
1259 #ifdef DEBUG
1260 else clog << "HistoryMonitorThread::fileModified: file wasn't crawled" << endl;
1261 #endif
1262 }
1263
DirectoryScannerThread(const DocumentInfo & docInfo,const string & indexLocation,unsigned int maxLevel,bool inlineIndexing,bool followSymLinks)1264 DirectoryScannerThread::DirectoryScannerThread(const DocumentInfo &docInfo,
1265 const string &indexLocation, unsigned int maxLevel,
1266 bool inlineIndexing, bool followSymLinks) :
1267 IndexingThread(docInfo, indexLocation),
1268 m_currentLevel(0),
1269 m_maxLevel(maxLevel),
1270 m_inlineIndexing(inlineIndexing),
1271 m_followSymLinks(followSymLinks)
1272 {
1273 Url urlObj(docInfo.getLocation());
1274
1275 m_dirName = urlObj.getLocation() + "/" + urlObj.getFile();
1276 }
1277
~DirectoryScannerThread()1278 DirectoryScannerThread::~DirectoryScannerThread()
1279 {
1280 }
1281
getType(void) const1282 string DirectoryScannerThread::getType(void) const
1283 {
1284 if (m_inlineIndexing == true)
1285 {
1286 return IndexingThread::getType();
1287 }
1288
1289 return "DirectoryScannerThread";
1290 }
1291
getDirectory(void) const1292 string DirectoryScannerThread::getDirectory(void) const
1293 {
1294 return m_dirName;
1295 }
1296
stop(void)1297 void DirectoryScannerThread::stop(void)
1298 {
1299 // Disconnect the signal
1300 sigc::signal2<void, DocumentInfo, bool>::slot_list_type slotsList = m_signalFileFound.slots();
1301 sigc::signal2<void, DocumentInfo, bool>::slot_list_type::iterator slotIter = slotsList.begin();
1302 if (slotIter != slotsList.end())
1303 {
1304 if (slotIter->empty() == false)
1305 {
1306 slotIter->block();
1307 slotIter->disconnect();
1308 }
1309 }
1310 WorkerThread::stop();
1311 }
1312
getFileFoundSignal(void)1313 sigc::signal2<void, DocumentInfo, bool>& DirectoryScannerThread::getFileFoundSignal(void)
1314 {
1315 return m_signalFileFound;
1316 }
1317
recordCrawled(const string & location,time_t itemDate)1318 void DirectoryScannerThread::recordCrawled(const string &location, time_t itemDate)
1319 {
1320 // Nothing to do by default
1321 }
1322
isIndexable(const string & entryName) const1323 bool DirectoryScannerThread::isIndexable(const string &entryName) const
1324 {
1325 string entryDir(path_get_dirname(entryName) + "/");
1326
1327 // Is this under the directory being scanned ?
1328 if ((entryDir.length() >= m_dirName.length()) &&
1329 (entryDir.substr(0, m_dirName.length()) == m_dirName))
1330 {
1331 // Yes, it is
1332 #ifdef DEBUG
1333 clog << "DirectoryScannerThread::isIndexable: under " << m_dirName << endl;
1334 #endif
1335 return true;
1336 }
1337
1338 return false;
1339 }
1340
wasCrawled(const string & location,time_t & itemDate)1341 bool DirectoryScannerThread::wasCrawled(const string &location, time_t &itemDate)
1342 {
1343 // This information is unknown
1344 return false;
1345 }
1346
recordCrawling(const string & location,bool itemExists,time_t & itemDate)1347 void DirectoryScannerThread::recordCrawling(const string &location, bool itemExists, time_t &itemDate)
1348 {
1349 // Nothing to do by default
1350 }
1351
recordError(const string & location,int errorCode)1352 void DirectoryScannerThread::recordError(const string &location, int errorCode)
1353 {
1354 // Nothing to do by default
1355 }
1356
recordSymlink(const string & location,time_t itemDate)1357 void DirectoryScannerThread::recordSymlink(const string &location, time_t itemDate)
1358 {
1359 // Nothing to do by default
1360 }
1361
monitorEntry(const string & entryName)1362 bool DirectoryScannerThread::monitorEntry(const string &entryName)
1363 {
1364 // Nothing to do by default
1365 return true;
1366 }
1367
unmonitorEntry(const string & entryName)1368 void DirectoryScannerThread::unmonitorEntry(const string &entryName)
1369 {
1370 // Nothing to do by default
1371 }
1372
foundFile(const DocumentInfo & docInfo)1373 void DirectoryScannerThread::foundFile(const DocumentInfo &docInfo)
1374 {
1375 if ((docInfo.getLocation().empty() == true) ||
1376 (m_done == true))
1377 {
1378 return;
1379 }
1380
1381 if (m_inlineIndexing == true)
1382 {
1383 // Reset base class members
1384 m_docInfo = docInfo;
1385 m_docId = 0;
1386 m_update = false;
1387
1388 IndexingThread::doWork();
1389 #ifdef DEBUG
1390 clog << "DirectoryScannerThread::foundFile: indexed " << docInfo.getLocation() << " to " << m_docId << endl;
1391 #endif
1392 }
1393 else
1394 {
1395 // Delegate indexing
1396 // Report everything as file to avoid triggering another crawl
1397 m_signalFileFound(docInfo, false);
1398 }
1399 }
1400
scanEntry(const string & entryName,int & entryStatus,bool statLinks)1401 bool DirectoryScannerThread::scanEntry(const string &entryName,
1402 int &entryStatus, bool statLinks)
1403 {
1404 string location("file://" + entryName);
1405 DocumentInfo docInfo("", location, "", "");
1406 time_t itemDate = time(NULL);
1407 struct stat fileStat;
1408 bool scanSuccess = true, reportFile = false, itemExists = false;
1409
1410 if (entryName.empty() == true)
1411 {
1412 return false;
1413 }
1414
1415 // Skip . .. and dotfiles
1416 Url urlObj(location);
1417 if (urlObj.getFile()[0] == '.')
1418 {
1419 #ifdef DEBUG
1420 clog << "DirectoryScannerThread::scanEntry: skipped dotfile " << urlObj.getFile() << endl;
1421 #endif
1422 return false;
1423 }
1424 #ifdef DEBUG
1425 clog << "DirectoryScannerThread::scanEntry: checking " << entryName << endl;
1426 #endif
1427
1428 #ifdef HAVE_LSTAT
1429 // Stat links, or the stuff it refers to ?
1430 if (statLinks == true)
1431 {
1432 entryStatus = lstat(entryName.c_str(), &fileStat);
1433 }
1434 else
1435 {
1436 #endif
1437 entryStatus = stat(entryName.c_str(), &fileStat);
1438 #ifdef HAVE_LSTAT
1439 }
1440 #endif
1441
1442 if (entryStatus == -1)
1443 {
1444 entryStatus = errno;
1445 scanSuccess = false;
1446 #ifdef DEBUG
1447 clog << "DirectoryScannerThread::scanEntry: stat failed with error " << entryStatus << endl;
1448 #endif
1449 }
1450 #ifdef HAVE_LSTAT
1451 // Special processing applies if it's a symlink
1452 else if (S_ISLNK(fileStat.st_mode))
1453 {
1454 string realEntryName(entryName);
1455 string entryNameReferree;
1456 bool isInIndexableLocation = false;
1457
1458 // If symlinks are followed, check if this symlink is blacklisted
1459 if ((m_followSymLinks == false) ||
1460 (PinotSettings::getInstance().isBlackListed(entryName) == true))
1461 {
1462 #ifdef DEBUG
1463 clog << "DirectoryScannerThread::scanEntry: skipped symlink " << entryName << endl;
1464 #endif
1465 return false;
1466 }
1467
1468 // Are we already following a symlink to a directory ?
1469 if (m_currentLinks.empty() == false)
1470 {
1471 string linkToDir(m_currentLinks.top() + "/");
1472
1473 // Yes, we are
1474 if ((entryName.length() > linkToDir.length()) &&
1475 (entryName.substr(0, linkToDir.length()) == linkToDir))
1476 {
1477 // ...and this entry is below it
1478 realEntryName.replace(0, linkToDir.length() - 1, m_currentLinkReferrees.top());
1479 #ifdef DEBUG
1480 clog << "DirectoryScannerThread::scanEntry: really at " << realEntryName << endl;
1481 #endif
1482 isInIndexableLocation = isIndexable(realEntryName);
1483 }
1484 }
1485
1486 char *pBuf = g_file_read_link(realEntryName.c_str(), NULL);
1487 if (pBuf != NULL)
1488 {
1489 string linkLocation(filename_to_utf8(pBuf));
1490 if (path_is_absolute(linkLocation) == true)
1491 {
1492 entryNameReferree = linkLocation;
1493 }
1494 else
1495 {
1496 string entryDir(path_get_dirname(realEntryName));
1497
1498 entryNameReferree = Url::resolvePath(entryDir, linkLocation);
1499 }
1500
1501 if (entryNameReferree[entryNameReferree.length() - 1] == '/')
1502 {
1503 // Drop the terminating slash
1504 entryNameReferree.resize(entryNameReferree.length() - 1);
1505 }
1506 #ifdef DEBUG
1507 clog << "DirectoryScannerThread::scanEntry: symlink resolved to " << entryNameReferree << endl;
1508 #endif
1509
1510 g_free(pBuf);
1511 }
1512
1513 string referreeLocation("file://" + entryNameReferree);
1514 time_t referreeItemDate;
1515
1516 // Check whether this will be, or has already been crawled
1517 // Referrees in indexable locations will be indexed later on
1518 if ((isInIndexableLocation == false) &&
1519 (isIndexable(entryNameReferree) == false) &&
1520 (wasCrawled(referreeLocation, referreeItemDate) == false))
1521 {
1522 m_currentLinks.push(entryName);
1523 m_currentLinkReferrees.push(entryNameReferree);
1524
1525 // Add a dummy entry for this referree
1526 // It will ensure it's not indexed more than once and it shouldn't do any harm
1527 recordSymlink(referreeLocation, itemDate);
1528
1529 // Do it again, this time by stat'ing what the link refers to
1530 bool scannedReferree = scanEntry(entryName, entryStatus, false);
1531
1532 m_currentLinks.pop();
1533 m_currentLinkReferrees.pop();
1534
1535 return scannedReferree;
1536 }
1537 else
1538 {
1539 clog << "Skipping " << entryName << ": it links to " << entryNameReferree
1540 << " which will be crawled, or has already been crawled" << endl;
1541
1542 // This should ensure that only metadata is indexed
1543 docInfo.setType("inode/symlink");
1544 reportFile = true;
1545 }
1546 }
1547 #endif
1548
1549 // Is this item in the database already ?
1550 itemExists = wasCrawled(location, itemDate);
1551 // Put it in if necessary
1552 recordCrawling(location, itemExists, itemDate);
1553
1554 // If stat'ing didn't fail, see if it's a file or a directory
1555 if ((entryStatus == 0) &&
1556 (S_ISREG(fileStat.st_mode)))
1557 {
1558 // Is this file blacklisted ?
1559 // We have to check early so that if necessary the file's status stays at TO_CRAWL
1560 // and it is removed from the index at the end of this crawl
1561 if (PinotSettings::getInstance().isBlackListed(entryName) == false)
1562 {
1563 reportFile = true;
1564 }
1565 }
1566 else if ((entryStatus == 0) &&
1567 (S_ISDIR(fileStat.st_mode)))
1568 {
1569 docInfo.setType("x-directory/normal");
1570
1571 // Can we scan this directory ?
1572 if (((m_maxLevel == 0) ||
1573 (m_currentLevel < m_maxLevel)) &&
1574 (PinotSettings::getInstance().isBlackListed(entryName) == false))
1575 {
1576 ++m_currentLevel;
1577
1578 // Open the directory
1579 DIR *pDir = opendir(entryName.c_str());
1580 if (pDir != NULL)
1581 {
1582 // Monitor first so that we don't miss events
1583 // If monitoring is not possible, record the first case
1584 if ((monitorEntry(entryName) == false) &&
1585 (entryStatus != MONITORING_FAILED))
1586 {
1587 entryStatus = MONITORING_FAILED;
1588 }
1589 #ifdef DEBUG
1590 clog << "DirectoryScannerThread::scanEntry: entering " << entryName << endl;
1591 #endif
1592
1593 // Iterate through this directory's entries
1594 struct dirent *pDirEntry = readdir(pDir);
1595 while ((m_done == false) &&
1596 (pDirEntry != NULL))
1597 {
1598 char *pEntryName = pDirEntry->d_name;
1599
1600 // Skip . .. and dotfiles
1601 if ((pEntryName != NULL) &&
1602 (pEntryName[0] != '.'))
1603 {
1604 string subEntryName(entryName);
1605 int subEntryStatus = 0;
1606
1607 if (entryName[entryName.length() - 1] != '/')
1608 {
1609 subEntryName += "/";
1610 }
1611 subEntryName += pEntryName;
1612
1613 // Scan this entry
1614 scanEntry(subEntryName, subEntryStatus);
1615 }
1616
1617 // Next entry
1618 pDirEntry = readdir(pDir);
1619 }
1620 #ifdef DEBUG
1621 clog << "DirectoryScannerThread::scanEntry: leaving " << entryName << endl;
1622 #endif
1623
1624 // Close the directory
1625 closedir(pDir);
1626 --m_currentLevel;
1627 reportFile = true;
1628 }
1629 else
1630 {
1631 entryStatus = errno;
1632 scanSuccess = false;
1633 #ifdef DEBUG
1634 clog << "DirectoryScannerThread::scanEntry: opendir failed with error " << entryStatus << endl;
1635 #endif
1636 }
1637 }
1638 }
1639 // Is it some unknown type ?
1640 else if ((entryStatus == 0)
1641 #ifdef HAVE_LSTAT
1642 && (!S_ISLNK(fileStat.st_mode))
1643 #endif
1644 )
1645 {
1646 #ifdef DEBUG
1647 clog << "DirectoryScannerThread::scanEntry: unknown entry type" << endl;
1648 #endif
1649 entryStatus = ENOENT;
1650 scanSuccess = false;
1651 }
1652
1653 // Was it modified after the last crawl ?
1654 if ((itemExists == true) &&
1655 (itemDate >= fileStat.st_mtime))
1656 {
1657 // No, it wasn't
1658 #ifdef DEBUG
1659 clog << "DirectoryScannerThread::scanEntry: no change to " << location << endl;
1660 #endif
1661 reportFile = false;
1662 }
1663
1664 if (m_done == true)
1665 {
1666 // Don't record or report the file
1667 reportFile = false;
1668 }
1669 // Did an error occur ?
1670 else if (entryStatus != 0)
1671 {
1672 // Record this error
1673 recordError(location, entryStatus);
1674
1675 if (scanSuccess == false)
1676 {
1677 return scanSuccess;
1678 }
1679 }
1680 // History of new or modified files, especially their timestamp, is always updated
1681 // Others' are updated only if we are doing a full scan because
1682 // the status has to be reset to CRAWLED, so that they are not unindexed
1683 else if ((itemExists == false) ||
1684 (reportFile == true))
1685 {
1686 recordCrawled(location, fileStat.st_mtime);
1687 }
1688
1689 // If a major error occurred, this won't be true
1690 if (reportFile == true)
1691 {
1692 if (docInfo.getType().empty() == true)
1693 {
1694 // Scan the file
1695 docInfo.setType(MIMEScanner::scanFile(entryName));
1696 }
1697 docInfo.setTimestamp(TimeConverter::toTimestamp(fileStat.st_mtime));
1698 docInfo.setSize(fileStat.st_size);
1699
1700 foundFile(docInfo);
1701 }
1702
1703 return scanSuccess;
1704 }
1705
doWork(void)1706 void DirectoryScannerThread::doWork(void)
1707 {
1708 Timer scanTimer;
1709 int entryStatus = 0;
1710
1711 if (m_dirName.empty() == true)
1712 {
1713 return;
1714 }
1715 scanTimer.start();
1716
1717 if (scanEntry(m_dirName, entryStatus) == false)
1718 {
1719 if (entryStatus == 0)
1720 {
1721 m_errorNum = OPENDIR_FAILED;
1722 }
1723 else
1724 {
1725 m_errorNum = entryStatus;
1726 }
1727 m_errorParam = m_dirName;
1728 }
1729 clog << "Scanned " << m_dirName << " in " << scanTimer.stop() << " ms" << endl;
1730 }
1731
1732