1 /*
2  *  Copyright 2005-2021 Fabrice Colin
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, write to the Free Software
16  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  */
18 
19 #include <fcntl.h>
20 #include <ctype.h>
21 #include <unistd.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <stdarg.h>
25 #include <strings.h>
26 #include <time.h>
27 #include <ctype.h>
28 #include <vector>
29 #include <iostream>
30 #include <fstream>
31 #include <algorithm>
32 #include <utility>
33 #include <cstring>
34 #include <xapian.h>
35 
36 #include "Languages.h"
37 #include "StringManip.h"
38 #include "TimeConverter.h"
39 #include "Url.h"
40 #include "FieldMapperInterface.h"
41 #include "LanguageDetector.h"
42 #include "XapianDatabaseFactory.h"
43 #include "XapianIndex.h"
44 
45 #define MAGIC_TERM "X-MetaSE-Doc"
46 
47 using std::clog;
48 using std::clog;
49 using std::endl;
50 using std::ios;
51 using std::ifstream;
52 using std::ofstream;
53 using std::string;
54 using std::vector;
55 using std::set;
56 using std::map;
57 using std::min;
58 using std::max;
59 using std::pair;
60 
61 extern FieldMapperInterface *g_pMapper;
62 
63 class TokensIndexer : public Dijon::CJKVTokenizer::TokensHandler
64 {
65 	public:
TokensIndexer(Xapian::Stem * pStemmer,Xapian::Document & doc,const Xapian::WritableDatabase & db,const string & prefix,unsigned int nGramSize,bool & doSpelling,Xapian::termcount & termPos)66 		TokensIndexer(Xapian::Stem *pStemmer, Xapian::Document &doc,
67 			const Xapian::WritableDatabase &db,
68 			const string &prefix, unsigned int nGramSize,
69 			bool &doSpelling, Xapian::termcount &termPos) :
70 			Dijon::CJKVTokenizer::TokensHandler(),
71 			m_pStemmer(pStemmer),
72 			m_doc(doc),
73 			m_db(db),
74 			m_prefix(prefix),
75 			m_nGramSize(nGramSize),
76 			m_nGramCount(0),
77 			m_doSpelling(doSpelling),
78 			m_termPos(termPos),
79 			m_hasCJKV(false)
80 		{
81 		}
82 
~TokensIndexer()83 		virtual ~TokensIndexer()
84 		{
85 			if (m_hasCJKV == true)
86 			{
87 				// This will help identify CJKV documents
88 				m_doc.add_term("XTOK:CJKV");
89 			}
90 		}
91 
handle_token(const string & tok,bool is_cjkv)92 		virtual bool handle_token(const string &tok, bool is_cjkv)
93 		{
94 			bool addSpelling = false;
95 
96 			if (tok.empty() == true)
97 			{
98 				return false;
99 			}
100 
101 			// Lower case the term and trim spaces
102 			string term(StringManip::toLowerCase(tok));
103 			StringManip::trimSpaces(term);
104 
105 			if (term.empty() == true)
106 			{
107 				return true;
108 			}
109 
110 			// Does it end with a dot ?
111 			if (term[term.length() - 1] == '.')
112 			{
113 				bool foundNonDot = false;
114 
115 				string::size_type pos = term.length() - 1;
116 				while (pos >= 0)
117 				{
118 					if (term[pos] != '.')
119 					{
120 						foundNonDot = true;
121 
122 						// Any dot before that ?
123 						if ((pos == 0) ||
124 							(term.find_last_of(".", pos - 1) == string::npos))
125 						{
126 							// No, all dots are at the end, trim them
127 							term.erase(pos + 1);
128 						}
129 						// Else, it's probably an acronym
130 						break;
131 					}
132 
133 					if (pos == 0)
134 					{
135 						break;
136 					}
137 					--pos;
138 				}
139 
140 				if (foundNonDot == false)
141 				{
142 					// It's all dots !
143 					return true;
144 				}
145 			}
146 			m_doc.add_posting(m_prefix + XapianDatabase::limitTermLength(term), m_termPos);
147 
148 			// Is this CJKV ?
149 			if (is_cjkv == false)
150 			{
151 #ifndef _DIACRITICS_SENSITIVE
152 				bool hasDiacritics = false;
153 
154 				// Remove accents and other diacritics
155 				string unaccentedTerm(Dijon::CJKVTokenizer::strip_marks(term));
156 				if (unaccentedTerm != term)
157 				{
158 					m_doc.add_posting(m_prefix + XapianDatabase::limitTermLength(unaccentedTerm), m_termPos);
159 					hasDiacritics = true;
160 				}
161 #endif
162 
163 				// Don't stem if the term starts with a digit
164 				if ((m_pStemmer != NULL) &&
165 					(isdigit((int)term[0]) == 0))
166 				{
167 					string stemmedTerm((*m_pStemmer)(term));
168 
169 					m_doc.add_term("Z" + XapianDatabase::limitTermLength(stemmedTerm));
170 #ifndef _DIACRITICS_SENSITIVE
171 					if (hasDiacritics == true)
172 					{
173 						stemmedTerm = (*m_pStemmer)(unaccentedTerm);
174 
175 						m_doc.add_term("Z" + XapianDatabase::limitTermLength(stemmedTerm));
176 					}
177 #endif
178 				}
179 
180 				// Does it include dots ?
181 				string::size_type dotPos = term.find('.');
182 				if (dotPos != string::npos)
183 				{
184 					string::size_type startPos = 0;
185 					bool addRemainder = true;
186 
187 					while (dotPos != string::npos)
188 					{
189 						string component(term.substr(startPos, dotPos - startPos));
190 
191 						if (component.empty() == false)
192 						{
193 							m_doc.add_posting(m_prefix + XapianDatabase::limitTermLength(component), m_termPos);
194 							++m_termPos;
195 						}
196 
197 						// Next
198 						if (dotPos == term.length() - 1)
199 						{
200 							addRemainder = false;
201 							break;
202 						}
203 						startPos = dotPos + 1;
204 						dotPos = term.find('.', startPos);
205 					}
206 
207 					if (addRemainder == true)
208 					{
209 						string lastComponent(term.substr(startPos));
210 
211 						m_doc.add_posting(m_prefix + XapianDatabase::limitTermLength(lastComponent), m_termPos);
212 					}
213 				}
214 
215 				addSpelling = m_doSpelling;
216 				++m_termPos;
217 				m_nGramCount = 0;
218 			}
219 			else
220 			{
221 				if (m_nGramCount % m_nGramSize == 0)
222 				{
223 					++m_termPos;
224 				}
225 				else if ((m_nGramCount + 1) % m_nGramSize == 0)
226 				{
227 					addSpelling = m_doSpelling;
228 				}
229 				++m_nGramCount;
230 				m_hasCJKV = true;
231 			}
232 
233 			if (addSpelling == true)
234 			{
235 				try
236 				{
237 					m_db.add_spelling(XapianDatabase::limitTermLength(term));
238 				}
239 				catch (const Xapian::UnimplementedError &error)
240 				{
241 					clog << "Couldn't index with spelling correction: " << error.get_type() << ": " << error.get_msg() << endl;
242 
243 					m_doSpelling = false;
244 				}
245 			}
246 
247 			return true;
248 		}
249 
250 	protected:
251 		Xapian::Stem *m_pStemmer;
252 		Xapian::Document &m_doc;
253 		const Xapian::WritableDatabase &m_db;
254 		string m_prefix;
255 		unsigned int m_nGramSize;
256 		unsigned int m_nGramCount;
257 		bool &m_doSpelling;
258 		Xapian::termcount &m_termPos;
259 		bool m_hasCJKV;
260 
261 };
262 
XapianIndex(const string & indexName)263 XapianIndex::XapianIndex(const string &indexName) :
264 	IndexInterface(),
265 	m_databaseName(indexName),
266 	m_goodIndex(false),
267 	m_doSpelling(true)
268 {
269 	// Open in read-only mode
270 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
271 	if ((pDatabase != NULL) &&
272 		(pDatabase->isOpen() == true))
273 	{
274 		m_goodIndex = true;
275 		m_doSpelling = pDatabase->withSpelling();
276 	}
277 }
278 
XapianIndex(const XapianIndex & other)279 XapianIndex::XapianIndex(const XapianIndex &other) :
280 	IndexInterface(other),
281 	m_databaseName(other.m_databaseName),
282 	m_goodIndex(other .m_goodIndex),
283 	m_doSpelling(other.m_doSpelling),
284 	m_stemLanguage(other.m_stemLanguage)
285 {
286 }
287 
~XapianIndex()288 XapianIndex::~XapianIndex()
289 {
290 }
291 
operator =(const XapianIndex & other)292 XapianIndex &XapianIndex::operator=(const XapianIndex &other)
293 {
294 	if (this != &other)
295 	{
296 		IndexInterface::operator=(other);
297 		m_databaseName = other.m_databaseName;
298 		m_goodIndex = other .m_goodIndex;
299 		m_doSpelling = other.m_doSpelling;
300 		m_stemLanguage = other.m_stemLanguage;
301 	}
302 
303 	return *this;
304 }
305 
listDocumentsWithTerm(const string & term,set<unsigned int> & docIds,unsigned int maxDocsCount,unsigned int startDoc) const306 bool XapianIndex::listDocumentsWithTerm(const string &term, set<unsigned int> &docIds,
307 	unsigned int maxDocsCount, unsigned int startDoc) const
308 {
309 	unsigned int docCount = 0;
310 
311 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
312 	if (pDatabase == NULL)
313 	{
314 		clog << "Couldn't get index " << m_databaseName << endl;
315 		return 0;
316 	}
317 
318 	docIds.clear();
319 	try
320 	{
321 		Xapian::Database *pIndex = pDatabase->readLock();
322 		if (pIndex != NULL)
323 		{
324 #ifdef DEBUG
325 			clog << "XapianIndex::listDocumentsWithTerm: term " << term << endl;
326 #endif
327 			// Get a list of documents that have the term
328 			for (Xapian::PostingIterator postingIter = pIndex->postlist_begin(term);
329 				(postingIter != pIndex->postlist_end(term)) &&
330 					((maxDocsCount == 0) || (docIds.size() < maxDocsCount));
331 				++postingIter)
332 			{
333 				Xapian::docid docId = *postingIter;
334 
335 				// We cannot use postingIter->skip_to() because startDoc isn't an ID
336 				if (docCount >= startDoc)
337 				{
338 					docIds.insert(docId);
339 				}
340 				++docCount;
341 			}
342 		}
343 	}
344 	catch (const Xapian::Error &error)
345 	{
346 		clog << "Couldn't get document list: " << error.get_type() << ": " << error.get_msg() << endl;
347 	}
348 	catch (...)
349 	{
350 		clog << "Couldn't get document list, unknown exception occurred" << endl;
351 	}
352 	pDatabase->unlock();
353 
354 	return docIds.size();
355 }
356 
addPostingsToDocument(const Xapian::Utf8Iterator & itor,Xapian::Document & doc,const Xapian::WritableDatabase & db,const string & prefix,bool noStemming,bool & doSpelling,Xapian::termcount & termPos) const357 void XapianIndex::addPostingsToDocument(const Xapian::Utf8Iterator &itor, Xapian::Document &doc,
358 	const Xapian::WritableDatabase &db, const string &prefix, bool noStemming, bool &doSpelling,
359 	Xapian::termcount &termPos) const
360 {
361 	Xapian::Stem *pStemmer = NULL;
362 	bool isCJKV = false;
363 
364 	// Do we know what language to use for stemming ?
365 	if ((noStemming == false) &&
366 		(m_stemLanguage.empty() == false) &&
367 		(m_stemLanguage != "unknown"))
368 	{
369 		try
370 		{
371 			pStemmer = new Xapian::Stem(StringManip::toLowerCase(m_stemLanguage));
372 		}
373 		catch (const Xapian::Error &error)
374 		{
375 			clog << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
376 		}
377 	}
378 
379 	const char *pRawData = itor.raw();
380 	if (pRawData != NULL)
381 	{
382 		Dijon::CJKVTokenizer tokenizer;
383 		string text(pRawData);
384 
385 #ifdef _DIACRITICS_SENSITIVE
386 		if (tokenizer.has_cjkv(text) == true)
387 		{
388 #endif
389 			// Use overload
390 			addPostingsToDocument(tokenizer, pStemmer, text, doc, db,
391 				prefix, doSpelling, termPos);
392 			isCJKV = true;
393 #ifdef _DIACRITICS_SENSITIVE
394 		}
395 #endif
396 	}
397 
398 #ifdef _DIACRITICS_SENSITIVE
399 	if (isCJKV == false)
400 	{
401 		Xapian::TermGenerator generator;
402 
403 		// Set the stemmer
404 		if (pStemmer != NULL)
405 		{
406 			generator.set_stemmer(*pStemmer);
407 		}
408 
409 		generator.set_termpos(termPos);
410 		try
411 		{
412 			// Older Xapian backends don't support spelling correction
413 			if (doSpelling == true)
414 			{
415 				// The database is required for the spelling dictionary
416 				generator.set_flags(Xapian::TermGenerator::FLAG_SPELLING);
417 				generator.set_database(db);
418 			}
419 			generator.set_document(doc);
420 			generator.index_text(itor, 1, prefix);
421 		}
422 		catch (const Xapian::UnimplementedError &error)
423 		{
424 			clog << "Couldn't index with spelling correction: " << error.get_type() << ": " << error.get_msg() << endl;
425 
426 			if (doSpelling == true)
427 			{
428 				doSpelling = false;
429 
430 				// Try again without spelling correction
431 				// Let the caller catch the exception
432 				generator.set_flags(Xapian::TermGenerator::FLAG_SPELLING, Xapian::TermGenerator::FLAG_SPELLING);
433 				generator.set_document(doc);
434 				generator.index_text(itor, 1, prefix);
435 			}
436 		}
437 		termPos = generator.get_termpos();
438 	}
439 #endif
440 
441 	if (pStemmer != NULL)
442 	{
443 		delete pStemmer;
444 	}
445 }
446 
addPostingsToDocument(Dijon::CJKVTokenizer & tokenizer,Xapian::Stem * pStemmer,const string & text,Xapian::Document & doc,const Xapian::WritableDatabase & db,const string & prefix,bool & doSpelling,Xapian::termcount & termPos) const447 void XapianIndex::addPostingsToDocument(Dijon::CJKVTokenizer &tokenizer, Xapian::Stem *pStemmer,
448 	const string &text, Xapian::Document &doc, const Xapian::WritableDatabase &db,
449 	const string &prefix, bool &doSpelling, Xapian::termcount &termPos) const
450 {
451 	TokensIndexer handler(pStemmer, doc, db, prefix, tokenizer.get_ngram_size(),
452 		doSpelling, termPos);
453 
454 	// Get the terms
455 	tokenizer.tokenize(text, handler, true);
456 #ifdef DEBUG
457 	clog << "XapianIndex::addPostingsToDocument: terms to position " << termPos << endl;
458 #endif
459 }
460 
addLabelsToDocument(Xapian::Document & doc,const set<string> & labels,bool skipInternals)461 void XapianIndex::addLabelsToDocument(Xapian::Document &doc, const set<string> &labels,
462 	bool skipInternals)
463 {
464 	if (labels.empty() == true)
465 	{
466 		return;
467 	}
468 
469 	for (set<string>::const_iterator labelIter = labels.begin(); labelIter != labels.end();
470 		++labelIter)
471 	{
472 		string labelName(*labelIter);
473 
474 		// Prevent from setting internal labels ?
475 		if ((labelName.empty() == true) ||
476 			((skipInternals == true) && (labelName.substr(0, 2) == "X-")))
477 		{
478 			continue;
479 		}
480 
481 #ifdef DEBUG
482 		clog << "XapianIndex::addLabelsToDocument: label \"" << labelName << "\"" << endl;
483 #endif
484 		doc.add_term(string("XLABEL:") + XapianDatabase::limitTermLength(Url::escapeUrl(labelName)));
485 	}
486 }
487 
removePostingsFromDocument(const Xapian::Utf8Iterator & itor,Xapian::Document & doc,const Xapian::WritableDatabase & db,const string & prefix,bool noStemming,bool & doSpelling) const488 void XapianIndex::removePostingsFromDocument(const Xapian::Utf8Iterator &itor, Xapian::Document &doc,
489 	const Xapian::WritableDatabase &db, const string &prefix,
490 	bool noStemming, bool &doSpelling) const
491 {
492 	Xapian::Document termsDoc;
493 	Xapian::termcount termPos = 0;
494 	bool addDoSpelling = false;
495 
496 	// Get the terms, without populating the spelling database
497 	addPostingsToDocument(itor, termsDoc, db, prefix, noStemming, addDoSpelling, termPos);
498 
499 	// Get the terms and remove the first posting for each
500 	for (Xapian::TermIterator termListIter = termsDoc.termlist_begin();
501 		termListIter != termsDoc.termlist_end(); ++termListIter)
502 	{
503 		Xapian::termcount postingsCount = termListIter.positionlist_count();
504 		Xapian::termcount postingNum = 0;
505 		bool removeTerm = false;
506 
507 #ifdef DEBUG
508 		clog << "XapianIndex::removePostingsFromDocument: term " << *termListIter
509 			<< " has " << postingsCount << " postings" << endl;
510 #endif
511 		// If a prefix is defined, or there are no postings, we can afford removing the term
512 		if ((prefix.empty() == false) ||
513 			(postingsCount == 0))
514 		{
515 			removeTerm = true;
516 		}
517 		else
518 		{
519 			// Check whether this term is in the original document and how many postings it has
520 			Xapian::TermIterator termIter = doc.termlist_begin();
521 			if (termIter != doc.termlist_end())
522 			{
523 				termIter.skip_to(*termListIter);
524 				if (termIter != doc.termlist_end())
525 				{
526 					if (*termIter != *termListIter)
527 					{
528 						// This term doesn't exist in the document !
529 #ifdef DEBUG
530 						clog << "XapianIndex::removePostingsFromDocument: no such term" << endl;
531 #endif
532 						continue;
533 					}
534 
535 					if (termIter.positionlist_count() <= postingsCount)
536 					{
537 						// All postings are to be removed, so we can remove the term
538 #ifdef DEBUG
539 						clog << "XapianIndex::removePostingsFromDocument: no extra posting" << endl;
540 #endif
541 						removeTerm = true;
542 					}
543 				}
544 			}
545 		}
546 
547 		if (removeTerm == true)
548 		{
549 			try
550 			{
551 				doc.remove_term(*termListIter);
552 			}
553 			catch (const Xapian::Error &error)
554 			{
555 #ifdef DEBUG
556 				clog << "XapianIndex::removePostingsFromDocument: " << error.get_msg() << endl;
557 #endif
558 			}
559 
560 			try
561 			{
562 				// Decrease this term's frequency in the spelling dictionary
563 				if (doSpelling == true)
564 				{
565 					db.remove_spelling(*termListIter);
566 				}
567 			}
568 			catch (const Xapian::UnimplementedError &error)
569 			{
570 				clog << "Couldn't remove spelling correction: " << error.get_type() << ": " << error.get_msg() << endl;
571 				doSpelling = false;
572 			}
573 			catch (const Xapian::Error &error)
574 			{
575 #ifdef DEBUG
576 				clog << "XapianIndex::removePostingsFromDocument: " << error.get_msg() << endl;
577 #endif
578 			}
579 			continue;
580 		}
581 
582 		// Otherwise, remove the first N postings
583 		// FIXME: if all the postings are in the range associated with the metadata
584 		// as opposed to the actual data, the term can be removed altogether
585 		for (Xapian::PositionIterator firstPosIter = termListIter.positionlist_begin();
586 			firstPosIter != termListIter.positionlist_end(); ++firstPosIter)
587 		{
588 			if (postingNum >= postingsCount)
589 			{
590 				break;
591 			}
592 			++postingNum;
593 
594 			try
595 			{
596 				doc.remove_posting(*termListIter, *firstPosIter);
597 			}
598 			catch (const Xapian::Error &error)
599 			{
600 				// This posting may have been removed already
601 #ifdef DEBUG
602 				clog << "XapianIndex::removePostingsFromDocument: " << error.get_msg() << endl;
603 #endif
604 			}
605 		}
606 	}
607 }
608 
addCommonTerms(const DocumentInfo & docInfo,Xapian::Document & doc,const Xapian::WritableDatabase & db,Xapian::termcount & termPos)609 void XapianIndex::addCommonTerms(const DocumentInfo &docInfo, Xapian::Document &doc,
610 	const Xapian::WritableDatabase &db, Xapian::termcount &termPos)
611 {
612 	string title(docInfo.getTitle());
613 	string location(docInfo.getLocation());
614 	string type(docInfo.getType(false));
615 	Url urlObj(location);
616 
617 	// Add a magic term :-)
618 	doc.add_term(MAGIC_TERM);
619 
620 	// Index the title with prefix S
621 	if (title.empty() == false)
622 	{
623 		addPostingsToDocument(Xapian::Utf8Iterator(title), doc, db, "S",
624 			false, m_doSpelling, termPos);
625 	}
626 
627 	string hostName, tree, fileName;
628 
629 	if (g_pMapper != NULL)
630 	{
631 		hostName = g_pMapper->getHost(docInfo);
632 		tree = g_pMapper->getDirectory(docInfo);
633 		fileName = g_pMapper->getFile(docInfo);
634 	}
635 	else
636 	{
637 		hostName = StringManip::toLowerCase(urlObj.getHost());
638 		tree = urlObj.getLocation();
639 		fileName = urlObj.getFile();
640 	}
641 #ifdef DEBUG
642 	clog << "XapianIndex::addCommonTerms: called for " << docInfo.getLocation()
643 		<< " (" << docInfo.getInternalPath() << ")" << endl;
644 #endif
645 
646 	// Index the full URL with prefix U
647 	doc.add_term(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(docInfo.getLocation(true)), true));
648 	// And for containers, the base file with XFILE:
649 	if ((urlObj.isLocal() == true) &&
650 		(docInfo.getInternalPath().empty() == false))
651 	{
652 		string protocol(urlObj.getProtocol());
653 
654 		doc.add_term(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true));
655 		if ((urlObj.isLocal() == true) &&
656 			(protocol != "file"))
657 		{
658 			string fileUrl(location);
659 
660 			// Add another term with file as protocol
661 			fileUrl.replace(0, protocol.length(), "file");
662 			doc.add_term(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(fileUrl), true));
663 		}
664 	}
665 	// ...the host name and included domains with prefix H
666 	if (hostName.empty() == false)
667 	{
668 		doc.add_term(string("H") + XapianDatabase::limitTermLength(hostName, true));
669 		string::size_type dotPos = hostName.find('.');
670 		while (dotPos != string::npos)
671 		{
672 			doc.add_term(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true));
673 
674 			// Next
675 			dotPos = hostName.find('.', dotPos + 1);
676 		}
677 	}
678 	// ...the location (as is) and all directories with prefix XDIR:
679 	if (tree.empty() == false)
680 	{
681 		if ((urlObj.isLocal() == true) &&
682 			(docInfo.getIsDirectory() == true))
683 		{
684 			doc.add_term(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(docInfo.getLocation().substr(7)), true));
685 #ifdef DEBUG
686 			clog << "XapianIndex::addCommonTerms: full XDIR" << docInfo.getLocation().substr(7) << endl;
687 #endif
688 		}
689 		doc.add_term(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true));
690 #ifdef DEBUG
691 		clog << "XapianIndex::addCommonTerms: first XDIR" << tree << endl;
692 #endif
693 		if (tree[0] == '/')
694 		{
695 			doc.add_term("XDIR:/");
696 #ifdef DEBUG
697 			clog << "XapianIndex::addCommonTerms: top-level XDIR" << endl;
698 #endif
699 		}
700 		string::size_type slashPos = tree.find('/', 1);
701 		while (slashPos != string::npos)
702 		{
703 			doc.add_term(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true));
704 #ifdef DEBUG
705 			clog << "XapianIndex::addCommonTerms: component XDIR" << tree.substr(0, slashPos) << endl;
706 #endif
707 
708 			// Next
709 			slashPos = tree.find('/', slashPos + 1);
710 		}
711 
712 		// ...and all components as XPATH:
713 		bool doSpellingOnPaths = false;
714 		addPostingsToDocument(Xapian::Utf8Iterator(tree), doc, db, "XPATH:",
715 			true, doSpellingOnPaths, termPos);
716 	}
717 	else
718 	{
719 		doc.add_term("XDIR:/");
720 #ifdef DEBUG
721 		clog << "XapianIndex::addCommonTerms: single top-level XDIR" << endl;
722 #endif
723 	}
724 	// ...and the file name with prefix P
725 	if (fileName.empty() == false)
726 	{
727 		string extension;
728 
729 		doc.add_term(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true));
730 		if (fileName.find(' ') != string::npos)
731 		{
732 			bool doSpellingOnPaths = false;
733 
734 			// Add more XPATH: terms if there's a space in the file name
735 			addPostingsToDocument(Xapian::Utf8Iterator(fileName), doc, db, "XPATH:",
736 				true, doSpellingOnPaths, termPos);
737 		}
738 
739 		// Does it have an extension ?
740 		string::size_type extPos = fileName.rfind('.');
741 		if ((extPos != string::npos) &&
742 			(extPos + 1 < fileName.length()))
743 		{
744 			extension = StringManip::toLowerCase(fileName.substr(extPos + 1));
745 		}
746 		doc.add_term(string("E") + XapianDatabase::limitTermLength(extension));
747 	}
748 	// Add the language code with prefix L
749 	doc.add_term(string("L") + Languages::toCode(m_stemLanguage));
750 	// ...and the MIME type with prefix T
751 	doc.add_term(string("T") + type);
752 	string::size_type slashPos = type.find('/');
753 	if (slashPos != string::npos)
754 	{
755 		doc.add_term(string("XCLASS:") + type.substr(0, slashPos));
756 	}
757 	// Others
758 	if (g_pMapper != NULL)
759 	{
760 		vector<pair<string, string> > prefixedTerms;
761 
762 		g_pMapper->getTerms(docInfo, prefixedTerms);
763 
764 		for (vector<pair<string, string> >::const_iterator termIter = prefixedTerms.begin();
765 			termIter != prefixedTerms.end(); ++termIter)
766 		{
767 			doc.add_term(termIter->second + XapianDatabase::limitTermLength(termIter->first));
768 		}
769 	}
770 }
771 
removeCommonTerms(Xapian::Document & doc,const Xapian::WritableDatabase & db)772 void XapianIndex::removeCommonTerms(Xapian::Document &doc, const Xapian::WritableDatabase &db)
773 {
774 	DocumentInfo docInfo;
775 	set<string> commonTerms;
776 	string record(doc.get_data());
777 
778 	// First, remove the magic term
779 	commonTerms.insert(MAGIC_TERM);
780 
781 	if (record.empty() == true)
782         {
783 		// Nothing else we can do
784 		return;
785 	}
786 
787 	XapianDatabase::recordToProps(record, &docInfo);
788 	// XapianDatabase expects the language in English, which is okay here
789 	string language(docInfo.getLanguage());
790 	Url urlObj(docInfo.getLocation());
791 
792 	// FIXME: remove terms extracted from the title if they don't have more than one posting
793 	string title(docInfo.getTitle());
794 	if (title.empty() == false)
795 	{
796 		removePostingsFromDocument(Xapian::Utf8Iterator(title), doc, db, "S",
797 			false, m_doSpelling);
798 	}
799 
800 	// Location
801 	string location(docInfo.getLocation());
802 	commonTerms.insert(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(docInfo.getLocation(true)), true));
803 	// Containers' base file
804 	if ((urlObj.isLocal() == true) &&
805 		(docInfo.getInternalPath().empty() == false))
806 	{
807 		string protocol(urlObj.getProtocol());
808 
809 		commonTerms.insert(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(location), true));
810 
811 		if ((urlObj.isLocal() == true) &&
812 			(protocol != "file"))
813 		{
814 			string fileUrl(location);
815 
816 			// Add another term with file as protocol
817 			fileUrl.replace(0, protocol.length(), "file");
818 			commonTerms.insert(string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(fileUrl), true));
819 		}
820 	}
821 	// Host name
822 	string hostName(StringManip::toLowerCase(urlObj.getHost()));
823 	if (hostName.empty() == false)
824 	{
825 		commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName, true));
826 		string::size_type dotPos = hostName.find('.');
827 		while (dotPos != string::npos)
828 		{
829 			commonTerms.insert(string("H") + XapianDatabase::limitTermLength(hostName.substr(dotPos + 1), true));
830 
831 			// Next
832 			dotPos = hostName.find('.', dotPos + 1);
833 		}
834 	}
835 	// ...location
836 	string tree(urlObj.getLocation());
837 	if (tree.empty() == false)
838 	{
839 		if ((urlObj.isLocal() == true) &&
840 			(docInfo.getIsDirectory() == true))
841 		{
842 			commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(docInfo.getLocation().substr(7)), true));
843 		}
844 		commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree), true));
845 		if (tree[0] == '/')
846 		{
847 			commonTerms.insert("XDIR:/");
848 		}
849 		string::size_type slashPos = tree.find('/', 1);
850 		while (slashPos != string::npos)
851 		{
852 			commonTerms.insert(string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(tree.substr(0, slashPos)), true));
853 
854 			// Next
855 			slashPos = tree.find('/', slashPos + 1);
856 		}
857 
858 		// ...paths
859 		bool doSpellingOnPaths = false;
860 		removePostingsFromDocument(Xapian::Utf8Iterator(tree), doc, db, "XPATH:",
861 			true, doSpellingOnPaths);
862 	}
863 	else
864 	{
865 		commonTerms.insert("XDIR:/");
866 	}
867 	// ...and file name
868 	string fileName(urlObj.getFile());
869 	if (fileName.empty() == false)
870 	{
871 		string extension;
872 
873 		commonTerms.insert(string("P") + XapianDatabase::limitTermLength(Url::escapeUrl(fileName), true));
874 		if (fileName.find(' ') != string::npos)
875 		{
876 			bool doSpellingOnPaths = false;
877 
878 			removePostingsFromDocument(Xapian::Utf8Iterator(fileName), doc, db, "XPATH:",
879 				true, doSpellingOnPaths);
880 		}
881 
882 		// Does it have an extension ?
883 		string::size_type extPos = fileName.rfind('.');
884 		if ((extPos != string::npos) &&
885 			(extPos + 1 < fileName.length()))
886 		{
887 			extension = StringManip::toLowerCase(fileName.substr(extPos + 1));
888 		}
889 		commonTerms.insert(string("E") + XapianDatabase::limitTermLength(extension));
890 	}
891 	// Language code
892 	commonTerms.insert(string("L") + Languages::toCode(language));
893 	// MIME type
894 	string type(docInfo.getType(false));
895 	commonTerms.insert(string("T") + type);
896 	string::size_type slashPos = type.find('/');
897 	if (slashPos != string::npos)
898 	{
899 		commonTerms.insert(string("XCLASS:") + type.substr(0, slashPos));
900 	}
901 	// Others
902 	if (g_pMapper != NULL)
903 	{
904 		vector<pair<string, string> > prefixedTerms;
905 
906 		g_pMapper->getTerms(docInfo, prefixedTerms);
907 
908 		for (vector<pair<string, string> >::const_iterator termIter = prefixedTerms.begin();
909 			termIter != prefixedTerms.end(); ++termIter)
910 		{
911 			commonTerms.insert(termIter->second + XapianDatabase::limitTermLength(termIter->first));
912 		}
913 	}
914 
915 	for (set<string>::const_iterator termIter = commonTerms.begin(); termIter != commonTerms.end(); ++termIter)
916 	{
917 		try
918 		{
919 			doc.remove_term(*termIter);
920 		}
921 		catch (const Xapian::Error &error)
922 		{
923 #ifdef DEBUG
924 			clog << "XapianIndex::removeCommonTerms: " << error.get_msg() << endl;
925 #endif
926 		}
927 	}
928 }
929 
scanDocument(const string & suggestedLanguage,const char * pData,off_t dataLength)930 string XapianIndex::scanDocument(const string &suggestedLanguage,
931 	const char *pData, off_t dataLength)
932 {
933 	vector<string> candidates;
934 	string language;
935 	bool scannedDocument = false;
936 
937 	if (suggestedLanguage.empty() == false)
938 	{
939 		// See first if this is suitable
940 		candidates.push_back(suggestedLanguage);
941 	}
942 	else
943 	{
944 		// Try to determine the document's language right away
945 		LanguageDetector::getInstance().guessLanguage(pData, max(dataLength, (off_t)2048), candidates);
946 
947 		scannedDocument = true;
948 	}
949 
950 	// See which of these languages is suitable for stemming
951 	vector<string>::iterator langIter = candidates.begin();
952 	while (langIter != candidates.end())
953 	{
954 		if (*langIter == "unknown")
955 		{
956 			++langIter;
957 			continue;
958 		}
959 
960 		try
961 		{
962 			Xapian::Stem stemmer(StringManip::toLowerCase(*langIter));
963 		}
964 		catch (const Xapian::Error &error)
965 		{
966 			clog << "Invalid language: " << error.get_type() << ": " << error.get_msg() << endl;
967 
968 			if (scannedDocument == false)
969 			{
970 				// The suggested language is not suitable
971 				candidates.clear();
972 				LanguageDetector::getInstance().guessLanguage(pData, max(dataLength, (off_t)2048), candidates);
973 
974 				langIter = candidates.begin();
975 				scannedDocument = true;
976 			}
977 			else
978 			{
979 				++langIter;
980 			}
981 			continue;
982 		}
983 
984 		language = *langIter;
985 		break;
986 	}
987 #ifdef DEBUG
988 	clog << "XapianIndex::scanDocument: language " << language << endl;
989 #endif
990 
991 	return language;
992 }
993 
setDocumentData(const DocumentInfo & docInfo,Xapian::Document & doc,const string & language) const994 void XapianIndex::setDocumentData(const DocumentInfo &docInfo, Xapian::Document &doc,
995 	const string &language) const
996 {
997 	time_t timeT = TimeConverter::fromTimestamp(docInfo.getTimestamp());
998 	struct tm *tm = localtime(&timeT);
999 	string yyyymmdd(TimeConverter::toYYYYMMDDString(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday));
1000 	string hhmmss(TimeConverter::toHHMMSSString(tm->tm_hour, tm->tm_min, tm->tm_sec));
1001 
1002 	// Date
1003 	doc.add_value(0, yyyymmdd);
1004 	// FIXME: checksum in value 1
1005 	// Size
1006 	doc.add_value(2, Xapian::sortable_serialise((double )docInfo.getSize()));
1007 	// Time
1008 	doc.add_value(3, hhmmss);
1009 	// Date and time, for results sorting
1010 	doc.add_value(4, yyyymmdd + hhmmss);
1011 	// Number of seconds to January 1st, 10000
1012 	doc.add_value(5, Xapian::sortable_serialise((double )253402300800 - timeT));
1013 	// Any custom value ?
1014 	if (g_pMapper != NULL)
1015 	{
1016 		map<unsigned int, string> values;
1017 
1018 		g_pMapper->getValues(docInfo, values);
1019 		for (map<unsigned int, string>::const_iterator valIter = values.begin();
1020 			valIter != values.end(); ++valIter)
1021 		{
1022 			doc.add_value(valIter->first, valIter->second);
1023 		}
1024 	}
1025 
1026 	DocumentInfo docCopy(docInfo);
1027 	// XapianDatabase expects the language in English, which is okay here
1028 	docCopy.setLanguage(language);
1029 	doc.set_data(XapianDatabase::propsToRecord(&docCopy));
1030 }
1031 
deleteDocuments(const string & term)1032 bool XapianIndex::deleteDocuments(const string &term)
1033 {
1034 	bool unindexed = false;
1035 
1036 	if (term.empty() == true)
1037 	{
1038 		return false;
1039 	}
1040 
1041 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
1042 	if (pDatabase == NULL)
1043 	{
1044 		clog << "Couldn't get index " << m_databaseName << endl;
1045 		return false;
1046 	}
1047 
1048 	try
1049 	{
1050 		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
1051 		if (pIndex != NULL)
1052 		{
1053 #ifdef DEBUG
1054 			clog << "XapianIndex::deleteDocuments: term is " << term << endl;
1055 #endif
1056 
1057 			// Delete documents from the index
1058 			pIndex->delete_document(term);
1059 
1060 			unindexed = true;
1061 		}
1062 	}
1063 	catch (const Xapian::Error &error)
1064 	{
1065 		clog << "Couldn't unindex documents: " << error.get_type() << ": " << error.get_msg() << endl;
1066 	}
1067 	catch (...)
1068 	{
1069 		clog << "Couldn't unindex documents, unknown exception occurred" << endl;
1070 	}
1071 	pDatabase->unlock();
1072 
1073 	return unindexed;
1074 }
1075 
1076 //
1077 // Implementation of IndexInterface
1078 //
1079 
1080 /// Returns false if the index couldn't be opened.
isGood(void) const1081 bool XapianIndex::isGood(void) const
1082 {
1083 	return m_goodIndex;
1084 }
1085 
1086 /// Gets metadata.
getMetadata(const string & name) const1087 string XapianIndex::getMetadata(const string &name) const
1088 {
1089 	string metadataValue;
1090 
1091 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1092 	if (pDatabase == NULL)
1093 	{
1094 		clog << "Couldn't get index " << m_databaseName << endl;
1095 		return "";
1096 	}
1097 
1098 	try
1099 	{
1100 		Xapian::Database *pIndex = pDatabase->readLock();
1101 		if (pIndex != NULL)
1102 		{
1103 			// If this index type doesn't support metadata, no exception will be thrown
1104 			// We will just get an empty string
1105 			metadataValue = pIndex->get_metadata(name);
1106 		}
1107 	}
1108 	catch (const Xapian::Error &error)
1109 	{
1110 		clog << "Couldn't get metadata: " << error.get_type() << ": " << error.get_msg() << endl;
1111 	}
1112 	catch (...)
1113 	{
1114 		clog << "Couldn't get metadata, unknown exception occurred" << endl;
1115 	}
1116 	pDatabase->unlock();
1117 
1118 	return metadataValue;
1119 }
1120 
1121 /// Sets metadata.
setMetadata(const string & name,const string & value) const1122 bool XapianIndex::setMetadata(const string &name, const string &value) const
1123 {
1124 	bool setMetadata = false;
1125 
1126 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
1127 	if (pDatabase == NULL)
1128 	{
1129 		clog << "Couldn't get index " << m_databaseName << endl;
1130 		return false;
1131 	}
1132 
1133 	try
1134 	{
1135 		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
1136 		if (pIndex != NULL)
1137 		{
1138 			pIndex->set_metadata(name, value);
1139 			setMetadata = true;
1140 		}
1141 	}
1142 	catch (const Xapian::UnimplementedError &error)
1143 	{
1144 		clog << "Couldn't set metadata: " << error.get_type() << ": " << error.get_msg() << endl;
1145 	}
1146 	catch (const Xapian::Error &error)
1147 	{
1148 		clog << "Couldn't set metadata: " << error.get_type() << ": " << error.get_msg() << endl;
1149 	}
1150 	catch (...)
1151 	{
1152 		clog << "Couldn't set metadata, unknown exception occurred" << endl;
1153 	}
1154 	pDatabase->unlock();
1155 
1156 	return setMetadata;
1157 }
1158 
1159 /// Gets the index location.
getLocation(void) const1160 string XapianIndex::getLocation(void) const
1161 {
1162 	return m_databaseName;
1163 }
1164 
1165 /// Returns a document's properties.
getDocumentInfo(unsigned int docId,DocumentInfo & docInfo) const1166 bool XapianIndex::getDocumentInfo(unsigned int docId, DocumentInfo &docInfo) const
1167 {
1168 	bool foundDocument = false;
1169 
1170 	if (docId == 0)
1171 	{
1172 		return false;
1173 	}
1174 
1175 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1176 	if (pDatabase == NULL)
1177 	{
1178 		clog << "Couldn't get index " << m_databaseName << endl;
1179 		return false;
1180 	}
1181 
1182 	try
1183 	{
1184 		Xapian::Database *pIndex = pDatabase->readLock();
1185 		if (pIndex != NULL)
1186 		{
1187 			Xapian::Document doc = pIndex->get_document(docId);
1188 			string record(doc.get_data());
1189 
1190 			// Get the current document data
1191 			if (record.empty() == false)
1192 			{
1193 				XapianDatabase::recordToProps(record, &docInfo);
1194 				// XapianDatabase stored the language in English
1195 				docInfo.setLanguage(Languages::toLocale(docInfo.getLanguage()));
1196 				foundDocument = true;
1197 			}
1198 		}
1199 	}
1200 	catch (const Xapian::Error &error)
1201 	{
1202 		clog << "Couldn't get document properties: " << error.get_type() << ": " << error.get_msg() << endl;
1203 	}
1204 	catch (...)
1205 	{
1206 		clog << "Couldn't get document properties, unknown exception occurred" << endl;
1207 	}
1208 	pDatabase->unlock();
1209 
1210 	return foundDocument;
1211 }
1212 
1213 /// Returns a document's terms count.
getDocumentTermsCount(unsigned int docId) const1214 unsigned int XapianIndex::getDocumentTermsCount(unsigned int docId) const
1215 {
1216 	unsigned int termsCount = 0;
1217 
1218 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1219 	if (pDatabase == NULL)
1220 	{
1221 		clog << "Couldn't get index " << m_databaseName << endl;
1222 		return 0;
1223 	}
1224 
1225 	try
1226 	{
1227 		Xapian::Database *pIndex = pDatabase->readLock();
1228 		if (pIndex != NULL)
1229 		{
1230 			Xapian::Document doc = pIndex->get_document(docId);
1231 
1232 			termsCount = doc.termlist_count();
1233 #ifdef DEBUG
1234 			clog << "XapianIndex::getDocumentTermsCount: " << termsCount << " terms in document " << docId << endl;
1235 #endif
1236 		}
1237 	}
1238 	catch (const Xapian::Error &error)
1239 	{
1240 		clog << "Couldn't get document terms count: " << error.get_type() << ": " << error.get_msg() << endl;
1241 	}
1242 	catch (...)
1243 	{
1244 		clog << "Couldn't get document terms count, unknown exception occurred" << endl;
1245 	}
1246 	pDatabase->unlock();
1247 
1248 	return termsCount;
1249 }
1250 
1251 /// Returns a document's terms.
getDocumentTerms(unsigned int docId,map<unsigned int,string> & wordsBuffer) const1252 bool XapianIndex::getDocumentTerms(unsigned int docId, map<unsigned int, string> &wordsBuffer) const
1253 {
1254 	vector<string> noPosTerms;
1255 	bool gotTerms = false;
1256 
1257 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1258 	if (pDatabase == NULL)
1259 	{
1260 		clog << "Couldn't get index " << m_databaseName << endl;
1261 		return false;
1262 	}
1263 
1264 	try
1265 	{
1266 		Xapian::Database *pIndex = pDatabase->readLock();
1267 		if (pIndex != NULL)
1268 		{
1269 			unsigned int lastPos = 0;
1270 
1271 			// Go through the position list of each term
1272 			for (Xapian::TermIterator termIter = pIndex->termlist_begin(docId);
1273 				termIter != pIndex->termlist_end(docId); ++termIter)
1274 			{
1275 				string termName(*termIter);
1276 				char firstChar = termName[0];
1277 				bool hasPositions = false;
1278 
1279 				// Is it prefixed ?
1280 				if (isupper((int)firstChar) != 0)
1281 				{
1282 					// Skip X-prefixed terms
1283 					if (firstChar == 'X')
1284 					{
1285 #ifdef DEBUG
1286 						clog << "XapianIndex::getDocumentTerms: skipping " << termName << endl;
1287 #endif
1288 						continue;
1289 					}
1290 
1291 					// Keep other prefixed terms (S, U, H, P, L, T...)
1292 					termName.erase(0, 1);
1293 				}
1294 
1295 				for (Xapian::PositionIterator positionIter = pIndex->positionlist_begin(docId, *termIter);
1296 					positionIter != pIndex->positionlist_end(docId, *termIter); ++positionIter)
1297 				{
1298 					wordsBuffer[*positionIter] = termName;
1299 					if (*positionIter > lastPos)
1300 					{
1301 						lastPos = *positionIter;
1302 					}
1303 					hasPositions = true;
1304 				}
1305 
1306 				if (hasPositions == false)
1307 				{
1308 					noPosTerms.push_back(termName);
1309 				}
1310 
1311 				gotTerms = true;
1312 			}
1313 
1314 			// Append terms without positional docInformation as if they were at the end of the document
1315 			for (vector<string>::const_iterator noPosIter = noPosTerms.begin();
1316 				noPosIter != noPosTerms.end(); ++noPosIter)
1317 			{
1318 				wordsBuffer[lastPos] = *noPosIter;
1319 				++lastPos;
1320 			}
1321 		}
1322 	}
1323 	catch (const Xapian::Error &error)
1324 	{
1325 		clog << "Couldn't get document terms: " << error.get_type() << ": " << error.get_msg() << endl;
1326 	}
1327 	catch (...)
1328 	{
1329 		clog << "Couldn't get document terms, unknown exception occurred" << endl;
1330 	}
1331 	pDatabase->unlock();
1332 
1333 	return gotTerms;
1334 }
1335 
1336 /// Sets the list of known labels.
setLabels(const set<string> & labels,bool resetLabels)1337 bool XapianIndex::setLabels(const set<string> &labels, bool resetLabels)
1338 {
1339 	string labelsString;
1340 
1341 	// Whether labels are reset or not doesn't make any difference
1342 	for (set<string>::const_iterator labelIter = labels.begin();
1343 		labelIter != labels.end(); ++labelIter)
1344 	{
1345 		// Prevent from setting internal labels
1346 		if (labelIter->substr(0, 2) == "X-")
1347 		{
1348 			continue;
1349 		}
1350 
1351 		labelsString += "[";
1352 		labelsString += Url::escapeUrl(*labelIter);
1353 		labelsString += "]";
1354 	}
1355 
1356 	return setMetadata("labels", labelsString);
1357 }
1358 
1359 /// Gets the list of known labels.
getLabels(set<string> & labels) const1360 bool XapianIndex::getLabels(set<string> &labels) const
1361 {
1362 	string labelsString(getMetadata("labels"));
1363 
1364 	if (labelsString.empty() == true)
1365 	{
1366 		return false;
1367 	}
1368 
1369 	string::size_type endPos = 0;
1370 	string label(StringManip::extractField(labelsString, "[", "]", endPos));
1371 
1372 	while (label.empty() == false)
1373 	{
1374 		labels.insert(Url::unescapeUrl(label));
1375 
1376 		if (endPos == string::npos)
1377 		{
1378 			break;
1379 		}
1380 		label = StringManip::extractField(labelsString, "[", "]", endPos);
1381 	}
1382 
1383 	return true;
1384 }
1385 
1386 /// Adds a label.
addLabel(const string & name)1387 bool XapianIndex::addLabel(const string &name)
1388 {
1389 	set<string> labels;
1390 
1391 	if (getLabels(labels) == true)
1392 	{
1393 		labels.insert(name);
1394 
1395 		if (setLabels(labels, true) == true)
1396 		{
1397 			return true;
1398 		}
1399 	}
1400 
1401 	return false;
1402 }
1403 
1404 /// Deletes all references to a label.
deleteLabel(const string & name)1405 bool XapianIndex::deleteLabel(const string &name)
1406 {
1407 	bool deletedLabel = false;
1408 
1409 	// Prevent from deleting internal labels
1410 	if (name.substr(0, 2) == "X-")
1411 	{
1412 		return false;
1413 	}
1414 
1415 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
1416 	if (pDatabase == NULL)
1417 	{
1418 		clog << "Couldn't get index " << m_databaseName << endl;
1419 		return false;
1420 	}
1421 
1422 	try
1423 	{
1424 		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
1425 		if (pIndex != NULL)
1426 		{
1427 			string term("XLABEL:");
1428 
1429 			// Get documents that have this label
1430 			term += XapianDatabase::limitTermLength(Url::escapeUrl(name));
1431 			for (Xapian::PostingIterator postingIter = pIndex->postlist_begin(term);
1432 				postingIter != pIndex->postlist_end(term); ++postingIter)
1433 			{
1434 				Xapian::docid docId = *postingIter;
1435 
1436 				// Get the document
1437 				Xapian::Document doc = pIndex->get_document(docId);
1438 				// Remove the term
1439 				doc.remove_term(term);
1440 				// ...and update the document
1441 				pIndex->replace_document(docId, doc);
1442 			}
1443 			deletedLabel = true;
1444 		}
1445 	}
1446 	catch (const Xapian::Error &error)
1447 	{
1448 		clog << "Couldn't delete label: " << error.get_type() << ": " << error.get_msg() << endl;
1449 	}
1450 	catch (...)
1451 	{
1452 		clog << "Couldn't delete label, unknown exception occurred" << endl;
1453 	}
1454 	pDatabase->unlock();
1455 
1456 	return deletedLabel;
1457 }
1458 
1459 /// Determines whether a document has a label.
hasLabel(unsigned int docId,const string & name) const1460 bool XapianIndex::hasLabel(unsigned int docId, const string &name) const
1461 {
1462 	bool foundLabel = false;
1463 
1464 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1465 	if (pDatabase == NULL)
1466 	{
1467 		clog << "Couldn't get index " << m_databaseName << endl;
1468 		return false;
1469 	}
1470 
1471 	try
1472 	{
1473 		Xapian::Database *pIndex = pDatabase->readLock();
1474 		if (pIndex != NULL)
1475 		{
1476 			string term("XLABEL:");
1477 
1478 			// Get documents that have this label
1479 			// FIXME: would it be faster to get the document's terms ?
1480 			term += XapianDatabase::limitTermLength(Url::escapeUrl(name));
1481 			Xapian::PostingIterator postingIter = pIndex->postlist_begin(term);
1482 			if (postingIter != pIndex->postlist_end(term))
1483 			{
1484 				// Is this document in the list ?
1485 				postingIter.skip_to(docId);
1486 				if ((postingIter != pIndex->postlist_end(term)) &&
1487 					(docId == (*postingIter)))
1488 				{
1489 					foundLabel = true;
1490 				}
1491 			}
1492 		}
1493 	}
1494 	catch (const Xapian::Error &error)
1495 	{
1496 		clog << "Couldn't check document labels: " << error.get_type() << ": " << error.get_msg() << endl;
1497 	}
1498 	catch (...)
1499 	{
1500 		clog << "Couldn't check document labels, unknown exception occurred" << endl;
1501 	}
1502 	pDatabase->unlock();
1503 
1504 	return foundLabel;
1505 }
1506 
1507 /// Returns a document's labels.
getDocumentLabels(unsigned int docId,set<string> & labels) const1508 bool XapianIndex::getDocumentLabels(unsigned int docId, set<string> &labels) const
1509 {
1510 	bool gotLabels = false;
1511 
1512 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1513 	if (pDatabase == NULL)
1514 	{
1515 		clog << "Couldn't get index " << m_databaseName << endl;
1516 		return false;
1517 	}
1518 
1519 	labels.clear();
1520 	try
1521 	{
1522 		Xapian::Database *pIndex = pDatabase->readLock();
1523 		if (pIndex != NULL)
1524 		{
1525 			Xapian::TermIterator termIter = pIndex->termlist_begin(docId);
1526 			if (termIter != pIndex->termlist_end(docId))
1527 			{
1528 				for (termIter.skip_to("XLABEL:");
1529 					termIter != pIndex->termlist_end(docId); ++termIter)
1530 				{
1531 					if ((*termIter).length() < 7)
1532 					{
1533 						break;
1534 					}
1535 
1536 					// Is this a label ?
1537 					if (strncasecmp((*termIter).c_str(), "XLABEL:", min(7, (int)(*termIter).length())) == 0)
1538 					{
1539 						labels.insert(Url::unescapeUrl((*termIter).substr(7)));
1540 					}
1541 				}
1542 				gotLabels = true;
1543 			}
1544 		}
1545 	}
1546 	catch (const Xapian::Error &error)
1547 	{
1548 		clog << "Couldn't get document's labels: " << error.get_type() << ": " << error.get_msg() << endl;
1549 	}
1550 	catch (...)
1551 	{
1552 		clog << "Couldn't get document's labels, unknown exception occurred" << endl;
1553 	}
1554 	pDatabase->unlock();
1555 
1556 	return gotLabels;
1557 }
1558 
1559 /// Sets a document's labels.
setDocumentLabels(unsigned int docId,const set<string> & labels,bool resetLabels)1560 bool XapianIndex::setDocumentLabels(unsigned int docId, const set<string> &labels,
1561 	bool resetLabels)
1562 {
1563 	set<unsigned int> docIds;
1564 
1565 	docIds.insert(docId);
1566 	return setDocumentsLabels(docIds, labels, resetLabels);
1567 }
1568 
1569 /// Sets documents' labels.
setDocumentsLabels(const set<unsigned int> & docIds,const set<string> & labels,bool resetLabels)1570 bool XapianIndex::setDocumentsLabels(const set<unsigned int> &docIds,
1571 	const set<string> &labels, bool resetLabels)
1572 {
1573 	bool updatedLabels = false;
1574 
1575 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
1576 	if (pDatabase == NULL)
1577 	{
1578 		clog << "Couldn't get index " << m_databaseName << endl;
1579 		return false;
1580 	}
1581 
1582 	for (set<unsigned int>::const_iterator docIter = docIds.begin();
1583 		docIter != docIds.end(); ++docIter)
1584 	{
1585 		try
1586 		{
1587 			Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
1588 			if (pIndex == NULL)
1589 			{
1590 				break;
1591 			}
1592 
1593 			unsigned int docId = (*docIter);
1594 			Xapian::Document doc = pIndex->get_document(docId);
1595 
1596 			// Reset existing labels ?
1597 			if (resetLabels == true)
1598 			{
1599 				Xapian::TermIterator termIter = pIndex->termlist_begin(docId);
1600 				if (termIter != pIndex->termlist_end(docId))
1601 				{
1602 					for (termIter.skip_to("XLABEL:");
1603 						termIter != pIndex->termlist_end(docId); ++termIter)
1604 					{
1605 						string term(*termIter);
1606 
1607 						// Is this a non-internal label ?
1608 						if ((strncasecmp(term.c_str(), "XLABEL:", min(7, (int)term.length())) == 0) &&
1609 							(strncasecmp(term.c_str(), "XLABEL:X-", min(9, (int)term.length())) != 0))
1610 						{
1611 							doc.remove_term(term);
1612 						}
1613 					}
1614 				}
1615 			}
1616 
1617 			// Set new labels
1618 			addLabelsToDocument(doc, labels, true);
1619 
1620 			pIndex->replace_document(docId, doc);
1621 			updatedLabels = true;
1622 		}
1623 		catch (const Xapian::Error &error)
1624 		{
1625 			clog << "Couldn't update document's labels: " << error.get_type() << ": " << error.get_msg() << endl;
1626 		}
1627 		catch (...)
1628 		{
1629 			clog << "Couldn't update document's labels, unknown exception occurred" << endl;
1630 		}
1631 
1632 		pDatabase->unlock();
1633 	}
1634 
1635 	return updatedLabels;
1636 }
1637 
1638 /// Checks whether the given URL is in the index.
hasDocument(const string & url) const1639 unsigned int XapianIndex::hasDocument(const string &url) const
1640 {
1641 	unsigned int docId = 0;
1642 
1643 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1644 	if (pDatabase == NULL)
1645 	{
1646 		clog << "Couldn't get index " << m_databaseName << endl;
1647 		return 0;
1648 	}
1649 
1650 	try
1651 	{
1652 		Xapian::Database *pIndex = pDatabase->readLock();
1653 		if (pIndex != NULL)
1654 		{
1655 			string term = string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(url)), true);
1656 
1657 			// Get documents that have this term
1658 			Xapian::PostingIterator postingIter = pIndex->postlist_begin(term);
1659 			if (postingIter != pIndex->postlist_end(term))
1660 			{
1661 				// This URL was indexed
1662 				docId = *postingIter;
1663 #ifdef DEBUG
1664 				clog << "XapianIndex::hasDocument: " << term << " in document "
1665 					<< docId << " " << postingIter.get_wdf() << " time(s)" << endl;
1666 #endif
1667 			}
1668 			// FIXME: what if the term exists in more than one document ?
1669 		}
1670 	}
1671 	catch (const Xapian::Error &error)
1672 	{
1673 		clog << "Couldn't look for document: " << error.get_type() << ": " << error.get_msg() << endl;
1674 	}
1675 	catch (...)
1676 	{
1677 		clog << "Couldn't look for document, unknown exception occurred" << endl;
1678 	}
1679 	pDatabase->unlock();
1680 
1681 	return docId;
1682 }
1683 
1684 /// Gets terms with the same root.
getCloseTerms(const string & term,set<string> & suggestions)1685 unsigned int XapianIndex::getCloseTerms(const string &term, set<string> &suggestions)
1686 {
1687 	Dijon::CJKVTokenizer tokenizer;
1688 
1689 	// Only offer suggestions for non CJKV terms
1690 	if (tokenizer.has_cjkv(term) == true)
1691 	{
1692 		return 0;
1693 	}
1694 
1695 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1696 	if (pDatabase == NULL)
1697 	{
1698 		clog << "Couldn't get index " << m_databaseName << endl;
1699 		return 0;
1700 	}
1701 
1702 	suggestions.clear();
1703 	try
1704 	{
1705 		Xapian::Database *pIndex = pDatabase->readLock();
1706 		if (pIndex != NULL)
1707 		{
1708 			Xapian::TermIterator termIter = pIndex->allterms_begin();
1709 
1710 			if (termIter != pIndex->allterms_end())
1711 			{
1712 				string baseTerm(StringManip::toLowerCase(term));
1713 				unsigned int count = 0;
1714 
1715 				// Get the next 10 terms
1716 				for (termIter.skip_to(baseTerm);
1717 					(termIter != pIndex->allterms_end()) && (count < 10); ++termIter)
1718 				{
1719 					string suggestedTerm(*termIter);
1720 
1721 					// Does this term have the same root ?
1722 					if (suggestedTerm.find(baseTerm) != 0)
1723 					{
1724 						break;
1725 					}
1726 
1727 					suggestions.insert(suggestedTerm);
1728 					++count;
1729 				}
1730 			}
1731 		}
1732 	}
1733 	catch (const Xapian::Error &error)
1734 	{
1735 		clog << "Couldn't get terms: " << error.get_type() << ": " << error.get_msg() << endl;
1736 	}
1737 	catch (...)
1738 	{
1739 		clog << "Couldn't get terms, unknown exception occurred" << endl;
1740 	}
1741 	pDatabase->unlock();
1742 
1743 	return suggestions.size();
1744 }
1745 
1746 /// Returns the ID of the last document.
getLastDocumentID(void) const1747 unsigned int XapianIndex::getLastDocumentID(void) const
1748 {
1749 	unsigned int docId = 0;
1750 
1751 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1752 	if (pDatabase == NULL)
1753 	{
1754 		clog << "Couldn't get index " << m_databaseName << endl;
1755 		return 0;
1756 	}
1757 
1758 	try
1759 	{
1760 		Xapian::Database *pIndex = pDatabase->readLock();
1761 		if (pIndex != NULL)
1762 		{
1763 			docId = pIndex->get_lastdocid();
1764 		}
1765 	}
1766 	catch (const Xapian::Error &error)
1767 	{
1768 		clog << "Couldn't get last document ID: " << error.get_type() << ": " << error.get_msg() << endl;
1769 	}
1770 	catch (...)
1771 	{
1772 		clog << "Couldn't get last document ID, unknown exception occurred" << endl;
1773 	}
1774 	pDatabase->unlock();
1775 
1776 	return docId;
1777 }
1778 
1779 /// Returns the number of documents.
getDocumentsCount(const string & labelName) const1780 unsigned int XapianIndex::getDocumentsCount(const string &labelName) const
1781 {
1782 	unsigned int docCount = 0;
1783 
1784 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
1785 	if (pDatabase == NULL)
1786 	{
1787 		clog << "Couldn't get index " << m_databaseName << endl;
1788 		return 0;
1789 	}
1790 
1791 	try
1792 	{
1793 		Xapian::Database *pIndex = pDatabase->readLock();
1794 		if (pIndex != NULL)
1795 		{
1796 			if (labelName.empty() == true)
1797 			{
1798 				docCount = pIndex->get_doccount();
1799 			}
1800 			else
1801 			{
1802 				string term("XLABEL:");
1803 
1804 				// Each label appears only one per document so the collection frequency
1805 				// is the number of documents that have this label
1806 				term += XapianDatabase::limitTermLength(Url::escapeUrl(labelName));
1807 				docCount = pIndex->get_collection_freq(term);
1808 			}
1809 		}
1810 	}
1811 	catch (const Xapian::Error &error)
1812 	{
1813 		clog << "Couldn't count documents: " << error.get_type() << ": " << error.get_msg() << endl;
1814 	}
1815 	catch (...)
1816 	{
1817 		clog << "Couldn't count documents, unknown exception occurred" << endl;
1818 	}
1819 	pDatabase->unlock();
1820 
1821 	return docCount;
1822 }
1823 
1824 /// Lists document IDs.
listDocuments(set<unsigned int> & docIds,unsigned int maxDocsCount,unsigned int startDoc) const1825 unsigned int XapianIndex::listDocuments(set<unsigned int> &docIds,
1826 	unsigned int maxDocsCount, unsigned int startDoc) const
1827 {
1828 	// All documents have the magic term
1829 	if (listDocumentsWithTerm("", docIds, maxDocsCount, startDoc) == true)
1830 	{
1831 		return docIds.size();
1832 	}
1833 
1834 	return 0;
1835 }
1836 
1837 /// Lists documents.
listDocuments(const string & name,set<unsigned int> & docIds,NameType type,unsigned int maxDocsCount,unsigned int startDoc) const1838 bool XapianIndex::listDocuments(const string &name, set<unsigned int> &docIds,
1839 	NameType type, unsigned int maxDocsCount, unsigned int startDoc) const
1840 {
1841 	string term;
1842 
1843 	if (type == BY_LABEL)
1844 	{
1845 		term = string("XLABEL:") + XapianDatabase::limitTermLength(Url::escapeUrl(name));
1846 	}
1847 	else if (type == BY_DIRECTORY)
1848 	{
1849 		term = string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(name), true);
1850 	}
1851 	else if (type == BY_FILE)
1852 	{
1853 		term = string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(name), true);
1854 	}
1855 	else if (type == BY_CONTAINER_FILE)
1856 	{
1857 		term = string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(name), true);
1858 	}
1859 
1860 	return listDocumentsWithTerm(term, docIds, maxDocsCount, startDoc);
1861 }
1862 
1863 /// Indexes the given data.
indexDocument(const Document & document,const std::set<std::string> & labels,unsigned int & docId)1864 bool XapianIndex::indexDocument(const Document &document, const std::set<std::string> &labels,
1865 	unsigned int &docId)
1866 {
1867 	bool indexed = false;
1868 
1869 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
1870 	if (pDatabase == NULL)
1871 	{
1872 		clog << "Couldn't get index " << m_databaseName << endl;
1873 		return false;
1874 	}
1875 
1876 	// Cache the document's properties
1877 	DocumentInfo docInfo(document);
1878 	docInfo.setLocation(Url::canonicalizeUrl(document.getLocation()));
1879 
1880 	off_t dataLength = 0;
1881 	const char *pData = document.getData(dataLength);
1882 
1883 	// Don't scan the document if a language is specified
1884 	m_stemLanguage = Languages::toEnglish(docInfo.getLanguage());
1885 	if ((pData != NULL) &&
1886 		(dataLength > 0))
1887 	{
1888 		m_stemLanguage = scanDocument(m_stemLanguage, pData, dataLength);
1889 		docInfo.setLanguage(Languages::toLocale(m_stemLanguage));
1890 	}
1891 
1892 	try
1893 	{
1894 		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
1895 		if (pIndex != NULL)
1896 		{
1897 			Xapian::Document doc;
1898 			Xapian::termcount termPos = 0;
1899 
1900 			// Populate the Xapian document
1901 			addCommonTerms(docInfo, doc, *pIndex, termPos);
1902 			if ((pData != NULL) &&
1903 				(dataLength > 0))
1904 			{
1905 				Xapian::Utf8Iterator itor(pData, dataLength);
1906 				addPostingsToDocument(itor, doc, *pIndex, "",
1907 					false, m_doSpelling, termPos);
1908 			}
1909 #ifdef DEBUG
1910 			clog << "XapianIndex::indexDocument: " << labels.size() << " labels for URL " << docInfo.getLocation(true) << endl;
1911 #endif
1912 
1913 			// Add labels
1914 			addLabelsToDocument(doc, labels, false);
1915 
1916 			// Set data
1917 			setDocumentData(docInfo, doc, m_stemLanguage);
1918 
1919 			// Add this document to the Xapian index
1920 			docId = pIndex->add_document(doc);
1921 			indexed = true;
1922 		}
1923 	}
1924 	catch (const Xapian::Error &error)
1925 	{
1926 		clog << "Couldn't index document: " << error.get_type() << ": " << error.get_msg() << endl;
1927 	}
1928 	catch (...)
1929 	{
1930 		clog << "Couldn't index document, unknown exception occurred" << endl;
1931 	}
1932 	pDatabase->unlock();
1933 
1934 	return indexed;
1935 }
1936 
1937 /// Updates the given document; true if success.
updateDocument(unsigned int docId,const Document & document)1938 bool XapianIndex::updateDocument(unsigned int docId, const Document &document)
1939 {
1940 	bool updated = false;
1941 
1942 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
1943 	if (pDatabase == NULL)
1944 	{
1945 		clog << "Couldn't get index " << m_databaseName << endl;
1946 		return false;
1947 	}
1948 
1949 	// Cache the document's properties
1950 	DocumentInfo docInfo(document);
1951 	set<string> labels(document.getLabels());
1952 	docInfo.setLocation(Url::canonicalizeUrl(document.getLocation()));
1953 
1954 	off_t dataLength = 0;
1955 	const char *pData = document.getData(dataLength);
1956 
1957 	// Don't scan the document if a language is specified
1958 	m_stemLanguage = Languages::toEnglish(docInfo.getLanguage());
1959 	if ((pData != NULL) &&
1960 		(dataLength > 0))
1961 	{
1962 		m_stemLanguage = scanDocument(m_stemLanguage, pData, dataLength);
1963 		docInfo.setLanguage(Languages::toLocale(m_stemLanguage));
1964 	}
1965 
1966 	Xapian::WritableDatabase *pIndex = NULL;
1967 
1968 	try
1969 	{
1970 		pIndex = pDatabase->writeLock();
1971 		if (pIndex != NULL)
1972 		{
1973 			Xapian::Document doc;
1974 			Xapian::termcount termPos = 0;
1975 
1976 			// Populate the Xapian document
1977 			addCommonTerms(docInfo, doc, *pIndex, termPos);
1978 			if ((pData != NULL) &&
1979 				(dataLength > 0))
1980 			{
1981 				Xapian::Utf8Iterator itor(pData, dataLength);
1982 				addPostingsToDocument(itor, doc, *pIndex, "",
1983 					false, m_doSpelling, termPos);
1984 			}
1985 
1986 			// Add labels
1987 			addLabelsToDocument(doc, labels, false);
1988 
1989 			// Set data
1990 			setDocumentData(docInfo, doc, m_stemLanguage);
1991 
1992 			// Update the document in the database
1993 			pIndex->replace_document(docId, doc);
1994 			updated = true;
1995 		}
1996 	}
1997 	catch (const Xapian::Error &error)
1998 	{
1999 		clog << "Couldn't update document: " << error.get_type() << ": " << error.get_msg() << endl;
2000 	}
2001 	catch (...)
2002 	{
2003 		clog << "Couldn't update document, unknown exception occurred" << endl;
2004 	}
2005 	if (pIndex != NULL)
2006 	{
2007 		pDatabase->unlock();
2008 	}
2009 
2010 	return updated;
2011 }
2012 
2013 /// Updates a document's properties.
updateDocumentInfo(unsigned int docId,const DocumentInfo & docInfo)2014 bool XapianIndex::updateDocumentInfo(unsigned int docId, const DocumentInfo &docInfo)
2015 {
2016 	bool updated = false;
2017 
2018 	if (docId == 0)
2019 	{
2020 		return false;
2021 	}
2022 
2023 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
2024 	if (pDatabase == NULL)
2025 	{
2026 		clog << "Couldn't get index " << m_databaseName << endl;
2027 		return false;
2028 	}
2029 
2030 	try
2031 	{
2032 		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
2033 		if (pIndex != NULL)
2034 		{
2035 			Xapian::Document doc = pIndex->get_document(docId);
2036 			Xapian::termcount termPos = 0;
2037 
2038 			// Update the document data with the current language
2039 			m_stemLanguage = Languages::toEnglish(docInfo.getLanguage());
2040 			removeCommonTerms(doc, *pIndex);
2041 			addCommonTerms(docInfo, doc, *pIndex, termPos);
2042 			setDocumentData(docInfo, doc, m_stemLanguage);
2043 
2044 			pIndex->replace_document(docId, doc);
2045 			updated = true;
2046 		}
2047 	}
2048 	catch (const Xapian::Error &error)
2049 	{
2050 		clog << "Couldn't update document properties: " << error.get_type() << ": " << error.get_msg() << endl;
2051 	}
2052 	catch (...)
2053 	{
2054 		clog << "Couldn't update document properties, unknown exception occurred" << endl;
2055 	}
2056 	pDatabase->unlock();
2057 
2058 	return updated;
2059 }
2060 
2061 /// Unindexes the given document; true if success.
unindexDocument(unsigned int docId)2062 bool XapianIndex::unindexDocument(unsigned int docId)
2063 {
2064 	bool unindexed = false;
2065 
2066 	if (docId == 0)
2067 	{
2068 		return false;
2069 	}
2070 
2071 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
2072 	if (pDatabase == NULL)
2073 	{
2074 		clog << "Couldn't get index " << m_databaseName << endl;
2075 		return false;
2076 	}
2077 
2078 	try
2079 	{
2080 		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
2081 		if (pIndex != NULL)
2082 		{
2083 			// Delete the document from the index
2084 			pIndex->delete_document(docId);
2085 			unindexed = true;
2086 		}
2087 	}
2088 	catch (const Xapian::Error &error)
2089 	{
2090 		clog << "Couldn't unindex document: " << error.get_type() << ": " << error.get_msg() << endl;
2091 	}
2092 	catch (...)
2093 	{
2094 		clog << "Couldn't unindex document, unknown exception occurred" << endl;
2095 	}
2096 	pDatabase->unlock();
2097 
2098 	return unindexed;
2099 }
2100 
2101 /// Unindexes the given document.
unindexDocument(const string & location)2102 bool XapianIndex::unindexDocument(const string &location)
2103 {
2104 	string term(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(location)), true));
2105 
2106 	return deleteDocuments(term);
2107 }
2108 
2109 /// Unindexes documents.
unindexDocuments(const string & name,NameType type)2110 bool XapianIndex::unindexDocuments(const string &name, NameType type)
2111 {
2112 	string term;
2113 
2114 	if (type == BY_LABEL)
2115 	{
2116 		term = string("XLABEL:") + XapianDatabase::limitTermLength(Url::escapeUrl(name));
2117 	}
2118 	else if (type == BY_DIRECTORY)
2119 	{
2120 		term = string("XDIR:") + XapianDatabase::limitTermLength(Url::escapeUrl(name), true);
2121 	}
2122 	else if (type == BY_FILE)
2123 	{
2124 		term = string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(name), true);
2125 	}
2126 	else if (type == BY_CONTAINER_FILE)
2127 	{
2128 		term = string("XFILE:") + XapianDatabase::limitTermLength(Url::escapeUrl(name), true);
2129 	}
2130 
2131 	return deleteDocuments(term);
2132 }
2133 
2134 /// Unindexes all documents.
unindexAllDocuments(void)2135 bool XapianIndex::unindexAllDocuments(void)
2136 {
2137 	// All documents have the magic term
2138 	return deleteDocuments(MAGIC_TERM);
2139 }
2140 
2141 /// Flushes recent changes to the disk.
flush(void)2142 bool XapianIndex::flush(void)
2143 {
2144 	bool flushed = false;
2145 
2146 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false);
2147 	if (pDatabase == NULL)
2148 	{
2149 		clog << "Couldn't get index " << m_databaseName << endl;
2150 		return false;
2151 	}
2152 
2153 	try
2154 	{
2155 		Xapian::WritableDatabase *pIndex = pDatabase->writeLock();
2156 		if (pIndex != NULL)
2157 		{
2158 			pIndex->commit();
2159 			flushed = true;
2160 		}
2161 	}
2162 	catch (const Xapian::Error &error)
2163 	{
2164 		clog << "Couldn't flush database: " << error.get_type() << ": " << error.get_msg() << endl;
2165 	}
2166 	catch (...)
2167 	{
2168 		clog << "Couldn't flush database, unknown exception occurred" << endl;
2169 	}
2170 	pDatabase->unlock();
2171 
2172 	return flushed;
2173 }
2174 
2175 /// Reopens the index.
reopen(void) const2176 bool XapianIndex::reopen(void) const
2177 {
2178 	// Reopen
2179 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName);
2180 	if (pDatabase == NULL)
2181 	{
2182 		clog << "Couldn't get index " << m_databaseName << endl;
2183 		return false;
2184 	}
2185 	pDatabase->reopen();
2186 
2187 	return true;
2188 }
2189 
2190 /// Resets the index.
reset(void)2191 bool XapianIndex::reset(void)
2192 {
2193 	// Overwrite and reopen
2194 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, false, true);
2195 	if (pDatabase == NULL)
2196 	{
2197 		clog << "Couldn't get index " << m_databaseName << endl;
2198 		return false;
2199 	}
2200 
2201 	return true;
2202 }
2203 
2204