1 /*
2  *  Copyright 2005-2021 Fabrice Colin
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, write to the Free Software
16  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  */
18 
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 #include <unistd.h>
22 #include <time.h>
23 #include <string>
24 #include <cstring>
25 #include <vector>
26 #include <iostream>
27 #include <fstream>
28 #include <algorithm>
29 
30 #include "config.h"
31 #include "Languages.h"
32 #include "StringManip.h"
33 #include "TimeConverter.h"
34 #include "Timer.h"
35 #include "Url.h"
36 #include "CJKVTokenizer.h"
37 #include "FieldMapperInterface.h"
38 #include "XapianDatabaseFactory.h"
39 #include "AbstractGenerator.h"
40 #include "XapianEngine.h"
41 
42 using std::string;
43 using std::multimap;
44 using std::vector;
45 using std::clog;
46 using std::clog;
47 using std::endl;
48 using std::inserter;
49 using std::getline;
50 using std::ifstream;
51 using namespace Dijon;
52 
53 extern FieldMapperInterface *g_pMapper;
54 
checkFilter(const string & freeQuery,string::size_type filterValueStart,bool & escapeValue,bool & hashValue)55 static void checkFilter(const string &freeQuery, string::size_type filterValueStart,
56 	bool &escapeValue, bool &hashValue)
57 {
58 	string filterName;
59 	string::size_type filterNameStart = freeQuery.rfind(' ', filterValueStart);
60 
61 	escapeValue = hashValue = false;
62 
63 	if (filterNameStart == string::npos)
64 	{
65 		filterName = freeQuery.substr(0, filterValueStart);
66 	}
67 	else
68 	{
69 		filterName = freeQuery.substr(filterNameStart + 1, filterValueStart - filterNameStart - 1);
70 	}
71 #ifdef DEBUG
72 	clog << "checkFilter: filter " << filterName << endl;
73 #endif
74 
75 	// In XapianIndex, these are escaped and hashed
76 	if ((filterName == "file") ||
77 		(filterName =="dir") ||
78 		(filterName == "url") ||
79 		(filterName == "path"))
80 	{
81 		escapeValue = hashValue = true;
82 	}
83 	// except label which is only escaped
84 	else if (filterName == "label")
85 	{
86 		escapeValue = true;
87 	}
88 	else if (g_pMapper != NULL)
89 	{
90 		escapeValue = g_pMapper->isEscaped(filterName);
91 	}
92 }
93 
94 class TimeValueRangeProcessor : public Xapian::RangeProcessor
95 {
96 	public:
TimeValueRangeProcessor(Xapian::valueno valueNumber)97 		TimeValueRangeProcessor(Xapian::valueno valueNumber) :
98 			Xapian::RangeProcessor(),
99 			m_valueNumber(valueNumber)
100 		{
101 		}
~TimeValueRangeProcessor()102 		~TimeValueRangeProcessor()
103 		{
104 		}
105 
operator ()(const std::string & begin,const std::string & end)106 		virtual Xapian::Query operator()(const std::string &begin, const std::string &end)
107 		{
108 			if ((begin.size() == 6) &&
109 				(end.size() == 6))
110 			{
111 				// HHMMSS
112 #ifdef DEBUG
113 				clog << "TimeValueRangeProcessor::operator: accepting " << begin << ".." << end << endl;
114 #endif
115 
116 				return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
117 					m_valueNumber,
118 					begin,end);
119 			}
120 			if ((begin.size() == 8) && (end.size() == 8) &&
121 				(begin[2] == begin[5]) && (end[2] == end[5]) && (begin[2] == end[2]) &&
122 				(end[4] == ':'))
123 			{
124 				std::string lower(begin), upper(end);
125 
126 				// HH:MM:SS
127 				lower.erase(2, 1);
128 				lower.erase(5, 1);
129 				upper.erase(2, 1);
130 				upper.erase(5, 1);
131 #ifdef DEBUG
132 				clog << "TimeValueRangeProcessor::operator: accepting " << lower << ".." << upper << endl;
133 #endif
134 
135 				return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
136 					m_valueNumber,
137 					lower, upper);
138 			}
139 #ifdef DEBUG
140 			clog << "TimeValueRangeProcessor::operator: rejecting " << begin << ".." << end << endl;
141 #endif
142 
143 			return Xapian::Query(Xapian::Query::OP_INVALID);
144 		}
145 
146 	protected:
147 		Xapian::valueno m_valueNumber;
148 
149 };
150 
151 class TermDecider : public Xapian::ExpandDecider
152 {
153 	public:
TermDecider(Xapian::Database * pIndex,Xapian::Stem * pStemmer,Xapian::Stopper * pStopper,const string & allowedPrefixes,Xapian::Query & query)154 		TermDecider(Xapian::Database *pIndex,
155 			Xapian::Stem *pStemmer,
156 			Xapian::Stopper *pStopper,
157 			const string &allowedPrefixes,
158 			Xapian::Query &query) :
159 			Xapian::ExpandDecider(),
160 			m_pIndex(pIndex),
161 			m_pStemmer(pStemmer),
162 			m_pStopper(pStopper),
163 			m_allowedPrefixes(allowedPrefixes),
164 			m_pTermsToAvoid(NULL)
165 		{
166 			m_pTermsToAvoid = new set<string>();
167 
168 			for (Xapian::TermIterator termIter = query.get_terms_begin();
169 				termIter != query.get_terms_end(); ++termIter)
170 			{
171 				string term(*termIter);
172 
173 				if (isupper((int)(term[0])) == 0)
174 				{
175 					m_pTermsToAvoid->insert(term);
176 					if (m_pStemmer != NULL)
177 					{
178 						string stem((*m_pStemmer)(term));
179 						m_pTermsToAvoid->insert(stem);
180 					}
181 				}
182 				else if (term[0] == 'Z')
183 				{
184 					m_pTermsToAvoid->insert(term.substr(1));
185 				}
186 			}
187 #ifdef DEBUG
188 			clog << "TermDecider: avoiding " << m_pTermsToAvoid->size() << " terms" << endl;
189 #endif
190 		}
~TermDecider()191 		~TermDecider()
192 		{
193 			if (m_pTermsToAvoid != NULL)
194 			{
195 				delete m_pTermsToAvoid;
196 			}
197 		}
198 
operator ()(const std::string & term) const199 		virtual bool operator()(const std::string &term) const
200 		{
201 			CJKVTokenizer tokenizer;
202 			bool isPrefixed = false;
203 
204 			// Reject short terms
205 			if ((tokenizer.has_cjkv(term) == false) &&
206 				(term.length() < 3))
207 			{
208 				return false;
209 			}
210 
211 			// Reject terms with prefixes we don't want
212 			if (isupper((int)(term[0])) != 0)
213 			{
214 				isPrefixed = true;
215 
216 				if (m_allowedPrefixes.find(term[0]) == string::npos)
217 				{
218 					return false;
219 				}
220 			}
221 
222 			// Reject terms with spaces
223 			if (term.find_first_of(" \t\r\n") != string::npos)
224 			{
225 				return false;
226 			}
227 
228 			// Reject terms that occur only once
229 			if ((m_pIndex != NULL) &&
230 				(m_pIndex->get_termfreq(term) <= 1))
231 			{
232 				return false;
233 			}
234 
235 			// Reject stop words
236 			if ((m_pStopper != NULL) &&
237 				((*m_pStopper)(term) == true))
238 			{
239 				return false;
240 			}
241 
242 			// Stop here if there's no specific terms to avoid
243 			if (m_pTermsToAvoid->empty() == true)
244 			{
245 				return true;
246 			}
247 
248 			// Reject query terms
249 			if (m_pTermsToAvoid->find(term) != m_pTermsToAvoid->end())
250 			{
251 				return false;
252 			}
253 
254 			// Stop here is there's no stemmer
255 			if (m_pStemmer == NULL)
256 			{
257 				return true;
258 			}
259 
260 			// Reject terms that stem to the same as query terms
261 			// or previously validated terms
262 			string stem;
263 			if (isPrefixed == true)
264 			{
265 				stem = (*m_pStemmer)(term.substr(1));
266 			}
267 			else
268 			{
269 				stem = (*m_pStemmer)(term);
270 			}
271 			if (m_pTermsToAvoid->find(stem) != m_pTermsToAvoid->end())
272 			{
273 				return false;
274 			}
275 			m_pTermsToAvoid->insert(stem);
276 
277 			return true;
278 		}
279 
280 	protected:
281 		Xapian::Database *m_pIndex;
282 		Xapian::Stem *m_pStemmer;
283 		Xapian::Stopper *m_pStopper;
284 		string m_allowedPrefixes;
285 		set<string> *m_pTermsToAvoid;
286 
287 };
288 
289 class FileStopper : public Xapian::SimpleStopper
290 {
291 	public:
FileStopper(const string & languageCode)292 		FileStopper(const string &languageCode) :
293 			Xapian::SimpleStopper(),
294 			m_languageCode(languageCode),
295 			m_stopwordsCount(0)
296 		{
297 			if (languageCode.empty() == false)
298 			{
299 				ifstream inputFile;
300 				string fileName(PREFIX);
301 
302 				fileName += "/share/pinot/stopwords/stopwords.";
303 				fileName += languageCode;
304 				inputFile.open(fileName.c_str());
305 				if (inputFile.good() == true)
306 				{
307 					string line;
308 
309 					// Each line is a stopword
310 					while (getline(inputFile, line).eof() == false)
311 					{
312 						add(line);
313 						++m_stopwordsCount;
314 					}
315 				}
316 				inputFile.close();
317 
318 #ifdef DEBUG
319 				clog << "FileStopper: " << m_stopwordsCount << " stopwords for language code " << languageCode << endl;
320 #endif
321 			}
322 		}
~FileStopper()323 		virtual ~FileStopper()
324 		{
325 		}
326 
get_stopwords_count(void) const327 		unsigned int get_stopwords_count(void) const
328 		{
329 			return m_stopwordsCount;
330 		}
331 
get_stopper(const string & languageCode)332 		static FileStopper *get_stopper(const string &languageCode)
333 		{
334 			if (m_pStopper == NULL)
335 			{
336 				m_pStopper = new FileStopper(languageCode);
337 			}
338 			else if (m_pStopper->m_languageCode != languageCode)
339 			{
340 				delete m_pStopper;
341 
342 				m_pStopper = new FileStopper(languageCode);
343 			}
344 
345 			return m_pStopper;
346 		}
347 
free_stopper(void)348 		static void free_stopper(void)
349 		{
350 			if (m_pStopper != NULL)
351 			{
352 				delete m_pStopper;
353 				m_pStopper = NULL;
354 			}
355 		}
356 
357 	protected:
358 		string m_languageCode;
359 		unsigned int m_stopwordsCount;
360 		static FileStopper *m_pStopper;
361 
362 };
363 
364 FileStopper *FileStopper::m_pStopper = NULL;
365 
366 class QueryModifier : public Dijon::CJKVTokenizer::TokensHandler
367 {
368 	public:
369 		typedef enum { NONE = 0, BRACKETS } CJKVWrap;
370 
QueryModifier(const string & query,bool diacriticSensitive,unsigned int nGramSize)371 		QueryModifier(const string &query,
372 			bool diacriticSensitive, unsigned int nGramSize) :
373 			m_query(query),
374 			m_diacriticSensitive(diacriticSensitive),
375 			m_pos(0),
376 			m_wrap(BRACKETS),
377 			m_wrapped(false),
378 			m_nGramCount(0),
379 			m_nGramSize(nGramSize),
380 			m_tokensCount(0),
381 			m_hasCJKV(false),
382 			m_hasNonCJKV(false)
383 		{
384 		}
385 
~QueryModifier()386 		virtual ~QueryModifier()
387 		{
388 		}
389 
handle_token(const string & tok,bool is_cjkv)390 		virtual bool handle_token(const string &tok, bool is_cjkv)
391 		{
392 			if (tok.empty() == true)
393 			{
394 				return false;
395 			}
396 #ifdef DEBUG
397 			clog << "QueryModifier::handle_token: " << tok << endl;
398 #endif
399 
400 			// Where is this token in the original query ?
401 			string::size_type tokPos = m_query.find(tok, m_pos);
402 			++m_tokensCount;
403 
404 			// Is this CJKV ?
405 			if (is_cjkv == false)
406 			{
407 				char lastChar = tok[tok.length() - 1];
408 
409 				if (tokPos == string::npos)
410 				{
411 					// This should have been found
412 					return false;
413 				}
414 
415 				if (m_nGramCount > 0)
416 				{
417 					wrapClose();
418 
419 					m_nGramCount = 0;
420 					m_pos = tokPos;
421 				}
422 
423 				m_currentFilter.clear();
424 				if (lastChar == '"')
425 				{
426 					// It's a quoted string
427 					m_wrap = NONE;
428 				}
429 				else if (lastChar == ':')
430 				{
431 					// It's a filter
432 					m_wrap = NONE;
433 					m_currentFilter = tok;
434 				}
435 				else
436 				{
437 					m_wrap = BRACKETS;
438 				}
439 
440 				if (m_currentFilter.empty() == true)
441 				{
442 					m_hasNonCJKV = true;
443 				}
444 
445 				if (m_diacriticSensitive == false)
446 				{
447 					// Strip accents and other diacritics from terms
448 					string unaccentedTok(Dijon::CJKVTokenizer::strip_marks(tok));
449 					if (tok != unaccentedTok)
450 					{
451 #ifdef DEBUG
452 						clog << "QueryModifier::handle_token: " << tok << " stripped to " << unaccentedTok << endl;
453 #endif
454 						m_query.replace(tokPos, tok.length(), unaccentedTok);
455 					}
456 				}
457 
458 				// Return right away
459 				return true;
460 			}
461 
462 			// First n-gram ?
463 			if (m_nGramCount == 0)
464 			{
465 				if (tokPos == string::npos)
466 				{
467 					// That's definitely not right
468 					return false;
469 				}
470 
471 				// Append non-CJKV text that precedes and start wrapping CJKV tokens
472 				if (tokPos > m_pos)
473 				{
474 					m_modifiedQuery += " " + m_query.substr(m_pos, tokPos - m_pos);
475 				}
476 				m_pos += tok.length();
477 
478 				wrapOpen();
479 			}
480 			else
481 			{
482 				m_modifiedQuery += " ";
483 				if (m_currentFilter.empty() == false)
484 				{
485 					m_modifiedQuery += m_currentFilter;
486 				}
487 			}
488 			m_modifiedQuery += tok;
489 #ifdef DEBUG
490 			clog << "QueryModifier::handle_token: " << m_modifiedQuery << endl;
491 #endif
492 
493 			if (tokPos != string::npos)
494 			{
495 				m_pos = tokPos + tok.length();
496 			}
497 			++m_nGramCount;
498 			m_hasCJKV = true;
499 
500 			return true;
501 		}
502 
get_tokens_count(void) const503 		unsigned int get_tokens_count(void) const
504 		{
505 			return m_tokensCount;
506 		}
507 
get_modified_query(bool & pureCJKV)508 		string get_modified_query(bool &pureCJKV)
509 		{
510 #ifdef DEBUG
511 			clog << "QueryModifier::get_modified_query: " << m_pos << "/" << m_query.length() << endl;
512 #endif
513 
514 			// Anything left ?
515 			if (m_pos < m_query.length() - 1)
516 			{
517 				m_modifiedQuery += " " + m_query.substr(m_pos);
518 			}
519 			wrapClose();
520 #ifdef DEBUG
521 			clog << "QueryModifier::get_modified_query: " << m_modifiedQuery << endl;
522 #endif
523 
524 			if ((m_hasCJKV == true) &&
525 				(m_hasNonCJKV == false))
526 			{
527 				pureCJKV = true;
528 			}
529 			else
530 			{
531 				pureCJKV = false;
532 			}
533 
534 			return m_modifiedQuery;
535 		}
536 
537 	protected:
538 		string m_query;
539 		bool m_diacriticSensitive;
540 		string m_modifiedQuery;
541 		string::size_type m_pos;
542 		CJKVWrap m_wrap;
543 		bool m_wrapped;
544 		string m_currentFilter;
545 		unsigned int m_nGramCount;
546 		unsigned int m_nGramSize;
547 		unsigned int m_tokensCount;
548 		bool m_hasCJKV;
549 		bool m_hasNonCJKV;
550 
wrapOpen(void)551 		void wrapOpen(void)
552 		{
553 			switch (m_wrap)
554 			{
555 				case BRACKETS:
556 					m_modifiedQuery += " (";
557 					break;
558 				case NONE:
559 				default:
560 					break;
561 			}
562 			m_wrapped = true;
563 		}
564 
wrapClose(void)565 		void wrapClose(void)
566 		{
567 			if (m_wrapped == false)
568 			{
569 				return;
570 			}
571 
572 			// Finish wrapping CJKV tokens
573 			switch (m_wrap)
574 			{
575 				case BRACKETS:
576 					m_modifiedQuery += ')';
577 					break;
578 				case NONE:
579 				default:
580 					break;
581 			}
582 			m_wrapped = false;
583 		}
584 
585 };
586 
XapianEngine(const string & database)587 XapianEngine::XapianEngine(const string &database) :
588 	SearchEngineInterface()
589 {
590 	// We expect documents to have been converted to UTF-8 at indexing time
591 	m_charset = "UTF-8";
592 
593 	// If the database name ends with a slash, remove it
594 	if (database[database.length() - 1] == '/')
595 	{
596 		m_databaseName = database.substr(0, database.length() - 1);
597 	}
598 	else
599 	{
600 		m_databaseName = database;
601 	}
602 }
603 
~XapianEngine()604 XapianEngine::~XapianEngine()
605 {
606 }
607 
parseQuery(Xapian::Database * pIndex,const QueryProperties & queryProps,const string & stemLanguage,DefaultOperator defaultOperator,string & correctedFreeQuery,bool minimal)608 Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps,
609 	const string &stemLanguage, DefaultOperator defaultOperator,
610 	string &correctedFreeQuery, bool minimal)
611 {
612 	Xapian::QueryParser parser;
613 	CJKVTokenizer tokenizer;
614 	string freeQuery(queryProps.getFreeQuery());
615 	unsigned int tokensCount = 1;
616 	bool diacriticSensitive = queryProps.getDiacriticSensitive();
617 
618 	// Modifying the query is necessary if it's CJKV or diacritics are off
619 	if ((tokenizer.has_cjkv(freeQuery) == true) ||
620 		(diacriticSensitive == false))
621 	{
622 		QueryModifier handler(freeQuery,
623 			diacriticSensitive,
624 			tokenizer.get_ngram_size());
625 
626 		tokenizer.tokenize(freeQuery, handler, true);
627 
628 		tokensCount = handler.get_tokens_count();
629 
630 		// We can disable stemming and spelling correction for pure CJKV queries
631 		string cjkvQuery(handler.get_modified_query(minimal));
632 #ifdef DEBUG
633 		clog << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl;
634 #endif
635 
636 		// Do as if the user had given this as input
637 		freeQuery = cjkvQuery;
638 	}
639 	else
640 	{
641 		string::size_type spacePos = freeQuery.find(' ');
642 		while (spacePos != string::npos)
643 		{
644 			++tokensCount;
645 
646 			if (spacePos + 1 >= freeQuery.length())
647 			{
648 				break;
649 			}
650 
651 			// Next
652 			spacePos = freeQuery.find(' ', spacePos + 1);
653 		}
654 	}
655 #ifdef DEBUG
656 	clog << "XapianEngine::parseQuery: " << tokensCount << " tokens" << endl;
657 #endif
658 
659 	if (pIndex != NULL)
660 	{
661 		// The database is required for wildcards and spelling
662 		parser.set_database(*pIndex);
663 	}
664 
665 	// Set things up
666 	if ((minimal == false) &&
667 		(stemLanguage.empty() == false))
668 	{
669 		parser.set_stemmer(m_stemmer);
670 		parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
671 
672 		// Don't bother loading the stopwords list if there's only one token
673 		if (tokensCount > 1)
674 		{
675 			FileStopper *pStopper = FileStopper::get_stopper(Languages::toCode(stemLanguage));
676 			if ((pStopper != NULL) &&
677 				(pStopper->get_stopwords_count() > 0))
678 			{
679 				parser.set_stopper(pStopper);
680 			}
681 		}
682 	}
683 	else
684 	{
685 #ifdef DEBUG
686 		clog << "XapianEngine::parseQuery: no stemming" << endl;
687 #endif
688 		parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
689 	}
690 	// What's the default operator ?
691 	if (defaultOperator == DEFAULT_OP_AND)
692 	{
693 		parser.set_default_op(Xapian::Query::OP_AND);
694 	}
695 	else
696 	{
697 		parser.set_default_op(Xapian::Query::OP_OR);
698 	}
699 	// Search across text body and title
700 	parser.add_prefix("", "");
701 	parser.add_prefix("", "S");
702 	// X prefixes should always include a colon
703 	parser.add_boolean_prefix("site", "H");
704 	parser.add_boolean_prefix("file", "P");
705 	parser.add_boolean_prefix("ext", "E");
706 	parser.add_prefix("title", "S");
707 	parser.add_boolean_prefix("url", "U");
708 	parser.add_boolean_prefix("dir", "XDIR:");
709 	parser.add_boolean_prefix("inurl", "XFILE:");
710 	parser.add_prefix("path", "XPATH:");
711 	parser.add_boolean_prefix("lang", "L");
712 	parser.add_boolean_prefix("type", "T");
713 	parser.add_boolean_prefix("class", "XCLASS:");
714 	parser.add_boolean_prefix("label", "XLABEL:");
715 	parser.add_boolean_prefix("tokens", "XTOK:");
716 	if (g_pMapper != NULL)
717 	{
718 		map<string, string> filters;
719 
720 		g_pMapper->getBooleanFilters(filters);
721 
722 		for (map<string, string>::const_iterator filterIter = filters.begin();
723 			filterIter != filters.end(); ++filterIter)
724 		{
725 			parser.add_boolean_prefix(filterIter->first, filterIter->second);
726 		}
727 	}
728 
729 	// Date range
730 	Xapian::DateRangeProcessor dateProcessor(0);
731 	parser.add_rangeprocessor(&dateProcessor);
732 
733 	// Size with a "b" suffix, ie 1024..10240b
734 	Xapian::NumberRangeProcessor sizeProcessor(2, "b", Xapian::RP_SUFFIX);
735 	parser.add_rangeprocessor(&sizeProcessor);
736 
737 	// Time range
738 	TimeValueRangeProcessor timeProcessor(3);
739 	parser.add_rangeprocessor(&timeProcessor);
740 
741 	// What type of query is this ?
742 	QueryProperties::QueryType type = queryProps.getType();
743 	if (type != QueryProperties::XAPIAN_QP)
744 	{
745 		// This isn't supported
746 		return Xapian::Query();
747 	}
748 
749 	// Do some pre-processing : look for filters with quoted values
750 	string::size_type escapedFilterEnd = 0;
751 	string::size_type escapedFilterStart = freeQuery.find(":\"");
752 	while ((escapedFilterStart != string::npos) &&
753 		(escapedFilterStart < freeQuery.length() - 2))
754 	{
755 		escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2);
756 		if (escapedFilterEnd == string::npos)
757 		{
758 			break;
759 		}
760 
761 		string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2);
762 		if (filterValue.empty() == false)
763 		{
764 			string escapedValue(Url::escapeUrl(filterValue));
765 			bool escapeValue = false, hashValue = false;
766 
767 			// The value should be escaped and length-limited as done at indexing time
768 			checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue);
769 
770 			if (escapeValue == false)
771 			{
772 				// No escaping
773 				escapedValue = filterValue;
774 			}
775 			if (hashValue == true)
776 			{
777 				// Partially hash if necessary
778 				escapedValue = XapianDatabase::limitTermLength(escapedValue, true);
779 			}
780 			else
781 			{
782 				escapedValue = XapianDatabase::limitTermLength(escapedValue);
783 			}
784 
785 #ifdef DEBUG
786 			clog << "XapianEngine::parseQuery: escaping to " << escapedValue << endl;
787 #endif
788 			freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart,
789 				escapedValue);
790 			escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length();
791 		}
792 		else
793 		{
794 			// No value !
795 			freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":");
796 			escapedFilterEnd -= 2;
797 		}
798 #ifdef DEBUG
799 		clog << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl;
800 #endif
801 
802 		// Next
803 		escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd);
804 	}
805 
806 	// Parse the query string with all necessary options
807 	unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE|
808 		Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_PURE_NOT;
809 	if (minimal == false)
810 	{
811 		flags |= Xapian::QueryParser::FLAG_WILDCARD;
812 #if ENABLE_XAPIAN_SPELLING_CORRECTION>0
813 		flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
814 #endif
815 	}
816 	Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags);
817 #ifdef DEBUG
818 	clog << "XapianEngine::parseQuery: query is " << parsedQuery.get_description() << endl;
819 #endif
820 
821 	// Any limit on what documents should be searched ?
822 	if (m_limitDocuments.empty() == false)
823 	{
824 		Xapian::Query filterQuery(Xapian::Query::OP_OR,
825 			m_limitDocuments.begin(), m_limitDocuments.end());
826 
827 		parsedQuery = Xapian::Query(Xapian::Query::OP_FILTER,
828 			parsedQuery, filterQuery);
829 #ifdef DEBUG
830 		clog << "XapianEngine::parseQuery: limited query is " << parsedQuery.get_description() << endl;
831 #endif
832 	}
833 
834 	if (minimal == false)
835 	{
836 #if ENABLE_XAPIAN_SPELLING_CORRECTION>0
837 		// Any correction ?
838 		correctedFreeQuery = parser.get_corrected_query_string();
839 #ifdef DEBUG
840 		if (correctedFreeQuery.empty() == false)
841 		{
842 			clog << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl;
843 		}
844 #endif
845 #endif
846 	}
847 
848 	return parsedQuery;
849 }
850 
queryDatabase(Xapian::Database * pIndex,Xapian::Query & query,const string & stemLanguage,unsigned int startDoc,const QueryProperties & queryProps)851 bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query,
852 	const string &stemLanguage, unsigned int startDoc, const QueryProperties &queryProps)
853 {
854 	Timer timer;
855 	unsigned int maxResultsCount = queryProps.getMaximumResultsCount();
856 	bool completedQuery = false;
857 
858 	if (pIndex == NULL)
859 	{
860 		return false;
861 	}
862 
863 	// Start an enquire session on the database
864 	Xapian::Enquire enquire(*pIndex);
865 
866 	timer.start();
867 	try
868 	{
869 		AbstractGenerator abstractGen(pIndex, 50);
870 		vector<string> seedTerms;
871 
872 		// Give the query object to the enquire session
873 		enquire.set_query(query);
874 		// How should results be sorted ?
875 		if (queryProps.getSortOrder() == QueryProperties::RELEVANCE)
876 		{
877 			// By relevance, then date
878 			enquire.set_sort_by_relevance_then_value(4, true);
879 #ifdef DEBUG
880 			clog << "XapianEngine::queryDatabase: sorting by relevance first" << endl;
881 #endif
882 		}
883 		else if (queryProps.getSortOrder() == QueryProperties::DATE_DESC)
884 		{
885 			// By date, and then by relevance
886 			enquire.set_docid_order(Xapian::Enquire::DONT_CARE);
887 			enquire.set_sort_by_value_then_relevance(4, true);
888 #ifdef DEBUG
889 			clog << "XapianEngine::queryDatabase: sorting by date and time desc" << endl;
890 #endif
891 		}
892 		else if (queryProps.getSortOrder() == QueryProperties::DATE_ASC)
893 		{
894 			// By date, and then by relevance
895 			enquire.set_docid_order(Xapian::Enquire::DONT_CARE);
896 			enquire.set_sort_by_value_then_relevance(5, true);
897 #ifdef DEBUG
898 			clog << "XapianEngine::queryDatabase: sorting by date and time asc" << endl;
899 #endif
900 		}
901 		else if (queryProps.getSortOrder() == QueryProperties::SIZE_DESC)
902 		{
903 			// By date, and then by relevance
904 			enquire.set_docid_order(Xapian::Enquire::DONT_CARE);
905 			enquire.set_sort_by_value_then_relevance(2, true);
906 #ifdef DEBUG
907 			clog << "XapianEngine::queryDatabase: sorting by size asc" << endl;
908 #endif
909 		}
910 
911 		// Collapse results ?
912 		if (g_pMapper != NULL)
913 		{
914 			unsigned int valueNumber;
915 
916 			if (g_pMapper->collapseOnValue(valueNumber) == true)
917 			{
918 				enquire.set_collapse_key(valueNumber, 1);
919 			}
920 		}
921 
922 		// Get the top results of the query
923 		Xapian::MSet matches = enquire.get_mset(startDoc, maxResultsCount, (2 * maxResultsCount) + 1);
924 		m_resultsCountEstimate = matches.get_matches_estimated();
925 		if (matches.empty() == false)
926 		{
927 #ifdef DEBUG
928 			clog << "XapianEngine::queryDatabase: found " << matches.size() << "/" << maxResultsCount
929 				<< " results found from position " << startDoc << endl;
930 			clog << "XapianEngine::queryDatabase: estimated " << matches.get_matches_lower_bound()
931 				<< "/" << m_resultsCountEstimate << "/" << matches.get_matches_upper_bound()
932 				<< ", " << matches.get_description() << endl;
933 #endif
934 
935 			// Get the results
936 			for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter)
937 			{
938 				Xapian::docid docId = *mIter;
939 				Xapian::Document doc(mIter.get_document());
940 
941 				// What terms did this document match ?
942 				seedTerms.clear();
943 				for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId);
944 					termIter != enquire.get_matching_terms_end(docId); ++termIter)
945 				{
946 					char firstChar = (*termIter)[0];
947 
948 					if (isupper(((int)firstChar)) == 0)
949 					{
950 						seedTerms.push_back(*termIter);
951 #ifdef DEBUG
952 						clog << "XapianEngine::queryDatabase: matched term " << *termIter << endl;
953 #endif
954 					}
955 					else if (firstChar == 'Z')
956 					{
957 						string stemmed((*termIter).substr(1));
958 						string::size_type stemmedLen = stemmed.length();
959 
960 						// Which of this document's terms stem to this ?
961 						Xapian::TermIterator docTermIter = pIndex->termlist_begin(docId);
962 						if (docTermIter != pIndex->termlist_end(docId))
963 						{
964 							for (docTermIter.skip_to(stemmed);
965 								docTermIter != pIndex->termlist_end(docId); ++docTermIter)
966 							{
967 								// Is this a potential unstem ?
968 								if (strncasecmp((*docTermIter).c_str(), stemmed.c_str(), stemmedLen) != 0)
969 								{
970 									// No, no point looking at the next terms
971 									break;
972 								}
973 #ifdef DEBUG
974 								clog << "XapianEngine::queryDatabase: matched unstem " << *docTermIter << endl;
975 #endif
976 
977 								// FIXME: check this term stems to stemmed !
978 								seedTerms.push_back(*docTermIter);
979 							}
980 						}
981 					}
982 				}
983 
984 				if (docId <= 0)
985 				{
986 #ifdef DEBUG
987 					clog << "XapianEngine::queryDatabase: bogus document ID " << docId << endl;
988 #endif
989 					continue;
990 				}
991 
992 				DocumentInfo thisResult;
993 				thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms));
994 				thisResult.setScore((float)mIter.get_percent());
995 
996 #ifdef DEBUG
997 				clog << "XapianEngine::queryDatabase: found document ID " << docId << endl;
998 #endif
999 				XapianDatabase::recordToProps(doc.get_data(), &thisResult);
1000 				// XapianDatabase stored the language in English
1001 				thisResult.setLanguage(Languages::toLocale(thisResult.getLanguage()));
1002 
1003 				string url(thisResult.getLocation());
1004 				if (url.empty() == true)
1005 				{
1006 					// Hmmm this shouldn't be empty...
1007 					// Use this instead, even though the document isn't cached in the index
1008 					thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId));
1009 				}
1010 
1011 				// We don't know the index ID, just the document ID
1012 				thisResult.setIsIndexed(0, docId);
1013 
1014 				// Add this result
1015 				m_resultsList.push_back(thisResult);
1016 			}
1017 		}
1018 
1019 		completedQuery = true;
1020 	}
1021 	catch (const Xapian::Error &error)
1022 	{
1023 		clog << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
1024 	}
1025 	clog << "Ran query \"" << queryProps.getFreeQuery() << "\" in " << timer.stop() << " ms" << endl;
1026 
1027 	try
1028 	{
1029 		m_expandTerms.clear();
1030 
1031 		// Expand the query ?
1032 		if (m_expandDocuments.empty() == false)
1033 		{
1034 			Xapian::RSet expandDocs;
1035 
1036 			for (set<string>::const_iterator docIter = m_expandDocuments.begin();
1037 				docIter != m_expandDocuments.end(); ++docIter)
1038 			{
1039 				string uniqueTerm(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(*docIter)), true));
1040 
1041 				// Only one document may have this term
1042 				Xapian::PostingIterator postingIter = pIndex->postlist_begin(uniqueTerm);
1043 				if (postingIter != pIndex->postlist_end(uniqueTerm))
1044 				{
1045 					expandDocs.add_document(*postingIter);
1046 				}
1047 			}
1048 #ifdef DEBUG
1049 			clog << "XapianEngine::queryDatabase: expand from " << expandDocs.size() << " documents" << endl;
1050 #endif
1051 
1052 			// Get 10 non-prefixed terms
1053 			string allowedPrefixes("RS");
1054 			TermDecider expandDecider(pIndex, ((stemLanguage.empty() == true) ? NULL : &m_stemmer),
1055 				FileStopper::get_stopper(Languages::toCode(stemLanguage)),
1056 				allowedPrefixes, query);
1057 			Xapian::ESet expandTerms = enquire.get_eset(10, expandDocs, &expandDecider);
1058 #ifdef DEBUG
1059 			clog << "XapianEngine::queryDatabase: " << expandTerms.size() << " expand terms" << endl;
1060 #endif
1061 			for (Xapian::ESetIterator termIter = expandTerms.begin();
1062 				termIter != expandTerms.end(); ++termIter)
1063 			{
1064 				string expandTerm(*termIter);
1065 				char firstChar = expandTerm[0];
1066 
1067 				// Is this prefixed ?
1068 				if (allowedPrefixes.find(firstChar) != string::npos)
1069 				{
1070 					expandTerm.erase(0, 1);
1071 				}
1072 
1073 				m_expandTerms.insert(expandTerm);
1074 			}
1075 		}
1076 	}
1077 	catch (const Xapian::Error &error)
1078 	{
1079 		clog << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
1080 	}
1081 
1082 	// Be tolerant of errors as long as we got some results
1083 	if ((completedQuery == true) ||
1084 		(m_resultsList.empty() == false))
1085 	{
1086 		return true;
1087 	}
1088 
1089 	return false;
1090 }
1091 
1092 /// Frees all objects.
freeAll(void)1093 void XapianEngine::freeAll(void)
1094 {
1095 	FileStopper::free_stopper();
1096 }
1097 
1098 //
1099 // Implementation of SearchEngineInterface
1100 //
1101 
1102 /// Sets the set of documents to limit to.
setLimitSet(const set<string> & docsSet)1103 bool XapianEngine::setLimitSet(const set<string> &docsSet)
1104 {
1105 	for (set<string>::const_iterator docIter = docsSet.begin();
1106 		docIter != docsSet.end(); ++docIter)
1107 	{
1108 		string urlFilter("U");
1109 
1110 		// Escape and hash
1111 		urlFilter += XapianDatabase::limitTermLength(Url::escapeUrl(*docIter), true);
1112 		m_limitDocuments.insert(urlFilter);
1113 	}
1114 #ifdef DEBUG
1115 	clog << "XapianEngine::setLimitSet: " << m_limitDocuments.size() << " documents" << endl;
1116 #endif
1117 
1118 	return true;
1119 }
1120 
1121 /// Sets the set of documents to expand from.
setExpandSet(const set<string> & docsSet)1122 bool XapianEngine::setExpandSet(const set<string> &docsSet)
1123 {
1124 	copy(docsSet.begin(), docsSet.end(),
1125 		inserter(m_expandDocuments, m_expandDocuments.begin()));
1126 #ifdef DEBUG
1127 	clog << "XapianEngine::setExpandSet: " << m_expandDocuments.size() << " documents" << endl;
1128 #endif
1129 
1130 	return true;
1131 }
1132 
1133 /// Runs a query; true if success.
runQuery(QueryProperties & queryProps,unsigned int startDoc)1134 bool XapianEngine::runQuery(QueryProperties& queryProps,
1135 	unsigned int startDoc)
1136 {
1137 	string stemLanguage(Languages::toEnglish(queryProps.getStemmingLanguage()));
1138 
1139 	// Clear the results list
1140 	m_resultsList.clear();
1141 	m_resultsCountEstimate = 0;
1142 	m_correctedFreeQuery.clear();
1143 
1144 	if (queryProps.isEmpty() == true)
1145 	{
1146 #ifdef DEBUG
1147 		clog << "XapianEngine::runQuery: query is empty" << endl;
1148 #endif
1149 		return false;
1150 	}
1151 
1152 	XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, true);
1153 	if (pDatabase == NULL)
1154 	{
1155 		clog << "Couldn't get index " << m_databaseName << endl;
1156 		return false;
1157 	}
1158 
1159 	if ((stemLanguage.empty() == false) &&
1160 		(stemLanguage != "unknown"))
1161 	{
1162 #ifdef DEBUG
1163 		clog << "XapianEngine::runQuery: " << stemLanguage << " stemming" << endl;
1164 #endif
1165 		try
1166 		{
1167 			m_stemmer = Xapian::Stem(StringManip::toLowerCase(stemLanguage));
1168 		}
1169 		catch (const Xapian::Error &error)
1170 		{
1171 			clog << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
1172 		}
1173 	}
1174 
1175 	// Get the latest revision...
1176 	pDatabase->reopen();
1177 	Xapian::Database *pIndex = pDatabase->readLock();
1178 	try
1179 	{
1180 		unsigned int searchStep = 1;
1181 
1182 		// Searches are run in this order :
1183 		// 1. no stemming, exact matches only
1184 		// 2. stem terms if a language is defined for the query
1185 		Xapian::Query fullQuery = parseQuery(pIndex, queryProps, "",
1186 			m_defaultOperator, m_correctedFreeQuery);
1187 		while (fullQuery.empty() == false)
1188 		{
1189 			// Query the database
1190 			if (queryDatabase(pIndex, fullQuery, stemLanguage, startDoc, queryProps) == false)
1191 			{
1192 				break;
1193 			}
1194 
1195 			if (m_resultsList.empty() == true)
1196 			{
1197 				// The search did succeed but didn't return anything
1198 				if ((searchStep == 1) &&
1199 					(stemLanguage.empty() == false))
1200 				{
1201 #ifdef DEBUG
1202 					clog << "XapianEngine::runQuery: trying again with stemming" << endl;
1203 #endif
1204 					fullQuery = parseQuery(pIndex, queryProps, stemLanguage,
1205 						m_defaultOperator, m_correctedFreeQuery);
1206 					++searchStep;
1207 					continue;
1208 				}
1209 			}
1210 			else
1211 			{
1212 				// We have results, don't bother about correcting the query
1213 				m_correctedFreeQuery.clear();
1214 			}
1215 
1216 			pDatabase->unlock();
1217 			return true;
1218 		}
1219 	}
1220 	catch (const Xapian::Error &error)
1221 	{
1222 		clog << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
1223 	}
1224 	pDatabase->unlock();
1225 
1226 	return false;
1227 }
1228