1 /*
2 * Copyright 2005-2021 Fabrice Colin
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 #include <unistd.h>
22 #include <time.h>
23 #include <string>
24 #include <cstring>
25 #include <vector>
26 #include <iostream>
27 #include <fstream>
28 #include <algorithm>
29
30 #include "config.h"
31 #include "Languages.h"
32 #include "StringManip.h"
33 #include "TimeConverter.h"
34 #include "Timer.h"
35 #include "Url.h"
36 #include "CJKVTokenizer.h"
37 #include "FieldMapperInterface.h"
38 #include "XapianDatabaseFactory.h"
39 #include "AbstractGenerator.h"
40 #include "XapianEngine.h"
41
42 using std::string;
43 using std::multimap;
44 using std::vector;
45 using std::clog;
46 using std::clog;
47 using std::endl;
48 using std::inserter;
49 using std::getline;
50 using std::ifstream;
51 using namespace Dijon;
52
53 extern FieldMapperInterface *g_pMapper;
54
checkFilter(const string & freeQuery,string::size_type filterValueStart,bool & escapeValue,bool & hashValue)55 static void checkFilter(const string &freeQuery, string::size_type filterValueStart,
56 bool &escapeValue, bool &hashValue)
57 {
58 string filterName;
59 string::size_type filterNameStart = freeQuery.rfind(' ', filterValueStart);
60
61 escapeValue = hashValue = false;
62
63 if (filterNameStart == string::npos)
64 {
65 filterName = freeQuery.substr(0, filterValueStart);
66 }
67 else
68 {
69 filterName = freeQuery.substr(filterNameStart + 1, filterValueStart - filterNameStart - 1);
70 }
71 #ifdef DEBUG
72 clog << "checkFilter: filter " << filterName << endl;
73 #endif
74
75 // In XapianIndex, these are escaped and hashed
76 if ((filterName == "file") ||
77 (filterName =="dir") ||
78 (filterName == "url") ||
79 (filterName == "path"))
80 {
81 escapeValue = hashValue = true;
82 }
83 // except label which is only escaped
84 else if (filterName == "label")
85 {
86 escapeValue = true;
87 }
88 else if (g_pMapper != NULL)
89 {
90 escapeValue = g_pMapper->isEscaped(filterName);
91 }
92 }
93
94 class TimeValueRangeProcessor : public Xapian::RangeProcessor
95 {
96 public:
TimeValueRangeProcessor(Xapian::valueno valueNumber)97 TimeValueRangeProcessor(Xapian::valueno valueNumber) :
98 Xapian::RangeProcessor(),
99 m_valueNumber(valueNumber)
100 {
101 }
~TimeValueRangeProcessor()102 ~TimeValueRangeProcessor()
103 {
104 }
105
operator ()(const std::string & begin,const std::string & end)106 virtual Xapian::Query operator()(const std::string &begin, const std::string &end)
107 {
108 if ((begin.size() == 6) &&
109 (end.size() == 6))
110 {
111 // HHMMSS
112 #ifdef DEBUG
113 clog << "TimeValueRangeProcessor::operator: accepting " << begin << ".." << end << endl;
114 #endif
115
116 return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
117 m_valueNumber,
118 begin,end);
119 }
120 if ((begin.size() == 8) && (end.size() == 8) &&
121 (begin[2] == begin[5]) && (end[2] == end[5]) && (begin[2] == end[2]) &&
122 (end[4] == ':'))
123 {
124 std::string lower(begin), upper(end);
125
126 // HH:MM:SS
127 lower.erase(2, 1);
128 lower.erase(5, 1);
129 upper.erase(2, 1);
130 upper.erase(5, 1);
131 #ifdef DEBUG
132 clog << "TimeValueRangeProcessor::operator: accepting " << lower << ".." << upper << endl;
133 #endif
134
135 return Xapian::Query(Xapian::Query::OP_VALUE_RANGE,
136 m_valueNumber,
137 lower, upper);
138 }
139 #ifdef DEBUG
140 clog << "TimeValueRangeProcessor::operator: rejecting " << begin << ".." << end << endl;
141 #endif
142
143 return Xapian::Query(Xapian::Query::OP_INVALID);
144 }
145
146 protected:
147 Xapian::valueno m_valueNumber;
148
149 };
150
151 class TermDecider : public Xapian::ExpandDecider
152 {
153 public:
TermDecider(Xapian::Database * pIndex,Xapian::Stem * pStemmer,Xapian::Stopper * pStopper,const string & allowedPrefixes,Xapian::Query & query)154 TermDecider(Xapian::Database *pIndex,
155 Xapian::Stem *pStemmer,
156 Xapian::Stopper *pStopper,
157 const string &allowedPrefixes,
158 Xapian::Query &query) :
159 Xapian::ExpandDecider(),
160 m_pIndex(pIndex),
161 m_pStemmer(pStemmer),
162 m_pStopper(pStopper),
163 m_allowedPrefixes(allowedPrefixes),
164 m_pTermsToAvoid(NULL)
165 {
166 m_pTermsToAvoid = new set<string>();
167
168 for (Xapian::TermIterator termIter = query.get_terms_begin();
169 termIter != query.get_terms_end(); ++termIter)
170 {
171 string term(*termIter);
172
173 if (isupper((int)(term[0])) == 0)
174 {
175 m_pTermsToAvoid->insert(term);
176 if (m_pStemmer != NULL)
177 {
178 string stem((*m_pStemmer)(term));
179 m_pTermsToAvoid->insert(stem);
180 }
181 }
182 else if (term[0] == 'Z')
183 {
184 m_pTermsToAvoid->insert(term.substr(1));
185 }
186 }
187 #ifdef DEBUG
188 clog << "TermDecider: avoiding " << m_pTermsToAvoid->size() << " terms" << endl;
189 #endif
190 }
~TermDecider()191 ~TermDecider()
192 {
193 if (m_pTermsToAvoid != NULL)
194 {
195 delete m_pTermsToAvoid;
196 }
197 }
198
operator ()(const std::string & term) const199 virtual bool operator()(const std::string &term) const
200 {
201 CJKVTokenizer tokenizer;
202 bool isPrefixed = false;
203
204 // Reject short terms
205 if ((tokenizer.has_cjkv(term) == false) &&
206 (term.length() < 3))
207 {
208 return false;
209 }
210
211 // Reject terms with prefixes we don't want
212 if (isupper((int)(term[0])) != 0)
213 {
214 isPrefixed = true;
215
216 if (m_allowedPrefixes.find(term[0]) == string::npos)
217 {
218 return false;
219 }
220 }
221
222 // Reject terms with spaces
223 if (term.find_first_of(" \t\r\n") != string::npos)
224 {
225 return false;
226 }
227
228 // Reject terms that occur only once
229 if ((m_pIndex != NULL) &&
230 (m_pIndex->get_termfreq(term) <= 1))
231 {
232 return false;
233 }
234
235 // Reject stop words
236 if ((m_pStopper != NULL) &&
237 ((*m_pStopper)(term) == true))
238 {
239 return false;
240 }
241
242 // Stop here if there's no specific terms to avoid
243 if (m_pTermsToAvoid->empty() == true)
244 {
245 return true;
246 }
247
248 // Reject query terms
249 if (m_pTermsToAvoid->find(term) != m_pTermsToAvoid->end())
250 {
251 return false;
252 }
253
254 // Stop here is there's no stemmer
255 if (m_pStemmer == NULL)
256 {
257 return true;
258 }
259
260 // Reject terms that stem to the same as query terms
261 // or previously validated terms
262 string stem;
263 if (isPrefixed == true)
264 {
265 stem = (*m_pStemmer)(term.substr(1));
266 }
267 else
268 {
269 stem = (*m_pStemmer)(term);
270 }
271 if (m_pTermsToAvoid->find(stem) != m_pTermsToAvoid->end())
272 {
273 return false;
274 }
275 m_pTermsToAvoid->insert(stem);
276
277 return true;
278 }
279
280 protected:
281 Xapian::Database *m_pIndex;
282 Xapian::Stem *m_pStemmer;
283 Xapian::Stopper *m_pStopper;
284 string m_allowedPrefixes;
285 set<string> *m_pTermsToAvoid;
286
287 };
288
289 class FileStopper : public Xapian::SimpleStopper
290 {
291 public:
FileStopper(const string & languageCode)292 FileStopper(const string &languageCode) :
293 Xapian::SimpleStopper(),
294 m_languageCode(languageCode),
295 m_stopwordsCount(0)
296 {
297 if (languageCode.empty() == false)
298 {
299 ifstream inputFile;
300 string fileName(PREFIX);
301
302 fileName += "/share/pinot/stopwords/stopwords.";
303 fileName += languageCode;
304 inputFile.open(fileName.c_str());
305 if (inputFile.good() == true)
306 {
307 string line;
308
309 // Each line is a stopword
310 while (getline(inputFile, line).eof() == false)
311 {
312 add(line);
313 ++m_stopwordsCount;
314 }
315 }
316 inputFile.close();
317
318 #ifdef DEBUG
319 clog << "FileStopper: " << m_stopwordsCount << " stopwords for language code " << languageCode << endl;
320 #endif
321 }
322 }
~FileStopper()323 virtual ~FileStopper()
324 {
325 }
326
get_stopwords_count(void) const327 unsigned int get_stopwords_count(void) const
328 {
329 return m_stopwordsCount;
330 }
331
get_stopper(const string & languageCode)332 static FileStopper *get_stopper(const string &languageCode)
333 {
334 if (m_pStopper == NULL)
335 {
336 m_pStopper = new FileStopper(languageCode);
337 }
338 else if (m_pStopper->m_languageCode != languageCode)
339 {
340 delete m_pStopper;
341
342 m_pStopper = new FileStopper(languageCode);
343 }
344
345 return m_pStopper;
346 }
347
free_stopper(void)348 static void free_stopper(void)
349 {
350 if (m_pStopper != NULL)
351 {
352 delete m_pStopper;
353 m_pStopper = NULL;
354 }
355 }
356
357 protected:
358 string m_languageCode;
359 unsigned int m_stopwordsCount;
360 static FileStopper *m_pStopper;
361
362 };
363
364 FileStopper *FileStopper::m_pStopper = NULL;
365
366 class QueryModifier : public Dijon::CJKVTokenizer::TokensHandler
367 {
368 public:
369 typedef enum { NONE = 0, BRACKETS } CJKVWrap;
370
QueryModifier(const string & query,bool diacriticSensitive,unsigned int nGramSize)371 QueryModifier(const string &query,
372 bool diacriticSensitive, unsigned int nGramSize) :
373 m_query(query),
374 m_diacriticSensitive(diacriticSensitive),
375 m_pos(0),
376 m_wrap(BRACKETS),
377 m_wrapped(false),
378 m_nGramCount(0),
379 m_nGramSize(nGramSize),
380 m_tokensCount(0),
381 m_hasCJKV(false),
382 m_hasNonCJKV(false)
383 {
384 }
385
~QueryModifier()386 virtual ~QueryModifier()
387 {
388 }
389
handle_token(const string & tok,bool is_cjkv)390 virtual bool handle_token(const string &tok, bool is_cjkv)
391 {
392 if (tok.empty() == true)
393 {
394 return false;
395 }
396 #ifdef DEBUG
397 clog << "QueryModifier::handle_token: " << tok << endl;
398 #endif
399
400 // Where is this token in the original query ?
401 string::size_type tokPos = m_query.find(tok, m_pos);
402 ++m_tokensCount;
403
404 // Is this CJKV ?
405 if (is_cjkv == false)
406 {
407 char lastChar = tok[tok.length() - 1];
408
409 if (tokPos == string::npos)
410 {
411 // This should have been found
412 return false;
413 }
414
415 if (m_nGramCount > 0)
416 {
417 wrapClose();
418
419 m_nGramCount = 0;
420 m_pos = tokPos;
421 }
422
423 m_currentFilter.clear();
424 if (lastChar == '"')
425 {
426 // It's a quoted string
427 m_wrap = NONE;
428 }
429 else if (lastChar == ':')
430 {
431 // It's a filter
432 m_wrap = NONE;
433 m_currentFilter = tok;
434 }
435 else
436 {
437 m_wrap = BRACKETS;
438 }
439
440 if (m_currentFilter.empty() == true)
441 {
442 m_hasNonCJKV = true;
443 }
444
445 if (m_diacriticSensitive == false)
446 {
447 // Strip accents and other diacritics from terms
448 string unaccentedTok(Dijon::CJKVTokenizer::strip_marks(tok));
449 if (tok != unaccentedTok)
450 {
451 #ifdef DEBUG
452 clog << "QueryModifier::handle_token: " << tok << " stripped to " << unaccentedTok << endl;
453 #endif
454 m_query.replace(tokPos, tok.length(), unaccentedTok);
455 }
456 }
457
458 // Return right away
459 return true;
460 }
461
462 // First n-gram ?
463 if (m_nGramCount == 0)
464 {
465 if (tokPos == string::npos)
466 {
467 // That's definitely not right
468 return false;
469 }
470
471 // Append non-CJKV text that precedes and start wrapping CJKV tokens
472 if (tokPos > m_pos)
473 {
474 m_modifiedQuery += " " + m_query.substr(m_pos, tokPos - m_pos);
475 }
476 m_pos += tok.length();
477
478 wrapOpen();
479 }
480 else
481 {
482 m_modifiedQuery += " ";
483 if (m_currentFilter.empty() == false)
484 {
485 m_modifiedQuery += m_currentFilter;
486 }
487 }
488 m_modifiedQuery += tok;
489 #ifdef DEBUG
490 clog << "QueryModifier::handle_token: " << m_modifiedQuery << endl;
491 #endif
492
493 if (tokPos != string::npos)
494 {
495 m_pos = tokPos + tok.length();
496 }
497 ++m_nGramCount;
498 m_hasCJKV = true;
499
500 return true;
501 }
502
get_tokens_count(void) const503 unsigned int get_tokens_count(void) const
504 {
505 return m_tokensCount;
506 }
507
get_modified_query(bool & pureCJKV)508 string get_modified_query(bool &pureCJKV)
509 {
510 #ifdef DEBUG
511 clog << "QueryModifier::get_modified_query: " << m_pos << "/" << m_query.length() << endl;
512 #endif
513
514 // Anything left ?
515 if (m_pos < m_query.length() - 1)
516 {
517 m_modifiedQuery += " " + m_query.substr(m_pos);
518 }
519 wrapClose();
520 #ifdef DEBUG
521 clog << "QueryModifier::get_modified_query: " << m_modifiedQuery << endl;
522 #endif
523
524 if ((m_hasCJKV == true) &&
525 (m_hasNonCJKV == false))
526 {
527 pureCJKV = true;
528 }
529 else
530 {
531 pureCJKV = false;
532 }
533
534 return m_modifiedQuery;
535 }
536
537 protected:
538 string m_query;
539 bool m_diacriticSensitive;
540 string m_modifiedQuery;
541 string::size_type m_pos;
542 CJKVWrap m_wrap;
543 bool m_wrapped;
544 string m_currentFilter;
545 unsigned int m_nGramCount;
546 unsigned int m_nGramSize;
547 unsigned int m_tokensCount;
548 bool m_hasCJKV;
549 bool m_hasNonCJKV;
550
wrapOpen(void)551 void wrapOpen(void)
552 {
553 switch (m_wrap)
554 {
555 case BRACKETS:
556 m_modifiedQuery += " (";
557 break;
558 case NONE:
559 default:
560 break;
561 }
562 m_wrapped = true;
563 }
564
wrapClose(void)565 void wrapClose(void)
566 {
567 if (m_wrapped == false)
568 {
569 return;
570 }
571
572 // Finish wrapping CJKV tokens
573 switch (m_wrap)
574 {
575 case BRACKETS:
576 m_modifiedQuery += ')';
577 break;
578 case NONE:
579 default:
580 break;
581 }
582 m_wrapped = false;
583 }
584
585 };
586
XapianEngine(const string & database)587 XapianEngine::XapianEngine(const string &database) :
588 SearchEngineInterface()
589 {
590 // We expect documents to have been converted to UTF-8 at indexing time
591 m_charset = "UTF-8";
592
593 // If the database name ends with a slash, remove it
594 if (database[database.length() - 1] == '/')
595 {
596 m_databaseName = database.substr(0, database.length() - 1);
597 }
598 else
599 {
600 m_databaseName = database;
601 }
602 }
603
~XapianEngine()604 XapianEngine::~XapianEngine()
605 {
606 }
607
parseQuery(Xapian::Database * pIndex,const QueryProperties & queryProps,const string & stemLanguage,DefaultOperator defaultOperator,string & correctedFreeQuery,bool minimal)608 Xapian::Query XapianEngine::parseQuery(Xapian::Database *pIndex, const QueryProperties &queryProps,
609 const string &stemLanguage, DefaultOperator defaultOperator,
610 string &correctedFreeQuery, bool minimal)
611 {
612 Xapian::QueryParser parser;
613 CJKVTokenizer tokenizer;
614 string freeQuery(queryProps.getFreeQuery());
615 unsigned int tokensCount = 1;
616 bool diacriticSensitive = queryProps.getDiacriticSensitive();
617
618 // Modifying the query is necessary if it's CJKV or diacritics are off
619 if ((tokenizer.has_cjkv(freeQuery) == true) ||
620 (diacriticSensitive == false))
621 {
622 QueryModifier handler(freeQuery,
623 diacriticSensitive,
624 tokenizer.get_ngram_size());
625
626 tokenizer.tokenize(freeQuery, handler, true);
627
628 tokensCount = handler.get_tokens_count();
629
630 // We can disable stemming and spelling correction for pure CJKV queries
631 string cjkvQuery(handler.get_modified_query(minimal));
632 #ifdef DEBUG
633 clog << "XapianEngine::parseQuery: CJKV query is " << cjkvQuery << endl;
634 #endif
635
636 // Do as if the user had given this as input
637 freeQuery = cjkvQuery;
638 }
639 else
640 {
641 string::size_type spacePos = freeQuery.find(' ');
642 while (spacePos != string::npos)
643 {
644 ++tokensCount;
645
646 if (spacePos + 1 >= freeQuery.length())
647 {
648 break;
649 }
650
651 // Next
652 spacePos = freeQuery.find(' ', spacePos + 1);
653 }
654 }
655 #ifdef DEBUG
656 clog << "XapianEngine::parseQuery: " << tokensCount << " tokens" << endl;
657 #endif
658
659 if (pIndex != NULL)
660 {
661 // The database is required for wildcards and spelling
662 parser.set_database(*pIndex);
663 }
664
665 // Set things up
666 if ((minimal == false) &&
667 (stemLanguage.empty() == false))
668 {
669 parser.set_stemmer(m_stemmer);
670 parser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
671
672 // Don't bother loading the stopwords list if there's only one token
673 if (tokensCount > 1)
674 {
675 FileStopper *pStopper = FileStopper::get_stopper(Languages::toCode(stemLanguage));
676 if ((pStopper != NULL) &&
677 (pStopper->get_stopwords_count() > 0))
678 {
679 parser.set_stopper(pStopper);
680 }
681 }
682 }
683 else
684 {
685 #ifdef DEBUG
686 clog << "XapianEngine::parseQuery: no stemming" << endl;
687 #endif
688 parser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
689 }
690 // What's the default operator ?
691 if (defaultOperator == DEFAULT_OP_AND)
692 {
693 parser.set_default_op(Xapian::Query::OP_AND);
694 }
695 else
696 {
697 parser.set_default_op(Xapian::Query::OP_OR);
698 }
699 // Search across text body and title
700 parser.add_prefix("", "");
701 parser.add_prefix("", "S");
702 // X prefixes should always include a colon
703 parser.add_boolean_prefix("site", "H");
704 parser.add_boolean_prefix("file", "P");
705 parser.add_boolean_prefix("ext", "E");
706 parser.add_prefix("title", "S");
707 parser.add_boolean_prefix("url", "U");
708 parser.add_boolean_prefix("dir", "XDIR:");
709 parser.add_boolean_prefix("inurl", "XFILE:");
710 parser.add_prefix("path", "XPATH:");
711 parser.add_boolean_prefix("lang", "L");
712 parser.add_boolean_prefix("type", "T");
713 parser.add_boolean_prefix("class", "XCLASS:");
714 parser.add_boolean_prefix("label", "XLABEL:");
715 parser.add_boolean_prefix("tokens", "XTOK:");
716 if (g_pMapper != NULL)
717 {
718 map<string, string> filters;
719
720 g_pMapper->getBooleanFilters(filters);
721
722 for (map<string, string>::const_iterator filterIter = filters.begin();
723 filterIter != filters.end(); ++filterIter)
724 {
725 parser.add_boolean_prefix(filterIter->first, filterIter->second);
726 }
727 }
728
729 // Date range
730 Xapian::DateRangeProcessor dateProcessor(0);
731 parser.add_rangeprocessor(&dateProcessor);
732
733 // Size with a "b" suffix, ie 1024..10240b
734 Xapian::NumberRangeProcessor sizeProcessor(2, "b", Xapian::RP_SUFFIX);
735 parser.add_rangeprocessor(&sizeProcessor);
736
737 // Time range
738 TimeValueRangeProcessor timeProcessor(3);
739 parser.add_rangeprocessor(&timeProcessor);
740
741 // What type of query is this ?
742 QueryProperties::QueryType type = queryProps.getType();
743 if (type != QueryProperties::XAPIAN_QP)
744 {
745 // This isn't supported
746 return Xapian::Query();
747 }
748
749 // Do some pre-processing : look for filters with quoted values
750 string::size_type escapedFilterEnd = 0;
751 string::size_type escapedFilterStart = freeQuery.find(":\"");
752 while ((escapedFilterStart != string::npos) &&
753 (escapedFilterStart < freeQuery.length() - 2))
754 {
755 escapedFilterEnd = freeQuery.find("\"", escapedFilterStart + 2);
756 if (escapedFilterEnd == string::npos)
757 {
758 break;
759 }
760
761 string filterValue = freeQuery.substr(escapedFilterStart + 2, escapedFilterEnd - escapedFilterStart - 2);
762 if (filterValue.empty() == false)
763 {
764 string escapedValue(Url::escapeUrl(filterValue));
765 bool escapeValue = false, hashValue = false;
766
767 // The value should be escaped and length-limited as done at indexing time
768 checkFilter(freeQuery, escapedFilterStart, escapeValue, hashValue);
769
770 if (escapeValue == false)
771 {
772 // No escaping
773 escapedValue = filterValue;
774 }
775 if (hashValue == true)
776 {
777 // Partially hash if necessary
778 escapedValue = XapianDatabase::limitTermLength(escapedValue, true);
779 }
780 else
781 {
782 escapedValue = XapianDatabase::limitTermLength(escapedValue);
783 }
784
785 #ifdef DEBUG
786 clog << "XapianEngine::parseQuery: escaping to " << escapedValue << endl;
787 #endif
788 freeQuery.replace(escapedFilterStart + 1, escapedFilterEnd - escapedFilterStart,
789 escapedValue);
790 escapedFilterEnd = escapedFilterEnd + escapedValue.length() - filterValue.length();
791 }
792 else
793 {
794 // No value !
795 freeQuery.replace(escapedFilterStart, escapedFilterEnd - escapedFilterStart + 1, ":");
796 escapedFilterEnd -= 2;
797 }
798 #ifdef DEBUG
799 clog << "XapianEngine::parseQuery: replaced filter: " << freeQuery << endl;
800 #endif
801
802 // Next
803 escapedFilterStart = freeQuery.find(":\"", escapedFilterEnd);
804 }
805
806 // Parse the query string with all necessary options
807 unsigned int flags = Xapian::QueryParser::FLAG_BOOLEAN|Xapian::QueryParser::FLAG_PHRASE|
808 Xapian::QueryParser::FLAG_LOVEHATE|Xapian::QueryParser::FLAG_PURE_NOT;
809 if (minimal == false)
810 {
811 flags |= Xapian::QueryParser::FLAG_WILDCARD;
812 #if ENABLE_XAPIAN_SPELLING_CORRECTION>0
813 flags |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
814 #endif
815 }
816 Xapian::Query parsedQuery = parser.parse_query(freeQuery, flags);
817 #ifdef DEBUG
818 clog << "XapianEngine::parseQuery: query is " << parsedQuery.get_description() << endl;
819 #endif
820
821 // Any limit on what documents should be searched ?
822 if (m_limitDocuments.empty() == false)
823 {
824 Xapian::Query filterQuery(Xapian::Query::OP_OR,
825 m_limitDocuments.begin(), m_limitDocuments.end());
826
827 parsedQuery = Xapian::Query(Xapian::Query::OP_FILTER,
828 parsedQuery, filterQuery);
829 #ifdef DEBUG
830 clog << "XapianEngine::parseQuery: limited query is " << parsedQuery.get_description() << endl;
831 #endif
832 }
833
834 if (minimal == false)
835 {
836 #if ENABLE_XAPIAN_SPELLING_CORRECTION>0
837 // Any correction ?
838 correctedFreeQuery = parser.get_corrected_query_string();
839 #ifdef DEBUG
840 if (correctedFreeQuery.empty() == false)
841 {
842 clog << "XapianEngine::parseQuery: corrected spelling to: " << correctedFreeQuery << endl;
843 }
844 #endif
845 #endif
846 }
847
848 return parsedQuery;
849 }
850
queryDatabase(Xapian::Database * pIndex,Xapian::Query & query,const string & stemLanguage,unsigned int startDoc,const QueryProperties & queryProps)851 bool XapianEngine::queryDatabase(Xapian::Database *pIndex, Xapian::Query &query,
852 const string &stemLanguage, unsigned int startDoc, const QueryProperties &queryProps)
853 {
854 Timer timer;
855 unsigned int maxResultsCount = queryProps.getMaximumResultsCount();
856 bool completedQuery = false;
857
858 if (pIndex == NULL)
859 {
860 return false;
861 }
862
863 // Start an enquire session on the database
864 Xapian::Enquire enquire(*pIndex);
865
866 timer.start();
867 try
868 {
869 AbstractGenerator abstractGen(pIndex, 50);
870 vector<string> seedTerms;
871
872 // Give the query object to the enquire session
873 enquire.set_query(query);
874 // How should results be sorted ?
875 if (queryProps.getSortOrder() == QueryProperties::RELEVANCE)
876 {
877 // By relevance, then date
878 enquire.set_sort_by_relevance_then_value(4, true);
879 #ifdef DEBUG
880 clog << "XapianEngine::queryDatabase: sorting by relevance first" << endl;
881 #endif
882 }
883 else if (queryProps.getSortOrder() == QueryProperties::DATE_DESC)
884 {
885 // By date, and then by relevance
886 enquire.set_docid_order(Xapian::Enquire::DONT_CARE);
887 enquire.set_sort_by_value_then_relevance(4, true);
888 #ifdef DEBUG
889 clog << "XapianEngine::queryDatabase: sorting by date and time desc" << endl;
890 #endif
891 }
892 else if (queryProps.getSortOrder() == QueryProperties::DATE_ASC)
893 {
894 // By date, and then by relevance
895 enquire.set_docid_order(Xapian::Enquire::DONT_CARE);
896 enquire.set_sort_by_value_then_relevance(5, true);
897 #ifdef DEBUG
898 clog << "XapianEngine::queryDatabase: sorting by date and time asc" << endl;
899 #endif
900 }
901 else if (queryProps.getSortOrder() == QueryProperties::SIZE_DESC)
902 {
903 // By date, and then by relevance
904 enquire.set_docid_order(Xapian::Enquire::DONT_CARE);
905 enquire.set_sort_by_value_then_relevance(2, true);
906 #ifdef DEBUG
907 clog << "XapianEngine::queryDatabase: sorting by size asc" << endl;
908 #endif
909 }
910
911 // Collapse results ?
912 if (g_pMapper != NULL)
913 {
914 unsigned int valueNumber;
915
916 if (g_pMapper->collapseOnValue(valueNumber) == true)
917 {
918 enquire.set_collapse_key(valueNumber, 1);
919 }
920 }
921
922 // Get the top results of the query
923 Xapian::MSet matches = enquire.get_mset(startDoc, maxResultsCount, (2 * maxResultsCount) + 1);
924 m_resultsCountEstimate = matches.get_matches_estimated();
925 if (matches.empty() == false)
926 {
927 #ifdef DEBUG
928 clog << "XapianEngine::queryDatabase: found " << matches.size() << "/" << maxResultsCount
929 << " results found from position " << startDoc << endl;
930 clog << "XapianEngine::queryDatabase: estimated " << matches.get_matches_lower_bound()
931 << "/" << m_resultsCountEstimate << "/" << matches.get_matches_upper_bound()
932 << ", " << matches.get_description() << endl;
933 #endif
934
935 // Get the results
936 for (Xapian::MSetIterator mIter = matches.begin(); mIter != matches.end(); ++mIter)
937 {
938 Xapian::docid docId = *mIter;
939 Xapian::Document doc(mIter.get_document());
940
941 // What terms did this document match ?
942 seedTerms.clear();
943 for (Xapian::TermIterator termIter = enquire.get_matching_terms_begin(docId);
944 termIter != enquire.get_matching_terms_end(docId); ++termIter)
945 {
946 char firstChar = (*termIter)[0];
947
948 if (isupper(((int)firstChar)) == 0)
949 {
950 seedTerms.push_back(*termIter);
951 #ifdef DEBUG
952 clog << "XapianEngine::queryDatabase: matched term " << *termIter << endl;
953 #endif
954 }
955 else if (firstChar == 'Z')
956 {
957 string stemmed((*termIter).substr(1));
958 string::size_type stemmedLen = stemmed.length();
959
960 // Which of this document's terms stem to this ?
961 Xapian::TermIterator docTermIter = pIndex->termlist_begin(docId);
962 if (docTermIter != pIndex->termlist_end(docId))
963 {
964 for (docTermIter.skip_to(stemmed);
965 docTermIter != pIndex->termlist_end(docId); ++docTermIter)
966 {
967 // Is this a potential unstem ?
968 if (strncasecmp((*docTermIter).c_str(), stemmed.c_str(), stemmedLen) != 0)
969 {
970 // No, no point looking at the next terms
971 break;
972 }
973 #ifdef DEBUG
974 clog << "XapianEngine::queryDatabase: matched unstem " << *docTermIter << endl;
975 #endif
976
977 // FIXME: check this term stems to stemmed !
978 seedTerms.push_back(*docTermIter);
979 }
980 }
981 }
982 }
983
984 if (docId <= 0)
985 {
986 #ifdef DEBUG
987 clog << "XapianEngine::queryDatabase: bogus document ID " << docId << endl;
988 #endif
989 continue;
990 }
991
992 DocumentInfo thisResult;
993 thisResult.setExtract(abstractGen.generateAbstract(docId, seedTerms));
994 thisResult.setScore((float)mIter.get_percent());
995
996 #ifdef DEBUG
997 clog << "XapianEngine::queryDatabase: found document ID " << docId << endl;
998 #endif
999 XapianDatabase::recordToProps(doc.get_data(), &thisResult);
1000 // XapianDatabase stored the language in English
1001 thisResult.setLanguage(Languages::toLocale(thisResult.getLanguage()));
1002
1003 string url(thisResult.getLocation());
1004 if (url.empty() == true)
1005 {
1006 // Hmmm this shouldn't be empty...
1007 // Use this instead, even though the document isn't cached in the index
1008 thisResult.setLocation(XapianDatabase::buildUrl(m_databaseName, docId));
1009 }
1010
1011 // We don't know the index ID, just the document ID
1012 thisResult.setIsIndexed(0, docId);
1013
1014 // Add this result
1015 m_resultsList.push_back(thisResult);
1016 }
1017 }
1018
1019 completedQuery = true;
1020 }
1021 catch (const Xapian::Error &error)
1022 {
1023 clog << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
1024 }
1025 clog << "Ran query \"" << queryProps.getFreeQuery() << "\" in " << timer.stop() << " ms" << endl;
1026
1027 try
1028 {
1029 m_expandTerms.clear();
1030
1031 // Expand the query ?
1032 if (m_expandDocuments.empty() == false)
1033 {
1034 Xapian::RSet expandDocs;
1035
1036 for (set<string>::const_iterator docIter = m_expandDocuments.begin();
1037 docIter != m_expandDocuments.end(); ++docIter)
1038 {
1039 string uniqueTerm(string("U") + XapianDatabase::limitTermLength(Url::escapeUrl(Url::canonicalizeUrl(*docIter)), true));
1040
1041 // Only one document may have this term
1042 Xapian::PostingIterator postingIter = pIndex->postlist_begin(uniqueTerm);
1043 if (postingIter != pIndex->postlist_end(uniqueTerm))
1044 {
1045 expandDocs.add_document(*postingIter);
1046 }
1047 }
1048 #ifdef DEBUG
1049 clog << "XapianEngine::queryDatabase: expand from " << expandDocs.size() << " documents" << endl;
1050 #endif
1051
1052 // Get 10 non-prefixed terms
1053 string allowedPrefixes("RS");
1054 TermDecider expandDecider(pIndex, ((stemLanguage.empty() == true) ? NULL : &m_stemmer),
1055 FileStopper::get_stopper(Languages::toCode(stemLanguage)),
1056 allowedPrefixes, query);
1057 Xapian::ESet expandTerms = enquire.get_eset(10, expandDocs, &expandDecider);
1058 #ifdef DEBUG
1059 clog << "XapianEngine::queryDatabase: " << expandTerms.size() << " expand terms" << endl;
1060 #endif
1061 for (Xapian::ESetIterator termIter = expandTerms.begin();
1062 termIter != expandTerms.end(); ++termIter)
1063 {
1064 string expandTerm(*termIter);
1065 char firstChar = expandTerm[0];
1066
1067 // Is this prefixed ?
1068 if (allowedPrefixes.find(firstChar) != string::npos)
1069 {
1070 expandTerm.erase(0, 1);
1071 }
1072
1073 m_expandTerms.insert(expandTerm);
1074 }
1075 }
1076 }
1077 catch (const Xapian::Error &error)
1078 {
1079 clog << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
1080 }
1081
1082 // Be tolerant of errors as long as we got some results
1083 if ((completedQuery == true) ||
1084 (m_resultsList.empty() == false))
1085 {
1086 return true;
1087 }
1088
1089 return false;
1090 }
1091
1092 /// Frees all objects.
freeAll(void)1093 void XapianEngine::freeAll(void)
1094 {
1095 FileStopper::free_stopper();
1096 }
1097
1098 //
1099 // Implementation of SearchEngineInterface
1100 //
1101
1102 /// Sets the set of documents to limit to.
setLimitSet(const set<string> & docsSet)1103 bool XapianEngine::setLimitSet(const set<string> &docsSet)
1104 {
1105 for (set<string>::const_iterator docIter = docsSet.begin();
1106 docIter != docsSet.end(); ++docIter)
1107 {
1108 string urlFilter("U");
1109
1110 // Escape and hash
1111 urlFilter += XapianDatabase::limitTermLength(Url::escapeUrl(*docIter), true);
1112 m_limitDocuments.insert(urlFilter);
1113 }
1114 #ifdef DEBUG
1115 clog << "XapianEngine::setLimitSet: " << m_limitDocuments.size() << " documents" << endl;
1116 #endif
1117
1118 return true;
1119 }
1120
1121 /// Sets the set of documents to expand from.
setExpandSet(const set<string> & docsSet)1122 bool XapianEngine::setExpandSet(const set<string> &docsSet)
1123 {
1124 copy(docsSet.begin(), docsSet.end(),
1125 inserter(m_expandDocuments, m_expandDocuments.begin()));
1126 #ifdef DEBUG
1127 clog << "XapianEngine::setExpandSet: " << m_expandDocuments.size() << " documents" << endl;
1128 #endif
1129
1130 return true;
1131 }
1132
1133 /// Runs a query; true if success.
runQuery(QueryProperties & queryProps,unsigned int startDoc)1134 bool XapianEngine::runQuery(QueryProperties& queryProps,
1135 unsigned int startDoc)
1136 {
1137 string stemLanguage(Languages::toEnglish(queryProps.getStemmingLanguage()));
1138
1139 // Clear the results list
1140 m_resultsList.clear();
1141 m_resultsCountEstimate = 0;
1142 m_correctedFreeQuery.clear();
1143
1144 if (queryProps.isEmpty() == true)
1145 {
1146 #ifdef DEBUG
1147 clog << "XapianEngine::runQuery: query is empty" << endl;
1148 #endif
1149 return false;
1150 }
1151
1152 XapianDatabase *pDatabase = XapianDatabaseFactory::getDatabase(m_databaseName, true);
1153 if (pDatabase == NULL)
1154 {
1155 clog << "Couldn't get index " << m_databaseName << endl;
1156 return false;
1157 }
1158
1159 if ((stemLanguage.empty() == false) &&
1160 (stemLanguage != "unknown"))
1161 {
1162 #ifdef DEBUG
1163 clog << "XapianEngine::runQuery: " << stemLanguage << " stemming" << endl;
1164 #endif
1165 try
1166 {
1167 m_stemmer = Xapian::Stem(StringManip::toLowerCase(stemLanguage));
1168 }
1169 catch (const Xapian::Error &error)
1170 {
1171 clog << "Couldn't create stemmer: " << error.get_type() << ": " << error.get_msg() << endl;
1172 }
1173 }
1174
1175 // Get the latest revision...
1176 pDatabase->reopen();
1177 Xapian::Database *pIndex = pDatabase->readLock();
1178 try
1179 {
1180 unsigned int searchStep = 1;
1181
1182 // Searches are run in this order :
1183 // 1. no stemming, exact matches only
1184 // 2. stem terms if a language is defined for the query
1185 Xapian::Query fullQuery = parseQuery(pIndex, queryProps, "",
1186 m_defaultOperator, m_correctedFreeQuery);
1187 while (fullQuery.empty() == false)
1188 {
1189 // Query the database
1190 if (queryDatabase(pIndex, fullQuery, stemLanguage, startDoc, queryProps) == false)
1191 {
1192 break;
1193 }
1194
1195 if (m_resultsList.empty() == true)
1196 {
1197 // The search did succeed but didn't return anything
1198 if ((searchStep == 1) &&
1199 (stemLanguage.empty() == false))
1200 {
1201 #ifdef DEBUG
1202 clog << "XapianEngine::runQuery: trying again with stemming" << endl;
1203 #endif
1204 fullQuery = parseQuery(pIndex, queryProps, stemLanguage,
1205 m_defaultOperator, m_correctedFreeQuery);
1206 ++searchStep;
1207 continue;
1208 }
1209 }
1210 else
1211 {
1212 // We have results, don't bother about correcting the query
1213 m_correctedFreeQuery.clear();
1214 }
1215
1216 pDatabase->unlock();
1217 return true;
1218 }
1219 }
1220 catch (const Xapian::Error &error)
1221 {
1222 clog << "Couldn't run query: " << error.get_type() << ": " << error.get_msg() << endl;
1223 }
1224 pDatabase->unlock();
1225
1226 return false;
1227 }
1228