1 #include <errno.h>
2 #include <config.h>
3 #include <string.h>
4 #include <sys/types.h>
5 #include <syslog.h>
6 
7 #include <fstream>
8 #include <sstream>
9 #include <algorithm>
10 #include <memory>
11 
12 extern "C" {
13 #include <assert.h>
14 #include "libconfig.h"
15 #include "util.h"
16 #include "search_engines.h"
17 #include "search_part.h"
18 #include "xmalloc.h"
19 #include "xapian_wrap.h"
20 #include "charset.h"
21 #include "ptrarray.h"
22 #include "parseaddr.h"
23 
24 
25 /* generated headers are not necessarily in current directory */
26 #include "imap/imap_err.h"
27 };
28 
29 #include <unicode/unistr.h>
30 #include <unicode/locid.h>
31 
32 #include <xapian.h>
33 
34 #ifdef HAVE_CLD2
35 #include <cld2/public/compact_lang_det.h>
36 #endif
37 
38 // from global.h
39 extern int charset_flags;
40 
41 #define SLOT_CYRUSID        0
42 #define SLOT_DOCLANGS       1
43 #define SLOT_INDEXLEVEL     2
44 #define SLOT_INDEXVERSION   3
45 
46 static const unsigned XAPIAN_MAX_TERM_LENGTH = 200; /* in UTF-8 bytes */
47 
48 /* ====================================================================== */
49 
make_cyrusid(struct buf * dst,const struct message_guid * guid,char doctype)50 static void make_cyrusid(struct buf *dst, const struct message_guid *guid, char doctype)
51 {
52     buf_reset(dst);
53     buf_putc(dst, '*');
54     buf_putc(dst, doctype);
55     buf_putc(dst, '*');
56     buf_appendcstr(dst, message_guid_encode(guid));
57     buf_cstring(dst);
58 }
59 
60 /* ====================================================================== */
61 
62 /*
63  * A brief history of Xapian db versions:
64  * Version 0: uses STEM_ALL for all terms, term prefixes don't start with 'X'
65  * Version 1: term prefixes start with 'X'
66  * Version 2: uses STEM_SOME for some terms
67  * Version 3: removes all use of STEM_ALL
68  * Version 4: indexes headers and bodies in separate documents
69  * Version 5: indexes headers and bodies together and stems by language
70  * Version 6: stores all detected languages of a document in slot SLOT_DOCLANGS (deprecated)
71  * Version 7: indexes new DELIVEREDTO search part
72  * Version 8: reintroduces language indexing for non-English text
73  * Version 9: introduces index levels as keys to cyrusid metadata
74  * Version 10: indexes new PRIORITY search part
75  * Version 11: indexes LIST-ID as single value
76  * Version 12: indexes email domains as single values. Supports subdomain search.
77  * Version 13: indexes content-type and subtype separately
78  * Version 14: adds SLOT_INDEXVERSION to documents
79  * Version 15: receives indexed header fields and text in original format (rather than search form)
80  * Version 16: indexes entire addr-spec as a single value.  Prevents cross-matching localparts and domains
81  */
82 #define XAPIAN_DB_CURRENT_VERSION 16
83 #define XAPIAN_DB_MIN_SUPPORTED_VERSION 5
84 
read_db_versions(const Xapian::Database & database)85 static std::set<int> read_db_versions(const Xapian::Database &database)
86 {
87     std::set<int> versions;
88 
89     // db_version is a comma-separated list of version numbers
90     std::string val = database.get_metadata("cyrus.db_version");
91     if (!val.empty()) {
92         strarray_t *vstr = strarray_split(val.c_str(), ",", 0);
93         for (int i = 0; i < strarray_size(vstr); i++) {
94             int version = std::atoi(strarray_nth(vstr, i));
95             if (version) versions.insert(version);
96         }
97         strarray_free(vstr);
98     }
99     // Up to version 3 this was named stem version.
100     val = database.get_metadata("cyrus.stem-version");
101     if (!val.empty()) {
102         versions.insert(std::stoi(val));
103     }
104 
105     return versions;
106 }
107 
write_db_versions(Xapian::WritableDatabase & database,std::set<int> & versions)108 static void write_db_versions(Xapian::WritableDatabase &database, std::set<int> &versions)
109 {
110     std::ostringstream val;
111     for (std::set<int>::iterator it = versions.begin(); it != versions.end(); ++it) {
112         if (it != versions.begin()) val << ",";
113         val << *it;
114     }
115     database.set_metadata("cyrus.db_version", val.str());
116     database.set_metadata("cyrus.stem-version", "");
117 }
118 
119 /* ====================================================================== */
120 #define XAPIAN_LANG_COUNT_KEYPREFIX "lang.count"
121 #define XAPIAN_LANG_DOC_KEYPREFIX "lang.doc"
122 
lang_prefix(const std::string & iso_lang,const char * prefix)123 static std::string lang_prefix(const std::string& iso_lang, const char *prefix)
124 {
125     std::string ustr = std::string(prefix) + "XI" + iso_lang;
126     std::transform(ustr.begin(), ustr.end(), ustr.begin(), ::toupper);
127     return ustr;
128 }
129 
lang_doc_key(const char * cyrusid)130 static std::string lang_doc_key(const char *cyrusid)
131 {
132     std::string key(XAPIAN_LANG_DOC_KEYPREFIX ".");
133     key += cyrusid;
134     return key;
135 }
136 
lang_count_key(const std::string & iso_lang)137 static std::string lang_count_key(const std::string& iso_lang)
138 {
139     std::string key(XAPIAN_LANG_COUNT_KEYPREFIX ".");
140     key += iso_lang;
141     return key;
142 }
143 
calculate_language_counts(const Xapian::Database & db,std::map<const std::string,unsigned> & lang_counts)144 static int calculate_language_counts(const Xapian::Database& db,
145                                      std::map<const std::string, unsigned>& lang_counts)
146 {
147     std::set<int> db_versions = read_db_versions(db);
148 
149     if (db_versions.lower_bound(8) == db_versions.begin()) {
150         // count all indexed body parts
151         size_t nparts = 0;
152         for (Xapian::TermIterator it = db.metadata_keys_begin("cyrusid.*P*");
153                 it != db.metadata_keys_end("cyrusid.*P*"); ++it) {
154             nparts++;
155         }
156         // count body parts with language metadata
157         const std::string prefix{XAPIAN_LANG_DOC_KEYPREFIX ".*P*"};
158         size_t nlangparts = 0;
159         for (Xapian::TermIterator it = db.metadata_keys_begin(prefix);
160                 it != db.metadata_keys_end(prefix); ++it) {
161             lang_counts[db.get_metadata(*it)] += 1;
162             nlangparts++;
163         }
164         // English or unknown language body parts have no metadata.
165         lang_counts["en"] += nparts - nlangparts;
166         // Sanity check data
167         if (nparts < nlangparts) {
168             return IMAP_IOERROR;
169         }
170     }
171 
172     return 0;
173 }
174 
remove_legacy_metadata(Xapian::WritableDatabase & db)175 static void remove_legacy_metadata(Xapian::WritableDatabase& db)
176 {
177     const std::string prefix{XAPIAN_LANG_DOC_KEYPREFIX "."};
178     for (Xapian::TermIterator key = db.metadata_keys_begin(prefix);
179             key != db.metadata_keys_end(prefix); ++key) {
180 
181         const std::string& val = db.get_metadata(*key);
182         // Remove legacy keys and values.
183         if ((*key).find('.') != std::string::npos ||
184             (!val.empty() && !isalpha(val[0]))) {
185             db.set_metadata(*key, "");
186         }
187     }
188     for (Xapian::docid docid = 1; docid <= db.get_lastdocid(); ++docid) {
189         try {
190             Xapian::Document doc = db.get_document(docid);
191             const std::string& val = doc.get_value(SLOT_DOCLANGS);
192             // Remove legacy doclang slot values.
193             if (!val.empty() && !isalpha(val[0])) {
194                 doc.remove_value(SLOT_DOCLANGS);
195             }
196         }
197         catch (Xapian::DocNotFoundError e) {
198             // ignore
199         }
200     }
201 }
202 
write_language_counts(Xapian::WritableDatabase & db,const std::map<const std::string,unsigned> & lang_counts)203 static void write_language_counts(Xapian::WritableDatabase& db,
204                                   const std::map<const std::string, unsigned>& lang_counts)
205 {
206     for (Xapian::TermIterator it = db.metadata_keys_begin(XAPIAN_LANG_COUNT_KEYPREFIX);
207             it != db.metadata_keys_end(XAPIAN_LANG_COUNT_KEYPREFIX); ++it) {
208         db.set_metadata(*it, "");
209     }
210     for (const std::pair<std::string, unsigned>& it : lang_counts) {
211         db.set_metadata(lang_count_key(it.first), std::to_string(it.second));
212     }
213 }
214 
read_language_counts(const Xapian::Database & db,std::map<const std::string,unsigned> & lang_counts)215 static void read_language_counts(const Xapian::Database& db,
216                                  std::map<const std::string, unsigned>& lang_counts)
217 {
218     std::set<int> db_versions = read_db_versions(db);
219 
220     if (db_versions.lower_bound(8) == db_versions.begin()) {
221         const std::string prefix(XAPIAN_LANG_COUNT_KEYPREFIX ".");
222         for (Xapian::TermIterator it = db.metadata_keys_begin(prefix);
223                 it != db.metadata_keys_end(prefix); ++it) {
224             std::string iso_lang = (*it).substr(prefix.length());
225             unsigned count = std::stol(db.get_metadata(*it));
226             lang_counts[iso_lang] += count;
227         }
228     }
229 }
230 
parse_doclangs(const std::string & val,std::set<std::string> & doclangs)231 static void parse_doclangs(const std::string& val, std::set<std::string>& doclangs)
232 {
233     if (val.empty() || !isalpha(val[0])) return;
234 
235     size_t base = 0, pos;
236     while ((pos = val.find(',', base)) != std::string::npos) {
237         doclangs.insert(val.substr(base, pos - base));
238         base = pos + 1;
239     }
240     doclangs.insert(val.substr(base));
241 }
242 
format_doclangs(const std::set<std::string> & doclangs)243 static std::string format_doclangs(const std::set<std::string>& doclangs)
244 {
245     std::ostringstream val;
246     for (std::set<std::string>::iterator it = doclangs.begin(); it != doclangs.end(); ++it) {
247         if (it != doclangs.begin()) val << ",";
248         val << *it;
249     }
250     std::string s = val.str();
251     return s;
252 }
253 
parse_langcode(const char * str)254 static std::string parse_langcode(const char *str)
255 {
256     std::string lstr(str);
257     std::transform(lstr.begin(), lstr.end(), lstr.begin(), ::tolower);
258     // accept syntax for two and three letter ISO 639 codes
259     if (!(isalpha(lstr[0]) && isalpha(lstr[1]) &&
260            (lstr[2] == '\0' || (isalpha(lstr[2]) && lstr[3] == '\0')))) {
261         return std::string();
262     }
263     return lstr;
264 }
265 
266 // Process-scoped, thread-unsafe cache of stoppers by ISO 639 code.
267 static std::map<const std::string, std::unique_ptr<Xapian::Stopper>> stoppers;
268 
get_stopper(const std::string & iso)269 static const Xapian::Stopper* get_stopper(const std::string& iso)
270 {
271     // Lookup cached entry.
272     try {
273         return stoppers.at(iso).get();
274     } catch (const std::out_of_range&) {};
275 
276     // Lookup language name by ISO code.
277     icu::Locale loc(iso.c_str());
278     if (loc.isBogus()) return NULL;
279 
280     // Read stopper file and add to cache.
281     const char *swpath = config_getstring(IMAPOPT_SEARCH_STOPWORD_PATH);
282     if (!swpath) return NULL;
283 
284     std::string lang_name;
285     icu::UnicodeString ulang_name;
286     loc.getDisplayLanguage(icu::Locale("en"), ulang_name);
287     ulang_name.toLower();
288     ulang_name.toUTF8String(lang_name);
289 
290     // Open stopword file
291     // XXX doesn't play nice with WIN32 paths
292     std::string fname(std::string(swpath) + "/" + lang_name + ".txt");
293     errno = 0;
294     std::ifstream inFile (fname);
295     if (inFile.fail()) {
296         syslog(LOG_DEBUG, "Xapian: could not open stopword file %s: %s",
297                 fname.c_str(), errno ? strerror(errno) : "unknown error");
298         return NULL;
299     }
300 
301     // Create and store the Xapian stopper
302     stoppers[iso].reset(new Xapian::SimpleStopper(
303                 std::istream_iterator<std::string>(inFile),
304                 std::istream_iterator<std::string>()));
305     return stoppers[iso].get();
306 }
307 
308 class CyrusSearchStemmer : public Xapian::StemImplementation
309 {
310     charset_t utf8 {charset_lookupname("utf-8")};
311     std::map<const std::string, std::string> cache;
312     Xapian::Stem stem {"en"};
313 
314     public:
~CyrusSearchStemmer()315     virtual ~CyrusSearchStemmer() { charset_free(&utf8); }
316 
operator ()(const std::string & word)317     virtual std::string operator() (const std::string &word) override {
318         // Is this word already in the cache?
319         try {
320             return cache.at(word);
321         } catch (const std::out_of_range&) {}
322 
323         // Convert the word to search form
324         std::unique_ptr<char, decltype(std::free)*>
325             q {charset_convert(word.c_str(), utf8, charset_flags), std::free};
326         std::string s = q ? stem(Xapian::Unicode::tolower(q.get())) : stem(word);
327         if (s.size() > XAPIAN_MAX_TERM_LENGTH) return std::string{};
328 
329         // Store the normalized word in the cache
330         return cache[word] = s;
331     }
332 
get_description() const333     virtual std::string get_description () const override {
334         return "Cyrus";
335     }
336 };
337 
338 
339 class FrenchContractionStemmer : public Xapian::StemImplementation
340 {
341     Xapian::Stem stem {"fr"};
342 
343     public:
344 
operator ()(const std::string & word)345     virtual std::string operator() (const std::string &word) override {
346 
347         size_t pos = 0;
348         switch (word[0]) {
349             case 'q':
350                 if (word.length() <= 3 || word[1] != 'u') {
351                     break;
352                 }
353                 pos++;
354                 // fall through
355             case 'c':
356             case 'd':
357             case 'j':
358             case 'l':
359             case 'm':
360             case 'n':
361             case 's':
362             case 't':
363                 // APOSTROPHE (U+0027)
364                 if (word.length() > pos + 2 && word[pos+1] == 0x27) {
365                     return stem(word.substr(pos + 2));
366                 }
367                 // RIGHT SINGLE QUOTATION MARK (U+2019)
368                 // FULLWIDTH APOSTROPHE (U+FF07)
369                 else if (!word.compare(pos + 1, 3, "\xe2\x80\x99") ||
370                          !word.compare(pos + 1, 3, "\xef\xbc\x87")) {
371                     return stem(word.substr(pos + 4));
372                 }
373                 // fall through
374         }
375         // not a contraction
376         return stem(word);
377     }
378 
get_description() const379     virtual std::string get_description () const override {
380         return "fr-contraction";
381     }
382 };
383 
get_stemmer(const std::string & iso_lang)384 static Xapian::Stem get_stemmer(const std::string& iso_lang)
385 {
386     return iso_lang == "fr" ?
387         Xapian::Stem{new FrenchContractionStemmer} :
388         Xapian::Stem{iso_lang};
389 }
390 
391 #ifdef HAVE_CLD2
detect_language(const struct buf * part)392 static std::string detect_language(const struct buf *part)
393 {
394     std::string iso_lang;
395     bool reliable = false;
396     CLD2::Language lang = CLD2::DetectLanguage(part->s, part->len, 1, &reliable);
397 
398     if (reliable && lang != CLD2::UNKNOWN_LANGUAGE) {
399         std::string code(CLD2::LanguageCode(lang));
400         std::transform(code.begin(), code.end(), code.begin(), ::tolower);
401         // Map CLD2 special codes to ISO 639.
402         if (!code.compare("zh-Hant")) {
403             code = "zh";
404         }
405         else if (!code.compare("sr-ME" )) {
406             code = "sr"; // not a political statement!
407         }
408         else if (!code.compare("xxx")) {
409             code = "";
410         }
411         iso_lang = parse_langcode(code.c_str());
412     }
413 
414     return iso_lang;
415 }
416 #endif /* HAVE_CLD2 */
417 
418 /* ====================================================================== */
419 
better_indexlevel(uint8_t levela,uint8_t levelb)420 static uint8_t better_indexlevel(uint8_t levela, uint8_t levelb)
421 {
422     uint8_t a = levela & ~SEARCH_INDEXLEVEL_PARTIAL;
423     uint8_t b = levelb & ~SEARCH_INDEXLEVEL_PARTIAL;
424     if (a > b) return levela;
425     if (a < b) return levelb;
426     return (levela & SEARCH_INDEXLEVEL_PARTIAL) ? levelb : levela;
427 }
428 
parse_indexlevel(const std::string & s)429 static uint8_t parse_indexlevel(const std::string& s)
430 {
431     uint8_t level = 0;
432     if (hex_to_bin(s.c_str(), s.length(), &level) != 1) {
433         return 0;
434     }
435     return level;
436 }
437 
format_indexlevel(uint8_t level)438 static std::string format_indexlevel(uint8_t level)
439 {
440     char hex[4];
441     bin_to_lchex(&level, 1, hex);
442     return std::string(hex, 2);
443 }
444 
445 /* ====================================================================== */
446 
447 class CyrusMetadataCompactor : public Xapian::Compactor
448 {
449     public:
450 
CyrusMetadataCompactor()451         CyrusMetadataCompactor() { }
452 
resolve_duplicate_metadata(const std::string & key,size_t num_tags,const std::string tags[])453         std::string resolve_duplicate_metadata(const std::string &key,
454                                                size_t num_tags,
455                                                const std::string tags[])
456         {
457             if (key.rfind("cyrusid.", 0) == 0) {
458                 uint8_t indexlevel = parse_indexlevel(tags[0]);
459                 size_t bestpos = 0;
460                 for (size_t i = 1; i < num_tags; i++) {
461                     uint8_t level = parse_indexlevel(tags[i]);
462                     if (better_indexlevel(indexlevel, level) == level) {
463                         indexlevel = level;
464                         bestpos = i;
465                     }
466                 }
467                 return tags[bestpos];
468             }
469 
470             return tags[0];
471         }
472 };
473 
474 
xapian_compact_dbs(const char * dest,const char ** sources)475 EXPORTED int xapian_compact_dbs(const char *dest, const char **sources)
476 {
477     int r = 0;
478     Xapian::Database db;
479     const char *thispath = "(unknown path)";
480 
481     try {
482         std::set<int> db_versions;
483         std::map<const std::string, unsigned> lang_counts;
484         std::vector<Xapian::Database> subdbs;
485 
486         while (*sources) {
487             thispath = *sources;
488             Xapian::Database subdb(*sources++);
489             db.add_database(subdb);
490             subdbs.push_back(subdb);
491 
492             // Aggregate db versions.
493             bool need_metadata = false;
494             for (Xapian::docid docid = 1; docid <= subdb.get_lastdocid(); ++docid) {
495                 try {
496                     Xapian::Document doc = subdb.get_document(docid);
497                     const std::string& val = doc.get_value(SLOT_INDEXVERSION);
498                     if (!val.empty()) {
499                         int version = std::atoi(val.c_str());
500                         if (version) db_versions.insert(version);
501                     }
502                     else need_metadata = true;
503                 }
504                 catch (Xapian::DocNotFoundError e) {
505                     // ignore
506                 }
507             }
508             if (need_metadata) {
509                 /* At least one document didn't have its index version set.
510                  * Read the legacy version from the metadata. */
511                 std::set<int> md_versions = read_db_versions(subdb);
512                 db_versions.insert(md_versions.begin(), md_versions.lower_bound(14));
513             }
514 
515             // Aggregate language counts.
516             r = calculate_language_counts(subdb, lang_counts);
517             if (r) {
518                 xsyslog(LOG_ERR, "IOERROR: corrupt language metadata",
519                                  "path=<%s>", thispath);
520                 return r;
521             }
522         }
523         thispath = "(unknown path)";
524 
525         // Compact database.
526         static CyrusMetadataCompactor comp;
527         // FULLER because we never write to compression targets again.
528         db.compact(dest, Xapian::Compactor::FULLER | Xapian::DBCOMPACT_MULTIPASS, 0, comp);
529 
530         Xapian::WritableDatabase newdb(dest);
531         write_db_versions(newdb, db_versions);
532 
533         // Clean metadata.
534         remove_legacy_metadata(newdb);
535 
536         // Reset language counts.
537         write_language_counts(newdb, lang_counts);
538     }
539     catch (const Xapian::Error &err) {
540         xsyslog(LOG_ERR, "IOERROR: caught exception",
541                          "exception=<%s> path=<%s>",
542                          err.get_description().c_str(), thispath);
543         r = IMAP_IOERROR;
544     }
545 
546     return r;
547 }
548 
549 /* ====================================================================== */
550 
get_term_prefix(int db_version,int partnum)551 static const char *get_term_prefix(int db_version, int partnum)
552 {
553     /*
554      * We use term prefixes to store terms per search part.
555      * In addition, each Xapian document contains a "XE"
556      * prefix to indicate its document type, listed in
557      * the XAPIAN_WRAP_DOCTYPE definitions. The "XE" prefix
558      * MUST not be used for any search part.
559      *
560      */
561     static const char * const term_prefixes[SEARCH_NUM_PARTS] = {
562         NULL,                /* ANY */
563         "XF",                /* FROM */
564         "XT",                /* TO */
565         "XC",                /* CC */
566         "XB",                /* BCC */
567         "XS",                /* SUBJECT */
568         "XL",                /* LISTID */
569         "XY",                /* TYPE */
570         "XH",                /* HEADERS */
571         "",                  /* BODY */
572         "XO",                /* LOCATION */
573         "XA",                /* ATTACHMENTNAME */
574         "XAB",               /* ATTACHMENTBODY */
575         "XDT",               /* DELIVEREDTO */
576         "XI",                /* LANGUAGE */
577         "XP"                 /* PRIORITY */
578     };
579 
580     static const char * const term_prefixes_v0[SEARCH_NUM_PARTS] = {
581         NULL,               /* ANY */
582         "F",                /* FROM */
583         "T",                /* TO */
584         "C",                /* CC */
585         "B",                /* BCC */
586         "S",                /* SUBJECT */
587         "L",                /* LISTID */
588         "Y",                /* TYPE */
589         "H",                /* HEADERS */
590         "D",                /* BODY */
591         "O",                /* LOCATION */
592         "A",                /* ATTACHMENTNAME */
593         "AB",               /* ATTACHMENTBODY */
594         "E",                /* DELIVEREDTO */
595         NULL,               /* LANGUAGE */
596         NULL                /* PRIORITY */
597     };
598 
599     return db_version > 0 ? term_prefixes[partnum] : term_prefixes_v0[partnum];
600 }
601 
get_stem_strategy(int db_version,int partnum)602 static Xapian::TermGenerator::stem_strategy get_stem_strategy(int db_version, int partnum)
603 {
604     static Xapian::TermGenerator::stem_strategy stem_strategy[SEARCH_NUM_PARTS] = {
605         // Version 2 and higher
606         Xapian::TermGenerator::STEM_NONE,  /* ANY */
607         Xapian::TermGenerator::STEM_NONE,  /* FROM */
608         Xapian::TermGenerator::STEM_NONE,  /* TO */
609         Xapian::TermGenerator::STEM_NONE,  /* CC */
610         Xapian::TermGenerator::STEM_NONE,  /* BCC */
611         Xapian::TermGenerator::STEM_SOME,  /* SUBJECT */
612         Xapian::TermGenerator::STEM_NONE,  /* LISTID */
613         Xapian::TermGenerator::STEM_NONE,  /* TYPE */
614         Xapian::TermGenerator::STEM_NONE,  /* HEADERS */
615         Xapian::TermGenerator::STEM_SOME,  /* BODY */
616         Xapian::TermGenerator::STEM_SOME,  /* LOCATION */
617         Xapian::TermGenerator::STEM_NONE,  /* ATTACHMENTNAME */
618         Xapian::TermGenerator::STEM_SOME,  /* ATTACHMENTBODY */
619         Xapian::TermGenerator::STEM_NONE,  /* DELIVEREDTO */
620         Xapian::TermGenerator::STEM_NONE,  /* LANGUAGE */
621         Xapian::TermGenerator::STEM_NONE   /* PRIORITY */
622     };
623 
624     static Xapian::TermGenerator::stem_strategy stem_strategy_v1[SEARCH_NUM_PARTS] = {
625         // Version 1: Stem bodies using STEM_SOME with stopwords
626         Xapian::TermGenerator::STEM_NONE,  /* ANY */
627         Xapian::TermGenerator::STEM_ALL,   /* FROM */
628         Xapian::TermGenerator::STEM_ALL,   /* TO */
629         Xapian::TermGenerator::STEM_ALL,   /* CC */
630         Xapian::TermGenerator::STEM_ALL,   /* BCC */
631         Xapian::TermGenerator::STEM_ALL,   /* SUBJECT */
632         Xapian::TermGenerator::STEM_ALL,   /* LISTID */
633         Xapian::TermGenerator::STEM_ALL,   /* TYPE */
634         Xapian::TermGenerator::STEM_ALL,   /* HEADERS */
635         Xapian::TermGenerator::STEM_SOME,  /* BODY */
636         Xapian::TermGenerator::STEM_SOME,  /* LOCATION */
637         Xapian::TermGenerator::STEM_NONE,  /* ATTACHMENTNAME */
638         Xapian::TermGenerator::STEM_SOME,  /* ATTACHMENTBODY */
639         Xapian::TermGenerator::STEM_ALL,   /* DELIVEREDTO */
640         Xapian::TermGenerator::STEM_NONE,  /* LANGUAGE */
641         Xapian::TermGenerator::STEM_NONE   /* PRIORITY */
642     };
643 
644     static Xapian::TermGenerator::stem_strategy stem_strategy_v0[SEARCH_NUM_PARTS] = {
645         // Version 0: Initial version
646         Xapian::TermGenerator::STEM_NONE,  /* ANY */
647         Xapian::TermGenerator::STEM_ALL,   /* FROM */
648         Xapian::TermGenerator::STEM_ALL,   /* TO */
649         Xapian::TermGenerator::STEM_ALL,   /* CC */
650         Xapian::TermGenerator::STEM_ALL,   /* BCC */
651         Xapian::TermGenerator::STEM_ALL,   /* SUBJECT */
652         Xapian::TermGenerator::STEM_ALL,   /* LISTID */
653         Xapian::TermGenerator::STEM_ALL,   /* TYPE */
654         Xapian::TermGenerator::STEM_ALL,   /* HEADERS */
655         Xapian::TermGenerator::STEM_ALL,   /* BODY */
656         Xapian::TermGenerator::STEM_ALL,   /* LOCATION */
657         Xapian::TermGenerator::STEM_ALL,   /* ATTACHMENTNAME */
658         Xapian::TermGenerator::STEM_ALL,   /* ATTACHMENTBODY */
659         Xapian::TermGenerator::STEM_ALL,   /* DELIVEREDTO */
660         Xapian::TermGenerator::STEM_NONE,  /* LANGUAGE */
661         Xapian::TermGenerator::STEM_NONE   /* PRIORITY */
662     };
663 
664     switch (db_version) {
665         case 0:
666             return stem_strategy_v0[partnum];
667         case 1:
668             return stem_strategy_v1[partnum];
669         default:
670             return stem_strategy[partnum];
671     }
672 }
673 
674 /* For all db paths in sources that are not using the latest database
675  * version or not readable, report their paths in toreindex */
xapian_check_if_needs_reindex(const strarray_t * sources,strarray_t * toreindex,int always_upgrade)676 EXPORTED void xapian_check_if_needs_reindex(const strarray_t *sources,
677                                             strarray_t *toreindex,
678                                             int always_upgrade)
679 {
680     // Check the version of all dbs in sources
681     for (int i = 0; i < sources->count; i++) {
682         const char *thispath = strarray_nth(sources, i);
683         try {
684             for (const int& it: read_db_versions(Xapian::Database{thispath})) {
685                 if (it < XAPIAN_DB_MIN_SUPPORTED_VERSION ||
686                         (always_upgrade && (it != XAPIAN_DB_CURRENT_VERSION))) {
687                     strarray_add(toreindex, thispath);
688                 }
689             }
690         }
691         catch (const Xapian::Error &err) {
692             strarray_add(toreindex, thispath);
693         }
694     }
695 }
696 
697 /* ====================================================================== */
698 
add_boolean_nterm(Xapian::Document & doc,const std::string & term,size_t n=XAPIAN_MAX_TERM_LENGTH)699 static inline void add_boolean_nterm(Xapian::Document& doc,
700                                      const std::string& term,
701                                      size_t n = XAPIAN_MAX_TERM_LENGTH)
702 {
703     if (term.size() && term.size() < n) {
704         doc.add_boolean_term(term);
705     }
706 }
707 
708 struct xapian_dbw
709 {
710     // Database context.
711     Xapian::WritableDatabase *database;
712     ptrarray_t otherdbs;
713     Xapian::TermGenerator *term_generator;
714     Xapian::Stem *default_stemmer;
715     const Xapian::Stopper* default_stopper;
716     // Document context.
717     Xapian::Document *document;
718     char doctype;
719     char *cyrusid;
720     std::set<std::string> *doclangs;
721     std::vector<std::string> *subjects;
722 };
723 
724 
xapian_dbw_init(xapian_dbw_t * dbw)725 static int xapian_dbw_init(xapian_dbw_t *dbw)
726 {
727     dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
728     dbw->default_stopper = get_stopper("en");
729     dbw->term_generator = new Xapian::TermGenerator;
730     dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
731     /* Always enable CJK word tokenization */
732 #ifdef USE_XAPIAN_CJK_WORDS
733     dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
734             ~Xapian::TermGenerator::FLAG_CJK_WORDS);
735 #else
736     dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
737             ~Xapian::TermGenerator::FLAG_CJK_NGRAM);
738 #endif
739     dbw->doclangs = new std::set<std::string>;
740     dbw->subjects = new std::vector<std::string>;
741     return 0;
742 }
743 
xapian_dbw_open(const char ** paths,xapian_dbw_t ** dbwp,int mode,int nosync)744 EXPORTED int xapian_dbw_open(const char **paths, xapian_dbw_t **dbwp,
745                              int mode, int nosync)
746 {
747     xapian_dbw_t *dbw = (xapian_dbw_t *)xzmalloc(sizeof(xapian_dbw_t));
748     int r = 0;
749     const char *thispath = *paths++;
750 
751     std::set<int> db_versions;
752     try {
753         int flags = Xapian::DB_BACKEND_GLASS|Xapian::DB_RETRY_LOCK;
754         if (nosync) flags |= Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
755         try {
756             dbw->database = new Xapian::WritableDatabase{thispath, flags|Xapian::DB_OPEN};
757             db_versions = read_db_versions(*dbw->database);
758         } catch (Xapian::DatabaseOpeningError &e) {
759             /* It's OK not to atomically create or open, since we can assume
760              * the xapianactive file items to be locked. */
761             dbw->database = new Xapian::WritableDatabase{thispath, flags|Xapian::DB_CREATE};
762         }
763         if (db_versions.find(XAPIAN_DB_CURRENT_VERSION) == db_versions.end()) {
764             // Always index using latest database version.
765             db_versions.insert(XAPIAN_DB_CURRENT_VERSION);
766             write_db_versions(*dbw->database, db_versions);
767         }
768 
769         r = xapian_dbw_init(dbw);
770 
771     }
772     catch (const Xapian::DatabaseLockError &err) {
773         /* somebody else is already indexing this user.  They may be doing a different
774          * mailbox, so we need to re-insert this mailbox into the queue! */
775         r = IMAP_MAILBOX_LOCKED;
776     }
777     catch (const Xapian::Error &err) {
778         xsyslog(LOG_ERR, "IOERROR: caught exception",
779                          "exception=<%s> path=<%s>",
780                          err.get_description().c_str(), thispath);
781         r = IMAP_IOERROR;
782     }
783 
784     if (r) {
785         xapian_dbw_close(dbw);
786         return r;
787     }
788 
789     /* open the read-only databases */
790     if (mode == XAPIAN_DBW_XAPINDEXED) {
791         while (*paths) {
792             try {
793                 thispath = *paths;
794                 ptrarray_append(&dbw->otherdbs, new Xapian::Database{*paths++});
795             }
796             catch (const Xapian::Error &err) {
797                 xsyslog(LOG_ERR, "IOERROR: reading database",
798                                  "exception=<%s> path=<%s>",
799                                  err.get_description().c_str(), thispath);
800             }
801         }
802     }
803 
804     *dbwp = dbw;
805 
806     return 0;
807 }
808 
xapian_dbw_close(xapian_dbw_t * dbw)809 EXPORTED void xapian_dbw_close(xapian_dbw_t *dbw)
810 {
811     if (!dbw) return;
812     try {
813         delete dbw->database;
814         delete dbw->term_generator;
815         delete dbw->document;
816         delete dbw->default_stemmer;
817         delete dbw->doclangs;
818         delete dbw->subjects;
819         for (int i = 0; i < dbw->otherdbs.count; i++) {
820             delete (Xapian::Database *)ptrarray_nth(&dbw->otherdbs, i);
821         }
822         ptrarray_fini(&dbw->otherdbs);
823         free(dbw->cyrusid);
824         free(dbw);
825     }
826     catch (const Xapian::Error &err) {
827         xsyslog(LOG_ERR, "IOERROR: caught exception",
828                          "exception=<%s>",
829                          err.get_description().c_str());
830     }
831 }
832 
xapian_dbw_begin_txn(xapian_dbw_t * dbw)833 EXPORTED int xapian_dbw_begin_txn(xapian_dbw_t *dbw)
834 {
835     int r = 0;
836     try {
837         dbw->database->begin_transaction();
838     }
839     catch (const Xapian::Error &err) {
840         xsyslog(LOG_ERR, "IOERROR: caught exception",
841                          "exception=<%s>",
842                          err.get_description().c_str());
843         r = IMAP_IOERROR;
844     }
845     return r;
846 }
847 
xapian_dbw_commit_txn(xapian_dbw_t * dbw)848 EXPORTED int xapian_dbw_commit_txn(xapian_dbw_t *dbw)
849 {
850     int r = 0;
851     try {
852         dbw->database->commit_transaction();
853     }
854     catch (const Xapian::Error &err) {
855         xsyslog(LOG_ERR, "IOERROR: caught exception",
856                          "exception=<%s>",
857                          err.get_description().c_str());
858         r = IMAP_IOERROR;
859     }
860     return r;
861 }
862 
xapian_dbw_cancel_txn(xapian_dbw_t * dbw)863 EXPORTED int xapian_dbw_cancel_txn(xapian_dbw_t *dbw)
864 {
865     int r = 0;
866     try {
867         dbw->database->cancel_transaction();
868     }
869     catch (const Xapian::Error &err) {
870         xsyslog(LOG_ERR, "IOERROR: caught exception",
871                          "exception=<%s>",
872                          err.get_description().c_str());
873         r = IMAP_IOERROR;
874     }
875     return r;
876 }
877 
xapian_dbw_begin_doc(xapian_dbw_t * dbw,const struct message_guid * guid,char doctype)878 EXPORTED int xapian_dbw_begin_doc(xapian_dbw_t *dbw,
879                                   const struct message_guid *guid,
880                                   char doctype)
881 {
882     int r = 0;
883 
884     try {
885         delete dbw->document;
886         dbw->document = new Xapian::Document;
887         dbw->doctype = doctype;
888         /* Set document id and type */
889         struct buf buf = BUF_INITIALIZER;
890         make_cyrusid(&buf, guid, doctype);
891         dbw->document->add_value(SLOT_CYRUSID, buf_cstring(&buf));
892         dbw->cyrusid = buf_release(&buf);
893         add_boolean_nterm(*dbw->document, std::string("XE") + doctype);
894         /* Initialize term generator */
895         dbw->term_generator->set_document(*dbw->document);
896         dbw->term_generator->set_termpos(1);
897     }
898     catch (const Xapian::Error &err) {
899         xsyslog(LOG_ERR, "IOERROR: caught exception",
900                          "exception=<%s>",
901                          err.get_description().c_str());
902         r = IMAP_IOERROR;
903     }
904     return r;
905 }
906 
add_language_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)907 static int add_language_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
908 {
909     std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
910     std::string val = parse_langcode(buf_cstring(part));
911     if (val.empty()) {
912         syslog(LOG_INFO, "Xapian: not a valid ISO 639 code: %s",
913                 buf_cstring(part));
914         return 0;
915     }
916     add_boolean_nterm(*dbw->document, prefix + val);
917     return 0;
918 }
919 
parse_priority(const char * str)920 static std::string parse_priority(const char *str)
921 {
922     const char *err;
923     uint32_t u;
924     if (parseuint32(str, &err, &u) == -1 || *err || u == 0) {
925         return std::string();
926     }
927     return std::to_string(u);
928 }
929 
add_priority_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)930 static int add_priority_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
931 {
932     std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
933     if (buf_len(part)) {
934         std::string val = parse_priority(buf_cstring(part));
935         if (val.empty()) {
936             syslog(LOG_DEBUG, "Xapian: not a valid priority: %s",
937                     buf_cstring(part));
938             return 0;
939         }
940         add_boolean_nterm(*dbw->document, prefix + val);
941     }
942     return 0;
943 }
944 
parse_listid(const char * str)945 static std::string parse_listid(const char *str)
946 {
947     std::string val;
948 
949     /* Extract list-id */
950     const char *start = strrchr(str, '<');
951     if (start) {
952         /* RFC2919 list-id header (with optional closing bracket) */
953         const char *end = strchr(++start, '>');
954         if (end)
955             val = std::string(start, end - start);
956         else
957             val = std::string(start);
958     }
959     else {
960         /* Groups-style header: 'list list-id[; contact list-contact]'
961          * As seen at Google Group, Yahoo, et al. */
962         for (start = str; isspace(*start); start++) {}
963         if (!strncasecmp("list", start, 4) && isspace(start[4])) {
964             for (start = start + 4; isspace(*start); start++) {}
965             if (*start) {
966                 const char *end = strchr(start, ';');
967                 if (!end || end - start) {
968                     val = end ? std::string(start, end - start) : std::string{start};
969                 }
970             }
971         }
972         /* just raw value, that's OK too, like sentry creates.  Parse up to first whitespace */
973         else {
974             const char *end;
975             for (end = start; *end && !isspace(*end); end++) {}
976             val = std::string(start, end - start);
977         }
978     }
979 
980     /* Normalize list-id */
981     val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
982     std::transform(val.begin(), val.end(), val.begin(), ::tolower);
983     return val;
984 }
985 
add_listid_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)986 static int add_listid_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
987 {
988     std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
989 
990     /* Normalize list-id */
991     std::string val = parse_listid(buf_cstring(part));
992     val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
993     std::transform(val.begin(), val.end(), val.begin(), ::tolower);
994     if (val.empty()) {
995         syslog(LOG_WARNING, "Xapian: not a valid list-id: %s",
996                 buf_cstring(part));
997         return 0;
998     }
999 
1000     add_boolean_nterm(*dbw->document, prefix + val);
1001     return 0;
1002 }
1003 
add_email_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1004 static int add_email_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
1005 {
1006     std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
1007     std::string lpart = Xapian::Unicode::tolower(buf_cstring(part));
1008     struct address_itr itr;
1009     address_itr_init(&itr, lpart.c_str(), 0);
1010 
1011     const struct address *addr;
1012     while ((addr = address_itr_next(&itr))) {
1013         if (addr->invalid) {
1014             continue;
1015         }
1016         if (addr->name) {
1017             dbw->term_generator->set_stemmer(Xapian::Stem());
1018             dbw->term_generator->set_stopper(NULL);
1019             dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->name), 1, prefix + 'N');
1020 
1021             dbw->term_generator->set_stemmer(Xapian::Stem());
1022             dbw->term_generator->set_stopper(NULL);
1023             dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->name), 1, prefix);
1024         }
1025         if (addr->mailbox) {
1026             // index mailbox as single value
1027             std::string val(addr->mailbox);
1028             // ignore whitespace (as seen in quoted mailboxes)
1029             val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
1030             add_boolean_nterm(*dbw->document, prefix + 'L' + val);
1031             // index individual terms
1032             dbw->term_generator->set_stemmer(Xapian::Stem());
1033             dbw->term_generator->set_stopper(NULL);
1034             dbw->term_generator->index_text(Xapian::Utf8Iterator(val), 1, prefix);
1035         }
1036         if (addr->domain && strcmp(addr->domain, "unspecified-domain")) {
1037             // index reversed domain
1038             std::string val;
1039             strarray_t *sa = strarray_split(addr->domain, ".", 0);
1040             val.reserve(buf_len(part));
1041             for (int i = strarray_size(sa) - 1; i >= 0; i--) {
1042                 val.append(strarray_nth(sa, i));
1043                 if (i > 0) {
1044                     val.append(1, '.');
1045                 }
1046             }
1047             strarray_free(sa);
1048             add_boolean_nterm(*dbw->document, prefix + "D" + val);
1049             // index individual terms
1050             dbw->term_generator->set_stemmer(Xapian::Stem());
1051             dbw->term_generator->set_stopper(NULL);
1052             dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->domain,
1053                         strlen(addr->domain)), 1, prefix);
1054         }
1055 
1056         // index entire addr-spec
1057         char *a = address_get_all(addr, /*canon_domain*/1);
1058         if (a) {
1059             add_boolean_nterm(*dbw->document, prefix + 'A' + std::string(a));
1060             free(a);
1061         }
1062     }
1063 
1064     address_itr_fini(&itr);
1065     return 0;
1066 }
1067 
parse_content_type(const char * str)1068 static std::pair<std::string, std::string> parse_content_type(const char *str)
1069 {
1070     std::pair<std::string, std::string> ret;
1071     struct buf buf = BUF_INITIALIZER;
1072 
1073     const char *sep = strchr(str, '/');
1074     if (sep) {
1075         /* type */
1076         buf_setmap(&buf, str, sep - str);
1077         buf_lcase(&buf);
1078         buf_trim(&buf);
1079         ret.first = std::string(buf_cstring(&buf));
1080         /* subtype */
1081         buf_setcstr(&buf, sep + 1);
1082         buf_lcase(&buf);
1083         buf_trim(&buf);
1084         ret.second = std::string(buf_cstring(&buf));
1085     }
1086     else {
1087         /* type or subtype */
1088         buf_setcstr(&buf, str);
1089         buf_lcase(&buf);
1090         buf_trim(&buf);
1091         ret.first = std::string(buf_cstring(&buf));
1092     }
1093 
1094     buf_free(&buf);
1095     return ret;
1096 }
1097 
add_type_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1098 static int add_type_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
1099 {
1100     std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
1101     std::pair<std::string, std::string> ct = parse_content_type(buf_cstring(part));
1102     if (!ct.first.empty()) {
1103         add_boolean_nterm(*dbw->document, prefix + "T" + ct.first);
1104     }
1105     if (!ct.second.empty()) {
1106         add_boolean_nterm(*dbw->document, prefix + "S" + ct.second);
1107     }
1108     if (!ct.first.empty() && !ct.second.empty()) {
1109         add_boolean_nterm(*dbw->document, prefix + ct.first + '/' + ct.second);
1110     }
1111     return 0;
1112 }
1113 
add_text_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1114 static int add_text_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
1115 {
1116     const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
1117     int r = 0;
1118 
1119     // Index text.
1120     Xapian::TermGenerator::stem_strategy stem_strategy =
1121         get_stem_strategy(XAPIAN_DB_CURRENT_VERSION, partnum);
1122     dbw->term_generator->set_stemming_strategy(stem_strategy);
1123 
1124     if (stem_strategy != Xapian::TermGenerator::STEM_NONE) {
1125         if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE)){
1126             // Index by language.
1127 #ifndef HAVE_CLD2
1128             // XXX is this really an "IOERROR"?
1129             xsyslog(LOG_ERR, "IOERROR: language indexing requires CLD2 library",
1130                              NULL);
1131             return IMAP_IOERROR;
1132 #else
1133 
1134             if (search_part_is_body(partnum)) {
1135                 const std::string iso_lang = detect_language(part);
1136                 if (!iso_lang.empty()) {
1137                     if (iso_lang.compare("en")) {
1138                         // Stem and index by non-default language.
1139                         try {
1140                             dbw->term_generator->set_stemmer(get_stemmer(iso_lang));
1141                             dbw->term_generator->set_stopper(get_stopper(iso_lang));
1142                             dbw->term_generator->index_text(Xapian::Utf8Iterator(part->s, part->len),
1143                                     1, lang_prefix(iso_lang, prefix));
1144                         } catch (const Xapian::InvalidArgumentError &err) {
1145                             syslog(LOG_DEBUG, "Xapian: no stemmer for language %s",
1146                                     iso_lang.c_str());
1147                         }
1148                     }
1149                     if (dbw->doctype == 'P') {
1150                         // Keep track of stemmer language.
1151                         std::string key = lang_doc_key(dbw->cyrusid);
1152                         dbw->database->set_metadata(key, iso_lang);
1153                         dbw->document->add_value(SLOT_DOCLANGS, iso_lang);
1154                         // Update language counts for body parts.
1155                         key = lang_count_key(iso_lang);
1156                         const std::string val = dbw->database->get_metadata(key);
1157                         dbw->database->set_metadata(key, val.empty() ?
1158                                 "1" : std::to_string(std::stoi(val) + 1));
1159                     }
1160                     // Store detected languages in document.
1161                     dbw->doclangs->insert(iso_lang.c_str());
1162                     add_boolean_nterm(*dbw->document, std::string("XI") + iso_lang);
1163                 }
1164             }
1165             else if (partnum == SEARCH_PART_SUBJECT) {
1166                 // Keep subject text to index by language later.
1167                 dbw->subjects->push_back(buf_cstring(part));
1168             }
1169 #endif /* HAVE_CLD2 */
1170         }
1171 
1172         // Index with default stemmer.
1173         dbw->term_generator->set_stemmer(*dbw->default_stemmer);
1174         dbw->term_generator->set_stopper(dbw->default_stopper);
1175     } else {
1176         // Index with no stemming.
1177         dbw->term_generator->set_stemmer(Xapian::Stem());
1178         dbw->term_generator->set_stopper(NULL);
1179     }
1180     dbw->term_generator->index_text(Xapian::Utf8Iterator(part->s, part->len), 1, prefix);
1181 
1182     return r;
1183 }
1184 
xapian_dbw_doc_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1185 EXPORTED int xapian_dbw_doc_part(xapian_dbw_t *dbw,
1186                                  const struct buf *part,
1187                                  int partnum)
1188 {
1189     int r = 0;
1190 
1191     if (!get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum)) {
1192         syslog(LOG_ERR, "xapian_wrapper: no prefix for partnum %d", partnum);
1193         return IMAP_INTERNAL;
1194     }
1195 
1196     try {
1197         // Handle search parts.
1198         switch (partnum) {
1199             case SEARCH_PART_PRIORITY:
1200                 r = add_priority_part(dbw, part, partnum);
1201                 break;
1202             case SEARCH_PART_LISTID:
1203                 r = add_listid_part(dbw, part, partnum);
1204                 break;
1205             case SEARCH_PART_LANGUAGE:
1206                 r = add_language_part(dbw, part, partnum);
1207                 break;
1208             case SEARCH_PART_FROM:
1209             case SEARCH_PART_TO:
1210             case SEARCH_PART_CC:
1211             case SEARCH_PART_BCC:
1212             case SEARCH_PART_DELIVEREDTO:
1213                 r = add_email_part(dbw, part, partnum);
1214                 break;
1215             case SEARCH_PART_TYPE:
1216                 r = add_type_part(dbw, part, partnum);
1217                 break;
1218             default:
1219                 r = add_text_part(dbw, part, partnum);
1220         }
1221         // Finalize index.
1222         dbw->term_generator->increase_termpos();
1223     }
1224     catch (const Xapian::Error &err) {
1225         xsyslog(LOG_ERR, "IOERROR: caught exception",
1226                          "exception=<%s>",
1227                          err.get_description().c_str());
1228         r = IMAP_IOERROR;
1229     }
1230     return r;
1231 }
1232 
xapian_dbw_end_doc(xapian_dbw_t * dbw,uint8_t indexlevel)1233 EXPORTED int xapian_dbw_end_doc(xapian_dbw_t *dbw, uint8_t indexlevel)
1234 {
1235     int r = 0;
1236 
1237     assert(indexlevel > 0);
1238 
1239     try {
1240         if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE)){
1241             // Keep track of languages used in this message.
1242             if (dbw->doctype == 'G') {
1243                 std::string val = format_doclangs(*dbw->doclangs);
1244                 dbw->database->set_metadata(lang_doc_key(dbw->cyrusid), val);
1245                 dbw->document->add_value(SLOT_DOCLANGS, val);
1246             }
1247             // Index subjects by detected document languages.
1248             for (std::set<std::string>::iterator it = dbw->doclangs->begin(); it != dbw->doclangs->end(); ++it) {
1249                 std::string iso_lang = *it;
1250                 if (iso_lang.compare("en")) {
1251                     try {
1252                         const char *tp = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, SEARCH_PART_SUBJECT);
1253                         std::string prefix = lang_prefix(iso_lang, tp);
1254                         dbw->term_generator->set_stemmer(get_stemmer(iso_lang));
1255                         dbw->term_generator->set_stopper(get_stopper(iso_lang));
1256                         for (const std::string& subject : *dbw->subjects)
1257                             dbw->term_generator->index_text(Xapian::Utf8Iterator(subject), 1, prefix);
1258                     } catch (const Xapian::InvalidArgumentError &err) {
1259                         // ignore unknown stemmer
1260                     }
1261                 }
1262             }
1263         }
1264         dbw->document->add_value(SLOT_INDEXLEVEL, format_indexlevel(indexlevel));
1265         dbw->document->add_value(SLOT_INDEXVERSION,
1266                 std::to_string(XAPIAN_DB_CURRENT_VERSION));
1267         dbw->database->add_document(*dbw->document);
1268         dbw->database->set_metadata("cyrusid." + std::string(dbw->cyrusid),
1269                                     format_indexlevel(indexlevel));
1270         delete dbw->document;
1271         dbw->document = 0;
1272         dbw->doctype = 0;
1273         free(dbw->cyrusid);
1274         dbw->cyrusid = NULL;
1275         dbw->doclangs->clear();
1276         dbw->subjects->clear();
1277     }
1278     catch (const Xapian::Error &err) {
1279         xsyslog(LOG_ERR, "IOERROR: caught exception",
1280                          "exception=<%s>",
1281                          err.get_description().c_str());
1282         r = IMAP_IOERROR;
1283     }
1284     return r;
1285 }
1286 
xapian_dbw_total_length(xapian_dbw_t * dbw)1287 EXPORTED unsigned long xapian_dbw_total_length(xapian_dbw_t *dbw)
1288 {
1289     unsigned long res = 0;
1290     try {
1291         res = dbw->database->get_total_length();
1292     }
1293     catch (const Xapian::Error &err) {
1294         xsyslog(LOG_ERR, "IOERROR: caught exception",
1295                          "exception=<%s>",
1296                          err.get_description().c_str());
1297     }
1298     return res;
1299 }
1300 
xapian_dbw_is_indexed(xapian_dbw_t * dbw,const struct message_guid * guid,char doctype)1301 EXPORTED uint8_t xapian_dbw_is_indexed(xapian_dbw_t *dbw,
1302                                        const struct message_guid *guid,
1303                                        char doctype)
1304 {
1305     struct buf buf = BUF_INITIALIZER;
1306     make_cyrusid(&buf, guid, doctype);
1307     std::string key = "cyrusid." + std::string(buf_cstring(&buf));
1308     buf_free(&buf);
1309 
1310     /* indexed in the current DB? */
1311     uint8_t indexlevel = parse_indexlevel(dbw->database->get_metadata(key));
1312     if (indexlevel == SEARCH_INDEXLEVEL_BEST ||
1313             (indexlevel && doctype == XAPIAN_WRAP_DOCTYPE_PART)) {
1314         return indexlevel;
1315     }
1316 
1317     /* indexed in other DBs? */
1318     for (int i = 0; i < dbw->otherdbs.count; i++) {
1319         Xapian::Database *database = (Xapian::Database *)ptrarray_nth(&dbw->otherdbs, i);
1320         uint8_t level = parse_indexlevel(database->get_metadata(key));
1321         if (level == SEARCH_INDEXLEVEL_BEST ||
1322                 (level && doctype == XAPIAN_WRAP_DOCTYPE_PART)) {
1323             return level;
1324         }
1325         else indexlevel = better_indexlevel(indexlevel, level);
1326     }
1327 
1328     return indexlevel;
1329 }
1330 
1331 /* ====================================================================== */
1332 
1333 struct xapian_db
1334 {
1335     std::string *paths;
1336     Xapian::Database *database; // all but version 4 databases
1337     Xapian::Database *legacydbv4; // version 4 databases
1338     std::vector<Xapian::Database> *subdbs; // all database subdbs
1339     Xapian::Stem *default_stemmer;
1340     const Xapian::Stopper* default_stopper;
1341     std::set<std::string> *stem_languages;
1342     Xapian::QueryParser *parser;
1343     std::set<int> *db_versions;
1344     xapian_dbw_t *dbw;
1345 };
1346 
xapian_db_init(xapian_db_t * db)1347 static int xapian_db_init(xapian_db_t *db)
1348 {
1349     int r = 0;
1350 
1351     try {
1352         db->parser = new Xapian::QueryParser;
1353         db->parser->set_default_op(Xapian::Query::OP_AND);
1354         db->parser->set_database(db->database ? *db->database : *db->legacydbv4);
1355         db->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
1356         db->default_stopper = get_stopper("en");
1357 
1358         // Determine stemmer languages (in addition to English).
1359         db->stem_languages = new std::set<std::string>;
1360         std::map<const std::string, unsigned> lang_counts;
1361         size_t total_doccount = 0;
1362         for (const Xapian::Database& subdb : *db->subdbs) {
1363             read_language_counts(subdb, lang_counts);
1364             total_doccount += subdb.get_doccount();
1365         }
1366         total_doccount /= 2; // Crude estimate.
1367         for (std::pair<const std::string, unsigned>& it : lang_counts) {
1368             if (it.first.compare("en") && ((double) it.second / total_doccount) >= 0.05) {
1369                 db->stem_languages->insert(it.first);
1370             }
1371         }
1372     }
1373     catch (const Xapian::Error &err) {
1374         xsyslog(LOG_ERR, "IOERROR: caught exception",
1375                          "exception=<%s>",
1376                          err.get_description().c_str());
1377         r = IMAP_IOERROR;
1378     }
1379 
1380     return r;
1381 }
1382 
xapian_db_open(const char ** paths,xapian_db_t ** dbp)1383 EXPORTED int xapian_db_open(const char **paths, xapian_db_t **dbp)
1384 {
1385     xapian_db_t *db = (xapian_db_t *)xzmalloc(sizeof(xapian_db_t));
1386     const char *thispath = "(unknown)";
1387     int r = 0;
1388 
1389     try {
1390         db->paths = new std::string;
1391         while (paths && *paths) {
1392             thispath = *paths++;
1393             Xapian::Database subdb {thispath};
1394             std::set<int> db_versions = read_db_versions(subdb);
1395             if (db_versions.empty()) {
1396                 syslog(LOG_ERR, "xapian_wrapper: invalid db version in %s", thispath);
1397                 r = IMAP_INTERNAL;
1398                 goto done;
1399             }
1400             if (!db->db_versions)
1401                 db->db_versions = new std::set<int>;
1402             db->db_versions->insert(db_versions.begin(), db_versions.end());
1403             // Databases with version 4 split indexing by doctype.
1404             if (db_versions.find(4) != db_versions.end()) {
1405                 if (!db->legacydbv4) db->legacydbv4 = new Xapian::Database;
1406                 db->legacydbv4->add_database(subdb);
1407             }
1408             // Databases with any but version 4 are regular dbs.
1409             if (db_versions.size() > 1 || db_versions.find(4) == db_versions.end()) {
1410                 if (!db->database) db->database = new Xapian::Database;
1411                 db->database->add_database(subdb);
1412             }
1413 
1414             // Xapian database has no API to access subdbs.
1415             if (!db->subdbs) db->subdbs = new std::vector<Xapian::Database>;
1416             db->subdbs->push_back(subdb);
1417 
1418             db->paths->append(thispath).push_back(' ');
1419         }
1420         thispath = "(unknown)";
1421 
1422         if (!db->database && !db->legacydbv4) {
1423             r = IMAP_NOTFOUND;
1424             goto done;
1425         }
1426 
1427         r = xapian_db_init(db);
1428         if (r) goto done;
1429     }
1430     catch (const Xapian::Error &err) {
1431         xsyslog(LOG_ERR, "IOERROR: caught exception",
1432                          "exception=<%s> path=<%s>",
1433                          err.get_description().c_str(), thispath);
1434         r = IMAP_IOERROR;
1435     }
1436 
1437 done:
1438     if (r)
1439         xapian_db_close(db);
1440     else
1441         *dbp = db;
1442 
1443     return r;
1444 }
1445 
xapian_db_opendbw(struct xapian_dbw * dbw,xapian_db_t ** dbp)1446 EXPORTED int xapian_db_opendbw(struct xapian_dbw *dbw, xapian_db_t **dbp)
1447 {
1448     xapian_db_t *db = (xapian_db_t *)xzmalloc(sizeof(xapian_db_t));
1449 
1450     db->dbw = dbw;
1451     db->database = dbw->database;
1452     db->db_versions = new std::set<int>();
1453     std::set<int> dbw_versions = read_db_versions(*dbw->database);
1454     db->db_versions->insert(dbw_versions.begin(), dbw_versions.end());
1455     db->subdbs = new std::vector<Xapian::Database>;
1456     db->subdbs->push_back(*dbw->database);
1457 
1458     int r = xapian_db_init(db);
1459     if (r) {
1460         xapian_db_close(db);
1461         db = NULL;
1462     }
1463 
1464     *dbp = db;
1465     return r;
1466 }
1467 
xapian_db_close(xapian_db_t * db)1468 EXPORTED void xapian_db_close(xapian_db_t *db)
1469 {
1470     if (!db) return;
1471     try {
1472         if (!db->dbw) delete db->database;
1473         delete db->legacydbv4;
1474         delete db->parser;
1475         delete db->paths;
1476         delete db->db_versions;
1477         delete db->default_stemmer;
1478         delete db->stem_languages;
1479         delete db->subdbs;
1480         free(db);
1481     }
1482     catch (const Xapian::Error &err) {
1483         /* XXX - memory leak? */
1484         xsyslog(LOG_ERR, "IOERROR: caught exception",
1485                          "exception=<%s>",
1486                          err.get_description().c_str());
1487     }
1488 }
1489 
xapian_db_langstats(xapian_db_t * db,ptrarray_t * lstats,size_t * nolang)1490 EXPORTED int xapian_db_langstats(xapian_db_t *db, ptrarray_t* lstats,
1491                                  size_t *nolang)
1492 {
1493     std::map<const std::string, unsigned> lang_counts;
1494     size_t total_part = 0;
1495     size_t total_lang = 0;
1496 
1497     for (const Xapian::Database& subdb : *db->subdbs) {
1498         // count body parts
1499         for (Xapian::TermIterator it = subdb.metadata_keys_begin("cyrusid.*P*");
1500                 it != subdb.metadata_keys_end("cyrusid.*P*"); ++it) {
1501             total_part++;
1502         }
1503         // cummulate language counts
1504         read_language_counts(subdb, lang_counts);
1505     }
1506     for (const std::pair<const std::string, unsigned>& counts : lang_counts) {
1507         struct search_langstat *stat = (struct search_langstat*)
1508                                        xzmalloc(sizeof(struct search_langstat));
1509         stat->iso_lang = xstrdup(counts.first.c_str());
1510         stat->count = counts.second;
1511         ptrarray_append(lstats, stat);
1512         total_lang += counts.second;
1513     }
1514     *nolang = total_part > total_lang ? total_part - total_lang : 0;
1515 
1516     return 0;
1517 }
1518 
xapian_query_add_stemmer(xapian_db_t * db,const char * iso_lang)1519 EXPORTED void xapian_query_add_stemmer(xapian_db_t *db, const char *iso_lang)
1520 {
1521     if (strcmp(iso_lang, "en")) db->stem_languages->insert(iso_lang);
1522 }
1523 
xapian_db_has_otherthan_v4_index(const xapian_db_t * db)1524 int xapian_db_has_otherthan_v4_index(const xapian_db_t *db)
1525 {
1526     return db->database != NULL;
1527 }
1528 
xapian_db_has_legacy_v4_index(const xapian_db_t * db)1529 int xapian_db_has_legacy_v4_index(const xapian_db_t *db)
1530 {
1531     return db->legacydbv4 != NULL;
1532 }
1533 
query_new_textmatch(const xapian_db_t * db,const char * match,const char * prefix,Xapian::TermGenerator::stem_strategy tg_stem_strategy)1534 static Xapian::Query* query_new_textmatch(const xapian_db_t *db,
1535                                           const char *match,
1536                                           const char *prefix,
1537                                           Xapian::TermGenerator::stem_strategy tg_stem_strategy)
1538 {
1539     unsigned flags = Xapian::QueryParser::FLAG_PHRASE |
1540                      Xapian::QueryParser::FLAG_WILDCARD;
1541 
1542     std::string lmatch = Xapian::Unicode::tolower(match);
1543 
1544     if (tg_stem_strategy != Xapian::TermGenerator::STEM_NONE) {
1545 
1546         // Query without any stemmer.
1547         db->parser->set_stemmer(Xapian::Stem());
1548         db->parser->set_stopper(NULL);
1549         db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1550         Xapian::Query q = db->parser->parse_query(lmatch, flags, prefix);
1551 
1552         // Query with default stemmer. But don't stem stopwords.
1553         if (!db->default_stopper || !(*db->default_stopper)(lmatch)) {
1554             db->parser->set_stemmer(*db->default_stemmer);
1555             db->parser->set_stopper(db->default_stopper);
1556             db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
1557             q |= db->parser->parse_query(lmatch, flags, prefix);
1558         }
1559 
1560         // Stem query for each language detected in the index.
1561         for (const std::string& iso_lang : *db->stem_languages) {
1562             try {
1563                 const Xapian::Stopper *stopper = get_stopper(iso_lang);
1564                 db->parser->set_stemmer(get_stemmer(iso_lang));
1565                 db->parser->set_stopper(stopper);
1566                 db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
1567                 if (!stopper || !(*stopper)(lmatch)) {
1568                     q |= db->parser->parse_query(lmatch, flags, lang_prefix(iso_lang, prefix));
1569                 }
1570             } catch (const Xapian::InvalidArgumentError &err) {
1571                 syslog(LOG_INFO, "Xapian: no stemmer for language %s", iso_lang.c_str());
1572             }
1573         }
1574 
1575         return new Xapian::Query(q);
1576     }
1577     else {
1578         db->parser->set_stemmer(Xapian::Stem());
1579         db->parser->set_stopper(NULL);
1580         db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1581         return new Xapian::Query {db->parser->parse_query(lmatch, flags, prefix)};
1582     }
1583 }
1584 
query_new_language(const xapian_db_t * db,const char * prefix,const char * str)1585 static Xapian::Query *query_new_language(const xapian_db_t *db __attribute__((unused)),
1586                                          const char *prefix,
1587                                          const char *str)
1588 {
1589     std::string val = parse_langcode(str);
1590     if (val.empty()) {
1591         syslog(LOG_DEBUG, "Xapian: invalid language in query: %s", str);
1592         return new Xapian::Query(Xapian::Query::MatchNothing);
1593     }
1594     return new Xapian::Query(std::string(prefix) + val);
1595 }
1596 
query_new_priority(const xapian_db_t * db,const char * prefix,const char * str)1597 static Xapian::Query *query_new_priority(const xapian_db_t *db __attribute__((unused)),
1598                                          const char *prefix,
1599                                          const char *str)
1600 {
1601     std::string val = parse_priority(str);
1602     if (val.empty()) {
1603         syslog(LOG_DEBUG, "Xapian: invalid priority in query: %s", str);
1604         return new Xapian::Query(Xapian::Query::MatchNothing);
1605     }
1606     return new Xapian::Query(std::string(prefix) + val);
1607 }
1608 
query_new_listid(const xapian_db_t * db,const char * prefix,const char * str)1609 static Xapian::Query *query_new_listid(const xapian_db_t *db,
1610                                        const char *prefix,
1611                                        const char *str)
1612 {
1613     Xapian::Query *q = NULL;
1614 
1615     std::string val = parse_listid(str);
1616     if (!val.empty()) {
1617         q = new Xapian::Query(std::string(prefix) + val);
1618     }
1619     else {
1620         syslog(LOG_DEBUG, "Xapian: invalid listid in query: %s", str);
1621         q = new Xapian::Query(Xapian::Query::MatchNothing);
1622     }
1623 
1624     if (db->db_versions->lower_bound(11) != db->db_versions->begin()) {
1625         // query in legacy format
1626         db->parser->set_stemmer(Xapian::Stem());
1627         db->parser->set_stopper(NULL);
1628         db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1629         q = new Xapian::Query(Xapian::Query::OP_OR, *q,
1630                 db->parser->parse_query(str, 0, prefix));
1631     }
1632 
1633     return q;
1634 }
1635 
query_new_email(const xapian_db_t * db,const char * _prefix,const char * str)1636 static Xapian::Query *query_new_email(const xapian_db_t *db,
1637                                       const char *_prefix,
1638                                       const char *str)
1639 {
1640     std::string prefix(_prefix);
1641 
1642     unsigned qpflags = Xapian::QueryParser::FLAG_PHRASE |
1643                        Xapian::QueryParser::FLAG_WILDCARD;
1644 
1645     db->parser->set_stemmer(Xapian::Stem());
1646     db->parser->set_stopper(NULL);
1647     db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1648 
1649     std::string mystr = Xapian::Unicode::tolower(str);
1650     str = mystr.c_str();
1651 
1652     const char *atsign = strchr(str, '@');
1653 
1654     if (!atsign) {
1655         // query free text
1656         return new Xapian::Query{db->parser->parse_query(str, qpflags, prefix)};
1657     }
1658 
1659     Xapian::Query q = Xapian::Query::MatchNothing;
1660 
1661     // query name and mailbox (unless just searching for '@domain')
1662     if (atsign > str) {
1663         struct address *addr = NULL;
1664         parseaddr_list(str, &addr);
1665         if (addr && addr->name) {
1666             Xapian::Query qq = db->parser->parse_query(addr->name, qpflags, prefix + 'N');
1667             if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1668                 q &= qq;
1669             }
1670             else q = qq;
1671         }
1672         if (addr && addr->mailbox) {
1673             // strip the domain from the mailbox
1674             std::string mail(addr->mailbox);
1675             mail.erase(std::remove_if(mail.begin(), mail.end(), isspace), mail.end());
1676             int wildcard = mail[mail.size()-1] == '*';
1677             if (wildcard) {
1678                 mail.resize(mail.size()-1);
1679             }
1680             if (!mail.empty()) {
1681                 std::string term(prefix + 'L' + mail);
1682                 Xapian::Query qq = wildcard ?
1683                     Xapian::Query(Xapian::Query::OP_WILDCARD, term) :
1684                     Xapian::Query(term);
1685                 if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1686                     q &= qq;
1687                 }
1688                 else q = qq;
1689             }
1690         }
1691         // ignore @domain - it's being handled below
1692         if (addr) parseaddr_free(addr);
1693     }
1694 
1695     // query domain
1696     if (atsign[1]) {
1697         std::string domain;
1698         const char *dstart = atsign + 1;
1699         bool wildcard = *dstart == '*';
1700         if (wildcard) dstart++;
1701         const char *dend;
1702         for (dend = dstart; *dend; dend++) {
1703             char c = *dend;
1704             if (Uisalnum(c) || c == '-' || c == '[' || c == ']' || c == ':') {
1705                 continue;
1706             }
1707             else if (c == '.' && (dend-1 == dstart || dend[-2] != '.')) {
1708                 continue;
1709             }
1710             else {
1711                 break;
1712             }
1713         }
1714         if (dend > dstart) {
1715             strarray_t *sa = strarray_nsplit(dstart, dend - dstart, ".", 0);
1716             for (int i = strarray_size(sa) - 1; i >= 0; i--) {
1717                 domain.append(strarray_nth(sa, i));
1718                 if (i > 0) {
1719                     domain.append(1, '.');
1720                 }
1721             }
1722             strarray_free(sa);
1723             if (*dstart == '.') {
1724                 domain.append(1, '.');
1725             }
1726         }
1727         if (!domain.empty()) {
1728             std::string term(prefix + 'D' + domain);
1729             Xapian::Query qq = wildcard ? Xapian::Query(Xapian::Query::OP_WILDCARD, term) :
1730                                           Xapian::Query(term);
1731             {
1732                 // FIXME - temporarily also search for '@' prefix
1733                 std::string term2(prefix + '@' + domain);
1734                 Xapian::Query qq2 = wildcard ? Xapian::Query(Xapian::Query::OP_WILDCARD, term2) :
1735                                                Xapian::Query(term2);
1736                 qq |= qq2;
1737             }
1738             if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1739                 q &= qq;
1740             }
1741             else q = qq;
1742         }
1743     }
1744 
1745     if (q.get_type() == q.LEAF_MATCH_ALL) {
1746         q = Xapian::Query::MatchNothing;
1747     }
1748 
1749     // query in legacy format as well!
1750     if (db->db_versions->lower_bound(12) != db->db_versions->begin()) {
1751         q |= db->parser->parse_query(str, qpflags, prefix);
1752     }
1753 
1754     // query localpart@domain (ONLY if no wildcards)
1755     if ((atsign > str) && atsign[1] && !strchr(str, '*')) {
1756         struct address *addr = NULL;
1757 
1758         parseaddr_list(str, &addr);
1759         if (addr) {
1760             char *a = address_get_all(addr, /*canon_domain*/1);
1761             if (a) {
1762                 // query 'A' term for index >= 16
1763                 std::string term(prefix + 'A' + std::string(a));
1764                 Xapian::Query qq =
1765                     Xapian::Query(Xapian::Query::OP_AND,
1766                                   Xapian::Query(Xapian::Query::OP_VALUE_GE,
1767                                                 Xapian::valueno(SLOT_INDEXVERSION),
1768                                                 std::string("16")),
1769                                   Xapian::Query(term));
1770                 if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1771                     // otherwise, query 'L' + 'D' terms (as per above)
1772                     Xapian::Query qq2 =
1773                         Xapian::Query(Xapian::Query::OP_AND,
1774                                       Xapian::Query(Xapian::Query::OP_VALUE_LE,
1775                                                     Xapian::valueno(SLOT_INDEXVERSION),
1776                                                     std::string("15")),
1777                                       q);
1778                     qq |= qq2;
1779                 }
1780 
1781                 q = qq;
1782             }
1783 
1784             parseaddr_free(addr);
1785             free(a);
1786         }
1787     }
1788 
1789     return new Xapian::Query(q);
1790 }
1791 
append_alnum(struct buf * buf,const char * ss)1792 static void append_alnum(struct buf *buf, const char *ss)
1793 {
1794     const unsigned char *s = (const unsigned char *)ss;
1795 
1796     for ( ; *s ; ++s) {
1797         if (Uisalnum(*s))
1798             buf_putc(buf, *s);
1799     }
1800 }
1801 
query_new_type(const xapian_db_t * db,const char * _prefix,const char * str)1802 static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused)),
1803                                      const char *_prefix,
1804                                      const char *str)
1805 {
1806 
1807     std::pair<std::string, std::string> ct = parse_content_type(str);
1808     std::string prefix(_prefix);
1809     Xapian::Query q = Xapian::Query::MatchNothing;
1810 
1811     bool query_legacy = db->db_versions->lower_bound(13) != db->db_versions->begin();
1812     struct buf buf = BUF_INITIALIZER;
1813     unsigned qpflags = Xapian::QueryParser::FLAG_PHRASE |
1814                        Xapian::QueryParser::FLAG_WILDCARD;
1815 
1816     if (!ct.first.empty() && ct.second.empty()) {
1817         /* Match either type or subtype */
1818         if (ct.first != "*") {
1819             q = Xapian::Query(Xapian::Query::OP_OR,
1820                     Xapian::Query(prefix + 'T' + ct.first),
1821                     Xapian::Query(prefix + 'S' + ct.first));
1822             if (query_legacy) {
1823                 append_alnum(&buf, ct.first.c_str());
1824                 q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1825             }
1826         }
1827     }
1828     else if (ct.first == "*" || ct.second == "*") {
1829         /* Wildcard query */
1830         if (!ct.first.empty() && ct.first != "*") {
1831             /* Match type */
1832             q = Xapian::Query(prefix + 'T' + ct.first);
1833             if (query_legacy) {
1834                 append_alnum(&buf, ct.first.c_str());
1835                 q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1836             }
1837         }
1838         if (!ct.second.empty() && ct.second != "*") {
1839             /* Match subtype */
1840             q = Xapian::Query(prefix + 'S' + ct.second);
1841             if (query_legacy) {
1842                 append_alnum(&buf, ct.second.c_str());
1843                 q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1844             }
1845         }
1846     }
1847     else if (!ct.first.empty() && !ct.second.empty()) {
1848         /* Verbatim search */
1849         q = Xapian::Query(prefix + ct.first + '/' + ct.second);
1850         if (query_legacy) {
1851             append_alnum(&buf, ct.first.c_str());
1852             buf_putc(&buf, '_');
1853             append_alnum(&buf, ct.second.c_str());
1854             q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1855         }
1856     }
1857 
1858     buf_free(&buf);
1859     return new Xapian::Query(q);
1860 }
1861 
1862 EXPORTED Xapian::Query *
xapian_query_new_match_internal(const xapian_db_t * db,int partnum,const char * str)1863 xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str)
1864 {
1865     const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
1866 
1867     try {
1868         // Handle special value search parts.
1869         if (partnum == SEARCH_PART_LANGUAGE) {
1870             return query_new_language(db, prefix, str);
1871         }
1872         else if (partnum == SEARCH_PART_PRIORITY) {
1873             return query_new_priority(db, prefix, str);
1874         }
1875         else if (partnum == SEARCH_PART_LISTID) {
1876             return query_new_listid(db, prefix, str);
1877         }
1878         else if (partnum == SEARCH_PART_FROM ||
1879                  partnum == SEARCH_PART_TO ||
1880                  partnum == SEARCH_PART_CC ||
1881                  partnum == SEARCH_PART_BCC ||
1882                  partnum == SEARCH_PART_DELIVEREDTO) {
1883             return query_new_email(db, prefix, str);
1884         }
1885         else if (partnum == SEARCH_PART_TYPE) {
1886             return query_new_type(db, prefix, str);
1887         }
1888 
1889         // Don't stem queries for Thaana codepage (0780) or higher.
1890         for (const unsigned char *p = (const unsigned char *)str; *p; p++) {
1891             if (*p > 221) //has highbit
1892                 return new Xapian::Query {db->parser->parse_query(
1893                     str,
1894 #ifdef USE_XAPIAN_CJK_WORDS
1895                     Xapian::QueryParser::FLAG_CJK_WORDS,
1896 #else
1897                     Xapian::QueryParser::FLAG_CJK_NGRAM,
1898 #endif
1899                     prefix)};
1900         }
1901 
1902         // Stemable codepage.
1903         Xapian::TermGenerator::stem_strategy stem_strategy =
1904             get_stem_strategy(XAPIAN_DB_CURRENT_VERSION, partnum);
1905 
1906         Xapian::Query *qq = query_new_textmatch(db, str, prefix, stem_strategy);
1907         if (qq->get_type() == Xapian::Query::LEAF_MATCH_NOTHING) {
1908             delete qq;
1909             qq = NULL;
1910         }
1911         return qq;
1912 
1913     } catch (const Xapian::Error &err) {
1914         xsyslog(LOG_ERR, "IOERROR: caught exception",
1915                          "exception=<%s>",
1916                          err.get_description().c_str());
1917         return 0;
1918     }
1919 }
1920 
1921 EXPORTED xapian_query_t *
xapian_query_new_match(const xapian_db_t * db,int partnum,const char * str)1922 xapian_query_new_match(const xapian_db_t *db, int partnum, const char *str)
1923 {
1924     if (db->subdbs->empty()) {
1925         // no database to query
1926         return NULL;
1927     }
1928 
1929     const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
1930     if (!prefix) {
1931         return NULL;
1932     }
1933 
1934     int min_version = *db->db_versions->begin();
1935     if (min_version < XAPIAN_DB_MIN_SUPPORTED_VERSION) {
1936         xsyslog(LOG_WARNING,
1937                 "deprecated database version, reindex required",
1938                 "version=<%d> min_supported_version=<%d> paths=<%s>",
1939                 min_version, XAPIAN_DB_MIN_SUPPORTED_VERSION,
1940                 db->paths->c_str());
1941     }
1942 
1943     Xapian::Query *q = xapian_query_new_match_internal(db, partnum, str);
1944     if (min_version < 15) {
1945         /* Older versions indexed header fields in Cyrus search form */
1946         charset_t utf8 = charset_lookupname("utf-8");
1947         char *mystr = charset_convert(str, utf8, charset_flags);
1948         if (mystr) {
1949             Xapian::Query *qq = xapian_query_new_match_internal(db, partnum, mystr);
1950             if (qq && q) {
1951                 *q |= *qq;
1952             }
1953             else if (!q) q = qq;
1954         }
1955         free(mystr);
1956         charset_free(&utf8);
1957     }
1958     return (xapian_query_t*) q;
1959 }
1960 
1961 EXPORTED xapian_query_t *
xapian_query_new_compound(const xapian_db_t * db,int is_or,xapian_query_t ** children,int n)1962 xapian_query_new_compound(const xapian_db_t *db __attribute__((unused)),
1963                           int is_or, xapian_query_t **children, int n)
1964 {
1965     try {
1966         // I want to use std::initializer_list<Xapian::Query*> here
1967         // but that requires "experimental" gcc C++0x support :(
1968         // 'compound' owns a refcount on each child.  We need to
1969         // drop the one we got when we allocated the children
1970         Xapian::Query* compound = new Xapian::Query;
1971         if (is_or)
1972             for (int i = 0 ; i < n ; i++) {
1973                 *compound |= *(Xapian::Query*)children[i];
1974                 delete (Xapian::Query*)children[i];
1975             }
1976         else
1977             for (int i = 0 ; i < n ; i++) {
1978                 if (compound->empty())
1979                     *compound = *(Xapian::Query*)children[i];
1980                 else
1981                     *compound &= *(Xapian::Query*)children[i];
1982                 delete (Xapian::Query*)children[i];
1983             }
1984         return (xapian_query_t *)compound;
1985     }
1986     catch (const Xapian::Error &err) {
1987         xsyslog(LOG_ERR, "IOERROR: caught exception",
1988                          "exception=<%s>",
1989                          err.get_description().c_str());
1990         return 0;
1991     }
1992 }
1993 
1994 /* Xapian does not have an OP_NOT.  WTF?  We fake it with
1995  * OP_AND_NOT where the left child is MatchAll */
1996 EXPORTED xapian_query_t *
xapian_query_new_not(const xapian_db_t * db,xapian_query_t * child)1997 xapian_query_new_not(const xapian_db_t *db __attribute__((unused)),
1998                      xapian_query_t *child)
1999 {
2000     if (!child) return (xapian_query_t*) new Xapian::Query(Xapian::Query::MatchAll);
2001 
2002     try {
2003         Xapian::Query *qq = new Xapian::Query(
2004                                         Xapian::Query::OP_AND_NOT,
2005                                         Xapian::Query::MatchAll,
2006                                         *(Xapian::Query *)child);
2007         // 'compound' owns a refcount on each child.  We need to
2008         // drop the one we got when we allocated the children
2009         delete (Xapian::Query *)child;
2010         return (xapian_query_t *)qq;
2011     }
2012     catch (const Xapian::Error &err) {
2013         xsyslog(LOG_ERR, "IOERROR: caught exception",
2014                          "exception=<%s>",
2015                          err.get_description().c_str());
2016         return 0;
2017     }
2018 }
2019 
2020 EXPORTED xapian_query_t *
xapian_query_new_matchall(const xapian_db_t * db)2021 xapian_query_new_matchall(const xapian_db_t *db __attribute__((unused)))
2022 {
2023     return (xapian_query_t *) new Xapian::Query(Xapian::Query::MatchAll);
2024 }
2025 
2026 EXPORTED xapian_query_t *
xapian_query_new_has_doctype(const xapian_db_t * db,char doctype,xapian_query_t * child)2027 xapian_query_new_has_doctype(const xapian_db_t *db __attribute__((unused)),
2028                              char doctype, xapian_query_t *child)
2029 {
2030     try {
2031         Xapian::Query *qq = new Xapian::Query(
2032                                         Xapian::Query::OP_FILTER,
2033                                         child ? *(Xapian::Query *)child : Xapian::Query::MatchAll,
2034                                         std::string("XE") + doctype);
2035         // 'compound' owns a refcount on each child.  We need to
2036         // drop the one we got when we allocated the children
2037         delete (Xapian::Query *)child;
2038         return (xapian_query_t *)qq;
2039     }
2040     catch (const Xapian::Error &err) {
2041         xsyslog(LOG_ERR, "IOERROR: caught exception",
2042                          "exception=<%s>",
2043                          err.get_description().c_str());
2044         return 0;
2045     }
2046 }
2047 
xapian_query_free(xapian_query_t * qq)2048 EXPORTED void xapian_query_free(xapian_query_t *qq)
2049 {
2050     try {
2051         delete (Xapian::Query *)qq;
2052     }
2053     catch (const Xapian::Error &err) {
2054         xsyslog(LOG_ERR, "IOERROR: caught exception",
2055                          "exception=<%s>",
2056                          err.get_description().c_str());
2057     }
2058 }
2059 
xapian_query_run(const xapian_db_t * db,const xapian_query_t * qq,int is_legacy,int (* cb)(void * data,size_t n,void * rock),void * rock)2060 EXPORTED int xapian_query_run(const xapian_db_t *db, const xapian_query_t *qq,
2061                               int is_legacy,
2062                               int (*cb)(void *data, size_t n, void *rock), void *rock)
2063 {
2064     const Xapian::Query *query = (const Xapian::Query *)qq;
2065     void *data = NULL;
2066     size_t n = 0;
2067 
2068     if ((is_legacy && !db->legacydbv4) || (!is_legacy && !db->database)) return 0;
2069 
2070     try {
2071         Xapian::Database *database = is_legacy ? db->legacydbv4 : db->database;
2072         Xapian::Enquire enquire(*database);
2073         enquire.set_query(*query);
2074         enquire.set_sort_by_value(0, false); // sort by cyrusid ascending
2075         Xapian::MSet matches = enquire.get_mset(0, database->get_doccount());
2076         size_t size = matches.size();
2077         if (size) data = xzmalloc(size * 41);
2078         for (Xapian::MSetIterator i = matches.begin() ; i != matches.end() ; ++i) {
2079             const Xapian::Document& d = i.get_document();
2080             const std::string& cyrusid = d.get_value(SLOT_CYRUSID);
2081 
2082             /* ignore documents with no cyrusid.  Shouldn't happen, but has been seen */
2083             if (cyrusid.length() != 43) {
2084                 xsyslog(LOG_ERR, "IOERROR: skipping document with zero-length cyrusid",
2085                                  "documentid=<%u> paths=<%s>",
2086                                  d.get_docid(), db->paths->c_str());
2087                 continue;
2088             }
2089             const char *cstr = cyrusid.c_str();
2090             if (cstr[0] != '*' || !isalpha(cstr[1]) || cstr[2] != '*') {
2091                 xsyslog(LOG_ERR, "IOERROR: skipping document with invalid cyrusid",
2092                                  "cyrusid=<%s> documentid=<%u> paths=<%s>",
2093                                  cstr, d.get_docid(), db->paths->c_str());
2094                 continue;
2095             }
2096             if (n >= size) throw Xapian::DatabaseError("Too many records in MSet");
2097             char *entry = (char *) data + (41*n);
2098             memcpy(entry, cstr+3, 40);
2099             entry[40] = '\0';
2100             ++n;
2101         }
2102     }
2103     catch (const Xapian::Error &err) {
2104         xsyslog(LOG_ERR, "IOERROR: caught exception",
2105                          "exception=<%s> query=<%s>",
2106                          err.get_description().c_str(),
2107                          query ? query->get_description().c_str() : "");
2108         free(data);
2109         return IMAP_IOERROR;
2110     }
2111 
2112     if (!n) {
2113         free(data);
2114         return 0;
2115     }
2116 
2117     int r = cb(data, n, rock);
2118     free(data);
2119     return r;
2120 }
2121 
2122 /* ====================================================================== */
2123 
2124 struct xapian_snipgen
2125 {
2126     Xapian::Stem *default_stemmer;
2127     xapian_db_t *db;
2128     Xapian::Database *memdb;
2129     std::vector<std::string> *loose_terms;
2130     std::vector<std::string> *queries;
2131     char *cyrusid;
2132     char doctype;
2133     struct buf *buf;
2134     const char *hi_start;
2135     const char *hi_end;
2136     const char *omit;
2137     size_t max_len;
2138 };
2139 
2140 EXPORTED xapian_snipgen_t *
xapian_snipgen_new(xapian_db_t * db,const char * hi_start,const char * hi_end,const char * omit)2141 xapian_snipgen_new(xapian_db_t *db,
2142                    const char *hi_start,
2143                    const char *hi_end,
2144                    const char *omit)
2145 {
2146     xapian_snipgen_t *snipgen = (xapian_snipgen_t *)xzmalloc(sizeof(xapian_snipgen_t));
2147     snipgen->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
2148     snipgen->db = db;
2149     snipgen->memdb = new Xapian::WritableDatabase(std::string(), Xapian::DB_BACKEND_INMEMORY);
2150     snipgen->buf = buf_new();
2151     snipgen->hi_start = hi_start;
2152     snipgen->hi_end = hi_end;
2153     snipgen->omit = omit;
2154     snipgen->max_len = (size_t) config_getint(IMAPOPT_SEARCH_SNIPPET_LENGTH);
2155 
2156     return snipgen;
2157 }
2158 
xapian_snipgen_free(xapian_snipgen_t * snipgen)2159 EXPORTED void xapian_snipgen_free(xapian_snipgen_t *snipgen)
2160 {
2161     if (!snipgen) return;
2162     delete snipgen->default_stemmer;
2163     delete snipgen->loose_terms;
2164     delete snipgen->queries;
2165     delete snipgen->memdb;
2166     free(snipgen->cyrusid);
2167     buf_destroy(snipgen->buf);
2168     free(snipgen);
2169 }
2170 
xapian_snipgen_build_query(xapian_snipgen_t * snipgen,Xapian::Stem & stemmer)2171 static Xapian::Query xapian_snipgen_build_query(xapian_snipgen_t *snipgen, Xapian::Stem& stemmer)
2172 {
2173     Xapian::TermGenerator term_generator;
2174     Xapian::Query q;
2175 
2176     if (snipgen->loose_terms) {
2177         /* Add loose query terms */
2178         term_generator.set_stemmer(stemmer);
2179 #ifdef USE_XAPIAN_CJK_WORDS
2180         term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
2181                 ~Xapian::TermGenerator::FLAG_CJK_WORDS);
2182 #else
2183         term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
2184                 ~Xapian::TermGenerator::FLAG_CJK_NGRAM);
2185 #endif
2186 
2187         for(size_t i = 0; i < snipgen->loose_terms->size(); ++i)
2188         {
2189             term_generator.index_text(Xapian::Utf8Iterator((*snipgen->loose_terms)[i]));
2190         }
2191 
2192         const Xapian::Document& doc = term_generator.get_document();
2193         q = Xapian::Query(Xapian::Query::OP_OR, doc.termlist_begin(), doc.termlist_end());
2194     }
2195 
2196     if (snipgen->queries) {
2197         /* Add phrase queries */
2198         unsigned flags = Xapian::QueryParser::FLAG_PHRASE|
2199                          Xapian::QueryParser::FLAG_WILDCARD|
2200 #ifdef USE_XAPIAN_CJK_WORDS
2201                          Xapian::QueryParser::FLAG_CJK_WORDS;
2202 #else
2203                          Xapian::QueryParser::FLAG_CJK_NGRAM;
2204 #endif
2205         Xapian::QueryParser queryparser;
2206         queryparser.set_stemmer(stemmer);
2207         for(size_t i = 0; i < snipgen->queries->size(); ++i) {
2208             q |= queryparser.parse_query((*snipgen->queries)[i], flags);;
2209         }
2210     }
2211 
2212     return q;
2213 }
2214 
xapian_snipgen_add_match(xapian_snipgen_t * snipgen,const char * match)2215 EXPORTED int xapian_snipgen_add_match(xapian_snipgen_t *snipgen,
2216                                       const char *match)
2217 {
2218     size_t len = strlen(match);
2219     bool is_query = len > 1 && ((match[0] == '"' && match[len-1] == '"') ||
2220                                 (strchr(match, '*') != NULL));
2221 
2222     if (is_query) {
2223         if (!snipgen->queries) {
2224             snipgen->queries = new std::vector<std::string>;
2225         }
2226         snipgen->queries->push_back(match);
2227     } else {
2228         if (!snipgen->loose_terms) {
2229             snipgen->loose_terms = new std::vector<std::string>;
2230         }
2231         snipgen->loose_terms->push_back(match);
2232     }
2233 
2234     return 0;
2235 }
2236 
xapian_snipgen_begin_doc(xapian_snipgen_t * snipgen,const struct message_guid * guid,char doctype)2237 EXPORTED int xapian_snipgen_begin_doc(xapian_snipgen_t *snipgen,
2238                                       const struct message_guid *guid,
2239                                       char doctype)
2240 {
2241     struct buf buf = BUF_INITIALIZER;
2242     make_cyrusid(&buf, guid, doctype);
2243     snipgen->cyrusid = buf_release(&buf);
2244     snipgen->doctype = doctype;
2245 
2246     buf_reset(snipgen->buf);
2247     return 0;
2248 }
2249 
xapian_snipgen_make_snippet(xapian_snipgen_t * snipgen,const struct buf * part,Xapian::Stem * stemmer)2250 EXPORTED int xapian_snipgen_make_snippet(xapian_snipgen_t *snipgen,
2251                                          const struct buf *part,
2252                                          Xapian::Stem* stemmer)
2253 {
2254     int r = 0;
2255     try {
2256         std::string text {buf_base(part), buf_len(part)};
2257         Xapian::Enquire enquire(*snipgen->memdb);
2258         Xapian::Query qq = xapian_snipgen_build_query(snipgen, *stemmer);
2259         if (qq.empty()) return 0;
2260         enquire.set_query(qq);
2261 
2262         unsigned flags = Xapian::MSet::SNIPPET_EXHAUSTIVE |
2263                          Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH;
2264 #ifdef USE_XAPIAN_CJK_WORDS
2265         flags |= Xapian::MSet::SNIPPET_CJK_WORDS;
2266 #endif
2267 
2268         const std::string snippet = enquire.get_mset(0, 0).snippet(text,
2269                 snipgen->max_len - buf_len(snipgen->buf),
2270                 *stemmer, flags,
2271                 snipgen->hi_start,
2272                 snipgen->hi_end,
2273                 snipgen->omit);
2274         if (!snippet.empty()) {
2275             if (buf_len(snipgen->buf)) {
2276                 buf_appendoverlap(snipgen->buf, snipgen->omit);
2277             }
2278             buf_appendcstr(snipgen->buf, snippet.c_str());
2279         }
2280     } catch (const Xapian::Error &err) {
2281         xsyslog(LOG_ERR, "IOERROR: caught exception",
2282                          "exception=<%s>",
2283                          err.get_description().c_str());
2284         r = IMAP_IOERROR;
2285     }
2286     return r;
2287 }
2288 
xapian_snipgen_doc_part(xapian_snipgen_t * snipgen,const struct buf * part,int partnum)2289 EXPORTED int xapian_snipgen_doc_part(xapian_snipgen_t *snipgen,
2290                                      const struct buf *part,
2291                                      int partnum __attribute__((unused)))
2292 {
2293     // Ignore empty queries.
2294     if (!snipgen->loose_terms && !snipgen->queries) return 0;
2295 
2296     // Don't exceed allowed snippet length.
2297     if (buf_len(snipgen->buf) >= snipgen->max_len) return 0;
2298 
2299     if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE) &&
2300         snipgen->db->database && snipgen->cyrusid) {
2301         std::set<std::string> doclangs;
2302 
2303         // Lookup stemmer language for this document part, if any.
2304         std::string key = lang_doc_key(snipgen->cyrusid);
2305         for (const Xapian::Database& subdb : *snipgen->db->subdbs) {
2306             std::string val = subdb.get_metadata(key);
2307             if (!val.empty()) parse_doclangs(val, doclangs);
2308             break;
2309         }
2310 
2311         // Generate snippets for each detected message language.
2312         // The first non-empty snippet wins.
2313         size_t prev_size = buf_len(snipgen->buf);
2314         for (std::set<std::string>::iterator it = doclangs.begin(); it != doclangs.end(); ++it) {
2315             const std::string& iso_lang = *it;
2316             if (iso_lang.compare("en")) {
2317                 try {
2318                     Xapian::Stem stemmer = get_stemmer(iso_lang);
2319                     int r = xapian_snipgen_make_snippet(snipgen, part, &stemmer);
2320                     if (!r && prev_size != buf_len(snipgen->buf)) {
2321                         return 0;
2322                     }
2323                 } catch (const Xapian::InvalidArgumentError &err) {
2324                     // ignore unknown stemmer
2325                 }
2326             }
2327         }
2328     }
2329 
2330     /* Using a custom stemmer did not generate a snippet.
2331      * This could be because the query matched using the
2332      * default stemmer, so try generating a snippet with
2333      * that stemmer instead.*/
2334     return xapian_snipgen_make_snippet(snipgen, part, snipgen->default_stemmer);
2335 }
2336 
xapian_snipgen_end_doc(xapian_snipgen_t * snipgen,struct buf * buf)2337 EXPORTED int xapian_snipgen_end_doc(xapian_snipgen_t *snipgen, struct buf *buf)
2338 {
2339     buf_reset(buf);
2340     buf_copy(buf, snipgen->buf);
2341     buf_cstring(buf);
2342     buf_reset(snipgen->buf);
2343 
2344     delete snipgen->loose_terms;
2345     snipgen->loose_terms = NULL;
2346 
2347     delete snipgen->queries;
2348     snipgen->queries = NULL;
2349 
2350     free(snipgen->cyrusid);
2351     snipgen->cyrusid = NULL;
2352     snipgen->doctype = 0;
2353 
2354     return 0;
2355 }
2356 
2357 /* cb returns true if document should be copied, false if not */
xapian_filter(const char * dest,const char ** sources,int (* cb)(const char * cyrusid,void * rock),void * rock)2358 EXPORTED int xapian_filter(const char *dest, const char **sources,
2359                            int (*cb)(const char *cyrusid, void *rock),
2360                            void *rock)
2361 {
2362     int r = 0;
2363     const char *thispath = "(unknown path)";
2364 
2365     try {
2366         /* create a destination database */
2367         Xapian::WritableDatabase destdb {dest, Xapian::DB_CREATE|Xapian::DB_BACKEND_GLASS};
2368 
2369         /* With multiple databases as above, the docids are interleaved, so it
2370          * might be worth trying to open each source and copy its documents to
2371          * destdb in turn for better locality of reference, and so better cache
2372          * use. -- Olly on the mailing list */
2373 
2374         std::vector<Xapian::Database> srcdbs;
2375 
2376         // Open databases and aggregate database-level metadata.
2377         while (*sources) {
2378             thispath = *sources++;
2379             const Xapian::Database srcdb {thispath};
2380             srcdbs.push_back(srcdb);
2381         }
2382 
2383         // Copy all matching documents.
2384         std::set<int> db_versions;
2385 
2386         for (size_t i = 0; i < srcdbs.size(); ++i) {
2387             const Xapian::Database& srcdb = srcdbs.at(i);
2388             bool need_md_versions = false;
2389             std::set<int> md_versions = read_db_versions(srcdb);
2390 
2391             /* copy all matching documents to the new DB */
2392             for (Xapian::ValueIterator it = srcdb.valuestream_begin(SLOT_CYRUSID);
2393                     it != srcdb.valuestream_end(SLOT_CYRUSID); ++it) {
2394                 const std::string& cyrusid = *it;
2395                 const std::string idkey {"cyrusid." + cyrusid};
2396 
2397                 // check if caller wants this cyrusid
2398                 if (!cb(cyrusid.c_str(), rock)) {
2399                     continue;
2400                 }
2401 
2402                 // is it already indexed?
2403                 if (!destdb.get_metadata(idkey).empty()) {
2404                     continue;
2405                 }
2406 
2407                 // is there a subsequent db with a better index level? (only for G docs)
2408                 uint8_t indexlevel = parse_indexlevel(srcdb.get_metadata(idkey));
2409                 if (cyrusid[1] == XAPIAN_WRAP_DOCTYPE_MSG) {
2410                     int found_better = 0;
2411                     for (size_t j = i + 1; !found_better && j < srcdbs.size(); ++j) {
2412                         uint8_t level = parse_indexlevel(srcdbs[j].get_metadata(idkey));
2413                         found_better = better_indexlevel(indexlevel, level) != indexlevel;
2414                     }
2415                     if (found_better) {
2416                         continue;
2417                     }
2418                 }
2419 
2420                 // add document
2421                 Xapian::Document srcdoc = srcdb.get_document(it.get_docid());
2422                 Xapian::docid docid = destdb.add_document(srcdoc);
2423                 destdb.set_metadata(idkey, format_indexlevel(indexlevel));
2424 
2425                 // copy document language metadata
2426                 const std::string& langkey = lang_doc_key(cyrusid.c_str());
2427                 if (destdb.get_metadata(langkey).empty()) {
2428                     std::string val = srcdb.get_metadata(langkey);
2429                     if (!val.empty() && isalpha(val[0])) {
2430                         destdb.set_metadata(langkey, val);
2431                     }
2432                 }
2433                 const std::string& langval = srcdoc.get_value(SLOT_DOCLANGS);
2434                 if (!langval.empty() && !isalpha(langval[0])) {
2435                     destdb.get_document(docid).remove_value(SLOT_DOCLANGS);
2436                 }
2437                 // add document index version
2438                 const std::string& verval = srcdoc.get_value(SLOT_INDEXVERSION);
2439                 if (!verval.empty()) {
2440                     int version = std::atoi(verval.c_str());
2441                     if (version) db_versions.insert(version);
2442                 }
2443                 else need_md_versions = true;
2444             }
2445 
2446             if (need_md_versions) {
2447                 /* At least one added document didn't have its index
2448                  * version slot set in this subdb. Read legacy versions. */
2449                 std::set<int> md_versions = read_db_versions(srcdb);
2450                 db_versions.insert(md_versions.begin(), md_versions.lower_bound(14));
2451             }
2452         }
2453 
2454         thispath = "(unknown path)";
2455 
2456         // set the versions
2457         write_db_versions(destdb, db_versions);
2458 
2459         // recalculate language counts
2460         std::map<const std::string, unsigned> lang_counts;
2461         r = calculate_language_counts(destdb, lang_counts);
2462         if (r) {
2463             xsyslog(LOG_ERR, "IOERROR: corrupt metadata",
2464                              "filter=<%s>",
2465                              dest);
2466             return r;
2467         }
2468         write_language_counts(destdb, lang_counts);
2469 
2470         /* commit all changes explicitly */
2471         destdb.commit();
2472     }
2473     catch (const Xapian::Error &err) {
2474         xsyslog(LOG_ERR, "IOERROR: caught exception",
2475                          "exception=<%s> path=<%s>",
2476                          err.get_description().c_str(), thispath);
2477         r = IMAP_IOERROR;
2478     }
2479 
2480     return r;
2481 }
2482 
xapian_version_string()2483 EXPORTED const char *xapian_version_string()
2484 {
2485     return Xapian::version_string();
2486 }
2487 
2488 struct xapian_doc {
2489     Xapian::TermGenerator *termgen;
2490     Xapian::Document *doc;
2491 };
2492 
xapian_doc_new(void)2493 EXPORTED xapian_doc_t *xapian_doc_new(void)
2494 {
2495     xapian_doc_t *doc = (xapian_doc_t *) xzmalloc(sizeof(struct xapian_doc));
2496     doc->doc = new Xapian::Document;
2497     doc->termgen = new Xapian::TermGenerator;
2498     doc->termgen->set_document(*doc->doc);
2499     return doc;
2500 }
2501 
xapian_doc_index_text(xapian_doc_t * doc,const char * text,size_t len)2502 EXPORTED void xapian_doc_index_text(xapian_doc_t *doc,
2503                                     const char *text, size_t len)
2504 {
2505     doc->termgen->index_text(Xapian::Utf8Iterator(text, len));
2506 }
2507 
xapian_doc_termcount(xapian_doc_t * doc)2508 EXPORTED size_t xapian_doc_termcount(xapian_doc_t *doc)
2509 {
2510     return doc->doc->termlist_count();
2511 }
2512 
xapian_doc_foreach_term(xapian_doc_t * doc,int (* cb)(const char *,void *),void * rock)2513 EXPORTED int xapian_doc_foreach_term(xapian_doc_t *doc,
2514                                      int(*cb)(const char*, void*),
2515                                      void *rock)
2516 {
2517     for (Xapian::TermIterator ti = doc->doc->termlist_begin();
2518             ti != doc->doc->termlist_end(); ++ti) {
2519         int r = cb((*ti).c_str(), rock);
2520         if (r) return r;
2521     }
2522     return 0;
2523 }
2524 
xapian_doc_reset(xapian_doc_t * doc)2525 EXPORTED void xapian_doc_reset(xapian_doc_t *doc)
2526 {
2527     doc->doc->clear_values();
2528 }
2529 
xapian_doc_close(xapian_doc_t * doc)2530 EXPORTED void xapian_doc_close(xapian_doc_t *doc)
2531 {
2532     delete doc->termgen;
2533     delete doc->doc;
2534     free(doc);
2535 }
2536