1 #include <errno.h>
2 #include <config.h>
3 #include <string.h>
4 #include <sys/types.h>
5 #include <syslog.h>
6
7 #include <fstream>
8 #include <sstream>
9 #include <algorithm>
10 #include <memory>
11
12 extern "C" {
13 #include <assert.h>
14 #include "libconfig.h"
15 #include "util.h"
16 #include "search_engines.h"
17 #include "search_part.h"
18 #include "xmalloc.h"
19 #include "xapian_wrap.h"
20 #include "charset.h"
21 #include "ptrarray.h"
22 #include "parseaddr.h"
23
24
25 /* generated headers are not necessarily in current directory */
26 #include "imap/imap_err.h"
27 };
28
29 #include <unicode/unistr.h>
30 #include <unicode/locid.h>
31
32 #include <xapian.h>
33
34 #ifdef HAVE_CLD2
35 #include <cld2/public/compact_lang_det.h>
36 #endif
37
38 // from global.h
39 extern int charset_flags;
40
41 #define SLOT_CYRUSID 0
42 #define SLOT_DOCLANGS 1
43 #define SLOT_INDEXLEVEL 2
44 #define SLOT_INDEXVERSION 3
45
46 static const unsigned XAPIAN_MAX_TERM_LENGTH = 200; /* in UTF-8 bytes */
47
48 /* ====================================================================== */
49
make_cyrusid(struct buf * dst,const struct message_guid * guid,char doctype)50 static void make_cyrusid(struct buf *dst, const struct message_guid *guid, char doctype)
51 {
52 buf_reset(dst);
53 buf_putc(dst, '*');
54 buf_putc(dst, doctype);
55 buf_putc(dst, '*');
56 buf_appendcstr(dst, message_guid_encode(guid));
57 buf_cstring(dst);
58 }
59
60 /* ====================================================================== */
61
62 /*
63 * A brief history of Xapian db versions:
64 * Version 0: uses STEM_ALL for all terms, term prefixes don't start with 'X'
65 * Version 1: term prefixes start with 'X'
66 * Version 2: uses STEM_SOME for some terms
67 * Version 3: removes all use of STEM_ALL
68 * Version 4: indexes headers and bodies in separate documents
69 * Version 5: indexes headers and bodies together and stems by language
70 * Version 6: stores all detected languages of a document in slot SLOT_DOCLANGS (deprecated)
71 * Version 7: indexes new DELIVEREDTO search part
72 * Version 8: reintroduces language indexing for non-English text
73 * Version 9: introduces index levels as keys to cyrusid metadata
74 * Version 10: indexes new PRIORITY search part
75 * Version 11: indexes LIST-ID as single value
76 * Version 12: indexes email domains as single values. Supports subdomain search.
77 * Version 13: indexes content-type and subtype separately
78 * Version 14: adds SLOT_INDEXVERSION to documents
79 * Version 15: receives indexed header fields and text in original format (rather than search form)
80 * Version 16: indexes entire addr-spec as a single value. Prevents cross-matching localparts and domains
81 */
82 #define XAPIAN_DB_CURRENT_VERSION 16
83 #define XAPIAN_DB_MIN_SUPPORTED_VERSION 5
84
read_db_versions(const Xapian::Database & database)85 static std::set<int> read_db_versions(const Xapian::Database &database)
86 {
87 std::set<int> versions;
88
89 // db_version is a comma-separated list of version numbers
90 std::string val = database.get_metadata("cyrus.db_version");
91 if (!val.empty()) {
92 strarray_t *vstr = strarray_split(val.c_str(), ",", 0);
93 for (int i = 0; i < strarray_size(vstr); i++) {
94 int version = std::atoi(strarray_nth(vstr, i));
95 if (version) versions.insert(version);
96 }
97 strarray_free(vstr);
98 }
99 // Up to version 3 this was named stem version.
100 val = database.get_metadata("cyrus.stem-version");
101 if (!val.empty()) {
102 versions.insert(std::stoi(val));
103 }
104
105 return versions;
106 }
107
write_db_versions(Xapian::WritableDatabase & database,std::set<int> & versions)108 static void write_db_versions(Xapian::WritableDatabase &database, std::set<int> &versions)
109 {
110 std::ostringstream val;
111 for (std::set<int>::iterator it = versions.begin(); it != versions.end(); ++it) {
112 if (it != versions.begin()) val << ",";
113 val << *it;
114 }
115 database.set_metadata("cyrus.db_version", val.str());
116 database.set_metadata("cyrus.stem-version", "");
117 }
118
119 /* ====================================================================== */
120 #define XAPIAN_LANG_COUNT_KEYPREFIX "lang.count"
121 #define XAPIAN_LANG_DOC_KEYPREFIX "lang.doc"
122
lang_prefix(const std::string & iso_lang,const char * prefix)123 static std::string lang_prefix(const std::string& iso_lang, const char *prefix)
124 {
125 std::string ustr = std::string(prefix) + "XI" + iso_lang;
126 std::transform(ustr.begin(), ustr.end(), ustr.begin(), ::toupper);
127 return ustr;
128 }
129
lang_doc_key(const char * cyrusid)130 static std::string lang_doc_key(const char *cyrusid)
131 {
132 std::string key(XAPIAN_LANG_DOC_KEYPREFIX ".");
133 key += cyrusid;
134 return key;
135 }
136
lang_count_key(const std::string & iso_lang)137 static std::string lang_count_key(const std::string& iso_lang)
138 {
139 std::string key(XAPIAN_LANG_COUNT_KEYPREFIX ".");
140 key += iso_lang;
141 return key;
142 }
143
calculate_language_counts(const Xapian::Database & db,std::map<const std::string,unsigned> & lang_counts)144 static int calculate_language_counts(const Xapian::Database& db,
145 std::map<const std::string, unsigned>& lang_counts)
146 {
147 std::set<int> db_versions = read_db_versions(db);
148
149 if (db_versions.lower_bound(8) == db_versions.begin()) {
150 // count all indexed body parts
151 size_t nparts = 0;
152 for (Xapian::TermIterator it = db.metadata_keys_begin("cyrusid.*P*");
153 it != db.metadata_keys_end("cyrusid.*P*"); ++it) {
154 nparts++;
155 }
156 // count body parts with language metadata
157 const std::string prefix{XAPIAN_LANG_DOC_KEYPREFIX ".*P*"};
158 size_t nlangparts = 0;
159 for (Xapian::TermIterator it = db.metadata_keys_begin(prefix);
160 it != db.metadata_keys_end(prefix); ++it) {
161 lang_counts[db.get_metadata(*it)] += 1;
162 nlangparts++;
163 }
164 // English or unknown language body parts have no metadata.
165 lang_counts["en"] += nparts - nlangparts;
166 // Sanity check data
167 if (nparts < nlangparts) {
168 return IMAP_IOERROR;
169 }
170 }
171
172 return 0;
173 }
174
remove_legacy_metadata(Xapian::WritableDatabase & db)175 static void remove_legacy_metadata(Xapian::WritableDatabase& db)
176 {
177 const std::string prefix{XAPIAN_LANG_DOC_KEYPREFIX "."};
178 for (Xapian::TermIterator key = db.metadata_keys_begin(prefix);
179 key != db.metadata_keys_end(prefix); ++key) {
180
181 const std::string& val = db.get_metadata(*key);
182 // Remove legacy keys and values.
183 if ((*key).find('.') != std::string::npos ||
184 (!val.empty() && !isalpha(val[0]))) {
185 db.set_metadata(*key, "");
186 }
187 }
188 for (Xapian::docid docid = 1; docid <= db.get_lastdocid(); ++docid) {
189 try {
190 Xapian::Document doc = db.get_document(docid);
191 const std::string& val = doc.get_value(SLOT_DOCLANGS);
192 // Remove legacy doclang slot values.
193 if (!val.empty() && !isalpha(val[0])) {
194 doc.remove_value(SLOT_DOCLANGS);
195 }
196 }
197 catch (Xapian::DocNotFoundError e) {
198 // ignore
199 }
200 }
201 }
202
write_language_counts(Xapian::WritableDatabase & db,const std::map<const std::string,unsigned> & lang_counts)203 static void write_language_counts(Xapian::WritableDatabase& db,
204 const std::map<const std::string, unsigned>& lang_counts)
205 {
206 for (Xapian::TermIterator it = db.metadata_keys_begin(XAPIAN_LANG_COUNT_KEYPREFIX);
207 it != db.metadata_keys_end(XAPIAN_LANG_COUNT_KEYPREFIX); ++it) {
208 db.set_metadata(*it, "");
209 }
210 for (const std::pair<std::string, unsigned>& it : lang_counts) {
211 db.set_metadata(lang_count_key(it.first), std::to_string(it.second));
212 }
213 }
214
read_language_counts(const Xapian::Database & db,std::map<const std::string,unsigned> & lang_counts)215 static void read_language_counts(const Xapian::Database& db,
216 std::map<const std::string, unsigned>& lang_counts)
217 {
218 std::set<int> db_versions = read_db_versions(db);
219
220 if (db_versions.lower_bound(8) == db_versions.begin()) {
221 const std::string prefix(XAPIAN_LANG_COUNT_KEYPREFIX ".");
222 for (Xapian::TermIterator it = db.metadata_keys_begin(prefix);
223 it != db.metadata_keys_end(prefix); ++it) {
224 std::string iso_lang = (*it).substr(prefix.length());
225 unsigned count = std::stol(db.get_metadata(*it));
226 lang_counts[iso_lang] += count;
227 }
228 }
229 }
230
parse_doclangs(const std::string & val,std::set<std::string> & doclangs)231 static void parse_doclangs(const std::string& val, std::set<std::string>& doclangs)
232 {
233 if (val.empty() || !isalpha(val[0])) return;
234
235 size_t base = 0, pos;
236 while ((pos = val.find(',', base)) != std::string::npos) {
237 doclangs.insert(val.substr(base, pos - base));
238 base = pos + 1;
239 }
240 doclangs.insert(val.substr(base));
241 }
242
format_doclangs(const std::set<std::string> & doclangs)243 static std::string format_doclangs(const std::set<std::string>& doclangs)
244 {
245 std::ostringstream val;
246 for (std::set<std::string>::iterator it = doclangs.begin(); it != doclangs.end(); ++it) {
247 if (it != doclangs.begin()) val << ",";
248 val << *it;
249 }
250 std::string s = val.str();
251 return s;
252 }
253
parse_langcode(const char * str)254 static std::string parse_langcode(const char *str)
255 {
256 std::string lstr(str);
257 std::transform(lstr.begin(), lstr.end(), lstr.begin(), ::tolower);
258 // accept syntax for two and three letter ISO 639 codes
259 if (!(isalpha(lstr[0]) && isalpha(lstr[1]) &&
260 (lstr[2] == '\0' || (isalpha(lstr[2]) && lstr[3] == '\0')))) {
261 return std::string();
262 }
263 return lstr;
264 }
265
266 // Process-scoped, thread-unsafe cache of stoppers by ISO 639 code.
267 static std::map<const std::string, std::unique_ptr<Xapian::Stopper>> stoppers;
268
get_stopper(const std::string & iso)269 static const Xapian::Stopper* get_stopper(const std::string& iso)
270 {
271 // Lookup cached entry.
272 try {
273 return stoppers.at(iso).get();
274 } catch (const std::out_of_range&) {};
275
276 // Lookup language name by ISO code.
277 icu::Locale loc(iso.c_str());
278 if (loc.isBogus()) return NULL;
279
280 // Read stopper file and add to cache.
281 const char *swpath = config_getstring(IMAPOPT_SEARCH_STOPWORD_PATH);
282 if (!swpath) return NULL;
283
284 std::string lang_name;
285 icu::UnicodeString ulang_name;
286 loc.getDisplayLanguage(icu::Locale("en"), ulang_name);
287 ulang_name.toLower();
288 ulang_name.toUTF8String(lang_name);
289
290 // Open stopword file
291 // XXX doesn't play nice with WIN32 paths
292 std::string fname(std::string(swpath) + "/" + lang_name + ".txt");
293 errno = 0;
294 std::ifstream inFile (fname);
295 if (inFile.fail()) {
296 syslog(LOG_DEBUG, "Xapian: could not open stopword file %s: %s",
297 fname.c_str(), errno ? strerror(errno) : "unknown error");
298 return NULL;
299 }
300
301 // Create and store the Xapian stopper
302 stoppers[iso].reset(new Xapian::SimpleStopper(
303 std::istream_iterator<std::string>(inFile),
304 std::istream_iterator<std::string>()));
305 return stoppers[iso].get();
306 }
307
308 class CyrusSearchStemmer : public Xapian::StemImplementation
309 {
310 charset_t utf8 {charset_lookupname("utf-8")};
311 std::map<const std::string, std::string> cache;
312 Xapian::Stem stem {"en"};
313
314 public:
~CyrusSearchStemmer()315 virtual ~CyrusSearchStemmer() { charset_free(&utf8); }
316
operator ()(const std::string & word)317 virtual std::string operator() (const std::string &word) override {
318 // Is this word already in the cache?
319 try {
320 return cache.at(word);
321 } catch (const std::out_of_range&) {}
322
323 // Convert the word to search form
324 std::unique_ptr<char, decltype(std::free)*>
325 q {charset_convert(word.c_str(), utf8, charset_flags), std::free};
326 std::string s = q ? stem(Xapian::Unicode::tolower(q.get())) : stem(word);
327 if (s.size() > XAPIAN_MAX_TERM_LENGTH) return std::string{};
328
329 // Store the normalized word in the cache
330 return cache[word] = s;
331 }
332
get_description() const333 virtual std::string get_description () const override {
334 return "Cyrus";
335 }
336 };
337
338
339 class FrenchContractionStemmer : public Xapian::StemImplementation
340 {
341 Xapian::Stem stem {"fr"};
342
343 public:
344
operator ()(const std::string & word)345 virtual std::string operator() (const std::string &word) override {
346
347 size_t pos = 0;
348 switch (word[0]) {
349 case 'q':
350 if (word.length() <= 3 || word[1] != 'u') {
351 break;
352 }
353 pos++;
354 // fall through
355 case 'c':
356 case 'd':
357 case 'j':
358 case 'l':
359 case 'm':
360 case 'n':
361 case 's':
362 case 't':
363 // APOSTROPHE (U+0027)
364 if (word.length() > pos + 2 && word[pos+1] == 0x27) {
365 return stem(word.substr(pos + 2));
366 }
367 // RIGHT SINGLE QUOTATION MARK (U+2019)
368 // FULLWIDTH APOSTROPHE (U+FF07)
369 else if (!word.compare(pos + 1, 3, "\xe2\x80\x99") ||
370 !word.compare(pos + 1, 3, "\xef\xbc\x87")) {
371 return stem(word.substr(pos + 4));
372 }
373 // fall through
374 }
375 // not a contraction
376 return stem(word);
377 }
378
get_description() const379 virtual std::string get_description () const override {
380 return "fr-contraction";
381 }
382 };
383
get_stemmer(const std::string & iso_lang)384 static Xapian::Stem get_stemmer(const std::string& iso_lang)
385 {
386 return iso_lang == "fr" ?
387 Xapian::Stem{new FrenchContractionStemmer} :
388 Xapian::Stem{iso_lang};
389 }
390
391 #ifdef HAVE_CLD2
detect_language(const struct buf * part)392 static std::string detect_language(const struct buf *part)
393 {
394 std::string iso_lang;
395 bool reliable = false;
396 CLD2::Language lang = CLD2::DetectLanguage(part->s, part->len, 1, &reliable);
397
398 if (reliable && lang != CLD2::UNKNOWN_LANGUAGE) {
399 std::string code(CLD2::LanguageCode(lang));
400 std::transform(code.begin(), code.end(), code.begin(), ::tolower);
401 // Map CLD2 special codes to ISO 639.
402 if (!code.compare("zh-Hant")) {
403 code = "zh";
404 }
405 else if (!code.compare("sr-ME" )) {
406 code = "sr"; // not a political statement!
407 }
408 else if (!code.compare("xxx")) {
409 code = "";
410 }
411 iso_lang = parse_langcode(code.c_str());
412 }
413
414 return iso_lang;
415 }
416 #endif /* HAVE_CLD2 */
417
418 /* ====================================================================== */
419
better_indexlevel(uint8_t levela,uint8_t levelb)420 static uint8_t better_indexlevel(uint8_t levela, uint8_t levelb)
421 {
422 uint8_t a = levela & ~SEARCH_INDEXLEVEL_PARTIAL;
423 uint8_t b = levelb & ~SEARCH_INDEXLEVEL_PARTIAL;
424 if (a > b) return levela;
425 if (a < b) return levelb;
426 return (levela & SEARCH_INDEXLEVEL_PARTIAL) ? levelb : levela;
427 }
428
parse_indexlevel(const std::string & s)429 static uint8_t parse_indexlevel(const std::string& s)
430 {
431 uint8_t level = 0;
432 if (hex_to_bin(s.c_str(), s.length(), &level) != 1) {
433 return 0;
434 }
435 return level;
436 }
437
format_indexlevel(uint8_t level)438 static std::string format_indexlevel(uint8_t level)
439 {
440 char hex[4];
441 bin_to_lchex(&level, 1, hex);
442 return std::string(hex, 2);
443 }
444
445 /* ====================================================================== */
446
447 class CyrusMetadataCompactor : public Xapian::Compactor
448 {
449 public:
450
CyrusMetadataCompactor()451 CyrusMetadataCompactor() { }
452
resolve_duplicate_metadata(const std::string & key,size_t num_tags,const std::string tags[])453 std::string resolve_duplicate_metadata(const std::string &key,
454 size_t num_tags,
455 const std::string tags[])
456 {
457 if (key.rfind("cyrusid.", 0) == 0) {
458 uint8_t indexlevel = parse_indexlevel(tags[0]);
459 size_t bestpos = 0;
460 for (size_t i = 1; i < num_tags; i++) {
461 uint8_t level = parse_indexlevel(tags[i]);
462 if (better_indexlevel(indexlevel, level) == level) {
463 indexlevel = level;
464 bestpos = i;
465 }
466 }
467 return tags[bestpos];
468 }
469
470 return tags[0];
471 }
472 };
473
474
xapian_compact_dbs(const char * dest,const char ** sources)475 EXPORTED int xapian_compact_dbs(const char *dest, const char **sources)
476 {
477 int r = 0;
478 Xapian::Database db;
479 const char *thispath = "(unknown path)";
480
481 try {
482 std::set<int> db_versions;
483 std::map<const std::string, unsigned> lang_counts;
484 std::vector<Xapian::Database> subdbs;
485
486 while (*sources) {
487 thispath = *sources;
488 Xapian::Database subdb(*sources++);
489 db.add_database(subdb);
490 subdbs.push_back(subdb);
491
492 // Aggregate db versions.
493 bool need_metadata = false;
494 for (Xapian::docid docid = 1; docid <= subdb.get_lastdocid(); ++docid) {
495 try {
496 Xapian::Document doc = subdb.get_document(docid);
497 const std::string& val = doc.get_value(SLOT_INDEXVERSION);
498 if (!val.empty()) {
499 int version = std::atoi(val.c_str());
500 if (version) db_versions.insert(version);
501 }
502 else need_metadata = true;
503 }
504 catch (Xapian::DocNotFoundError e) {
505 // ignore
506 }
507 }
508 if (need_metadata) {
509 /* At least one document didn't have its index version set.
510 * Read the legacy version from the metadata. */
511 std::set<int> md_versions = read_db_versions(subdb);
512 db_versions.insert(md_versions.begin(), md_versions.lower_bound(14));
513 }
514
515 // Aggregate language counts.
516 r = calculate_language_counts(subdb, lang_counts);
517 if (r) {
518 xsyslog(LOG_ERR, "IOERROR: corrupt language metadata",
519 "path=<%s>", thispath);
520 return r;
521 }
522 }
523 thispath = "(unknown path)";
524
525 // Compact database.
526 static CyrusMetadataCompactor comp;
527 // FULLER because we never write to compression targets again.
528 db.compact(dest, Xapian::Compactor::FULLER | Xapian::DBCOMPACT_MULTIPASS, 0, comp);
529
530 Xapian::WritableDatabase newdb(dest);
531 write_db_versions(newdb, db_versions);
532
533 // Clean metadata.
534 remove_legacy_metadata(newdb);
535
536 // Reset language counts.
537 write_language_counts(newdb, lang_counts);
538 }
539 catch (const Xapian::Error &err) {
540 xsyslog(LOG_ERR, "IOERROR: caught exception",
541 "exception=<%s> path=<%s>",
542 err.get_description().c_str(), thispath);
543 r = IMAP_IOERROR;
544 }
545
546 return r;
547 }
548
549 /* ====================================================================== */
550
get_term_prefix(int db_version,int partnum)551 static const char *get_term_prefix(int db_version, int partnum)
552 {
553 /*
554 * We use term prefixes to store terms per search part.
555 * In addition, each Xapian document contains a "XE"
556 * prefix to indicate its document type, listed in
557 * the XAPIAN_WRAP_DOCTYPE definitions. The "XE" prefix
558 * MUST not be used for any search part.
559 *
560 */
561 static const char * const term_prefixes[SEARCH_NUM_PARTS] = {
562 NULL, /* ANY */
563 "XF", /* FROM */
564 "XT", /* TO */
565 "XC", /* CC */
566 "XB", /* BCC */
567 "XS", /* SUBJECT */
568 "XL", /* LISTID */
569 "XY", /* TYPE */
570 "XH", /* HEADERS */
571 "", /* BODY */
572 "XO", /* LOCATION */
573 "XA", /* ATTACHMENTNAME */
574 "XAB", /* ATTACHMENTBODY */
575 "XDT", /* DELIVEREDTO */
576 "XI", /* LANGUAGE */
577 "XP" /* PRIORITY */
578 };
579
580 static const char * const term_prefixes_v0[SEARCH_NUM_PARTS] = {
581 NULL, /* ANY */
582 "F", /* FROM */
583 "T", /* TO */
584 "C", /* CC */
585 "B", /* BCC */
586 "S", /* SUBJECT */
587 "L", /* LISTID */
588 "Y", /* TYPE */
589 "H", /* HEADERS */
590 "D", /* BODY */
591 "O", /* LOCATION */
592 "A", /* ATTACHMENTNAME */
593 "AB", /* ATTACHMENTBODY */
594 "E", /* DELIVEREDTO */
595 NULL, /* LANGUAGE */
596 NULL /* PRIORITY */
597 };
598
599 return db_version > 0 ? term_prefixes[partnum] : term_prefixes_v0[partnum];
600 }
601
get_stem_strategy(int db_version,int partnum)602 static Xapian::TermGenerator::stem_strategy get_stem_strategy(int db_version, int partnum)
603 {
604 static Xapian::TermGenerator::stem_strategy stem_strategy[SEARCH_NUM_PARTS] = {
605 // Version 2 and higher
606 Xapian::TermGenerator::STEM_NONE, /* ANY */
607 Xapian::TermGenerator::STEM_NONE, /* FROM */
608 Xapian::TermGenerator::STEM_NONE, /* TO */
609 Xapian::TermGenerator::STEM_NONE, /* CC */
610 Xapian::TermGenerator::STEM_NONE, /* BCC */
611 Xapian::TermGenerator::STEM_SOME, /* SUBJECT */
612 Xapian::TermGenerator::STEM_NONE, /* LISTID */
613 Xapian::TermGenerator::STEM_NONE, /* TYPE */
614 Xapian::TermGenerator::STEM_NONE, /* HEADERS */
615 Xapian::TermGenerator::STEM_SOME, /* BODY */
616 Xapian::TermGenerator::STEM_SOME, /* LOCATION */
617 Xapian::TermGenerator::STEM_NONE, /* ATTACHMENTNAME */
618 Xapian::TermGenerator::STEM_SOME, /* ATTACHMENTBODY */
619 Xapian::TermGenerator::STEM_NONE, /* DELIVEREDTO */
620 Xapian::TermGenerator::STEM_NONE, /* LANGUAGE */
621 Xapian::TermGenerator::STEM_NONE /* PRIORITY */
622 };
623
624 static Xapian::TermGenerator::stem_strategy stem_strategy_v1[SEARCH_NUM_PARTS] = {
625 // Version 1: Stem bodies using STEM_SOME with stopwords
626 Xapian::TermGenerator::STEM_NONE, /* ANY */
627 Xapian::TermGenerator::STEM_ALL, /* FROM */
628 Xapian::TermGenerator::STEM_ALL, /* TO */
629 Xapian::TermGenerator::STEM_ALL, /* CC */
630 Xapian::TermGenerator::STEM_ALL, /* BCC */
631 Xapian::TermGenerator::STEM_ALL, /* SUBJECT */
632 Xapian::TermGenerator::STEM_ALL, /* LISTID */
633 Xapian::TermGenerator::STEM_ALL, /* TYPE */
634 Xapian::TermGenerator::STEM_ALL, /* HEADERS */
635 Xapian::TermGenerator::STEM_SOME, /* BODY */
636 Xapian::TermGenerator::STEM_SOME, /* LOCATION */
637 Xapian::TermGenerator::STEM_NONE, /* ATTACHMENTNAME */
638 Xapian::TermGenerator::STEM_SOME, /* ATTACHMENTBODY */
639 Xapian::TermGenerator::STEM_ALL, /* DELIVEREDTO */
640 Xapian::TermGenerator::STEM_NONE, /* LANGUAGE */
641 Xapian::TermGenerator::STEM_NONE /* PRIORITY */
642 };
643
644 static Xapian::TermGenerator::stem_strategy stem_strategy_v0[SEARCH_NUM_PARTS] = {
645 // Version 0: Initial version
646 Xapian::TermGenerator::STEM_NONE, /* ANY */
647 Xapian::TermGenerator::STEM_ALL, /* FROM */
648 Xapian::TermGenerator::STEM_ALL, /* TO */
649 Xapian::TermGenerator::STEM_ALL, /* CC */
650 Xapian::TermGenerator::STEM_ALL, /* BCC */
651 Xapian::TermGenerator::STEM_ALL, /* SUBJECT */
652 Xapian::TermGenerator::STEM_ALL, /* LISTID */
653 Xapian::TermGenerator::STEM_ALL, /* TYPE */
654 Xapian::TermGenerator::STEM_ALL, /* HEADERS */
655 Xapian::TermGenerator::STEM_ALL, /* BODY */
656 Xapian::TermGenerator::STEM_ALL, /* LOCATION */
657 Xapian::TermGenerator::STEM_ALL, /* ATTACHMENTNAME */
658 Xapian::TermGenerator::STEM_ALL, /* ATTACHMENTBODY */
659 Xapian::TermGenerator::STEM_ALL, /* DELIVEREDTO */
660 Xapian::TermGenerator::STEM_NONE, /* LANGUAGE */
661 Xapian::TermGenerator::STEM_NONE /* PRIORITY */
662 };
663
664 switch (db_version) {
665 case 0:
666 return stem_strategy_v0[partnum];
667 case 1:
668 return stem_strategy_v1[partnum];
669 default:
670 return stem_strategy[partnum];
671 }
672 }
673
674 /* For all db paths in sources that are not using the latest database
675 * version or not readable, report their paths in toreindex */
xapian_check_if_needs_reindex(const strarray_t * sources,strarray_t * toreindex,int always_upgrade)676 EXPORTED void xapian_check_if_needs_reindex(const strarray_t *sources,
677 strarray_t *toreindex,
678 int always_upgrade)
679 {
680 // Check the version of all dbs in sources
681 for (int i = 0; i < sources->count; i++) {
682 const char *thispath = strarray_nth(sources, i);
683 try {
684 for (const int& it: read_db_versions(Xapian::Database{thispath})) {
685 if (it < XAPIAN_DB_MIN_SUPPORTED_VERSION ||
686 (always_upgrade && (it != XAPIAN_DB_CURRENT_VERSION))) {
687 strarray_add(toreindex, thispath);
688 }
689 }
690 }
691 catch (const Xapian::Error &err) {
692 strarray_add(toreindex, thispath);
693 }
694 }
695 }
696
697 /* ====================================================================== */
698
add_boolean_nterm(Xapian::Document & doc,const std::string & term,size_t n=XAPIAN_MAX_TERM_LENGTH)699 static inline void add_boolean_nterm(Xapian::Document& doc,
700 const std::string& term,
701 size_t n = XAPIAN_MAX_TERM_LENGTH)
702 {
703 if (term.size() && term.size() < n) {
704 doc.add_boolean_term(term);
705 }
706 }
707
708 struct xapian_dbw
709 {
710 // Database context.
711 Xapian::WritableDatabase *database;
712 ptrarray_t otherdbs;
713 Xapian::TermGenerator *term_generator;
714 Xapian::Stem *default_stemmer;
715 const Xapian::Stopper* default_stopper;
716 // Document context.
717 Xapian::Document *document;
718 char doctype;
719 char *cyrusid;
720 std::set<std::string> *doclangs;
721 std::vector<std::string> *subjects;
722 };
723
724
xapian_dbw_init(xapian_dbw_t * dbw)725 static int xapian_dbw_init(xapian_dbw_t *dbw)
726 {
727 dbw->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
728 dbw->default_stopper = get_stopper("en");
729 dbw->term_generator = new Xapian::TermGenerator;
730 dbw->term_generator->set_max_word_length(XAPIAN_MAX_TERM_LENGTH);
731 /* Always enable CJK word tokenization */
732 #ifdef USE_XAPIAN_CJK_WORDS
733 dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
734 ~Xapian::TermGenerator::FLAG_CJK_WORDS);
735 #else
736 dbw->term_generator->set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
737 ~Xapian::TermGenerator::FLAG_CJK_NGRAM);
738 #endif
739 dbw->doclangs = new std::set<std::string>;
740 dbw->subjects = new std::vector<std::string>;
741 return 0;
742 }
743
xapian_dbw_open(const char ** paths,xapian_dbw_t ** dbwp,int mode,int nosync)744 EXPORTED int xapian_dbw_open(const char **paths, xapian_dbw_t **dbwp,
745 int mode, int nosync)
746 {
747 xapian_dbw_t *dbw = (xapian_dbw_t *)xzmalloc(sizeof(xapian_dbw_t));
748 int r = 0;
749 const char *thispath = *paths++;
750
751 std::set<int> db_versions;
752 try {
753 int flags = Xapian::DB_BACKEND_GLASS|Xapian::DB_RETRY_LOCK;
754 if (nosync) flags |= Xapian::DB_DANGEROUS|Xapian::DB_NO_SYNC;
755 try {
756 dbw->database = new Xapian::WritableDatabase{thispath, flags|Xapian::DB_OPEN};
757 db_versions = read_db_versions(*dbw->database);
758 } catch (Xapian::DatabaseOpeningError &e) {
759 /* It's OK not to atomically create or open, since we can assume
760 * the xapianactive file items to be locked. */
761 dbw->database = new Xapian::WritableDatabase{thispath, flags|Xapian::DB_CREATE};
762 }
763 if (db_versions.find(XAPIAN_DB_CURRENT_VERSION) == db_versions.end()) {
764 // Always index using latest database version.
765 db_versions.insert(XAPIAN_DB_CURRENT_VERSION);
766 write_db_versions(*dbw->database, db_versions);
767 }
768
769 r = xapian_dbw_init(dbw);
770
771 }
772 catch (const Xapian::DatabaseLockError &err) {
773 /* somebody else is already indexing this user. They may be doing a different
774 * mailbox, so we need to re-insert this mailbox into the queue! */
775 r = IMAP_MAILBOX_LOCKED;
776 }
777 catch (const Xapian::Error &err) {
778 xsyslog(LOG_ERR, "IOERROR: caught exception",
779 "exception=<%s> path=<%s>",
780 err.get_description().c_str(), thispath);
781 r = IMAP_IOERROR;
782 }
783
784 if (r) {
785 xapian_dbw_close(dbw);
786 return r;
787 }
788
789 /* open the read-only databases */
790 if (mode == XAPIAN_DBW_XAPINDEXED) {
791 while (*paths) {
792 try {
793 thispath = *paths;
794 ptrarray_append(&dbw->otherdbs, new Xapian::Database{*paths++});
795 }
796 catch (const Xapian::Error &err) {
797 xsyslog(LOG_ERR, "IOERROR: reading database",
798 "exception=<%s> path=<%s>",
799 err.get_description().c_str(), thispath);
800 }
801 }
802 }
803
804 *dbwp = dbw;
805
806 return 0;
807 }
808
xapian_dbw_close(xapian_dbw_t * dbw)809 EXPORTED void xapian_dbw_close(xapian_dbw_t *dbw)
810 {
811 if (!dbw) return;
812 try {
813 delete dbw->database;
814 delete dbw->term_generator;
815 delete dbw->document;
816 delete dbw->default_stemmer;
817 delete dbw->doclangs;
818 delete dbw->subjects;
819 for (int i = 0; i < dbw->otherdbs.count; i++) {
820 delete (Xapian::Database *)ptrarray_nth(&dbw->otherdbs, i);
821 }
822 ptrarray_fini(&dbw->otherdbs);
823 free(dbw->cyrusid);
824 free(dbw);
825 }
826 catch (const Xapian::Error &err) {
827 xsyslog(LOG_ERR, "IOERROR: caught exception",
828 "exception=<%s>",
829 err.get_description().c_str());
830 }
831 }
832
xapian_dbw_begin_txn(xapian_dbw_t * dbw)833 EXPORTED int xapian_dbw_begin_txn(xapian_dbw_t *dbw)
834 {
835 int r = 0;
836 try {
837 dbw->database->begin_transaction();
838 }
839 catch (const Xapian::Error &err) {
840 xsyslog(LOG_ERR, "IOERROR: caught exception",
841 "exception=<%s>",
842 err.get_description().c_str());
843 r = IMAP_IOERROR;
844 }
845 return r;
846 }
847
xapian_dbw_commit_txn(xapian_dbw_t * dbw)848 EXPORTED int xapian_dbw_commit_txn(xapian_dbw_t *dbw)
849 {
850 int r = 0;
851 try {
852 dbw->database->commit_transaction();
853 }
854 catch (const Xapian::Error &err) {
855 xsyslog(LOG_ERR, "IOERROR: caught exception",
856 "exception=<%s>",
857 err.get_description().c_str());
858 r = IMAP_IOERROR;
859 }
860 return r;
861 }
862
xapian_dbw_cancel_txn(xapian_dbw_t * dbw)863 EXPORTED int xapian_dbw_cancel_txn(xapian_dbw_t *dbw)
864 {
865 int r = 0;
866 try {
867 dbw->database->cancel_transaction();
868 }
869 catch (const Xapian::Error &err) {
870 xsyslog(LOG_ERR, "IOERROR: caught exception",
871 "exception=<%s>",
872 err.get_description().c_str());
873 r = IMAP_IOERROR;
874 }
875 return r;
876 }
877
xapian_dbw_begin_doc(xapian_dbw_t * dbw,const struct message_guid * guid,char doctype)878 EXPORTED int xapian_dbw_begin_doc(xapian_dbw_t *dbw,
879 const struct message_guid *guid,
880 char doctype)
881 {
882 int r = 0;
883
884 try {
885 delete dbw->document;
886 dbw->document = new Xapian::Document;
887 dbw->doctype = doctype;
888 /* Set document id and type */
889 struct buf buf = BUF_INITIALIZER;
890 make_cyrusid(&buf, guid, doctype);
891 dbw->document->add_value(SLOT_CYRUSID, buf_cstring(&buf));
892 dbw->cyrusid = buf_release(&buf);
893 add_boolean_nterm(*dbw->document, std::string("XE") + doctype);
894 /* Initialize term generator */
895 dbw->term_generator->set_document(*dbw->document);
896 dbw->term_generator->set_termpos(1);
897 }
898 catch (const Xapian::Error &err) {
899 xsyslog(LOG_ERR, "IOERROR: caught exception",
900 "exception=<%s>",
901 err.get_description().c_str());
902 r = IMAP_IOERROR;
903 }
904 return r;
905 }
906
add_language_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)907 static int add_language_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
908 {
909 std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
910 std::string val = parse_langcode(buf_cstring(part));
911 if (val.empty()) {
912 syslog(LOG_INFO, "Xapian: not a valid ISO 639 code: %s",
913 buf_cstring(part));
914 return 0;
915 }
916 add_boolean_nterm(*dbw->document, prefix + val);
917 return 0;
918 }
919
parse_priority(const char * str)920 static std::string parse_priority(const char *str)
921 {
922 const char *err;
923 uint32_t u;
924 if (parseuint32(str, &err, &u) == -1 || *err || u == 0) {
925 return std::string();
926 }
927 return std::to_string(u);
928 }
929
add_priority_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)930 static int add_priority_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
931 {
932 std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
933 if (buf_len(part)) {
934 std::string val = parse_priority(buf_cstring(part));
935 if (val.empty()) {
936 syslog(LOG_DEBUG, "Xapian: not a valid priority: %s",
937 buf_cstring(part));
938 return 0;
939 }
940 add_boolean_nterm(*dbw->document, prefix + val);
941 }
942 return 0;
943 }
944
parse_listid(const char * str)945 static std::string parse_listid(const char *str)
946 {
947 std::string val;
948
949 /* Extract list-id */
950 const char *start = strrchr(str, '<');
951 if (start) {
952 /* RFC2919 list-id header (with optional closing bracket) */
953 const char *end = strchr(++start, '>');
954 if (end)
955 val = std::string(start, end - start);
956 else
957 val = std::string(start);
958 }
959 else {
960 /* Groups-style header: 'list list-id[; contact list-contact]'
961 * As seen at Google Group, Yahoo, et al. */
962 for (start = str; isspace(*start); start++) {}
963 if (!strncasecmp("list", start, 4) && isspace(start[4])) {
964 for (start = start + 4; isspace(*start); start++) {}
965 if (*start) {
966 const char *end = strchr(start, ';');
967 if (!end || end - start) {
968 val = end ? std::string(start, end - start) : std::string{start};
969 }
970 }
971 }
972 /* just raw value, that's OK too, like sentry creates. Parse up to first whitespace */
973 else {
974 const char *end;
975 for (end = start; *end && !isspace(*end); end++) {}
976 val = std::string(start, end - start);
977 }
978 }
979
980 /* Normalize list-id */
981 val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
982 std::transform(val.begin(), val.end(), val.begin(), ::tolower);
983 return val;
984 }
985
add_listid_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)986 static int add_listid_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
987 {
988 std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
989
990 /* Normalize list-id */
991 std::string val = parse_listid(buf_cstring(part));
992 val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
993 std::transform(val.begin(), val.end(), val.begin(), ::tolower);
994 if (val.empty()) {
995 syslog(LOG_WARNING, "Xapian: not a valid list-id: %s",
996 buf_cstring(part));
997 return 0;
998 }
999
1000 add_boolean_nterm(*dbw->document, prefix + val);
1001 return 0;
1002 }
1003
add_email_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1004 static int add_email_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
1005 {
1006 std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
1007 std::string lpart = Xapian::Unicode::tolower(buf_cstring(part));
1008 struct address_itr itr;
1009 address_itr_init(&itr, lpart.c_str(), 0);
1010
1011 const struct address *addr;
1012 while ((addr = address_itr_next(&itr))) {
1013 if (addr->invalid) {
1014 continue;
1015 }
1016 if (addr->name) {
1017 dbw->term_generator->set_stemmer(Xapian::Stem());
1018 dbw->term_generator->set_stopper(NULL);
1019 dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->name), 1, prefix + 'N');
1020
1021 dbw->term_generator->set_stemmer(Xapian::Stem());
1022 dbw->term_generator->set_stopper(NULL);
1023 dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->name), 1, prefix);
1024 }
1025 if (addr->mailbox) {
1026 // index mailbox as single value
1027 std::string val(addr->mailbox);
1028 // ignore whitespace (as seen in quoted mailboxes)
1029 val.erase(std::remove_if(val.begin(), val.end(), isspace), val.end());
1030 add_boolean_nterm(*dbw->document, prefix + 'L' + val);
1031 // index individual terms
1032 dbw->term_generator->set_stemmer(Xapian::Stem());
1033 dbw->term_generator->set_stopper(NULL);
1034 dbw->term_generator->index_text(Xapian::Utf8Iterator(val), 1, prefix);
1035 }
1036 if (addr->domain && strcmp(addr->domain, "unspecified-domain")) {
1037 // index reversed domain
1038 std::string val;
1039 strarray_t *sa = strarray_split(addr->domain, ".", 0);
1040 val.reserve(buf_len(part));
1041 for (int i = strarray_size(sa) - 1; i >= 0; i--) {
1042 val.append(strarray_nth(sa, i));
1043 if (i > 0) {
1044 val.append(1, '.');
1045 }
1046 }
1047 strarray_free(sa);
1048 add_boolean_nterm(*dbw->document, prefix + "D" + val);
1049 // index individual terms
1050 dbw->term_generator->set_stemmer(Xapian::Stem());
1051 dbw->term_generator->set_stopper(NULL);
1052 dbw->term_generator->index_text(Xapian::Utf8Iterator(addr->domain,
1053 strlen(addr->domain)), 1, prefix);
1054 }
1055
1056 // index entire addr-spec
1057 char *a = address_get_all(addr, /*canon_domain*/1);
1058 if (a) {
1059 add_boolean_nterm(*dbw->document, prefix + 'A' + std::string(a));
1060 free(a);
1061 }
1062 }
1063
1064 address_itr_fini(&itr);
1065 return 0;
1066 }
1067
parse_content_type(const char * str)1068 static std::pair<std::string, std::string> parse_content_type(const char *str)
1069 {
1070 std::pair<std::string, std::string> ret;
1071 struct buf buf = BUF_INITIALIZER;
1072
1073 const char *sep = strchr(str, '/');
1074 if (sep) {
1075 /* type */
1076 buf_setmap(&buf, str, sep - str);
1077 buf_lcase(&buf);
1078 buf_trim(&buf);
1079 ret.first = std::string(buf_cstring(&buf));
1080 /* subtype */
1081 buf_setcstr(&buf, sep + 1);
1082 buf_lcase(&buf);
1083 buf_trim(&buf);
1084 ret.second = std::string(buf_cstring(&buf));
1085 }
1086 else {
1087 /* type or subtype */
1088 buf_setcstr(&buf, str);
1089 buf_lcase(&buf);
1090 buf_trim(&buf);
1091 ret.first = std::string(buf_cstring(&buf));
1092 }
1093
1094 buf_free(&buf);
1095 return ret;
1096 }
1097
add_type_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1098 static int add_type_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
1099 {
1100 std::string prefix(get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum));
1101 std::pair<std::string, std::string> ct = parse_content_type(buf_cstring(part));
1102 if (!ct.first.empty()) {
1103 add_boolean_nterm(*dbw->document, prefix + "T" + ct.first);
1104 }
1105 if (!ct.second.empty()) {
1106 add_boolean_nterm(*dbw->document, prefix + "S" + ct.second);
1107 }
1108 if (!ct.first.empty() && !ct.second.empty()) {
1109 add_boolean_nterm(*dbw->document, prefix + ct.first + '/' + ct.second);
1110 }
1111 return 0;
1112 }
1113
add_text_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1114 static int add_text_part(xapian_dbw_t *dbw, const struct buf *part, int partnum)
1115 {
1116 const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
1117 int r = 0;
1118
1119 // Index text.
1120 Xapian::TermGenerator::stem_strategy stem_strategy =
1121 get_stem_strategy(XAPIAN_DB_CURRENT_VERSION, partnum);
1122 dbw->term_generator->set_stemming_strategy(stem_strategy);
1123
1124 if (stem_strategy != Xapian::TermGenerator::STEM_NONE) {
1125 if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE)){
1126 // Index by language.
1127 #ifndef HAVE_CLD2
1128 // XXX is this really an "IOERROR"?
1129 xsyslog(LOG_ERR, "IOERROR: language indexing requires CLD2 library",
1130 NULL);
1131 return IMAP_IOERROR;
1132 #else
1133
1134 if (search_part_is_body(partnum)) {
1135 const std::string iso_lang = detect_language(part);
1136 if (!iso_lang.empty()) {
1137 if (iso_lang.compare("en")) {
1138 // Stem and index by non-default language.
1139 try {
1140 dbw->term_generator->set_stemmer(get_stemmer(iso_lang));
1141 dbw->term_generator->set_stopper(get_stopper(iso_lang));
1142 dbw->term_generator->index_text(Xapian::Utf8Iterator(part->s, part->len),
1143 1, lang_prefix(iso_lang, prefix));
1144 } catch (const Xapian::InvalidArgumentError &err) {
1145 syslog(LOG_DEBUG, "Xapian: no stemmer for language %s",
1146 iso_lang.c_str());
1147 }
1148 }
1149 if (dbw->doctype == 'P') {
1150 // Keep track of stemmer language.
1151 std::string key = lang_doc_key(dbw->cyrusid);
1152 dbw->database->set_metadata(key, iso_lang);
1153 dbw->document->add_value(SLOT_DOCLANGS, iso_lang);
1154 // Update language counts for body parts.
1155 key = lang_count_key(iso_lang);
1156 const std::string val = dbw->database->get_metadata(key);
1157 dbw->database->set_metadata(key, val.empty() ?
1158 "1" : std::to_string(std::stoi(val) + 1));
1159 }
1160 // Store detected languages in document.
1161 dbw->doclangs->insert(iso_lang.c_str());
1162 add_boolean_nterm(*dbw->document, std::string("XI") + iso_lang);
1163 }
1164 }
1165 else if (partnum == SEARCH_PART_SUBJECT) {
1166 // Keep subject text to index by language later.
1167 dbw->subjects->push_back(buf_cstring(part));
1168 }
1169 #endif /* HAVE_CLD2 */
1170 }
1171
1172 // Index with default stemmer.
1173 dbw->term_generator->set_stemmer(*dbw->default_stemmer);
1174 dbw->term_generator->set_stopper(dbw->default_stopper);
1175 } else {
1176 // Index with no stemming.
1177 dbw->term_generator->set_stemmer(Xapian::Stem());
1178 dbw->term_generator->set_stopper(NULL);
1179 }
1180 dbw->term_generator->index_text(Xapian::Utf8Iterator(part->s, part->len), 1, prefix);
1181
1182 return r;
1183 }
1184
xapian_dbw_doc_part(xapian_dbw_t * dbw,const struct buf * part,int partnum)1185 EXPORTED int xapian_dbw_doc_part(xapian_dbw_t *dbw,
1186 const struct buf *part,
1187 int partnum)
1188 {
1189 int r = 0;
1190
1191 if (!get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum)) {
1192 syslog(LOG_ERR, "xapian_wrapper: no prefix for partnum %d", partnum);
1193 return IMAP_INTERNAL;
1194 }
1195
1196 try {
1197 // Handle search parts.
1198 switch (partnum) {
1199 case SEARCH_PART_PRIORITY:
1200 r = add_priority_part(dbw, part, partnum);
1201 break;
1202 case SEARCH_PART_LISTID:
1203 r = add_listid_part(dbw, part, partnum);
1204 break;
1205 case SEARCH_PART_LANGUAGE:
1206 r = add_language_part(dbw, part, partnum);
1207 break;
1208 case SEARCH_PART_FROM:
1209 case SEARCH_PART_TO:
1210 case SEARCH_PART_CC:
1211 case SEARCH_PART_BCC:
1212 case SEARCH_PART_DELIVEREDTO:
1213 r = add_email_part(dbw, part, partnum);
1214 break;
1215 case SEARCH_PART_TYPE:
1216 r = add_type_part(dbw, part, partnum);
1217 break;
1218 default:
1219 r = add_text_part(dbw, part, partnum);
1220 }
1221 // Finalize index.
1222 dbw->term_generator->increase_termpos();
1223 }
1224 catch (const Xapian::Error &err) {
1225 xsyslog(LOG_ERR, "IOERROR: caught exception",
1226 "exception=<%s>",
1227 err.get_description().c_str());
1228 r = IMAP_IOERROR;
1229 }
1230 return r;
1231 }
1232
xapian_dbw_end_doc(xapian_dbw_t * dbw,uint8_t indexlevel)1233 EXPORTED int xapian_dbw_end_doc(xapian_dbw_t *dbw, uint8_t indexlevel)
1234 {
1235 int r = 0;
1236
1237 assert(indexlevel > 0);
1238
1239 try {
1240 if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE)){
1241 // Keep track of languages used in this message.
1242 if (dbw->doctype == 'G') {
1243 std::string val = format_doclangs(*dbw->doclangs);
1244 dbw->database->set_metadata(lang_doc_key(dbw->cyrusid), val);
1245 dbw->document->add_value(SLOT_DOCLANGS, val);
1246 }
1247 // Index subjects by detected document languages.
1248 for (std::set<std::string>::iterator it = dbw->doclangs->begin(); it != dbw->doclangs->end(); ++it) {
1249 std::string iso_lang = *it;
1250 if (iso_lang.compare("en")) {
1251 try {
1252 const char *tp = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, SEARCH_PART_SUBJECT);
1253 std::string prefix = lang_prefix(iso_lang, tp);
1254 dbw->term_generator->set_stemmer(get_stemmer(iso_lang));
1255 dbw->term_generator->set_stopper(get_stopper(iso_lang));
1256 for (const std::string& subject : *dbw->subjects)
1257 dbw->term_generator->index_text(Xapian::Utf8Iterator(subject), 1, prefix);
1258 } catch (const Xapian::InvalidArgumentError &err) {
1259 // ignore unknown stemmer
1260 }
1261 }
1262 }
1263 }
1264 dbw->document->add_value(SLOT_INDEXLEVEL, format_indexlevel(indexlevel));
1265 dbw->document->add_value(SLOT_INDEXVERSION,
1266 std::to_string(XAPIAN_DB_CURRENT_VERSION));
1267 dbw->database->add_document(*dbw->document);
1268 dbw->database->set_metadata("cyrusid." + std::string(dbw->cyrusid),
1269 format_indexlevel(indexlevel));
1270 delete dbw->document;
1271 dbw->document = 0;
1272 dbw->doctype = 0;
1273 free(dbw->cyrusid);
1274 dbw->cyrusid = NULL;
1275 dbw->doclangs->clear();
1276 dbw->subjects->clear();
1277 }
1278 catch (const Xapian::Error &err) {
1279 xsyslog(LOG_ERR, "IOERROR: caught exception",
1280 "exception=<%s>",
1281 err.get_description().c_str());
1282 r = IMAP_IOERROR;
1283 }
1284 return r;
1285 }
1286
xapian_dbw_total_length(xapian_dbw_t * dbw)1287 EXPORTED unsigned long xapian_dbw_total_length(xapian_dbw_t *dbw)
1288 {
1289 unsigned long res = 0;
1290 try {
1291 res = dbw->database->get_total_length();
1292 }
1293 catch (const Xapian::Error &err) {
1294 xsyslog(LOG_ERR, "IOERROR: caught exception",
1295 "exception=<%s>",
1296 err.get_description().c_str());
1297 }
1298 return res;
1299 }
1300
xapian_dbw_is_indexed(xapian_dbw_t * dbw,const struct message_guid * guid,char doctype)1301 EXPORTED uint8_t xapian_dbw_is_indexed(xapian_dbw_t *dbw,
1302 const struct message_guid *guid,
1303 char doctype)
1304 {
1305 struct buf buf = BUF_INITIALIZER;
1306 make_cyrusid(&buf, guid, doctype);
1307 std::string key = "cyrusid." + std::string(buf_cstring(&buf));
1308 buf_free(&buf);
1309
1310 /* indexed in the current DB? */
1311 uint8_t indexlevel = parse_indexlevel(dbw->database->get_metadata(key));
1312 if (indexlevel == SEARCH_INDEXLEVEL_BEST ||
1313 (indexlevel && doctype == XAPIAN_WRAP_DOCTYPE_PART)) {
1314 return indexlevel;
1315 }
1316
1317 /* indexed in other DBs? */
1318 for (int i = 0; i < dbw->otherdbs.count; i++) {
1319 Xapian::Database *database = (Xapian::Database *)ptrarray_nth(&dbw->otherdbs, i);
1320 uint8_t level = parse_indexlevel(database->get_metadata(key));
1321 if (level == SEARCH_INDEXLEVEL_BEST ||
1322 (level && doctype == XAPIAN_WRAP_DOCTYPE_PART)) {
1323 return level;
1324 }
1325 else indexlevel = better_indexlevel(indexlevel, level);
1326 }
1327
1328 return indexlevel;
1329 }
1330
1331 /* ====================================================================== */
1332
1333 struct xapian_db
1334 {
1335 std::string *paths;
1336 Xapian::Database *database; // all but version 4 databases
1337 Xapian::Database *legacydbv4; // version 4 databases
1338 std::vector<Xapian::Database> *subdbs; // all database subdbs
1339 Xapian::Stem *default_stemmer;
1340 const Xapian::Stopper* default_stopper;
1341 std::set<std::string> *stem_languages;
1342 Xapian::QueryParser *parser;
1343 std::set<int> *db_versions;
1344 xapian_dbw_t *dbw;
1345 };
1346
xapian_db_init(xapian_db_t * db)1347 static int xapian_db_init(xapian_db_t *db)
1348 {
1349 int r = 0;
1350
1351 try {
1352 db->parser = new Xapian::QueryParser;
1353 db->parser->set_default_op(Xapian::Query::OP_AND);
1354 db->parser->set_database(db->database ? *db->database : *db->legacydbv4);
1355 db->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
1356 db->default_stopper = get_stopper("en");
1357
1358 // Determine stemmer languages (in addition to English).
1359 db->stem_languages = new std::set<std::string>;
1360 std::map<const std::string, unsigned> lang_counts;
1361 size_t total_doccount = 0;
1362 for (const Xapian::Database& subdb : *db->subdbs) {
1363 read_language_counts(subdb, lang_counts);
1364 total_doccount += subdb.get_doccount();
1365 }
1366 total_doccount /= 2; // Crude estimate.
1367 for (std::pair<const std::string, unsigned>& it : lang_counts) {
1368 if (it.first.compare("en") && ((double) it.second / total_doccount) >= 0.05) {
1369 db->stem_languages->insert(it.first);
1370 }
1371 }
1372 }
1373 catch (const Xapian::Error &err) {
1374 xsyslog(LOG_ERR, "IOERROR: caught exception",
1375 "exception=<%s>",
1376 err.get_description().c_str());
1377 r = IMAP_IOERROR;
1378 }
1379
1380 return r;
1381 }
1382
xapian_db_open(const char ** paths,xapian_db_t ** dbp)1383 EXPORTED int xapian_db_open(const char **paths, xapian_db_t **dbp)
1384 {
1385 xapian_db_t *db = (xapian_db_t *)xzmalloc(sizeof(xapian_db_t));
1386 const char *thispath = "(unknown)";
1387 int r = 0;
1388
1389 try {
1390 db->paths = new std::string;
1391 while (paths && *paths) {
1392 thispath = *paths++;
1393 Xapian::Database subdb {thispath};
1394 std::set<int> db_versions = read_db_versions(subdb);
1395 if (db_versions.empty()) {
1396 syslog(LOG_ERR, "xapian_wrapper: invalid db version in %s", thispath);
1397 r = IMAP_INTERNAL;
1398 goto done;
1399 }
1400 if (!db->db_versions)
1401 db->db_versions = new std::set<int>;
1402 db->db_versions->insert(db_versions.begin(), db_versions.end());
1403 // Databases with version 4 split indexing by doctype.
1404 if (db_versions.find(4) != db_versions.end()) {
1405 if (!db->legacydbv4) db->legacydbv4 = new Xapian::Database;
1406 db->legacydbv4->add_database(subdb);
1407 }
1408 // Databases with any but version 4 are regular dbs.
1409 if (db_versions.size() > 1 || db_versions.find(4) == db_versions.end()) {
1410 if (!db->database) db->database = new Xapian::Database;
1411 db->database->add_database(subdb);
1412 }
1413
1414 // Xapian database has no API to access subdbs.
1415 if (!db->subdbs) db->subdbs = new std::vector<Xapian::Database>;
1416 db->subdbs->push_back(subdb);
1417
1418 db->paths->append(thispath).push_back(' ');
1419 }
1420 thispath = "(unknown)";
1421
1422 if (!db->database && !db->legacydbv4) {
1423 r = IMAP_NOTFOUND;
1424 goto done;
1425 }
1426
1427 r = xapian_db_init(db);
1428 if (r) goto done;
1429 }
1430 catch (const Xapian::Error &err) {
1431 xsyslog(LOG_ERR, "IOERROR: caught exception",
1432 "exception=<%s> path=<%s>",
1433 err.get_description().c_str(), thispath);
1434 r = IMAP_IOERROR;
1435 }
1436
1437 done:
1438 if (r)
1439 xapian_db_close(db);
1440 else
1441 *dbp = db;
1442
1443 return r;
1444 }
1445
xapian_db_opendbw(struct xapian_dbw * dbw,xapian_db_t ** dbp)1446 EXPORTED int xapian_db_opendbw(struct xapian_dbw *dbw, xapian_db_t **dbp)
1447 {
1448 xapian_db_t *db = (xapian_db_t *)xzmalloc(sizeof(xapian_db_t));
1449
1450 db->dbw = dbw;
1451 db->database = dbw->database;
1452 db->db_versions = new std::set<int>();
1453 std::set<int> dbw_versions = read_db_versions(*dbw->database);
1454 db->db_versions->insert(dbw_versions.begin(), dbw_versions.end());
1455 db->subdbs = new std::vector<Xapian::Database>;
1456 db->subdbs->push_back(*dbw->database);
1457
1458 int r = xapian_db_init(db);
1459 if (r) {
1460 xapian_db_close(db);
1461 db = NULL;
1462 }
1463
1464 *dbp = db;
1465 return r;
1466 }
1467
xapian_db_close(xapian_db_t * db)1468 EXPORTED void xapian_db_close(xapian_db_t *db)
1469 {
1470 if (!db) return;
1471 try {
1472 if (!db->dbw) delete db->database;
1473 delete db->legacydbv4;
1474 delete db->parser;
1475 delete db->paths;
1476 delete db->db_versions;
1477 delete db->default_stemmer;
1478 delete db->stem_languages;
1479 delete db->subdbs;
1480 free(db);
1481 }
1482 catch (const Xapian::Error &err) {
1483 /* XXX - memory leak? */
1484 xsyslog(LOG_ERR, "IOERROR: caught exception",
1485 "exception=<%s>",
1486 err.get_description().c_str());
1487 }
1488 }
1489
xapian_db_langstats(xapian_db_t * db,ptrarray_t * lstats,size_t * nolang)1490 EXPORTED int xapian_db_langstats(xapian_db_t *db, ptrarray_t* lstats,
1491 size_t *nolang)
1492 {
1493 std::map<const std::string, unsigned> lang_counts;
1494 size_t total_part = 0;
1495 size_t total_lang = 0;
1496
1497 for (const Xapian::Database& subdb : *db->subdbs) {
1498 // count body parts
1499 for (Xapian::TermIterator it = subdb.metadata_keys_begin("cyrusid.*P*");
1500 it != subdb.metadata_keys_end("cyrusid.*P*"); ++it) {
1501 total_part++;
1502 }
1503 // cummulate language counts
1504 read_language_counts(subdb, lang_counts);
1505 }
1506 for (const std::pair<const std::string, unsigned>& counts : lang_counts) {
1507 struct search_langstat *stat = (struct search_langstat*)
1508 xzmalloc(sizeof(struct search_langstat));
1509 stat->iso_lang = xstrdup(counts.first.c_str());
1510 stat->count = counts.second;
1511 ptrarray_append(lstats, stat);
1512 total_lang += counts.second;
1513 }
1514 *nolang = total_part > total_lang ? total_part - total_lang : 0;
1515
1516 return 0;
1517 }
1518
xapian_query_add_stemmer(xapian_db_t * db,const char * iso_lang)1519 EXPORTED void xapian_query_add_stemmer(xapian_db_t *db, const char *iso_lang)
1520 {
1521 if (strcmp(iso_lang, "en")) db->stem_languages->insert(iso_lang);
1522 }
1523
xapian_db_has_otherthan_v4_index(const xapian_db_t * db)1524 int xapian_db_has_otherthan_v4_index(const xapian_db_t *db)
1525 {
1526 return db->database != NULL;
1527 }
1528
xapian_db_has_legacy_v4_index(const xapian_db_t * db)1529 int xapian_db_has_legacy_v4_index(const xapian_db_t *db)
1530 {
1531 return db->legacydbv4 != NULL;
1532 }
1533
query_new_textmatch(const xapian_db_t * db,const char * match,const char * prefix,Xapian::TermGenerator::stem_strategy tg_stem_strategy)1534 static Xapian::Query* query_new_textmatch(const xapian_db_t *db,
1535 const char *match,
1536 const char *prefix,
1537 Xapian::TermGenerator::stem_strategy tg_stem_strategy)
1538 {
1539 unsigned flags = Xapian::QueryParser::FLAG_PHRASE |
1540 Xapian::QueryParser::FLAG_WILDCARD;
1541
1542 std::string lmatch = Xapian::Unicode::tolower(match);
1543
1544 if (tg_stem_strategy != Xapian::TermGenerator::STEM_NONE) {
1545
1546 // Query without any stemmer.
1547 db->parser->set_stemmer(Xapian::Stem());
1548 db->parser->set_stopper(NULL);
1549 db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1550 Xapian::Query q = db->parser->parse_query(lmatch, flags, prefix);
1551
1552 // Query with default stemmer. But don't stem stopwords.
1553 if (!db->default_stopper || !(*db->default_stopper)(lmatch)) {
1554 db->parser->set_stemmer(*db->default_stemmer);
1555 db->parser->set_stopper(db->default_stopper);
1556 db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
1557 q |= db->parser->parse_query(lmatch, flags, prefix);
1558 }
1559
1560 // Stem query for each language detected in the index.
1561 for (const std::string& iso_lang : *db->stem_languages) {
1562 try {
1563 const Xapian::Stopper *stopper = get_stopper(iso_lang);
1564 db->parser->set_stemmer(get_stemmer(iso_lang));
1565 db->parser->set_stopper(stopper);
1566 db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
1567 if (!stopper || !(*stopper)(lmatch)) {
1568 q |= db->parser->parse_query(lmatch, flags, lang_prefix(iso_lang, prefix));
1569 }
1570 } catch (const Xapian::InvalidArgumentError &err) {
1571 syslog(LOG_INFO, "Xapian: no stemmer for language %s", iso_lang.c_str());
1572 }
1573 }
1574
1575 return new Xapian::Query(q);
1576 }
1577 else {
1578 db->parser->set_stemmer(Xapian::Stem());
1579 db->parser->set_stopper(NULL);
1580 db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1581 return new Xapian::Query {db->parser->parse_query(lmatch, flags, prefix)};
1582 }
1583 }
1584
query_new_language(const xapian_db_t * db,const char * prefix,const char * str)1585 static Xapian::Query *query_new_language(const xapian_db_t *db __attribute__((unused)),
1586 const char *prefix,
1587 const char *str)
1588 {
1589 std::string val = parse_langcode(str);
1590 if (val.empty()) {
1591 syslog(LOG_DEBUG, "Xapian: invalid language in query: %s", str);
1592 return new Xapian::Query(Xapian::Query::MatchNothing);
1593 }
1594 return new Xapian::Query(std::string(prefix) + val);
1595 }
1596
query_new_priority(const xapian_db_t * db,const char * prefix,const char * str)1597 static Xapian::Query *query_new_priority(const xapian_db_t *db __attribute__((unused)),
1598 const char *prefix,
1599 const char *str)
1600 {
1601 std::string val = parse_priority(str);
1602 if (val.empty()) {
1603 syslog(LOG_DEBUG, "Xapian: invalid priority in query: %s", str);
1604 return new Xapian::Query(Xapian::Query::MatchNothing);
1605 }
1606 return new Xapian::Query(std::string(prefix) + val);
1607 }
1608
query_new_listid(const xapian_db_t * db,const char * prefix,const char * str)1609 static Xapian::Query *query_new_listid(const xapian_db_t *db,
1610 const char *prefix,
1611 const char *str)
1612 {
1613 Xapian::Query *q = NULL;
1614
1615 std::string val = parse_listid(str);
1616 if (!val.empty()) {
1617 q = new Xapian::Query(std::string(prefix) + val);
1618 }
1619 else {
1620 syslog(LOG_DEBUG, "Xapian: invalid listid in query: %s", str);
1621 q = new Xapian::Query(Xapian::Query::MatchNothing);
1622 }
1623
1624 if (db->db_versions->lower_bound(11) != db->db_versions->begin()) {
1625 // query in legacy format
1626 db->parser->set_stemmer(Xapian::Stem());
1627 db->parser->set_stopper(NULL);
1628 db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1629 q = new Xapian::Query(Xapian::Query::OP_OR, *q,
1630 db->parser->parse_query(str, 0, prefix));
1631 }
1632
1633 return q;
1634 }
1635
query_new_email(const xapian_db_t * db,const char * _prefix,const char * str)1636 static Xapian::Query *query_new_email(const xapian_db_t *db,
1637 const char *_prefix,
1638 const char *str)
1639 {
1640 std::string prefix(_prefix);
1641
1642 unsigned qpflags = Xapian::QueryParser::FLAG_PHRASE |
1643 Xapian::QueryParser::FLAG_WILDCARD;
1644
1645 db->parser->set_stemmer(Xapian::Stem());
1646 db->parser->set_stopper(NULL);
1647 db->parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
1648
1649 std::string mystr = Xapian::Unicode::tolower(str);
1650 str = mystr.c_str();
1651
1652 const char *atsign = strchr(str, '@');
1653
1654 if (!atsign) {
1655 // query free text
1656 return new Xapian::Query{db->parser->parse_query(str, qpflags, prefix)};
1657 }
1658
1659 Xapian::Query q = Xapian::Query::MatchNothing;
1660
1661 // query name and mailbox (unless just searching for '@domain')
1662 if (atsign > str) {
1663 struct address *addr = NULL;
1664 parseaddr_list(str, &addr);
1665 if (addr && addr->name) {
1666 Xapian::Query qq = db->parser->parse_query(addr->name, qpflags, prefix + 'N');
1667 if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1668 q &= qq;
1669 }
1670 else q = qq;
1671 }
1672 if (addr && addr->mailbox) {
1673 // strip the domain from the mailbox
1674 std::string mail(addr->mailbox);
1675 mail.erase(std::remove_if(mail.begin(), mail.end(), isspace), mail.end());
1676 int wildcard = mail[mail.size()-1] == '*';
1677 if (wildcard) {
1678 mail.resize(mail.size()-1);
1679 }
1680 if (!mail.empty()) {
1681 std::string term(prefix + 'L' + mail);
1682 Xapian::Query qq = wildcard ?
1683 Xapian::Query(Xapian::Query::OP_WILDCARD, term) :
1684 Xapian::Query(term);
1685 if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1686 q &= qq;
1687 }
1688 else q = qq;
1689 }
1690 }
1691 // ignore @domain - it's being handled below
1692 if (addr) parseaddr_free(addr);
1693 }
1694
1695 // query domain
1696 if (atsign[1]) {
1697 std::string domain;
1698 const char *dstart = atsign + 1;
1699 bool wildcard = *dstart == '*';
1700 if (wildcard) dstart++;
1701 const char *dend;
1702 for (dend = dstart; *dend; dend++) {
1703 char c = *dend;
1704 if (Uisalnum(c) || c == '-' || c == '[' || c == ']' || c == ':') {
1705 continue;
1706 }
1707 else if (c == '.' && (dend-1 == dstart || dend[-2] != '.')) {
1708 continue;
1709 }
1710 else {
1711 break;
1712 }
1713 }
1714 if (dend > dstart) {
1715 strarray_t *sa = strarray_nsplit(dstart, dend - dstart, ".", 0);
1716 for (int i = strarray_size(sa) - 1; i >= 0; i--) {
1717 domain.append(strarray_nth(sa, i));
1718 if (i > 0) {
1719 domain.append(1, '.');
1720 }
1721 }
1722 strarray_free(sa);
1723 if (*dstart == '.') {
1724 domain.append(1, '.');
1725 }
1726 }
1727 if (!domain.empty()) {
1728 std::string term(prefix + 'D' + domain);
1729 Xapian::Query qq = wildcard ? Xapian::Query(Xapian::Query::OP_WILDCARD, term) :
1730 Xapian::Query(term);
1731 {
1732 // FIXME - temporarily also search for '@' prefix
1733 std::string term2(prefix + '@' + domain);
1734 Xapian::Query qq2 = wildcard ? Xapian::Query(Xapian::Query::OP_WILDCARD, term2) :
1735 Xapian::Query(term2);
1736 qq |= qq2;
1737 }
1738 if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1739 q &= qq;
1740 }
1741 else q = qq;
1742 }
1743 }
1744
1745 if (q.get_type() == q.LEAF_MATCH_ALL) {
1746 q = Xapian::Query::MatchNothing;
1747 }
1748
1749 // query in legacy format as well!
1750 if (db->db_versions->lower_bound(12) != db->db_versions->begin()) {
1751 q |= db->parser->parse_query(str, qpflags, prefix);
1752 }
1753
1754 // query localpart@domain (ONLY if no wildcards)
1755 if ((atsign > str) && atsign[1] && !strchr(str, '*')) {
1756 struct address *addr = NULL;
1757
1758 parseaddr_list(str, &addr);
1759 if (addr) {
1760 char *a = address_get_all(addr, /*canon_domain*/1);
1761 if (a) {
1762 // query 'A' term for index >= 16
1763 std::string term(prefix + 'A' + std::string(a));
1764 Xapian::Query qq =
1765 Xapian::Query(Xapian::Query::OP_AND,
1766 Xapian::Query(Xapian::Query::OP_VALUE_GE,
1767 Xapian::valueno(SLOT_INDEXVERSION),
1768 std::string("16")),
1769 Xapian::Query(term));
1770 if (q.get_type() != q.LEAF_MATCH_NOTHING) {
1771 // otherwise, query 'L' + 'D' terms (as per above)
1772 Xapian::Query qq2 =
1773 Xapian::Query(Xapian::Query::OP_AND,
1774 Xapian::Query(Xapian::Query::OP_VALUE_LE,
1775 Xapian::valueno(SLOT_INDEXVERSION),
1776 std::string("15")),
1777 q);
1778 qq |= qq2;
1779 }
1780
1781 q = qq;
1782 }
1783
1784 parseaddr_free(addr);
1785 free(a);
1786 }
1787 }
1788
1789 return new Xapian::Query(q);
1790 }
1791
append_alnum(struct buf * buf,const char * ss)1792 static void append_alnum(struct buf *buf, const char *ss)
1793 {
1794 const unsigned char *s = (const unsigned char *)ss;
1795
1796 for ( ; *s ; ++s) {
1797 if (Uisalnum(*s))
1798 buf_putc(buf, *s);
1799 }
1800 }
1801
query_new_type(const xapian_db_t * db,const char * _prefix,const char * str)1802 static Xapian::Query *query_new_type(const xapian_db_t *db __attribute__((unused)),
1803 const char *_prefix,
1804 const char *str)
1805 {
1806
1807 std::pair<std::string, std::string> ct = parse_content_type(str);
1808 std::string prefix(_prefix);
1809 Xapian::Query q = Xapian::Query::MatchNothing;
1810
1811 bool query_legacy = db->db_versions->lower_bound(13) != db->db_versions->begin();
1812 struct buf buf = BUF_INITIALIZER;
1813 unsigned qpflags = Xapian::QueryParser::FLAG_PHRASE |
1814 Xapian::QueryParser::FLAG_WILDCARD;
1815
1816 if (!ct.first.empty() && ct.second.empty()) {
1817 /* Match either type or subtype */
1818 if (ct.first != "*") {
1819 q = Xapian::Query(Xapian::Query::OP_OR,
1820 Xapian::Query(prefix + 'T' + ct.first),
1821 Xapian::Query(prefix + 'S' + ct.first));
1822 if (query_legacy) {
1823 append_alnum(&buf, ct.first.c_str());
1824 q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1825 }
1826 }
1827 }
1828 else if (ct.first == "*" || ct.second == "*") {
1829 /* Wildcard query */
1830 if (!ct.first.empty() && ct.first != "*") {
1831 /* Match type */
1832 q = Xapian::Query(prefix + 'T' + ct.first);
1833 if (query_legacy) {
1834 append_alnum(&buf, ct.first.c_str());
1835 q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1836 }
1837 }
1838 if (!ct.second.empty() && ct.second != "*") {
1839 /* Match subtype */
1840 q = Xapian::Query(prefix + 'S' + ct.second);
1841 if (query_legacy) {
1842 append_alnum(&buf, ct.second.c_str());
1843 q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1844 }
1845 }
1846 }
1847 else if (!ct.first.empty() && !ct.second.empty()) {
1848 /* Verbatim search */
1849 q = Xapian::Query(prefix + ct.first + '/' + ct.second);
1850 if (query_legacy) {
1851 append_alnum(&buf, ct.first.c_str());
1852 buf_putc(&buf, '_');
1853 append_alnum(&buf, ct.second.c_str());
1854 q |= db->parser->parse_query(buf_cstring(&buf), qpflags, prefix);
1855 }
1856 }
1857
1858 buf_free(&buf);
1859 return new Xapian::Query(q);
1860 }
1861
1862 EXPORTED Xapian::Query *
xapian_query_new_match_internal(const xapian_db_t * db,int partnum,const char * str)1863 xapian_query_new_match_internal(const xapian_db_t *db, int partnum, const char *str)
1864 {
1865 const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
1866
1867 try {
1868 // Handle special value search parts.
1869 if (partnum == SEARCH_PART_LANGUAGE) {
1870 return query_new_language(db, prefix, str);
1871 }
1872 else if (partnum == SEARCH_PART_PRIORITY) {
1873 return query_new_priority(db, prefix, str);
1874 }
1875 else if (partnum == SEARCH_PART_LISTID) {
1876 return query_new_listid(db, prefix, str);
1877 }
1878 else if (partnum == SEARCH_PART_FROM ||
1879 partnum == SEARCH_PART_TO ||
1880 partnum == SEARCH_PART_CC ||
1881 partnum == SEARCH_PART_BCC ||
1882 partnum == SEARCH_PART_DELIVEREDTO) {
1883 return query_new_email(db, prefix, str);
1884 }
1885 else if (partnum == SEARCH_PART_TYPE) {
1886 return query_new_type(db, prefix, str);
1887 }
1888
1889 // Don't stem queries for Thaana codepage (0780) or higher.
1890 for (const unsigned char *p = (const unsigned char *)str; *p; p++) {
1891 if (*p > 221) //has highbit
1892 return new Xapian::Query {db->parser->parse_query(
1893 str,
1894 #ifdef USE_XAPIAN_CJK_WORDS
1895 Xapian::QueryParser::FLAG_CJK_WORDS,
1896 #else
1897 Xapian::QueryParser::FLAG_CJK_NGRAM,
1898 #endif
1899 prefix)};
1900 }
1901
1902 // Stemable codepage.
1903 Xapian::TermGenerator::stem_strategy stem_strategy =
1904 get_stem_strategy(XAPIAN_DB_CURRENT_VERSION, partnum);
1905
1906 Xapian::Query *qq = query_new_textmatch(db, str, prefix, stem_strategy);
1907 if (qq->get_type() == Xapian::Query::LEAF_MATCH_NOTHING) {
1908 delete qq;
1909 qq = NULL;
1910 }
1911 return qq;
1912
1913 } catch (const Xapian::Error &err) {
1914 xsyslog(LOG_ERR, "IOERROR: caught exception",
1915 "exception=<%s>",
1916 err.get_description().c_str());
1917 return 0;
1918 }
1919 }
1920
1921 EXPORTED xapian_query_t *
xapian_query_new_match(const xapian_db_t * db,int partnum,const char * str)1922 xapian_query_new_match(const xapian_db_t *db, int partnum, const char *str)
1923 {
1924 if (db->subdbs->empty()) {
1925 // no database to query
1926 return NULL;
1927 }
1928
1929 const char *prefix = get_term_prefix(XAPIAN_DB_CURRENT_VERSION, partnum);
1930 if (!prefix) {
1931 return NULL;
1932 }
1933
1934 int min_version = *db->db_versions->begin();
1935 if (min_version < XAPIAN_DB_MIN_SUPPORTED_VERSION) {
1936 xsyslog(LOG_WARNING,
1937 "deprecated database version, reindex required",
1938 "version=<%d> min_supported_version=<%d> paths=<%s>",
1939 min_version, XAPIAN_DB_MIN_SUPPORTED_VERSION,
1940 db->paths->c_str());
1941 }
1942
1943 Xapian::Query *q = xapian_query_new_match_internal(db, partnum, str);
1944 if (min_version < 15) {
1945 /* Older versions indexed header fields in Cyrus search form */
1946 charset_t utf8 = charset_lookupname("utf-8");
1947 char *mystr = charset_convert(str, utf8, charset_flags);
1948 if (mystr) {
1949 Xapian::Query *qq = xapian_query_new_match_internal(db, partnum, mystr);
1950 if (qq && q) {
1951 *q |= *qq;
1952 }
1953 else if (!q) q = qq;
1954 }
1955 free(mystr);
1956 charset_free(&utf8);
1957 }
1958 return (xapian_query_t*) q;
1959 }
1960
1961 EXPORTED xapian_query_t *
xapian_query_new_compound(const xapian_db_t * db,int is_or,xapian_query_t ** children,int n)1962 xapian_query_new_compound(const xapian_db_t *db __attribute__((unused)),
1963 int is_or, xapian_query_t **children, int n)
1964 {
1965 try {
1966 // I want to use std::initializer_list<Xapian::Query*> here
1967 // but that requires "experimental" gcc C++0x support :(
1968 // 'compound' owns a refcount on each child. We need to
1969 // drop the one we got when we allocated the children
1970 Xapian::Query* compound = new Xapian::Query;
1971 if (is_or)
1972 for (int i = 0 ; i < n ; i++) {
1973 *compound |= *(Xapian::Query*)children[i];
1974 delete (Xapian::Query*)children[i];
1975 }
1976 else
1977 for (int i = 0 ; i < n ; i++) {
1978 if (compound->empty())
1979 *compound = *(Xapian::Query*)children[i];
1980 else
1981 *compound &= *(Xapian::Query*)children[i];
1982 delete (Xapian::Query*)children[i];
1983 }
1984 return (xapian_query_t *)compound;
1985 }
1986 catch (const Xapian::Error &err) {
1987 xsyslog(LOG_ERR, "IOERROR: caught exception",
1988 "exception=<%s>",
1989 err.get_description().c_str());
1990 return 0;
1991 }
1992 }
1993
1994 /* Xapian does not have an OP_NOT. WTF? We fake it with
1995 * OP_AND_NOT where the left child is MatchAll */
1996 EXPORTED xapian_query_t *
xapian_query_new_not(const xapian_db_t * db,xapian_query_t * child)1997 xapian_query_new_not(const xapian_db_t *db __attribute__((unused)),
1998 xapian_query_t *child)
1999 {
2000 if (!child) return (xapian_query_t*) new Xapian::Query(Xapian::Query::MatchAll);
2001
2002 try {
2003 Xapian::Query *qq = new Xapian::Query(
2004 Xapian::Query::OP_AND_NOT,
2005 Xapian::Query::MatchAll,
2006 *(Xapian::Query *)child);
2007 // 'compound' owns a refcount on each child. We need to
2008 // drop the one we got when we allocated the children
2009 delete (Xapian::Query *)child;
2010 return (xapian_query_t *)qq;
2011 }
2012 catch (const Xapian::Error &err) {
2013 xsyslog(LOG_ERR, "IOERROR: caught exception",
2014 "exception=<%s>",
2015 err.get_description().c_str());
2016 return 0;
2017 }
2018 }
2019
2020 EXPORTED xapian_query_t *
xapian_query_new_matchall(const xapian_db_t * db)2021 xapian_query_new_matchall(const xapian_db_t *db __attribute__((unused)))
2022 {
2023 return (xapian_query_t *) new Xapian::Query(Xapian::Query::MatchAll);
2024 }
2025
2026 EXPORTED xapian_query_t *
xapian_query_new_has_doctype(const xapian_db_t * db,char doctype,xapian_query_t * child)2027 xapian_query_new_has_doctype(const xapian_db_t *db __attribute__((unused)),
2028 char doctype, xapian_query_t *child)
2029 {
2030 try {
2031 Xapian::Query *qq = new Xapian::Query(
2032 Xapian::Query::OP_FILTER,
2033 child ? *(Xapian::Query *)child : Xapian::Query::MatchAll,
2034 std::string("XE") + doctype);
2035 // 'compound' owns a refcount on each child. We need to
2036 // drop the one we got when we allocated the children
2037 delete (Xapian::Query *)child;
2038 return (xapian_query_t *)qq;
2039 }
2040 catch (const Xapian::Error &err) {
2041 xsyslog(LOG_ERR, "IOERROR: caught exception",
2042 "exception=<%s>",
2043 err.get_description().c_str());
2044 return 0;
2045 }
2046 }
2047
xapian_query_free(xapian_query_t * qq)2048 EXPORTED void xapian_query_free(xapian_query_t *qq)
2049 {
2050 try {
2051 delete (Xapian::Query *)qq;
2052 }
2053 catch (const Xapian::Error &err) {
2054 xsyslog(LOG_ERR, "IOERROR: caught exception",
2055 "exception=<%s>",
2056 err.get_description().c_str());
2057 }
2058 }
2059
xapian_query_run(const xapian_db_t * db,const xapian_query_t * qq,int is_legacy,int (* cb)(void * data,size_t n,void * rock),void * rock)2060 EXPORTED int xapian_query_run(const xapian_db_t *db, const xapian_query_t *qq,
2061 int is_legacy,
2062 int (*cb)(void *data, size_t n, void *rock), void *rock)
2063 {
2064 const Xapian::Query *query = (const Xapian::Query *)qq;
2065 void *data = NULL;
2066 size_t n = 0;
2067
2068 if ((is_legacy && !db->legacydbv4) || (!is_legacy && !db->database)) return 0;
2069
2070 try {
2071 Xapian::Database *database = is_legacy ? db->legacydbv4 : db->database;
2072 Xapian::Enquire enquire(*database);
2073 enquire.set_query(*query);
2074 enquire.set_sort_by_value(0, false); // sort by cyrusid ascending
2075 Xapian::MSet matches = enquire.get_mset(0, database->get_doccount());
2076 size_t size = matches.size();
2077 if (size) data = xzmalloc(size * 41);
2078 for (Xapian::MSetIterator i = matches.begin() ; i != matches.end() ; ++i) {
2079 const Xapian::Document& d = i.get_document();
2080 const std::string& cyrusid = d.get_value(SLOT_CYRUSID);
2081
2082 /* ignore documents with no cyrusid. Shouldn't happen, but has been seen */
2083 if (cyrusid.length() != 43) {
2084 xsyslog(LOG_ERR, "IOERROR: skipping document with zero-length cyrusid",
2085 "documentid=<%u> paths=<%s>",
2086 d.get_docid(), db->paths->c_str());
2087 continue;
2088 }
2089 const char *cstr = cyrusid.c_str();
2090 if (cstr[0] != '*' || !isalpha(cstr[1]) || cstr[2] != '*') {
2091 xsyslog(LOG_ERR, "IOERROR: skipping document with invalid cyrusid",
2092 "cyrusid=<%s> documentid=<%u> paths=<%s>",
2093 cstr, d.get_docid(), db->paths->c_str());
2094 continue;
2095 }
2096 if (n >= size) throw Xapian::DatabaseError("Too many records in MSet");
2097 char *entry = (char *) data + (41*n);
2098 memcpy(entry, cstr+3, 40);
2099 entry[40] = '\0';
2100 ++n;
2101 }
2102 }
2103 catch (const Xapian::Error &err) {
2104 xsyslog(LOG_ERR, "IOERROR: caught exception",
2105 "exception=<%s> query=<%s>",
2106 err.get_description().c_str(),
2107 query ? query->get_description().c_str() : "");
2108 free(data);
2109 return IMAP_IOERROR;
2110 }
2111
2112 if (!n) {
2113 free(data);
2114 return 0;
2115 }
2116
2117 int r = cb(data, n, rock);
2118 free(data);
2119 return r;
2120 }
2121
2122 /* ====================================================================== */
2123
2124 struct xapian_snipgen
2125 {
2126 Xapian::Stem *default_stemmer;
2127 xapian_db_t *db;
2128 Xapian::Database *memdb;
2129 std::vector<std::string> *loose_terms;
2130 std::vector<std::string> *queries;
2131 char *cyrusid;
2132 char doctype;
2133 struct buf *buf;
2134 const char *hi_start;
2135 const char *hi_end;
2136 const char *omit;
2137 size_t max_len;
2138 };
2139
2140 EXPORTED xapian_snipgen_t *
xapian_snipgen_new(xapian_db_t * db,const char * hi_start,const char * hi_end,const char * omit)2141 xapian_snipgen_new(xapian_db_t *db,
2142 const char *hi_start,
2143 const char *hi_end,
2144 const char *omit)
2145 {
2146 xapian_snipgen_t *snipgen = (xapian_snipgen_t *)xzmalloc(sizeof(xapian_snipgen_t));
2147 snipgen->default_stemmer = new Xapian::Stem(new CyrusSearchStemmer);
2148 snipgen->db = db;
2149 snipgen->memdb = new Xapian::WritableDatabase(std::string(), Xapian::DB_BACKEND_INMEMORY);
2150 snipgen->buf = buf_new();
2151 snipgen->hi_start = hi_start;
2152 snipgen->hi_end = hi_end;
2153 snipgen->omit = omit;
2154 snipgen->max_len = (size_t) config_getint(IMAPOPT_SEARCH_SNIPPET_LENGTH);
2155
2156 return snipgen;
2157 }
2158
xapian_snipgen_free(xapian_snipgen_t * snipgen)2159 EXPORTED void xapian_snipgen_free(xapian_snipgen_t *snipgen)
2160 {
2161 if (!snipgen) return;
2162 delete snipgen->default_stemmer;
2163 delete snipgen->loose_terms;
2164 delete snipgen->queries;
2165 delete snipgen->memdb;
2166 free(snipgen->cyrusid);
2167 buf_destroy(snipgen->buf);
2168 free(snipgen);
2169 }
2170
xapian_snipgen_build_query(xapian_snipgen_t * snipgen,Xapian::Stem & stemmer)2171 static Xapian::Query xapian_snipgen_build_query(xapian_snipgen_t *snipgen, Xapian::Stem& stemmer)
2172 {
2173 Xapian::TermGenerator term_generator;
2174 Xapian::Query q;
2175
2176 if (snipgen->loose_terms) {
2177 /* Add loose query terms */
2178 term_generator.set_stemmer(stemmer);
2179 #ifdef USE_XAPIAN_CJK_WORDS
2180 term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_WORDS,
2181 ~Xapian::TermGenerator::FLAG_CJK_WORDS);
2182 #else
2183 term_generator.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM,
2184 ~Xapian::TermGenerator::FLAG_CJK_NGRAM);
2185 #endif
2186
2187 for(size_t i = 0; i < snipgen->loose_terms->size(); ++i)
2188 {
2189 term_generator.index_text(Xapian::Utf8Iterator((*snipgen->loose_terms)[i]));
2190 }
2191
2192 const Xapian::Document& doc = term_generator.get_document();
2193 q = Xapian::Query(Xapian::Query::OP_OR, doc.termlist_begin(), doc.termlist_end());
2194 }
2195
2196 if (snipgen->queries) {
2197 /* Add phrase queries */
2198 unsigned flags = Xapian::QueryParser::FLAG_PHRASE|
2199 Xapian::QueryParser::FLAG_WILDCARD|
2200 #ifdef USE_XAPIAN_CJK_WORDS
2201 Xapian::QueryParser::FLAG_CJK_WORDS;
2202 #else
2203 Xapian::QueryParser::FLAG_CJK_NGRAM;
2204 #endif
2205 Xapian::QueryParser queryparser;
2206 queryparser.set_stemmer(stemmer);
2207 for(size_t i = 0; i < snipgen->queries->size(); ++i) {
2208 q |= queryparser.parse_query((*snipgen->queries)[i], flags);;
2209 }
2210 }
2211
2212 return q;
2213 }
2214
xapian_snipgen_add_match(xapian_snipgen_t * snipgen,const char * match)2215 EXPORTED int xapian_snipgen_add_match(xapian_snipgen_t *snipgen,
2216 const char *match)
2217 {
2218 size_t len = strlen(match);
2219 bool is_query = len > 1 && ((match[0] == '"' && match[len-1] == '"') ||
2220 (strchr(match, '*') != NULL));
2221
2222 if (is_query) {
2223 if (!snipgen->queries) {
2224 snipgen->queries = new std::vector<std::string>;
2225 }
2226 snipgen->queries->push_back(match);
2227 } else {
2228 if (!snipgen->loose_terms) {
2229 snipgen->loose_terms = new std::vector<std::string>;
2230 }
2231 snipgen->loose_terms->push_back(match);
2232 }
2233
2234 return 0;
2235 }
2236
xapian_snipgen_begin_doc(xapian_snipgen_t * snipgen,const struct message_guid * guid,char doctype)2237 EXPORTED int xapian_snipgen_begin_doc(xapian_snipgen_t *snipgen,
2238 const struct message_guid *guid,
2239 char doctype)
2240 {
2241 struct buf buf = BUF_INITIALIZER;
2242 make_cyrusid(&buf, guid, doctype);
2243 snipgen->cyrusid = buf_release(&buf);
2244 snipgen->doctype = doctype;
2245
2246 buf_reset(snipgen->buf);
2247 return 0;
2248 }
2249
xapian_snipgen_make_snippet(xapian_snipgen_t * snipgen,const struct buf * part,Xapian::Stem * stemmer)2250 EXPORTED int xapian_snipgen_make_snippet(xapian_snipgen_t *snipgen,
2251 const struct buf *part,
2252 Xapian::Stem* stemmer)
2253 {
2254 int r = 0;
2255 try {
2256 std::string text {buf_base(part), buf_len(part)};
2257 Xapian::Enquire enquire(*snipgen->memdb);
2258 Xapian::Query qq = xapian_snipgen_build_query(snipgen, *stemmer);
2259 if (qq.empty()) return 0;
2260 enquire.set_query(qq);
2261
2262 unsigned flags = Xapian::MSet::SNIPPET_EXHAUSTIVE |
2263 Xapian::MSet::SNIPPET_EMPTY_WITHOUT_MATCH;
2264 #ifdef USE_XAPIAN_CJK_WORDS
2265 flags |= Xapian::MSet::SNIPPET_CJK_WORDS;
2266 #endif
2267
2268 const std::string snippet = enquire.get_mset(0, 0).snippet(text,
2269 snipgen->max_len - buf_len(snipgen->buf),
2270 *stemmer, flags,
2271 snipgen->hi_start,
2272 snipgen->hi_end,
2273 snipgen->omit);
2274 if (!snippet.empty()) {
2275 if (buf_len(snipgen->buf)) {
2276 buf_appendoverlap(snipgen->buf, snipgen->omit);
2277 }
2278 buf_appendcstr(snipgen->buf, snippet.c_str());
2279 }
2280 } catch (const Xapian::Error &err) {
2281 xsyslog(LOG_ERR, "IOERROR: caught exception",
2282 "exception=<%s>",
2283 err.get_description().c_str());
2284 r = IMAP_IOERROR;
2285 }
2286 return r;
2287 }
2288
xapian_snipgen_doc_part(xapian_snipgen_t * snipgen,const struct buf * part,int partnum)2289 EXPORTED int xapian_snipgen_doc_part(xapian_snipgen_t *snipgen,
2290 const struct buf *part,
2291 int partnum __attribute__((unused)))
2292 {
2293 // Ignore empty queries.
2294 if (!snipgen->loose_terms && !snipgen->queries) return 0;
2295
2296 // Don't exceed allowed snippet length.
2297 if (buf_len(snipgen->buf) >= snipgen->max_len) return 0;
2298
2299 if (config_getswitch(IMAPOPT_SEARCH_INDEX_LANGUAGE) &&
2300 snipgen->db->database && snipgen->cyrusid) {
2301 std::set<std::string> doclangs;
2302
2303 // Lookup stemmer language for this document part, if any.
2304 std::string key = lang_doc_key(snipgen->cyrusid);
2305 for (const Xapian::Database& subdb : *snipgen->db->subdbs) {
2306 std::string val = subdb.get_metadata(key);
2307 if (!val.empty()) parse_doclangs(val, doclangs);
2308 break;
2309 }
2310
2311 // Generate snippets for each detected message language.
2312 // The first non-empty snippet wins.
2313 size_t prev_size = buf_len(snipgen->buf);
2314 for (std::set<std::string>::iterator it = doclangs.begin(); it != doclangs.end(); ++it) {
2315 const std::string& iso_lang = *it;
2316 if (iso_lang.compare("en")) {
2317 try {
2318 Xapian::Stem stemmer = get_stemmer(iso_lang);
2319 int r = xapian_snipgen_make_snippet(snipgen, part, &stemmer);
2320 if (!r && prev_size != buf_len(snipgen->buf)) {
2321 return 0;
2322 }
2323 } catch (const Xapian::InvalidArgumentError &err) {
2324 // ignore unknown stemmer
2325 }
2326 }
2327 }
2328 }
2329
2330 /* Using a custom stemmer did not generate a snippet.
2331 * This could be because the query matched using the
2332 * default stemmer, so try generating a snippet with
2333 * that stemmer instead.*/
2334 return xapian_snipgen_make_snippet(snipgen, part, snipgen->default_stemmer);
2335 }
2336
xapian_snipgen_end_doc(xapian_snipgen_t * snipgen,struct buf * buf)2337 EXPORTED int xapian_snipgen_end_doc(xapian_snipgen_t *snipgen, struct buf *buf)
2338 {
2339 buf_reset(buf);
2340 buf_copy(buf, snipgen->buf);
2341 buf_cstring(buf);
2342 buf_reset(snipgen->buf);
2343
2344 delete snipgen->loose_terms;
2345 snipgen->loose_terms = NULL;
2346
2347 delete snipgen->queries;
2348 snipgen->queries = NULL;
2349
2350 free(snipgen->cyrusid);
2351 snipgen->cyrusid = NULL;
2352 snipgen->doctype = 0;
2353
2354 return 0;
2355 }
2356
2357 /* cb returns true if document should be copied, false if not */
xapian_filter(const char * dest,const char ** sources,int (* cb)(const char * cyrusid,void * rock),void * rock)2358 EXPORTED int xapian_filter(const char *dest, const char **sources,
2359 int (*cb)(const char *cyrusid, void *rock),
2360 void *rock)
2361 {
2362 int r = 0;
2363 const char *thispath = "(unknown path)";
2364
2365 try {
2366 /* create a destination database */
2367 Xapian::WritableDatabase destdb {dest, Xapian::DB_CREATE|Xapian::DB_BACKEND_GLASS};
2368
2369 /* With multiple databases as above, the docids are interleaved, so it
2370 * might be worth trying to open each source and copy its documents to
2371 * destdb in turn for better locality of reference, and so better cache
2372 * use. -- Olly on the mailing list */
2373
2374 std::vector<Xapian::Database> srcdbs;
2375
2376 // Open databases and aggregate database-level metadata.
2377 while (*sources) {
2378 thispath = *sources++;
2379 const Xapian::Database srcdb {thispath};
2380 srcdbs.push_back(srcdb);
2381 }
2382
2383 // Copy all matching documents.
2384 std::set<int> db_versions;
2385
2386 for (size_t i = 0; i < srcdbs.size(); ++i) {
2387 const Xapian::Database& srcdb = srcdbs.at(i);
2388 bool need_md_versions = false;
2389 std::set<int> md_versions = read_db_versions(srcdb);
2390
2391 /* copy all matching documents to the new DB */
2392 for (Xapian::ValueIterator it = srcdb.valuestream_begin(SLOT_CYRUSID);
2393 it != srcdb.valuestream_end(SLOT_CYRUSID); ++it) {
2394 const std::string& cyrusid = *it;
2395 const std::string idkey {"cyrusid." + cyrusid};
2396
2397 // check if caller wants this cyrusid
2398 if (!cb(cyrusid.c_str(), rock)) {
2399 continue;
2400 }
2401
2402 // is it already indexed?
2403 if (!destdb.get_metadata(idkey).empty()) {
2404 continue;
2405 }
2406
2407 // is there a subsequent db with a better index level? (only for G docs)
2408 uint8_t indexlevel = parse_indexlevel(srcdb.get_metadata(idkey));
2409 if (cyrusid[1] == XAPIAN_WRAP_DOCTYPE_MSG) {
2410 int found_better = 0;
2411 for (size_t j = i + 1; !found_better && j < srcdbs.size(); ++j) {
2412 uint8_t level = parse_indexlevel(srcdbs[j].get_metadata(idkey));
2413 found_better = better_indexlevel(indexlevel, level) != indexlevel;
2414 }
2415 if (found_better) {
2416 continue;
2417 }
2418 }
2419
2420 // add document
2421 Xapian::Document srcdoc = srcdb.get_document(it.get_docid());
2422 Xapian::docid docid = destdb.add_document(srcdoc);
2423 destdb.set_metadata(idkey, format_indexlevel(indexlevel));
2424
2425 // copy document language metadata
2426 const std::string& langkey = lang_doc_key(cyrusid.c_str());
2427 if (destdb.get_metadata(langkey).empty()) {
2428 std::string val = srcdb.get_metadata(langkey);
2429 if (!val.empty() && isalpha(val[0])) {
2430 destdb.set_metadata(langkey, val);
2431 }
2432 }
2433 const std::string& langval = srcdoc.get_value(SLOT_DOCLANGS);
2434 if (!langval.empty() && !isalpha(langval[0])) {
2435 destdb.get_document(docid).remove_value(SLOT_DOCLANGS);
2436 }
2437 // add document index version
2438 const std::string& verval = srcdoc.get_value(SLOT_INDEXVERSION);
2439 if (!verval.empty()) {
2440 int version = std::atoi(verval.c_str());
2441 if (version) db_versions.insert(version);
2442 }
2443 else need_md_versions = true;
2444 }
2445
2446 if (need_md_versions) {
2447 /* At least one added document didn't have its index
2448 * version slot set in this subdb. Read legacy versions. */
2449 std::set<int> md_versions = read_db_versions(srcdb);
2450 db_versions.insert(md_versions.begin(), md_versions.lower_bound(14));
2451 }
2452 }
2453
2454 thispath = "(unknown path)";
2455
2456 // set the versions
2457 write_db_versions(destdb, db_versions);
2458
2459 // recalculate language counts
2460 std::map<const std::string, unsigned> lang_counts;
2461 r = calculate_language_counts(destdb, lang_counts);
2462 if (r) {
2463 xsyslog(LOG_ERR, "IOERROR: corrupt metadata",
2464 "filter=<%s>",
2465 dest);
2466 return r;
2467 }
2468 write_language_counts(destdb, lang_counts);
2469
2470 /* commit all changes explicitly */
2471 destdb.commit();
2472 }
2473 catch (const Xapian::Error &err) {
2474 xsyslog(LOG_ERR, "IOERROR: caught exception",
2475 "exception=<%s> path=<%s>",
2476 err.get_description().c_str(), thispath);
2477 r = IMAP_IOERROR;
2478 }
2479
2480 return r;
2481 }
2482
xapian_version_string()2483 EXPORTED const char *xapian_version_string()
2484 {
2485 return Xapian::version_string();
2486 }
2487
2488 struct xapian_doc {
2489 Xapian::TermGenerator *termgen;
2490 Xapian::Document *doc;
2491 };
2492
xapian_doc_new(void)2493 EXPORTED xapian_doc_t *xapian_doc_new(void)
2494 {
2495 xapian_doc_t *doc = (xapian_doc_t *) xzmalloc(sizeof(struct xapian_doc));
2496 doc->doc = new Xapian::Document;
2497 doc->termgen = new Xapian::TermGenerator;
2498 doc->termgen->set_document(*doc->doc);
2499 return doc;
2500 }
2501
xapian_doc_index_text(xapian_doc_t * doc,const char * text,size_t len)2502 EXPORTED void xapian_doc_index_text(xapian_doc_t *doc,
2503 const char *text, size_t len)
2504 {
2505 doc->termgen->index_text(Xapian::Utf8Iterator(text, len));
2506 }
2507
xapian_doc_termcount(xapian_doc_t * doc)2508 EXPORTED size_t xapian_doc_termcount(xapian_doc_t *doc)
2509 {
2510 return doc->doc->termlist_count();
2511 }
2512
xapian_doc_foreach_term(xapian_doc_t * doc,int (* cb)(const char *,void *),void * rock)2513 EXPORTED int xapian_doc_foreach_term(xapian_doc_t *doc,
2514 int(*cb)(const char*, void*),
2515 void *rock)
2516 {
2517 for (Xapian::TermIterator ti = doc->doc->termlist_begin();
2518 ti != doc->doc->termlist_end(); ++ti) {
2519 int r = cb((*ti).c_str(), rock);
2520 if (r) return r;
2521 }
2522 return 0;
2523 }
2524
xapian_doc_reset(xapian_doc_t * doc)2525 EXPORTED void xapian_doc_reset(xapian_doc_t *doc)
2526 {
2527 doc->doc->clear_values();
2528 }
2529
xapian_doc_close(xapian_doc_t * doc)2530 EXPORTED void xapian_doc_close(xapian_doc_t *doc)
2531 {
2532 delete doc->termgen;
2533 delete doc->doc;
2534 free(doc);
2535 }
2536