1 /* Copyright (C) 2004 J.F.Dockes
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the
14 * Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17 #ifndef _DB_H_INCLUDED_
18 #define _DB_H_INCLUDED_
19
20 #include "autoconfig.h"
21
22 #include <stdint.h>
23 #include <string>
24 #include <vector>
25 #include <memory>
26
27 #include "cstr.h"
28 #include "rcldoc.h"
29 #include "stoplist.h"
30 #include "rclconfig.h"
31 #include "utf8iter.h"
32 #include "textsplit.h"
33 #include "syngroups.h"
34
35 using std::string;
36 using std::vector;
37
38 // rcldb defines an interface for a 'real' text database. The current
39 // implementation uses xapian only, and xapian-related code is in rcldb.cpp
40 // If support was added for other backend, the xapian code would be moved in
41 // rclxapian.cpp, another file would be created for the new backend, and the
42 // configuration/compile/link code would be adjusted to allow choosing. There
43 // is no plan for supporting multiple different backends.
44 //
45 // In no case does this try to implement a useful virtualized text-db interface
46 // The main goal is simplicity and good matching to usage inside the recoll
47 // user interface. In other words, this is not exhaustive or well-designed or
48 // reusable.
49 //
50 // Unique Document Identifier: uniquely identifies a document in its
51 // source storage (file system or other). Used for up to date checks
52 // etc. "udi". Our user is responsible for making sure it's not too
53 // big, cause it's stored as a Xapian term (< 150 bytes would be
54 // reasonable)
55
56 class RclConfig;
57 class Aspell;
58
59 namespace Rcl {
60
61 // Omega compatible values. We leave a hole for future omega values. Not sure
62 // it makes any sense to keep any level of omega compat given that the index
63 // is incompatible anyway.
64 enum value_slot {
65 // Omega-compatible values:
66 VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
67 VALUE_MD5 = 1, // 16 byte MD5 checksum of original document.
68 VALUE_SIZE = 2, // sortable_serialise(<file size in bytes>)
69
70 ////////// Recoll only:
71 // Doc sig as chosen by app (ex: mtime+size
72 VALUE_SIG = 10,
73 };
74
75 class SearchData;
76 class TermIter;
77 class Query;
78
79 /** Used for returning result lists for index terms matching some criteria */
80 class TermMatchEntry {
81 public:
TermMatchEntry()82 TermMatchEntry()
83 : wcf(0) {}
TermMatchEntry(const string & t,int f,int d)84 TermMatchEntry(const string& t, int f, int d)
85 : term(t), wcf(f), docs(d) {}
TermMatchEntry(const string & t)86 TermMatchEntry(const string& t)
87 : term(t), wcf(0) {}
88 bool operator==(const TermMatchEntry &o) const {
89 return term == o.term;
90 }
91 bool operator<(const TermMatchEntry &o) const {
92 return term < o.term;
93 }
94
95 string term;
96 int wcf; // Total count of occurrences within collection.
97 int docs; // Number of documents countaining term.
98 };
99
100 /** Term match result list header: statistics and global info */
101 class TermMatchResult {
102 public:
TermMatchResult()103 TermMatchResult() {
104 clear();
105 }
clear()106 void clear() {
107 entries.clear();
108 }
109 // Term expansion
110 vector<TermMatchEntry> entries;
111 // If a field was specified, this is the corresponding index prefix
112 string prefix;
113 };
114
115 class DbStats {
116 public:
DbStats()117 DbStats() {}
118 // Index-wide stats
119 unsigned int dbdoccount{0};
120 double dbavgdoclen{0};
121 size_t mindoclen{0};
122 size_t maxdoclen{0};
123 std::vector<std::string> failedurls; /* Only set if requested */
124 };
125
has_prefix(const string & trm)126 inline bool has_prefix(const string& trm)
127 {
128 if (o_index_stripchars) {
129 return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
130 } else {
131 return !trm.empty() && trm[0] == ':';
132 }
133 }
134
strip_prefix(const string & trm)135 inline string strip_prefix(const string& trm)
136 {
137 if (!has_prefix(trm))
138 return trm;
139 string::size_type st = 0;
140 if (o_index_stripchars) {
141 st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
142 #ifdef _WIN32
143 // We have a problem there because we forgot to lowercase the drive
144 // name. So if the found character is a colon consider the drive name as
145 // the first non capital even if it is uppercase
146 if (st != string::npos && st >= 2 && trm[st] == ':') {
147 st -= 1;
148 }
149 #endif
150 } else {
151 st = trm.find_first_of(":", 1) + 1;
152 }
153 if (st == string::npos) {
154 return string(); // ??
155 }
156 return trm.substr(st);
157 }
158
get_prefix(const string & trm)159 inline string get_prefix(const string& trm)
160 {
161 if (!has_prefix(trm))
162 return string();
163 string::size_type st = 0;
164 if (o_index_stripchars) {
165 st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
166 if (st == string::npos) {
167 return string(); // ??
168 }
169 #ifdef _WIN32
170 // We have a problem there because we forgot to lowercase the drive
171 // name. So if the found character is a colon consider the drive name as
172 // the first non capital even if it is uppercase
173 if (st >= 2 && trm[st] == ':') {
174 st -= 1;
175 }
176 #endif
177 return trm.substr(0, st);
178 } else {
179 st = trm.find_first_of(":", 1) + 1;
180 if (st == string::npos) {
181 return string(); // ??
182 }
183 return trm.substr(1, st-2);
184 }
185 }
186
wrap_prefix(const string & pfx)187 inline string wrap_prefix(const string& pfx)
188 {
189 if (o_index_stripchars) {
190 return pfx;
191 } else {
192 return cstr_colon + pfx + cstr_colon;
193 }
194 }
195
196 /**
197 * Wrapper class for the native database.
198 */
199 class Db {
200 public:
201 // A place for things we don't want visible here.
202 class Native;
203 friend class Native;
204
205 /* General stuff (valid for query or update) ****************************/
206 Db(const RclConfig *cfp);
207 ~Db();
208
209 enum OpenMode {DbRO, DbUpd, DbTrunc};
isWriteMode(OpenMode mode)210 bool isWriteMode(OpenMode mode) {
211 return mode == DbUpd || mode == DbTrunc;
212 }
213 enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
214 bool open(OpenMode mode, OpenError *error = 0);
215 bool close();
216 bool isopen();
217
218 /** Get explanation about last error */
getReason()219 string getReason() const {return m_reason;}
220
221 /** Return all possible stemmer names */
222 static vector<string> getStemmerNames();
223
224 /** Return existing stemming databases */
225 vector<string> getStemLangs();
226
227 /** Check if index stores the documents' texts. Only valid after open */
228 bool storesDocText();
229
230 /** Test word for spelling correction candidate: not too long, no
231 * special chars...
232 * @param with_aspell test for use with aspell, else for xapian speller
233 */
234 static bool isSpellingCandidate(const string& term, bool with_aspell=true) {
235 if (term.empty() || term.length() > 50 || has_prefix(term))
236 return false;
237
238 Utf8Iter u8i(term);
239 if (with_aspell) {
240 // If spelling with aspell, CJK scripts are not candidates
241 if (TextSplit::isCJK(*u8i))
242 return false;
243 } else {
244 #ifdef TESTING_XAPIAN_SPELL
245 // The Xapian speller (purely proximity-based) can be used
246 // for Katakana (when split as words which is not always
247 // completely feasible because of separator-less
248 // compounds). Currently we don't try to use the Xapian
249 // speller with other scripts with which it would be usable
250 // in the absence of aspell (it would indeed be better
251 // than nothing with e.g. european languages). This would
252 // require a few more config variables, maybe one day.
253 if (!TextSplit::isKATAKANA(*u8i)) {
254 return false;
255 }
256 #else
257 return false;
258 #endif
259 }
260 if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
261 != string::npos)
262 return false;
263 return true;
264 }
265
266 /** Return spelling suggestion */
267 bool getSpellingSuggestions(const string& word,
268 std::vector<std::string>& suggs);
269
270 /* The next two, only for searchdata, should be somehow hidden */
271 /* Return configured stop words */
getStopList()272 const StopList& getStopList() const {return m_stops;}
273 /* Field name to prefix translation (ie: author -> 'A') */
274 bool fieldToTraits(const string& fldname, const FieldTraits **ftpp,
275 bool isquery = false);
276
277 /* Update-related methods ******************************************/
278
279 /** Test if the db entry for the given udi is up to date.
280 *
281 * This is done by comparing the input and stored sigs. This is
282 * used both when indexing and querying (before opening a document
283 * using stale info).
284 *
285 * **This assumes that the udi pertains to the main index (idxi==0).**
286 *
287 * Side-effect when the db is writeable and the document up to
288 * date: set the existence flag for the file document and all
289 * subdocs if any (for later use by 'purge()')
290 *
291 * @param udi Unique Document Identifier (as chosen by indexer).
292 * @param sig New signature (as computed by indexer).
293 * @param xdocid[output] Non-zero if doc existed. Should be considered
294 * as opaque, to be used for a possible later call to setExistingFlags()
295 * Note that if inplaceReset is set, the return value is non-zero but not
296 * an actual docid, it's only used as a flag in this case.
297 * @param osig[output] old signature.
298 */
299 bool needUpdate(const string &udi, const string& sig,
300 unsigned int *xdocid = 0, std::string *osig = 0);
301
302 /** Set the existance flags for the document and its eventual subdocuments
303 *
304 * This can be called by the indexer after needUpdate() has returned true,
305 * if the indexer does not wish to actually re-index (e.g.: the doc is
306 * known to cause errors).
307 */
308 void setExistingFlags(const string& udi, unsigned int docid);
309
310 /** Indicate if we are doing a systematic reindex. This complements
311 needUpdate() return */
inFullReset()312 bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
313
314 /** Add or update document identified by unique identifier.
315 * @param config Config object to use. Can be the same as the member config
316 * or a clone, to avoid sharing when called in multithread context.
317 * @param udi the Unique Document Identifier is opaque to us.
318 * Maximum size 150 bytes.
319 * @param parent_udi the UDI for the container document. In case of complex
320 * embedding, this is not always the immediate parent but the UDI for
321 * the container file (which may be a farther ancestor). It is
322 * used for purging subdocuments when a file ceases to exist and
323 * to set the existence flags of all subdocuments of a container
324 * that is found to be up to date. In other words, the
325 * parent_udi is the UDI for the ancestor of the document which
326 * is subject to needUpdate() and physical existence tests (some
327 * kind of file equivalent). Empty for top-level docs. Should
328 * probably be renamed container_udi.
329 * @param doc container for document data. Should have been filled as
330 * much as possible depending on the document type.
331 * ** doc will be modified in a destructive way **
332 */
333 bool addOrUpdate(const string &udi, const string &parent_udi, Doc &doc);
334
335 #ifdef IDX_THREADS
336 void waitUpdIdle();
337 #endif
338
339 /** Delete document(s) for given UDI, including subdocs */
340 bool purgeFile(const string &udi, bool *existed = 0);
341 /** Delete subdocs with an out of date sig. We do this to purge
342 obsolete subdocs during a partial update where no general purge
343 will be done */
344 bool purgeOrphans(const string &udi);
345
346 /** Remove documents that no longer exist in the file system. This
347 * depends on the update map, which is built during
348 * indexing (needUpdate() / addOrUpdate()).
349 *
350 * This should only be called after a full walk of
351 * the file system, else the update map will not be complete, and
352 * many documents will be deleted that shouldn't, which is why this
353 * has to be called externally, rcldb can't know if the indexing
354 * pass was complete or partial.
355 */
356 bool purge();
357
358 /** Create stem expansion database for given languages. */
359 bool createStemDbs(const std::vector<std::string> &langs);
360 /** Delete stem expansion database for given language. */
361 bool deleteStemDb(const string &lang);
362
363 /* Query-related methods ************************************/
364
365 /** Return total docs in db */
366 int docCnt();
367 /** Return count of docs which have an occurrence of term */
368 int termDocCnt(const string& term);
369 /** Add extra Xapian database for querying.
370 * @param dir must point to something which can be passed as parameter
371 * to a Xapian::Database constructor (directory or stub).
372 */
373 bool addQueryDb(const string &dir);
374 /** Remove extra database. if dir == "", remove all. */
375 bool rmQueryDb(const string &dir);
376 /** Set the extra indexes to the input list. */
377 bool setExtraQueryDbs(const std::vector<std::string>& dbs);
378
379 /** Check if document comes from the main index (this is used to
380 decide if we can update the index for it */
381 bool fromMainIndex(const Doc& doc);
382
383 /** Retrieve the stored doc text. This returns false if the index does not
384 store raw text or other problems (discriminate with storesDocText().
385 On success, the data is stored in doc.text
386 */
387 bool getDocRawText(Doc& doc);
388
389 /** Retrieve an index designator for the document result. This is used
390 * by the GUI document history feature for remembering where a
391 * doc comes from and allowing later retrieval (if the ext index
392 * is still active...).
393 */
394 std::string whatIndexForResultDoc(const Doc& doc);
395
396 /** Tell if directory seems to hold xapian db */
397 static bool testDbDir(const string &dir, bool *stripped = 0);
398
399 /** Return the index terms that match the input string
400 * Expansion is performed either with either wildcard or regexp processing
401 * Stem expansion is performed if lang is not empty
402 *
403 * @param typ_sens defines the kind of expansion: none, wildcard,
404 * regexp or stemming. "none" may still expand case,
405 * diacritics and synonyms, depending on the casesens, diacsens and
406 * synexp flags.
407 * @param lang sets the stemming language(s). Can be a space-separated list
408 * @param term is the term to expand
409 * @param result is the main output
410 * @param max defines the maximum result count
411 * @param field if set, defines the field within with the expansion should
412 * be performed. Only used for wildcards and regexps, stemming is
413 * always global. If this is set, the resulting output terms
414 * will be appropriately prefixed and the prefix value will be set
415 * in the TermMatchResult header
416 */
417 enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
418 ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32, ET_PATHELT=64};
matchTypeTp(int tp)419 int matchTypeTp(int tp) {
420 return tp & 7;
421 }
422 bool termMatch(int typ_sens, const string &lang, const string &term,
423 TermMatchResult& result, int max = -1,
424 const string& field = "", vector<string> *multiwords = 0);
425 bool dbStats(DbStats& stats, bool listFailed);
426 /** Return min and max years for doc mod times in db */
427 bool maxYearSpan(int *minyear, int *maxyear);
428 /** Return all mime types in index. This can be different from the
429 ones defined in the config because of 'file' command
430 usage. Inserts the types at the end of the parameter */
431 bool getAllDbMimeTypes(std::vector<std::string>&);
432
433 /** Wildcard expansion specific to file names. Internal/sdata use only */
434 bool filenameWildExp(const string& exp, vector<string>& names, int max);
435
436 /** Set parameters for synthetic abstract generation */
437 void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
getAbsCtxLen()438 int getAbsCtxLen() const {
439 return m_synthAbsWordCtxLen;
440 }
getAbsLen()441 int getAbsLen() const {
442 return m_synthAbsLen;
443 }
444
445 /** Get document for given udi and db index
446 *
447 * Used to retrieve ancestor documents.
448 * @param udi The unique document identifier.
449 * @param idxdoc A document from the same database as an opaque way to pass
450 * the database id (e.g.: when looking for parent in a multi-database
451 * context).
452 * @param[out] doc The output Recoll document.
453 * @return True for success.
454 */
455 bool getDoc(const string &udi, const Doc& idxdoc, Doc &doc);
456
457 /** Get document for given udi and index directory.
458 *
459 * Used by the 'history' feature. This supposes that the extra db
460 * is still active.
461 * @param udi The unique document identifier.
462 * @param dbdir The index directory, from storage, as returned by
463 * whatIndexForResultDoc() at the time of the query. Can be
464 * empty to mean "main index" (allows the history to avoid
465 * storing the main dbdir value).
466 * @param[out] doc The output Recoll document.
467 * @return True for success.
468 */
469 bool getDoc(const string &udi, const std::string& dbdir, Doc &doc);
470
471 /** Test if documents has sub-documents.
472 *
473 * This can always be detected for file-level documents, using the
474 * postlist for the parent term constructed with udi.
475 *
476 * For non file-level documents (e.g.: does an email inside an
477 * mbox have attachments ?), detection is dependant on the filter
478 * having set an appropriate flag at index time. Higher level code
479 * can't detect it because the doc for the parent may have been
480 * seen before any children. The flag is stored as a value in the
481 * index.
482 */
483 bool hasSubDocs(const Doc &idoc);
484
485 /** Get subdocuments of given document.
486 *
487 * For file-level documents, these are all docs indexed by the
488 * parent term built on idoc.udi. For embedded documents, the
489 * parent doc is looked for, then its subdocs list is
490 * filtered using the idoc ipath as a prefix.
491 */
492 bool getSubDocs(const Doc& idoc, vector<Doc>& subdocs);
493
494 /** Get container (top level file) document.
495 *
496 * If the input is not a subdocument, this returns a copy of the input.
497 */
498 bool getContainerDoc(const Doc &idoc, Doc& ctdoc);
499
500 /** Get duplicates (md5) of document */
501 bool docDups(const Doc& idoc, std::vector<Doc>& odocs);
502
503 /* The following are mainly for the aspell module */
504 /** Whole term list walking. */
505 TermIter *termWalkOpen();
506 bool termWalkNext(TermIter *, string &term);
507 void termWalkClose(TermIter *);
508 /** Test term existence */
509 bool termExists(const string& term);
510 /** Test if terms stem to different roots. */
511 bool stemDiffers(const string& lang, const string& term,
512 const string& base);
513
getConf()514 const RclConfig *getConf() {return m_config;}
515
516 /**
517 Activate the "in place reset" mode where all documents are
518 considered as needing update. This is a global/per-process
519 option, and can't be reset. It should be set at the start of
520 the indexing pass. 2012-10: no idea why this is done this way...
521 */
setInPlaceReset()522 static void setInPlaceReset() {o_inPlaceReset = true;}
523
524 /** Flush interval get/set. This is used by the first indexing
525 pass to override the config value and flush more rapidly
526 initially so that the user can quickly play with queries */
getFlushMb()527 int getFlushMb() {
528 return m_flushMb;
529 }
setFlushMb(int mb)530 void setFlushMb(int mb) {
531 m_flushMb = mb;
532 }
533 bool doFlush();
534
535 // Use empty fn for no synonyms
536 bool setSynGroupsFile(const std::string& fn);
getSynGroups()537 const SynGroups& getSynGroups() {return m_syngroups;}
538
539 // Mark all documents with an UDI having input as prefix as
540 // existing. Only works if the UDIs for the store are
541 // hierarchical of course. Used by FsIndexer to avoid purging
542 // files for a topdir which is on a removable file system and
543 // currently unmounted (topdir does not exist or is empty.
544 bool udiTreeMarkExisting(const string& udi);
545
546 /* This has to be public for access by embedded Query::Native */
547 Native *m_ndb{nullptr};
548
549 private:
550 const RclConfig *m_config;
551 string m_reason; // Error explanation
552
553 // Xapian directories for additional databases to query
554 vector<string> m_extraDbs;
555 OpenMode m_mode{Db::DbRO};
556 // File existence vector: this is filled during the indexing pass. Any
557 // document whose bit is not set at the end is purged
558 vector<bool> updated;
559 // Text bytes indexed since beginning
560 long long m_curtxtsz{0};
561 // Text bytes at last flush
562 long long m_flushtxtsz{0};
563 // Text bytes at last fsoccup check
564 long long m_occtxtsz{0};
565 // First fs occup check ?
566 int m_occFirstCheck{1};
567
568 // Synonym groups. There is no strict reason that this has to be
569 // an Rcl::Db member, as it is only used when building each It
570 // could be a SearchData member, or even a parameter to
571 // Query::setQuery(). Otoh, building the syngroups structure from
572 // a file may be expensive and it's unlikely to change with every
573 // query, so it makes sense to cache it, and Rcl::Db is not a bad
574 // place for this.
575 SynGroups m_syngroups;
576
577 // Aspell object if needed
578 Aspell *m_aspell{nullptr};
579
580 /***************
581 * Parameters cached out of the configuration files. Logically const
582 * after init */
583 // Stop terms: those don't get indexed.
584 StopList m_stops;
585 // Truncation length for stored meta fields
586 int m_idxMetaStoredLen{150};
587 // This is how long an abstract we keep or build from beginning of
588 // text when indexing. It only has an influence on the size of the
589 // db as we are free to shorten it again when displaying
590 int m_idxAbsTruncLen{250};
591 // Document text truncation length
592 int m_idxTextTruncateLen{0};
593 // This is the size of the abstract that we synthetize out of query
594 // term contexts at *query time*
595 int m_synthAbsLen{250};
596 // This is how many words (context size) we keep around query terms
597 // when building the abstract
598 int m_synthAbsWordCtxLen{4};
599 // Flush threshold. Megabytes of text indexed before we flush.
600 int m_flushMb{-1};
601 // Maximum file system occupation percentage
602 int m_maxFsOccupPc{0};
603 // Database directory
604 string m_basedir;
605 // When this is set, all documents are considered as needing a reindex.
606 // This implements an alternative to just erasing the index before
607 // beginning, with the advantage that, for small index formats updates,
608 // between releases the index remains available while being recreated.
609 static bool o_inPlaceReset;
610 /******* End logical constnesss */
611
612 #ifdef IDX_THREADS
613 friend void *DbUpdWorker(void*);
614 #endif // IDX_THREADS
615
616 // Internal form of setExistingFlags: no locking
617 void i_setExistingFlags(const string& udi, unsigned int docid);
618 // Internal form of close, can be called during destruction
619 bool i_close(bool final);
620 // Reinitialize when adding/removing additional dbs
621 bool adjustdbs();
622 bool idxTermMatch(int typ_sens, const string &lang, const string &term,
623 TermMatchResult& result, int max = -1,
624 const string& field = cstr_null);
625
626 // Flush when idxflushmb is reached
627 bool maybeflush(int64_t moretext);
628 bool docExists(const string& uniterm);
629
630 bool getDoc(const std::string& udi, int idxi, Doc& doc);
631
632 /* Copyconst and assignment private and forbidden */
Db(const Db &)633 Db(const Db &) {}
634 Db& operator=(const Db &) {return *this;};
635 };
636
637 // This has to go somewhere, and as it needs the Xapian version, this is
638 // the most reasonable place.
639 string version_string();
640
641 extern const string pathelt_prefix;
642 extern const string mimetype_prefix;
643 extern const string unsplitFilenameFieldName;
644 extern string start_of_field_term;
645 extern string end_of_field_term;
646
647 }
648
649 #endif /* _DB_H_INCLUDED_ */
650