1 /* Copyright (C) 2004 J.F.Dockes
2  *   This program is free software; you can redistribute it and/or modify
3  *   it under the terms of the GNU General Public License as published by
4  *   the Free Software Foundation; either version 2 of the License, or
5  *   (at your option) any later version.
6  *
7  *   This program is distributed in the hope that it will be useful,
8  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *   GNU General Public License for more details.
11  *
12  *   You should have received a copy of the GNU General Public License
13  *   along with this program; if not, write to the
14  *   Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16  */
17 #ifndef _DB_H_INCLUDED_
18 #define _DB_H_INCLUDED_
19 
20 #include "autoconfig.h"
21 
22 #include <stdint.h>
23 #include <string>
24 #include <vector>
25 #include <memory>
26 
27 #include "cstr.h"
28 #include "rcldoc.h"
29 #include "stoplist.h"
30 #include "rclconfig.h"
31 #include "utf8iter.h"
32 #include "textsplit.h"
33 #include "syngroups.h"
34 
35 using std::string;
36 using std::vector;
37 
38 // rcldb defines an interface for a 'real' text database. The current
39 // implementation uses xapian only, and xapian-related code is in rcldb.cpp
40 // If support was added for other backend, the xapian code would be moved in
41 // rclxapian.cpp, another file would be created for the new backend, and the
42 // configuration/compile/link code would be adjusted to allow choosing. There
43 // is no plan for supporting multiple different backends.
44 //
45 // In no case does this try to implement a useful virtualized text-db interface
46 // The main goal is simplicity and good matching to usage inside the recoll
47 // user interface. In other words, this is not exhaustive or well-designed or
48 // reusable.
49 //
50 // Unique Document Identifier: uniquely identifies a document in its
51 // source storage (file system or other). Used for up to date checks
52 // etc. "udi". Our user is responsible for making sure it's not too
53 // big, cause it's stored as a Xapian term (< 150 bytes would be
54 // reasonable)
55 
56 class RclConfig;
57 class Aspell;
58 
59 namespace Rcl {
60 
61 // Omega compatible values. We leave a hole for future omega values. Not sure
62 // it makes any sense to keep any level of omega compat given that the index
63 // is incompatible anyway.
64 enum value_slot {
65     // Omega-compatible values:
66     VALUE_LASTMOD = 0,  // 4 byte big endian value - seconds since 1970.
67     VALUE_MD5 = 1,      // 16 byte MD5 checksum of original document.
68     VALUE_SIZE = 2,     // sortable_serialise(<file size in bytes>)
69 
70     ////////// Recoll only:
71     // Doc sig as chosen by app (ex: mtime+size
72     VALUE_SIG = 10,
73 };
74 
75 class SearchData;
76 class TermIter;
77 class Query;
78 
79 /** Used for returning result lists for index terms matching some criteria */
80 class TermMatchEntry {
81 public:
TermMatchEntry()82     TermMatchEntry()
83         : wcf(0) {}
TermMatchEntry(const string & t,int f,int d)84     TermMatchEntry(const string& t, int f, int d)
85         : term(t), wcf(f), docs(d) {}
TermMatchEntry(const string & t)86     TermMatchEntry(const string& t)
87         : term(t), wcf(0) {}
88     bool operator==(const TermMatchEntry &o) const {
89         return term == o.term;
90     }
91     bool operator<(const TermMatchEntry &o) const {
92         return term < o.term;
93     }
94 
95     string term;
96     int    wcf; // Total count of occurrences within collection.
97     int    docs; // Number of documents countaining term.
98 };
99 
100 /** Term match result list header: statistics and global info */
101 class TermMatchResult {
102 public:
TermMatchResult()103     TermMatchResult() {
104         clear();
105     }
clear()106     void clear() {
107         entries.clear();
108     }
109     // Term expansion
110     vector<TermMatchEntry> entries;
111     // If a field was specified, this is the corresponding index prefix
112     string prefix;
113 };
114 
115 class DbStats {
116 public:
DbStats()117     DbStats() {}
118     // Index-wide stats
119     unsigned int dbdoccount{0};
120     double       dbavgdoclen{0};
121     size_t       mindoclen{0};
122     size_t       maxdoclen{0};
123     std::vector<std::string> failedurls; /* Only set if requested */
124 };
125 
has_prefix(const string & trm)126 inline bool has_prefix(const string& trm)
127 {
128     if (o_index_stripchars) {
129         return !trm.empty() && 'A' <= trm[0] && trm[0] <= 'Z';
130     } else {
131         return !trm.empty() && trm[0] == ':';
132     }
133 }
134 
strip_prefix(const string & trm)135 inline string strip_prefix(const string& trm)
136 {
137     if (!has_prefix(trm))
138         return trm;
139     string::size_type st = 0;
140     if (o_index_stripchars) {
141         st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
142 #ifdef _WIN32
143         // We have a problem there because we forgot to lowercase the drive
144         // name. So if the found character is a colon consider the drive name as
145         // the first non capital even if it is uppercase
146         if (st != string::npos && st >= 2 && trm[st] == ':') {
147             st -= 1;
148         }
149 #endif
150     } else {
151         st = trm.find_first_of(":", 1) + 1;
152     }
153     if (st == string::npos) {
154         return string(); // ??
155     }
156     return trm.substr(st);
157 }
158 
get_prefix(const string & trm)159 inline string get_prefix(const string& trm)
160 {
161     if (!has_prefix(trm))
162         return string();
163     string::size_type st = 0;
164     if (o_index_stripchars) {
165         st = trm.find_first_not_of("ABCDEFIJKLMNOPQRSTUVWXYZ");
166         if (st == string::npos) {
167             return string(); // ??
168         }
169 #ifdef _WIN32
170         // We have a problem there because we forgot to lowercase the drive
171         // name. So if the found character is a colon consider the drive name as
172         // the first non capital even if it is uppercase
173         if (st >= 2 && trm[st] == ':') {
174             st -= 1;
175         }
176 #endif
177         return trm.substr(0, st);
178     } else {
179         st = trm.find_first_of(":", 1) + 1;
180         if (st == string::npos) {
181             return string(); // ??
182         }
183         return trm.substr(1, st-2);
184     }
185 }
186 
wrap_prefix(const string & pfx)187 inline string wrap_prefix(const string& pfx)
188 {
189     if (o_index_stripchars) {
190         return pfx;
191     } else {
192         return cstr_colon + pfx + cstr_colon;
193     }
194 }
195 
196 /**
197  * Wrapper class for the native database.
198  */
199 class Db {
200 public:
201     // A place for things we don't want visible here.
202     class Native;
203     friend class Native;
204 
205     /* General stuff (valid for query or update) ****************************/
206     Db(const RclConfig *cfp);
207     ~Db();
208 
209     enum OpenMode {DbRO, DbUpd, DbTrunc};
isWriteMode(OpenMode mode)210     bool isWriteMode(OpenMode mode) {
211         return mode == DbUpd || mode == DbTrunc;
212     }
213     enum OpenError {DbOpenNoError, DbOpenMainDb, DbOpenExtraDb};
214     bool open(OpenMode mode, OpenError *error = 0);
215     bool close();
216     bool isopen();
217 
218     /** Get explanation about last error */
getReason()219     string getReason() const {return m_reason;}
220 
221     /** Return all possible stemmer names */
222     static vector<string> getStemmerNames();
223 
224     /** Return existing stemming databases */
225     vector<string> getStemLangs();
226 
227     /** Check if index stores the documents' texts. Only valid after open */
228     bool storesDocText();
229 
230     /** Test word for spelling correction candidate: not too long, no
231      * special chars...
232      * @param with_aspell test for use with aspell, else for xapian speller
233      */
234     static bool isSpellingCandidate(const string& term, bool with_aspell=true) {
235         if (term.empty() || term.length() > 50 || has_prefix(term))
236             return false;
237 
238         Utf8Iter u8i(term);
239         if (with_aspell) {
240             // If spelling with aspell, CJK scripts are not candidates
241             if (TextSplit::isCJK(*u8i))
242                 return false;
243         } else {
244 #ifdef TESTING_XAPIAN_SPELL
245             // The Xapian speller (purely proximity-based) can be used
246             // for Katakana (when split as words which is not always
247             // completely feasible because of separator-less
248             // compounds). Currently we don't try to use the Xapian
249             // speller with other scripts with which it would be usable
250             // in the absence of aspell (it would indeed be better
251             // than nothing with e.g. european languages). This would
252             // require a few more config variables, maybe one day.
253             if (!TextSplit::isKATAKANA(*u8i)) {
254                 return false;
255             }
256 #else
257             return false;
258 #endif
259         }
260         if (term.find_first_of(" !\"#$%&()*+,-./0123456789:;<=>?@[\\]^_`{|}~")
261             != string::npos)
262             return false;
263         return true;
264     }
265 
266     /** Return spelling suggestion */
267     bool getSpellingSuggestions(const string& word,
268                                 std::vector<std::string>& suggs);
269 
270     /* The next two, only for searchdata, should be somehow hidden */
271     /* Return configured stop words */
getStopList()272     const StopList& getStopList() const {return m_stops;}
273     /* Field name to prefix translation (ie: author -> 'A') */
274     bool fieldToTraits(const string& fldname, const FieldTraits **ftpp,
275                        bool isquery = false);
276 
277     /* Update-related methods ******************************************/
278 
279     /** Test if the db entry for the given udi is up to date.
280      *
281      * This is done by comparing the input and stored sigs. This is
282      * used both when indexing and querying (before opening a document
283      * using stale info).
284      *
285      * **This assumes that the udi pertains to the main index (idxi==0).**
286      *
287      * Side-effect when the db is writeable and the document up to
288      * date: set the existence flag for the file document and all
289      * subdocs if any (for later use by 'purge()')
290      *
291      * @param udi Unique Document Identifier (as chosen by indexer).
292      * @param sig New signature (as computed by indexer).
293      * @param xdocid[output] Non-zero if doc existed. Should be considered
294      *    as opaque, to be used for a possible later call to setExistingFlags()
295      *    Note that if inplaceReset is set, the return value is non-zero but not
296      *    an actual docid, it's only used as a flag in this case.
297      * @param osig[output] old signature.
298      */
299     bool needUpdate(const string &udi, const string& sig,
300                     unsigned int *xdocid = 0, std::string *osig = 0);
301 
302     /** Set the existance flags for the document and its eventual subdocuments
303      *
304      * This can be called by the indexer after needUpdate() has returned true,
305      * if the indexer does not wish to actually re-index (e.g.: the doc is
306      * known to cause errors).
307      */
308     void setExistingFlags(const string& udi, unsigned int docid);
309 
310     /** Indicate if we are doing a systematic reindex. This complements
311         needUpdate() return */
inFullReset()312     bool inFullReset() {return o_inPlaceReset || m_mode == DbTrunc;}
313 
314     /** Add or update document identified by unique identifier.
315      * @param config Config object to use. Can be the same as the member config
316      *   or a clone, to avoid sharing when called in multithread context.
317      * @param udi the Unique Document Identifier is opaque to us.
318      *   Maximum size 150 bytes.
319      * @param parent_udi the UDI for the container document. In case of complex
320      *  embedding, this is not always the immediate parent but the UDI for
321      *  the container file (which may be a farther ancestor). It is
322      *  used for purging subdocuments when a file ceases to exist and
323      *  to set the existence flags of all subdocuments of a container
324      *  that is found to be up to date. In other words, the
325      *  parent_udi is the UDI for the ancestor of the document which
326      *  is subject to needUpdate() and physical existence tests (some
327      *  kind of file equivalent). Empty for top-level docs. Should
328      *  probably be renamed container_udi.
329      * @param doc container for document data. Should have been filled as
330      *   much as possible depending on the document type.
331      *   ** doc will be modified in a destructive way **
332      */
333     bool addOrUpdate(const string &udi, const string &parent_udi, Doc &doc);
334 
335 #ifdef IDX_THREADS
336     void waitUpdIdle();
337 #endif
338 
339     /** Delete document(s) for given UDI, including subdocs */
340     bool purgeFile(const string &udi, bool *existed = 0);
341     /** Delete subdocs with an out of date sig. We do this to purge
342         obsolete subdocs during a partial update where no general purge
343         will be done */
344     bool purgeOrphans(const string &udi);
345 
346     /** Remove documents that no longer exist in the file system. This
347      * depends on the update map, which is built during
348      * indexing (needUpdate() / addOrUpdate()).
349      *
350      * This should only be called after a full walk of
351      * the file system, else the update map will not be complete, and
352      * many documents will be deleted that shouldn't, which is why this
353      * has to be called externally, rcldb can't know if the indexing
354      * pass was complete or partial.
355      */
356     bool purge();
357 
358     /** Create stem expansion database for given languages. */
359     bool createStemDbs(const std::vector<std::string> &langs);
360     /** Delete stem expansion database for given language. */
361     bool deleteStemDb(const string &lang);
362 
363     /* Query-related methods ************************************/
364 
365     /** Return total docs in db */
366     int  docCnt();
367     /** Return count of docs which have an occurrence of term */
368     int termDocCnt(const string& term);
369     /** Add extra Xapian database for querying.
370      * @param dir must point to something which can be passed as parameter
371      *      to a Xapian::Database constructor (directory or stub).
372      */
373     bool addQueryDb(const string &dir);
374     /** Remove extra database. if dir == "", remove all. */
375     bool rmQueryDb(const string &dir);
376     /** Set the extra indexes to the input list. */
377     bool setExtraQueryDbs(const std::vector<std::string>& dbs);
378 
379     /** Check if document comes from the main index (this is used to
380         decide if we can update the index for it */
381     bool fromMainIndex(const Doc& doc);
382 
383     /** Retrieve the stored doc text. This returns false if the index does not
384         store raw text or other problems (discriminate with storesDocText().
385         On success, the data is stored in doc.text
386     */
387     bool getDocRawText(Doc& doc);
388 
389     /** Retrieve an index designator for the document result. This is used
390      * by the GUI document history feature for remembering where a
391      * doc comes from and allowing later retrieval (if the ext index
392      * is still active...).
393      */
394     std::string whatIndexForResultDoc(const Doc& doc);
395 
396     /** Tell if directory seems to hold xapian db */
397     static bool testDbDir(const string &dir, bool *stripped = 0);
398 
399     /** Return the index terms that match the input string
400      * Expansion is performed either with either wildcard or regexp processing
401      * Stem expansion is performed if lang is not empty
402      *
403      * @param typ_sens defines the kind of expansion: none, wildcard,
404      *    regexp or stemming. "none" may still expand case,
405      *    diacritics and synonyms, depending on the casesens, diacsens and
406      *    synexp flags.
407      * @param lang sets the stemming language(s). Can be a space-separated list
408      * @param term is the term to expand
409      * @param result is the main output
410      * @param max defines the maximum result count
411      * @param field if set, defines the field within with the expansion should
412      *        be performed. Only used for wildcards and regexps, stemming is
413      *        always global. If this is set, the resulting output terms
414      *        will be appropriately prefixed and the prefix value will be set
415      *        in the TermMatchResult header
416      */
417     enum MatchType {ET_NONE=0, ET_WILD=1, ET_REGEXP=2, ET_STEM=3,
418                     ET_DIACSENS=8, ET_CASESENS=16, ET_SYNEXP=32, ET_PATHELT=64};
matchTypeTp(int tp)419     int matchTypeTp(int tp) {
420         return tp & 7;
421     }
422     bool termMatch(int typ_sens, const string &lang, const string &term,
423                    TermMatchResult& result, int max = -1,
424                    const string& field = "", vector<string> *multiwords = 0);
425     bool dbStats(DbStats& stats, bool listFailed);
426     /** Return min and max years for doc mod times in db */
427     bool maxYearSpan(int *minyear, int *maxyear);
428     /** Return all mime types in index. This can be different from the
429         ones defined in the config because of 'file' command
430         usage. Inserts the types at the end of the parameter */
431     bool getAllDbMimeTypes(std::vector<std::string>&);
432 
433     /** Wildcard expansion specific to file names. Internal/sdata use only */
434     bool filenameWildExp(const string& exp, vector<string>& names, int max);
435 
436     /** Set parameters for synthetic abstract generation */
437     void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
getAbsCtxLen()438     int getAbsCtxLen() const {
439         return m_synthAbsWordCtxLen;
440     }
getAbsLen()441     int getAbsLen() const {
442         return m_synthAbsLen;
443     }
444 
445     /** Get document for given udi and db index
446      *
447      * Used to retrieve ancestor documents.
448      * @param udi The unique document identifier.
449      * @param idxdoc A document from the same database as an opaque way to pass
450      *   the database id (e.g.: when looking for parent in a multi-database
451      *   context).
452      * @param[out] doc The output Recoll document.
453      * @return True for success.
454      */
455     bool getDoc(const string &udi, const Doc& idxdoc, Doc &doc);
456 
457     /** Get document for given udi and index directory.
458      *
459      * Used by the 'history' feature. This supposes that the extra db
460      * is still active.
461      * @param udi The unique document identifier.
462      * @param dbdir The index directory, from storage, as returned by
463      *   whatIndexForResultDoc() at the time of the query. Can be
464      *   empty to mean "main index" (allows the history to avoid
465      *   storing the main dbdir value).
466      * @param[out] doc The output Recoll document.
467      * @return True for success.
468      */
469     bool getDoc(const string &udi, const std::string& dbdir, Doc &doc);
470 
471     /** Test if documents has sub-documents.
472      *
473      * This can always be detected for file-level documents, using the
474      * postlist for the parent term constructed with udi.
475      *
476      * For non file-level documents (e.g.: does an email inside an
477      * mbox have attachments ?), detection is dependant on the filter
478      * having set an appropriate flag at index time. Higher level code
479      * can't detect it because the doc for the parent may have been
480      * seen before any children. The flag is stored as a value in the
481      * index.
482      */
483     bool hasSubDocs(const Doc &idoc);
484 
485     /** Get subdocuments of given document.
486      *
487      * For file-level documents, these are all docs indexed by the
488      * parent term built on idoc.udi. For embedded documents, the
489      * parent doc is looked for, then its subdocs list is
490      * filtered using the idoc ipath as a prefix.
491      */
492     bool getSubDocs(const Doc& idoc, vector<Doc>& subdocs);
493 
494     /** Get container (top level file) document.
495      *
496      * If the input is not a subdocument, this returns a copy of the input.
497      */
498     bool getContainerDoc(const Doc &idoc, Doc& ctdoc);
499 
500     /** Get duplicates (md5) of document */
501     bool docDups(const Doc& idoc, std::vector<Doc>& odocs);
502 
503     /* The following are mainly for the aspell module */
504     /** Whole term list walking. */
505     TermIter *termWalkOpen();
506     bool termWalkNext(TermIter *, string &term);
507     void termWalkClose(TermIter *);
508     /** Test term existence */
509     bool termExists(const string& term);
510     /** Test if terms stem to different roots. */
511     bool stemDiffers(const string& lang, const string& term,
512                      const string& base);
513 
getConf()514     const RclConfig *getConf() {return m_config;}
515 
516     /**
517         Activate the "in place reset" mode where all documents are
518         considered as needing update. This is a global/per-process
519         option, and can't be reset. It should be set at the start of
520         the indexing pass. 2012-10: no idea why this is done this way...
521     */
setInPlaceReset()522     static void setInPlaceReset() {o_inPlaceReset = true;}
523 
524     /** Flush interval get/set. This is used by the first indexing
525         pass to override the config value and flush more rapidly
526         initially so that the user can quickly play with queries */
getFlushMb()527     int getFlushMb() {
528         return  m_flushMb;
529     }
setFlushMb(int mb)530     void setFlushMb(int mb) {
531         m_flushMb = mb;
532     }
533     bool doFlush();
534 
535     // Use empty fn for no synonyms
536     bool setSynGroupsFile(const std::string& fn);
getSynGroups()537     const SynGroups& getSynGroups() {return m_syngroups;}
538 
539     // Mark all documents with an UDI having input as prefix as
540     // existing.  Only works if the UDIs for the store are
541     // hierarchical of course.  Used by FsIndexer to avoid purging
542     // files for a topdir which is on a removable file system and
543     // currently unmounted (topdir does not exist or is empty.
544     bool udiTreeMarkExisting(const string& udi);
545 
546     /* This has to be public for access by embedded Query::Native */
547     Native *m_ndb{nullptr};
548 
549 private:
550     const RclConfig *m_config;
551     string     m_reason; // Error explanation
552 
553     // Xapian directories for additional databases to query
554     vector<string> m_extraDbs;
555     OpenMode m_mode{Db::DbRO};
556     // File existence vector: this is filled during the indexing pass. Any
557     // document whose bit is not set at the end is purged
558     vector<bool> updated;
559     // Text bytes indexed since beginning
560     long long    m_curtxtsz{0};
561     // Text bytes at last flush
562     long long    m_flushtxtsz{0};
563     // Text bytes at last fsoccup check
564     long long    m_occtxtsz{0};
565     // First fs occup check ?
566     int         m_occFirstCheck{1};
567 
568     // Synonym groups. There is no strict reason that this has to be
569     // an Rcl::Db member, as it is only used when building each It
570     // could be a SearchData member, or even a parameter to
571     // Query::setQuery(). Otoh, building the syngroups structure from
572     // a file may be expensive and it's unlikely to change with every
573     // query, so it makes sense to cache it, and Rcl::Db is not a bad
574     // place for this.
575     SynGroups m_syngroups;
576 
577     // Aspell object if needed
578     Aspell *m_aspell{nullptr};
579 
580     /***************
581      * Parameters cached out of the configuration files. Logically const
582      * after init */
583     // Stop terms: those don't get indexed.
584     StopList m_stops;
585     // Truncation length for stored meta fields
586     int         m_idxMetaStoredLen{150};
587     // This is how long an abstract we keep or build from beginning of
588     // text when indexing. It only has an influence on the size of the
589     // db as we are free to shorten it again when displaying
590     int          m_idxAbsTruncLen{250};
591     // Document text truncation length
592     int          m_idxTextTruncateLen{0};
593     // This is the size of the abstract that we synthetize out of query
594     // term contexts at *query time*
595     int          m_synthAbsLen{250};
596     // This is how many words (context size) we keep around query terms
597     // when building the abstract
598     int          m_synthAbsWordCtxLen{4};
599     // Flush threshold. Megabytes of text indexed before we flush.
600     int          m_flushMb{-1};
601     // Maximum file system occupation percentage
602     int          m_maxFsOccupPc{0};
603     // Database directory
604     string       m_basedir;
605     // When this is set, all documents are considered as needing a reindex.
606     // This implements an alternative to just erasing the index before
607     // beginning, with the advantage that, for small index formats updates,
608     // between releases the index remains available while being recreated.
609     static bool o_inPlaceReset;
610     /******* End logical constnesss */
611 
612 #ifdef IDX_THREADS
613     friend void *DbUpdWorker(void*);
614 #endif // IDX_THREADS
615 
616     // Internal form of setExistingFlags: no locking
617     void i_setExistingFlags(const string& udi, unsigned int docid);
618     // Internal form of close, can be called during destruction
619     bool i_close(bool final);
620     // Reinitialize when adding/removing additional dbs
621     bool adjustdbs();
622     bool idxTermMatch(int typ_sens, const string &lang, const string &term,
623                       TermMatchResult& result, int max = -1,
624                       const string& field = cstr_null);
625 
626     // Flush when idxflushmb is reached
627     bool maybeflush(int64_t moretext);
628     bool docExists(const string& uniterm);
629 
630     bool getDoc(const std::string& udi, int idxi, Doc& doc);
631 
632     /* Copyconst and assignment private and forbidden */
Db(const Db &)633     Db(const Db &) {}
634     Db& operator=(const Db &) {return *this;};
635 };
636 
637 // This has to go somewhere, and as it needs the Xapian version, this is
638 // the most reasonable place.
639 string version_string();
640 
641 extern const string pathelt_prefix;
642 extern const string mimetype_prefix;
643 extern const string unsplitFilenameFieldName;
644 extern string start_of_field_term;
645 extern string end_of_field_term;
646 
647 }
648 
649 #endif /* _DB_H_INCLUDED_ */
650