1 /*
2  *  Copyright 2005-2021 Fabrice Colin
3  *
4  *  This program is free software; you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation; either version 2 of the License, or
7  *  (at your option) any later version.
8  *
9  *  This program is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with this program; if not, write to the Free Software
16  *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17  */
18 
19 #ifndef _XAPIAN_INDEX_H
20 #define _XAPIAN_INDEX_H
21 
22 #include <string>
23 #include <set>
24 #include <map>
25 
26 #include "config.h"
27 #include "CJKVTokenizer.h"
28 #include "XapianDatabase.h"
29 #include "IndexInterface.h"
30 
31 /// A Xapian-based index.
32 class XapianIndex : public IndexInterface
33 {
34 	public:
35 		XapianIndex(const std::string &indexName);
36 		XapianIndex(const XapianIndex &other);
37 		virtual ~XapianIndex();
38 
39 		XapianIndex &operator=(const XapianIndex &other);
40 
41 		/// Returns false if the index couldn't be opened.
42 		virtual bool isGood(void) const;
43 
44 		/// Gets metadata.
45 		virtual std::string getMetadata(const std::string &name) const;
46 
47 		/// Sets metadata.
48 		virtual bool setMetadata(const std::string &name, const std::string &value) const;
49 
50 		/// Gets the index location.
51 		virtual std::string getLocation(void) const;
52 
53 		/// Returns a document's properties.
54 		virtual bool getDocumentInfo(unsigned int docId, DocumentInfo &docInfo) const;
55 
56 		/// Returns a document's terms count.
57 		virtual unsigned int getDocumentTermsCount(unsigned int docId) const;
58 
59 		/// Returns a document's terms.
60 		virtual bool getDocumentTerms(unsigned int docId,
61 			std::map<unsigned int, std::string> &wordsBuffer) const;
62 
63 		/// Sets the list of known labels.
64 		virtual bool setLabels(const std::set<std::string> &labels, bool resetLabels);
65 
66 		/// Gets the list of known labels.
67 		virtual bool getLabels(std::set<std::string> &labels) const;
68 
69 		/// Adds a label.
70 		virtual bool addLabel(const std::string &name);
71 
72 		/// Deletes all references to a label.
73 		virtual bool deleteLabel(const std::string &name);
74 
75 		/// Determines whether a document has a label.
76 		virtual bool hasLabel(unsigned int docId, const std::string &name) const;
77 
78 		/// Returns a document's labels.
79 		virtual bool getDocumentLabels(unsigned int docId, std::set<std::string> &labels) const;
80 
81 		/// Sets a document's labels.
82 		virtual bool setDocumentLabels(unsigned int docId, const std::set<std::string> &labels,
83 			bool resetLabels = true);
84 
85 		/// Sets documents' labels.
86 		virtual bool setDocumentsLabels(const std::set<unsigned int> &docIds,
87 			const std::set<std::string> &labels, bool resetLabels = true);
88 
89 		/// Checks whether the given URL is in the index.
90 		virtual unsigned int hasDocument(const std::string &url) const;
91 
92 		/// Gets terms with the same root.
93 		virtual unsigned int getCloseTerms(const std::string &term, std::set<std::string> &suggestions);
94 
95 		/// Returns the ID of the last document.
96 		virtual unsigned int getLastDocumentID(void) const;
97 
98 		/// Returns the number of documents.
99 		virtual unsigned int getDocumentsCount(const std::string &labelName = "") const;
100 
101 		/// Lists documents.
102 		virtual unsigned int listDocuments(std::set<unsigned int> &docIDList,
103 			unsigned int maxDocsCount = 0, unsigned int startDoc = 0) const;
104 
105 		/// Lists documents.
106 		virtual bool listDocuments(const std::string &name, std::set<unsigned int> &docIds,
107 			NameType type, unsigned int maxDocsCount = 0, unsigned int startDoc = 0) const;
108 
109 		/// Indexes the given data.
110 		virtual bool indexDocument(const Document &doc, const std::set<std::string> &labels,
111 			unsigned int &docId);
112 
113 		/// Updates the given document.
114 		virtual bool updateDocument(unsigned int docId, const Document &doc);
115 
116 		/// Updates a document's properties.
117 		virtual bool updateDocumentInfo(unsigned int docId, const DocumentInfo &docInfo);
118 
119 		/// Unindexes the given document.
120 		virtual bool unindexDocument(unsigned int docId);
121 
122 		/// Unindexes the given document.
123 		virtual bool unindexDocument(const std::string &location);
124 
125 		/// Unindexes documents.
126 		virtual bool unindexDocuments(const std::string &name, NameType type);
127 
128 		/// Unindexes all documents.
129 		virtual bool unindexAllDocuments(void);
130 
131 		/// Flushes recent changes to the disk.
132 		virtual bool flush(void);
133 
134 		/// Reopens the index.
135 		virtual bool reopen(void) const;
136 
137 		/// Resets the index.
138 		virtual bool reset(void);
139 
140 	protected:
141 		std::string m_databaseName;
142 		bool m_goodIndex;
143 		bool m_doSpelling;
144 		std::string m_stemLanguage;
145 
146 		bool listDocumentsWithTerm(const std::string &term, std::set<unsigned int> &docIds,
147 			unsigned int maxDocsCount = 0, unsigned int startDoc = 0) const;
148 
149 		void addPostingsToDocument(const Xapian::Utf8Iterator &itor, Xapian::Document &doc,
150 			const Xapian::WritableDatabase &db, const std::string &prefix,
151 			bool noStemming, bool &doSpelling,  Xapian::termcount &termPos) const;
152 
153 		void addPostingsToDocument(Dijon::CJKVTokenizer &tokenizer, Xapian::Stem *pStemmer,
154 			const std::string &text, Xapian::Document &doc,
155 			const Xapian::WritableDatabase &db, const std::string &prefix,
156 			bool &doSpelling, Xapian::termcount &termPos) const;
157 
158 		static void addLabelsToDocument(Xapian::Document &doc,
159 			const std::set<std::string> &labels, bool skipInternals);
160 
161 		void removePostingsFromDocument(const Xapian::Utf8Iterator &itor, Xapian::Document &doc,
162 			const Xapian::WritableDatabase &db, const std::string &prefix,
163 			bool noStemming, bool &doSpelling) const;
164 
165 		void addCommonTerms(const DocumentInfo &info, Xapian::Document &doc,
166 			const Xapian::WritableDatabase &db, Xapian::termcount &termPos);
167 
168 		void removeCommonTerms(Xapian::Document &doc, const Xapian::WritableDatabase &db);
169 
170 		std::string scanDocument(const std::string &suggestedLanguage,
171 			const char *pData, off_t dataLength);
172 
173 		void setDocumentData(const DocumentInfo &info, Xapian::Document &doc,
174 			const std::string &language) const;
175 
176 		bool deleteDocuments(const std::string &term);
177 
178 };
179 
180 #endif // _XAPIAN_INDEX_H
181