1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6 
7 #ifndef MEMORYINDEX_H
8 #define MEMORYINDEX_H
9 
10 #include "LuceneContrib.h"
11 #include "IndexReader.h"
12 #include "TermEnum.h"
13 #include "Collector.h"
14 #include "TermPositions.h"
15 #include "TermPositionVector.h"
16 
17 namespace Lucene {
18 
19 /// High-performance single-document main memory Lucene fulltext search index.
20 ///
21 /// Overview
22 ///
23 /// This class is a replacement/substitute for a large subset of {@link RAMDirectory} functionality.
24 /// It is designed to enable maximum efficiency for on-the-fly matchmaking combining structured and
25 /// fuzzy fulltext search in realtime streaming applications such as Nux XQuery based XML message
26 /// queues, publish-subscribe systems for Blogs/newsfeeds, text chat, data acquisition and
27 /// distribution systems, application level routers, firewalls, classifiers, etc.  Rather than
28 /// targeting fulltext search of infrequent queries over huge persistent data archives (historic
29 /// search), this class targets fulltext search of huge numbers of queries over comparatively small
30 /// transient realtime data (prospective search).
31 ///
32 /// For example as in
33 /// <pre>
34 /// double score = search(const String& text, const QueryPtr& query)
35 /// </pre>
36 ///
37 /// Each instance can hold at most one Lucene "document", with a document containing zero or more
38 /// "fields", each field having a name and a fulltext value.  The fulltext value is tokenized
39 /// (split and transformed) into zero or more index terms (aka words) on addField(), according to
40 /// the policy implemented by an Analyzer.  For example, Lucene analyzers can split on whitespace,
41 /// normalize to lower case for case insensitivity, ignore common terms with little discriminatory
42 /// value such as "he", "in", "and" (stop words), reduce the terms to their natural linguistic root
43 /// form such as "fishing" being reduced to "fish" (stemming), resolve synonyms/inflexions/thesauri
44 /// (upon indexing and/or querying), etc.
45 ///
46 /// Note that a Lucene query selects on the field names and associated (indexed) tokenized terms,
47 /// not on the original fulltext(s) - the latter are not stored but rather thrown away immediately
48 /// after tokenization.
49 ///
50 /// For some interesting background information on search technology, see Bob Wyman's <a target="_blank"
51 /// href="http://bobwyman.pubsub.com/main/2005/05/mary_hodder_poi.html">Prospective Search</a>,
52 /// Jim Gray's <a target="_blank" href="http://www.acmqueue.org/modules.php?name=Content&pa=showpage&pid=293&page=4">
53 /// A Call to Arms - Custom subscriptions</a>, and Tim Bray's <a target="_blank"
54 /// href="http://www.tbray.org/ongoing/When/200x/2003/07/30/OnSearchTOC">On Search, the Series</a>.
55 ///
56 ///
57 /// Example Usage
58 /// <pre>
59 /// AnalyzerPtr analyzer = newLucene<SimpleAnalyzer>();
60 /// MemoryIndexPtr index = newLucene<MemoryIndex>();
61 /// index->addField(L"content", L"Readings about Salmons and other select Alaska fishing Manuals", analyzer);
62 /// index->addField(L"author", L"Tales of James", analyzer);
63 /// QueryParserPtr parser = newLucene<QueryParser>(L"content", analyzer);
64 /// double score = index->search(parser->parse(L"+author:james +salmon~ +fish* manual~"));
65 /// if (score > 0.0)
66 /// {
67 ///     // it's a match
68 /// }
69 /// else
70 /// {
71 ///     // no match found
72 /// }
73 /// </pre>
74 ///
75 ///
76 /// Performance Notes
77 ///
78 /// Internally there's a new data structure geared towards efficient indexing and searching, plus
79 /// the necessary support code to seamlessly plug into the Lucene framework.
80 ///
81 /// This class performs very well for very small texts (eg. 10 chars) as well as for large texts
82 /// (eg. 10 MB) and everything in between.  Typically, it is about 10-100 times faster than
83 /// RAMDirectory.  Note that RAMDirectory has particularly large efficiency overheads for small to
84 /// medium sized texts, both in time and space.  Indexing a field with N tokens takes O(N) in the
85 /// best case, and O(N logN) in the worst case.  Memory consumption is probably larger than for
86 /// RAMDirectory.
87 ///
88 class LPPCONTRIBAPI MemoryIndex : public LuceneObject {
89 public:
90     /// Constructs an empty instance that can optionally store the start and end character offset
91     /// of each token term in the text.  This can be useful for highlighting of hit locations with
92     /// the Lucene highlighter package.  Private until the highlighter package matures, so that
93     /// this can actually be meaningfully integrated.
94     /// @param storeOffsets Whether or not to store the start and end character offset of each
95     /// token term in the text.
96     MemoryIndex(bool storeOffsets = false);
97 
98     virtual ~MemoryIndex();
99 
100     LUCENE_CLASS(MemoryIndex);
101 
102 protected:
103     /// info for each field
104     MapStringMemoryIndexInfo fields;
105 
106     /// fields sorted ascending by fieldName; lazily computed on demand
107     CollectionStringMemoryIndexInfo sortedFields;
108 
109     /// pos: positions[3 * i], startOffset: positions[3 * i + 1], endOffset: positions[3 * i + 2]
110     int32_t stride;
111 
112     static const double docBoost;
113 
114 public:
115     /// Convenience method; Tokenizes the given field text and adds the resulting terms to the
116     /// index; Equivalent to adding an indexed non-keyword Lucene {@link Field} that is {@link
117     /// Field::INDEX_ANALYZED tokenized}, {@link Field::STORE_NO not stored}, {@link
118     /// Field::TERM_VECTOR_WITH_POSITIONS termVectorStored with positions} (or {@link
119     /// Field::TERM_VECTOR_WITH_POSITIONS_OFFSETS termVectorStored with positions and offsets})
120     /// @param fieldName A name to be associated with the text
121     /// @param text The text to tokenize and index.
122     /// @param analyzer The analyzer to use for tokenization
123     void addField(const String& fieldName, const String& text, const AnalyzerPtr& analyzer);
124 
125     /// Iterates over the given token stream and adds the resulting terms to the index;
126     /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link
127     /// Field}.  Finally closes the token stream. Note that untokenized keywords can be added
128     /// with this method via  {@link #keywordTokenStream(Collection)}, the Lucene contrib
129     /// KeywordTokenizer or similar utilities.
130     /// @param fieldName A name to be associated with the text.
131     /// @param stream The token stream to retrieve tokens from.
132     /// @param boost The boost factor for hits for this field.
133     /// @see Field#setBoost(double)
134     void addField(const String& fieldName, const TokenStreamPtr& stream, double boost = 1.0);
135 
136     /// Creates and returns a searcher that can be used to execute arbitrary Lucene queries
137     /// and to collect the resulting query results as hits.
138     /// @return a searcher
139     IndexSearcherPtr createSearcher();
140 
141     /// Convenience method that efficiently returns the relevance score by matching this index
142     /// against the given Lucene query expression.
143     /// @param query An arbitrary Lucene query to run against this index
144     /// @return the relevance score of the matchmaking; A number in the range [0.0 .. 1.0],
145     /// with 0.0 indicating no match. The higher the number the better the match.
146     double search(const QueryPtr& query);
147 
148 protected:
149     int32_t numPositions(Collection<int32_t> positions);
150 
151     /// sorts into ascending order (on demand), reusing memory along the way
152     void sortFields();
153 
154     friend class MemoryIndexReader;
155     friend class MemoryIndexInfo;
156     friend class MemoryIndexTermEnum;
157     friend class MemoryIndexTermPositions;
158     friend class MemoryIndexTermPositionVector;
159 };
160 
161 /// Index data structure for a field; Contains the tokenized term texts and their positions.
162 class LPPCONTRIBAPI MemoryIndexInfo : public LuceneObject {
163 public:
164     MemoryIndexInfo(MapStringIntCollection terms, int32_t numTokens, int32_t numOverlapTokens, double boost);
165     virtual ~MemoryIndexInfo();
166 
167     LUCENE_CLASS(MemoryIndexInfo);
168 
169 protected:
170     /// Term strings and their positions for this field
171     MapStringIntCollection terms;
172 
173     /// Terms sorted ascending by term text; computed on demand
174     CollectionStringIntCollection sortedTerms;
175 
176     /// Number of added tokens for this field
177     int32_t numTokens;
178 
179     /// Number of overlapping tokens for this field
180     int32_t numOverlapTokens;
181 
182     /// Boost factor for hits for this field
183     double boost;
184 
185     /// Term for this field's fieldName, lazily computed on demand
186     TermPtr _template;
187 
188 public:
189     /// Sorts hashed terms into ascending order, reusing memory along the way.  Note that
190     /// sorting is lazily delayed until required (often it's not required at all).
191     void sortTerms();
192 
193     /// Note that the frequency can be calculated as numPosition(getPositions(x))
194     Collection<int32_t> getPositions(const String& term);
195 
196     /// Note that the frequency can be calculated as numPosition(getPositions(x))
197     Collection<int32_t> getPositions(int32_t pos);
198 
199     double getBoost();
200 
201     friend class MemoryIndexReader;
202     friend class MemoryIndexTermEnum;
203     friend class MemoryIndexTermPositions;
204     friend class MemoryIndexTermPositionVector;
205 };
206 
207 /// Search support for Lucene framework integration; implements all methods required by the
208 /// Lucene IndexReader contracts.
209 class LPPCONTRIBAPI MemoryIndexReader : public IndexReader {
210 public:
211     MemoryIndexReader(const MemoryIndexPtr& memoryIndex);
212     virtual ~MemoryIndexReader();
213 
214     LUCENE_CLASS(MemoryIndexReader);
215 
216 public:
217     static TermPtr MATCH_ALL_TERM();
218 
219 protected:
220     MemoryIndexPtr memoryIndex;
221     SearcherWeakPtr _searcher; // needed to find searcher.getSimilarity()
222 
223     /// cache norms to avoid repeated expensive calculations
224     ByteArray cachedNorms;
225     String cachedFieldName;
226     SimilarityPtr cachedSimilarity;
227 
228 protected:
229     MemoryIndexInfoPtr getInfo(const String& fieldName);
230     MemoryIndexInfoPtr getInfo(int32_t pos);
231 
232     SimilarityPtr getSimilarity();
233     void setSearcher(const SearcherPtr& searcher);
234 
235 public:
236     virtual int32_t docFreq(const TermPtr& t);
237     virtual TermEnumPtr terms();
238     virtual TermEnumPtr terms(const TermPtr& t);
239     virtual TermPositionsPtr termPositions();
240     virtual TermDocsPtr termDocs();
241     virtual Collection<TermFreqVectorPtr> getTermFreqVectors(int32_t docNumber);
242     virtual void getTermFreqVector(int32_t docNumber, const String& field, const TermVectorMapperPtr& mapper);
243     virtual void getTermFreqVector(int32_t docNumber, const TermVectorMapperPtr& mapper);
244     virtual TermFreqVectorPtr getTermFreqVector(int32_t docNumber, const String& field);
245     virtual ByteArray norms(const String& field);
246     virtual void norms(const String& field, ByteArray norms, int32_t offset);
247     virtual void doSetNorm(int32_t doc, const String& field, uint8_t value);
248     virtual int32_t numDocs();
249     virtual int32_t maxDoc();
250     virtual DocumentPtr document(int32_t n);
251     virtual DocumentPtr document(int32_t n, const FieldSelectorPtr& fieldSelector);
252     virtual bool isDeleted(int32_t n);
253     virtual bool hasDeletions();
254     virtual void doDelete(int32_t docNum);
255     virtual void doUndeleteAll();
256     virtual void doCommit(MapStringString commitUserData);
257     virtual void doClose();
258     virtual HashSet<String> getFieldNames(FieldOption fieldOption);
259 
260     friend class MemoryIndex;
261     friend class MemoryIndexTermEnum;
262     friend class MemoryIndexTermPositions;
263     friend class MemoryIndexTermPositionVector;
264 };
265 
266 class LPPCONTRIBAPI MemoryIndexTermEnum : public TermEnum {
267 public:
268     MemoryIndexTermEnum(const MemoryIndexReaderPtr& reader, int32_t ix, int32_t jx);
269     virtual ~MemoryIndexTermEnum();
270 
271     LUCENE_CLASS(MemoryIndexTermEnum);
272 
273 protected:
274     MemoryIndexReaderWeakPtr _reader;
275     int32_t i;
276     int32_t j;
277 
278 public:
279     virtual bool next();
280     virtual TermPtr term();
281     virtual int32_t docFreq();
282     virtual void close();
283 
284 protected:
285     TermPtr createTerm(const MemoryIndexInfoPtr& info, int32_t pos, const String& text);
286 };
287 
288 class LPPCONTRIBAPI MemoryIndexCollector : public Collector {
289 public:
290     MemoryIndexCollector(Collection<double> scores);
291     virtual ~MemoryIndexCollector();
292 
293     LUCENE_CLASS(MemoryIndexCollector);
294 
295 protected:
296     Collection<double> scores;
297     ScorerPtr scorer;
298 
299 public:
300     virtual void collect(int32_t doc);
301     virtual void setScorer(const ScorerPtr& scorer);
302     virtual bool acceptsDocsOutOfOrder();
303     virtual void setNextReader(const IndexReaderPtr& reader, int32_t docBase);
304 };
305 
306 class LPPCONTRIBAPI MemoryIndexTermPositions : public TermPositions, public LuceneObject {
307 public:
308     MemoryIndexTermPositions(const MemoryIndexReaderPtr& reader);
309     virtual ~MemoryIndexTermPositions();
310 
311     LUCENE_CLASS(MemoryIndexTermPositions);
312 
313 protected:
314     MemoryIndexReaderWeakPtr _reader;
315     bool hasNext;
316     int32_t cursor;
317     Collection<int32_t> current;
318     TermPtr term;
319 
320 public:
321     virtual void seek(const TermPtr& term);
322     virtual void seek(const TermEnumPtr& termEnum);
323     virtual int32_t doc();
324     virtual int32_t freq();
325     virtual bool next();
326     virtual int32_t read(Collection<int32_t> docs, Collection<int32_t> freqs);
327     virtual bool skipTo(int32_t target);
328     virtual void close();
329 
330     virtual int32_t nextPosition();
331     virtual int32_t getPayloadLength();
332     virtual ByteArray getPayload(ByteArray data, int32_t offset);
333     virtual bool isPayloadAvailable();
334 };
335 
336 class MemoryIndexTermPositionVector : public TermPositionVector, public LuceneObject {
337 public:
338     MemoryIndexTermPositionVector(const MemoryIndexReaderPtr& reader, const MemoryIndexInfoPtr& info, const String& fieldName);
339     virtual ~MemoryIndexTermPositionVector();
340 
341     LUCENE_CLASS(MemoryIndexTermPositionVector);
342 
343 protected:
344     MemoryIndexReaderWeakPtr _reader;
345     CollectionStringIntCollection sortedTerms;
346     String fieldName;
347 
348 public:
349     virtual String getField();
350     virtual int32_t size();
351     virtual Collection<String> getTerms();
352     virtual Collection<int32_t> getTermFrequencies();
353     virtual int32_t indexOf(const String& term);
354     virtual Collection<int32_t> indexesOf(Collection<String> terms, int32_t start, int32_t length);
355 
356     virtual Collection<int32_t> getTermPositions(int32_t index);
357     virtual Collection<TermVectorOffsetInfoPtr> getOffsets(int32_t index);
358 };
359 
360 }
361 
362 #endif
363