1 ///////////////////////////////////////////////////////////////////////////// 2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved. 3 // Distributable under the terms of either the Apache License (Version 2.0) 4 // or the GNU Lesser General Public License. 5 ///////////////////////////////////////////////////////////////////////////// 6 7 #ifndef MEMORYINDEX_H 8 #define MEMORYINDEX_H 9 10 #include "LuceneContrib.h" 11 #include "IndexReader.h" 12 #include "TermEnum.h" 13 #include "Collector.h" 14 #include "TermPositions.h" 15 #include "TermPositionVector.h" 16 17 namespace Lucene { 18 19 /// High-performance single-document main memory Lucene fulltext search index. 20 /// 21 /// Overview 22 /// 23 /// This class is a replacement/substitute for a large subset of {@link RAMDirectory} functionality. 24 /// It is designed to enable maximum efficiency for on-the-fly matchmaking combining structured and 25 /// fuzzy fulltext search in realtime streaming applications such as Nux XQuery based XML message 26 /// queues, publish-subscribe systems for Blogs/newsfeeds, text chat, data acquisition and 27 /// distribution systems, application level routers, firewalls, classifiers, etc. Rather than 28 /// targeting fulltext search of infrequent queries over huge persistent data archives (historic 29 /// search), this class targets fulltext search of huge numbers of queries over comparatively small 30 /// transient realtime data (prospective search). 31 /// 32 /// For example as in 33 /// <pre> 34 /// double score = search(const String& text, const QueryPtr& query) 35 /// </pre> 36 /// 37 /// Each instance can hold at most one Lucene "document", with a document containing zero or more 38 /// "fields", each field having a name and a fulltext value. The fulltext value is tokenized 39 /// (split and transformed) into zero or more index terms (aka words) on addField(), according to 40 /// the policy implemented by an Analyzer. For example, Lucene analyzers can split on whitespace, 41 /// normalize to lower case for case insensitivity, ignore common terms with little discriminatory 42 /// value such as "he", "in", "and" (stop words), reduce the terms to their natural linguistic root 43 /// form such as "fishing" being reduced to "fish" (stemming), resolve synonyms/inflexions/thesauri 44 /// (upon indexing and/or querying), etc. 45 /// 46 /// Note that a Lucene query selects on the field names and associated (indexed) tokenized terms, 47 /// not on the original fulltext(s) - the latter are not stored but rather thrown away immediately 48 /// after tokenization. 49 /// 50 /// For some interesting background information on search technology, see Bob Wyman's <a target="_blank" 51 /// href="http://bobwyman.pubsub.com/main/2005/05/mary_hodder_poi.html">Prospective Search</a>, 52 /// Jim Gray's <a target="_blank" href="http://www.acmqueue.org/modules.php?name=Content&pa=showpage&pid=293&page=4"> 53 /// A Call to Arms - Custom subscriptions</a>, and Tim Bray's <a target="_blank" 54 /// href="http://www.tbray.org/ongoing/When/200x/2003/07/30/OnSearchTOC">On Search, the Series</a>. 55 /// 56 /// 57 /// Example Usage 58 /// <pre> 59 /// AnalyzerPtr analyzer = newLucene<SimpleAnalyzer>(); 60 /// MemoryIndexPtr index = newLucene<MemoryIndex>(); 61 /// index->addField(L"content", L"Readings about Salmons and other select Alaska fishing Manuals", analyzer); 62 /// index->addField(L"author", L"Tales of James", analyzer); 63 /// QueryParserPtr parser = newLucene<QueryParser>(L"content", analyzer); 64 /// double score = index->search(parser->parse(L"+author:james +salmon~ +fish* manual~")); 65 /// if (score > 0.0) 66 /// { 67 /// // it's a match 68 /// } 69 /// else 70 /// { 71 /// // no match found 72 /// } 73 /// </pre> 74 /// 75 /// 76 /// Performance Notes 77 /// 78 /// Internally there's a new data structure geared towards efficient indexing and searching, plus 79 /// the necessary support code to seamlessly plug into the Lucene framework. 80 /// 81 /// This class performs very well for very small texts (eg. 10 chars) as well as for large texts 82 /// (eg. 10 MB) and everything in between. Typically, it is about 10-100 times faster than 83 /// RAMDirectory. Note that RAMDirectory has particularly large efficiency overheads for small to 84 /// medium sized texts, both in time and space. Indexing a field with N tokens takes O(N) in the 85 /// best case, and O(N logN) in the worst case. Memory consumption is probably larger than for 86 /// RAMDirectory. 87 /// 88 class LPPCONTRIBAPI MemoryIndex : public LuceneObject { 89 public: 90 /// Constructs an empty instance that can optionally store the start and end character offset 91 /// of each token term in the text. This can be useful for highlighting of hit locations with 92 /// the Lucene highlighter package. Private until the highlighter package matures, so that 93 /// this can actually be meaningfully integrated. 94 /// @param storeOffsets Whether or not to store the start and end character offset of each 95 /// token term in the text. 96 MemoryIndex(bool storeOffsets = false); 97 98 virtual ~MemoryIndex(); 99 100 LUCENE_CLASS(MemoryIndex); 101 102 protected: 103 /// info for each field 104 MapStringMemoryIndexInfo fields; 105 106 /// fields sorted ascending by fieldName; lazily computed on demand 107 CollectionStringMemoryIndexInfo sortedFields; 108 109 /// pos: positions[3 * i], startOffset: positions[3 * i + 1], endOffset: positions[3 * i + 2] 110 int32_t stride; 111 112 static const double docBoost; 113 114 public: 115 /// Convenience method; Tokenizes the given field text and adds the resulting terms to the 116 /// index; Equivalent to adding an indexed non-keyword Lucene {@link Field} that is {@link 117 /// Field::INDEX_ANALYZED tokenized}, {@link Field::STORE_NO not stored}, {@link 118 /// Field::TERM_VECTOR_WITH_POSITIONS termVectorStored with positions} (or {@link 119 /// Field::TERM_VECTOR_WITH_POSITIONS_OFFSETS termVectorStored with positions and offsets}) 120 /// @param fieldName A name to be associated with the text 121 /// @param text The text to tokenize and index. 122 /// @param analyzer The analyzer to use for tokenization 123 void addField(const String& fieldName, const String& text, const AnalyzerPtr& analyzer); 124 125 /// Iterates over the given token stream and adds the resulting terms to the index; 126 /// Equivalent to adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link 127 /// Field}. Finally closes the token stream. Note that untokenized keywords can be added 128 /// with this method via {@link #keywordTokenStream(Collection)}, the Lucene contrib 129 /// KeywordTokenizer or similar utilities. 130 /// @param fieldName A name to be associated with the text. 131 /// @param stream The token stream to retrieve tokens from. 132 /// @param boost The boost factor for hits for this field. 133 /// @see Field#setBoost(double) 134 void addField(const String& fieldName, const TokenStreamPtr& stream, double boost = 1.0); 135 136 /// Creates and returns a searcher that can be used to execute arbitrary Lucene queries 137 /// and to collect the resulting query results as hits. 138 /// @return a searcher 139 IndexSearcherPtr createSearcher(); 140 141 /// Convenience method that efficiently returns the relevance score by matching this index 142 /// against the given Lucene query expression. 143 /// @param query An arbitrary Lucene query to run against this index 144 /// @return the relevance score of the matchmaking; A number in the range [0.0 .. 1.0], 145 /// with 0.0 indicating no match. The higher the number the better the match. 146 double search(const QueryPtr& query); 147 148 protected: 149 int32_t numPositions(Collection<int32_t> positions); 150 151 /// sorts into ascending order (on demand), reusing memory along the way 152 void sortFields(); 153 154 friend class MemoryIndexReader; 155 friend class MemoryIndexInfo; 156 friend class MemoryIndexTermEnum; 157 friend class MemoryIndexTermPositions; 158 friend class MemoryIndexTermPositionVector; 159 }; 160 161 /// Index data structure for a field; Contains the tokenized term texts and their positions. 162 class LPPCONTRIBAPI MemoryIndexInfo : public LuceneObject { 163 public: 164 MemoryIndexInfo(MapStringIntCollection terms, int32_t numTokens, int32_t numOverlapTokens, double boost); 165 virtual ~MemoryIndexInfo(); 166 167 LUCENE_CLASS(MemoryIndexInfo); 168 169 protected: 170 /// Term strings and their positions for this field 171 MapStringIntCollection terms; 172 173 /// Terms sorted ascending by term text; computed on demand 174 CollectionStringIntCollection sortedTerms; 175 176 /// Number of added tokens for this field 177 int32_t numTokens; 178 179 /// Number of overlapping tokens for this field 180 int32_t numOverlapTokens; 181 182 /// Boost factor for hits for this field 183 double boost; 184 185 /// Term for this field's fieldName, lazily computed on demand 186 TermPtr _template; 187 188 public: 189 /// Sorts hashed terms into ascending order, reusing memory along the way. Note that 190 /// sorting is lazily delayed until required (often it's not required at all). 191 void sortTerms(); 192 193 /// Note that the frequency can be calculated as numPosition(getPositions(x)) 194 Collection<int32_t> getPositions(const String& term); 195 196 /// Note that the frequency can be calculated as numPosition(getPositions(x)) 197 Collection<int32_t> getPositions(int32_t pos); 198 199 double getBoost(); 200 201 friend class MemoryIndexReader; 202 friend class MemoryIndexTermEnum; 203 friend class MemoryIndexTermPositions; 204 friend class MemoryIndexTermPositionVector; 205 }; 206 207 /// Search support for Lucene framework integration; implements all methods required by the 208 /// Lucene IndexReader contracts. 209 class LPPCONTRIBAPI MemoryIndexReader : public IndexReader { 210 public: 211 MemoryIndexReader(const MemoryIndexPtr& memoryIndex); 212 virtual ~MemoryIndexReader(); 213 214 LUCENE_CLASS(MemoryIndexReader); 215 216 public: 217 static TermPtr MATCH_ALL_TERM(); 218 219 protected: 220 MemoryIndexPtr memoryIndex; 221 SearcherWeakPtr _searcher; // needed to find searcher.getSimilarity() 222 223 /// cache norms to avoid repeated expensive calculations 224 ByteArray cachedNorms; 225 String cachedFieldName; 226 SimilarityPtr cachedSimilarity; 227 228 protected: 229 MemoryIndexInfoPtr getInfo(const String& fieldName); 230 MemoryIndexInfoPtr getInfo(int32_t pos); 231 232 SimilarityPtr getSimilarity(); 233 void setSearcher(const SearcherPtr& searcher); 234 235 public: 236 virtual int32_t docFreq(const TermPtr& t); 237 virtual TermEnumPtr terms(); 238 virtual TermEnumPtr terms(const TermPtr& t); 239 virtual TermPositionsPtr termPositions(); 240 virtual TermDocsPtr termDocs(); 241 virtual Collection<TermFreqVectorPtr> getTermFreqVectors(int32_t docNumber); 242 virtual void getTermFreqVector(int32_t docNumber, const String& field, const TermVectorMapperPtr& mapper); 243 virtual void getTermFreqVector(int32_t docNumber, const TermVectorMapperPtr& mapper); 244 virtual TermFreqVectorPtr getTermFreqVector(int32_t docNumber, const String& field); 245 virtual ByteArray norms(const String& field); 246 virtual void norms(const String& field, ByteArray norms, int32_t offset); 247 virtual void doSetNorm(int32_t doc, const String& field, uint8_t value); 248 virtual int32_t numDocs(); 249 virtual int32_t maxDoc(); 250 virtual DocumentPtr document(int32_t n); 251 virtual DocumentPtr document(int32_t n, const FieldSelectorPtr& fieldSelector); 252 virtual bool isDeleted(int32_t n); 253 virtual bool hasDeletions(); 254 virtual void doDelete(int32_t docNum); 255 virtual void doUndeleteAll(); 256 virtual void doCommit(MapStringString commitUserData); 257 virtual void doClose(); 258 virtual HashSet<String> getFieldNames(FieldOption fieldOption); 259 260 friend class MemoryIndex; 261 friend class MemoryIndexTermEnum; 262 friend class MemoryIndexTermPositions; 263 friend class MemoryIndexTermPositionVector; 264 }; 265 266 class LPPCONTRIBAPI MemoryIndexTermEnum : public TermEnum { 267 public: 268 MemoryIndexTermEnum(const MemoryIndexReaderPtr& reader, int32_t ix, int32_t jx); 269 virtual ~MemoryIndexTermEnum(); 270 271 LUCENE_CLASS(MemoryIndexTermEnum); 272 273 protected: 274 MemoryIndexReaderWeakPtr _reader; 275 int32_t i; 276 int32_t j; 277 278 public: 279 virtual bool next(); 280 virtual TermPtr term(); 281 virtual int32_t docFreq(); 282 virtual void close(); 283 284 protected: 285 TermPtr createTerm(const MemoryIndexInfoPtr& info, int32_t pos, const String& text); 286 }; 287 288 class LPPCONTRIBAPI MemoryIndexCollector : public Collector { 289 public: 290 MemoryIndexCollector(Collection<double> scores); 291 virtual ~MemoryIndexCollector(); 292 293 LUCENE_CLASS(MemoryIndexCollector); 294 295 protected: 296 Collection<double> scores; 297 ScorerPtr scorer; 298 299 public: 300 virtual void collect(int32_t doc); 301 virtual void setScorer(const ScorerPtr& scorer); 302 virtual bool acceptsDocsOutOfOrder(); 303 virtual void setNextReader(const IndexReaderPtr& reader, int32_t docBase); 304 }; 305 306 class LPPCONTRIBAPI MemoryIndexTermPositions : public TermPositions, public LuceneObject { 307 public: 308 MemoryIndexTermPositions(const MemoryIndexReaderPtr& reader); 309 virtual ~MemoryIndexTermPositions(); 310 311 LUCENE_CLASS(MemoryIndexTermPositions); 312 313 protected: 314 MemoryIndexReaderWeakPtr _reader; 315 bool hasNext; 316 int32_t cursor; 317 Collection<int32_t> current; 318 TermPtr term; 319 320 public: 321 virtual void seek(const TermPtr& term); 322 virtual void seek(const TermEnumPtr& termEnum); 323 virtual int32_t doc(); 324 virtual int32_t freq(); 325 virtual bool next(); 326 virtual int32_t read(Collection<int32_t> docs, Collection<int32_t> freqs); 327 virtual bool skipTo(int32_t target); 328 virtual void close(); 329 330 virtual int32_t nextPosition(); 331 virtual int32_t getPayloadLength(); 332 virtual ByteArray getPayload(ByteArray data, int32_t offset); 333 virtual bool isPayloadAvailable(); 334 }; 335 336 class MemoryIndexTermPositionVector : public TermPositionVector, public LuceneObject { 337 public: 338 MemoryIndexTermPositionVector(const MemoryIndexReaderPtr& reader, const MemoryIndexInfoPtr& info, const String& fieldName); 339 virtual ~MemoryIndexTermPositionVector(); 340 341 LUCENE_CLASS(MemoryIndexTermPositionVector); 342 343 protected: 344 MemoryIndexReaderWeakPtr _reader; 345 CollectionStringIntCollection sortedTerms; 346 String fieldName; 347 348 public: 349 virtual String getField(); 350 virtual int32_t size(); 351 virtual Collection<String> getTerms(); 352 virtual Collection<int32_t> getTermFrequencies(); 353 virtual int32_t indexOf(const String& term); 354 virtual Collection<int32_t> indexesOf(Collection<String> terms, int32_t start, int32_t length); 355 356 virtual Collection<int32_t> getTermPositions(int32_t index); 357 virtual Collection<TermVectorOffsetInfoPtr> getOffsets(int32_t index); 358 }; 359 360 } 361 362 #endif 363