1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #ifndef _lucene_index_SegmentInfos_
8 #define _lucene_index_SegmentInfos_
9
10
11 //#include "IndexReader.h"
12 #include "CLucene/util/Misc.h"
13 #include "_IndexFileNames.h"
CL_CLASS_DEF(store,Directory)14 CL_CLASS_DEF(store,Directory)
15 CL_CLASS_DEF(store,IndexInput)
16 CL_CLASS_DEF(store,IndexOutput)
17
18 CL_NS_DEF(index)
19
20 class SegmentInfo :LUCENE_BASE{
21 public:
22
23 LUCENE_STATIC_CONSTANT(int32_t, NO = -1); // e.g. no norms; no deletes;
24 LUCENE_STATIC_CONSTANT(int32_t, YES = 1); // e.g. have norms; have deletes;
25 LUCENE_STATIC_CONSTANT(int32_t, CHECK_DIR = 0); // e.g. must check dir to see if there are norms/deletions
26 LUCENE_STATIC_CONSTANT(int32_t, WITHOUT_GEN = 0); // a file name that has no GEN in it.
27
28 std::string name; // unique name in dir
29 int32_t docCount; // number of docs in seg
30 CL_NS(store)::Directory* dir; // where segment resides
31
32 private:
33 bool preLockless; // true if this is a segments file written before
34 // lock-less commits (2.1)
35
36 int64_t delGen; // current generation of del file; NO if there
37 // are no deletes; CHECK_DIR if it's a pre-2.1 segment
38 // (and we must check filesystem); YES or higher if
39 // there are deletes at generation N
40
41 CL_NS(util)::ValueArray<int64_t> normGen; // current generation of each field's norm file.
42 // If this array is null, for lockLess this means no
43 // separate norms. For preLockLess this means we must
44 // check filesystem. If this array is not null, its
45 // values mean: NO says this field has no separate
46 // norms; CHECK_DIR says it is a preLockLess segment and
47 // filesystem must be checked; >= YES says this field
48 // has separate norms with the specified generation
49
50 int8_t isCompoundFile; // NO if it is not; YES if it is; CHECK_DIR if it's
51 // pre-2.1 (ie, must check file system to see
52 // if <name>.cfs and <name>.nrm exist)
53
54 bool hasSingleNormFile; // true if this segment maintains norms in a single file;
55 // false otherwise
56 // this is currently false for segments populated by DocumentWriter
57 // and true for newly created merged segments (both
58 // compound and non compound).
59
60 private:
61
62 std::vector<std::string> _files; // cached list of files that this segment uses
63 // in the Directory
64
65 int64_t _sizeInBytes; // total byte size of all of our files (computed on demand)
66
67 int32_t docStoreOffset; // if this segment shares stored fields & vectors, this
68 // offset is where in that file this segment's docs begin
69 std::string docStoreSegment; // name used to derive fields/vectors file we share with
70 // other segments
71 // This string is being interned. There might be a way around this,
72 // and if found, this would greatly improve perfomance.
73
74 bool docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx)
75
76 /* Called whenever any change is made that affects which
77 * files this segment has. */
78 void clearFiles();
79
80 void addIfExists(std::vector<std::string>& files, const std::string& fileName);
81
82 public:
83 SegmentInfo(const char* _name, const int32_t _docCount, CL_NS(store)::Directory* _dir,
84 bool _isCompoundFile=SegmentInfo::CHECK_DIR,
85 bool _hasSingleNormFile=false,
86 int32_t _docStoreOffset = -1,
87 const char* _docStoreSegment = NULL,
88 bool _docStoreIsCompoundFile = false);
89
90 /**
91 * Construct a new SegmentInfo instance by reading a
92 * previously saved SegmentInfo from input.
93 *
94 * @param dir directory to load from
95 * @param format format of the segments info file
96 * @param input input handle to read segment info from
97 */
98 SegmentInfo(CL_NS(store)::Directory* dir, int32_t format, CL_NS(store)::IndexInput* input);
99
100 ~SegmentInfo();
101
102 void setNumFields(const int32_t numFields);
103 int64_t sizeInBytes();
104 bool hasDeletions() const;
105
106 void advanceDelGen();
107 void clearDelGen();
108
109 SegmentInfo* clone ();
110
111 std::string getDelFileName() const;
112
113 /**
114 * Returns true if this field for this segment has saved a separate norms file (_<segment>_N.sX).
115 *
116 * @param fieldNumber the field index to check
117 */
118 bool hasSeparateNorms(const int32_t fieldNumber) const;
119
120 /**
121 * Returns true if any fields in this segment have separate norms.
122 */
123 bool hasSeparateNorms() const;
124
125 /**
126 * Get the file name for the norms file for this field.
127 *
128 * @param number field index
129 */
130 std::string getNormFileName(const int32_t number) const;
131
132 /**
133 * Increment the generation count for the norms file for
134 * this field.
135 *
136 * @param fieldIndex field whose norm file will be rewritten
137 */
138 void advanceNormGen(const int32_t fieldIndex);
139
140 /**
141 * Mark whether this segment is stored as a compound file.
142 *
143 * @param isCompoundFile true if this is a compound file;
144 * else, false
145 */
146 void setUseCompoundFile(const bool isCompoundFile);
147
148 /**
149 * Returns true if this segment is stored as a compound
150 * file; else, false.
151 */
152 bool getUseCompoundFile() const;
153
154 /*
155 * Return all files referenced by this SegmentInfo. The
156 * returns List is a locally cached List so you should not
157 * modify it.
158 */
159 const std::vector<std::string>& files();
160
161 /**
162 * Copy everything from src SegmentInfo into our instance.
163 */
164 void reset(const SegmentInfo* src);
165
166 /**
167 * Save this segment's info.
168 */
169 void write(CL_NS(store)::IndexOutput* output);
170
171 int32_t getDocStoreOffset() const;
172
173 bool getDocStoreIsCompoundFile() const;
174
175 void setDocStoreIsCompoundFile(const bool v);
176
177 /**
178 * Returns a reference to docStoreSegment
179 */
180 const std::string& getDocStoreSegment() const;
181
182 void setDocStoreOffset(const int32_t offset);
183
184 /** We consider another SegmentInfo instance equal if it
185 * has the same dir and same name. */
186 bool equals(const SegmentInfo* obj);
187
188 ///Gets the Directory where the segment resides
189 CL_NS(store)::Directory* getDir() const{ return dir; } //todo: since dir is public, consider removing this function
190
191 friend class SegmentReader;
192
193 /** Used for debugging */
194 std::string segString(CL_NS(store)::Directory* dir);
195 };
196
197 typedef CL_NS(util)::CLVector<SegmentInfo*,CL_NS(util)::Deletor::Object<SegmentInfo> > segmentInfosType;
198 //SegmentInfos manages a list of SegmentInfo instances
199 //Each SegmentInfo contains information about a segment in a directory.
200 //
201 //The active segments in the index are stored in the segment info file.
202 //An index only has a single file in this format, and it is named "segments".
203 //This lists each segment by name, and also contains the size of each segment.
204 //The format of the file segments is defined as follows:
205 //
206 // SegCount
207 //Segments --> SegCount, <SegName, SegSize>
208 //
209 //SegCount, SegSize --> UInt32
210 //
211 //SegName --> String
212 //
213 //SegName is the name of the segment, and is used as the file name prefix
214 //for all of the files that compose the segment's index.
215 //
216 //SegSize is the number of documents contained in the segment index.
217 //
218 //Note:
219 //At http://jakarta.apache.org/lucene/docs/fileformats.html the definition
220 //of all file formats can be found. Note that java lucene currently
221 //defines Segments as follows:
222 //
223 //Segments --> Format, Version, SegCount, <SegName, SegSize>SegCount
224 //
225 //Format, SegCount, SegSize --> UInt32
226 //
227 //Format and Version have not been implemented yet
228
229 class IndexReader;
230
231 class SegmentInfos: LUCENE_BASE {
232 public:
233 DEFINE_MUTEX(THIS_LOCK)
234
235 /** The file format version, a negative number. */
236 /* Works since counter, the old 1st entry, is always >= 0 */
237 LUCENE_STATIC_CONSTANT(int32_t,FORMAT=-1);
238
239 /** This format adds details used for lockless commits. It differs
240 * slightly from the previous format in that file names
241 * are never re-used (write once). Instead, each file is
242 * written to the next generation. For example,
243 * segments_1, segments_2, etc. This allows us to not use
244 * a commit lock. See <a
245 * href="http://lucene.apache.org/java/docs/fileformats.html">file
246 * formats</a> for details.
247 */
248 LUCENE_STATIC_CONSTANT(int32_t,FORMAT_LOCKLESS=-2);
249
250 /** This format adds a "hasSingleNormFile" flag into each segment info.
251 * See <a href="http://issues.apache.org/jira/browse/LUCENE-756">LUCENE-756</a>
252 * for details.
253 */
254 LUCENE_STATIC_CONSTANT(int32_t,FORMAT_SINGLE_NORM_FILE=-3);
255
256 /** This format allows multiple segments to share a single
257 * vectors and stored fields file. */
258 LUCENE_STATIC_CONSTANT(int32_t,FORMAT_SHARED_DOC_STORE=-4);
259
260 private:
261 /* This must always point to the most recent file format. */
262 LUCENE_STATIC_CONSTANT(int32_t,CURRENT_FORMAT=FORMAT_SHARED_DOC_STORE);
263
264 public:
265 int32_t counter; // used to name new segments
266
267 /**
268 * counts how often the index has been changed by adding or deleting docs.
269 * starting with the current time in milliseconds forces to create unique version numbers.
270 */
271 int64_t version;
272
273 private:
274 int64_t generation; // generation of the "segments_N" for the next commit
275 int64_t lastGeneration; // generation of the "segments_N" file we last successfully read
276 // or wrote; this is normally the same as generation except if
277 // there was an IOException that had interrupted a commit
278
279 /**
280 * If non-null, information about loading segments_N files
281 * will be printed here. @see #setInfoStream.
282 */
283 static std::ostream* infoStream;
284
285 LUCENE_STATIC_CONSTANT(int32_t,defaultGenFileRetryCount=10);
286 LUCENE_STATIC_CONSTANT(int32_t,defaultGenFileRetryPauseMsec=50);
287 LUCENE_STATIC_CONSTANT(int32_t,defaultGenLookaheadCount=10);
288
289 segmentInfosType infos;
290
291 friend class IndexWriter; //allow IndexWriter to use counter
292
293 static void message(const char* _message, ...);
294
295 public:
296 SegmentInfos(bool deleteMembers=true, int32_t reserveCount=0);
297 ~SegmentInfos();
298
299 //Returns a reference to the i-th SegmentInfo in the list.
300 SegmentInfo* info(int32_t i) const;
301
302 /**
303 * Get the generation (N) of the current segments_N file
304 * from a list of files.
305 *
306 * @param files -- array of file names to check
307 */
308 static int64_t getCurrentSegmentGeneration( std::vector<std::string>& files );
309
310 /**
311 * Get the generation (N) of the current segments_N file
312 * in the directory.
313 *
314 * @param directory -- directory to search for the latest segments_N file
315 */
316 static int64_t getCurrentSegmentGeneration( const CL_NS(store)::Directory* directory );
317
318 /**
319 * Get the filename of the current segments_N file
320 * from a list of files.
321 *
322 * @param files -- array of file names to check
323 */
324 static std::string getCurrentSegmentFileName( std::vector<std::string>& files );
325
326 /**
327 * Get the filename of the current segments_N file
328 * in the directory.
329 *
330 * @param directory -- directory to search for the latest segments_N file
331 */
332 static std::string getCurrentSegmentFileName( CL_NS(store)::Directory* directory );
333
334 /**
335 * Get the segments_N filename in use by this segment infos.
336 */
337 std::string getCurrentSegmentFileName();
338
339 /**
340 * Parse the generation off the segments file name and
341 * return it.
342 */
343 static int64_t generationFromSegmentsFileName( const char* fileName );
344
345 /**
346 * Get the next segments_N filename that will be written.
347 */
348 std::string getNextSegmentFileName();
349
350 /* public vector-like operations */
351 //delete and clears objects 'from' from to 'to'
352 void clearto(size_t to, size_t end);
353 //count of segment infos
354 int32_t size() const;
355 /** add a segment info
356 * @param pos position to add the info at. -1 for last position
357 */
358 void add(SegmentInfo* info, int32_t pos=-1);
359 SegmentInfo* elementAt(int32_t pos);
360 void setElementAt(SegmentInfo* si, int32_t pos);
361 void clear();
362
363 void insert(SegmentInfos* infos, bool takeMemory);
364 void insert(SegmentInfo* info);
365 int32_t indexOf(const SegmentInfo* info) const;
366 void range(size_t from, size_t to, SegmentInfos& ret) const;
367 void remove(size_t index, bool dontDelete=false);
368
369 /**
370 * Read a particular segmentFileName. Note that this may
371 * throw an IOException if a commit is in process.
372 *
373 * @param directory -- directory containing the segments file
374 * @param segmentFileName -- segment file to load
375 * @throws CorruptIndexException if the index is corrupt
376 * @throws IOException if there is a low-level IO error
377 */
378 void read(CL_NS(store)::Directory* directory, const char* segmentFileName);
379
380 /**
381 * This version of read uses the retry logic (for lock-less
382 * commits) to find the right segments file to load.
383 * @throws CorruptIndexException if the index is corrupt
384 * @throws IOException if there is a low-level IO error
385 */
386 void read(CL_NS(store)::Directory* directory);
387
388 //Writes a new segments file based upon the SegmentInfo instances it manages
389 //note: still does not support lock-less writes (still pre-2.1 format)
390 void write(CL_NS(store)::Directory* directory);
391
392 /**
393 * Returns a copy of this instance, also copying each
394 * SegmentInfo.
395 */
396 SegmentInfos* clone() const;
397
398 /**
399 * version number when this SegmentInfos was generated.
400 */
401 int64_t getVersion() const;
402 int64_t getGeneration() const;
403 int64_t getLastGeneration() const;
404
405 /**
406 * Current version number from segments file.
407 * @throws CorruptIndexException if the index is corrupt
408 * @throws IOException if there is a low-level IO error
409 */
410 static int64_t readCurrentVersion(CL_NS(store)::Directory* directory);
411
412
413 /** If non-null, information about retries when loading
414 * the segments file will be printed to this.
415 */
416 static void setInfoStream(std::ostream* infoStream);
417
418 /**
419 * @see #setInfoStream
420 */
421 static std::ostream* getInfoStream();
422
423 /**
424 * Advanced: set how many times to try loading the
425 * segments.gen file contents to determine current segment
426 * generation. This file is only referenced when the
427 * primary method (listing the directory) fails.
428 */
429 //static void setDefaultGenFileRetryCount(const int32_t count);
430 /**
431 * @see #setDefaultGenFileRetryCount
432 */
433 static int32_t getDefaultGenFileRetryCount();
434
435 /**
436 * Advanced: set how many milliseconds to pause in between
437 * attempts to load the segments.gen file.
438 */
439 //static void setDefaultGenFileRetryPauseMsec(const int32_t msec);
440 /**
441 * @see #setDefaultGenFileRetryPauseMsec
442 */
443 static int32_t getDefaultGenFileRetryPauseMsec();
444
445 /**
446 * Advanced: set how many times to try incrementing the
447 * gen when loading the segments file. This only runs if
448 * the primary (listing directory) and secondary (opening
449 * segments.gen file) methods fail to find the segments
450 * file.
451 */
452 //static void setDefaultGenLookaheadCount(const int32_t count);
453 /**
454 * @see #setDefaultGenLookaheadCount
455 */
456 static int32_t getDefaultGenLookahedCount();
457
458 class _FindSegmentsFile: LUCENE_BASE{
459 protected:
460 const char* fileDirectory;
461 CL_NS(store)::Directory* directory;
462
463 void doRun();
464 virtual bool tryDoBody(const char* segmentFileName, CLuceneError& ret_err) = 0;
465 };
466
467 /**
468 * Utility class for executing code that needs to do
469 * something with the current segments file. This is
470 * necessary with lock-less commits because from the time
471 * you locate the current segments file name, until you
472 * actually open it, read its contents, or check modified
473 * time, etc., it could have been deleted due to a writer
474 * commit finishing.
475 */
476 template<typename RET>
477 class FindSegmentsFile: public _FindSegmentsFile{
478 protected:
479 virtual RET doBody(const char* segmentFileName) = 0;
480 RET result;
481
482 //catch only IO errors, return true on success...
tryDoBody(const char * segmentFileName,CLuceneError & ret_err)483 bool tryDoBody(const char* segmentFileName, CLuceneError& ret_err){
484 try{
485 result = doBody(segmentFileName);
486 return true;
487 } catch (CLuceneError& err) {
488 result = 0;
489 ret_err.set(err.number(),err.what());
490 }
491 return false;
492 }
493 public:
FindSegmentsFile(CL_NS (store)::Directory * dir)494 FindSegmentsFile( CL_NS(store)::Directory* dir ){
495 this->directory = dir;
496 this->fileDirectory = NULL;
497 this->result = 0;
498 }
FindSegmentsFile(const char * dir)499 FindSegmentsFile( const char* dir ){
500 this->directory = NULL;
501 this->fileDirectory = dir;
502 this->result = 0;
503 }
~FindSegmentsFile()504 ~FindSegmentsFile(){
505 }
506
run()507 RET run(){
508 doRun();
509 return result;
510 };
511 };
512 //friend class SegmentInfos::FindSegmentsFile;
513
514 class FindSegmentsVersion: public FindSegmentsFile<int64_t> {
515 public:
516 FindSegmentsVersion( CL_NS(store)::Directory* dir );
517 FindSegmentsVersion( const char* dir );
518 int64_t doBody( const char* segmentFileName );
519 };
520 friend class SegmentInfos::FindSegmentsVersion;
521
522 class FindSegmentsRead: public FindSegmentsFile<bool> {
523 SegmentInfos* _this;
524 public:
525 FindSegmentsRead( CL_NS(store)::Directory* dir, SegmentInfos* _this );
526 FindSegmentsRead( const char* dir, SegmentInfos* _this );
527 bool doBody( const char* segmentFileName );
528 };
529 friend class SegmentInfos::FindSegmentsRead;
530 };
531 CL_NS_END
532 #endif
533