1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #ifndef _lucene_index_SegmentInfos_
8 #define _lucene_index_SegmentInfos_
9 
10 
11 //#include "IndexReader.h"
12 #include "CLucene/util/Misc.h"
13 #include "_IndexFileNames.h"
CL_CLASS_DEF(store,Directory)14 CL_CLASS_DEF(store,Directory)
15 CL_CLASS_DEF(store,IndexInput)
16 CL_CLASS_DEF(store,IndexOutput)
17 
18 CL_NS_DEF(index)
19 
20 	class SegmentInfo :LUCENE_BASE{
21 	public:
22 
23 		LUCENE_STATIC_CONSTANT(int32_t, NO = -1);			// e.g. no norms; no deletes;
24 		LUCENE_STATIC_CONSTANT(int32_t, YES = 1);			// e.g. have norms; have deletes;
25 		LUCENE_STATIC_CONSTANT(int32_t, CHECK_DIR = 0);		// e.g. must check dir to see if there are norms/deletions
26 		LUCENE_STATIC_CONSTANT(int32_t, WITHOUT_GEN = 0);	// a file name that has no GEN in it.
27 
28     std::string name;									// unique name in dir
29 		int32_t docCount;							// number of docs in seg
30 		CL_NS(store)::Directory* dir;				// where segment resides
31 
32 	private:
33 		bool preLockless;						  // true if this is a segments file written before
34                                                   // lock-less commits (2.1)
35 
36 		int64_t delGen;                            // current generation of del file; NO if there
37                                                   // are no deletes; CHECK_DIR if it's a pre-2.1 segment
38                                                   // (and we must check filesystem); YES or higher if
39                                                   // there are deletes at generation N
40 
41     CL_NS(util)::ValueArray<int64_t> normGen;     // current generation of each field's norm file.
42                                                   // If this array is null, for lockLess this means no
43                                                   // separate norms.  For preLockLess this means we must
44                                                   // check filesystem. If this array is not null, its
45                                                   // values mean: NO says this field has no separate
46                                                   // norms; CHECK_DIR says it is a preLockLess segment and
47                                                   // filesystem must be checked; >= YES says this field
48                                                   // has separate norms with the specified generation
49 
50 		int8_t isCompoundFile;					  // NO if it is not; YES if it is; CHECK_DIR if it's
51                                                   // pre-2.1 (ie, must check file system to see
52                                                   // if <name>.cfs and <name>.nrm exist)
53 
54 		bool hasSingleNormFile;					  // true if this segment maintains norms in a single file;
55                                                   // false otherwise
56                                                   // this is currently false for segments populated by DocumentWriter
57                                                   // and true for newly created merged segments (both
58                                                   // compound and non compound).
59 
60 	private:
61 
62     std::vector<std::string> _files;                               // cached list of files that this segment uses
63                                                   // in the Directory
64 
65 		int64_t _sizeInBytes;					  // total byte size of all of our files (computed on demand)
66 
67 		int32_t docStoreOffset;					  // if this segment shares stored fields & vectors, this
68                                                   // offset is where in that file this segment's docs begin
69     std::string docStoreSegment;					  // name used to derive fields/vectors file we share with
70                                                   // other segments
71 												  // This string is being interned. There might be a way around this,
72 												  // and if found, this would greatly improve perfomance.
73 
74 		bool docStoreIsCompoundFile;			  // whether doc store files are stored in compound file (*.cfx)
75 
76 		/* Called whenever any change is made that affects which
77 		* files this segment has. */
78 		void clearFiles();
79 
80     void addIfExists(std::vector<std::string>& files, const std::string& fileName);
81 
82 	public:
83 		SegmentInfo(const char* _name, const int32_t _docCount, CL_NS(store)::Directory* _dir,
84 			bool _isCompoundFile=SegmentInfo::CHECK_DIR,
85       bool _hasSingleNormFile=false,
86 			int32_t _docStoreOffset = -1,
87       const char* _docStoreSegment = NULL,
88       bool _docStoreIsCompoundFile = false);
89 
90 		/**
91 		* Construct a new SegmentInfo instance by reading a
92 		* previously saved SegmentInfo from input.
93 		*
94 		* @param dir directory to load from
95 		* @param format format of the segments info file
96 		* @param input input handle to read segment info from
97 		*/
98 		SegmentInfo(CL_NS(store)::Directory* dir, int32_t format, CL_NS(store)::IndexInput* input);
99 
100 		~SegmentInfo();
101 
102 		void setNumFields(const int32_t numFields);
103     int64_t sizeInBytes();
104 		bool hasDeletions() const;
105 
106 		void advanceDelGen();
107 		void clearDelGen();
108 
109 		SegmentInfo* clone ();
110 
111     std::string getDelFileName() const;
112 
113 		/**
114 		* Returns true if this field for this segment has saved a separate norms file (_<segment>_N.sX).
115 		*
116 		* @param fieldNumber the field index to check
117 		*/
118 		bool hasSeparateNorms(const int32_t fieldNumber) const;
119 
120 		/**
121 		* Returns true if any fields in this segment have separate norms.
122 		*/
123 		bool hasSeparateNorms() const;
124 
125 		/**
126 		* Get the file name for the norms file for this field.
127 		*
128 		* @param number field index
129 		*/
130     std::string getNormFileName(const int32_t number) const;
131 
132 		/**
133 		* Increment the generation count for the norms file for
134 		* this field.
135 		*
136 		* @param fieldIndex field whose norm file will be rewritten
137 		*/
138 		void advanceNormGen(const int32_t fieldIndex);
139 
140 		/**
141 		* Mark whether this segment is stored as a compound file.
142 		*
143 		* @param isCompoundFile true if this is a compound file;
144 		* else, false
145 		*/
146 		void setUseCompoundFile(const bool isCompoundFile);
147 
148 		/**
149 		* Returns true if this segment is stored as a compound
150 		* file; else, false.
151 		*/
152 		bool getUseCompoundFile() const;
153 
154     /*
155     * Return all files referenced by this SegmentInfo.  The
156     * returns List is a locally cached List so you should not
157     * modify it.
158     */
159     const std::vector<std::string>& files();
160 
161 		/**
162 		* Copy everything from src SegmentInfo into our instance.
163 		*/
164 		void reset(const SegmentInfo* src);
165 
166 		/**
167 		* Save this segment's info.
168 		*/
169 		void write(CL_NS(store)::IndexOutput* output);
170 
171 		int32_t getDocStoreOffset() const;
172 
173 		bool getDocStoreIsCompoundFile() const;
174 
175 		void setDocStoreIsCompoundFile(const bool v);
176 
177 		/**
178 		* Returns a reference to docStoreSegment
179 		*/
180     const std::string& getDocStoreSegment() const;
181 
182 		void setDocStoreOffset(const int32_t offset);
183 
184 		/** We consider another SegmentInfo instance equal if it
185 		*  has the same dir and same name. */
186 		bool equals(const SegmentInfo* obj);
187 
188 		///Gets the Directory where the segment resides
189 		CL_NS(store)::Directory* getDir() const{ return dir; } //todo: since dir is public, consider removing this function
190 
191 	    friend class SegmentReader;
192 
193 	    /** Used for debugging */
194 	    std::string segString(CL_NS(store)::Directory* dir);
195 	};
196 
197 	typedef CL_NS(util)::CLVector<SegmentInfo*,CL_NS(util)::Deletor::Object<SegmentInfo> > segmentInfosType;
198   //SegmentInfos manages a list of SegmentInfo instances
199   //Each SegmentInfo contains information about a segment in a directory.
200   //
201   //The active segments in the index are stored in the segment info file.
202   //An index only has a single file in this format, and it is named "segments".
203   //This lists each segment by name, and also contains the size of each segment.
204   //The format of the file segments is defined as follows:
205   //
206   //                                        SegCount
207   //Segments --> SegCount, <SegName, SegSize>
208   //
209   //SegCount, SegSize --> UInt32
210   //
211   //SegName --> String
212   //
213   //SegName is the name of the segment, and is used as the file name prefix
214   //for all of the files that compose the segment's index.
215   //
216   //SegSize is the number of documents contained in the segment index.
217   //
218   //Note:
219   //At http://jakarta.apache.org/lucene/docs/fileformats.html the definition
220   //of all file formats can be found. Note that java lucene currently
221   //defines Segments as follows:
222   //
223   //Segments --> Format, Version, SegCount, <SegName, SegSize>SegCount
224   //
225   //Format, SegCount, SegSize --> UInt32
226   //
227   //Format and Version have not been implemented yet
228 
229 	class IndexReader;
230 
231 	class SegmentInfos: LUCENE_BASE {
232 	public:
233 	  DEFINE_MUTEX(THIS_LOCK)
234 
235 		/** The file format version, a negative number. */
236 		/* Works since counter, the old 1st entry, is always >= 0 */
237 		LUCENE_STATIC_CONSTANT(int32_t,FORMAT=-1);
238 
239 		/** This format adds details used for lockless commits.  It differs
240 		* slightly from the previous format in that file names
241 		* are never re-used (write once).  Instead, each file is
242 		* written to the next generation.  For example,
243 		* segments_1, segments_2, etc.  This allows us to not use
244 		* a commit lock.  See <a
245 		* href="http://lucene.apache.org/java/docs/fileformats.html">file
246 		* formats</a> for details.
247 		*/
248 		LUCENE_STATIC_CONSTANT(int32_t,FORMAT_LOCKLESS=-2);
249 
250 		/** This format adds a "hasSingleNormFile" flag into each segment info.
251 		* See <a href="http://issues.apache.org/jira/browse/LUCENE-756">LUCENE-756</a>
252 		* for details.
253 		*/
254 		LUCENE_STATIC_CONSTANT(int32_t,FORMAT_SINGLE_NORM_FILE=-3);
255 
256 		/** This format allows multiple segments to share a single
257 		* vectors and stored fields file. */
258 		LUCENE_STATIC_CONSTANT(int32_t,FORMAT_SHARED_DOC_STORE=-4);
259 
260 	private:
261 		/* This must always point to the most recent file format. */
262 		LUCENE_STATIC_CONSTANT(int32_t,CURRENT_FORMAT=FORMAT_SHARED_DOC_STORE);
263 
264 	public:
265 		int32_t counter;  // used to name new segments
266 
267 		/**
268 		* counts how often the index has been changed by adding or deleting docs.
269 		* starting with the current time in milliseconds forces to create unique version numbers.
270 		*/
271 		int64_t version;
272 
273 	private:
274 		int64_t generation;					// generation of the "segments_N" for the next commit
275 		int64_t lastGeneration;				// generation of the "segments_N" file we last successfully read
276 											// or wrote; this is normally the same as generation except if
277 											// there was an IOException that had interrupted a commit
278 
279 		/**
280 		* If non-null, information about loading segments_N files
281 		* will be printed here.  @see #setInfoStream.
282 		*/
283 		static std::ostream* infoStream;
284 
285 		LUCENE_STATIC_CONSTANT(int32_t,defaultGenFileRetryCount=10);
286 		LUCENE_STATIC_CONSTANT(int32_t,defaultGenFileRetryPauseMsec=50);
287 		LUCENE_STATIC_CONSTANT(int32_t,defaultGenLookaheadCount=10);
288 
289 		segmentInfosType infos;
290 
291 		friend class IndexWriter; //allow IndexWriter to use counter
292 
293     static void message(const char* _message, ...);
294 
295   public:
296       SegmentInfos(bool deleteMembers=true, int32_t reserveCount=0);
297       ~SegmentInfos();
298 
299 		//Returns a reference to the i-th SegmentInfo in the list.
300 		SegmentInfo* info(int32_t i) const;
301 
302 		/**
303 		* Get the generation (N) of the current segments_N file
304 		* from a list of files.
305 		*
306 		* @param files -- array of file names to check
307 		*/
308     static int64_t getCurrentSegmentGeneration( std::vector<std::string>& files );
309 
310 		/**
311 		* Get the generation (N) of the current segments_N file
312 		* in the directory.
313 		*
314 		* @param directory -- directory to search for the latest segments_N file
315 		*/
316 		static int64_t getCurrentSegmentGeneration( const CL_NS(store)::Directory* directory );
317 
318 		/**
319 		* Get the filename of the current segments_N file
320 		* from a list of files.
321 		*
322 		* @param files -- array of file names to check
323 		*/
324     static std::string getCurrentSegmentFileName( std::vector<std::string>& files );
325 
326 		/**
327 		* Get the filename of the current segments_N file
328 		* in the directory.
329 		*
330 		* @param directory -- directory to search for the latest segments_N file
331 		*/
332 		static std::string getCurrentSegmentFileName( CL_NS(store)::Directory* directory );
333 
334 		/**
335 		* Get the segments_N filename in use by this segment infos.
336 		*/
337 		std::string getCurrentSegmentFileName();
338 
339 		/**
340 		* Parse the generation off the segments file name and
341 		* return it.
342 		*/
343 		static int64_t generationFromSegmentsFileName( const char* fileName );
344 
345 		/**
346 		* Get the next segments_N filename that will be written.
347 		*/
348 		std::string getNextSegmentFileName();
349 
350 		/* public vector-like operations */
351 		//delete and clears objects 'from' from to 'to'
352 		void clearto(size_t to, size_t end);
353 		//count of segment infos
354 		int32_t size() const;
355 		/** add a segment info
356     * @param pos position to add the info at. -1 for last position
357     */
358 		void add(SegmentInfo* info, int32_t pos=-1);
359 		SegmentInfo* elementAt(int32_t pos);
360 		void setElementAt(SegmentInfo* si, int32_t pos);
361 		void clear();
362 
363 		void insert(SegmentInfos* infos, bool takeMemory);
364 		void insert(SegmentInfo* info);
365 		int32_t indexOf(const SegmentInfo* info) const;
366 		void range(size_t from, size_t to, SegmentInfos& ret) const;
367     void remove(size_t index, bool dontDelete=false);
368 
369 		/**
370 		* Read a particular segmentFileName.  Note that this may
371 		* throw an IOException if a commit is in process.
372 		*
373 		* @param directory -- directory containing the segments file
374 		* @param segmentFileName -- segment file to load
375 		* @throws CorruptIndexException if the index is corrupt
376 		* @throws IOException if there is a low-level IO error
377 		*/
378 		void read(CL_NS(store)::Directory* directory, const char* segmentFileName);
379 
380 		/**
381 		* This version of read uses the retry logic (for lock-less
382 		* commits) to find the right segments file to load.
383 		* @throws CorruptIndexException if the index is corrupt
384 		* @throws IOException if there is a low-level IO error
385 		*/
386 		void read(CL_NS(store)::Directory* directory);
387 
388 		//Writes a new segments file based upon the SegmentInfo instances it manages
389 		//note: still does not support lock-less writes (still pre-2.1 format)
390         void write(CL_NS(store)::Directory* directory);
391 
392 		/**
393 		* Returns a copy of this instance, also copying each
394 		* SegmentInfo.
395 		*/
396 		SegmentInfos* clone() const;
397 
398 		/**
399 		* version number when this SegmentInfos was generated.
400 		*/
401 		int64_t getVersion() const;
402 		int64_t getGeneration() const;
403 		int64_t getLastGeneration() const;
404 
405 		/**
406 		* Current version number from segments file.
407 		* @throws CorruptIndexException if the index is corrupt
408 		* @throws IOException if there is a low-level IO error
409 		*/
410 		static int64_t readCurrentVersion(CL_NS(store)::Directory* directory);
411 
412 
413     /** If non-null, information about retries when loading
414     * the segments file will be printed to this.
415     */
416     static void setInfoStream(std::ostream* infoStream);
417 
418     /**
419     * @see #setInfoStream
420     */
421     static std::ostream* getInfoStream();
422 
423 		/**
424 		* Advanced: set how many times to try loading the
425 		* segments.gen file contents to determine current segment
426 		* generation.  This file is only referenced when the
427 		* primary method (listing the directory) fails.
428 		*/
429 		//static void setDefaultGenFileRetryCount(const int32_t count);
430 		/**
431 		* @see #setDefaultGenFileRetryCount
432 		*/
433 		static int32_t getDefaultGenFileRetryCount();
434 
435 		/**
436 		* Advanced: set how many milliseconds to pause in between
437 		* attempts to load the segments.gen file.
438 		*/
439 		//static void setDefaultGenFileRetryPauseMsec(const int32_t msec);
440 		/**
441 		* @see #setDefaultGenFileRetryPauseMsec
442 		*/
443 		static int32_t getDefaultGenFileRetryPauseMsec();
444 
445 		/**
446 		* Advanced: set how many times to try incrementing the
447 		* gen when loading the segments file.  This only runs if
448 		* the primary (listing directory) and secondary (opening
449 		* segments.gen file) methods fail to find the segments
450 		* file.
451 		*/
452 		//static void setDefaultGenLookaheadCount(const int32_t count);
453 		/**
454 		* @see #setDefaultGenLookaheadCount
455 		*/
456 		static int32_t getDefaultGenLookahedCount();
457 
458     class _FindSegmentsFile: LUCENE_BASE{
459     protected:
460       const char* fileDirectory;
461       CL_NS(store)::Directory* directory;
462 
463       void doRun();
464       virtual bool tryDoBody(const char* segmentFileName, CLuceneError& ret_err) = 0;
465     };
466 
467 		/**
468 		* Utility class for executing code that needs to do
469 		* something with the current segments file.  This is
470 		* necessary with lock-less commits because from the time
471 		* you locate the current segments file name, until you
472 		* actually open it, read its contents, or check modified
473 		* time, etc., it could have been deleted due to a writer
474 		* commit finishing.
475 		*/
476     template<typename RET>
477 		class FindSegmentsFile: public _FindSegmentsFile{
478     protected:
479       virtual RET doBody(const char* segmentFileName) = 0;
480       RET result;
481 
482       //catch only IO errors, return true on success...
tryDoBody(const char * segmentFileName,CLuceneError & ret_err)483       bool tryDoBody(const char* segmentFileName, CLuceneError& ret_err){
484         try{
485           result = doBody(segmentFileName);
486           return true;
487         } catch (CLuceneError& err) {
488           result = 0;
489           ret_err.set(err.number(),err.what());
490         }
491         return false;
492       }
493     public:
FindSegmentsFile(CL_NS (store)::Directory * dir)494     		FindSegmentsFile( CL_NS(store)::Directory* dir ){
495 	        this->directory = dir;
496           this->fileDirectory = NULL;
497           this->result = 0;
498         }
FindSegmentsFile(const char * dir)499     		FindSegmentsFile( const char* dir ){
500 	        this->directory = NULL;
501           this->fileDirectory = dir;
502           this->result = 0;
503         }
~FindSegmentsFile()504         ~FindSegmentsFile(){
505         }
506 
run()507         RET run(){
508           doRun();
509           return result;
510         };
511     	};
512     	//friend class SegmentInfos::FindSegmentsFile;
513 
514     	class FindSegmentsVersion: public FindSegmentsFile<int64_t> {
515     	public:
516     		FindSegmentsVersion( CL_NS(store)::Directory* dir );
517     		FindSegmentsVersion( const char* dir );
518     		int64_t doBody( const char* segmentFileName );
519     	};
520     	friend class SegmentInfos::FindSegmentsVersion;
521 
522 		class FindSegmentsRead: public FindSegmentsFile<bool> {
523       	  SegmentInfos* _this;
524     	public:
525 		  FindSegmentsRead( CL_NS(store)::Directory* dir, SegmentInfos* _this );
526     	  FindSegmentsRead( const char* dir, SegmentInfos* _this );
527     	  bool doBody( const char* segmentFileName );
528     	};
529     	friend class SegmentInfos::FindSegmentsRead;
530   };
531 CL_NS_END
532 #endif
533