1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #include "CLucene/_ApiHeader.h"
8 
9 #include "_SegmentInfos.h"
10 #include "_IndexFileNames.h"
11 #include "_SegmentHeader.h"
12 #include "MultiReader.h"
13 #include <assert.h>
14 #include <iostream>
15 
16 #include "CLucene/store/Directory.h"
17 #include "CLucene/util/Misc.h"
18 
19 CL_NS_USE(store)
CL_NS_USE(util)20 CL_NS_USE(util)
21 
22 CL_NS_DEF(index)
23 
24 SegmentInfo::SegmentInfo(const char* _name, const int32_t _docCount, CL_NS(store)::Directory* _dir,
25 			bool _isCompoundFile, bool _hasSingleNormFile,
26 			int32_t _docStoreOffset, const char* _docStoreSegment, bool _docStoreIsCompoundFile)
27 			:
28 			docCount(_docCount),
29 			preLockless(false),
30 			delGen(SegmentInfo::NO),
31 			isCompoundFile(_isCompoundFile ? SegmentInfo::YES : SegmentInfo::NO),
32 			hasSingleNormFile(_hasSingleNormFile),
33 			_sizeInBytes(-1),
34 			docStoreOffset(_docStoreOffset),
35       docStoreSegment( _docStoreSegment == NULL ? "" : _docStoreSegment ),
36 			docStoreIsCompoundFile(_docStoreIsCompoundFile)
37 {
38 	CND_PRECONDITION(docStoreOffset == -1 || !docStoreSegment.empty(), "failed testing for (docStoreOffset == -1 || docStoreSegment != NULL)");
39 
40 	this->name = _name;
41 	this->dir = _dir;
42 }
43 
segString(Directory * dir)44 string SegmentInfo::segString(Directory* dir) {
45   string cfs;
46   try {
47     if (getUseCompoundFile())
48       cfs = "c";
49     else
50       cfs = "C";
51   } catch (CLuceneError& ioe) {
52     if ( ioe.number() != CL_ERR_IO ) throw ioe;
53     cfs = "?";
54   }
55 
56   string docStore;
57 
58   if (docStoreOffset != -1)
59     docStore = string("->") + docStoreSegment;
60   else
61     docStore = "";
62 
63   return string(name) + ":" +
64     cfs +
65     string(this->dir == dir ? "" : "x") +
66     Misc::toString(docCount) + docStore;
67 }
SegmentInfo(CL_NS (store)::Directory * _dir,int32_t format,CL_NS (store)::IndexInput * input)68    SegmentInfo::SegmentInfo(CL_NS(store)::Directory* _dir, int32_t format, CL_NS(store)::IndexInput* input):
69      _sizeInBytes(-1)
70    {
71 	   this->dir = _dir;
72 
73 	   {
74 		   char aname[CL_MAX_PATH];
75        input->readString(aname, CL_MAX_PATH);
76 		   this->name = aname;
77 	   }
78 
79 	   docCount = input->readInt();
80 	   if (format <= SegmentInfos::FORMAT_LOCKLESS) {
81 		   delGen = input->readLong();
82 		   if (format <= SegmentInfos::FORMAT_SHARED_DOC_STORE) {
83 			   docStoreOffset = input->readInt();
84 			   if (docStoreOffset != -1) {
85 				   char aname[CL_MAX_PATH];
86 			     input->readString(aname, CL_MAX_PATH);
87 				   docStoreSegment = aname;
88 				   docStoreIsCompoundFile = (1 == input->readByte());
89 			   } else {
90 				   docStoreSegment = name;
91 				   docStoreIsCompoundFile = false;
92 			   }
93 		   } else {
94 			   docStoreOffset = -1;
95 			   docStoreSegment = name;
96 			   docStoreIsCompoundFile = false;
97 		   }
98 		   if (format <= SegmentInfos::FORMAT_SINGLE_NORM_FILE) {
99 			   hasSingleNormFile = (1 == input->readByte());
100 		   } else {
101 			   hasSingleNormFile = false;
102 		   }
103 		   int32_t numNormGen = input->readInt();
104 		   normGen.deleteValues();
105 		   if (numNormGen == NO) {
106 			   // normGen is already NULL, we'll just set normGenLen to 0
107 		   } else {
108 			   normGen.values = _CL_NEWARRAY(int64_t, numNormGen);
109          normGen.length = numNormGen;
110 			   for(int32_t j=0;j<numNormGen;j++) {
111 				   normGen.values[j] = input->readLong();
112 			   }
113 		   }
114 		   isCompoundFile = input->readByte();
115 		   preLockless = (isCompoundFile == CHECK_DIR);
116 	   } else {
117 		   delGen = CHECK_DIR;
118 		   //normGen=NULL; normGenLen=0;
119 		   isCompoundFile = CHECK_DIR;
120 		   preLockless = true;
121 		   hasSingleNormFile = false;
122 		   docStoreOffset = -1;
123 		   docStoreIsCompoundFile = false;
124 	   }
125    }
126 
reset(const SegmentInfo * src)127    void SegmentInfo::reset(const SegmentInfo* src) {
128 	   clearFiles();
129 	   this->name = src->name;
130 	   docCount = src->docCount;
131 	   dir = src->dir;
132 	   preLockless = src->preLockless;
133 	   delGen = src->delGen;
134 	   docStoreOffset = src->docStoreOffset;
135 	   docStoreIsCompoundFile = src->docStoreIsCompoundFile;
136 	   if (src->normGen.values == NULL) {
137        this->normGen.deleteValues();
138      }else{
139 		   // optimized case to allocate new array only if current memory buffer is too small
140        if (this->normGen.length < src->normGen.length) {
141          normGen.resize(src->normGen.length);
142        }else{
143         this->normGen.length = src->normGen.length;
144        }
145        memcpy(this->normGen.values, src->normGen.values, sizeof(int64_t) * this->normGen.length);
146 	   }
147 	   isCompoundFile = src->isCompoundFile;
148 	   hasSingleNormFile = src->hasSingleNormFile;
149    }
150 
~SegmentInfo()151    SegmentInfo::~SegmentInfo(){
152      normGen.deleteValues();
153    }
154 
setNumFields(const int32_t numFields)155    void SegmentInfo::setNumFields(const int32_t numFields) {
156      if (normGen.values == NULL) {
157 		   // normGen is null if we loaded a pre-2.1 segment
158 		   // file, or, if this segments file hasn't had any
159 		   // norms set against it yet:
160            normGen.resize(numFields);
161 
162 		   if (preLockless) {
163 			   // Do nothing: thus leaving normGen[k]==CHECK_DIR (==0), so that later we know
164 			   // we have to check filesystem for norm files, because this is prelockless.
165 
166 		   } else {
167 			   // This is a FORMAT_LOCKLESS segment, which means
168 			   // there are no separate norms:
169 			   for(int32_t i=0;i<numFields;i++) {
170 				   normGen.values[i] = NO;
171 			   }
172 		   }
173 	   }
174    }
175    /** Returns total size in bytes of all of files used by
176    *  this segment. */
sizeInBytes()177   int64_t SegmentInfo::sizeInBytes(){
178     if (_sizeInBytes == -1) {
179       const vector<string>& __files = files();
180       size_t size = __files.size();
181       _sizeInBytes = 0;
182       for(size_t i=0;i<size;i++) {
183         const char* fileName = __files[i].c_str();
184         // We don't count bytes used by a shared doc store
185         // against this segment:
186         if (docStoreOffset == -1 || !IndexFileNames::isDocStoreFile(fileName))
187           _sizeInBytes += dir->fileLength(fileName);
188       }
189     }
190     return _sizeInBytes;
191   }
192 
addIfExists(std::vector<std::string> & files,const std::string & fileName)193   void SegmentInfo::addIfExists(std::vector<std::string>& files, const std::string& fileName){
194     if (dir->fileExists(fileName.c_str()))
195       files.push_back(fileName);
196   }
197 
files()198   const vector<string>& SegmentInfo::files(){
199     if (!_files.empty()) {
200       // Already cached:
201       return _files;
202     }
203 
204     bool useCompoundFile = getUseCompoundFile();
205 
206     if (useCompoundFile) {
207       _files.push_back( string(name) + "." + IndexFileNames::COMPOUND_FILE_EXTENSION);
208     } else {
209       ConstValueArray<const char*>& exts = IndexFileNames::NON_STORE_INDEX_EXTENSIONS();
210       for(size_t i=0;i<exts.length;i++){
211         addIfExists(_files, name + "." + exts[i]);
212       }
213     }
214 
215     if (docStoreOffset != -1) {
216       // We are sharing doc stores (stored fields, term
217       // vectors) with other segments
218       assert (!docStoreSegment.empty());
219       if (docStoreIsCompoundFile) {
220         _files.push_back(docStoreSegment + "." + IndexFileNames::COMPOUND_FILE_STORE_EXTENSION);
221       } else {
222         ConstValueArray<const char*>& exts = IndexFileNames::STORE_INDEX_EXTENSIONS();
223         for(size_t i=0;i<exts.length;i++)
224           addIfExists(_files, docStoreSegment + "." + exts[i]);
225       }
226     } else if (!useCompoundFile) {
227       // We are not sharing, and, these files were not
228       // included in the compound file
229       ConstValueArray<const char*>& exts = IndexFileNames::STORE_INDEX_EXTENSIONS();
230       for(size_t i=0;i<exts.length;i++)
231         addIfExists(_files, name + "." + exts[i]);
232     }
233 
234     string delFileName = IndexFileNames::fileNameFromGeneration(name.c_str(), (string(".") + IndexFileNames::DELETES_EXTENSION).c_str(), delGen);
235     if ( !delFileName.empty() && (delGen >= YES || dir->fileExists(delFileName.c_str()))) {
236       _files.push_back(delFileName);
237     }
238 
239     // Careful logic for norms files
240     if (normGen.values != NULL) {
241       for(size_t i=0;i<normGen.length;i++) {
242         int64_t gen = normGen[i];
243         if (gen >= YES) {
244           // Definitely a separate norm file, with generation:
245           string gens = string(".") + IndexFileNames::SEPARATE_NORMS_EXTENSION;
246           gens += Misc::toString((int64_t)i);
247           _files.push_back(IndexFileNames::fileNameFromGeneration(name.c_str(), gens.c_str(), gen));
248         } else if (NO == gen) {
249           // No separate norms but maybe plain norms
250           // in the non compound file case:
251           if (!hasSingleNormFile && !useCompoundFile) {
252             string fileName = name + "." + IndexFileNames::PLAIN_NORMS_EXTENSION;
253             fileName += i;
254             if (dir->fileExists(fileName.c_str())) {
255               _files.push_back(fileName);
256             }
257           }
258         } else if (CHECK_DIR == gen) {
259           // Pre-2.1: we have to check file existence
260           string fileName;
261           if (useCompoundFile) {
262             fileName = name + "." + IndexFileNames::SEPARATE_NORMS_EXTENSION;
263             fileName += Misc::toString((int64_t)i);
264           } else if (!hasSingleNormFile) {
265             fileName = name + "." + IndexFileNames::PLAIN_NORMS_EXTENSION;
266             fileName += Misc::toString((int64_t)i);
267           }
268           if ( !fileName.empty() && dir->fileExists(fileName.c_str())) {
269             _files.push_back(fileName);
270           }
271         }
272       }
273     } else if (preLockless || (!hasSingleNormFile && !useCompoundFile)) {
274       // Pre-2.1: we have to scan the dir to find all
275       // matching _X.sN/_X.fN files for our segment:
276       string prefix;
277       if (useCompoundFile)
278         prefix = name + "." + IndexFileNames::SEPARATE_NORMS_EXTENSION;
279       else
280         prefix = name + "." + IndexFileNames::PLAIN_NORMS_EXTENSION;
281       size_t prefixLength = prefix.length();
282       vector<string> allFiles;
283       if (dir->list(allFiles) == false ){
284         string err = "cannot read directory ";
285         err += dir->toString();
286         err += ": list() returned null";
287         _CLTHROWA(CL_ERR_IO, err.c_str());
288       }
289       for(size_t i=0;i<allFiles.size();i++) {
290         string& fileName = allFiles[i];
291         if (fileName.length() > prefixLength && _istdigit(fileName[prefixLength]) && fileName.compare(0,prefix.length(),prefix)==0 ) {
292           _files.push_back(fileName);
293         }
294       }
295     }
296     return _files;
297   }
298 
299 
300 
hasDeletions() const301    bool SegmentInfo::hasDeletions() const {
302 	   // Cases:
303 	   //
304 	   //   delGen == NO: this means this segment was written
305 	   //     by the LOCKLESS code and for certain does not have
306 	   //     deletions yet
307 	   //
308 	   //   delGen == CHECK_DIR: this means this segment was written by
309 	   //     pre-LOCKLESS code which means we must check
310 	   //     directory to see if .del file exists
311 	   //
312 	   //   delGen >= YES: this means this segment was written by
313 	   //     the LOCKLESS code and for certain has
314 	   //     deletions
315 	   //
316 	   if (delGen == NO) {
317 		   return false;
318 	   } else if (delGen >= YES) {
319 		   return true;
320 	   } else {
321 		   return dir->fileExists(getDelFileName().c_str());
322 	   }
323    }
324 
advanceDelGen()325    void SegmentInfo::advanceDelGen() {
326 	   // delGen 0 is reserved for pre-LOCKLESS format
327 	   if (delGen == NO) {
328 		   delGen = YES;
329 	   } else {
330 		   delGen++;
331 	   }
332 	   clearFiles();
333    }
334 
clearDelGen()335    void SegmentInfo::clearDelGen() {
336 	   delGen = NO;
337 	   clearFiles();
338    }
339 
clone()340    SegmentInfo* SegmentInfo::clone () {
341 	   SegmentInfo* si = _CLNEW SegmentInfo(name.c_str(), docCount, dir);
342 	   si->isCompoundFile = isCompoundFile;
343 	   si->delGen = delGen;
344 	   si->preLockless = preLockless;
345 	   si->hasSingleNormFile = hasSingleNormFile;
346 	   if (this->normGen.values != NULL) {
347        si->normGen.resize(this->normGen.length);
348        memcpy(si->normGen.values, this->normGen.values, sizeof(int64_t) * this->normGen.length);
349 	   }
350      si->docStoreOffset = docStoreOffset;
351      si->docStoreSegment = docStoreSegment;
352      si->docStoreIsCompoundFile = docStoreIsCompoundFile;
353 
354 	   return si;
355    }
356 
getDelFileName() const357    string SegmentInfo::getDelFileName() const {
358 	   if (delGen == NO) {
359 		   // In this case we know there is no deletion filename
360 		   // against this segment
361 		   return NULL;
362 	   } else {
363 		   // If delGen is CHECK_DIR, it's the pre-lockless-commit file format
364 		   return IndexFileNames::fileNameFromGeneration(name.c_str(), (string(".") + IndexFileNames::DELETES_EXTENSION).c_str(), delGen);
365 	   }
366    }
367 
hasSeparateNorms(const int32_t fieldNumber) const368    bool SegmentInfo::hasSeparateNorms(const int32_t fieldNumber) const {
369 	   if ((normGen.values == NULL && preLockless) || (normGen.values != NULL && normGen[fieldNumber] == CHECK_DIR)) {
370 		   // Must fallback to directory file exists check:
371 		   return dir->fileExists( (name + string(".s") + Misc::toString(fieldNumber)).c_str() );
372 	   } else if (normGen.values == NULL || normGen[fieldNumber] == NO) {
373 		   return false;
374 	   } else {
375 		   return true;
376 	   }
377    }
378 
hasSeparateNorms() const379    bool SegmentInfo::hasSeparateNorms() const {
380 	   if (normGen.values == NULL) {
381 		   if (!preLockless) {
382 			   // This means we were created w/ LOCKLESS code and no
383 			   // norms are written yet:
384 			   return false;
385 		   } else {
386 			   // This means this segment was saved with pre-LOCKLESS
387 			   // code.  So we must fallback to the original
388 			   // directory list check:
389 			   vector<string> result;
390 			   if ( !dir->list(result) ) {
391 				   _CLTHROWA(CL_ERR_IO, (string("cannot read directory: ") + dir->toString() + string(" list() returned NULL")).c_str() );
392 			   }
393 
394          string pattern = name + string(".s");
395 			   for ( vector<string>::iterator itr = result.begin();
396                itr != result.end() ; itr ++ ){
397 				   if(strncmp(itr->c_str(), pattern.c_str(), pattern.length() ) == 0 &&
398               isdigit( (*itr)[pattern.length()])) {
399 					   return true;
400 				   }
401 			   }
402 			   return false;
403 		   }
404 	   } else {
405 		   // This means this segment was saved with LOCKLESS
406 		   // code so we first check whether any normGen's are >= 1
407 		   // (meaning they definitely have separate norms):
408        for(size_t i=0;i<normGen.length;i++) {
409 			   if (normGen[i] >= YES) {
410 				   return true;
411 			   }
412 		   }
413 		   // Next we look for any == 0.  These cases were
414 		   // pre-LOCKLESS and must be checked in directory:
415        for(size_t j=0;j<normGen.length;j++) {
416 			   if (normGen[j] == CHECK_DIR) {
417 				   if (hasSeparateNorms(j)) {
418 					   return true;
419 				   }
420 			   }
421 		   }
422 	   }
423 
424 	   return false;
425    }
426 
advanceNormGen(const int32_t fieldIndex)427    void SegmentInfo::advanceNormGen(const int32_t fieldIndex) {
428 	   if (normGen[fieldIndex] == NO) {
429 		   normGen.values[fieldIndex] = YES;
430 	   } else {
431 		   normGen.values[fieldIndex]++;
432 	   }
433 	   clearFiles();
434    }
435 
getNormFileName(const int32_t number) const436    string SegmentInfo::getNormFileName(const int32_t number) const {
437 	   char prefix[10];
438 
439 	   int64_t gen;
440 	   if (normGen.values == NULL) {
441 		   gen = CHECK_DIR;
442 	   } else {
443 		   gen = normGen[number];
444 	   }
445 
446 	   if (hasSeparateNorms(number)) {
447 		   // case 1: separate norm
448 		   cl_sprintf(prefix, 10, ".s%d", number);
449 		   return IndexFileNames::fileNameFromGeneration(name.c_str(), prefix, gen);
450 	   }
451 
452 	   if (hasSingleNormFile) {
453 		   // case 2: lockless (or nrm file exists) - single file for all norms
454 		   cl_sprintf(prefix, 10, ".%s", IndexFileNames::NORMS_EXTENSION);
455 		   return IndexFileNames::fileNameFromGeneration(name.c_str(), prefix, WITHOUT_GEN);
456 	   }
457 
458 	   // case 3: norm file for each field
459 	   cl_sprintf(prefix, 10, ".f%d", number);
460 	   return IndexFileNames::fileNameFromGeneration(name.c_str(), prefix, WITHOUT_GEN);
461    }
462 
setUseCompoundFile(const bool isCompoundFile)463    void SegmentInfo::setUseCompoundFile(const bool isCompoundFile) {
464 	   if (isCompoundFile) {
465 		   this->isCompoundFile = YES;
466 	   } else {
467 		   this->isCompoundFile = NO;
468 	   }
469 	   clearFiles();
470    }
471 
getUseCompoundFile() const472    bool SegmentInfo::getUseCompoundFile() const {
473 	   if (isCompoundFile == NO) {
474 		   return false;
475 	   } else if (isCompoundFile == YES) {
476 		   return true;
477 	   } else {
478 		   return dir->fileExists( ((string)name + "." + IndexFileNames::COMPOUND_FILE_EXTENSION).c_str() );
479 	   }
480    }
481 
getDocStoreOffset() const482    int32_t SegmentInfo::getDocStoreOffset() const { return docStoreOffset; }
483 
getDocStoreIsCompoundFile() const484    bool SegmentInfo::getDocStoreIsCompoundFile() const { return docStoreIsCompoundFile; }
485 
setDocStoreIsCompoundFile(const bool v)486    void SegmentInfo::setDocStoreIsCompoundFile(const bool v) {
487 	   docStoreIsCompoundFile = v;
488 	   clearFiles();
489    }
490 
getDocStoreSegment() const491    const string& SegmentInfo::getDocStoreSegment() const {
492      return docStoreSegment;
493    }
494 
setDocStoreOffset(const int32_t offset)495    void SegmentInfo::setDocStoreOffset(const int32_t offset) {
496 	   docStoreOffset = offset;
497 	   clearFiles();
498    }
499 
write(CL_NS (store)::IndexOutput * output)500    void SegmentInfo::write(CL_NS(store)::IndexOutput* output) {
501      output->writeString(name);
502 	   output->writeInt(docCount);
503 	   output->writeLong(delGen);
504 	   output->writeInt(docStoreOffset);
505 	   if (docStoreOffset != -1) {
506 		   output->writeString(docStoreSegment);
507 		   output->writeByte(static_cast<uint8_t>(docStoreIsCompoundFile ? 1:0));
508 	   }
509 
510 	   output->writeByte(static_cast<uint8_t>(hasSingleNormFile ? 1:0));
511 	   if (normGen.values == NULL) {
512 		   output->writeInt(NO);
513 	   } else {
514        output->writeInt(normGen.length);
515        for(size_t j = 0; j < normGen.length; j++) {
516 			   output->writeLong(normGen[j]);
517 		   }
518 	   }
519 	   output->writeByte(isCompoundFile);
520    }
521 
clearFiles()522    void SegmentInfo::clearFiles() {
523 	   _files.clear();
524 	   _sizeInBytes = -1;
525    }
526 
527    /** We consider another SegmentInfo instance equal if it
528    *  has the same dir and same name. */
equals(const SegmentInfo * obj)529    bool SegmentInfo::equals(const SegmentInfo* obj) {
530 	   return (obj->dir == this->dir && obj->name.compare(this->name) == 0 );
531    }
532 
533 
534 
535 
536 
537   std::ostream* SegmentInfos::infoStream = NULL;
538 
539   /** If non-null, information about retries when loading
540   * the segments file will be printed to this.
541   */
setInfoStream(std::ostream * infoStream)542   void SegmentInfos::setInfoStream(std::ostream* infoStream) {
543     SegmentInfos::infoStream = infoStream;
544   }
545 
546   /**
547   * @see #setInfoStream
548   */
getInfoStream()549   std::ostream* SegmentInfos::getInfoStream() {
550     return infoStream;
551   }
552 
SegmentInfos(bool deleteMembers,int32_t reserveCount)553   SegmentInfos::SegmentInfos(bool deleteMembers, int32_t reserveCount) :
554       generation(0),lastGeneration(0), infos(deleteMembers) {
555   //Func - Constructor
556   //Pre  - deleteMembers indicates if the instance to be created must delete
557   //       all SegmentInfo instances it manages when the instance is destroyed or not
558   //       true -> must delete, false may not delete
559   //Post - An instance of SegmentInfos has been created.
560 
561       //initialize counter to 0
562       counter = 0;
563       version = Misc::currentTimeMillis();
564 	  if (reserveCount > 1)
565 		  infos.reserve(reserveCount);
566   }
567 
~SegmentInfos()568   SegmentInfos::~SegmentInfos(){
569   //Func - Destructor
570   //Pre  - true
571   //Post - The instance has been destroyed. Depending on the constructor used
572   //       the SegmentInfo instances that this instance managed have been deleted or not.
573 
574 	  //Clear the list of SegmentInfo instances - make sure everything is deleted
575       infos.clear();
576   }
577 
info(int32_t i) const578   SegmentInfo* SegmentInfos::info(int32_t i) const {
579   //Func - Returns a reference to the i-th SegmentInfo in the list.
580   //Pre  - i >= 0
581   //Post - A reference to the i-th SegmentInfo instance has been returned
582 
583       CND_PRECONDITION(i >= 0 && i < infos.size(), "i is out of bounds");
584 
585 	  //Get the i-th SegmentInfo instance
586       SegmentInfo *ret = infos[i];
587 
588       //Condition check to see if the i-th SegmentInfo has been retrieved
589       CND_CONDITION(ret != NULL,"No SegmentInfo instance found");
590 
591       return ret;
592   }
593 
getCurrentSegmentGeneration(std::vector<std::string> & files)594   int64_t SegmentInfos::getCurrentSegmentGeneration( std::vector<std::string>& files ) {
595     if ( files.size() == 0 ) {
596 		  return -1;
597 	  }
598 
599 	  int64_t max = -1;
600 
601     vector<string>::iterator itr = files.begin();
602 		const char* file;
603     size_t seglen = strlen(IndexFileNames::SEGMENTS);
604 	  while ( itr != files.end() ) {
605       file = itr->c_str();
606 		  if ( strncmp( file, IndexFileNames::SEGMENTS, seglen ) == 0 && strcmp( file, IndexFileNames::SEGMENTS_GEN ) != 0 ) {
607 			  int64_t gen = generationFromSegmentsFileName( file );
608 			  if ( gen > max ) {
609 				  max = gen;
610 			  }
611 		  }
612 
613       itr++;
614 	  }
615 
616 	  return max;
617   }
618 
getCurrentSegmentGeneration(const CL_NS (store)::Directory * directory)619   int64_t SegmentInfos::getCurrentSegmentGeneration( const CL_NS(store)::Directory* directory ) {
620 	  vector<string> files;
621     if ( !directory->list(&files) ){
622 		  _CLTHROWA(CL_ERR_IO, (string("cannot read directory ") + directory->toString() + string(": list() returned NULL")).c_str() );
623 	  }
624 	  int64_t gen = getCurrentSegmentGeneration( files );
625 	  return gen;
626   }
627 
getCurrentSegmentFileName(vector<string> & files)628   string SegmentInfos::getCurrentSegmentFileName( vector<string>& files ) {
629 	  return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", getCurrentSegmentGeneration( files ));
630   }
631 
getCurrentSegmentFileName(CL_NS (store)::Directory * directory)632   std::string SegmentInfos::getCurrentSegmentFileName( CL_NS(store)::Directory* directory ) {
633 	  return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", getCurrentSegmentGeneration( directory ));
634   }
635 
getCurrentSegmentFileName()636   std::string SegmentInfos::getCurrentSegmentFileName() {
637 	  return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", lastGeneration );
638   }
639 
generationFromSegmentsFileName(const char * fileName)640   int64_t SegmentInfos::generationFromSegmentsFileName( const char* fileName ) {
641 	  if ( strcmp( fileName, IndexFileNames::SEGMENTS ) == 0 ) {
642 		  return 0;
643 	  } else if ( strncmp( fileName, IndexFileNames::SEGMENTS, strlen(IndexFileNames::SEGMENTS) ) == 0 ) {
644 		  return CL_NS(util)::Misc::base36ToLong( fileName + strlen( IndexFileNames::SEGMENTS )+1 );
645 	  } else {
646 		  TCHAR err[CL_MAX_PATH + 35];
647 		  _sntprintf(err,CL_MAX_PATH + 35,_T("fileName \"%s\" is not a segments file"), fileName);
648 		  _CLTHROWA(CL_ERR_IllegalArgument, err);
649 		  return 0;
650 	  }
651   }
652 
getNextSegmentFileName()653   std::string SegmentInfos::getNextSegmentFileName() {
654 	  int64_t nextGeneration;
655 
656 	  if ( generation == -1 ) {
657 		  nextGeneration = 1;
658 	  } else {
659 		  nextGeneration = generation+1;
660 	  }
661 
662 	  return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", nextGeneration );
663   }
664 
clearto(size_t from,size_t end)665   void SegmentInfos::clearto(size_t from, size_t end){
666 	size_t range = end - from;
667       if ( (infos.size() - from) >= range) { // Make sure we actually need to remove
668         segmentInfosType::iterator itr,bitr=infos.begin()+from,eitr=infos.end();
669         size_t count = 0;
670         for(itr=bitr;itr!=eitr && count < range;++itr, count++) {
671                 _CLLDELETE((*itr));
672             }
673             infos.erase(bitr,bitr + count);
674         }
675   }
add(SegmentInfo * info,int32_t pos)676   void SegmentInfos::add(SegmentInfo* info, int32_t pos){
677     if ( pos == -1 ){
678       infos.push_back(info);
679     }else{
680       if ( pos < 0 || pos >= (int32_t)infos.size()+1 ) _CLTHROWA(CL_ERR_IllegalArgument, "pos is out of range");
681       infos.insert( infos.begin()+pos, info );
682     }
683   }
size() const684   int32_t SegmentInfos::size() const{
685 	  return infos.size();
686   }
elementAt(int32_t pos)687   SegmentInfo* SegmentInfos::elementAt(int32_t pos) {
688 	  return infos.at(pos);
689   }
setElementAt(SegmentInfo * si,int32_t pos)690   void SegmentInfos::setElementAt(SegmentInfo* si, int32_t pos) {
691 	  infos.set(pos, si);
692   }
clear()693   void SegmentInfos::clear() { infos.clear(); }
694 
695 
insert(SegmentInfos * _infos,bool takeMemory)696   void SegmentInfos::insert(SegmentInfos* _infos, bool takeMemory){
697     infos.insert(infos.end(),_infos->infos.begin(),_infos->infos.end());
698     if ( takeMemory ){
699       while (_infos->infos.size() > 0 )
700         _infos->infos.remove(_infos->infos.begin(), true );
701     }
702   }
insert(SegmentInfo * info)703 	void SegmentInfos::insert(SegmentInfo* info){
704     infos.push_back(info);
705   }
indexOf(const SegmentInfo * info) const706 	int32_t SegmentInfos::indexOf(const SegmentInfo* info) const{
707     segmentInfosType::const_iterator itr = infos.begin();
708     int32_t c=-1;
709     while ( itr != infos.end()){
710       c++;
711       if ( *itr == info ){
712         return c;
713       }
714       itr++;
715     }
716     return -1;
717   }
range(size_t from,size_t to,SegmentInfos & ret) const718 	void SegmentInfos::range(size_t from, size_t to, SegmentInfos& ret) const{
719     segmentInfosType::const_iterator itr = infos.begin();
720     itr+= from;
721     for (size_t i=from;i<to && itr != infos.end();i++){
722       ret.infos.push_back(*itr);
723 
724       itr++;
725     }
726   }
remove(size_t index,bool dontDelete)727   void SegmentInfos::remove(size_t index, bool dontDelete){
728     infos.remove(index, dontDelete);
729   }
730 
read(Directory * directory,const char * segmentFileName)731   void SegmentInfos::read(Directory* directory, const char* segmentFileName){
732 	  bool success = false;
733 
734 	  // Clear any previous segments:
735 	  clear();
736 
737 	  IndexInput* input = directory->openInput(segmentFileName);
738 	  CND_CONDITION(input != NULL,"input == NULL");
739 
740 	  generation = generationFromSegmentsFileName( segmentFileName );
741 	  lastGeneration = generation;
742 
743 	  try {
744 		  int32_t format = input->readInt();
745 		  if(format < 0){     // file contains explicit format info
746 			  // check that it is a format we can understand
747 			  if (format < CURRENT_FORMAT){
748 				  char err[30];
749 				  cl_sprintf(err,30,"Unknown format version: %d", format);
750 				  _CLTHROWA(CL_ERR_CorruptIndex, err);
751 			  }
752 			  version = input->readLong(); // read version
753 			  counter = input->readInt(); // read counter
754 		  }
755 		  else{     // file is in old format without explicit format info
756 			  counter = format;
757 		  }
758 
759 		  for (int32_t i = input->readInt(); i > 0; i--) { // read segmentInfos
760 			  infos.push_back( _CLNEW SegmentInfo(directory, format, input) );
761 		  }
762 
763 		  if(format >= 0){    // in old format the version number may be at the end of the file
764 			  if (input->getFilePointer() >= input->length())
765 				  version = CL_NS(util)::Misc::currentTimeMillis(); // old file format without version number
766 			  else
767 				  version = input->readLong(); // read version
768 		  }
769 		  success = true;
770 	  } _CLFINALLY({
771 		  input->close();
772 		  _CLDELETE(input);
773 		  if (!success) {
774 			  // Clear any segment infos we had loaded so we
775 			  // have a clean slate on retry:
776 			  clear();
777 		  }
778 	  });
779   }
780 
read(Directory * directory)781   void SegmentInfos::read(Directory* directory) {
782 	  generation = lastGeneration = -1;
783 
784 	  FindSegmentsRead find(directory, this);
785 
786 	  find.run();
787   }
788 
789 
write(Directory * directory)790   void SegmentInfos::write(Directory* directory){
791   //Func - Writes a new segments file based upon the SegmentInfo instances it manages
792   //Pre  - directory is a valid reference to a Directory
793   //Post - The new segment has been written to disk
794 
795     string segmentFileName = getNextSegmentFileName();
796 
797     // Always advance the generation on write:
798     if (generation == -1) {
799       generation = 1;
800     } else {
801       generation++;
802     }
803 
804     IndexOutput* output = directory->createOutput(segmentFileName.c_str());
805 
806     bool success = false;
807 
808     try {
809       output->writeInt(CURRENT_FORMAT); // write FORMAT
810       output->writeLong(++version); // every write changes
811                                    // the index
812       output->writeInt(counter); // write counter
813       output->writeInt(size()); // write infos
814       for (int32_t i = 0; i < size(); i++) {
815         info(i)->write(output);
816       }
817     }_CLFINALLY (
818       try {
819         output->close();
820         _CLDELETE(output);
821         success = true;
822       } _CLFINALLY (
823         if (!success) {
824           // Try not to leave a truncated segments_N file in
825           // the index:
826           directory->deleteFile(segmentFileName.c_str());
827         }
828       )
829     )
830 
831     try {
832       output = directory->createOutput(IndexFileNames::SEGMENTS_GEN);
833       try {
834         output->writeInt(FORMAT_LOCKLESS);
835         output->writeLong(generation);
836         output->writeLong(generation);
837       } _CLFINALLY(
838         output->close();
839         _CLDELETE(output);
840       )
841     } catch (CLuceneError& e) {
842       if ( e.number() != CL_ERR_IO ) throw e;
843       // It's OK if we fail to write this file since it's
844       // used only as one of the retry fallbacks.
845     }
846 
847     lastGeneration = generation;
848   }
849 
clone() const850   SegmentInfos* SegmentInfos::clone() const{
851 	  SegmentInfos* sis = _CLNEW SegmentInfos(true, infos.size());
852 	  for(size_t i=0;i<infos.size();i++) {
853 		  sis->setElementAt(infos[i]->clone(), i);
854 	  }
855 	  return sis;
856   }
857 
getVersion() const858   int64_t SegmentInfos::getVersion() const { return version; }
getGeneration() const859   int64_t SegmentInfos::getGeneration() const { return generation; }
getLastGeneration() const860   int64_t SegmentInfos::getLastGeneration() const { return lastGeneration; }
861 
readCurrentVersion(Directory * directory)862   int64_t SegmentInfos::readCurrentVersion(Directory* directory){
863 	  FindSegmentsVersion find(directory);
864 	  return find.run();
865   }
866 
867   //void SegmentInfos::setDefaultGenFileRetryCount(const int32_t count) { defaultGenFileRetryCount = count; }
getDefaultGenFileRetryCount()868   int32_t SegmentInfos::getDefaultGenFileRetryCount() { return defaultGenFileRetryCount; }
869 
870   //void SegmentInfos::setDefaultGenFileRetryPauseMsec(const int32_t msec) { defaultGenFileRetryPauseMsec = msec; }
getDefaultGenFileRetryPauseMsec()871   int32_t SegmentInfos::getDefaultGenFileRetryPauseMsec() { return defaultGenFileRetryPauseMsec; }
872 
873   //void SegmentInfos::setDefaultGenLookaheadCount(const int32_t count) { defaultGenLookaheadCount = count;}
getDefaultGenLookahedCount()874   int32_t SegmentInfos::getDefaultGenLookahedCount() { return defaultGenLookaheadCount; }
875 
doRun()876   void SegmentInfos::_FindSegmentsFile::doRun(){
877     string segmentFileName;
878     int64_t lastGen = -1;
879     int64_t gen = 0;
880     int32_t genLookaheadCount = 0;
881     bool retry = false;
882     CLuceneError exc; //saved exception
883 
884     int32_t method = 0;
885 
886     // Loop until we succeed in calling doBody() without
887     // hitting an IOException.  An IOException most likely
888     // means a commit was in process and has finished, in
889     // the time it took us to load the now-old infos files
890     // (and segments files).  It's also possible it's a
891     // true error (corrupt index).  To distinguish these,
892     // on each retry we must see "forward progress" on
893     // which generation we are trying to load.  If we
894     // don't, then the original error is real and we throw
895     // it.
896 
897     // We have three methods for determining the current
898     // generation.  We try the first two in parallel, and
899     // fall back to the third when necessary.
900 
901     while( true ) {
902 
903       if ( 0 == method ) {
904         // Method 1: list the directory and use the highest
905         // segments_N file.  This method works well as long
906         // as there is no stale caching on the directory
907         // contents (NOTE: NFS clients often have such stale
908         // caching):
909         vector<string> files;
910 
911         int64_t genA = -1;
912 
913         if (directory != NULL){
914           if (directory->list(&files)) {
915             genA = getCurrentSegmentGeneration( files );
916             files.clear();
917           }
918         }
919 
920 
921         if ( infoStream ){
922           (*infoStream) << "[SIS]: directory listing genA=" << genA << "\n";
923         }
924 
925         // Method 2: open segments.gen and read its
926         // contents.  Then we take the larger of the two
927         // gen's.  This way, if either approach is hitting
928         // a stale cache (NFS) we have a better chance of
929         // getting the right generation.
930         int64_t genB = -1;
931         if (directory != NULL) {
932           CLuceneError e;
933           for(int32_t i=0;i<defaultGenFileRetryCount;i++) {
934             IndexInput* genInput = NULL;
935             if ( ! directory->openInput(IndexFileNames::SEGMENTS_GEN, genInput, e) ){
936               if (e.number() == CL_ERR_IO ) {
937 	              if ( infoStream ){
938                   (*infoStream) << "[SIS]: segments.gen open: IOException " << e.what() << "\n";
939                 }
940                 break;
941               } else {
942 				  genInput->close();
943 	              _CLLDELETE(genInput);
944 	              throw e;
945               }
946             }
947 
948             if (genInput != NULL) {
949               try {
950 	              int32_t version = genInput->readInt();
951 	              if (version == FORMAT_LOCKLESS) {
952 		              int64_t gen0 = genInput->readLong();
953 		              int64_t gen1 = genInput->readLong();
954 		              //CL_TRACE("fallback check: %d; %d", gen0, gen1);
955 		              if (gen0 == gen1) {
956 			              // The file is consistent.
957 			              genB = gen0;
958 			              genInput->close();
959 			              _CLDELETE(genInput);
960 			              break;
961 		              }
962 	              }
963               } catch (CLuceneError &err2) {
964 	              if (err2.number() != CL_ERR_IO) {
965 					  genInput->close();
966 		              _CLLDELETE(genInput);
967 		              throw err2; // retry only for IOException
968 	              }
969               } _CLFINALLY({
970 	              genInput->close();
971 	              _CLDELETE(genInput);
972               });
973             }
974 
975             _LUCENE_SLEEP(defaultGenFileRetryPauseMsec);
976             /*
977             //todo: Wrap the LUCENE_SLEEP call above with the following try/catch block if
978             //	  InterruptedException is implemented
979             try {
980             } catch (CLuceneError &e) {
981             //if (err2.number != CL_ERR_Interrupted) // retry only for InterruptedException
982             // todo: see if CL_ERR_Interrupted needs to be added...
983             throw e;
984             }*/
985 
986           }
987         }
988 
989         //CL_TRACE("%s check: genB=%d", IndexFileNames::SEGMENTS_GEN, genB);
990 
991         // Pick the larger of the two gen's:
992         if (genA > genB)
993           gen = genA;
994         else
995           gen = genB;
996 
997         if (gen == -1) {
998           // Neither approach found a generation
999           _CLTHROWA(CL_ERR_IO, (string("No segments* file found in ") + directory->toString()).c_str());
1000         }
1001       }
1002 
1003       // Third method (fallback if first & second methods
1004       // are not reliable): since both directory cache and
1005       // file contents cache seem to be stale, just
1006       // advance the generation.
1007       if ( 1 == method || ( 0 == method && lastGen == gen && retry )) {
1008 
1009         method = 1;
1010 
1011         if (genLookaheadCount < defaultGenLookaheadCount) {
1012           gen++;
1013           genLookaheadCount++;
1014           //CL_TRACE("look ahead increment gen to %d", gen);
1015         }
1016       }
1017 
1018       if (lastGen == gen) {
1019 
1020         // This means we're about to try the same
1021         // segments_N last tried.  This is allowed,
1022         // exactly once, because writer could have been in
1023         // the process of writing segments_N last time.
1024 
1025         if (retry) {
1026           // OK, we've tried the same segments_N file
1027           // twice in a row, so this must be a real
1028           // error.  We throw the original exception we
1029           // got.
1030           throw exc;
1031         } else {
1032           retry = true;
1033         }
1034 
1035       } else {
1036         // Segment file has advanced since our last loop, so
1037         // reset retry:
1038         retry = false;
1039       }
1040 
1041       lastGen = gen;
1042 
1043       segmentFileName = IndexFileNames::fileNameFromGeneration(IndexFileNames::SEGMENTS, "", gen);
1044 
1045       CLuceneError saved_error;
1046       if ( tryDoBody(segmentFileName.c_str(), saved_error) ){
1047         return;
1048       }
1049 
1050       // Save the original root cause:
1051       if (exc.number() == 0) {
1052         CND_CONDITION( saved_error.number() > 0, "Unsupported error code");
1053         exc.set(saved_error.number(),saved_error.what());
1054       }
1055 
1056       //CL_TRACE("primary Exception on '" + segmentFileName + "': " + err + "'; will retry: retry=" + retry + "; gen = " + gen);
1057 
1058       if (!retry && gen > 1) {
1059 
1060         // This is our first time trying this segments
1061         // file (because retry is false), and, there is
1062         // possibly a segments_(N-1) (because gen > 1).
1063         // So, check if the segments_(N-1) exists and
1064         // try it if so:
1065         string prevSegmentFileName = IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", gen-1 );
1066 
1067         bool prevExists=false;
1068         if (directory != NULL)
1069           prevExists = directory->fileExists(prevSegmentFileName.c_str());
1070         else
1071           prevExists = Misc::dir_Exists( (string(fileDirectory) + prevSegmentFileName).c_str() );
1072 
1073         if (prevExists) {
1074           //CL_TRACE("fallback to prior segment file '%s'", prevSegmentFileName);
1075           CLuceneError saved_error;
1076           if ( tryDoBody(prevSegmentFileName.c_str(), saved_error) ){
1077             return;
1078           }
1079           //CL_TRACE("secondary Exception on '" + prevSegmentFileName + "': " + err2 + "'; will retry");
1080         }
1081       }
1082     }
1083   }
FindSegmentsRead(CL_NS (store)::Directory * dir,SegmentInfos * _this)1084   SegmentInfos::FindSegmentsRead::FindSegmentsRead( CL_NS(store)::Directory* dir, SegmentInfos* _this ) :
1085     SegmentInfos::FindSegmentsFile<bool>(dir) {
1086       this->_this = _this;
1087   }
doBody(const char * segmentFileName)1088   bool SegmentInfos::FindSegmentsRead::doBody( const char* segmentFileName ) {
1089 	  //Have SegmentInfos read the segments file in directory
1090 	  _this->read(directory, segmentFileName);
1091     return true;
1092   }
1093 
FindSegmentsVersion(CL_NS (store)::Directory * dir)1094   SegmentInfos::FindSegmentsVersion::FindSegmentsVersion( CL_NS(store)::Directory* dir ) :
1095     SegmentInfos::FindSegmentsFile<int64_t>(dir) {
1096   }
1097 
doBody(const char * segmentFileName)1098   int64_t SegmentInfos::FindSegmentsVersion::doBody( const char* segmentFileName ) {
1099 
1100 	  IndexInput* input = directory->openInput( segmentFileName );
1101 
1102 	  int32_t format = 0;
1103 	  int64_t version=0;
1104 	  try {
1105 		  format = input->readInt();
1106 		  if(format < 0){
1107 			  if(format < CURRENT_FORMAT){
1108 				  char err[30];
1109 				  cl_sprintf(err,30,"Unknown format version: %d",format);
1110 				  _CLTHROWA(CL_ERR_CorruptIndex,err);
1111 			  }
1112 			  version = input->readLong(); // read version
1113 		  }
1114 	  }
1115 	  _CLFINALLY( input->close(); _CLDELETE(input); );
1116 
1117 	  if(format < 0)
1118 		  return version;
1119 
1120 	  // We cannot be sure about the format of the file.
1121 	  // Therefore we have to read the whole file and cannot simply seek to the version entry.
1122 	  SegmentInfos* sis = _CLNEW SegmentInfos();
1123 	  sis->read(directory, segmentFileName);
1124 	  version = sis->getVersion();
1125 	  _CLDELETE(sis);
1126 
1127 	  return version;
1128 
1129   }
1130 
1131 CL_NS_END
1132