1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #include "CLucene/_ApiHeader.h"
8
9 #include "_SegmentInfos.h"
10 #include "_IndexFileNames.h"
11 #include "_SegmentHeader.h"
12 #include "MultiReader.h"
13 #include <assert.h>
14 #include <iostream>
15
16 #include "CLucene/store/Directory.h"
17 #include "CLucene/util/Misc.h"
18
19 CL_NS_USE(store)
CL_NS_USE(util)20 CL_NS_USE(util)
21
22 CL_NS_DEF(index)
23
24 SegmentInfo::SegmentInfo(const char* _name, const int32_t _docCount, CL_NS(store)::Directory* _dir,
25 bool _isCompoundFile, bool _hasSingleNormFile,
26 int32_t _docStoreOffset, const char* _docStoreSegment, bool _docStoreIsCompoundFile)
27 :
28 docCount(_docCount),
29 preLockless(false),
30 delGen(SegmentInfo::NO),
31 isCompoundFile(_isCompoundFile ? SegmentInfo::YES : SegmentInfo::NO),
32 hasSingleNormFile(_hasSingleNormFile),
33 _sizeInBytes(-1),
34 docStoreOffset(_docStoreOffset),
35 docStoreSegment( _docStoreSegment == NULL ? "" : _docStoreSegment ),
36 docStoreIsCompoundFile(_docStoreIsCompoundFile)
37 {
38 CND_PRECONDITION(docStoreOffset == -1 || !docStoreSegment.empty(), "failed testing for (docStoreOffset == -1 || docStoreSegment != NULL)");
39
40 this->name = _name;
41 this->dir = _dir;
42 }
43
segString(Directory * dir)44 string SegmentInfo::segString(Directory* dir) {
45 string cfs;
46 try {
47 if (getUseCompoundFile())
48 cfs = "c";
49 else
50 cfs = "C";
51 } catch (CLuceneError& ioe) {
52 if ( ioe.number() != CL_ERR_IO ) throw ioe;
53 cfs = "?";
54 }
55
56 string docStore;
57
58 if (docStoreOffset != -1)
59 docStore = string("->") + docStoreSegment;
60 else
61 docStore = "";
62
63 return string(name) + ":" +
64 cfs +
65 string(this->dir == dir ? "" : "x") +
66 Misc::toString(docCount) + docStore;
67 }
SegmentInfo(CL_NS (store)::Directory * _dir,int32_t format,CL_NS (store)::IndexInput * input)68 SegmentInfo::SegmentInfo(CL_NS(store)::Directory* _dir, int32_t format, CL_NS(store)::IndexInput* input):
69 _sizeInBytes(-1)
70 {
71 this->dir = _dir;
72
73 {
74 char aname[CL_MAX_PATH];
75 input->readString(aname, CL_MAX_PATH);
76 this->name = aname;
77 }
78
79 docCount = input->readInt();
80 if (format <= SegmentInfos::FORMAT_LOCKLESS) {
81 delGen = input->readLong();
82 if (format <= SegmentInfos::FORMAT_SHARED_DOC_STORE) {
83 docStoreOffset = input->readInt();
84 if (docStoreOffset != -1) {
85 char aname[CL_MAX_PATH];
86 input->readString(aname, CL_MAX_PATH);
87 docStoreSegment = aname;
88 docStoreIsCompoundFile = (1 == input->readByte());
89 } else {
90 docStoreSegment = name;
91 docStoreIsCompoundFile = false;
92 }
93 } else {
94 docStoreOffset = -1;
95 docStoreSegment = name;
96 docStoreIsCompoundFile = false;
97 }
98 if (format <= SegmentInfos::FORMAT_SINGLE_NORM_FILE) {
99 hasSingleNormFile = (1 == input->readByte());
100 } else {
101 hasSingleNormFile = false;
102 }
103 int32_t numNormGen = input->readInt();
104 normGen.deleteValues();
105 if (numNormGen == NO) {
106 // normGen is already NULL, we'll just set normGenLen to 0
107 } else {
108 normGen.values = _CL_NEWARRAY(int64_t, numNormGen);
109 normGen.length = numNormGen;
110 for(int32_t j=0;j<numNormGen;j++) {
111 normGen.values[j] = input->readLong();
112 }
113 }
114 isCompoundFile = input->readByte();
115 preLockless = (isCompoundFile == CHECK_DIR);
116 } else {
117 delGen = CHECK_DIR;
118 //normGen=NULL; normGenLen=0;
119 isCompoundFile = CHECK_DIR;
120 preLockless = true;
121 hasSingleNormFile = false;
122 docStoreOffset = -1;
123 docStoreIsCompoundFile = false;
124 }
125 }
126
reset(const SegmentInfo * src)127 void SegmentInfo::reset(const SegmentInfo* src) {
128 clearFiles();
129 this->name = src->name;
130 docCount = src->docCount;
131 dir = src->dir;
132 preLockless = src->preLockless;
133 delGen = src->delGen;
134 docStoreOffset = src->docStoreOffset;
135 docStoreIsCompoundFile = src->docStoreIsCompoundFile;
136 if (src->normGen.values == NULL) {
137 this->normGen.deleteValues();
138 }else{
139 // optimized case to allocate new array only if current memory buffer is too small
140 if (this->normGen.length < src->normGen.length) {
141 normGen.resize(src->normGen.length);
142 }else{
143 this->normGen.length = src->normGen.length;
144 }
145 memcpy(this->normGen.values, src->normGen.values, sizeof(int64_t) * this->normGen.length);
146 }
147 isCompoundFile = src->isCompoundFile;
148 hasSingleNormFile = src->hasSingleNormFile;
149 }
150
~SegmentInfo()151 SegmentInfo::~SegmentInfo(){
152 normGen.deleteValues();
153 }
154
setNumFields(const int32_t numFields)155 void SegmentInfo::setNumFields(const int32_t numFields) {
156 if (normGen.values == NULL) {
157 // normGen is null if we loaded a pre-2.1 segment
158 // file, or, if this segments file hasn't had any
159 // norms set against it yet:
160 normGen.resize(numFields);
161
162 if (preLockless) {
163 // Do nothing: thus leaving normGen[k]==CHECK_DIR (==0), so that later we know
164 // we have to check filesystem for norm files, because this is prelockless.
165
166 } else {
167 // This is a FORMAT_LOCKLESS segment, which means
168 // there are no separate norms:
169 for(int32_t i=0;i<numFields;i++) {
170 normGen.values[i] = NO;
171 }
172 }
173 }
174 }
175 /** Returns total size in bytes of all of files used by
176 * this segment. */
sizeInBytes()177 int64_t SegmentInfo::sizeInBytes(){
178 if (_sizeInBytes == -1) {
179 const vector<string>& __files = files();
180 size_t size = __files.size();
181 _sizeInBytes = 0;
182 for(size_t i=0;i<size;i++) {
183 const char* fileName = __files[i].c_str();
184 // We don't count bytes used by a shared doc store
185 // against this segment:
186 if (docStoreOffset == -1 || !IndexFileNames::isDocStoreFile(fileName))
187 _sizeInBytes += dir->fileLength(fileName);
188 }
189 }
190 return _sizeInBytes;
191 }
192
addIfExists(std::vector<std::string> & files,const std::string & fileName)193 void SegmentInfo::addIfExists(std::vector<std::string>& files, const std::string& fileName){
194 if (dir->fileExists(fileName.c_str()))
195 files.push_back(fileName);
196 }
197
files()198 const vector<string>& SegmentInfo::files(){
199 if (!_files.empty()) {
200 // Already cached:
201 return _files;
202 }
203
204 bool useCompoundFile = getUseCompoundFile();
205
206 if (useCompoundFile) {
207 _files.push_back( string(name) + "." + IndexFileNames::COMPOUND_FILE_EXTENSION);
208 } else {
209 ConstValueArray<const char*>& exts = IndexFileNames::NON_STORE_INDEX_EXTENSIONS();
210 for(size_t i=0;i<exts.length;i++){
211 addIfExists(_files, name + "." + exts[i]);
212 }
213 }
214
215 if (docStoreOffset != -1) {
216 // We are sharing doc stores (stored fields, term
217 // vectors) with other segments
218 assert (!docStoreSegment.empty());
219 if (docStoreIsCompoundFile) {
220 _files.push_back(docStoreSegment + "." + IndexFileNames::COMPOUND_FILE_STORE_EXTENSION);
221 } else {
222 ConstValueArray<const char*>& exts = IndexFileNames::STORE_INDEX_EXTENSIONS();
223 for(size_t i=0;i<exts.length;i++)
224 addIfExists(_files, docStoreSegment + "." + exts[i]);
225 }
226 } else if (!useCompoundFile) {
227 // We are not sharing, and, these files were not
228 // included in the compound file
229 ConstValueArray<const char*>& exts = IndexFileNames::STORE_INDEX_EXTENSIONS();
230 for(size_t i=0;i<exts.length;i++)
231 addIfExists(_files, name + "." + exts[i]);
232 }
233
234 string delFileName = IndexFileNames::fileNameFromGeneration(name.c_str(), (string(".") + IndexFileNames::DELETES_EXTENSION).c_str(), delGen);
235 if ( !delFileName.empty() && (delGen >= YES || dir->fileExists(delFileName.c_str()))) {
236 _files.push_back(delFileName);
237 }
238
239 // Careful logic for norms files
240 if (normGen.values != NULL) {
241 for(size_t i=0;i<normGen.length;i++) {
242 int64_t gen = normGen[i];
243 if (gen >= YES) {
244 // Definitely a separate norm file, with generation:
245 string gens = string(".") + IndexFileNames::SEPARATE_NORMS_EXTENSION;
246 gens += Misc::toString((int64_t)i);
247 _files.push_back(IndexFileNames::fileNameFromGeneration(name.c_str(), gens.c_str(), gen));
248 } else if (NO == gen) {
249 // No separate norms but maybe plain norms
250 // in the non compound file case:
251 if (!hasSingleNormFile && !useCompoundFile) {
252 string fileName = name + "." + IndexFileNames::PLAIN_NORMS_EXTENSION;
253 fileName += i;
254 if (dir->fileExists(fileName.c_str())) {
255 _files.push_back(fileName);
256 }
257 }
258 } else if (CHECK_DIR == gen) {
259 // Pre-2.1: we have to check file existence
260 string fileName;
261 if (useCompoundFile) {
262 fileName = name + "." + IndexFileNames::SEPARATE_NORMS_EXTENSION;
263 fileName += Misc::toString((int64_t)i);
264 } else if (!hasSingleNormFile) {
265 fileName = name + "." + IndexFileNames::PLAIN_NORMS_EXTENSION;
266 fileName += Misc::toString((int64_t)i);
267 }
268 if ( !fileName.empty() && dir->fileExists(fileName.c_str())) {
269 _files.push_back(fileName);
270 }
271 }
272 }
273 } else if (preLockless || (!hasSingleNormFile && !useCompoundFile)) {
274 // Pre-2.1: we have to scan the dir to find all
275 // matching _X.sN/_X.fN files for our segment:
276 string prefix;
277 if (useCompoundFile)
278 prefix = name + "." + IndexFileNames::SEPARATE_NORMS_EXTENSION;
279 else
280 prefix = name + "." + IndexFileNames::PLAIN_NORMS_EXTENSION;
281 size_t prefixLength = prefix.length();
282 vector<string> allFiles;
283 if (dir->list(allFiles) == false ){
284 string err = "cannot read directory ";
285 err += dir->toString();
286 err += ": list() returned null";
287 _CLTHROWA(CL_ERR_IO, err.c_str());
288 }
289 for(size_t i=0;i<allFiles.size();i++) {
290 string& fileName = allFiles[i];
291 if (fileName.length() > prefixLength && _istdigit(fileName[prefixLength]) && fileName.compare(0,prefix.length(),prefix)==0 ) {
292 _files.push_back(fileName);
293 }
294 }
295 }
296 return _files;
297 }
298
299
300
hasDeletions() const301 bool SegmentInfo::hasDeletions() const {
302 // Cases:
303 //
304 // delGen == NO: this means this segment was written
305 // by the LOCKLESS code and for certain does not have
306 // deletions yet
307 //
308 // delGen == CHECK_DIR: this means this segment was written by
309 // pre-LOCKLESS code which means we must check
310 // directory to see if .del file exists
311 //
312 // delGen >= YES: this means this segment was written by
313 // the LOCKLESS code and for certain has
314 // deletions
315 //
316 if (delGen == NO) {
317 return false;
318 } else if (delGen >= YES) {
319 return true;
320 } else {
321 return dir->fileExists(getDelFileName().c_str());
322 }
323 }
324
advanceDelGen()325 void SegmentInfo::advanceDelGen() {
326 // delGen 0 is reserved for pre-LOCKLESS format
327 if (delGen == NO) {
328 delGen = YES;
329 } else {
330 delGen++;
331 }
332 clearFiles();
333 }
334
clearDelGen()335 void SegmentInfo::clearDelGen() {
336 delGen = NO;
337 clearFiles();
338 }
339
clone()340 SegmentInfo* SegmentInfo::clone () {
341 SegmentInfo* si = _CLNEW SegmentInfo(name.c_str(), docCount, dir);
342 si->isCompoundFile = isCompoundFile;
343 si->delGen = delGen;
344 si->preLockless = preLockless;
345 si->hasSingleNormFile = hasSingleNormFile;
346 if (this->normGen.values != NULL) {
347 si->normGen.resize(this->normGen.length);
348 memcpy(si->normGen.values, this->normGen.values, sizeof(int64_t) * this->normGen.length);
349 }
350 si->docStoreOffset = docStoreOffset;
351 si->docStoreSegment = docStoreSegment;
352 si->docStoreIsCompoundFile = docStoreIsCompoundFile;
353
354 return si;
355 }
356
getDelFileName() const357 string SegmentInfo::getDelFileName() const {
358 if (delGen == NO) {
359 // In this case we know there is no deletion filename
360 // against this segment
361 return NULL;
362 } else {
363 // If delGen is CHECK_DIR, it's the pre-lockless-commit file format
364 return IndexFileNames::fileNameFromGeneration(name.c_str(), (string(".") + IndexFileNames::DELETES_EXTENSION).c_str(), delGen);
365 }
366 }
367
hasSeparateNorms(const int32_t fieldNumber) const368 bool SegmentInfo::hasSeparateNorms(const int32_t fieldNumber) const {
369 if ((normGen.values == NULL && preLockless) || (normGen.values != NULL && normGen[fieldNumber] == CHECK_DIR)) {
370 // Must fallback to directory file exists check:
371 return dir->fileExists( (name + string(".s") + Misc::toString(fieldNumber)).c_str() );
372 } else if (normGen.values == NULL || normGen[fieldNumber] == NO) {
373 return false;
374 } else {
375 return true;
376 }
377 }
378
hasSeparateNorms() const379 bool SegmentInfo::hasSeparateNorms() const {
380 if (normGen.values == NULL) {
381 if (!preLockless) {
382 // This means we were created w/ LOCKLESS code and no
383 // norms are written yet:
384 return false;
385 } else {
386 // This means this segment was saved with pre-LOCKLESS
387 // code. So we must fallback to the original
388 // directory list check:
389 vector<string> result;
390 if ( !dir->list(result) ) {
391 _CLTHROWA(CL_ERR_IO, (string("cannot read directory: ") + dir->toString() + string(" list() returned NULL")).c_str() );
392 }
393
394 string pattern = name + string(".s");
395 for ( vector<string>::iterator itr = result.begin();
396 itr != result.end() ; itr ++ ){
397 if(strncmp(itr->c_str(), pattern.c_str(), pattern.length() ) == 0 &&
398 isdigit( (*itr)[pattern.length()])) {
399 return true;
400 }
401 }
402 return false;
403 }
404 } else {
405 // This means this segment was saved with LOCKLESS
406 // code so we first check whether any normGen's are >= 1
407 // (meaning they definitely have separate norms):
408 for(size_t i=0;i<normGen.length;i++) {
409 if (normGen[i] >= YES) {
410 return true;
411 }
412 }
413 // Next we look for any == 0. These cases were
414 // pre-LOCKLESS and must be checked in directory:
415 for(size_t j=0;j<normGen.length;j++) {
416 if (normGen[j] == CHECK_DIR) {
417 if (hasSeparateNorms(j)) {
418 return true;
419 }
420 }
421 }
422 }
423
424 return false;
425 }
426
advanceNormGen(const int32_t fieldIndex)427 void SegmentInfo::advanceNormGen(const int32_t fieldIndex) {
428 if (normGen[fieldIndex] == NO) {
429 normGen.values[fieldIndex] = YES;
430 } else {
431 normGen.values[fieldIndex]++;
432 }
433 clearFiles();
434 }
435
getNormFileName(const int32_t number) const436 string SegmentInfo::getNormFileName(const int32_t number) const {
437 char prefix[10];
438
439 int64_t gen;
440 if (normGen.values == NULL) {
441 gen = CHECK_DIR;
442 } else {
443 gen = normGen[number];
444 }
445
446 if (hasSeparateNorms(number)) {
447 // case 1: separate norm
448 cl_sprintf(prefix, 10, ".s%d", number);
449 return IndexFileNames::fileNameFromGeneration(name.c_str(), prefix, gen);
450 }
451
452 if (hasSingleNormFile) {
453 // case 2: lockless (or nrm file exists) - single file for all norms
454 cl_sprintf(prefix, 10, ".%s", IndexFileNames::NORMS_EXTENSION);
455 return IndexFileNames::fileNameFromGeneration(name.c_str(), prefix, WITHOUT_GEN);
456 }
457
458 // case 3: norm file for each field
459 cl_sprintf(prefix, 10, ".f%d", number);
460 return IndexFileNames::fileNameFromGeneration(name.c_str(), prefix, WITHOUT_GEN);
461 }
462
setUseCompoundFile(const bool isCompoundFile)463 void SegmentInfo::setUseCompoundFile(const bool isCompoundFile) {
464 if (isCompoundFile) {
465 this->isCompoundFile = YES;
466 } else {
467 this->isCompoundFile = NO;
468 }
469 clearFiles();
470 }
471
getUseCompoundFile() const472 bool SegmentInfo::getUseCompoundFile() const {
473 if (isCompoundFile == NO) {
474 return false;
475 } else if (isCompoundFile == YES) {
476 return true;
477 } else {
478 return dir->fileExists( ((string)name + "." + IndexFileNames::COMPOUND_FILE_EXTENSION).c_str() );
479 }
480 }
481
getDocStoreOffset() const482 int32_t SegmentInfo::getDocStoreOffset() const { return docStoreOffset; }
483
getDocStoreIsCompoundFile() const484 bool SegmentInfo::getDocStoreIsCompoundFile() const { return docStoreIsCompoundFile; }
485
setDocStoreIsCompoundFile(const bool v)486 void SegmentInfo::setDocStoreIsCompoundFile(const bool v) {
487 docStoreIsCompoundFile = v;
488 clearFiles();
489 }
490
getDocStoreSegment() const491 const string& SegmentInfo::getDocStoreSegment() const {
492 return docStoreSegment;
493 }
494
setDocStoreOffset(const int32_t offset)495 void SegmentInfo::setDocStoreOffset(const int32_t offset) {
496 docStoreOffset = offset;
497 clearFiles();
498 }
499
write(CL_NS (store)::IndexOutput * output)500 void SegmentInfo::write(CL_NS(store)::IndexOutput* output) {
501 output->writeString(name);
502 output->writeInt(docCount);
503 output->writeLong(delGen);
504 output->writeInt(docStoreOffset);
505 if (docStoreOffset != -1) {
506 output->writeString(docStoreSegment);
507 output->writeByte(static_cast<uint8_t>(docStoreIsCompoundFile ? 1:0));
508 }
509
510 output->writeByte(static_cast<uint8_t>(hasSingleNormFile ? 1:0));
511 if (normGen.values == NULL) {
512 output->writeInt(NO);
513 } else {
514 output->writeInt(normGen.length);
515 for(size_t j = 0; j < normGen.length; j++) {
516 output->writeLong(normGen[j]);
517 }
518 }
519 output->writeByte(isCompoundFile);
520 }
521
clearFiles()522 void SegmentInfo::clearFiles() {
523 _files.clear();
524 _sizeInBytes = -1;
525 }
526
527 /** We consider another SegmentInfo instance equal if it
528 * has the same dir and same name. */
equals(const SegmentInfo * obj)529 bool SegmentInfo::equals(const SegmentInfo* obj) {
530 return (obj->dir == this->dir && obj->name.compare(this->name) == 0 );
531 }
532
533
534
535
536
537 std::ostream* SegmentInfos::infoStream = NULL;
538
539 /** If non-null, information about retries when loading
540 * the segments file will be printed to this.
541 */
setInfoStream(std::ostream * infoStream)542 void SegmentInfos::setInfoStream(std::ostream* infoStream) {
543 SegmentInfos::infoStream = infoStream;
544 }
545
546 /**
547 * @see #setInfoStream
548 */
getInfoStream()549 std::ostream* SegmentInfos::getInfoStream() {
550 return infoStream;
551 }
552
SegmentInfos(bool deleteMembers,int32_t reserveCount)553 SegmentInfos::SegmentInfos(bool deleteMembers, int32_t reserveCount) :
554 generation(0),lastGeneration(0), infos(deleteMembers) {
555 //Func - Constructor
556 //Pre - deleteMembers indicates if the instance to be created must delete
557 // all SegmentInfo instances it manages when the instance is destroyed or not
558 // true -> must delete, false may not delete
559 //Post - An instance of SegmentInfos has been created.
560
561 //initialize counter to 0
562 counter = 0;
563 version = Misc::currentTimeMillis();
564 if (reserveCount > 1)
565 infos.reserve(reserveCount);
566 }
567
~SegmentInfos()568 SegmentInfos::~SegmentInfos(){
569 //Func - Destructor
570 //Pre - true
571 //Post - The instance has been destroyed. Depending on the constructor used
572 // the SegmentInfo instances that this instance managed have been deleted or not.
573
574 //Clear the list of SegmentInfo instances - make sure everything is deleted
575 infos.clear();
576 }
577
info(int32_t i) const578 SegmentInfo* SegmentInfos::info(int32_t i) const {
579 //Func - Returns a reference to the i-th SegmentInfo in the list.
580 //Pre - i >= 0
581 //Post - A reference to the i-th SegmentInfo instance has been returned
582
583 CND_PRECONDITION(i >= 0 && i < infos.size(), "i is out of bounds");
584
585 //Get the i-th SegmentInfo instance
586 SegmentInfo *ret = infos[i];
587
588 //Condition check to see if the i-th SegmentInfo has been retrieved
589 CND_CONDITION(ret != NULL,"No SegmentInfo instance found");
590
591 return ret;
592 }
593
getCurrentSegmentGeneration(std::vector<std::string> & files)594 int64_t SegmentInfos::getCurrentSegmentGeneration( std::vector<std::string>& files ) {
595 if ( files.size() == 0 ) {
596 return -1;
597 }
598
599 int64_t max = -1;
600
601 vector<string>::iterator itr = files.begin();
602 const char* file;
603 size_t seglen = strlen(IndexFileNames::SEGMENTS);
604 while ( itr != files.end() ) {
605 file = itr->c_str();
606 if ( strncmp( file, IndexFileNames::SEGMENTS, seglen ) == 0 && strcmp( file, IndexFileNames::SEGMENTS_GEN ) != 0 ) {
607 int64_t gen = generationFromSegmentsFileName( file );
608 if ( gen > max ) {
609 max = gen;
610 }
611 }
612
613 itr++;
614 }
615
616 return max;
617 }
618
getCurrentSegmentGeneration(const CL_NS (store)::Directory * directory)619 int64_t SegmentInfos::getCurrentSegmentGeneration( const CL_NS(store)::Directory* directory ) {
620 vector<string> files;
621 if ( !directory->list(&files) ){
622 _CLTHROWA(CL_ERR_IO, (string("cannot read directory ") + directory->toString() + string(": list() returned NULL")).c_str() );
623 }
624 int64_t gen = getCurrentSegmentGeneration( files );
625 return gen;
626 }
627
getCurrentSegmentFileName(vector<string> & files)628 string SegmentInfos::getCurrentSegmentFileName( vector<string>& files ) {
629 return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", getCurrentSegmentGeneration( files ));
630 }
631
getCurrentSegmentFileName(CL_NS (store)::Directory * directory)632 std::string SegmentInfos::getCurrentSegmentFileName( CL_NS(store)::Directory* directory ) {
633 return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", getCurrentSegmentGeneration( directory ));
634 }
635
getCurrentSegmentFileName()636 std::string SegmentInfos::getCurrentSegmentFileName() {
637 return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", lastGeneration );
638 }
639
generationFromSegmentsFileName(const char * fileName)640 int64_t SegmentInfos::generationFromSegmentsFileName( const char* fileName ) {
641 if ( strcmp( fileName, IndexFileNames::SEGMENTS ) == 0 ) {
642 return 0;
643 } else if ( strncmp( fileName, IndexFileNames::SEGMENTS, strlen(IndexFileNames::SEGMENTS) ) == 0 ) {
644 return CL_NS(util)::Misc::base36ToLong( fileName + strlen( IndexFileNames::SEGMENTS )+1 );
645 } else {
646 TCHAR err[CL_MAX_PATH + 35];
647 _sntprintf(err,CL_MAX_PATH + 35,_T("fileName \"%s\" is not a segments file"), fileName);
648 _CLTHROWA(CL_ERR_IllegalArgument, err);
649 return 0;
650 }
651 }
652
getNextSegmentFileName()653 std::string SegmentInfos::getNextSegmentFileName() {
654 int64_t nextGeneration;
655
656 if ( generation == -1 ) {
657 nextGeneration = 1;
658 } else {
659 nextGeneration = generation+1;
660 }
661
662 return IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", nextGeneration );
663 }
664
clearto(size_t from,size_t end)665 void SegmentInfos::clearto(size_t from, size_t end){
666 size_t range = end - from;
667 if ( (infos.size() - from) >= range) { // Make sure we actually need to remove
668 segmentInfosType::iterator itr,bitr=infos.begin()+from,eitr=infos.end();
669 size_t count = 0;
670 for(itr=bitr;itr!=eitr && count < range;++itr, count++) {
671 _CLLDELETE((*itr));
672 }
673 infos.erase(bitr,bitr + count);
674 }
675 }
add(SegmentInfo * info,int32_t pos)676 void SegmentInfos::add(SegmentInfo* info, int32_t pos){
677 if ( pos == -1 ){
678 infos.push_back(info);
679 }else{
680 if ( pos < 0 || pos >= (int32_t)infos.size()+1 ) _CLTHROWA(CL_ERR_IllegalArgument, "pos is out of range");
681 infos.insert( infos.begin()+pos, info );
682 }
683 }
size() const684 int32_t SegmentInfos::size() const{
685 return infos.size();
686 }
elementAt(int32_t pos)687 SegmentInfo* SegmentInfos::elementAt(int32_t pos) {
688 return infos.at(pos);
689 }
setElementAt(SegmentInfo * si,int32_t pos)690 void SegmentInfos::setElementAt(SegmentInfo* si, int32_t pos) {
691 infos.set(pos, si);
692 }
clear()693 void SegmentInfos::clear() { infos.clear(); }
694
695
insert(SegmentInfos * _infos,bool takeMemory)696 void SegmentInfos::insert(SegmentInfos* _infos, bool takeMemory){
697 infos.insert(infos.end(),_infos->infos.begin(),_infos->infos.end());
698 if ( takeMemory ){
699 while (_infos->infos.size() > 0 )
700 _infos->infos.remove(_infos->infos.begin(), true );
701 }
702 }
insert(SegmentInfo * info)703 void SegmentInfos::insert(SegmentInfo* info){
704 infos.push_back(info);
705 }
indexOf(const SegmentInfo * info) const706 int32_t SegmentInfos::indexOf(const SegmentInfo* info) const{
707 segmentInfosType::const_iterator itr = infos.begin();
708 int32_t c=-1;
709 while ( itr != infos.end()){
710 c++;
711 if ( *itr == info ){
712 return c;
713 }
714 itr++;
715 }
716 return -1;
717 }
range(size_t from,size_t to,SegmentInfos & ret) const718 void SegmentInfos::range(size_t from, size_t to, SegmentInfos& ret) const{
719 segmentInfosType::const_iterator itr = infos.begin();
720 itr+= from;
721 for (size_t i=from;i<to && itr != infos.end();i++){
722 ret.infos.push_back(*itr);
723
724 itr++;
725 }
726 }
remove(size_t index,bool dontDelete)727 void SegmentInfos::remove(size_t index, bool dontDelete){
728 infos.remove(index, dontDelete);
729 }
730
read(Directory * directory,const char * segmentFileName)731 void SegmentInfos::read(Directory* directory, const char* segmentFileName){
732 bool success = false;
733
734 // Clear any previous segments:
735 clear();
736
737 IndexInput* input = directory->openInput(segmentFileName);
738 CND_CONDITION(input != NULL,"input == NULL");
739
740 generation = generationFromSegmentsFileName( segmentFileName );
741 lastGeneration = generation;
742
743 try {
744 int32_t format = input->readInt();
745 if(format < 0){ // file contains explicit format info
746 // check that it is a format we can understand
747 if (format < CURRENT_FORMAT){
748 char err[30];
749 cl_sprintf(err,30,"Unknown format version: %d", format);
750 _CLTHROWA(CL_ERR_CorruptIndex, err);
751 }
752 version = input->readLong(); // read version
753 counter = input->readInt(); // read counter
754 }
755 else{ // file is in old format without explicit format info
756 counter = format;
757 }
758
759 for (int32_t i = input->readInt(); i > 0; i--) { // read segmentInfos
760 infos.push_back( _CLNEW SegmentInfo(directory, format, input) );
761 }
762
763 if(format >= 0){ // in old format the version number may be at the end of the file
764 if (input->getFilePointer() >= input->length())
765 version = CL_NS(util)::Misc::currentTimeMillis(); // old file format without version number
766 else
767 version = input->readLong(); // read version
768 }
769 success = true;
770 } _CLFINALLY({
771 input->close();
772 _CLDELETE(input);
773 if (!success) {
774 // Clear any segment infos we had loaded so we
775 // have a clean slate on retry:
776 clear();
777 }
778 });
779 }
780
read(Directory * directory)781 void SegmentInfos::read(Directory* directory) {
782 generation = lastGeneration = -1;
783
784 FindSegmentsRead find(directory, this);
785
786 find.run();
787 }
788
789
write(Directory * directory)790 void SegmentInfos::write(Directory* directory){
791 //Func - Writes a new segments file based upon the SegmentInfo instances it manages
792 //Pre - directory is a valid reference to a Directory
793 //Post - The new segment has been written to disk
794
795 string segmentFileName = getNextSegmentFileName();
796
797 // Always advance the generation on write:
798 if (generation == -1) {
799 generation = 1;
800 } else {
801 generation++;
802 }
803
804 IndexOutput* output = directory->createOutput(segmentFileName.c_str());
805
806 bool success = false;
807
808 try {
809 output->writeInt(CURRENT_FORMAT); // write FORMAT
810 output->writeLong(++version); // every write changes
811 // the index
812 output->writeInt(counter); // write counter
813 output->writeInt(size()); // write infos
814 for (int32_t i = 0; i < size(); i++) {
815 info(i)->write(output);
816 }
817 }_CLFINALLY (
818 try {
819 output->close();
820 _CLDELETE(output);
821 success = true;
822 } _CLFINALLY (
823 if (!success) {
824 // Try not to leave a truncated segments_N file in
825 // the index:
826 directory->deleteFile(segmentFileName.c_str());
827 }
828 )
829 )
830
831 try {
832 output = directory->createOutput(IndexFileNames::SEGMENTS_GEN);
833 try {
834 output->writeInt(FORMAT_LOCKLESS);
835 output->writeLong(generation);
836 output->writeLong(generation);
837 } _CLFINALLY(
838 output->close();
839 _CLDELETE(output);
840 )
841 } catch (CLuceneError& e) {
842 if ( e.number() != CL_ERR_IO ) throw e;
843 // It's OK if we fail to write this file since it's
844 // used only as one of the retry fallbacks.
845 }
846
847 lastGeneration = generation;
848 }
849
clone() const850 SegmentInfos* SegmentInfos::clone() const{
851 SegmentInfos* sis = _CLNEW SegmentInfos(true, infos.size());
852 for(size_t i=0;i<infos.size();i++) {
853 sis->setElementAt(infos[i]->clone(), i);
854 }
855 return sis;
856 }
857
getVersion() const858 int64_t SegmentInfos::getVersion() const { return version; }
getGeneration() const859 int64_t SegmentInfos::getGeneration() const { return generation; }
getLastGeneration() const860 int64_t SegmentInfos::getLastGeneration() const { return lastGeneration; }
861
readCurrentVersion(Directory * directory)862 int64_t SegmentInfos::readCurrentVersion(Directory* directory){
863 FindSegmentsVersion find(directory);
864 return find.run();
865 }
866
867 //void SegmentInfos::setDefaultGenFileRetryCount(const int32_t count) { defaultGenFileRetryCount = count; }
getDefaultGenFileRetryCount()868 int32_t SegmentInfos::getDefaultGenFileRetryCount() { return defaultGenFileRetryCount; }
869
870 //void SegmentInfos::setDefaultGenFileRetryPauseMsec(const int32_t msec) { defaultGenFileRetryPauseMsec = msec; }
getDefaultGenFileRetryPauseMsec()871 int32_t SegmentInfos::getDefaultGenFileRetryPauseMsec() { return defaultGenFileRetryPauseMsec; }
872
873 //void SegmentInfos::setDefaultGenLookaheadCount(const int32_t count) { defaultGenLookaheadCount = count;}
getDefaultGenLookahedCount()874 int32_t SegmentInfos::getDefaultGenLookahedCount() { return defaultGenLookaheadCount; }
875
doRun()876 void SegmentInfos::_FindSegmentsFile::doRun(){
877 string segmentFileName;
878 int64_t lastGen = -1;
879 int64_t gen = 0;
880 int32_t genLookaheadCount = 0;
881 bool retry = false;
882 CLuceneError exc; //saved exception
883
884 int32_t method = 0;
885
886 // Loop until we succeed in calling doBody() without
887 // hitting an IOException. An IOException most likely
888 // means a commit was in process and has finished, in
889 // the time it took us to load the now-old infos files
890 // (and segments files). It's also possible it's a
891 // true error (corrupt index). To distinguish these,
892 // on each retry we must see "forward progress" on
893 // which generation we are trying to load. If we
894 // don't, then the original error is real and we throw
895 // it.
896
897 // We have three methods for determining the current
898 // generation. We try the first two in parallel, and
899 // fall back to the third when necessary.
900
901 while( true ) {
902
903 if ( 0 == method ) {
904 // Method 1: list the directory and use the highest
905 // segments_N file. This method works well as long
906 // as there is no stale caching on the directory
907 // contents (NOTE: NFS clients often have such stale
908 // caching):
909 vector<string> files;
910
911 int64_t genA = -1;
912
913 if (directory != NULL){
914 if (directory->list(&files)) {
915 genA = getCurrentSegmentGeneration( files );
916 files.clear();
917 }
918 }
919
920
921 if ( infoStream ){
922 (*infoStream) << "[SIS]: directory listing genA=" << genA << "\n";
923 }
924
925 // Method 2: open segments.gen and read its
926 // contents. Then we take the larger of the two
927 // gen's. This way, if either approach is hitting
928 // a stale cache (NFS) we have a better chance of
929 // getting the right generation.
930 int64_t genB = -1;
931 if (directory != NULL) {
932 CLuceneError e;
933 for(int32_t i=0;i<defaultGenFileRetryCount;i++) {
934 IndexInput* genInput = NULL;
935 if ( ! directory->openInput(IndexFileNames::SEGMENTS_GEN, genInput, e) ){
936 if (e.number() == CL_ERR_IO ) {
937 if ( infoStream ){
938 (*infoStream) << "[SIS]: segments.gen open: IOException " << e.what() << "\n";
939 }
940 break;
941 } else {
942 genInput->close();
943 _CLLDELETE(genInput);
944 throw e;
945 }
946 }
947
948 if (genInput != NULL) {
949 try {
950 int32_t version = genInput->readInt();
951 if (version == FORMAT_LOCKLESS) {
952 int64_t gen0 = genInput->readLong();
953 int64_t gen1 = genInput->readLong();
954 //CL_TRACE("fallback check: %d; %d", gen0, gen1);
955 if (gen0 == gen1) {
956 // The file is consistent.
957 genB = gen0;
958 genInput->close();
959 _CLDELETE(genInput);
960 break;
961 }
962 }
963 } catch (CLuceneError &err2) {
964 if (err2.number() != CL_ERR_IO) {
965 genInput->close();
966 _CLLDELETE(genInput);
967 throw err2; // retry only for IOException
968 }
969 } _CLFINALLY({
970 genInput->close();
971 _CLDELETE(genInput);
972 });
973 }
974
975 _LUCENE_SLEEP(defaultGenFileRetryPauseMsec);
976 /*
977 //todo: Wrap the LUCENE_SLEEP call above with the following try/catch block if
978 // InterruptedException is implemented
979 try {
980 } catch (CLuceneError &e) {
981 //if (err2.number != CL_ERR_Interrupted) // retry only for InterruptedException
982 // todo: see if CL_ERR_Interrupted needs to be added...
983 throw e;
984 }*/
985
986 }
987 }
988
989 //CL_TRACE("%s check: genB=%d", IndexFileNames::SEGMENTS_GEN, genB);
990
991 // Pick the larger of the two gen's:
992 if (genA > genB)
993 gen = genA;
994 else
995 gen = genB;
996
997 if (gen == -1) {
998 // Neither approach found a generation
999 _CLTHROWA(CL_ERR_IO, (string("No segments* file found in ") + directory->toString()).c_str());
1000 }
1001 }
1002
1003 // Third method (fallback if first & second methods
1004 // are not reliable): since both directory cache and
1005 // file contents cache seem to be stale, just
1006 // advance the generation.
1007 if ( 1 == method || ( 0 == method && lastGen == gen && retry )) {
1008
1009 method = 1;
1010
1011 if (genLookaheadCount < defaultGenLookaheadCount) {
1012 gen++;
1013 genLookaheadCount++;
1014 //CL_TRACE("look ahead increment gen to %d", gen);
1015 }
1016 }
1017
1018 if (lastGen == gen) {
1019
1020 // This means we're about to try the same
1021 // segments_N last tried. This is allowed,
1022 // exactly once, because writer could have been in
1023 // the process of writing segments_N last time.
1024
1025 if (retry) {
1026 // OK, we've tried the same segments_N file
1027 // twice in a row, so this must be a real
1028 // error. We throw the original exception we
1029 // got.
1030 throw exc;
1031 } else {
1032 retry = true;
1033 }
1034
1035 } else {
1036 // Segment file has advanced since our last loop, so
1037 // reset retry:
1038 retry = false;
1039 }
1040
1041 lastGen = gen;
1042
1043 segmentFileName = IndexFileNames::fileNameFromGeneration(IndexFileNames::SEGMENTS, "", gen);
1044
1045 CLuceneError saved_error;
1046 if ( tryDoBody(segmentFileName.c_str(), saved_error) ){
1047 return;
1048 }
1049
1050 // Save the original root cause:
1051 if (exc.number() == 0) {
1052 CND_CONDITION( saved_error.number() > 0, "Unsupported error code");
1053 exc.set(saved_error.number(),saved_error.what());
1054 }
1055
1056 //CL_TRACE("primary Exception on '" + segmentFileName + "': " + err + "'; will retry: retry=" + retry + "; gen = " + gen);
1057
1058 if (!retry && gen > 1) {
1059
1060 // This is our first time trying this segments
1061 // file (because retry is false), and, there is
1062 // possibly a segments_(N-1) (because gen > 1).
1063 // So, check if the segments_(N-1) exists and
1064 // try it if so:
1065 string prevSegmentFileName = IndexFileNames::fileNameFromGeneration( IndexFileNames::SEGMENTS, "", gen-1 );
1066
1067 bool prevExists=false;
1068 if (directory != NULL)
1069 prevExists = directory->fileExists(prevSegmentFileName.c_str());
1070 else
1071 prevExists = Misc::dir_Exists( (string(fileDirectory) + prevSegmentFileName).c_str() );
1072
1073 if (prevExists) {
1074 //CL_TRACE("fallback to prior segment file '%s'", prevSegmentFileName);
1075 CLuceneError saved_error;
1076 if ( tryDoBody(prevSegmentFileName.c_str(), saved_error) ){
1077 return;
1078 }
1079 //CL_TRACE("secondary Exception on '" + prevSegmentFileName + "': " + err2 + "'; will retry");
1080 }
1081 }
1082 }
1083 }
FindSegmentsRead(CL_NS (store)::Directory * dir,SegmentInfos * _this)1084 SegmentInfos::FindSegmentsRead::FindSegmentsRead( CL_NS(store)::Directory* dir, SegmentInfos* _this ) :
1085 SegmentInfos::FindSegmentsFile<bool>(dir) {
1086 this->_this = _this;
1087 }
doBody(const char * segmentFileName)1088 bool SegmentInfos::FindSegmentsRead::doBody( const char* segmentFileName ) {
1089 //Have SegmentInfos read the segments file in directory
1090 _this->read(directory, segmentFileName);
1091 return true;
1092 }
1093
FindSegmentsVersion(CL_NS (store)::Directory * dir)1094 SegmentInfos::FindSegmentsVersion::FindSegmentsVersion( CL_NS(store)::Directory* dir ) :
1095 SegmentInfos::FindSegmentsFile<int64_t>(dir) {
1096 }
1097
doBody(const char * segmentFileName)1098 int64_t SegmentInfos::FindSegmentsVersion::doBody( const char* segmentFileName ) {
1099
1100 IndexInput* input = directory->openInput( segmentFileName );
1101
1102 int32_t format = 0;
1103 int64_t version=0;
1104 try {
1105 format = input->readInt();
1106 if(format < 0){
1107 if(format < CURRENT_FORMAT){
1108 char err[30];
1109 cl_sprintf(err,30,"Unknown format version: %d",format);
1110 _CLTHROWA(CL_ERR_CorruptIndex,err);
1111 }
1112 version = input->readLong(); // read version
1113 }
1114 }
1115 _CLFINALLY( input->close(); _CLDELETE(input); );
1116
1117 if(format < 0)
1118 return version;
1119
1120 // We cannot be sure about the format of the file.
1121 // Therefore we have to read the whole file and cannot simply seek to the version entry.
1122 SegmentInfos* sis = _CLNEW SegmentInfos();
1123 sis->read(directory, segmentFileName);
1124 version = sis->getVersion();
1125 _CLDELETE(sis);
1126
1127 return version;
1128
1129 }
1130
1131 CL_NS_END
1132