1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #include "CLucene/_ApiHeader.h"
8
9 #include "Term.h"
10 #include "Terms.h"
11 #include "CLucene/util/Misc.h"
12 #include "CLucene/store/Directory.h"
13 #include "CLucene/store/IndexInput.h"
14
15 #include "_TermInfo.h"
16 #include "_FieldInfos.h"
17 #include "_SegmentTermEnum.h"
18 #include "_FieldInfos.h"
19 #include "_TermInfo.h"
20 #include "_TermInfosWriter.h"
21 #include "_TermInfosReader.h"
22
23 CL_NS_USE(store)
CL_NS_USE(util)24 CL_NS_USE(util)
25 CL_NS_DEF(index)
26
27
28 TermInfosReader::TermInfosReader(Directory* dir, const char* seg, FieldInfos* fis, const int32_t readBufferSize):
29 directory (dir),fieldInfos (fis), indexTerms(NULL), indexInfos(NULL), indexPointers(NULL), indexDivisor(1)
30 {
31 //Func - Constructor.
32 // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii)
33 //Pre - dir is a reference to a valid Directory
34 // Fis contains a valid reference to an FieldInfos instance
35 // seg != NULL and contains the name of the segment
36 //Post - An instance has been created and the index named seg has been read. (Remember
37 // a segment is nothing more then an independently readable index)
38
39 CND_PRECONDITION(seg != NULL, "seg is NULL");
40
41 //Initialize the name of the segment
42 segment = seg;
43
44 //Create a filname fo a Term Info File
45 string tisFile = Misc::segmentname(segment,".tis");
46 string tiiFile = Misc::segmentname(segment,".tii");
47 bool success = false;
48 origEnum = indexEnum = NULL;
49 _size = indexTermsLength = totalIndexInterval = 0;
50
51 try {
52 //Create an SegmentTermEnum for storing all the terms read of the segment
53 origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile.c_str(), readBufferSize ), fieldInfos, false);
54 _size = origEnum->size;
55 totalIndexInterval = origEnum->indexInterval;
56 indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile.c_str(), readBufferSize ), fieldInfos, true);
57
58 //Check if enumerator points to a valid instance
59 CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator");
60 CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator");
61
62 success = true;
63 } _CLFINALLY({
64 // With lock-less commits, it's entirely possible (and
65 // fine) to hit a FileNotFound exception above. In
66 // this case, we want to explicitly close any subset
67 // of things that were opened so that we don't have to
68 // wait for a GC to do so.
69 if (!success) {
70 close();
71 }
72 });
73
74 }
75
~TermInfosReader()76 TermInfosReader::~TermInfosReader(){
77 //Func - Destructor
78 //Pre - true
79 //Post - The instance has been destroyed
80
81 //Close the TermInfosReader to be absolutly sure that enumerator has been closed
82 //and the arrays indexTerms, indexPointers and indexInfos and their elements
83 //have been destroyed
84 close();
85 }
getSkipInterval() const86 int32_t TermInfosReader::getSkipInterval() const {
87 return origEnum->skipInterval;
88 }
89
getMaxSkipLevels() const90 int32_t TermInfosReader::getMaxSkipLevels() const {
91 return origEnum->maxSkipLevels;
92 }
93
setIndexDivisor(const int32_t _indexDivisor)94 void TermInfosReader::setIndexDivisor(const int32_t _indexDivisor) {
95 if (indexDivisor < 1)
96 _CLTHROWA(CL_ERR_IllegalArgument, "indexDivisor must be > 0");
97
98 if (indexTerms != NULL)
99 _CLTHROWA(CL_ERR_IllegalArgument, "index terms are already loaded");
100
101 this->indexDivisor = _indexDivisor;
102 totalIndexInterval = origEnum->indexInterval * _indexDivisor;
103 }
104
getIndexDivisor() const105 int32_t TermInfosReader::getIndexDivisor() const { return indexDivisor; }
close()106 void TermInfosReader::close() {
107
108 //Check if indexTerms and indexInfos exist
109 if (indexTerms && indexInfos){
110 //Iterate through arrays indexTerms and indexPointer to
111 //destroy their elements
112 #ifdef _DEBUG
113 for ( int32_t i=0; i<indexTermsLength;++i ){
114 indexTerms[i].__cl_decref();
115 }
116 #endif
117 //Delete the arrays
118 delete [] indexTerms;
119 _CLDELETE_ARRAY(indexInfos);
120 }
121
122 //Delete the arrays
123 _CLDELETE_ARRAY(indexPointers);
124
125 if (origEnum != NULL){
126 origEnum->close();
127
128 //Get a pointer to IndexInput used by the enumeration but
129 //instantiated in the constructor by directory.open( tisFile )
130 IndexInput *is = origEnum->input;
131
132 //Delete the enumuration enumerator
133 _CLDELETE(origEnum);
134
135 //Delete the IndexInput
136 _CLDELETE(is);
137 }
138
139 if (indexEnum != NULL){
140 indexEnum->close();
141
142 //Get a pointer to IndexInput used by the enumeration but
143 //instantiated in the constructor by directory.open( tiiFile )
144 IndexInput *is = indexEnum->input;
145
146 //Delete the enumuration enumerator
147 _CLDELETE(indexEnum);
148
149 //Delete the IndexInput
150 _CLDELETE(is);
151 }
152 enumerators.setNull();
153 }
154
size() const155 int64_t TermInfosReader::size() const{
156 //Func - Return the size of the enumeration of TermInfos
157 //Pre - true
158 //Post - size has been returened
159
160 return _size;
161 }
162
163
get(const int32_t position)164 Term* TermInfosReader::get(const int32_t position) {
165 //Func - Returns the nth term in the set
166 //Pre - position > = 0
167 //Post - The n-th term in the set has been returned
168
169 //Check if the size is 0 because then there are no terms
170 if (_size == 0)
171 return NULL;
172
173 SegmentTermEnum* enumerator = getEnum();
174
175 if (
176 enumerator != NULL //an enumeration exists
177 && enumerator->term(false) != NULL // term is at or past current
178 && position >= enumerator->position
179 && position < (enumerator->position + totalIndexInterval)
180 )
181 {
182 return scanEnum(position); // can avoid seek
183 }
184
185 //random-access: must seek
186 seekEnum(position / totalIndexInterval);
187
188 //Get the Term at position
189 return scanEnum(position);
190 }
191
getEnum()192 SegmentTermEnum* TermInfosReader::getEnum(){
193 SegmentTermEnum* termEnum = enumerators.get();
194 if (termEnum == NULL){
195 termEnum = terms();
196 enumerators.set(termEnum);
197 }
198 return termEnum;
199 }
200
get(const Term * term)201 TermInfo* TermInfosReader::get(const Term* term){
202 //Func - Returns a TermInfo for a term
203 //Pre - term holds a valid reference to term
204 //Post - if term can be found its TermInfo has been returned otherwise NULL
205
206 //If the size of the enumeration is 0 then no Terms have been read
207 if (_size == 0)
208 return NULL;
209
210 ensureIndexIsRead();
211
212 // optimize sequential access: first try scanning cached enum w/o seeking
213 SegmentTermEnum* enumerator = getEnum();
214
215 // optimize sequential access: first try scanning cached enumerator w/o seeking
216 if (
217 //the current term of the enumeration enumerator is not at the end AND
218 enumerator->term(false) != NULL &&
219 (
220 //there exists a previous current called prev and term is positioned after this prev OR
221 ( enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) ||
222 //term is positioned at the same position as the current of enumerator or at a higher position
223 term->compareTo(enumerator->term(false)) >= 0 )
224 )
225 {
226
227 //Calculate the offset for the position
228 int32_t _enumOffset = (int32_t)(enumerator->position/totalIndexInterval)+1;
229
230 // but before end of block
231 if (
232 //the length of indexTerms (the number of terms in enumerator) equals
233 //_enum_offset OR
234 indexTermsLength == _enumOffset ||
235 //term is positioned in front of term found at _enumOffset in indexTerms
236 term->compareTo(&indexTerms[_enumOffset]) < 0){
237
238 //no need to seek, retrieve the TermInfo for term
239 return scanEnum(term);
240 }
241 }
242
243 //Reposition current term in the enumeration
244 seekEnum(getIndexOffset(term));
245 //Return the TermInfo for term
246 return scanEnum(term);
247 }
248
249
getPosition(const Term * term)250 int64_t TermInfosReader::getPosition(const Term* term) {
251 //Func - Returns the position of a Term in the set
252 //Pre - term holds a valid reference to a Term
253 // enumerator != NULL
254 //Post - If term was found then its position is returned otherwise -1
255
256 //if the enumeration is empty then return -1
257 if (_size == 0)
258 return -1;
259
260 ensureIndexIsRead();
261
262 //Retrieve the indexOffset for term
263 int32_t indexOffset = getIndexOffset(term);
264 seekEnum(indexOffset);
265
266 SegmentTermEnum* enumerator = getEnum();
267
268 while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {}
269
270 if ( term->equals(enumerator->term(false)) ){
271 return enumerator->position;
272 }else
273 return -1;
274 }
275
terms(const Term * term)276 SegmentTermEnum* TermInfosReader::terms(const Term* term) {
277 //Func - Returns an enumeration of terms starting at or after the named term.
278 // If term is null then enumerator is set to the beginning
279 //Pre - term holds a valid reference to a Term
280 // enumerator != NULL
281 //Post - An enumeration of terms starting at or after the named term has been returned
282
283 SegmentTermEnum* enumerator = NULL;
284 if ( term != NULL ){
285 //Seek enumerator to term; delete the new TermInfo that's returned.
286 TermInfo* ti = get(term);
287 _CLLDELETE(ti);
288 enumerator = getEnum();
289 }else
290 enumerator = origEnum;
291
292 //Clone the entire enumeration
293 SegmentTermEnum* cln = enumerator->clone();
294
295 //Check if cln points to a valid instance
296 CND_CONDITION(cln != NULL,"cln is NULL");
297
298 return cln;
299 }
300
301
ensureIndexIsRead()302 void TermInfosReader::ensureIndexIsRead() {
303 //Func - Reads the term info index file or .tti file.
304 // This file contains every IndexInterval-th entry from the .tis file,
305 // along with its location in the "tis" file. This is designed to be read entirely
306 // into memory and used to provide random access to the "tis" file.
307 //Pre - indexTerms = NULL
308 // indexInfos = NULL
309 // indexPointers = NULL
310 //Post - The term info index file has been read into memory
311
312 SCOPED_LOCK_MUTEX(THIS_LOCK)
313
314 if ( indexTerms != NULL )
315 return;
316
317 try {
318 indexTermsLength = (size_t)indexEnum->size;
319
320 //Instantiate an block of Term's,so that each one doesn't have to be new'd
321 indexTerms = new Term[indexTermsLength];
322 CND_CONDITION(indexTerms != NULL,"No memory could be allocated for indexTerms");//Check if is indexTerms is a valid array
323
324 //Instantiate an big block of TermInfo's, so that each one doesn't have to be new'd
325 indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength);
326 CND_CONDITION(indexInfos != NULL,"No memory could be allocated for indexInfos"); //Check if is indexInfos is a valid array
327
328 //Instantiate an array indexPointers that contains pointers to the term info index file
329 indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength);
330 CND_CONDITION(indexPointers != NULL,"No memory could be allocated for indexPointers");//Check if is indexPointers is a valid array
331
332 //Iterate through the terms of indexEnum
333 for (int32_t i = 0; indexEnum->next(); ++i){
334 indexTerms[i].set(indexEnum->term(false),indexEnum->term(false)->text());
335 indexEnum->getTermInfo(&indexInfos[i]);
336 indexPointers[i] = indexEnum->indexPointer;
337
338 for (int32_t j = 1; j < indexDivisor; j++)
339 if (!indexEnum->next())
340 break;
341 }
342 }_CLFINALLY(
343 indexEnum->close();
344 //Close and delete the IndexInput is. The close is done by the destructor.
345 _CLDELETE( indexEnum->input );
346 _CLDELETE( indexEnum );
347 );
348 }
349
350
getIndexOffset(const Term * term)351 int32_t TermInfosReader::getIndexOffset(const Term* term){
352 //Func - Returns the offset of the greatest index entry which is less than or equal to term.
353 //Pre - term holds a reference to a valid term
354 // indexTerms != NULL
355 //Post - The new offset has been returned
356
357 //Check if is indexTerms is a valid array
358 CND_PRECONDITION(indexTerms != NULL,"indexTerms is NULL");
359
360 int32_t lo = 0;
361 int32_t hi = indexTermsLength - 1;
362 int32_t mid;
363 int32_t delta;
364
365 while (hi >= lo) {
366 //Start in the middle betwee hi and lo
367 mid = (lo + hi) >> 1;
368
369 //Check if is indexTerms[mid] is a valid instance of Term
370 CND_PRECONDITION(&indexTerms[mid] != NULL,"indexTerms[mid] is NULL");
371 CND_PRECONDITION(mid < indexTermsLength,"mid >= indexTermsLength");
372
373 //Determine if term is before mid or after mid
374 delta = term->compareTo(&indexTerms[mid]);
375 if (delta < 0){
376 //Calculate the new hi
377 hi = mid - 1;
378 }else if (delta > 0){
379 //Calculate the new lo
380 lo = mid + 1;
381 }else{
382 //term has been found so return its position
383 return mid;
384 }
385 }
386 // the new starting offset
387 return hi;
388 }
389
seekEnum(const int32_t indexOffset)390 void TermInfosReader::seekEnum(const int32_t indexOffset) {
391 //Func - Reposition the current Term and TermInfo to indexOffset
392 //Pre - indexOffset >= 0
393 // indexTerms != NULL
394 // indexInfos != NULL
395 // indexPointers != NULL
396 //Post - The current Term and Terminfo have been repositioned to indexOffset
397
398 CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number");
399 CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
400 CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL");
401 CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL");
402
403 SegmentTermEnum* enumerator = getEnum();
404 enumerator->seek(
405 indexPointers[indexOffset],
406 (indexOffset * totalIndexInterval) - 1,
407 &indexTerms[indexOffset],
408 &indexInfos[indexOffset]
409 );
410 }
411
412
scanEnum(const Term * term)413 TermInfo* TermInfosReader::scanEnum(const Term* term) {
414 //Func - Scans the Enumeration of terms for term and returns the corresponding TermInfo instance if found.
415 // The search is started from the current term.
416 //Pre - term contains a valid reference to a Term
417 // enumerator != NULL
418 //Post - if term has been found the corresponding TermInfo has been returned otherwise NULL
419 // has been returned
420
421 SegmentTermEnum* enumerator = getEnum();
422 enumerator->scanTo(term);
423
424 //Check if the at the position the Term term can be found
425 if (enumerator->term(false) != NULL && term->equals(enumerator->term(false)) ){
426 //Return the TermInfo instance about term
427 return enumerator->getTermInfo();
428 }else{
429 //term was not found so no TermInfo can be returned
430 return NULL;
431 }
432 }
433
scanEnum(const int32_t position)434 Term* TermInfosReader::scanEnum(const int32_t position) {
435 //Func - Scans the enumeration to the requested position and returns the
436 // Term located at that position
437 //Pre - position > = 0
438 // enumerator != NULL
439 //Post - The Term at the requested position has been returned
440
441 SegmentTermEnum* enumerator = getEnum();
442
443 //As long the position of the enumeration enumerator is smaller than the requested one
444 while(enumerator->position < position){
445 //Move the current of enumerator to the next
446 if (!enumerator->next()){
447 //If there is no next it means that the requested position was to big
448 return NULL;
449 }
450 }
451
452 //Return the Term a the requested position
453 return enumerator->term();
454 }
455
456 CL_NS_END
457