1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 #include "CLucene/_ApiHeader.h"
8 
9 #include "Term.h"
10 #include "Terms.h"
11 #include "CLucene/util/Misc.h"
12 #include "CLucene/store/Directory.h"
13 #include "CLucene/store/IndexInput.h"
14 
15 #include "_TermInfo.h"
16 #include "_FieldInfos.h"
17 #include "_SegmentTermEnum.h"
18 #include "_FieldInfos.h"
19 #include "_TermInfo.h"
20 #include "_TermInfosWriter.h"
21 #include "_TermInfosReader.h"
22 
23 CL_NS_USE(store)
CL_NS_USE(util)24 CL_NS_USE(util)
25 CL_NS_DEF(index)
26 
27 
28   TermInfosReader::TermInfosReader(Directory* dir, const char* seg, FieldInfos* fis, const int32_t readBufferSize):
29       directory (dir),fieldInfos (fis), indexTerms(NULL), indexInfos(NULL), indexPointers(NULL), indexDivisor(1)
30   {
31   //Func - Constructor.
32   //       Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii)
33   //Pre  - dir is a reference to a valid Directory
34   //       Fis contains a valid reference to an FieldInfos instance
35   //       seg != NULL and contains the name of the segment
36   //Post - An instance has been created and the index named seg has been read. (Remember
37   //       a segment is nothing more then an independently readable index)
38 
39       CND_PRECONDITION(seg != NULL, "seg is NULL");
40 
41 	  //Initialize the name of the segment
42       segment    =  seg;
43 
44       //Create a filname fo a Term Info File
45 	  string tisFile = Misc::segmentname(segment,".tis");
46 	  string tiiFile = Misc::segmentname(segment,".tii");
47 	  bool success = false;
48     origEnum = indexEnum = NULL;
49     _size = indexTermsLength = totalIndexInterval = 0;
50 
51 	  try {
52 		  //Create an SegmentTermEnum for storing all the terms read of the segment
53 		  origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile.c_str(), readBufferSize ), fieldInfos, false);
54 		  _size =  origEnum->size;
55 		  totalIndexInterval = origEnum->indexInterval;
56 		  indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile.c_str(), readBufferSize ), fieldInfos, true);
57 
58 		  //Check if enumerator points to a valid instance
59 		  CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator");
60 		  CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator");
61 
62 		  success = true;
63 	  } _CLFINALLY({
64 		  // With lock-less commits, it's entirely possible (and
65 		  // fine) to hit a FileNotFound exception above. In
66 		  // this case, we want to explicitly close any subset
67 		  // of things that were opened so that we don't have to
68 		  // wait for a GC to do so.
69 		  if (!success) {
70 			  close();
71 		  }
72 	  });
73 
74   }
75 
~TermInfosReader()76   TermInfosReader::~TermInfosReader(){
77   //Func - Destructor
78   //Pre  - true
79   //Post - The instance has been destroyed
80 
81       //Close the TermInfosReader to be absolutly sure that enumerator has been closed
82 	  //and the arrays indexTerms, indexPointers and indexInfos and  their elements
83 	  //have been destroyed
84       close();
85   }
getSkipInterval() const86   int32_t TermInfosReader::getSkipInterval() const {
87     return origEnum->skipInterval;
88   }
89 
getMaxSkipLevels() const90   int32_t TermInfosReader::getMaxSkipLevels() const {
91     return origEnum->maxSkipLevels;
92   }
93 
setIndexDivisor(const int32_t _indexDivisor)94   void TermInfosReader::setIndexDivisor(const int32_t _indexDivisor) {
95 	  if (indexDivisor < 1)
96 		  _CLTHROWA(CL_ERR_IllegalArgument, "indexDivisor must be > 0");
97 
98 	  if (indexTerms != NULL)
99 		  _CLTHROWA(CL_ERR_IllegalArgument, "index terms are already loaded");
100 
101 	  this->indexDivisor = _indexDivisor;
102 	  totalIndexInterval = origEnum->indexInterval * _indexDivisor;
103   }
104 
getIndexDivisor() const105   int32_t TermInfosReader::getIndexDivisor() const { return indexDivisor; }
close()106   void TermInfosReader::close() {
107 
108 	  //Check if indexTerms and indexInfos exist
109      if (indexTerms && indexInfos){
110           //Iterate through arrays indexTerms and indexPointer to
111 	      //destroy their elements
112 #ifdef _DEBUG
113          for ( int32_t i=0; i<indexTermsLength;++i ){
114             indexTerms[i].__cl_decref();
115          }
116 #endif
117          //Delete the arrays
118          delete [] indexTerms;
119          _CLDELETE_ARRAY(indexInfos);
120      }
121 
122       //Delete the arrays
123       _CLDELETE_ARRAY(indexPointers);
124 
125       if (origEnum != NULL){
126         origEnum->close();
127 
128 	    //Get a pointer to IndexInput used by the enumeration but
129 	    //instantiated in the constructor by directory.open( tisFile )
130         IndexInput *is = origEnum->input;
131 
132         //Delete the enumuration enumerator
133         _CLDELETE(origEnum);
134 
135         //Delete the IndexInput
136         _CLDELETE(is);
137       }
138 
139       if (indexEnum != NULL){
140         indexEnum->close();
141 
142 	    //Get a pointer to IndexInput used by the enumeration but
143 	    //instantiated in the constructor by directory.open( tiiFile )
144         IndexInput *is = indexEnum->input;
145 
146         //Delete the enumuration enumerator
147         _CLDELETE(indexEnum);
148 
149         //Delete the IndexInput
150         _CLDELETE(is);
151       }
152 	  enumerators.setNull();
153   }
154 
size() const155   int64_t TermInfosReader::size() const{
156   //Func - Return the size of the enumeration of TermInfos
157   //Pre  - true
158   //Post - size has been returened
159 
160       return _size;
161   }
162 
163 
get(const int32_t position)164   Term* TermInfosReader::get(const int32_t position) {
165   //Func - Returns the nth term in the set
166   //Pre  - position > = 0
167   //Post - The n-th term in the set has been returned
168 
169 	  //Check if the size is 0 because then there are no terms
170       if (_size == 0)
171           return NULL;
172 
173 	  SegmentTermEnum* enumerator = getEnum();
174 
175 	  if (
176 	      enumerator != NULL //an enumeration exists
177 	      && enumerator->term(false) != NULL // term is at or past current
178 	      && position >= enumerator->position
179 		  && position < (enumerator->position + totalIndexInterval)
180 	     )
181 	  {
182 		  return scanEnum(position);			  // can avoid seek
183 	  }
184 
185     //random-access: must seek
186     seekEnum(position / totalIndexInterval);
187 
188 	//Get the Term at position
189     return scanEnum(position);
190   }
191 
getEnum()192   SegmentTermEnum* TermInfosReader::getEnum(){
193     SegmentTermEnum* termEnum = enumerators.get();
194     if (termEnum == NULL){
195       termEnum = terms();
196       enumerators.set(termEnum);
197     }
198     return termEnum;
199   }
200 
get(const Term * term)201   TermInfo* TermInfosReader::get(const Term* term){
202   //Func - Returns a TermInfo for a term
203   //Pre  - term holds a valid reference to term
204   //Post - if term can be found its TermInfo has been returned otherwise NULL
205 
206     //If the size of the enumeration is 0 then no Terms have been read
207 	if (_size == 0)
208 		return NULL;
209 
210     ensureIndexIsRead();
211 
212     // optimize sequential access: first try scanning cached enum w/o seeking
213     SegmentTermEnum* enumerator = getEnum();
214 
215     // optimize sequential access: first try scanning cached enumerator w/o seeking
216     if (
217 	      //the current term of the enumeration enumerator is not at the end AND
218       	enumerator->term(false) != NULL	 &&
219       	(
220             //there exists a previous current called prev and term is positioned after this prev OR
221             ( enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) ||
222             //term is positioned at the same position as the current of enumerator or at a higher position
223             term->compareTo(enumerator->term(false)) >= 0 )
224       	)
225      {
226 
227 		//Calculate the offset for the position
228 		int32_t _enumOffset = (int32_t)(enumerator->position/totalIndexInterval)+1;
229 
230 		// but before end of block
231 		if (
232 			//the length of indexTerms (the number of terms in enumerator) equals
233 			//_enum_offset OR
234 			indexTermsLength == _enumOffset	 ||
235 			//term is positioned in front of term found at _enumOffset in indexTerms
236 			term->compareTo(&indexTerms[_enumOffset]) < 0){
237 
238 			//no need to seek, retrieve the TermInfo for term
239 			return scanEnum(term);
240         }
241     }
242 
243     //Reposition current term in the enumeration
244     seekEnum(getIndexOffset(term));
245 	//Return the TermInfo for term
246     return scanEnum(term);
247   }
248 
249 
getPosition(const Term * term)250   int64_t TermInfosReader::getPosition(const Term* term) {
251   //Func - Returns the position of a Term in the set
252   //Pre  - term holds a valid reference to a Term
253   //       enumerator != NULL
254   //Post - If term was found then its position is returned otherwise -1
255 
256 	  //if the enumeration is empty then return -1
257 	  if (_size == 0)
258 		  return -1;
259 
260 	  ensureIndexIsRead();
261 
262       //Retrieve the indexOffset for term
263       int32_t indexOffset = getIndexOffset(term);
264       seekEnum(indexOffset);
265 
266 	  SegmentTermEnum* enumerator = getEnum();
267 
268       while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {}
269 
270 	  if ( term->equals(enumerator->term(false)) ){
271           return enumerator->position;
272 	  }else
273           return -1;
274   }
275 
terms(const Term * term)276   SegmentTermEnum* TermInfosReader::terms(const Term* term) {
277   //Func - Returns an enumeration of terms starting at or after the named term.
278   //       If term is null then enumerator is set to the beginning
279   //Pre  - term holds a valid reference to a Term
280   //       enumerator != NULL
281   //Post - An enumeration of terms starting at or after the named term has been returned
282 
283 	  SegmentTermEnum* enumerator = NULL;
284 	  if ( term != NULL ){
285 		//Seek enumerator to term; delete the new TermInfo that's returned.
286 		TermInfo* ti = get(term);
287 		_CLLDELETE(ti);
288 		enumerator = getEnum();
289 	  }else
290 	    enumerator = origEnum;
291 
292       //Clone the entire enumeration
293       SegmentTermEnum* cln = enumerator->clone();
294 
295       //Check if cln points to a valid instance
296       CND_CONDITION(cln != NULL,"cln is NULL");
297 
298       return cln;
299   }
300 
301 
ensureIndexIsRead()302   void TermInfosReader::ensureIndexIsRead() {
303   //Func - Reads the term info index file or .tti file.
304   //       This file contains every IndexInterval-th entry from the .tis file,
305   //       along with its location in the "tis" file. This is designed to be read entirely
306   //       into memory and used to provide random access to the "tis" file.
307   //Pre  - indexTerms    = NULL
308   //       indexInfos    = NULL
309   //       indexPointers = NULL
310   //Post - The term info index file has been read into memory
311 
312     SCOPED_LOCK_MUTEX(THIS_LOCK)
313 
314 	  if ( indexTerms != NULL )
315 		  return;
316 
317       try {
318           indexTermsLength = (size_t)indexEnum->size;
319 
320 		      //Instantiate an block of Term's,so that each one doesn't have to be new'd
321           indexTerms    = new Term[indexTermsLength];
322           CND_CONDITION(indexTerms != NULL,"No memory could be allocated for indexTerms");//Check if is indexTerms is a valid array
323 
324 		  //Instantiate an big block of TermInfo's, so that each one doesn't have to be new'd
325           indexInfos    = _CL_NEWARRAY(TermInfo,indexTermsLength);
326           CND_CONDITION(indexInfos != NULL,"No memory could be allocated for indexInfos"); //Check if is indexInfos is a valid array
327 
328           //Instantiate an array indexPointers that contains pointers to the term info index file
329           indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength);
330           CND_CONDITION(indexPointers != NULL,"No memory could be allocated for indexPointers");//Check if is indexPointers is a valid array
331 
332 		  //Iterate through the terms of indexEnum
333           for (int32_t i = 0; indexEnum->next(); ++i){
334               indexTerms[i].set(indexEnum->term(false),indexEnum->term(false)->text());
335               indexEnum->getTermInfo(&indexInfos[i]);
336               indexPointers[i] = indexEnum->indexPointer;
337 
338 			        for (int32_t j = 1; j < indexDivisor; j++)
339 				        if (!indexEnum->next())
340 					        break;
341           }
342     }_CLFINALLY(
343           indexEnum->close();
344 		  //Close and delete the IndexInput is. The close is done by the destructor.
345           _CLDELETE( indexEnum->input );
346           _CLDELETE( indexEnum );
347     );
348   }
349 
350 
getIndexOffset(const Term * term)351   int32_t TermInfosReader::getIndexOffset(const Term* term){
352   //Func - Returns the offset of the greatest index entry which is less than or equal to term.
353   //Pre  - term holds a reference to a valid term
354   //       indexTerms != NULL
355   //Post - The new offset has been returned
356 
357       //Check if is indexTerms is a valid array
358       CND_PRECONDITION(indexTerms != NULL,"indexTerms is NULL");
359 
360       int32_t lo = 0;
361       int32_t hi = indexTermsLength - 1;
362 	  int32_t mid;
363 	  int32_t delta;
364 
365       while (hi >= lo) {
366           //Start in the middle betwee hi and lo
367           mid = (lo + hi) >> 1;
368 
369           //Check if is indexTerms[mid] is a valid instance of Term
370           CND_PRECONDITION(&indexTerms[mid] != NULL,"indexTerms[mid] is NULL");
371           CND_PRECONDITION(mid < indexTermsLength,"mid >= indexTermsLength");
372 
373 		  //Determine if term is before mid or after mid
374           delta = term->compareTo(&indexTerms[mid]);
375           if (delta < 0){
376               //Calculate the new hi
377               hi = mid - 1;
378           }else if (delta > 0){
379                   //Calculate the new lo
380                   lo = mid + 1;
381 			  }else{
382                   //term has been found so return its position
383                   return mid;
384           }
385      }
386      // the new starting offset
387      return hi;
388   }
389 
seekEnum(const int32_t indexOffset)390   void TermInfosReader::seekEnum(const int32_t indexOffset) {
391   //Func - Reposition the current Term and TermInfo to indexOffset
392   //Pre  - indexOffset >= 0
393   //       indexTerms    != NULL
394   //       indexInfos    != NULL
395   //       indexPointers != NULL
396   //Post - The current Term and Terminfo have been repositioned to indexOffset
397 
398       CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number");
399       CND_PRECONDITION(indexTerms != NULL,    "indexTerms is NULL");
400       CND_PRECONDITION(indexInfos != NULL,    "indexInfos is NULL");
401       CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL");
402 
403 	  SegmentTermEnum* enumerator =  getEnum();
404 	  enumerator->seek(
405           indexPointers[indexOffset],
406 		  (indexOffset * totalIndexInterval) - 1,
407           &indexTerms[indexOffset],
408 		  &indexInfos[indexOffset]
409 	      );
410   }
411 
412 
scanEnum(const Term * term)413   TermInfo* TermInfosReader::scanEnum(const Term* term) {
414   //Func - Scans the Enumeration of terms for term and returns the corresponding TermInfo instance if found.
415   //       The search is started from the current term.
416   //Pre  - term contains a valid reference to a Term
417   //       enumerator != NULL
418   //Post - if term has been found the corresponding TermInfo has been returned otherwise NULL
419   //       has been returned
420 
421       SegmentTermEnum* enumerator = getEnum();
422 	  enumerator->scanTo(term);
423 
424       //Check if the at the position the Term term can be found
425 	  if (enumerator->term(false) != NULL && term->equals(enumerator->term(false)) ){
426 		  //Return the TermInfo instance about term
427           return enumerator->getTermInfo();
428      }else{
429           //term was not found so no TermInfo can be returned
430           return NULL;
431      }
432   }
433 
scanEnum(const int32_t position)434   Term* TermInfosReader::scanEnum(const int32_t position) {
435   //Func - Scans the enumeration to the requested position and returns the
436   //       Term located at that position
437   //Pre  - position > = 0
438   //       enumerator != NULL
439   //Post - The Term at the requested position has been returned
440 
441       SegmentTermEnum* enumerator = getEnum();
442 
443 	  //As long the position of the enumeration enumerator is smaller than the requested one
444       while(enumerator->position < position){
445 		  //Move the current of enumerator to the next
446 		  if (!enumerator->next()){
447 			  //If there is no next it means that the requested position was to big
448               return NULL;
449           }
450 	  }
451 
452 	  //Return the Term a the requested position
453 	  return enumerator->term();
454   }
455 
456 CL_NS_END
457