1 /*
2  * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3  *
4  * Distributable under the terms of either the Apache License (Version 2.0) or
5  * the GNU Lesser General Public License, as specified in the COPYING file.
6  *
7  * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
8 */
9 #include "CLucene/StdHeader.h"
10 #include "TermInfosReader.h"
11 
12 #include "CLucene/store/Directory.h"
13 #include "CLucene/util/Misc.h"
14 #include "FieldInfos.h"
15 #include "Term.h"
16 #include "Terms.h"
17 #include "TermInfo.h"
18 #include "TermInfosWriter.h"
19 
20 CL_NS_USE(store)
CL_NS_USE(util)21 CL_NS_USE(util)
22 CL_NS_DEF(index)
23 
24 TermInfosReader::TermInfosReader(Directory* dir, const QString& seg,
25     FieldInfos* fis)
26     : directory(dir)
27     , fieldInfos (fis)
28 {
29     //Func - Constructor.
30     //       Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii)
31     //Pre  - dir is a reference to a valid Directory
32     //       Fis contains a valid reference to an FieldInfos instance
33     //       seg != NULL and contains the name of the segment
34     //Post - An instance has been created and the index named seg has been read. (Remember
35     //       a segment is nothing more then an independently readable index)
36 
37     CND_PRECONDITION(!seg.isEmpty(), "seg is NULL");
38 
39     //Initialize the name of the segment
40     segment    =  seg;
41     //There are no indexTerms yet
42     indexTerms    = NULL;
43     //So there are no indexInfos
44     indexInfos    = NULL;
45     //So there are no indexPointers
46     indexPointers = NULL;
47     //Create a filname fo a Term Info File
48     QString tisFile = Misc::segmentname(segment, QLatin1String(".tis"));
49     QString tiiFile = Misc::segmentname(segment, QLatin1String(".tii"));
50 
51     //Create an SegmentTermEnum for storing all the terms read of the segment
52     origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false);
53     indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true);
54 
55     //Check if enumerator points to a valid instance
56     CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator");
57     CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator");
58 
59     //Get the size of the enumeration and store it in size
60     _size =  origEnum->size;
61 }
62 
~TermInfosReader()63 TermInfosReader::~TermInfosReader()
64 {
65     //Func - Destructor
66     //Pre  - true
67     //Post - The instance has been destroyed
68 
69     //Close the TermInfosReader to be absolutly sure that enumerator has been closed
70     //and the arrays indexTerms, indexPointers and indexInfos and  their elements
71     //have been destroyed
72     close();
73 }
74 
close()75 void TermInfosReader::close()
76 {
77     //Func - Close the enumeration of TermInfos
78     //Pre  - true
79     //Post - The _enumeration has been closed and the arrays
80 
81     //Check if indexTerms and indexInfos exist
82     if (indexTerms && indexInfos){
83         //Iterate through arrays indexTerms and indexPointer to
84         //destroy their elements
85 #ifdef _DEBUG
86         for (int32_t i = 0; i < indexTermsLength; ++i) {
87             if (indexTerms[i].__cl_refcount != 1) {
88                 CND_PRECONDITION(indexTerms[i].__cl_refcount == 1,
89                     "TermInfosReader term was references more than internally");
90             }
91             //   _CLDECDELETE(indexTerms[i]);
92             //_CLDELETE(indexInfos[i]);
93         }
94 #endif
95         //Delete the arrays
96         _CLDELETE_ARRAY(indexTerms);
97         _CLDELETE_ARRAY(indexInfos);
98     }
99 
100     //Delete the arrays
101     _CLDELETE_ARRAY(indexPointers);
102 
103     if (origEnum != NULL) {
104         origEnum->close();
105 
106         //Get a pointer to IndexInput used by the enumeration but
107         //instantiated in the constructor by directory.open( tisFile )
108         IndexInput *is = origEnum->input;
109 
110         //Delete the enumuration enumerator
111         _CLDELETE(origEnum);
112 
113         //Delete the IndexInput
114         _CLDELETE(is);
115     }
116 
117     if (indexEnum != NULL){
118         indexEnum->close();
119 
120         //Get a pointer to IndexInput used by the enumeration but
121         //instantiated in the constructor by directory.open( tiiFile )
122         IndexInput *is = indexEnum->input;
123 
124         //Delete the enumuration enumerator
125         _CLDELETE(indexEnum);
126 
127         //Delete the IndexInput
128         _CLDELETE(is);
129     }
130 }
131 
size() const132 int64_t TermInfosReader::size() const
133 {
134     //Func - Return the size of the enumeration of TermInfos
135     //Pre  - true
136     //Post - size has been returened
137 
138     return _size;
139 }
140 
get(const int32_t position)141 Term* TermInfosReader::get(const int32_t position)
142 {
143     //Func - Returns the nth term in the set
144     //Pre  - position > = 0
145     //Post - The n-th term in the set has been returned
146 
147     //Check if the size is 0 because then there are no terms
148     if (_size == 0)
149         return NULL;
150 
151     SegmentTermEnum* enumerator = getEnum();
152 
153     if (enumerator != NULL //an enumeration exists
154         && enumerator->term(false) != NULL // term is at or past current
155         && position >= enumerator->position
156         && position < (enumerator->position + enumerator->indexInterval)) {
157         return scanEnum(position);			  // can avoid seek
158     }
159 
160     //random-access: must seek
161     seekEnum(position / enumerator->indexInterval);
162 
163     //Get the Term at position
164     return scanEnum(position);
165 }
166 
167 // TODO: currently there is no way of cleaning up a thread, if the thread ends.
168 // we are stuck with the terminfosreader of that thread. Hopefully this won't
169 // be too big a problem... solutions anyone?
getEnum()170 SegmentTermEnum* TermInfosReader::getEnum()
171 {
172     SegmentTermEnum* termEnum = enumerators.get();
173     if (termEnum == NULL) {
174         termEnum = terms();
175         enumerators.set(termEnum);
176     }
177     return termEnum;
178 }
179 
get(const Term * term)180 TermInfo* TermInfosReader::get(const Term* term)
181 {
182     //Func - Returns a TermInfo for a term
183     //Pre  - term holds a valid reference to term
184     //Post - if term can be found its TermInfo has been returned otherwise NULL
185 
186     //If the size of the enumeration is 0 then no Terms have been read
187     if (_size == 0)
188         return NULL;
189 
190     ensureIndexIsRead();
191 
192     // optimize sequential access: first try scanning cached enum w/o seeking
193     SegmentTermEnum* enumerator = getEnum();
194 
195     // optimize sequential access: first try scanning cached enumerator w/o seeking
196     // if the current term of the enumeration enumerator is not at the end
197     if (enumerator->term(false) != NULL
198         // AND there exists a previous current called prev and term is
199         // positioned after this prev
200         && ((enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0)
201         // OR term is positioned at the same position as the current of
202         // enumerator or at a higher position
203         || term->compareTo(enumerator->term(false)) >= 0)) {
204             //Calculate the offset for the position
205             int32_t _enumOffset = (int32_t)
206                 (enumerator->position / enumerator->indexInterval) + 1;
207 
208         // but before end of block the length of indexTerms (the number of
209         // terms in enumerator) equals _enum_offset
210         if (indexTermsLength == _enumOffset
211             // OR term is positioned in front of term found at _enumOffset in
212             // indexTerms
213             || term->compareTo(&indexTerms[_enumOffset]) < 0) {
214                 //no need to seek, retrieve the TermInfo for term
215                 return scanEnum(term);
216         }
217     }
218 
219     //Reposition current term in the enumeration
220     seekEnum(getIndexOffset(term));
221     //Return the TermInfo for term
222     return scanEnum(term);
223 }
224 
getPosition(const Term * term)225 int64_t TermInfosReader::getPosition(const Term* term)
226 {
227     //Func - Returns the position of a Term in the set
228     //Pre  - term holds a valid reference to a Term
229     //       enumerator != NULL
230     //Post - If term was found then its position is returned otherwise -1
231 
232     //if the enumeration is empty then return -1
233     if (_size == 0)
234         return -1;
235 
236     ensureIndexIsRead();
237 
238     //Retrieve the indexOffset for term
239     int32_t indexOffset = getIndexOffset(term);
240     seekEnum(indexOffset);
241 
242     SegmentTermEnum* enumerator = getEnum();
243 
244     while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {}
245 
246     if (term->equals(enumerator->term(false)))
247         return enumerator->position;
248 
249     return -1;
250 }
251 
terms(const Term * term)252 SegmentTermEnum* TermInfosReader::terms(const Term* term)
253 {
254     //Func - Returns an enumeration of terms starting at or after the named term.
255     //       If term is null then enumerator is set to the beginning
256     //Pre  - term holds a valid reference to a Term
257     //       enumerator != NULL
258     //Post - An enumeration of terms starting at or after the named term has been returned
259 
260     SegmentTermEnum* enumerator = NULL;
261     if (term != NULL) {
262         //Seek enumerator to term; delete the new TermInfo that's returned.
263         TermInfo* ti = get(term);
264         _CLDELETE(ti);
265         enumerator = getEnum();
266     } else {
267         enumerator = origEnum;
268     }
269     //Clone the entire enumeration
270     SegmentTermEnum* cln = enumerator->clone();
271 
272     //Check if cln points to a valid instance
273     CND_CONDITION(cln != NULL, "cln is NULL");
274 
275     return cln;
276 }
277 
ensureIndexIsRead()278 void TermInfosReader::ensureIndexIsRead()
279 {
280     //Func - Reads the term info index file or .tti file.
281     //       This file contains every IndexInterval-th entry from the .tis file,
282     //       along with its location in the "tis" file. This is designed to be
283     //       read entirely into memory and used to provide random access to the
284     //       "tis" file.
285     //Pre  - indexTerms    = NULL
286     //       indexInfos    = NULL
287     //       indexPointers = NULL
288     //Post - The term info index file has been read into memory
289 
290     SCOPED_LOCK_MUTEX(THIS_LOCK)
291 
292     if ( indexTerms != NULL )
293         return;
294 
295     try {
296         indexTermsLength = (size_t)indexEnum->size;
297 
298         // Instantiate an block of Term's,so that each one doesn't have to be new'd
299         indexTerms    = _CL_NEWARRAY(Term,indexTermsLength);
300 
301         // Check if is indexTerms is a valid array
302         CND_CONDITION(indexTerms != NULL,
303             "No memory could be allocated for indexTerms");
304 
305         // Instantiate an big block of TermInfo's, so that each one doesn't
306         // have to be new'd
307         indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength);
308 
309         // Check if is indexInfos is a valid array
310         CND_CONDITION(indexInfos != NULL,
311             "No memory could be allocated for indexInfos");
312 
313         // Instantiate an array indexPointers that contains pointers to the
314         // term info index file
315         indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength);
316 
317         // Check if is indexPointers is a valid array
318         CND_CONDITION(indexPointers != NULL,
319             "No memory could be allocated for indexPointers");
320 
321         //Iterate through the terms of indexEnum
322         for (int32_t i = 0; indexEnum->next(); ++i) {
323             indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text());
324             indexEnum->getTermInfo(&indexInfos[i]);
325             indexPointers[i] = indexEnum->indexPointer;
326         }
327     } _CLFINALLY (
328         indexEnum->close();
329         // Close and delete the IndexInput is. The close is done by the destructor.
330         _CLDELETE( indexEnum->input );
331         _CLDELETE( indexEnum );
332     );
333 }
334 
getIndexOffset(const Term * term)335 int32_t TermInfosReader::getIndexOffset(const Term* term)
336 {
337     //Func - Returns the offset of the greatest index entry which is less than
338     //       or equal to term.
339     //Pre  - term holds a reference to a valid term
340     //       indexTerms != NULL
341     //Post - The new offset has been returned
342 
343     //Check if is indexTerms is a valid array
344     CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
345 
346     int32_t lo = 0;
347     int32_t hi = indexTermsLength - 1;
348     int32_t mid;
349     int32_t delta;
350 
351     while (hi >= lo) {
352         //Start in the middle betwee hi and lo
353         mid = (lo + hi) >> 1;
354 
355         //Check if is indexTerms[mid] is a valid instance of Term
356         CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL");
357         CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength");
358 
359         //Determine if term is before mid or after mid
360         delta = term->compareTo(&indexTerms[mid]);
361         if (delta < 0) {
362             //Calculate the new hi
363             hi = mid - 1;
364         } else if (delta > 0) {
365             //Calculate the new lo
366             lo = mid + 1;
367         } else {
368             //term has been found so return its position
369             return mid;
370         }
371     }
372     // the new starting offset
373     return hi;
374 }
375 
seekEnum(const int32_t indexOffset)376 void TermInfosReader::seekEnum(const int32_t indexOffset)
377 {
378     //Func - Reposition the current Term and TermInfo to indexOffset
379     //Pre  - indexOffset >= 0
380     //       indexTerms    != NULL
381     //       indexInfos    != NULL
382     //       indexPointers != NULL
383     //Post - The current Term and Terminfo have been repositioned to indexOffset
384 
385     CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number");
386     CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
387     CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL");
388     CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL");
389 
390     SegmentTermEnum* enumerator =  getEnum();
391     enumerator->seek(indexPointers[indexOffset],
392         (indexOffset * enumerator->indexInterval) - 1,
393         &indexTerms[indexOffset], &indexInfos[indexOffset]);
394 }
395 
scanEnum(const Term * term)396 TermInfo* TermInfosReader::scanEnum(const Term* term)
397 {
398     //Func - Scans the Enumeration of terms for term and returns the
399     //       corresponding TermInfo instance if found. The search is started
400     //       from the current term.
401     //Pre  - term contains a valid reference to a Term
402     //       enumerator != NULL
403     //Post - if term has been found the corresponding TermInfo has been returned
404     //       otherwise NULL has been returned
405 
406     SegmentTermEnum* enumerator = getEnum();
407     enumerator->scanTo(term);
408 
409     //Check if the at the position the Term term can be found
410     if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) {
411         //Return the TermInfo instance about term
412         return enumerator->getTermInfo();
413     }
414 
415     //term was not found so no TermInfo can be returned
416     return NULL;
417 }
418 
scanEnum(const int32_t position)419 Term* TermInfosReader::scanEnum(const int32_t position)
420 {
421     //Func - Scans the enumeration to the requested position and returns the
422     //       Term located at that position
423     //Pre  - position > = 0
424     //       enumerator != NULL
425     //Post - The Term at the requested position has been returned
426 
427     SegmentTermEnum* enumerator = getEnum();
428 
429     // As long the position of the enumeration enumerator is smaller than the
430     // requested one
431     while(enumerator->position < position) {
432         //Move the current of enumerator to the next
433         if (!enumerator->next()) {
434             //If there is no next it means that the requested position was to big
435             return NULL;
436         }
437     }
438 
439     //Return the Term a the requested position
440     return enumerator->term();
441 }
442 
443 CL_NS_END
444