1 /*
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 *
7 * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
8 */
9 #include "CLucene/StdHeader.h"
10 #include "TermInfosReader.h"
11
12 #include "CLucene/store/Directory.h"
13 #include "CLucene/util/Misc.h"
14 #include "FieldInfos.h"
15 #include "Term.h"
16 #include "Terms.h"
17 #include "TermInfo.h"
18 #include "TermInfosWriter.h"
19
20 CL_NS_USE(store)
CL_NS_USE(util)21 CL_NS_USE(util)
22 CL_NS_DEF(index)
23
24 TermInfosReader::TermInfosReader(Directory* dir, const QString& seg,
25 FieldInfos* fis)
26 : directory(dir)
27 , fieldInfos (fis)
28 {
29 //Func - Constructor.
30 // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii)
31 //Pre - dir is a reference to a valid Directory
32 // Fis contains a valid reference to an FieldInfos instance
33 // seg != NULL and contains the name of the segment
34 //Post - An instance has been created and the index named seg has been read. (Remember
35 // a segment is nothing more then an independently readable index)
36
37 CND_PRECONDITION(!seg.isEmpty(), "seg is NULL");
38
39 //Initialize the name of the segment
40 segment = seg;
41 //There are no indexTerms yet
42 indexTerms = NULL;
43 //So there are no indexInfos
44 indexInfos = NULL;
45 //So there are no indexPointers
46 indexPointers = NULL;
47 //Create a filname fo a Term Info File
48 QString tisFile = Misc::segmentname(segment, QLatin1String(".tis"));
49 QString tiiFile = Misc::segmentname(segment, QLatin1String(".tii"));
50
51 //Create an SegmentTermEnum for storing all the terms read of the segment
52 origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile ), fieldInfos, false);
53 indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile ), fieldInfos, true);
54
55 //Check if enumerator points to a valid instance
56 CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator");
57 CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator");
58
59 //Get the size of the enumeration and store it in size
60 _size = origEnum->size;
61 }
62
~TermInfosReader()63 TermInfosReader::~TermInfosReader()
64 {
65 //Func - Destructor
66 //Pre - true
67 //Post - The instance has been destroyed
68
69 //Close the TermInfosReader to be absolutly sure that enumerator has been closed
70 //and the arrays indexTerms, indexPointers and indexInfos and their elements
71 //have been destroyed
72 close();
73 }
74
close()75 void TermInfosReader::close()
76 {
77 //Func - Close the enumeration of TermInfos
78 //Pre - true
79 //Post - The _enumeration has been closed and the arrays
80
81 //Check if indexTerms and indexInfos exist
82 if (indexTerms && indexInfos){
83 //Iterate through arrays indexTerms and indexPointer to
84 //destroy their elements
85 #ifdef _DEBUG
86 for (int32_t i = 0; i < indexTermsLength; ++i) {
87 if (indexTerms[i].__cl_refcount != 1) {
88 CND_PRECONDITION(indexTerms[i].__cl_refcount == 1,
89 "TermInfosReader term was references more than internally");
90 }
91 // _CLDECDELETE(indexTerms[i]);
92 //_CLDELETE(indexInfos[i]);
93 }
94 #endif
95 //Delete the arrays
96 _CLDELETE_ARRAY(indexTerms);
97 _CLDELETE_ARRAY(indexInfos);
98 }
99
100 //Delete the arrays
101 _CLDELETE_ARRAY(indexPointers);
102
103 if (origEnum != NULL) {
104 origEnum->close();
105
106 //Get a pointer to IndexInput used by the enumeration but
107 //instantiated in the constructor by directory.open( tisFile )
108 IndexInput *is = origEnum->input;
109
110 //Delete the enumuration enumerator
111 _CLDELETE(origEnum);
112
113 //Delete the IndexInput
114 _CLDELETE(is);
115 }
116
117 if (indexEnum != NULL){
118 indexEnum->close();
119
120 //Get a pointer to IndexInput used by the enumeration but
121 //instantiated in the constructor by directory.open( tiiFile )
122 IndexInput *is = indexEnum->input;
123
124 //Delete the enumuration enumerator
125 _CLDELETE(indexEnum);
126
127 //Delete the IndexInput
128 _CLDELETE(is);
129 }
130 }
131
size() const132 int64_t TermInfosReader::size() const
133 {
134 //Func - Return the size of the enumeration of TermInfos
135 //Pre - true
136 //Post - size has been returened
137
138 return _size;
139 }
140
get(const int32_t position)141 Term* TermInfosReader::get(const int32_t position)
142 {
143 //Func - Returns the nth term in the set
144 //Pre - position > = 0
145 //Post - The n-th term in the set has been returned
146
147 //Check if the size is 0 because then there are no terms
148 if (_size == 0)
149 return NULL;
150
151 SegmentTermEnum* enumerator = getEnum();
152
153 if (enumerator != NULL //an enumeration exists
154 && enumerator->term(false) != NULL // term is at or past current
155 && position >= enumerator->position
156 && position < (enumerator->position + enumerator->indexInterval)) {
157 return scanEnum(position); // can avoid seek
158 }
159
160 //random-access: must seek
161 seekEnum(position / enumerator->indexInterval);
162
163 //Get the Term at position
164 return scanEnum(position);
165 }
166
167 // TODO: currently there is no way of cleaning up a thread, if the thread ends.
168 // we are stuck with the terminfosreader of that thread. Hopefully this won't
169 // be too big a problem... solutions anyone?
getEnum()170 SegmentTermEnum* TermInfosReader::getEnum()
171 {
172 SegmentTermEnum* termEnum = enumerators.get();
173 if (termEnum == NULL) {
174 termEnum = terms();
175 enumerators.set(termEnum);
176 }
177 return termEnum;
178 }
179
get(const Term * term)180 TermInfo* TermInfosReader::get(const Term* term)
181 {
182 //Func - Returns a TermInfo for a term
183 //Pre - term holds a valid reference to term
184 //Post - if term can be found its TermInfo has been returned otherwise NULL
185
186 //If the size of the enumeration is 0 then no Terms have been read
187 if (_size == 0)
188 return NULL;
189
190 ensureIndexIsRead();
191
192 // optimize sequential access: first try scanning cached enum w/o seeking
193 SegmentTermEnum* enumerator = getEnum();
194
195 // optimize sequential access: first try scanning cached enumerator w/o seeking
196 // if the current term of the enumeration enumerator is not at the end
197 if (enumerator->term(false) != NULL
198 // AND there exists a previous current called prev and term is
199 // positioned after this prev
200 && ((enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0)
201 // OR term is positioned at the same position as the current of
202 // enumerator or at a higher position
203 || term->compareTo(enumerator->term(false)) >= 0)) {
204 //Calculate the offset for the position
205 int32_t _enumOffset = (int32_t)
206 (enumerator->position / enumerator->indexInterval) + 1;
207
208 // but before end of block the length of indexTerms (the number of
209 // terms in enumerator) equals _enum_offset
210 if (indexTermsLength == _enumOffset
211 // OR term is positioned in front of term found at _enumOffset in
212 // indexTerms
213 || term->compareTo(&indexTerms[_enumOffset]) < 0) {
214 //no need to seek, retrieve the TermInfo for term
215 return scanEnum(term);
216 }
217 }
218
219 //Reposition current term in the enumeration
220 seekEnum(getIndexOffset(term));
221 //Return the TermInfo for term
222 return scanEnum(term);
223 }
224
getPosition(const Term * term)225 int64_t TermInfosReader::getPosition(const Term* term)
226 {
227 //Func - Returns the position of a Term in the set
228 //Pre - term holds a valid reference to a Term
229 // enumerator != NULL
230 //Post - If term was found then its position is returned otherwise -1
231
232 //if the enumeration is empty then return -1
233 if (_size == 0)
234 return -1;
235
236 ensureIndexIsRead();
237
238 //Retrieve the indexOffset for term
239 int32_t indexOffset = getIndexOffset(term);
240 seekEnum(indexOffset);
241
242 SegmentTermEnum* enumerator = getEnum();
243
244 while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {}
245
246 if (term->equals(enumerator->term(false)))
247 return enumerator->position;
248
249 return -1;
250 }
251
terms(const Term * term)252 SegmentTermEnum* TermInfosReader::terms(const Term* term)
253 {
254 //Func - Returns an enumeration of terms starting at or after the named term.
255 // If term is null then enumerator is set to the beginning
256 //Pre - term holds a valid reference to a Term
257 // enumerator != NULL
258 //Post - An enumeration of terms starting at or after the named term has been returned
259
260 SegmentTermEnum* enumerator = NULL;
261 if (term != NULL) {
262 //Seek enumerator to term; delete the new TermInfo that's returned.
263 TermInfo* ti = get(term);
264 _CLDELETE(ti);
265 enumerator = getEnum();
266 } else {
267 enumerator = origEnum;
268 }
269 //Clone the entire enumeration
270 SegmentTermEnum* cln = enumerator->clone();
271
272 //Check if cln points to a valid instance
273 CND_CONDITION(cln != NULL, "cln is NULL");
274
275 return cln;
276 }
277
ensureIndexIsRead()278 void TermInfosReader::ensureIndexIsRead()
279 {
280 //Func - Reads the term info index file or .tti file.
281 // This file contains every IndexInterval-th entry from the .tis file,
282 // along with its location in the "tis" file. This is designed to be
283 // read entirely into memory and used to provide random access to the
284 // "tis" file.
285 //Pre - indexTerms = NULL
286 // indexInfos = NULL
287 // indexPointers = NULL
288 //Post - The term info index file has been read into memory
289
290 SCOPED_LOCK_MUTEX(THIS_LOCK)
291
292 if ( indexTerms != NULL )
293 return;
294
295 try {
296 indexTermsLength = (size_t)indexEnum->size;
297
298 // Instantiate an block of Term's,so that each one doesn't have to be new'd
299 indexTerms = _CL_NEWARRAY(Term,indexTermsLength);
300
301 // Check if is indexTerms is a valid array
302 CND_CONDITION(indexTerms != NULL,
303 "No memory could be allocated for indexTerms");
304
305 // Instantiate an big block of TermInfo's, so that each one doesn't
306 // have to be new'd
307 indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength);
308
309 // Check if is indexInfos is a valid array
310 CND_CONDITION(indexInfos != NULL,
311 "No memory could be allocated for indexInfos");
312
313 // Instantiate an array indexPointers that contains pointers to the
314 // term info index file
315 indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength);
316
317 // Check if is indexPointers is a valid array
318 CND_CONDITION(indexPointers != NULL,
319 "No memory could be allocated for indexPointers");
320
321 //Iterate through the terms of indexEnum
322 for (int32_t i = 0; indexEnum->next(); ++i) {
323 indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text());
324 indexEnum->getTermInfo(&indexInfos[i]);
325 indexPointers[i] = indexEnum->indexPointer;
326 }
327 } _CLFINALLY (
328 indexEnum->close();
329 // Close and delete the IndexInput is. The close is done by the destructor.
330 _CLDELETE( indexEnum->input );
331 _CLDELETE( indexEnum );
332 );
333 }
334
getIndexOffset(const Term * term)335 int32_t TermInfosReader::getIndexOffset(const Term* term)
336 {
337 //Func - Returns the offset of the greatest index entry which is less than
338 // or equal to term.
339 //Pre - term holds a reference to a valid term
340 // indexTerms != NULL
341 //Post - The new offset has been returned
342
343 //Check if is indexTerms is a valid array
344 CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
345
346 int32_t lo = 0;
347 int32_t hi = indexTermsLength - 1;
348 int32_t mid;
349 int32_t delta;
350
351 while (hi >= lo) {
352 //Start in the middle betwee hi and lo
353 mid = (lo + hi) >> 1;
354
355 //Check if is indexTerms[mid] is a valid instance of Term
356 CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL");
357 CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength");
358
359 //Determine if term is before mid or after mid
360 delta = term->compareTo(&indexTerms[mid]);
361 if (delta < 0) {
362 //Calculate the new hi
363 hi = mid - 1;
364 } else if (delta > 0) {
365 //Calculate the new lo
366 lo = mid + 1;
367 } else {
368 //term has been found so return its position
369 return mid;
370 }
371 }
372 // the new starting offset
373 return hi;
374 }
375
seekEnum(const int32_t indexOffset)376 void TermInfosReader::seekEnum(const int32_t indexOffset)
377 {
378 //Func - Reposition the current Term and TermInfo to indexOffset
379 //Pre - indexOffset >= 0
380 // indexTerms != NULL
381 // indexInfos != NULL
382 // indexPointers != NULL
383 //Post - The current Term and Terminfo have been repositioned to indexOffset
384
385 CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number");
386 CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL");
387 CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL");
388 CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL");
389
390 SegmentTermEnum* enumerator = getEnum();
391 enumerator->seek(indexPointers[indexOffset],
392 (indexOffset * enumerator->indexInterval) - 1,
393 &indexTerms[indexOffset], &indexInfos[indexOffset]);
394 }
395
scanEnum(const Term * term)396 TermInfo* TermInfosReader::scanEnum(const Term* term)
397 {
398 //Func - Scans the Enumeration of terms for term and returns the
399 // corresponding TermInfo instance if found. The search is started
400 // from the current term.
401 //Pre - term contains a valid reference to a Term
402 // enumerator != NULL
403 //Post - if term has been found the corresponding TermInfo has been returned
404 // otherwise NULL has been returned
405
406 SegmentTermEnum* enumerator = getEnum();
407 enumerator->scanTo(term);
408
409 //Check if the at the position the Term term can be found
410 if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) {
411 //Return the TermInfo instance about term
412 return enumerator->getTermInfo();
413 }
414
415 //term was not found so no TermInfo can be returned
416 return NULL;
417 }
418
scanEnum(const int32_t position)419 Term* TermInfosReader::scanEnum(const int32_t position)
420 {
421 //Func - Scans the enumeration to the requested position and returns the
422 // Term located at that position
423 //Pre - position > = 0
424 // enumerator != NULL
425 //Post - The Term at the requested position has been returned
426
427 SegmentTermEnum* enumerator = getEnum();
428
429 // As long the position of the enumeration enumerator is smaller than the
430 // requested one
431 while(enumerator->position < position) {
432 //Move the current of enumerator to the next
433 if (!enumerator->next()) {
434 //If there is no next it means that the requested position was to big
435 return NULL;
436 }
437 }
438
439 //Return the Term a the requested position
440 return enumerator->term();
441 }
442
443 CL_NS_END
444