1 /*
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 *
7 * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
8 */
9 #include "CLucene/StdHeader.h"
10
11 #include "DocumentWriter.h"
12 #include "FieldInfos.h"
13 #include "IndexWriter.h"
14 #include "FieldsWriter.h"
15 #include "Term.h"
16 #include "TermInfo.h"
17 #include "TermInfosWriter.h"
18
19 #include "CLucene/analysis/AnalysisHeader.h"
20
21 #include "CLucene/search/Similarity.h"
22 #include "TermInfosWriter.h"
23 #include "FieldsWriter.h"
24
25 CL_NS_USE(util)
CL_NS_USE(store)26 CL_NS_USE(store)
27 CL_NS_USE(analysis)
28 CL_NS_USE(document)
29 CL_NS_DEF(index)
30
31 /*Posting*/
32
33 DocumentWriter::Posting::Posting(Term* t, const int32_t position,
34 TermVectorOffsetInfo* offset)
35 {
36 //Func - Constructor
37 //Pre - t contains a valid reference to a Term
38 //Post - Instance has been created
39 freq = 1;
40
41 term = _CL_POINTER(t);
42 positions.values = (int32_t*)malloc(sizeof(int32_t));
43 positions.values[0] = position;
44 positions.length = 1;
45
46 if ( offset != NULL ){
47 this->offsets.values =(TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo));
48 this->offsets.values[0] = *offset;
49 this->offsets.length = 1;
50 }
51 }
52
~Posting()53 DocumentWriter::Posting::~Posting()
54 {
55 //Func - Destructor
56 //Pre - true
57 //Post - The instance has been destroyed
58
59 free(positions.values);
60 if ( this->offsets.values != NULL )
61 free(this->offsets.values);
62 _CLDECDELETE(term);
63 }
64
DocumentWriter(Directory * d,Analyzer * a,CL_NS (search)::Similarity * sim,const int32_t mfl)65 DocumentWriter::DocumentWriter(Directory* d, Analyzer* a,
66 CL_NS(search)::Similarity* sim, const int32_t mfl)
67 : analyzer(a)
68 , directory(d)
69 , maxFieldLength(mfl)
70 , fieldInfos(NULL)
71 , fieldLengths(NULL)
72 , similarity(sim)
73 , termIndexInterval(IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
74 , fieldPositions(NULL)
75 , fieldBoosts(NULL)
76 , termBuffer(_CLNEW Term)
77 {
78 //Pre - d contains a valid reference to a Directory
79 // d contains a valid reference to a Analyzer
80 // mfl > 0 and contains the maximum field length
81 //Post - Instance has been created
82
83 CND_PRECONDITION(((mfl > 0) || (mfl == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
84 "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
85
86 fieldInfos = NULL;
87 fieldLengths = NULL;
88 }
89
DocumentWriter(CL_NS (store)::Directory * d,CL_NS (analysis)::Analyzer * a,IndexWriter * writer)90 DocumentWriter::DocumentWriter(CL_NS(store)::Directory* d,
91 CL_NS(analysis)::Analyzer* a, IndexWriter* writer)
92 : analyzer(a)
93 , directory(d)
94 , maxFieldLength(writer->getMaxFieldLength())
95 , fieldInfos(NULL)
96 , fieldLengths(NULL)
97 , similarity(writer->getSimilarity())
98 , termIndexInterval(writer->getTermIndexInterval())
99 , fieldPositions(NULL)
100 , fieldBoosts(NULL)
101 , termBuffer(_CLNEW Term)
102 {
103 //Pre - d contains a valid reference to a Directory
104 // d contains a valid reference to a Analyzer
105 // mfl > 0 and contains the maximum field length
106 //Post - Instance has been created
107
108 CND_PRECONDITION(((maxFieldLength > 0)
109 || (maxFieldLength == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
110 "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
111
112 fieldInfos = NULL;
113 fieldLengths = NULL;
114
115 }
116
~DocumentWriter()117 DocumentWriter::~DocumentWriter()
118 {
119 //Func - Destructor
120 //Pre - true
121 //Post - The instance has been destroyed
122 clearPostingTable();
123 _CLDELETE( fieldInfos );
124 _CLDELETE_ARRAY(fieldLengths);
125 _CLDELETE_ARRAY(fieldPositions);
126 _CLDELETE_ARRAY(fieldBoosts);
127 _CLDELETE_ARRAY(fieldOffsets);
128
129 _CLDECDELETE(termBuffer);
130 }
131
clearPostingTable()132 void DocumentWriter::clearPostingTable()
133 {
134 PostingTableType::iterator itr = postingTable.begin();
135 while (itr != postingTable.end()){
136 _CLDELETE(itr->second);
137 _CLLDECDELETE(itr->first);
138 ++itr;
139 }
140 postingTable.clear();
141 }
142
addDocument(const QString & segment,Document * doc)143 void DocumentWriter::addDocument(const QString& segment, Document* doc)
144 {
145 CND_PRECONDITION(fieldInfos == NULL, "fieldInfos!=NULL")
146
147 // write field names
148 fieldInfos = _CLNEW FieldInfos();
149 fieldInfos->add(doc);
150
151 QString buf = Misc::segmentname(segment, QLatin1String(".fnm"));
152 fieldInfos->write(directory, buf);
153
154 // write field values
155 FieldsWriter fieldsWriter(directory, segment, fieldInfos);
156 try {
157 fieldsWriter.addDocument(doc);
158 } _CLFINALLY (
159 fieldsWriter.close()
160 );
161
162 // clear postingTable
163 clearPostingTable();
164
165 int32_t fieldInfoSize = fieldInfos->size();
166 fieldLengths = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldLengths
167 fieldPositions = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldPositions
168 fieldOffsets = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldOffsets
169 fieldBoosts = _CL_NEWARRAY(qreal, fieldInfoSize); // init fieldBoosts
170
171 qreal fbd = doc->getBoost();
172 for (int32_t i = 0; i < fieldInfoSize; ++i) {
173 fieldLengths[i] = 0;
174 fieldPositions[i] = 0;
175 fieldOffsets[i] = 0;
176 //initialise fieldBoost array with default boost
177 fieldBoosts[i] = fbd;
178 }
179
180 // invert doc into postingTable
181 invertDocument(doc);
182
183 // sort postingTable into an array
184 Posting** postings = NULL;
185 int32_t postingsLength = 0;
186 sortPostingTable(postings, postingsLength);
187
188 //DEBUG:
189 /*for (int32_t i = 0; i < postingsLength; i++) {
190 Posting* posting = postings[i];
191
192 TCHAR* b = posting->term->toString();
193 _cout << b << " freq=" << posting->freq;
194 _CLDELETE(b);
195
196 _cout << " pos=" << posting->positions[0];
197 for (int32_t j = 1; j < posting->freq; j++)
198 _cout <<"," << posting->positions[j];
199
200 _cout << endl;
201 }*/
202
203
204 // write postings
205 writePostings(postings, postingsLength, segment);
206
207 // write norms of indexed fields
208 writeNorms(segment);
209 _CLDELETE_ARRAY(postings);
210 }
211
sortPostingTable(Posting ** & array,int32_t & arraySize)212 void DocumentWriter::sortPostingTable(Posting**& array, int32_t& arraySize)
213 {
214 // copy postingTable into an array
215 arraySize = postingTable.size();
216 array = _CL_NEWARRAY(Posting*,arraySize);
217 PostingTableType::iterator postings = postingTable.begin();
218 int32_t i=0;
219 while ( postings != postingTable.end() ){
220 array[i] = (Posting*)postings->second;
221 postings++;
222 i++;
223 }
224 // sort the array
225 quickSort(array, 0, i - 1);
226 }
227
228
invertDocument(const Document * doc)229 void DocumentWriter::invertDocument(const Document* doc)
230 {
231 DocumentFieldEnumeration* fields = doc->fields();
232 try {
233 while (fields->hasMoreElements()) {
234 Field* field = (Field*)fields->nextElement();
235 const TCHAR* fieldName = field->name();
236 const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);
237
238 int32_t length = fieldLengths[fieldNumber]; // length of field
239 int32_t position = fieldPositions[fieldNumber]; // position in field
240 if (length>0)
241 position+=analyzer->getPositionIncrementGap(fieldName);
242 int32_t offset = fieldOffsets[fieldNumber]; // offset field
243
244 if (field->isIndexed()) {
245 if (!field->isTokenized()) { // un-tokenized field
246 //FEATURE: this is bug in java: if using a Reader, then
247 //field value will not be added. With CLucene, an untokenized
248 //field with a reader will still be added (if it isn't stored,
249 //because if it's stored, then the reader has already been read.
250 const TCHAR* charBuf = NULL;
251 int64_t dataLen = 0;
252
253 if (field->stringValue() == NULL && !field->isStored() ) {
254 CL_NS(util)::Reader* r = field->readerValue();
255 // this call tries to read the entire stream
256 // this may invalidate the string for the further calls
257 // it may be better to do this via a FilterReader
258 // TODO make a better implementation of this
259 dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);
260 if (dataLen == -1)
261 dataLen = 0;
262 //todo: would be better to pass the string length, in case
263 //a null char is passed, but then would need to test the output too.
264 } else {
265 charBuf = field->stringValue();
266 dataLen = _tcslen(charBuf);
267 }
268
269 if(field->isStoreOffsetWithTermVector()){
270 TermVectorOffsetInfo tio;
271 tio.setStartOffset(offset);
272 tio.setEndOffset(offset + dataLen);
273 addPosition(fieldName, charBuf, position++, &tio );
274 }else
275 addPosition(fieldName, charBuf, position++, NULL);
276 offset += dataLen;
277 length++;
278 } else { // field must be tokenized
279 CL_NS(util)::Reader* reader; // find or make Reader
280 bool delReader = false;
281 if (field->readerValue() != NULL) {
282 reader = field->readerValue();
283 } else if (field->stringValue() != NULL) {
284 reader = _CLNEW CL_NS(util)::StringReader(
285 field->stringValue(),_tcslen(field->stringValue()),
286 false);
287 delReader = true;
288 } else {
289 _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");
290 }
291
292 try {
293 // Tokenize field and add to postingTable.
294 CL_NS(analysis)::TokenStream* stream =
295 analyzer->tokenStream(fieldName, reader);
296
297 try {
298 CL_NS(analysis)::Token t;
299 int32_t lastTokenEndOffset = -1;
300 while (stream->next(&t)) {
301 position += (t.getPositionIncrement() - 1);
302
303 if(field->isStoreOffsetWithTermVector()){
304 TermVectorOffsetInfo tio;
305 tio.setStartOffset(offset + t.startOffset());
306 tio.setEndOffset(offset + t.endOffset());
307 addPosition(fieldName, t.termText(), position++, &tio);
308 } else
309 addPosition(fieldName, t.termText(), position++, NULL);
310
311 lastTokenEndOffset = t.endOffset();
312 length++;
313 // Apply field truncation policy.
314 if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
315 // The client programmer has explicitly authorized us to
316 // truncate the token stream after maxFieldLength tokens.
317 if ( length > maxFieldLength)
318 break;
319 } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {
320 const TCHAR* errMsgBase =
321 _T("Indexing a huge number of tokens from a single")
322 _T(" field (\"%s\", in this case) can cause CLucene")
323 _T(" to use memory excessively.")
324 _T(" By default, CLucene will accept only %s tokens")
325 _T(" tokens from a single field before forcing the")
326 _T(" client programmer to specify a threshold at")
327 _T(" which to truncate the token stream.")
328 _T(" You should set this threshold via")
329 _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")
330 _T(" to disable truncation, or a value to specify maximum number of fields).");
331
332 TCHAR defaultMaxAsChar[34];
333 _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,
334 defaultMaxAsChar, 10
335 );
336 int32_t errMsgLen = _tcslen(errMsgBase)
337 + _tcslen(fieldName)
338 + _tcslen(defaultMaxAsChar);
339 TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);
340
341 _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);
342
343 _CLTHROWT_DEL(CL_ERR_Runtime,errMsg);
344 }
345 } // while token->next
346
347 if(lastTokenEndOffset != -1 )
348 offset += lastTokenEndOffset + 1;
349 } _CLFINALLY (
350 stream->close();
351 _CLDELETE(stream);
352 );
353 } _CLFINALLY (
354 if (delReader) {
355 _CLDELETE(reader);
356 }
357 );
358 } // if/else field is to be tokenized
359 fieldLengths[fieldNumber] = length; // save field length
360 fieldPositions[fieldNumber] = position; // save field position
361 fieldBoosts[fieldNumber] *= field->getBoost();
362 fieldOffsets[fieldNumber] = offset;
363 } // if field is to beindexed
364 } // while more fields available
365 } _CLFINALLY (
366 _CLDELETE(fields);
367 );
368 }
369
addPosition(const TCHAR * field,const TCHAR * text,const int32_t position,TermVectorOffsetInfo * offset)370 void DocumentWriter::addPosition(const TCHAR* field, const TCHAR* text,
371 const int32_t position, TermVectorOffsetInfo* offset)
372 {
373 termBuffer->set(field,text,false);
374
375 Posting* ti = postingTable.get(termBuffer);
376 if (ti != NULL) { // word seen before
377 int32_t freq = ti->freq;
378 if (ti->positions.length == freq) {
379 // positions array is full, realloc its size
380 ti->positions.length = freq*2;
381 ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t));
382 }
383 ti->positions.values[freq] = position; // add new position
384
385 if (offset != NULL) {
386 if (ti->offsets.length == freq){
387 ti->offsets.length = freq*2;
388 ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo));
389 }
390 ti->offsets[freq] = *offset;
391 }
392
393 ti->freq = freq + 1; // update frequency
394 } else { // word not seen before
395 Term* term = _CLNEW Term( field, text, false);
396 postingTable.put(term, _CLNEW Posting(term, position, offset));
397 }
398 }
399
400 //static
quickSort(Posting ** & postings,const int32_t lo,const int32_t hi)401 void DocumentWriter::quickSort(Posting**& postings, const int32_t lo, const int32_t hi)
402 {
403 if(lo >= hi)
404 return;
405
406 int32_t mid = (lo + hi) / 2;
407
408 if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
409 Posting* tmp = postings[lo];
410 postings[lo] = postings[mid];
411 postings[mid] = tmp;
412 }
413
414 if(postings[mid]->term->compareTo(postings[hi]->term) > 0) {
415 Posting* tmp = postings[mid];
416 postings[mid] = postings[hi];
417 postings[hi] = tmp;
418
419 if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
420 Posting* tmp2 = postings[lo];
421 postings[lo] = postings[mid];
422 postings[mid] = tmp2;
423 }
424 }
425
426 int32_t left = lo + 1;
427 int32_t right = hi - 1;
428
429 if (left >= right)
430 return;
431
432 const Term* partition = postings[mid]->term; //not kept, so no need to finalize
433
434 for( ;; ) {
435 while(postings[right]->term->compareTo(partition) > 0)
436 --right;
437
438 while(left < right && postings[left]->term->compareTo(partition) <= 0)
439 ++left;
440
441 if(left < right) {
442 Posting* tmp = postings[left];
443 postings[left] = postings[right];
444 postings[right] = tmp;
445 --right;
446 } else {
447 break;
448 }
449 }
450
451 quickSort(postings, lo, left);
452 quickSort(postings, left + 1, hi);
453 }
454
writePostings(Posting ** postings,const int32_t postingsLength,const QString & segment)455 void DocumentWriter::writePostings(Posting** postings,
456 const int32_t postingsLength, const QString& segment)
457 {
458 #define __DOCLOSE(obj) \
459 if (obj!=NULL) { \
460 try { \
461 obj->close(); \
462 _CLDELETE(obj); \
463 } catch(CLuceneError &e) { \
464 ierr = e.number(); \
465 err = e.what(); \
466 } catch(...) { \
467 err = "Unknown error while closing posting tables"; \
468 } \
469 }
470
471 IndexOutput* freq = NULL;
472 IndexOutput* prox = NULL;
473 TermInfosWriter* tis = NULL;
474 TermVectorsWriter* termVectorWriter = NULL;
475 try {
476 //open files for inverse index storage
477 QString buf = Misc::segmentname(segment, QLatin1String(".frq"));
478 freq = directory->createOutput(buf);
479
480 buf = Misc::segmentname(segment, QLatin1String(".prx"));
481 prox = directory->createOutput(buf);
482
483 tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,
484 termIndexInterval);
485 TermInfo* ti = _CLNEW TermInfo();
486 const TCHAR* currentField = NULL;
487 for (int32_t i = 0; i < postingsLength; i++) {
488 Posting* posting = postings[i];
489
490 // add an entry to the dictionary with pointers to prox and freq files
491 ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);
492 tis->add(posting->term, ti);
493
494 // add an entry to the freq file
495 int32_t postingFreq = posting->freq;
496 if (postingFreq == 1) // optimize freq=1
497 freq->writeVInt(1); // set low bit of doc num.
498 else {
499 freq->writeVInt(0); // the document number
500 freq->writeVInt(postingFreq); // frequency in doc
501 }
502
503 int32_t lastPosition = 0; // write positions
504 for (int32_t j = 0; j < postingFreq; ++j) { // use delta-encoding
505 prox->writeVInt(posting->positions.values[j] - lastPosition);
506 lastPosition = posting->positions.values[j];
507 }
508
509 // check to see if we switched to a new field
510 const TCHAR* termField = posting->term->field();
511 if ( currentField == NULL || _tcscmp(currentField,termField) != 0 ) {
512 //todo, can we do an intern'd check?
513 // changing field - see if there is something to save
514 currentField = termField;
515 FieldInfo* fi = fieldInfos->fieldInfo(currentField);
516
517 if (fi->storeTermVector) {
518 if (termVectorWriter == NULL) {
519 termVectorWriter = _CLNEW TermVectorsWriter(directory,
520 segment, fieldInfos);
521 termVectorWriter->openDocument();
522 }
523 termVectorWriter->openField(currentField);
524 } else if (termVectorWriter != NULL) {
525 termVectorWriter->closeField();
526 }
527 }
528 if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) {
529 termVectorWriter->addTerm(posting->term->text(), postingFreq,
530 &posting->positions, &posting->offsets);
531 }
532 }
533 if (termVectorWriter != NULL)
534 termVectorWriter->closeDocument();
535 _CLDELETE(ti);
536 } _CLFINALLY (
537 const char* err = NULL;
538 int32_t ierr = 0;
539
540 // make an effort to close all streams we can but remember and re-throw
541 // the first exception encountered in this process
542 __DOCLOSE(freq);
543 __DOCLOSE(prox);
544 __DOCLOSE(tis);
545 __DOCLOSE(termVectorWriter);
546 if (err != NULL)
547 _CLTHROWA(ierr,err);
548 );
549 }
550
writeNorms(const QString & segment)551 void DocumentWriter::writeNorms(const QString& segment)
552 {
553 for(int32_t n = 0; n < fieldInfos->size(); n++){
554 FieldInfo* fi = fieldInfos->fieldInfo(n);
555 if(fi->isIndexed && !fi->omitNorms) {
556 qreal norm = fieldBoosts[n] * similarity->lengthNorm(
557 fi->name, fieldLengths[n]);
558
559 QString fn(segment + QLatin1String(".f%1"));
560 IndexOutput* norms = directory->createOutput(fn.arg(n));
561 try {
562 norms->writeByte(CL_NS(search)::Similarity::encodeNorm(norm));
563 }_CLFINALLY (
564 norms->close();
565 _CLDELETE(norms);
566 )
567 }
568 }
569 }
570
571 CL_NS_END
572