1 /*
2  * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3  *
4  * Distributable under the terms of either the Apache License (Version 2.0) or
5  * the GNU Lesser General Public License, as specified in the COPYING file.
6  *
7  * Changes are Copyright(C) 2007, 2008 by Nokia Corporation and/or its subsidiary(-ies), all rights reserved.
8 */
9 #include "CLucene/StdHeader.h"
10 
11 #include "DocumentWriter.h"
12 #include "FieldInfos.h"
13 #include "IndexWriter.h"
14 #include "FieldsWriter.h"
15 #include "Term.h"
16 #include "TermInfo.h"
17 #include "TermInfosWriter.h"
18 
19 #include "CLucene/analysis/AnalysisHeader.h"
20 
21 #include "CLucene/search/Similarity.h"
22 #include "TermInfosWriter.h"
23 #include "FieldsWriter.h"
24 
25 CL_NS_USE(util)
CL_NS_USE(store)26 CL_NS_USE(store)
27 CL_NS_USE(analysis)
28 CL_NS_USE(document)
29 CL_NS_DEF(index)
30 
31 /*Posting*/
32 
33 DocumentWriter::Posting::Posting(Term* t, const int32_t position,
34     TermVectorOffsetInfo* offset)
35 {
36     //Func - Constructor
37     //Pre  - t contains a valid reference to a Term
38     //Post - Instance has been created
39     freq = 1;
40 
41     term = _CL_POINTER(t);
42     positions.values = (int32_t*)malloc(sizeof(int32_t));
43     positions.values[0] = position;
44     positions.length = 1;
45 
46     if ( offset != NULL ){
47         this->offsets.values =(TermVectorOffsetInfo*)malloc(sizeof(TermVectorOffsetInfo));
48         this->offsets.values[0] = *offset;
49         this->offsets.length = 1;
50     }
51 }
52 
~Posting()53 DocumentWriter::Posting::~Posting()
54 {
55     //Func - Destructor
56     //Pre  - true
57     //Post - The instance has been destroyed
58 
59     free(positions.values);
60     if ( this->offsets.values != NULL )
61         free(this->offsets.values);
62     _CLDECDELETE(term);
63 }
64 
DocumentWriter(Directory * d,Analyzer * a,CL_NS (search)::Similarity * sim,const int32_t mfl)65 DocumentWriter::DocumentWriter(Directory* d, Analyzer* a,
66     CL_NS(search)::Similarity* sim, const int32_t mfl)
67     : analyzer(a)
68     , directory(d)
69     , maxFieldLength(mfl)
70     , fieldInfos(NULL)
71     , fieldLengths(NULL)
72     , similarity(sim)
73     , termIndexInterval(IndexWriter::DEFAULT_TERM_INDEX_INTERVAL)
74     , fieldPositions(NULL)
75     , fieldBoosts(NULL)
76     , termBuffer(_CLNEW Term)
77 {
78     //Pre  - d contains a valid reference to a Directory
79     //       d contains a valid reference to a Analyzer
80     //       mfl > 0 and contains the maximum field length
81     //Post - Instance has been created
82 
83     CND_PRECONDITION(((mfl > 0) || (mfl == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
84         "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
85 
86     fieldInfos     = NULL;
87     fieldLengths   = NULL;
88 }
89 
DocumentWriter(CL_NS (store)::Directory * d,CL_NS (analysis)::Analyzer * a,IndexWriter * writer)90 DocumentWriter::DocumentWriter(CL_NS(store)::Directory* d,
91     CL_NS(analysis)::Analyzer* a, IndexWriter* writer)
92     : analyzer(a)
93     , directory(d)
94     , maxFieldLength(writer->getMaxFieldLength())
95     , fieldInfos(NULL)
96     , fieldLengths(NULL)
97     , similarity(writer->getSimilarity())
98     , termIndexInterval(writer->getTermIndexInterval())
99     , fieldPositions(NULL)
100     , fieldBoosts(NULL)
101     , termBuffer(_CLNEW Term)
102 {
103     //Pre  - d contains a valid reference to a Directory
104     //       d contains a valid reference to a Analyzer
105     //       mfl > 0 and contains the maximum field length
106     //Post - Instance has been created
107 
108     CND_PRECONDITION(((maxFieldLength > 0)
109         || (maxFieldLength == IndexWriter::FIELD_TRUNC_POLICY__WARN)),
110         "mfl is 0 or smaller than IndexWriter::FIELD_TRUNC_POLICY__WARN")
111 
112     fieldInfos     = NULL;
113     fieldLengths   = NULL;
114 
115 }
116 
~DocumentWriter()117 DocumentWriter::~DocumentWriter()
118 {
119     //Func - Destructor
120     //Pre  - true
121     //Post - The instance has been destroyed
122     clearPostingTable();
123     _CLDELETE( fieldInfos );
124     _CLDELETE_ARRAY(fieldLengths);
125     _CLDELETE_ARRAY(fieldPositions);
126     _CLDELETE_ARRAY(fieldBoosts);
127     _CLDELETE_ARRAY(fieldOffsets);
128 
129     _CLDECDELETE(termBuffer);
130 }
131 
clearPostingTable()132 void DocumentWriter::clearPostingTable()
133 {
134 	PostingTableType::iterator itr = postingTable.begin();
135 	while (itr != postingTable.end()){
136 		_CLDELETE(itr->second);
137 		_CLLDECDELETE(itr->first);
138 		++itr;
139 	}
140 	postingTable.clear();
141 }
142 
addDocument(const QString & segment,Document * doc)143 void DocumentWriter::addDocument(const QString& segment, Document* doc)
144 {
145     CND_PRECONDITION(fieldInfos == NULL, "fieldInfos!=NULL")
146 
147 	// write field names
148 	fieldInfos = _CLNEW FieldInfos();
149 	fieldInfos->add(doc);
150 
151     QString buf = Misc::segmentname(segment, QLatin1String(".fnm"));
152 	fieldInfos->write(directory, buf);
153 
154 	// write field values
155 	FieldsWriter fieldsWriter(directory, segment, fieldInfos);
156 	try {
157 		fieldsWriter.addDocument(doc);
158 	} _CLFINALLY (
159         fieldsWriter.close()
160     );
161 
162 	// clear postingTable
163 	clearPostingTable();
164 
165     int32_t fieldInfoSize = fieldInfos->size();
166     fieldLengths = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldLengths
167     fieldPositions = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldPositions
168 	fieldOffsets = _CL_NEWARRAY(int32_t, fieldInfoSize); // init fieldOffsets
169     fieldBoosts = _CL_NEWARRAY(qreal, fieldInfoSize); // init fieldBoosts
170 
171     qreal fbd = doc->getBoost();
172     for (int32_t i = 0; i < fieldInfoSize; ++i) {
173         fieldLengths[i] = 0;
174         fieldPositions[i] = 0;
175         fieldOffsets[i] = 0;
176         //initialise fieldBoost array with default boost
177         fieldBoosts[i] = fbd;
178     }
179 
180     // invert doc into postingTable
181     invertDocument(doc);
182 
183 	// sort postingTable into an array
184 	Posting** postings = NULL;
185 	int32_t postingsLength = 0;
186 	sortPostingTable(postings, postingsLength);
187 
188 	//DEBUG:
189 	/*for (int32_t i = 0; i < postingsLength; i++) {
190 		Posting* posting = postings[i];
191 
192 		TCHAR* b = posting->term->toString();
193 		_cout << b << " freq=" << posting->freq;
194 		_CLDELETE(b);
195 
196 		_cout << " pos=" << posting->positions[0];
197 		for (int32_t j = 1; j < posting->freq; j++)
198 			_cout <<"," << posting->positions[j];
199 
200 		_cout << endl;
201 	}*/
202 
203 
204 	// write postings
205 	writePostings(postings, postingsLength, segment);
206 
207 	// write norms of indexed fields
208 	writeNorms(segment);
209 	_CLDELETE_ARRAY(postings);
210 }
211 
sortPostingTable(Posting ** & array,int32_t & arraySize)212 void DocumentWriter::sortPostingTable(Posting**& array, int32_t& arraySize)
213 {
214     // copy postingTable into an array
215     arraySize = postingTable.size();
216     array = _CL_NEWARRAY(Posting*,arraySize);
217     PostingTableType::iterator postings = postingTable.begin();
218     int32_t i=0;
219     while ( postings != postingTable.end() ){
220         array[i] = (Posting*)postings->second;
221         postings++;
222         i++;
223     }
224     // sort the array
225     quickSort(array, 0, i - 1);
226 }
227 
228 
invertDocument(const Document * doc)229 void DocumentWriter::invertDocument(const Document* doc)
230 {
231     DocumentFieldEnumeration* fields = doc->fields();
232     try {
233         while (fields->hasMoreElements()) {
234             Field* field = (Field*)fields->nextElement();
235             const TCHAR* fieldName = field->name();
236             const int32_t fieldNumber = fieldInfos->fieldNumber(fieldName);
237 
238             int32_t length = fieldLengths[fieldNumber];     // length of field
239             int32_t position = fieldPositions[fieldNumber]; // position in field
240             if (length>0)
241                 position+=analyzer->getPositionIncrementGap(fieldName);
242             int32_t offset = fieldOffsets[fieldNumber];       // offset field
243 
244             if (field->isIndexed()) {
245                 if (!field->isTokenized()) { // un-tokenized field
246                     //FEATURE: this is bug in java: if using a Reader, then
247                     //field value will not be added. With CLucene, an untokenized
248                     //field with a reader will still be added (if it isn't stored,
249                     //because if it's stored, then the reader has already been read.
250                     const TCHAR* charBuf = NULL;
251                     int64_t dataLen = 0;
252 
253                     if (field->stringValue() == NULL && !field->isStored() ) {
254                         CL_NS(util)::Reader* r = field->readerValue();
255                         // this call tries to read the entire stream
256                         // this may invalidate the string for the further calls
257                         // it may be better to do this via a FilterReader
258                         // TODO make a better implementation of this
259                         dataLen = r->read(charBuf, LUCENE_INT32_MAX_SHOULDBE);
260                         if (dataLen == -1)
261                             dataLen = 0;
262                         //todo: would be better to pass the string length, in case
263                         //a null char is passed, but then would need to test the output too.
264                     } else {
265                         charBuf = field->stringValue();
266                         dataLen = _tcslen(charBuf);
267                     }
268 
269                     if(field->isStoreOffsetWithTermVector()){
270                         TermVectorOffsetInfo tio;
271                         tio.setStartOffset(offset);
272                         tio.setEndOffset(offset + dataLen);
273                         addPosition(fieldName, charBuf, position++, &tio );
274                     }else
275                         addPosition(fieldName, charBuf, position++, NULL);
276                     offset += dataLen;
277                     length++;
278                 } else { // field must be tokenized
279                     CL_NS(util)::Reader* reader; // find or make Reader
280                     bool delReader = false;
281                     if (field->readerValue() != NULL) {
282                         reader = field->readerValue();
283                     } else if (field->stringValue() != NULL) {
284                         reader = _CLNEW CL_NS(util)::StringReader(
285                             field->stringValue(),_tcslen(field->stringValue()),
286                             false);
287                         delReader = true;
288                     } else {
289                         _CLTHROWA(CL_ERR_IO,"field must have either String or Reader value");
290                     }
291 
292                     try {
293                         // Tokenize field and add to postingTable.
294                         CL_NS(analysis)::TokenStream* stream =
295                             analyzer->tokenStream(fieldName, reader);
296 
297                         try {
298                             CL_NS(analysis)::Token t;
299                             int32_t lastTokenEndOffset = -1;
300                             while (stream->next(&t)) {
301                                 position += (t.getPositionIncrement() - 1);
302 
303                                 if(field->isStoreOffsetWithTermVector()){
304                                     TermVectorOffsetInfo tio;
305                                     tio.setStartOffset(offset + t.startOffset());
306                                     tio.setEndOffset(offset + t.endOffset());
307                                     addPosition(fieldName, t.termText(), position++, &tio);
308                                 } else
309                                     addPosition(fieldName, t.termText(), position++, NULL);
310 
311                                 lastTokenEndOffset = t.endOffset();
312                                 length++;
313                                 // Apply field truncation policy.
314                                 if (maxFieldLength != IndexWriter::FIELD_TRUNC_POLICY__WARN) {
315                                     // The client programmer has explicitly authorized us to
316                                     // truncate the token stream after maxFieldLength tokens.
317                                     if ( length > maxFieldLength)
318                                         break;
319                                 } else if (length > IndexWriter::DEFAULT_MAX_FIELD_LENGTH) {
320                                     const TCHAR* errMsgBase =
321                                         _T("Indexing a huge number of tokens from a single")
322                                         _T(" field (\"%s\", in this case) can cause CLucene")
323                                         _T(" to use memory excessively.")
324                                         _T("  By default, CLucene will accept only %s tokens")
325                                         _T(" tokens from a single field before forcing the")
326                                         _T(" client programmer to specify a threshold at")
327                                         _T(" which to truncate the token stream.")
328                                         _T("  You should set this threshold via")
329                                         _T(" IndexReader::maxFieldLength (set to LUCENE_INT32_MAX")
330                                         _T(" to disable truncation, or a value to specify maximum number of fields).");
331 
332                                     TCHAR defaultMaxAsChar[34];
333                                     _i64tot(IndexWriter::DEFAULT_MAX_FIELD_LENGTH,
334                                         defaultMaxAsChar, 10
335                                         );
336                                     int32_t errMsgLen = _tcslen(errMsgBase)
337                                         + _tcslen(fieldName)
338                                         + _tcslen(defaultMaxAsChar);
339                                     TCHAR* errMsg = _CL_NEWARRAY(TCHAR,errMsgLen+1);
340 
341                                     _sntprintf(errMsg, errMsgLen,errMsgBase, fieldName, defaultMaxAsChar);
342 
343                                     _CLTHROWT_DEL(CL_ERR_Runtime,errMsg);
344                                 }
345                             } // while token->next
346 
347                             if(lastTokenEndOffset != -1 )
348                                 offset += lastTokenEndOffset + 1;
349                         } _CLFINALLY (
350                             stream->close();
351                             _CLDELETE(stream);
352                         );
353                     } _CLFINALLY (
354                         if (delReader) {
355                             _CLDELETE(reader);
356                         }
357                     );
358                 } // if/else field is to be tokenized
359                 fieldLengths[fieldNumber] = length; // save field length
360                 fieldPositions[fieldNumber] = position;	  // save field position
361                 fieldBoosts[fieldNumber] *= field->getBoost();
362                 fieldOffsets[fieldNumber] = offset;
363             } // if field is to beindexed
364         } // while more fields available
365     } _CLFINALLY (
366         _CLDELETE(fields);
367     );
368 }
369 
addPosition(const TCHAR * field,const TCHAR * text,const int32_t position,TermVectorOffsetInfo * offset)370 void DocumentWriter::addPosition(const TCHAR* field, const TCHAR* text,
371     const int32_t position, TermVectorOffsetInfo* offset)
372 {
373 	termBuffer->set(field,text,false);
374 
375 	Posting* ti = postingTable.get(termBuffer);
376 	if (ti != NULL) {				  // word seen before
377 		int32_t freq = ti->freq;
378 		if (ti->positions.length == freq) {
379             // positions array is full, realloc its size
380             ti->positions.length = freq*2;
381             ti->positions.values = (int32_t*)realloc(ti->positions.values, ti->positions.length * sizeof(int32_t));
382 		}
383 		ti->positions.values[freq] = position;		  // add new position
384 
385 		if (offset != NULL) {
386 			if (ti->offsets.length == freq){
387 				ti->offsets.length = freq*2;
388                 ti->offsets.values = (TermVectorOffsetInfo*)realloc(ti->offsets.values, ti->offsets.length * sizeof(TermVectorOffsetInfo));
389             }
390 			ti->offsets[freq] = *offset;
391 		}
392 
393 		ti->freq = freq + 1;			  // update frequency
394 	} else {					  // word not seen before
395 		Term* term = _CLNEW Term( field, text, false);
396 		postingTable.put(term, _CLNEW Posting(term, position, offset));
397 	}
398 }
399 
400 //static
quickSort(Posting ** & postings,const int32_t lo,const int32_t hi)401 void DocumentWriter::quickSort(Posting**& postings, const int32_t lo, const int32_t hi)
402 {
403 	if(lo >= hi)
404 		return;
405 
406 	int32_t mid = (lo + hi) / 2;
407 
408 	if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
409 		 Posting* tmp = postings[lo];
410 		postings[lo] = postings[mid];
411 		postings[mid] = tmp;
412 	}
413 
414 	if(postings[mid]->term->compareTo(postings[hi]->term) > 0) {
415 		Posting* tmp = postings[mid];
416 		postings[mid] = postings[hi];
417 		postings[hi] = tmp;
418 
419 		if(postings[lo]->term->compareTo(postings[mid]->term) > 0) {
420 			Posting* tmp2 = postings[lo];
421 			postings[lo] = postings[mid];
422 			postings[mid] = tmp2;
423 		}
424 	}
425 
426 	int32_t left = lo + 1;
427 	int32_t right = hi - 1;
428 
429 	if (left >= right)
430 		return;
431 
432 	const Term* partition = postings[mid]->term; //not kept, so no need to finalize
433 
434 	for( ;; ) {
435 		while(postings[right]->term->compareTo(partition) > 0)
436 		--right;
437 
438 		while(left < right && postings[left]->term->compareTo(partition) <= 0)
439 			++left;
440 
441 		if(left < right) {
442 			Posting* tmp = postings[left];
443 			postings[left] = postings[right];
444 			postings[right] = tmp;
445 			--right;
446 		} else {
447 			break;
448 		}
449 	}
450 
451 	quickSort(postings, lo, left);
452 	quickSort(postings, left + 1, hi);
453 }
454 
writePostings(Posting ** postings,const int32_t postingsLength,const QString & segment)455 void DocumentWriter::writePostings(Posting** postings,
456     const int32_t postingsLength, const QString& segment)
457 {
458 	#define __DOCLOSE(obj)                                          \
459         if (obj!=NULL) {                                            \
460             try {                                                   \
461                 obj->close();                                       \
462                 _CLDELETE(obj);                                     \
463             } catch(CLuceneError &e) {                              \
464                 ierr = e.number();                                  \
465                 err = e.what();                                     \
466             } catch(...) {                                          \
467                 err = "Unknown error while closing posting tables"; \
468             }                                                       \
469         }
470 
471     IndexOutput* freq = NULL;
472     IndexOutput* prox = NULL;
473     TermInfosWriter* tis = NULL;
474     TermVectorsWriter* termVectorWriter = NULL;
475     try {
476         //open files for inverse index storage
477         QString buf = Misc::segmentname(segment, QLatin1String(".frq"));
478         freq = directory->createOutput(buf);
479 
480         buf = Misc::segmentname(segment, QLatin1String(".prx"));
481         prox = directory->createOutput(buf);
482 
483         tis = _CLNEW TermInfosWriter(directory, segment, fieldInfos,
484             termIndexInterval);
485         TermInfo* ti = _CLNEW TermInfo();
486         const TCHAR* currentField = NULL;
487         for (int32_t i = 0; i < postingsLength; i++) {
488             Posting* posting = postings[i];
489 
490             // add an entry to the dictionary with pointers to prox and freq files
491             ti->set(1, freq->getFilePointer(), prox->getFilePointer(), -1);
492             tis->add(posting->term, ti);
493 
494             // add an entry to the freq file
495             int32_t postingFreq = posting->freq;
496             if (postingFreq == 1)				  // optimize freq=1
497                 freq->writeVInt(1);			  // set low bit of doc num.
498             else {
499                 freq->writeVInt(0);			  // the document number
500                 freq->writeVInt(postingFreq);			  // frequency in doc
501             }
502 
503             int32_t lastPosition = 0;			  // write positions
504             for (int32_t j = 0; j < postingFreq; ++j) {		  // use delta-encoding
505                 prox->writeVInt(posting->positions.values[j] - lastPosition);
506                 lastPosition = posting->positions.values[j];
507             }
508 
509             // check to see if we switched to a new field
510             const TCHAR* termField = posting->term->field();
511             if ( currentField == NULL || _tcscmp(currentField,termField) != 0 ) {
512                 //todo, can we do an intern'd check?
513                 // changing field - see if there is something to save
514                 currentField = termField;
515                 FieldInfo* fi = fieldInfos->fieldInfo(currentField);
516 
517                 if (fi->storeTermVector) {
518                     if (termVectorWriter == NULL) {
519                         termVectorWriter = _CLNEW TermVectorsWriter(directory,
520                             segment, fieldInfos);
521                         termVectorWriter->openDocument();
522                     }
523                     termVectorWriter->openField(currentField);
524                 } else if (termVectorWriter != NULL) {
525                     termVectorWriter->closeField();
526                 }
527             }
528             if (termVectorWriter != NULL && termVectorWriter->isFieldOpen()) {
529                 termVectorWriter->addTerm(posting->term->text(), postingFreq,
530                     &posting->positions, &posting->offsets);
531             }
532         }
533         if (termVectorWriter != NULL)
534             termVectorWriter->closeDocument();
535         _CLDELETE(ti);
536     } _CLFINALLY (
537         const char* err = NULL;
538         int32_t ierr = 0;
539 
540         // make an effort to close all streams we can but remember and re-throw
541         // the first exception encountered in this process
542         __DOCLOSE(freq);
543         __DOCLOSE(prox);
544         __DOCLOSE(tis);
545         __DOCLOSE(termVectorWriter);
546         if (err != NULL)
547             _CLTHROWA(ierr,err);
548     );
549 }
550 
writeNorms(const QString & segment)551 void DocumentWriter::writeNorms(const QString& segment)
552 {
553     for(int32_t n = 0; n < fieldInfos->size(); n++){
554         FieldInfo* fi = fieldInfos->fieldInfo(n);
555         if(fi->isIndexed && !fi->omitNorms) {
556             qreal norm = fieldBoosts[n] * similarity->lengthNorm(
557                 fi->name, fieldLengths[n]);
558 
559             QString fn(segment + QLatin1String(".f%1"));
560             IndexOutput* norms = directory->createOutput(fn.arg(n));
561             try {
562                 norms->writeByte(CL_NS(search)::Similarity::encodeNorm(norm));
563             }_CLFINALLY (
564                 norms->close();
565                 _CLDELETE(norms);
566             )
567         }
568     }
569 }
570 
571 CL_NS_END
572