1 //
2 // $Id$
3 //
4 
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 
16 #include "sphinx.h"
17 #include "sphinxutils.h"
18 #include "sphinxint.h"
19 #include "sphinxrt.h"
20 #include <time.h>
21 
22 #if USE_WINDOWS
23 #include <io.h> // for setmode(). open() on windows
24 #define sphSeek		_lseeki64
25 #else
26 #define sphSeek		lseek
27 #endif
28 
29 
StripStdin(const char * sIndexAttrs,const char * sRemoveElements)30 void StripStdin ( const char * sIndexAttrs, const char * sRemoveElements )
31 {
32 	CSphString sError;
33 	CSphHTMLStripper tStripper ( true );
34 	if ( !tStripper.SetIndexedAttrs ( sIndexAttrs, sError )
35 		|| !tStripper.SetRemovedElements ( sRemoveElements, sError ) )
36 			sphDie ( "failed to configure stripper: %s", sError.cstr() );
37 
38 	CSphVector<BYTE> dBuffer;
39 	while ( !feof(stdin) )
40 	{
41 		char sBuffer[1024];
42 		int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin );
43 		if ( !iLen )
44 			break;
45 
46 		int iPos = dBuffer.GetLength();
47 		dBuffer.Resize ( iPos+iLen );
48 		memcpy ( &dBuffer[iPos], sBuffer, iLen );
49 	}
50 	dBuffer.Add ( 0 );
51 
52 	tStripper.Strip ( &dBuffer[0] );
53 	fprintf ( stdout, "dumping stripped results...\n%s\n", &dBuffer[0] );
54 }
55 
56 
ApplyMorphology(CSphIndex * pIndex)57 void ApplyMorphology ( CSphIndex * pIndex )
58 {
59 	CSphVector<BYTE> dInBuffer, dOutBuffer;
60 	const int READ_BUFFER_SIZE = 1024;
61 	dInBuffer.Reserve ( READ_BUFFER_SIZE );
62 	char sBuffer[READ_BUFFER_SIZE];
63 	while ( !feof(stdin) )
64 	{
65 		int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin );
66 		if ( !iLen )
67 			break;
68 
69 		int iPos = dInBuffer.GetLength();
70 		dInBuffer.Resize ( iPos+iLen );
71 		memcpy ( &dInBuffer[iPos], sBuffer, iLen );
72 	}
73 	dInBuffer.Add(0);
74 	dOutBuffer.Reserve ( dInBuffer.GetLength() );
75 
76 	CSphScopedPtr<ISphTokenizer> pTokenizer ( pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX ) );
77 	CSphDict * pDict = pIndex->GetDictionary();
78 	BYTE * sBufferToDump = &dInBuffer[0];
79 	if ( pTokenizer.Ptr() )
80 	{
81 		pTokenizer->SetBuffer ( &dInBuffer[0], dInBuffer.GetLength() );
82 		while ( BYTE * sToken = pTokenizer->GetToken() )
83 		{
84 			if ( pDict )
85 				pDict->ApplyStemmers ( sToken );
86 
87 			int iPos = dOutBuffer.GetLength();
88 			int iLen = strlen ( (char *)sToken );
89 			sToken[iLen] = ' ';
90 			dOutBuffer.Resize ( iPos+iLen+1 );
91 			memcpy ( &dOutBuffer[iPos], sToken, iLen+1 );
92 		}
93 
94 		if ( dOutBuffer.GetLength() )
95 			dOutBuffer[dOutBuffer.GetLength()-1] = 0;
96 		else
97 			dOutBuffer.Add(0);
98 
99 		sBufferToDump = &dOutBuffer[0];
100 	}
101 
102 	fprintf ( stdout, "dumping stemmed results...\n%s\n", sBufferToDump );
103 }
104 
105 
CharsetFold(CSphIndex * pIndex,FILE * fp)106 void CharsetFold ( CSphIndex * pIndex, FILE * fp )
107 {
108 	CSphVector<BYTE> sBuf1 ( 16384 );
109 	CSphVector<BYTE> sBuf2 ( 16384 );
110 
111 	CSphLowercaser tLC = pIndex->GetTokenizer()->GetLowercaser();
112 
113 #if USE_WINDOWS
114 	setmode ( fileno(stdout), O_BINARY );
115 #endif
116 
117 	int iBuf1 = 0; // how many leftover bytes from previous iteration
118 	while ( !feof(fp) )
119 	{
120 		int iGot = fread ( sBuf1.Begin()+iBuf1, 1, sBuf1.GetLength()-iBuf1, fp );
121 		if ( iGot<0 )
122 			sphDie ( "read error: %s", strerror(errno) );
123 
124 		if ( iGot==0 )
125 			if ( feof(fp) )
126 				if ( iBuf1==0 )
127 					break;
128 
129 
130 		const BYTE * pIn = sBuf1.Begin();
131 		const BYTE * pInMax = pIn + iBuf1 + iGot;
132 
133 		if ( pIn==pInMax && feof(fp) )
134 			break;
135 
136 		// tricky bit
137 		// on full buffer, and not an eof, terminate a bit early
138 		// to avoid codepoint vs buffer boundary issue
139 		if ( ( iBuf1+iGot )==sBuf1.GetLength() && iGot!=0 )
140 			pInMax -= 16;
141 
142 		// do folding
143 		BYTE * pOut = sBuf2.Begin();
144 		BYTE * pOutMax = pOut + sBuf2.GetLength() - 16;
145 		while ( pIn < pInMax )
146 		{
147 			int iCode = sphUTF8Decode ( pIn );
148 			if ( iCode==0 )
149 				pIn++; // decoder does not do that!
150 			assert ( iCode>=0 );
151 
152 			if ( iCode!=0x09 && iCode!=0x0A && iCode!=0x0D )
153 			{
154 				iCode = tLC.ToLower ( iCode ) & 0xffffffUL;
155 				if ( !iCode )
156 					iCode = 0x20;
157 			}
158 
159 			pOut += sphUTF8Encode ( pOut, iCode );
160 			if ( pOut>=pOutMax )
161 			{
162 				fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout );
163 				pOut = sBuf2.Begin();
164 			}
165 		}
166 		fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout );
167 
168 		// now move around leftovers
169 		BYTE * pRealEnd = sBuf1.Begin() + iBuf1 + iGot;
170 		if ( pIn < pRealEnd )
171 		{
172 			iBuf1 = pRealEnd - pIn;
173 			memmove ( sBuf1.Begin(), pIn, iBuf1 );
174 		}
175 	}
176 }
177 
178 //////////////////////////////////////////////////////////////////////////
179 
FixupFiles(const CSphVector<CSphString> & dFiles,CSphString & sError)180 bool FixupFiles ( const CSphVector<CSphString> & dFiles, CSphString & sError )
181 {
182 	ARRAY_FOREACH ( i, dFiles )
183 	{
184 		const CSphString & sPath = dFiles[i];
185 		CSphString sKlistOld, sKlistNew, sHeader;
186 		sKlistOld.SetSprintf ( "%s.spk", sPath.cstr() );
187 		sKlistNew.SetSprintf ( "%s.new.spk", sPath.cstr() );
188 		sHeader.SetSprintf ( "%s.sph", sPath.cstr() );
189 
190 		DWORD iCount = 0;
191 		{
192 			CSphAutoreader rdHeader, rdKlistNew, rdKlistOld;
193 			if ( !rdHeader.Open ( sHeader, sError ) || !rdKlistNew.Open ( sKlistNew, sError ) || !rdKlistOld.Open ( sKlistOld, sError ) )
194 				return false;
195 
196 			const SphOffset_t iSize = rdKlistNew.GetFilesize();
197 			iCount = (DWORD)( iSize / sizeof(SphAttr_t) );
198 		}
199 
200 		if ( ::unlink ( sKlistOld.cstr() )!=0 )
201 		{
202 			sError.SetSprintf ( "file: '%s', error: '%s'", sKlistOld.cstr(), strerror(errno) );
203 			return false;
204 		}
205 
206 		if ( ::rename ( sKlistNew.cstr(), sKlistOld.cstr() )!=0 )
207 		{
208 			sError.SetSprintf ( "files: '%s'->'%s', error: '%s'", sKlistNew.cstr(), sKlistOld.cstr(), strerror(errno) );
209 			return false;
210 		}
211 
212 		int iFD = ::open ( sHeader.cstr(), SPH_O_BINARY | O_RDWR, 0644 );
213 		if ( iFD<0 )
214 		{
215 			sError.SetSprintf ( "file: '%s', error: '%s'", sHeader.cstr(), strerror(errno) );
216 			return false;
217 		}
218 
219 		if ( sphSeek ( iFD, -4, SEEK_END )==-1L )
220 		{
221 			sError.SetSprintf ( "file: '%s', error: '%s'", sHeader.cstr(), strerror(errno) );
222 			SafeClose ( iFD );
223 			return false;
224 		}
225 
226 		if ( ::write ( iFD, &iCount, 4 )==-1 )
227 		{
228 			sError.SetSprintf ( "file: '%s', error: '%s'", sHeader.cstr(), strerror(errno) );
229 			SafeClose ( iFD );
230 			return false;
231 		}
232 
233 		SafeClose ( iFD );
234 	}
235 
236 	return true;
237 }
238 
239 
DoKlistsOptimization(int iRowSize,const char * sPath,int iChunkCount,CSphVector<CSphString> & dFiles)240 bool DoKlistsOptimization ( int iRowSize, const char * sPath, int iChunkCount, CSphVector<CSphString> & dFiles )
241 {
242 	CSphTightVector<SphDocID_t> dLiveID;
243 
244 	CSphString sError;
245 
246 	for ( int iChunk=0; iChunk<iChunkCount; iChunk++ )
247 	{
248 		const int64_t tmStart = sphMicroTimer();
249 
250 		fprintf ( stdout, "\nprocessing '%s.%d'...", sPath, iChunk );
251 
252 		CSphString sKlist, sAttr, sNew;
253 		sKlist.SetSprintf ( "%s.%d.spk", sPath, iChunk );
254 		sAttr.SetSprintf ( "%s.%d.spa", sPath, iChunk );
255 		sNew.SetSprintf ( "%s.%d.new.spk", sPath, iChunk );
256 
257 		CSphAutoreader rdKList, rdAttr;
258 		CSphWriter wrNew;
259 		if ( !rdKList.Open ( sKlist, sError ) || !rdAttr.Open ( sAttr, sError ) || !wrNew.OpenFile ( sNew, sError ) )
260 		{
261 			fprintf ( stdout, "\n%s\n", sError.cstr() );
262 			return false;
263 		}
264 
265 		CSphTightVector<SphAttr_t> dKlist;
266 
267 		if ( dLiveID.GetLength()>0 )
268 		{
269 			assert ( rdKList.GetFilesize()<INT_MAX );
270 
271 			dKlist.Resize ( (int)( rdKList.GetFilesize()/sizeof(SphAttr_t) ) );
272 			rdKList.GetBytes ( dKlist.Begin(), (int)rdKList.GetFilesize() );
273 
274 			// 1nd step kill all k-list ids not in live ids
275 
276 			ARRAY_FOREACH ( i, dKlist )
277 			{
278 				SphDocID_t uid = (SphDocID_t)dKlist[i];
279 				SphDocID_t * pInLive = sphBinarySearch ( dLiveID.Begin(), &dLiveID.Last(), uid );
280 				if ( !pInLive )
281 					dKlist.RemoveFast ( i-- );
282 			}
283 			dKlist.Sort();
284 
285 			// 2nd step kill all prev ids by this fresh k-list
286 
287 			SphDocID_t * pFirstLive = dLiveID.Begin();
288 			SphDocID_t * pLastLive = &dLiveID.Last();
289 
290 			ARRAY_FOREACH ( i, dKlist )
291 			{
292 				SphDocID_t uID = (SphDocID_t)dKlist[i];
293 				SphDocID_t * pKilled = sphBinarySearch ( pFirstLive, pLastLive, uID );
294 
295 				assert ( pKilled );
296 				pFirstLive = pKilled+1;
297 				*pKilled = 0;
298 			}
299 
300 #ifndef NDEBUG
301 			const int iWasLive = dLiveID.GetLength();
302 #endif
303 
304 			if ( dKlist.GetLength()>0 )
305 				ARRAY_FOREACH ( i, dLiveID )
306 				if ( dLiveID[i]==0 )
307 					dLiveID.RemoveFast ( i-- );
308 
309 			assert ( dLiveID.GetLength()+dKlist.GetLength()==iWasLive );
310 
311 			dLiveID.Sort();
312 		}
313 
314 		// 3d step write new k-list
315 
316 		if ( dKlist.GetLength()>0 )
317 			wrNew.PutBytes ( dKlist.Begin(), dKlist.GetLength()*sizeof(SphAttr_t) );
318 
319 		dKlist.Reset();
320 		wrNew.CloseFile();
321 
322 		// 4th step merge ID from this segment into live ids
323 		if ( iChunk!=iChunkCount-1 )
324 		{
325 			const int iWasLive = Max ( dLiveID.GetLength()-1, 0 );
326 			const int iRowCount = (int)( rdAttr.GetFilesize() / ( (DOCINFO_IDSIZE+iRowSize)*4 ) );
327 
328 			for ( int i=0; i<iRowCount; i++ )
329 			{
330 				SphDocID_t uID = 0;
331 				rdAttr.GetBytes ( &uID, DOCINFO_IDSIZE*4 );
332 				rdAttr.SkipBytes ( iRowSize*4 );
333 
334 				if ( sphBinarySearch ( dLiveID.Begin(), dLiveID.Begin()+iWasLive, uID )==NULL )
335 					dLiveID.Add ( uID );
336 			}
337 
338 			dLiveID.Sort();
339 		}
340 
341 		CSphString & sFile = dFiles.Add();
342 		sFile.SetSprintf ( "%s.%d", sPath, iChunk );
343 
344 		const int64_t tmEnd = sphMicroTimer();
345 		fprintf ( stdout, "\rprocessed '%s.%d' in %.3f sec", sPath, iChunk, float(tmEnd-tmStart )/1000000.0f );
346 	}
347 
348 	return true;
349 }
350 
351 
352 #pragma pack(push,4)
353 struct IDFWord_t
354 {
355 	uint64_t	m_uWordID;
356 	DWORD		m_iDocs;
357 };
358 #pragma pack(pop)
359 STATIC_SIZE_ASSERT	( IDFWord_t, 12 );
360 
361 
BuildIDF(const CSphString & sFilename,const CSphVector<CSphString> & dFiles,CSphString & sError,bool bSkipUnique)362 bool BuildIDF ( const CSphString & sFilename, const CSphVector<CSphString> & dFiles, CSphString & sError, bool bSkipUnique )
363 {
364 	// text dictionaries are ordered alphabetically - we can use that fact while reading
365 	// to merge duplicates, calculate total number of occurrences and process bSkipUnique
366 	// this method is about 3x faster and consumes ~2x less memory than a hash based one
367 
368 	typedef char StringBuffer_t [ 3*SPH_MAX_WORD_LEN+16+128 ]; // { dict-keyowrd, 32bit number, 32bit number, 64bit number }
369 
370 	int64_t iTotalDocuments = 0;
371 	int64_t iTotalWords = 0;
372 	int64_t iReadWords = 0;
373 	int64_t iMergedWords = 0;
374 	int64_t iSkippedWords = 0;
375 	int64_t iReadBytes = 0;
376 	int64_t iTotalBytes = 0;
377 
378 	const int64_t tmStart = sphMicroTimer ();
379 
380 	int iFiles = dFiles.GetLength ();
381 
382 	CSphVector<CSphAutoreader> dReaders ( iFiles );
383 
384 	ARRAY_FOREACH ( i, dFiles )
385 	{
386 		if ( !dReaders[i].Open ( dFiles[i], sError ) )
387 			return false;
388 		iTotalBytes += dReaders[i].GetFilesize ();
389 	}
390 
391 	// internal state
392 	CSphFixedVector<StringBuffer_t> dWords ( iFiles );
393 	CSphVector<int> dDocs ( iFiles );
394 	CSphVector<bool> dFinished ( iFiles );
395 	dFinished.Fill ( false );
396 	bool bPreread = false;
397 
398 	// current entry
399 	StringBuffer_t sWord = {0};
400 	DWORD iDocs = 0;
401 
402 	// output vector, preallocate 10M
403 	CSphTightVector<IDFWord_t> dEntries;
404 	dEntries.Reserve ( 1024*1024*10 );
405 
406 	for ( int i=0;; )
407 	{
408 		// read next input
409 		for ( ;; )
410 		{
411 			int iLen;
412 			char * sBuffer = dWords[i];
413 			if ( ( iLen = dReaders[i].GetLine ( sBuffer, sizeof(StringBuffer_t) ) )>=0 )
414 			{
415 				iReadBytes += iLen;
416 
417 				// find keyword pattern ( ^<keyword>,<docs>,... )
418 				char * p1 = strchr ( sBuffer, ',' );
419 				if ( p1 )
420 				{
421 					char * p2 = strchr ( p1+1, ',' );
422 					if ( p2 )
423 					{
424 						*p1 = *p2 = '\0';
425 						int iDocuments = atoi ( p1+1 );
426 						if ( iDocuments )
427 						{
428 							dDocs[i] = iDocuments;
429 							iReadWords++;
430 							break;
431 						}
432 					}
433 				} else
434 				{
435 					// keyword pattern not found (rather rare case), try to parse as a header, then
436 					char sSearch[] = "total-documents: ";
437 					if ( strstr ( sBuffer, sSearch )==sBuffer )
438 						iTotalDocuments += atoi ( sBuffer+strlen(sSearch) );
439 				}
440 			} else
441 			{
442 				dFinished[i] = true;
443 				break;
444 			}
445 		}
446 
447 		bool bEnd = !dFinished.Contains ( false );
448 
449 		i++;
450 		if ( !bPreread && i==iFiles )
451 			bPreread = true;
452 
453 		if ( bPreread )
454 		{
455 			// find the next smallest input
456 			i = 0;
457 			for ( int j=0; j<iFiles; j++ )
458 				if ( !dFinished[j] && ( dFinished[i] || strcmp ( dWords[i], dWords[j] )>0 ) )
459 					i = j;
460 
461 			// merge if we got the same word
462 			if ( !strcmp ( sWord, dWords[i] ) && !bEnd )
463 			{
464 				iDocs += dDocs[i];
465 				iMergedWords++;
466 			} else
467 			{
468 				if ( sWord[0]!='\0' )
469 				{
470 					if ( !bSkipUnique || iDocs>1 )
471 					{
472 						IDFWord_t & tEntry = dEntries.Add ();
473 						tEntry.m_uWordID = sphFNV64 ( sWord );
474 						tEntry.m_iDocs = iDocs;
475 						iTotalWords++;
476 					} else
477 						iSkippedWords++;
478 				}
479 
480 				strncpy ( sWord, dWords[i], sizeof ( dWords[i] ) );
481 				iDocs = dDocs[i];
482 			}
483 		}
484 
485 		if ( ( iReadWords & 0xffff )==0 || bEnd )
486 			fprintf ( stderr, "read %.1f of %.1f MB, %.1f%% done%c", ( bEnd ? float(iTotalBytes) : float(iReadBytes) )/1000000.0f,
487 			float(iTotalBytes)/1000000.0f, bEnd ? 100.0f : float(iReadBytes)*100.0f/float(iTotalBytes), bEnd ? '\n' : '\r' );
488 
489 		if ( bEnd )
490 			break;
491 	}
492 
493 	fprintf ( stdout, INT64_FMT" documents, " INT64_FMT " words (" INT64_FMT " read, " INT64_FMT " merged, " INT64_FMT " skipped)\n",
494 		iTotalDocuments, iTotalWords, iReadWords, iMergedWords, iSkippedWords );
495 
496 	// write to disk
497 	fprintf ( stdout, "writing %s (%1.fM)...\n", sFilename.cstr(), float(iTotalWords*sizeof(IDFWord_t))/1000000.0f );
498 
499 	dEntries.Sort ( bind ( &IDFWord_t::m_uWordID ) );
500 
501 	CSphWriter tWriter;
502 	if ( !tWriter.OpenFile ( sFilename, sError ) )
503 		return false;
504 
505 	// write file header
506 	tWriter.PutOffset ( iTotalDocuments );
507 
508 	// write data
509 	tWriter.PutBytes ( dEntries.Begin(), dEntries.GetLength()*sizeof(IDFWord_t) );
510 
511 	int tmWallMsec = (int)( ( sphMicroTimer() - tmStart )/1000 );
512 	fprintf ( stdout, "finished in %d.%d sec\n", tmWallMsec/1000, (tmWallMsec/100)%10 );
513 
514 	return true;
515 }
516 
517 
MergeIDF(const CSphString & sFilename,const CSphVector<CSphString> & dFiles,CSphString & sError,bool bSkipUnique)518 bool MergeIDF ( const CSphString & sFilename, const CSphVector<CSphString> & dFiles, CSphString & sError, bool bSkipUnique )
519 {
520 	// binary dictionaries are ordered by 64-bit word id, we can use that for merging.
521 	// read every file, check repeating word ids, merge if found, write to disk if not
522 	// memory requirements are about ~4KB per input file (used for buffered reading)
523 
524 	int64_t iTotalDocuments = 0;
525 	int64_t iTotalWords = 0;
526 	int64_t iReadWords = 0;
527 	int64_t iMergedWords = 0;
528 	int64_t iSkippedWords = 0;
529 	int64_t iReadBytes = 0;
530 	int64_t iTotalBytes = 0;
531 
532 	const int64_t tmStart = sphMicroTimer ();
533 
534 	int iFiles = dFiles.GetLength ();
535 
536 	// internal state
537 	CSphVector<CSphAutoreader> dReaders ( iFiles );
538 	CSphVector<IDFWord_t> dWords ( iFiles );
539 	CSphVector<int64_t> dRead ( iFiles );
540 	CSphVector<int64_t> dSize ( iFiles );
541 	CSphVector<BYTE*> dBuffers ( iFiles );
542 	CSphVector<bool> dFinished ( iFiles );
543 	dFinished.Fill ( false );
544 	bool bPreread = false;
545 
546 	// current entry
547 	IDFWord_t tWord;
548 	tWord.m_uWordID = 0;
549 	tWord.m_iDocs = 0;
550 
551 	// preread buffer
552 	const int iEntrySize = sizeof(int64_t)+sizeof(DWORD);
553 	const int iBufferSize = iEntrySize*256;
554 
555 	// initialize vectors
556 	ARRAY_FOREACH ( i, dFiles )
557 	{
558 		if ( !dReaders[i].Open ( dFiles[i], sError ) )
559 			return false;
560 		iTotalDocuments += dReaders[i].GetOffset ();
561 		dRead[i] = 0;
562 		dSize[i] = dReaders[i].GetFilesize() - sizeof( SphOffset_t );
563 		dBuffers[i] = new BYTE [ iBufferSize ];
564 		iTotalBytes += dSize[i];
565 	}
566 
567 	// open output file
568 	CSphWriter tWriter;
569 	if ( !tWriter.OpenFile ( sFilename, sError ) )
570 		return false;
571 
572 	// write file header
573 	tWriter.PutOffset ( iTotalDocuments );
574 
575 	for ( int i=0;; )
576 	{
577 		if ( dRead[i]<dSize[i] )
578 		{
579 			iReadBytes += iEntrySize;
580 
581 			// This part basically does the following:
582 			// dWords[i].m_uWordID = dReaders[i].GetOffset ();
583 			// dWords[i].m_iDocs = dReaders[i].GetDword ();
584 			// but reading by 12 bytes seems quite slow (SetBuffers doesn't help)
585 			// the only way to speed it up is to buffer up a few entries manually
586 
587 			int iOffset = (int)( dRead[i] % iBufferSize );
588 			if ( iOffset==0 )
589 				dReaders[i].GetBytes ( dBuffers[i], ( dSize[i]-dRead[i] )<iBufferSize ? (int)( dSize[i]-dRead[i] ) : iBufferSize );
590 
591 			dWords[i].m_uWordID = *(uint64_t*)( dBuffers[i]+iOffset );
592 			dWords[i].m_iDocs = *(DWORD*)( dBuffers[i]+iOffset+sizeof(uint64_t) );
593 
594 			dRead[i] += iEntrySize;
595 			iReadWords++;
596 		} else
597 			dFinished[i] = true;
598 
599 		bool bEnd = !dFinished.Contains ( false );
600 
601 		i++;
602 		if ( !bPreread && i==iFiles )
603 			bPreread = true;
604 
605 		if ( bPreread )
606 		{
607 			// find the next smallest input
608 			i = 0;
609 			for ( int j=0; j<iFiles; j++ )
610 				if ( !dFinished[j] && ( dFinished[i] || dWords[i].m_uWordID>dWords[j].m_uWordID ) )
611 					i = j;
612 
613 			// merge if we got the same word
614 			if ( tWord.m_uWordID==dWords[i].m_uWordID && !bEnd )
615 			{
616 				tWord.m_iDocs += dWords[i].m_iDocs;
617 				iMergedWords++;
618 			} else
619 			{
620 				if ( tWord.m_uWordID )
621 				{
622 					if ( !bSkipUnique || tWord.m_iDocs>1 )
623 					{
624 						tWriter.PutOffset ( tWord.m_uWordID );
625 						tWriter.PutDword ( tWord.m_iDocs );
626 						iTotalWords++;
627 					} else
628 						iSkippedWords++;
629 				}
630 
631 				tWord = dWords[i];
632 			}
633 		}
634 
635 		if ( ( iReadWords & 0xffff )==0 || bEnd )
636 			fprintf ( stderr, "read %.1f of %.1f MB, %.1f%% done%c", ( bEnd ? float(iTotalBytes) : float(iReadBytes) )/1000000.0f,
637 			float(iTotalBytes)/1000000.0f, bEnd ? 100.0f : float(iReadBytes)*100.0f/float(iTotalBytes), bEnd ? '\n' : '\r' );
638 
639 		if ( bEnd )
640 			break;
641 	}
642 
643 	ARRAY_FOREACH ( i, dFiles )
644 		SafeDeleteArray ( dBuffers[i] );
645 
646 	fprintf ( stdout, INT64_FMT" documents, " INT64_FMT " words (" INT64_FMT " read, " INT64_FMT " merged, " INT64_FMT " skipped)\n",
647 		iTotalDocuments, iTotalWords, iReadWords, iMergedWords, iSkippedWords );
648 
649 	int tmWallMsec = (int)( ( sphMicroTimer() - tmStart )/1000 );
650 	fprintf ( stdout, "finished in %d.%d sec\n", tmWallMsec/1000, (tmWallMsec/100)%10 );
651 
652 	return true;
653 }
654 
655 
OptimizeRtKlists(const CSphString & sIndex,const CSphConfig & hConf)656 void OptimizeRtKlists ( const CSphString & sIndex, const CSphConfig & hConf )
657 {
658 	const int64_t tmStart = sphMicroTimer();
659 
660 	int iDone = 0;
661 	CSphVector<CSphString> dFiles;
662 
663 	hConf["index"].IterateStart ();
664 	while ( hConf["index"].IterateNext () )
665 	{
666 		CSphString sError;
667 
668 		const CSphConfigSection & hIndex = hConf["index"].IterateGet ();
669 		const char * sIndexName = hConf["index"].IterateGetKey().cstr();
670 
671 		if ( !hIndex("type") || hIndex["type"]!="rt" )
672 			continue;
673 
674 		if ( !sIndex.IsEmpty() && sIndex!=sIndexName )
675 			continue;
676 
677 		if ( !hIndex.Exists ( "path" ) )
678 		{
679 			fprintf ( stdout, "key 'path' not found in index '%s' - skiped\n", sIndexName );
680 			continue;
681 		}
682 
683 		const int64_t tmIndexStart = sphMicroTimer();
684 
685 		CSphSchema tSchema ( sIndexName );
686 		CSphColumnInfo tCol;
687 
688 		// fields
689 		for ( CSphVariant * v=hIndex("rt_field"); v; v=v->m_pNext )
690 		{
691 			tCol.m_sName = v->cstr();
692 			tSchema.m_dFields.Add ( tCol );
693 		}
694 		if ( !tSchema.m_dFields.GetLength() )
695 		{
696 			fprintf ( stdout, "index '%s': no fields configured (use rt_field directive) - skiped\n", sIndexName );
697 			continue;
698 		}
699 
700 		// attrs
701 		const int iNumTypes = 5;
702 		const char * sTypes[iNumTypes] = { "rt_attr_uint", "rt_attr_bigint", "rt_attr_float", "rt_attr_timestamp", "rt_attr_string" };
703 		const ESphAttr iTypes[iNumTypes] = { SPH_ATTR_INTEGER, SPH_ATTR_BIGINT, SPH_ATTR_FLOAT, SPH_ATTR_TIMESTAMP, SPH_ATTR_STRING };
704 
705 		for ( int iType=0; iType<iNumTypes; iType++ )
706 		{
707 			for ( CSphVariant * v = hIndex ( sTypes[iType] ); v; v = v->m_pNext )
708 			{
709 				tCol.m_sName = v->cstr();
710 				tCol.m_eAttrType = iTypes[iType];
711 				tSchema.AddAttr ( tCol, false );
712 			}
713 		}
714 
715 		const char * sPath = hIndex["path"].cstr();
716 
717 		CSphString sMeta;
718 		sMeta.SetSprintf ( "%s.meta", sPath );
719 		CSphAutoreader rdMeta;
720 		if ( !rdMeta.Open ( sMeta.cstr(), sError ) )
721 		{
722 			fprintf ( stdout, "%s\n", sError.cstr() );
723 			continue;
724 		}
725 
726 		rdMeta.SeekTo ( 8, 4 );
727 		const int iDiskCunkCount = rdMeta.GetDword();
728 
729 		if ( !DoKlistsOptimization ( tSchema.GetRowSize(), sPath, iDiskCunkCount, dFiles ) )
730 			sphDie ( "can't cook k-list '%s'", sPath );
731 
732 		const int64_t tmIndexDone = sphMicroTimer();
733 		fprintf ( stdout, "\nindex '%s' done in %.3f sec\n", sIndexName, float(tmIndexDone-tmIndexStart )/1000000.0f );
734 		iDone++;
735 	}
736 
737 	const int64_t tmIndexesDone = sphMicroTimer();
738 	fprintf ( stdout, "\ntotal processed=%d in %.3f sec\n", iDone, float(tmIndexesDone-tmStart )/1000000.0f );
739 
740 	CSphString sError("none");
741 	if ( !FixupFiles ( dFiles, sError ) )
742 		fprintf ( stdout, "error during files fixup: %s\n", sError.cstr() );
743 
744 	const int64_t tmDone = sphMicroTimer();
745 	fprintf ( stdout, "\nfinished in %.3f sec\n", float(tmDone-tmStart )/1000000.0f );
746 }
747 
748 //////////////////////////////////////////////////////////////////////////
749 
750 extern void sphDictBuildInfixes ( const char * sPath );
751 extern void sphDictBuildSkiplists ( const char * sPath );
752 
753 
main(int argc,char ** argv)754 int main ( int argc, char ** argv )
755 {
756 	if ( argc<=1 )
757 	{
758 		fprintf ( stdout, SPHINX_BANNER );
759 		fprintf ( stdout,
760 			"Usage: indextool <COMMAND> [OPTIONS]\n"
761 			"\n"
762 			"Commands are:\n"
763 			"--build-infixes <INDEX>\tbuild infixes for an existing dict=keywords index\n"
764 			"\t\t\t(upgrades .sph, .spi in place)\n"
765 			"--build-skips <INDEX>\tbuild skiplists for an existing index (builds .spe and\n"
766 			"\t\t\tupgrades .sph, .spi in place)\n"
767 			"--check <INDEX>\t\tperform index consistency check\n"
768 			"--checkconfig\t\tperform config consistency check\n"
769 			"--dumpconfig <SPH-FILE>\tdump index header in config format by file name\n"
770 			"--dumpdocids <INDEX>\tdump docids by index name\n"
771 			"--dumpdict <SPI-FILE>\tdump dictionary by file name\n"
772 			"--dumpdict <INDEX>\tdump dictionary\n"
773 			"--dumpheader <SPH-FILE>\tdump index header by file name\n"
774 			"--dumpheader <INDEX>\tdump index header by index name\n"
775 			"--dumphitlist <INDEX> <KEYWORD>\n"
776 			"--dumphitlist <INDEX> --wordid <ID>\n"
777 			"\t\t\tdump hits for a given keyword\n"
778 			"--fold <INDEX> [FILE]\tfold FILE or stdin using INDEX charset_table\n"
779 			"--htmlstrip <INDEX>\tfilter stdin using index HTML stripper settings\n"
780 			"--optimize-rt-klists <INDEX>\n"
781 			"\t\t\toptimize kill list memory use in RT index disk chunks;\n"
782 			"\t\t\teither for a given index or --all\n"
783 			"--buildidf <INDEX1.dict> [INDEX2.dict ...] [--skip-uniq] --out <GLOBAL.idf>\n"
784 			"\t\t\tjoin --stats dictionary dumps into global.idf file\n"
785 			"--mergeidf <NODE1.idf> [NODE2.idf ...] [--skip-uniq] --out <GLOBAL.idf>\n"
786 			"\t\t\tmerge several .idf files into one file\n"
787 			"\n"
788 			"Options are:\n"
789 			"-c, --config <file>\tuse given config file instead of defaults\n"
790 			"-q, --quiet\t\tbe quiet, skip banner etc (useful with --fold etc)\n"
791 			"--strip-path\t\tstrip path from filenames referenced by index\n"
792 			"\t\t\t(eg. stopwords, exceptions, etc)\n"
793 			"--stats\t\t\tshow total statistics in the dictionary dump\n"
794 			"--skip-uniq\t\tskip unique (df=1) words in the .idf files\n"
795 		);
796 		exit ( 0 );
797 	}
798 
799 	//////////////////////
800 	// parse command line
801 	//////////////////////
802 
803 	#define OPT(_a1,_a2)	else if ( !strcmp(argv[i],_a1) || !strcmp(argv[i],_a2) )
804 	#define OPT1(_a1)		else if ( !strcmp(argv[i],_a1) )
805 
806 	const char * sOptConfig = NULL;
807 	CSphString sDumpHeader, sIndex, sKeyword, sFoldFile;
808 	bool bWordid = false;
809 	bool bStripPath = false;
810 	CSphVector<CSphString> dFiles;
811 	CSphString sOut;
812 	bool bStats = false;
813 	bool bSkipUnique = false;
814 	CSphString sDumpDict;
815 	bool bQuiet = false;
816 	bool bRotate = false;
817 
818 	enum
819 	{
820 		CMD_NOTHING,
821 		CMD_DUMPHEADER,
822 		CMD_DUMPCONFIG,
823 		CMD_DUMPDOCIDS,
824 		CMD_DUMPHITLIST,
825 		CMD_DUMPDICT,
826 		CMD_CHECK,
827 		CMD_STRIP,
828 		CMD_OPTIMIZEKLISTS,
829 		CMD_BUILDINFIXES,
830 		CMD_MORPH,
831 		CMD_BUILDSKIPS,
832 		CMD_BUILDIDF,
833 		CMD_MERGEIDF,
834 		CMD_CHECKCONFIG,
835 		CMD_FOLD
836 	} eCommand = CMD_NOTHING;
837 
838 	int i;
839 	for ( i=1; i<argc; i++ )
840 	{
841 		// handle argless options
842 		if ( argv[i][0]!='-' ) break;
843 		OPT ( "-q", "--quiet" )		{ bQuiet = true; continue; }
844 		OPT1 ( "--strip-path" )		{ bStripPath = true; continue; }
845 
846 		// handle options/commands with 1+ args
847 		if ( (i+1)>=argc )			break;
848 		OPT ( "-c", "--config" )	sOptConfig = argv[++i];
849 		OPT1 ( "--dumpheader" )		{ eCommand = CMD_DUMPHEADER; sDumpHeader = argv[++i]; }
850 		OPT1 ( "--dumpconfig" )		{ eCommand = CMD_DUMPCONFIG; sDumpHeader = argv[++i]; }
851 		OPT1 ( "--dumpdocids" )		{ eCommand = CMD_DUMPDOCIDS; sIndex = argv[++i]; }
852 		OPT1 ( "--check" )			{ eCommand = CMD_CHECK; sIndex = argv[++i]; }
853 		OPT1 ( "--rotate" )			{ bRotate = true; }
854 		OPT1 ( "--htmlstrip" )		{ eCommand = CMD_STRIP; sIndex = argv[++i]; }
855 		OPT1 ( "--build-infixes" )	{ eCommand = CMD_BUILDINFIXES; sIndex = argv[++i]; }
856 		OPT1 ( "--build-skips" )	{ eCommand = CMD_BUILDSKIPS; sIndex = argv[++i]; }
857 		OPT1 ( "--morph" )			{ eCommand = CMD_MORPH; sIndex = argv[++i]; }
858 		OPT1 ( "--checkconfig" )	{ eCommand = CMD_CHECKCONFIG; }
859 		OPT1 ( "--optimize-rt-klists" )
860 		{
861 			eCommand = CMD_OPTIMIZEKLISTS;
862 			sIndex = argv[++i];
863 			if ( sIndex=="--all" )
864 				sIndex = "";
865 		}
866 		OPT1 ( "--dumpdict" )
867 		{
868 			eCommand = CMD_DUMPDICT;
869 			sDumpDict = argv[++i];
870 			if ( (i+1)<argc && !strcmp ( argv[i+1], "--stats" ) )
871 			{
872 				bStats = true;
873 				i++;
874 			}
875 		}
876 		OPT1 ( "--fold" )
877 		{
878 			eCommand = CMD_FOLD;
879 			sIndex = argv[++i];
880 			if ( (i+1)<argc && argv[i+1][0]!='-' )
881 				sFoldFile = argv[++i];
882 		}
883 
884 		// options with 2 args
885 		else if ( (i+2)>=argc ) // NOLINT
886 		{
887 			// not enough args
888 			break;
889 
890 		} else if ( !strcmp ( argv[i], "--dumphitlist" ) )
891 		{
892 			eCommand = CMD_DUMPHITLIST;
893 			sIndex = argv[++i];
894 
895 			if ( !strcmp ( argv[i+1], "--wordid" ) )
896 			{
897 				if ( (i+3)<argc )
898 					break; // not enough args
899 				bWordid = true;
900 				i++;
901 			}
902 
903 			sKeyword = argv[++i];
904 
905 		} else if ( !strcmp ( argv[i], "--buildidf" ) || !strcmp ( argv[i], "--mergeidf" ) )
906 		{
907 			eCommand = !strcmp ( argv[i], "--buildidf" ) ? CMD_BUILDIDF : CMD_MERGEIDF;
908 			while ( ++i<argc )
909 			{
910 				if ( !strcmp ( argv[i], "--out" ) )
911 				{
912 					if ( (i+1)>=argc )
913 						break; // too few args
914 					sOut = argv[++i];
915 
916 				} else if ( !strcmp ( argv[i], "--skip-uniq" ) )
917 				{
918 					bSkipUnique = true;
919 
920 				} else if ( argv[i][0]=='-' )
921 				{
922 					break; // unknown switch
923 
924 				} else
925 				{
926 					dFiles.Add ( argv[i] ); // handle everything else as a file name
927 				}
928 			}
929 			break;
930 
931 		} else
932 		{
933 			// unknown option
934 			break;
935 		}
936 	}
937 
938 	if ( !bQuiet )
939 		fprintf ( stdout, SPHINX_BANNER );
940 
941 	if ( i!=argc )
942 	{
943 		fprintf ( stdout, "ERROR: malformed or unknown option near '%s'.\n", argv[i] );
944 		return 1;
945 	}
946 
947 	//////////////////////
948 	// load proper config
949 	//////////////////////
950 
951 	CSphString sError;
952 	if ( !sphInitCharsetAliasTable ( sError ) )
953 		sphDie ( "failed to init charset alias table: %s", sError.cstr() );
954 
955 	CSphConfigParser cp;
956 	CSphConfig & hConf = cp.m_tConf;
957 	for ( ;; )
958 	{
959 		if ( eCommand==CMD_BUILDIDF || eCommand==CMD_MERGEIDF )
960 			break;
961 
962 		if ( eCommand==CMD_DUMPDICT && !sDumpDict.Ends ( ".spi" ) )
963 				sIndex = sDumpDict;
964 
965 		sphLoadConfig ( sOptConfig, bQuiet, cp );
966 		break;
967 	}
968 
969 	///////////
970 	// action!
971 	///////////
972 	int iMvaDefault = 1048576;
973 	if ( hConf.Exists ( "searchd" ) && hConf["searchd"].Exists ( "searchd" ) )
974 	{
975 		const CSphConfigSection & hSearchd = hConf["searchd"]["searchd"];
976 		iMvaDefault = hSearchd.GetSize ( "mva_updates_pool", iMvaDefault );
977 	}
978 	const char * sArenaError = sphArenaInit ( iMvaDefault );
979 	if ( sArenaError )
980 		sphWarning ( "process shared mutex unsupported, persist MVA disabled ( %s )", sArenaError );
981 
982 
983 	if ( eCommand==CMD_CHECKCONFIG )
984 	{
985 		fprintf ( stdout, "config valid\nchecking index(es) ... " );
986 
987 		bool bError = false;
988 		// config parser made sure that index(es) present
989 		const CSphConfigType & hIndexes = hConf ["index"];
990 
991 		hIndexes.IterateStart();
992 		while ( hIndexes.IterateNext() )
993 		{
994 			const CSphConfigSection & tIndex = hIndexes.IterateGet();
995 			const CSphVariant * pPath = tIndex ( "path" );
996 			if ( !pPath )
997 				continue;
998 
999 			const CSphVariant * pType = tIndex ( "type" );
1000 			if ( pType && ( *pType=="rt" || *pType=="distributed" ) )
1001 				continue;
1002 
1003 			// checking index presence by sph file available
1004 			CSphString sHeader, sError;
1005 			sHeader.SetSprintf ( "%s.sph", pPath->cstr() );
1006 			CSphAutoreader rdHeader;
1007 			if ( !rdHeader.Open ( sHeader, sError ) )
1008 			{
1009 				// nice looking output
1010 				if ( !bError )
1011 					fprintf ( stdout, "\nmissed index(es): '%s'", hIndexes.IterateGetKey().cstr() );
1012 				else
1013 					fprintf ( stdout, ", '%s'", hIndexes.IterateGetKey().cstr() );
1014 
1015 				bError = true;
1016 			}
1017 		}
1018 		if ( !bError )
1019 		{
1020 			fprintf ( stdout, "ok\n" );
1021 			exit ( 0 );
1022 		} else
1023 		{
1024 			fprintf ( stdout, "\n" );
1025 			exit ( 1 );
1026 		}
1027 	}
1028 
1029 	// configure common settings (as of time of this writing, AOT and RLP setup)
1030 	sphConfigureCommon ( hConf );
1031 
1032 	// common part for several commands, check and preload index
1033 	CSphIndex * pIndex = NULL;
1034 	while ( !sIndex.IsEmpty() && eCommand!=CMD_OPTIMIZEKLISTS )
1035 	{
1036 		// check config
1037 		if ( !hConf["index"](sIndex) )
1038 			sphDie ( "index '%s': no such index in config\n", sIndex.cstr() );
1039 
1040 		// only need config-level settings for --htmlstrip
1041 		if ( eCommand==CMD_STRIP )
1042 			break;
1043 
1044 		if ( !hConf["index"][sIndex]("path") )
1045 			sphDie ( "index '%s': missing 'path' in config'\n", sIndex.cstr() );
1046 
1047 		// only need path for --build-infixes, it will access the files directly
1048 		if ( eCommand==CMD_BUILDINFIXES )
1049 			break;
1050 
1051 		// preload that index
1052 		CSphString sError;
1053 		bool bDictKeywords = true;
1054 		if ( hConf["index"][sIndex].Exists ( "dict" ) )
1055 			bDictKeywords = ( hConf["index"][sIndex]["dict"]!="crc" );
1056 
1057 		if ( hConf["index"][sIndex]("type") && hConf["index"][sIndex]["type"]=="rt" )
1058 		{
1059 			CSphSchema tSchema;
1060 			if ( sphRTSchemaConfigure ( hConf["index"][sIndex], &tSchema, &sError ) )
1061 				pIndex = sphCreateIndexRT ( tSchema, sIndex.cstr(), 32*1024*1024, hConf["index"][sIndex]["path"].cstr(), bDictKeywords );
1062 		} else
1063 		{
1064 			const char * sPath = hConf["index"][sIndex]["path"].cstr();
1065 			CSphStringBuilder tPath;
1066 			if ( bRotate )
1067 			{
1068 				tPath.Appendf ( "%s.tmp", sPath );
1069 				sPath = tPath.cstr();
1070 			}
1071 			pIndex = sphCreateIndexPhrase ( sIndex.cstr(), sPath );
1072 		}
1073 
1074 		if ( !pIndex )
1075 			sphDie ( "index '%s': failed to create (%s)", sIndex.cstr(), sError.cstr() );
1076 
1077 		if ( eCommand==CMD_CHECK )
1078 			pIndex->SetDebugCheck();
1079 
1080 		CSphString sWarn;
1081 		if ( !pIndex->Prealloc ( false, bStripPath, sWarn ) )
1082 			sphDie ( "index '%s': prealloc failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1083 
1084 		if ( eCommand==CMD_MORPH )
1085 			break;
1086 
1087 		if ( !pIndex->Preread() )
1088 			sphDie ( "index '%s': preread failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1089 
1090 		if ( hConf["index"][sIndex]("hitless_words") )
1091 		{
1092 			CSphIndexSettings tSettings = pIndex->GetSettings();
1093 
1094 			const CSphString & sValue = hConf["index"][sIndex]["hitless_words"].strval();
1095 			if ( sValue=="all" )
1096 			{
1097 				tSettings.m_eHitless = SPH_HITLESS_ALL;
1098 			} else
1099 			{
1100 				tSettings.m_eHitless = SPH_HITLESS_SOME;
1101 				tSettings.m_sHitlessFiles = sValue;
1102 			}
1103 
1104 			pIndex->Setup ( tSettings );
1105 		}
1106 
1107 		break;
1108 	}
1109 
1110 	int iCheckErrno = 0;
1111 	CSphString sNewIndex;
1112 
1113 	// do the dew
1114 	switch ( eCommand )
1115 	{
1116 		case CMD_NOTHING:
1117 			sphDie ( "nothing to do; specify a command (run indextool w/o switches for help)" );
1118 
1119 		case CMD_DUMPHEADER:
1120 		case CMD_DUMPCONFIG:
1121 		{
1122 			CSphString sIndexName = "(none)";
1123 			if ( hConf("index") && hConf["index"](sDumpHeader) )
1124 			{
1125 				fprintf ( stdout, "dumping header for index '%s'...\n", sDumpHeader.cstr() );
1126 
1127 				if ( !hConf["index"][sDumpHeader]("path") )
1128 					sphDie ( "missing 'path' for index '%s'\n", sDumpHeader.cstr() );
1129 
1130 				sIndexName = sDumpHeader;
1131 				sDumpHeader.SetSprintf ( "%s.sph", hConf["index"][sDumpHeader]["path"].cstr() );
1132 			} else
1133 				fprintf ( stdout, "dumping header file '%s'...\n", sDumpHeader.cstr() );
1134 
1135 			pIndex = sphCreateIndexPhrase ( sIndexName.cstr(), "" );
1136 			pIndex->DebugDumpHeader ( stdout, sDumpHeader.cstr(), eCommand==CMD_DUMPCONFIG );
1137 			break;
1138 		}
1139 
1140 		case CMD_DUMPDOCIDS:
1141 			fprintf ( stdout, "dumping docids for index '%s'...\n", sIndex.cstr() );
1142 			pIndex->DebugDumpDocids ( stdout );
1143 			break;
1144 
1145 		case CMD_DUMPHITLIST:
1146 			fprintf ( stdout, "dumping hitlist for index '%s' keyword '%s'...\n", sIndex.cstr(), sKeyword.cstr() );
1147 			pIndex->DebugDumpHitlist ( stdout, sKeyword.cstr(), bWordid );
1148 			break;
1149 
1150 		case CMD_DUMPDICT:
1151 		{
1152 			if ( sDumpDict.Ends ( ".spi" ) )
1153 			{
1154 				fprintf ( stdout, "dumping dictionary file '%s'...\n", sDumpDict.cstr() );
1155 
1156 				sIndex = sDumpDict.SubString ( 0, sDumpDict.Length()-4 );
1157 				pIndex = sphCreateIndexPhrase ( sIndex.cstr(), sIndex.cstr() );
1158 
1159 				CSphString sError;
1160 				if ( !pIndex )
1161 					sphDie ( "index '%s': failed to create (%s)", sIndex.cstr(), sError.cstr() );
1162 
1163 				CSphString sWarn;
1164 				if ( !pIndex->Prealloc ( false, bStripPath, sWarn ) )
1165 					sphDie ( "index '%s': prealloc failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1166 
1167 				if ( !pIndex->Preread() )
1168 					sphDie ( "index '%s': preread failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1169 			} else
1170 				fprintf ( stdout, "dumping dictionary for index '%s'...\n", sIndex.cstr() );
1171 
1172 			if ( bStats )
1173 				fprintf ( stdout, "total-documents: " INT64_FMT "\n", pIndex->GetStats().m_iTotalDocuments );
1174 			pIndex->DebugDumpDict ( stdout );
1175 			break;
1176 		}
1177 
1178 		case CMD_CHECK:
1179 			fprintf ( stdout, "checking index '%s'...\n", sIndex.cstr() );
1180 			iCheckErrno = pIndex->DebugCheck ( stdout );
1181 			if ( iCheckErrno )
1182 				return iCheckErrno;
1183 			if ( bRotate )
1184 			{
1185 				pIndex->Dealloc();
1186 				sNewIndex.SetSprintf ( "%s.new", hConf["index"][sIndex]["path"].cstr() );
1187 				if ( !pIndex->Rename ( sNewIndex.cstr() ) )
1188 					sphDie ( "index '%s': rotate failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1189 			}
1190 			return 0;
1191 
1192 		case CMD_STRIP:
1193 			{
1194 				const CSphConfigSection & hIndex = hConf["index"][sIndex];
1195 				if ( hIndex.GetInt ( "html_strip" )==0 )
1196 					sphDie ( "HTML stripping is not enabled in index '%s'", sIndex.cstr() );
1197 				StripStdin ( hIndex.GetStr ( "html_index_attrs" ), hIndex.GetStr ( "html_remove_elements" ) );
1198 			}
1199 			break;
1200 
1201 		case CMD_OPTIMIZEKLISTS:
1202 			OptimizeRtKlists ( sIndex, hConf );
1203 			break;
1204 
1205 		case CMD_BUILDINFIXES:
1206 			{
1207 				const CSphConfigSection & hIndex = hConf["index"][sIndex];
1208 				if ( hIndex("type") && hIndex["type"]=="rt" )
1209 					sphDie ( "build-infixes requires a disk index" );
1210 				if ( !hIndex("dict") || hIndex["dict"]!="keywords" )
1211 					sphDie ( "build-infixes requires dict=keywords" );
1212 
1213 				fprintf ( stdout, "building infixes for index %s...\n", sIndex.cstr() );
1214 				sphDictBuildInfixes ( hIndex["path"].cstr() );
1215 			}
1216 			break;
1217 
1218 		case CMD_BUILDSKIPS:
1219 			{
1220 				const CSphConfigSection & hIndex = hConf["index"][sIndex];
1221 				if ( hIndex("type") && hIndex["type"]=="rt" )
1222 					sphDie ( "build-infixes requires a disk index" );
1223 
1224 				fprintf ( stdout, "building skiplists for index %s...\n", sIndex.cstr() );
1225 				sphDictBuildSkiplists ( hIndex["path"].cstr() );
1226 			}
1227 			break;
1228 
1229 		case CMD_MORPH:
1230 			ApplyMorphology ( pIndex );
1231 			break;
1232 
1233 		case CMD_BUILDIDF:
1234 		{
1235 			CSphString sError;
1236 			if ( !BuildIDF ( sOut, dFiles, sError, bSkipUnique ) )
1237 				sphDie ( "ERROR: %s\n", sError.cstr() );
1238 			break;
1239 		}
1240 
1241 		case CMD_MERGEIDF:
1242 		{
1243 			CSphString sError;
1244 			if ( !MergeIDF ( sOut, dFiles, sError, bSkipUnique ) )
1245 				sphDie ( "ERROR: %s\n", sError.cstr() );
1246 			break;
1247 		}
1248 
1249 		case CMD_FOLD:
1250 			{
1251 				FILE * fp = stdin;
1252 				if ( !sFoldFile.IsEmpty() )
1253 				{
1254 					fp = fopen ( sFoldFile.cstr(), "rb" );
1255 					if ( !fp )
1256 						sphDie ( "failed to topen %s\n", sFoldFile.cstr() );
1257 				}
1258 				CharsetFold ( pIndex, fp );
1259 				if ( fp!=stdin )
1260 					fclose ( fp );
1261 			}
1262 			break;
1263 
1264 		default:
1265 			sphDie ( "INTERNAL ERROR: unhandled command (id=%d)", (int)eCommand );
1266 	}
1267 
1268 	return 0;
1269 }
1270 
1271 //
1272 // $Id$
1273 //
1274