1 //
2 // $Id$
3 //
4
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15
16 #include "sphinx.h"
17 #include "sphinxutils.h"
18 #include "sphinxint.h"
19 #include "sphinxrt.h"
20 #include <time.h>
21
22 #if USE_WINDOWS
23 #include <io.h> // for setmode(). open() on windows
24 #define sphSeek _lseeki64
25 #else
26 #define sphSeek lseek
27 #endif
28
29
StripStdin(const char * sIndexAttrs,const char * sRemoveElements)30 void StripStdin ( const char * sIndexAttrs, const char * sRemoveElements )
31 {
32 CSphString sError;
33 CSphHTMLStripper tStripper ( true );
34 if ( !tStripper.SetIndexedAttrs ( sIndexAttrs, sError )
35 || !tStripper.SetRemovedElements ( sRemoveElements, sError ) )
36 sphDie ( "failed to configure stripper: %s", sError.cstr() );
37
38 CSphVector<BYTE> dBuffer;
39 while ( !feof(stdin) )
40 {
41 char sBuffer[1024];
42 int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin );
43 if ( !iLen )
44 break;
45
46 int iPos = dBuffer.GetLength();
47 dBuffer.Resize ( iPos+iLen );
48 memcpy ( &dBuffer[iPos], sBuffer, iLen );
49 }
50 dBuffer.Add ( 0 );
51
52 tStripper.Strip ( &dBuffer[0] );
53 fprintf ( stdout, "dumping stripped results...\n%s\n", &dBuffer[0] );
54 }
55
56
ApplyMorphology(CSphIndex * pIndex)57 void ApplyMorphology ( CSphIndex * pIndex )
58 {
59 CSphVector<BYTE> dInBuffer, dOutBuffer;
60 const int READ_BUFFER_SIZE = 1024;
61 dInBuffer.Reserve ( READ_BUFFER_SIZE );
62 char sBuffer[READ_BUFFER_SIZE];
63 while ( !feof(stdin) )
64 {
65 int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin );
66 if ( !iLen )
67 break;
68
69 int iPos = dInBuffer.GetLength();
70 dInBuffer.Resize ( iPos+iLen );
71 memcpy ( &dInBuffer[iPos], sBuffer, iLen );
72 }
73 dInBuffer.Add(0);
74 dOutBuffer.Reserve ( dInBuffer.GetLength() );
75
76 CSphScopedPtr<ISphTokenizer> pTokenizer ( pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX ) );
77 CSphDict * pDict = pIndex->GetDictionary();
78 BYTE * sBufferToDump = &dInBuffer[0];
79 if ( pTokenizer.Ptr() )
80 {
81 pTokenizer->SetBuffer ( &dInBuffer[0], dInBuffer.GetLength() );
82 while ( BYTE * sToken = pTokenizer->GetToken() )
83 {
84 if ( pDict )
85 pDict->ApplyStemmers ( sToken );
86
87 int iPos = dOutBuffer.GetLength();
88 int iLen = strlen ( (char *)sToken );
89 sToken[iLen] = ' ';
90 dOutBuffer.Resize ( iPos+iLen+1 );
91 memcpy ( &dOutBuffer[iPos], sToken, iLen+1 );
92 }
93
94 if ( dOutBuffer.GetLength() )
95 dOutBuffer[dOutBuffer.GetLength()-1] = 0;
96 else
97 dOutBuffer.Add(0);
98
99 sBufferToDump = &dOutBuffer[0];
100 }
101
102 fprintf ( stdout, "dumping stemmed results...\n%s\n", sBufferToDump );
103 }
104
105
CharsetFold(CSphIndex * pIndex,FILE * fp)106 void CharsetFold ( CSphIndex * pIndex, FILE * fp )
107 {
108 CSphVector<BYTE> sBuf1 ( 16384 );
109 CSphVector<BYTE> sBuf2 ( 16384 );
110
111 CSphLowercaser tLC = pIndex->GetTokenizer()->GetLowercaser();
112
113 #if USE_WINDOWS
114 setmode ( fileno(stdout), O_BINARY );
115 #endif
116
117 int iBuf1 = 0; // how many leftover bytes from previous iteration
118 while ( !feof(fp) )
119 {
120 int iGot = fread ( sBuf1.Begin()+iBuf1, 1, sBuf1.GetLength()-iBuf1, fp );
121 if ( iGot<0 )
122 sphDie ( "read error: %s", strerror(errno) );
123
124 if ( iGot==0 )
125 if ( feof(fp) )
126 if ( iBuf1==0 )
127 break;
128
129
130 const BYTE * pIn = sBuf1.Begin();
131 const BYTE * pInMax = pIn + iBuf1 + iGot;
132
133 if ( pIn==pInMax && feof(fp) )
134 break;
135
136 // tricky bit
137 // on full buffer, and not an eof, terminate a bit early
138 // to avoid codepoint vs buffer boundary issue
139 if ( ( iBuf1+iGot )==sBuf1.GetLength() && iGot!=0 )
140 pInMax -= 16;
141
142 // do folding
143 BYTE * pOut = sBuf2.Begin();
144 BYTE * pOutMax = pOut + sBuf2.GetLength() - 16;
145 while ( pIn < pInMax )
146 {
147 int iCode = sphUTF8Decode ( pIn );
148 if ( iCode==0 )
149 pIn++; // decoder does not do that!
150 assert ( iCode>=0 );
151
152 if ( iCode!=0x09 && iCode!=0x0A && iCode!=0x0D )
153 {
154 iCode = tLC.ToLower ( iCode ) & 0xffffffUL;
155 if ( !iCode )
156 iCode = 0x20;
157 }
158
159 pOut += sphUTF8Encode ( pOut, iCode );
160 if ( pOut>=pOutMax )
161 {
162 fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout );
163 pOut = sBuf2.Begin();
164 }
165 }
166 fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout );
167
168 // now move around leftovers
169 BYTE * pRealEnd = sBuf1.Begin() + iBuf1 + iGot;
170 if ( pIn < pRealEnd )
171 {
172 iBuf1 = pRealEnd - pIn;
173 memmove ( sBuf1.Begin(), pIn, iBuf1 );
174 }
175 }
176 }
177
178 //////////////////////////////////////////////////////////////////////////
179
FixupFiles(const CSphVector<CSphString> & dFiles,CSphString & sError)180 bool FixupFiles ( const CSphVector<CSphString> & dFiles, CSphString & sError )
181 {
182 ARRAY_FOREACH ( i, dFiles )
183 {
184 const CSphString & sPath = dFiles[i];
185 CSphString sKlistOld, sKlistNew, sHeader;
186 sKlistOld.SetSprintf ( "%s.spk", sPath.cstr() );
187 sKlistNew.SetSprintf ( "%s.new.spk", sPath.cstr() );
188 sHeader.SetSprintf ( "%s.sph", sPath.cstr() );
189
190 DWORD iCount = 0;
191 {
192 CSphAutoreader rdHeader, rdKlistNew, rdKlistOld;
193 if ( !rdHeader.Open ( sHeader, sError ) || !rdKlistNew.Open ( sKlistNew, sError ) || !rdKlistOld.Open ( sKlistOld, sError ) )
194 return false;
195
196 const SphOffset_t iSize = rdKlistNew.GetFilesize();
197 iCount = (DWORD)( iSize / sizeof(SphAttr_t) );
198 }
199
200 if ( ::unlink ( sKlistOld.cstr() )!=0 )
201 {
202 sError.SetSprintf ( "file: '%s', error: '%s'", sKlistOld.cstr(), strerror(errno) );
203 return false;
204 }
205
206 if ( ::rename ( sKlistNew.cstr(), sKlistOld.cstr() )!=0 )
207 {
208 sError.SetSprintf ( "files: '%s'->'%s', error: '%s'", sKlistNew.cstr(), sKlistOld.cstr(), strerror(errno) );
209 return false;
210 }
211
212 int iFD = ::open ( sHeader.cstr(), SPH_O_BINARY | O_RDWR, 0644 );
213 if ( iFD<0 )
214 {
215 sError.SetSprintf ( "file: '%s', error: '%s'", sHeader.cstr(), strerror(errno) );
216 return false;
217 }
218
219 if ( sphSeek ( iFD, -4, SEEK_END )==-1L )
220 {
221 sError.SetSprintf ( "file: '%s', error: '%s'", sHeader.cstr(), strerror(errno) );
222 SafeClose ( iFD );
223 return false;
224 }
225
226 if ( ::write ( iFD, &iCount, 4 )==-1 )
227 {
228 sError.SetSprintf ( "file: '%s', error: '%s'", sHeader.cstr(), strerror(errno) );
229 SafeClose ( iFD );
230 return false;
231 }
232
233 SafeClose ( iFD );
234 }
235
236 return true;
237 }
238
239
DoKlistsOptimization(int iRowSize,const char * sPath,int iChunkCount,CSphVector<CSphString> & dFiles)240 bool DoKlistsOptimization ( int iRowSize, const char * sPath, int iChunkCount, CSphVector<CSphString> & dFiles )
241 {
242 CSphTightVector<SphDocID_t> dLiveID;
243
244 CSphString sError;
245
246 for ( int iChunk=0; iChunk<iChunkCount; iChunk++ )
247 {
248 const int64_t tmStart = sphMicroTimer();
249
250 fprintf ( stdout, "\nprocessing '%s.%d'...", sPath, iChunk );
251
252 CSphString sKlist, sAttr, sNew;
253 sKlist.SetSprintf ( "%s.%d.spk", sPath, iChunk );
254 sAttr.SetSprintf ( "%s.%d.spa", sPath, iChunk );
255 sNew.SetSprintf ( "%s.%d.new.spk", sPath, iChunk );
256
257 CSphAutoreader rdKList, rdAttr;
258 CSphWriter wrNew;
259 if ( !rdKList.Open ( sKlist, sError ) || !rdAttr.Open ( sAttr, sError ) || !wrNew.OpenFile ( sNew, sError ) )
260 {
261 fprintf ( stdout, "\n%s\n", sError.cstr() );
262 return false;
263 }
264
265 CSphTightVector<SphAttr_t> dKlist;
266
267 if ( dLiveID.GetLength()>0 )
268 {
269 assert ( rdKList.GetFilesize()<INT_MAX );
270
271 dKlist.Resize ( (int)( rdKList.GetFilesize()/sizeof(SphAttr_t) ) );
272 rdKList.GetBytes ( dKlist.Begin(), (int)rdKList.GetFilesize() );
273
274 // 1nd step kill all k-list ids not in live ids
275
276 ARRAY_FOREACH ( i, dKlist )
277 {
278 SphDocID_t uid = (SphDocID_t)dKlist[i];
279 SphDocID_t * pInLive = sphBinarySearch ( dLiveID.Begin(), &dLiveID.Last(), uid );
280 if ( !pInLive )
281 dKlist.RemoveFast ( i-- );
282 }
283 dKlist.Sort();
284
285 // 2nd step kill all prev ids by this fresh k-list
286
287 SphDocID_t * pFirstLive = dLiveID.Begin();
288 SphDocID_t * pLastLive = &dLiveID.Last();
289
290 ARRAY_FOREACH ( i, dKlist )
291 {
292 SphDocID_t uID = (SphDocID_t)dKlist[i];
293 SphDocID_t * pKilled = sphBinarySearch ( pFirstLive, pLastLive, uID );
294
295 assert ( pKilled );
296 pFirstLive = pKilled+1;
297 *pKilled = 0;
298 }
299
300 #ifndef NDEBUG
301 const int iWasLive = dLiveID.GetLength();
302 #endif
303
304 if ( dKlist.GetLength()>0 )
305 ARRAY_FOREACH ( i, dLiveID )
306 if ( dLiveID[i]==0 )
307 dLiveID.RemoveFast ( i-- );
308
309 assert ( dLiveID.GetLength()+dKlist.GetLength()==iWasLive );
310
311 dLiveID.Sort();
312 }
313
314 // 3d step write new k-list
315
316 if ( dKlist.GetLength()>0 )
317 wrNew.PutBytes ( dKlist.Begin(), dKlist.GetLength()*sizeof(SphAttr_t) );
318
319 dKlist.Reset();
320 wrNew.CloseFile();
321
322 // 4th step merge ID from this segment into live ids
323 if ( iChunk!=iChunkCount-1 )
324 {
325 const int iWasLive = Max ( dLiveID.GetLength()-1, 0 );
326 const int iRowCount = (int)( rdAttr.GetFilesize() / ( (DOCINFO_IDSIZE+iRowSize)*4 ) );
327
328 for ( int i=0; i<iRowCount; i++ )
329 {
330 SphDocID_t uID = 0;
331 rdAttr.GetBytes ( &uID, DOCINFO_IDSIZE*4 );
332 rdAttr.SkipBytes ( iRowSize*4 );
333
334 if ( sphBinarySearch ( dLiveID.Begin(), dLiveID.Begin()+iWasLive, uID )==NULL )
335 dLiveID.Add ( uID );
336 }
337
338 dLiveID.Sort();
339 }
340
341 CSphString & sFile = dFiles.Add();
342 sFile.SetSprintf ( "%s.%d", sPath, iChunk );
343
344 const int64_t tmEnd = sphMicroTimer();
345 fprintf ( stdout, "\rprocessed '%s.%d' in %.3f sec", sPath, iChunk, float(tmEnd-tmStart )/1000000.0f );
346 }
347
348 return true;
349 }
350
351
352 #pragma pack(push,4)
353 struct IDFWord_t
354 {
355 uint64_t m_uWordID;
356 DWORD m_iDocs;
357 };
358 #pragma pack(pop)
359 STATIC_SIZE_ASSERT ( IDFWord_t, 12 );
360
361
BuildIDF(const CSphString & sFilename,const CSphVector<CSphString> & dFiles,CSphString & sError,bool bSkipUnique)362 bool BuildIDF ( const CSphString & sFilename, const CSphVector<CSphString> & dFiles, CSphString & sError, bool bSkipUnique )
363 {
364 // text dictionaries are ordered alphabetically - we can use that fact while reading
365 // to merge duplicates, calculate total number of occurrences and process bSkipUnique
366 // this method is about 3x faster and consumes ~2x less memory than a hash based one
367
368 typedef char StringBuffer_t [ 3*SPH_MAX_WORD_LEN+16+128 ]; // { dict-keyowrd, 32bit number, 32bit number, 64bit number }
369
370 int64_t iTotalDocuments = 0;
371 int64_t iTotalWords = 0;
372 int64_t iReadWords = 0;
373 int64_t iMergedWords = 0;
374 int64_t iSkippedWords = 0;
375 int64_t iReadBytes = 0;
376 int64_t iTotalBytes = 0;
377
378 const int64_t tmStart = sphMicroTimer ();
379
380 int iFiles = dFiles.GetLength ();
381
382 CSphVector<CSphAutoreader> dReaders ( iFiles );
383
384 ARRAY_FOREACH ( i, dFiles )
385 {
386 if ( !dReaders[i].Open ( dFiles[i], sError ) )
387 return false;
388 iTotalBytes += dReaders[i].GetFilesize ();
389 }
390
391 // internal state
392 CSphFixedVector<StringBuffer_t> dWords ( iFiles );
393 CSphVector<int> dDocs ( iFiles );
394 CSphVector<bool> dFinished ( iFiles );
395 dFinished.Fill ( false );
396 bool bPreread = false;
397
398 // current entry
399 StringBuffer_t sWord = {0};
400 DWORD iDocs = 0;
401
402 // output vector, preallocate 10M
403 CSphTightVector<IDFWord_t> dEntries;
404 dEntries.Reserve ( 1024*1024*10 );
405
406 for ( int i=0;; )
407 {
408 // read next input
409 for ( ;; )
410 {
411 int iLen;
412 char * sBuffer = dWords[i];
413 if ( ( iLen = dReaders[i].GetLine ( sBuffer, sizeof(StringBuffer_t) ) )>=0 )
414 {
415 iReadBytes += iLen;
416
417 // find keyword pattern ( ^<keyword>,<docs>,... )
418 char * p1 = strchr ( sBuffer, ',' );
419 if ( p1 )
420 {
421 char * p2 = strchr ( p1+1, ',' );
422 if ( p2 )
423 {
424 *p1 = *p2 = '\0';
425 int iDocuments = atoi ( p1+1 );
426 if ( iDocuments )
427 {
428 dDocs[i] = iDocuments;
429 iReadWords++;
430 break;
431 }
432 }
433 } else
434 {
435 // keyword pattern not found (rather rare case), try to parse as a header, then
436 char sSearch[] = "total-documents: ";
437 if ( strstr ( sBuffer, sSearch )==sBuffer )
438 iTotalDocuments += atoi ( sBuffer+strlen(sSearch) );
439 }
440 } else
441 {
442 dFinished[i] = true;
443 break;
444 }
445 }
446
447 bool bEnd = !dFinished.Contains ( false );
448
449 i++;
450 if ( !bPreread && i==iFiles )
451 bPreread = true;
452
453 if ( bPreread )
454 {
455 // find the next smallest input
456 i = 0;
457 for ( int j=0; j<iFiles; j++ )
458 if ( !dFinished[j] && ( dFinished[i] || strcmp ( dWords[i], dWords[j] )>0 ) )
459 i = j;
460
461 // merge if we got the same word
462 if ( !strcmp ( sWord, dWords[i] ) && !bEnd )
463 {
464 iDocs += dDocs[i];
465 iMergedWords++;
466 } else
467 {
468 if ( sWord[0]!='\0' )
469 {
470 if ( !bSkipUnique || iDocs>1 )
471 {
472 IDFWord_t & tEntry = dEntries.Add ();
473 tEntry.m_uWordID = sphFNV64 ( sWord );
474 tEntry.m_iDocs = iDocs;
475 iTotalWords++;
476 } else
477 iSkippedWords++;
478 }
479
480 strncpy ( sWord, dWords[i], sizeof ( dWords[i] ) );
481 iDocs = dDocs[i];
482 }
483 }
484
485 if ( ( iReadWords & 0xffff )==0 || bEnd )
486 fprintf ( stderr, "read %.1f of %.1f MB, %.1f%% done%c", ( bEnd ? float(iTotalBytes) : float(iReadBytes) )/1000000.0f,
487 float(iTotalBytes)/1000000.0f, bEnd ? 100.0f : float(iReadBytes)*100.0f/float(iTotalBytes), bEnd ? '\n' : '\r' );
488
489 if ( bEnd )
490 break;
491 }
492
493 fprintf ( stdout, INT64_FMT" documents, " INT64_FMT " words (" INT64_FMT " read, " INT64_FMT " merged, " INT64_FMT " skipped)\n",
494 iTotalDocuments, iTotalWords, iReadWords, iMergedWords, iSkippedWords );
495
496 // write to disk
497 fprintf ( stdout, "writing %s (%1.fM)...\n", sFilename.cstr(), float(iTotalWords*sizeof(IDFWord_t))/1000000.0f );
498
499 dEntries.Sort ( bind ( &IDFWord_t::m_uWordID ) );
500
501 CSphWriter tWriter;
502 if ( !tWriter.OpenFile ( sFilename, sError ) )
503 return false;
504
505 // write file header
506 tWriter.PutOffset ( iTotalDocuments );
507
508 // write data
509 tWriter.PutBytes ( dEntries.Begin(), dEntries.GetLength()*sizeof(IDFWord_t) );
510
511 int tmWallMsec = (int)( ( sphMicroTimer() - tmStart )/1000 );
512 fprintf ( stdout, "finished in %d.%d sec\n", tmWallMsec/1000, (tmWallMsec/100)%10 );
513
514 return true;
515 }
516
517
MergeIDF(const CSphString & sFilename,const CSphVector<CSphString> & dFiles,CSphString & sError,bool bSkipUnique)518 bool MergeIDF ( const CSphString & sFilename, const CSphVector<CSphString> & dFiles, CSphString & sError, bool bSkipUnique )
519 {
520 // binary dictionaries are ordered by 64-bit word id, we can use that for merging.
521 // read every file, check repeating word ids, merge if found, write to disk if not
522 // memory requirements are about ~4KB per input file (used for buffered reading)
523
524 int64_t iTotalDocuments = 0;
525 int64_t iTotalWords = 0;
526 int64_t iReadWords = 0;
527 int64_t iMergedWords = 0;
528 int64_t iSkippedWords = 0;
529 int64_t iReadBytes = 0;
530 int64_t iTotalBytes = 0;
531
532 const int64_t tmStart = sphMicroTimer ();
533
534 int iFiles = dFiles.GetLength ();
535
536 // internal state
537 CSphVector<CSphAutoreader> dReaders ( iFiles );
538 CSphVector<IDFWord_t> dWords ( iFiles );
539 CSphVector<int64_t> dRead ( iFiles );
540 CSphVector<int64_t> dSize ( iFiles );
541 CSphVector<BYTE*> dBuffers ( iFiles );
542 CSphVector<bool> dFinished ( iFiles );
543 dFinished.Fill ( false );
544 bool bPreread = false;
545
546 // current entry
547 IDFWord_t tWord;
548 tWord.m_uWordID = 0;
549 tWord.m_iDocs = 0;
550
551 // preread buffer
552 const int iEntrySize = sizeof(int64_t)+sizeof(DWORD);
553 const int iBufferSize = iEntrySize*256;
554
555 // initialize vectors
556 ARRAY_FOREACH ( i, dFiles )
557 {
558 if ( !dReaders[i].Open ( dFiles[i], sError ) )
559 return false;
560 iTotalDocuments += dReaders[i].GetOffset ();
561 dRead[i] = 0;
562 dSize[i] = dReaders[i].GetFilesize() - sizeof( SphOffset_t );
563 dBuffers[i] = new BYTE [ iBufferSize ];
564 iTotalBytes += dSize[i];
565 }
566
567 // open output file
568 CSphWriter tWriter;
569 if ( !tWriter.OpenFile ( sFilename, sError ) )
570 return false;
571
572 // write file header
573 tWriter.PutOffset ( iTotalDocuments );
574
575 for ( int i=0;; )
576 {
577 if ( dRead[i]<dSize[i] )
578 {
579 iReadBytes += iEntrySize;
580
581 // This part basically does the following:
582 // dWords[i].m_uWordID = dReaders[i].GetOffset ();
583 // dWords[i].m_iDocs = dReaders[i].GetDword ();
584 // but reading by 12 bytes seems quite slow (SetBuffers doesn't help)
585 // the only way to speed it up is to buffer up a few entries manually
586
587 int iOffset = (int)( dRead[i] % iBufferSize );
588 if ( iOffset==0 )
589 dReaders[i].GetBytes ( dBuffers[i], ( dSize[i]-dRead[i] )<iBufferSize ? (int)( dSize[i]-dRead[i] ) : iBufferSize );
590
591 dWords[i].m_uWordID = *(uint64_t*)( dBuffers[i]+iOffset );
592 dWords[i].m_iDocs = *(DWORD*)( dBuffers[i]+iOffset+sizeof(uint64_t) );
593
594 dRead[i] += iEntrySize;
595 iReadWords++;
596 } else
597 dFinished[i] = true;
598
599 bool bEnd = !dFinished.Contains ( false );
600
601 i++;
602 if ( !bPreread && i==iFiles )
603 bPreread = true;
604
605 if ( bPreread )
606 {
607 // find the next smallest input
608 i = 0;
609 for ( int j=0; j<iFiles; j++ )
610 if ( !dFinished[j] && ( dFinished[i] || dWords[i].m_uWordID>dWords[j].m_uWordID ) )
611 i = j;
612
613 // merge if we got the same word
614 if ( tWord.m_uWordID==dWords[i].m_uWordID && !bEnd )
615 {
616 tWord.m_iDocs += dWords[i].m_iDocs;
617 iMergedWords++;
618 } else
619 {
620 if ( tWord.m_uWordID )
621 {
622 if ( !bSkipUnique || tWord.m_iDocs>1 )
623 {
624 tWriter.PutOffset ( tWord.m_uWordID );
625 tWriter.PutDword ( tWord.m_iDocs );
626 iTotalWords++;
627 } else
628 iSkippedWords++;
629 }
630
631 tWord = dWords[i];
632 }
633 }
634
635 if ( ( iReadWords & 0xffff )==0 || bEnd )
636 fprintf ( stderr, "read %.1f of %.1f MB, %.1f%% done%c", ( bEnd ? float(iTotalBytes) : float(iReadBytes) )/1000000.0f,
637 float(iTotalBytes)/1000000.0f, bEnd ? 100.0f : float(iReadBytes)*100.0f/float(iTotalBytes), bEnd ? '\n' : '\r' );
638
639 if ( bEnd )
640 break;
641 }
642
643 ARRAY_FOREACH ( i, dFiles )
644 SafeDeleteArray ( dBuffers[i] );
645
646 fprintf ( stdout, INT64_FMT" documents, " INT64_FMT " words (" INT64_FMT " read, " INT64_FMT " merged, " INT64_FMT " skipped)\n",
647 iTotalDocuments, iTotalWords, iReadWords, iMergedWords, iSkippedWords );
648
649 int tmWallMsec = (int)( ( sphMicroTimer() - tmStart )/1000 );
650 fprintf ( stdout, "finished in %d.%d sec\n", tmWallMsec/1000, (tmWallMsec/100)%10 );
651
652 return true;
653 }
654
655
OptimizeRtKlists(const CSphString & sIndex,const CSphConfig & hConf)656 void OptimizeRtKlists ( const CSphString & sIndex, const CSphConfig & hConf )
657 {
658 const int64_t tmStart = sphMicroTimer();
659
660 int iDone = 0;
661 CSphVector<CSphString> dFiles;
662
663 hConf["index"].IterateStart ();
664 while ( hConf["index"].IterateNext () )
665 {
666 CSphString sError;
667
668 const CSphConfigSection & hIndex = hConf["index"].IterateGet ();
669 const char * sIndexName = hConf["index"].IterateGetKey().cstr();
670
671 if ( !hIndex("type") || hIndex["type"]!="rt" )
672 continue;
673
674 if ( !sIndex.IsEmpty() && sIndex!=sIndexName )
675 continue;
676
677 if ( !hIndex.Exists ( "path" ) )
678 {
679 fprintf ( stdout, "key 'path' not found in index '%s' - skiped\n", sIndexName );
680 continue;
681 }
682
683 const int64_t tmIndexStart = sphMicroTimer();
684
685 CSphSchema tSchema ( sIndexName );
686 CSphColumnInfo tCol;
687
688 // fields
689 for ( CSphVariant * v=hIndex("rt_field"); v; v=v->m_pNext )
690 {
691 tCol.m_sName = v->cstr();
692 tSchema.m_dFields.Add ( tCol );
693 }
694 if ( !tSchema.m_dFields.GetLength() )
695 {
696 fprintf ( stdout, "index '%s': no fields configured (use rt_field directive) - skiped\n", sIndexName );
697 continue;
698 }
699
700 // attrs
701 const int iNumTypes = 5;
702 const char * sTypes[iNumTypes] = { "rt_attr_uint", "rt_attr_bigint", "rt_attr_float", "rt_attr_timestamp", "rt_attr_string" };
703 const ESphAttr iTypes[iNumTypes] = { SPH_ATTR_INTEGER, SPH_ATTR_BIGINT, SPH_ATTR_FLOAT, SPH_ATTR_TIMESTAMP, SPH_ATTR_STRING };
704
705 for ( int iType=0; iType<iNumTypes; iType++ )
706 {
707 for ( CSphVariant * v = hIndex ( sTypes[iType] ); v; v = v->m_pNext )
708 {
709 tCol.m_sName = v->cstr();
710 tCol.m_eAttrType = iTypes[iType];
711 tSchema.AddAttr ( tCol, false );
712 }
713 }
714
715 const char * sPath = hIndex["path"].cstr();
716
717 CSphString sMeta;
718 sMeta.SetSprintf ( "%s.meta", sPath );
719 CSphAutoreader rdMeta;
720 if ( !rdMeta.Open ( sMeta.cstr(), sError ) )
721 {
722 fprintf ( stdout, "%s\n", sError.cstr() );
723 continue;
724 }
725
726 rdMeta.SeekTo ( 8, 4 );
727 const int iDiskCunkCount = rdMeta.GetDword();
728
729 if ( !DoKlistsOptimization ( tSchema.GetRowSize(), sPath, iDiskCunkCount, dFiles ) )
730 sphDie ( "can't cook k-list '%s'", sPath );
731
732 const int64_t tmIndexDone = sphMicroTimer();
733 fprintf ( stdout, "\nindex '%s' done in %.3f sec\n", sIndexName, float(tmIndexDone-tmIndexStart )/1000000.0f );
734 iDone++;
735 }
736
737 const int64_t tmIndexesDone = sphMicroTimer();
738 fprintf ( stdout, "\ntotal processed=%d in %.3f sec\n", iDone, float(tmIndexesDone-tmStart )/1000000.0f );
739
740 CSphString sError("none");
741 if ( !FixupFiles ( dFiles, sError ) )
742 fprintf ( stdout, "error during files fixup: %s\n", sError.cstr() );
743
744 const int64_t tmDone = sphMicroTimer();
745 fprintf ( stdout, "\nfinished in %.3f sec\n", float(tmDone-tmStart )/1000000.0f );
746 }
747
748 //////////////////////////////////////////////////////////////////////////
749
750 extern void sphDictBuildInfixes ( const char * sPath );
751 extern void sphDictBuildSkiplists ( const char * sPath );
752
753
main(int argc,char ** argv)754 int main ( int argc, char ** argv )
755 {
756 if ( argc<=1 )
757 {
758 fprintf ( stdout, SPHINX_BANNER );
759 fprintf ( stdout,
760 "Usage: indextool <COMMAND> [OPTIONS]\n"
761 "\n"
762 "Commands are:\n"
763 "--build-infixes <INDEX>\tbuild infixes for an existing dict=keywords index\n"
764 "\t\t\t(upgrades .sph, .spi in place)\n"
765 "--build-skips <INDEX>\tbuild skiplists for an existing index (builds .spe and\n"
766 "\t\t\tupgrades .sph, .spi in place)\n"
767 "--check <INDEX>\t\tperform index consistency check\n"
768 "--checkconfig\t\tperform config consistency check\n"
769 "--dumpconfig <SPH-FILE>\tdump index header in config format by file name\n"
770 "--dumpdocids <INDEX>\tdump docids by index name\n"
771 "--dumpdict <SPI-FILE>\tdump dictionary by file name\n"
772 "--dumpdict <INDEX>\tdump dictionary\n"
773 "--dumpheader <SPH-FILE>\tdump index header by file name\n"
774 "--dumpheader <INDEX>\tdump index header by index name\n"
775 "--dumphitlist <INDEX> <KEYWORD>\n"
776 "--dumphitlist <INDEX> --wordid <ID>\n"
777 "\t\t\tdump hits for a given keyword\n"
778 "--fold <INDEX> [FILE]\tfold FILE or stdin using INDEX charset_table\n"
779 "--htmlstrip <INDEX>\tfilter stdin using index HTML stripper settings\n"
780 "--optimize-rt-klists <INDEX>\n"
781 "\t\t\toptimize kill list memory use in RT index disk chunks;\n"
782 "\t\t\teither for a given index or --all\n"
783 "--buildidf <INDEX1.dict> [INDEX2.dict ...] [--skip-uniq] --out <GLOBAL.idf>\n"
784 "\t\t\tjoin --stats dictionary dumps into global.idf file\n"
785 "--mergeidf <NODE1.idf> [NODE2.idf ...] [--skip-uniq] --out <GLOBAL.idf>\n"
786 "\t\t\tmerge several .idf files into one file\n"
787 "\n"
788 "Options are:\n"
789 "-c, --config <file>\tuse given config file instead of defaults\n"
790 "-q, --quiet\t\tbe quiet, skip banner etc (useful with --fold etc)\n"
791 "--strip-path\t\tstrip path from filenames referenced by index\n"
792 "\t\t\t(eg. stopwords, exceptions, etc)\n"
793 "--stats\t\t\tshow total statistics in the dictionary dump\n"
794 "--skip-uniq\t\tskip unique (df=1) words in the .idf files\n"
795 );
796 exit ( 0 );
797 }
798
799 //////////////////////
800 // parse command line
801 //////////////////////
802
803 #define OPT(_a1,_a2) else if ( !strcmp(argv[i],_a1) || !strcmp(argv[i],_a2) )
804 #define OPT1(_a1) else if ( !strcmp(argv[i],_a1) )
805
806 const char * sOptConfig = NULL;
807 CSphString sDumpHeader, sIndex, sKeyword, sFoldFile;
808 bool bWordid = false;
809 bool bStripPath = false;
810 CSphVector<CSphString> dFiles;
811 CSphString sOut;
812 bool bStats = false;
813 bool bSkipUnique = false;
814 CSphString sDumpDict;
815 bool bQuiet = false;
816 bool bRotate = false;
817
818 enum
819 {
820 CMD_NOTHING,
821 CMD_DUMPHEADER,
822 CMD_DUMPCONFIG,
823 CMD_DUMPDOCIDS,
824 CMD_DUMPHITLIST,
825 CMD_DUMPDICT,
826 CMD_CHECK,
827 CMD_STRIP,
828 CMD_OPTIMIZEKLISTS,
829 CMD_BUILDINFIXES,
830 CMD_MORPH,
831 CMD_BUILDSKIPS,
832 CMD_BUILDIDF,
833 CMD_MERGEIDF,
834 CMD_CHECKCONFIG,
835 CMD_FOLD
836 } eCommand = CMD_NOTHING;
837
838 int i;
839 for ( i=1; i<argc; i++ )
840 {
841 // handle argless options
842 if ( argv[i][0]!='-' ) break;
843 OPT ( "-q", "--quiet" ) { bQuiet = true; continue; }
844 OPT1 ( "--strip-path" ) { bStripPath = true; continue; }
845
846 // handle options/commands with 1+ args
847 if ( (i+1)>=argc ) break;
848 OPT ( "-c", "--config" ) sOptConfig = argv[++i];
849 OPT1 ( "--dumpheader" ) { eCommand = CMD_DUMPHEADER; sDumpHeader = argv[++i]; }
850 OPT1 ( "--dumpconfig" ) { eCommand = CMD_DUMPCONFIG; sDumpHeader = argv[++i]; }
851 OPT1 ( "--dumpdocids" ) { eCommand = CMD_DUMPDOCIDS; sIndex = argv[++i]; }
852 OPT1 ( "--check" ) { eCommand = CMD_CHECK; sIndex = argv[++i]; }
853 OPT1 ( "--rotate" ) { bRotate = true; }
854 OPT1 ( "--htmlstrip" ) { eCommand = CMD_STRIP; sIndex = argv[++i]; }
855 OPT1 ( "--build-infixes" ) { eCommand = CMD_BUILDINFIXES; sIndex = argv[++i]; }
856 OPT1 ( "--build-skips" ) { eCommand = CMD_BUILDSKIPS; sIndex = argv[++i]; }
857 OPT1 ( "--morph" ) { eCommand = CMD_MORPH; sIndex = argv[++i]; }
858 OPT1 ( "--checkconfig" ) { eCommand = CMD_CHECKCONFIG; }
859 OPT1 ( "--optimize-rt-klists" )
860 {
861 eCommand = CMD_OPTIMIZEKLISTS;
862 sIndex = argv[++i];
863 if ( sIndex=="--all" )
864 sIndex = "";
865 }
866 OPT1 ( "--dumpdict" )
867 {
868 eCommand = CMD_DUMPDICT;
869 sDumpDict = argv[++i];
870 if ( (i+1)<argc && !strcmp ( argv[i+1], "--stats" ) )
871 {
872 bStats = true;
873 i++;
874 }
875 }
876 OPT1 ( "--fold" )
877 {
878 eCommand = CMD_FOLD;
879 sIndex = argv[++i];
880 if ( (i+1)<argc && argv[i+1][0]!='-' )
881 sFoldFile = argv[++i];
882 }
883
884 // options with 2 args
885 else if ( (i+2)>=argc ) // NOLINT
886 {
887 // not enough args
888 break;
889
890 } else if ( !strcmp ( argv[i], "--dumphitlist" ) )
891 {
892 eCommand = CMD_DUMPHITLIST;
893 sIndex = argv[++i];
894
895 if ( !strcmp ( argv[i+1], "--wordid" ) )
896 {
897 if ( (i+3)<argc )
898 break; // not enough args
899 bWordid = true;
900 i++;
901 }
902
903 sKeyword = argv[++i];
904
905 } else if ( !strcmp ( argv[i], "--buildidf" ) || !strcmp ( argv[i], "--mergeidf" ) )
906 {
907 eCommand = !strcmp ( argv[i], "--buildidf" ) ? CMD_BUILDIDF : CMD_MERGEIDF;
908 while ( ++i<argc )
909 {
910 if ( !strcmp ( argv[i], "--out" ) )
911 {
912 if ( (i+1)>=argc )
913 break; // too few args
914 sOut = argv[++i];
915
916 } else if ( !strcmp ( argv[i], "--skip-uniq" ) )
917 {
918 bSkipUnique = true;
919
920 } else if ( argv[i][0]=='-' )
921 {
922 break; // unknown switch
923
924 } else
925 {
926 dFiles.Add ( argv[i] ); // handle everything else as a file name
927 }
928 }
929 break;
930
931 } else
932 {
933 // unknown option
934 break;
935 }
936 }
937
938 if ( !bQuiet )
939 fprintf ( stdout, SPHINX_BANNER );
940
941 if ( i!=argc )
942 {
943 fprintf ( stdout, "ERROR: malformed or unknown option near '%s'.\n", argv[i] );
944 return 1;
945 }
946
947 //////////////////////
948 // load proper config
949 //////////////////////
950
951 CSphString sError;
952 if ( !sphInitCharsetAliasTable ( sError ) )
953 sphDie ( "failed to init charset alias table: %s", sError.cstr() );
954
955 CSphConfigParser cp;
956 CSphConfig & hConf = cp.m_tConf;
957 for ( ;; )
958 {
959 if ( eCommand==CMD_BUILDIDF || eCommand==CMD_MERGEIDF )
960 break;
961
962 if ( eCommand==CMD_DUMPDICT && !sDumpDict.Ends ( ".spi" ) )
963 sIndex = sDumpDict;
964
965 sphLoadConfig ( sOptConfig, bQuiet, cp );
966 break;
967 }
968
969 ///////////
970 // action!
971 ///////////
972 int iMvaDefault = 1048576;
973 if ( hConf.Exists ( "searchd" ) && hConf["searchd"].Exists ( "searchd" ) )
974 {
975 const CSphConfigSection & hSearchd = hConf["searchd"]["searchd"];
976 iMvaDefault = hSearchd.GetSize ( "mva_updates_pool", iMvaDefault );
977 }
978 const char * sArenaError = sphArenaInit ( iMvaDefault );
979 if ( sArenaError )
980 sphWarning ( "process shared mutex unsupported, persist MVA disabled ( %s )", sArenaError );
981
982
983 if ( eCommand==CMD_CHECKCONFIG )
984 {
985 fprintf ( stdout, "config valid\nchecking index(es) ... " );
986
987 bool bError = false;
988 // config parser made sure that index(es) present
989 const CSphConfigType & hIndexes = hConf ["index"];
990
991 hIndexes.IterateStart();
992 while ( hIndexes.IterateNext() )
993 {
994 const CSphConfigSection & tIndex = hIndexes.IterateGet();
995 const CSphVariant * pPath = tIndex ( "path" );
996 if ( !pPath )
997 continue;
998
999 const CSphVariant * pType = tIndex ( "type" );
1000 if ( pType && ( *pType=="rt" || *pType=="distributed" ) )
1001 continue;
1002
1003 // checking index presence by sph file available
1004 CSphString sHeader, sError;
1005 sHeader.SetSprintf ( "%s.sph", pPath->cstr() );
1006 CSphAutoreader rdHeader;
1007 if ( !rdHeader.Open ( sHeader, sError ) )
1008 {
1009 // nice looking output
1010 if ( !bError )
1011 fprintf ( stdout, "\nmissed index(es): '%s'", hIndexes.IterateGetKey().cstr() );
1012 else
1013 fprintf ( stdout, ", '%s'", hIndexes.IterateGetKey().cstr() );
1014
1015 bError = true;
1016 }
1017 }
1018 if ( !bError )
1019 {
1020 fprintf ( stdout, "ok\n" );
1021 exit ( 0 );
1022 } else
1023 {
1024 fprintf ( stdout, "\n" );
1025 exit ( 1 );
1026 }
1027 }
1028
1029 // configure common settings (as of time of this writing, AOT and RLP setup)
1030 sphConfigureCommon ( hConf );
1031
1032 // common part for several commands, check and preload index
1033 CSphIndex * pIndex = NULL;
1034 while ( !sIndex.IsEmpty() && eCommand!=CMD_OPTIMIZEKLISTS )
1035 {
1036 // check config
1037 if ( !hConf["index"](sIndex) )
1038 sphDie ( "index '%s': no such index in config\n", sIndex.cstr() );
1039
1040 // only need config-level settings for --htmlstrip
1041 if ( eCommand==CMD_STRIP )
1042 break;
1043
1044 if ( !hConf["index"][sIndex]("path") )
1045 sphDie ( "index '%s': missing 'path' in config'\n", sIndex.cstr() );
1046
1047 // only need path for --build-infixes, it will access the files directly
1048 if ( eCommand==CMD_BUILDINFIXES )
1049 break;
1050
1051 // preload that index
1052 CSphString sError;
1053 bool bDictKeywords = true;
1054 if ( hConf["index"][sIndex].Exists ( "dict" ) )
1055 bDictKeywords = ( hConf["index"][sIndex]["dict"]!="crc" );
1056
1057 if ( hConf["index"][sIndex]("type") && hConf["index"][sIndex]["type"]=="rt" )
1058 {
1059 CSphSchema tSchema;
1060 if ( sphRTSchemaConfigure ( hConf["index"][sIndex], &tSchema, &sError ) )
1061 pIndex = sphCreateIndexRT ( tSchema, sIndex.cstr(), 32*1024*1024, hConf["index"][sIndex]["path"].cstr(), bDictKeywords );
1062 } else
1063 {
1064 const char * sPath = hConf["index"][sIndex]["path"].cstr();
1065 CSphStringBuilder tPath;
1066 if ( bRotate )
1067 {
1068 tPath.Appendf ( "%s.tmp", sPath );
1069 sPath = tPath.cstr();
1070 }
1071 pIndex = sphCreateIndexPhrase ( sIndex.cstr(), sPath );
1072 }
1073
1074 if ( !pIndex )
1075 sphDie ( "index '%s': failed to create (%s)", sIndex.cstr(), sError.cstr() );
1076
1077 if ( eCommand==CMD_CHECK )
1078 pIndex->SetDebugCheck();
1079
1080 CSphString sWarn;
1081 if ( !pIndex->Prealloc ( false, bStripPath, sWarn ) )
1082 sphDie ( "index '%s': prealloc failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1083
1084 if ( eCommand==CMD_MORPH )
1085 break;
1086
1087 if ( !pIndex->Preread() )
1088 sphDie ( "index '%s': preread failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1089
1090 if ( hConf["index"][sIndex]("hitless_words") )
1091 {
1092 CSphIndexSettings tSettings = pIndex->GetSettings();
1093
1094 const CSphString & sValue = hConf["index"][sIndex]["hitless_words"].strval();
1095 if ( sValue=="all" )
1096 {
1097 tSettings.m_eHitless = SPH_HITLESS_ALL;
1098 } else
1099 {
1100 tSettings.m_eHitless = SPH_HITLESS_SOME;
1101 tSettings.m_sHitlessFiles = sValue;
1102 }
1103
1104 pIndex->Setup ( tSettings );
1105 }
1106
1107 break;
1108 }
1109
1110 int iCheckErrno = 0;
1111 CSphString sNewIndex;
1112
1113 // do the dew
1114 switch ( eCommand )
1115 {
1116 case CMD_NOTHING:
1117 sphDie ( "nothing to do; specify a command (run indextool w/o switches for help)" );
1118
1119 case CMD_DUMPHEADER:
1120 case CMD_DUMPCONFIG:
1121 {
1122 CSphString sIndexName = "(none)";
1123 if ( hConf("index") && hConf["index"](sDumpHeader) )
1124 {
1125 fprintf ( stdout, "dumping header for index '%s'...\n", sDumpHeader.cstr() );
1126
1127 if ( !hConf["index"][sDumpHeader]("path") )
1128 sphDie ( "missing 'path' for index '%s'\n", sDumpHeader.cstr() );
1129
1130 sIndexName = sDumpHeader;
1131 sDumpHeader.SetSprintf ( "%s.sph", hConf["index"][sDumpHeader]["path"].cstr() );
1132 } else
1133 fprintf ( stdout, "dumping header file '%s'...\n", sDumpHeader.cstr() );
1134
1135 pIndex = sphCreateIndexPhrase ( sIndexName.cstr(), "" );
1136 pIndex->DebugDumpHeader ( stdout, sDumpHeader.cstr(), eCommand==CMD_DUMPCONFIG );
1137 break;
1138 }
1139
1140 case CMD_DUMPDOCIDS:
1141 fprintf ( stdout, "dumping docids for index '%s'...\n", sIndex.cstr() );
1142 pIndex->DebugDumpDocids ( stdout );
1143 break;
1144
1145 case CMD_DUMPHITLIST:
1146 fprintf ( stdout, "dumping hitlist for index '%s' keyword '%s'...\n", sIndex.cstr(), sKeyword.cstr() );
1147 pIndex->DebugDumpHitlist ( stdout, sKeyword.cstr(), bWordid );
1148 break;
1149
1150 case CMD_DUMPDICT:
1151 {
1152 if ( sDumpDict.Ends ( ".spi" ) )
1153 {
1154 fprintf ( stdout, "dumping dictionary file '%s'...\n", sDumpDict.cstr() );
1155
1156 sIndex = sDumpDict.SubString ( 0, sDumpDict.Length()-4 );
1157 pIndex = sphCreateIndexPhrase ( sIndex.cstr(), sIndex.cstr() );
1158
1159 CSphString sError;
1160 if ( !pIndex )
1161 sphDie ( "index '%s': failed to create (%s)", sIndex.cstr(), sError.cstr() );
1162
1163 CSphString sWarn;
1164 if ( !pIndex->Prealloc ( false, bStripPath, sWarn ) )
1165 sphDie ( "index '%s': prealloc failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1166
1167 if ( !pIndex->Preread() )
1168 sphDie ( "index '%s': preread failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1169 } else
1170 fprintf ( stdout, "dumping dictionary for index '%s'...\n", sIndex.cstr() );
1171
1172 if ( bStats )
1173 fprintf ( stdout, "total-documents: " INT64_FMT "\n", pIndex->GetStats().m_iTotalDocuments );
1174 pIndex->DebugDumpDict ( stdout );
1175 break;
1176 }
1177
1178 case CMD_CHECK:
1179 fprintf ( stdout, "checking index '%s'...\n", sIndex.cstr() );
1180 iCheckErrno = pIndex->DebugCheck ( stdout );
1181 if ( iCheckErrno )
1182 return iCheckErrno;
1183 if ( bRotate )
1184 {
1185 pIndex->Dealloc();
1186 sNewIndex.SetSprintf ( "%s.new", hConf["index"][sIndex]["path"].cstr() );
1187 if ( !pIndex->Rename ( sNewIndex.cstr() ) )
1188 sphDie ( "index '%s': rotate failed: %s\n", sIndex.cstr(), pIndex->GetLastError().cstr() );
1189 }
1190 return 0;
1191
1192 case CMD_STRIP:
1193 {
1194 const CSphConfigSection & hIndex = hConf["index"][sIndex];
1195 if ( hIndex.GetInt ( "html_strip" )==0 )
1196 sphDie ( "HTML stripping is not enabled in index '%s'", sIndex.cstr() );
1197 StripStdin ( hIndex.GetStr ( "html_index_attrs" ), hIndex.GetStr ( "html_remove_elements" ) );
1198 }
1199 break;
1200
1201 case CMD_OPTIMIZEKLISTS:
1202 OptimizeRtKlists ( sIndex, hConf );
1203 break;
1204
1205 case CMD_BUILDINFIXES:
1206 {
1207 const CSphConfigSection & hIndex = hConf["index"][sIndex];
1208 if ( hIndex("type") && hIndex["type"]=="rt" )
1209 sphDie ( "build-infixes requires a disk index" );
1210 if ( !hIndex("dict") || hIndex["dict"]!="keywords" )
1211 sphDie ( "build-infixes requires dict=keywords" );
1212
1213 fprintf ( stdout, "building infixes for index %s...\n", sIndex.cstr() );
1214 sphDictBuildInfixes ( hIndex["path"].cstr() );
1215 }
1216 break;
1217
1218 case CMD_BUILDSKIPS:
1219 {
1220 const CSphConfigSection & hIndex = hConf["index"][sIndex];
1221 if ( hIndex("type") && hIndex["type"]=="rt" )
1222 sphDie ( "build-infixes requires a disk index" );
1223
1224 fprintf ( stdout, "building skiplists for index %s...\n", sIndex.cstr() );
1225 sphDictBuildSkiplists ( hIndex["path"].cstr() );
1226 }
1227 break;
1228
1229 case CMD_MORPH:
1230 ApplyMorphology ( pIndex );
1231 break;
1232
1233 case CMD_BUILDIDF:
1234 {
1235 CSphString sError;
1236 if ( !BuildIDF ( sOut, dFiles, sError, bSkipUnique ) )
1237 sphDie ( "ERROR: %s\n", sError.cstr() );
1238 break;
1239 }
1240
1241 case CMD_MERGEIDF:
1242 {
1243 CSphString sError;
1244 if ( !MergeIDF ( sOut, dFiles, sError, bSkipUnique ) )
1245 sphDie ( "ERROR: %s\n", sError.cstr() );
1246 break;
1247 }
1248
1249 case CMD_FOLD:
1250 {
1251 FILE * fp = stdin;
1252 if ( !sFoldFile.IsEmpty() )
1253 {
1254 fp = fopen ( sFoldFile.cstr(), "rb" );
1255 if ( !fp )
1256 sphDie ( "failed to topen %s\n", sFoldFile.cstr() );
1257 }
1258 CharsetFold ( pIndex, fp );
1259 if ( fp!=stdin )
1260 fclose ( fp );
1261 }
1262 break;
1263
1264 default:
1265 sphDie ( "INTERNAL ERROR: unhandled command (id=%d)", (int)eCommand );
1266 }
1267
1268 return 0;
1269 }
1270
1271 //
1272 // $Id$
1273 //
1274