1 //
2 // $Id: sphinx.cpp 4113 2013-08-26 07:43:28Z deogar $
3 //
4 
5 //
6 // Copyright (c) 2001-2013, Andrew Aksyonoff
7 // Copyright (c) 2008-2013, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 
16 #include "sphinx.h"
17 #include "sphinxstem.h"
18 #include "sphinxquery.h"
19 #include "sphinxutils.h"
20 #include "sphinxexpr.h"
21 #include "sphinxfilter.h"
22 #include "sphinxint.h"
23 #include "sphinxsearch.h"
24 
25 #include <ctype.h>
26 #include <fcntl.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <stdarg.h>
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <limits.h>
33 #include <time.h>
34 #include <math.h>
35 #include <float.h>
36 
37 #define SPH_UNPACK_BUFFER_SIZE	4096
38 #define SPH_READ_PROGRESS_CHUNK (8192*1024)
39 #define SPH_READ_NOPROGRESS_CHUNK (32768*1024)
40 
41 #if USE_LIBSTEMMER
42 #include <libstemmer.h>
43 #endif
44 
45 #if USE_LIBEXPAT
46 #define XMLIMPORT
47 #include "expat.h"
48 
49 // workaround for expat versions prior to 1.95.7
50 #ifndef XMLCALL
51 #define XMLCALL
52 #endif
53 #endif
54 
55 #if USE_LIBXML
56 #include <libxml/xmlreader.h>
57 #endif
58 
59 #if USE_LIBICONV
60 #include "iconv.h"
61 #endif
62 
63 #if USE_ZLIB
64 #include <zlib.h>
65 #endif
66 
67 #if USE_ODBC
68 #include <sql.h>
69 #endif
70 
71 #if USE_WINDOWS
72 	#include <io.h> // for open()
73 
74 	// workaround Windows quirks
75 	#define popen		_popen
76 	#define pclose		_pclose
77 	#define snprintf	_snprintf
78 	#define sphSeek		_lseeki64
79 
80 	#define stat		_stat64
81 	#define fstat		_fstat64
82 	#if _MSC_VER<1400
83 	#define struct_stat	__stat64
84 	#else
85 	#define struct_stat	struct _stat64
86 	#endif
87 
88 	#define ICONV_INBUF_CONST	1
89 #else
90 	#include <unistd.h>
91 	#include <sys/time.h>
92 
93 	#define sphSeek		lseek
94 
95 	#define struct_stat		struct stat
96 #endif
97 
98 #if ( USE_WINDOWS && USE_MYSQL )
99 	#pragma comment(linker, "/defaultlib:libmysql.lib")
100 	#pragma message("Automatically linking with libmysql.lib")
101 #endif
102 
103 #if ( USE_WINDOWS && USE_PGSQL )
104 	#pragma comment(linker, "/defaultlib:libpq.lib")
105 	#pragma message("Automatically linking with libpq.lib")
106 #endif
107 
108 #if ( USE_WINDOWS && USE_LIBSTEMMER )
109 	#pragma comment(linker, "/defaultlib:libstemmer_c.lib")
110 	#pragma message("Automatically linking with libstemmer_c.lib")
111 #endif
112 
113 #if ( USE_WINDOWS && USE_LIBEXPAT )
114 	#pragma comment(linker, "/defaultlib:libexpat.lib")
115 	#pragma message("Automatically linking with libexpat.lib")
116 #endif
117 
118 #if ( USE_WINDOWS && USE_LIBICONV )
119 	#pragma comment(linker, "/defaultlib:iconv.lib")
120 	#pragma message("Automatically linking with iconv.lib")
121 #endif
122 
123 #if ( USE_WINDOWS && USE_LIBXML )
124 	#pragma comment(linker, "/defaultlib:libxml.lib")
125 	#pragma message("Automatically linking with libxml.lib")
126 #endif
127 
128 /////////////////////////////////////////////////////////////////////////////
129 
130 typedef Hitman_c<8> HITMAN;
131 
132 // logf() is not there sometimes (eg. Solaris 9)
133 #if !USE_WINDOWS && !HAVE_LOGF
logf(float v)134 static inline float logf ( float v )
135 {
136 	return (float) log ( v );
137 }
138 #endif
139 
140 #if USE_WINDOWS
localtime_r(const time_t * clock,struct tm * res)141 void localtime_r ( const time_t * clock, struct tm * res )
142 {
143 	*res = *localtime ( clock );
144 }
145 #endif
146 
147 // forward decl
148 void sphWarn ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 1, 2 ) ) );
149 static bool sphTruncate ( int iFD );
150 
151 /////////////////////////////////////////////////////////////////////////////
152 // GLOBALS
153 /////////////////////////////////////////////////////////////////////////////
154 
155 const char *	SPHINX_DEFAULT_SBCS_TABLE	= "0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF";
156 const char *	SPHINX_DEFAULT_UTF8_TABLE	= "0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F";
157 
158 const char *	MAGIC_WORD_SENTENCE			= "\3sentence";		// emitted from source on sentence boundary, stored in dictionary
159 const char *	MAGIC_WORD_PARAGRAPH		= "\3paragraph";	// emitted from source on paragraph boundary, stored in dictionary
160 
161 static const int	DEFAULT_READ_BUFFER		= 262144;
162 static const int	DEFAULT_READ_UNHINTED	= 32768;
163 static const int	MIN_READ_BUFFER			= 8192;
164 static const int	MIN_READ_UNHINTED		= 1024;
165 
166 static bool			g_bSphQuiet					= false;
167 static bool			g_bDebugCheck				= false;
168 
169 static int					g_iReadBuffer				= DEFAULT_READ_BUFFER;
170 static int					g_iReadUnhinted				= DEFAULT_READ_UNHINTED;
171 
172 // quick hack for indexer crash reporting
173 // one day, these might turn into a callback or something
174 int64_t		g_iIndexerCurrentDocID		= 0;
175 int64_t		g_iIndexerCurrentHits		= 0;
176 int64_t		g_iIndexerCurrentRangeMin	= 0;
177 int64_t		g_iIndexerCurrentRangeMax	= 0;
178 int64_t		g_iIndexerPoolStartDocID	= 0;
179 int64_t		g_iIndexerPoolStartHit		= 0;
180 
181 /////////////////////////////////////////////////////////////////////////////
182 // COMPILE-TIME CHECKS
183 /////////////////////////////////////////////////////////////////////////////
184 
185 STATIC_SIZE_ASSERT ( SphOffset_t, 8 );
186 
187 /////////////////////////////////////////////////////////////////////////////
188 // INTERNAL PROFILER
189 /////////////////////////////////////////////////////////////////////////////
190 
191 #define SPH_INTERNAL_PROFILER 0
192 
193 #if SPH_INTERNAL_PROFILER
194 
195 enum ESphTimer
196 {
197 	TIMER_root = 0,
198 
199 	#define DECLARE_TIMER(_arg) TIMER_##_arg,
200 	#include "sphinxtimers.h"
201 	#undef DECLARE_TIMER
202 
203 	TIMERS_TOTAL
204 };
205 
206 
207 static const char * const g_dTimerNames [ TIMERS_TOTAL ] =
208 {
209 	"root",
210 
211 	#define DECLARE_TIMER(_arg) #_arg,
212 	#include "sphinxtimers.h" // NOLINT
213 	#undef DECLARE_TIMER
214 };
215 
216 
217 struct CSphTimer
218 {
219 	int64_t			m_iMicroSec;		///< time as clocked raw
220 	int				m_iCalls;			///< number of times this timer was called
221 
222 	int				m_iChildrenCalls;	///< number of times all subtimers (children, grandchildren etc) of this timer were called
223 	int64_t			m_iMicroSecAdj;		///< guessed (!) time after timer costs adjustment, including subtimer costs
224 	int64_t			m_iMicroSecSelf;	///< guessed (!) self time
225 
226 	ESphTimer		m_eTimer;
227 	int				m_iParent;
228 	int				m_iChild;
229 	int				m_iNext;
230 	int				m_iPrev;
231 
CSphTimerCSphTimer232 	CSphTimer ()
233 	{
234 		Alloc ( TIMER_root, -1 );
235 	}
236 
AllocCSphTimer237 	void Alloc ( ESphTimer eTimer, int iParent )
238 	{
239 		m_iParent = iParent;
240 		m_iChild = -1;
241 		m_iNext = -1;
242 		m_iPrev = -1;
243 		m_eTimer = eTimer;
244 		m_iMicroSec = 0;
245 		m_iMicroSecAdj = 0;
246 		m_iCalls = 0;
247 		m_iChildrenCalls = 0;
248 	}
249 
StartCSphTimer250 	void Start ()
251 	{
252 		m_iMicroSec -= sphMicroTimer ();
253 		m_iCalls++;
254 	}
255 
StopCSphTimer256 	void Stop ()
257 	{
258 		m_iMicroSec += sphMicroTimer ();
259 	}
260 };
261 
262 static const int	SPH_MAX_TIMERS					= 128;
263 static const int	SPH_TIMER_TRIALS				= 16384;
264 
265 static int			g_iTimer						= -1;
266 static int			g_iTimers						= 0;
267 static CSphTimer	g_dTimers [ SPH_MAX_TIMERS ];
268 static int64_t		g_iTimerTrialsWall				= 0;
269 
sphProfilerInit()270 void sphProfilerInit ()
271 {
272 	assert ( g_iTimers==0 );
273 	assert ( g_iTimer==-1 );
274 
275 	// start root timer
276 	g_iTimers = 1;
277 	g_iTimer = 0;
278 	g_dTimers[g_iTimer].Alloc ( TIMER_root, -1 );
279 	g_dTimers[g_iTimer].Start ();
280 }
281 
282 
sphProfilerPush(ESphTimer eTimer)283 void sphProfilerPush ( ESphTimer eTimer )
284 {
285 	assert ( g_iTimer>=0 && g_iTimer<SPH_MAX_TIMERS );
286 	assert ( eTimer!=TIMER_root );
287 
288 	// search for match timer in current timer's children list
289 	int iTimer;
290 	for ( iTimer=g_dTimers[g_iTimer].m_iChild; iTimer>0; iTimer=g_dTimers[iTimer].m_iNext )
291 	{
292 		if ( g_dTimers[iTimer].m_eTimer==eTimer )
293 			break;
294 	}
295 
296 	// not found? let's alloc
297 	if ( iTimer<0 )
298 	{
299 		assert ( g_iTimers<SPH_MAX_TIMERS );
300 		iTimer = g_iTimers++;
301 
302 		// create child and make current timer it's parent
303 		g_dTimers[iTimer].Alloc ( eTimer, g_iTimer );
304 
305 		// make it new children list head
306 		g_dTimers[iTimer].m_iNext = g_dTimers[g_iTimer].m_iChild;
307 		if ( g_dTimers[g_iTimer].m_iChild>=0 )
308 			g_dTimers [ g_dTimers[g_iTimer].m_iChild ].m_iPrev = iTimer;
309 		g_dTimers[g_iTimer].m_iChild = iTimer;
310 	}
311 
312 	// make it new current one
313 	assert ( iTimer>0 );
314 	g_dTimers[iTimer].Start ();
315 	g_iTimer = iTimer;
316 }
317 
318 
sphProfilerPop(ESphTimer eTimer)319 void sphProfilerPop ( ESphTimer eTimer )
320 {
321 	assert ( g_iTimer>0 && g_iTimer<SPH_MAX_TIMERS );
322 	assert ( g_dTimers[g_iTimer].m_eTimer==eTimer );
323 
324 	g_dTimers[g_iTimer].Stop ();
325 	g_iTimer = g_dTimers[g_iTimer].m_iParent;
326 	assert ( g_iTimer>=0 && g_iTimer<SPH_MAX_TIMERS );
327 }
328 
329 
sphProfilerAdjust(int iTimer)330 static void sphProfilerAdjust ( int iTimer )
331 {
332 	CSphTimer & tTimer = g_dTimers[iTimer];
333 	tTimer.m_iChildrenCalls = 0;
334 
335 	// adjust all my children first
336 	// count the subtimer call totals along the way, too
337 	for ( int iChild=tTimer.m_iChild; iChild>0; iChild=g_dTimers[iChild].m_iNext )
338 	{
339 		sphProfilerAdjust ( iChild );
340 		tTimer.m_iChildrenCalls += g_dTimers[iChild].m_iCalls + g_dTimers[iChild].m_iChildrenCalls;
341 	}
342 
343 	// adjust my raw time, remove all the timer costs from it
344 	// my own costs are 1x sphMicroTimer() call per start/stop cycle
345 	// subtimer costs are 2x sphMicroTimer() calls per start/stop cycle
346 	tTimer.m_iMicroSecAdj = tTimer.m_iMicroSec - ( ( tTimer.m_iCalls + 2*tTimer.m_iChildrenCalls )*g_iTimerTrialsWall / SPH_TIMER_TRIALS );
347 
348 	// now calculate self time
349 	// as adjusted time (all subtimer costs removed) minus all subtimer self time
350 	tTimer.m_iMicroSecSelf = tTimer.m_iMicroSecAdj;
351 	for ( int iChild=tTimer.m_iChild; iChild>0; iChild=g_dTimers[iChild].m_iNext )
352 		tTimer.m_iMicroSecSelf -= g_dTimers[iChild].m_iMicroSecSelf;
353 }
354 
355 
sphProfilerDone()356 void sphProfilerDone ()
357 {
358 	assert ( g_iTimers>0 );
359 	assert ( g_iTimer==0 );
360 
361 	// stop root timer
362 	g_iTimers = 0;
363 	g_iTimer = -1;
364 	g_dTimers[0].Stop ();
365 
366 	// bench adjustments
367 	for ( int iRun=0; iRun<3; iRun++ )
368 	{
369 		int64_t iTrial = sphMicroTimer();
370 		for ( int i=0; i<SPH_TIMER_TRIALS-1; i++ )
371 			sphMicroTimer();
372 		iTrial = sphMicroTimer()-iTrial;
373 
374 		if ( iRun!=0 )
375 			g_iTimerTrialsWall = Min ( g_iTimerTrialsWall, iTrial );
376 		else
377 			g_iTimerTrialsWall = iTrial;
378 	}
379 
380 	// apply those adjustments
381 	sphProfilerAdjust ( 0 );
382 }
383 
384 
sphProfilerShow(int iTimer=0,int iLevel=0)385 void sphProfilerShow ( int iTimer=0, int iLevel=0 )
386 {
387 	assert ( g_iTimers==0 );
388 	assert ( g_iTimer==-1 );
389 
390 	if ( iTimer==0 )
391 		fprintf ( stdout, "--- PROFILE ---\n" );
392 
393 	// show this timer
394 	CSphTimer & tTimer = g_dTimers[iTimer];
395 	if ( tTimer.m_iMicroSec<50 )
396 		return;
397 
398 	char sName[32];
399 	for ( int i=0; i<iLevel; i++ )
400 		sName[2*i] = sName[2*i+1] = ' ';
401 	sName[2*iLevel] = '\0';
402 	strncat ( sName, g_dTimerNames [ tTimer.m_eTimer ], sizeof(sName) );
403 
404 	fprintf ( stdout, "%-32s | %6d.%02d ms | %6d.%02d ms self | %d calls\n",
405 		sName,
406 		(int)(tTimer.m_iMicroSecAdj/1000), (int)(tTimer.m_iMicroSecAdj%1000)/10,
407 		(int)(tTimer.m_iMicroSecSelf/1000), (int)(tTimer.m_iMicroSecSelf%1000)/10,
408 		tTimer.m_iCalls );
409 
410 	// dump my children
411 	int iChild = tTimer.m_iChild;
412 	while ( iChild>0 && g_dTimers[iChild].m_iNext>0 )
413 		iChild = g_dTimers[iChild].m_iNext;
414 
415 	while ( iChild>0 )
416 	{
417 		sphProfilerShow ( iChild, 1+iLevel );
418 		iChild = g_dTimers[iChild].m_iPrev;
419 	}
420 
421 	if ( iTimer==0 )
422 		fprintf ( stdout, "---------------\n" );
423 }
424 
425 
426 class CSphEasyTimer
427 {
428 public:
CSphEasyTimer(ESphTimer eTimer)429 	explicit CSphEasyTimer ( ESphTimer eTimer )
430 		: m_eTimer ( eTimer )
431 	{
432 		if ( g_iTimer>=0 )
433 			sphProfilerPush ( m_eTimer );
434 	}
435 
~CSphEasyTimer()436 	~CSphEasyTimer ()
437 	{
438 		if ( g_iTimer>=0 )
439 			sphProfilerPop ( m_eTimer );
440 	}
441 
442 protected:
443 	ESphTimer		m_eTimer;
444 };
445 
446 
447 #define PROFILER_INIT() sphProfilerInit()
448 #define PROFILER_DONE() sphProfilerDone()
449 #define PROFILE_BEGIN(_arg) sphProfilerPush(TIMER_##_arg)
450 #define PROFILE_END(_arg) sphProfilerPop(TIMER_##_arg)
451 #define PROFILE_SHOW() sphProfilerShow()
452 #define PROFILE(_arg) CSphEasyTimer __t_##_arg ( TIMER_##_arg );
453 
454 #else
455 
456 #define PROFILER_INIT()
457 #define PROFILER_DONE()
458 #define PROFILE_BEGIN(_arg)
459 #define PROFILE_END(_arg)
460 #define PROFILE_SHOW()
461 #define PROFILE(_arg)
462 
463 #endif // SPH_INTERNAL_PROFILER
464 
465 /////////////////////////////////////////////////////////////////////////////
466 
467 #if !USE_WINDOWS
468 
469 bool g_bHeadProcess = true;
470 
sphSetProcessInfo(bool bHead)471 void sphSetProcessInfo ( bool bHead )
472 {
473 	g_bHeadProcess = bHead;
474 }
475 
476 #endif // USE_WINDOWS
477 
478 // whatever to collect IO stats
479 static bool g_bCollectIOStats = false;
480 static SphThreadKey_t g_tIOStatsTls;
481 
482 
sphInitIOStats()483 bool sphInitIOStats ()
484 {
485 	if ( !sphThreadKeyCreate ( &g_tIOStatsTls ) )
486 		return false;
487 
488 	g_bCollectIOStats = true;
489 	return true;
490 }
491 
492 
sphDoneIOStats()493 void sphDoneIOStats ()
494 {
495 	sphThreadKeyDelete ( g_tIOStatsTls );
496 	g_bCollectIOStats = false;
497 }
498 
499 
CSphIOStats()500 CSphIOStats::CSphIOStats ()
501 	: m_iReadTime ( 0 )
502 	, m_iReadOps ( 0 )
503 	, m_iReadBytes ( 0 )
504 	, m_iWriteTime ( 0 )
505 	, m_iWriteOps ( 0 )
506 	, m_iWriteBytes ( 0 )
507 	, m_pPrev ( NULL )
508 {}
509 
510 
~CSphIOStats()511 CSphIOStats::~CSphIOStats ()
512 {
513 	Stop();
514 }
515 
516 
Start()517 void CSphIOStats::Start()
518 {
519 	if ( !g_bCollectIOStats )
520 		return;
521 
522 	m_pPrev = (CSphIOStats *)sphThreadGet ( g_tIOStatsTls );
523 	sphThreadSet ( g_tIOStatsTls, this );
524 	m_bEnabled = true;
525 }
526 
Stop()527 void CSphIOStats::Stop()
528 {
529 	if ( !g_bCollectIOStats )
530 		return;
531 
532 	m_bEnabled = false;
533 	sphThreadSet ( g_tIOStatsTls, m_pPrev );
534 }
535 
536 
Add(const CSphIOStats & b)537 void CSphIOStats::Add ( const CSphIOStats & b )
538 {
539 	m_iReadTime += b.m_iReadTime;
540 	m_iReadOps += b.m_iReadOps;
541 	m_iReadBytes += b.m_iReadBytes;
542 	m_iWriteTime += b.m_iWriteTime;
543 	m_iWriteOps += b.m_iWriteOps;
544 	m_iWriteBytes += b.m_iWriteBytes;
545 }
546 
547 
GetIOStats()548 static CSphIOStats * GetIOStats ()
549 {
550 	if ( !g_bCollectIOStats )
551 		return NULL;
552 
553 	CSphIOStats * pIOStats = (CSphIOStats *)sphThreadGet ( g_tIOStatsTls );
554 
555 	if ( !pIOStats || !pIOStats->IsEnabled() )
556 		return NULL;
557 	else
558 		return pIOStats;
559 }
560 
561 
sphRead(int iFD,void * pBuf,size_t iCount)562 static size_t sphRead ( int iFD, void * pBuf, size_t iCount )
563 {
564 	CSphIOStats * pIOStats = GetIOStats();
565 	int64_t tmStart = 0;
566 	if ( pIOStats )
567 		tmStart = sphMicroTimer();
568 
569 	size_t uRead = (size_t) ::read ( iFD, pBuf, iCount );
570 
571 	if ( pIOStats )
572 	{
573 		pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
574 		pIOStats->m_iReadOps++;
575 		pIOStats->m_iReadBytes += iCount;
576 	}
577 
578 	return uRead;
579 }
580 
581 
582 static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo, CSphString * pError );
583 
584 /////////////////////////////////////////////////////////////////////////////
585 // INTERNAL SPHINX CLASSES DECLARATIONS
586 /////////////////////////////////////////////////////////////////////////////
587 
CSphAutofile()588 CSphAutofile::CSphAutofile ()
589 	: m_iFD ( -1 )
590 	, m_bTemporary ( false )
591 	, m_bWouldTemporary ( false )
592 	, m_pProgress ( NULL )
593 	, m_pStat ( NULL )
594 {
595 }
596 
597 
CSphAutofile(const CSphString & sName,int iMode,CSphString & sError,bool bTemp)598 CSphAutofile::CSphAutofile ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp )
599 	: m_iFD ( -1 )
600 	, m_bTemporary ( false )
601 	, m_bWouldTemporary ( false )
602 	, m_pProgress ( NULL )
603 	, m_pStat ( NULL )
604 {
605 	Open ( sName, iMode, sError, bTemp );
606 }
607 
608 
~CSphAutofile()609 CSphAutofile::~CSphAutofile ()
610 {
611 	Close ();
612 }
613 
614 
Open(const CSphString & sName,int iMode,CSphString & sError,bool bTemp)615 int CSphAutofile::Open ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp )
616 {
617 	assert ( m_iFD==-1 && m_sFilename.IsEmpty () );
618 	assert ( !sName.IsEmpty() );
619 
620 #if USE_WINDOWS
621 	if ( iMode==SPH_O_READ )
622 	{
623 		intptr_t tFD = (intptr_t)CreateFile ( sName.cstr(), GENERIC_READ , FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL );
624 		m_iFD = _open_osfhandle ( tFD, 0 );
625 	} else
626 		m_iFD = ::open ( sName.cstr(), iMode, 0644 );
627 #else
628 	m_iFD = ::open ( sName.cstr(), iMode, 0644 );
629 #endif
630 	m_sFilename = sName; // not exactly sure why is this uncoditional. for error reporting later, i suppose
631 
632 	if ( m_iFD<0 )
633 		sError.SetSprintf ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
634 	else
635 	{
636 		m_bTemporary = bTemp; // only if we managed to actually open it
637 		m_bWouldTemporary = true; // if a shit happen - we could delete the file.
638 	}
639 
640 	return m_iFD;
641 }
642 
643 
Close()644 void CSphAutofile::Close ()
645 {
646 	if ( m_iFD>=0 )
647 	{
648 		::close ( m_iFD );
649 		if ( m_bTemporary )
650 			::unlink ( m_sFilename.cstr() );
651 	}
652 
653 	m_iFD = -1;
654 	m_sFilename = "";
655 	m_bTemporary = false;
656 	m_bWouldTemporary = false;
657 }
658 
SetTemporary()659 void CSphAutofile::SetTemporary()
660 {
661 	m_bTemporary = m_bWouldTemporary;
662 }
663 
664 
GetFilename() const665 const char * CSphAutofile::GetFilename () const
666 {
667 	assert ( m_sFilename.cstr() );
668 	return m_sFilename.cstr();
669 }
670 
671 
GetSize(SphOffset_t iMinSize,bool bCheckSizeT,CSphString & sError)672 SphOffset_t CSphAutofile::GetSize ( SphOffset_t iMinSize, bool bCheckSizeT, CSphString & sError )
673 {
674 	struct_stat st;
675 	if ( stat ( GetFilename(), &st )<0 )
676 	{
677 		sError.SetSprintf ( "failed to stat %s: %s", GetFilename(), strerror(errno) );
678 		return -1;
679 	}
680 	if ( st.st_size<iMinSize )
681 	{
682 		sError.SetSprintf ( "failed to load %s: bad size "INT64_FMT" (at least "INT64_FMT" bytes expected)",
683 			GetFilename(), (int64_t)st.st_size, (int64_t)iMinSize );
684 		return -1;
685 	}
686 	if ( bCheckSizeT )
687 	{
688 		size_t sCheck = (size_t)st.st_size;
689 		if ( st.st_size!=SphOffset_t(sCheck) )
690 		{
691 			sError.SetSprintf ( "failed to load %s: bad size "INT64_FMT" (out of size_t; 4 GB limit on 32-bit machine hit?)",
692 				GetFilename(), (int64_t)st.st_size );
693 			return -1;
694 		}
695 	}
696 	return st.st_size;
697 }
698 
699 
GetSize()700 SphOffset_t CSphAutofile::GetSize ()
701 {
702 	CSphString sTmp;
703 	return GetSize ( 0, false, sTmp );
704 }
705 
706 
Read(void * pBuf,size_t uCount,CSphString & sError)707 bool CSphAutofile::Read ( void * pBuf, size_t uCount, CSphString & sError )
708 {
709 	int64_t iCount = (int64_t) uCount;
710 	int64_t iToRead = iCount;
711 	BYTE * pCur = (BYTE *)pBuf;
712 	while ( iToRead>0 )
713 	{
714 		int64_t iToReadOnce = ( m_pProgress && m_pStat )
715 			? Min ( SPH_READ_PROGRESS_CHUNK, iToRead )
716 			: Min ( SPH_READ_NOPROGRESS_CHUNK, iToRead );
717 		int64_t iGot = (int64_t) sphRead ( GetFD(), pCur, (size_t)iToReadOnce );
718 		if ( iGot<=0 )
719 			break;
720 
721 		iToRead -= iGot;
722 		pCur += iGot;
723 
724 		if ( m_pProgress && m_pStat )
725 		{
726 			m_pStat->m_iBytes += iGot;
727 			m_pProgress ( m_pStat, false );
728 		}
729 	}
730 
731 	if ( iToRead!=0 )
732 	{
733 		sError.SetSprintf ( "read error in %s; "INT64_FMT" of "INT64_FMT" bytes read",
734 			GetFilename(), iCount-iToRead, iCount );
735 		return false;
736 	}
737 	return true;
738 }
739 
740 
SetProgressCallback(CSphIndex::ProgressCallback_t * pfnProgress,CSphIndexProgress * pStat)741 void CSphAutofile::SetProgressCallback ( CSphIndex::ProgressCallback_t * pfnProgress, CSphIndexProgress * pStat )
742 {
743 	m_pProgress = pfnProgress;
744 	m_pStat = pStat;
745 }
746 
747 /////////////////////////////////////////////////////////////////////////////
748 
749 /// array pointer which self-destructs when going out of scope, or on demand
750 template < typename T > class CSphAutoArray
751 {
752 protected:
753 	T *		m_pData;
754 #ifndef NDEBUG
755 	size_t			m_iLength; // for pretty-printers to work
756 #endif
757 
758 public:
CSphAutoArray(int iCount)759 	explicit	CSphAutoArray ( int iCount )
760 #ifndef NDEBUG
761 	: m_iLength ( iCount )
762 #endif
763 	{ m_pData = ( iCount>0 ) ? new T [ iCount ] : NULL; }
~CSphAutoArray()764 				~CSphAutoArray ()				{ Reset (); }
765 
Reset()766 	void		Reset ()						{ SafeDeleteArray ( m_pData ); }
767 
operator =(const CSphAutoArray &)768 	const CSphAutoArray & operator = ( const CSphAutoArray & )		{ assert(0); return *this; }
operator T*()769 	operator T * ()													{ return m_pData; }
770 };
771 
772 /////////////////////////////////////////////////////////////////////////////
773 
774 /// generic stateless priority queue
775 template < typename T, typename COMP > class CSphQueue
776 {
777 protected:
778 	T *		m_pData;
779 	int		m_iUsed;
780 	int		m_iSize;
781 
782 public:
783 	/// ctor
CSphQueue(int iSize)784 	explicit CSphQueue ( int iSize )
785 		: m_iUsed ( 0 )
786 		, m_iSize ( iSize )
787 	{
788 		assert ( iSize>0 );
789 		m_pData = new T [ iSize ];
790 		assert ( m_pData );
791 	}
792 
793 	/// dtor
~CSphQueue()794 	virtual ~CSphQueue ()
795 	{
796 		SafeDeleteArray ( m_pData );
797 	}
798 
799 	/// add entry to the queue
Push(const T & tEntry)800 	virtual bool Push ( const T & tEntry )
801 	{
802 		if ( m_iUsed==m_iSize )
803 		{
804 			// if it's worse that current min, reject it, else pop off current min
805 			if ( COMP::IsLess ( tEntry, m_pData[0] ) )
806 				return true;
807 			else
808 				Pop ();
809 		}
810 
811 		// do add
812 		m_pData [ m_iUsed ] = tEntry;
813 		int iEntry = m_iUsed++;
814 
815 		// sift up if needed, so that worst (lesser) ones float to the top
816 		while ( iEntry )
817 		{
818 			int iParent = ( iEntry-1 ) >> 1;
819 			if ( !COMP::IsLess ( m_pData[iEntry], m_pData[iParent] ) )
820 				break;
821 
822 			// entry is less than parent, should float to the top
823 			Swap ( m_pData[iEntry], m_pData[iParent] );
824 			iEntry = iParent;
825 		}
826 
827 		return true;
828 	}
829 
830 	/// remove root (ie. top priority) entry
Pop()831 	virtual void Pop ()
832 	{
833 		assert ( m_iUsed );
834 		if ( !(--m_iUsed) ) // empty queue? just return
835 			return;
836 
837 		// make the last entry my new root
838 		m_pData[0] = m_pData[m_iUsed];
839 
840 		// sift down if needed
841 		int iEntry = 0;
842 		for ( ;; )
843 		{
844 			// select child
845 			int iChild = (iEntry<<1) + 1;
846 			if ( iChild>=m_iUsed )
847 				break;
848 
849 			// select smallest child
850 			if ( iChild+1<m_iUsed )
851 				if ( COMP::IsLess ( m_pData[iChild+1], m_pData[iChild] ) )
852 					iChild++;
853 
854 			// if smallest child is less than entry, do float it to the top
855 			if ( COMP::IsLess ( m_pData[iChild], m_pData[iEntry] ) )
856 			{
857 				Swap ( m_pData[iChild], m_pData[iEntry] );
858 				iEntry = iChild;
859 				continue;
860 			}
861 
862 			break;
863 		}
864 	}
865 
866 	/// get entries count
GetLength() const867 	inline int GetLength () const
868 	{
869 		return m_iUsed;
870 	}
871 
872 	/// get current root
Root() const873 	inline const T & Root () const
874 	{
875 		assert ( m_iUsed );
876 		return m_pData[0];
877 	}
878 };
879 
880 //////////////////////////////////////////////////////////////////////////
881 
882 /// possible bin states
883 enum ESphBinState
884 {
885 	BIN_ERR_READ	= -2,	///< bin read error
886 	BIN_ERR_END		= -1,	///< bin end
887 	BIN_POS			= 0,	///< bin is in "expects pos delta" state
888 	BIN_DOC			= 1,	///< bin is in "expects doc delta" state
889 	BIN_WORD		= 2		///< bin is in "expects word delta" state
890 };
891 
892 
893 enum ESphBinRead
894 {
895 	BIN_READ_OK,			///< bin read ok
896 	BIN_READ_EOF,			///< bin end
897 	BIN_READ_ERROR,			///< bin read error
898 	BIN_PRECACHE_OK,		///< precache ok
899 	BIN_PRECACHE_ERROR		///< precache failed
900 };
901 
902 
903 /// aggregated hit info
904 struct CSphAggregateHit
905 {
906 	SphDocID_t		m_iDocID;		///< document ID
907 	SphWordID_t		m_iWordID;		///< word ID in current dictionary
908 	BYTE *			m_sKeyword;		///< word itself (in keywords dictionary case only)
909 	Hitpos_t		m_iWordPos;		///< word position in current document, or hit count in case of aggregate hit
910 	CSphSmallBitvec	m_dFieldMask;	///< mask of fields containing this word, 0 for regular hits, non-0 for aggregate hits
911 
CSphAggregateHitCSphAggregateHit912 	CSphAggregateHit()
913 		: m_iDocID ( 0 )
914 		, m_iWordID ( 0 )
915 		, m_sKeyword ( NULL )
916 	{}
917 
GetAggrCountCSphAggregateHit918 	int GetAggrCount () const
919 	{
920 		assert ( !m_dFieldMask.TestAll ( false ) );
921 		return m_iWordPos;
922 	}
923 
SetAggrCountCSphAggregateHit924 	void SetAggrCount ( int iVal )
925 	{
926 		m_iWordPos = iVal;
927 	}
928 };
929 
930 
931 static const int MAX_KEYWORD_BYTES = SPH_MAX_WORD_LEN*3+4;
932 
933 
934 /// bin, block input buffer
935 struct CSphBin
936 {
937 	static const int	MIN_SIZE	= 8192;
938 	static const int	WARN_SIZE	= 262144;
939 
940 protected:
941 	ESphHitless			m_eMode;
942 	int					m_iSize;
943 
944 	BYTE *				m_dBuffer;
945 	BYTE *				m_pCurrent;
946 	int					m_iLeft;
947 	int					m_iDone;
948 	ESphBinState		m_eState;
949 	bool				m_bWordDict;
950 	bool				m_bError;	// FIXME? sort of redundant, but states are a mess
951 
952 	CSphAggregateHit	m_tHit;									///< currently decoded hit
953 	BYTE				m_sKeyword [ MAX_KEYWORD_BYTES ];	///< currently decoded hit keyword (in keywords dict mode)
954 
955 #ifndef NDEBUG
956 	SphWordID_t			m_iLastWordID;
957 	BYTE				m_sLastKeyword [ MAX_KEYWORD_BYTES ];
958 #endif
959 
960 	int					m_iFile;		///< my file
961 	SphOffset_t *		m_pFilePos;		///< shared current offset in file
962 
963 public:
964 	SphOffset_t			m_iFilePos;		///< my current offset in file
965 	int					m_iFileLeft;	///< how much data is still unread from the file
966 
967 public:
968 	explicit 			CSphBin ( ESphHitless eMode = SPH_HITLESS_NONE, bool bWordDict = false );
969 						~CSphBin ();
970 
971 	static int			CalcBinSize ( int iMemoryLimit, int iBlocks, const char * sPhase, bool bWarn = true );
972 	void				Init ( int iFD, SphOffset_t * pSharedOffset, const int iBinSize );
973 
974 	SphWordID_t			ReadVLB ();
975 	int					ReadByte ();
976 	ESphBinRead			ReadBytes ( void * pDest, int iBytes );
977 	int					ReadHit ( CSphAggregateHit * pHit, int iRowitems, CSphRowitem * pRowitems );
978 
979 	DWORD				UnzipInt ();
980 	SphOffset_t			UnzipOffset ();
981 
982 	bool				IsEOF () const;
983 	bool				IsDone () const;
IsErrorCSphBin984 	bool				IsError () const { return m_bError; }
985 	ESphBinRead			Precache ();
986 };
987 
988 /////////////////////////////////////////////////////////////////////////////
989 
990 #define READ_NO_SIZE_HINT 0
991 
992 /////////////////////////////////////////////////////////////////////////////
993 
994 /// search filter attribute types
995 enum ESphFilterAttr
996 {
997 	SPH_FILTERATTR_ATTR		= 0,
998 	SPH_FILTERATTR_ID		= 1,
999 	SPH_FILTERATTR_WEIGHT	= 2
1000 };
1001 
1002 class CSphIndex_VLN;
1003 
1004 /// everything required to setup search term
1005 class DiskIndexQwordSetup_c : public ISphQwordSetup
1006 {
1007 public:
1008 	const CSphAutofile &	m_tDoclist;
1009 	const CSphAutofile &	m_tHitlist;
1010 	const CSphAutofile &	m_tWordlist;
1011 	bool					m_bSetupReaders;
1012 
1013 	BYTE *					m_pDictBuf;
1014 
1015 public:
DiskIndexQwordSetup_c(const CSphAutofile & tDoclist,const CSphAutofile & tHitlist,const CSphAutofile & tWordlist,int iDictBufSize)1016 	DiskIndexQwordSetup_c ( const CSphAutofile & tDoclist, const CSphAutofile & tHitlist, const CSphAutofile & tWordlist, int iDictBufSize )
1017 		: m_tDoclist ( tDoclist )
1018 		, m_tHitlist ( tHitlist )
1019 		, m_tWordlist ( tWordlist )
1020 		, m_bSetupReaders ( false )
1021 		, m_pDictBuf ( NULL )
1022 	{
1023 		if ( iDictBufSize>0 )
1024 			m_pDictBuf = new BYTE [iDictBufSize];
1025 	}
1026 
~DiskIndexQwordSetup_c()1027 	virtual ~DiskIndexQwordSetup_c()
1028 	{
1029 		SafeDeleteArray ( m_pDictBuf );
1030 	}
1031 
1032 	virtual ISphQword *					QwordSpawn ( const XQKeyword_t & tWord ) const;
1033 	virtual bool						QwordSetup ( ISphQword * ) const;
1034 
1035 protected:
1036 	template < class T >	bool		Setup ( ISphQword * ) const;
1037 };
1038 
1039 
1040 #if USE_WINDOWS
1041 #pragma warning(disable:4127) // conditional expr is const for MSVC
1042 #endif
1043 
1044 
1045 /// query word from the searcher's point of view
1046 class DiskIndexQwordTraits_c : public ISphQword
1047 {
1048 	static const int	MINIBUFFER_LEN = 1024;
1049 
1050 public:
1051 	SphOffset_t		m_uHitPosition;
1052 	Hitpos_t		m_uInlinedHit;
1053 	DWORD			m_uHitState;
1054 
1055 	bool			m_bDupe;		///< whether the word occurs only once in current query
1056 
1057 	CSphMatch		m_tDoc;			///< current match (partial)
1058 	Hitpos_t		m_iHitPos;		///< current hit postition, from hitlist
1059 
1060 	BYTE			m_dDoclistBuf [ MINIBUFFER_LEN ];
1061 	BYTE			m_dHitlistBuf [ MINIBUFFER_LEN ];
1062 	CSphReader		m_rdDoclist;	///< my doclist reader
1063 	CSphReader		m_rdHitlist;	///< my hitlist reader
1064 
1065 	SphDocID_t		m_iMinID;		///< min ID to fixup
1066 	int				m_iInlineAttrs;	///< inline attributes count
1067 	CSphRowitem *	m_pInlineFixup;	///< inline attributes fixup (POINTER TO EXTERNAL DATA, NOT MANAGED BY THIS CLASS!)
1068 
1069 #ifndef NDEBUG
1070 	bool			m_bHitlistOver;
1071 #endif
1072 
1073 public:
DiskIndexQwordTraits_c(bool bUseMini,bool bExcluded)1074 	explicit DiskIndexQwordTraits_c ( bool bUseMini, bool bExcluded )
1075 		: m_uHitPosition ( 0 )
1076 		, m_uHitState ( 0 )
1077 		, m_bDupe ( false )
1078 		, m_iHitPos ()
1079 		, m_rdDoclist ( bUseMini ? m_dDoclistBuf : NULL, bUseMini ? MINIBUFFER_LEN : 0 )
1080 		, m_rdHitlist ( bUseMini ? m_dHitlistBuf : NULL, bUseMini ? MINIBUFFER_LEN : 0 )
1081 		, m_iMinID ( 0 )
1082 		, m_iInlineAttrs ( 0 )
1083 		, m_pInlineFixup ( NULL )
1084 #ifndef NDEBUG
1085 		, m_bHitlistOver ( true )
1086 #endif
1087 	{
1088 		m_iHitPos = EMPTY_HIT;
1089 		m_bExcluded = bExcluded;
1090 	}
1091 };
1092 
1093 
1094 /// query word from the searcher's point of view
1095 template < bool INLINE_HITS, bool INLINE_DOCINFO, bool DISABLE_HITLIST_SEEK >
1096 class DiskIndexQword_c : public DiskIndexQwordTraits_c
1097 {
1098 public:
DiskIndexQword_c(bool bUseMinibuffer,bool bExcluded)1099 	explicit DiskIndexQword_c ( bool bUseMinibuffer, bool bExcluded )
1100 		: DiskIndexQwordTraits_c ( bUseMinibuffer, bExcluded )
1101 	{
1102 	}
1103 
Reset()1104 	virtual void Reset ()
1105 	{
1106 		m_uHitPosition = 0;
1107 		m_uHitState = 0;
1108 		m_rdDoclist.Reset ();
1109 		m_rdHitlist.Reset ();
1110 		ISphQword::Reset();
1111 		m_iHitPos = EMPTY_HIT;
1112 		m_iInlineAttrs = 0;
1113 	}
1114 
GetHitlistEntry()1115 	void GetHitlistEntry ()
1116 	{
1117 		assert ( !m_bHitlistOver );
1118 		DWORD iDelta = m_rdHitlist.UnzipInt ();
1119 		if ( iDelta )
1120 		{
1121 			m_iHitPos += iDelta;
1122 		} else
1123 		{
1124 			m_iHitPos = EMPTY_HIT;
1125 #ifndef NDEBUG
1126 			m_bHitlistOver = true;
1127 #endif
1128 		}
1129 	}
1130 
GetNextDoc(DWORD * pDocinfo)1131 	virtual const CSphMatch & GetNextDoc ( DWORD * pDocinfo )
1132 	{
1133 		SphDocID_t iDelta = m_rdDoclist.UnzipDocid();
1134 		if ( iDelta )
1135 		{
1136 			m_bAllFieldsKnown = false;
1137 			m_tDoc.m_iDocID += iDelta;
1138 			if ( INLINE_DOCINFO )
1139 			{
1140 				assert ( pDocinfo );
1141 				for ( int i=0; i<m_iInlineAttrs; i++ )
1142 					pDocinfo[i] = m_rdDoclist.UnzipInt() + m_pInlineFixup[i];
1143 			}
1144 
1145 			if ( INLINE_HITS )
1146 			{
1147 				m_uMatchHits = m_rdDoclist.UnzipInt();
1148 				const DWORD uFirst = m_rdDoclist.UnzipInt();
1149 				if ( m_uMatchHits==1 && m_bHasHitlist )
1150 				{
1151 					const DWORD uField = m_rdDoclist.UnzipInt(); // field and end marker
1152 					m_iHitlistPos = uFirst | ( uField << 23 ) | ( U64C(1)<<63 );
1153 					m_dQwordFields.Unset();
1154 					m_dQwordFields.Set ( uField >> 1 );
1155 					m_bAllFieldsKnown = true;
1156 				} else
1157 				{
1158 					m_dQwordFields.Assign32 ( uFirst );
1159 					m_uHitPosition += m_rdDoclist.UnzipOffset();
1160 					m_iHitlistPos = m_uHitPosition;
1161 				}
1162 			} else
1163 			{
1164 				SphOffset_t iDeltaPos = m_rdDoclist.UnzipOffset();
1165 				assert ( iDeltaPos>=0 );
1166 
1167 				m_iHitlistPos += iDeltaPos;
1168 
1169 				m_dQwordFields.Assign32 ( m_rdDoclist.UnzipInt() );
1170 				m_uMatchHits = m_rdDoclist.UnzipInt();
1171 			}
1172 		} else
1173 		{
1174 			m_tDoc.m_iDocID = 0;
1175 		}
1176 		return m_tDoc;
1177 	}
1178 
SeekHitlist(SphOffset_t uOff)1179 	virtual void SeekHitlist ( SphOffset_t uOff )
1180 	{
1181 		if ( uOff >> 63 )
1182 		{
1183 			m_uHitState = 1;
1184 			m_uInlinedHit = (DWORD)uOff; // truncate high dword
1185 		} else
1186 		{
1187 			m_uHitState = 0;
1188 			m_iHitPos = EMPTY_HIT;
1189 			if ( DISABLE_HITLIST_SEEK )
1190 				assert ( m_rdHitlist.GetPos()==uOff ); // make sure we're where caller thinks we are.
1191 			else
1192 				m_rdHitlist.SeekTo ( uOff, READ_NO_SIZE_HINT );
1193 		}
1194 #ifndef NDEBUG
1195 		m_bHitlistOver = false;
1196 #endif
1197 	}
1198 
GetNextHit()1199 	virtual Hitpos_t GetNextHit ()
1200 	{
1201 		assert ( m_bHasHitlist );
1202 		switch ( m_uHitState )
1203 		{
1204 			case 0: // read hit from hitlist
1205 				GetHitlistEntry ();
1206 				return m_iHitPos;
1207 
1208 			case 1: // return inlined hit
1209 				m_uHitState = 2;
1210 				return m_uInlinedHit;
1211 
1212 			case 2: // return end-of-hitlist marker after inlined hit
1213 				#ifndef NDEBUG
1214 				m_bHitlistOver = true;
1215 				#endif
1216 				m_uHitState = 0;
1217 				return EMPTY_HIT;
1218 		}
1219 		sphDie ( "INTERNAL ERROR: impossible hit emitter state" );
1220 		return EMPTY_HIT;
1221 	}
1222 };
1223 
1224 #if USE_WINDOWS
1225 #pragma warning(default:4127) // conditional expr is const for MSVC
1226 #endif
1227 
1228 //////////////////////////////////////////////////////////////////////////////
1229 
1230 #define WITH_QWORD(INDEX, NO_SEEK, NAME, ACTION)													\
1231 {																									\
1232 	CSphIndex_VLN * INDEX##pIndex = (CSphIndex_VLN *)INDEX;												\
1233 	DWORD INDEX##uInlineHits = INDEX##pIndex->m_tSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE;					\
1234 	DWORD INDEX##uInlineDocinfo = INDEX##pIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE;						\
1235 																									\
1236 	switch ( ( INDEX##uInlineHits<<1 ) | INDEX##uInlineDocinfo )													\
1237 	{																								\
1238 		case 0: { typedef DiskIndexQword_c < false, false, NO_SEEK > NAME; ACTION; break; }			\
1239 		case 1: { typedef DiskIndexQword_c < false, true, NO_SEEK > NAME; ACTION; break; }			\
1240 		case 2: { typedef DiskIndexQword_c < true, false, NO_SEEK > NAME; ACTION; break; }			\
1241 		case 3: { typedef DiskIndexQword_c < true, true, NO_SEEK > NAME; ACTION; break; }			\
1242 		default:																					\
1243 			sphDie ( "INTERNAL ERROR: impossible qword settings" );									\
1244 	}																								\
1245 }
1246 
1247 /////////////////////////////////////////////////////////////////////////////
1248 
1249 struct CSphWordlistCheckpoint
1250 {
1251 	union
1252 	{
1253 		SphWordID_t		m_iWordID;
1254 		const char *	m_sWord;
1255 	};
1256 	SphOffset_t			m_iWordlistOffset;
1257 };
1258 
1259 // pre-v11 wordlist checkpoint
1260 struct CSphWordlistCheckpoint_v10
1261 {
1262 	SphWordID_t			m_iWordID;
1263 	DWORD				m_iWordlistOffset;
1264 };
1265 
1266 /////////////////////////////////////////////////////////////////////////////
1267 
1268 /// ordinals accumulation and sorting
1269 struct Ordinal_t
1270 {
1271 	SphDocID_t	m_uDocID;	///< doc id
1272 	CSphString	m_sValue;	///< string value
1273 };
1274 
1275 
1276 struct OrdinalEntry_t : public Ordinal_t
1277 {
1278 	int	m_iTag;
1279 };
1280 
1281 
1282 struct OrdinalId_t
1283 {
1284 	SphDocID_t	m_uDocID;
1285 	DWORD		m_uId;
1286 };
1287 
1288 
1289 struct OrdinalIdEntry_t : public OrdinalId_t
1290 {
1291 	int	m_iTag;
1292 };
1293 
Swap(Ordinal_t & a,Ordinal_t & b)1294 void Swap ( Ordinal_t & a, Ordinal_t & b )
1295 {
1296 	Swap ( a.m_uDocID, b.m_uDocID );
1297 	Swap ( a.m_sValue, b.m_sValue );
1298 }
1299 
Swap(OrdinalEntry_t & a,OrdinalEntry_t & b)1300 void Swap ( OrdinalEntry_t & a, OrdinalEntry_t & b )
1301 {
1302 	Swap ( a.m_uDocID, b.m_uDocID );
1303 	Swap ( a.m_sValue, b.m_sValue );
1304 	Swap ( a.m_iTag, b.m_iTag );
1305 }
1306 
1307 //////////////////////////////////////////////////////////////////////////
1308 
ReadFileInfo(CSphReader & tReader,const char * szFilename,CSphString & sWarning)1309 static void ReadFileInfo ( CSphReader & tReader, const char * szFilename, CSphString & sWarning )
1310 {
1311 	SphOffset_t uSize = tReader.GetOffset ();
1312 	SphOffset_t uCTime = tReader.GetOffset ();
1313 	SphOffset_t uMTime = tReader.GetOffset ();
1314 	DWORD uCRC32 = tReader.GetDword ();
1315 
1316 	if ( szFilename && *szFilename )
1317 	{
1318 		struct_stat tFileInfo;
1319 		if ( stat ( szFilename, &tFileInfo ) < 0 )
1320 			sWarning.SetSprintf ( "failed to stat %s: %s", szFilename, strerror(errno) );
1321 		else
1322 		{
1323 			DWORD uMyCRC32 = 0;
1324 			if ( !sphCalcFileCRC32 ( szFilename, uMyCRC32 ) )
1325 				sWarning.SetSprintf ( "failed to calculate CRC32 for %s", szFilename );
1326 			else
1327 				if ( uMyCRC32!=uCRC32 || tFileInfo.st_size!=uSize || tFileInfo.st_ctime!=uCTime || tFileInfo.st_mtime!=uMTime )
1328 						sWarning.SetSprintf ( "'%s' differs from the original", szFilename );
1329 		}
1330 	}
1331 }
1332 
1333 
WriteFileInfo(CSphWriter & tWriter,const CSphSavedFile & tInfo)1334 static void WriteFileInfo ( CSphWriter & tWriter, const CSphSavedFile & tInfo )
1335 {
1336 	tWriter.PutOffset ( tInfo.m_uSize );
1337 	tWriter.PutOffset ( tInfo.m_uCTime );
1338 	tWriter.PutOffset ( tInfo.m_uMTime );
1339 	tWriter.PutDword ( tInfo.m_uCRC32 );
1340 }
1341 
1342 
1343 struct WordDictInfo_t
1344 {
1345 	CSphString		m_sWord;
1346 	SphOffset_t		m_uOff;
1347 	int				m_iDocs;
1348 	int				m_iHits;
1349 	int				m_iDoclistHint;
1350 
1351 	WordDictInfo_t ();
1352 };
1353 
1354 
1355 struct WordReaderContext_t
1356 {
1357 	BYTE m_sWord [ MAX_KEYWORD_BYTES ];
1358 	int m_iLen;
1359 
1360 	WordReaderContext_t();
1361 };
1362 
1363 
1364 // !COMMIT eliminate this, move it to proper dict impls
1365 class CWordlist : public ISphWordlist
1366 {
1367 public:
1368 	int64_t								m_iCheckpointsPos;		///< checkpoints offset
1369 	CSphFixedVector<CSphWordlistCheckpoint>	m_dCheckpoints;			///< checkpoint offsets
1370 
1371 	CSphAutofile						m_tFile;				///< file
1372 	int64_t								m_iSize;				///< file size
1373 	CSphSharedBuffer<BYTE>				m_pBuf;					///< my cache
1374 	int									m_iMaxChunk;			///< max size of entry between checkpoints
1375 
1376 	BYTE *								m_pWords;				///< arena for checkpoint's words
1377 
1378 public:
1379 										CWordlist ();
1380 										~CWordlist ();
1381 	void								Reset ();
1382 
1383 	bool								ReadCP ( CSphAutofile & tFile, DWORD uVer, bool bWordDict, CSphString & sError );
1384 
1385 	const CSphWordlistCheckpoint *		FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const;
1386 
1387 	const BYTE *						GetWord ( const BYTE * pBuf, const char * pStr, int iLen, WordDictInfo_t & tWord, bool bStarMode, WordReaderContext_t & tCtx ) const;
1388 	bool								GetWord ( const BYTE * pBuf, SphWordID_t iWordID, WordDictInfo_t & tWord ) const;
1389 
1390 	const BYTE *						AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint, int iFD, BYTE * pDictBuf ) const;
1391 	virtual void						GetPrefixedWords ( const char * sWord, int iWordLen, CSphVector<CSphNamedInt> & dPrefixedWords, BYTE * pDictBuf, int iFD ) const;
1392 
1393 private:
1394 	bool								m_bWordDict;
1395 };
1396 
1397 
1398 /// this is my actual VLN-compressed phrase index implementation
1399 class CSphIndex_VLN : public CSphIndex
1400 {
1401 	friend class DiskIndexQwordSetup_c;
1402 	friend class CSphMerger;
1403 	friend class AttrIndexBuilder_t<SphDocID_t>;
1404 
1405 public:
1406 	explicit					CSphIndex_VLN ( const char* sIndexName, const char * sFilename );
1407 								~CSphIndex_VLN ();
1408 
1409 	virtual int					Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer );
1410 
1411 	virtual bool				LoadHeader ( const char * sHeaderName, bool bStripPath, CSphString & sWarning );
1412 	virtual bool				WriteHeader ( CSphWriter & fdInfo, SphOffset_t iCheckpointsPos, DWORD iCheckpointCount );
1413 
1414 	virtual void				DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig );
1415 	virtual void				DebugDumpDocids ( FILE * fp );
1416 	virtual void				DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID );
1417 	virtual int					DebugCheck ( FILE * fp );
1418 	template <class Qword> void	DumpHitlist ( FILE * fp, const char * sKeyword, bool bID );
1419 
1420 	virtual bool				Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning );
1421 	virtual bool				Mlock ();
1422 	virtual void				Dealloc ();
1423 
1424 	virtual bool				Preread ();
1425 	template<typename T> bool	PrereadSharedBuffer ( CSphSharedBuffer<T> & pBuffer, const char * sExt, size_t uExpected=0, DWORD uOffset=0 );
1426 
1427 	virtual void				SetBase ( const char * sNewBase );
1428 	virtual bool				Rename ( const char * sNewBase );
1429 
1430 	virtual bool				Lock ();
1431 	virtual void				Unlock ();
PostSetup()1432 	virtual void				PostSetup() {}
1433 
1434 	virtual bool				MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const;
1435 	virtual bool				MultiQueryEx ( int iQueries, const CSphQuery * pQueries, CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const;
1436 	virtual bool				GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString & sError ) const;
1437 	template <class Qword> bool	DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString & sError ) const;
1438 
1439 	virtual bool				Merge ( CSphIndex * pSource, CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists );
1440 	template <class QWORDDST, class QWORDSRC> bool MergeWords ( CSphIndex_VLN * pSrcIndex, ISphFilter * pFilter );
1441 
1442 	virtual int					UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError );
1443 	virtual bool				SaveAttributes ();
1444 	virtual DWORD				GetAttributeStatus () const;
1445 
1446 	bool						EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const;
1447 
1448 	virtual SphAttr_t *			GetKillList () const;
GetKillListSize() const1449 	virtual int					GetKillListSize () const { return m_iKillListSize; }
1450 	virtual bool				HasDocid ( SphDocID_t uDocid ) const;
1451 
GetStats() const1452 	virtual const CSphSourceStats &		GetStats () const { return m_tStats; }
1453 
1454 private:
1455 
1456 	static const int			MIN_WRITE_BUFFER		= 262144;	///< min write buffer size
1457 	static const int			DEFAULT_WRITE_BUFFER	= 1048576;	///< default write buffer size
1458 
1459 private:
1460 	// common stuff
1461 	CSphString					m_sFilename;
1462 	int							m_iLockFD;
1463 
1464 	CSphMatch *					m_pMin;				///< min attribute values tracker
1465 	CSphSourceStats				m_tStats;			///< my stats
1466 	SphDocID_t					m_iMergeInfinum;	///< minimal docid-1 for merging
1467 
1468 private:
1469 	// indexing-only
1470 	BYTE *						m_pWriteBuffer;		///< my write buffer (for temp files)
1471 	int							m_iWriteBuffer;		///< my write buffer size
1472 
1473 	bool						m_bWordDict;
1474 	bool						m_bMerging;
1475 	CSphAggregateHit			m_tLastHit;			///< hitlist entry
1476 	BYTE						m_sLastKeyword [ MAX_KEYWORD_BYTES ];
1477 
1478 	SphOffset_t					m_iLastHitlistPos;		///< doclist entry
1479 	SphOffset_t					m_iLastHitlistDelta;	///< doclist entry
1480 	CSphSmallBitvec				m_dLastDocFields;		///< doclist entry
1481 	DWORD						m_uLastDocHits;			///< doclist entry
1482 
1483 	SphOffset_t					m_iLastWordDoclist;		///< wordlist entry
1484 	int							m_iLastWordDocs;		///< wordlist entry
1485 	int							m_iLastWordHits;		///< wordlist entry
1486 
1487 	CSphWriter					m_wrDoclist;	///< wordlist writer
1488 	CSphWriter					m_wrHitlist;	///< hitlist writer
1489 
1490 	CSphIndexProgress			m_tProgress;
1491 
1492 	CSphVector<SphWordID_t>		m_dHitlessWords;
1493 
1494 	bool						LoadHitlessWords ();
1495 
1496 private:
1497 	// searching-only, per-index
1498 	static const int			DOCINFO_HASH_BITS	= 18;	// FIXME! make this configurable
1499 
1500 	CSphSharedBuffer<DWORD>		m_pDocinfo;				///< my docinfo cache
1501 	DWORD						m_uDocinfo;				///< my docinfo cache size
1502 	CSphSharedBuffer<DWORD>		m_pDocinfoHash;			///< hashed ids, to accelerate lookups
1503 	DWORD						m_uDocinfoIndex;		///< docinfo "index" entries count (each entry is 2x docinfo rows, for min/max)
1504 	DWORD *						m_pDocinfoIndex;		///< docinfo "index", to accelerate filtering during full-scan (2x rows for each block, and 2x rows for the whole index, 1+m_uDocinfoIndex entries)
1505 
1506 	CSphSharedBuffer<DWORD>		m_pMva;					///< my multi-valued attrs cache
1507 	CSphSharedBuffer<BYTE>		m_pStrings;				///< my in-RAM strings cache
1508 
1509 	CWordlist					m_tWordlist;			///< my wordlist
1510 
1511 	CSphSharedBuffer<SphAttr_t>	m_pKillList;			///< killlist
1512 	DWORD						m_iKillListSize;		///< killlist size (in elements)
1513 
1514 	int64_t						m_uMinMaxIndex;			///< stored min/max cache offset (counted in DWORDs)
1515 
1516 	CSphAutofile				m_tDoclistFile;			///< doclist file
1517 	CSphAutofile				m_tHitlistFile;			///< hitlist file
1518 
1519 #define SPH_SHARED_VARS_COUNT 2
1520 
1521 	DWORD *						m_pPreread;
1522 	DWORD *						m_pAttrsStatus;
1523 	CSphSharedBuffer<DWORD>		m_dShared;				///< are we ready to search
1524 
1525 	bool						m_bPreallocated;		///< are we ready to preread
1526 	DWORD						m_uVersion;				///< data files version
1527 	bool						m_bUse64;				///< whether the header is id64
1528 
1529 	int							m_iIndexTag;			///< my ids for MVA updates pool
1530 	static int					m_iIndexTagSeq;			///< static ids sequence
1531 
1532 	bool						m_bIsEmpty;				///< do we have actually indexed documents (m_iTotalDocuments is just fetched documents, not indexed!)
1533 
1534 private:
1535 	CSphString					GetIndexFileName ( const char * sExt ) const;
1536 
1537 	int							cidxWriteRawVLB ( int fd, CSphWordHit * pHit, int iHits, DWORD * pDocinfo, int Docinfos, int iStride );
1538 	void						cidxFinishDoclistEntry ( Hitpos_t uLastPos );
1539 	void						cidxHit ( CSphAggregateHit * pHit, CSphRowitem * pDocinfos );
1540 	bool						cidxDone ( const char * sHeaderExtension, int iMemLimit );
1541 
1542 	bool						ParsedMultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const XQQuery_t & tXQ, CSphDict * pDict, const CSphVector<CSphFilterSettings> * pExtraFilters, CSphQueryNodeCache * pNodeCache, int iTag ) const;
1543 	bool						MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const;
1544 	bool						MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery, int iSorters, ISphMatchSorter ** ppSorters, ISphRanker * pRanker, int iTag ) const;
1545 
1546 	const DWORD *				FindDocinfo ( SphDocID_t uDocID ) const;
1547 	void						CopyDocinfo ( CSphQueryContext * pCtx, CSphMatch & tMatch, const DWORD * pFound ) const;
1548 
1549 	bool						BuildMVA ( const CSphVector<CSphSource*> & dSources, CSphAutoArray<CSphWordHit> & dHits, int iArenaSize, int iFieldFD, int nFieldMVAs, int iFieldMVAInPool );
1550 
1551 	CSphDict *					SetupStarDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict, ISphTokenizer & tTokenizer ) const;
1552 	CSphDict *					SetupExactDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict, ISphTokenizer & tTokenizer ) const;
1553 
1554 	bool						RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSize, SphOffset_t * pFileSize, CSphBin * pMinBin, SphOffset_t * pSharedOffset );
1555 	bool						PrecomputeMinMax();
1556 
1557 private:
1558 	static const int MAX_ORDINAL_STR_LEN	= 4096;	///< maximum ordinal string length in bytes
1559 	static const int ORDINAL_READ_SIZE		= 262144;	///< sorted ordinal id read buffer size in bytes
1560 
1561 	ESphBinRead					ReadOrdinal ( CSphBin & Reader, Ordinal_t & Ordinal );
1562 	SphOffset_t					DumpOrdinals ( CSphWriter & Writer, CSphVector<Ordinal_t> & dOrdinals );
1563 	bool						SortOrdinals ( const char * szToFile, int iFromFD, int iArenaSize, int iOrdinalsInPool, CSphVector< CSphVector<SphOffset_t> > & dOrdBlockSize, bool bWarnOfMem );
1564 	bool						SortOrdinalIds ( const char * szToFile, int iFromFD, int iArenaSize, CSphVector < CSphVector < SphOffset_t > > & dOrdBlockSize, bool bWarnOfMem );
1565 
GetMVAPool() const1566 	const DWORD *				GetMVAPool () const { return m_pMva.GetWritePtr(); }
1567 	bool						LoadPersistentMVA ( CSphString & sError );
1568 
1569 	bool						JuggleFile ( const char* szExt, bool bNeedOrigin=true );
1570 	XQNode_t *					ExpandPrefix ( XQNode_t * pNode, CSphString & sError, CSphQueryResultMeta * pResult ) const;
1571 };
1572 
1573 int CSphIndex_VLN::m_iIndexTagSeq = 0;
1574 
1575 /////////////////////////////////////////////////////////////////////////////
1576 // UTILITY FUNCTIONS
1577 /////////////////////////////////////////////////////////////////////////////
1578 
1579 /// indexer warning
sphWarn(const char * sTemplate,...)1580 void sphWarn ( const char * sTemplate, ... )
1581 {
1582 	va_list ap;
1583 	va_start ( ap, sTemplate );
1584 	fprintf ( stdout, "WARNING: " );
1585 	vfprintf ( stdout, sTemplate, ap );
1586 	fprintf ( stdout, "\n" );
1587 	va_end ( ap );
1588 }
1589 
1590 //////////////////////////////////////////////////////////////////////////
1591 
1592 /// microsecond precision timestamp
sphMicroTimer()1593 int64_t sphMicroTimer()
1594 {
1595 #if USE_WINDOWS
1596 	// Windows time query
1597 	static int64_t iBase = 0;
1598 	static int64_t iStart = 0;
1599 	static int64_t iFreq = 0;
1600 
1601 	LARGE_INTEGER iLarge;
1602 	if ( !iBase )
1603 	{
1604 		// get start QPC value
1605 		QueryPerformanceFrequency ( &iLarge ); iFreq = iLarge.QuadPart;
1606 		QueryPerformanceCounter ( &iLarge ); iStart = iLarge.QuadPart;
1607 
1608 		// get start UTC timestamp
1609 		// assuming it's still approximately the same moment as iStart, give or take a msec or three
1610 		FILETIME ft;
1611 		GetSystemTimeAsFileTime ( &ft );
1612 
1613 		iBase = ( int64_t(ft.dwHighDateTime)<<32 ) + int64_t(ft.dwLowDateTime);
1614 		iBase = ( iBase - 116444736000000000ULL ) / 10; // rebase from 01 Jan 1601 to 01 Jan 1970, and rescale to 1 usec from 100 ns
1615 	}
1616 
1617 	// we can't easily drag iBase into parens because iBase*iFreq/1000000 overflows 64bit int!
1618 	QueryPerformanceCounter ( &iLarge );
1619 	return iBase + ( iLarge.QuadPart - iStart )*1000000/iFreq;
1620 
1621 #else
1622 	// UNIX time query
1623 	struct timeval tv;
1624 	gettimeofday ( &tv, NULL );
1625 	return int64_t(tv.tv_sec)*int64_t(1000000) + int64_t(tv.tv_usec);
1626 #endif // USE_WINDOWS
1627 }
1628 
1629 //////////////////////////////////////////////////////////////////////////
1630 
1631 static int		g_iMaxIOps		= 0;
1632 static int		g_iMaxIOSize	= 0;
1633 static int64_t	g_tmLastIOTime	= 0;
1634 
1635 
sphSetThrottling(int iMaxIOps,int iMaxIOSize)1636 void sphSetThrottling ( int iMaxIOps, int iMaxIOSize )
1637 {
1638 	g_iMaxIOps = iMaxIOps;
1639 	g_iMaxIOSize = iMaxIOSize;
1640 }
1641 
1642 
sphThrottleSleep()1643 static inline void sphThrottleSleep ()
1644 {
1645 	if ( g_iMaxIOps>0 )
1646 	{
1647 		int64_t tmTimer = sphMicroTimer();
1648 		int64_t tmSleep = Max ( 0, g_tmLastIOTime + 1000000/g_iMaxIOps - tmTimer );
1649 		sphSleepMsec ( (int)(tmSleep/1000) );
1650 		g_tmLastIOTime = tmTimer + tmSleep;
1651 	}
1652 }
1653 
1654 
sphWriteThrottled(int iFD,const void * pBuf,int64_t iCount,const char * sName,CSphString & sError)1655 bool sphWriteThrottled ( int iFD, const void * pBuf, int64_t iCount, const char * sName, CSphString & sError )
1656 {
1657 	if ( iCount<=0 )
1658 		return true;
1659 
1660 	// by default, slice ios by at most 1 GB
1661 	int iChunkSize = ( 1UL<<30 );
1662 
1663 	// when there's a sane max_iosize (4K to 1GB), use it
1664 	if ( g_iMaxIOSize>=4096 )
1665 		iChunkSize = Min ( iChunkSize, g_iMaxIOSize );
1666 
1667 	CSphIOStats * pIOStats = GetIOStats();
1668 
1669 	// while there's data, write it chunk by chunk
1670 	const BYTE * p = (const BYTE*) pBuf;
1671 	while ( iCount>0 )
1672 	{
1673 		// wait for a timely occasion
1674 		sphThrottleSleep ();
1675 
1676 		// write (and maybe time)
1677 		int64_t tmTimer = 0;
1678 		if ( pIOStats )
1679 			tmTimer = sphMicroTimer();
1680 
1681 		int iToWrite = iChunkSize;
1682 		if ( iCount<iChunkSize )
1683 			iToWrite = (int)iCount;
1684 
1685 		int iWritten = ::write ( iFD, p, iToWrite );
1686 
1687 		if ( pIOStats )
1688 		{
1689 			pIOStats->m_iWriteTime += sphMicroTimer() - tmTimer;
1690 			pIOStats->m_iWriteOps++;
1691 			pIOStats->m_iWriteBytes += iToWrite;
1692 		}
1693 
1694 		// success? rinse, repeat
1695 		if ( iWritten==iToWrite )
1696 		{
1697 			iCount -= iToWrite;
1698 			p += iToWrite;
1699 			continue;
1700 		}
1701 
1702 		// failure? report, bailout
1703 		if ( iWritten<0 )
1704 			sError.SetSprintf ( "%s: write error: %s", sName, strerror(errno) );
1705 		else
1706 			sError.SetSprintf ( "%s: write error: %d of %d bytes written", sName, iWritten, iToWrite );
1707 		return false;
1708 	}
1709 	return true;
1710 }
1711 
1712 
sphReadThrottled(int iFD,void * pBuf,size_t iCount)1713 size_t sphReadThrottled ( int iFD, void * pBuf, size_t iCount )
1714 {
1715 	if ( g_iMaxIOSize && int(iCount) > g_iMaxIOSize )
1716 	{
1717 		size_t nChunks = iCount / g_iMaxIOSize;
1718 		size_t nBytesLeft = iCount % g_iMaxIOSize;
1719 
1720 		size_t nBytesRead = 0;
1721 		size_t iRead = 0;
1722 
1723 		for ( size_t i=0; i<nChunks; i++ )
1724 		{
1725 			iRead = sphReadThrottled ( iFD, (char *)pBuf + i*g_iMaxIOSize, g_iMaxIOSize );
1726 			nBytesRead += iRead;
1727 			if ( iRead!=(size_t)g_iMaxIOSize )
1728 				return nBytesRead;
1729 		}
1730 
1731 		if ( nBytesLeft > 0 )
1732 		{
1733 			iRead = sphReadThrottled ( iFD, (char *)pBuf + nChunks*g_iMaxIOSize, nBytesLeft );
1734 			nBytesRead += iRead;
1735 			if ( iRead!=nBytesLeft )
1736 				return nBytesRead;
1737 		}
1738 
1739 		return nBytesRead;
1740 	}
1741 
1742 	sphThrottleSleep ();
1743 	return sphRead ( iFD, pBuf, iCount );
1744 }
1745 
SafeClose(int & iFD)1746 void SafeClose ( int & iFD )
1747 {
1748 	if ( iFD>=0 )
1749 		::close ( iFD );
1750 	iFD = -1;
1751 }
1752 
1753 //////////////////////////////////////////////////////////////////////////
1754 
1755 #if !USE_WINDOWS
strlwr(char * s)1756 char * strlwr ( char * s )
1757 {
1758 	while ( *s )
1759 	{
1760 		*s = tolower ( *s );
1761 		s++;
1762 	}
1763 	return s;
1764 }
1765 #endif
1766 
1767 
sphStrMacro(const char * sTemplate,const char * sMacro,SphDocID_t uValue)1768 char * sphStrMacro ( const char * sTemplate, const char * sMacro, SphDocID_t uValue )
1769 {
1770 	// expand macro
1771 	char sExp[32];
1772 	snprintf ( sExp, sizeof(sExp), DOCID_FMT, uValue );
1773 
1774 	// calc lengths
1775 	int iExp = strlen ( sExp );
1776 	int iMacro = strlen ( sMacro );
1777 	int iDelta = iExp-iMacro;
1778 
1779 	// calc result length
1780 	int iRes = strlen ( sTemplate );
1781 	const char * sCur = sTemplate;
1782 	while ( ( sCur = strstr ( sCur, sMacro ) )!=NULL )
1783 	{
1784 		iRes += iDelta;
1785 		sCur++;
1786 	}
1787 
1788 	// build result
1789 	char * sRes = new char [ iRes+1 ];
1790 	char * sOut = sRes;
1791 	const char * sLast = sTemplate;
1792 	sCur = sTemplate;
1793 
1794 	while ( ( sCur = strstr ( sCur, sMacro ) )!=NULL )
1795 	{
1796 		strncpy ( sOut, sLast, sCur-sLast ); sOut += sCur-sLast;
1797 		strcpy ( sOut, sExp ); sOut += iExp; // NOLINT
1798 		sCur += iMacro;
1799 		sLast = sCur;
1800 	}
1801 
1802 	if ( *sLast )
1803 		strcpy ( sOut, sLast ); // NOLINT
1804 
1805 	assert ( (int)strlen(sRes)==iRes );
1806 	return sRes;
1807 }
1808 
1809 
sphToFloat(const char * s)1810 float sphToFloat ( const char * s )
1811 {
1812 	if ( !s ) return 0.0f;
1813 	return (float)strtod ( s, NULL );
1814 }
1815 
1816 
sphToDword(const char * s)1817 DWORD sphToDword ( const char * s )
1818 {
1819 	if ( !s ) return 0;
1820 	return strtoul ( s, NULL, 10 );
1821 }
1822 
1823 
sphToUint64(const char * s)1824 uint64_t sphToUint64 ( const char * s )
1825 {
1826 	if ( !s ) return 0;
1827 	return strtoull ( s, NULL, 10 );
1828 }
1829 
1830 
sphToInt64(const char * s)1831 int64_t sphToInt64 ( const char * s )
1832 {
1833 	if ( !s ) return 0;
1834 	return strtoll ( s, NULL, 10 );
1835 }
1836 
1837 
1838 #if USE_64BIT
1839 #define sphToDocid sphToUint64
1840 #else
1841 #define sphToDocid sphToDword
1842 #endif
1843 
1844 
1845 #if USE_WINDOWS
1846 
sphLockEx(int iFile,bool bWait)1847 bool sphLockEx ( int iFile, bool bWait )
1848 {
1849 	HANDLE hHandle = (HANDLE) _get_osfhandle ( iFile );
1850 	if ( hHandle!=INVALID_HANDLE_VALUE )
1851 	{
1852 		OVERLAPPED tOverlapped;
1853 		memset ( &tOverlapped, 0, sizeof ( tOverlapped ) );
1854 		return !!LockFileEx ( hHandle, LOCKFILE_EXCLUSIVE_LOCK | ( bWait ? 0 : LOCKFILE_FAIL_IMMEDIATELY ), 0, 1, 0, &tOverlapped );
1855 	}
1856 
1857 	return false;
1858 }
1859 
sphLockUn(int iFile)1860 void sphLockUn ( int iFile )
1861 {
1862 	HANDLE hHandle = (HANDLE) _get_osfhandle ( iFile );
1863 	if ( hHandle!=INVALID_HANDLE_VALUE )
1864 	{
1865 		OVERLAPPED tOverlapped;
1866 		memset ( &tOverlapped, 0, sizeof ( tOverlapped ) );
1867 		UnlockFileEx ( hHandle, 0, 1, 0, &tOverlapped );
1868 	}
1869 }
1870 
1871 #else
1872 
sphLockEx(int iFile,bool bWait)1873 bool sphLockEx ( int iFile, bool bWait )
1874 {
1875 	struct flock tLock;
1876 	tLock.l_type = F_WRLCK;
1877 	tLock.l_whence = SEEK_SET;
1878 	tLock.l_start = 0;
1879 	tLock.l_len = 0;
1880 
1881 	int iCmd = bWait ? F_SETLKW : F_SETLK; // FIXME! check for HAVE_F_SETLKW?
1882 	return ( fcntl ( iFile, iCmd, &tLock )!=-1 );
1883 }
1884 
1885 
sphLockUn(int iFile)1886 void sphLockUn ( int iFile )
1887 {
1888 	struct flock tLock;
1889 	tLock.l_type = F_UNLCK;
1890 	tLock.l_whence = SEEK_SET;
1891 	tLock.l_start = 0;
1892 	tLock.l_len = 0;
1893 
1894 	fcntl ( iFile, F_SETLK, &tLock );
1895 }
1896 #endif
1897 
1898 
sphSleepMsec(int iMsec)1899 void sphSleepMsec ( int iMsec )
1900 {
1901 	if ( iMsec<0 )
1902 		return;
1903 
1904 #if USE_WINDOWS
1905 	Sleep ( iMsec );
1906 
1907 #else
1908 	struct timeval tvTimeout;
1909 	tvTimeout.tv_sec = iMsec / 1000; // full seconds
1910 	tvTimeout.tv_usec = ( iMsec % 1000 ) * 1000; // remainder is msec, so *1000 for usec
1911 
1912 	select ( 0, NULL, NULL, NULL, &tvTimeout ); // FIXME? could handle EINTR
1913 #endif
1914 }
1915 
1916 
sphIsReadable(const char * sPath,CSphString * pError)1917 bool sphIsReadable ( const char * sPath, CSphString * pError )
1918 {
1919 	int iFD = ::open ( sPath, O_RDONLY );
1920 
1921 	if ( iFD<0 )
1922 	{
1923 		if ( pError )
1924 			pError->SetSprintf ( "%s unreadable: %s", sPath, strerror(errno) );
1925 		return false;
1926 	}
1927 
1928 	close ( iFD );
1929 	return true;
1930 }
1931 
sphSetReadBuffers(int iReadBuffer,int iReadUnhinted)1932 void sphSetReadBuffers ( int iReadBuffer, int iReadUnhinted )
1933 {
1934 	if ( iReadBuffer<=0 )
1935 		iReadBuffer = DEFAULT_READ_BUFFER;
1936 	g_iReadBuffer = Max ( iReadBuffer, MIN_READ_BUFFER );
1937 
1938 	if ( iReadUnhinted<=0 )
1939 		iReadUnhinted = DEFAULT_READ_UNHINTED;
1940 	g_iReadUnhinted = Max ( iReadUnhinted, MIN_READ_UNHINTED );
1941 }
1942 
1943 //////////////////////////////////////////////////////////////////////////
1944 // DOCINFO
1945 //////////////////////////////////////////////////////////////////////////
1946 
1947 
1948 static DWORD *				g_pMvaArena = NULL;		///< initialized by sphArenaInit()
1949 
1950 
1951 // OPTIMIZE! try to inline or otherwise simplify maybe
GetAttrMVA(const CSphAttrLocator & tLoc,const DWORD * pPool) const1952 const DWORD * CSphMatch::GetAttrMVA ( const CSphAttrLocator & tLoc, const DWORD * pPool ) const
1953 {
1954 	DWORD uIndex = MVA_DOWNSIZE ( GetAttr ( tLoc ) );
1955 	if ( !uIndex )
1956 		return NULL;
1957 
1958 	if ( uIndex & MVA_ARENA_FLAG )
1959 		return g_pMvaArena + ( uIndex & MVA_OFFSET_MASK );
1960 
1961 	assert ( pPool );
1962 	return pPool + uIndex;
1963 }
1964 
1965 /////////////////////////////////////////////////////////////////////////////
1966 // TOKENIZERS
1967 /////////////////////////////////////////////////////////////////////////////
1968 
1969 #if USE_WINDOWS
1970 #pragma warning(disable:4127) // conditional expr is const for MSVC
1971 #endif
1972 inline int sphUTF8Decode ( BYTE * & pBuf ); // forward ref for GCC
1973 inline int sphUTF8Encode ( BYTE * pBuf, int iCode ); // forward ref for GCC
1974 
1975 
1976 /// synonym list entry
1977 struct CSphSynonym
1978 {
1979 	CSphString	m_sFrom;	///< specially packed list of map-from tokens
1980 	CSphString	m_sTo;		///< map-to string
1981 	int			m_iFromLen;	///< cached m_sFrom length
1982 	int			m_iToLen;	///< cached m_sTo length
1983 
operator <CSphSynonym1984 	inline bool operator < ( const CSphSynonym & rhs ) const
1985 	{
1986 		return strcmp ( m_sFrom.cstr(), rhs.m_sFrom.cstr() ) < 0;
1987 	}
1988 };
1989 
1990 
1991 /// tokenizer implementation traits
1992 template < bool IS_UTF8 >
1993 class CSphTokenizerTraits : public ISphTokenizer
1994 {
1995 public:
1996 	CSphTokenizerTraits ();
1997 
1998 	virtual bool			SetCaseFolding ( const char * sConfig, CSphString & sError );
1999 	virtual bool			LoadSynonyms ( const char * sFilename, CSphString & sError );
2000 	virtual void			CloneBase ( const CSphTokenizerTraits<IS_UTF8> * pFrom, bool bEscaped );
2001 
GetTokenStart() const2002 	virtual const char *	GetTokenStart () const		{ return (const char *) m_pTokenStart; }
GetTokenEnd() const2003 	virtual const char *	GetTokenEnd () const		{ return (const char *) m_pTokenEnd; }
GetBufferPtr() const2004 	virtual const char *	GetBufferPtr () const		{ return (const char *) m_pCur; }
GetBufferEnd() const2005 	virtual const char *	GetBufferEnd () const		{ return (const char *) m_pBufferMax; }
2006 	virtual void			SetBufferPtr ( const char * sNewPtr );
2007 	virtual int				SkipBlended ();
2008 
2009 protected:
2010 	BYTE *	GetTokenSyn ();
2011 	bool	BlendAdjust ( BYTE * pPosition );
2012 	BYTE *	GetBlendedVariant ();
2013 	int		CodepointArbitration ( int iCodepoint, bool bWasEscaped, BYTE uNextByte );
2014 
2015 protected:
2016 	/// get codepoint
GetCodepoint()2017 	inline int GetCodepoint ()
2018 	{
2019 		if ( IS_UTF8 )
2020 		{
2021 			while ( m_pCur<m_pBufferMax )
2022 			{
2023 				int iCode = sphUTF8Decode ( m_pCur );
2024 				if ( iCode>=0 )
2025 					return iCode; // succesful decode
2026 			}
2027 			return -1; // eof
2028 		} else
2029 		{
2030 			return m_pCur>=m_pBufferMax
2031 				? -1
2032 				: int ( *m_pCur++ );
2033 		}
2034 	}
2035 
2036 	/// accum codepoint
AccumCodepoint(int iCode)2037 	inline void AccumCodepoint ( int iCode )
2038 	{
2039 		assert ( iCode>0 );
2040 		assert ( m_iAccum>=0 );
2041 
2042 		// throw away everything which is over the token size
2043 		bool bFit = ( m_iAccum<SPH_MAX_WORD_LEN );
2044 		if ( IS_UTF8 )
2045 			bFit &= ( m_pAccum-m_sAccum+SPH_MAX_UTF8_BYTES<=(int)sizeof(m_sAccum));
2046 
2047 		if ( bFit )
2048 		{
2049 			if ( IS_UTF8 )
2050 				m_pAccum += sphUTF8Encode ( m_pAccum, iCode );
2051 			else
2052 				*m_pAccum++ = BYTE(iCode);
2053 			assert ( m_pAccum>=m_sAccum && m_pAccum<m_sAccum+sizeof(m_sAccum) );
2054 
2055 			m_iAccum++;
2056 		}
2057 	}
2058 
2059 protected:
2060 	BYTE *				m_pBuffer;							///< my buffer
2061 	BYTE *				m_pBufferMax;						///< max buffer ptr, exclusive (ie. this ptr is invalid, but every ptr below is ok)
2062 	BYTE *				m_pCur;								///< current position
2063 	BYTE *				m_pTokenStart;						///< last token start point
2064 	BYTE *				m_pTokenEnd;						///< last token end point
2065 
2066 	BYTE				m_sAccum [ 3*SPH_MAX_WORD_LEN+3 ];	///< folded token accumulator
2067 	BYTE *				m_pAccum;							///< current accumulator position
2068 	int					m_iAccum;							///< boundary token size
2069 
2070 	BYTE				m_sAccumBlend [ 3*SPH_MAX_WORD_LEN+3 ];	///< blend-acc, an accumulator copy for additional blended variants
2071 	int					m_iBlendNormalStart;					///< points to first normal char in the accumulators (might be NULL)
2072 	int					m_iBlendNormalEnd;						///< points just past (!) last normal char in the accumulators (might be NULL)
2073 
2074 	CSphVector<CSphSynonym>			m_dSynonyms;				///< active synonyms
2075 	CSphVector<int>					m_dSynStart;				///< map 1st byte to candidate range start
2076 	CSphVector<int>					m_dSynEnd;					///< map 1st byte to candidate range end
2077 
2078 	BYTE *	m_pBlendStart;
2079 	BYTE *	m_pBlendEnd;
2080 };
2081 
2082 
2083 /// single-byte charset tokenizer
2084 class CSphTokenizer_SBCS : public CSphTokenizerTraits<false>
2085 {
2086 public:
2087 								CSphTokenizer_SBCS ();
2088 
2089 	virtual void				SetBuffer ( BYTE * sBuffer, int iLength );
2090 	virtual BYTE *				GetToken ();
2091 	virtual ISphTokenizer *		Clone ( bool bEscaped ) const;
IsUtf8() const2092 	virtual bool				IsUtf8 () const { return false; }
GetCodepointLength(int) const2093 	virtual int					GetCodepointLength ( int ) const { return 1; }
2094 };
2095 
2096 
2097 /// UTF-8 tokenizer
2098 class CSphTokenizer_UTF8 : public CSphTokenizerTraits<true>
2099 {
2100 public:
2101 								CSphTokenizer_UTF8 ();
2102 
2103 	virtual void				SetBuffer ( BYTE * sBuffer, int iLength );
2104 	virtual BYTE *				GetToken ();
2105 	virtual ISphTokenizer *		Clone ( bool bEscaped ) const;
IsUtf8() const2106 	virtual bool				IsUtf8 () const { return true; }
2107 	virtual int					GetCodepointLength ( int iCode ) const;
2108 
2109 protected:
2110 	void						FlushAccum ();
2111 };
2112 
2113 
2114 /// UTF-8 tokenizer with n-grams
2115 class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8
2116 {
2117 public:
CSphTokenizer_UTF8Ngram()2118 						CSphTokenizer_UTF8Ngram () : m_iNgramLen ( 1 ) {}
2119 
2120 public:
2121 	virtual bool		SetNgramChars ( const char * sConfig, CSphString & sError );
2122 	virtual void		SetNgramLen ( int iLen );
2123 	virtual BYTE *		GetToken ();
2124 
2125 protected:
2126 	int					m_iNgramLen;
2127 	CSphString			m_sNgramCharsStr;
2128 };
2129 
2130 
2131 struct CSphMultiform
2132 {
2133 	CSphString				m_sNormalForm;
2134 	int						m_iNormalTokenLen;
2135 	CSphVector<CSphString>	m_dTokens;
2136 };
2137 
2138 
2139 struct CSphMultiforms
2140 {
2141 	int						m_iMinTokens;
2142 	int						m_iMaxTokens;
2143 	CSphVector<CSphMultiform*> m_dWordforms;
2144 };
2145 
2146 
2147 struct CSphMultiformContainer
2148 {
CSphMultiformContainerCSphMultiformContainer2149 							CSphMultiformContainer () : m_iMaxTokens ( 0 ) {}
2150 
2151 	int						m_iMaxTokens;
2152 	typedef CSphOrderedHash < CSphMultiforms *, CSphString, CSphStrHashFunc, 131072 > CSphMultiformHash;
2153 	CSphMultiformHash	m_Hash;
2154 };
2155 
2156 
2157 /// Token filter
2158 class CSphTokenizer_Filter : public ISphTokenizer
2159 {
2160 public:
2161 									CSphTokenizer_Filter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer );
2162 									~CSphTokenizer_Filter ();
2163 
SetCaseFolding(const char * sConfig,CSphString & sError)2164 	virtual bool					SetCaseFolding ( const char * sConfig, CSphString & sError )	{ return m_pTokenizer->SetCaseFolding ( sConfig, sError ); }
AddCaseFolding(CSphRemapRange & tRange)2165 	virtual void					AddCaseFolding ( CSphRemapRange & tRange )						{ m_pTokenizer->AddCaseFolding ( tRange ); }
AddSpecials(const char * sSpecials)2166 	virtual void					AddSpecials ( const char * sSpecials )							{ m_pTokenizer->AddSpecials ( sSpecials ); }
SetIgnoreChars(const char * sIgnored,CSphString & sError)2167 	virtual bool					SetIgnoreChars ( const char * sIgnored, CSphString & sError )	{ return m_pTokenizer->SetIgnoreChars ( sIgnored, sError ); }
SetNgramChars(const char * sConfig,CSphString & sError)2168 	virtual bool					SetNgramChars ( const char * sConfig, CSphString & sError )		{ return m_pTokenizer->SetNgramChars ( sConfig, sError ); }
SetNgramLen(int iLen)2169 	virtual void					SetNgramLen ( int iLen )										{ m_pTokenizer->SetNgramLen ( iLen ); }
LoadSynonyms(const char * sFilename,CSphString & sError)2170 	virtual bool					LoadSynonyms ( const char * sFilename, CSphString & sError )	{ return m_pTokenizer->LoadSynonyms ( sFilename, sError ); }
SetBoundary(const char * sConfig,CSphString & sError)2171 	virtual bool					SetBoundary ( const char * sConfig, CSphString & sError )		{ return m_pTokenizer->SetBoundary ( sConfig, sError ); }
Setup(const CSphTokenizerSettings & tSettings)2172 	virtual void					Setup ( const CSphTokenizerSettings & tSettings )				{ m_pTokenizer->Setup ( tSettings ); }
GetSettings() const2173 	virtual const CSphTokenizerSettings &	GetSettings () const									{ return m_pTokenizer->GetSettings (); }
GetSynFileInfo() const2174 	virtual const CSphSavedFile &	GetSynFileInfo () const											{ return m_pTokenizer->GetSynFileInfo (); }
EnableSentenceIndexing(CSphString & sError)2175 	virtual bool					EnableSentenceIndexing ( CSphString & sError )					{ return m_pTokenizer->EnableSentenceIndexing ( sError ); }
EnableZoneIndexing(CSphString & sError)2176 	virtual bool					EnableZoneIndexing ( CSphString & sError )						{ return m_pTokenizer->EnableZoneIndexing ( sError ); }
2177 
2178 public:
2179 	virtual void					SetBuffer ( BYTE * sBuffer, int iLength );
2180 	virtual BYTE *					GetToken ();
GetCodepointLength(int iCode) const2181 	virtual int						GetCodepointLength ( int iCode ) const		{ return m_pTokenizer->GetCodepointLength ( iCode ); }
EnableQueryParserMode(bool bEnable)2182 	virtual void					EnableQueryParserMode ( bool bEnable )		{ m_pTokenizer->EnableQueryParserMode ( bEnable ); }
EnableTokenizedMultiformTracking()2183 	virtual void					EnableTokenizedMultiformTracking ()			{ m_bBuildMultiform = true; }
GetLastTokenLen() const2184 	virtual int						GetLastTokenLen () const					{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_iTokenLen : m_pTokenizer->GetLastTokenLen(); }
GetBoundary()2185 	virtual bool					GetBoundary ()								{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bBoundary : m_pTokenizer->GetBoundary(); }
WasTokenSpecial()2186 	virtual bool					WasTokenSpecial ()							{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bSpecial : m_pTokenizer->WasTokenSpecial(); }
GetOvershortCount()2187 	virtual int						GetOvershortCount ()						{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_iOvershortCount : m_pTokenizer->GetOvershortCount(); }
GetTokenizedMultiform()2188 	virtual BYTE *					GetTokenizedMultiform ()					{ return m_sTokenizedMultiform[0] ? m_sTokenizedMultiform : NULL; }
TokenIsBlended() const2189 	virtual bool					TokenIsBlended () const						{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bBlended : m_pTokenizer->TokenIsBlended(); }
TokenIsBlendedPart() const2190 	virtual bool					TokenIsBlendedPart () const					{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bBlendedPart : m_pTokenizer->TokenIsBlendedPart(); }
2191 	virtual int						SkipBlended ();
2192 
2193 public:
2194 	virtual ISphTokenizer *			Clone ( bool bEscaped ) const;
IsUtf8() const2195 	virtual bool					IsUtf8 () const				{ return m_pTokenizer->IsUtf8 (); }
GetTokenStart() const2196 	virtual const char *			GetTokenStart () const		{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_szTokenStart : m_pTokenizer->GetTokenStart(); }
GetTokenEnd() const2197 	virtual const char *			GetTokenEnd () const		{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_szTokenEnd : m_pTokenizer->GetTokenEnd(); }
GetBufferPtr() const2198 	virtual const char *			GetBufferPtr () const		{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_pBufferPtr : m_pTokenizer->GetBufferPtr(); }
GetBufferEnd() const2199 	virtual const char *			GetBufferEnd () const		{ return m_pTokenizer->GetBufferEnd (); }
2200 	virtual void					SetBufferPtr ( const char * sNewPtr );
2201 
2202 private:
2203 	ISphTokenizer *					m_pTokenizer;
2204 	const CSphMultiformContainer *	m_pMultiWordforms;
2205 	int								m_iStart;
2206 
2207 	bool				m_bBuildMultiform;
2208 	BYTE				m_sTokenizedMultiform [ 3*SPH_MAX_WORD_LEN+4 ];
2209 
2210 	struct StoredToken_t
2211 	{
2212 		BYTE			m_sToken [3*SPH_MAX_WORD_LEN+4];
2213 		// tokenized state
2214 		const char *	m_szTokenStart;
2215 		const char *	m_szTokenEnd;
2216 		const char *	m_pBufferPtr;
2217 		int				m_iTokenLen;
2218 		int				m_iOvershortCount;
2219 		bool			m_bBoundary;
2220 		bool			m_bSpecial;
2221 		bool			m_bBlended;
2222 		bool			m_bBlendedPart;
2223 	};
2224 
2225 	CSphVector<StoredToken_t>		m_dStoredTokens;
2226 
2227 	void							FillTokenInfo ( StoredToken_t & tToken, const BYTE * sToken );
2228 };
2229 
2230 
2231 #if USE_WINDOWS
2232 #pragma warning(default:4127) // conditional expr is const
2233 #endif
2234 
2235 /////////////////////////////////////////////////////////////////////////////
2236 
sphCreateSBCSTokenizer()2237 ISphTokenizer * sphCreateSBCSTokenizer ()
2238 {
2239 	return new CSphTokenizer_SBCS ();
2240 }
2241 
2242 
sphCreateUTF8Tokenizer()2243 ISphTokenizer * sphCreateUTF8Tokenizer ()
2244 {
2245 	return new CSphTokenizer_UTF8 ();
2246 }
2247 
sphCreateUTF8NgramTokenizer()2248 ISphTokenizer * sphCreateUTF8NgramTokenizer ()
2249 {
2250 	return new CSphTokenizer_UTF8Ngram ();
2251 }
2252 
2253 /////////////////////////////////////////////////////////////////////////////
2254 
2255 enum
2256 {
2257 	MASK_CODEPOINT			= 0x00ffffffUL,	// mask off codepoint flags
2258 	MASK_FLAGS				= 0xff000000UL, // mask off codepoint value
2259 	FLAG_CODEPOINT_SPECIAL	= 0x01000000UL,	// this codepoint is special
2260 	FLAG_CODEPOINT_DUAL		= 0x02000000UL,	// this codepoint is special but also a valid word part
2261 	FLAG_CODEPOINT_NGRAM	= 0x04000000UL,	// this codepoint is n-gram indexed
2262 	FLAG_CODEPOINT_SYNONYM	= 0x08000000UL,	// this codepoint is used in synonym tokens only
2263 	FLAG_CODEPOINT_BOUNDARY	= 0x10000000UL,	// this codepoint is phrase boundary
2264 	FLAG_CODEPOINT_IGNORE	= 0x20000000UL,	// this codepoint is ignored
2265 	FLAG_CODEPOINT_BLEND	= 0x40000000UL	// this codepoint is "blended" (indexed both as a character, and as a separator)
2266 };
2267 
2268 
CSphLowercaser()2269 CSphLowercaser::CSphLowercaser ()
2270 	: m_pData ( NULL )
2271 {
2272 }
2273 
2274 
Reset()2275 void CSphLowercaser::Reset()
2276 {
2277 	m_iChunks = 0;
2278 	for ( int i=0; i<CHUNK_COUNT; i++ )
2279 		m_pChunk[i] = NULL;
2280 	SafeDeleteArray ( m_pData );
2281 }
2282 
2283 
~CSphLowercaser()2284 CSphLowercaser::~CSphLowercaser ()
2285 {
2286 	Reset ();
2287 }
2288 
2289 
SetRemap(const CSphLowercaser * pLC)2290 void CSphLowercaser::SetRemap ( const CSphLowercaser * pLC )
2291 {
2292 	if ( !pLC )
2293 		return;
2294 
2295 	Reset ();
2296 
2297 	m_iChunks = pLC->m_iChunks;
2298 	m_pData = new int [ m_iChunks*CHUNK_SIZE ];
2299 	memcpy ( m_pData, pLC->m_pData, sizeof(int)*m_iChunks*CHUNK_SIZE ); // NOLINT sizeof(int)
2300 
2301 	for ( int i=0; i<CHUNK_COUNT; i++ )
2302 		m_pChunk[i] = pLC->m_pChunk[i]
2303 			? pLC->m_pChunk[i] - pLC->m_pData + m_pData
2304 			: NULL;
2305 }
2306 
2307 
AddRemaps(const CSphVector<CSphRemapRange> & dRemaps,DWORD uFlags)2308 void CSphLowercaser::AddRemaps ( const CSphVector<CSphRemapRange> & dRemaps, DWORD uFlags )
2309 {
2310 	if ( !dRemaps.GetLength() )
2311 		return;
2312 
2313 	// build new chunks map
2314 	// 0 means "was unused"
2315 	// 1 means "was used"
2316 	// 2 means "is used now"
2317 	int dUsed [ CHUNK_COUNT ];
2318 	for ( int i=0; i<CHUNK_COUNT; i++ )
2319 		dUsed[i] = m_pChunk[i] ? 1 : 0;
2320 
2321 	int iNewChunks = m_iChunks;
2322 
2323 	ARRAY_FOREACH ( i, dRemaps )
2324 	{
2325 		const CSphRemapRange & tRemap = dRemaps[i];
2326 
2327 		#define LOC_CHECK_RANGE(_a) assert ( (_a)>=0 && (_a)<MAX_CODE );
2328 		LOC_CHECK_RANGE ( tRemap.m_iStart );
2329 		LOC_CHECK_RANGE ( tRemap.m_iEnd );
2330 		LOC_CHECK_RANGE ( tRemap.m_iRemapStart );
2331 		LOC_CHECK_RANGE ( tRemap.m_iRemapStart + tRemap.m_iEnd - tRemap.m_iStart );
2332 		#undef LOC_CHECK_RANGE
2333 
2334 		for ( int iChunk=( tRemap.m_iStart >> CHUNK_BITS ); iChunk<=( tRemap.m_iEnd >> CHUNK_BITS ); iChunk++ )
2335 			if ( dUsed[iChunk]==0 )
2336 		{
2337 			dUsed[iChunk] = 2;
2338 			iNewChunks++;
2339 		}
2340 	}
2341 
2342 	// alloc new tables and copy, if necessary
2343 	if ( iNewChunks>m_iChunks )
2344 	{
2345 		int * pData = new int [ iNewChunks*CHUNK_SIZE ];
2346 		memset ( pData, 0, sizeof(int)*iNewChunks*CHUNK_SIZE ); // NOLINT sizeof(int)
2347 
2348 		int * pChunk = pData;
2349 		for ( int i=0; i<CHUNK_COUNT; i++ )
2350 		{
2351 			int * pOldChunk = m_pChunk[i];
2352 
2353 			// build new ptr
2354 			if ( dUsed[i] )
2355 			{
2356 				m_pChunk[i] = pChunk;
2357 				pChunk += CHUNK_SIZE;
2358 			}
2359 
2360 			// copy old data
2361 			if ( dUsed[i]==1 )
2362 				memcpy ( m_pChunk[i], pOldChunk, sizeof(int)*CHUNK_SIZE ); // NOLINT sizeof(int)
2363 		}
2364 		assert ( pChunk-pData==iNewChunks*CHUNK_SIZE );
2365 
2366 		SafeDeleteArray ( m_pData );
2367 		m_pData = pData;
2368 		m_iChunks = iNewChunks;
2369 	}
2370 
2371 	// fill new stuff
2372 	ARRAY_FOREACH ( i, dRemaps )
2373 	{
2374 		const CSphRemapRange & tRemap = dRemaps[i];
2375 
2376 		DWORD iRemapped = tRemap.m_iRemapStart;
2377 		for ( int j=tRemap.m_iStart; j<=tRemap.m_iEnd; j++, iRemapped++ )
2378 		{
2379 			assert ( m_pChunk [ j >> CHUNK_BITS ] );
2380 			int & iCodepoint = m_pChunk [ j >> CHUNK_BITS ] [ j & CHUNK_MASK ];
2381 			bool bWordPart = ( iCodepoint & MASK_CODEPOINT ) && !( iCodepoint & FLAG_CODEPOINT_SYNONYM );
2382 			int iNew = iRemapped | uFlags | ( iCodepoint & MASK_FLAGS );
2383 			if ( bWordPart && ( iNew & FLAG_CODEPOINT_SPECIAL ) )
2384 				iCodepoint = ( iNew | FLAG_CODEPOINT_DUAL );
2385 			else
2386 				iCodepoint = iNew;
2387 
2388 			// new code-point flag removes SYNONYM
2389 			if ( ( iCodepoint & FLAG_CODEPOINT_SYNONYM ) && uFlags==0 && iRemapped!=0 )
2390 				iCodepoint &= ~FLAG_CODEPOINT_SYNONYM;
2391 		}
2392 	}
2393 }
2394 
2395 
AddSpecials(const char * sSpecials)2396 void CSphLowercaser::AddSpecials ( const char * sSpecials )
2397 {
2398 	assert ( sSpecials );
2399 	int iSpecials = strlen(sSpecials);
2400 
2401 	CSphVector<CSphRemapRange> dRemaps;
2402 	dRemaps.Resize ( iSpecials );
2403 	ARRAY_FOREACH ( i, dRemaps )
2404 		dRemaps[i].m_iStart = dRemaps[i].m_iEnd = dRemaps[i].m_iRemapStart = sSpecials[i];
2405 
2406 	AddRemaps ( dRemaps, FLAG_CODEPOINT_SPECIAL );
2407 }
2408 
operator =(const CSphLowercaser & rhs)2409 const CSphLowercaser & CSphLowercaser::operator = ( const CSphLowercaser & rhs )
2410 {
2411 	SetRemap ( &rhs );
2412 	return * this;
2413 }
2414 
GetFNV() const2415 uint64_t CSphLowercaser::GetFNV () const
2416 {
2417 	int iLen = ( sizeof(int) * m_iChunks * CHUNK_SIZE ) / sizeof(BYTE); // NOLINT
2418 	return sphFNV64 ( (BYTE *)m_pData, iLen );
2419 }
2420 
2421 /////////////////////////////////////////////////////////////////////////////
2422 
2423 /// parser to build lowercaser from textual config
2424 class CSphCharsetDefinitionParser
2425 {
2426 public:
CSphCharsetDefinitionParser()2427 						CSphCharsetDefinitionParser () : m_bError ( false ) {}
2428 	bool				Parse ( const char * sConfig, CSphVector<CSphRemapRange> & dRanges );
2429 	const char *		GetLastError ();
2430 
2431 protected:
2432 	bool				m_bError;
2433 	char				m_sError [ 1024 ];
2434 	const char *		m_pCurrent;
2435 
2436 	bool				Error ( const char * sMessage );
2437 	void				SkipSpaces ();
2438 	bool				IsEof ();
2439 	bool				CheckEof ();
2440 	int					HexDigit ( int c );
2441 	int					ParseCharsetCode ();
2442 	bool				AddRange ( const CSphRemapRange & tRange, CSphVector<CSphRemapRange> & dRanges );
2443 };
2444 
2445 
GetLastError()2446 const char * CSphCharsetDefinitionParser::GetLastError ()
2447 {
2448 	return m_bError ? m_sError : NULL;
2449 }
2450 
2451 
IsEof()2452 bool CSphCharsetDefinitionParser::IsEof ()
2453 {
2454 	return ( *m_pCurrent )==0;
2455 }
2456 
2457 
CheckEof()2458 bool CSphCharsetDefinitionParser::CheckEof ()
2459 {
2460 	if ( IsEof() )
2461 	{
2462 		Error ( "unexpected end of line" );
2463 		return true;
2464 	} else
2465 	{
2466 		return false;
2467 	}
2468 }
2469 
2470 
Error(const char * sMessage)2471 bool CSphCharsetDefinitionParser::Error ( const char * sMessage )
2472 {
2473 	char sErrorBuffer[32];
2474 	strncpy ( sErrorBuffer, m_pCurrent, sizeof(sErrorBuffer) );
2475 	sErrorBuffer [ sizeof(sErrorBuffer)-1 ] = '\0';
2476 
2477 	snprintf ( m_sError, sizeof(m_sError), "%s near '%s'",
2478 		sMessage, sErrorBuffer );
2479 	m_sError [ sizeof(m_sError)-1 ] = '\0';
2480 
2481 	m_bError = true;
2482 	return false;
2483 }
2484 
2485 
HexDigit(int c)2486 int CSphCharsetDefinitionParser::HexDigit ( int c )
2487 {
2488 	if ( c>='0' && c<='9' ) return c-'0';
2489 	if ( c>='a' && c<='f' ) return c-'a'+10;
2490 	if ( c>='A' && c<='F' ) return c-'A'+10;
2491 	return 0;
2492 }
2493 
2494 
SkipSpaces()2495 void CSphCharsetDefinitionParser::SkipSpaces ()
2496 {
2497 	while ( ( *m_pCurrent ) && isspace ( (BYTE)*m_pCurrent ) )
2498 		m_pCurrent++;
2499 }
2500 
2501 
ParseCharsetCode()2502 int CSphCharsetDefinitionParser::ParseCharsetCode ()
2503 {
2504 	const char * p = m_pCurrent;
2505 	int iCode = 0;
2506 
2507 	if ( p[0]=='U' && p[1]=='+' )
2508 	{
2509 		p += 2;
2510 		while ( isxdigit(*p) )
2511 		{
2512 			iCode = iCode*16 + HexDigit ( *p++ );
2513 		}
2514 		while ( isspace(*p) )
2515 			p++;
2516 
2517 	} else
2518 	{
2519 		if ( (*(BYTE*)p)<32 || (*(BYTE*)p)>127 )
2520 		{
2521 			Error ( "non-ASCII characters not allowed, use 'U+00AB' syntax" );
2522 			return -1;
2523 		}
2524 
2525 		iCode = *p++;
2526 		while ( isspace(*p) )
2527 			p++;
2528 	}
2529 
2530 	m_pCurrent = p;
2531 	return iCode;
2532 }
2533 
AddRange(const CSphRemapRange & tRange,CSphVector<CSphRemapRange> & dRanges)2534 bool CSphCharsetDefinitionParser::AddRange ( const CSphRemapRange & tRange, CSphVector<CSphRemapRange> & dRanges )
2535 {
2536 	if ( tRange.m_iRemapStart>=0x20 )
2537 	{
2538 		dRanges.Add ( tRange );
2539 		return true;
2540 	}
2541 
2542 	CSphString sError;
2543 	sError.SetSprintf ( "dest range (U+%x) below U+20, not allowed", tRange.m_iRemapStart );
2544 	Error ( sError.cstr() );
2545 	return false;
2546 }
2547 
2548 
Parse(const char * sConfig,CSphVector<CSphRemapRange> & dRanges)2549 bool CSphCharsetDefinitionParser::Parse ( const char * sConfig, CSphVector<CSphRemapRange> & dRanges )
2550 {
2551 	m_pCurrent = sConfig;
2552 	dRanges.Reset ();
2553 
2554 	// do parse
2555 	while ( *m_pCurrent )
2556 	{
2557 		SkipSpaces ();
2558 		if ( IsEof () )
2559 			break;
2560 
2561 		// check for stray comma
2562 		if ( *m_pCurrent==',' )
2563 			return Error ( "stray ',' not allowed, use 'U+002C' instead" );
2564 
2565 		// parse char code
2566 		const char * pStart = m_pCurrent;
2567 		int iStart = ParseCharsetCode();
2568 		if ( iStart<0 )
2569 			return false;
2570 
2571 		// stray char?
2572 		if ( !*m_pCurrent || *m_pCurrent==',' )
2573 		{
2574 			// stray char
2575 			if ( !AddRange ( CSphRemapRange ( iStart, iStart, iStart ), dRanges ) )
2576 				return false;
2577 
2578 			if ( IsEof () )
2579 				break;
2580 			m_pCurrent++;
2581 			continue;
2582 		}
2583 
2584 		// stray remap?
2585 		if ( m_pCurrent[0]=='-' && m_pCurrent[1]=='>' )
2586 		{
2587 			// parse and add
2588 			m_pCurrent += 2;
2589 			int iDest = ParseCharsetCode ();
2590 			if ( iDest<0 )
2591 				return false;
2592 			if ( !AddRange ( CSphRemapRange ( iStart, iStart, iDest ), dRanges ) )
2593 				return false;
2594 
2595 			// it's either end of line now, or must be followed by comma
2596 			if ( *m_pCurrent )
2597 				if ( *m_pCurrent++!=',' )
2598 					return Error ( "syntax error" );
2599 			continue;
2600 		}
2601 
2602 		// range start?
2603 		if (!( m_pCurrent[0]=='.' && m_pCurrent[1]=='.' ))
2604 			return Error ( "syntax error" );
2605 		m_pCurrent += 2;
2606 
2607 		SkipSpaces ();
2608 		if ( CheckEof () )
2609 			return false;
2610 
2611 		// parse range end char code
2612 		int iEnd = ParseCharsetCode ();
2613 		if ( iEnd<0 )
2614 			return false;
2615 		if ( iStart>iEnd )
2616 		{
2617 			m_pCurrent = pStart;
2618 			return Error ( "range end less than range start" );
2619 		}
2620 
2621 		// stray range?
2622 		if ( !*m_pCurrent || *m_pCurrent==',' )
2623 		{
2624 			if ( !AddRange ( CSphRemapRange ( iStart, iEnd, iStart ), dRanges ) )
2625 				return false;
2626 
2627 			if ( IsEof () )
2628 				break;
2629 			m_pCurrent++;
2630 			continue;
2631 		}
2632 
2633 		// "checkerboard" range?
2634 		if ( m_pCurrent[0]=='/' && m_pCurrent[1]=='2' )
2635 		{
2636 			for ( int i=iStart; i<iEnd; i+=2 )
2637 			{
2638 				if ( !AddRange ( CSphRemapRange ( i, i, i+1 ), dRanges ) )
2639 					return false;
2640 				if ( !AddRange ( CSphRemapRange ( i+1, i+1, i+1 ), dRanges ) )
2641 					return false;
2642 			}
2643 
2644 			// skip "/2", expect ","
2645 			m_pCurrent += 2;
2646 			SkipSpaces ();
2647 			if ( *m_pCurrent )
2648 				if ( *m_pCurrent++!=',' )
2649 					return Error ( "expected end of line or ','" );
2650 			continue;
2651 		}
2652 
2653 		// remapped range?
2654 		if (!( m_pCurrent[0]=='-' && m_pCurrent[1]=='>' ))
2655 			return Error ( "expected end of line, ',' or '-><char>'" );
2656 		m_pCurrent += 2;
2657 
2658 		SkipSpaces ();
2659 		if ( CheckEof () )
2660 			return false;
2661 
2662 		// parse dest start
2663 		const char * pRemapStart = m_pCurrent;
2664 		int iRemapStart = ParseCharsetCode ();
2665 		if ( iRemapStart<0 )
2666 			return false;
2667 
2668 		// expect '..'
2669 		if ( CheckEof () )
2670 			return false;
2671 		if (!( m_pCurrent[0]=='.' && m_pCurrent[1]=='.' ))
2672 			return Error ( "expected '..'" );
2673 		m_pCurrent += 2;
2674 
2675 		// parse dest end
2676 		int iRemapEnd = ParseCharsetCode ();
2677 		if ( iRemapEnd<0 )
2678 			return false;
2679 
2680 		// check dest range
2681 		if ( iRemapStart>iRemapEnd )
2682 		{
2683 			m_pCurrent = pRemapStart;
2684 			return Error ( "dest range end less than dest range start" );
2685 		}
2686 
2687 		// check for length mismatch
2688 		if ( ( iRemapEnd-iRemapStart )!=( iEnd-iStart ) )
2689 		{
2690 			m_pCurrent = pStart;
2691 			return Error ( "dest range length must match src range length" );
2692 		}
2693 
2694 		// remapped ok
2695 		if ( !AddRange ( CSphRemapRange ( iStart, iEnd, iRemapStart ), dRanges ) )
2696 			return false;
2697 
2698 		if ( IsEof () )
2699 			break;
2700 		if ( *m_pCurrent!=',' )
2701 			return Error ( "expected ','" );
2702 		m_pCurrent++;
2703 	}
2704 
2705 	dRanges.Sort ();
2706 	for ( int i=0; i<dRanges.GetLength()-1; i++ )
2707 	{
2708 		if ( dRanges[i].m_iEnd>=dRanges[i+1].m_iStart )
2709 		{
2710 			// FIXME! add an ambiguity check
2711 			dRanges[i].m_iEnd = Max ( dRanges[i].m_iEnd, dRanges[i+1].m_iEnd );
2712 			dRanges.Remove ( i+1 );
2713 			i--;
2714 		}
2715 	}
2716 
2717 	return true;
2718 }
2719 
2720 //////////////////////////////////////////////////////////////////////////
2721 
sphParseCharset(const char * sCharset,CSphVector<CSphRemapRange> & dRemaps)2722 bool sphParseCharset ( const char * sCharset, CSphVector<CSphRemapRange> & dRemaps )
2723 {
2724 	CSphCharsetDefinitionParser tParser;
2725 	return tParser.Parse ( sCharset, dRemaps );
2726 }
2727 
2728 /////////////////////////////////////////////////////////////////////////////
2729 
CSphSavedFile()2730 CSphSavedFile::CSphSavedFile ()
2731 	: m_uSize	( 0 )
2732 	, m_uCTime	( 0 )
2733 	, m_uMTime	( 0 )
2734 	, m_uCRC32	( 0 )
2735 {
2736 }
2737 
2738 
CSphTokenizerSettings()2739 CSphTokenizerSettings::CSphTokenizerSettings ()
2740 	: m_iType				( TOKENIZER_SBCS )
2741 	, m_iMinWordLen			( 1 )
2742 	, m_iNgramLen			( 0 )
2743 {
2744 }
2745 
2746 
LoadTokenizerSettings(CSphReader & tReader,CSphTokenizerSettings & tSettings,DWORD uVersion,CSphString & sWarning)2747 void LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSettings, DWORD uVersion, CSphString & sWarning )
2748 {
2749 	if ( uVersion<9 )
2750 		return;
2751 
2752 	tSettings.m_iType = tReader.GetByte ();
2753 	tSettings.m_sCaseFolding = tReader.GetString ();
2754 	tSettings.m_iMinWordLen = tReader.GetDword ();
2755 	tSettings.m_sSynonymsFile = tReader.GetString ();
2756 	ReadFileInfo ( tReader, tSettings.m_sSynonymsFile.cstr (), sWarning );
2757 	tSettings.m_sBoundary = tReader.GetString ();
2758 	tSettings.m_sIgnoreChars = tReader.GetString ();
2759 	tSettings.m_iNgramLen = tReader.GetDword ();
2760 	tSettings.m_sNgramChars = tReader.GetString ();
2761 	if ( uVersion>=15 )
2762 		tSettings.m_sBlendChars = tReader.GetString ();
2763 	if ( uVersion>=24 )
2764 		tSettings.m_sBlendMode = tReader.GetString();
2765 }
2766 
2767 
SaveTokenizerSettings(CSphWriter & tWriter,ISphTokenizer * pTokenizer)2768 void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer )
2769 {
2770 	assert ( pTokenizer );
2771 
2772 	const CSphTokenizerSettings & tSettings = pTokenizer->GetSettings ();
2773 	tWriter.PutByte ( tSettings.m_iType );
2774 	tWriter.PutString ( tSettings.m_sCaseFolding.cstr () );
2775 	tWriter.PutDword ( tSettings.m_iMinWordLen );
2776 	tWriter.PutString ( tSettings.m_sSynonymsFile.cstr () );
2777 	WriteFileInfo ( tWriter, pTokenizer->GetSynFileInfo () );
2778 	tWriter.PutString ( tSettings.m_sBoundary.cstr () );
2779 	tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
2780 	tWriter.PutDword ( tSettings.m_iNgramLen );
2781 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
2782 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
2783 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
2784 }
2785 
2786 
LoadDictionarySettings(CSphReader & tReader,CSphDictSettings & tSettings,DWORD uVersion,CSphString & sWarning)2787 void LoadDictionarySettings ( CSphReader & tReader, CSphDictSettings & tSettings, DWORD uVersion, CSphString & sWarning )
2788 {
2789 	if ( uVersion<9 )
2790 		return;
2791 
2792 	tSettings.m_sMorphology = tReader.GetString ();
2793 	tSettings.m_sStopwords = tReader.GetString ();
2794 	int nFiles = tReader.GetDword ();
2795 
2796 	CSphString sFile;
2797 	for ( int i = 0; i < nFiles; i++ )
2798 	{
2799 		sFile = tReader.GetString ();
2800 		ReadFileInfo ( tReader, sFile.cstr (), sWarning );
2801 	}
2802 
2803 	tSettings.m_sWordforms = tReader.GetString ();
2804 	ReadFileInfo ( tReader, tSettings.m_sWordforms.cstr (), sWarning );
2805 
2806 	if ( uVersion>=13 )
2807 		tSettings.m_iMinStemmingLen = tReader.GetDword ();
2808 
2809 	tSettings.m_bWordDict = false; // default to crc for old indexes
2810 	if ( uVersion>=21 )
2811 		tSettings.m_bWordDict = ( tReader.GetByte()!=0 );
2812 }
2813 
2814 
SaveDictionarySettings(CSphWriter & tWriter,CSphDict * pDict,bool bForceWordDict)2815 void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict )
2816 {
2817 	assert ( pDict );
2818 	const CSphDictSettings & tSettings = pDict->GetSettings ();
2819 
2820 	tWriter.PutString ( tSettings.m_sMorphology.cstr () );
2821 	tWriter.PutString ( tSettings.m_sStopwords.cstr () );
2822 	const CSphVector <CSphSavedFile> & dSWFileInfos = pDict->GetStopwordsFileInfos ();
2823 	tWriter.PutDword ( dSWFileInfos.GetLength () );
2824 	ARRAY_FOREACH ( i, dSWFileInfos )
2825 	{
2826 		tWriter.PutString ( dSWFileInfos[i].m_sFilename.cstr () );
2827 		WriteFileInfo ( tWriter, dSWFileInfos[i] );
2828 	}
2829 
2830 	const CSphSavedFile & tWFFileInfo = pDict->GetWordformsFileInfo ();
2831 
2832 	tWriter.PutString ( tSettings.m_sWordforms.cstr () );
2833 	WriteFileInfo ( tWriter, tWFFileInfo );
2834 
2835 	tWriter.PutDword ( tSettings.m_iMinStemmingLen );
2836 	tWriter.PutByte ( tSettings.m_bWordDict || bForceWordDict );
2837 }
2838 
2839 
ShortTokenFilter(BYTE * pToken,int iLen)2840 static inline bool ShortTokenFilter ( BYTE * pToken, int iLen )
2841 {
2842 	return pToken[0]=='*' || ( iLen > 0 && pToken[iLen-1]=='*' );
2843 }
2844 
2845 /////////////////////////////////////////////////////////////////////////////
2846 
ISphTokenizer()2847 ISphTokenizer::ISphTokenizer ()
2848 	: m_iLastTokenLen ( 0 )
2849 	, m_bTokenBoundary ( false )
2850 	, m_bBoundary ( false )
2851 	, m_bWasSpecial ( false )
2852 	, m_bWasSynonym ( false )
2853 	, m_bEscaped ( false )
2854 	, m_iOvershortCount ( 0 )
2855 	, m_bBlended ( false )
2856 	, m_bNonBlended ( true )
2857 	, m_bBlendedPart ( false )
2858 	, m_bBlendAdd ( false )
2859 	, m_uBlendVariants ( BLEND_TRIM_NONE )
2860 	, m_uBlendVariantsPending ( 0 )
2861 	, m_bBlendSkipPure ( false )
2862 	, m_bShortTokenFilter ( false )
2863 	, m_bQueryMode ( false )
2864 	, m_bDetectSentences ( false )
2865 	, m_bPhrase ( false )
2866 {}
2867 
2868 
SetCaseFolding(const char * sConfig,CSphString & sError)2869 bool ISphTokenizer::SetCaseFolding ( const char * sConfig, CSphString & sError )
2870 {
2871 	CSphVector<CSphRemapRange> dRemaps;
2872 	CSphCharsetDefinitionParser tParser;
2873 	if ( !tParser.Parse ( sConfig, dRemaps ) )
2874 	{
2875 		sError = tParser.GetLastError();
2876 		return false;
2877 	}
2878 
2879 	const int MIN_CODE = 0x21;
2880 	ARRAY_FOREACH ( i, dRemaps )
2881 	{
2882 		CSphRemapRange & tMap = dRemaps[i];
2883 
2884 		if ( tMap.m_iStart<MIN_CODE || tMap.m_iStart>=m_tLC.MAX_CODE )
2885 		{
2886 			sphWarning ( "wrong character mapping start specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED", tMap.m_iStart, MIN_CODE, m_tLC.MAX_CODE-1 );
2887 			tMap.m_iStart = Min ( Max ( tMap.m_iStart, MIN_CODE ), m_tLC.MAX_CODE-1 );
2888 		}
2889 
2890 		if ( tMap.m_iEnd<MIN_CODE || tMap.m_iEnd>=m_tLC.MAX_CODE )
2891 		{
2892 			sphWarning ( "wrong character mapping end specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED", tMap.m_iEnd, MIN_CODE, m_tLC.MAX_CODE-1 );
2893 			tMap.m_iEnd = Min ( Max ( tMap.m_iEnd, MIN_CODE ), m_tLC.MAX_CODE-1 );
2894 		}
2895 
2896 		if ( tMap.m_iRemapStart<MIN_CODE || tMap.m_iRemapStart>=m_tLC.MAX_CODE )
2897 		{
2898 			sphWarning ( "wrong character remapping start specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED", tMap.m_iRemapStart, MIN_CODE, m_tLC.MAX_CODE-1 );
2899 			tMap.m_iRemapStart = Min ( Max ( tMap.m_iRemapStart, MIN_CODE ), m_tLC.MAX_CODE-1 );
2900 		}
2901 
2902 		int iRemapEnd = tMap.m_iRemapStart+tMap.m_iEnd-tMap.m_iStart;
2903 		if ( iRemapEnd<MIN_CODE || iRemapEnd>=m_tLC.MAX_CODE )
2904 		{
2905 			sphWarning ( "wrong character remapping end specified: U+%x, should be between U+%x and U+%x (inclusive); IGNORED", iRemapEnd, MIN_CODE, m_tLC.MAX_CODE-1 );
2906 			dRemaps.Remove(i);
2907 			i--;
2908 		}
2909 	}
2910 
2911 	m_tLC.Reset ();
2912 	m_tLC.AddRemaps ( dRemaps, 0 );
2913 	return true;
2914 }
2915 
2916 
AddCaseFolding(CSphRemapRange & tRange)2917 void ISphTokenizer::AddCaseFolding ( CSphRemapRange & tRange )
2918 {
2919 	CSphVector<CSphRemapRange> dTmp;
2920 	dTmp.Add ( tRange );
2921 	m_tLC.AddRemaps ( dTmp, 0 );
2922 }
2923 
2924 
AddSpecials(const char * sSpecials)2925 void ISphTokenizer::AddSpecials ( const char * sSpecials )
2926 {
2927 	m_tLC.AddSpecials ( sSpecials );
2928 }
2929 
2930 
TokenizeOnWhitespace(CSphVector<CSphString> & dTokens,BYTE * sFrom,bool bUtf8)2931 static int TokenizeOnWhitespace ( CSphVector<CSphString> & dTokens, BYTE * sFrom, bool bUtf8 )
2932 {
2933 	BYTE sAccum [ 3*SPH_MAX_WORD_LEN+16 ];
2934 	BYTE * pAccum = sAccum;
2935 	int iAccum = 0;
2936 
2937 	for ( ;; )
2938 	{
2939 		int iCode = bUtf8 ? sphUTF8Decode(sFrom) : *sFrom++;
2940 
2941 		// eof or whitespace?
2942 		if ( !iCode || sphIsSpace(iCode) )
2943 		{
2944 			// flush accum
2945 			if ( iAccum )
2946 			{
2947 				*pAccum = '\0';
2948 				dTokens.Add ( (char*)sAccum );
2949 
2950 				pAccum = sAccum;
2951 				iAccum = 0;
2952 			}
2953 
2954 			// break on eof
2955 			if ( !iCode )
2956 				break;
2957 		} else
2958 		{
2959 			// accumulate everything else
2960 			if ( iAccum<SPH_MAX_WORD_LEN )
2961 			{
2962 				if ( bUtf8 && ( pAccum-sAccum+SPH_MAX_UTF8_BYTES<=(int)sizeof(sAccum) ) )
2963 				{
2964 					pAccum += sphUTF8Encode ( pAccum, iCode );
2965 					iAccum++;
2966 				} else
2967 				{
2968 					*pAccum++ = BYTE(iCode);
2969 					iAccum++;
2970 				}
2971 			}
2972 		}
2973 	}
2974 
2975 	return dTokens.GetLength();
2976 }
2977 
2978 
sphTrim(BYTE * s)2979 static BYTE * sphTrim ( BYTE * s )
2980 {
2981 	// skip to first non-whitespace from start
2982 	while ( *s && sphIsSpace(*s) )
2983 		s++;
2984 	if ( !*s )
2985 		return s;
2986 
2987 	// find the end
2988 	BYTE * sEnd = s;
2989 	while ( *sEnd )
2990 		sEnd++;
2991 	sEnd--;
2992 
2993 	// skip to first non-whitespace from end
2994 	while ( sEnd>s && sphIsSpace(*sEnd) )
2995 		sEnd--;
2996 
2997 	*++sEnd = '\0';
2998 	return s;
2999 }
3000 
3001 
Setup(const CSphTokenizerSettings & tSettings)3002 void ISphTokenizer::Setup ( const CSphTokenizerSettings & tSettings )
3003 {
3004 	m_tSettings = tSettings;
3005 	m_tSettings.m_iMinWordLen = Max ( tSettings.m_iMinWordLen, 1 );
3006 }
3007 
3008 
Create(const CSphTokenizerSettings & tSettings,CSphString & sError)3009 ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, CSphString & sError )
3010 {
3011 	CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL );
3012 
3013 	switch ( tSettings.m_iType )
3014 	{
3015 		case TOKENIZER_SBCS:	pTokenizer = sphCreateSBCSTokenizer (); break;
3016 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
3017 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
3018 		default:
3019 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
3020 			return NULL;
3021 	}
3022 
3023 	pTokenizer->Setup ( tSettings );
3024 
3025 	if ( !tSettings.m_sCaseFolding.IsEmpty () && !pTokenizer->SetCaseFolding ( tSettings.m_sCaseFolding.cstr (), sError ) )
3026 	{
3027 		sError.SetSprintf ( "'charset_table': %s", sError.cstr() );
3028 		return NULL;
3029 	}
3030 
3031 	if ( !tSettings.m_sSynonymsFile.IsEmpty () && !pTokenizer->LoadSynonyms ( tSettings.m_sSynonymsFile.cstr (), sError ) )
3032 	{
3033 		sError.SetSprintf ( "'synonyms': %s", sError.cstr() );
3034 		return NULL;
3035 	}
3036 
3037 	if ( !tSettings.m_sBoundary.IsEmpty () && !pTokenizer->SetBoundary ( tSettings.m_sBoundary.cstr (), sError ) )
3038 	{
3039 		sError.SetSprintf ( "'phrase_boundary': %s", sError.cstr() );
3040 		return NULL;
3041 	}
3042 
3043 	if ( !tSettings.m_sIgnoreChars.IsEmpty () && !pTokenizer->SetIgnoreChars ( tSettings.m_sIgnoreChars.cstr (), sError ) )
3044 	{
3045 		sError.SetSprintf ( "'ignore_chars': %s", sError.cstr() );
3046 		return NULL;
3047 	}
3048 
3049 	if ( !tSettings.m_sBlendChars.IsEmpty () && !pTokenizer->SetBlendChars ( tSettings.m_sBlendChars.cstr (), sError ) )
3050 	{
3051 		sError.SetSprintf ( "'blend_chars': %s", sError.cstr() );
3052 		return NULL;
3053 	}
3054 
3055 	if ( !pTokenizer->SetBlendMode ( tSettings.m_sBlendMode.cstr (), sError ) )
3056 	{
3057 		sError.SetSprintf ( "'blend_mode': %s", sError.cstr() );
3058 		return NULL;
3059 	}
3060 
3061 	pTokenizer->SetNgramLen ( tSettings.m_iNgramLen );
3062 
3063 	if ( !tSettings.m_sNgramChars.IsEmpty () && !pTokenizer->SetNgramChars ( tSettings.m_sNgramChars.cstr (), sError ) )
3064 	{
3065 		sError.SetSprintf ( "'ngram_chars': %s", sError.cstr() );
3066 		return NULL;
3067 	}
3068 
3069 	return pTokenizer.LeakPtr ();
3070 }
3071 
3072 
CreateTokenFilter(ISphTokenizer * pTokenizer,const CSphMultiformContainer * pContainer)3073 ISphTokenizer * ISphTokenizer::CreateTokenFilter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
3074 {
3075 	if ( !pContainer )
3076 		return NULL;
3077 
3078 	return new CSphTokenizer_Filter ( pTokenizer, pContainer );
3079 }
3080 
3081 
AddSpecialsSPZ(const char * sSpecials,const char * sDirective,CSphString & sError)3082 bool ISphTokenizer::AddSpecialsSPZ ( const char * sSpecials, const char * sDirective, CSphString & sError )
3083 {
3084 	for ( int i=0; sSpecials[i]; i++ )
3085 	{
3086 		int iCode = m_tLC.ToLower ( sSpecials[i] );
3087 		if ( iCode & ( FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_BOUNDARY | FLAG_CODEPOINT_IGNORE ) )
3088 		{
3089 			sError.SetSprintf ( "%s requires that character '%c' is not in ngram_chars, phrase_boundary, or ignore_chars",
3090 				sDirective, sSpecials[i] );
3091 			return false;
3092 		}
3093 	}
3094 
3095 	AddSpecials ( sSpecials );
3096 	return true;
3097 }
3098 
3099 
EnableSentenceIndexing(CSphString & sError)3100 bool ISphTokenizer::EnableSentenceIndexing ( CSphString & sError )
3101 {
3102 	const char sSpecials[] = { '.', '?', '!', MAGIC_CODE_PARAGRAPH, 0 };
3103 
3104 	if ( !AddSpecialsSPZ ( sSpecials, "index_sp", sError ) )
3105 		return false;
3106 
3107 	m_bDetectSentences = true;
3108 	return true;
3109 }
3110 
3111 
EnableZoneIndexing(CSphString & sError)3112 bool ISphTokenizer::EnableZoneIndexing ( CSphString & sError )
3113 {
3114 	static const char sSpecials[] = { MAGIC_CODE_ZONE, 0 };
3115 	return AddSpecialsSPZ ( sSpecials, "index_zones", sError );
3116 }
3117 
3118 //////////////////////////////////////////////////////////////////////////
3119 
3120 template < bool IS_UTF8 >
CSphTokenizerTraits()3121 CSphTokenizerTraits<IS_UTF8>::CSphTokenizerTraits ()
3122 	: m_pBuffer		( NULL )
3123 	, m_pBufferMax	( NULL )
3124 	, m_pCur		( NULL )
3125 	, m_pTokenStart ( NULL )
3126 	, m_pTokenEnd	( NULL )
3127 	, m_iAccum		( 0 )
3128 	, m_pBlendStart		( NULL )
3129 	, m_pBlendEnd		( NULL )
3130 {
3131 	m_pAccum = m_sAccum;
3132 }
3133 
3134 
3135 template < bool IS_UTF8 >
SetCaseFolding(const char * sConfig,CSphString & sError)3136 bool CSphTokenizerTraits<IS_UTF8>::SetCaseFolding ( const char * sConfig, CSphString & sError )
3137 {
3138 	if ( m_dSynonyms.GetLength() )
3139 	{
3140 		sError = "SetCaseFolding() must not be called after LoadSynonyms()";
3141 		return false;
3142 	}
3143 	return ISphTokenizer::SetCaseFolding ( sConfig, sError );
3144 }
3145 
3146 
3147 template < bool IS_UTF8 >
LoadSynonyms(const char * sFilename,CSphString & sError)3148 bool CSphTokenizerTraits<IS_UTF8>::LoadSynonyms ( const char * sFilename, CSphString & sError )
3149 {
3150 	m_dSynonyms.Reset ();
3151 
3152 	if ( !sFilename || !*sFilename )
3153 		return true;
3154 
3155 	GetFileStats ( sFilename, m_tSynFileInfo, NULL );
3156 
3157 	FILE * fp = fopen ( sFilename, "r" );
3158 	if ( !fp )
3159 	{
3160 		sError.SetSprintf ( "failed to open '%s'", sFilename );
3161 		return false;
3162 	}
3163 
3164 	int iLine = 0;
3165 	char sBuffer[1024];
3166 
3167 	CSphOrderedHash < int, int, IdentityHash_fn, 4096 > hSynonymOnly;
3168 	CSphVector<CSphString> dFrom;
3169 
3170 	bool bOK = false;
3171 	for ( ;; )
3172 	{
3173 		char * sGot = fgets ( sBuffer, sizeof(sBuffer), fp );
3174 		if ( !sGot )
3175 		{
3176 			if ( feof(fp) )
3177 				bOK = true;
3178 			break;
3179 		}
3180 
3181 		iLine++;
3182 		dFrom.Resize ( 0 );
3183 
3184 		// extract map-from and map-to parts
3185 		char * sSplit = strstr ( sBuffer, "=>" );
3186 		if ( !sSplit )
3187 		{
3188 			sError.SetSprintf ( "%s line %d: mapping token (=>) not found", sFilename, iLine );
3189 			break;
3190 		}
3191 
3192 		BYTE * sFrom = (BYTE *) sBuffer;
3193 		BYTE * sTo = (BYTE *)( sSplit + strlen ( "=>" ) );
3194 		*sSplit = '\0';
3195 
3196 		// tokenize map-from
3197 		if ( !TokenizeOnWhitespace ( dFrom, sFrom, IsUtf8() ) )
3198 		{
3199 			sError.SetSprintf ( "%s line %d: empty map-from part", sFilename, iLine );
3200 			break;
3201 		}
3202 
3203 		// trim map-to
3204 		sTo = sphTrim ( sTo );
3205 		if ( !*sTo )
3206 		{
3207 			sError.SetSprintf ( "%s line %d: empty map-to part", sFilename, iLine );
3208 			break;
3209 		}
3210 
3211 		// check lengths
3212 		ARRAY_FOREACH ( i, dFrom )
3213 		{
3214 			int iFromLen = IsUtf8() ? sphUTF8Len ( dFrom[i].cstr() ) : strlen ( dFrom[i].cstr() );
3215 			if ( iFromLen>SPH_MAX_WORD_LEN )
3216 			{
3217 				sError.SetSprintf ( "%s line %d: map-from token too long (over %d bytes)", sFilename, iLine, SPH_MAX_WORD_LEN );
3218 				break;
3219 			}
3220 		}
3221 
3222 		int iToLen = IsUtf8() ? sphUTF8Len ( (const char*)sTo ) : strlen ( (const char*)sTo );
3223 		if ( iToLen>SPH_MAX_WORD_LEN )
3224 		{
3225 			sError.SetSprintf ( "%s line %d: map-to token too long (over %d bytes)", sFilename, iLine, SPH_MAX_WORD_LEN );
3226 			break;
3227 		}
3228 
3229 		// pack and store it
3230 		int iFromLen = 1;
3231 		ARRAY_FOREACH ( i, dFrom )
3232 			iFromLen += strlen ( dFrom[i].cstr() ) + 1;
3233 
3234 		if ( iFromLen>MAX_SYNONYM_LEN )
3235 		{
3236 			sError.SetSprintf ( "%s line %d: map-from part too long (over %d bytes)", sFilename, iLine, MAX_SYNONYM_LEN );
3237 			break;
3238 		}
3239 
3240 		CSphSynonym & tSyn = m_dSynonyms.Add ();
3241 		tSyn.m_sFrom.Reserve ( iFromLen );
3242 		tSyn.m_iFromLen = iFromLen;
3243 		tSyn.m_sTo = (char*)sTo;
3244 		tSyn.m_iToLen = iToLen;
3245 
3246 		char * sCur = const_cast<char*> ( tSyn.m_sFrom.cstr() );
3247 		ARRAY_FOREACH ( i, dFrom )
3248 		{
3249 			int iLen = strlen ( dFrom[i].cstr() );
3250 			memcpy ( sCur, dFrom[i].cstr(), iLen );
3251 
3252 			sCur[iLen] = MAGIC_SYNONYM_WHITESPACE;
3253 			sCur += iLen+1;
3254 		}
3255 		*sCur++ = '\0';
3256 		assert ( sCur-tSyn.m_sFrom.cstr()==iFromLen );
3257 
3258 		// track synonym-only codepoints in map-from
3259 		for ( ;; )
3260 		{
3261 			int iCode = IsUtf8() ? sphUTF8Decode(sFrom) : *sFrom++;
3262 			if ( !iCode )
3263 				break;
3264 			if ( iCode>0 && !sphIsSpace(iCode) && !m_tLC.ToLower(iCode) )
3265 				hSynonymOnly.Add ( 1, iCode );
3266 		}
3267 	}
3268 	fclose ( fp );
3269 
3270 	if ( !bOK )
3271 	{
3272 		m_dSynonyms.Reset ();
3273 		return false;
3274 	}
3275 
3276 	// sort the list
3277 	m_dSynonyms.Sort ();
3278 
3279 	// build simple lookup table
3280 	m_dSynStart.Resize ( 256 );
3281 	m_dSynEnd.Resize ( 256 );
3282 	for ( int i=0; i<256; i++ )
3283 	{
3284 		m_dSynStart[i] = INT_MAX;
3285 		m_dSynEnd[i] = -INT_MAX;
3286 	}
3287 	ARRAY_FOREACH ( i, m_dSynonyms )
3288 	{
3289 		int iCh = *(BYTE*)( m_dSynonyms[i].m_sFrom.cstr() );
3290 		m_dSynStart[iCh] = Min ( m_dSynStart[iCh], i );
3291 		m_dSynEnd[iCh] = Max ( m_dSynEnd[iCh], i );
3292 	}
3293 
3294 	// add synonym-only remaps
3295 	CSphVector<CSphRemapRange> dRemaps;
3296 	dRemaps.Reserve ( hSynonymOnly.GetLength() );
3297 
3298 	hSynonymOnly.IterateStart ();
3299 	while ( hSynonymOnly.IterateNext() )
3300 	{
3301 		CSphRemapRange & tRange = dRemaps.Add ();
3302 		tRange.m_iStart = tRange.m_iEnd = tRange.m_iRemapStart = hSynonymOnly.IterateGetKey();
3303 	}
3304 
3305 	m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_SYNONYM );
3306 	return true;
3307 }
3308 
3309 
3310 template < bool IS_UTF8 >
CloneBase(const CSphTokenizerTraits<IS_UTF8> * pFrom,bool bEscaped)3311 void CSphTokenizerTraits<IS_UTF8>::CloneBase ( const CSphTokenizerTraits<IS_UTF8> * pFrom, bool bEscaped )
3312 {
3313 	m_tLC = pFrom->m_tLC;
3314 	m_dSynonyms = pFrom->m_dSynonyms;
3315 	m_dSynStart = pFrom->m_dSynStart;
3316 	m_dSynEnd = pFrom->m_dSynEnd;
3317 	m_tSettings = pFrom->m_tSettings;
3318 	m_bEscaped = bEscaped;
3319 	m_uBlendVariants = pFrom->m_uBlendVariants;
3320 	m_bBlendSkipPure = pFrom->m_bBlendSkipPure;
3321 
3322 	if ( bEscaped )
3323 	{
3324 		CSphVector<CSphRemapRange> dRemaps;
3325 		CSphRemapRange Range;
3326 		Range.m_iStart = Range.m_iEnd = Range.m_iRemapStart = '\\';
3327 		dRemaps.Add ( Range );
3328 		m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_SPECIAL );
3329 	}
3330 }
3331 
3332 
3333 template < bool IS_UTF8 >
SetBufferPtr(const char * sNewPtr)3334 void CSphTokenizerTraits<IS_UTF8>::SetBufferPtr ( const char * sNewPtr )
3335 {
3336 	assert ( (BYTE*)sNewPtr>=m_pBuffer && (BYTE*)sNewPtr<=m_pBufferMax );
3337 	m_pCur = Min ( m_pBufferMax, Max ( m_pBuffer, (BYTE*)sNewPtr ) );
3338 	m_iAccum = 0;
3339 	m_pAccum = m_sAccum;
3340 	m_pTokenStart = m_pTokenEnd = NULL;
3341 	m_pBlendStart = m_pBlendEnd = NULL;
3342 }
3343 
3344 
3345 template < bool IS_UTF8 >
SkipBlended()3346 int CSphTokenizerTraits<IS_UTF8>::SkipBlended()
3347 {
3348 	if ( !m_pBlendEnd )
3349 		return 0;
3350 
3351 	bool bQuery = m_bQueryMode;
3352 	BYTE * pMax = m_pBufferMax;
3353 
3354 	m_bQueryMode = false;
3355 	m_pBufferMax = m_pBlendEnd;
3356 
3357 	int iBlended = 0;
3358 	while ( GetToken() )
3359 		iBlended++;
3360 
3361 	m_bQueryMode = bQuery;
3362 	m_pBufferMax = pMax;
3363 
3364 	return iBlended;
3365 }
3366 
3367 
3368 /// adjusts blending magic when we're about to return a token (any token)
3369 /// returns false if current token should be skipped, true otherwise
3370 template < bool IS_UTF8 >
BlendAdjust(BYTE * pCur)3371 bool CSphTokenizerTraits<IS_UTF8>::BlendAdjust ( BYTE * pCur )
3372 {
3373 	// check if all we got is a bunch of blended characters (pure-blended case)
3374 	if ( m_bBlended && !m_bNonBlended )
3375 	{
3376 		// we either skip this token, or pretend it was normal
3377 		// in both cases, clear the flag
3378 		m_bBlended = false;
3379 
3380 		// do we need to skip it?
3381 		if ( m_bBlendSkipPure )
3382 		{
3383 			m_pBlendStart = NULL;
3384 			return false;
3385 		}
3386 	}
3387 	m_bNonBlended = false;
3388 
3389 	// adjust buffer pointers
3390 	if ( m_bBlended && m_pBlendStart )
3391 	{
3392 		// called once per blended token, on processing start
3393 		// at this point, full blended token is in the accumulator
3394 		// and we're about to return it
3395 		m_pCur = m_pBlendStart;
3396 		m_pBlendEnd = pCur;
3397 		m_pBlendStart = NULL;
3398 		m_bBlendedPart = true;
3399 	} else if ( pCur>=m_pBlendEnd )
3400 	{
3401 		// tricky bit, as at this point, token we're about to return
3402 		// can either be a blended subtoken, or the next one
3403 		m_bBlendedPart = ( m_pTokenStart!=NULL ) && ( m_pTokenStart<m_pBlendEnd );
3404 		m_pBlendEnd = NULL;
3405 		m_pBlendStart = NULL;
3406 	} else if ( !m_pBlendEnd )
3407 	{
3408 		// we aren't re-parsing blended; so clear the "blended subtoken" flag
3409 		m_bBlendedPart = false;
3410 	}
3411 	return true;
3412 }
3413 
3414 
CopySubstring(BYTE * pDst,const BYTE * pSrc,int iLen)3415 static inline void CopySubstring ( BYTE * pDst, const BYTE * pSrc, int iLen )
3416 {
3417 	while ( iLen-->0 && *pSrc )
3418 		*pDst++ = *pSrc++;
3419 	*pDst++ = '\0';
3420 }
3421 
3422 
3423 template < bool IS_UTF8 >
GetBlendedVariant()3424 BYTE * CSphTokenizerTraits<IS_UTF8>::GetBlendedVariant ()
3425 {
3426 	// we can get called on several occasions
3427 	// case 1, a new blended token was just accumulated
3428 	if ( m_bBlended && !m_bBlendAdd )
3429 	{
3430 		// fast path for the default case (trim_none)
3431 		if ( m_uBlendVariants==BLEND_TRIM_NONE )
3432 			return m_sAccum;
3433 
3434 		// analyze the full token, find non-blended bounds
3435 		m_iBlendNormalStart = -1;
3436 		m_iBlendNormalEnd = -1;
3437 
3438 		// OPTIMIZE? we can skip this based on non-blended flag from adjust
3439 		BYTE * p = m_sAccum;
3440 		while ( *p )
3441 		{
3442 			int iLast = (int)( p-m_sAccum );
3443 			int iCode = IS_UTF8
3444 				? sphUTF8Decode ( p )
3445 				: *p++;
3446 			if (!( m_tLC.ToLower ( iCode ) & FLAG_CODEPOINT_BLEND ))
3447 			{
3448 				m_iBlendNormalEnd = (int)( p-m_sAccum );
3449 				if ( m_iBlendNormalStart<0 )
3450 					m_iBlendNormalStart = iLast;
3451 			}
3452 		}
3453 
3454 		// build todo mask
3455 		// check and revert a few degenerate cases
3456 		m_uBlendVariantsPending = m_uBlendVariants;
3457 		if ( m_uBlendVariantsPending & BLEND_TRIM_BOTH )
3458 		{
3459 			if ( m_iBlendNormalStart<0 )
3460 			{
3461 				// no heading blended; revert BOTH to TAIL
3462 				m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
3463 				m_uBlendVariantsPending |= BLEND_TRIM_TAIL;
3464 			} else if ( m_iBlendNormalEnd<0 )
3465 			{
3466 				// no trailing blended; revert BOTH to HEAD
3467 				m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
3468 				m_uBlendVariantsPending |= BLEND_TRIM_HEAD;
3469 			}
3470 		}
3471 		if ( m_uBlendVariantsPending & BLEND_TRIM_HEAD )
3472 		{
3473 			// either no heading blended, or pure blended; revert HEAD to NONE
3474 			if ( m_iBlendNormalStart<=0 )
3475 			{
3476 				m_uBlendVariantsPending &= ~BLEND_TRIM_HEAD;
3477 				m_uBlendVariantsPending |= BLEND_TRIM_NONE;
3478 			}
3479 		}
3480 		if ( m_uBlendVariantsPending & BLEND_TRIM_TAIL )
3481 		{
3482 			// either no trailing blended, or pure blended; revert TAIL to NONE
3483 			if ( m_iBlendNormalEnd<=0 || m_sAccum[m_iBlendNormalEnd]==0 )
3484 			{
3485 				m_uBlendVariantsPending &= ~BLEND_TRIM_TAIL;
3486 				m_uBlendVariantsPending |= BLEND_TRIM_NONE;
3487 			}
3488 		}
3489 
3490 		// ok, we are going to return a few variants after all, flag that
3491 		// OPTIMIZE? add fast path for "single" variants?
3492 		m_bBlendAdd = true;
3493 		assert ( m_uBlendVariantsPending );
3494 
3495 		// we also have to stash the original blended token
3496 		// because accumulator contents may get trashed by caller (say, when stemming)
3497 		strncpy ( (char*)m_sAccumBlend, (char*)m_sAccum, sizeof(m_sAccumBlend) );
3498 	}
3499 
3500 	// case 2, caller is checking for pending variants, have we even got any?
3501 	if ( !m_bBlendAdd )
3502 		return NULL;
3503 
3504 	// handle trim_none
3505 	// this MUST be the first handler, so that we could avoid copying below, and just return the original accumulator
3506 	if ( m_uBlendVariantsPending & BLEND_TRIM_NONE )
3507 	{
3508 		m_uBlendVariantsPending &= ~BLEND_TRIM_NONE;
3509 		m_bBlended = true;
3510 		return m_sAccum;
3511 	}
3512 
3513 	// handle trim_both
3514 	if ( m_uBlendVariantsPending & BLEND_TRIM_BOTH )
3515 	{
3516 		m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
3517 		if ( m_iBlendNormalStart<0 )
3518 			m_uBlendVariantsPending |= BLEND_TRIM_TAIL; // no heading blended; revert BOTH to TAIL
3519 		else if ( m_iBlendNormalEnd<0 )
3520 			m_uBlendVariantsPending |= BLEND_TRIM_HEAD; // no trailing blended; revert BOTH to HEAD
3521 		else
3522 		{
3523 			assert ( m_iBlendNormalStart<m_iBlendNormalEnd );
3524 			CopySubstring ( m_sAccum, m_sAccumBlend+m_iBlendNormalStart, m_iBlendNormalEnd-m_iBlendNormalStart );
3525 			m_bBlended = true;
3526 			return m_sAccum;
3527 		}
3528 	}
3529 
3530 	// handle TRIM_HEAD
3531 	if ( m_uBlendVariantsPending & BLEND_TRIM_HEAD )
3532 	{
3533 		m_uBlendVariantsPending &= ~BLEND_TRIM_HEAD;
3534 		if ( m_iBlendNormalStart>=0 )
3535 		{
3536 			// FIXME! need we check for overshorts?
3537 			CopySubstring ( m_sAccum, m_sAccumBlend+m_iBlendNormalStart, sizeof(m_sAccum) );
3538 			m_bBlended = true;
3539 			return m_sAccum;
3540 		}
3541 	}
3542 
3543 	// handle TRIM_TAIL
3544 	if ( m_uBlendVariantsPending & BLEND_TRIM_TAIL )
3545 	{
3546 		m_uBlendVariantsPending &= ~BLEND_TRIM_TAIL;
3547 		if ( m_iBlendNormalEnd>0 )
3548 		{
3549 			// FIXME! need we check for overshorts?
3550 			CopySubstring ( m_sAccum, m_sAccumBlend, m_iBlendNormalEnd );
3551 			m_bBlended = true;
3552 			return m_sAccum;
3553 		}
3554 	}
3555 
3556 	// all clear, no more variants to go
3557 	m_bBlendAdd = false;
3558 	return NULL;
3559 }
3560 
3561 
IsCapital(int iCh)3562 static inline bool IsCapital ( int iCh )
3563 {
3564 	return iCh>='A' && iCh<='Z';
3565 }
3566 
3567 
IsWhitespace(BYTE c)3568 static inline bool IsWhitespace ( BYTE c )
3569 {
3570 	return ( c=='\0' || c==' ' || c=='\t' || c=='\r' || c=='\n' );
3571 }
3572 
3573 
IsWhitespace(int c)3574 static inline bool IsWhitespace ( int c )
3575 {
3576 	return ( c=='\0' || c==' ' || c=='\t' || c=='\r' || c=='\n' );
3577 }
3578 
3579 
IsBoundary(BYTE c,bool bPhrase)3580 static inline bool IsBoundary ( BYTE c, bool bPhrase )
3581 {
3582 	// FIXME? sorta intersects with specials
3583 	// then again, a shortened-down list (more strict syntax) is reasonble here too
3584 	return IsWhitespace(c) || c=='"' || ( !bPhrase && ( c=='(' || c==')' || c=='|' ) );
3585 }
3586 
3587 
3588 template < bool IS_UTF8 >
CodepointArbitration(int iCode,bool bWasEscaped,BYTE uNextByte)3589 int CSphTokenizerTraits<IS_UTF8>::CodepointArbitration ( int iCode, bool bWasEscaped, BYTE uNextByte )
3590 {
3591 	/////////////////////////////
3592 	// indexing time arbitration
3593 	/////////////////////////////
3594 
3595 	if ( !m_bQueryMode )
3596 	{
3597 		int iSymbol = iCode & MASK_CODEPOINT;
3598 
3599 		// detect sentence boundaries
3600 		// FIXME! should use charset_table (or add a new directive) and support languages other than English
3601 		if ( m_bDetectSentences )
3602 		{
3603 			if ( iSymbol=='?' || iSymbol=='!' )
3604 			{
3605 				// definitely a sentence boundary
3606 				return MAGIC_CODE_SENTENCE | FLAG_CODEPOINT_SPECIAL;
3607 			}
3608 
3609 			if ( iSymbol=='.' )
3610 			{
3611 				// inline dot ("in the U.K and"), not a boundary
3612 				bool bInwordDot = ( sphIsAlpha ( m_pCur[0] ) || m_pCur[0]==',' );
3613 
3614 				// followed by a small letter or an opening paren, not a boundary
3615 				// FIXME? might want to scan for more than one space
3616 				// Yoyodine Inc. exists ...
3617 				// Yoyodine Inc. (the company) ..
3618 				bool bInphraseDot = ( sphIsSpace ( m_pCur[0] )
3619 					&& ( ( 'a'<=m_pCur[1] && m_pCur[1]<='z' )
3620 						|| ( m_pCur[1]=='(' && 'a'<=m_pCur[2] && m_pCur[2]<='z' ) ) );
3621 
3622 				// preceded by something that looks like a middle name, opening first name, salutation
3623 				bool bMiddleName = false;
3624 				switch ( m_iAccum )
3625 				{
3626 					case 1:
3627 						// 1-char capital letter
3628 						// example: J. R. R. Tolkien, who wrote Hobbit ...
3629 						// example: John D. Doe ...
3630 						bMiddleName = IsCapital ( m_pCur[-2] );
3631 						break;
3632 					case 2:
3633 						// 2-char token starting with a capital
3634 						if ( IsCapital ( m_pCur[-3] ) )
3635 						{
3636 							// capital+small
3637 							// example: Known as Mr. Doe ...
3638 							if ( !IsCapital ( m_pCur[-2] ) )
3639 								bMiddleName = true;
3640 
3641 							// known capital+capital (MR, DR, MS)
3642 							if (
3643 								( m_pCur[-3]=='M' && m_pCur[-2]=='R' ) ||
3644 								( m_pCur[-3]=='M' && m_pCur[-2]=='S' ) ||
3645 								( m_pCur[-3]=='D' && m_pCur[-2]=='R' ) )
3646 									bMiddleName = true;
3647 						}
3648 						break;
3649 					case 3:
3650 						// preceded by a known 3-byte token (MRS, DRS)
3651 						// example: Survived by Mrs. Doe ...
3652 						if ( ( m_sAccum[0]=='m' || m_sAccum[0]=='d' ) && m_sAccum[1]=='r' && m_sAccum[2]=='s' )
3653 							bMiddleName = true;
3654 						break;
3655 				}
3656 
3657 				if ( !bInwordDot && !bInphraseDot && !bMiddleName )
3658 				{
3659 					// sentence boundary
3660 					return MAGIC_CODE_SENTENCE | FLAG_CODEPOINT_SPECIAL;
3661 				} else
3662 				{
3663 					// just a character
3664 					if ( ( iCode & MASK_FLAGS )==FLAG_CODEPOINT_SPECIAL )
3665 						return 0; // special only, not dual? then in this context, it is a separator
3666 					else
3667 						return iCode & ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL ); // perhaps it was blended, so return the original code
3668 				}
3669 			}
3670 		}
3671 
3672 		// pass-through
3673 		return iCode;
3674 	}
3675 
3676 	//////////////////////////
3677 	// query time arbitration
3678 	//////////////////////////
3679 
3680 	if ( iCode & FLAG_CODEPOINT_NGRAM )
3681 		return iCode; // ngrams are handled elsewhere
3682 
3683 	int iSymbol = iCode & MASK_CODEPOINT;
3684 
3685 	// codepoints can't be blended and special at the same time
3686 	if ( ( iCode & FLAG_CODEPOINT_BLEND ) && ( iCode & FLAG_CODEPOINT_SPECIAL ) )
3687 	{
3688 		bool bBlend =
3689 			bWasEscaped || // escaped characters should always act as blended
3690 			( m_bPhrase && !sphIsModifier ( iSymbol ) && iSymbol!='"' ) || // non-modifier special inside phrase
3691 			( m_iAccum && ( iSymbol=='@' || iSymbol=='/' || iSymbol=='-' ) ); // some specials in the middle of a token
3692 
3693 		// clear special or blend flags
3694 		iCode &= bBlend
3695 			? ~( FLAG_CODEPOINT_DUAL | FLAG_CODEPOINT_SPECIAL )
3696 			: ~( FLAG_CODEPOINT_DUAL | FLAG_CODEPOINT_BLEND );
3697 	}
3698 
3699 	// escaped specials are not special
3700 	// dash and dollar inside the word are not special (however, single opening modifier is not a word!)
3701 	// non-modifier specials within phrase are not special
3702 	bool bDashInside = ( m_iAccum && iSymbol=='-' && !( m_iAccum==1 && sphIsModifier ( m_sAccum[0] ) ));
3703 	if ( iCode & FLAG_CODEPOINT_SPECIAL )
3704 		if ( bWasEscaped
3705 			|| bDashInside
3706 			|| ( m_iAccum && iSymbol=='$' && !IsBoundary ( uNextByte, m_bPhrase ) )
3707 			|| ( m_bPhrase && iSymbol!='"' && !sphIsModifier ( iSymbol ) ) )
3708 	{
3709 		if ( iCode & FLAG_CODEPOINT_DUAL )
3710 			iCode &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
3711 		else if ( bDashInside && ( iCode & FLAG_CODEPOINT_SYNONYM ) )
3712 			// if we return zero here, we will break the tokens like 'Ms-Dos'
3713 			iCode &= ~( FLAG_CODEPOINT_SPECIAL );
3714 		else
3715 			iCode = 0;
3716 	}
3717 
3718 	// if we didn't remove special by now, it must win
3719 	if ( iCode & FLAG_CODEPOINT_DUAL )
3720 	{
3721 		assert ( iCode & FLAG_CODEPOINT_SPECIAL );
3722 		iCode = iSymbol | FLAG_CODEPOINT_SPECIAL;
3723 	}
3724 
3725 	// ideally, all conflicts must be resolved here
3726 	// well, at least most
3727 	assert ( sphBitCount ( iCode & MASK_FLAGS )<=1
3728 		|| ( iCode & FLAG_CODEPOINT_SYNONYM ) );
3729 	return iCode;
3730 }
3731 
3732 
3733 enum SynCheck_e
3734 {
3735 	SYNCHECK_LESS,
3736 	SYNCHECK_PARTIAL,
3737 	SYNCHECK_EXACT,
3738 	SYNCHECK_GREATER
3739 };
3740 
3741 
SynCheckPrefix(const CSphSynonym & tCandidate,int iOff,const BYTE * sCur,int iBytes,bool bMaybeSeparator)3742 static inline SynCheck_e SynCheckPrefix ( const CSphSynonym & tCandidate, int iOff, const BYTE * sCur, int iBytes, bool bMaybeSeparator )
3743 {
3744 	const BYTE * sCand = ( (const BYTE*)tCandidate.m_sFrom.cstr() ) + iOff;
3745 
3746 	while ( iBytes-->0 )
3747 	{
3748 		if ( *sCand!=*sCur )
3749 		{
3750 			// incoming synonym-only char vs. ending sequence (eg. 2nd slash in "OS/2/3"); we actually have a match
3751 			if ( bMaybeSeparator && sCand[0]==MAGIC_SYNONYM_WHITESPACE && sCand[1]=='\0' )
3752 				return SYNCHECK_EXACT;
3753 
3754 			// otherwise, it is a mismatch
3755 			return ( *sCand<*sCur ) ? SYNCHECK_LESS : SYNCHECK_GREATER;
3756 		}
3757 		sCand++;
3758 		sCur++;
3759 	}
3760 
3761 	// full match after a full separator
3762 	if ( sCand[0]=='\0' )
3763 		return SYNCHECK_EXACT;
3764 
3765 	// full match after my last synonym-only char
3766 	if ( bMaybeSeparator && sCand[0]==MAGIC_SYNONYM_WHITESPACE && sCand[1]=='\0' )
3767 		return SYNCHECK_EXACT;
3768 
3769 	// otherwise, partial match so far
3770 	return SYNCHECK_PARTIAL;
3771 }
3772 
3773 
IsSeparator(int iFolded,bool bFirst)3774 static inline bool IsSeparator ( int iFolded, bool bFirst )
3775 {
3776 	// eternal separator
3777 	if ( iFolded<0 || ( iFolded & MASK_CODEPOINT )==0 )
3778 		return true;
3779 
3780 	// just a codepoint
3781 	if (!( iFolded & MASK_FLAGS ))
3782 		return false;
3783 
3784 	// any magic flag, besides dual
3785 	if (!( iFolded & FLAG_CODEPOINT_DUAL ))
3786 		return true;
3787 
3788 	// FIXME? n-grams currently also set dual
3789 	if ( iFolded & FLAG_CODEPOINT_NGRAM )
3790 		return true;
3791 
3792 	// dual depends on position
3793 	return bFirst;
3794 }
3795 
3796 // handles escaped specials that are not in the character set
3797 // returns true if the codepoint should be processed as a simple codepoint,
3798 // returns false if it should be processed as a whitespace
3799 // for example: aaa\!bbb => aaa bbb
Special2Simple(int & iCodepoint)3800 static inline bool Special2Simple ( int & iCodepoint )
3801 {
3802 	if ( ( iCodepoint & FLAG_CODEPOINT_DUAL ) || !( iCodepoint & FLAG_CODEPOINT_SPECIAL ) )
3803 	{
3804 		iCodepoint &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
3805 		return true;
3806 	}
3807 
3808 	return false;
3809 }
3810 
3811 template < bool IS_UTF8 >
GetTokenSyn()3812 BYTE * CSphTokenizerTraits<IS_UTF8>::GetTokenSyn ()
3813 {
3814 	assert ( m_dSynonyms.GetLength() );
3815 
3816 	bool bEscaped = m_bEscaped;
3817 	m_bWasSynonym = false;
3818 	BYTE * pCur;
3819 
3820 	m_bTokenBoundary = false;
3821 	for ( ;; )
3822 	{
3823 		// initialize accumulators and range
3824 		BYTE * pFirstSeparator = NULL;
3825 
3826 		m_iAccum = 0;
3827 		m_pAccum = m_sAccum;
3828 
3829 		int iSynStart = 0;
3830 		int iSynEnd = m_dSynonyms.GetLength()-1;
3831 		int iSynOff = 0;
3832 
3833 		int iLastCodepoint = 0;
3834 		int iLastFolded = 0;
3835 		BYTE * pRescan = NULL;
3836 
3837 		int iExact = -1;
3838 		BYTE * pExact = NULL;
3839 
3840 		// main refinement loop
3841 		for ( ;; )
3842 		{
3843 			// store current position (to be able to restart from it on folded boundary)
3844 			pCur = m_pCur;
3845 
3846 			// get next codepoint
3847 			int iCode = GetCodepoint();
3848 
3849 			// handle early-out
3850 			if ( iCode<0 )
3851 			{
3852 				// eof at token start? we're done
3853 				if ( iSynOff==0 )
3854 					return NULL;
3855 
3856 				// eof after whitespace? we already checked the candidate last time, so break
3857 				if ( iLastFolded==0 )
3858 					break;
3859 			}
3860 
3861 			// fold codepoint (and lookup flags!)
3862 			int iFolded = m_tLC.ToLower ( iCode );
3863 
3864 			// handle boundaries
3865 			if ( m_bBoundary && ( iFolded==0 ) ) m_bTokenBoundary = true;
3866 			m_bBoundary = ( iFolded & FLAG_CODEPOINT_BOUNDARY )!=0;
3867 
3868 			// skip continuous whitespace
3869 			if ( iLastFolded==0 && iFolded==0 )
3870 				continue;
3871 
3872 			if ( bEscaped )
3873 			{
3874 				if ( iCode=='\\' && iLastCodepoint!='\\' )
3875 				{
3876 					iLastCodepoint = iCode;
3877 					continue;
3878 				} else if ( iLastCodepoint=='\\' && ( iFolded & FLAG_CODEPOINT_SYNONYM ) && ( iFolded & FLAG_CODEPOINT_SPECIAL ) )
3879 				{
3880 					iFolded &= ~FLAG_CODEPOINT_SPECIAL;
3881 
3882 				} else if ( iLastCodepoint=='\\' && !Special2Simple ( iFolded ) )
3883 				{
3884 					iLastCodepoint = 0;
3885 					continue;
3886 				}
3887 
3888 				iLastCodepoint = iCode;
3889 			}
3890 
3891 			iFolded = CodepointArbitration ( iFolded, false, *m_pCur );
3892 
3893 			iLastFolded = iFolded;
3894 
3895 			if ( m_iAccum==0 )
3896 				m_pTokenStart = pCur;
3897 
3898 			// handle specials at the very word start
3899 			if ( ( iFolded & FLAG_CODEPOINT_SPECIAL ) && m_iAccum==0 )
3900 			{
3901 				m_bWasSpecial = !( iFolded & FLAG_CODEPOINT_NGRAM );
3902 
3903 				AccumCodepoint ( iFolded & MASK_CODEPOINT );
3904 				*m_pAccum = '\0';
3905 
3906 				m_iLastTokenLen = 1;
3907 				m_pTokenStart = pCur;
3908 				m_pTokenEnd = m_pCur;
3909 				return m_sAccum;
3910 			}
3911 
3912 			// handle specials
3913 			bool bJustSpecial = ( iFolded & FLAG_CODEPOINT_SPECIAL )
3914 				&& !( iFolded & FLAG_CODEPOINT_DUAL ) // OPTIMIZE?
3915 				&& !( iFolded & FLAG_CODEPOINT_SYNONYM ); // OPTIMIZE?
3916 
3917 			// if candidate starts with something special, and turns out to be not a synonym,
3918 			// we will need to rescan from current position later
3919 			if ( iSynOff==0 )
3920 				pRescan = IsSeparator ( iFolded, true ) ? m_pCur : NULL;
3921 
3922 			// accumulate folded token
3923 			if ( !pFirstSeparator )
3924 			{
3925 				if ( IsSeparator ( iFolded, m_iAccum==0 ) )
3926 				{
3927 					if ( m_iAccum )
3928 						pFirstSeparator = pCur;
3929 				} else
3930 				{
3931 					if ( m_iAccum==0 )
3932 						m_pTokenStart = pCur;
3933 
3934 					AccumCodepoint ( iFolded & MASK_CODEPOINT );
3935 				}
3936 			}
3937 
3938 			// accumulate next raw synonym symbol to refine
3939 			// note that we need a special check for whitespace here, to avoid "MS*DOS" being treated as "MS DOS" synonym
3940 			BYTE sTest[4];
3941 			int iTest;
3942 
3943 			int iMasked = ( iCode & MASK_CODEPOINT );
3944 			if ( iFolded<=0 || bJustSpecial )
3945 			{
3946 				sTest[0] = MAGIC_SYNONYM_WHITESPACE;
3947 				iTest = 1;
3948 
3949 				if (!( iMasked==' ' || iMasked=='\t' ))
3950 				{
3951 					sTest[1] = '\0';
3952 					iTest = 2;
3953 				}
3954 			} else
3955 			{
3956 				if ( IsUtf8() )
3957 				{
3958 					iTest = sphUTF8Encode ( sTest, iMasked );
3959 				} else
3960 				{
3961 					iTest = 1;
3962 					sTest[0] = BYTE(iMasked);
3963 				}
3964 			}
3965 
3966 			// refine synonyms range
3967 			#define LOC_RETURN_SYNONYM(_idx) \
3968 			{ \
3969 				m_pTokenEnd = m_iAccum ? pCur : m_pCur; \
3970 				if ( bJustSpecial || ( iFolded & FLAG_CODEPOINT_SPECIAL )!=0 ) m_pCur = pCur; \
3971 				strncpy ( (char*)m_sAccum, m_dSynonyms[_idx].m_sTo.cstr(), sizeof(m_sAccum) ); \
3972 				m_iLastTokenLen = m_dSynonyms[_idx].m_iToLen; \
3973 				m_bWasSynonym = true; \
3974 				return m_sAccum; \
3975 			}
3976 
3977 			#define LOC_REFINE_BREAK() \
3978 			{ \
3979 				if ( iExact>=0 ) { m_pCur = pCur = pExact; LOC_RETURN_SYNONYM ( iExact ); } \
3980 				break; \
3981 			}
3982 
3983 			// if this is the first symbol, use prebuilt lookup table to speedup initial range search
3984 			if ( iSynOff==0 )
3985 			{
3986 				iSynStart = m_dSynStart[sTest[0]];
3987 				iSynEnd = m_dSynEnd[sTest[0]];
3988 				if ( iSynStart>iSynEnd )
3989 					break;
3990 			}
3991 
3992 			// this is to catch intermediate separators (eg. "OS/2/3")
3993 			bool bMaybeSeparator = ( iFolded & FLAG_CODEPOINT_SYNONYM )!=0 || ( iFolded<0 );
3994 
3995 			SynCheck_e eStart = SynCheckPrefix ( m_dSynonyms[iSynStart], iSynOff, sTest, iTest, bMaybeSeparator );
3996 			if ( eStart==SYNCHECK_EXACT )
3997 			{
3998 				if ( iSynStart==iSynEnd ) LOC_RETURN_SYNONYM ( iSynStart );
3999 				iExact = iSynStart;
4000 				pExact = pCur;
4001 			}
4002 			if ( eStart==SYNCHECK_GREATER || ( iSynStart==iSynEnd && eStart!=SYNCHECK_PARTIAL ) )
4003 				LOC_REFINE_BREAK();
4004 
4005 			SynCheck_e eEnd = SynCheckPrefix ( m_dSynonyms[iSynEnd], iSynOff, sTest, iTest, bMaybeSeparator );
4006 			if ( eEnd==SYNCHECK_LESS )
4007 				LOC_REFINE_BREAK();
4008 			if ( eEnd==SYNCHECK_EXACT )
4009 			{
4010 				iExact = iSynEnd;
4011 				pExact = pCur;
4012 			}
4013 
4014 
4015 			// refine left boundary
4016 			if ( eStart!=SYNCHECK_PARTIAL && eStart!=SYNCHECK_EXACT )
4017 			{
4018 				assert ( eStart==SYNCHECK_LESS );
4019 
4020 				int iL = iSynStart;
4021 				int iR = iSynEnd;
4022 				SynCheck_e eL = eStart;
4023 				SynCheck_e eR = eEnd;
4024 
4025 				while ( iR-iL>1 )
4026 				{
4027 					int iM = iL + (iR-iL)/2;
4028 					SynCheck_e eMid = SynCheckPrefix ( m_dSynonyms[iM], iSynOff, sTest, iTest, bMaybeSeparator );
4029 
4030 					if ( eMid==SYNCHECK_LESS )
4031 					{
4032 						iL = iM;
4033 						eL = eMid;
4034 					} else
4035 					{
4036 						iR = iM;
4037 						eR = eMid;
4038 					}
4039 				}
4040 
4041 				assert ( eL==SYNCHECK_LESS );
4042 				assert ( eR!=SYNCHECK_LESS );
4043 				assert ( iR-iL==1 );
4044 
4045 				if ( eR==SYNCHECK_GREATER )					LOC_REFINE_BREAK();
4046 				if ( eR==SYNCHECK_EXACT && iR==iSynEnd )	LOC_RETURN_SYNONYM ( iR );
4047 
4048 				assert ( eR==SYNCHECK_PARTIAL || eR==SYNCHECK_EXACT );
4049 				iSynStart = iR;
4050 				eStart = eR;
4051 			}
4052 
4053 			// refine right boundary
4054 			if ( eEnd!=SYNCHECK_PARTIAL && eEnd!=SYNCHECK_EXACT )
4055 			{
4056 				assert ( eEnd==SYNCHECK_GREATER );
4057 
4058 				int iL = iSynStart;
4059 				int iR = iSynEnd;
4060 				SynCheck_e eL = eStart;
4061 				SynCheck_e eR = eEnd;
4062 
4063 				while ( iR-iL>1 )
4064 				{
4065 					int iM = iL + (iR-iL)/2;
4066 					SynCheck_e eMid = SynCheckPrefix ( m_dSynonyms[iM], iSynOff, sTest, iTest, bMaybeSeparator );
4067 
4068 					if ( eMid==SYNCHECK_GREATER )
4069 					{
4070 						iR = iM;
4071 						eR = eMid;
4072 					} else
4073 					{
4074 						iL = iM;
4075 						eL = eMid;
4076 					}
4077 				}
4078 
4079 				assert ( eR==SYNCHECK_GREATER );
4080 				assert ( eL!=SYNCHECK_GREATER );
4081 				assert ( iR-iL==1 );
4082 
4083 				if ( eL==SYNCHECK_LESS )					LOC_REFINE_BREAK();
4084 				if ( eL==SYNCHECK_EXACT && iL==iSynStart )	LOC_RETURN_SYNONYM ( iL );
4085 
4086 				assert ( eL==SYNCHECK_PARTIAL || eL==SYNCHECK_EXACT );
4087 				iSynEnd = iL;
4088 				eEnd = eL;
4089 			}
4090 
4091 			// handle eof
4092 			if ( iCode<0 )
4093 				break;
4094 
4095 			// we still have a partial synonym match, continue;
4096 			iSynOff += iTest;
4097 		}
4098 
4099 		// at this point, that was not a synonym
4100 		if ( pRescan )
4101 		{
4102 			m_pCur = pRescan;
4103 			continue;
4104 		}
4105 
4106 		// at this point, it also started with a valid char
4107 		assert ( m_iAccum>0 );
4108 
4109 		// find the proper separator
4110 		if ( !pFirstSeparator )
4111 		{
4112 			int iLast = 0;
4113 
4114 			// if there was none, scan until found
4115 			for ( ;; )
4116 			{
4117 				pCur = m_pCur;
4118 				int iCode = *pCur;
4119 				int iFolded = m_tLC.ToLower ( GetCodepoint() );
4120 				if ( iFolded<0 )
4121 					break; // eof
4122 
4123 				if ( bEscaped )
4124 				{
4125 					if ( iCode=='\\' && iLast!='\\' )
4126 					{
4127 						iLast = iCode;
4128 						continue;
4129 					}
4130 
4131 					if ( iLast=='\\' && !Special2Simple ( iFolded ) )
4132 						break;
4133 
4134 					iLast = iCode;
4135 				}
4136 
4137 				iFolded = CodepointArbitration ( iFolded, false, *m_pCur );
4138 
4139 				if ( IsSeparator ( iFolded, false ) )
4140 				{
4141 					if ( iFolded!=0 )
4142 						m_pCur = pCur; // force rescan
4143 					break;
4144 				}
4145 
4146 				AccumCodepoint ( iFolded & MASK_CODEPOINT );
4147 			}
4148 		} else
4149 		{
4150 			// if there was, token is ready but we should restart from that separator
4151 			m_pCur = pFirstSeparator;
4152 			pCur = m_pCur;
4153 		}
4154 
4155 		// return accumulated token
4156 		if ( m_iAccum<m_tSettings.m_iMinWordLen )
4157 		{
4158 			if ( m_bShortTokenFilter )
4159 			{
4160 				*m_pAccum = '\0';
4161 
4162 				if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
4163 				{
4164 					m_iLastTokenLen = m_iAccum;
4165 					m_pTokenEnd = pCur;
4166 					m_iAccum = 0;
4167 					return m_sAccum;
4168 				}
4169 			}
4170 
4171 			if ( m_iAccum )
4172 				m_iOvershortCount++;
4173 
4174 			m_iAccum = 0;
4175 			continue;
4176 		}
4177 
4178 		*m_pAccum = '\0';
4179 		m_iLastTokenLen = m_iAccum;
4180 		m_pTokenEnd = pCur;
4181 		return m_sAccum;
4182 	}
4183 }
4184 
RemapCharacters(const char * sConfig,DWORD uFlags,const char * sSource,bool bCanRemap,CSphString & sError)4185 bool ISphTokenizer::RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError )
4186 {
4187 	// parse
4188 	CSphVector<CSphRemapRange> dRemaps;
4189 	CSphCharsetDefinitionParser tParser;
4190 	if ( !tParser.Parse ( sConfig, dRemaps ) )
4191 	{
4192 		sError = tParser.GetLastError();
4193 		return false;
4194 	}
4195 
4196 	// check
4197 	ARRAY_FOREACH ( i, dRemaps )
4198 	{
4199 		const CSphRemapRange & r = dRemaps[i];
4200 
4201 		if ( !bCanRemap && r.m_iStart!=r.m_iRemapStart )
4202 		{
4203 			sError.SetSprintf ( "%s characters must not be remapped (map-from=U+%x, map-to=U+%x)",
4204 				sSource, r.m_iStart, r.m_iRemapStart );
4205 			return false;
4206 		}
4207 
4208 		for ( int j=r.m_iStart; j<=r.m_iEnd; j++ )
4209 			if ( m_tLC.ToLower(j) )
4210 		{
4211 			sError.SetSprintf ( "%s characters must not be referenced anywhere else (code=U+%x)", sSource, j );
4212 			return false;
4213 		}
4214 
4215 		if ( bCanRemap )
4216 			for ( int j=r.m_iRemapStart; j<=r.m_iRemapStart + r.m_iEnd - r.m_iStart; j++ )
4217 				if ( m_tLC.ToLower(j) )
4218 		{
4219 			sError.SetSprintf ( "%s characters must not be referenced anywhere else (code=U+%x)", sSource, j );
4220 			return false;
4221 		}
4222 	}
4223 
4224 	// add mapping
4225 	m_tLC.AddRemaps ( dRemaps, uFlags );
4226 	return true;
4227 }
4228 
SetBoundary(const char * sConfig,CSphString & sError)4229 bool ISphTokenizer::SetBoundary ( const char * sConfig, CSphString & sError )
4230 {
4231 	return RemapCharacters ( sConfig, FLAG_CODEPOINT_BOUNDARY, "phrase boundary", false, sError );
4232 }
4233 
SetIgnoreChars(const char * sConfig,CSphString & sError)4234 bool ISphTokenizer::SetIgnoreChars ( const char * sConfig, CSphString & sError )
4235 {
4236 	return RemapCharacters ( sConfig, FLAG_CODEPOINT_IGNORE, "ignored", false, sError );
4237 }
4238 
SetBlendChars(const char * sConfig,CSphString & sError)4239 bool ISphTokenizer::SetBlendChars ( const char * sConfig, CSphString & sError )
4240 {
4241 	return RemapCharacters ( sConfig, FLAG_CODEPOINT_BLEND, "blend", true, sError );
4242 }
4243 
4244 
sphStrncmp(const char * sCheck,int iCheck,const char * sRef)4245 static bool sphStrncmp ( const char * sCheck, int iCheck, const char * sRef )
4246 {
4247 	return ( iCheck==(int)strlen(sRef) && memcmp ( sCheck, sRef, iCheck )==0 );
4248 }
4249 
4250 
SetBlendMode(const char * sMode,CSphString & sError)4251 bool ISphTokenizer::SetBlendMode ( const char * sMode, CSphString & sError )
4252 {
4253 	if ( !sMode || !*sMode )
4254 	{
4255 		m_uBlendVariants = BLEND_TRIM_NONE;
4256 		m_bBlendSkipPure = false;
4257 		return true;
4258 	}
4259 
4260 	m_uBlendVariants = 0;
4261 	const char * p = sMode;
4262 	while ( *p )
4263 	{
4264 		while ( !sphIsAlpha(*p) )
4265 			p++;
4266 		if ( !*p )
4267 			break;
4268 
4269 		const char * sTok = p;
4270 		while ( sphIsAlpha(*p) )
4271 			p++;
4272 		if ( sphStrncmp ( sTok, p-sTok, "trim_none" ) )
4273 			m_uBlendVariants |= BLEND_TRIM_NONE;
4274 		else if ( sphStrncmp ( sTok, p-sTok, "trim_head" ) )
4275 			m_uBlendVariants |= BLEND_TRIM_HEAD;
4276 		else if ( sphStrncmp ( sTok, p-sTok, "trim_tail" ) )
4277 			m_uBlendVariants |= BLEND_TRIM_TAIL;
4278 		else if ( sphStrncmp ( sTok, p-sTok, "trim_both" ) )
4279 			m_uBlendVariants |= BLEND_TRIM_BOTH;
4280 		else if ( sphStrncmp ( sTok, p-sTok, "skip_pure" ) )
4281 			m_bBlendSkipPure = true;
4282 		else
4283 		{
4284 			sError.SetSprintf ( "unknown blend_mode option near '%s'", sTok );
4285 			return false;
4286 		}
4287 	}
4288 
4289 	if ( !m_uBlendVariants )
4290 	{
4291 		sError.SetSprintf ( "blend_mode must define at least one variant to index" );
4292 		m_uBlendVariants = BLEND_TRIM_NONE;
4293 		m_bBlendSkipPure = false;
4294 		return false;
4295 	}
4296 	return true;
4297 }
4298 
4299 /////////////////////////////////////////////////////////////////////////////
4300 
CSphTokenizer_SBCS()4301 CSphTokenizer_SBCS::CSphTokenizer_SBCS ()
4302 {
4303 	CSphString sTmp;
4304 	SetCaseFolding ( SPHINX_DEFAULT_SBCS_TABLE, sTmp );
4305 }
4306 
4307 
SetBuffer(BYTE * sBuffer,int iLength)4308 void CSphTokenizer_SBCS::SetBuffer ( BYTE * sBuffer, int iLength )
4309 {
4310 	// check that old one is over and that new length is sane
4311 	assert ( iLength>=0 );
4312 
4313 	// set buffer
4314 	m_pBuffer = sBuffer;
4315 	m_pBufferMax = sBuffer + iLength;
4316 	m_pCur = sBuffer;
4317 	m_pTokenStart = m_pTokenEnd = NULL;
4318 	m_pBlendStart = m_pBlendEnd = NULL;
4319 
4320 	m_iOvershortCount = 0;
4321 	m_bBoundary = m_bTokenBoundary = false;
4322 }
4323 
4324 
GetToken()4325 BYTE * CSphTokenizer_SBCS::GetToken ()
4326 {
4327 	m_bWasSpecial = false;
4328 	m_bBlended = false;
4329 	m_iOvershortCount = 0;
4330 	m_bTokenBoundary = false;
4331 
4332 	if ( m_dSynonyms.GetLength() )
4333 		return GetTokenSyn ();
4334 
4335 	// return pending blending variants
4336 	BYTE * pVar = GetBlendedVariant ();
4337 	if ( pVar )
4338 		return pVar;
4339 	m_bBlendedPart = ( m_pBlendEnd!=NULL );
4340 
4341 	const bool bUseEscape = m_bEscaped;
4342 
4343 	for ( ;; )
4344 	{
4345 		// memorize buffer start
4346 		BYTE * pCur = m_pCur;
4347 
4348 		// get next codepoint, real or virtual
4349 		int iCodepoint = 0;
4350 		int iCode = 0;
4351 
4352 		bool bWasEscaped = false; // whether current char was escaped
4353 		if ( m_pCur<m_pBufferMax )
4354 		{
4355 			// get next codepoint
4356 			iCodepoint = *m_pCur++;
4357 			iCode = m_tLC.ToLower ( iCodepoint );
4358 
4359 			// handle escaping
4360 			if ( bUseEscape && iCodepoint=='\\' )
4361 			{
4362 				if ( m_pCur<m_pBufferMax )
4363 				{
4364 					// fetch, fold, and then forcibly demote special
4365 					iCodepoint = *m_pCur++;
4366 					iCode = m_tLC.ToLower ( iCodepoint );
4367 					if ( !Special2Simple ( iCode ) )
4368 						iCode = 0;
4369 					bWasEscaped = true;
4370 
4371 				} else
4372 				{
4373 					// stray slash on a buffer end
4374 					// handle it as a separator
4375 					iCode = 0;
4376 				}
4377 			}
4378 
4379 		} else
4380 		{
4381 			// out of buffer
4382 			// but still need to handle short tokens
4383 			if ( m_iAccum<m_tSettings.m_iMinWordLen )
4384 			{
4385 				bool bShortToken = false;
4386 				if ( m_bShortTokenFilter )
4387 				{
4388 					m_sAccum[m_iAccum] = '\0';
4389 					if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
4390 						bShortToken = true;
4391 				}
4392 
4393 				if ( !bShortToken )
4394 				{
4395 					if ( m_iAccum )
4396 						m_iOvershortCount++;
4397 					m_iAccum = 0;
4398 					m_iLastTokenLen = 0;
4399 					BlendAdjust ( pCur );
4400 					return NULL;
4401 				}
4402 			}
4403 		}
4404 
4405 		iCode = CodepointArbitration ( iCode, bWasEscaped, *m_pCur );
4406 
4407 		// handle ignored chars
4408 		if ( iCode & FLAG_CODEPOINT_IGNORE )
4409 			continue;
4410 
4411 		// handle blended characters
4412 		if ( iCode & FLAG_CODEPOINT_BLEND )
4413 		{
4414 			if ( m_pBlendEnd )
4415 				iCode = 0;
4416 			else
4417 			{
4418 				m_bBlended = true;
4419 				m_pBlendStart = m_iAccum ? m_pTokenStart : pCur;
4420 			}
4421 		}
4422 
4423 		// handle whitespace and boundary
4424 		if ( m_bBoundary && ( iCode==0 ) )
4425 		{
4426 			m_bTokenBoundary = true;
4427 			m_iBoundaryOffset = pCur - m_pBuffer - 1;
4428 		}
4429 		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
4430 		if ( iCode==0 || m_bBoundary )
4431 		{
4432 			if ( m_iAccum<m_tSettings.m_iMinWordLen )
4433 			{
4434 				bool bShortToken = false;
4435 				if ( m_bShortTokenFilter )
4436 				{
4437 					m_sAccum[m_iAccum] = '\0';
4438 					if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
4439 						bShortToken = true;
4440 				}
4441 
4442 				if ( !bShortToken )
4443 				{
4444 					if ( m_iAccum )
4445 						m_iOvershortCount++;
4446 
4447 					m_iAccum = 0;
4448 					BlendAdjust ( pCur );
4449 					continue;
4450 				}
4451 			}
4452 
4453 			m_iLastTokenLen = m_iAccum;
4454 			m_sAccum[m_iAccum] = '\0';
4455 			m_iAccum = 0;
4456 			m_pTokenEnd = pCur>=m_pBufferMax ? m_pCur : pCur;
4457 			if ( !BlendAdjust ( pCur ) )
4458 				continue;
4459 			if ( m_bBlended )
4460 				return GetBlendedVariant();
4461 			return m_sAccum;
4462 		}
4463 
4464 		// handle specials
4465 		bool bSpecial = ( iCode & FLAG_CODEPOINT_SPECIAL )!=0;
4466 		bool bNoBlend = !( iCode & FLAG_CODEPOINT_BLEND );
4467 		iCode &= MASK_CODEPOINT;
4468 		if ( bSpecial )
4469 		{
4470 			// skip short words
4471 			if ( m_iAccum<m_tSettings.m_iMinWordLen )
4472 			{
4473 				if ( m_iAccum )
4474 					m_iOvershortCount++;
4475 
4476 				bool bShortToken = false;
4477 				if ( m_bShortTokenFilter )
4478 				{
4479 					m_sAccum[m_iAccum] = '\0';
4480 					if ( ShortTokenFilter ( m_sAccum, m_iAccum ) )
4481 						bShortToken = true;
4482 				}
4483 
4484 				if ( !bShortToken )
4485 				{
4486 					if ( m_iAccum )
4487 						m_iOvershortCount++;
4488 
4489 					m_iAccum = 0;
4490 				}
4491 			}
4492 
4493 			m_pTokenEnd = m_pCur;
4494 
4495 			if ( m_iAccum==0 )
4496 			{
4497 				// nice standalone special
4498 				m_iLastTokenLen = 1;
4499 				m_sAccum[0] = (BYTE)iCode;
4500 				m_sAccum[1] = '\0';
4501 				m_pTokenStart = pCur;
4502 				m_bWasSpecial = true;
4503 
4504 			} else
4505 			{
4506 				// flush prev accum and redo this special
4507 				m_iLastTokenLen = m_iAccum;
4508 				m_sAccum[m_iAccum] = '\0';
4509 				m_pCur--;
4510 				m_pTokenEnd--;
4511 			}
4512 
4513 			m_iAccum = 0;
4514 			if ( !BlendAdjust ( pCur ) )
4515 				continue;
4516 			if ( m_bBlended )
4517 				return GetBlendedVariant();
4518 			return m_sAccum;
4519 		}
4520 
4521 		// just accumulate
4522 		assert ( iCode>0 );
4523 		if ( m_iAccum<SPH_MAX_WORD_LEN )
4524 		{
4525 			if ( m_iAccum==0 )
4526 				m_pTokenStart = pCur;
4527 
4528 			// tricky bit
4529 			// heading modifiers must not (!) affected blended status
4530 			// eg. we want stuff like '=-' (w/o apostrophes) thrown away when pure_blend is on
4531 			if (!( m_bQueryMode && !m_iAccum && sphIsModifier(iCode) ) )
4532 				m_bNonBlended = m_bNonBlended || bNoBlend;
4533 			m_sAccum[m_iAccum++] = (BYTE)iCode;
4534 		}
4535 	}
4536 }
4537 
4538 
Clone(bool bEscaped) const4539 ISphTokenizer * CSphTokenizer_SBCS::Clone ( bool bEscaped ) const
4540 {
4541 	CSphTokenizer_SBCS * pClone = new CSphTokenizer_SBCS ();
4542 	pClone->CloneBase ( this, bEscaped );
4543 	return pClone;
4544 }
4545 
4546 /////////////////////////////////////////////////////////////////////////////
4547 
CSphTokenizer_UTF8()4548 CSphTokenizer_UTF8::CSphTokenizer_UTF8 ()
4549 {
4550 	CSphString sTmp;
4551 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
4552 }
4553 
4554 
SetBuffer(BYTE * sBuffer,int iLength)4555 void CSphTokenizer_UTF8::SetBuffer ( BYTE * sBuffer, int iLength )
4556 {
4557 	// check that old one is over and that new length is sane
4558 	assert ( iLength>=0 );
4559 
4560 	// set buffer
4561 	m_pBuffer = sBuffer;
4562 	m_pBufferMax = sBuffer + iLength;
4563 	m_pCur = sBuffer;
4564 	m_pTokenStart = m_pTokenEnd = NULL;
4565 	m_pBlendStart = m_pBlendEnd = NULL;
4566 
4567 	// fixup embedded zeroes with spaces
4568 	for ( BYTE * p = m_pBuffer; p < m_pBufferMax; p++ )
4569 		if ( !*p )
4570 			*p = ' ';
4571 
4572 	m_iOvershortCount = 0;
4573 	m_bBoundary = m_bTokenBoundary = false;
4574 }
4575 
4576 
GetToken()4577 BYTE * CSphTokenizer_UTF8::GetToken ()
4578 {
4579 	m_bWasSpecial = false;
4580 	m_bBlended = false;
4581 	m_iOvershortCount = 0;
4582 	m_bTokenBoundary = false;
4583 
4584 	if ( m_dSynonyms.GetLength() )
4585 		return GetTokenSyn ();
4586 
4587 	// return pending blending variants
4588 	BYTE * pVar = GetBlendedVariant ();
4589 	if ( pVar )
4590 		return pVar;
4591 	m_bBlendedPart = ( m_pBlendEnd!=NULL );
4592 
4593 	// whether this tokenizer supports escaping
4594 	const bool bUseEscape = m_bEscaped;
4595 
4596 	// in query mode, lets capture (soft-whitespace hard-whitespace) sequences and adjust overshort counter
4597 	// sample queries would be (one NEAR $$$) or (one | $$$ two) where $ is not a valid character
4598 	bool bGotNonToken = ( !m_bQueryMode || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases
4599 	bool bGotSoft = false; // hey Beavis he said soft huh huhhuh
4600 
4601 	for ( ;; )
4602 	{
4603 		// get next codepoint
4604 		BYTE * pCur = m_pCur; // to redo special char, if there's a token already
4605 		int iCodePoint = GetCodepoint(); // advances m_pCur
4606 		int iCode = m_tLC.ToLower ( iCodePoint );
4607 
4608 		// handle escaping
4609 		bool bWasEscaped = ( bUseEscape && iCodePoint=='\\' ); // whether current codepoint was escaped
4610 		if ( bWasEscaped )
4611 		{
4612 			iCodePoint = GetCodepoint();
4613 			iCode = m_tLC.ToLower ( iCodePoint );
4614 			if ( !Special2Simple ( iCode ) )
4615 				iCode = 0;
4616 		}
4617 
4618 		// handle eof
4619 		if ( iCode<0 )
4620 		{
4621 			// skip trailing short word
4622 			FlushAccum ();
4623 			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen )
4624 			{
4625 				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) )
4626 				{
4627 					if ( m_iLastTokenLen )
4628 						m_iOvershortCount++;
4629 					m_iLastTokenLen = 0;
4630 					BlendAdjust ( pCur );
4631 					return NULL;
4632 				}
4633 			}
4634 
4635 			// return trailing word
4636 			if ( !BlendAdjust ( pCur ) )
4637 				return NULL;
4638 			m_pTokenEnd = m_pCur;
4639 			if ( m_bBlended )
4640 				return GetBlendedVariant();
4641 			return m_sAccum;
4642 		}
4643 
4644 		// handle all the flags..
4645 		iCode = CodepointArbitration ( iCode, bWasEscaped, *m_pCur );
4646 
4647 		// handle ignored chars
4648 		if ( iCode & FLAG_CODEPOINT_IGNORE )
4649 			continue;
4650 
4651 		// handle blended characters
4652 		if ( iCode & FLAG_CODEPOINT_BLEND )
4653 		{
4654 			if ( m_pBlendEnd )
4655 				iCode = 0;
4656 			else
4657 			{
4658 				m_bBlended = true;
4659 				m_pBlendStart = m_iAccum ? m_pTokenStart : pCur;
4660 			}
4661 		}
4662 
4663 		// handle soft-whitespace-only tokens
4664 		if ( !bGotNonToken && !m_iAccum )
4665 		{
4666 			if ( !bGotSoft )
4667 			{
4668 				// detect opening soft whitespace
4669 				if ( ( iCode==0 && !( iCode & MASK_FLAGS ) && !IsWhitespace ( iCodePoint ) )
4670 					|| ( ( iCode & FLAG_CODEPOINT_BLEND ) && !m_iAccum ) )
4671 				{
4672 					bGotSoft = true;
4673 				}
4674 			} else
4675 			{
4676 				// detect closing hard whitespace or special
4677 				// (if there was anything meaningful in the meantime, we must never get past the outer if!)
4678 				if ( IsWhitespace ( iCodePoint ) || ( iCode & FLAG_CODEPOINT_SPECIAL ) )
4679 				{
4680 					m_iOvershortCount++;
4681 					bGotNonToken = true;
4682 				}
4683 			}
4684 		}
4685 
4686 		// handle whitespace and boundary
4687 		if ( m_bBoundary && ( iCode==0 ) )
4688 		{
4689 			m_bTokenBoundary = true;
4690 			m_iBoundaryOffset = pCur - m_pBuffer - 1;
4691 		}
4692 		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
4693 
4694 		if ( iCode==0 || m_bBoundary )
4695 		{
4696 			FlushAccum ();
4697 			if ( !BlendAdjust ( pCur ) )
4698 				continue;
4699 
4700 			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen
4701 				&& !( m_bShortTokenFilter && ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) ) )
4702 			{
4703 				if ( m_iLastTokenLen )
4704 					m_iOvershortCount++;
4705 				continue;
4706 			} else
4707 			{
4708 				m_pTokenEnd = pCur;
4709 				if ( m_bBlended )
4710 					return GetBlendedVariant();
4711 				return m_sAccum;
4712 			}
4713 		}
4714 
4715 		// handle specials
4716 		if ( iCode & FLAG_CODEPOINT_SPECIAL )
4717 		{
4718 			// skip short words preceding specials
4719 			if ( m_iAccum<m_tSettings.m_iMinWordLen )
4720 			{
4721 				m_sAccum[m_iAccum] = '\0';
4722 
4723 				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iAccum ) )
4724 				{
4725 					if ( m_iAccum )
4726 						m_iOvershortCount++;
4727 
4728 					FlushAccum ();
4729 				}
4730 			}
4731 
4732 			if ( m_iAccum==0 )
4733 			{
4734 				m_bNonBlended = m_bNonBlended || ( !( iCode & FLAG_CODEPOINT_BLEND ) && !( iCode & FLAG_CODEPOINT_SPECIAL ) );
4735 				m_bWasSpecial = !( iCode & FLAG_CODEPOINT_NGRAM );
4736 				m_pTokenStart = pCur;
4737 				m_pTokenEnd = m_pCur;
4738 				AccumCodepoint ( iCode & MASK_CODEPOINT ); // handle special as a standalone token
4739 			} else
4740 			{
4741 				m_pCur = pCur; // we need to flush current accum and then redo special char again
4742 				m_pTokenEnd = pCur;
4743 			}
4744 
4745 			FlushAccum ();
4746 			if ( !BlendAdjust ( pCur ) )
4747 				continue;
4748 			if ( m_bBlended )
4749 				return GetBlendedVariant();
4750 			return m_sAccum;
4751 		}
4752 
4753 		if ( m_iAccum==0 )
4754 			m_pTokenStart = pCur;
4755 
4756 		// tricky bit
4757 		// heading modifiers must not (!) affected blended status
4758 		// eg. we want stuff like '=-' (w/o apostrophes) thrown away when pure_blend is on
4759 		if (!( m_bQueryMode && !m_iAccum && sphIsModifier ( iCode & MASK_CODEPOINT ) ) )
4760 			m_bNonBlended = m_bNonBlended || !( iCode & FLAG_CODEPOINT_BLEND );
4761 
4762 		// just accumulate
4763 		AccumCodepoint ( iCode & MASK_CODEPOINT );
4764 	}
4765 }
4766 
4767 
FlushAccum()4768 void CSphTokenizer_UTF8::FlushAccum ()
4769 {
4770 	assert ( m_pAccum-m_sAccum < (int)sizeof(m_sAccum) );
4771 	m_iLastTokenLen = m_iAccum;
4772 	*m_pAccum = 0;
4773 	m_iAccum = 0;
4774 	m_pAccum = m_sAccum;
4775 }
4776 
4777 
Clone(bool bEscaped) const4778 ISphTokenizer * CSphTokenizer_UTF8::Clone ( bool bEscaped ) const
4779 {
4780 	CSphTokenizer_UTF8 * pClone = new CSphTokenizer_UTF8 ();
4781 	pClone->CloneBase ( this, bEscaped );
4782 	return pClone;
4783 }
4784 
4785 
GetCodepointLength(int iCode) const4786 int CSphTokenizer_UTF8::GetCodepointLength ( int iCode ) const
4787 {
4788 	if ( iCode<128 )
4789 		return 1;
4790 
4791 	int iBytes = 0;
4792 	while ( iCode & 0x80 )
4793 	{
4794 		iBytes++;
4795 		iCode <<= 1;
4796 	}
4797 
4798 	assert ( iBytes>=2 && iBytes<=4 );
4799 	return iBytes;
4800 }
4801 
4802 
4803 /////////////////////////////////////////////////////////////////////////////
4804 
SetNgramChars(const char * sConfig,CSphString & sError)4805 bool CSphTokenizer_UTF8Ngram::SetNgramChars ( const char * sConfig, CSphString & sError )
4806 {
4807 	CSphVector<CSphRemapRange> dRemaps;
4808 	CSphCharsetDefinitionParser tParser;
4809 	if ( !tParser.Parse ( sConfig, dRemaps ) )
4810 	{
4811 		sError = tParser.GetLastError();
4812 		return false;
4813 	}
4814 
4815 	m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL ); // !COMMIT support other n-gram lengths than 1
4816 	m_sNgramCharsStr = sConfig;
4817 	return true;
4818 }
4819 
4820 
SetNgramLen(int iLen)4821 void CSphTokenizer_UTF8Ngram::SetNgramLen ( int iLen )
4822 {
4823 	assert ( iLen>0 );
4824 	m_iNgramLen = iLen;
4825 }
4826 
4827 
GetToken()4828 BYTE * CSphTokenizer_UTF8Ngram::GetToken ()
4829 {
4830 	// !COMMIT support other n-gram lengths than 1
4831 	assert ( m_iNgramLen==1 );
4832 	return CSphTokenizer_UTF8::GetToken ();
4833 }
4834 
4835 //////////////////////////////////////////////////////////////////////////
4836 
CSphTokenizer_Filter(ISphTokenizer * pTokenizer,const CSphMultiformContainer * pContainer)4837 CSphTokenizer_Filter::CSphTokenizer_Filter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
4838 	: m_pTokenizer		( pTokenizer )
4839 	, m_pMultiWordforms ( pContainer )
4840 	, m_iStart	( 0 )
4841 	, m_bBuildMultiform	( false )
4842 {
4843 	assert ( pTokenizer && pContainer );
4844 	m_dStoredTokens.Reserve ( pContainer->m_iMaxTokens + 6 ); // max form tokens + some blended tokens
4845 	m_sTokenizedMultiform[0] = '\0';
4846 }
4847 
4848 
~CSphTokenizer_Filter()4849 CSphTokenizer_Filter::~CSphTokenizer_Filter ()
4850 {
4851 	SafeDelete ( m_pTokenizer );
4852 }
4853 
4854 
FillTokenInfo(StoredToken_t & tToken,const BYTE * sToken)4855 void CSphTokenizer_Filter::FillTokenInfo ( StoredToken_t & tToken, const BYTE * sToken )
4856 {
4857 	assert ( sToken );
4858 	strncpy ( (char *)tToken.m_sToken, (const char *)sToken, sizeof(tToken.m_sToken) );
4859 	tToken.m_szTokenStart = m_pTokenizer->GetTokenStart ();
4860 	tToken.m_szTokenEnd = m_pTokenizer->GetTokenEnd ();
4861 	tToken.m_iOvershortCount = m_pTokenizer->GetOvershortCount ();
4862 	tToken.m_iTokenLen = m_pTokenizer->GetLastTokenLen ();
4863 	tToken.m_pBufferPtr = m_pTokenizer->GetBufferPtr ();
4864 	tToken.m_bBoundary = m_pTokenizer->GetBoundary ();
4865 	tToken.m_bSpecial = m_pTokenizer->WasTokenSpecial ();
4866 	tToken.m_bBlended = m_pTokenizer->TokenIsBlended();
4867 	tToken.m_bBlendedPart = m_pTokenizer->TokenIsBlendedPart();
4868 }
4869 
4870 
GetToken()4871 BYTE * CSphTokenizer_Filter::GetToken ()
4872 {
4873 	m_sTokenizedMultiform[0] = '\0';
4874 	m_iStart++;
4875 
4876 	if ( m_iStart>=m_dStoredTokens.GetLength() )
4877 	{
4878 		m_iStart = 0;
4879 		m_dStoredTokens.Resize ( 0 );
4880 		const BYTE * pToken = m_pTokenizer->GetToken();
4881 		if ( !pToken )
4882 			return NULL;
4883 
4884 		FillTokenInfo ( m_dStoredTokens.Add(), pToken );
4885 		while ( m_dStoredTokens.Last().m_bBlended || m_dStoredTokens.Last().m_bBlendedPart )
4886 		{
4887 			pToken = m_pTokenizer->GetToken ();
4888 			if ( !pToken )
4889 				break;
4890 
4891 			FillTokenInfo ( m_dStoredTokens.Add(), pToken );
4892 		}
4893 	}
4894 
4895 	CSphMultiforms ** pWordforms = NULL;
4896 	int iTokensGot = 1;
4897 	bool bBlended = false;
4898 
4899 	// check multi-form
4900 	// only blended parts checked for multi-form with blended
4901 	// in case ALL blended parts got transformed primary blended got replaced by normal form
4902 	// otherwise blended tokens provided as is
4903 	if ( m_dStoredTokens[m_iStart].m_bBlended || m_dStoredTokens[m_iStart].m_bBlendedPart )
4904 	{
4905 		if ( m_dStoredTokens[m_iStart].m_bBlended && m_iStart+1<m_dStoredTokens.GetLength() && m_dStoredTokens[m_iStart+1].m_bBlendedPart )
4906 		{
4907 			pWordforms = m_pMultiWordforms->m_Hash ( (const char *)m_dStoredTokens[m_iStart+1].m_sToken );
4908 			if ( pWordforms )
4909 			{
4910 				bBlended = true;
4911 				for ( int i=m_iStart+2; i<m_dStoredTokens.GetLength(); i++ )
4912 				{
4913 					// break out on blended over or got completely different blended
4914 					if ( m_dStoredTokens[i].m_bBlended || !m_dStoredTokens[i].m_bBlendedPart )
4915 						break;
4916 
4917 					iTokensGot++;
4918 				}
4919 			}
4920 		}
4921 	} else
4922 	{
4923 		pWordforms = m_pMultiWordforms->m_Hash ( (const char *)m_dStoredTokens[m_iStart].m_sToken );
4924 		if ( pWordforms )
4925 		{
4926 			int iTokensNeed = (*pWordforms)->m_iMaxTokens + 1;
4927 			int iCur = m_iStart;
4928 			bool bGotBlended = false;
4929 
4930 			// collect up ahead to multi-form tokens or all blended tokens
4931 			while ( iTokensGot<iTokensNeed || bGotBlended )
4932 			{
4933 				iCur++;
4934 				if ( iCur>=m_dStoredTokens.GetLength() )
4935 				{
4936 					// fetch next token
4937 					const BYTE* pToken = m_pTokenizer->GetToken ();
4938 					if ( !pToken )
4939 						break;
4940 
4941 					FillTokenInfo ( m_dStoredTokens.Add(), pToken );
4942 				}
4943 
4944 				bool bCurBleneded = ( m_dStoredTokens[iCur].m_bBlended || m_dStoredTokens[iCur].m_bBlendedPart );
4945 				if ( bGotBlended && !bCurBleneded )
4946 					break;
4947 
4948 				bGotBlended = bCurBleneded;
4949 				// count only regular tokens; can not fold mixed (regular+blended) tokens to form
4950 				iTokensGot += ( bGotBlended ? 0 : 1 );
4951 			}
4952 		}
4953 	}
4954 
4955 	if ( !pWordforms || iTokensGot<(*pWordforms)->m_iMinTokens+1 )
4956 		return m_dStoredTokens[m_iStart].m_sToken;
4957 
4958 	int iStartToken = m_iStart + ( bBlended ? 1 : 0 );
4959 	ARRAY_FOREACH ( i, (*pWordforms)->m_dWordforms )
4960 	{
4961 		const CSphMultiform * pCurForm = (*pWordforms)->m_dWordforms[i];
4962 		int iFormTokCount = pCurForm->m_dTokens.GetLength();
4963 
4964 		if ( iTokensGot<iFormTokCount+1 || ( bBlended && iTokensGot!=iFormTokCount+1 ) )
4965 			continue;
4966 
4967 		int iForm = 0;
4968 		for ( ; iForm<iFormTokCount; iForm++ )
4969 		{
4970 			const StoredToken_t & tTok = m_dStoredTokens[iStartToken + 1 + iForm];
4971 			const char * szStored = (const char*)tTok.m_sToken;
4972 			const char * szNormal = pCurForm->m_dTokens[iForm].cstr ();
4973 
4974 			if ( *szNormal!=*szStored || strcasecmp ( szNormal, szStored ) )
4975 				break;
4976 		}
4977 
4978 		// early out - no destination form detected
4979 		if ( iForm!=iFormTokCount )
4980 			continue;
4981 
4982 		// tokens after folded form are valid tail that should be processed next time
4983 		if ( m_bBuildMultiform )
4984 		{
4985 			BYTE * pOut = m_sTokenizedMultiform;
4986 			BYTE * pMax = pOut + sizeof(m_sTokenizedMultiform);
4987 			for ( int j=0; j<iFormTokCount+1 && pOut<pMax; j++ )
4988 			{
4989 				const StoredToken_t & tTok = m_dStoredTokens[iStartToken+j];
4990 				const BYTE * sTok = tTok.m_sToken;
4991 				if ( j && pOut<pMax )
4992 					*pOut++ = ' ';
4993 				while ( *sTok && pOut<pMax )
4994 					*pOut++ = *sTok++;
4995 			}
4996 			*pOut = '\0';
4997 			*(pMax-1) = '\0';
4998 		}
4999 
5000 		if ( !bBlended )
5001 		{
5002 			// fold regular tokens to form
5003 			StoredToken_t & tStart = m_dStoredTokens[m_iStart];
5004 			StoredToken_t & tEnd = m_dStoredTokens[m_iStart+iFormTokCount];
5005 			m_iStart += iFormTokCount;
5006 
5007 			strncpy ( (char *)tEnd.m_sToken, pCurForm->m_sNormalForm.cstr(), sizeof(tEnd.m_sToken) );
5008 			tEnd.m_szTokenStart = tStart.m_szTokenStart;
5009 			tEnd.m_pBufferPtr = tStart.m_pBufferPtr;
5010 			tEnd.m_iTokenLen = pCurForm->m_iNormalTokenLen;
5011 
5012 			tEnd.m_bBoundary = false;
5013 			tEnd.m_bSpecial = false;
5014 			tEnd.m_bBlended = false;
5015 			tEnd.m_bBlendedPart = false;
5016 		} else
5017 		{
5018 			// replace blended by form
5019 			StoredToken_t & tDst = m_dStoredTokens[m_iStart];
5020 			strncpy ( (char *)tDst.m_sToken, pCurForm->m_sNormalForm.cstr(), sizeof(tDst.m_sToken) );
5021 			tDst.m_iTokenLen = pCurForm->m_iNormalTokenLen;
5022 		}
5023 	}
5024 
5025 	return m_dStoredTokens[m_iStart].m_sToken;
5026 }
5027 
5028 
Clone(bool bEscaped) const5029 ISphTokenizer * CSphTokenizer_Filter::Clone ( bool bEscaped ) const
5030 {
5031 	ISphTokenizer * pClone = m_pTokenizer->Clone ( bEscaped );
5032 	return CreateTokenFilter ( pClone, m_pMultiWordforms );
5033 }
5034 
5035 
SetBufferPtr(const char * sNewPtr)5036 void CSphTokenizer_Filter::SetBufferPtr ( const char * sNewPtr )
5037 {
5038 	m_iStart = 0;
5039 	m_dStoredTokens.Resize ( 0 );
5040 	m_pTokenizer->SetBufferPtr ( sNewPtr );
5041 }
5042 
SetBuffer(BYTE * sBuffer,int iLength)5043 void CSphTokenizer_Filter::SetBuffer ( BYTE * sBuffer, int iLength )
5044 {
5045 	m_pTokenizer->SetBuffer ( sBuffer, iLength );
5046 	SetBufferPtr ( (const char *)sBuffer );
5047 }
5048 
SkipBlended()5049 int CSphTokenizer_Filter::SkipBlended ()
5050 {
5051 	bool bGotBlended = ( m_iStart<m_dStoredTokens.GetLength() &&
5052 		( m_dStoredTokens[m_iStart].m_bBlended || m_dStoredTokens[m_iStart].m_bBlendedPart ) );
5053 	if ( !bGotBlended )
5054 		return 0;
5055 
5056 	int iWasStart = m_iStart;
5057 	for ( int iTok=m_iStart+1; iTok<m_dStoredTokens.GetLength() && m_dStoredTokens[iTok].m_bBlendedPart && !m_dStoredTokens[iTok].m_bBlended; iTok++ )
5058 		m_iStart = iTok;
5059 
5060 	return ( m_iStart-iWasStart+1 );
5061 }
5062 
5063 
5064 /////////////////////////////////////////////////////////////////////////////
5065 // FILTER
5066 /////////////////////////////////////////////////////////////////////////////
5067 
CSphFilterSettings()5068 CSphFilterSettings::CSphFilterSettings ()
5069 	: m_sAttrName	( "" )
5070 	, m_bExclude	( false )
5071 	, m_iMinValue	( LLONG_MIN )
5072 	, m_iMaxValue	( LLONG_MAX )
5073 	, m_pValues		( NULL )
5074 	, m_nValues		( 0 )
5075 {}
5076 
5077 
CSphFilterSettings(const CSphFilterSettings & rhs)5078 CSphFilterSettings::CSphFilterSettings ( const CSphFilterSettings & rhs )
5079 {
5080 	assert ( 0 );
5081 	(*this) = rhs;
5082 }
5083 
5084 
SetExternalValues(const SphAttr_t * pValues,int nValues)5085 void CSphFilterSettings::SetExternalValues ( const SphAttr_t * pValues, int nValues )
5086 {
5087 	m_pValues = pValues;
5088 	m_nValues = nValues;
5089 }
5090 
5091 
operator ==(const CSphFilterSettings & rhs) const5092 bool CSphFilterSettings::operator == ( const CSphFilterSettings & rhs ) const
5093 {
5094 	// check name, mode, type
5095 	if ( m_sAttrName!=rhs.m_sAttrName || m_bExclude!=rhs.m_bExclude || m_eType!=rhs.m_eType )
5096 		return false;
5097 
5098 	switch ( m_eType )
5099 	{
5100 		case SPH_FILTER_RANGE:
5101 			return m_iMinValue==rhs.m_iMinValue && m_iMaxValue==rhs.m_iMaxValue;
5102 
5103 		case SPH_FILTER_FLOATRANGE:
5104 			return m_fMinValue==rhs.m_fMinValue && m_fMaxValue==rhs.m_fMaxValue;
5105 
5106 		case SPH_FILTER_VALUES:
5107 			if ( m_dValues.GetLength()!=rhs.m_dValues.GetLength() )
5108 				return false;
5109 
5110 			ARRAY_FOREACH ( i, m_dValues )
5111 				if ( m_dValues[i]!=rhs.m_dValues[i] )
5112 					return false;
5113 
5114 			return true;
5115 
5116 		default:
5117 			assert ( 0 && "internal error: unhandled filter type in comparison" );
5118 			return false;
5119 	}
5120 }
5121 
5122 /////////////////////////////////////////////////////////////////////////////
5123 // QUERY
5124 /////////////////////////////////////////////////////////////////////////////
5125 
CSphQuery()5126 CSphQuery::CSphQuery ()
5127 	: m_sIndexes	( "*" )
5128 	, m_sQuery		( "" )
5129 	, m_sRawQuery	( "" )
5130 	, m_iOffset		( 0 )
5131 	, m_iLimit		( 20 )
5132 	, m_pWeights	( NULL )
5133 	, m_iWeights	( 0 )
5134 	, m_eMode		( SPH_MATCH_ALL )
5135 	, m_eRanker		( SPH_RANK_DEFAULT )
5136 	, m_eSort		( SPH_SORT_RELEVANCE )
5137 	, m_iMaxMatches	( 1000 )
5138 	, m_eGroupFunc		( SPH_GROUPBY_ATTR )
5139 	, m_sGroupSortBy	( "@groupby desc" )
5140 	, m_sGroupDistinct	( "" )
5141 	, m_iCutoff			( 0 )
5142 	, m_iRetryCount		( 0 )
5143 	, m_iRetryDelay		( 0 )
5144 	, m_bGeoAnchor		( false )
5145 	, m_fGeoLatitude	( 0.0f )
5146 	, m_fGeoLongitude	( 0.0f )
5147 	, m_uMaxQueryMsec	( 0 )
5148 	, m_sComment		( "" )
5149 	, m_sSelect			( "" )
5150 	, m_bReverseScan	( false )
5151 	, m_iSQLSelectStart	( -1 )
5152 	, m_iSQLSelectEnd	( -1 )
5153 
5154 	, m_iOldVersion		( 0 )
5155 	, m_iOldGroups		( 0 )
5156 	, m_pOldGroups		( NULL )
5157 	, m_iOldMinTS		( 0 )
5158 	, m_iOldMaxTS		( UINT_MAX )
5159 	, m_iOldMinGID		( 0 )
5160 	, m_iOldMaxGID		( UINT_MAX )
5161 
5162 	, m_eCollation		( SPH_COLLATION_DEFAULT )
5163 	, m_bAgent			( false )
5164 {}
5165 
5166 
~CSphQuery()5167 CSphQuery::~CSphQuery ()
5168 {
5169 }
5170 
5171 
GetIndexWeight(const char * sName) const5172 int CSphQuery::GetIndexWeight ( const char * sName ) const
5173 {
5174 	ARRAY_FOREACH ( i, m_dIndexWeights )
5175 		if ( m_dIndexWeights[i].m_sName==sName )
5176 			return m_dIndexWeights[i].m_iValue;
5177 	return 1;
5178 }
5179 
5180 //////////////////////////////////////////////////////////////////////////
5181 
5182 struct SelectBounds_t
5183 {
5184 	int		m_iStart;
5185 	int		m_iEnd;
5186 };
5187 #define YYSTYPE SelectBounds_t
5188 #include "yysphinxselect.h"
5189 
5190 
5191 class SelectParser_t
5192 {
5193 public:
5194 	int				GetToken ( YYSTYPE * lvalp );
5195 	void			AddItem ( YYSTYPE * pExpr, ESphAggrFunc eAggrFunc=SPH_AGGR_NONE, YYSTYPE * pStart=NULL, YYSTYPE * pEnd=NULL );
5196 	void			AddItem ( const char * pToken, YYSTYPE * pStart=NULL, YYSTYPE * pEnd=NULL );
5197 	void			AliasLastItem ( YYSTYPE * pAlias );
5198 private:
5199 	void			AutoAlias ( CSphQueryItem & tItem, YYSTYPE * pStart, YYSTYPE * pEnd );
5200 
5201 public:
5202 	CSphString		m_sParserError;
5203 	const char *	m_pLastTokenStart;
5204 
5205 	const char *	m_pStart;
5206 	const char *	m_pCur;
5207 
5208 	CSphQuery *		m_pQuery;
5209 };
5210 
yylex(YYSTYPE * lvalp,SelectParser_t * pParser)5211 int yylex ( YYSTYPE * lvalp, SelectParser_t * pParser )				{ return pParser->GetToken ( lvalp );}
yyerror(SelectParser_t * pParser,const char * sMessage)5212 void yyerror ( SelectParser_t * pParser, const char * sMessage )	{ pParser->m_sParserError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastTokenStart ); }
5213 #include "yysphinxselect.c"
5214 
5215 
GetToken(YYSTYPE * lvalp)5216 int SelectParser_t::GetToken ( YYSTYPE * lvalp )
5217 {
5218 	// skip whitespace, check eof
5219 	while ( isspace ( *m_pCur ) ) m_pCur++;
5220 	if ( !*m_pCur ) return 0;
5221 
5222 	m_pLastTokenStart = m_pCur;
5223 	lvalp->m_iStart = m_pCur-m_pStart;
5224 
5225 	// check for constant
5226 	if ( isdigit ( *m_pCur ) )
5227 	{
5228 		char * pEnd = NULL;
5229 		double fDummy; // to avoid gcc unused result warning
5230 		fDummy = strtod ( m_pCur, &pEnd );
5231 		fDummy *= 2; // to avoid gcc unused variable warning
5232 
5233 		m_pCur = pEnd;
5234 		lvalp->m_iEnd = m_pCur-m_pStart;
5235 		return SEL_TOKEN;
5236 	}
5237 
5238 	// check for token
5239 	if ( sphIsAttr ( m_pCur[0] ) || ( m_pCur[0]=='@' && sphIsAttr ( m_pCur[1] ) && !isdigit ( m_pCur[1] ) ) )
5240 	{
5241 		m_pCur++;
5242 		while ( sphIsAttr ( *m_pCur ) ) m_pCur++;
5243 		lvalp->m_iEnd = m_pCur-m_pStart;
5244 
5245 		#define LOC_CHECK(_str,_len,_ret) \
5246 			if ( lvalp->m_iEnd==_len+lvalp->m_iStart && strncasecmp ( m_pStart+lvalp->m_iStart, _str, _len )==0 ) return _ret;
5247 
5248 		LOC_CHECK ( "ID", 2, SEL_ID );
5249 		LOC_CHECK ( "AS", 2, SEL_AS );
5250 		LOC_CHECK ( "OR", 2, TOK_OR );
5251 		LOC_CHECK ( "AND", 3, TOK_AND );
5252 		LOC_CHECK ( "NOT", 3, TOK_NOT );
5253 		LOC_CHECK ( "DIV", 3, TOK_DIV );
5254 		LOC_CHECK ( "MOD", 3, TOK_MOD );
5255 		LOC_CHECK ( "AVG", 3, SEL_AVG );
5256 		LOC_CHECK ( "MIN", 3, SEL_MIN );
5257 		LOC_CHECK ( "MAX", 3, SEL_MAX );
5258 		LOC_CHECK ( "SUM", 3, SEL_SUM );
5259 		LOC_CHECK ( "COUNT", 5, SEL_COUNT );
5260 		LOC_CHECK ( "DISTINCT", 8, SEL_DISTINCT );
5261 		LOC_CHECK ( "WEIGHT", 6, SEL_WEIGHT );
5262 
5263 		#undef LOC_CHECK
5264 
5265 		return SEL_TOKEN;
5266 	}
5267 
5268 	// check for equality checks
5269 	lvalp->m_iEnd = 1+lvalp->m_iStart;
5270 	switch ( *m_pCur )
5271 	{
5272 		case '<':
5273 			m_pCur++;
5274 			if ( *m_pCur=='>' ) { m_pCur++; lvalp->m_iEnd++; return TOK_NE; }
5275 			if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; return TOK_LTE; }
5276 			return '<';
5277 
5278 		case '>':
5279 			m_pCur++;
5280 			if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; return TOK_GTE; }
5281 			return '>';
5282 
5283 		case '=':
5284 			m_pCur++;
5285 			if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; }
5286 			return TOK_EQ;
5287 
5288 		case '\'':
5289 		{
5290 			const char cEnd = *m_pCur;
5291 			for ( const char * s = m_pCur+1; *s; s++ )
5292 			{
5293 				if ( *s==cEnd )
5294 				{
5295 					m_pCur = s+1;
5296 					return TOK_CONST_STRING;
5297 				}
5298 			}
5299 			return -1;
5300 		}
5301 	}
5302 
5303 	// return char as a token
5304 	return *m_pCur++;
5305 }
5306 
AutoAlias(CSphQueryItem & tItem,YYSTYPE * pStart,YYSTYPE * pEnd)5307 void SelectParser_t::AutoAlias ( CSphQueryItem & tItem, YYSTYPE * pStart, YYSTYPE * pEnd )
5308 {
5309 	if ( pStart && pEnd )
5310 	{
5311 		tItem.m_sAlias.SetBinary ( m_pStart + pStart->m_iStart, pEnd->m_iEnd - pStart->m_iStart );
5312 		tItem.m_sAlias.ToLower();
5313 	} else
5314 		tItem.m_sAlias = tItem.m_sExpr;
5315 }
5316 
AddItem(YYSTYPE * pExpr,ESphAggrFunc eAggrFunc,YYSTYPE * pStart,YYSTYPE * pEnd)5317 void SelectParser_t::AddItem ( YYSTYPE * pExpr, ESphAggrFunc eAggrFunc, YYSTYPE * pStart, YYSTYPE * pEnd )
5318 {
5319 	CSphQueryItem & tItem = m_pQuery->m_dItems.Add();
5320 	tItem.m_sExpr.SetBinary ( m_pStart + pExpr->m_iStart, pExpr->m_iEnd - pExpr->m_iStart );
5321 	tItem.m_sExpr.ToLower();
5322 	tItem.m_eAggrFunc = eAggrFunc;
5323 	AutoAlias ( tItem, pStart, pEnd );
5324 }
5325 
AddItem(const char * pToken,YYSTYPE * pStart,YYSTYPE * pEnd)5326 void SelectParser_t::AddItem ( const char * pToken, YYSTYPE * pStart, YYSTYPE * pEnd )
5327 {
5328 	CSphQueryItem & tItem = m_pQuery->m_dItems.Add();
5329 	tItem.m_sExpr = pToken;
5330 	tItem.m_eAggrFunc = SPH_AGGR_NONE;
5331 	tItem.m_sExpr.ToLower();
5332 	AutoAlias ( tItem, pStart, pEnd );
5333 }
5334 
AliasLastItem(YYSTYPE * pAlias)5335 void SelectParser_t::AliasLastItem ( YYSTYPE * pAlias )
5336 {
5337 	if ( pAlias )
5338 	{
5339 		CSphQueryItem & tItem = m_pQuery->m_dItems.Last();
5340 		tItem.m_sAlias.SetBinary ( m_pStart + pAlias->m_iStart, pAlias->m_iEnd - pAlias->m_iStart );
5341 		tItem.m_sAlias.ToLower();
5342 	}
5343 }
5344 
5345 
ParseSelectList(CSphString & sError)5346 bool CSphQuery::ParseSelectList ( CSphString & sError )
5347 {
5348 	m_dItems.Reset ();
5349 	if ( m_sSelect.IsEmpty() )
5350 		return true; // empty is ok; will just return everything
5351 
5352 	SelectParser_t tParser;
5353 	tParser.m_pStart = m_sSelect.cstr();
5354 	tParser.m_pCur = m_sSelect.cstr();
5355 	tParser.m_pQuery = this;
5356 
5357 	yyparse ( &tParser );
5358 
5359 	sError = tParser.m_sParserError;
5360 	return sError.IsEmpty ();
5361 }
5362 
5363 /////////////////////////////////////////////////////////////////////////////
5364 // SCHEMA
5365 /////////////////////////////////////////////////////////////////////////////
5366 
sphDumpAttr(const CSphColumnInfo & tAttr)5367 static CSphString sphDumpAttr ( const CSphColumnInfo & tAttr )
5368 {
5369 	CSphString sRes;
5370 	sRes.SetSprintf ( "%s %s:%d@%d", sphTypeName ( tAttr.m_eAttrType ), tAttr.m_sName.cstr(), tAttr.m_tLocator.m_iBitCount, tAttr.m_tLocator.m_iBitOffset );
5371 	return sRes;
5372 }
5373 
5374 
CompareTo(const CSphSchema & rhs,CSphString & sError) const5375 bool CSphSchema::CompareTo ( const CSphSchema & rhs, CSphString & sError ) const
5376 {
5377 	// check attr count
5378 	if ( GetAttrsCount()!=rhs.GetAttrsCount() )
5379 	{
5380 		sError.SetSprintf ( "attribute count mismatch (me=%s, in=%s, myattrs=%d, inattrs=%d)",
5381 			m_sName.cstr(), rhs.m_sName.cstr(),
5382 			GetAttrsCount(), rhs.GetAttrsCount() );
5383 		return false;
5384 	}
5385 
5386 	// check attrs
5387 	ARRAY_FOREACH ( i, m_dAttrs )
5388 		if (!( rhs.m_dAttrs[i]==m_dAttrs[i] ))
5389 	{
5390 		sError.SetSprintf ( "attribute mismatch (me=%s, in=%s, idx=%d, myattr=%s, inattr=%s)",
5391 			m_sName.cstr(), rhs.m_sName.cstr(),
5392 			i, sphDumpAttr ( m_dAttrs[i] ).cstr(), sphDumpAttr ( rhs.m_dAttrs[i] ).cstr() );
5393 		return false;
5394 	}
5395 
5396 	// check field count
5397 	if ( rhs.m_dFields.GetLength()!=m_dFields.GetLength() )
5398 	{
5399 		sError.SetSprintf ( "fulltext fields count mismatch (me=%s, in=%s, myfields=%d, infields=%d)",
5400 			m_sName.cstr(), rhs.m_sName.cstr(),
5401 			m_dFields.GetLength(), rhs.m_dFields.GetLength() );
5402 		return false;
5403 	}
5404 
5405 	// check fulltext field names
5406 	ARRAY_FOREACH ( i, rhs.m_dFields )
5407 		if ( rhs.m_dFields[i].m_sName!=m_dFields[i].m_sName )
5408 	{
5409 		sError.SetSprintf ( "fulltext field mismatch (me=%s, myfield=%s, idx=%d, in=%s, infield=%s)",
5410 			m_sName.cstr(), rhs.m_sName.cstr(),
5411 			i, m_dFields[i].m_sName.cstr(), rhs.m_dFields[i].m_sName.cstr() );
5412 		return false;
5413 	}
5414 
5415 	return true;
5416 }
5417 
5418 
GetFieldIndex(const char * sName) const5419 int CSphSchema::GetFieldIndex ( const char * sName ) const
5420 {
5421 	if ( !sName )
5422 		return -1;
5423 	ARRAY_FOREACH ( i, m_dFields )
5424 		if ( strcasecmp ( m_dFields[i].m_sName.cstr(), sName )==0 )
5425 			return i;
5426 	return -1;
5427 }
5428 
5429 
GetAttrIndex(const char * sName) const5430 int CSphSchema::GetAttrIndex ( const char * sName ) const
5431 {
5432 	if ( !sName )
5433 		return -1;
5434 	ARRAY_FOREACH ( i, m_dAttrs )
5435 		if ( m_dAttrs[i].m_sName==sName )
5436 			return i;
5437 	return -1;
5438 }
5439 
5440 
GetAttr(const char * sName) const5441 const CSphColumnInfo * CSphSchema::GetAttr ( const char * sName ) const
5442 {
5443 	int iIndex = GetAttrIndex ( sName );
5444 	if ( iIndex>=0 )
5445 		return &m_dAttrs[iIndex];
5446 	return NULL;
5447 }
5448 
5449 
Reset()5450 void CSphSchema::Reset ()
5451 {
5452 	m_dFields.Reset();
5453 	ResetAttrs ();
5454 }
5455 
5456 
ResetAttrs()5457 void CSphSchema::ResetAttrs ()
5458 {
5459 	m_dAttrs.Reset();
5460 	m_dStaticUsed.Reset();
5461 	m_dDynamicUsed.Reset();
5462 	m_iStaticSize = 0;
5463 }
5464 
5465 
AddAttr(const CSphColumnInfo & tCol,bool bDynamic)5466 void CSphSchema::AddAttr ( const CSphColumnInfo & tCol, bool bDynamic )
5467 {
5468 	assert ( tCol.m_eAttrType!=SPH_ATTR_NONE );
5469 	if ( tCol.m_eAttrType==SPH_ATTR_NONE )
5470 		return;
5471 
5472 	m_dAttrs.Add ( tCol );
5473 	CSphAttrLocator & tLoc = m_dAttrs.Last().m_tLocator;
5474 
5475 	if ( tLoc.IsID() )
5476 		return;
5477 
5478 	int iBits = ROWITEM_BITS;
5479 	if ( tCol.m_tLocator.m_iBitCount>0 )		iBits = tCol.m_tLocator.m_iBitCount;
5480 	if ( tCol.m_eAttrType==SPH_ATTR_BOOL )		iBits = 1;
5481 	if ( tCol.m_eAttrType==SPH_ATTR_BIGINT )	iBits = 64;
5482 	tLoc.m_iBitCount = iBits;
5483 	tLoc.m_bDynamic = bDynamic;
5484 
5485 	CSphVector<int> & dUsed = bDynamic ? m_dDynamicUsed : m_dStaticUsed;
5486 	if ( iBits>=ROWITEM_BITS )
5487 	{
5488 		tLoc.m_iBitOffset = dUsed.GetLength()*ROWITEM_BITS;
5489 
5490 		int iItems = (iBits+ROWITEM_BITS-1) / ROWITEM_BITS;
5491 		for ( int i=0; i<iItems; i++ )
5492 		{
5493 			dUsed.Add ( ROWITEM_BITS );
5494 			if ( !bDynamic )
5495 				m_iStaticSize++;
5496 		}
5497 
5498 	} else
5499 	{
5500 		int iItem;
5501 		for ( iItem=0; iItem<dUsed.GetLength(); iItem++ )
5502 			if ( dUsed[iItem]+iBits<=ROWITEM_BITS )
5503 				break;
5504 		if ( iItem==dUsed.GetLength() )
5505 		{
5506 			dUsed.Add ( 0 );
5507 			if ( !bDynamic )
5508 				m_iStaticSize++;
5509 		}
5510 
5511 		tLoc.m_iBitOffset = iItem*ROWITEM_BITS + dUsed[iItem];
5512 		dUsed[iItem] += iBits;
5513 	}
5514 }
5515 
5516 
RemoveAttr(int iIndex)5517 void CSphSchema::RemoveAttr ( int iIndex )
5518 {
5519 	// adjust size
5520 	CSphAttrLocator & tLoc = m_dAttrs[iIndex].m_tLocator;
5521 	assert ( !tLoc.m_bDynamic );
5522 
5523 	int iItem = tLoc.m_iBitOffset / ROWITEM_BITS;
5524 	if ( tLoc.m_iBitCount>=ROWITEM_BITS )
5525 	{
5526 		for ( int i=0; i<tLoc.m_iBitCount/ROWITEM_BITS; i++ )
5527 		{
5528 			m_dStaticUsed[i+iItem] = 0;
5529 			m_iStaticSize--;
5530 		}
5531 	} else
5532 	{
5533 		m_dStaticUsed[iItem] -= tLoc.m_iBitCount;
5534 		assert ( m_dStaticUsed[iItem]>=0 );
5535 
5536 		if ( m_dStaticUsed[iItem]<=0 )
5537 			m_iStaticSize--;
5538 	}
5539 
5540 	// do remove
5541 	m_dAttrs.Remove ( iIndex );
5542 }
5543 
5544 ///////////////////////////////////////////////////////////////////////////////
5545 // BIT-ENCODED FILE OUTPUT
5546 ///////////////////////////////////////////////////////////////////////////////
5547 
CSphWriter()5548 CSphWriter::CSphWriter ()
5549 	: m_sName ( "" )
5550 	, m_iPos ( -1 )
5551 	, m_iWritten ( 0 )
5552 
5553 	, m_iFD ( -1 )
5554 	, m_iPoolUsed ( 0 )
5555 	, m_pBuffer ( NULL )
5556 	, m_pPool ( NULL )
5557 	, m_bOwnFile ( false )
5558 	, m_pSharedOffset ( NULL )
5559 	, m_iBufferSize	( 262144 )
5560 
5561 	, m_bError ( false )
5562 	, m_pError ( NULL )
5563 {
5564 }
5565 
5566 
SetBufferSize(int iBufferSize)5567 void CSphWriter::SetBufferSize ( int iBufferSize )
5568 {
5569 	if ( iBufferSize!=m_iBufferSize )
5570 	{
5571 		m_iBufferSize = Max ( iBufferSize, 262144 );
5572 		if ( m_pBuffer )
5573 			SafeDeleteArray ( m_pBuffer );
5574 	}
5575 }
5576 
5577 
OpenFile(const CSphString & sName,CSphString & sErrorBuffer)5578 bool CSphWriter::OpenFile ( const CSphString & sName, CSphString & sErrorBuffer )
5579 {
5580 	assert ( !sName.IsEmpty() );
5581 	assert ( m_iFD<0 && "already open" );
5582 
5583 	m_bOwnFile = true;
5584 	m_sName = sName;
5585 	m_pError = &sErrorBuffer;
5586 
5587 	if ( !m_pBuffer )
5588 		m_pBuffer = new BYTE [ m_iBufferSize ];
5589 
5590 	m_iFD = ::open ( m_sName.cstr(), SPH_O_NEW, 0644 );
5591 	m_pPool = m_pBuffer;
5592 	m_iPoolUsed = 0;
5593 	m_iPos = 0;
5594 	m_iWritten = 0;
5595 	m_bError = ( m_iFD<0 );
5596 
5597 	if ( m_bError )
5598 		m_pError->SetSprintf ( "failed to create %s: %s" , sName.cstr(), strerror(errno) );
5599 
5600 	return !m_bError;
5601 }
5602 
5603 
SetFile(CSphAutofile & tAuto,SphOffset_t * pSharedOffset,CSphString & sError)5604 void CSphWriter::SetFile ( CSphAutofile & tAuto, SphOffset_t * pSharedOffset, CSphString & sError )
5605 {
5606 	assert ( m_iFD<0 && "already open" );
5607 	m_bOwnFile = false;
5608 
5609 	if ( !m_pBuffer )
5610 		m_pBuffer = new BYTE [ m_iBufferSize ];
5611 
5612 	m_iFD = tAuto.GetFD();
5613 	m_sName = tAuto.GetFilename();
5614 	m_pPool = m_pBuffer;
5615 	m_iPoolUsed = 0;
5616 	m_iPos = 0;
5617 	m_iWritten = 0;
5618 	m_pSharedOffset = pSharedOffset;
5619 	m_pError = &sError;
5620 	assert ( m_pError );
5621 }
5622 
5623 
~CSphWriter()5624 CSphWriter::~CSphWriter ()
5625 {
5626 	CloseFile ();
5627 	SafeDeleteArray ( m_pBuffer );
5628 }
5629 
5630 
CloseFile(bool bTruncate)5631 void CSphWriter::CloseFile ( bool bTruncate )
5632 {
5633 	if ( m_iFD>=0 )
5634 	{
5635 		Flush ();
5636 		if ( bTruncate )
5637 			sphTruncate ( m_iFD );
5638 		if ( m_bOwnFile )
5639 			::close ( m_iFD );
5640 		m_iFD = -1;
5641 	}
5642 }
5643 
UnlinkFile()5644 void CSphWriter::UnlinkFile()
5645 {
5646 	if ( m_bOwnFile )
5647 	{
5648 		if ( m_iFD>=0 )
5649 			::close ( m_iFD );
5650 
5651 		m_iFD = -1;
5652 		::unlink ( m_sName.cstr() );
5653 		m_sName = "";
5654 	}
5655 	SafeDeleteArray ( m_pBuffer );
5656 }
5657 
5658 
PutByte(int data)5659 void CSphWriter::PutByte ( int data )
5660 {
5661 	if ( m_iPoolUsed==m_iBufferSize )
5662 		Flush ();
5663 	*m_pPool++ = BYTE ( data & 0xff );
5664 	m_iPoolUsed++;
5665 	m_iPos++;
5666 }
5667 
5668 
PutBytes(const void * pData,int iSize)5669 void CSphWriter::PutBytes ( const void * pData, int iSize )
5670 {
5671 	const BYTE * pBuf = (const BYTE *) pData;
5672 	while ( iSize>0 )
5673 	{
5674 		int iPut = Min ( iSize, m_iBufferSize );
5675 		if ( m_iPoolUsed+iPut>m_iBufferSize )
5676 			Flush ();
5677 		assert ( m_iPoolUsed+iPut<=m_iBufferSize );
5678 
5679 		memcpy ( m_pPool, pBuf, iPut );
5680 		m_pPool += iPut;
5681 		m_iPoolUsed += iPut;
5682 		m_iPos += iPut;
5683 
5684 		pBuf += iPut;
5685 		iSize -= iPut;
5686 	}
5687 }
5688 
5689 
ZipInt(DWORD uValue)5690 void CSphWriter::ZipInt ( DWORD uValue )
5691 {
5692 	int iBytes = 1;
5693 
5694 	DWORD u = ( uValue>>7 );
5695 	while ( u )
5696 	{
5697 		u >>= 7;
5698 		iBytes++;
5699 	}
5700 
5701 	while ( iBytes-- )
5702 		PutByte (
5703 			( 0x7f & ( uValue >> (7*iBytes) ) )
5704 			| ( iBytes ? 0x80 : 0 ) );
5705 }
5706 
5707 
ZipOffset(SphOffset_t uValue)5708 void CSphWriter::ZipOffset ( SphOffset_t uValue )
5709 {
5710 	int iBytes = 1;
5711 
5712 	uint64_t u = ((uint64_t)uValue)>>7;
5713 	while ( u )
5714 	{
5715 		u >>= 7;
5716 		iBytes++;
5717 	}
5718 
5719 	while ( iBytes-- )
5720 		PutByte (
5721 			( 0x7f & (DWORD)( uValue >> (7*iBytes) ) )
5722 			| ( iBytes ? 0x80 : 0 ) );
5723 }
5724 
5725 
ZipOffsets(CSphVector<SphOffset_t> * pData)5726 void CSphWriter::ZipOffsets ( CSphVector<SphOffset_t> * pData )
5727 {
5728 	assert ( pData );
5729 
5730 	SphOffset_t * pValue = &((*pData)[0]);
5731 	int n = pData->GetLength ();
5732 
5733 	while ( n-->0 )
5734 	{
5735 		SphOffset_t uValue = *pValue++;
5736 
5737 		int iBytes = 1;
5738 
5739 		uint64_t u = ((uint64_t)uValue)>>7;
5740 		while ( u )
5741 		{
5742 			u >>= 7;
5743 			iBytes++;
5744 		}
5745 
5746 		while ( iBytes-- )
5747 			PutByte (
5748 				( 0x7f & (DWORD)( uValue >> (7*iBytes) ) )
5749 				| ( iBytes ? 0x80 : 0 ) );
5750 	}
5751 }
5752 
5753 
Flush()5754 void CSphWriter::Flush ()
5755 {
5756 	PROFILE ( write_hits );
5757 
5758 	if ( m_pSharedOffset && *m_pSharedOffset!=m_iWritten )
5759 		sphSeek ( m_iFD, m_iWritten, SEEK_SET );
5760 
5761 	if ( !sphWriteThrottled ( m_iFD, m_pBuffer, m_iPoolUsed, m_sName.cstr(), *m_pError ) )
5762 		m_bError = true;
5763 
5764 	m_iWritten += m_iPoolUsed;
5765 	m_iPoolUsed = 0;
5766 	m_pPool = m_pBuffer;
5767 
5768 	if ( m_pSharedOffset )
5769 		*m_pSharedOffset = m_iWritten;
5770 }
5771 
5772 
PutString(const char * szString)5773 void CSphWriter::PutString ( const char * szString )
5774 {
5775 	int iLen = szString ? strlen ( szString ) : 0;
5776 	PutDword ( iLen );
5777 	if ( iLen )
5778 		PutBytes ( szString, iLen );
5779 }
5780 
PutString(const CSphString & sString)5781 void CSphWriter::PutString ( const CSphString & sString )
5782 {
5783 	int iLen = sString.Length();
5784 	PutDword ( iLen );
5785 	if ( iLen )
5786 		PutBytes ( sString.cstr(), iLen );
5787 }
5788 
5789 
SeekTo(SphOffset_t iPos)5790 void CSphWriter::SeekTo ( SphOffset_t iPos )
5791 {
5792 	assert ( iPos>=0 );
5793 
5794 	if ( iPos>=m_iWritten && iPos<=( m_iWritten + m_iPoolUsed ) )
5795 	{
5796 		// seeking inside the buffer
5797 		m_iPoolUsed = (int)( iPos - m_iWritten );
5798 		m_pPool = m_pBuffer + m_iPoolUsed;
5799 	} else
5800 	{
5801 		assert ( iPos<m_iWritten ); // seeking forward in a writer, we don't support it
5802 		sphSeek ( m_iFD, iPos, SEEK_SET );
5803 
5804 		// seeking outside the buffer; so the buffer must be discarded
5805 		// also, current write position must be adjusted
5806 		m_pPool = m_pBuffer;
5807 		m_iPoolUsed = 0;
5808 		m_iWritten = iPos;
5809 	}
5810 	m_iPos = iPos;
5811 }
5812 
5813 ///////////////////////////////////////////////////////////////////////////////
5814 // BIT-ENCODED FILE INPUT
5815 ///////////////////////////////////////////////////////////////////////////////
5816 
CSphReader(BYTE * pBuf,int iSize)5817 CSphReader::CSphReader ( BYTE * pBuf, int iSize )
5818 	: m_iFD ( -1 )
5819 	, m_iPos ( 0 )
5820 	, m_iBuffPos ( 0 )
5821 	, m_iBuffUsed ( 0 )
5822 	, m_pBuff ( pBuf )
5823 	, m_iSizeHint ( 0 )
5824 	, m_iBufSize ( iSize )
5825 	, m_bBufOwned ( false )
5826 	, m_iReadUnhinted ( DEFAULT_READ_UNHINTED )
5827 	, m_bError ( false )
5828 {
5829 	assert ( pBuf==NULL || iSize>0 );
5830 }
5831 
5832 
~CSphReader()5833 CSphReader::~CSphReader ()
5834 {
5835 	if ( m_bBufOwned )
5836 		SafeDeleteArray ( m_pBuff );
5837 }
5838 
5839 
SetBuffers(int iReadBuffer,int iReadUnhinted)5840 void CSphReader::SetBuffers ( int iReadBuffer, int iReadUnhinted )
5841 {
5842 	if ( !m_pBuff )
5843 		m_iBufSize = iReadBuffer;
5844 	m_iReadUnhinted = iReadUnhinted;
5845 }
5846 
5847 
SetFile(int iFD,const char * sFilename)5848 void CSphReader::SetFile ( int iFD, const char * sFilename )
5849 {
5850 	m_iFD = iFD;
5851 	m_iPos = 0;
5852 	m_iBuffPos = 0;
5853 	m_iBuffUsed = 0;
5854 	m_sFilename = sFilename;
5855 }
5856 
5857 
SetFile(const CSphAutofile & tFile)5858 void CSphReader::SetFile ( const CSphAutofile & tFile )
5859 {
5860 	SetFile ( tFile.GetFD(), tFile.GetFilename() );
5861 }
5862 
5863 
Reset()5864 void CSphReader::Reset ()
5865 {
5866 	SetFile ( -1, "" );
5867 }
5868 
5869 
SeekTo(SphOffset_t iPos,int iSizeHint)5870 void CSphReader::SeekTo ( SphOffset_t iPos, int iSizeHint )
5871 {
5872 	assert ( iPos>=0 );
5873 
5874 #ifndef NDEBUG
5875 #if PARANOID
5876 	struct_stat tStat;
5877 	fstat ( m_iFD, &tStat );
5878 	if ( iPos > tStat.st_size )
5879 		sphDie ( "INTERNAL ERROR: seeking past the end of file" );
5880 #endif
5881 #endif
5882 
5883 	if ( iPos>=m_iPos && iPos<m_iPos+m_iBuffUsed )
5884 	{
5885 		m_iBuffPos = (int)( iPos-m_iPos ); // reposition to proper byte
5886 		m_iSizeHint = iSizeHint - ( m_iBuffUsed - m_iBuffPos ); // we already have some bytes cached, so let's adjust size hint
5887 		assert ( m_iBuffPos<m_iBuffUsed );
5888 	} else
5889 	{
5890 		m_iPos = iPos;
5891 		m_iBuffPos = 0; // for GetPos() to work properly, aaaargh
5892 		m_iBuffUsed = 0;
5893 		m_iSizeHint = iSizeHint;
5894 	}
5895 }
5896 
5897 
SkipBytes(int iCount)5898 void CSphReader::SkipBytes ( int iCount )
5899 {
5900 	SeekTo ( m_iPos+m_iBuffPos+iCount, m_iSizeHint-m_iBuffPos-iCount );
5901 }
5902 
5903 
5904 #if USE_WINDOWS
5905 
5906 // atomic seek+read for Windows
sphPread(int iFD,void * pBuf,int iBytes,SphOffset_t iOffset)5907 int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
5908 {
5909 	if ( iBytes==0 )
5910 		return 0;
5911 
5912 	CSphIOStats * pIOStats = GetIOStats();
5913 	int64_t tmStart = 0;
5914 	if ( pIOStats )
5915 		tmStart = sphMicroTimer();
5916 
5917 	HANDLE hFile;
5918 	hFile = (HANDLE) _get_osfhandle ( iFD );
5919 	if ( hFile==INVALID_HANDLE_VALUE )
5920 		return -1;
5921 
5922 	STATIC_SIZE_ASSERT ( SphOffset_t, 8 );
5923 	OVERLAPPED tOverlapped = { 0 };
5924 	tOverlapped.Offset = (DWORD)( iOffset & I64C(0xffffffff) );
5925 	tOverlapped.OffsetHigh = (DWORD)( iOffset>>32 );
5926 
5927 	DWORD uRes;
5928 	if ( !ReadFile ( hFile, pBuf, iBytes, &uRes, &tOverlapped ) )
5929 	{
5930 		DWORD uErr = GetLastError();
5931 		if ( uErr==ERROR_HANDLE_EOF )
5932 			return 0;
5933 
5934 		errno = uErr; // FIXME! should remap from Win to POSIX
5935 		return -1;
5936 	}
5937 
5938 	if ( pIOStats )
5939 	{
5940 		pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
5941 		pIOStats->m_iReadOps++;
5942 		pIOStats->m_iReadBytes += iBytes;
5943 	}
5944 
5945 	return uRes;
5946 }
5947 
5948 #else
5949 #if HAVE_PREAD
5950 
5951 // atomic seek+read for non-Windows systems with pread() call
sphPread(int iFD,void * pBuf,int iBytes,SphOffset_t iOffset)5952 int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
5953 {
5954 	CSphIOStats * pIOStats = GetIOStats();
5955 	if ( !pIOStats )
5956 		return ::pread ( iFD, pBuf, iBytes, iOffset );
5957 
5958 	int64_t tmStart = sphMicroTimer();
5959 	int iRes = (int) ::pread ( iFD, pBuf, iBytes, iOffset );
5960 	if ( pIOStats )
5961 	{
5962 		pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
5963 		pIOStats->m_iReadOps++;
5964 		pIOStats->m_iReadBytes += iBytes;
5965 	}
5966 	return iRes;
5967 }
5968 
5969 #else
5970 
5971 // generic fallback; prone to races between seek and read
sphPread(int iFD,void * pBuf,int iBytes,SphOffset_t iOffset)5972 int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
5973 {
5974 	if ( sphSeek ( iFD, iOffset, SEEK_SET )==-1 )
5975 		return -1;
5976 
5977 	return sphReadThrottled ( iFD, pBuf, iBytes );
5978 }
5979 
5980 #endif // HAVE_PREAD
5981 #endif // USE_WINDOWS
5982 
5983 
UpdateCache()5984 void CSphReader::UpdateCache ()
5985 {
5986 	PROFILE ( read_hits );
5987 	assert ( m_iFD>=0 );
5988 
5989 	// alloc buf on first actual read
5990 	if ( !m_pBuff )
5991 	{
5992 		if ( m_iBufSize<=0 )
5993 			m_iBufSize = DEFAULT_READ_BUFFER;
5994 
5995 		m_bBufOwned = true;
5996 		m_pBuff = new BYTE [ m_iBufSize ];
5997 	}
5998 
5999 	// stream position could be changed externally
6000 	// so let's just hope that the OS optimizes redundant seeks
6001 	SphOffset_t iNewPos = m_iPos + Min ( m_iBuffPos, m_iBuffUsed );
6002 
6003 	if ( m_iSizeHint<=0 )
6004 		m_iSizeHint = ( m_iReadUnhinted>0 ) ? m_iReadUnhinted : DEFAULT_READ_UNHINTED;
6005 	int iReadLen = Min ( m_iSizeHint, m_iBufSize );
6006 
6007 	m_iBuffPos = 0;
6008 	m_iBuffUsed = sphPread ( m_iFD, m_pBuff, iReadLen, iNewPos ); // FIXME! what about throttling?
6009 
6010 	if ( m_iBuffUsed<0 )
6011 	{
6012 		m_iBuffUsed = m_iBuffPos = 0;
6013 		m_bError = true;
6014 		m_sError.SetSprintf ( "pread error in %s: pos="INT64_FMT", len=%d, code=%d, msg=%s",
6015 			m_sFilename.cstr(), (int64_t)iNewPos, iReadLen, errno, strerror(errno) );
6016 		return;
6017 	}
6018 
6019 	// all fine, adjust offset and hint
6020 	m_iSizeHint -= m_iBuffUsed;
6021 	m_iPos = iNewPos;
6022 }
6023 
6024 
GetByte()6025 int CSphReader::GetByte ()
6026 {
6027 	if ( m_iBuffPos>=m_iBuffUsed )
6028 	{
6029 		UpdateCache ();
6030 		if ( m_iBuffPos>=m_iBuffUsed )
6031 			return 0; // unexpected io failure
6032 	}
6033 
6034 	assert ( m_iBuffPos<m_iBuffUsed );
6035 	return m_pBuff [ m_iBuffPos++ ];
6036 }
6037 
6038 
GetBytes(void * pData,int iSize)6039 void CSphReader::GetBytes ( void * pData, int iSize )
6040 {
6041 	BYTE * pOut = (BYTE*) pData;
6042 
6043 	while ( iSize>m_iBufSize )
6044 	{
6045 		int iLen = m_iBuffUsed - m_iBuffPos;
6046 		assert ( iLen<=m_iBufSize );
6047 
6048 		memcpy ( pOut, m_pBuff+m_iBuffPos, iLen );
6049 		m_iBuffPos += iLen;
6050 		pOut += iLen;
6051 		iSize -= iLen;
6052 		m_iSizeHint = iSize; // FIXME!
6053 
6054 		if ( iSize>0 )
6055 		{
6056 			UpdateCache ();
6057 			if ( !m_iBuffUsed )
6058 			{
6059 				memset ( pData, 0, iSize );
6060 				return; // unexpected io failure
6061 			}
6062 		}
6063 	}
6064 
6065 	if ( m_iBuffPos+iSize>m_iBuffUsed )
6066 	{
6067 		// move old buffer tail to buffer head to avoid losing the data
6068 		const int iLen = m_iBuffUsed - m_iBuffPos;
6069 		if ( iLen>0 )
6070 		{
6071 			memcpy ( pOut, m_pBuff+m_iBuffPos, iLen );
6072 			m_iBuffPos += iLen;
6073 			pOut += iLen;
6074 			iSize -= iLen;
6075 		}
6076 
6077 		m_iSizeHint = iSize - m_iBuffUsed + m_iBuffPos; // FIXME!
6078 		UpdateCache ();
6079 		if ( m_iBuffPos+iSize>m_iBuffUsed )
6080 		{
6081 			memset ( pData, 0, iSize ); // unexpected io failure
6082 			return;
6083 		}
6084 	}
6085 
6086 	assert ( (m_iBuffPos+iSize)<=m_iBuffUsed );
6087 	memcpy ( pOut, m_pBuff+m_iBuffPos, iSize );
6088 	m_iBuffPos += iSize;
6089 }
6090 
6091 
GetBytesZerocopy(const BYTE ** ppData,int iMax)6092 int CSphReader::GetBytesZerocopy ( const BYTE ** ppData, int iMax )
6093 {
6094 	if ( m_iBuffPos>=m_iBuffUsed )
6095 	{
6096 		UpdateCache ();
6097 		if ( m_iBuffPos>=m_iBuffUsed )
6098 			return 0; // unexpected io failure
6099 	}
6100 
6101 	int iChunk = Min ( m_iBuffUsed-m_iBuffPos, iMax );
6102 	*ppData = m_pBuff + m_iBuffPos;
6103 	m_iBuffPos += iChunk;
6104 	return iChunk;
6105 }
6106 
6107 
GetLine(char * sBuffer,int iMaxLen)6108 int CSphReader::GetLine ( char * sBuffer, int iMaxLen )
6109 {
6110 	int iOutPos = 0;
6111 	iMaxLen--; // reserve space for trailing '\0'
6112 
6113 	// grab as many chars as we can
6114 	while ( iOutPos<iMaxLen )
6115 	{
6116 		// read next chunk if necessary
6117 		if ( m_iBuffPos>=m_iBuffUsed )
6118 		{
6119 			UpdateCache ();
6120 			if ( m_iBuffPos>=m_iBuffUsed )
6121 			{
6122 				if ( iOutPos==0 ) return -1; // current line is empty; indicate eof
6123 				break; // return current line; will return eof next time
6124 			}
6125 		}
6126 
6127 		// break on CR or LF
6128 		if ( m_pBuff[m_iBuffPos]=='\r' || m_pBuff[m_iBuffPos]=='\n' )
6129 			break;
6130 
6131 		// one more valid char
6132 		sBuffer[iOutPos++] = m_pBuff[m_iBuffPos++];
6133 	}
6134 
6135 	// skip everything until the newline or eof
6136 	for ( ;; )
6137 	{
6138 		// read next chunk if necessary
6139 		if ( m_iBuffPos>=m_iBuffUsed )
6140 			UpdateCache ();
6141 
6142 		// eof?
6143 		if ( m_iBuffPos>=m_iBuffUsed )
6144 			break;
6145 
6146 		// newline?
6147 		if ( m_pBuff[m_iBuffPos++]=='\n' )
6148 			break;
6149 	}
6150 
6151 	// finalize
6152 	sBuffer[iOutPos] = '\0';
6153 	return iOutPos;
6154 }
6155 
6156 /////////////////////////////////////////////////////////////////////////////
6157 
6158 #if PARANOID
6159 
6160 #define SPH_UNZIP_IMPL(_type,_getexpr) \
6161 	register DWORD b = 0; \
6162 	register _type v = 0; \
6163 	int it = 0; \
6164 	do { b = _getexpr; v = ( v<<7 ) + ( b&0x7f ); it++; } while ( b&0x80 ); \
6165 	assert ( (it-1)*7<=sizeof(_type)*8 ); \
6166 	return v;
6167 
6168 #else
6169 
6170 #define SPH_UNZIP_IMPL(_type,_getexpr) \
6171 	register DWORD b = 0; \
6172 	register _type v = 0; \
6173 	do { b = _getexpr; v = ( v<<7 ) + ( b&0x7f ); } while ( b&0x80 ); \
6174 	return v;
6175 
6176 #endif // PARANOID
6177 
sphUnzipInt(const BYTE * & pBuf)6178 DWORD sphUnzipInt ( const BYTE * & pBuf )			{ SPH_UNZIP_IMPL ( DWORD, *pBuf++ ); }
sphUnzipOffset(const BYTE * & pBuf)6179 SphOffset_t sphUnzipOffset ( const BYTE * & pBuf )	{ SPH_UNZIP_IMPL ( SphOffset_t, *pBuf++ ); }
6180 
UnzipInt()6181 DWORD CSphReader::UnzipInt ()			{ SPH_UNZIP_IMPL ( DWORD, GetByte() ); }
UnzipOffset()6182 SphOffset_t CSphReader::UnzipOffset ()	{ SPH_UNZIP_IMPL ( uint64_t, GetByte() ); }
6183 
6184 
6185 #if USE_64BIT
6186 #define sphUnzipWordid sphUnzipOffset
6187 #else
6188 #define sphUnzipWordid sphUnzipInt
6189 #endif
6190 
6191 /////////////////////////////////////////////////////////////////////////////
6192 
operator =(const CSphReader & rhs)6193 const CSphReader & CSphReader::operator = ( const CSphReader & rhs )
6194 {
6195 	SetFile ( rhs.m_iFD, rhs.m_sFilename.cstr() );
6196 	SeekTo ( rhs.m_iPos + rhs.m_iBuffPos, rhs.m_iSizeHint );
6197 	return *this;
6198 }
6199 
6200 
GetDword()6201 DWORD CSphReader::GetDword ()
6202 {
6203 	DWORD uRes = 0;
6204 	GetBytes ( &uRes, sizeof(DWORD) );
6205 	return uRes;
6206 }
6207 
6208 
GetOffset()6209 SphOffset_t CSphReader::GetOffset ()
6210 {
6211 	SphOffset_t uRes = 0;
6212 	GetBytes ( &uRes, sizeof(SphOffset_t) );
6213 	return uRes;
6214 }
6215 
6216 
GetString()6217 CSphString CSphReader::GetString ()
6218 {
6219 	CSphString sRes;
6220 
6221 	DWORD iLen = GetDword ();
6222 	if ( iLen )
6223 	{
6224 		char * sBuf = new char [ iLen ];
6225 		GetBytes ( sBuf, iLen );
6226 		sRes.SetBinary ( sBuf, iLen );
6227 		SafeDeleteArray ( sBuf );
6228 	}
6229 
6230 	return sRes;
6231 }
6232 
6233 //////////////////////////////////////////////////////////////////////////
6234 
~CSphAutoreader()6235 CSphAutoreader::~CSphAutoreader ()
6236 {
6237 	Close ();
6238 }
6239 
6240 
Open(const CSphString & sFilename,CSphString & sError)6241 bool CSphAutoreader::Open ( const CSphString & sFilename, CSphString & sError )
6242 {
6243 	assert ( m_iFD<0 );
6244 	assert ( !sFilename.IsEmpty() );
6245 
6246 	m_iFD = ::open ( sFilename.cstr(), SPH_O_READ, 0644 );
6247 	m_iPos = 0;
6248 	m_iBuffPos = 0;
6249 	m_iBuffUsed = 0;
6250 	m_sFilename = sFilename;
6251 
6252 	if ( m_iFD<0 )
6253 		sError.SetSprintf ( "failed to open %s: %s", sFilename.cstr(), strerror(errno) );
6254 	return ( m_iFD>=0 );
6255 }
6256 
6257 
Close()6258 void CSphAutoreader::Close ()
6259 {
6260 	if ( m_iFD>=0 )
6261 		::close ( m_iFD	);
6262 	m_iFD = -1;
6263 }
6264 
6265 
GetFilesize()6266 SphOffset_t CSphAutoreader::GetFilesize ()
6267 {
6268 	assert ( m_iFD>=0 );
6269 
6270 	struct_stat st;
6271 	if ( m_iFD<0 || fstat ( m_iFD, &st )<0 )
6272 		return -1;
6273 
6274 	return st.st_size;
6275 }
6276 
6277 /////////////////////////////////////////////////////////////////////////////
6278 // QUERY RESULT
6279 /////////////////////////////////////////////////////////////////////////////
6280 
CSphQueryResult()6281 CSphQueryResult::CSphQueryResult ()
6282 	: m_tSchema ( "query_result" )
6283 {
6284 	m_iQueryTime = 0;
6285 	m_iCpuTime = 0;
6286 	m_iMultiplier = 1;
6287 	m_iTotalMatches = 0;
6288 	m_pMva = NULL;
6289 	m_pStrings = NULL;
6290 	m_iOffset = 0;
6291 	m_iCount = 0;
6292 	m_iSuccesses = 0;
6293 }
6294 
6295 
~CSphQueryResult()6296 CSphQueryResult::~CSphQueryResult ()
6297 {
6298 	ARRAY_FOREACH ( i, m_dStorage2Free )
6299 	{
6300 		SafeDeleteArray ( m_dStorage2Free[i] );
6301 	}
6302 }
6303 
LeakStorages(CSphQueryResult & tDst)6304 void CSphQueryResult::LeakStorages ( CSphQueryResult & tDst )
6305 {
6306 	ARRAY_FOREACH ( i, m_dStorage2Free )
6307 		tDst.m_dStorage2Free.Add ( m_dStorage2Free[i] );
6308 
6309 	m_dStorage2Free.Reset();
6310 }
6311 
6312 
6313 /////////////////////////////////////////////////////////////////////////////
6314 // CHUNK READER
6315 /////////////////////////////////////////////////////////////////////////////
6316 
CSphBin(ESphHitless eMode,bool bWordDict)6317 CSphBin::CSphBin ( ESphHitless eMode, bool bWordDict )
6318 	: m_eMode ( eMode )
6319 	, m_dBuffer ( NULL )
6320 	, m_pCurrent ( NULL )
6321 	, m_iLeft ( 0 )
6322 	, m_iDone ( 0 )
6323 	, m_eState ( BIN_POS )
6324 	, m_bWordDict ( bWordDict )
6325 	, m_bError ( false )
6326 	, m_iFile ( -1 )
6327 	, m_pFilePos ( NULL )
6328 	, m_iFilePos ( 0 )
6329 	, m_iFileLeft ( 0 )
6330 {
6331 	m_tHit.m_sKeyword = bWordDict ? m_sKeyword : NULL;
6332 	m_sKeyword[0] = '\0';
6333 
6334 #ifndef NDEBUG
6335 	m_iLastWordID = 0;
6336 	m_sLastKeyword[0] = '\0';
6337 #endif
6338 }
6339 
6340 
CalcBinSize(int iMemoryLimit,int iBlocks,const char * sPhase,bool bWarn)6341 int CSphBin::CalcBinSize ( int iMemoryLimit, int iBlocks, const char * sPhase, bool bWarn )
6342 {
6343 	if ( iBlocks<=0 )
6344 		return CSphBin::MIN_SIZE;
6345 
6346 	int iBinSize = ( ( iMemoryLimit/iBlocks + 2048 ) >> 12 ) << 12; // round to 4k
6347 
6348 	if ( iBinSize<CSphBin::MIN_SIZE )
6349 	{
6350 		iBinSize = CSphBin::MIN_SIZE;
6351 		sphWarn ( "%s: mem_limit=%d kb extremely low, increasing to %d kb",
6352 			sPhase, iMemoryLimit/1024, iBinSize*iBlocks/1024 );
6353 	}
6354 
6355 	if ( iBinSize<CSphBin::WARN_SIZE && bWarn )
6356 	{
6357 		sphWarn ( "%s: merge_block_size=%d kb too low, increasing mem_limit may improve performance",
6358 			sPhase, iBinSize/1024 );
6359 	}
6360 
6361 	return iBinSize;
6362 }
6363 
6364 
Init(int iFD,SphOffset_t * pSharedOffset,const int iBinSize)6365 void CSphBin::Init ( int iFD, SphOffset_t * pSharedOffset, const int iBinSize )
6366 {
6367 	assert ( !m_dBuffer );
6368 	assert ( iBinSize>=MIN_SIZE );
6369 	assert ( pSharedOffset );
6370 
6371 	m_iFile = iFD;
6372 	m_pFilePos = pSharedOffset;
6373 
6374 	m_iSize = iBinSize;
6375 	m_dBuffer = new BYTE [ iBinSize ];
6376 	m_pCurrent = m_dBuffer;
6377 
6378 	m_tHit.m_iDocID = 0;
6379 	m_tHit.m_iWordID = 0;
6380 	m_tHit.m_iWordPos = EMPTY_HIT;
6381 	m_tHit.m_dFieldMask.Unset();
6382 
6383 	m_bError = false;
6384 }
6385 
6386 
~CSphBin()6387 CSphBin::~CSphBin ()
6388 {
6389 	SafeDeleteArray ( m_dBuffer );
6390 }
6391 
6392 
ReadByte()6393 int CSphBin::ReadByte ()
6394 {
6395 	BYTE r;
6396 
6397 	if ( !m_iLeft )
6398 	{
6399 		PROFILE ( read_hits );
6400 		if ( *m_pFilePos!=m_iFilePos )
6401 		{
6402 			sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
6403 			*m_pFilePos = m_iFilePos;
6404 		}
6405 
6406 		int n = m_iFileLeft > m_iSize
6407 			? m_iSize
6408 			: (int)m_iFileLeft;
6409 		if ( n==0 )
6410 		{
6411 			m_iDone = 1;
6412 			m_iLeft = 1;
6413 		} else
6414 		{
6415 			assert ( m_dBuffer );
6416 
6417 			if ( sphReadThrottled ( m_iFile, m_dBuffer, n )!=(size_t)n )
6418 			{
6419 				m_bError = true;
6420 				return -2;
6421 			}
6422 			m_iLeft = n;
6423 
6424 			m_iFilePos += n;
6425 			m_iFileLeft -= n;
6426 			m_pCurrent = m_dBuffer;
6427 			*m_pFilePos += n;
6428 		}
6429 	}
6430 	if ( m_iDone )
6431 	{
6432 		m_bError = true; // unexpected (!) eof
6433 		return -1;
6434 	}
6435 
6436 	m_iLeft--;
6437 	r = *(m_pCurrent);
6438 	m_pCurrent++;
6439 	return r;
6440 }
6441 
6442 
ReadBytes(void * pDest,int iBytes)6443 ESphBinRead CSphBin::ReadBytes ( void * pDest, int iBytes )
6444 {
6445 	assert ( iBytes>0 );
6446 	assert ( iBytes<=m_iSize );
6447 
6448 	if ( m_iDone )
6449 		return BIN_READ_EOF;
6450 
6451 	if ( m_iLeft<iBytes )
6452 	{
6453 		if ( *m_pFilePos!=m_iFilePos )
6454 		{
6455 			sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
6456 			*m_pFilePos = m_iFilePos;
6457 		}
6458 
6459 		int n = Min ( m_iFileLeft, m_iSize - m_iLeft );
6460 		if ( n==0 )
6461 		{
6462 			m_iDone = 1;
6463 			m_bError = true; // unexpected (!) eof
6464 			return BIN_READ_EOF;
6465 		}
6466 
6467 		assert ( m_dBuffer );
6468 		memmove ( m_dBuffer, m_pCurrent, m_iLeft );
6469 
6470 		if ( sphReadThrottled ( m_iFile, m_dBuffer + m_iLeft, n )!=(size_t)n )
6471 		{
6472 			m_bError = true;
6473 			return BIN_READ_ERROR;
6474 		}
6475 
6476 		m_iLeft += n;
6477 		m_iFilePos += n;
6478 		m_iFileLeft -= n;
6479 		m_pCurrent = m_dBuffer;
6480 		*m_pFilePos += n;
6481 	}
6482 
6483 	assert ( m_iLeft>=iBytes );
6484 	m_iLeft -= iBytes;
6485 
6486 	memcpy ( pDest, m_pCurrent, iBytes );
6487 	m_pCurrent += iBytes;
6488 
6489 	return BIN_READ_OK;
6490 }
6491 
6492 
ReadVLB()6493 SphWordID_t CSphBin::ReadVLB ()
6494 {
6495 	SphWordID_t uValue = 0;
6496 	int iByte, iOffset = 0;
6497 	do
6498 	{
6499 		if ( ( iByte = ReadByte() )<0 )
6500 			return 0;
6501 		uValue += ( ( SphWordID_t ( iByte & 0x7f ) ) << iOffset );
6502 		iOffset += 7;
6503 	}
6504 	while ( iByte & 0x80 );
6505 	return uValue;
6506 }
6507 
UnzipInt()6508 DWORD CSphBin::UnzipInt ()
6509 {
6510 	register int b = 0;
6511 	register DWORD v = 0;
6512 	do
6513 	{
6514 		b = ReadByte();
6515 		if ( b<0 )
6516 			b = 0;
6517 		v = ( v<<7 ) + ( b & 0x7f );
6518 	} while ( b & 0x80 );
6519 	return v;
6520 }
6521 
UnzipOffset()6522 SphOffset_t CSphBin::UnzipOffset ()
6523 {
6524 	register int b = 0;
6525 	register SphOffset_t v = 0;
6526 	do
6527 	{
6528 		b = ReadByte();
6529 		if ( b<0 )
6530 			b = 0;
6531 		v = ( v<<7 ) + ( b & 0x7f );
6532 	} while ( b & 0x80 );
6533 	return v;
6534 }
6535 
ReadHit(CSphAggregateHit * pOut,int iRowitems,CSphRowitem * pRowitems)6536 int CSphBin::ReadHit ( CSphAggregateHit * pOut, int iRowitems, CSphRowitem * pRowitems )
6537 {
6538 	// expected EOB
6539 	if ( m_iDone )
6540 	{
6541 		pOut->m_iWordID = 0;
6542 		return 1;
6543 	}
6544 
6545 	CSphAggregateHit & tHit = m_tHit; // shortcut
6546 	for ( ;; )
6547 	{
6548 		// SPH_MAX_WORD_LEN is now 42 only to keep ReadVLB() below
6549 		// technically, we can just use different functions on different paths, if ever needed
6550 		STATIC_ASSERT ( SPH_MAX_WORD_LEN*3<=127, KEYWORD_TOO_LONG );
6551 		SphWordID_t uDelta = ReadVLB();
6552 
6553 		if ( uDelta )
6554 		{
6555 			switch ( m_eState )
6556 			{
6557 				case BIN_WORD:
6558 					if ( m_bWordDict )
6559 					{
6560 #ifdef NDEBUG
6561 						// FIXME?! move this under PARANOID or something?
6562 						// or just introduce an assert() checked release build?
6563 						if ( uDelta>=sizeof(m_sKeyword) )
6564 							sphDie ( "INTERNAL ERROR: corrupted keyword length (len="UINT64_FMT", deltapos="UINT64_FMT")",
6565 								(uint64_t)uDelta, (uint64_t)(m_iFilePos-m_iLeft) );
6566 #else
6567 						assert ( uDelta>0 && uDelta<sizeof(m_sKeyword)-1 );
6568 #endif
6569 
6570 						ReadBytes ( m_sKeyword, (int)uDelta );
6571 						m_sKeyword[uDelta] = '\0';
6572 						tHit.m_iWordID = sphCRC32 ( m_sKeyword ); // must be in sync with dict!
6573 
6574 #ifndef NDEBUG
6575 						assert ( ( m_iLastWordID<tHit.m_iWordID )
6576 							|| ( m_iLastWordID==tHit.m_iWordID && strcmp ( (char*)m_sLastKeyword, (char*)m_sKeyword )<0 ) );
6577 						strncpy ( (char*)m_sLastKeyword, (char*)m_sKeyword, sizeof(m_sLastKeyword) );
6578 #endif
6579 
6580 					} else
6581 					{
6582 						tHit.m_iWordID += uDelta;
6583 					}
6584 					tHit.m_iDocID = 0;
6585 					tHit.m_iWordPos = EMPTY_HIT;
6586 					tHit.m_dFieldMask.Unset();
6587 					m_eState = BIN_DOC;
6588 					break;
6589 
6590 				case BIN_DOC:
6591 					// doc id
6592 					m_eState = BIN_POS;
6593 					tHit.m_iDocID += uDelta;
6594 					tHit.m_iWordPos = EMPTY_HIT;
6595 					for ( int i=0; i<iRowitems; i++, pRowitems++ )
6596 						*pRowitems = (DWORD)ReadVLB(); // FIXME? check range?
6597 					break;
6598 
6599 				case BIN_POS:
6600 					if ( m_eMode==SPH_HITLESS_ALL )
6601 					{
6602 						tHit.m_dFieldMask.Assign32 ( (DWORD)ReadVLB() );
6603 						m_eState = BIN_DOC;
6604 
6605 					} else if ( m_eMode==SPH_HITLESS_SOME )
6606 					{
6607 						if ( uDelta & 1 )
6608 						{
6609 							tHit.m_dFieldMask.Assign32 ( (DWORD)ReadVLB() );
6610 							m_eState = BIN_DOC;
6611 						}
6612 						uDelta >>= 1;
6613 					}
6614 					tHit.m_iWordPos += (DWORD)uDelta;
6615 					*pOut = tHit;
6616 					return 1;
6617 
6618 				default:
6619 					sphDie ( "INTERNAL ERROR: unknown bin state (state=%d)", m_eState );
6620 			}
6621 		} else
6622 		{
6623 			switch ( m_eState )
6624 			{
6625 				case BIN_POS:	m_eState = BIN_DOC; break;
6626 				case BIN_DOC:	m_eState = BIN_WORD; break;
6627 				case BIN_WORD:	m_iDone = 1; pOut->m_iWordID = 0; return 1;
6628 				default:		sphDie ( "INTERNAL ERROR: unknown bin state (state=%d)", m_eState );
6629 			}
6630 		}
6631 	}
6632 }
6633 
6634 
IsEOF() const6635 bool CSphBin::IsEOF () const
6636 {
6637 	return m_iDone!=0 || m_iFileLeft<=0;
6638 }
6639 
6640 
IsDone() const6641 bool CSphBin::IsDone () const
6642 {
6643 	return m_iDone!=0 || ( m_iFileLeft<=0 && m_iLeft<=0 );
6644 }
6645 
6646 
Precache()6647 ESphBinRead CSphBin::Precache ()
6648 {
6649 	if ( m_iFileLeft > m_iSize-m_iLeft )
6650 	{
6651 		m_bError = true;
6652 		return BIN_PRECACHE_ERROR;
6653 	}
6654 
6655 	if ( !m_iFileLeft )
6656 		return BIN_PRECACHE_OK;
6657 
6658 	if ( *m_pFilePos!=m_iFilePos )
6659 	{
6660 		sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
6661 		*m_pFilePos = m_iFilePos;
6662 	}
6663 
6664 	assert ( m_dBuffer );
6665 	memmove ( m_dBuffer, m_pCurrent, m_iLeft );
6666 
6667 	if ( sphReadThrottled ( m_iFile, m_dBuffer+m_iLeft, m_iFileLeft )!=(size_t)m_iFileLeft )
6668 	{
6669 		m_bError = true;
6670 		return BIN_READ_ERROR;
6671 	}
6672 
6673 	m_iLeft += m_iFileLeft;
6674 	m_iFilePos += m_iFileLeft;
6675 	m_iFileLeft -= m_iFileLeft;
6676 	m_pCurrent = m_dBuffer;
6677 	*m_pFilePos += m_iFileLeft;
6678 
6679 	return BIN_PRECACHE_OK;
6680 }
6681 
6682 
6683 //////////////////////////////////////////////////////////////////////////
6684 // INDEX SETTINGS
6685 //////////////////////////////////////////////////////////////////////////
6686 
CSphIndexSettings()6687 CSphIndexSettings::CSphIndexSettings ()
6688 	: m_eDocinfo			( SPH_DOCINFO_NONE )
6689 	, m_eHitFormat			( SPH_HIT_FORMAT_PLAIN )
6690 	, m_bHtmlStrip			( false )
6691 	, m_eHitless			( SPH_HITLESS_NONE )
6692 	, m_bVerbose			( false )
6693 {
6694 }
6695 
6696 //////////////////////////////////////////////////////////////////////////
6697 // GLOBAL MVA STORAGE ARENA
6698 //////////////////////////////////////////////////////////////////////////
6699 
6700 class tTester : public ISphNoncopyable
6701 {
6702 public:
6703 	virtual void Reset() = 0;
6704 	virtual void TestData ( int iData ) = 0;
~tTester()6705 	virtual ~tTester() {}
6706 };
6707 
6708 /// shared-memory arena allocator
6709 /// manages small tagged dword strings, upto 4096 bytes in size
6710 class CSphArena
6711 {
6712 public:
6713 							CSphArena ();
6714 							~CSphArena ();
6715 
6716 	DWORD *					ReInit ( int uMaxBytes );
GetError() const6717 	const char *			GetError () const { return m_sError.cstr(); }
6718 
6719 	int						TaggedAlloc ( int iTag, int iBytes );
6720 	void					TaggedFreeIndex ( int iTag, int iIndex );
6721 	void					TaggedFreeTag ( int iTag );
6722 
6723 	void					ExamineTag ( tTester* pTest, int iTag );
6724 
6725 protected:
6726 	static const int		MIN_BITS	= 4;
6727 	static const int		MAX_BITS	= 12;
6728 	static const int		NUM_SIZES	= MAX_BITS-MIN_BITS+2;	///< one for 0 (empty pages), and one for each size from min to max
6729 
6730 	static const int		PAGE_SIZE	= 1<<MAX_BITS;
6731 	static const int		PAGE_ALLOCS	= 1<<( MAX_BITS-MIN_BITS);
6732 	static const int		PAGE_BITMAP	= ( PAGE_ALLOCS+8*sizeof(DWORD)-1 )/( 8*sizeof(DWORD) );
6733 
6734 	static const int		MAX_TAGS		= 1024;
6735 	static const int		MAX_LOGENTRIES	= 29;
6736 
6737 	///< page descriptor
6738 	struct PageDesc_t
6739 	{
6740 		int					m_iSizeBits;			///< alloc size
6741 		int					m_iPrev;				///< prev free page of this size
6742 		int					m_iNext;				///< next free page of this size
6743 		int					m_iUsed;				///< usage count
6744 		DWORD				m_uBitmap[PAGE_BITMAP];	///< usage bitmap
6745 	};
6746 
6747 	///< tag descriptor
6748 	struct TagDesc_t
6749 	{
6750 		int					m_iTag;					///< tag value
6751 		int					m_iAllocs;				///< active allocs
6752 		int					m_iLogHead;				///< pointer to head allocs log entry
6753 	};
6754 
6755 	///< allocs log entry
6756 	struct AllocsLogEntry_t
6757 	{
6758 		int					m_iUsed;
6759 		int					m_iNext;
6760 		int					m_dEntries[MAX_LOGENTRIES];
6761 	};
6762 	STATIC_SIZE_ASSERT ( AllocsLogEntry_t, 124 );
6763 
6764 protected:
6765 	DWORD *					Init ( int uMaxBytes );
6766 	int						RawAlloc ( int iBytes );
6767 	void					RawFree ( int iIndex );
6768 	void					RemoveTag ( TagDesc_t * pTag );
6769 
6770 protected:
6771 	CSphProcessSharedMutex	m_tProcMutex;
6772 	CSphMutex				m_tThdMutex;
6773 
6774 	int						m_iPages;			///< max pages count
6775 	CSphSharedBuffer<DWORD>	m_pArena;			///< arena that stores everything (all other pointers point here)
6776 
6777 	PageDesc_t *			m_pPages;			///< page descriptors
6778 	int *					m_pFreelistHeads;	///< free-list heads
6779 	int *					m_pTagCount;
6780 	TagDesc_t *				m_pTags;
6781 
6782 	DWORD *					m_pBasePtr;			///< base data storage pointer
6783 	CSphString				m_sError;
6784 
6785 #if ARENADEBUG
6786 protected:
6787 	int *					m_pTotalAllocs;
6788 	int *					m_pTotalBytes;
6789 
6790 public:
6791 	void					CheckFreelists ();
6792 #else
CheckFreelists()6793 	inline void				CheckFreelists () {}
6794 #endif // ARENADEBUG
6795 };
6796 
6797 class tDocCollector : public tTester
6798 {
6799 	CSphVector<SphDocID_t> * m_dCollection;
6800 public:
tDocCollector(CSphVector<SphDocID_t> & dCollection)6801 	explicit tDocCollector ( CSphVector<SphDocID_t> & dCollection )
6802 		: m_dCollection ( &dCollection )
6803 	{}
Reset()6804 	virtual void Reset()
6805 	{
6806 		m_dCollection->Reset();
6807 	}
TestData(int iData)6808 	virtual void TestData ( int iData )
6809 	{
6810 		if ( !g_pMvaArena )
6811 			return;
6812 
6813 		m_dCollection->Add ( *(SphDocID_t*)(g_pMvaArena + iData) );
6814 	}
6815 };
6816 
6817 //////////////////////////////////////////////////////////////////////////
CSphArena()6818 CSphArena::CSphArena ()
6819 	: m_iPages ( 0 )
6820 {
6821 	m_tThdMutex.Init();
6822 }
6823 
6824 
~CSphArena()6825 CSphArena::~CSphArena ()
6826 {
6827 	// notify callers that arena no longer exists
6828 	g_pMvaArena = NULL;
6829 	m_tThdMutex.Done();
6830 }
6831 
ReInit(int uMaxBytes)6832 DWORD * CSphArena::ReInit ( int uMaxBytes )
6833 {
6834 	if ( m_iPages!=0 )
6835 	{
6836 		m_pArena.Reset();
6837 		m_iPages = 0;
6838 	}
6839 	return Init ( uMaxBytes );
6840 }
6841 
Init(int uMaxBytes)6842 DWORD * CSphArena::Init ( int uMaxBytes )
6843 {
6844 	m_iPages = ( uMaxBytes+PAGE_SIZE-1 ) / PAGE_SIZE;
6845 
6846 	int iData = m_iPages*PAGE_SIZE; // data size, bytes
6847 	int iMyTaglist = sizeof(int) + MAX_TAGS*sizeof(TagDesc_t); // int length, TagDesc_t[] tags; NOLINT
6848 	int iMy = m_iPages*sizeof(PageDesc_t) + NUM_SIZES*sizeof(int) + iMyTaglist; // my internal structures size, bytes; NOLINT
6849 #if ARENADEBUG
6850 	iMy += 2*sizeof(int); // debugging counters; NOLINT
6851 #endif
6852 
6853 	assert ( iData%sizeof(DWORD)==0 );
6854 	assert ( iMy%sizeof(DWORD)==0 );
6855 
6856 	CSphString sError, sWarning;
6857 	if ( m_tProcMutex.GetError() || !m_pArena.Alloc ( (iData+iMy)/sizeof(DWORD), sError, sWarning ) )
6858 	{
6859 		m_iPages = 0;
6860 		if ( m_tProcMutex.GetError() )
6861 			m_sError = m_tProcMutex.GetError();
6862 		else
6863 			m_sError.SetSprintf ( "alloc, error='%s', warning='%s'", sError.cstr(), sWarning.cstr() );
6864 		return NULL;
6865 	}
6866 
6867 	// setup internal pointers
6868 	DWORD * pCur = m_pArena.GetWritePtr();
6869 
6870 	m_pPages = (PageDesc_t*) pCur;
6871 	pCur += sizeof(PageDesc_t)*m_iPages/sizeof(DWORD);
6872 
6873 	m_pFreelistHeads = (int*) pCur;
6874 	pCur += NUM_SIZES; // one for each size, and one extra for zero
6875 
6876 	m_pTagCount = (int*) pCur++;
6877 	m_pTags = (TagDesc_t*) pCur;
6878 	pCur += sizeof(TagDesc_t)*MAX_TAGS/sizeof(DWORD);
6879 
6880 #if ARENADEBUG
6881 	m_pTotalAllocs = (int*) pCur++;
6882 	m_pTotalBytes = (int*) pCur++;
6883 	*m_pTotalAllocs = 0;
6884 	*m_pTotalBytes = 0;
6885 #endif
6886 
6887 	m_pBasePtr = m_pArena.GetWritePtr() + iMy/sizeof(DWORD);
6888 	assert ( m_pBasePtr==pCur );
6889 
6890 	// setup initial state
6891 	for ( int i=0; i<m_iPages; i++ )
6892 	{
6893 		m_pPages[i].m_iSizeBits = 0; // fully empty
6894 		m_pPages[i].m_iPrev = ( i>0 ) ? i-1 : -1;
6895 		m_pPages[i].m_iNext = ( i<m_iPages-1 ) ? i+1 : -1;
6896 	}
6897 
6898 	m_pFreelistHeads[0] = 0;
6899 	for ( int i=1; i<NUM_SIZES; i++ )
6900 		m_pFreelistHeads[i] = -1;
6901 
6902 	*m_pTagCount = 0;
6903 
6904 	return m_pBasePtr;
6905 }
6906 
6907 
RawAlloc(int iBytes)6908 int CSphArena::RawAlloc ( int iBytes )
6909 {
6910 	CheckFreelists ();
6911 
6912 	if ( iBytes<=0 || iBytes>( ( 1 << MAX_BITS ) - (int)sizeof(int) ) )
6913 		return -1;
6914 
6915 	int iSizeBits = sphLog2 ( iBytes+2*sizeof(int)-1 ); // always reserve sizeof(int) for the tag and AllocsLogEntry_t backtrack; NOLINT
6916 	iSizeBits = Max ( iSizeBits, MIN_BITS );
6917 	assert ( iSizeBits>=MIN_BITS && iSizeBits<=MAX_BITS );
6918 
6919 	int iSizeSlot = iSizeBits-MIN_BITS+1;
6920 	assert ( iSizeSlot>=1 && iSizeSlot<NUM_SIZES );
6921 
6922 	// get semi-free page for this size
6923 	PageDesc_t * pPage = NULL;
6924 	if ( m_pFreelistHeads[iSizeSlot]>=0 )
6925 	{
6926 		// got something in the free-list
6927 		pPage = m_pPages + m_pFreelistHeads[iSizeSlot];
6928 
6929 	} else
6930 	{
6931 		// nothing in free-list, alloc next empty one
6932 		if ( m_pFreelistHeads[0]<0 )
6933 			return -1; // out of memory
6934 
6935 		// update the page
6936 		pPage = m_pPages + m_pFreelistHeads[0];
6937 		assert ( pPage->m_iPrev==-1 );
6938 
6939 		m_pFreelistHeads[iSizeSlot] = m_pFreelistHeads[0];
6940 		m_pFreelistHeads[0] = pPage->m_iNext;
6941 		if ( pPage->m_iNext>=0 )
6942 			m_pPages[pPage->m_iNext].m_iPrev = -1;
6943 
6944 		pPage->m_iSizeBits = iSizeBits;
6945 		pPage->m_iUsed = 0;
6946 		pPage->m_iNext = -1;
6947 
6948 		CheckFreelists ();
6949 
6950 		// setup bitmap
6951 		int iUsedBits = ( 1<<(MAX_BITS-iSizeBits) ); // max-used-bits = page-size/alloc-size = ( 1<<page-bitsize )/( 1<<alloc-bitsize )
6952 		assert ( iUsedBits>0 && iUsedBits<=(PAGE_BITMAP<<5) );
6953 
6954 		for ( int i=0; i<PAGE_BITMAP; i++ )
6955 			pPage->m_uBitmap[i] = ( ( i<<5 )>=iUsedBits ) ? 0xffffffffUL : 0;
6956 
6957 		if ( iUsedBits<32 )
6958 			pPage->m_uBitmap[0] = ( 0xffffffffUL<<iUsedBits );
6959 	}
6960 
6961 	// get free alloc slot and use it
6962 	assert ( pPage );
6963 	assert ( pPage->m_iSizeBits==iSizeBits );
6964 
6965 	for ( int i=0; i<PAGE_BITMAP; i++ ) // FIXME! optimize, can scan less
6966 	{
6967 		if ( pPage->m_uBitmap[i]==0xffffffffUL )
6968 			continue;
6969 
6970 		int iFree = FindBit ( pPage->m_uBitmap[i] );
6971 		pPage->m_uBitmap[i] |= ( 1<<iFree );
6972 
6973 		pPage->m_iUsed++;
6974 		if ( pPage->m_iUsed==( PAGE_SIZE >> pPage->m_iSizeBits ) )
6975 		{
6976 			// this page is full now, unchain from the free-list
6977 			assert ( m_pFreelistHeads[iSizeSlot]==pPage-m_pPages );
6978 			m_pFreelistHeads[iSizeSlot] = pPage->m_iNext;
6979 			if ( pPage->m_iNext>=0 )
6980 			{
6981 				assert ( m_pPages[pPage->m_iNext].m_iPrev==pPage-m_pPages );
6982 				m_pPages[pPage->m_iNext].m_iPrev = -1;
6983 			}
6984 			pPage->m_iNext = -1;
6985 		}
6986 
6987 #if ARENADEBUG
6988 		(*m_pTotalAllocs)++;
6989 		(*m_pTotalBytes) += ( 1<<iSizeBits );
6990 #endif
6991 
6992 		CheckFreelists ();
6993 
6994 		int iOffset = ( pPage-m_pPages )*PAGE_SIZE + ( i*32+iFree )*( 1<<iSizeBits ); // raw internal byte offset (FIXME! optimize with shifts?)
6995 		int iIndex = 2 + ( iOffset/sizeof(DWORD) ); // dword index with tag and backtrack fixup
6996 
6997 		m_pBasePtr[iIndex-1] = DWORD(-1); // untagged by default
6998 		m_pBasePtr[iIndex-2] = DWORD(-1); // backtrack nothere
6999 		return iIndex;
7000 	}
7001 
7002 	assert ( 0 && "internal error, no free slots in free page" );
7003 	return -1;
7004 }
7005 
7006 
RawFree(int iIndex)7007 void CSphArena::RawFree ( int iIndex )
7008 {
7009 	CheckFreelists ();
7010 
7011 	int iOffset = (iIndex-2)*sizeof(DWORD); // remove tag fixup, and go to raw internal byte offset
7012 	int iPage = iOffset / PAGE_SIZE;
7013 
7014 	if ( iPage<0 || iPage>m_iPages )
7015 	{
7016 		assert ( 0 && "internal error, freed index out of arena" );
7017 		return;
7018 	}
7019 
7020 	PageDesc_t * pPage = m_pPages + iPage;
7021 	int iBit = ( iOffset % PAGE_SIZE ) >> pPage->m_iSizeBits;
7022 	assert ( ( iOffset % PAGE_SIZE )==( iBit << pPage->m_iSizeBits ) && "internal error, freed offset is unaligned" );
7023 
7024 	if (!( pPage->m_uBitmap[iBit>>5] & ( 1UL<<(iBit & 31) ) ))
7025 	{
7026 		assert ( 0 && "internal error, freed index already freed" );
7027 		return;
7028 	}
7029 
7030 	pPage->m_uBitmap[iBit>>5] &= ~( 1UL << ( iBit & 31 ) );
7031 	pPage->m_iUsed--;
7032 
7033 #if ARENADEBUG
7034 	(*m_pTotalAllocs)--;
7035 	(*m_pTotalBytes) -= ( 1<<pPage->m_iSizeBits );
7036 #endif
7037 
7038 	CheckFreelists ();
7039 
7040 	int iSizeSlot = pPage->m_iSizeBits-MIN_BITS+1;
7041 
7042 	if ( pPage->m_iUsed==( PAGE_SIZE >> pPage->m_iSizeBits )-1 )
7043 	{
7044 		// this page was full, but it's semi-free now
7045 		// chain to free-list
7046 		assert ( pPage->m_iPrev==-1 ); // full pages must not be in any list
7047 		assert ( pPage->m_iNext==-1 );
7048 
7049 		pPage->m_iNext = m_pFreelistHeads[iSizeSlot];
7050 		if ( pPage->m_iNext>=0 )
7051 		{
7052 			assert ( m_pPages[pPage->m_iNext].m_iPrev==-1 );
7053 			assert ( m_pPages[pPage->m_iNext].m_iSizeBits==pPage->m_iSizeBits );
7054 			m_pPages[pPage->m_iNext].m_iPrev = iPage;
7055 		}
7056 		m_pFreelistHeads[iSizeSlot] = iPage;
7057 	}
7058 
7059 	if ( pPage->m_iUsed==0 )
7060 	{
7061 		// this page is empty now
7062 		// unchain from free-list
7063 		if ( pPage->m_iPrev>=0 )
7064 		{
7065 			// non-head page
7066 			assert ( m_pPages[pPage->m_iPrev].m_iNext==iPage );
7067 			m_pPages[pPage->m_iPrev].m_iNext = pPage->m_iNext;
7068 
7069 			if ( pPage->m_iNext>=0 )
7070 			{
7071 				assert ( m_pPages[pPage->m_iNext].m_iPrev==iPage );
7072 				m_pPages[pPage->m_iNext].m_iPrev = pPage->m_iPrev;
7073 			}
7074 
7075 		} else
7076 		{
7077 			// head page
7078 			assert ( m_pFreelistHeads[iSizeSlot]==iPage );
7079 			assert ( pPage->m_iPrev==-1 );
7080 
7081 			if ( pPage->m_iNext>=0 )
7082 			{
7083 				assert ( m_pPages[pPage->m_iNext].m_iPrev==iPage );
7084 				m_pPages[pPage->m_iNext].m_iPrev = -1;
7085 			}
7086 			m_pFreelistHeads[iSizeSlot] = pPage->m_iNext;
7087 		}
7088 
7089 		pPage->m_iSizeBits = 0;
7090 		pPage->m_iPrev = -1;
7091 		pPage->m_iNext = m_pFreelistHeads[0];
7092 		if ( pPage->m_iNext>=0 )
7093 		{
7094 			assert ( m_pPages[pPage->m_iNext].m_iPrev==-1 );
7095 			assert ( m_pPages[pPage->m_iNext].m_iSizeBits==0 );
7096 			m_pPages[pPage->m_iNext].m_iPrev = iPage;
7097 		}
7098 		m_pFreelistHeads[0] = iPage;
7099 	}
7100 
7101 	CheckFreelists ();
7102 }
7103 
7104 
TaggedAlloc(int iTag,int iBytes)7105 int CSphArena::TaggedAlloc ( int iTag, int iBytes )
7106 {
7107 	if ( !m_iPages )
7108 		return -1; // uninitialized
7109 
7110 	assert ( iTag>=0 );
7111 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
7112 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
7113 
7114 	// find that tag first
7115 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
7116 	if ( !pTag )
7117 	{
7118 		if ( *m_pTagCount==MAX_TAGS )
7119 			return -1; // out of tags
7120 
7121 		int iLogHead = RawAlloc ( sizeof(AllocsLogEntry_t) );
7122 		if ( iLogHead<0 )
7123 			return -1; // out of memory
7124 
7125 		assert ( iLogHead>=2 );
7126 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLogHead );
7127 		pLog->m_iUsed = 0;
7128 		pLog->m_iNext = -1;
7129 
7130 		// add new tag
7131 		pTag = m_pTags + (*m_pTagCount)++;
7132 		pTag->m_iTag = iTag;
7133 		pTag->m_iAllocs = 0;
7134 		pTag->m_iLogHead = iLogHead;
7135 
7136 		// re-sort
7137 		// OPTIMIZE! full-blown sort is overkill here
7138 		sphSort ( m_pTags, *m_pTagCount, sphMemberLess ( &TagDesc_t::m_iTag ) );
7139 
7140 		// we must be able to find it now
7141 		pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
7142 		assert ( pTag && "internal error, fresh tag not found in TaggedAlloc()" );
7143 
7144 		if ( !pTag )
7145 			return -1; // internal error
7146 	}
7147 
7148 	// grow the log if needed
7149 	int iLogEntry = pTag->m_iLogHead;
7150 	AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + pTag->m_iLogHead );
7151 	if ( pLog->m_iUsed==MAX_LOGENTRIES )
7152 	{
7153 		int iNewEntry = RawAlloc ( sizeof(AllocsLogEntry_t) );
7154 		if ( iNewEntry<0 )
7155 			return -1; // out of memory
7156 
7157 		assert ( iNewEntry>=2 );
7158 		iLogEntry = iNewEntry;
7159 		AllocsLogEntry_t * pNew = (AllocsLogEntry_t*) ( m_pBasePtr + iNewEntry );
7160 		pNew->m_iUsed = 0;
7161 		pNew->m_iNext = pTag->m_iLogHead;
7162 		pTag->m_iLogHead = iNewEntry;
7163 		pLog = pNew;
7164 	}
7165 
7166 	// do the alloc itself
7167 	int iIndex = RawAlloc ( iBytes );
7168 	if ( iIndex<0 )
7169 		return -1; // out of memory
7170 
7171 	assert ( iIndex>=2 );
7172 	// tag it
7173 	m_pBasePtr[iIndex-1] = iTag;
7174 	// set data->AllocsLogEntry_t backtrack
7175 	m_pBasePtr[iIndex-2] = iLogEntry;
7176 
7177 	// log it
7178 	assert ( pLog->m_iUsed<MAX_LOGENTRIES );
7179 	pLog->m_dEntries [ pLog->m_iUsed++ ] = iIndex;
7180 	pTag->m_iAllocs++;
7181 
7182 	// and we're done
7183 	return iIndex;
7184 }
7185 
7186 
TaggedFreeIndex(int iTag,int iIndex)7187 void CSphArena::TaggedFreeIndex ( int iTag, int iIndex )
7188 {
7189 	if ( !m_iPages )
7190 		return; // uninitialized
7191 
7192 	assert ( iTag>=0 );
7193 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
7194 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
7195 
7196 	// find that tag
7197 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
7198 	assert ( pTag && "internal error, unknown tag in TaggedFreeIndex()" );
7199 	assert ( m_pBasePtr[iIndex-1]==DWORD(iTag) && "internal error, tag mismatch in TaggedFreeIndex()" );
7200 
7201 	// defence against internal errors
7202 	if ( !pTag )
7203 		return;
7204 
7205 	// untag it
7206 	m_pBasePtr[iIndex-1] = DWORD(-1);
7207 
7208 	// free it
7209 	RawFree ( iIndex );
7210 
7211 	// update AllocsLogEntry_t
7212 	int iLogEntry = m_pBasePtr[iIndex-2];
7213 	assert ( iLogEntry>=2 );
7214 	m_pBasePtr[iIndex-2] = DWORD(-1);
7215 	AllocsLogEntry_t * pLogEntry = (AllocsLogEntry_t*) ( m_pBasePtr + iLogEntry );
7216 	for ( int i = 0; i<MAX_LOGENTRIES; i++ )
7217 	{
7218 		if ( pLogEntry->m_dEntries[i]!=iIndex )
7219 			continue;
7220 
7221 		pLogEntry->m_dEntries[i] = pLogEntry->m_dEntries[pLogEntry->m_iUsed-1]; // RemoveFast
7222 		pLogEntry->m_iUsed--;
7223 		break;
7224 	}
7225 	assert ( pLogEntry->m_iUsed>=0 );
7226 
7227 	// remove from tag entries list
7228 	if ( pLogEntry->m_iUsed==0 )
7229 	{
7230 		if ( pTag->m_iLogHead==iLogEntry )
7231 		{
7232 			pTag->m_iLogHead = pLogEntry->m_iNext;
7233 		} else
7234 		{
7235 			int iLog = pTag->m_iLogHead;
7236 			while ( iLog>=0 )
7237 			{
7238 				AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
7239 				if ( iLogEntry!=pLog->m_iNext )
7240 				{
7241 					iLog = pLog->m_iNext;
7242 					continue;
7243 				} else
7244 				{
7245 					pLog->m_iNext = pLogEntry->m_iNext;
7246 					break;
7247 				}
7248 			}
7249 		}
7250 		RawFree ( iLogEntry );
7251 	}
7252 
7253 	// update the tag descriptor
7254 	pTag->m_iAllocs--;
7255 	assert ( pTag->m_iAllocs>=0 );
7256 
7257 	// remove the descriptor if its empty now
7258 	if ( pTag->m_iAllocs==0 )
7259 		RemoveTag ( pTag );
7260 }
7261 
7262 
TaggedFreeTag(int iTag)7263 void CSphArena::TaggedFreeTag ( int iTag )
7264 {
7265 	if ( !m_iPages )
7266 		return; // uninitialized
7267 
7268 	assert ( iTag>=0 );
7269 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
7270 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
7271 
7272 	// find that tag
7273 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
7274 	if ( !pTag )
7275 		return;
7276 
7277 	// walk the log and free it
7278 	int iLog = pTag->m_iLogHead;
7279 	while ( iLog>=0 )
7280 	{
7281 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
7282 		iLog = pLog->m_iNext;
7283 
7284 		// free each alloc if tag still matches
7285 		for ( int i=0; i<pLog->m_iUsed; i++ )
7286 		{
7287 			int iIndex = pLog->m_dEntries[i];
7288 			if ( m_pBasePtr[iIndex-1]==DWORD(iTag) )
7289 			{
7290 				m_pBasePtr[iIndex-1] = DWORD(-1); // avoid double free
7291 				RawFree ( iIndex );
7292 				pTag->m_iAllocs--;
7293 			}
7294 		}
7295 	}
7296 
7297 	// check for mismatches
7298 	assert ( pTag->m_iAllocs==0 );
7299 
7300 	// remove the descriptor
7301 	RemoveTag ( pTag );
7302 }
7303 
ExamineTag(tTester * pTest,int iTag)7304 void CSphArena::ExamineTag ( tTester* pTest, int iTag )
7305 {
7306 	if ( !pTest )
7307 		return;
7308 
7309 	pTest->Reset();
7310 
7311 	if ( !m_iPages )
7312 		return; // uninitialized
7313 
7314 	assert ( iTag>=0 );
7315 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
7316 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
7317 
7318 	// find that tag
7319 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
7320 	if ( !pTag )
7321 		return;
7322 
7323 	// walk the log and tick it's chunks
7324 	int iLog = pTag->m_iLogHead;
7325 	while ( iLog>=0 )
7326 	{
7327 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
7328 		iLog = pLog->m_iNext;
7329 
7330 		// tick each alloc
7331 		for ( int i=0; i<pLog->m_iUsed; i++ )
7332 			pTest->TestData ( pLog->m_dEntries[i] );
7333 	}
7334 }
7335 
RemoveTag(TagDesc_t * pTag)7336 void CSphArena::RemoveTag ( TagDesc_t * pTag )
7337 {
7338 	assert ( pTag );
7339 	assert ( pTag->m_iAllocs==0 );
7340 
7341 	// dealloc log chain
7342 	int iLog = pTag->m_iLogHead;
7343 	while ( iLog>=0 )
7344 	{
7345 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
7346 		int iNext = pLog->m_iNext;
7347 
7348 		RawFree ( iLog );
7349 		iLog = iNext;
7350 	}
7351 
7352 	// remove tag from the list
7353 	int iTail = m_pTags + (*m_pTagCount) - pTag - 1;
7354 	memmove ( pTag, pTag+1, iTail*sizeof(TagDesc_t) );
7355 	(*m_pTagCount)--;
7356 }
7357 
7358 
7359 #if ARENADEBUG
CheckFreelists()7360 void CSphArena::CheckFreelists ()
7361 {
7362 	assert ( m_pFreelistHeads[0]==-1 || m_pPages[m_pFreelistHeads[0]].m_iSizeBits==0 );
7363 	for ( int iSizeSlot=1; iSizeSlot<NUM_SIZES; iSizeSlot++ )
7364 		assert ( m_pFreelistHeads[iSizeSlot]==-1 || m_pPages[m_pFreelistHeads[iSizeSlot]].m_iSizeBits-MIN_BITS+1==iSizeSlot );
7365 }
7366 #endif // ARENADEBUG
7367 
7368 //////////////////////////////////////////////////////////////////////////
7369 
7370 static CSphArena g_MvaArena; // global mega-arena
7371 
sphArenaInit(int iMaxBytes)7372 const char * sphArenaInit ( int iMaxBytes )
7373 {
7374 	if ( !g_pMvaArena )
7375 		g_pMvaArena = g_MvaArena.ReInit ( iMaxBytes );
7376 
7377 	const char * sError = g_MvaArena.GetError();
7378 	return sError;
7379 }
7380 
7381 /////////////////////////////////////////////////////////////////////////////
7382 // INDEX
7383 /////////////////////////////////////////////////////////////////////////////
7384 
CSphIndex(const char * sIndexName,const char * sFilename)7385 CSphIndex::CSphIndex ( const char * sIndexName, const char * sFilename )
7386 	: m_iTID ( 0 )
7387 	, m_bExpandKeywords ( false )
7388 	, m_iExpansionLimit ( 0 )
7389 	, m_pProgress ( NULL )
7390 	, m_tSchema ( sFilename )
7391 	, m_bInplaceSettings ( false )
7392 	, m_iHitGap ( 0 )
7393 	, m_iDocinfoGap ( 0 )
7394 	, m_fRelocFactor ( 0.0f )
7395 	, m_fWriteFactor ( 0.0f )
7396 	, m_bKeepFilesOpen ( false )
7397 	, m_bPreloadWordlist ( true )
7398 	, m_bBinlog ( true )
7399 	, m_bStripperInited ( true )
7400 	, m_bEnableStar ( false )
7401 	, m_bId32to64 ( false )
7402 	, m_pTokenizer ( NULL )
7403 	, m_pDict ( NULL )
7404 	, m_iMaxCachedDocs ( 0 )
7405 	, m_iMaxCachedHits ( 0 )
7406 	, m_sIndexName ( sIndexName )
7407 {
7408 }
7409 
7410 
~CSphIndex()7411 CSphIndex::~CSphIndex ()
7412 {
7413 	SafeDelete ( m_pTokenizer );
7414 	SafeDelete ( m_pDict );
7415 }
7416 
7417 
SetInplaceSettings(int iHitGap,int iDocinfoGap,float fRelocFactor,float fWriteFactor)7418 void CSphIndex::SetInplaceSettings ( int iHitGap, int iDocinfoGap, float fRelocFactor, float fWriteFactor )
7419 {
7420 	m_iHitGap = iHitGap;
7421 	m_iDocinfoGap = iDocinfoGap;
7422 	m_fRelocFactor = fRelocFactor;
7423 	m_fWriteFactor = fWriteFactor;
7424 	m_bInplaceSettings = true;
7425 }
7426 
7427 
SetTokenizer(ISphTokenizer * pTokenizer)7428 void CSphIndex::SetTokenizer ( ISphTokenizer * pTokenizer )
7429 {
7430 	if ( m_pTokenizer!=pTokenizer )
7431 		SafeDelete ( m_pTokenizer );
7432 	m_pTokenizer = pTokenizer;
7433 }
7434 
7435 
LeakTokenizer()7436 ISphTokenizer *	CSphIndex::LeakTokenizer ()
7437 {
7438 	ISphTokenizer * pTokenizer = m_pTokenizer;
7439 	m_pTokenizer = NULL;
7440 	return pTokenizer;
7441 }
7442 
7443 
SetDictionary(CSphDict * pDict)7444 void CSphIndex::SetDictionary ( CSphDict * pDict )
7445 {
7446 	if ( m_pDict!=pDict )
7447 		SafeDelete ( m_pDict );
7448 
7449 	m_pDict = pDict;
7450 }
7451 
7452 
LeakDictionary()7453 CSphDict * CSphIndex::LeakDictionary ()
7454 {
7455 	CSphDict * pDict = m_pDict;
7456 	m_pDict = NULL;
7457 	return pDict;
7458 }
7459 
7460 
Setup(const CSphIndexSettings & tSettings)7461 void CSphIndex::Setup ( const CSphIndexSettings & tSettings )
7462 {
7463 	m_bStripperInited = true;
7464 	m_tSettings = tSettings;
7465 }
7466 
SetCacheSize(int iMaxCachedDocs,int iMaxCachedHits)7467 void CSphIndex::SetCacheSize ( int iMaxCachedDocs, int iMaxCachedHits )
7468 {
7469 	m_iMaxCachedDocs = iMaxCachedDocs;
7470 	m_iMaxCachedHits = iMaxCachedHits;
7471 }
7472 
7473 /////////////////////////////////////////////////////////////////////////////
7474 
sphCreateIndexPhrase(const char * szIndexName,const char * sFilename)7475 CSphIndex * sphCreateIndexPhrase ( const char* szIndexName, const char * sFilename )
7476 {
7477 	return new CSphIndex_VLN ( szIndexName, sFilename );
7478 }
7479 
7480 
CSphIndex_VLN(const char * sIndexName,const char * sFilename)7481 CSphIndex_VLN::CSphIndex_VLN ( const char* sIndexName, const char * sFilename )
7482 	: CSphIndex ( sIndexName, sFilename )
7483 	, m_iLockFD ( -1 )
7484 {
7485 	m_sFilename = sFilename;
7486 
7487 	m_pWriteBuffer = NULL;
7488 
7489 	m_tLastHit.m_iDocID = 0;
7490 	m_tLastHit.m_iWordID = 0;
7491 	m_tLastHit.m_iWordPos = EMPTY_HIT;
7492 	m_tLastHit.m_sKeyword = m_sLastKeyword;
7493 	m_iLastHitlistPos = 0;
7494 	m_dLastDocFields.Unset();
7495 	m_uLastDocHits = 0;
7496 	m_iLastWordDocs = 0;
7497 	m_iLastWordHits = 0;
7498 
7499 	m_uDocinfo = 0;
7500 	m_uDocinfoIndex = 0;
7501 	m_pDocinfoIndex = NULL;
7502 
7503 	m_bPreallocated = false;
7504 	m_uVersion = INDEX_FORMAT_VERSION;
7505 
7506 	m_iKillListSize = 0;
7507 	m_uMinMaxIndex = 0;
7508 
7509 	m_iIndexTag = -1;
7510 	m_iMergeInfinum = 0;
7511 	m_bWordDict = false;
7512 	m_bIsEmpty = true;
7513 	m_bMerging = false;
7514 	m_tLastHit.m_sKeyword[0] = '\0';
7515 
7516 	m_pPreread = NULL;
7517 	m_pAttrsStatus = NULL;
7518 
7519 	m_pMin = new CSphMatch();
7520 }
7521 
7522 
~CSphIndex_VLN()7523 CSphIndex_VLN::~CSphIndex_VLN ()
7524 {
7525 	SafeDeleteArray ( m_pWriteBuffer );
7526 	SafeDelete ( m_pMin );
7527 
7528 #if USE_WINDOWS
7529 	if ( m_iIndexTag>=0 && g_pMvaArena )
7530 #else
7531 	if ( m_iIndexTag>=0 && g_bHeadProcess && g_pMvaArena )
7532 #endif
7533 		g_MvaArena.TaggedFreeTag ( m_iIndexTag );
7534 
7535 #if !USE_WINDOWS
7536 	if ( g_bHeadProcess )
7537 #endif
7538 	Unlock();
7539 }
7540 
7541 
7542 /////////////////////////////////////////////////////////////////////////////
7543 
7544 
UpdateAttributes(const CSphAttrUpdate & tUpd,int iIndex,CSphString & sError)7545 int CSphIndex_VLN::UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError )
7546 {
7547 	// check if we can
7548 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
7549 	{
7550 		sError.SetSprintf ( "docinfo=extern required for updates" );
7551 		return -1;
7552 	}
7553 
7554 	assert ( tUpd.m_dDocids.GetLength()==0 || tUpd.m_dRows.GetLength()==0 );
7555 	DWORD uRows = Max ( tUpd.m_dDocids.GetLength(), tUpd.m_dRows.GetLength() );
7556 	bool bRaw = tUpd.m_dDocids.GetLength()==0;
7557 
7558 	// check if we have to
7559 	assert ( (int)uRows==tUpd.m_dRowOffset.GetLength() );
7560 	if ( !m_uDocinfo || !uRows )
7561 		return 0;
7562 
7563 	if ( m_bBinlog && g_pBinlog )
7564 		g_pBinlog->BinlogUpdateAttributes ( &m_iTID, m_sIndexName.cstr(), tUpd );
7565 
7566 	// remap update schema to index schema
7567 	CSphVector<CSphAttrLocator> dLocators;
7568 	CSphVector<int> dIndexes;
7569 	CSphVector<bool> dFloats;
7570 	CSphVector<bool> dBigints;
7571 	dLocators.Reserve ( tUpd.m_dAttrs.GetLength() );
7572 	dIndexes.Reserve ( tUpd.m_dAttrs.GetLength() );
7573 	dFloats.Reserve ( tUpd.m_dAttrs.GetLength() );
7574 	dBigints.Reserve ( tUpd.m_dAttrs.GetLength() ); // bigint flags for *source* schema.
7575 	uint64_t uDst64 = 0;
7576 	ARRAY_FOREACH ( i, tUpd.m_dAttrs )
7577 	{
7578 		int iIdx = m_tSchema.GetAttrIndex ( tUpd.m_dAttrs[i].m_sName.cstr() );
7579 		if ( iIdx<0 )
7580 		{
7581 			sError.SetSprintf ( "attribute '%s' not found", tUpd.m_dAttrs[i].m_sName.cstr() );
7582 			return -1;
7583 		}
7584 
7585 		dBigints.Add ( tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_BIGINT );
7586 
7587 		// forbid updates on non-int columns
7588 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(iIdx);
7589 		if (!( tCol.m_eAttrType==SPH_ATTR_BOOL || tCol.m_eAttrType==SPH_ATTR_INTEGER || tCol.m_eAttrType==SPH_ATTR_TIMESTAMP
7590 			|| tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET
7591 			|| tCol.m_eAttrType==SPH_ATTR_BIGINT || tCol.m_eAttrType==SPH_ATTR_FLOAT ))
7592 		{
7593 			sError.SetSprintf ( "attribute '%s' can not be updated (must be boolean, integer, bigint, float, timestamp, or MVA)", tUpd.m_dAttrs[i].m_sName.cstr() );
7594 			return -1;
7595 		}
7596 
7597 		// forbid updates on MVA columns if there's no arena
7598 		if ( ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET ) && !g_pMvaArena )
7599 		{
7600 			sError.SetSprintf ( "MVA attribute '%s' can not be updated (MVA arena not initialized)", tCol.m_sName.cstr() );
7601 			return -1;
7602 		}
7603 
7604 		bool bSrcMva = ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET );
7605 		bool bDstMva = ( tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_UINT32SET || tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_INT64SET );
7606 		if ( bSrcMva!=bDstMva )
7607 		{
7608 			sError.SetSprintf ( "attribute '%s' MVA flag mismatch", tUpd.m_dAttrs[i].m_sName.cstr() );
7609 			return -1;
7610 		}
7611 
7612 		if ( tCol.m_eAttrType==SPH_ATTR_UINT32SET && tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_INT64SET )
7613 		{
7614 			sError.SetSprintf ( "attribute '%s' MVA bits (dst=%d, src=%d) mismatch", tUpd.m_dAttrs[i].m_sName.cstr(),
7615 				tCol.m_eAttrType, tUpd.m_dAttrs[i].m_eAttrType );
7616 			return -1;
7617 		}
7618 
7619 		if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
7620 			uDst64 |= ( U64C(1)<<i );
7621 
7622 		dFloats.Add ( tCol.m_eAttrType==SPH_ATTR_FLOAT );
7623 		dLocators.Add ( tCol.m_tLocator );
7624 
7625 		// find dupes to optimize
7626 		ARRAY_FOREACH ( j, dIndexes )
7627 			if ( dIndexes[j]==iIdx )
7628 			{
7629 				dIndexes[j] = -1;
7630 				break;
7631 			}
7632 		dIndexes.Add ( iIdx );
7633 	}
7634 	assert ( dLocators.GetLength()==tUpd.m_dAttrs.GetLength() );
7635 
7636 	// FIXME! FIXME! FIXME! overwriting just-freed blocks might hurt concurrent searchers;
7637 	// should implement a simplistic MVCC-style delayed-free to avoid that
7638 
7639 	// do the update
7640 	const int iFirst = ( iIndex<0 ) ? 0 : iIndex;
7641 	const int iLast = ( iIndex<0 ) ? uRows : iIndex+1;
7642 
7643 	// row update must leave it in cosistent state; so let's preallocate all the needed MVA
7644 	// storage upfront to avoid suddenly having to rollback if allocation fails later
7645 	int iNumMVA = 0;
7646 	ARRAY_FOREACH ( i, tUpd.m_dAttrs )
7647 		if ( dIndexes[i]>=0 && ( tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_UINT32SET || tUpd.m_dAttrs[i].m_eAttrType==SPH_ATTR_INT64SET ) )
7648 			iNumMVA++;
7649 
7650 	// OPTIMIZE! execute the code below conditionally
7651 	CSphVector<DWORD*> dRowPtrs;
7652 	CSphVector<int> dMvaPtrs;
7653 
7654 	dRowPtrs.Resize ( uRows );
7655 	dMvaPtrs.Resize ( uRows*iNumMVA );
7656 	dMvaPtrs.Fill ( -1 );
7657 
7658 	// preallocate
7659 	bool bFailed = false;
7660 	for ( int iUpd=iFirst; iUpd<iLast && !bFailed; iUpd++ )
7661 	{
7662 		dRowPtrs[iUpd] = const_cast < DWORD * > ( bRaw ? tUpd.m_dRows[iUpd] : FindDocinfo ( tUpd.m_dDocids[iUpd] ) );
7663 		if ( !dRowPtrs[iUpd] )
7664 			continue; // no such id
7665 
7666 		int iPoolPos = tUpd.m_dRowOffset[iUpd];
7667 		int iMvaPtr = iUpd*iNumMVA;
7668 		ARRAY_FOREACH_COND ( iCol, tUpd.m_dAttrs, !bFailed )
7669 		{
7670 			bool bSrcMva32 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_UINT32SET );
7671 			bool bSrcMva64 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_INT64SET );
7672 			if (!( bSrcMva32 || bSrcMva64 )) // FIXME! optimize using a prebuilt dword mask?
7673 			{
7674 				iPoolPos++;
7675 				if ( dBigints[iCol] )
7676 					iPoolPos++;
7677 				continue;
7678 			}
7679 
7680 			// get the requested new count
7681 			int iNewCount = (int)tUpd.m_dPool[iPoolPos++];
7682 			iPoolPos += iNewCount;
7683 
7684 			// try to alloc
7685 			if ( dIndexes[iCol]>=0 )
7686 			{
7687 				int iAlloc = -1;
7688 				if ( iNewCount )
7689 				{
7690 					bool bDst64 = ( uDst64 & ( U64C(1) << iCol ) )!=0;
7691 					assert ( (iNewCount%2)==0 );
7692 					int iLen = ( bDst64 ? iNewCount : iNewCount/2 );
7693 					iAlloc = g_MvaArena.TaggedAlloc ( m_iIndexTag, (1+iLen)*sizeof(DWORD)+sizeof(SphDocID_t) );
7694 					if ( iAlloc<0 )
7695 						bFailed = true;
7696 				}
7697 
7698 				// whatever the outcome, move the pointer
7699 				dMvaPtrs[iMvaPtr++] = iAlloc;
7700 			}
7701 		}
7702 	}
7703 
7704 	// if there were any allocation failures, rollback everything
7705 	if ( bFailed )
7706 	{
7707 		ARRAY_FOREACH ( i, dMvaPtrs )
7708 			if ( dMvaPtrs[i]>=0 )
7709 				g_MvaArena.TaggedFreeIndex ( m_iIndexTag, dMvaPtrs[i] );
7710 
7711 		sError.SetSprintf ( "out of pool memory on MVA update" );
7712 		return -1;
7713 	}
7714 
7715 	// preallocation went OK; do the actual update
7716 	int iRowStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
7717 	int iUpdated = 0;
7718 	DWORD uUpdateMask = 0;
7719 
7720 	for ( int iUpd=iFirst; iUpd<iLast; iUpd++ )
7721 	{
7722 		DWORD * pEntry = dRowPtrs[iUpd];
7723 		if ( !pEntry )
7724 			continue; // no such id
7725 
7726 		int iBlock = ( pEntry-m_pDocinfo.GetWritePtr() ) / ( iRowStride*DOCINFO_INDEX_FREQ );
7727 		DWORD * pBlockRanges = const_cast < DWORD * > ( &m_pDocinfoIndex[2*iBlock*iRowStride] );
7728 		DWORD * pIndexRanges = const_cast < DWORD * > ( &m_pDocinfoIndex[2*m_uDocinfoIndex*iRowStride] );
7729 		assert ( iBlock>=0 && iBlock<(int)m_uDocinfoIndex );
7730 
7731 		assert ( bRaw || ( DOCINFO2ID(pEntry)==tUpd.m_dDocids[iUpd] ) );
7732 		pEntry = DOCINFO2ATTRS(pEntry);
7733 
7734 		int iPos = tUpd.m_dRowOffset[iUpd];
7735 		int iMvaPtr = iUpd*iNumMVA;
7736 		ARRAY_FOREACH ( iCol, tUpd.m_dAttrs )
7737 		{
7738 			bool bSrcMva32 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_UINT32SET );
7739 			bool bSrcMva64 = ( tUpd.m_dAttrs[iCol].m_eAttrType==SPH_ATTR_INT64SET );
7740 			if (!( bSrcMva32 || bSrcMva64 )) // FIXME! optimize using a prebuilt dword mask?
7741 			{
7742 				// plain update
7743 				if ( dIndexes[iCol]>=0 )
7744 				{
7745 					SphAttr_t uValue = dBigints[iCol] ? MVA_UPSIZE ( &tUpd.m_dPool[iPos] ) : tUpd.m_dPool[iPos];
7746 					sphSetRowAttr ( pEntry, dLocators[iCol], uValue );
7747 
7748 					// update block and index ranges
7749 					for ( int i=0; i<2; i++ )
7750 					{
7751 						DWORD * pBlock = i ? pBlockRanges : pIndexRanges;
7752 						SphAttr_t uMin = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol] );
7753 						SphAttr_t uMax = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ) , dLocators[iCol] );
7754 						if ( dFloats[iCol] ) // update float's indexes assumes float comparision
7755 						{
7756 							float fValue = sphDW2F ( (DWORD) uValue );
7757 							float fMin = sphDW2F ( (DWORD) uMin );
7758 							float fMax = sphDW2F ( (DWORD) uMax );
7759 							if ( fValue<fMin )
7760 								sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], sphF2DW ( fValue ) );
7761 							if ( fValue>fMax )
7762 								sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], sphF2DW ( fValue ) );
7763 						} else // update usual integers
7764 						{
7765 							if ( uValue<uMin )
7766 								sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], uValue );
7767 							if ( uValue>uMax )
7768 								sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], uValue );
7769 						}
7770 					}
7771 					uUpdateMask |= ATTRS_UPDATED;
7772 				}
7773 				iPos += dBigints[iCol]?2:1;
7774 				continue;
7775 			}
7776 
7777 			// MVA update
7778 			DWORD uOldIndex = MVA_DOWNSIZE ( sphGetRowAttr ( pEntry, dLocators[iCol] ) );
7779 
7780 			// get new count, store new data if needed
7781 			DWORD uNew = tUpd.m_dPool[iPos++];
7782 			const DWORD * pSrc = tUpd.m_dPool.Begin() + iPos;
7783 			iPos += uNew;
7784 			if ( dIndexes[iCol]>=0 )
7785 			{
7786 				int64_t iNewMin = LLONG_MAX, iNewMax = LLONG_MIN;
7787 				int iNewIndex = dMvaPtrs[iMvaPtr++];
7788 				if ( uNew )
7789 				{
7790 					assert ( iNewIndex>=0 );
7791 					SphDocID_t* pDocid = (SphDocID_t *)(g_pMvaArena + iNewIndex);
7792 					*pDocid++ = ( bRaw ? DOCINFO2ID ( tUpd.m_dRows[iUpd] ) : tUpd.m_dDocids[iUpd] );
7793 					iNewIndex = (DWORD *)pDocid - g_pMvaArena;
7794 
7795 					assert ( iNewIndex>=0 );
7796 					DWORD * pDst = g_pMvaArena + iNewIndex;
7797 
7798 					bool bDst64 = ( uDst64 & ( U64C(1) << iCol ) )!=0;
7799 					assert ( ( uNew%2 )==0 );
7800 					int iLen = ( bDst64 ? uNew : uNew/2 );
7801 					// setup new value (flagged index) to store within row
7802 					uNew = DWORD(iNewIndex) | MVA_ARENA_FLAG;
7803 
7804 					// MVA values counter first
7805 					*pDst++ = iLen;
7806 					if ( bDst64 )
7807 					{
7808 						while ( iLen )
7809 						{
7810 							int64_t uValue = MVA_UPSIZE ( pSrc );
7811 							iNewMin = Min ( iNewMin, uValue );
7812 							iNewMax = Max ( iNewMax, uValue );
7813 							*pDst++ = *pSrc++;
7814 							*pDst++ = *pSrc++;
7815 							iLen -= 2;
7816 						}
7817 					} else
7818 					{
7819 						while ( iLen-- )
7820 						{
7821 							DWORD uValue = *pSrc;
7822 							pSrc += 2;
7823 							*pDst++ = uValue;
7824 							iNewMin = Min ( iNewMin, uValue );
7825 							iNewMax = Max ( iNewMax, uValue );
7826 						}
7827 					}
7828 				}
7829 
7830 				// store new value
7831 				sphSetRowAttr ( pEntry, dLocators[iCol], uNew );
7832 
7833 				// update block and index ranges
7834 				if ( uNew )
7835 					for ( int i=0; i<2; i++ )
7836 				{
7837 					DWORD * pBlock = i ? pBlockRanges : pIndexRanges;
7838 					int64_t iMin = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol] );
7839 					int64_t iMax = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol] );
7840 					if ( iNewMin<iMin || iNewMax>iMax )
7841 					{
7842 						sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], Min ( iMin, iNewMin ) );
7843 						sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], Max ( iMax, iNewMax ) );
7844 					}
7845 				}
7846 
7847 				// free old storage if needed
7848 				if ( uOldIndex & MVA_ARENA_FLAG )
7849 				{
7850 					uOldIndex = ((DWORD*)((SphDocID_t*)(g_pMvaArena + (uOldIndex & MVA_OFFSET_MASK))-1))-g_pMvaArena;
7851 					g_MvaArena.TaggedFreeIndex ( m_iIndexTag, uOldIndex );
7852 				}
7853 
7854 				uUpdateMask |= ATTRS_MVA_UPDATED;
7855 			}
7856 		}
7857 
7858 		iUpdated++;
7859 	}
7860 
7861 	*m_pAttrsStatus |= uUpdateMask; // FIXME! add lock/atomic?
7862 	return iUpdated;
7863 }
7864 
LoadPersistentMVA(CSphString & sError)7865 bool CSphIndex_VLN::LoadPersistentMVA ( CSphString & sError )
7866 {
7867 	// prepare the file to load
7868 	CSphAutoreader fdReader;
7869 	if ( !fdReader.Open ( GetIndexFileName("mvp"), m_sLastError ) )
7870 	{
7871 		// no mvp means no saved attributes.
7872 		m_sLastError = "";
7873 		return true;
7874 	}
7875 
7876 	// check if we can
7877 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
7878 	{
7879 		sError.SetSprintf ( "docinfo=extern required for updates" );
7880 		return false;
7881 	}
7882 
7883 	DWORD uDocs = fdReader.GetDword();
7884 
7885 	// if we have docs to update
7886 	if ( !uDocs )
7887 		return false;
7888 
7889 	CSphVector<SphDocID_t> dAffected ( uDocs );
7890 	fdReader.GetBytes ( &dAffected[0], uDocs*sizeof(SphDocID_t) );
7891 
7892 	// collect the indexes of MVA schema attributes
7893 	CSphVector<CSphAttrLocator> dMvaLocators;
7894 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
7895 	{
7896 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
7897 		if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
7898 			dMvaLocators.Add ( tAttr.m_tLocator );
7899 	}
7900 #ifndef NDEBUG
7901 	int iMva64 = dMvaLocators.GetLength();
7902 #endif
7903 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
7904 	{
7905 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
7906 		if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
7907 			dMvaLocators.Add ( tAttr.m_tLocator );
7908 	}
7909 	assert ( dMvaLocators.GetLength()!=0 );
7910 
7911 	if ( g_MvaArena.GetError() ) // have to reset affected MVA in case of ( persistent MVA + no MVA arena )
7912 	{
7913 		ARRAY_FOREACH ( iDoc, dAffected )
7914 		{
7915 			DWORD * pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[iDoc] ) );
7916 			assert ( pDocinfo );
7917 			DWORD * pAttrs = DOCINFO2ATTRS ( pDocinfo );
7918 			ARRAY_FOREACH ( iMva, dMvaLocators )
7919 			{
7920 				// reset MVA from arena
7921 				if ( MVA_DOWNSIZE ( sphGetRowAttr ( pAttrs, dMvaLocators[iMva] ) ) & MVA_ARENA_FLAG )
7922 					sphSetRowAttr ( pAttrs, dMvaLocators[iMva], 0 );
7923 			}
7924 		}
7925 
7926 		sphWarning ( "index '%s' forced to reset persistent MVAs ( %s )", m_sIndexName.cstr(), g_MvaArena.GetError() );
7927 		fdReader.Close();
7928 		return true;
7929 	}
7930 
7931 	CSphVector<DWORD*> dRowPtrs ( uDocs );
7932 	CSphVector<int> dAllocs;
7933 	dAllocs.Reserve ( uDocs );
7934 
7935 	// prealloc values (and also preload)
7936 	bool bFailed = false;
7937 	ARRAY_FOREACH ( i, dAffected )
7938 	{
7939 		DWORD* pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[i] ) );
7940 		assert ( pDocinfo );
7941 		pDocinfo = DOCINFO2ATTRS ( pDocinfo );
7942 		ARRAY_FOREACH_COND ( j, dMvaLocators, !bFailed )
7943 		{
7944 			// if this MVA was updated
7945 			if ( MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) ) & MVA_ARENA_FLAG )
7946 			{
7947 				DWORD uCount = fdReader.GetDword();
7948 				if ( uCount )
7949 				{
7950 					assert ( j<iMva64 || ( uCount%2 )==0 );
7951 					int iAlloc = g_MvaArena.TaggedAlloc ( m_iIndexTag, (1+uCount)*sizeof(DWORD)+sizeof(SphDocID_t) );
7952 					if ( iAlloc<0 )
7953 						bFailed = true;
7954 					else
7955 					{
7956 						SphDocID_t *pDocid = (SphDocID_t*)(g_pMvaArena + iAlloc);
7957 						*pDocid++ = dAffected[i];
7958 						DWORD * pData = (DWORD*)pDocid;
7959 						*pData++ = uCount;
7960 						fdReader.GetBytes ( pData, uCount*sizeof(DWORD) );
7961 						dAllocs.Add ( iAlloc );
7962 					}
7963 				}
7964 			}
7965 		}
7966 		if ( bFailed )
7967 			break;
7968 		dRowPtrs[i] = pDocinfo;
7969 	}
7970 	fdReader.Close();
7971 
7972 	if ( bFailed )
7973 	{
7974 		ARRAY_FOREACH ( i, dAllocs )
7975 			g_MvaArena.TaggedFreeIndex ( m_iIndexTag, dAllocs[i] );
7976 
7977 		sError.SetSprintf ( "out of pool memory on loading persistent MVA values" );
7978 		return false;
7979 	}
7980 
7981 	// prealloc && load ok, fix the attributes now
7982 	int iAllocIndex = 0;
7983 	ARRAY_FOREACH ( i, dAffected )
7984 	{
7985 		DWORD* pDocinfo = dRowPtrs[i];
7986 		assert ( pDocinfo );
7987 		ARRAY_FOREACH_COND ( j, dMvaLocators, !bFailed )
7988 			// if this MVA was updated
7989 			if ( MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) ) & MVA_ARENA_FLAG )
7990 				sphSetRowAttr ( pDocinfo, dMvaLocators[j],
7991 					((DWORD*)(((SphDocID_t*)(g_pMvaArena + dAllocs[iAllocIndex++]))+1) - g_pMvaArena) | MVA_ARENA_FLAG );
7992 	}
7993 	return true;
7994 }
7995 
7996 //////////////////////////////////////////////////////////////////////////
7997 
PrecomputeMinMax()7998 bool CSphIndex_VLN::PrecomputeMinMax()
7999 {
8000 	if ( !m_uDocinfo )
8001 		return true;
8002 
8003 	AttrIndexBuilder_c tBuilder ( m_tSchema );
8004 	tBuilder.Prepare ( m_pDocinfoIndex, m_pDocinfoIndex + 2*( 1+m_uDocinfoIndex )*( DOCINFO_IDSIZE + m_tSchema.GetRowSize() ) );
8005 
8006 	DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
8007 	DWORD uProgressEntry = 0;
8008 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_PRECOMPUTE;
8009 	m_tProgress.m_iDone = 0;
8010 	m_uMinMaxIndex = 0;
8011 
8012 	for ( DWORD uIndexEntry=0; uIndexEntry<m_uDocinfo; uIndexEntry++ )
8013 	{
8014 		if ( !tBuilder.Collect ( &m_pDocinfo[(int64_t(uIndexEntry))*uStride], m_pMva.GetWritePtr(), (int64_t)m_pMva.GetNumEntries(), m_sLastError, true ) )
8015 			return false;
8016 		m_uMinMaxIndex += uStride;
8017 
8018 		// show progress
8019 		if ( uIndexEntry==uProgressEntry )
8020 		{
8021 			uProgressEntry = Min ( uIndexEntry+1000, m_uDocinfoIndex-1 );
8022 			if ( m_pProgress )
8023 			{
8024 				m_tProgress.m_iDone = (uIndexEntry+1)*1000/m_uDocinfoIndex;
8025 				m_pProgress ( &m_tProgress, m_tProgress.m_iDone==1000 );
8026 			}
8027 		}
8028 	}
8029 
8030 	tBuilder.FinishCollect();
8031 	return true;
8032 }
8033 
8034 // safely rename an index file
JuggleFile(const char * szExt,bool bNeedOrigin)8035 bool CSphIndex_VLN::JuggleFile ( const char* szExt, bool bNeedOrigin )
8036 {
8037 	CSphString sExt = GetIndexFileName ( szExt );
8038 	CSphString sExtNew, sExtOld;
8039 	sExtNew.SetSprintf ( "%s.tmpnew", sExt.cstr() );
8040 	sExtOld.SetSprintf ( "%s.tmpold", sExt.cstr() );
8041 
8042 	if ( ::rename ( sExt.cstr(), sExtOld.cstr() ) )
8043 	{
8044 		if ( bNeedOrigin )
8045 		{
8046 			m_sLastError.SetSprintf ( "rename '%s' to '%s' failed: %s",
8047 				sExt.cstr(), sExtOld.cstr(), strerror(errno) );
8048 			return false;
8049 		}
8050 	}
8051 
8052 	if ( ::rename ( sExtNew.cstr(), sExt.cstr() ) )
8053 	{
8054 		if ( bNeedOrigin && !::rename ( sExtOld.cstr(), sExt.cstr() ) )
8055 		{
8056 			// rollback failed too!
8057 			m_sLastError.SetSprintf ( "rollback rename to '%s' failed: %s; INDEX UNUSABLE; FIX FILE NAMES MANUALLY",
8058 				sExt.cstr(), strerror(errno) );
8059 		} else
8060 		{
8061 			// rollback went ok
8062 			m_sLastError.SetSprintf ( "rename '%s' to '%s' failed: %s",
8063 				sExtNew.cstr(), sExt.cstr(), strerror(errno) );
8064 		}
8065 		return false;
8066 	}
8067 
8068 	// all done
8069 	::unlink ( sExtOld.cstr() );
8070 	return true;
8071 }
8072 
SaveAttributes()8073 bool CSphIndex_VLN::SaveAttributes ()
8074 {
8075 	if ( !m_pAttrsStatus || !*m_pAttrsStatus || !m_uDocinfo )
8076 		return true;
8077 
8078 	DWORD uAttrStatus = *m_pAttrsStatus;
8079 
8080 	sphLogDebugvv ( "index '%s' attrs (%d) saving...", m_sIndexName.cstr(), uAttrStatus );
8081 
8082 	assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && m_uDocinfo && m_pDocinfo.GetWritePtr() );
8083 
8084 	for ( ; uAttrStatus & ATTRS_MVA_UPDATED ; )
8085 	{
8086 		// collect the indexes of MVA schema attributes
8087 		CSphVector<CSphAttrLocator> dMvaLocators;
8088 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
8089 		{
8090 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
8091 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
8092 				dMvaLocators.Add ( tAttr.m_tLocator );
8093 		}
8094 #ifndef NDEBUG
8095 		int iMva64 = dMvaLocators.GetLength();
8096 #endif
8097 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
8098 		{
8099 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
8100 			if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
8101 				dMvaLocators.Add ( tAttr.m_tLocator );
8102 		}
8103 		assert ( dMvaLocators.GetLength()!=0 );
8104 
8105 		// collect the list of all docids with changed MVA attributes
8106 		CSphVector<SphDocID_t> dAffected;
8107 		{
8108 			tDocCollector dCollect ( dAffected );
8109 			g_MvaArena.ExamineTag ( &dCollect, m_iIndexTag );
8110 		}
8111 		dAffected.Uniq();
8112 
8113 		if ( !dAffected.GetLength() )
8114 			break;
8115 
8116 		// prepare the file to save into;
8117 		CSphWriter fdFlushMVA;
8118 		fdFlushMVA.OpenFile ( GetIndexFileName("mvp.tmpnew"), m_sLastError );
8119 		if ( fdFlushMVA.IsError() )
8120 			return false;
8121 
8122 		// save the vector of affected docids
8123 		DWORD uPos = dAffected.GetLength();
8124 		fdFlushMVA.PutDword ( uPos );
8125 		fdFlushMVA.PutBytes ( &dAffected[0], uPos*sizeof(SphDocID_t) );
8126 
8127 		// save the updated MVA vectors
8128 		ARRAY_FOREACH ( i, dAffected )
8129 		{
8130 			DWORD* pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[i] ) );
8131 			assert ( pDocinfo );
8132 
8133 			pDocinfo = DOCINFO2ATTRS ( pDocinfo );
8134 			ARRAY_FOREACH ( j, dMvaLocators )
8135 			{
8136 				DWORD uOldIndex = MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) );
8137 				// if this MVA was updated
8138 				if ( uOldIndex & MVA_ARENA_FLAG )
8139 				{
8140 					DWORD * pMva = g_pMvaArena + ( uOldIndex & MVA_OFFSET_MASK );
8141 					DWORD uCount = *pMva;
8142 					assert ( j<iMva64 || ( uCount%2 )==0 );
8143 					fdFlushMVA.PutDword ( uCount );
8144 					fdFlushMVA.PutBytes ( pMva+1, uCount*sizeof(DWORD) );
8145 				}
8146 			}
8147 		}
8148 		fdFlushMVA.CloseFile();
8149 		if ( !JuggleFile ( "mvp", false ) )
8150 			return false;
8151 		break;
8152 	}
8153 
8154 	if ( m_bId32to64 )
8155 	{
8156 		m_sLastError.SetSprintf ( "id32 index loaded by id64 binary; saving is not (yet) possible" );
8157 		return false;
8158 	}
8159 
8160 	assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && m_uDocinfo && m_pDocinfo.GetWritePtr() );
8161 
8162 	// save current state
8163 	CSphAutofile fdTmpnew ( GetIndexFileName("spa.tmpnew"), SPH_O_NEW, m_sLastError );
8164 	if ( fdTmpnew.GetFD()<0 )
8165 		return false;
8166 
8167 	size_t uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
8168 	size_t uSize = uStride*size_t(m_uDocinfo)*sizeof(DWORD);
8169 	if ( m_uVersion>=20 )
8170 		uSize += 2*(1+m_uDocinfoIndex)*uStride*sizeof(CSphRowitem);
8171 
8172 	if ( !sphWriteThrottled ( fdTmpnew.GetFD(), m_pDocinfo.GetWritePtr(), uSize, "docinfo", m_sLastError ) )
8173 		return false;
8174 
8175 	fdTmpnew.Close ();
8176 
8177 	if ( !JuggleFile("spa") )
8178 		return false;
8179 
8180 	if ( m_bBinlog && g_pBinlog )
8181 		g_pBinlog->NotifyIndexFlush ( m_sIndexName.cstr(), m_iTID, false );
8182 
8183 	if ( *m_pAttrsStatus==uAttrStatus )
8184 		*m_pAttrsStatus = 0;
8185 
8186 	sphLogDebugvv ( "index '%s' attrs (%d) saved", m_sIndexName.cstr(), *m_pAttrsStatus );
8187 
8188 	return true;
8189 }
8190 
GetAttributeStatus() const8191 DWORD CSphIndex_VLN::GetAttributeStatus () const
8192 {
8193 	assert ( m_pAttrsStatus );
8194 	return *m_pAttrsStatus;
8195 }
8196 
8197 
8198 /////////////////////////////////////////////////////////////////////////////
8199 
8200 #define SPH_CMPHIT_LESS(a,b) \
8201 	( a.m_iWordID<b.m_iWordID || \
8202 	( a.m_iWordID==b.m_iWordID && a.m_iDocID<b.m_iDocID ) || \
8203 	( a.m_iWordID==b.m_iWordID && a.m_iDocID==b.m_iDocID && a.m_iWordPos<b.m_iWordPos ) )
8204 
8205 
8206 struct CmpHit_fn
8207 {
IsLessCmpHit_fn8208 	inline bool IsLess ( const CSphWordHit & a, const CSphWordHit & b ) const
8209 	{
8210 		return SPH_CMPHIT_LESS ( a, b );
8211 	}
8212 };
8213 
8214 
8215 /// sort baked docinfos by document ID
8216 struct DocinfoSort_fn
8217 {
8218 	typedef SphDocID_t MEDIAN_TYPE;
8219 
8220 	int m_iStride;
8221 
DocinfoSort_fnDocinfoSort_fn8222 	explicit DocinfoSort_fn ( int iStride )
8223 		: m_iStride ( iStride )
8224 	{}
8225 
KeyDocinfoSort_fn8226 	SphDocID_t Key ( DWORD * pData ) const
8227 	{
8228 		return DOCINFO2ID(pData);
8229 	}
8230 
CopyKeyDocinfoSort_fn8231 	void CopyKey ( SphDocID_t * pMed, DWORD * pVal ) const
8232 	{
8233 		*pMed = Key(pVal);
8234 	}
8235 
IsLessDocinfoSort_fn8236 	bool IsLess ( SphDocID_t a, SphDocID_t b ) const
8237 	{
8238 		return a < b;
8239 	}
8240 
SwapDocinfoSort_fn8241 	void Swap ( DWORD * a, DWORD * b ) const
8242 	{
8243 		for ( int i=0; i<m_iStride; i++ )
8244 			::Swap ( a[i], b[i] );
8245 	}
8246 
AddDocinfoSort_fn8247 	DWORD * Add ( DWORD * p, int i ) const
8248 	{
8249 		return p+i*m_iStride;
8250 	}
8251 
SubDocinfoSort_fn8252 	int Sub ( DWORD * b, DWORD * a ) const
8253 	{
8254 		return (int)((b-a)/m_iStride);
8255 	}
8256 };
8257 
8258 
sphSortDocinfos(DWORD * pBuf,int iCount,int iStride)8259 void sphSortDocinfos ( DWORD * pBuf, int iCount, int iStride )
8260 {
8261 	DocinfoSort_fn fnSort ( iStride );
8262 	sphSort ( pBuf, iCount, fnSort, fnSort );
8263 }
8264 
8265 
GetIndexFileName(const char * sExt) const8266 CSphString CSphIndex_VLN::GetIndexFileName ( const char * sExt ) const
8267 {
8268 	CSphString sRes;
8269 	sRes.SetSprintf ( "%s.%s", m_sFilename.cstr(), sExt );
8270 	return sRes;
8271 }
8272 
8273 
cidxFinishDoclistEntry(Hitpos_t uLastPos)8274 void CSphIndex_VLN::cidxFinishDoclistEntry ( Hitpos_t uLastPos )
8275 {
8276 	if ( m_tSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE )
8277 	{
8278 		bool bIgnoreHits =
8279 			( m_tSettings.m_eHitless==SPH_HITLESS_ALL ) ||
8280 			( m_tSettings.m_eHitless==SPH_HITLESS_SOME && ( m_iLastWordDocs & 0x80000000 ) );
8281 
8282 		// inline the only hit into doclist (unless it is completely discarded)
8283 		// and finish doclist entry
8284 		m_wrDoclist.ZipInt ( m_uLastDocHits );
8285 		if ( m_uLastDocHits==1 && !bIgnoreHits )
8286 		{
8287 			m_wrHitlist.SeekTo ( m_iLastHitlistPos );
8288 			m_wrDoclist.ZipInt ( uLastPos & 0x7FFFFF );
8289 			m_wrDoclist.ZipInt ( uLastPos >> 23 );
8290 			m_iLastHitlistPos -= m_iLastHitlistDelta;
8291 			assert ( m_iLastHitlistPos>=0 );
8292 
8293 		} else
8294 		{
8295 			m_wrDoclist.ZipInt ( m_dLastDocFields.GetMask32() );
8296 			m_wrDoclist.ZipOffset ( m_iLastHitlistDelta );
8297 		}
8298 
8299 	} else // plain format - finish doclist entry
8300 	{
8301 		assert ( m_tSettings.m_eHitFormat==SPH_HIT_FORMAT_PLAIN );
8302 		m_wrDoclist.ZipOffset ( m_iLastHitlistDelta );
8303 		m_wrDoclist.ZipInt ( m_dLastDocFields.GetMask32() );
8304 		m_wrDoclist.ZipInt ( m_uLastDocHits );
8305 	}
8306 	m_dLastDocFields.Unset();
8307 	m_uLastDocHits = 0;
8308 }
8309 
8310 
cidxHit(CSphAggregateHit * hit,CSphRowitem * pAttrs)8311 void CSphIndex_VLN::cidxHit ( CSphAggregateHit * hit, CSphRowitem * pAttrs )
8312 {
8313 	assert (
8314 		( hit->m_iWordID!=0 && hit->m_iWordPos!=EMPTY_HIT && hit->m_iDocID!=0 ) || // it's either ok hit
8315 		( hit->m_iWordID==0 && hit->m_iWordPos==EMPTY_HIT ) ); // or "flush-hit"
8316 
8317 	/////////////
8318 	// next word
8319 	/////////////
8320 
8321 	bool bNextWord = ( m_tLastHit.m_iWordID!=hit->m_iWordID || ( m_bWordDict && strcmp ( (char*)m_tLastHit.m_sKeyword, (char*)hit->m_sKeyword ) ) ); // OPTIMIZE?
8322 	bool bNextDoc = bNextWord || ( m_tLastHit.m_iDocID!=hit->m_iDocID );
8323 
8324 	if ( bNextDoc )
8325 	{
8326 		// finish hitlist, if any
8327 		Hitpos_t uLastPos = m_tLastHit.m_iWordPos;
8328 		if ( m_tLastHit.m_iWordPos!=EMPTY_HIT )
8329 		{
8330 			m_wrHitlist.ZipInt ( 0 );
8331 			m_tLastHit.m_iWordPos = EMPTY_HIT;
8332 		}
8333 
8334 		// finish doclist entry, if any
8335 		if ( m_tLastHit.m_iDocID )
8336 			cidxFinishDoclistEntry ( uLastPos );
8337 	}
8338 
8339 	if ( bNextWord )
8340 	{
8341 		// finish doclist, if any
8342 		if ( m_tLastHit.m_iDocID )
8343 		{
8344 			// emit end-of-doclist marker
8345 			m_wrDoclist.ZipInt ( 0 );
8346 
8347 			// emit dict entry
8348 			m_pDict->DictEntry ( m_tLastHit.m_iWordID, m_tLastHit.m_sKeyword, m_iLastWordDocs, m_iLastWordHits, m_iLastWordDoclist, m_wrDoclist.GetPos()-m_iLastWordDoclist );
8349 
8350 			// reset trackers
8351 			m_iLastWordDocs = 0;
8352 			m_iLastWordHits = 0;
8353 
8354 			m_tLastHit.m_iDocID = 0;
8355 			m_iLastHitlistPos = 0;
8356 		}
8357 
8358 		// flush wordlist, if this is the end
8359 		if ( hit->m_iWordPos==EMPTY_HIT )
8360 		{
8361 			m_pDict->DictEndEntries ( m_wrDoclist.GetPos() );
8362 			return;
8363 		}
8364 
8365 		assert ( hit->m_iWordID > m_tLastHit.m_iWordID
8366 			|| ( m_bWordDict && hit->m_iWordID==m_tLastHit.m_iWordID && strcmp ( (char*)hit->m_sKeyword, (char*)m_tLastHit.m_sKeyword )>0 )
8367 			|| m_bMerging );
8368 		m_iLastWordDoclist = m_wrDoclist.GetPos();
8369 		m_tLastHit.m_iWordID = hit->m_iWordID;
8370 		if ( m_bWordDict )
8371 		{
8372 			assert ( strlen ( (char *)hit->m_sKeyword )<sizeof(m_sLastKeyword)-1 );
8373 			strncpy ( (char*)m_tLastHit.m_sKeyword, (char*)hit->m_sKeyword, sizeof(m_sLastKeyword) ); // OPTIMIZE?
8374 		}
8375 	}
8376 
8377 	if ( bNextDoc )
8378 	{
8379 		// begin new doclist entry for new doc id
8380 		assert ( hit->m_iDocID>m_tLastHit.m_iDocID );
8381 		assert ( m_wrHitlist.GetPos()>=m_iLastHitlistPos );
8382 
8383 		m_wrDoclist.ZipOffset ( hit->m_iDocID - m_tLastHit.m_iDocID );
8384 		if ( pAttrs )
8385 		{
8386 			for ( int i=0; i<m_tSchema.GetRowSize(); i++ )
8387 				m_wrDoclist.ZipInt ( pAttrs[i] - m_pMin->m_pDynamic[i] );
8388 		}
8389 		m_iLastHitlistDelta = m_wrHitlist.GetPos() - m_iLastHitlistPos;
8390 
8391 		m_tLastHit.m_iDocID = hit->m_iDocID;
8392 		m_iLastHitlistPos = m_wrHitlist.GetPos();
8393 
8394 		// update per-word stats
8395 		m_iLastWordDocs++;
8396 	}
8397 
8398 	///////////
8399 	// the hit
8400 	///////////
8401 
8402 	if ( !hit->m_dFieldMask.TestAll(false) ) // merge aggregate hits into the current hit
8403 	{
8404 		int iHitCount = hit->GetAggrCount();
8405 		assert ( m_tSettings.m_eHitless );
8406 		assert ( iHitCount );
8407 		assert ( !hit->m_dFieldMask.TestAll(false) );
8408 
8409 		m_uLastDocHits += iHitCount;
8410 		m_dLastDocFields |= hit->m_dFieldMask;
8411 		m_iLastWordHits += iHitCount;
8412 
8413 		if ( m_tSettings.m_eHitless==SPH_HITLESS_SOME )
8414 			m_iLastWordDocs |= 0x80000000;
8415 
8416 	} else // handle normal hits
8417 	{
8418 		// add hit delta
8419 		if ( hit->m_iWordPos==m_tLastHit.m_iWordPos )
8420 			return;
8421 
8422 		assert ( m_tLastHit.m_iWordPos < hit->m_iWordPos );
8423 		m_wrHitlist.ZipInt ( hit->m_iWordPos - m_tLastHit.m_iWordPos );
8424 		m_tLastHit.m_iWordPos = hit->m_iWordPos;
8425 		m_iLastWordHits++;
8426 
8427 		// update matched fields mask
8428 		m_dLastDocFields.Set ( HITMAN::GetField ( hit->m_iWordPos ) );
8429 		m_uLastDocHits++;
8430 	}
8431 }
8432 
8433 
ReadSchemaColumn(CSphReader & rdInfo,CSphColumnInfo & tCol,DWORD uVersion)8434 static void ReadSchemaColumn ( CSphReader & rdInfo, CSphColumnInfo & tCol, DWORD uVersion )
8435 {
8436 	tCol.m_sName = rdInfo.GetString ();
8437 	if ( tCol.m_sName.IsEmpty () )
8438 		tCol.m_sName = "@emptyname";
8439 
8440 	tCol.m_sName.ToLower ();
8441 	tCol.m_eAttrType = (ESphAttr) rdInfo.GetDword (); // FIXME? check/fixup?
8442 
8443 	if ( uVersion>=5 ) // m_uVersion for searching
8444 	{
8445 		rdInfo.GetDword (); // ignore rowitem
8446 		tCol.m_tLocator.m_iBitOffset = rdInfo.GetDword ();
8447 		tCol.m_tLocator.m_iBitCount = rdInfo.GetDword ();
8448 	} else
8449 	{
8450 		tCol.m_tLocator.m_iBitOffset = -1;
8451 		tCol.m_tLocator.m_iBitCount = -1;
8452 	}
8453 
8454 	if ( uVersion>=16 ) // m_uVersion for searching
8455 		tCol.m_bPayload = ( rdInfo.GetByte()!=0 );
8456 
8457 	// WARNING! max version used here must be in sync with RtIndex_t::Prealloc
8458 }
8459 
8460 
ReadSchema(CSphReader & rdInfo,CSphSchema & m_tSchema,DWORD uVersion,bool bDynamic)8461 void ReadSchema ( CSphReader & rdInfo, CSphSchema & m_tSchema, DWORD uVersion, bool bDynamic )
8462 {
8463 	m_tSchema.Reset ();
8464 
8465 	m_tSchema.m_dFields.Resize ( rdInfo.GetDword() );
8466 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
8467 		ReadSchemaColumn ( rdInfo, m_tSchema.m_dFields[i], uVersion );
8468 
8469 	int iNumAttrs = rdInfo.GetDword();
8470 
8471 	for ( int i=0; i<iNumAttrs; i++ )
8472 	{
8473 		CSphColumnInfo tCol;
8474 		ReadSchemaColumn ( rdInfo, tCol, uVersion );
8475 		m_tSchema.AddAttr ( tCol, bDynamic );
8476 	}
8477 }
8478 
8479 
WriteSchemaColumn(CSphWriter & fdInfo,const CSphColumnInfo & tCol)8480 static void WriteSchemaColumn ( CSphWriter & fdInfo, const CSphColumnInfo & tCol )
8481 {
8482 	int iLen = strlen ( tCol.m_sName.cstr() );
8483 	fdInfo.PutDword ( iLen );
8484 	fdInfo.PutBytes ( tCol.m_sName.cstr(), iLen );
8485 
8486 	ESphAttr eAttrType = tCol.m_eAttrType;
8487 	if ( eAttrType==SPH_ATTR_WORDCOUNT )
8488 		eAttrType = SPH_ATTR_INTEGER;
8489 	fdInfo.PutDword ( eAttrType );
8490 
8491 	fdInfo.PutDword ( tCol.m_tLocator.CalcRowitem() ); // for backwards compatibility
8492 	fdInfo.PutDword ( tCol.m_tLocator.m_iBitOffset );
8493 	fdInfo.PutDword ( tCol.m_tLocator.m_iBitCount );
8494 
8495 	fdInfo.PutByte ( tCol.m_bPayload );
8496 }
8497 
8498 
WriteSchema(CSphWriter & fdInfo,const CSphSchema & tSchema)8499 void WriteSchema ( CSphWriter & fdInfo, const CSphSchema & tSchema )
8500 {
8501 	// schema
8502 	fdInfo.PutDword ( tSchema.m_dFields.GetLength() );
8503 	ARRAY_FOREACH ( i, tSchema.m_dFields )
8504 		WriteSchemaColumn ( fdInfo, tSchema.m_dFields[i] );
8505 
8506 	fdInfo.PutDword ( tSchema.GetAttrsCount() );
8507 	for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
8508 		WriteSchemaColumn ( fdInfo, tSchema.GetAttr(i) );
8509 }
8510 
8511 
SaveIndexSettings(CSphWriter & tWriter,const CSphIndexSettings & tSettings)8512 void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings )
8513 {
8514 	tWriter.PutDword ( tSettings.m_iMinPrefixLen );
8515 	tWriter.PutDword ( tSettings.m_iMinInfixLen );
8516 	tWriter.PutByte ( tSettings.m_bHtmlStrip ? 1 : 0 );
8517 	tWriter.PutString ( tSettings.m_sHtmlIndexAttrs.cstr () );
8518 	tWriter.PutString ( tSettings.m_sHtmlRemoveElements.cstr () );
8519 	tWriter.PutByte ( tSettings.m_bIndexExactWords ? 1 : 0 );
8520 	tWriter.PutDword ( tSettings.m_eHitless );
8521 	tWriter.PutDword ( tSettings.m_eHitFormat );
8522 	tWriter.PutByte ( tSettings.m_bIndexSP );
8523 	tWriter.PutString ( tSettings.m_sZones );
8524 	tWriter.PutDword ( tSettings.m_iBoundaryStep );
8525 	tWriter.PutDword ( tSettings.m_iStopwordStep );
8526 }
8527 
8528 
WriteHeader(CSphWriter & fdInfo,SphOffset_t iCheckpointsPos,DWORD iCheckpointCount)8529 bool CSphIndex_VLN::WriteHeader ( CSphWriter & fdInfo, SphOffset_t iCheckpointsPos, DWORD iCheckpointCount )
8530 {
8531 	// version
8532 	fdInfo.PutDword ( INDEX_MAGIC_HEADER );
8533 	fdInfo.PutDword ( INDEX_FORMAT_VERSION );
8534 
8535 	// bits
8536 	fdInfo.PutDword ( USE_64BIT );
8537 
8538 	// docinfo
8539 	fdInfo.PutDword ( m_tSettings.m_eDocinfo );
8540 
8541 	// schema
8542 	WriteSchema ( fdInfo, m_tSchema );
8543 
8544 	// min doc
8545 	fdInfo.PutOffset ( m_pMin->m_iDocID ); // was dword in v.1
8546 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
8547 		fdInfo.PutBytes ( m_pMin->m_pDynamic, m_tSchema.GetRowSize()*sizeof(CSphRowitem) );
8548 
8549 	// wordlist checkpoints
8550 	fdInfo.PutOffset ( iCheckpointsPos );
8551 	fdInfo.PutDword ( iCheckpointCount );
8552 
8553 	// index stats
8554 	fdInfo.PutDword ( m_tStats.m_iTotalDocuments );
8555 	fdInfo.PutOffset ( m_tStats.m_iTotalBytes );
8556 
8557 	// index settings
8558 	SaveIndexSettings ( fdInfo, m_tSettings );
8559 
8560 	// tokenizer info
8561 	assert ( m_pTokenizer );
8562 	SaveTokenizerSettings ( fdInfo, m_pTokenizer );
8563 
8564 	// dictionary info
8565 	assert ( m_pDict );
8566 	SaveDictionarySettings ( fdInfo, m_pDict, false );
8567 
8568 	fdInfo.PutDword ( m_iKillListSize );
8569 	fdInfo.PutDword ( (DWORD)m_uMinMaxIndex );
8570 
8571 	return true;
8572 }
8573 
8574 
cidxDone(const char * sHeaderExtension,int iMemLimit)8575 bool CSphIndex_VLN::cidxDone ( const char * sHeaderExtension, int iMemLimit )
8576 {
8577 	// flush wordlist checkpoints
8578 	SphOffset_t iCheckpointsPos;
8579 	int iCheckpointsCount;
8580 
8581 	if ( !m_pDict->DictEnd ( &iCheckpointsPos, &iCheckpointsCount, iMemLimit, m_sLastError ) )
8582 		return false;
8583 
8584 	/////////////////
8585 	// create header
8586 	/////////////////
8587 
8588 	CSphWriter fdInfo;
8589 	fdInfo.OpenFile ( GetIndexFileName ( sHeaderExtension ), m_sLastError );
8590 	if ( fdInfo.IsError() )
8591 		return false;
8592 
8593 	if ( !WriteHeader ( fdInfo, iCheckpointsPos, iCheckpointsCount ) )
8594 		return false;
8595 
8596 	////////////////////////
8597 	// close all data files
8598 	////////////////////////
8599 
8600 	fdInfo.CloseFile ();
8601 	m_wrDoclist.CloseFile ();
8602 	m_wrHitlist.CloseFile ( true );
8603 
8604 	if ( fdInfo.IsError() || m_pDict->DictIsError() || m_wrDoclist.IsError() || m_wrHitlist.IsError() )
8605 		return false;
8606 
8607 	return true;
8608 }
8609 
8610 
encodeVLB(BYTE * buf,DWORD v)8611 inline int encodeVLB ( BYTE * buf, DWORD v )
8612 {
8613 	register BYTE b;
8614 	register int n = 0;
8615 
8616 	do
8617 	{
8618 		b = (BYTE)(v & 0x7f);
8619 		v >>= 7;
8620 		if ( v )
8621 			b |= 0x80;
8622 		*buf++ = b;
8623 		n++;
8624 	} while ( v );
8625 	return n;
8626 }
8627 
8628 
encodeVLB8(BYTE * buf,uint64_t v)8629 inline int encodeVLB8 ( BYTE * buf, uint64_t v )
8630 {
8631 	register BYTE b;
8632 	register int n = 0;
8633 
8634 	do {
8635 		b = (BYTE)(v & 0x7f);
8636 		v >>= 7;
8637 		if ( v )
8638 			b |= 0x80;
8639 		*buf++ = b;
8640 		n++;
8641 	} while ( v );
8642 	return n;
8643 }
8644 
8645 
encodeKeyword(BYTE * pBuf,const char * pKeyword)8646 inline int encodeKeyword ( BYTE * pBuf, const char * pKeyword )
8647 {
8648 	int iLen = strlen ( pKeyword ); // OPTIMIZE! remove this and memcpy and check if thats faster
8649 	assert ( iLen>0 && iLen<128 ); // so that ReadVLB()
8650 
8651 	*pBuf = (BYTE) iLen;
8652 	memcpy ( pBuf+1, pKeyword, iLen );
8653 	return 1+iLen;
8654 }
8655 
8656 
cidxWriteRawVLB(int fd,CSphWordHit * pHit,int iHits,DWORD * pDocinfo,int iDocinfos,int iStride)8657 int CSphIndex_VLN::cidxWriteRawVLB ( int fd, CSphWordHit * pHit, int iHits, DWORD * pDocinfo, int iDocinfos, int iStride )
8658 {
8659 	PROFILE ( write_hits );
8660 
8661 	assert ( pHit );
8662 	assert ( iHits>0 );
8663 
8664 	/////////////////////////////
8665 	// do simple bitwise hashing
8666 	/////////////////////////////
8667 
8668 	static const int HBITS = 11;
8669 	static const int HSIZE = ( 1 << HBITS );
8670 
8671 	SphDocID_t iStartID = 0;
8672 	int dHash [ HSIZE+1 ];
8673 	int iShift = 0;
8674 
8675 	if ( pDocinfo )
8676 	{
8677 		iStartID = DOCINFO2ID ( pDocinfo );
8678 		int iBits = sphLog2 ( DOCINFO2ID ( pDocinfo + (iDocinfos-1)*iStride ) - iStartID );
8679 		iShift = ( iBits<HBITS ) ? 0 : ( iBits-HBITS );
8680 
8681 		#ifndef NDEBUG
8682 		for ( int i=0; i<=HSIZE; i++ )
8683 			dHash[i] = -1;
8684 		#endif
8685 
8686 		dHash[0] = 0;
8687 		int iHashed = 0;
8688 		for ( int i=0; i<iDocinfos; i++ )
8689 		{
8690 			int iHash = (int)( ( DOCINFO2ID ( pDocinfo+i*iStride ) - iStartID ) >> iShift );
8691 			assert ( iHash>=0 && iHash<HSIZE );
8692 
8693 			if ( iHash>iHashed )
8694 			{
8695 				dHash [ iHashed+1 ] = i-1; // right boundary for prev hash value
8696 				dHash [ iHash ] = i; // left boundary for next hash value
8697 				iHashed = iHash;
8698 			}
8699 		}
8700 		dHash [ iHashed+1 ] = iDocinfos-1; // right boundary for last hash value
8701 	}
8702 
8703 	///////////////////////////////////////
8704 	// encode through a small write buffer
8705 	///////////////////////////////////////
8706 
8707 	BYTE *pBuf, *maxP;
8708 	int n = 0, w;
8709 	SphWordID_t d1, l1 = 0;
8710 	SphDocID_t d2, l2 = 0;
8711 	DWORD d3, l3 = 0; // !COMMIT must be wide enough
8712 	bool bWordDict = m_pDict->GetSettings().m_bWordDict;
8713 
8714 	int iGap = Max ( 128, 16*sizeof(DWORD) + iStride*sizeof(DWORD) + ( bWordDict ? MAX_KEYWORD_BYTES : 0 ) );
8715 	pBuf = m_pWriteBuffer;
8716 	maxP = m_pWriteBuffer + m_iWriteBuffer - iGap;
8717 
8718 	SphDocID_t iAttrID = 0; // current doc id
8719 	DWORD * pAttrs = NULL; // current doc attrs
8720 
8721 	// hit aggregation state
8722 	DWORD uHitCount = 0;
8723 	DWORD uHitFieldMask = 0;
8724 
8725 	const int iPositionShift = m_tSettings.m_eHitless==SPH_HITLESS_SOME ? 1 : 0;
8726 
8727 	while ( iHits-- )
8728 	{
8729 		// find attributes by id
8730 		if ( pDocinfo && iAttrID!=pHit->m_iDocID )
8731 		{
8732 			int iHash = (int)( ( pHit->m_iDocID - iStartID ) >> iShift );
8733 			assert ( iHash>=0 && iHash<HSIZE );
8734 
8735 			int iStart = dHash[iHash];
8736 			int iEnd = dHash[iHash+1];
8737 
8738 			if ( pHit->m_iDocID==DOCINFO2ID ( pDocinfo + iStart*iStride ) )
8739 			{
8740 				pAttrs = DOCINFO2ATTRS ( pDocinfo + iStart*iStride );
8741 
8742 			} else if ( pHit->m_iDocID==DOCINFO2ID ( pDocinfo + iEnd*iStride ) )
8743 			{
8744 				pAttrs = DOCINFO2ATTRS ( pDocinfo + iEnd*iStride );
8745 
8746 			} else
8747 			{
8748 				pAttrs = NULL;
8749 				while ( iEnd-iStart>1 )
8750 				{
8751 					// check if nothing found
8752 					if (
8753 						pHit->m_iDocID < DOCINFO2ID ( pDocinfo + iStart*iStride ) ||
8754 						pHit->m_iDocID > DOCINFO2ID ( pDocinfo + iEnd*iStride ) )
8755 							break;
8756 					assert ( pHit->m_iDocID > DOCINFO2ID ( pDocinfo + iStart*iStride ) );
8757 					assert ( pHit->m_iDocID < DOCINFO2ID ( pDocinfo + iEnd*iStride ) );
8758 
8759 					int iMid = iStart + (iEnd-iStart)/2;
8760 					if ( pHit->m_iDocID==DOCINFO2ID ( pDocinfo + iMid*iStride ) )
8761 					{
8762 						pAttrs = DOCINFO2ATTRS ( pDocinfo + iMid*iStride );
8763 						break;
8764 					}
8765 					if ( pHit->m_iDocID<DOCINFO2ID ( pDocinfo + iMid*iStride ) )
8766 						iEnd = iMid;
8767 					else
8768 						iStart = iMid;
8769 				}
8770 			}
8771 
8772 			assert ( pAttrs );
8773 			assert ( DOCINFO2ID ( pAttrs - DOCINFO_IDSIZE )==pHit->m_iDocID );
8774 			iAttrID = pHit->m_iDocID;
8775 		}
8776 
8777 		// calc deltas
8778 		d1 = pHit->m_iWordID - l1;
8779 		d2 = pHit->m_iDocID - l2;
8780 		d3 = pHit->m_iWordPos - l3;
8781 
8782 		// ignore duplicate hits
8783 		if ( d1==0 && d2==0 && d3==0 ) // OPTIMIZE? check if ( 0==(d1|d2|d3) ) is faster
8784 		{
8785 			pHit++;
8786 			continue;
8787 		}
8788 
8789 		// non-zero delta restarts all the fields after it
8790 		// because their deltas might now be negative
8791 		if ( d1 ) d2 = pHit->m_iDocID;
8792 		if ( d2 ) d3 = pHit->m_iWordPos;
8793 
8794 		// when we moved to the next word or document
8795 		bool bFlushed = false;
8796 		if ( d1 || d2 )
8797 		{
8798 			// flush previous aggregate hit
8799 			if ( uHitCount )
8800 			{
8801 				// we either skip all hits or the high bit must be available for marking
8802 				// failing that, we can't produce a consistent index
8803 				assert ( m_tSettings.m_eHitless!=SPH_HITLESS_NONE );
8804 				assert ( m_tSettings.m_eHitless==SPH_HITLESS_ALL || !( uHitCount & 0x80000000UL ) );
8805 
8806 				if ( m_tSettings.m_eHitless!=SPH_HITLESS_ALL )
8807 					uHitCount = ( uHitCount << 1 ) | 1;
8808 				pBuf += encodeVLB ( pBuf, uHitCount );
8809 				pBuf += encodeVLB ( pBuf, uHitFieldMask );
8810 				assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8811 
8812 				uHitCount = 0;
8813 				uHitFieldMask = 0;
8814 
8815 				bFlushed = true;
8816 			}
8817 
8818 			// start aggregating if we're skipping all hits or this word is in a list of ignored words
8819 			if ( ( m_tSettings.m_eHitless==SPH_HITLESS_ALL ) ||
8820 				( m_tSettings.m_eHitless==SPH_HITLESS_SOME && m_dHitlessWords.BinarySearch ( pHit->m_iWordID ) ) )
8821 			{
8822 				uHitCount = 1;
8823 				uHitFieldMask |= 1 << HITMAN::GetField ( pHit->m_iWordPos );
8824 			}
8825 
8826 		} else if ( uHitCount ) // next hit for the same word/doc pair, update state if we need it
8827 		{
8828 			uHitCount++;
8829 			uHitFieldMask |= 1 << HITMAN::GetField ( pHit->m_iWordPos );
8830 		}
8831 
8832 		// encode enough restart markers
8833 		if ( d1 ) pBuf += encodeVLB ( pBuf, 0 );
8834 		if ( d2 && !bFlushed ) pBuf += encodeVLB ( pBuf, 0 );
8835 
8836 		assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8837 
8838 		// encode deltas
8839 #if USE_64BIT
8840 #define LOC_ENCODE encodeVLB8
8841 #else
8842 #define LOC_ENCODE encodeVLB
8843 #endif
8844 
8845 		// encode keyword
8846 		if ( d1 )
8847 		{
8848 			if ( bWordDict )
8849 				pBuf += encodeKeyword ( pBuf, m_pDict->HitblockGetKeyword ( pHit->m_iWordID ) ); // keyword itself in case of keywords dict
8850 			else
8851 				pBuf += LOC_ENCODE ( pBuf, d1 ); // delta in case of CRC dict
8852 
8853 			assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8854 		}
8855 
8856 		// encode docid delta
8857 		if ( d2 )
8858 		{
8859 			pBuf += LOC_ENCODE ( pBuf, d2 );
8860 			assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8861 		}
8862 
8863 #undef LOC_ENCODE
8864 
8865 		// encode attrs
8866 		if ( d2 && pAttrs )
8867 		{
8868 			for ( int i=0; i<iStride-DOCINFO_IDSIZE; i++ )
8869 			{
8870 				pBuf += encodeVLB ( pBuf, pAttrs[i] );
8871 				assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8872 			}
8873 		}
8874 
8875 		assert ( d3 );
8876 		if ( !uHitCount ) // encode position delta, unless accumulating hits
8877 		{
8878 			pBuf += encodeVLB ( pBuf, d3 << iPositionShift );
8879 			assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8880 		}
8881 
8882 		// update current state
8883 		l1 = pHit->m_iWordID;
8884 		l2 = pHit->m_iDocID;
8885 		l3 = pHit->m_iWordPos;
8886 
8887 		pHit++;
8888 
8889 		if ( pBuf>maxP )
8890 		{
8891 			w = (int)(pBuf - m_pWriteBuffer);
8892 			assert ( w<m_iWriteBuffer );
8893 			if ( !sphWriteThrottled ( fd, m_pWriteBuffer, w, "raw_hits", m_sLastError ) )
8894 				return -1;
8895 			n += w;
8896 			pBuf = m_pWriteBuffer;
8897 		}
8898 	}
8899 
8900 	// flush last aggregate
8901 	if ( uHitCount )
8902 	{
8903 		assert ( m_tSettings.m_eHitless!=SPH_HITLESS_NONE );
8904 		assert ( m_tSettings.m_eHitless==SPH_HITLESS_ALL || !( uHitCount & 0x80000000UL ) );
8905 
8906 		if ( m_tSettings.m_eHitless!=SPH_HITLESS_ALL )
8907 			uHitCount = ( uHitCount << 1 ) | 1;
8908 		pBuf += encodeVLB ( pBuf, uHitCount );
8909 		pBuf += encodeVLB ( pBuf, uHitFieldMask );
8910 
8911 		assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8912 	}
8913 
8914 	pBuf += encodeVLB ( pBuf, 0 );
8915 	pBuf += encodeVLB ( pBuf, 0 );
8916 	pBuf += encodeVLB ( pBuf, 0 );
8917 	assert ( pBuf<m_pWriteBuffer + m_iWriteBuffer );
8918 	w = (int)(pBuf - m_pWriteBuffer);
8919 	assert ( w<m_iWriteBuffer );
8920 	if ( !sphWriteThrottled ( fd, m_pWriteBuffer, w, "raw_hits", m_sLastError ) )
8921 		return -1;
8922 	n += w;
8923 
8924 	return n;
8925 }
8926 
8927 /////////////////////////////////////////////////////////////////////////////
8928 
8929 // OPTIMIZE?
SPH_CMPAGGRHIT_LESS(const CSphAggregateHit & a,const CSphAggregateHit & b)8930 inline bool SPH_CMPAGGRHIT_LESS ( const CSphAggregateHit & a, const CSphAggregateHit & b )
8931 {
8932 	if ( a.m_iWordID < b.m_iWordID )
8933 		return true;
8934 
8935 	if ( a.m_iWordID > b.m_iWordID )
8936 		return false;
8937 
8938 	if ( a.m_sKeyword )
8939 	{
8940 		int iCmp = strcmp ( (char*)a.m_sKeyword, (char*)b.m_sKeyword ); // OPTIMIZE?
8941 		if ( iCmp!=0 )
8942 			return ( iCmp<0 );
8943 	}
8944 
8945 	return
8946 		( a.m_iDocID < b.m_iDocID ) ||
8947 		( a.m_iDocID==b.m_iDocID && a.m_iWordPos<b.m_iWordPos );
8948 }
8949 
8950 
8951 /// hit priority queue entry
8952 struct CSphHitQueueEntry : public CSphAggregateHit
8953 {
8954 	int m_iBin;
8955 };
8956 
8957 
8958 /// hit priority queue
8959 struct CSphHitQueue
8960 {
8961 public:
8962 	CSphHitQueueEntry *		m_pData;
8963 	int						m_iSize;
8964 	int						m_iUsed;
8965 
8966 public:
8967 	/// create queue
CSphHitQueueCSphHitQueue8968 	explicit CSphHitQueue ( int iSize )
8969 	{
8970 		assert ( iSize>0 );
8971 		m_iSize = iSize;
8972 		m_iUsed = 0;
8973 		m_pData = new CSphHitQueueEntry [ iSize ];
8974 	}
8975 
8976 	/// destroy queue
~CSphHitQueueCSphHitQueue8977 	~CSphHitQueue ()
8978 	{
8979 		SafeDeleteArray ( m_pData );
8980 	}
8981 
8982 	/// add entry to the queue
PushCSphHitQueue8983 	void Push ( CSphAggregateHit & tHit, int iBin )
8984 	{
8985 		// check for overflow and do add
8986 		assert ( m_iUsed<m_iSize );
8987 		m_pData [ m_iUsed ].m_iDocID = tHit.m_iDocID;
8988 		m_pData [ m_iUsed ].m_iWordID = tHit.m_iWordID;
8989 		m_pData [ m_iUsed ].m_sKeyword = tHit.m_sKeyword; // bin must hold the actual data for the queue
8990 		m_pData [ m_iUsed ].m_iWordPos = tHit.m_iWordPos;
8991 		m_pData [ m_iUsed ].m_dFieldMask = tHit.m_dFieldMask;
8992 		m_pData [ m_iUsed ].m_iBin = iBin;
8993 
8994 		int iEntry = m_iUsed++;
8995 
8996 		// sift up if needed
8997 		while ( iEntry )
8998 		{
8999 			int iParent = ( iEntry-1 ) >> 1;
9000 			if ( SPH_CMPAGGRHIT_LESS ( m_pData[iEntry], m_pData[iParent] ) )
9001 			{
9002 				// entry is less than parent, should float to the top
9003 				Swap ( m_pData[iEntry], m_pData[iParent] );
9004 				iEntry = iParent;
9005 			} else
9006 			{
9007 				break;
9008 			}
9009 		}
9010 	}
9011 
9012 	/// remove root (ie. top priority) entry
PopCSphHitQueue9013 	void Pop ()
9014 	{
9015 		assert ( m_iUsed );
9016 		if ( !(--m_iUsed) ) // empty queue? just return
9017 			return;
9018 
9019 		// make the last entry my new root
9020 		m_pData[0] = m_pData[m_iUsed];
9021 
9022 		// sift down if needed
9023 		int iEntry = 0;
9024 		for ( ;; )
9025 		{
9026 			// select child
9027 			int iChild = (iEntry<<1) + 1;
9028 			if ( iChild>=m_iUsed )
9029 				break;
9030 
9031 			// select smallest child
9032 			if ( iChild+1<m_iUsed )
9033 				if ( SPH_CMPAGGRHIT_LESS ( m_pData[iChild+1], m_pData[iChild] ) )
9034 					iChild++;
9035 
9036 			// if smallest child is less than entry, do float it to the top
9037 			if ( SPH_CMPAGGRHIT_LESS ( m_pData[iChild], m_pData[iEntry] ) )
9038 			{
9039 				Swap ( m_pData[iChild], m_pData[iEntry] );
9040 				iEntry = iChild;
9041 				continue;
9042 			}
9043 
9044 			break;
9045 		}
9046 	}
9047 };
9048 
9049 
9050 struct CmpQueuedDocinfo_fn
9051 {
9052 	static DWORD *	m_pStorage;
9053 	static int		m_iStride;
9054 
IsLessCmpQueuedDocinfo_fn9055 	static inline bool IsLess ( const int a, const int b )
9056 	{
9057 		return DOCINFO2ID ( m_pStorage + a*m_iStride ) < DOCINFO2ID ( m_pStorage + b*m_iStride );
9058 	}
9059 };
9060 DWORD *		CmpQueuedDocinfo_fn::m_pStorage		= NULL;
9061 int			CmpQueuedDocinfo_fn::m_iStride		= 1;
9062 
9063 
9064 #define MAX_SOURCE_HITS	32768
9065 static const int MIN_KEYWORDS_DICT	= 4*1048576;	// FIXME! ideally must be in sync with impl (ENTRY_CHUNKS, KEYWORD_CHUNKS)
9066 
9067 /////////////////////////////////////////////////////////////////////////////
9068 
9069 struct MvaEntry_t
9070 {
9071 	SphDocID_t	m_uDocID;
9072 	int			m_iAttr;
9073 	int64_t		m_iValue;
9074 
operator <MvaEntry_t9075 	inline bool operator < ( const MvaEntry_t & rhs ) const
9076 	{
9077 		if ( m_uDocID!=rhs.m_uDocID ) return m_uDocID<rhs.m_uDocID;
9078 		if ( m_iAttr!=rhs.m_iAttr ) return m_iAttr<rhs.m_iAttr;
9079 		return m_iValue<rhs.m_iValue;
9080 	}
9081 };
9082 
9083 
9084 struct MvaEntryTag_t : public MvaEntry_t
9085 {
9086 	int			m_iTag;
9087 };
9088 
9089 
9090 struct MvaEntryCmp_fn
9091 {
IsLessMvaEntryCmp_fn9092 	static inline bool IsLess ( const MvaEntry_t & a, const MvaEntry_t & b )
9093 	{
9094 		return a<b;
9095 	}
9096 };
9097 
9098 
BuildMVA(const CSphVector<CSphSource * > & dSources,CSphAutoArray<CSphWordHit> & dHits,int iArenaSize,int iFieldFD,int nFieldMVAs,int iFieldMVAInPool)9099 bool CSphIndex_VLN::BuildMVA ( const CSphVector<CSphSource*> & dSources,
9100 		CSphAutoArray<CSphWordHit> & dHits, int iArenaSize, int iFieldFD,
9101 		int nFieldMVAs, int iFieldMVAInPool )
9102 {
9103 	// initialize writer (data file must always exist)
9104 	CSphWriter wrMva;
9105 	if ( !wrMva.OpenFile ( GetIndexFileName("spm"), m_sLastError ) )
9106 		return false;
9107 
9108 	// calcs and checks
9109 	bool bOnlyFieldMVAs = true;
9110 	CSphVector<int> dMvaIndexes;
9111 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
9112 	{
9113 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
9114 		if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
9115 		{
9116 			dMvaIndexes.Add ( i );
9117 			if ( tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
9118 				bOnlyFieldMVAs = false;
9119 		}
9120 	}
9121 	int iMva64 = dMvaIndexes.GetLength();
9122 	// mva32 first
9123 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
9124 	{
9125 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
9126 		if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
9127 		{
9128 			dMvaIndexes.Add ( i );
9129 			if ( tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
9130 				bOnlyFieldMVAs = false;
9131 		}
9132 	}
9133 
9134 	if ( dMvaIndexes.GetLength()<=0 )
9135 		return true;
9136 
9137 	// reuse hits pool
9138 	CSphWordHit * pArena = dHits;
9139 	MvaEntry_t * pMvaPool = (MvaEntry_t*) pArena;
9140 	MvaEntry_t * pMvaMax = pMvaPool + ( iArenaSize/sizeof(MvaEntry_t) );
9141 	MvaEntry_t * pMva = pMvaPool;
9142 
9143 	// create temp file
9144 	CSphAutofile fdTmpMva ( GetIndexFileName("tmp3"), SPH_O_NEW, m_sLastError, true );
9145 	if ( fdTmpMva.GetFD()<0 )
9146 		return false;
9147 
9148 	//////////////////////////////
9149 	// collect and partially sort
9150 	//////////////////////////////
9151 
9152 	CSphVector<int> dBlockLens;
9153 	dBlockLens.Reserve ( 1024 );
9154 
9155 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_COLLECT_MVA;
9156 
9157 	if ( !bOnlyFieldMVAs )
9158 	{
9159 		ARRAY_FOREACH ( iSource, dSources )
9160 		{
9161 			CSphSource * pSource = dSources[iSource];
9162 			if ( !pSource->Connect ( m_sLastError ) )
9163 				return false;
9164 
9165 			ARRAY_FOREACH ( i, dMvaIndexes )
9166 			{
9167 				int iAttr = dMvaIndexes[i];
9168 				const CSphColumnInfo & tAttr = m_tSchema.GetAttr(iAttr);
9169 
9170 				if ( tAttr.m_eSrc==SPH_ATTRSRC_FIELD )
9171 					continue;
9172 
9173 				if ( !pSource->IterateMultivaluedStart ( iAttr, m_sLastError ) )
9174 					return false;
9175 
9176 				while ( pSource->IterateMultivaluedNext () )
9177 				{
9178 					pMva->m_uDocID = pSource->m_tDocInfo.m_iDocID;
9179 					pMva->m_iAttr = i;
9180 					if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
9181 					{
9182 						pMva->m_iValue = pSource->m_dMva[0];
9183 					} else
9184 					{
9185 						pMva->m_iValue = MVA_UPSIZE ( pSource->m_dMva.Begin() );
9186 					}
9187 
9188 					if ( ++pMva>=pMvaMax )
9189 					{
9190 						sphSort ( pMvaPool, pMva-pMvaPool );
9191 						if ( !sphWriteThrottled ( fdTmpMva.GetFD(), pMvaPool, (pMva-pMvaPool)*sizeof(MvaEntry_t), "temp_mva", m_sLastError ) )
9192 							return false;
9193 
9194 						dBlockLens.Add ( pMva-pMvaPool );
9195 						m_tProgress.m_iAttrs += pMva-pMvaPool;
9196 						pMva = pMvaPool;
9197 
9198 						if ( m_pProgress )
9199 							m_pProgress ( &m_tProgress, false );
9200 					}
9201 				}
9202 			}
9203 
9204 			pSource->Disconnect ();
9205 		}
9206 
9207 		if ( pMva>pMvaPool )
9208 		{
9209 			sphSort ( pMvaPool, pMva-pMvaPool );
9210 			if ( !sphWriteThrottled ( fdTmpMva.GetFD(), pMvaPool, (pMva-pMvaPool)*sizeof(MvaEntry_t), "temp_mva", m_sLastError ) )
9211 				return false;
9212 
9213 			dBlockLens.Add ( pMva-pMvaPool );
9214 			m_tProgress.m_iAttrs += pMva-pMvaPool;
9215 		}
9216 	}
9217 
9218 	if ( m_pProgress )
9219 		m_pProgress ( &m_tProgress, true );
9220 
9221 	///////////////////////////
9222 	// free memory for sorting
9223 	///////////////////////////
9224 
9225 	dHits.Reset ();
9226 
9227 	//////////////
9228 	// fully sort
9229 	//////////////
9230 
9231 	if ( m_pProgress )
9232 	{
9233 		m_tProgress.m_ePhase = CSphIndexProgress::PHASE_SORT_MVA;
9234 		m_tProgress.m_iAttrs = m_tProgress.m_iAttrs + nFieldMVAs;
9235 		m_tProgress.m_iAttrsTotal = m_tProgress.m_iAttrs;
9236 		m_pProgress ( &m_tProgress, false );
9237 	}
9238 
9239 	int	nLastBlockFieldMVAs = iFieldMVAInPool ? ( nFieldMVAs % iFieldMVAInPool ) : 0;
9240 	int nFieldBlocks = iFieldMVAInPool ? ( nFieldMVAs / iFieldMVAInPool + ( nLastBlockFieldMVAs ? 1 : 0 ) ) : 0;
9241 
9242 	// initialize readers
9243 	CSphVector<CSphBin*> dBins;
9244 	dBins.Reserve ( dBlockLens.GetLength() + nFieldBlocks );
9245 
9246 	int iBinSize = CSphBin::CalcBinSize ( iArenaSize, dBlockLens.GetLength() + nFieldBlocks, "sort_mva" );
9247 	SphOffset_t iSharedOffset = -1;
9248 
9249 	ARRAY_FOREACH ( i, dBlockLens )
9250 	{
9251 		dBins.Add ( new CSphBin() );
9252 		dBins[i]->m_iFileLeft = dBlockLens[i]*sizeof(MvaEntry_t);
9253 		dBins[i]->m_iFilePos = ( i==0 ) ? 0 : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
9254 		dBins[i]->Init ( fdTmpMva.GetFD(), &iSharedOffset, iBinSize );
9255 	}
9256 
9257 	SphOffset_t iSharedFieldOffset = -1;
9258 	SphOffset_t uStart = 0;
9259 	for ( int i = 0; i < nFieldBlocks; i++ )
9260 	{
9261 		dBins.Add ( new CSphBin() );
9262 		int iBin = dBins.GetLength () - 1;
9263 
9264 		dBins[iBin]->m_iFileLeft = ( i==nFieldBlocks-1 ? ( nLastBlockFieldMVAs ? nLastBlockFieldMVAs : iFieldMVAInPool ): iFieldMVAInPool ) * sizeof(MvaEntry_t);
9265 		dBins[iBin]->m_iFilePos = uStart;
9266 		dBins[iBin]->Init ( iFieldFD, &iSharedFieldOffset, iBinSize );
9267 
9268 		uStart += dBins [iBin]->m_iFileLeft;
9269 	}
9270 
9271 	// do the sort
9272 	CSphQueue < MvaEntryTag_t, MvaEntryCmp_fn > qMva ( Max ( 1, dBins.GetLength() ) );
9273 	ARRAY_FOREACH ( i, dBins )
9274 	{
9275 		MvaEntryTag_t tEntry;
9276 		if ( dBins[i]->ReadBytes ( (MvaEntry_t*) &tEntry, sizeof(MvaEntry_t) )!=BIN_READ_OK )
9277 		{
9278 			m_sLastError.SetSprintf ( "sort_mva: warmup failed (io error?)" );
9279 			return false;
9280 		}
9281 
9282 		tEntry.m_iTag = i;
9283 		qMva.Push ( tEntry );
9284 	}
9285 
9286 	// spm-file := info-list [ 0+ ]
9287 	// info-list := docid, values-list [ index.schema.mva-count ]
9288 	// values-list := values-count, value [ values-count ]
9289 	// note that mva32 come first then mva64
9290 	SphDocID_t uCurID = 0;
9291 	CSphVector < CSphVector<int64_t> > dCurInfo;
9292 	dCurInfo.Resize ( dMvaIndexes.GetLength() );
9293 
9294 	for ( ;; )
9295 	{
9296 		// flush previous per-document info-list
9297 		if ( !qMva.GetLength() || qMva.Root().m_uDocID!=uCurID )
9298 		{
9299 			if ( uCurID )
9300 			{
9301 				wrMva.PutDocid ( uCurID );
9302 				ARRAY_FOREACH ( i, dCurInfo )
9303 				{
9304 					int iLen = dCurInfo[i].GetLength();
9305 					if ( i>=iMva64 )
9306 					{
9307 						wrMva.PutDword ( iLen*2 );
9308 						wrMva.PutBytes ( dCurInfo[i].Begin(), sizeof(int64_t)*iLen );
9309 					} else
9310 					{
9311 						wrMva.PutDword ( iLen );
9312 						ARRAY_FOREACH ( iVal, dCurInfo[i] )
9313 						{
9314 							wrMva.PutDword ( (DWORD)dCurInfo[i][iVal] );
9315 						}
9316 					}
9317 				}
9318 			}
9319 
9320 			if ( !qMva.GetLength() )
9321 				break;
9322 
9323 			uCurID = qMva.Root().m_uDocID;
9324 			ARRAY_FOREACH ( i, dCurInfo )
9325 				dCurInfo[i].Resize ( 0 );
9326 		}
9327 
9328 		// accumulate this entry
9329 #if PARANOID
9330 		assert ( dCurInfo [ qMva.Root().m_iAttr ].GetLength()==0
9331 			|| dCurInfo [ qMva.Root().m_iAttr ].Last()<=qMva.Root().m_iValue );
9332 #endif
9333 		dCurInfo [ qMva.Root().m_iAttr ].AddUnique ( qMva.Root().m_iValue );
9334 
9335 		// get next entry
9336 		int iBin = qMva.Root().m_iTag;
9337 		qMva.Pop ();
9338 
9339 		MvaEntryTag_t tEntry;
9340 		ESphBinRead iRes = dBins[iBin]->ReadBytes ( (MvaEntry_t*)&tEntry, sizeof(MvaEntry_t) );
9341 		tEntry.m_iTag = iBin;
9342 
9343 		if ( iRes==BIN_READ_OK )
9344 			qMva.Push ( tEntry );
9345 
9346 		if ( iRes==BIN_READ_ERROR )
9347 		{
9348 			m_sLastError.SetSprintf ( "sort_mva: read error" );
9349 			return false;
9350 		}
9351 	}
9352 
9353 	// clean up readers
9354 	ARRAY_FOREACH ( i, dBins )
9355 		SafeDelete ( dBins[i] );
9356 
9357 	wrMva.CloseFile ();
9358 	if ( wrMva.IsError() )
9359 		return false;
9360 
9361 	if ( m_pProgress )
9362 		m_pProgress ( &m_tProgress, true );
9363 
9364 	return true;
9365 }
9366 
9367 
9368 struct CmpOrdinalsValue_fn
9369 {
IsLessCmpOrdinalsValue_fn9370 	inline bool IsLess ( const Ordinal_t & a, const Ordinal_t & b ) const
9371 	{
9372 		return strcmp ( a.m_sValue.cstr(), b.m_sValue.cstr() )<0;
9373 	}
9374 };
9375 
9376 struct CmpOrdinalsEntry_fn
9377 {
IsLessCmpOrdinalsEntry_fn9378 	static inline bool IsLess ( const OrdinalEntry_t & a, const OrdinalEntry_t & b )
9379 	{
9380 		return strcmp ( a.m_sValue.cstr(), b.m_sValue.cstr() )<0;
9381 	}
9382 };
9383 
9384 struct CmpOrdinalsDocid_fn
9385 {
IsLessCmpOrdinalsDocid_fn9386 	inline bool IsLess ( const OrdinalId_t & a, const OrdinalId_t & b ) const
9387 	{
9388 		return a.m_uDocID < b.m_uDocID;
9389 	}
9390 };
9391 
9392 
9393 struct CmpMvaEntries_fn
9394 {
IsLessCmpMvaEntries_fn9395 	inline bool IsLess ( const MvaEntry_t & a, const MvaEntry_t & b ) const
9396 	{
9397 		return a<b;
9398 	}
9399 };
9400 
9401 struct CmpOrdinalIdEntry_fn
9402 {
IsLessCmpOrdinalIdEntry_fn9403 	static inline bool IsLess ( const OrdinalIdEntry_t & a, const OrdinalIdEntry_t & b )
9404 	{
9405 		return a.m_uDocID < b.m_uDocID;
9406 	}
9407 };
9408 
9409 
DumpOrdinals(CSphWriter & Writer,CSphVector<Ordinal_t> & dOrdinals)9410 SphOffset_t CSphIndex_VLN::DumpOrdinals ( CSphWriter & Writer, CSphVector<Ordinal_t> & dOrdinals )
9411 {
9412 	SphOffset_t uSize = ( sizeof ( SphDocID_t ) + sizeof ( DWORD ) ) * dOrdinals.GetLength ();
9413 
9414 	ARRAY_FOREACH ( i, dOrdinals )
9415 	{
9416 		Ordinal_t & Ord = dOrdinals[i];
9417 
9418 		DWORD uValueLen = Ord.m_sValue.cstr () ? strlen ( Ord.m_sValue.cstr () ) : 0;
9419 		Writer.PutBytes ( &(Ord.m_uDocID), sizeof ( Ord.m_uDocID ) );
9420 		Writer.PutBytes ( &uValueLen, sizeof ( uValueLen ) );
9421 		Writer.PutBytes ( Ord.m_sValue.cstr (), uValueLen );
9422 		uSize += uValueLen;
9423 
9424 		if ( Writer.IsError () )
9425 			return 0;
9426 	}
9427 
9428 	return uSize;
9429 }
9430 
9431 
ReadOrdinal(CSphBin & Reader,Ordinal_t & Ordinal)9432 ESphBinRead CSphIndex_VLN::ReadOrdinal ( CSphBin & Reader, Ordinal_t & Ordinal )
9433 {
9434 	ESphBinRead eRes = Reader.ReadBytes ( &Ordinal.m_uDocID, sizeof ( Ordinal.m_uDocID ) );
9435 	if ( eRes!=BIN_READ_OK )
9436 		return eRes;
9437 
9438 	DWORD uStrLen;
9439 	eRes = Reader.ReadBytes ( &uStrLen, sizeof ( DWORD ) );
9440 	if ( eRes!=BIN_READ_OK )
9441 		return eRes;
9442 
9443 	if ( uStrLen>=(DWORD)MAX_ORDINAL_STR_LEN )
9444 		return BIN_READ_ERROR;
9445 
9446 	char dBuffer [MAX_ORDINAL_STR_LEN];
9447 
9448 	if ( uStrLen > 0 )
9449 	{
9450 		eRes = Reader.ReadBytes ( dBuffer, uStrLen );
9451 		if ( eRes!=BIN_READ_OK )
9452 			return eRes;
9453 	}
9454 
9455 	dBuffer [uStrLen] = '\0';
9456 	Ordinal.m_sValue = dBuffer;
9457 
9458 	return BIN_READ_OK;
9459 }
9460 
9461 
SortOrdinals(const char * szToFile,int iFromFD,int iArenaSize,int iOrdinalsInPool,CSphVector<CSphVector<SphOffset_t>> & dOrdBlockSize,bool bWarnOfMem)9462 bool CSphIndex_VLN::SortOrdinals ( const char * szToFile, int iFromFD, int iArenaSize, int iOrdinalsInPool, CSphVector < CSphVector < SphOffset_t > > & dOrdBlockSize, bool bWarnOfMem )
9463 {
9464 	int nAttrs = dOrdBlockSize.GetLength ();
9465 	int nBlocks = dOrdBlockSize[0].GetLength ();
9466 
9467 	CSphWriter Writer;
9468 	if ( !Writer.OpenFile ( szToFile, m_sLastError ) )
9469 		return false;
9470 
9471 	int iBinSize = CSphBin::CalcBinSize ( iArenaSize, nBlocks, "ordinals", bWarnOfMem );
9472 	SphOffset_t iSharedOffset = -1;
9473 
9474 	CSphQueue < OrdinalEntry_t, CmpOrdinalsEntry_fn > qOrdinals ( Max ( 1, nBlocks ) );
9475 	OrdinalEntry_t tOrdinalEntry;
9476 	DWORD uOrdinalId = 0;
9477 
9478 	CSphVector < OrdinalId_t > dOrdinalIdPool;
9479 	dOrdinalIdPool.Reserve ( nBlocks );
9480 
9481 	CSphVector < CSphVector < SphOffset_t > > dStarts;
9482 	dStarts.Resize ( nAttrs );
9483 	ARRAY_FOREACH ( i, dStarts )
9484 		dStarts[i].Resize ( nBlocks );
9485 
9486 	SphOffset_t uStart = 0;
9487 	for ( int iBlock = 0; iBlock < nBlocks; iBlock++ )
9488 		for ( int iAttr = 0; iAttr < nAttrs; iAttr++ )
9489 		{
9490 			dStarts [iAttr][iBlock] = uStart;
9491 			uStart += dOrdBlockSize [iAttr][iBlock];
9492 		}
9493 
9494 	for ( int iAttr = 0; iAttr < nAttrs; iAttr++ )
9495 	{
9496 		CSphVector < CSphBin > dBins;
9497 		dBins.Resize ( nBlocks );
9498 
9499 		ARRAY_FOREACH ( i, dBins )
9500 		{
9501 			dBins[i].m_iFileLeft = (int)dOrdBlockSize[iAttr][i];
9502 			dBins[i].m_iFilePos = dStarts[iAttr][i];
9503 			dBins[i].Init ( iFromFD, &iSharedOffset, iBinSize );
9504 		}
9505 
9506 		dOrdBlockSize [iAttr].Resize ( 0 );
9507 
9508 		for ( int iBlock = 0; iBlock < nBlocks; iBlock++ )
9509 		{
9510 			if ( ReadOrdinal ( dBins [iBlock], tOrdinalEntry )!=BIN_READ_OK )
9511 			{
9512 				m_sLastError = "sort_ordinals: warmup failed (io error?)";
9513 				return false;
9514 			}
9515 
9516 			tOrdinalEntry.m_iTag = iBlock;
9517 			qOrdinals.Push ( tOrdinalEntry );
9518 		}
9519 
9520 		SphDocID_t uCurID = 0;
9521 
9522 		CSphString sLastOrdValue;
9523 		int iMyBlock = 0;
9524 
9525 		for ( ;; )
9526 		{
9527 			if ( !qOrdinals.GetLength () || qOrdinals.Root ().m_uDocID!=uCurID )
9528 			{
9529 				if ( uCurID )
9530 				{
9531 					OrdinalId_t tId;
9532 					tId.m_uDocID = uCurID;
9533 					tId.m_uId = uOrdinalId;
9534 					dOrdinalIdPool.Add ( tId );
9535 
9536 					if ( qOrdinals.GetLength () > 0 )
9537 					{
9538 						if ( sLastOrdValue.cstr()[0]!=qOrdinals.Root ().m_sValue.cstr()[0] )
9539 							uOrdinalId++;
9540 						else
9541 							if ( strcmp ( sLastOrdValue.cstr (), qOrdinals.Root ().m_sValue.cstr () ) )
9542 								uOrdinalId++;
9543 					}
9544 
9545 					if ( dOrdinalIdPool.GetLength()==iOrdinalsInPool )
9546 					{
9547 						dOrdinalIdPool.Sort ( CmpOrdinalsDocid_fn () );
9548 						Writer.PutBytes ( &dOrdinalIdPool[0], sizeof(OrdinalId_t)*dOrdinalIdPool.GetLength() );
9549 						if ( Writer.IsError () )
9550 						{
9551 							m_sLastError = "sort_ordinals: io error";
9552 							return false;
9553 						}
9554 
9555 						dOrdBlockSize [iAttr].Add ( dOrdinalIdPool.GetLength () * sizeof ( OrdinalId_t ) );
9556 						dOrdinalIdPool.Resize ( 0 );
9557 					}
9558 				}
9559 
9560 				if ( !qOrdinals.GetLength () )
9561 					break;
9562 
9563 				uCurID = qOrdinals.Root().m_uDocID;
9564 				const_cast < CSphString & > ( qOrdinals.Root ().m_sValue ).Swap ( sLastOrdValue );
9565 			}
9566 
9567 			// get next entry
9568 			iMyBlock = qOrdinals.Root().m_iTag;
9569 			qOrdinals.Pop ();
9570 
9571 			ESphBinRead eRes = ReadOrdinal ( dBins [iMyBlock], tOrdinalEntry );
9572 			tOrdinalEntry.m_iTag = iMyBlock;
9573 			if ( eRes==BIN_READ_OK )
9574 				qOrdinals.Push ( tOrdinalEntry );
9575 
9576 			if ( eRes==BIN_READ_ERROR )
9577 			{
9578 				m_sLastError = "sort_ordinals: read error";
9579 				return false;
9580 			}
9581 		}
9582 
9583 		// flush last ordinal ids
9584 		if ( dOrdinalIdPool.GetLength () )
9585 		{
9586 			dOrdinalIdPool.Sort ( CmpOrdinalsDocid_fn () );
9587 			Writer.PutBytes ( &dOrdinalIdPool[0], sizeof(OrdinalId_t)*dOrdinalIdPool.GetLength () );
9588 			if ( Writer.IsError () )
9589 			{
9590 				m_sLastError = "sort_ordinals: io error";
9591 				return false;
9592 			}
9593 
9594 			dOrdBlockSize [iAttr].Add ( dOrdinalIdPool.GetLength()*sizeof(OrdinalId_t) );
9595 			dOrdinalIdPool.Resize ( 0 );
9596 		}
9597 	}
9598 
9599 	Writer.CloseFile ();
9600 	if ( Writer.IsError () )
9601 		return false;
9602 
9603 	return true;
9604 }
9605 
9606 
SortOrdinalIds(const char * szToFile,int iFromFD,int iArenaSize,CSphVector<CSphVector<SphOffset_t>> & dOrdBlockSize,bool bWarnOfMem)9607 bool CSphIndex_VLN::SortOrdinalIds ( const char * szToFile, int iFromFD, int iArenaSize, CSphVector < CSphVector < SphOffset_t > > & dOrdBlockSize, bool bWarnOfMem )
9608 {
9609 	int nAttrs = dOrdBlockSize.GetLength ();
9610 	int nMaxBlocks = 0;
9611 	ARRAY_FOREACH ( i, dOrdBlockSize )
9612 		if ( dOrdBlockSize[i].GetLength () > nMaxBlocks )
9613 			nMaxBlocks = dOrdBlockSize[i].GetLength ();
9614 
9615 	CSphWriter Writer;
9616 	if ( !Writer.OpenFile ( szToFile, m_sLastError ) )
9617 		return false;
9618 
9619 	int iBinSize = CSphBin::CalcBinSize ( iArenaSize, nMaxBlocks, "ordinals", bWarnOfMem );
9620 
9621 	SphOffset_t uStart = 0;
9622 	OrdinalIdEntry_t tOrdinalIdEntry;
9623 	OrdinalId_t tOrdinalId;
9624 
9625 	for ( int iAttr = 0; iAttr < nAttrs; ++iAttr )
9626 	{
9627 		int nBlocks = dOrdBlockSize [iAttr].GetLength ();
9628 		CSphQueue < OrdinalIdEntry_t, CmpOrdinalIdEntry_fn > qOrdinalIds ( Max ( 1, nBlocks ) );
9629 		CSphVector < CSphBin > dBins;
9630 		dBins.Resize ( nBlocks );
9631 
9632 		SphOffset_t iSharedOffset = -1;
9633 
9634 		ARRAY_FOREACH ( i, dBins )
9635 		{
9636 			dBins[i].m_iFileLeft = (int)dOrdBlockSize [iAttr][i];
9637 			dBins[i].m_iFilePos = uStart;
9638 			dBins[i].Init ( iFromFD, &iSharedOffset, iBinSize );
9639 
9640 			uStart += dBins[i].m_iFileLeft;
9641 		}
9642 
9643 		for ( int iBlock = 0; iBlock < nBlocks; iBlock++ )
9644 		{
9645 			if ( dBins[iBlock].ReadBytes ( &tOrdinalId, sizeof ( tOrdinalId ) )!=BIN_READ_OK )
9646 			{
9647 				m_sLastError = "sort_ordinals: warmup failed (io error?)";
9648 				return false;
9649 			}
9650 
9651 			tOrdinalIdEntry.m_uDocID = tOrdinalId.m_uDocID;
9652 			tOrdinalIdEntry.m_uId = tOrdinalId.m_uId;
9653 			tOrdinalIdEntry.m_iTag = iBlock;
9654 			qOrdinalIds.Push ( tOrdinalIdEntry );
9655 		}
9656 
9657 		OrdinalId_t tCachedId;
9658 		tCachedId.m_uDocID = 0;
9659 
9660 		SphOffset_t uResultSize = 0;
9661 
9662 		for ( ;; )
9663 		{
9664 			if ( !qOrdinalIds.GetLength () || qOrdinalIds.Root ().m_uDocID!=tCachedId.m_uDocID )
9665 			{
9666 				if ( tCachedId.m_uDocID )
9667 				{
9668 					uResultSize += sizeof ( OrdinalId_t );
9669 					Writer.PutBytes ( &tCachedId, sizeof ( OrdinalId_t ) );
9670 					if ( Writer.IsError () )
9671 					{
9672 						m_sLastError = "sort_ordinals: io error";
9673 						return false;
9674 					}
9675 				}
9676 
9677 				if ( !qOrdinalIds.GetLength () )
9678 					break;
9679 
9680 				tCachedId.m_uDocID = qOrdinalIds.Root().m_uDocID;
9681 				tCachedId.m_uId = qOrdinalIds.Root ().m_uId;
9682 			}
9683 
9684 			// get next entry
9685 			int iBlock = qOrdinalIds.Root().m_iTag;
9686 			qOrdinalIds.Pop ();
9687 
9688 			ESphBinRead eRes = dBins [iBlock].ReadBytes ( &tOrdinalId, sizeof ( tOrdinalId ) );
9689 			tOrdinalIdEntry.m_uDocID = tOrdinalId.m_uDocID;
9690 			tOrdinalIdEntry.m_uId = tOrdinalId.m_uId;
9691 			tOrdinalIdEntry.m_iTag = iBlock;
9692 			if ( eRes==BIN_READ_OK )
9693 				qOrdinalIds.Push ( tOrdinalIdEntry );
9694 
9695 			if ( eRes==BIN_READ_ERROR )
9696 			{
9697 				m_sLastError = "sort_ordinals: read error";
9698 				return false;
9699 			}
9700 		}
9701 
9702 		dOrdBlockSize [iAttr].Resize ( 0 );
9703 		dOrdBlockSize [iAttr].Add ( uResultSize );
9704 	}
9705 
9706 	return true;
9707 }
9708 
9709 struct FieldMVARedirect_t
9710 {
9711 	CSphAttrLocator		m_tLocator;
9712 	int					m_iAttr;
9713 	int					m_iMVAAttr;
9714 	bool				m_bMva64;
9715 };
9716 
9717 
RelocateBlock(int iFile,BYTE * pBuffer,int iRelocationSize,SphOffset_t * pFileSize,CSphBin * pMinBin,SphOffset_t * pSharedOffset)9718 bool CSphIndex_VLN::RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSize, SphOffset_t * pFileSize, CSphBin * pMinBin, SphOffset_t * pSharedOffset )
9719 {
9720 	assert ( pBuffer && pFileSize && pMinBin && pSharedOffset );
9721 
9722 	SphOffset_t iBlockStart = pMinBin->m_iFilePos;
9723 	SphOffset_t iBlockLeft = pMinBin->m_iFileLeft;
9724 
9725 	ESphBinRead eRes = pMinBin->Precache ();
9726 	switch ( eRes )
9727 	{
9728 	case BIN_PRECACHE_OK:
9729 		return true;
9730 	case BIN_READ_ERROR:
9731 		m_sLastError = "block relocation: preread error";
9732 		return false;
9733 	default:
9734 		break;
9735 	}
9736 
9737 	int nTransfers = (int)( ( iBlockLeft+iRelocationSize-1) / iRelocationSize );
9738 
9739 	SphOffset_t uTotalRead = 0;
9740 	SphOffset_t uNewBlockStart = *pFileSize;
9741 
9742 	for ( int i = 0; i < nTransfers; i++ )
9743 	{
9744 		sphSeek ( iFile, iBlockStart + uTotalRead, SEEK_SET );
9745 
9746 		int iToRead = i==nTransfers-1 ? (int)( iBlockLeft % iRelocationSize ) : iRelocationSize;
9747 		size_t iRead = sphReadThrottled ( iFile, pBuffer, iToRead );
9748 		if ( iRead!=size_t(iToRead) )
9749 		{
9750 			m_sLastError.SetSprintf ( "block relocation: read error (%d of %d bytes read): %s", (int)iRead, iToRead, strerror(errno) );
9751 			return false;
9752 		}
9753 
9754 		sphSeek ( iFile, *pFileSize, SEEK_SET );
9755 		uTotalRead += iToRead;
9756 
9757 		if ( !sphWriteThrottled ( iFile, pBuffer, iToRead, "block relocation", m_sLastError ) )
9758 			return false;
9759 
9760 		*pFileSize += iToRead;
9761 	}
9762 
9763 	assert ( uTotalRead==iBlockLeft );
9764 
9765 	// update block pointers
9766 	pMinBin->m_iFilePos = uNewBlockStart;
9767 	*pSharedOffset = *pFileSize;
9768 
9769 	return true;
9770 }
9771 
9772 
CountWords(const CSphString & sData,ISphTokenizer * pTokenizer)9773 static int CountWords ( const CSphString & sData, ISphTokenizer * pTokenizer )
9774 {
9775 	BYTE * sField = (BYTE*) sData.cstr();
9776 	if ( !sField )
9777 		return 0;
9778 
9779 	int iCount = 0;
9780 	pTokenizer->SetBuffer ( sField, (int)strlen ( (char*)sField ) );
9781 	while ( pTokenizer->GetToken() )
9782 		iCount++;
9783 	return iCount;
9784 }
9785 
LoadHitlessWords()9786 bool CSphIndex_VLN::LoadHitlessWords ()
9787 {
9788 	assert ( m_dHitlessWords.GetLength()==0 );
9789 
9790 	if ( m_tSettings.m_sHitlessFiles.IsEmpty() )
9791 		return true;
9792 
9793 	const char * szStart = m_tSettings.m_sHitlessFiles.cstr();
9794 
9795 	while ( *szStart )
9796 	{
9797 		while ( *szStart && ( sphIsSpace ( *szStart ) || *szStart==',' ) )
9798 			++szStart;
9799 
9800 		if ( !*szStart )
9801 			break;
9802 
9803 		const char * szWordStart = szStart;
9804 
9805 		while ( *szStart && !sphIsSpace ( *szStart ) && *szStart!=',' )
9806 			++szStart;
9807 
9808 		if ( szStart - szWordStart > 0 )
9809 		{
9810 			CSphString sFilename;
9811 			sFilename.SetBinary ( szWordStart, szStart-szWordStart );
9812 
9813 			CSphAutofile tFile ( sFilename.cstr(), SPH_O_READ, m_sLastError );
9814 			if ( tFile.GetFD()==-1 )
9815 				return false;
9816 
9817 			CSphVector<BYTE> dBuffer ( (int)tFile.GetSize() );
9818 			if ( !tFile.Read ( &dBuffer[0], dBuffer.GetLength(), m_sLastError ) )
9819 				return false;
9820 
9821 			// FIXME!!! dict=keywords + hitless_words=some
9822 			m_pTokenizer->SetBuffer ( &dBuffer[0], dBuffer.GetLength() );
9823 			while ( BYTE * sToken = m_pTokenizer->GetToken() )
9824 				m_dHitlessWords.Add ( m_pDict->GetWordID ( sToken ) );
9825 		}
9826 	}
9827 
9828 	m_dHitlessWords.Uniq();
9829 	return true;
9830 }
9831 
9832 
sphTruncate(int iFD)9833 static bool sphTruncate ( int iFD )
9834 {
9835 #if USE_WINDOWS
9836 	return SetEndOfFile ( (HANDLE) _get_osfhandle(iFD) )!=0;
9837 #else
9838 	return ::ftruncate ( iFD, ::lseek ( iFD, 0, SEEK_CUR ) )==0;
9839 #endif
9840 }
9841 
9842 class DeleteOnFail : public ISphNoncopyable
9843 {
9844 public:
DeleteOnFail()9845 	DeleteOnFail() : m_bShitHappened ( true )
9846 	{}
~DeleteOnFail()9847 	inline ~DeleteOnFail()
9848 	{
9849 		if ( m_bShitHappened )
9850 		{
9851 			ARRAY_FOREACH ( i, m_dWriters )
9852 				m_dWriters[i]->UnlinkFile();
9853 
9854 			ARRAY_FOREACH ( i, m_dAutofiles )
9855 				m_dAutofiles[i]->SetTemporary();
9856 		}
9857 	}
AddWriter(CSphWriter * pWr)9858 	inline void AddWriter ( CSphWriter* pWr )
9859 	{
9860 		if ( pWr )
9861 			m_dWriters.Add ( pWr );
9862 	}
AddAutofile(CSphAutofile * pAf)9863 	inline void AddAutofile ( CSphAutofile* pAf )
9864 	{
9865 		if ( pAf )
9866 			m_dAutofiles.Add ( pAf );
9867 	}
AllIsDone()9868 	inline void AllIsDone()
9869 	{
9870 		m_bShitHappened = false;
9871 	}
9872 private:
9873 	bool	m_bShitHappened;
9874 	CSphVector<CSphWriter*> m_dWriters;
9875 	CSphVector<CSphAutofile*> m_dAutofiles;
9876 };
9877 
9878 
Build(const CSphVector<CSphSource * > & dSources,int iMemoryLimit,int iWriteBuffer)9879 int CSphIndex_VLN::Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer )
9880 {
9881 	PROFILER_INIT ();
9882 
9883 	assert ( dSources.GetLength() );
9884 
9885 	if ( !LoadHitlessWords() )
9886 		return 0;
9887 
9888 	m_iWriteBuffer = ( iWriteBuffer>0 )
9889 		? Max ( iWriteBuffer, MIN_WRITE_BUFFER )
9890 		: DEFAULT_WRITE_BUFFER;
9891 
9892 	if ( !m_pWriteBuffer )
9893 		m_pWriteBuffer = new BYTE [ m_iWriteBuffer ];
9894 
9895 	m_bWordDict = m_pDict->GetSettings().m_bWordDict;
9896 
9897 	// vars shared between phases
9898 	CSphVector<CSphBin*> dBins;
9899 	SphOffset_t iSharedOffset = -1;
9900 
9901 	m_pDict->HitblockBegin();
9902 
9903 	// setup sources
9904 	ARRAY_FOREACH ( iSource, dSources )
9905 	{
9906 		CSphSource * pSource = dSources[iSource];
9907 		assert ( pSource );
9908 
9909 		pSource->SetDict ( m_pDict );
9910 		pSource->Setup ( m_tSettings );
9911 	}
9912 
9913 	// connect 1st source and fetch its schema
9914 	if ( !dSources[0]->Connect ( m_sLastError )
9915 		|| !dSources[0]->IterateStart ( m_sLastError )
9916 		|| !dSources[0]->UpdateSchema ( &m_tSchema, m_sLastError ) )
9917 	{
9918 		return 0;
9919 	}
9920 
9921 	if ( m_tSchema.m_dFields.GetLength()==0 )
9922 	{
9923 		m_sLastError.SetSprintf ( "No fields in schema - will not index" );
9924 		return 0;
9925 	}
9926 
9927 	// check docinfo
9928 	if ( m_tSchema.GetAttrsCount()==0 && m_tSettings.m_eDocinfo!=SPH_DOCINFO_NONE )
9929 	{
9930 		sphWarning ( "Attribute count is 0: switching to none docinfo" );
9931 		m_tSettings.m_eDocinfo = SPH_DOCINFO_NONE;
9932 	}
9933 
9934 	if ( dSources[0]->HasJoinedFields() && m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
9935 	{
9936 		m_sLastError.SetSprintf ( "got joined fields, but docinfo is 'inline' (fix your config file)" );
9937 		return 0;
9938 	}
9939 
9940 	if ( m_tSchema.GetAttrsCount()>0 && m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
9941 	{
9942 		m_sLastError.SetSprintf ( "got attributes, but docinfo is 'none' (fix your config file)" );
9943 		return 0;
9944 	}
9945 
9946 	bool bHaveFieldMVAs = false;
9947 	CSphVector<int> dMvaIndexes;
9948 	CSphVector<CSphAttrLocator> dMvaLocators;
9949 
9950 	// ordinals and strings storage
9951 	CSphVector<int> dOrdinalAttrs;
9952 	CSphVector<int> dStringAttrs;
9953 	CSphVector<int> dWordcountAttrs;
9954 
9955 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
9956 	{
9957 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
9958 		ESphAttr eAttrType = tCol.m_eAttrType;
9959 
9960 		if ( eAttrType==SPH_ATTR_UINT32SET )
9961 		{
9962 			if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
9963 				bHaveFieldMVAs = true;
9964 
9965 			dMvaIndexes.Add ( i );
9966 			dMvaLocators.Add ( tCol.m_tLocator );
9967 		}
9968 		switch ( eAttrType )
9969 		{
9970 		case SPH_ATTR_ORDINAL:
9971 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
9972 				dOrdinalAttrs.Add ( i );
9973 			break;
9974 		case SPH_ATTR_STRING:
9975 			dStringAttrs.Add ( i );
9976 			break;
9977 		case SPH_ATTR_WORDCOUNT:
9978 			dWordcountAttrs.Add ( i );
9979 			break;
9980 		default:
9981 			break;
9982 		}
9983 	}
9984 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
9985 	{
9986 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
9987 		ESphAttr eAttrType = tCol.m_eAttrType;
9988 		if ( eAttrType==SPH_ATTR_INT64SET )
9989 		{
9990 			if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
9991 				bHaveFieldMVAs = true;
9992 
9993 			dMvaIndexes.Add ( i );
9994 			dMvaLocators.Add ( tCol.m_tLocator );
9995 		}
9996 	}
9997 
9998 	bool bGotMVA = ( dMvaIndexes.GetLength()!=0 );
9999 	if ( bGotMVA && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
10000 	{
10001 		m_sLastError.SetSprintf ( "multi-valued attributes require docinfo=extern (fix your config file)" );
10002 		return 0;
10003 	}
10004 
10005 	bool bHaveOrdinals = ( dOrdinalAttrs.GetLength() > 0 );
10006 	if ( bHaveOrdinals && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
10007 	{
10008 		m_sLastError.SetSprintf ( "ordinal string attributes require docinfo=extern (fix your config file)" );
10009 		return 0;
10010 	}
10011 
10012 	if ( dStringAttrs.GetLength() && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
10013 	{
10014 		m_sLastError.SetSprintf ( "string attributes require docinfo=extern (fix your config file)" );
10015 		return 0;
10016 	}
10017 
10018 	////////////////////////////////////////////////
10019 	// collect and partially sort hits and docinfos
10020 	////////////////////////////////////////////////
10021 
10022 	// killlist storage
10023 	CSphVector <SphAttr_t> dKillList;
10024 
10025 	// adjust memory requirements
10026 	int iOldLimit = iMemoryLimit;
10027 
10028 	// book memory to store at least 64K attribute rows
10029 	const int iDocinfoStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
10030 	int iDocinfoMax = Max ( 65536, iMemoryLimit/16/iDocinfoStride/sizeof(DWORD) );
10031 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
10032 		iDocinfoMax = 1;
10033 
10034 	// book at least 32 KB for ordinals, if needed
10035 	int iOrdinalPoolSize = Max ( 32768, iMemoryLimit/8 );
10036 	if ( !bHaveOrdinals )
10037 		iOrdinalPoolSize = 0;
10038 
10039 	// book at least 32 KB for field MVAs, if needed
10040 	int iFieldMVAPoolSize = Max ( 32768, iMemoryLimit/16 );
10041 	if ( bHaveFieldMVAs==0 )
10042 		iFieldMVAPoolSize = 0;
10043 
10044 	// book at least 2 MB for keywords dict, if needed
10045 	int iDictSize = 0;
10046 	if ( m_bWordDict )
10047 		iDictSize = Max ( MIN_KEYWORDS_DICT, iMemoryLimit/8 );
10048 
10049 	// do we have enough left for hits?
10050 	int iHitsMax = 1048576;
10051 
10052 	iMemoryLimit -= iDocinfoMax*iDocinfoStride*sizeof(DWORD) + iOrdinalPoolSize + iFieldMVAPoolSize + iDictSize;
10053 	if ( iMemoryLimit < iHitsMax*(int)sizeof(CSphWordHit) )
10054 	{
10055 		iMemoryLimit = iOldLimit + iHitsMax*sizeof(CSphWordHit) - iMemoryLimit;
10056 		sphWarn ( "collect_hits: mem_limit=%d kb too low, increasing to %d kb",
10057 			iOldLimit/1024, iMemoryLimit/1024 );
10058 	} else
10059 	{
10060 		iHitsMax = iMemoryLimit / sizeof(CSphWordHit);
10061 	}
10062 
10063 	// allocate raw hits block
10064 	CSphAutoArray<CSphWordHit> dHits ( iHitsMax + MAX_SOURCE_HITS );
10065 	CSphWordHit * pHits = dHits;
10066 	CSphWordHit * pHitsMax = dHits + iHitsMax;
10067 
10068 	// allocate docinfos buffer
10069 	CSphAutoArray<DWORD> dDocinfos ( iDocinfoMax*iDocinfoStride );
10070 	DWORD * pDocinfo = dDocinfos;
10071 	const DWORD * pDocinfoMax = dDocinfos + iDocinfoMax*iDocinfoStride;
10072 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
10073 	{
10074 		pDocinfo = NULL;
10075 		pDocinfoMax = NULL;
10076 	}
10077 
10078 	int nOrdinals = 0;
10079 	SphOffset_t uMaxOrdinalAttrBlockSize = 0;
10080 	int iCurrentBlockSize = 0;
10081 
10082 	CSphVector < CSphVector < Ordinal_t > > dOrdinals;
10083 	dOrdinals.Resize ( dOrdinalAttrs.GetLength() );
10084 	ARRAY_FOREACH ( i, dOrdinals )
10085 		dOrdinals[i].Reserve ( 65536 );
10086 
10087 	CSphVector < CSphVector<SphOffset_t> > dOrdBlockSize;
10088 	dOrdBlockSize.Resize ( dOrdinalAttrs.GetLength () );
10089 	ARRAY_FOREACH ( i, dOrdBlockSize )
10090 		dOrdBlockSize[i].Reserve ( 8192 );
10091 
10092 	int iMaxOrdLen = 0;
10093 
10094 	CSphVector < MvaEntry_t > dFieldMVAs;
10095 	dFieldMVAs.Reserve ( 16384 );
10096 
10097 	CSphVector < SphOffset_t > dFieldMVABlocks;
10098 	dFieldMVABlocks.Reserve ( 4096 );
10099 
10100 	CSphVector < FieldMVARedirect_t > dFieldMvaIndexes;
10101 
10102 	if ( bHaveFieldMVAs )
10103 		dFieldMvaIndexes.Reserve ( 8 );
10104 
10105 	int iMaxPoolFieldMVAs = iFieldMVAPoolSize / sizeof ( MvaEntry_t );
10106 	int nFieldMVAs = 0;
10107 
10108 	// create temp files
10109 	CSphAutofile fdLock ( GetIndexFileName("tmp0"), SPH_O_NEW, m_sLastError, true );
10110 	CSphAutofile fdHits ( GetIndexFileName ( m_bInplaceSettings ? "spp" : "tmp1" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
10111 	CSphAutofile fdDocinfos ( GetIndexFileName ( m_bInplaceSettings ? "spa" : "tmp2" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
10112 	CSphAutofile fdTmpFieldMVAs ( GetIndexFileName("tmp7"), SPH_O_NEW, m_sLastError, true );
10113 	CSphWriter tOrdWriter;
10114 	CSphWriter tStrWriter;
10115 
10116 	CSphString sRawOrdinalsFile = GetIndexFileName("tmp4");
10117 	if ( bHaveOrdinals && !tOrdWriter.OpenFile ( sRawOrdinalsFile.cstr (), m_sLastError ) )
10118 		return 0;
10119 
10120 	if ( !tStrWriter.OpenFile ( GetIndexFileName("sps"), m_sLastError ) )
10121 		return 0;
10122 	tStrWriter.PutByte ( 0 ); // dummy byte, to reserve magic zero offset
10123 
10124 	DeleteOnFail dFileWatchdog;
10125 
10126 	if ( m_bInplaceSettings )
10127 	{
10128 		dFileWatchdog.AddAutofile ( &fdHits );
10129 		dFileWatchdog.AddAutofile ( &fdDocinfos );
10130 	}
10131 
10132 	dFileWatchdog.AddWriter ( &tStrWriter );
10133 
10134 	if ( fdLock.GetFD()<0 || fdHits.GetFD()<0 || fdDocinfos.GetFD()<0 || fdTmpFieldMVAs.GetFD ()<0 )
10135 		return 0;
10136 
10137 	SphOffset_t iHitsGap = 0;
10138 	SphOffset_t iDocinfosGap = 0;
10139 
10140 	if ( m_bInplaceSettings )
10141 	{
10142 		const int HIT_SIZE_AVG = 4;
10143 		const float HIT_BLOCK_FACTOR = 1.0f;
10144 		const float DOCINFO_BLOCK_FACTOR = 1.0f;
10145 
10146 		if ( m_iHitGap )
10147 			iHitsGap = (SphOffset_t) m_iHitGap;
10148 		else
10149 			iHitsGap = (SphOffset_t)( iHitsMax*HIT_BLOCK_FACTOR*HIT_SIZE_AVG );
10150 
10151 		iHitsGap = Max ( iHitsGap, 1 );
10152 		sphSeek ( fdHits.GetFD (), iHitsGap, SEEK_SET );
10153 
10154 		if ( m_iDocinfoGap )
10155 			iDocinfosGap = (SphOffset_t) m_iDocinfoGap;
10156 		else
10157 			iDocinfosGap = (SphOffset_t)( iDocinfoMax*DOCINFO_BLOCK_FACTOR*iDocinfoStride*sizeof(DWORD) );
10158 
10159 		iDocinfosGap = Max ( iDocinfosGap, 1 );
10160 		sphSeek ( fdDocinfos.GetFD (), iDocinfosGap, SEEK_SET );
10161 	}
10162 
10163 	if ( !sphLockEx ( fdLock.GetFD(), false ) )
10164 	{
10165 		m_sLastError.SetSprintf ( "failed to lock '%s': another indexer running?", fdLock.GetFilename() );
10166 		return 0;
10167 	}
10168 
10169 	// setup accumulating docinfo IDs range
10170 	m_pMin->Reset ( m_tSchema.GetRowSize() );
10171 
10172 	for ( int i=0; i<m_tSchema.GetRowSize(); i++ )
10173 		m_pMin->m_pDynamic[i] = ROWITEM_MAX;
10174 	m_pMin->m_iDocID = DOCID_MAX;
10175 
10176 	// build raw log
10177 	PROFILE_BEGIN ( collect_hits );
10178 
10179 	m_tStats.Reset ();
10180 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_COLLECT;
10181 	m_tProgress.m_iAttrs = 0;
10182 
10183 	CSphVector<int> dHitBlocks;
10184 	dHitBlocks.Reserve ( 1024 );
10185 
10186 	int iDocinfoBlocks = 0;
10187 
10188 	ARRAY_FOREACH ( iSource, dSources )
10189 	{
10190 		// connect and check schema, if it's not the first one
10191 		CSphSource * pSource = dSources[iSource];
10192 
10193 		if ( iSource )
10194 		{
10195 			if ( !pSource->Connect ( m_sLastError )
10196 				|| !pSource->IterateStart ( m_sLastError )
10197 				|| !pSource->UpdateSchema ( &m_tSchema, m_sLastError ) )
10198 			{
10199 				return 0;
10200 			}
10201 
10202 			if ( pSource->HasJoinedFields() && m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
10203 			{
10204 				m_sLastError.SetSprintf ( "got joined fields, but docinfo is 'inline' (fix your config file)" );
10205 				return 0;
10206 			}
10207 		}
10208 
10209 		dFieldMvaIndexes.Resize ( 0 );
10210 
10211 		ARRAY_FOREACH ( i, dMvaIndexes )
10212 		{
10213 			int iAttr = dMvaIndexes[i];
10214 			const CSphColumnInfo & tCol = m_tSchema.GetAttr ( iAttr );
10215 			if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
10216 			{
10217 				FieldMVARedirect_t & tRedirect = dFieldMvaIndexes.Add();
10218 				tRedirect.m_tLocator = tCol.m_tLocator;
10219 				tRedirect.m_iAttr = iAttr;
10220 				tRedirect.m_iMVAAttr = i;
10221 				tRedirect.m_bMva64 = ( tCol.m_eAttrType==SPH_ATTR_INT64SET );
10222 			}
10223 		}
10224 
10225 		// joined filter
10226 		bool bGotJoined = ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_INLINE ) && pSource->HasJoinedFields();
10227 		CSphVector<SphDocID_t> dAllIds; // FIXME! unlimited RAM use..
10228 
10229 		// fetch documents
10230 		for ( ;; )
10231 		{
10232 			// get next doc, and handle errors
10233 			bool bGotDoc = pSource->IterateDocument ( m_sLastError );
10234 			if ( !bGotDoc )
10235 				return 0;
10236 
10237 			// ensure docid is sane
10238 			if ( pSource->m_tDocInfo.m_iDocID==DOCID_MAX )
10239 			{
10240 				m_sLastError.SetSprintf ( "docid==DOCID_MAX (source broken?)" );
10241 				return 0;
10242 			}
10243 
10244 			// check for eof
10245 			if ( !pSource->m_tDocInfo.m_iDocID )
10246 				break;
10247 
10248 			if ( bGotJoined )
10249 				dAllIds.Add ( pSource->m_tDocInfo.m_iDocID );
10250 
10251 			// show progress bar
10252 			if ( m_pProgress
10253 				&& ( ( pSource->GetStats().m_iTotalDocuments % 1000 )==0 ) )
10254 			{
10255 				m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments + pSource->GetStats().m_iTotalDocuments;
10256 				m_tProgress.m_iBytes = m_tStats.m_iTotalBytes + pSource->GetStats().m_iTotalBytes;
10257 				m_pProgress ( &m_tProgress, false );
10258 			}
10259 
10260 			// update crashdump
10261 			g_iIndexerCurrentDocID = pSource->m_tDocInfo.m_iDocID;
10262 			g_iIndexerCurrentHits = pHits-dHits;
10263 
10264 			// store field MVAs
10265 			if ( bHaveFieldMVAs )
10266 			{
10267 				ARRAY_FOREACH ( i, dFieldMvaIndexes )
10268 				{
10269 					int iAttr = dFieldMvaIndexes[i].m_iAttr;
10270 					int iMVA = dFieldMvaIndexes[i].m_iMVAAttr;
10271 					bool bMva64 = dFieldMvaIndexes[i].m_bMva64;
10272 					int iStep = ( bMva64 ? 2 : 1 );
10273 
10274 					// store per-document MVAs
10275 					SphRange_t tFieldMva = pSource->IterateFieldMVAStart ( iAttr );
10276 					m_tProgress.m_iAttrs += ( tFieldMva.m_iLength / iStep );
10277 
10278 					assert ( ( tFieldMva.m_iStart + tFieldMva.m_iLength )<=pSource->m_dMva.GetLength() );
10279 					for ( int j=tFieldMva.m_iStart; j<( tFieldMva.m_iStart+tFieldMva.m_iLength); j+=iStep )
10280 					{
10281 						MvaEntry_t & tMva = dFieldMVAs.Add();
10282 						tMva.m_uDocID = pSource->m_tDocInfo.m_iDocID;
10283 						tMva.m_iAttr = iMVA;
10284 						if ( bMva64 )
10285 						{
10286 							tMva.m_iValue = MVA_UPSIZE ( pSource->m_dMva.Begin() + j );
10287 						} else
10288 						{
10289 							tMva.m_iValue = pSource->m_dMva[j];
10290 						}
10291 
10292 						int iLength = dFieldMVAs.GetLength ();
10293 						if ( iLength==iMaxPoolFieldMVAs )
10294 						{
10295 							dFieldMVAs.Sort ( CmpMvaEntries_fn () );
10296 							if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0], iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError ) )
10297 								return 0;
10298 
10299 							dFieldMVAs.Resize ( 0 );
10300 
10301 							nFieldMVAs += iMaxPoolFieldMVAs;
10302 						}
10303 					}
10304 				}
10305 			}
10306 
10307 			// store ordinals
10308 			iCurrentBlockSize += ( sizeof ( SphOffset_t ) + sizeof ( DWORD ) ) * dOrdinalAttrs.GetLength ();
10309 
10310 			ARRAY_FOREACH ( i, dOrdinalAttrs )
10311 			{
10312 				CSphVector<Ordinal_t> & dCol = dOrdinals[i];
10313 				dCol.Add();
10314 
10315 				Ordinal_t & tLastOrd = dCol.Last();
10316 				tLastOrd.m_uDocID = pSource->m_tDocInfo.m_iDocID;
10317 				Swap ( tLastOrd.m_sValue, pSource->m_dStrAttrs[dOrdinalAttrs[i]] );
10318 				int iOrdStrLen = strlen ( tLastOrd.m_sValue.cstr () );
10319 				if ( iOrdStrLen > MAX_ORDINAL_STR_LEN )
10320 				{
10321 					iMaxOrdLen = iOrdStrLen;
10322 
10323 					// truncate
10324 					iOrdStrLen = MAX_ORDINAL_STR_LEN;
10325 					tLastOrd.m_sValue = tLastOrd.m_sValue.SubString ( 0, iOrdStrLen - 1 );
10326 				}
10327 
10328 				iCurrentBlockSize += iOrdStrLen;
10329 			}
10330 
10331 			if ( bHaveOrdinals )
10332 			{
10333 				if ( iCurrentBlockSize>=iOrdinalPoolSize )
10334 				{
10335 					iCurrentBlockSize = 0;
10336 
10337 					nOrdinals += dOrdinals[0].GetLength ();
10338 
10339 					ARRAY_FOREACH ( i, dOrdinalAttrs )
10340 					{
10341 						CSphVector<Ordinal_t> & dCol = dOrdinals[i];
10342 						dCol.Sort ( CmpOrdinalsValue_fn() );
10343 						SphOffset_t uSize = DumpOrdinals ( tOrdWriter, dCol );
10344 						if ( !uSize )
10345 						{
10346 							m_sLastError = "dump ordinals: io error";
10347 							return 0;
10348 						}
10349 
10350 						if ( uSize > uMaxOrdinalAttrBlockSize )
10351 							uMaxOrdinalAttrBlockSize = uSize;
10352 
10353 						dOrdBlockSize[i].Add ( uSize );
10354 						dCol.Resize ( 0 );
10355 					}
10356 				}
10357 			}
10358 
10359 			// store strings
10360 			ARRAY_FOREACH ( i, dStringAttrs )
10361 			{
10362 				// FIXME! optimize locators etc?
10363 				// FIXME! support binary strings w/embedded zeroes?
10364 				// get data, calc length
10365 				const char * sData = pSource->m_dStrAttrs[dStringAttrs[i]].cstr();
10366 				int iLen = sData ? strlen ( sData ) : 0;
10367 
10368 				if ( iLen )
10369 				{
10370 					// calc offset, do sanity checks
10371 					SphOffset_t uOff = tStrWriter.GetPos();
10372 					if ( uint64_t(uOff)>>32 )
10373 					{
10374 						m_sLastError.SetSprintf ( "too many string attributes (current index format allows up to 4 GB)" );
10375 						return 0;
10376 					}
10377 					pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, DWORD(uOff) );
10378 
10379 					// pack length, emit it, emit data
10380 					BYTE dPackedLen[4];
10381 					int iLenLen = sphPackStrlen ( dPackedLen, iLen );
10382 					tStrWriter.PutBytes ( &dPackedLen, iLenLen );
10383 					tStrWriter.PutBytes ( sData, iLen );
10384 				} else
10385 				{
10386 					// no data
10387 					pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, 0 );
10388 				}
10389 			}
10390 
10391 			// count words
10392 			ARRAY_FOREACH ( i, dWordcountAttrs )
10393 			{
10394 				int iAttr = dWordcountAttrs[i];
10395 				int iNumWords = CountWords ( pSource->m_dStrAttrs[iAttr], m_pTokenizer );
10396 				pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr(iAttr).m_tLocator, iNumWords );
10397 			}
10398 
10399 			// update min docinfo
10400 			assert ( pSource->m_tDocInfo.m_iDocID );
10401 			m_pMin->m_iDocID = Min ( m_pMin->m_iDocID, pSource->m_tDocInfo.m_iDocID );
10402 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
10403 				for ( int i=0; i<m_tSchema.GetRowSize(); i++ )
10404 					m_pMin->m_pDynamic[i] = Min ( m_pMin->m_pDynamic[i], pSource->m_tDocInfo.m_pDynamic[i] );
10405 
10406 			// store docinfo
10407 			if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_NONE )
10408 			{
10409 				// store next entry
10410 				DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_iDocID );
10411 				memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSource->m_tDocInfo.m_pDynamic, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
10412 				pDocinfo += iDocinfoStride;
10413 
10414 				// if not inlining, flush buffer if it's full
10415 				// (if inlining, it will flushed later, along with the hits)
10416 				if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pDocinfo>=pDocinfoMax )
10417 				{
10418 					assert ( pDocinfo==pDocinfoMax );
10419 					int iLen = iDocinfoMax*iDocinfoStride*sizeof(DWORD);
10420 
10421 					sphSortDocinfos ( dDocinfos, iDocinfoMax, iDocinfoStride );
10422 					if ( !sphWriteThrottled ( fdDocinfos.GetFD(), dDocinfos, iLen, "raw_docinfos", m_sLastError ) )
10423 						return 0;
10424 
10425 					pDocinfo = dDocinfos;
10426 					iDocinfoBlocks++;
10427 				}
10428 			}
10429 
10430 			// store hits
10431 			while ( const ISphHits * pDocHits = pSource->IterateHits ( m_sLastWarning ) )
10432 			{
10433 				int iDocHits = pDocHits->Length();
10434 #if PARANOID
10435 				for ( int i=0; i<iDocHits; i++ )
10436 				{
10437 					assert ( pDocHits->m_dData[i].m_iDocID==pSource->m_tDocInfo.m_iDocID );
10438 					assert ( pDocHits->m_dData[i].m_iWordID );
10439 					assert ( pDocHits->m_dData[i].m_iWordPos );
10440 				}
10441 #endif
10442 
10443 				assert ( ( pHits+iDocHits )<=( pHitsMax+MAX_SOURCE_HITS ) );
10444 
10445 				memcpy ( pHits, pDocHits->First(), iDocHits*sizeof(CSphWordHit) );
10446 				pHits += iDocHits;
10447 
10448 				// check if we need to flush
10449 				if ( pHits<pHitsMax
10450 					&& !( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE && pDocinfo>=pDocinfoMax )
10451 					&& !( iDictSize && m_pDict->HitblockGetMemUse() > iDictSize ) )
10452 				{
10453 					continue;
10454 				}
10455 
10456 				// update crashdump
10457 				g_iIndexerPoolStartDocID = pSource->m_tDocInfo.m_iDocID;
10458 				g_iIndexerPoolStartHit = pHits-dHits;
10459 
10460 				// sort hits
10461 				int iHits = pHits - dHits;
10462 				{
10463 					PROFILE ( sort_hits );
10464 					sphSort ( &dHits[0], iHits, CmpHit_fn() );
10465 					m_pDict->HitblockPatch ( &dHits[0], iHits );
10466 				}
10467 				pHits = dHits;
10468 
10469 				if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
10470 				{
10471 					// we're inlining, so let's flush both hits and docs
10472 					int iDocs = ( pDocinfo - dDocinfos ) / iDocinfoStride;
10473 					pDocinfo = dDocinfos;
10474 
10475 					sphSortDocinfos ( pDocinfo, iDocs, iDocinfoStride );
10476 
10477 					dHitBlocks.Add ( cidxWriteRawVLB ( fdHits.GetFD(), dHits, iHits,
10478 						dDocinfos, iDocs, iDocinfoStride ) );
10479 
10480 					// we are inlining, so if there are more hits in this document,
10481 					// we'll need to know it's info next flush
10482 					if ( iDocHits )
10483 					{
10484 						DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_iDocID );
10485 						memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSource->m_tDocInfo.m_pDynamic, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
10486 						pDocinfo += iDocinfoStride;
10487 					}
10488 				} else
10489 				{
10490 					// we're not inlining, so only flush hits, docs are flushed independently
10491 					dHitBlocks.Add ( cidxWriteRawVLB ( fdHits.GetFD(), dHits, iHits,
10492 						NULL, 0, 0 ) );
10493 				}
10494 				m_pDict->HitblockReset ();
10495 
10496 				if ( dHitBlocks.Last()<0 )
10497 					return 0;
10498 
10499 				// progress bar
10500 				m_tProgress.m_iHitsTotal += iHits;
10501 				if ( m_pProgress )
10502 				{
10503 					m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments + pSource->GetStats().m_iTotalDocuments;
10504 					m_tProgress.m_iBytes = m_tStats.m_iTotalBytes + pSource->GetStats().m_iTotalBytes;
10505 					m_pProgress ( &m_tProgress, false );
10506 				}
10507 			}
10508 		}
10509 
10510 		// FIXME! uncontrolled memory usage; add checks and/or diskbased sort in the future?
10511 		if ( pSource->IterateKillListStart ( m_sLastError ) )
10512 		{
10513 			SphDocID_t tDocId;
10514 			while ( pSource->IterateKillListNext ( tDocId ) )
10515 				dKillList.Add ( tDocId );
10516 		}
10517 
10518 		// fetch joined fields
10519 		if ( bGotJoined )
10520 		{
10521 			dAllIds.Uniq();
10522 
10523 			SphDocID_t uLastID = 0;
10524 			bool bLastFound = 0;
10525 
10526 			for ( ;; )
10527 			{
10528 				// get next doc, and handle errors
10529 				ISphHits * pJoinedHits = pSource->IterateJoinedHits ( m_sLastError );
10530 				if ( !pJoinedHits )
10531 					return 0;
10532 
10533 				// ensure docid is sane
10534 				if ( pSource->m_tDocInfo.m_iDocID==DOCID_MAX )
10535 				{
10536 					m_sLastError.SetSprintf ( "joined_docid==DOCID_MAX (source broken?)" );
10537 					return 0;
10538 				}
10539 
10540 				// check for eof
10541 				if ( !pSource->m_tDocInfo.m_iDocID )
10542 					break;
10543 
10544 				// filter and store hits
10545 				for ( const CSphWordHit * pHit = pJoinedHits->First(); pHit<=pJoinedHits->Last(); pHit++ )
10546 				{
10547 					// flush if needed
10548 					if ( pHits>=pHitsMax )
10549 					{
10550 						// sort hits
10551 						int iHits = pHits - dHits;
10552 						{
10553 							PROFILE ( sort_hits );
10554 							sphSort ( &dHits[0], iHits, CmpHit_fn() );
10555 							m_pDict->HitblockPatch ( &dHits[0], iHits );
10556 						}
10557 						pHits = dHits;
10558 						m_tProgress.m_iHitsTotal += iHits;
10559 
10560 						// we're not inlining, so only flush hits, docs are flushed independently
10561 						dHitBlocks.Add ( cidxWriteRawVLB ( fdHits.GetFD(), dHits, iHits,
10562 							NULL, 0, 0 ) );
10563 
10564 						if ( dHitBlocks.Last()<0 )
10565 							return 0;
10566 					}
10567 
10568 					// filter
10569 					SphDocID_t uHitID = pHit->m_iDocID;
10570 					if ( uHitID!=uLastID )
10571 					{
10572 						uLastID = uHitID;
10573 						bLastFound = ( dAllIds.BinarySearch ( uHitID )!=NULL );
10574 					}
10575 
10576 					// copy next hit
10577 					if ( bLastFound )
10578 						*pHits++ = *pHit;
10579 				}
10580 
10581 				// reset keywords only after all collected hits processed
10582 				if ( iDictSize && m_pDict->HitblockGetMemUse()>iDictSize )
10583 				{
10584 					int iHits = pHits - dHits;
10585 					{
10586 						PROFILE ( sort_hits );
10587 						sphSort ( &dHits[0], iHits, CmpHit_fn() );
10588 						m_pDict->HitblockPatch ( &dHits[0], iHits );
10589 					}
10590 					pHits = dHits;
10591 					m_tProgress.m_iHitsTotal += iHits;
10592 					if ( iHits )
10593 					{
10594 						dHitBlocks.Add ( cidxWriteRawVLB ( fdHits.GetFD(), dHits, iHits, NULL, 0, 0 ) );
10595 						if ( dHitBlocks.Last()<0 )
10596 							return 0;
10597 					}
10598 
10599 					m_pDict->HitblockReset ();
10600 				}
10601 			}
10602 		}
10603 
10604 		// this source is over, disconnect and update stats
10605 		pSource->Disconnect ();
10606 
10607 		m_tStats.m_iTotalDocuments += pSource->GetStats().m_iTotalDocuments;
10608 		m_tStats.m_iTotalBytes += pSource->GetStats().m_iTotalBytes;
10609 	}
10610 
10611 	// flush last docinfo block
10612 	int iDocinfoLastBlockSize = 0;
10613 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pDocinfo>dDocinfos )
10614 	{
10615 		iDocinfoLastBlockSize = ( pDocinfo - dDocinfos ) / iDocinfoStride;
10616 		assert ( pDocinfo==( dDocinfos + iDocinfoLastBlockSize*iDocinfoStride ) );
10617 
10618 		int iLen = iDocinfoLastBlockSize*iDocinfoStride*sizeof(DWORD);
10619 		sphSortDocinfos ( dDocinfos, iDocinfoLastBlockSize, iDocinfoStride );
10620 		if ( !sphWriteThrottled ( fdDocinfos.GetFD(), dDocinfos, iLen, "raw_docinfos", m_sLastError ) )
10621 			return 0;
10622 
10623 		iDocinfoBlocks++;
10624 	}
10625 
10626 	// flush last hit block
10627 	if ( pHits>dHits )
10628 	{
10629 		int iHits = pHits - dHits;
10630 		{
10631 			PROFILE ( sort_hits );
10632 			sphSort ( &dHits[0], iHits, CmpHit_fn() );
10633 			m_pDict->HitblockPatch ( &dHits[0], iHits );
10634 		}
10635 		m_tProgress.m_iHitsTotal += iHits;
10636 
10637 		if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
10638 		{
10639 			int iDocs = ( pDocinfo - dDocinfos ) / iDocinfoStride;
10640 			sphSortDocinfos ( dDocinfos, iDocs, iDocinfoStride );
10641 			dHitBlocks.Add ( cidxWriteRawVLB ( fdHits.GetFD(), dHits, iHits,
10642 				dDocinfos, iDocs, iDocinfoStride ) );
10643 		} else
10644 		{
10645 			dHitBlocks.Add ( cidxWriteRawVLB ( fdHits.GetFD(), dHits, iHits, NULL, 0, 0 ) );
10646 		}
10647 		m_pDict->HitblockReset ();
10648 
10649 		if ( dHitBlocks.Last()<0 )
10650 			return 0;
10651 	}
10652 
10653 	// flush last field MVA block
10654 	if ( bHaveFieldMVAs && dFieldMVAs.GetLength () )
10655 	{
10656 		int iLength = dFieldMVAs.GetLength ();
10657 		nFieldMVAs += iLength;
10658 
10659 		dFieldMVAs.Sort ( CmpMvaEntries_fn () );
10660 		if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0], iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError ) )
10661 			return 0;
10662 
10663 		dFieldMVAs.Reset ();
10664 	}
10665 
10666 	// flush last ordinals block
10667 	if ( bHaveOrdinals && dOrdinals[0].GetLength () )
10668 	{
10669 		nOrdinals += dOrdinals[0].GetLength ();
10670 
10671 		ARRAY_FOREACH ( i, dOrdinalAttrs )
10672 		{
10673 			CSphVector<Ordinal_t> & dCol = dOrdinals[i];
10674 			dCol.Sort ( CmpOrdinalsValue_fn() );
10675 
10676 			SphOffset_t uSize = DumpOrdinals ( tOrdWriter, dCol );
10677 			if ( !uSize )
10678 			{
10679 				m_sLastError = "dump ordinals: io error";
10680 				return 0;
10681 			}
10682 
10683 			if ( uSize > uMaxOrdinalAttrBlockSize )
10684 				uMaxOrdinalAttrBlockSize = uSize;
10685 
10686 			dOrdBlockSize[i].Add ( uSize );
10687 			dCol.Reset ();
10688 		}
10689 	}
10690 
10691 	if ( m_pProgress )
10692 	{
10693 		m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments;
10694 		m_tProgress.m_iBytes = m_tStats.m_iTotalBytes;
10695 		m_pProgress ( &m_tProgress, true );
10696 	}
10697 
10698 	PROFILE_END ( collect_hits );
10699 
10700 	///////////////////////////////////////
10701 	// collect and sort multi-valued attrs
10702 	///////////////////////////////////////
10703 
10704 	if ( !BuildMVA ( dSources, dHits, iHitsMax*sizeof(CSphWordHit),
10705 		fdTmpFieldMVAs.GetFD (), nFieldMVAs, iMaxPoolFieldMVAs ) )
10706 		return 0;
10707 
10708 	// reset persistent mva update pool
10709 	::unlink ( GetIndexFileName("mvp").cstr() );
10710 
10711 	// reset hits pool
10712 	dHits.Reset ();
10713 
10714 	CSphString sFieldMVAFile = fdTmpFieldMVAs.GetFilename ();
10715 	fdTmpFieldMVAs.Close ();
10716 	::unlink ( sFieldMVAFile.cstr () );
10717 
10718 	/////////////////
10719 	// sort docinfos
10720 	/////////////////
10721 
10722 	tOrdWriter.CloseFile ();
10723 	if ( tOrdWriter.IsError () )
10724 		return 0;
10725 
10726 	CSphString sSortedOrdinalIdFile = GetIndexFileName("tmp6");
10727 
10728 	// sort ordinals
10729 	if ( bHaveOrdinals && !dOrdBlockSize[0].GetLength () )
10730 	{
10731 		bHaveOrdinals = false;
10732 		::unlink ( sRawOrdinalsFile.cstr () );
10733 	}
10734 
10735 	if ( bHaveOrdinals )
10736 	{
10737 		if ( iMaxOrdLen > MAX_ORDINAL_STR_LEN )
10738 			sphWarn ( "some ordinal attributes are too long (len=%d,max=%d)", iMaxOrdLen, MAX_ORDINAL_STR_LEN );
10739 
10740 		CSphString sUnsortedIdFile = GetIndexFileName("tmp5");
10741 
10742 		CSphAutofile fdRawOrdinals ( sRawOrdinalsFile.cstr (), SPH_O_READ, m_sLastError, true );
10743 		if ( fdRawOrdinals.GetFD () < 0 )
10744 			return 0;
10745 
10746 		const float ARENA_PERCENT = 0.5f;
10747 		int nBlocks = dOrdBlockSize[0].GetLength ();
10748 
10749 		SphOffset_t uMemNeededForReaders = SphOffset_t ( nBlocks ) * uMaxOrdinalAttrBlockSize;
10750 		SphOffset_t uMemNeededForSorting = sizeof ( OrdinalId_t ) * nOrdinals;
10751 
10752 		int iArenaSize = (int) Min ( SphOffset_t ( iMemoryLimit * ARENA_PERCENT ), uMemNeededForReaders );
10753 		iArenaSize = Max ( CSphBin::MIN_SIZE * nBlocks, iArenaSize );
10754 
10755 		int iOrdinalsInPool = (int) Min ( SphOffset_t ( iMemoryLimit * ( 1.0f - ARENA_PERCENT ) ), uMemNeededForSorting ) / sizeof ( OrdinalId_t );
10756 
10757 		if ( !SortOrdinals ( sUnsortedIdFile.cstr (), fdRawOrdinals.GetFD (), iArenaSize, iOrdinalsInPool, dOrdBlockSize, iArenaSize < uMemNeededForReaders ) )
10758 			return 0;
10759 
10760 		CSphAutofile fdUnsortedId ( sUnsortedIdFile.cstr (), SPH_O_READ, m_sLastError, true );
10761 		if ( fdUnsortedId.GetFD () < 0 )
10762 			return 0;
10763 
10764 		iArenaSize = Min ( iMemoryLimit, (int)uMemNeededForSorting );
10765 		iArenaSize = Max ( CSphBin::MIN_SIZE * ( nOrdinals / iOrdinalsInPool + 1 ), iArenaSize );
10766 
10767 		if ( !SortOrdinalIds ( sSortedOrdinalIdFile.cstr (), fdUnsortedId.GetFD (), iArenaSize, dOrdBlockSize, iArenaSize < uMemNeededForSorting ) )
10768 			return 0;
10769 	}
10770 
10771 	// initialize MVA reader
10772 	CSphAutoreader rdMva;
10773 	if ( !rdMva.Open ( GetIndexFileName("spm"), m_sLastError ) )
10774 		return 0;
10775 
10776 	SphDocID_t uMvaID = rdMva.GetDocid();
10777 
10778 	// initialize writer
10779 	int iDocinfoFD = -1;
10780 	SphOffset_t iDocinfoWritePos = 0;
10781 	CSphScopedPtr<CSphAutofile> pfdDocinfoFinal ( NULL );
10782 
10783 	if ( m_bInplaceSettings )
10784 		iDocinfoFD = fdDocinfos.GetFD ();
10785 	else
10786 	{
10787 		pfdDocinfoFinal = new CSphAutofile ( GetIndexFileName("spa"), SPH_O_NEW, m_sLastError );
10788 		iDocinfoFD = pfdDocinfoFinal->GetFD();
10789 		if ( iDocinfoFD < 0 )
10790 			return 0;
10791 	}
10792 
10793 	int iDupes = 0;
10794 	int iMinBlock = -1;
10795 
10796 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && dHitBlocks.GetLength() )
10797 	{
10798 		// initialize readers
10799 		assert ( dBins.GetLength()==0 );
10800 		dBins.Reserve ( iDocinfoBlocks );
10801 
10802 		float fReadFactor = 1.0f;
10803 		float fRelocFactor = 0.0f;
10804 		if ( m_bInplaceSettings )
10805 		{
10806 			assert ( m_fRelocFactor > 0.005f && m_fRelocFactor < 0.95f );
10807 			fRelocFactor = m_fRelocFactor;
10808 			fReadFactor -= fRelocFactor;
10809 		}
10810 
10811 		int iBinSize = CSphBin::CalcBinSize ( int ( iMemoryLimit * fReadFactor ), iDocinfoBlocks, "sort_docinfos" );
10812 		int iRelocationSize = m_bInplaceSettings ? int ( iMemoryLimit * fRelocFactor ) : 0;
10813 		CSphAutoArray <BYTE> pRelocationBuffer ( iRelocationSize );
10814 		iSharedOffset = -1;
10815 
10816 		for ( int i=0; i<iDocinfoBlocks; i++ )
10817 		{
10818 			dBins.Add ( new CSphBin() );
10819 			dBins[i]->m_iFileLeft = ( ( i==iDocinfoBlocks-1 ) ? iDocinfoLastBlockSize : iDocinfoMax )*iDocinfoStride*sizeof(DWORD);
10820 			dBins[i]->m_iFilePos = ( i==0 ) ? iDocinfosGap : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
10821 			dBins[i]->Init ( fdDocinfos.GetFD(), &iSharedOffset, iBinSize );
10822 		}
10823 
10824 		SphOffset_t iDocinfoFileSize = 0;
10825 		if ( iDocinfoBlocks )
10826 			iDocinfoFileSize = dBins [iDocinfoBlocks-1]->m_iFilePos + dBins [iDocinfoBlocks-1]->m_iFileLeft;
10827 
10828 		// docinfo queue
10829 		CSphAutoArray<DWORD> dDocinfoQueue ( iDocinfoBlocks*iDocinfoStride );
10830 		CSphQueue < int, CmpQueuedDocinfo_fn > qDocinfo ( iDocinfoBlocks );
10831 
10832 		CmpQueuedDocinfo_fn::m_pStorage = dDocinfoQueue;
10833 		CmpQueuedDocinfo_fn::m_iStride = iDocinfoStride;
10834 
10835 		pDocinfo = dDocinfoQueue;
10836 		for ( int i=0; i<iDocinfoBlocks; i++ )
10837 		{
10838 			if ( dBins[i]->ReadBytes ( pDocinfo, iDocinfoStride*sizeof(DWORD) )!=BIN_READ_OK )
10839 			{
10840 				m_sLastError.SetSprintf ( "sort_docinfos: warmup failed (io error?)" );
10841 				return 0;
10842 			}
10843 			pDocinfo += iDocinfoStride;
10844 			qDocinfo.Push ( i );
10845 		}
10846 
10847 		CSphVector < CSphBin > dOrdReaders;
10848 		SphOffset_t iSharedOrdOffset = -1;
10849 
10850 		CSphAutofile fdTmpSortedIds ( sSortedOrdinalIdFile.cstr (), SPH_O_READ, m_sLastError, true );
10851 
10852 		if ( bHaveOrdinals )
10853 		{
10854 			if ( fdTmpSortedIds.GetFD () < 0 )
10855 				return 0;
10856 
10857 			dOrdReaders.Resize ( dOrdinalAttrs.GetLength () );
10858 			SphOffset_t uStart = 0;
10859 			ARRAY_FOREACH ( i, dOrdReaders )
10860 			{
10861 				dOrdReaders[i].m_iFileLeft = (int)dOrdBlockSize [i][0];
10862 				dOrdReaders[i].m_iFilePos = uStart;
10863 				dOrdReaders[i].Init ( fdTmpSortedIds.GetFD(), &iSharedOrdOffset, ORDINAL_READ_SIZE );
10864 				uStart += dOrdReaders[i].m_iFileLeft;
10865 			}
10866 		}
10867 
10868 		// while the queue has data for us
10869 		int iOrd = 0;
10870 		pDocinfo = dDocinfos;
10871 		SphDocID_t uLastId = 0;
10872 		m_uMinMaxIndex = 0;
10873 
10874 		// prepare the collector for min/max of attributes
10875 		AttrIndexBuilder_c tMinMax ( m_tSchema );
10876 		CSphVector<DWORD> dMinMaxBuffer ( tMinMax.GetExpectedSize ( m_tStats.m_iTotalDocuments ) );
10877 		CSphDocMVA tCurInfo ( dMvaIndexes.GetLength() );
10878 		tMinMax.Prepare ( dMinMaxBuffer.Begin(), dMinMaxBuffer.Begin() + dMinMaxBuffer.GetLength() );
10879 
10880 		SphDocID_t uLastDupe = 0;
10881 		while ( qDocinfo.GetLength() )
10882 		{
10883 			// obtain bin index and next entry
10884 			int iBin = qDocinfo.Root();
10885 			DWORD * pEntry = dDocinfoQueue + iBin*iDocinfoStride;
10886 
10887 			if ( DOCINFO2ID ( pEntry )<uLastId )
10888 			{
10889 				m_sLastError.SetSprintf ( "descending document prev id="DOCID_FMT", curr="DOCID_FMT" bin=%d", uLastId, DOCINFO2ID ( pEntry ), iBin );
10890 				return 0;
10891 			}
10892 
10893 			// skip duplicates
10894 			if ( DOCINFO2ID ( pEntry )==uLastId )
10895 			{
10896 				// dupe, report it
10897 				if ( m_tSettings.m_bVerbose && uLastDupe!=uLastId )
10898 					sphWarn ( "duplicated document id="DOCID_FMT, uLastId );
10899 
10900 				uLastDupe = uLastId;
10901 				iDupes++;
10902 
10903 			} else
10904 			{
10905 				// new unique document, handle it
10906 				// update ordinals
10907 				ARRAY_FOREACH ( i, dOrdinalAttrs )
10908 				{
10909 					OrdinalId_t Id;
10910 					if ( dOrdReaders[i].ReadBytes ( &Id, sizeof(Id) )!=BIN_READ_OK )
10911 					{
10912 						m_sLastError = "update ordinals: io error";
10913 						return 0;
10914 					}
10915 
10916 					assert ( Id.m_uDocID==DOCINFO2ID(pEntry) );
10917 					sphSetRowAttr ( DOCINFO2ATTRS(pEntry), m_tSchema.GetAttr(dOrdinalAttrs[i]).m_tLocator, Id.m_uId );
10918 				}
10919 				iOrd++;
10920 				m_uMinMaxIndex += iDocinfoStride;
10921 
10922 				// update MVA
10923 				if ( bGotMVA )
10924 				{
10925 					// go to next id
10926 					while ( uMvaID<DOCINFO2ID(pEntry) )
10927 					{
10928 						ARRAY_FOREACH ( i, dMvaIndexes )
10929 						{
10930 							int iCount = rdMva.GetDword();
10931 							rdMva.SkipBytes ( iCount*sizeof(DWORD) );
10932 						}
10933 
10934 						uMvaID = rdMva.GetDocid();
10935 						if ( !uMvaID )
10936 							uMvaID = DOCID_MAX;
10937 					}
10938 
10939 					assert ( uMvaID>=DOCINFO2ID(pEntry) );
10940 					if ( uMvaID==DOCINFO2ID(pEntry) )
10941 					{
10942 						ARRAY_FOREACH ( i, dMvaIndexes )
10943 						{
10944 							sphSetRowAttr ( DOCINFO2ATTRS(pEntry), dMvaLocators[i], SphAttr_t(rdMva.GetPos()/sizeof(DWORD)) ); // intentional clamp; we'll check for 32bit overflow later
10945 
10946 							DWORD iMvaCount = rdMva.GetDword();
10947 							tCurInfo.m_dMVA[i].Reserve ( iMvaCount );
10948 							for ( ; iMvaCount; iMvaCount-- )
10949 							{
10950 								tCurInfo.m_dMVA[i].Add ( rdMva.GetDword() );
10951 							}
10952 						}
10953 
10954 						uMvaID = rdMva.GetDocid();
10955 						if ( !uMvaID )
10956 							uMvaID = DOCID_MAX;
10957 					}
10958 				}
10959 
10960 				tMinMax.Collect ( pEntry, tCurInfo );
10961 
10962 				ARRAY_FOREACH ( i, tCurInfo.m_dMVA )
10963 					tCurInfo.m_dMVA[i].Resize ( 0 );
10964 
10965 				// emit it
10966 				memcpy ( pDocinfo, pEntry, iDocinfoStride*sizeof(DWORD) );
10967 				pDocinfo += iDocinfoStride;
10968 
10969 				uLastId = DOCINFO2ID(pEntry);
10970 
10971 				if ( pDocinfo>=pDocinfoMax )
10972 				{
10973 					int iLen = iDocinfoMax*iDocinfoStride*sizeof(DWORD);
10974 
10975 					if ( m_bInplaceSettings )
10976 					{
10977 						if ( iMinBlock==-1 || dBins[iMinBlock]->IsEOF () )
10978 						{
10979 							iMinBlock = -1;
10980 							ARRAY_FOREACH ( i, dBins )
10981 								if ( !dBins[i]->IsEOF () && ( iMinBlock==-1 || dBins [i]->m_iFilePos<dBins [iMinBlock]->m_iFilePos ) )
10982 									iMinBlock = i;
10983 						}
10984 
10985 						if ( iMinBlock!=-1 && ( iDocinfoWritePos + iLen ) > dBins[iMinBlock]->m_iFilePos )
10986 						{
10987 							if ( !RelocateBlock ( iDocinfoFD, (BYTE*)pRelocationBuffer, iRelocationSize, &iDocinfoFileSize, dBins[iMinBlock], &iSharedOffset ) )
10988 								return 0;
10989 
10990 							iMinBlock = (iMinBlock+1) % dBins.GetLength ();
10991 						}
10992 
10993 						sphSeek ( iDocinfoFD, iDocinfoWritePos, SEEK_SET );
10994 						iSharedOffset = iDocinfoWritePos;
10995 					}
10996 
10997 					if ( !sphWriteThrottled ( iDocinfoFD, dDocinfos, iLen, "sort_docinfo", m_sLastError ) )
10998 						return 0;
10999 
11000 					iDocinfoWritePos += iLen;
11001 					pDocinfo = dDocinfos;
11002 				}
11003 			}
11004 
11005 			// pop its index, update it, push its index again
11006 			qDocinfo.Pop ();
11007 			ESphBinRead eRes = dBins[iBin]->ReadBytes ( pEntry, iDocinfoStride*sizeof(DWORD) );
11008 			if ( eRes==BIN_READ_ERROR )
11009 			{
11010 				m_sLastError.SetSprintf ( "sort_docinfo: failed to read entry" );
11011 				return 0;
11012 			}
11013 			if ( eRes==BIN_READ_OK )
11014 				qDocinfo.Push ( iBin );
11015 		}
11016 
11017 		if ( pDocinfo>dDocinfos )
11018 		{
11019 			assert ( 0==( pDocinfo-dDocinfos ) % iDocinfoStride );
11020 			int iLen = ( pDocinfo - dDocinfos )*sizeof(DWORD);
11021 
11022 			if ( m_bInplaceSettings )
11023 				sphSeek ( iDocinfoFD, iDocinfoWritePos, SEEK_SET );
11024 
11025 			if ( !sphWriteThrottled ( iDocinfoFD, dDocinfos, iLen, "sort_docinfo", m_sLastError ) )
11026 				return 0;
11027 
11028 			if ( m_bInplaceSettings )
11029 				if ( !sphTruncate ( iDocinfoFD ) )
11030 					sphWarn ( "failed to truncate %s", fdDocinfos.GetFilename() );
11031 		}
11032 		tMinMax.FinishCollect();
11033 		if ( !sphWriteThrottled ( iDocinfoFD, &dMinMaxBuffer[0], sizeof(DWORD)*tMinMax.GetActualSize(), "minmax_docinfo", m_sLastError ) )
11034 			return 0;
11035 
11036 		// clean up readers
11037 		ARRAY_FOREACH ( i, dBins )
11038 			SafeDelete ( dBins[i] );
11039 
11040 		dBins.Reset ();
11041 	}
11042 
11043 	dDocinfos.Reset ();
11044 	pDocinfo = NULL;
11045 
11046 	// it might be zero-length, but it must exist
11047 	if ( m_bInplaceSettings )
11048 		fdDocinfos.Close ();
11049 	else
11050 	{
11051 		assert ( pfdDocinfoFinal.Ptr () );
11052 		pfdDocinfoFinal->Close ();
11053 	}
11054 
11055 	// dump killlist
11056 	CSphAutofile fdKillList ( GetIndexFileName("spk"), SPH_O_NEW, m_sLastError );
11057 	if ( fdKillList.GetFD()<0 )
11058 		return 0;
11059 
11060 	if ( dKillList.GetLength () )
11061 	{
11062 		dKillList.Uniq ();
11063 
11064 		m_iKillListSize = dKillList.GetLength ();
11065 
11066 		if ( !sphWriteThrottled ( fdKillList.GetFD (), &dKillList[0], m_iKillListSize*sizeof(SphAttr_t), "kill list", m_sLastError ) )
11067 			return 0;
11068 	}
11069 
11070 	fdKillList.Close ();
11071 
11072 	///////////////////////////////////
11073 	// sort and write compressed index
11074 	///////////////////////////////////
11075 
11076 	PROFILE_BEGIN ( invert_hits );
11077 
11078 	// initialize readers
11079 	assert ( dBins.GetLength()==0 );
11080 	dBins.Reserve ( dHitBlocks.GetLength() );
11081 
11082 	iSharedOffset = -1;
11083 
11084 	float fReadFactor = 1.0f;
11085 	int iRelocationSize = 0;
11086 	iWriteBuffer = m_iWriteBuffer;
11087 
11088 	if ( m_bInplaceSettings )
11089 	{
11090 		assert ( m_fRelocFactor > 0.005f && m_fRelocFactor < 0.95f );
11091 		assert ( m_fWriteFactor > 0.005f && m_fWriteFactor < 0.95f );
11092 		assert ( m_fWriteFactor+m_fRelocFactor < 1.0f );
11093 
11094 		fReadFactor -= m_fRelocFactor + m_fWriteFactor;
11095 
11096 		iRelocationSize = int ( iMemoryLimit * m_fRelocFactor );
11097 		iWriteBuffer = int ( iMemoryLimit * m_fWriteFactor );
11098 	}
11099 
11100 	int iBinSize = CSphBin::CalcBinSize ( int ( iMemoryLimit * fReadFactor ), dHitBlocks.GetLength() + m_bWordDict, "sort_hits" );
11101 
11102 	CSphAutoArray <BYTE> pRelocationBuffer ( iRelocationSize );
11103 	iSharedOffset = -1;
11104 
11105 	ARRAY_FOREACH ( i, dHitBlocks )
11106 	{
11107 		dBins.Add ( new CSphBin ( m_tSettings.m_eHitless, m_pDict->GetSettings().m_bWordDict ) );
11108 		dBins[i]->m_iFileLeft = dHitBlocks[i];
11109 		dBins[i]->m_iFilePos = ( i==0 ) ? iHitsGap : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
11110 		dBins[i]->Init ( fdHits.GetFD(), &iSharedOffset, iBinSize );
11111 	}
11112 
11113 	// if there were no hits, create zero-length index files
11114 	int iRawBlocks = dBins.GetLength();
11115 
11116 	//////////////////////////////
11117 	// create new index files set
11118 	//////////////////////////////
11119 
11120 	// doclist and hitlist files
11121 	m_wrDoclist.CloseFile ();
11122 	m_wrHitlist.CloseFile ();
11123 
11124 	m_wrDoclist.SetBufferSize ( m_iWriteBuffer );
11125 	m_wrHitlist.SetBufferSize ( m_bInplaceSettings ? iWriteBuffer : m_iWriteBuffer );
11126 
11127 	if ( !m_wrDoclist.OpenFile ( GetIndexFileName("spd"), m_sLastError ) )
11128 		return 0;
11129 
11130 	if ( m_bInplaceSettings )
11131 	{
11132 		sphSeek ( fdHits.GetFD(), 0, SEEK_SET );
11133 		m_wrHitlist.SetFile ( fdHits, &iSharedOffset, m_sLastError );
11134 	} else
11135 		if ( !m_wrHitlist.OpenFile ( GetIndexFileName("spp"), m_sLastError ) )
11136 			return 0;
11137 
11138 	// put dummy byte (otherwise offset would start from 0, first delta would be 0
11139 	// and VLB encoding of offsets would fuckup)
11140 	BYTE bDummy = 1;
11141 	m_wrDoclist.PutBytes ( &bDummy, 1 );
11142 	m_wrHitlist.PutBytes ( &bDummy, 1 );
11143 
11144 	// dict files
11145 	CSphAutofile fdTmpDict ( GetIndexFileName("tmp8"), SPH_O_NEW, m_sLastError, true );
11146 	CSphAutofile fdDict ( GetIndexFileName("spi"), SPH_O_NEW, m_sLastError, false );
11147 	if ( fdTmpDict.GetFD()<0 || fdDict.GetFD()<0 )
11148 		return 0;
11149 	m_pDict->DictBegin ( fdTmpDict, fdDict, iBinSize );
11150 
11151 	// adjust min IDs, and fill header
11152 	assert ( m_pMin->m_iDocID>0 );
11153 	m_pMin->m_iDocID--;
11154 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
11155 		for ( int i=0; i<m_tSchema.GetRowSize(); i++ )
11156 			m_pMin->m_pDynamic[i]--;
11157 
11158 	//////////////
11159 	// final sort
11160 	//////////////
11161 
11162 	if ( iRawBlocks )
11163 	{
11164 		int iLastBin = dBins.GetLength () - 1;
11165 		SphOffset_t iHitFileSize = dBins[iLastBin]->m_iFilePos + dBins [iLastBin]->m_iFileLeft;
11166 
11167 		CSphHitQueue tQueue ( iRawBlocks );
11168 		CSphAggregateHit tHit;
11169 
11170 		// initialize hitlist encoder state
11171 		m_tLastHit.m_iDocID = 0;
11172 		m_tLastHit.m_iWordID = 0;
11173 		m_tLastHit.m_iWordPos = EMPTY_HIT;
11174 		m_tLastHit.m_sKeyword = m_sLastKeyword;
11175 
11176 		// initial fill
11177 		int iRowitems = ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ) ? m_tSchema.GetRowSize() : 0;
11178 		CSphAutoArray<CSphRowitem> dInlineAttrs ( iRawBlocks*iRowitems );
11179 
11180 		int * bActive = new int [ iRawBlocks ];
11181 		for ( int i=0; i<iRawBlocks; i++ )
11182 		{
11183 			if ( !dBins[i]->ReadHit ( &tHit, iRowitems, dInlineAttrs+i*iRowitems ) )
11184 			{
11185 				m_sLastError.SetSprintf ( "sort_hits: warmup failed (io error?)" );
11186 				return 0;
11187 			}
11188 			bActive[i] = ( tHit.m_iWordID!=0 );
11189 			if ( bActive[i] )
11190 				tQueue.Push ( tHit, i );
11191 		}
11192 
11193 		// init progress meter
11194 		m_tProgress.m_ePhase = CSphIndexProgress::PHASE_SORT;
11195 		m_tProgress.m_iHits = 0;
11196 
11197 		// while the queue has data for us
11198 		// FIXME! analyze binsRead return code
11199 		int iHitsSorted = 0;
11200 		iMinBlock = -1;
11201 		while ( tQueue.m_iUsed )
11202 		{
11203 			int iBin = tQueue.m_pData->m_iBin;
11204 
11205 			// pack and emit queue root
11206 			tQueue.m_pData->m_iDocID -= m_pMin->m_iDocID;
11207 
11208 			if ( m_bInplaceSettings )
11209 			{
11210 				if ( iMinBlock==-1 || dBins[iMinBlock]->IsEOF () || !bActive[iMinBlock] )
11211 				{
11212 					iMinBlock = -1;
11213 					ARRAY_FOREACH ( i, dBins )
11214 						if ( !dBins[i]->IsEOF () && bActive[i] && ( iMinBlock==-1 || dBins[i]->m_iFilePos < dBins[iMinBlock]->m_iFilePos ) )
11215 							iMinBlock = i;
11216 				}
11217 
11218 				int iToWriteMax = 3*sizeof(DWORD);
11219 				if ( iMinBlock!=-1 && ( m_wrHitlist.GetPos () + iToWriteMax ) > dBins[iMinBlock]->m_iFilePos )
11220 				{
11221 					if ( !RelocateBlock ( fdHits.GetFD (), (BYTE*)pRelocationBuffer, iRelocationSize, &iHitFileSize, dBins[iMinBlock], &iSharedOffset ) )
11222 						return 0;
11223 
11224 					iMinBlock = (iMinBlock+1) % dBins.GetLength ();
11225 				}
11226 			}
11227 
11228 			cidxHit ( tQueue.m_pData, iRowitems ? dInlineAttrs+iBin*iRowitems : NULL );
11229 			if ( m_pDict->DictIsError() || m_wrDoclist.IsError() || m_wrHitlist.IsError() )
11230 				return 0;
11231 
11232 			// pop queue root and push next hit from popped bin
11233 			tQueue.Pop ();
11234 			if ( bActive[iBin] )
11235 			{
11236 				dBins[iBin]->ReadHit ( &tHit, iRowitems, dInlineAttrs+iBin*iRowitems );
11237 				bActive[iBin] = ( tHit.m_iWordID!=0 );
11238 				if ( bActive[iBin] )
11239 					tQueue.Push ( tHit, iBin );
11240 			}
11241 
11242 			// progress
11243 			if ( m_pProgress && ++iHitsSorted==1000000 )
11244 			{
11245 				m_tProgress.m_iHits += iHitsSorted;
11246 				m_pProgress ( &m_tProgress, false );
11247 				iHitsSorted = 0;
11248 			}
11249 		}
11250 
11251 		if ( m_pProgress )
11252 		{
11253 			m_tProgress.m_iHits = m_tProgress.m_iHitsTotal; // sum might be less than total because of dupes!
11254 			m_pProgress ( &m_tProgress, true );
11255 		}
11256 
11257 		// cleanup
11258 		SafeDeleteArray ( bActive );
11259 
11260 		ARRAY_FOREACH ( i, dBins )
11261 			SafeDelete ( dBins[i] );
11262 		dBins.Reset ();
11263 
11264 		CSphAggregateHit tFlush;
11265 		tFlush.m_iDocID = 0;
11266 		tFlush.m_iWordID = 0;
11267 		tFlush.m_sKeyword = NULL;
11268 		tFlush.m_iWordPos = EMPTY_HIT;
11269 		tFlush.m_dFieldMask.Unset();
11270 		cidxHit ( &tFlush, NULL );
11271 
11272 		if ( m_bInplaceSettings )
11273 		{
11274 			m_wrHitlist.CloseFile ();
11275 			if ( !sphTruncate ( fdHits.GetFD () ) )
11276 				sphWarn ( "failed to truncate %s", fdHits.GetFilename() );
11277 		}
11278 	}
11279 
11280 	if ( iDupes )
11281 		sphWarn ( "%d duplicate document id pairs found", iDupes );
11282 
11283 	PROFILE_END ( invert_hits );
11284 
11285 	// we're done
11286 	if ( !cidxDone ( "sph", iMemoryLimit ) )
11287 		return 0;
11288 
11289 	// when the party's over..
11290 	ARRAY_FOREACH ( i, dSources )
11291 		dSources[i]->PostIndex ();
11292 
11293 	PROFILER_DONE ();
11294 	PROFILE_SHOW ();
11295 	dFileWatchdog.AllIsDone();
11296 	return 1;
11297 } // NOLINT function length
11298 
11299 
CopyFile(const char * sSrc,const char * sDst,CSphString & sErrStr)11300 static bool CopyFile ( const char * sSrc, const char * sDst, CSphString & sErrStr )
11301 {
11302 	assert ( sSrc );
11303 	assert ( sDst );
11304 
11305 	const DWORD iMaxBufSize = 1024 * 1024;
11306 
11307 	CSphAutofile tSrcFile ( sSrc, SPH_O_READ, sErrStr );
11308 	CSphAutofile tDstFile ( sDst, SPH_O_NEW, sErrStr );
11309 
11310 	if ( tSrcFile.GetFD()<0 || tDstFile.GetFD()<0 )
11311 		return false;
11312 
11313 	SphOffset_t iFileSize = tSrcFile.GetSize();
11314 	DWORD iBufSize = (DWORD) Min ( iFileSize, (SphOffset_t)iMaxBufSize );
11315 
11316 	if ( iFileSize )
11317 	{
11318 		BYTE * pData = new BYTE[iBufSize];
11319 
11320 		if ( !pData )
11321 		{
11322 			sErrStr.SetSprintf ( "memory allocation error" );
11323 			return false;
11324 		}
11325 
11326 		bool bError = true;
11327 
11328 		while ( iFileSize > 0 )
11329 		{
11330 			DWORD iSize = (DWORD) Min ( iFileSize, (SphOffset_t)iBufSize );
11331 
11332 			if ( !tSrcFile.Read ( pData, iSize, sErrStr ) )
11333 				break;
11334 
11335 			if ( !sphWriteThrottled ( tDstFile.GetFD(), pData, iSize, "CopyFile", sErrStr ) )
11336 				break;
11337 
11338 			iFileSize -= iSize;
11339 
11340 			if ( !iFileSize )
11341 				bError = false;
11342 		}
11343 
11344 		SafeDeleteArray ( pData );
11345 		return ( bError==false );
11346 	}
11347 
11348 	return true;
11349 }
11350 
11351 
CopyStringAttr(CSphWriter & wrTo,CSphReader & rdFrom,SphAttr_t uOffset)11352 SphAttr_t CopyStringAttr ( CSphWriter & wrTo, CSphReader & rdFrom, SphAttr_t uOffset )
11353 {
11354 	// magic offset? do nothing
11355 	if ( !uOffset )
11356 		return 0;
11357 
11358 	// aim
11359 	rdFrom.SeekTo ( uOffset, 0 );
11360 
11361 	// read and decode length
11362 	// MUST be in sync with sphUnpackStr
11363 	int iLen = rdFrom.GetByte ();
11364 	if ( iLen & 0x80 )
11365 	{
11366 		if ( iLen & 0x40 )
11367 		{
11368 			iLen = ( (int)( iLen & 0x3f )<<16 ) + ( rdFrom.GetByte()<<8 );
11369 			iLen += rdFrom.GetByte(); // MUST be separate statement; cf. sequence point
11370 		} else
11371 		{
11372 			iLen = ( (int)( iLen & 0x3f )<<8 ) + rdFrom.GetByte();
11373 		}
11374 	}
11375 
11376 	// no data? do nothing
11377 	if ( !iLen )
11378 		return 0;
11379 
11380 	// copy bytes
11381 	uOffset = (SphAttr_t) wrTo.GetPos(); // FIXME! check bounds?
11382 
11383 	BYTE dLen[4];
11384 	wrTo.PutBytes ( dLen, sphPackStrlen ( dLen, iLen ) );
11385 
11386 	while ( iLen>0 )
11387 	{
11388 		const BYTE * pBuf = NULL;
11389 		int iChunk = rdFrom.GetBytesZerocopy ( &pBuf, iLen );
11390 		wrTo.PutBytes ( pBuf, iChunk );
11391 		iLen -= iChunk;
11392 	}
11393 
11394 	return uOffset;
11395 }
11396 
11397 
11398 static const int DOCLIST_HINT_THRESH = 256;
11399 
DoclistHintUnpack(int iDocs,BYTE uHint)11400 static int DoclistHintUnpack ( int iDocs, BYTE uHint )
11401 {
11402 	if ( iDocs<DOCLIST_HINT_THRESH )
11403 		return 8*iDocs;
11404 	else
11405 		return 4*iDocs + (int)( int64_t(iDocs)*uHint/64 );
11406 }
11407 
sphDoclistHintPack(SphOffset_t iDocs,SphOffset_t iLen)11408 BYTE sphDoclistHintPack ( SphOffset_t iDocs, SphOffset_t iLen )
11409 {
11410 	// we won't really store a hint for small lists
11411 	if ( iDocs<DOCLIST_HINT_THRESH )
11412 		return 0;
11413 
11414 	// for bigger lists len/docs varies 4x-6x on test indexes
11415 	// so lets assume that 4x-8x should be enough for everybody
11416 	SphOffset_t iDelta = Min ( Max ( iLen-4*iDocs, 0 ), 4*iDocs-1 ); // len delta over 4x, clamped to [0x..4x) range
11417 	BYTE uHint = (BYTE)( 64*iDelta/iDocs ); // hint now must be in [0..256) range
11418 	while ( uHint<255 && ( iDocs*uHint/64 )<iDelta ) // roundoff (suddenly, my guru math skillz failed me)
11419 		uHint++;
11420 
11421 	return uHint;
11422 }
11423 
11424 // !COMMIT eliminate this, move to dict (or at least couple with CWordlist)
11425 class CSphDictReader
11426 {
11427 public:
11428 	// current word
11429 	SphWordID_t		m_iWordID;
11430 	SphOffset_t		m_iDoclistOffset;
11431 	int				m_iDocs;
11432 	int				m_iHits;
11433 	bool			m_bHasHitlist;
11434 	int				m_iHint;
11435 
11436 private:
11437 	ESphHitless		m_eHitless;
11438 	CSphReader	m_tReader;
11439 	CSphAutofile	m_tFile;
11440 	SphOffset_t		m_iMaxPos;
11441 
11442 	CSphDict *		m_pDict;	///< only used in dict=keywords case, NULL in dict=crc case
11443 	char			m_sWord[MAX_KEYWORD_BYTES];
11444 
11445 public:
CSphDictReader()11446 	CSphDictReader()
11447 		: m_iWordID ( 0 )
11448 		, m_iDoclistOffset ( 0 )
11449 		, m_iHint ( 0 )
11450 		, m_iMaxPos ( 0 )
11451 		, m_pDict ( NULL )
11452 	{
11453 		m_sWord[0] = '\0';
11454 	}
11455 
Setup(const CSphString & sFilename,SphOffset_t iMaxPos,ESphHitless eHitless,CSphString & sError,CSphDict * pDict)11456 	void Setup ( const CSphString & sFilename, SphOffset_t iMaxPos, ESphHitless eHitless, CSphString & sError, CSphDict * pDict )
11457 	{
11458 		m_iMaxPos = iMaxPos;
11459 		m_tFile.Open ( sFilename, SPH_O_READ, sError );
11460 		m_tReader.SetFile ( m_tFile );
11461 		m_tReader.SeekTo ( 1, READ_NO_SIZE_HINT );
11462 		m_eHitless = eHitless;
11463 		m_pDict = pDict;
11464 
11465 		m_sWord[0] = '\0';
11466 	}
11467 
Read()11468 	bool Read()
11469 	{
11470 		if ( m_tReader.GetPos()>=m_iMaxPos )
11471 			return false;
11472 
11473 		// get leading value
11474 		SphWordID_t iWord0 = m_pDict ? m_tReader.GetByte() : m_tReader.UnzipWordid();
11475 		if ( !iWord0 )
11476 		{
11477 			// handle checkpoint
11478 			m_tReader.UnzipOffset();
11479 
11480 			m_iWordID = 0;
11481 			m_iDoclistOffset = 0;
11482 			m_sWord[0] = '\0';
11483 
11484 			if ( m_tReader.GetPos()>=m_iMaxPos )
11485 				return false;
11486 
11487 			iWord0 = m_pDict ? m_tReader.GetByte() : m_tReader.UnzipWordid(); // get next word
11488 		}
11489 		if ( !iWord0 )
11490 			return false; // some failure
11491 
11492 		// get word entry
11493 		if ( m_pDict )
11494 		{
11495 			// unpack next word
11496 			// must be in sync with DictEnd()!
11497 			assert ( iWord0<=255 );
11498 			BYTE uPack = (BYTE) iWord0;
11499 
11500 			int iMatch, iDelta;
11501 			if ( uPack & 0x80 )
11502 			{
11503 				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
11504 				iMatch = uPack & 15;
11505 			} else
11506 			{
11507 				iDelta = uPack & 127;
11508 				iMatch = m_tReader.GetByte();
11509 			}
11510 			assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
11511 			assert ( iMatch<=(int)strlen(m_sWord) );
11512 
11513 			m_tReader.GetBytes ( m_sWord + iMatch, iDelta );
11514 			m_sWord [ iMatch+iDelta ] = '\0';
11515 
11516 			m_iDoclistOffset = m_tReader.UnzipOffset();
11517 			m_iDocs = m_tReader.UnzipInt();
11518 			m_iHits = m_tReader.UnzipInt();
11519 			m_iHint = 0;
11520 			if ( m_iDocs>=DOCLIST_HINT_THRESH )
11521 				m_iHint = m_tReader.GetByte();
11522 			DoclistHintUnpack ( m_iDocs, (BYTE) m_iHint );
11523 
11524 			m_iWordID = (SphWordID_t) sphCRC32 ( GetWord() ); // set wordID for indexing
11525 
11526 		} else
11527 		{
11528 			m_iWordID += iWord0;
11529 			m_iDoclistOffset += m_tReader.UnzipOffset();
11530 			m_iDocs = m_tReader.UnzipInt();
11531 			m_iHits = m_tReader.UnzipInt();
11532 		}
11533 
11534 			m_bHasHitlist =
11535 				( m_eHitless==SPH_HITLESS_NONE ) ||
11536 				( m_eHitless==SPH_HITLESS_SOME && !( m_iDocs & 0x80000000 ) );
11537 			m_iDocs = m_eHitless==SPH_HITLESS_SOME ? ( m_iDocs & 0x7FFFFFFF ) : m_iDocs;
11538 
11539 		return true; // FIXME? errorflag?
11540 	}
11541 
CmpWord(const CSphDictReader & tOther) const11542 	int CmpWord ( const CSphDictReader & tOther ) const
11543 	{
11544 		if ( m_pDict )
11545 			return strcmp ( m_sWord, tOther.m_sWord );
11546 
11547 		int iRes = 0;
11548 		iRes = m_iWordID<tOther.m_iWordID ? -1 : iRes;
11549 		iRes = m_iWordID>tOther.m_iWordID ? 1 : iRes;
11550 		return iRes;
11551 	}
11552 
GetWord() const11553 	BYTE * GetWord () const { return (BYTE *)m_sWord; }
11554 };
11555 
CreateMergeFilters(CSphVector<CSphFilterSettings> & dSettings,const CSphSchema & tSchema,const DWORD * pMvaPool)11556 static ISphFilter * CreateMergeFilters ( CSphVector<CSphFilterSettings> & dSettings, const CSphSchema & tSchema, const DWORD * pMvaPool )
11557 {
11558 	CSphString sError;
11559 	ISphFilter * pResult = NULL;
11560 	ARRAY_FOREACH ( i, dSettings )
11561 	{
11562 		ISphFilter * pFilter = sphCreateFilter ( dSettings[i], tSchema, pMvaPool, sError );
11563 		if ( pFilter )
11564 			pResult = sphJoinFilters ( pResult, pFilter );
11565 	}
11566 	return pResult;
11567 }
11568 
11569 class CSphMerger
11570 {
11571 private:
11572 	CSphIndex_VLN * m_pOutputIndex;
11573 
11574 public:
CSphMerger(CSphIndex_VLN * pOutputIndex)11575 	explicit CSphMerger ( CSphIndex_VLN * pOutputIndex )
11576 		: m_pOutputIndex ( pOutputIndex )
11577 	{}
11578 	template < typename QWORD > static inline
PrepareQword(QWORD & tQword,const CSphDictReader & tReader,int iDynamic,SphDocID_t iMinID,bool bWordDict)11579 	void PrepareQword ( QWORD & tQword, const CSphDictReader & tReader, int iDynamic, SphDocID_t iMinID, bool bWordDict ) //NOLINT
11580 	{
11581 		tQword.m_tDoc.Reset ( iDynamic );
11582 		tQword.m_iMinID = iMinID;
11583 		tQword.m_tDoc.m_iDocID = iMinID;
11584 
11585 		tQword.m_iDocs = tReader.m_iDocs;
11586 		tQword.m_iHits = tReader.m_iHits;
11587 		tQword.m_bHasHitlist = tReader.m_bHasHitlist;
11588 
11589 		tQword.m_uHitPosition = 0;
11590 		tQword.m_iHitlistPos = 0;
11591 
11592 		if ( bWordDict )
11593 			tQword.m_rdDoclist.SeekTo ( tReader.m_iDoclistOffset, tReader.m_iHint );
11594 	}
11595 
11596 	template < typename QWORD >
NextDocument(QWORD & tQword,CSphIndex_VLN * pSourceIndex,CSphRowitem * pInline,ISphFilter * pFilter)11597 	static inline bool NextDocument ( QWORD & tQword, CSphIndex_VLN * pSourceIndex, CSphRowitem * pInline, ISphFilter * pFilter )
11598 	{
11599 		for ( ;; )
11600 		{
11601 			tQword.GetNextDoc ( pInline );
11602 			if ( tQword.m_tDoc.m_iDocID )
11603 			{
11604 				tQword.SeekHitlist ( tQword.m_iHitlistPos );
11605 				if ( pFilter )
11606 				{
11607 					CSphMatch tMatch;
11608 					tMatch.m_iDocID = tQword.m_tDoc.m_iDocID;
11609 					if ( pFilter->UsesAttrs() )
11610 					{
11611 						if ( pInline )
11612 							tMatch.m_pDynamic = pInline;
11613 						else
11614 						{
11615 							const DWORD * pInfo = pSourceIndex->FindDocinfo ( tQword.m_tDoc.m_iDocID );
11616 							tMatch.m_pStatic = pInfo?DOCINFO2ATTRS ( pInfo ):NULL;
11617 						}
11618 					}
11619 					bool bResult = pFilter->Eval ( tMatch );
11620 					tMatch.m_pDynamic = NULL;
11621 					if ( !bResult )
11622 					{
11623 						while ( tQword.m_bHasHitlist && tQword.GetNextHit()!=EMPTY_HIT );
11624 						continue;
11625 					}
11626 				}
11627 				return true;
11628 			} else
11629 				return false;
11630 		}
11631 	}
11632 
11633 	template < typename QWORD >
TransferData(QWORD & tQword,SphWordID_t iWordID,BYTE * sWord,CSphIndex_VLN * pSourceIndex,CSphRowitem * pInline,ISphFilter * pFilter)11634 	inline void TransferData ( QWORD & tQword, SphWordID_t iWordID, BYTE * sWord, CSphIndex_VLN * pSourceIndex, CSphRowitem * pInline, ISphFilter * pFilter )
11635 	{
11636 		CSphAggregateHit tHit;
11637 		tHit.m_iWordID = iWordID;
11638 		tHit.m_sKeyword = sWord;
11639 		tHit.m_dFieldMask.Unset();
11640 
11641 		while ( CSphMerger::NextDocument ( tQword, pSourceIndex, pInline, pFilter ) )
11642 		{
11643 			if ( tQword.m_bHasHitlist )
11644 				TransferHits ( tQword, pInline, tHit );
11645 			else
11646 			{
11647 				// convert to aggregate if there is no hit-list
11648 				tHit.m_iDocID = tQword.m_tDoc.m_iDocID - m_pOutputIndex->m_pMin->m_iDocID;
11649 				tHit.m_dFieldMask = tQword.m_dQwordFields;
11650 				tHit.SetAggrCount ( tQword.m_uMatchHits );
11651 				m_pOutputIndex->cidxHit ( &tHit, pInline );
11652 			}
11653 		}
11654 	}
11655 
11656 	template < typename QWORD >
TransferHits(QWORD & tQword,CSphRowitem * pInline,CSphAggregateHit & tHit)11657 	inline void TransferHits ( QWORD & tQword, CSphRowitem * pInline, CSphAggregateHit & tHit )
11658 	{
11659 		assert ( tQword.m_bHasHitlist );
11660 		tHit.m_iDocID = tQword.m_tDoc.m_iDocID - m_pOutputIndex->m_pMin->m_iDocID;
11661 		for ( Hitpos_t uHit = tQword.GetNextHit(); uHit!=EMPTY_HIT; uHit = tQword.GetNextHit() )
11662 		{
11663 			tHit.m_iWordPos = uHit;
11664 			m_pOutputIndex->cidxHit ( &tHit, pInline );
11665 		}
11666 	}
11667 
11668 	template < typename QWORD >
ConfigureQword(QWORD & tQword,CSphAutofile & tHits,CSphAutofile & tDocs,CSphIndex_VLN * pIndex)11669 	static inline void ConfigureQword ( QWORD & tQword, CSphAutofile & tHits, CSphAutofile & tDocs, CSphIndex_VLN * pIndex )
11670 	{
11671 		bool bInline = pIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE;
11672 
11673 		tQword.m_iInlineAttrs = bInline ? pIndex->m_tSchema.GetDynamicSize() : 0;
11674 		tQword.m_pInlineFixup = bInline ? pIndex->m_pMin->m_pDynamic : NULL;
11675 
11676 		tQword.m_rdHitlist.SetFile ( tHits );
11677 		tQword.m_rdHitlist.GetByte();
11678 
11679 		tQword.m_rdDoclist.SetFile ( tDocs );
11680 		tQword.m_rdDoclist.GetByte();
11681 	}
11682 };
11683 
11684 
11685 template < typename QWORDDST, typename QWORDSRC >
MergeWords(CSphIndex_VLN * pSrcIndex,ISphFilter * pFilter)11686 bool CSphIndex_VLN::MergeWords ( CSphIndex_VLN * pSrcIndex, ISphFilter * pFilter )
11687 {
11688 	assert ( m_pDict->GetSettings().m_bWordDict==pSrcIndex->m_pDict->GetSettings().m_bWordDict );
11689 
11690 	// setup writers
11691 	m_wrDoclist.OpenFile ( GetIndexFileName("spd.tmp"), m_sLastError );
11692 	m_wrHitlist.OpenFile ( GetIndexFileName("spp.tmp"), m_sLastError );
11693 
11694 	BYTE bDummy = 1;
11695 	m_wrDoclist.PutBytes ( &bDummy, 1 );
11696 	m_wrHitlist.PutBytes ( &bDummy, 1 );
11697 
11698 	m_pDict->HitblockBegin();
11699 
11700 	CSphDictReader tDstReader;
11701 	CSphDictReader tSrcReader;
11702 
11703 	const bool bWordDict = m_pDict->GetSettings().m_bWordDict;
11704 
11705 	tDstReader.Setup ( GetIndexFileName("spi"), m_tWordlist.m_iCheckpointsPos,
11706 		m_tSettings.m_eHitless, m_sLastError, ( bWordDict ? m_pDict : NULL ) );
11707 	tSrcReader.Setup ( pSrcIndex->GetIndexFileName("spi"), pSrcIndex->m_tWordlist.m_iCheckpointsPos,
11708 		pSrcIndex->m_tSettings.m_eHitless, m_sLastError, ( bWordDict ? m_pDict : NULL ) );
11709 
11710 	if ( !m_sLastError.IsEmpty() )
11711 		return false;
11712 
11713 	/// prepare for indexing
11714 	m_tLastHit.m_iDocID = 0;
11715 	m_tLastHit.m_iWordID = 0;
11716 	m_tLastHit.m_sKeyword = m_sLastKeyword;
11717 	m_tLastHit.m_iWordPos = EMPTY_HIT;
11718 
11719 	const SphDocID_t iDstMinID = m_pMin->m_iDocID;
11720 	const SphDocID_t iSrcMinID = pSrcIndex->m_pMin->m_iDocID;
11721 
11722 	// correct infinum might be already set during spa merging.
11723 	if ( !m_iMergeInfinum )
11724 		m_pMin->m_iDocID = Min ( iDstMinID, iSrcMinID );
11725 	else
11726 		m_pMin->m_iDocID = m_iMergeInfinum;
11727 
11728 	m_tWordlist.m_dCheckpoints.Reset ( 0 );
11729 
11730 	const int iDstDynamic = m_tSchema.GetDynamicSize();
11731 	const int iSrcDynamic = pSrcIndex->m_tSchema.GetDynamicSize();
11732 
11733 	/// setup qwords
11734 
11735 	QWORDDST tDstQword ( false, false );
11736 	QWORDSRC tSrcQword ( false, false );
11737 
11738 	CSphAutofile fSrcDocs, fSrcHits;
11739 	fSrcDocs.Open ( pSrcIndex->GetIndexFileName("spd"), SPH_O_READ, m_sLastError );
11740 	fSrcHits.Open ( pSrcIndex->GetIndexFileName("spp"), SPH_O_READ, m_sLastError );
11741 
11742 	CSphAutofile fDstDocs, fDstHits;
11743 	fDstDocs.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError );
11744 	fDstHits.Open ( GetIndexFileName("spp"), SPH_O_READ, m_sLastError );
11745 
11746 	if ( !m_sLastError.IsEmpty() )
11747 		return false;
11748 
11749 	CSphMerger::ConfigureQword<QWORDDST> ( tDstQword, fDstHits, fDstDocs, this );
11750 	CSphMerger::ConfigureQword<QWORDSRC> ( tSrcQword, fSrcHits, fSrcDocs, pSrcIndex );
11751 
11752 	int iDstInlineSize = m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ? m_tSchema.GetRowSize() : 0;
11753 	int iSrcInlineSize = pSrcIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ? pSrcIndex->m_tSchema.GetRowSize() : 0;
11754 
11755 	CSphAutoArray<CSphRowitem> dDstInline ( iDstInlineSize );
11756 	CSphAutoArray<CSphRowitem> dSrcInline ( iSrcInlineSize );
11757 
11758 	/// merge
11759 
11760 	CSphMerger tMerge(this);
11761 
11762 	bool bDstWord = tDstReader.Read();
11763 	bool bSrcWord = tSrcReader.Read();
11764 
11765 	if ( m_pProgress )
11766 	{
11767 		m_tProgress.m_ePhase = CSphIndexProgress::PHASE_MERGE;
11768 		m_pProgress ( &m_tProgress, false );
11769 	}
11770 
11771 	int iWords = 0;
11772 	int iHitlistsDiscarded = 0;
11773 	for ( ; bDstWord || bSrcWord; iWords++ )
11774 	{
11775 		if ( m_pProgress && iWords==1000 )
11776 		{
11777 			m_tProgress.m_iWords += 1000;
11778 			iWords = 0;
11779 			m_pProgress ( &m_tProgress, false );
11780 		}
11781 
11782 		const int iCmp = tDstReader.CmpWord ( tSrcReader );
11783 
11784 		if ( !bSrcWord || ( bDstWord && iCmp<0 ) )
11785 		{
11786 			// transfer documents and hits from destination
11787 			CSphMerger::PrepareQword<QWORDDST> ( tDstQword, tDstReader, iDstDynamic, iDstMinID, bWordDict );
11788 			tMerge.TransferData<QWORDDST> ( tDstQword, tDstReader.m_iWordID, tDstReader.GetWord(), this, dDstInline, pFilter );
11789 			bDstWord = tDstReader.Read();
11790 
11791 		} else if ( !bDstWord || ( bSrcWord && iCmp>0 ) )
11792 		{
11793 			// transfer documents and hits from source
11794 			CSphMerger::PrepareQword<QWORDSRC> ( tSrcQword, tSrcReader, iSrcDynamic, iSrcMinID, bWordDict );
11795 			tMerge.TransferData<QWORDSRC> ( tSrcQword, tSrcReader.m_iWordID, tSrcReader.GetWord(), pSrcIndex, dSrcInline, NULL );
11796 			bSrcWord = tSrcReader.Read();
11797 
11798 		} else // merge documents and hits inside the word
11799 		{
11800 			assert ( iCmp==0 );
11801 
11802 			bool bHitless = !tDstReader.m_bHasHitlist;
11803 			if ( tDstReader.m_bHasHitlist!=tSrcReader.m_bHasHitlist )
11804 			{
11805 				iHitlistsDiscarded++;
11806 				bHitless = true;
11807 			}
11808 
11809 			CSphMerger::PrepareQword<QWORDDST> ( tDstQword, tDstReader, iDstDynamic, iDstMinID, bWordDict );
11810 			CSphMerger::PrepareQword<QWORDSRC> ( tSrcQword, tSrcReader, iSrcDynamic, iSrcMinID, bWordDict );
11811 
11812 			CSphAggregateHit tHit;
11813 			tHit.m_iWordID = tDstReader.m_iWordID; // !COMMIT m_sKeyword anyone?
11814 			tHit.m_sKeyword = tDstReader.GetWord();
11815 			tHit.m_dFieldMask.Unset();
11816 
11817 			bool bDstDocs = CSphMerger::NextDocument ( tDstQword, this, dDstInline, pFilter );
11818 			bool bSrcDocs = true;
11819 
11820 			tSrcQword.GetNextDoc ( dSrcInline );
11821 			tSrcQword.SeekHitlist ( tSrcQword.m_iHitlistPos );
11822 
11823 			while ( bDstDocs || bSrcDocs )
11824 			{
11825 				if ( !bSrcDocs || ( bDstDocs && tDstQword.m_tDoc.m_iDocID < tSrcQword.m_tDoc.m_iDocID ) )
11826 				{
11827 					// transfer hits from destination
11828 					if ( bHitless )
11829 					{
11830 						while ( tDstQword.m_bHasHitlist && tDstQword.GetNextHit()!=EMPTY_HIT );
11831 
11832 						tHit.m_iDocID = tDstQword.m_tDoc.m_iDocID - m_pMin->m_iDocID;
11833 						tHit.m_dFieldMask = tDstQword.m_dQwordFields;
11834 						tHit.SetAggrCount ( tDstQword.m_uMatchHits );
11835 						cidxHit ( &tHit, dSrcInline );
11836 					} else
11837 						tMerge.TransferHits ( tDstQword, dDstInline, tHit );
11838 					bDstDocs = CSphMerger::NextDocument ( tDstQword, this, dDstInline, pFilter );
11839 
11840 				} else if ( !bDstDocs || ( bSrcDocs && tDstQword.m_tDoc.m_iDocID > tSrcQword.m_tDoc.m_iDocID ) )
11841 				{
11842 					// transfer hits from source
11843 					if ( bHitless )
11844 					{
11845 						while ( tSrcQword.m_bHasHitlist && tSrcQword.GetNextHit()!=EMPTY_HIT );
11846 
11847 						tHit.m_iDocID = tSrcQword.m_tDoc.m_iDocID - m_pMin->m_iDocID;
11848 						tHit.m_dFieldMask = tSrcQword.m_dQwordFields;
11849 						tHit.SetAggrCount ( tSrcQword.m_uMatchHits );
11850 						cidxHit ( &tHit, dSrcInline );
11851 					} else
11852 						tMerge.TransferHits ( tSrcQword, dSrcInline, tHit );
11853 					bSrcDocs = CSphMerger::NextDocument ( tSrcQword, pSrcIndex, dSrcInline, NULL );
11854 
11855 				} else
11856 				{
11857 					// merge hits inside the document
11858 					assert ( bDstDocs );
11859 					assert ( bSrcDocs );
11860 					assert ( tDstQword.m_tDoc.m_iDocID==tSrcQword.m_tDoc.m_iDocID );
11861 
11862 					tHit.m_iDocID = tDstQword.m_tDoc.m_iDocID - m_pMin->m_iDocID;
11863 
11864 					if ( bHitless )
11865 					{
11866 						while ( tDstQword.m_bHasHitlist && tDstQword.GetNextHit()!=EMPTY_HIT );
11867 						while ( tSrcQword.m_bHasHitlist && tSrcQword.GetNextHit()!=EMPTY_HIT );
11868 
11869 						tHit.m_dFieldMask = tDstQword.m_dQwordFields | tSrcQword.m_dQwordFields;
11870 						tHit.SetAggrCount ( tDstQword.m_uMatchHits + tSrcQword.m_uMatchHits );
11871 						cidxHit ( &tHit, dSrcInline );
11872 
11873 					} else
11874 					{
11875 						Hitpos_t uDstHit = tDstQword.GetNextHit();
11876 						Hitpos_t uSrcHit = tSrcQword.GetNextHit();
11877 
11878 						while ( uDstHit!=EMPTY_HIT || uSrcHit!=EMPTY_HIT )
11879 						{
11880 							if ( uSrcHit==EMPTY_HIT || ( uDstHit!=EMPTY_HIT && uDstHit<uSrcHit ) )
11881 							{
11882 								tHit.m_iWordPos = uDstHit;
11883 								cidxHit ( &tHit, dSrcInline );
11884 								uDstHit = tDstQword.GetNextHit();
11885 
11886 							} else if ( uDstHit==EMPTY_HIT || ( uSrcHit!=EMPTY_HIT && uSrcHit<uDstHit ) )
11887 							{
11888 								tHit.m_iWordPos = uSrcHit;
11889 								cidxHit ( &tHit, dSrcInline );
11890 								uSrcHit = tSrcQword.GetNextHit();
11891 
11892 							} else
11893 							{
11894 								assert ( uDstHit==uSrcHit );
11895 
11896 								tHit.m_iWordPos = uDstHit;
11897 								cidxHit ( &tHit, dSrcInline );
11898 
11899 								uDstHit = tDstQword.GetNextHit();
11900 								uSrcHit = tSrcQword.GetNextHit();
11901 							}
11902 						}
11903 					}
11904 
11905 					// next document
11906 					bDstDocs = CSphMerger::NextDocument ( tDstQword, this, dDstInline, pFilter );
11907 					bSrcDocs = CSphMerger::NextDocument ( tSrcQword, pSrcIndex, dSrcInline, NULL );
11908 				}
11909 			}
11910 			// next word
11911 			bDstWord = tDstReader.Read();
11912 			bSrcWord = tSrcReader.Read();
11913 		}
11914 	}
11915 
11916 	m_tStats.m_iTotalDocuments += pSrcIndex->m_tStats.m_iTotalDocuments;
11917 	m_tStats.m_iTotalBytes += pSrcIndex->m_tStats.m_iTotalBytes;
11918 
11919 	if ( m_pProgress )
11920 	{
11921 		m_tProgress.m_iWords += iWords;
11922 		m_pProgress ( &m_tProgress, false );
11923 	}
11924 
11925 	if ( iHitlistsDiscarded )
11926 		m_sLastWarning.SetSprintf ( "discarded hitlists for %u words", iHitlistsDiscarded );
11927 
11928 	return true;
11929 }
11930 
Merge(CSphIndex * pSource,CSphVector<CSphFilterSettings> & dFilters,bool bMergeKillLists)11931 bool CSphIndex_VLN::Merge ( CSphIndex * pSource, CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists )
11932 {
11933 	assert ( pSource );
11934 
11935 	CSphIndex_VLN * pSrcIndex = dynamic_cast< CSphIndex_VLN * >( pSource );
11936 	assert ( pSrcIndex );
11937 
11938 	CSphString sWarning;
11939 	if ( !Prealloc ( false, false, sWarning ) || !Preread() )
11940 		return false;
11941 	if ( !pSrcIndex->Prealloc ( false, false, sWarning ) || !pSrcIndex->Preread() )
11942 	{
11943 		m_sLastError.SetSprintf ( "source index preload failed: %s", pSrcIndex->GetLastError().cstr() );
11944 		return false;
11945 	}
11946 
11947 	const CSphSchema & tDstSchema = m_tSchema;
11948 	const CSphSchema & tSrcSchema = pSrcIndex->m_tSchema;
11949 	if ( !tDstSchema.CompareTo ( tSrcSchema, m_sLastError ) )
11950 		return false;
11951 
11952 	if ( m_tSettings.m_eHitless!=pSrcIndex->m_tSettings.m_eHitless )
11953 	{
11954 		m_sLastError = "hitless settings must be the same on merged indices";
11955 		return false;
11956 	}
11957 
11958 	// FIXME!
11959 	if ( m_tSettings.m_eDocinfo!=pSrcIndex->m_tSettings.m_eDocinfo && !( m_bIsEmpty || pSrcIndex->m_bIsEmpty ) )
11960 	{
11961 		m_sLastError.SetSprintf ( "docinfo storage on non-empty indexes must be the same (dst docinfo %d, empty %d, src docinfo %d, empty %d",
11962 			m_tSettings.m_eDocinfo, m_bIsEmpty, pSrcIndex->m_tSettings.m_eDocinfo, pSrcIndex->m_bIsEmpty );
11963 		return false;
11964 	}
11965 
11966 	if ( m_pDict->GetSettings().m_bWordDict!=pSrcIndex->m_pDict->GetSettings().m_bWordDict )
11967 	{
11968 		m_sLastError.SetSprintf ( "dictionary types must be the same (dst dict=%s, src dict=%s )",
11969 			m_pDict->GetSettings().m_bWordDict ? "keywords" : "crc",
11970 			pSrcIndex->m_pDict->GetSettings().m_bWordDict ? "keywords" : "crc" );
11971 		return false;
11972 	}
11973 	m_bWordDict = m_pDict->GetSettings().m_bWordDict;
11974 	m_bMerging = true;
11975 
11976 	int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
11977 
11978 	// create filters
11979 	ISphFilter * pFilter = CreateMergeFilters ( dFilters, m_tSchema, GetMVAPool() );
11980 	DWORD nKillListSize = pSrcIndex->GetKillListSize ();
11981 	if ( nKillListSize )
11982 	{
11983 		CSphFilterSettings tKillListFilter;
11984 		SphAttr_t * pKillList = pSrcIndex->GetKillList ();
11985 
11986 		tKillListFilter.m_bExclude = true;
11987 		tKillListFilter.m_eType = SPH_FILTER_VALUES;
11988 		tKillListFilter.m_iMinValue = pKillList[0];
11989 		tKillListFilter.m_iMaxValue = pKillList[nKillListSize -1];
11990 		tKillListFilter.m_sAttrName = "@id";
11991 		tKillListFilter.SetExternalValues ( pKillList, nKillListSize );
11992 
11993 		ISphFilter * pKillListFilter =
11994 			sphCreateFilter ( tKillListFilter, m_tSchema, GetMVAPool(), m_sLastError );
11995 		pFilter = sphJoinFilters ( pFilter, pKillListFilter );
11996 	}
11997 
11998 	/////////////////////////////////////////
11999 	// merging attributes (.spa, .spm, .sps)
12000 	/////////////////////////////////////////
12001 
12002 	CSphAutoreader tDstSPM, tSrcSPM, tDstSPS, tSrcSPS;
12003 	if ( !tDstSPM.Open ( GetIndexFileName("spm"), m_sLastError )
12004 		|| !tSrcSPM.Open ( pSrcIndex->GetIndexFileName("spm"), m_sLastError )
12005 		|| !tDstSPS.Open ( GetIndexFileName("sps"), m_sLastError )
12006 		|| !tSrcSPS.Open ( pSrcIndex->GetIndexFileName("sps"), m_sLastError ) )
12007 	{
12008 		return false;
12009 	}
12010 
12011 	CSphWriter tSPMWriter, tSPSWriter;
12012 	if ( !tSPMWriter.OpenFile ( GetIndexFileName("spm.tmp"), m_sLastError )
12013 		|| !tSPSWriter.OpenFile ( GetIndexFileName("sps.tmp"), m_sLastError ) )
12014 	{
12015 		return false;
12016 	}
12017 	tSPSWriter.PutByte ( 0 ); // dummy byte, to reserve magic zero offset
12018 
12019 	/// merging
12020 	CSphVector<CSphAttrLocator> dMvaLocators;
12021 	CSphVector<CSphAttrLocator> dStringLocators;
12022 	for ( int i=0; i<tDstSchema.GetAttrsCount(); i++ )
12023 	{
12024 		const CSphColumnInfo & tInfo = tDstSchema.GetAttr(i);
12025 		if ( tInfo.m_eAttrType==SPH_ATTR_UINT32SET )
12026 			dMvaLocators.Add ( tInfo.m_tLocator );
12027 		if ( tInfo.m_eAttrType==SPH_ATTR_STRING )
12028 			dStringLocators.Add ( tInfo.m_tLocator );
12029 	}
12030 	for ( int i=0; i<tDstSchema.GetAttrsCount(); i++ )
12031 	{
12032 		const CSphColumnInfo & tInfo = tDstSchema.GetAttr(i);
12033 		if ( tInfo.m_eAttrType==SPH_ATTR_INT64SET )
12034 			dMvaLocators.Add ( tInfo.m_tLocator );
12035 	}
12036 
12037 	CSphDocMVA	tDstMVA ( dMvaLocators.GetLength() ), tSrcMVA ( dMvaLocators.GetLength() );
12038 	CSphVector<SphAttr_t> dPhantomKiller;
12039 
12040 	int iTotalDocuments = 0;
12041 	bool bNeedInfinum = true;
12042 	m_iMergeInfinum = 0;
12043 
12044 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pSrcIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
12045 	{
12046 		CSphWriter wrRows;
12047 		if ( !wrRows.OpenFile ( GetIndexFileName("spa.tmp"), m_sLastError ) )
12048 			return false;
12049 
12050 		AttrIndexBuilder_c tMinMax ( m_tSchema );
12051 		CSphVector<DWORD> dMinMaxBuffer ( tMinMax.GetExpectedSize (
12052 			m_tStats.m_iTotalDocuments + pSrcIndex->GetStats().m_iTotalDocuments ) );
12053 		tMinMax.Prepare ( dMinMaxBuffer.Begin(), dMinMaxBuffer.Begin() + dMinMaxBuffer.GetLength() );
12054 		m_uMinMaxIndex = 0;
12055 
12056 		DWORD * pSrcRow = pSrcIndex->m_pDocinfo.GetWritePtr(); // they *can* be null if the respective index is empty
12057 		DWORD * pDstRow = m_pDocinfo.GetWritePtr();
12058 
12059 		DWORD iSrcCount = 0;
12060 		DWORD iDstCount = 0;
12061 
12062 		tDstMVA.Read ( tDstSPM );
12063 		tSrcMVA.Read ( tSrcSPM );
12064 
12065 		CSphMatch tMatch;
12066 		while ( iSrcCount < pSrcIndex->m_uDocinfo || iDstCount < m_uDocinfo )
12067 		{
12068 			SphDocID_t iDstDocID, iSrcDocID;
12069 
12070 			if ( iDstCount < m_uDocinfo )
12071 			{
12072 				iDstDocID = DOCINFO2ID ( pDstRow );
12073 				if ( pFilter )
12074 				{
12075 					tMatch.m_iDocID = iDstDocID;
12076 					tMatch.m_pStatic = reinterpret_cast<CSphRowitem *> ( DOCINFO2ATTRS ( pDstRow ) );
12077 					tMatch.m_pDynamic = NULL;
12078 					if ( !pFilter->Eval ( tMatch ) )
12079 					{
12080 						pDstRow += iStride;
12081 						iDstCount++;
12082 						continue;
12083 					}
12084 				}
12085 			} else
12086 				iDstDocID = 0;
12087 
12088 			if ( iSrcCount < pSrcIndex->m_uDocinfo )
12089 				iSrcDocID = DOCINFO2ID ( pSrcRow );
12090 			else
12091 				iSrcDocID = 0;
12092 
12093 			if ( ( iDstDocID && iDstDocID < iSrcDocID ) || ( iDstDocID && !iSrcDocID ) )
12094 			{
12095 				while ( tDstMVA.m_iDocID && tDstMVA.m_iDocID<iDstDocID )
12096 					tDstMVA.Read ( tDstSPM );
12097 
12098 				if ( tDstMVA.m_iDocID==iDstDocID )
12099 				{
12100 					tDstMVA.Write ( tSPMWriter );
12101 					ARRAY_FOREACH ( i, tDstMVA.m_dMVA )
12102 						sphSetRowAttr ( DOCINFO2ATTRS ( pDstRow ), dMvaLocators[i], tDstMVA.m_dOffsets[i] );
12103 				}
12104 
12105 				ARRAY_FOREACH ( i, dStringLocators )
12106 					sphSetRowAttr ( DOCINFO2ATTRS ( pDstRow ), dStringLocators[i],
12107 						CopyStringAttr ( tSPSWriter, tDstSPS, sphGetRowAttr ( DOCINFO2ATTRS ( pDstRow ), dStringLocators[i] ) ) );
12108 
12109 				wrRows.PutBytes ( pDstRow, sizeof(DWORD)*iStride );
12110 				tMinMax.Collect ( pDstRow, tDstMVA );
12111 				m_uMinMaxIndex += iStride;
12112 				pDstRow += iStride;
12113 				iDstCount++;
12114 				iTotalDocuments++;
12115 				if ( bNeedInfinum )
12116 				{
12117 					bNeedInfinum = false;
12118 					m_iMergeInfinum = iDstDocID - 1;
12119 				}
12120 
12121 			} else if ( iSrcDocID )
12122 			{
12123 				// iSrcDocID<=iDstDocID; in both cases, its src attr values that must win
12124 				while ( tSrcMVA.m_iDocID && tSrcMVA.m_iDocID<iSrcDocID )
12125 					tSrcMVA.Read ( tSrcSPM );
12126 
12127 				if ( tSrcMVA.m_iDocID==iSrcDocID )
12128 				{
12129 					tSrcMVA.Write ( tSPMWriter );
12130 					ARRAY_FOREACH ( i, tSrcMVA.m_dMVA )
12131 						sphSetRowAttr ( DOCINFO2ATTRS ( pSrcRow ), dMvaLocators[i], tSrcMVA.m_dOffsets[i] );
12132 				}
12133 
12134 				ARRAY_FOREACH ( i, dStringLocators )
12135 					sphSetRowAttr ( DOCINFO2ATTRS ( pSrcRow ), dStringLocators[i],
12136 						CopyStringAttr ( tSPSWriter, tSrcSPS, sphGetRowAttr ( DOCINFO2ATTRS ( pSrcRow ), dStringLocators[i] ) ) );
12137 
12138 				wrRows.PutBytes ( pSrcRow, sizeof(DWORD)*iStride );
12139 				tMinMax.Collect ( pSrcRow, tSrcMVA );
12140 				m_uMinMaxIndex += iStride;
12141 				pSrcRow += iStride;
12142 				iSrcCount++;
12143 				iTotalDocuments++;
12144 				if ( bNeedInfinum )
12145 				{
12146 					bNeedInfinum = false;
12147 					m_iMergeInfinum = iSrcDocID - 1;
12148 				}
12149 
12150 				if ( iDstDocID==iSrcDocID )
12151 				{
12152 					dPhantomKiller.Add ( iSrcDocID );
12153 					pDstRow += iStride;
12154 					iDstCount++;
12155 				}
12156 			}
12157 		}
12158 
12159 		if ( iTotalDocuments )
12160 		{
12161 			tMinMax.FinishCollect();
12162 			wrRows.PutBytes ( &dMinMaxBuffer[0], sizeof(DWORD) * tMinMax.GetActualSize() );
12163 		}
12164 		wrRows.CloseFile();
12165 		if ( wrRows.IsError() )
12166 			return false;
12167 
12168 	} else if ( m_bIsEmpty || pSrcIndex->m_bIsEmpty )
12169 	{
12170 		// one of the indexes has no documents; copy the .spa file from the other one
12171 		CSphString sSrc = !m_bIsEmpty ? GetIndexFileName("spa") : pSrcIndex->GetIndexFileName("spa");
12172 		CSphString sDst = GetIndexFileName("spa.tmp");
12173 
12174 		if ( !CopyFile ( sSrc.cstr(), sDst.cstr(), m_sLastError ) )
12175 			return false;
12176 
12177 	} else
12178 	{
12179 		// storage is not extern; create dummy .spa file
12180 		CSphAutofile fdSpa ( GetIndexFileName("spa.tmp"), SPH_O_NEW, m_sLastError );
12181 		fdSpa.Close();
12182 	}
12183 
12184 	// create phantom killlist filter
12185 	if ( dPhantomKiller.GetLength() )
12186 	{
12187 		CSphFilterSettings tKLF;
12188 		tKLF.m_bExclude = true;
12189 		tKLF.m_eType = SPH_FILTER_VALUES;
12190 		tKLF.m_iMinValue = dPhantomKiller[0];
12191 		tKLF.m_iMaxValue = dPhantomKiller.Last();
12192 		tKLF.m_sAttrName = "@id";
12193 		tKLF.SetExternalValues ( &dPhantomKiller[0], dPhantomKiller.GetLength() );
12194 		ISphFilter * pSpaFilter = sphCreateFilter ( tKLF, m_tSchema, GetMVAPool(), m_sLastError );
12195 		pFilter = sphJoinFilters ( pFilter, pSpaFilter );
12196 	}
12197 	CSphScopedPtr<ISphFilter> pScopedFilter ( pFilter );
12198 
12199 	CSphAutofile fdTmpDict ( GetIndexFileName("spi.tmp8"), SPH_O_NEW, m_sLastError, true );
12200 	CSphAutofile fdDict ( GetIndexFileName("spi.tmp"), SPH_O_NEW, m_sLastError );
12201 
12202 	if ( !m_sLastError.IsEmpty() || fdTmpDict.GetFD()<0 || fdDict.GetFD()<0 )
12203 		return false;
12204 
12205 	m_pDict->DictBegin ( fdTmpDict, fdDict, 8*1024*1024 ); // FIXME? is this magic dict block constant any good?..
12206 
12207 	// merge dictionaries, doclists and hitlists
12208 	if ( m_pDict->GetSettings().m_bWordDict )
12209 	{
12210 		WITH_QWORD ( this, false, QwordDst,
12211 			WITH_QWORD ( pSrcIndex, false, QwordSrc,
12212 		{
12213 			if ( !MergeWords < QwordDst, QwordSrc > ( pSrcIndex, pFilter ) )
12214 				return false;
12215 		} ) );
12216 	} else
12217 	{
12218 		WITH_QWORD ( this, true, QwordDst,
12219 			WITH_QWORD ( pSrcIndex, true, QwordSrc,
12220 		{
12221 			if ( !MergeWords < QwordDst, QwordSrc > ( pSrcIndex, pFilter ) )
12222 				return false;
12223 		} ) );
12224 	}
12225 
12226 	if ( iTotalDocuments )
12227 		m_tStats.m_iTotalDocuments = iTotalDocuments;
12228 	// merge kill-lists
12229 	CSphAutofile fdKillList ( GetIndexFileName("spk.tmp"), SPH_O_NEW, m_sLastError );
12230 	if ( fdKillList.GetFD () < 0 )
12231 		return false;
12232 
12233 	if ( bMergeKillLists )
12234 	{
12235 		// merge spk
12236 		CSphVector<SphAttr_t> dKillList;
12237 		dKillList.Reserve ( GetKillListSize() + pSrcIndex->GetKillListSize() );
12238 		for ( int i = 0; i < pSrcIndex->GetKillListSize (); i++ )
12239 			dKillList.Add ( pSrcIndex->GetKillList () [i] );
12240 
12241 		for ( int i = 0; i < GetKillListSize (); i++ )
12242 			dKillList.Add ( GetKillList () [i] );
12243 
12244 		dKillList.Uniq ();
12245 
12246 		m_iKillListSize = dKillList.GetLength ();
12247 
12248 		if ( dKillList.GetLength() )
12249 		{
12250 			if ( !sphWriteThrottled ( fdKillList.GetFD(), &dKillList[0], dKillList.GetLength()*sizeof(SphAttr_t), "kill_list", m_sLastError ) )
12251 				return false;
12252 		}
12253 	} else
12254 	{
12255 		m_iKillListSize = 0;
12256 	}
12257 
12258 	fdKillList.Close ();
12259 
12260 	// finalize
12261 	CSphAggregateHit tFlush;
12262 	tFlush.m_iDocID = 0;
12263 	tFlush.m_iWordID = 0;
12264 	tFlush.m_sKeyword = (BYTE*)""; // tricky: assertion in cidxHit calls strcmp on this in case of empty index!
12265 	tFlush.m_iWordPos = EMPTY_HIT;
12266 	tFlush.m_dFieldMask.Unset();
12267 	cidxHit ( &tFlush, NULL );
12268 	cidxDone ( "sph.tmp", 8*1024*1024 ); // FIXME? is this magic dict block constant any good?..
12269 
12270 	// we're done
12271 	if ( m_pProgress )
12272 		m_pProgress ( &m_tProgress, true );
12273 
12274 	return true;
12275 }
12276 
12277 
12278 /////////////////////////////////////////////////////////////////////////////
12279 // THE SEARCHER
12280 /////////////////////////////////////////////////////////////////////////////
12281 
GetWordID(BYTE * pWord)12282 SphWordID_t CSphDictStar::GetWordID ( BYTE * pWord )
12283 {
12284 	char sBuf [ 16+3*SPH_MAX_WORD_LEN ];
12285 	assert ( strlen ( (const char*)pWord ) < 16+3*SPH_MAX_WORD_LEN );
12286 
12287 	m_pDict->ApplyStemmers ( pWord );
12288 
12289 	int iLen = strlen ( (const char*)pWord );
12290 	assert ( iLen < 16+3*SPH_MAX_WORD_LEN - 1 );
12291 
12292 	memcpy ( sBuf, pWord, iLen+1 );
12293 
12294 	if ( iLen )
12295 	{
12296 		if ( sBuf[iLen-1]=='*' )
12297 		{
12298 			sBuf[iLen-1] = '\0';
12299 		} else
12300 		{
12301 			sBuf[iLen] = MAGIC_WORD_TAIL;
12302 			sBuf[iLen+1] = '\0';
12303 		}
12304 	}
12305 
12306 	return m_pDict->GetWordID ( (BYTE*)sBuf );
12307 }
12308 
12309 
GetWordIDNonStemmed(BYTE * pWord)12310 SphWordID_t	CSphDictStar::GetWordIDNonStemmed ( BYTE * pWord )
12311 {
12312 	return m_pDict->GetWordIDNonStemmed ( pWord );
12313 }
12314 
12315 
12316 //////////////////////////////////////////////////////////////////////////
12317 
CSphDictStarV8(CSphDict * pDict,bool bPrefixes,bool bInfixes)12318 CSphDictStarV8::CSphDictStarV8 ( CSphDict * pDict, bool bPrefixes, bool bInfixes )
12319 	: CSphDictStar	( pDict )
12320 	, m_bPrefixes	( bPrefixes )
12321 	, m_bInfixes	( bInfixes )
12322 {
12323 }
12324 
12325 
GetWordID(BYTE * pWord)12326 SphWordID_t	CSphDictStarV8::GetWordID ( BYTE * pWord )
12327 {
12328 	char sBuf [ 16+3*SPH_MAX_WORD_LEN ];
12329 
12330 	int iLen = strlen ( (const char*)pWord );
12331 	iLen = Min ( iLen, 16+3*SPH_MAX_WORD_LEN - 1 );
12332 
12333 	if ( !iLen )
12334 		return 0;
12335 
12336 	bool bHeadStar = ( pWord[0]=='*' );
12337 	bool bTailStar = ( pWord[iLen-1]=='*' ) && ( iLen>1 );
12338 
12339 	if ( !bHeadStar && !bTailStar )
12340 	{
12341 		m_pDict->ApplyStemmers ( pWord );
12342 		if ( IsStopWord ( pWord ) )
12343 			return 0;
12344 	}
12345 
12346 	iLen = strlen ( (const char*)pWord );
12347 	assert ( iLen < 16+3*SPH_MAX_WORD_LEN - 2 );
12348 
12349 	if ( !iLen || ( bHeadStar && iLen==1 ) )
12350 		return 0;
12351 
12352 	if ( m_bInfixes )
12353 	{
12354 		////////////////////////////////////
12355 		// infix or mixed infix+prefix mode
12356 		////////////////////////////////////
12357 
12358 		// handle head star
12359 		if ( bHeadStar )
12360 		{
12361 			memcpy ( sBuf, pWord+1, iLen-- ); // chops star, copies trailing zero, updates iLen
12362 		} else
12363 		{
12364 			sBuf[0] = MAGIC_WORD_HEAD;
12365 			memcpy ( sBuf+1, pWord, ++iLen ); // copies everything incl trailing zero, updates iLen
12366 		}
12367 
12368 		// handle tail star
12369 		if ( bTailStar )
12370 		{
12371 			sBuf[--iLen] = '\0'; // got star, just chop it away
12372 		} else
12373 		{
12374 			sBuf[iLen] = MAGIC_WORD_TAIL; // no star, add tail marker
12375 			sBuf[++iLen] = '\0';
12376 		}
12377 
12378 	} else
12379 	{
12380 		////////////////////
12381 		// prefix-only mode
12382 		////////////////////
12383 
12384 		assert ( m_bPrefixes );
12385 
12386 		// always ignore head star in prefix mode
12387 		if ( bHeadStar )
12388 		{
12389 			pWord++;
12390 			iLen--;
12391 		}
12392 
12393 		// handle tail star
12394 		if ( !bTailStar )
12395 		{
12396 			// exact word search request, always (ie. both in infix/prefix mode) mangles to "\1word\1" in v.8+
12397 			sBuf[0] = MAGIC_WORD_HEAD;
12398 			memcpy ( sBuf+1, pWord, iLen );
12399 			sBuf[iLen+1] = MAGIC_WORD_TAIL;
12400 			sBuf[iLen+2] = '\0';
12401 			iLen += 2;
12402 
12403 		} else
12404 		{
12405 			// prefix search request, mangles to word itself (just chop away the star)
12406 			memcpy ( sBuf, pWord, iLen );
12407 			sBuf[--iLen] = '\0';
12408 		}
12409 	}
12410 
12411 	// calc id for mangled word
12412 	return m_pDict->GetWordID ( (BYTE*)sBuf, iLen, !bHeadStar && !bTailStar );
12413 }
12414 
12415 //////////////////////////////////////////////////////////////////////////
12416 
GetWordID(BYTE * pWord)12417 SphWordID_t CSphDictExact::GetWordID ( BYTE * pWord )
12418 {
12419 	int iLen = strlen ( (const char*)pWord );
12420 	iLen = Min ( iLen, 16+3*SPH_MAX_WORD_LEN - 1 );
12421 
12422 	if ( !iLen )
12423 		return 0;
12424 
12425 	if ( pWord[0]=='=' )
12426 		pWord[0] = MAGIC_WORD_HEAD_NONSTEMMED;
12427 
12428 	if ( pWord[0]<' ' )
12429 		return m_pDict->GetWordIDNonStemmed ( pWord );
12430 
12431 	return m_pDict->GetWordID ( pWord );
12432 }
12433 
12434 
12435 /////////////////////////////////////////////////////////////////////////////
12436 
sphGroupMatch(SphAttr_t iGroup,const SphAttr_t * pGroups,int iGroups)12437 inline bool sphGroupMatch ( SphAttr_t iGroup, const SphAttr_t * pGroups, int iGroups )
12438 {
12439 	if ( !pGroups ) return true;
12440 	const SphAttr_t * pA = pGroups;
12441 	const SphAttr_t * pB = pGroups+iGroups-1;
12442 	if ( iGroup==*pA || iGroup==*pB ) return true;
12443 	if ( iGroup<(*pA) || iGroup>(*pB) ) return false;
12444 
12445 	while ( pB-pA>1 )
12446 	{
12447 		const SphAttr_t * pM = pA + ((pB-pA)/2);
12448 		if ( iGroup==(*pM) )
12449 			return true;
12450 		if ( iGroup<(*pM) )
12451 			pB = pM;
12452 		else
12453 			pA = pM;
12454 	}
12455 	return false;
12456 }
12457 
12458 
EarlyReject(CSphQueryContext * pCtx,CSphMatch & tMatch) const12459 bool CSphIndex_VLN::EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const
12460 {
12461 	// might be needed even when we do not have a filter
12462 	if ( pCtx->m_bLookupFilter )
12463 		CopyDocinfo ( pCtx, tMatch, FindDocinfo ( tMatch.m_iDocID ) );
12464 	pCtx->CalcFilter ( tMatch );
12465 
12466 	return pCtx->m_pFilter ? !pCtx->m_pFilter->Eval ( tMatch ) : false;
12467 }
12468 
12469 
GetKillList() const12470 SphAttr_t * CSphIndex_VLN::GetKillList () const
12471 {
12472 	return m_pKillList.GetWritePtr ();
12473 }
12474 
12475 
HasDocid(SphDocID_t uDocid) const12476 bool CSphIndex_VLN::HasDocid ( SphDocID_t uDocid ) const
12477 {
12478 	return FindDocinfo ( uDocid )!=NULL;
12479 }
12480 
12481 
FindDocinfo(SphDocID_t uDocID) const12482 const DWORD * CSphIndex_VLN::FindDocinfo ( SphDocID_t uDocID ) const
12483 {
12484 	if ( m_uDocinfo<=0 )
12485 		return NULL;
12486 
12487 	assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN );
12488 	assert ( !m_pDocinfo.IsEmpty() );
12489 	assert ( m_tSchema.GetAttrsCount() );
12490 
12491 	int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
12492 	int iStart = 0;
12493 	int iEnd = m_uDocinfo-1;
12494 
12495 	if ( m_pDocinfoHash.GetLength() )
12496 	{
12497 		SphDocID_t uFirst = DOCINFO2ID ( &m_pDocinfo[0] );
12498 		SphDocID_t uLast = DOCINFO2ID ( &m_pDocinfo[( int64_t ( m_uDocinfo-1 ) )*iStride] );
12499 		if ( uDocID<uFirst || uDocID>uLast )
12500 			return NULL;
12501 
12502 		DWORD uHash = (DWORD)( ( uDocID - uFirst ) >> m_pDocinfoHash[0] );
12503 		if ( uHash > ( 1 << DOCINFO_HASH_BITS ) ) // possible in case of broken data, for instance
12504 			return NULL;
12505 
12506 		iStart = m_pDocinfoHash [ uHash+1 ];
12507 		iEnd = m_pDocinfoHash [ uHash+2 ] - 1;
12508 	}
12509 
12510 	const DWORD * pFound = NULL;
12511 	if ( uDocID==DOCINFO2ID ( &m_pDocinfo [ (int64_t(iStart))*iStride ] ) )
12512 	{
12513 		pFound = &m_pDocinfo [ (int64_t(iStart))*iStride ];
12514 
12515 	} else if ( uDocID==DOCINFO2ID ( &m_pDocinfo [ (int64_t(iEnd))*iStride ] ) )
12516 	{
12517 		pFound = &m_pDocinfo [ (int64_t(iEnd))*iStride ];
12518 
12519 	} else
12520 	{
12521 		while ( iEnd-iStart>1 )
12522 		{
12523 			// check if nothing found
12524 			if (
12525 				uDocID < DOCINFO2ID ( &m_pDocinfo [ (int64_t(iStart))*iStride ] ) ||
12526 				uDocID > DOCINFO2ID ( &m_pDocinfo [ (int64_t(iEnd))*iStride ] ) )
12527 					break;
12528 			assert ( uDocID > DOCINFO2ID ( &m_pDocinfo [ (int64_t(iStart))*iStride ] ) );
12529 			assert ( uDocID < DOCINFO2ID ( &m_pDocinfo [ (int64_t(iEnd))*iStride ] ) );
12530 
12531 			int iMid = iStart + (iEnd-iStart)/2;
12532 			if ( uDocID==DOCINFO2ID ( &m_pDocinfo [ (int64_t(iMid))*iStride ] ) )
12533 			{
12534 				pFound = &m_pDocinfo [ (int64_t(iMid))*iStride ];
12535 				break;
12536 			}
12537 			if ( uDocID<DOCINFO2ID ( &m_pDocinfo [ (int64_t(iMid))*iStride ] ) )
12538 				iEnd = iMid;
12539 			else
12540 				iStart = iMid;
12541 		}
12542 	}
12543 
12544 	return pFound;
12545 }
12546 
CopyDocinfo(CSphQueryContext * pCtx,CSphMatch & tMatch,const DWORD * pFound) const12547 void CSphIndex_VLN::CopyDocinfo ( CSphQueryContext * pCtx, CSphMatch & tMatch, const DWORD * pFound ) const
12548 {
12549 	if ( !pFound )
12550 		return;
12551 
12552 	// setup static pointer
12553 	assert ( DOCINFO2ID(pFound)==tMatch.m_iDocID );
12554 	tMatch.m_pStatic = DOCINFO2ATTRS(pFound);
12555 
12556 	// patch if necessary
12557 	if ( pCtx->m_pOverrides )
12558 		ARRAY_FOREACH ( i, (*pCtx->m_pOverrides) )
12559 	{
12560 		const CSphAttrOverride & tOverride = (*pCtx->m_pOverrides)[i]; // shortcut
12561 		const CSphAttrOverride::IdValuePair_t * pEntry = tOverride.m_dValues.BinarySearch ( bind ( &CSphAttrOverride::IdValuePair_t::m_uDocID ), tMatch.m_iDocID );
12562 		tMatch.SetAttr ( pCtx->m_dOverrideOut[i], pEntry
12563 			? pEntry->m_uValue
12564 			: sphGetRowAttr ( tMatch.m_pStatic, pCtx->m_dOverrideIn[i] ) );
12565 	}
12566 }
12567 
12568 
CalcContextItems(CSphMatch & tMatch,const CSphVector<CSphQueryContext::CalcItem_t> & dItems)12569 static inline void CalcContextItems ( CSphMatch & tMatch, const CSphVector<CSphQueryContext::CalcItem_t> & dItems )
12570 {
12571 	ARRAY_FOREACH ( i, dItems )
12572 	{
12573 		const CSphQueryContext::CalcItem_t & tCalc = dItems[i];
12574 		if ( tCalc.m_eType==SPH_ATTR_INTEGER )
12575 			tMatch.SetAttr ( tCalc.m_tLoc, tCalc.m_pExpr->IntEval(tMatch) );
12576 		else if ( tCalc.m_eType==SPH_ATTR_BIGINT )
12577 			tMatch.SetAttr ( tCalc.m_tLoc, tCalc.m_pExpr->Int64Eval(tMatch) );
12578 		else
12579 			tMatch.SetAttrFloat ( tCalc.m_tLoc, tCalc.m_pExpr->Eval(tMatch) );
12580 	}
12581 }
12582 
12583 
CalcFilter(CSphMatch & tMatch) const12584 void CSphQueryContext::CalcFilter ( CSphMatch & tMatch ) const
12585 {
12586 	CalcContextItems ( tMatch, m_dCalcFilter );
12587 }
12588 
12589 
CalcSort(CSphMatch & tMatch) const12590 void CSphQueryContext::CalcSort ( CSphMatch & tMatch ) const
12591 {
12592 	CalcContextItems ( tMatch, m_dCalcSort );
12593 }
12594 
12595 
CalcFinal(CSphMatch & tMatch) const12596 void CSphQueryContext::CalcFinal ( CSphMatch & tMatch ) const
12597 {
12598 	CalcContextItems ( tMatch, m_dCalcFinal );
12599 }
12600 
12601 
SetStringPool(const BYTE * pStrings)12602 void CSphQueryContext::SetStringPool ( const BYTE * pStrings )
12603 {
12604 	ARRAY_FOREACH ( i, m_dCalcFilter )
12605 		m_dCalcFilter[i].m_pExpr->SetStringPool ( pStrings );
12606 
12607 	ARRAY_FOREACH ( i, m_dCalcSort )
12608 		m_dCalcSort[i].m_pExpr->SetStringPool ( pStrings );
12609 
12610 	ARRAY_FOREACH ( i, m_dCalcFinal )
12611 		m_dCalcFinal[i].m_pExpr->SetStringPool ( pStrings );
12612 }
12613 
12614 
SetMVAPool(const DWORD * pMva)12615 void CSphQueryContext::SetMVAPool ( const DWORD * pMva )
12616 {
12617 	ARRAY_FOREACH ( i, m_dCalcFilter )
12618 		m_dCalcFilter[i].m_pExpr->SetMVAPool ( pMva );
12619 
12620 	ARRAY_FOREACH ( i, m_dCalcSort )
12621 		m_dCalcSort[i].m_pExpr->SetMVAPool ( pMva );
12622 
12623 	ARRAY_FOREACH ( i, m_dCalcFinal )
12624 		m_dCalcFinal[i].m_pExpr->SetMVAPool ( pMva );
12625 
12626 	if ( m_pFilter )
12627 		m_pFilter->SetMVAStorage ( pMva );
12628 
12629 	if ( m_pWeightFilter )
12630 		m_pWeightFilter->SetMVAStorage ( pMva );
12631 }
12632 
12633 
MatchExtended(CSphQueryContext * pCtx,const CSphQuery * pQuery,int iSorters,ISphMatchSorter ** ppSorters,ISphRanker * pRanker,int iTag) const12634 bool CSphIndex_VLN::MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery, int iSorters, ISphMatchSorter ** ppSorters, ISphRanker * pRanker, int iTag ) const
12635 {
12636 	int iCutoff = pQuery->m_iCutoff;
12637 	if ( iCutoff<=0 )
12638 		iCutoff = -1;
12639 
12640 	// do searching
12641 	CSphMatch * pMatch = pRanker->GetMatchesBuffer();
12642 	for ( ;; )
12643 	{
12644 		int iMatches = pRanker->GetMatches();
12645 		if ( iMatches<=0 )
12646 			break;
12647 
12648 		for ( int i=0; i<iMatches; i++ )
12649 		{
12650 			if ( pCtx->m_bLookupSort )
12651 				CopyDocinfo ( pCtx, pMatch[i], FindDocinfo ( pMatch[i].m_iDocID ) );
12652 			pCtx->CalcSort ( pMatch[i] );
12653 
12654 			if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )
12655 				continue;
12656 
12657 			pMatch[i].m_iTag = iTag;
12658 
12659 			bool bRand = false;
12660 			bool bNewMatch = false;
12661 			for ( int iSorter=0; iSorter<iSorters; iSorter++ )
12662 			{
12663 				// all non-random sorters are in the beginning,
12664 				// so we can avoid the simple 'first-element' assertion
12665 				if ( !bRand && ppSorters[iSorter]->m_bRandomize )
12666 				{
12667 					bRand = true;
12668 					pMatch[i].m_iWeight = ( sphRand() & 0xffff );
12669 
12670 					if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )
12671 						break;
12672 				}
12673 				bNewMatch |= ppSorters[iSorter]->Push ( pMatch[i] );
12674 			}
12675 
12676 			if ( bNewMatch )
12677 				if ( --iCutoff==0 )
12678 					break;
12679 		}
12680 
12681 		if ( iCutoff==0 )
12682 			break;
12683 	}
12684 	return true;
12685 }
12686 
12687 //////////////////////////////////////////////////////////////////////////
12688 
MultiScan(const CSphQuery * pQuery,CSphQueryResult * pResult,int iSorters,ISphMatchSorter ** ppSorters,const CSphVector<CSphFilterSettings> * pExtraFilters,int iTag) const12689 bool CSphIndex_VLN::MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const
12690 {
12691 	assert ( pQuery->m_sQuery.IsEmpty() );
12692 	assert ( iTag>=0 );
12693 
12694 	// check if index is ready
12695 	if ( !m_pPreread || !*m_pPreread )
12696 	{
12697 		pResult->m_sError = "index not preread";
12698 		return false;
12699 	}
12700 
12701 	// check if index supports scans
12702 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN || !m_tSchema.GetAttrsCount() )
12703 	{
12704 		pResult->m_sError = "fullscan requires extern docinfo";
12705 		return false;
12706 	}
12707 
12708 	// check if index has data
12709 	if ( m_bIsEmpty || m_uDocinfo<=0 || m_pDocinfo.IsEmpty() )
12710 		return true;
12711 
12712 	// start counting
12713 	int64_t tmQueryStart = sphMicroTimer();
12714 
12715 	// select the sorter with max schema
12716 	int iMaxSchemaSize = -1;
12717 	int iMaxSchemaIndex = -1;
12718 	for ( int i=0; i<iSorters; i++ )
12719 		if ( ppSorters[i]->GetSchema().GetRowSize() > iMaxSchemaSize )
12720 		{
12721 			iMaxSchemaSize = ppSorters[i]->GetSchema().GetRowSize();
12722 			iMaxSchemaIndex = i;
12723 		}
12724 
12725 	// setup calculations and result schema
12726 	CSphQueryContext tCtx;
12727 	if ( !tCtx.SetupCalc ( pResult, ppSorters[iMaxSchemaIndex]->GetSchema(), m_tSchema, GetMVAPool() ) )
12728 		return false;
12729 
12730 	// set string pool for string on_sort expression fix up
12731 	tCtx.SetStringPool ( m_pStrings.GetWritePtr() );
12732 
12733 	// setup filters
12734 	if ( !tCtx.CreateFilters ( true, &pQuery->m_dFilters, pResult->m_tSchema, GetMVAPool(), pResult->m_sError ) )
12735 		return false;
12736 	if ( !tCtx.CreateFilters ( true, pExtraFilters, pResult->m_tSchema, GetMVAPool(), pResult->m_sError ) )
12737 		return false;
12738 
12739 	// check if we can early reject the whole index
12740 	if ( tCtx.m_pFilter && m_uDocinfoIndex )
12741 	{
12742 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
12743 		DWORD * pMinEntry = const_cast<DWORD*> ( &m_pDocinfoIndex [ 2*m_uDocinfoIndex*uStride ] );
12744 		DWORD * pMaxEntry = pMinEntry + uStride;
12745 
12746 		if ( !tCtx.m_pFilter->EvalBlock ( pMinEntry, pMaxEntry ) )
12747 		{
12748 			pResult->m_iQueryTime += (int)( ( sphMicroTimer()-tmQueryStart )/1000 );
12749 			return true;
12750 		}
12751 	}
12752 
12753 	// setup lookup
12754 	tCtx.m_bLookupFilter = false;
12755 	tCtx.m_bLookupSort = true;
12756 
12757 	// setup sorters vs. MVA
12758 	for ( int i=0; i<iSorters; i++ )
12759 	{
12760 		(ppSorters[i])->SetMVAPool ( m_pMva.GetWritePtr() );
12761 		(ppSorters[i])->SetStringPool ( m_pStrings.GetWritePtr() );
12762 	}
12763 
12764 	// setup overrides
12765 	if ( !tCtx.SetupOverrides ( pQuery, pResult, m_tSchema ) )
12766 		return false;
12767 
12768 	// prepare to work them rows
12769 	bool bRandomize = ppSorters[0]->m_bRandomize;
12770 
12771 	CSphMatch tMatch;
12772 	tMatch.Reset ( pResult->m_tSchema.GetDynamicSize() );
12773 	tMatch.m_iWeight = pQuery->GetIndexWeight ( m_sIndexName.cstr() );
12774 	tMatch.m_iTag = tCtx.m_dCalcFinal.GetLength() ? -1 : iTag;
12775 
12776 	// optimize direct lookups by id
12777 	// run full scan with block and row filtering for everything else
12778 	if ( pQuery->m_dFilters.GetLength()==1
12779 		&& pQuery->m_dFilters[0].m_eType==SPH_FILTER_VALUES
12780 		&& pQuery->m_dFilters[0].m_bExclude==false
12781 		&& pQuery->m_dFilters[0].m_sAttrName=="@id"
12782 		&& !pExtraFilters )
12783 	{
12784 		// run id lookups
12785 		for ( int i=0; i<pQuery->m_dFilters[0].GetNumValues(); i++ )
12786 		{
12787 			SphDocID_t uDocid = (SphDocID_t) pQuery->m_dFilters[0].GetValue(i);
12788 			const DWORD * pRow = FindDocinfo ( uDocid );
12789 			if ( !pRow )
12790 				continue;
12791 
12792 			assert ( uDocid==DOCINFO2ID(pRow) );
12793 			tMatch.m_iDocID = uDocid;
12794 			CopyDocinfo ( &tCtx, tMatch, pRow );
12795 
12796 			// submit match to sorters
12797 			tCtx.CalcSort ( tMatch );
12798 			if ( bRandomize )
12799 				tMatch.m_iWeight = ( sphRand() & 0xffff );
12800 
12801 			for ( int iSorter=0; iSorter<iSorters; iSorter++ )
12802 				ppSorters[iSorter]->Push ( tMatch );
12803 		}
12804 	} else
12805 	{
12806 		// do scan
12807 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
12808 		DWORD uStart = pQuery->m_bReverseScan ? ( m_uDocinfoIndex-1 ) : 0;
12809 		int iStep = pQuery->m_bReverseScan ? -1 : 1;
12810 
12811 		int iCutoff = pQuery->m_iCutoff;
12812 		if ( iCutoff<=0 )
12813 			iCutoff = -1;
12814 
12815 		for ( DWORD uIndexEntry=uStart; uIndexEntry<m_uDocinfoIndex; uIndexEntry+=iStep )
12816 		{
12817 			// block-level filtering
12818 			const DWORD * pMin = &m_pDocinfoIndex[2*uIndexEntry*uStride];
12819 			const DWORD * pMax = pMin + uStride;
12820 
12821 			// check applicable filters
12822 			if ( tCtx.m_pFilter && !tCtx.m_pFilter->EvalBlock ( pMin, pMax ) )
12823 				continue;
12824 
12825 			// row-level filtering
12826 			const DWORD * pBlockStart = &m_pDocinfo [ ( int64_t ( uIndexEntry ) )*uStride*DOCINFO_INDEX_FREQ ];
12827 			const DWORD * pBlockEnd = &m_pDocinfo [ ( int64_t ( Min ( ( uIndexEntry+1 )*DOCINFO_INDEX_FREQ, m_uDocinfo ) - 1 ) )*uStride ];
12828 
12829 			for ( const DWORD * pDocinfo=pBlockStart; pDocinfo<=pBlockEnd; pDocinfo+=uStride )
12830 			{
12831 				tMatch.m_iDocID = DOCINFO2ID ( pDocinfo );
12832 				CopyDocinfo ( &tCtx, tMatch, pDocinfo );
12833 
12834 				// early filter only (no late filters in full-scan because of no @weight)
12835 				tCtx.CalcFilter ( tMatch );
12836 				if ( tCtx.m_pFilter && !tCtx.m_pFilter->Eval ( tMatch ) )
12837 					continue;
12838 
12839 				// submit match to sorters
12840 				tCtx.CalcSort ( tMatch );
12841 				if ( bRandomize )
12842 					tMatch.m_iWeight = ( sphRand() & 0xffff );
12843 
12844 				bool bNewMatch = false;
12845 				for ( int iSorter=0; iSorter<iSorters; iSorter++ )
12846 					bNewMatch |= ppSorters[iSorter]->Push ( tMatch );
12847 
12848 				// handle cutoff
12849 				if ( bNewMatch && --iCutoff==0 )
12850 				{
12851 					uIndexEntry = m_uDocinfoIndex; // outer break
12852 					break;
12853 				}
12854 			}
12855 		}
12856 	}
12857 
12858 	// do final expression calculations
12859 	if ( tCtx.m_dCalcFinal.GetLength() )
12860 		for ( int iSorter=0; iSorter<iSorters; iSorter++ )
12861 	{
12862 		ISphMatchSorter * pTop = ppSorters[iSorter];
12863 		CSphMatch * const pHead = pTop->Finalize();
12864 		const int iCount = pTop->GetLength ();
12865 		if ( !iCount )
12866 			continue;
12867 
12868 		CSphMatch * const pTail = pHead + iCount;
12869 		for ( CSphMatch * pCur=pHead; pCur<pTail; pCur++ )
12870 		{
12871 			if ( pCur->m_iTag<0 )
12872 			{
12873 				tCtx.CalcFinal ( *pCur );
12874 				pCur->m_iTag = iTag;
12875 			}
12876 		}
12877 	}
12878 
12879 	// done
12880 	pResult->m_pMva = m_pMva.GetWritePtr();
12881 	pResult->m_pStrings = m_pStrings.GetWritePtr();
12882 	pResult->m_iQueryTime += (int)( ( sphMicroTimer()-tmQueryStart )/1000 );
12883 	return true;
12884 }
12885 
12886 //////////////////////////////////////////////////////////////////////////////
12887 
QwordSpawn(const XQKeyword_t & tWord) const12888 ISphQword * DiskIndexQwordSetup_c::QwordSpawn ( const XQKeyword_t & tWord ) const
12889 {
12890 	WITH_QWORD ( m_pIndex, false, Qword, return new Qword ( tWord.m_bExpanded, tWord.m_bExcluded ) );
12891 	return NULL;
12892 }
12893 
12894 
QwordSetup(ISphQword * pWord) const12895 bool DiskIndexQwordSetup_c::QwordSetup ( ISphQword * pWord ) const
12896 {
12897 	WITH_QWORD ( m_pIndex, false, Qword, return Setup<Qword> ( pWord ) );
12898 	return false;
12899 }
12900 
12901 
12902 template < class Qword >
Setup(ISphQword * pWord) const12903 bool DiskIndexQwordSetup_c::Setup ( ISphQword * pWord ) const
12904 {
12905 	Qword * pMyWord = dynamic_cast<Qword*> ( pWord );
12906 
12907 	if ( !pMyWord )
12908 		return false;
12909 
12910 	Qword & tWord = *pMyWord;
12911 
12912 	// setup attrs
12913 	tWord.m_tDoc.Reset ( m_iDynamicRowitems );
12914 	tWord.m_iMinID = m_tMin.m_iDocID;
12915 	tWord.m_tDoc.m_iDocID = m_tMin.m_iDocID;
12916 
12917 	if ( m_eDocinfo==SPH_DOCINFO_INLINE )
12918 	{
12919 		tWord.m_iInlineAttrs = m_iInlineRowitems;
12920 		tWord.m_pInlineFixup = m_tMin.m_pDynamic;
12921 	} else
12922 	{
12923 		tWord.m_iInlineAttrs = 0;
12924 		tWord.m_pInlineFixup = NULL;
12925 	}
12926 
12927 	// setup stats
12928 	tWord.m_iDocs = 0;
12929 	tWord.m_iHits = 0;
12930 
12931 	CSphIndex_VLN * pIndex = (CSphIndex_VLN *)m_pIndex;
12932 
12933 	// binary search through checkpoints for a one whose range matches word ID
12934 	assert ( pIndex->m_pPreread && *pIndex->m_pPreread );
12935 	assert ( !pIndex->m_bPreloadWordlist || !pIndex->m_tWordlist.m_pBuf.IsEmpty() );
12936 
12937 	// empty index?
12938 	if ( !pIndex->m_tWordlist.m_dCheckpoints.GetLength() )
12939 		return false;
12940 
12941 	const char * sWord = tWord.m_sDictWord.cstr();
12942 	const bool bWordDict = pIndex->m_pDict->GetSettings().m_bWordDict;
12943 	int iWordLen = sWord ? strlen ( sWord ) : 0;
12944 	if ( pIndex->m_bEnableStar && bWordDict && tWord.m_sWord.Ends("*") )
12945 		iWordLen = Max ( iWordLen-1, 0 );
12946 
12947 	// leading special symbols trimming
12948 	if ( tWord.m_sDictWord.Begins("*") )
12949 	{
12950 		sWord++;
12951 		iWordLen = Max ( iWordLen-1, 0 );
12952 	}
12953 
12954 	const CSphWordlistCheckpoint * pCheckpoint = pIndex->m_tWordlist.FindCheckpoint ( sWord, iWordLen, tWord.m_iWordID, false );
12955 	if ( !pCheckpoint )
12956 		return false;
12957 
12958 	// decode wordlist chunk
12959 	const BYTE * pBuf = pIndex->m_tWordlist.AcquireDict ( pCheckpoint, m_tWordlist.GetFD(), m_pDictBuf );
12960 	assert ( pBuf );
12961 
12962 	WordDictInfo_t tResWord;
12963 	WordReaderContext_t tReaderCtx;
12964 
12965 	const bool bWordFound = bWordDict
12966 		? pIndex->m_tWordlist.GetWord ( pBuf, sWord, iWordLen, tResWord, false, tReaderCtx )!=NULL
12967 		: pIndex->m_tWordlist.GetWord ( pBuf, tWord.m_iWordID, tResWord );
12968 
12969 	if ( bWordFound )
12970 	{
12971 		const ESphHitless eMode = pIndex->m_tSettings.m_eHitless;
12972 		tWord.m_iDocs = eMode==SPH_HITLESS_SOME ? ( tResWord.m_iDocs & 0x7FFFFFFF ) : tResWord.m_iDocs;
12973 		tWord.m_iHits = tResWord.m_iHits;
12974 		tWord.m_bHasHitlist =
12975 			( eMode==SPH_HITLESS_NONE ) ||
12976 			( eMode==SPH_HITLESS_SOME && !( tResWord.m_iDocs & 0x80000000 ) );
12977 
12978 		if ( m_bSetupReaders )
12979 		{
12980 			tWord.m_rdDoclist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
12981 			tWord.m_rdDoclist.SetFile ( m_tDoclist );
12982 			tWord.m_rdDoclist.SeekTo ( tResWord.m_uOff, tResWord.m_iDoclistHint );
12983 
12984 			tWord.m_rdHitlist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
12985 			tWord.m_rdHitlist.SetFile ( m_tHitlist );
12986 		}
12987 	}
12988 
12989 	return bWordFound;
12990 }
12991 
12992 //////////////////////////////////////////////////////////////////////////////
12993 
Lock()12994 bool CSphIndex_VLN::Lock ()
12995 {
12996 	CSphString sName = GetIndexFileName("spl");
12997 	sphLogDebug ( "Locking the index via file %s", sName.cstr() );
12998 
12999 	if ( m_iLockFD<0 )
13000 	{
13001 		m_iLockFD = ::open ( sName.cstr(), SPH_O_NEW, 0644 );
13002 		if ( m_iLockFD<0 )
13003 		{
13004 			m_sLastError.SetSprintf ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
13005 			sphLogDebug ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
13006 			return false;
13007 		}
13008 	}
13009 
13010 	if ( !sphLockEx ( m_iLockFD, false ) )
13011 	{
13012 		m_sLastError.SetSprintf ( "failed to lock %s: %s", sName.cstr(), strerror(errno) );
13013 		::close ( m_iLockFD );
13014 		m_iLockFD = -1;
13015 		return false;
13016 	}
13017 	sphLogDebug ( "lock %s success", sName.cstr() );
13018 	return true;
13019 }
13020 
13021 
Unlock()13022 void CSphIndex_VLN::Unlock()
13023 {
13024 	CSphString sName = GetIndexFileName("spl");
13025 	sphLogDebug ( "Unlocking the index (lock %s)", sName.cstr() );
13026 	if ( m_iLockFD>=0 )
13027 	{
13028 		sphLogDebug ( "File ID ok, closing lock FD %d, unlinking %s", m_iLockFD, sName.cstr() );
13029 		::close ( m_iLockFD );
13030 		::unlink ( sName.cstr() );
13031 		m_iLockFD = -1;
13032 	}
13033 }
13034 
13035 
Mlock()13036 bool CSphIndex_VLN::Mlock ()
13037 {
13038 	bool bRes = true;
13039 	bRes &= m_pDocinfo.Mlock ( "docinfo", m_sLastError );
13040 
13041 	if ( m_bPreloadWordlist )
13042 		bRes &= m_tWordlist.m_pBuf.Mlock ( "wordlist", m_sLastError );
13043 
13044 	bRes &= m_pMva.Mlock ( "mva", m_sLastError );
13045 	bRes &= m_pStrings.Mlock ( "strings", m_sLastError );
13046 	return bRes;
13047 }
13048 
13049 
Dealloc()13050 void CSphIndex_VLN::Dealloc ()
13051 {
13052 	if ( !m_bPreallocated )
13053 		return;
13054 
13055 	m_tDoclistFile.Close ();
13056 	m_tHitlistFile.Close ();
13057 	m_pDocinfo.Reset ();
13058 	m_pDocinfoHash.Reset ();
13059 	m_pMva.Reset ();
13060 	m_pStrings.Reset ();
13061 	m_pKillList.Reset ();
13062 	m_tWordlist.Reset ();
13063 
13064 	m_uDocinfo = 0;
13065 	m_uMinMaxIndex = 0;
13066 	m_tSettings.m_eDocinfo = SPH_DOCINFO_NONE;
13067 
13068 	m_bPreallocated = false;
13069 	SafeDelete ( m_pTokenizer );
13070 	SafeDelete ( m_pDict );
13071 
13072 	if ( m_iIndexTag>=0 && g_pMvaArena )
13073 		g_MvaArena.TaggedFreeTag ( m_iIndexTag );
13074 	m_iIndexTag = -1;
13075 
13076 	m_pPreread = NULL;
13077 	m_pAttrsStatus = NULL;
13078 
13079 #ifndef NDEBUG
13080 	m_dShared.Reset ();
13081 #endif
13082 }
13083 
13084 
LoadIndexSettings(CSphIndexSettings & tSettings,CSphReader & tReader,DWORD uVersion)13085 void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion )
13086 {
13087 	if ( uVersion>=8 )
13088 	{
13089 		tSettings.m_iMinPrefixLen = tReader.GetDword ();
13090 		tSettings.m_iMinInfixLen = tReader.GetDword ();
13091 
13092 	} else if ( uVersion>=6 )
13093 	{
13094 		bool bPrefixesOnly = ( tReader.GetByte ()!=0 );
13095 		tSettings.m_iMinPrefixLen = tReader.GetDword ();
13096 		tSettings.m_iMinInfixLen = 0;
13097 		if ( !bPrefixesOnly )
13098 			Swap ( tSettings.m_iMinPrefixLen, tSettings.m_iMinInfixLen );
13099 	}
13100 
13101 	if ( uVersion>=9 )
13102 	{
13103 		tSettings.m_bHtmlStrip = !!tReader.GetByte ();
13104 		tSettings.m_sHtmlIndexAttrs = tReader.GetString ();
13105 		tSettings.m_sHtmlRemoveElements = tReader.GetString ();
13106 	}
13107 
13108 	if ( uVersion>=12 )
13109 		tSettings.m_bIndexExactWords = !!tReader.GetByte ();
13110 
13111 	if ( uVersion>=18 )
13112 		tSettings.m_eHitless = (ESphHitless)tReader.GetDword();
13113 
13114 	if ( uVersion>=19 )
13115 		tSettings.m_eHitFormat = (ESphHitFormat)tReader.GetDword();
13116 	else // force plain format for old indices
13117 		tSettings.m_eHitFormat = SPH_HIT_FORMAT_PLAIN;
13118 
13119 	if ( uVersion>=21 )
13120 		tSettings.m_bIndexSP = !!tReader.GetByte();
13121 
13122 	if ( uVersion>=22 )
13123 	{
13124 		tSettings.m_sZones = tReader.GetString();
13125 		if ( uVersion<25 && !tSettings.m_sZones.IsEmpty() )
13126 			tSettings.m_sZones.SetSprintf ( "%s*", tSettings.m_sZones.cstr() );
13127 	}
13128 
13129 	if ( uVersion>=23 )
13130 	{
13131 		tSettings.m_iBoundaryStep = (int)tReader.GetDword();
13132 		tSettings.m_iStopwordStep = (int)tReader.GetDword();
13133 	}
13134 }
13135 
13136 
LoadHeader(const char * sHeaderName,bool bStripPath,CSphString & sWarning)13137 bool CSphIndex_VLN::LoadHeader ( const char * sHeaderName, bool bStripPath, CSphString & sWarning )
13138 {
13139 	const int MAX_HEADER_SIZE = 32768;
13140 	CSphAutoArray<BYTE> dCacheInfo ( MAX_HEADER_SIZE );
13141 
13142 	CSphAutoreader rdInfo ( dCacheInfo, MAX_HEADER_SIZE ); // to avoid mallocs
13143 	if ( !rdInfo.Open ( sHeaderName, m_sLastError ) )
13144 		return false;
13145 
13146 	// version
13147 	DWORD uHeader = rdInfo.GetDword ();
13148 	if ( uHeader!=INDEX_MAGIC_HEADER )
13149 	{
13150 		m_sLastError.SetSprintf ( "%s is invalid header file (too old index version?)", sHeaderName );
13151 		return false;
13152 	}
13153 
13154 	m_uVersion = rdInfo.GetDword();
13155 	if ( m_uVersion==0 || m_uVersion>INDEX_FORMAT_VERSION )
13156 	{
13157 		m_sLastError.SetSprintf ( "%s is v.%d, binary is v.%d", sHeaderName, m_uVersion, INDEX_FORMAT_VERSION );
13158 		return false;
13159 	}
13160 
13161 	// bits
13162 	m_bUse64 = false;
13163 	if ( m_uVersion>=2 )
13164 		m_bUse64 = ( rdInfo.GetDword ()!=0 );
13165 
13166 	if ( m_bUse64!=USE_64BIT )
13167 	{
13168 #if USE_64BIT
13169 		// TODO: may be do this param conditional and push it into the config?
13170 		m_bId32to64 = true;
13171 #else
13172 		m_sLastError.SetSprintf ( "'%s' is id%d, and this binary is id%d",
13173 			GetIndexFileName("sph").cstr(),
13174 			m_bUse64 ? 64 : 32, USE_64BIT ? 64 : 32 );
13175 		return false;
13176 #endif
13177 	}
13178 
13179 	// docinfo
13180 	m_tSettings.m_eDocinfo = (ESphDocinfo) rdInfo.GetDword();
13181 
13182 	// schema
13183 	// 4th arg means that inline attributes need be dynamic in searching time too
13184 	ReadSchema ( rdInfo, m_tSchema, m_uVersion, m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE );
13185 
13186 	// check schema for dupes
13187 	for ( int iAttr=1; iAttr<m_tSchema.GetAttrsCount(); iAttr++ )
13188 	{
13189 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(iAttr);
13190 		for ( int i=0; i<iAttr; i++ )
13191 			if ( m_tSchema.GetAttr(i).m_sName==tCol.m_sName )
13192 				sWarning.SetSprintf ( "duplicate attribute name: %s", tCol.m_sName.cstr() );
13193 	}
13194 
13195 	// in case of *fork rotation we reuse min match from 1st rotated index ( it could be less than my size and inline ( m_pDynamic ) )
13196 	SafeDelete ( m_pMin );
13197 	m_pMin = new CSphMatch();
13198 
13199 	// min doc
13200 	m_pMin->Reset ( m_tSchema.GetRowSize() );
13201 	if ( m_uVersion>=2 )
13202 		m_pMin->m_iDocID = (SphDocID_t) rdInfo.GetOffset (); // v2+; losing high bits when !USE_64 is intentional, check is performed on bUse64 above
13203 	else
13204 		m_pMin->m_iDocID = rdInfo.GetDword(); // v1
13205 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
13206 		rdInfo.GetBytes ( m_pMin->m_pDynamic, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
13207 
13208 	// wordlist checkpoints
13209 	m_tWordlist.m_iCheckpointsPos = rdInfo.GetOffset();
13210 	m_tWordlist.m_dCheckpoints.Reset ( rdInfo.GetDword() );
13211 
13212 	// index stats
13213 	m_tStats.m_iTotalDocuments = rdInfo.GetDword ();
13214 	m_tStats.m_iTotalBytes = rdInfo.GetOffset ();
13215 
13216 	LoadIndexSettings ( m_tSettings, rdInfo, m_uVersion );
13217 	if ( m_uVersion<9 )
13218 		m_bStripperInited = false;
13219 
13220 	if ( m_uVersion>=9 )
13221 	{
13222 		// tokenizer stuff
13223 		CSphTokenizerSettings tSettings;
13224 		LoadTokenizerSettings ( rdInfo, tSettings, m_uVersion, sWarning );
13225 
13226 		if ( bStripPath )
13227 			StripPath ( tSettings.m_sSynonymsFile );
13228 
13229 		ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tSettings, m_sLastError );
13230 		if ( !pTokenizer )
13231 			return false;
13232 
13233 		// dictionary stuff
13234 		CSphDictSettings tDictSettings;
13235 		LoadDictionarySettings ( rdInfo, tDictSettings, m_uVersion, sWarning );
13236 		if ( m_bId32to64 )
13237 			tDictSettings.m_bCrc32 = true;
13238 
13239 		if ( bStripPath )
13240 		{
13241 			StripPath ( tDictSettings.m_sStopwords );
13242 			StripPath ( tDictSettings.m_sWordforms );
13243 		}
13244 
13245 		CSphDict * pDict = tDictSettings.m_bWordDict
13246 			? sphCreateDictionaryKeywords ( tDictSettings, pTokenizer, m_sLastError, m_sIndexName.cstr() )
13247 			: sphCreateDictionaryCRC ( tDictSettings, pTokenizer, m_sLastError, m_sIndexName.cstr() );
13248 
13249 		if ( !pDict )
13250 			return false;
13251 
13252 		SetDictionary ( pDict );
13253 
13254 		ISphTokenizer * pTokenFilter = ISphTokenizer::CreateTokenFilter ( pTokenizer, pDict->GetMultiWordforms () );
13255 		SetTokenizer ( pTokenFilter ? pTokenFilter : pTokenizer );
13256 	} else
13257 	{
13258 		if ( m_bId32to64 )
13259 		{
13260 			m_sLastError.SetSprintf ( "too old id32 index; can not be loaded by this id64 binary" );
13261 			return false;
13262 		}
13263 	}
13264 
13265 	if ( m_uVersion>=10 )
13266 		m_iKillListSize = rdInfo.GetDword ();
13267 
13268 	if ( m_uVersion>=20 )
13269 		m_uMinMaxIndex = rdInfo.GetDword ();
13270 
13271 	if ( rdInfo.GetErrorFlag() )
13272 		m_sLastError.SetSprintf ( "%s: failed to parse header (unexpected eof)", sHeaderName );
13273 
13274 	return !rdInfo.GetErrorFlag();
13275 }
13276 
13277 
DebugDumpHeader(FILE * fp,const char * sHeaderName,bool bConfig)13278 void CSphIndex_VLN::DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig )
13279 {
13280 	CSphString sWarning;
13281 	if ( !LoadHeader ( sHeaderName, false, sWarning ) )
13282 	{
13283 		fprintf ( fp, "FATAL: failed to load header: %s.\n", m_sLastError.cstr() );
13284 		return;
13285 	}
13286 
13287 	if ( !sWarning.IsEmpty () )
13288 		fprintf ( fp, "WARNING: %s\n", sWarning.cstr () );
13289 
13290 	///////////////////////////////////////////////
13291 	// print header in index config section format
13292 	///////////////////////////////////////////////
13293 
13294 	if ( bConfig )
13295 	{
13296 		fprintf ( fp, "\nsource $dump\n{\n" );
13297 
13298 		fprintf ( fp, "\tsql_query = SELECT id \\\n" );
13299 		ARRAY_FOREACH ( i, m_tSchema.m_dFields )
13300 			fprintf ( fp, "\t, %s \\\n", m_tSchema.m_dFields[i].m_sName.cstr() );
13301 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
13302 		{
13303 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
13304 			fprintf ( fp, "\t, %s \\\n", tAttr.m_sName.cstr() );
13305 		}
13306 		fprintf ( fp, "\tFROM documents\n" );
13307 
13308 		if ( m_tSchema.GetAttrsCount() )
13309 			fprintf ( fp, "\n" );
13310 
13311 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
13312 		{
13313 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
13314 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
13315 				fprintf ( fp, "\tsql_attr_multi = uint %s from field\n", tAttr.m_sName.cstr() );
13316 			else if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
13317 				fprintf ( fp, "\tsql_attr_multi = bigint %s from field\n", tAttr.m_sName.cstr() );
13318 			else if ( tAttr.m_eAttrType==SPH_ATTR_INTEGER && tAttr.m_tLocator.IsBitfield() )
13319 				fprintf ( fp, "\tsql_attr_uint = %s:%d\n", tAttr.m_sName.cstr(), tAttr.m_tLocator.m_iBitCount );
13320 			else
13321 				fprintf ( fp, "\t%s = %s\n", sphTypeDirective ( tAttr.m_eAttrType ), tAttr.m_sName.cstr() );
13322 		}
13323 
13324 		fprintf ( fp, "}\n\nindex $dump\n{\n\tsource = $dump\n\tpath = $dump\n" );
13325 
13326 		if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
13327 			fprintf ( fp, "\tdocinfo = inline\n" );
13328 		if ( m_tSettings.m_iMinPrefixLen )
13329 			fprintf ( fp, "\tmin_prefix_len = %d\n", m_tSettings.m_iMinPrefixLen );
13330 		if ( m_tSettings.m_iMinInfixLen )
13331 			fprintf ( fp, "\tmin_prefix_len = %d\n", m_tSettings.m_iMinInfixLen );
13332 		if ( m_tSettings.m_bIndexExactWords )
13333 			fprintf ( fp, "\tindex_exact_words = %d\n", m_tSettings.m_bIndexExactWords ? 1 : 0 );
13334 		if ( m_tSettings.m_bHtmlStrip )
13335 			fprintf ( fp, "\thtml_strip = 1\n" );
13336 		if ( !m_tSettings.m_sHtmlIndexAttrs.IsEmpty() )
13337 			fprintf ( fp, "\thtml_index_attrs = %s\n", m_tSettings.m_sHtmlIndexAttrs.cstr () );
13338 		if ( !m_tSettings.m_sHtmlRemoveElements.IsEmpty() )
13339 			fprintf ( fp, "\thtml_remove_elements = %s\n", m_tSettings.m_sHtmlRemoveElements.cstr () );
13340 		if ( m_tSettings.m_sZones.cstr() )
13341 			fprintf ( fp, "\tindex_zones = %s\n", m_tSettings.m_sZones.cstr() );
13342 
13343 		if ( m_pTokenizer )
13344 		{
13345 			const CSphTokenizerSettings & tSettings = m_pTokenizer->GetSettings ();
13346 			fprintf ( fp, "\tcharset_type = %s\n", tSettings.m_iType==TOKENIZER_SBCS ? "sbcs" : "utf-8" );
13347 			fprintf ( fp, "\tcharset_table = %s\n", tSettings.m_sCaseFolding.cstr () );
13348 			if ( tSettings.m_iMinWordLen>1 )
13349 				fprintf ( fp, "\tmin_word_len = %d\n", tSettings.m_iMinWordLen );
13350 			if ( tSettings.m_iNgramLen && !tSettings.m_sNgramChars.IsEmpty() )
13351 				fprintf ( fp, "\tngram_len = %d\nngram_chars = %s\n",
13352 					tSettings.m_iNgramLen, tSettings.m_sNgramChars.cstr () );
13353 			if ( !tSettings.m_sSynonymsFile.IsEmpty() )
13354 				fprintf ( fp, "\texceptions = %s\n", tSettings.m_sSynonymsFile.cstr () );
13355 			if ( !tSettings.m_sBoundary.IsEmpty() )
13356 				fprintf ( fp, "\tphrase_boundary = %s\n", tSettings.m_sBoundary.cstr () );
13357 			if ( !tSettings.m_sIgnoreChars.IsEmpty() )
13358 				fprintf ( fp, "\tignore_chars = %s\n", tSettings.m_sIgnoreChars.cstr () );
13359 			if ( !tSettings.m_sBlendChars.IsEmpty() )
13360 				fprintf ( fp, "\tblend_chars = %s\n", tSettings.m_sBlendChars.cstr () );
13361 			if ( !tSettings.m_sBlendMode.IsEmpty() )
13362 				fprintf ( fp, "\tblend_mode = %s\n", tSettings.m_sBlendMode.cstr () );
13363 		}
13364 
13365 		if ( m_pDict )
13366 		{
13367 			const CSphDictSettings & tSettings = m_pDict->GetSettings ();
13368 			if ( tSettings.m_bWordDict )
13369 				fprintf ( fp, "\tdict = keywords\n" );
13370 			if ( !tSettings.m_sMorphology.IsEmpty() )
13371 				fprintf ( fp, "\tmorphology = %s\n", tSettings.m_sMorphology.cstr () );
13372 			if ( !tSettings.m_sStopwords.IsEmpty() )
13373 				fprintf ( fp, "\tstopwords = %s\n", tSettings.m_sStopwords.cstr () );
13374 			if ( !tSettings.m_sWordforms.IsEmpty() )
13375 				fprintf ( fp, "\twordforms: %s\n", tSettings.m_sWordforms.cstr () );
13376 			if ( tSettings.m_iMinStemmingLen>1 )
13377 				fprintf ( fp, "\tmin_stemming_len = %d\n", tSettings.m_iMinStemmingLen );
13378 		}
13379 
13380 		fprintf ( fp, "}\n" );
13381 		return;
13382 	}
13383 
13384 	///////////////////////////////////////////////
13385 	// print header and stats in "readable" format
13386 	///////////////////////////////////////////////
13387 
13388 	fprintf ( fp, "version: %d\n",			m_uVersion );
13389 	fprintf ( fp, "idbits: %d\n",			m_bUse64 ? 64 : 32 );
13390 	fprintf ( fp, "docinfo: " );
13391 	switch ( m_tSettings.m_eDocinfo )
13392 	{
13393 		case SPH_DOCINFO_NONE:		fprintf ( fp, "none\n" ); break;
13394 		case SPH_DOCINFO_INLINE:	fprintf ( fp, "inline\n" ); break;
13395 		case SPH_DOCINFO_EXTERN:	fprintf ( fp, "extern\n" ); break;
13396 		default:					fprintf ( fp, "unknown (value=%d)\n", m_tSettings.m_eDocinfo ); break;
13397 	}
13398 
13399 	fprintf ( fp, "fields: %d\n", m_tSchema.m_dFields.GetLength() );
13400 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
13401 		fprintf ( fp, "  field %d: %s\n", i, m_tSchema.m_dFields[i].m_sName.cstr() );
13402 
13403 	fprintf ( fp, "attrs: %d\n", m_tSchema.GetAttrsCount() );
13404 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
13405 	{
13406 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
13407 		fprintf ( fp, "  attr %d: %s, %s", i, tAttr.m_sName.cstr(), sphTypeName ( tAttr.m_eAttrType ) );
13408 		if ( tAttr.m_eAttrType==SPH_ATTR_INTEGER && tAttr.m_tLocator.m_iBitCount!=32 )
13409 			fprintf ( fp, ", bits %d", tAttr.m_tLocator.m_iBitCount );
13410 		fprintf ( fp, ", bitoff %d\n", tAttr.m_tLocator.m_iBitOffset );
13411 	}
13412 
13413 	// skipped min doc, wordlist checkpoints
13414 	fprintf ( fp, "total-documents: "INT64_FMT"\n", m_tStats.m_iTotalDocuments );
13415 	fprintf ( fp, "total-bytes: "INT64_FMT"\n", int64_t(m_tStats.m_iTotalBytes) );
13416 
13417 	fprintf ( fp, "min-prefix-len: %d\n", m_tSettings.m_iMinPrefixLen );
13418 	fprintf ( fp, "min-infix-len: %d\n", m_tSettings.m_iMinInfixLen );
13419 	fprintf ( fp, "exact-words: %d\n", m_tSettings.m_bIndexExactWords ? 1 : 0 );
13420 	fprintf ( fp, "html-strip: %d\n", m_tSettings.m_bHtmlStrip ? 1 : 0 );
13421 	fprintf ( fp, "html-index-attrs: %s\n", m_tSettings.m_sHtmlIndexAttrs.cstr () );
13422 	fprintf ( fp, "html-remove-elements: %s\n", m_tSettings.m_sHtmlRemoveElements.cstr () );
13423 	fprintf ( fp, "index-zones: %s\n", m_tSettings.m_sZones.cstr() );
13424 
13425 	if ( m_pTokenizer )
13426 	{
13427 		const CSphTokenizerSettings & tSettings = m_pTokenizer->GetSettings ();
13428 		fprintf ( fp, "tokenizer-type: %d\n", tSettings.m_iType );
13429 		fprintf ( fp, "tokenizer-case-folding: %s\n", tSettings.m_sCaseFolding.cstr () );
13430 		fprintf ( fp, "tokenizer-min-word-len: %d\n", tSettings.m_iMinWordLen );
13431 		fprintf ( fp, "tokenizer-ngram-chars: %s\n", tSettings.m_sNgramChars.cstr () );
13432 		fprintf ( fp, "tokenizer-ngram-len: %d\n", tSettings.m_iNgramLen );
13433 		fprintf ( fp, "tokenizer-exceptions: %s\n", tSettings.m_sSynonymsFile.cstr () );
13434 		fprintf ( fp, "tokenizer-phrase-boundary: %s\n", tSettings.m_sBoundary.cstr () );
13435 		fprintf ( fp, "tokenizer-ignore-chars: %s\n", tSettings.m_sIgnoreChars.cstr () );
13436 		fprintf ( fp, "tokenizer-blend-chars: %s\n", tSettings.m_sBlendChars.cstr () );
13437 		fprintf ( fp, "tokenizer-blend-mode: %s\n", tSettings.m_sBlendMode.cstr () );
13438 	}
13439 
13440 	if ( m_pDict )
13441 	{
13442 		const CSphDictSettings & tSettings = m_pDict->GetSettings ();
13443 		fprintf ( fp, "dictionary-morphology: %s\n", tSettings.m_sMorphology.cstr () );
13444 		fprintf ( fp, "dictionary-stopwords: %s\n", tSettings.m_sStopwords.cstr () );
13445 		fprintf ( fp, "dictionary-wordforms: %s\n", tSettings.m_sWordforms.cstr () );
13446 		fprintf ( fp, "min-stemming-len: %d\n", tSettings.m_iMinStemmingLen );
13447 	}
13448 
13449 	fprintf ( fp, "killlist-size: %d\n", m_iKillListSize );
13450 }
13451 
13452 
DebugDumpDocids(FILE * fp)13453 void CSphIndex_VLN::DebugDumpDocids ( FILE * fp )
13454 {
13455 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
13456 	{
13457 		fprintf ( fp, "FATAL: docids dump only supported for docinfo=extern\n" );
13458 		return;
13459 	}
13460 
13461 	const int iRowStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
13462 
13463 	const DWORD uNumMinMaxRow = ( m_uVersion>=20 ) ? ( 2*(1+m_uDocinfoIndex)*iRowStride ) : 0;
13464 	const int64_t uNumRows = (m_pDocinfo.GetNumEntries()-uNumMinMaxRow) / iRowStride; // all 32bit, as we don't expect 2 billion documents per single physical index
13465 
13466 	const uint64_t uDocinfoSize = iRowStride*size_t(m_uDocinfo)*sizeof(DWORD);
13467 	const uint64_t uMinmaxSize = uNumMinMaxRow*sizeof(CSphRowitem);
13468 
13469 	fprintf ( fp, "docinfo-bytes: docinfo="UINT64_FMT", min-max="UINT64_FMT", total="UINT64_FMT"\n"
13470 		, uDocinfoSize, uMinmaxSize, (uint64_t)m_pDocinfo.GetLength() );
13471 	fprintf ( fp, "docinfo-stride: %d\n", (int)(iRowStride*sizeof(DWORD)) );
13472 	fprintf ( fp, "docinfo-rows: "INT64_FMT"\n", uNumRows );
13473 
13474 	if ( !m_pDocinfo.GetNumEntries() )
13475 		return;
13476 
13477 	DWORD * pDocinfo = m_pDocinfo.GetWritePtr();
13478 	for ( DWORD uRow=0; uRow<uNumRows; uRow++, pDocinfo+=iRowStride )
13479 		printf ( "%u. id=" DOCID_FMT "\n", uRow+1, DOCINFO2ID ( pDocinfo ) );
13480 	printf ( "--- min-max=%d ---\n", uNumMinMaxRow );
13481 	for ( DWORD uRow=0; uRow<2*(1+m_uDocinfoIndex); uRow++, pDocinfo+=iRowStride )
13482 		printf ( "id=" DOCID_FMT "\n", DOCINFO2ID ( pDocinfo ) );
13483 }
13484 
13485 
DebugDumpHitlist(FILE * fp,const char * sKeyword,bool bID)13486 void CSphIndex_VLN::DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID )
13487 {
13488 	WITH_QWORD ( this, false, Qword, DumpHitlist<Qword> ( fp, sKeyword, bID ) );
13489 }
13490 
13491 
13492 template < class Qword >
DumpHitlist(FILE * fp,const char * sKeyword,bool bID)13493 void CSphIndex_VLN::DumpHitlist ( FILE * fp, const char * sKeyword, bool bID )
13494 {
13495 	// get keyword id
13496 	SphWordID_t uWordID = 0;
13497 	BYTE * sTok = NULL;
13498 	if ( !bID )
13499 	{
13500 		CSphString sBuf ( sKeyword );
13501 
13502 		m_pTokenizer->SetBuffer ( (BYTE*)sBuf.cstr(), strlen ( sBuf.cstr() ) );
13503 		sTok = m_pTokenizer->GetToken();
13504 
13505 		if ( !sTok )
13506 			sphDie ( "keyword=%s, no token (too short?)", sKeyword );
13507 
13508 		uWordID = m_pDict->GetWordID ( sTok );
13509 		if ( !uWordID )
13510 			sphDie ( "keyword=%s, tok=%s, no wordid (stopped?)", sKeyword, sTok );
13511 
13512 		fprintf ( fp, "keyword=%s, tok=%s, wordid="UINT64_FMT"\n", sKeyword, sTok, uint64_t(uWordID) );
13513 
13514 	} else
13515 	{
13516 		uWordID = (SphWordID_t) strtoull ( sKeyword, NULL, 10 );
13517 		if ( !uWordID )
13518 			sphDie ( "failed to convert keyword=%s to id (must be integer)", sKeyword );
13519 
13520 		fprintf ( fp, "wordid="UINT64_FMT"\n", uint64_t(uWordID) );
13521 	}
13522 
13523 	// open files
13524 	CSphAutofile tDoclist, tHitlist, tWordlist;
13525 	if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError ) < 0 )
13526 		sphDie ( "failed to open doclist: %s", m_sLastError.cstr() );
13527 
13528 	if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, m_sLastError ) < 0 )
13529 		sphDie ( "failed to open hitlist: %s", m_sLastError.cstr() );
13530 
13531 	if ( tWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, m_sLastError ) < 0 )
13532 		sphDie ( "failed to open wordlist: %s", m_sLastError.cstr() );
13533 
13534 	// aim
13535 	DiskIndexQwordSetup_c tTermSetup ( tDoclist, tHitlist, tWordlist, m_bPreloadWordlist ? 0 : m_tWordlist.m_iMaxChunk );
13536 	tTermSetup.m_pDict = m_pDict;
13537 	tTermSetup.m_pIndex = this;
13538 	tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
13539 	tTermSetup.m_tMin.Clone ( *m_pMin, m_tSchema.GetRowSize() );
13540 	tTermSetup.m_bSetupReaders = true;
13541 
13542 	Qword tKeyword ( false, false );
13543 	tKeyword.m_tDoc.m_iDocID = m_pMin->m_iDocID;
13544 	tKeyword.m_iWordID = uWordID;
13545 	tKeyword.m_sWord = sKeyword;
13546 	tKeyword.m_sDictWord = (const char *)sTok;
13547 	if ( !tTermSetup.QwordSetup ( &tKeyword ) )
13548 		sphDie ( "failed to setup keyword" );
13549 
13550 	int iSize = m_tSchema.GetRowSize();
13551 	CSphVector<CSphRowitem> dAttrs ( iSize );
13552 
13553 	// press play on tape
13554 	for ( ;; )
13555 	{
13556 		tKeyword.GetNextDoc ( iSize ? &dAttrs[0] : NULL );
13557 		if ( !tKeyword.m_tDoc.m_iDocID )
13558 			break;
13559 		tKeyword.SeekHitlist ( tKeyword.m_iHitlistPos );
13560 
13561 		int iHits = 0;
13562 		if ( tKeyword.m_bHasHitlist )
13563 			for ( Hitpos_t uHit = tKeyword.GetNextHit(); uHit!=EMPTY_HIT; uHit = tKeyword.GetNextHit() )
13564 			{
13565 				fprintf ( fp, "doc="DOCID_FMT", hit=0x%08x\n", tKeyword.m_tDoc.m_iDocID, uHit ); // FIXME?
13566 				iHits++;
13567 			}
13568 
13569 		if ( !iHits )
13570 		{
13571 			uint64_t uOff = tKeyword.m_iHitlistPos;
13572 			fprintf ( fp, "doc="DOCID_FMT", NO HITS, inline=%d, off="UINT64_FMT"\n",
13573 				tKeyword.m_tDoc.m_iDocID, (int)(uOff>>63), (uOff<<1)>>1 );
13574 		}
13575 	}
13576 }
13577 
13578 
Prealloc(bool bMlock,bool bStripPath,CSphString & sWarning)13579 bool CSphIndex_VLN::Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning )
13580 {
13581 	MEMORY ( SPH_MEM_IDX_DISK );
13582 
13583 	// reset
13584 	Dealloc ();
13585 
13586 	// always keep shared variables flag
13587 	if ( m_dShared.IsEmpty() )
13588 	{
13589 		if ( !m_dShared.Alloc ( SPH_SHARED_VARS_COUNT, m_sLastError, sWarning ) )
13590 			return false;
13591 	}
13592 	memset ( m_dShared.GetWritePtr(), 0, m_dShared.GetLength() );
13593 	m_pPreread = m_dShared.GetWritePtr()+0;
13594 	m_pAttrsStatus = m_dShared.GetWritePtr()+1;
13595 
13596 	// set new locking flag
13597 	m_pDocinfo.SetMlock ( bMlock );
13598 	m_tWordlist.m_pBuf.SetMlock ( bMlock );
13599 	m_pMva.SetMlock ( bMlock );
13600 	m_pStrings.SetMlock ( bMlock );
13601 	m_pKillList.SetMlock ( bMlock );
13602 
13603 	// preload schema
13604 	if ( !LoadHeader ( GetIndexFileName("sph").cstr(), bStripPath, sWarning ) )
13605 		return false;
13606 
13607 	// verify that data files are readable
13608 	if ( !sphIsReadable ( GetIndexFileName("spd").cstr(), &m_sLastError ) )
13609 		return false;
13610 
13611 	if ( m_uVersion>=3 && !sphIsReadable ( GetIndexFileName("spp").cstr(), &m_sLastError ) )
13612 		return false;
13613 
13614 	/////////////////////
13615 	// prealloc wordlist
13616 	/////////////////////
13617 
13618 	// try to open wordlist file in all cases
13619 	CSphAutofile tWordlist ( GetIndexFileName("spi"), SPH_O_READ, m_sLastError );
13620 	if ( tWordlist.GetFD()<0 )
13621 		return false;
13622 
13623 	m_tWordlist.m_iSize = tWordlist.GetSize ( 1, true, m_sLastError );
13624 	if ( m_tWordlist.m_iSize<0 )
13625 		return false;
13626 
13627 	m_bIsEmpty = ( m_tWordlist.m_iSize<=1 );
13628 	if ( m_bIsEmpty!=( m_tWordlist.m_dCheckpoints.GetLength()==0 ) )
13629 		sphWarning ( "wordlist size mismatch (size="INT64_FMT", checkpoints=%d)", m_tWordlist.m_iSize, m_tWordlist.m_dCheckpoints.GetLength() );
13630 
13631 	// make sure checkpoints are loadable
13632 	// pre-11 indices use different offset type (this is fixed up later during the loading)
13633 	assert ( m_tWordlist.m_iCheckpointsPos>0 );
13634 
13635 	// prealloc wordlist only !!! no need to load checkpoints here to
13636 	if ( m_bPreloadWordlist )
13637 		if ( !m_tWordlist.m_pBuf.Alloc ( m_tWordlist.m_iCheckpointsPos, m_sLastError, sWarning ) )
13638 			return false;
13639 
13640 	// preopen
13641 	if ( m_bKeepFilesOpen )
13642 	{
13643 		if ( m_tDoclistFile.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError ) < 0 )
13644 			return false;
13645 
13646 		if ( m_tHitlistFile.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, m_sLastError ) < 0 )
13647 			return false;
13648 
13649 		if ( !m_bPreloadWordlist && m_tWordlist.m_tFile.Open ( GetIndexFileName("spi"), SPH_O_READ, m_sLastError ) < 0 )
13650 			return false;
13651 	}
13652 
13653 	/////////////////////
13654 	// prealloc docinfos
13655 	/////////////////////
13656 
13657 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_bIsEmpty )
13658 	{
13659 		/////////////
13660 		// attr data
13661 		/////////////
13662 
13663 		int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
13664 		int iStride2 = iStride-1; // id64 - 1 DWORD = id32
13665 		int iEntrySize = sizeof(DWORD)*iStride;
13666 
13667 		CSphAutofile tDocinfo ( GetIndexFileName("spa"), SPH_O_READ, m_sLastError );
13668 		if ( tDocinfo.GetFD()<0 )
13669 			return false;
13670 
13671 		int64_t iDocinfoSize = tDocinfo.GetSize ( iEntrySize, true, m_sLastError );
13672 		if ( iDocinfoSize<0 )
13673 			return false;
13674 
13675 		iDocinfoSize = iDocinfoSize / sizeof(DWORD);
13676 
13677 		// min-max index 32 bit overflow fix-up
13678 		if ( m_uMinMaxIndex && iDocinfoSize/sizeof(DWORD)>UINT_MAX )
13679 		{
13680 			int64_t uFixedMinMax = m_uMinMaxIndex + ( U64C(1)<<32 );
13681 			if ( uFixedMinMax<iDocinfoSize )
13682 			{
13683 				sphWarning ( "clamped min-max offset fixed (offset="INT64_FMT", fixed="UINT64_FMT")", m_uMinMaxIndex, uFixedMinMax );
13684 				m_uMinMaxIndex = uFixedMinMax;
13685 			} else
13686 			{
13687 				m_sLastError.SetSprintf ( "can't fix clamped min-max offset (offset="INT64_FMT", file size="UINT64_FMT")", m_uMinMaxIndex, iDocinfoSize );
13688 				return false;
13689 			}
13690 		}
13691 
13692 		int64_t iRealDocinfoSize = m_uMinMaxIndex ? m_uMinMaxIndex : iDocinfoSize;
13693 
13694 		// intentionally losing data; we don't support more than 4B documents per instance yet
13695 		m_uDocinfo = (DWORD)( iRealDocinfoSize / iStride );
13696 		if ( iRealDocinfoSize!=(int64_t)m_uDocinfo*iStride && !m_bId32to64 )
13697 		{
13698 			m_sLastError.SetSprintf ( "docinfo size check mismatch (4B document limit hit?)" );
13699 			return false;
13700 		}
13701 
13702 		if ( m_bId32to64 )
13703 		{
13704 			// check also the case of id32 here, and correct m_uDocinfo for it
13705 			m_uDocinfo = (DWORD)( iRealDocinfoSize / iStride2 );
13706 			if ( iRealDocinfoSize!=m_uDocinfo*iStride2 )
13707 			{
13708 				m_sLastError.SetSprintf ( "docinfo size check mismatch (4B document limit hit?)" );
13709 				return false;
13710 			}
13711 			m_uMinMaxIndex = m_uMinMaxIndex / iStride2 * iStride;
13712 		}
13713 
13714 
13715 		if ( m_uVersion < 20 )
13716 		{
13717 			if ( m_bId32to64 )
13718 				iDocinfoSize = iDocinfoSize / iStride2 * iStride;
13719 			m_uDocinfoIndex = (DWORD)( ( m_uDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ );
13720 
13721 			// prealloc docinfo
13722 			if ( !m_pDocinfo.Alloc ( iDocinfoSize + 2*(1+m_uDocinfoIndex)*iStride + ( m_bId32to64 ? m_uDocinfo : 0 ), m_sLastError, sWarning ) )
13723 				return false;
13724 
13725 			m_pDocinfoIndex = m_pDocinfo.GetWritePtr()+iDocinfoSize;
13726 		} else
13727 		{
13728 			if ( iDocinfoSize < iRealDocinfoSize )
13729 			{
13730 				m_sLastError.SetSprintf ( "precomputed chunk size check mismatch" );
13731 				sphLogDebug ( "precomputed chunk size check mismatch (size="INT64_FMT", real="INT64_FMT", min-max="INT64_FMT", count="INT64_FMT")",
13732 					iDocinfoSize, iRealDocinfoSize, m_uMinMaxIndex, int64_t ( m_uDocinfo ) );
13733 				return false;
13734 			}
13735 
13736 			m_uDocinfoIndex = (DWORD)( ( ( iDocinfoSize - iRealDocinfoSize ) / (m_bId32to64?iStride2:iStride) / 2 ) - 1 );
13737 
13738 			// prealloc docinfo
13739 			if ( !m_pDocinfo.Alloc ( iDocinfoSize + ( m_bId32to64 ? ( 2 + m_uDocinfo + 2*m_uDocinfoIndex ) : 0 ), m_sLastError, sWarning ) )
13740 				return false;
13741 
13742 #if PARANOID
13743 			DWORD uDocinfoIndex = ( m_uDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
13744 			assert ( uDocinfoIndex==m_uDocinfoIndex );
13745 #endif
13746 
13747 			m_pDocinfoIndex = m_pDocinfo.GetWritePtr()+m_uMinMaxIndex;
13748 		}
13749 
13750 		// prealloc docinfo hash but only if docinfo is big enough (in other words if hash is 8x+ less in size)
13751 		if ( m_pDocinfoHash.IsEmpty() && m_pDocinfo.GetLength() > ( 32 << DOCINFO_HASH_BITS ) && !g_bDebugCheck )
13752 			if ( !m_pDocinfoHash.Alloc ( ( 1 << DOCINFO_HASH_BITS )+4, m_sLastError, sWarning ) )
13753 				return false;
13754 
13755 		////////////
13756 		// MVA data
13757 		////////////
13758 
13759 		if ( m_uVersion>=4 )
13760 		{
13761 			// if index is v4, .spm must always exist, even though length could be 0
13762 			CSphAutofile fdMva ( GetIndexFileName("spm"), SPH_O_READ, m_sLastError );
13763 			if ( fdMva.GetFD()<0 )
13764 				return false;
13765 
13766 			SphOffset_t iMvaSize = fdMva.GetSize ( 0, true, m_sLastError );
13767 			if ( iMvaSize<0 )
13768 				return false;
13769 
13770 			// prealloc
13771 			if ( iMvaSize>0 )
13772 				if ( !m_pMva.Alloc ( DWORD(iMvaSize/sizeof(DWORD)), m_sLastError, sWarning ) )
13773 					return false;
13774 		}
13775 
13776 		///////////////
13777 		// string data
13778 		///////////////
13779 
13780 		if ( m_uVersion>=17 )
13781 		{
13782 			CSphAutofile fdStrings ( GetIndexFileName("sps"), SPH_O_READ, m_sLastError );
13783 			if ( fdStrings.GetFD()<0 )
13784 				return false;
13785 
13786 			SphOffset_t iStringsSize = fdStrings.GetSize ( 0, true, m_sLastError );
13787 			if ( iStringsSize<0 )
13788 				return false;
13789 
13790 			// prealloc
13791 			if ( iStringsSize>0 )
13792 				if ( !m_pStrings.Alloc ( DWORD(iStringsSize), m_sLastError, sWarning ) )
13793 					return false;
13794 		}
13795 	} else if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && m_bIsEmpty )
13796 		{
13797 			CSphAutofile tDocinfo ( GetIndexFileName("spa"), SPH_O_READ, m_sLastError );
13798 			if ( tDocinfo.GetFD()>0 )
13799 			{
13800 				SphOffset_t iDocinfoSize = tDocinfo.GetSize ( 0, false, m_sLastError );
13801 				if ( iDocinfoSize )
13802 					sphWarning ( "IsEmpty != attribute size ("INT64_FMT")", iDocinfoSize );
13803 			}
13804 	}
13805 
13806 
13807 	// prealloc killlist
13808 	if ( m_uVersion>=10 )
13809 	{
13810 		CSphAutofile fdKillList ( GetIndexFileName("spk"), SPH_O_READ, m_sLastError );
13811 		if ( fdKillList.GetFD()<0 )
13812 			return false;
13813 
13814 		SphOffset_t iSize = fdKillList.GetSize ( 0, true, m_sLastError );
13815 		if ( iSize<0 )
13816 			return false;
13817 
13818 		if ( iSize!=(SphOffset_t)( m_iKillListSize*sizeof(SphAttr_t) ) )
13819 		{
13820 			m_sLastError.SetSprintf ( "header k-list size does not match .spk size (klist=" INT64_FMT ", spk=" INT64_FMT ")",
13821 				(int64_t)( m_iKillListSize*sizeof(SphAttr_t) ),
13822 				(int64_t) iSize );
13823 			return false;
13824 		}
13825 
13826 		// prealloc
13827 		if ( iSize>0 && !m_pKillList.Alloc ( m_iKillListSize, m_sLastError, sWarning ) )
13828 			return false;
13829 	}
13830 
13831 	bool bWordDict = false;
13832 	if ( m_pDict )
13833 		bWordDict = m_pDict->GetSettings().m_bWordDict;
13834 
13835 	// preload checkpoints (must be done here as they are not shared)
13836 	if 	( !m_tWordlist.ReadCP ( tWordlist, m_uVersion, bWordDict, m_sLastError ) )
13837 	{
13838 		m_sLastError.SetSprintf ( "failed to read %s: %s", GetIndexFileName("spi").cstr(), m_sLastError.cstr () );
13839 		return false;
13840 	}
13841 
13842 	// all done
13843 	m_bPreallocated = true;
13844 	m_iIndexTag = ++m_iIndexTagSeq;
13845 	return true;
13846 }
13847 
13848 
PrereadSharedBuffer(CSphSharedBuffer<T> & pBuffer,const char * sExt,size_t uExpected,DWORD uOffset)13849 template < typename T > bool CSphIndex_VLN::PrereadSharedBuffer ( CSphSharedBuffer<T> & pBuffer, const char * sExt, size_t uExpected, DWORD uOffset )
13850 {
13851 	if ( !pBuffer.GetLength() )
13852 		return true;
13853 
13854 	CSphAutofile fdBuf ( GetIndexFileName(sExt), SPH_O_READ, m_sLastError );
13855 	if ( fdBuf.GetFD()<0 )
13856 		return false;
13857 
13858 	fdBuf.SetProgressCallback ( m_pProgress, &m_tProgress );
13859 	if ( uExpected==0 )
13860 		uExpected = size_t ( pBuffer.GetLength() ) - uOffset*sizeof(T);
13861 	return fdBuf.Read ( pBuffer.GetWritePtr() + uOffset, uExpected, m_sLastError );
13862 }
13863 
13864 
Preread()13865 bool CSphIndex_VLN::Preread ()
13866 {
13867 	MEMORY ( SPH_MEM_IDX_DISK );
13868 
13869 	sphLogDebug ( "CSphIndex_VLN::Preread invoked" );
13870 	if ( !m_bPreallocated )
13871 	{
13872 		m_sLastError = "INTERNAL ERROR: not preallocated";
13873 		return false;
13874 	}
13875 	if ( !m_pPreread || *m_pPreread )
13876 	{
13877 		m_sLastError = "INTERNAL ERROR: already preread";
13878 		return false;
13879 	}
13880 
13881 	///////////////////
13882 	// read everything
13883 	///////////////////
13884 
13885 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_PREREAD;
13886 	m_tProgress.m_iBytes = 0;
13887 	m_tProgress.m_iBytesTotal = m_pDocinfo.GetLength() + m_pMva.GetLength() + m_pStrings.GetLength() + m_pKillList.GetLength();
13888 	if ( m_bPreloadWordlist )
13889 		m_tProgress.m_iBytesTotal += m_tWordlist.m_pBuf.GetLength();
13890 
13891 	sphLogDebug ( "Prereading .spa" );
13892 	if ( !PrereadSharedBuffer ( m_pDocinfo, "spa",
13893 		( m_uVersion<20 )? m_uDocinfo * ( ( m_bId32to64 ? 1 : DOCINFO_IDSIZE ) + m_tSchema.GetRowSize() ) * sizeof(DWORD) : 0 , m_bId32to64 ? ( 2 + m_uDocinfo + 2 * m_uDocinfoIndex ) : 0 ) )
13894 		return false;
13895 
13896 	sphLogDebug ( "Prereading .spm" );
13897 	if ( !PrereadSharedBuffer ( m_pMva, "spm" ) )
13898 		return false;
13899 
13900 	sphLogDebug ( "Prereading .sps" );
13901 	if ( !PrereadSharedBuffer ( m_pStrings, "sps" ) )
13902 		return false;
13903 
13904 	sphLogDebug ( "Prereading .spk" );
13905 	if ( !PrereadSharedBuffer ( m_pKillList, "spk" ) )
13906 		return false;
13907 
13908 #if PARANOID
13909 	for ( int i = 1; i < (int)m_iKillListSize; i++ )
13910 		assert ( m_pKillList[i-1] < m_pKillList[i] );
13911 #endif
13912 
13913 	// preload wordlist
13914 	// FIXME! OPTIMIZE! can skip checkpoints
13915 	if ( m_bPreloadWordlist )
13916 	{
13917 		sphLogDebug ( "Prereading .spi" );
13918 		if ( !PrereadSharedBuffer ( m_tWordlist.m_pBuf, "spi" ) )
13919 
13920 			return false;
13921 	}
13922 
13923 	if ( m_pProgress )
13924 		m_pProgress ( &m_tProgress, true );
13925 
13926 	//////////////////////
13927 	// precalc everything
13928 	//////////////////////
13929 
13930 	// convert id32 to id64
13931 	if ( m_pDocinfo.GetLength() && m_bId32to64 )
13932 	{
13933 		DWORD *pTarget = m_pDocinfo.GetWritePtr();
13934 		DWORD *pSource = pTarget + 2 + m_uDocinfo + 2 * m_uDocinfoIndex;
13935 		int iStride = m_tSchema.GetRowSize();
13936 		SphDocID_t uDoc;
13937 		DWORD uLimit = m_uDocinfo + ( ( m_uVersion < 20 ) ? 0 : 2 + 2 * m_uDocinfoIndex );
13938 		for ( DWORD u=0; u<uLimit; u++ )
13939 		{
13940 			uDoc = *pSource; ///< wide id32 to id64
13941 			DOCINFOSETID ( pTarget, uDoc );
13942 			memcpy ( pTarget + DOCINFO_IDSIZE, pSource + 1, iStride * sizeof(DWORD) );
13943 			pSource += iStride+1;
13944 			pTarget += iStride+DOCINFO_IDSIZE;
13945 		}
13946 		sphWarning ( "id32 index loaded by id64 binary; attributes converted" );
13947 	}
13948 
13949 	// build attributes hash
13950 	if ( m_pDocinfo.GetLength() && m_pDocinfoHash.GetLength() && !g_bDebugCheck )
13951 	{
13952 		sphLogDebug ( "Hashing docinfo" );
13953 		int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
13954 		SphDocID_t uFirst = DOCINFO2ID ( &m_pDocinfo[0] );
13955 		SphDocID_t uRange = DOCINFO2ID ( &m_pDocinfo[( int64_t ( m_uDocinfo-1 ) )*iStride] ) - uFirst;
13956 		DWORD iShift = 0;
13957 		while ( uRange>=( 1 << DOCINFO_HASH_BITS ) )
13958 		{
13959 			iShift++;
13960 			uRange >>= 1;
13961 		}
13962 
13963 		DWORD * pHash = m_pDocinfoHash.GetWritePtr();
13964 		*pHash++ = iShift;
13965 		*pHash = 0;
13966 		DWORD uLastHash = 0;
13967 
13968 		for ( DWORD i=1; i<m_uDocinfo; i++ )
13969 		{
13970 			assert ( DOCINFO2ID ( &m_pDocinfo[( int64_t ( i ) )*iStride] )>uFirst
13971 				&& DOCINFO2ID ( &m_pDocinfo[( int64_t ( i-1 ) )*iStride] ) < DOCINFO2ID ( &m_pDocinfo[( int64_t ( i ) )*iStride] )
13972 				&& "descending document ID found" );
13973 			DWORD uHash = (DWORD)( ( DOCINFO2ID ( &m_pDocinfo[( int64_t ( i ) )*iStride] ) - uFirst ) >> iShift );
13974 			if ( uHash==uLastHash )
13975 				continue;
13976 
13977 			while ( uLastHash<uHash )
13978 				pHash [ ++uLastHash ] = i;
13979 
13980 			uLastHash = uHash;
13981 		}
13982 		pHash [ ++uLastHash ] = m_uDocinfo;
13983 	}
13984 
13985 	// persist MVA needs valid DocinfoHash
13986 	sphLogDebug ( "Prereading .mvp" );
13987 	if ( !LoadPersistentMVA ( m_sLastError ) )
13988 		return false;
13989 
13990 	// build "indexes" for full-scan
13991 	if ( m_uVersion < 20 && !PrecomputeMinMax() )
13992 		return false;
13993 
13994 	// paranoid MVA verification
13995 #if PARANOID
13996 	// find out what attrs are MVA
13997 	CSphVector<int> dMvaRowitem;
13998 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
13999 	{
14000 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
14001 		if ( tCol.m_eAttrType==SPH_ATTR_UINT32SET )
14002 			dMvaRowitem.Add ( tCol.m_tLocator.m_iBitOffset/ROWITEM_BITS );
14003 	}
14004 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
14005 	{
14006 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
14007 		if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
14008 			dMvaRowitem.Add ( tCol.m_tLocator.m_iBitOffset/ROWITEM_BITS );
14009 	}
14010 
14011 	// for each docinfo entry, verify that MVA attrs point to right storage location
14012 	int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
14013 	for ( DWORD iDoc=0; iDoc<m_uDocinfo && dMvaRowitem.GetLength(); iDoc++ )
14014 	{
14015 		CSphRowitem * pRow = m_pDocinfo.GetWritePtr() + ( iDoc*iStride );
14016 		CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
14017 		SphDocID_t uDocID = DOCINFO2ID(pRow);
14018 
14019 		DWORD uOff = pAttrs[ dMvaRowitem[0] ];
14020 		if ( !uOff )
14021 		{
14022 			// its either all or nothing
14023 			ARRAY_FOREACH ( i, dMvaRowitem )
14024 				assert ( pAttrs[ dMvaRowitem[i] ]==0 );
14025 		} else if ( !( uOff & MVA_ARENA_FLAG ) )
14026 		{
14027 			assert ( uDocID==DOCINFO2ID ( m_pMva.GetWritePtr() + uOff - DOCINFO_IDSIZE ) );
14028 
14029 			// walk the trail
14030 			ARRAY_FOREACH ( i, dMvaRowitem )
14031 			{
14032 				assert ( pAttrs[ dMvaRowitem[i] ]==uOff );
14033 				int iCount = m_pMva[uOff];
14034 				uOff += 1+iCount;
14035 			}
14036 		}
14037 	}
14038 #endif // PARANOID
14039 
14040 	*m_pPreread = 1;
14041 	sphLogDebug ( "Preread successfully finished" );
14042 	return true;
14043 }
14044 
14045 
SetBase(const char * sNewBase)14046 void CSphIndex_VLN::SetBase ( const char * sNewBase )
14047 {
14048 	m_sFilename = sNewBase;
14049 }
14050 
14051 
Rename(const char * sNewBase)14052 bool CSphIndex_VLN::Rename ( const char * sNewBase )
14053 {
14054 	if ( m_sFilename==sNewBase )
14055 		return true;
14056 
14057 	// try to rename everything
14058 	char sFrom [ SPH_MAX_FILENAME_LEN ];
14059 	char sTo [ SPH_MAX_FILENAME_LEN ];
14060 
14061 	const int EXT_COUNT = 9;
14062 	const char * sExts[EXT_COUNT] = { "spa", "spd", "sph", "spi", "spl", "spm", "spp", "spk", "sps" };
14063 	DWORD uMask = 0;
14064 
14065 	int iExt;
14066 	for ( iExt=0; iExt<EXT_COUNT; iExt++ )
14067 	{
14068 		const char * sExt = sExts[iExt];
14069 		if ( !strcmp ( sExt, "spp" ) && m_uVersion<3 ) // .spp files are v3+
14070 			continue;
14071 		if ( !strcmp ( sExt, "spm" ) && m_uVersion<4 ) // .spm files are v4+
14072 			continue;
14073 		if ( !strcmp ( sExt, "spk" ) && m_uVersion<10 ) // .spk files are v10+
14074 			continue;
14075 		if ( !strcmp ( sExt, "sps" ) && m_uVersion<17 ) // .spk files are v17+
14076 			continue;
14077 
14078 #if !USE_WINDOWS
14079 		if ( !strcmp ( sExt, "spl" ) && m_iLockFD<0 ) // .spl files are locks
14080 			continue;
14081 #else
14082 		if ( !strcmp ( sExt, "spl" ) )
14083 		{
14084 			if ( m_iLockFD>=0 )
14085 			{
14086 				::close ( m_iLockFD );
14087 				::unlink ( GetIndexFileName("spl").cstr() );
14088 				sphLogDebug ( "lock %s unlinked, file with ID %d closed", GetIndexFileName("spl").cstr(), m_iLockFD );
14089 				m_iLockFD = -1;
14090 			}
14091 			continue;
14092 		}
14093 #endif
14094 
14095 		snprintf ( sFrom, sizeof(sFrom), "%s.%s", m_sFilename.cstr(), sExt );
14096 		snprintf ( sTo, sizeof(sTo), "%s.%s", sNewBase, sExt );
14097 
14098 #if USE_WINDOWS
14099 		::unlink ( sTo );
14100 		sphLogDebug ( "%s unlinked", sTo );
14101 #endif
14102 
14103 		if ( ::rename ( sFrom, sTo ) )
14104 		{
14105 			m_sLastError.SetSprintf ( "rename %s to %s failed: %s", sFrom, sTo, strerror(errno) );
14106 			// this is no reason to fail if spl is missing, since it is only lock and no data.
14107 			if ( strcmp ( sExt, "spl" ) )
14108 				break;
14109 		}
14110 		uMask |= ( 1UL << iExt );
14111 	}
14112 
14113 	// are we good?
14114 	if ( iExt==EXT_COUNT )
14115 	{
14116 		SetBase ( sNewBase );
14117 		sphLogDebug ( "Base set to %s", sNewBase );
14118 		return true;
14119 	}
14120 
14121 	// if there were errors, rollback
14122 	for ( iExt=0; iExt<EXT_COUNT; iExt++ )
14123 	{
14124 		if (!( uMask & ( 1UL << iExt ) ))
14125 			continue;
14126 
14127 		const char * sExt = sExts[iExt];
14128 		snprintf ( sFrom, sizeof(sFrom), "%s.%s", sNewBase, sExt );
14129 		snprintf ( sTo, sizeof(sTo), "%s.%s", m_sFilename.cstr(), sExt );
14130 		if ( ::rename ( sFrom, sTo ) )
14131 		{
14132 			sphLogDebug ( "Rollback failure when renaming %s to %s", sFrom, sTo );
14133 			// !COMMIT should handle rollback failures somehow
14134 		}
14135 	}
14136 	return false;
14137 }
14138 
14139 //////////////////////////////////////////////////////////////////////////
14140 
CSphQueryContext()14141 CSphQueryContext::CSphQueryContext ()
14142 {
14143 	m_iWeights = 0;
14144 	m_bLookupFilter = false;
14145 	m_bLookupSort = false;
14146 	m_pFilter = NULL;
14147 	m_pWeightFilter = NULL;
14148 	m_pIndexData = NULL;
14149 }
14150 
~CSphQueryContext()14151 CSphQueryContext::~CSphQueryContext ()
14152 {
14153 	SafeDelete ( m_pFilter );
14154 	SafeDelete ( m_pWeightFilter );
14155 }
14156 
BindWeights(const CSphQuery * pQuery,const CSphSchema & tSchema,int iIndexWeight)14157 void CSphQueryContext::BindWeights ( const CSphQuery * pQuery, const CSphSchema & tSchema, int iIndexWeight )
14158 {
14159 	const int MIN_WEIGHT = 1;
14160 	// const int HEAVY_FIELDS = 32;
14161 	const int HEAVY_FIELDS = SPH_MAX_FIELDS;
14162 
14163 	// defaults
14164 	m_iWeights = Min ( tSchema.m_dFields.GetLength(), HEAVY_FIELDS );
14165 	for ( int i=0; i<m_iWeights; i++ )
14166 		m_dWeights[i] = MIN_WEIGHT * iIndexWeight;
14167 
14168 	// name-bound weights
14169 	if ( pQuery->m_dFieldWeights.GetLength() )
14170 	{
14171 		ARRAY_FOREACH ( i, pQuery->m_dFieldWeights )
14172 		{
14173 			int j = tSchema.GetFieldIndex ( pQuery->m_dFieldWeights[i].m_sName.cstr() );
14174 			if ( j>=0 && j<HEAVY_FIELDS )
14175 				m_dWeights[j] = Max ( MIN_WEIGHT, pQuery->m_dFieldWeights[i].m_iValue ) * iIndexWeight;
14176 		}
14177 		return;
14178 	}
14179 
14180 	// order-bound weights
14181 	if ( pQuery->m_pWeights )
14182 	{
14183 		for ( int i=0; i<Min ( m_iWeights, pQuery->m_iWeights ); i++ )
14184 			m_dWeights[i] = Max ( MIN_WEIGHT, (int)pQuery->m_pWeights[i] ) * iIndexWeight;
14185 	}
14186 }
14187 
14188 
SetupCalc(CSphQueryResult * pResult,const CSphSchema & tInSchema,const CSphSchema & tSchema,const DWORD * pMvaPool)14189 bool CSphQueryContext::SetupCalc ( CSphQueryResult * pResult, const CSphSchema & tInSchema, const CSphSchema & tSchema, const DWORD * pMvaPool )
14190 {
14191 	m_dCalcFilter.Resize ( 0 );
14192 	m_dCalcSort.Resize ( 0 );
14193 	m_dCalcFinal.Resize ( 0 );
14194 
14195 	// quickly verify that all my real attributes can be stashed there
14196 	if ( tInSchema.GetAttrsCount() < tSchema.GetAttrsCount() )
14197 	{
14198 		pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema mismatch (incount=%d, mycount=%d)", tInSchema.GetAttrsCount(), tSchema.GetAttrsCount() );
14199 		return false;
14200 	}
14201 
14202 	// now match everyone
14203 	for ( int iIn=0; iIn<tInSchema.GetAttrsCount(); iIn++ )
14204 	{
14205 		const CSphColumnInfo & tIn = tInSchema.GetAttr(iIn);
14206 		switch ( tIn.m_eStage )
14207 		{
14208 			case SPH_EVAL_STATIC:
14209 			case SPH_EVAL_OVERRIDE:
14210 			{
14211 				const CSphColumnInfo * pMy = tSchema.GetAttr ( tIn.m_sName.cstr() );
14212 				if ( !pMy )
14213 				{
14214 					pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema attr missing from index-schema (in=%s)",
14215 						sphDumpAttr(tIn).cstr() );
14216 					return false;
14217 				}
14218 
14219 				if ( tIn.m_eStage==SPH_EVAL_OVERRIDE )
14220 				{
14221 					// override; check for type/size match and dynamic part
14222 					if ( tIn.m_eAttrType!=pMy->m_eAttrType
14223 						|| tIn.m_tLocator.m_iBitCount!=pMy->m_tLocator.m_iBitCount
14224 						|| !tIn.m_tLocator.m_bDynamic )
14225 					{
14226 						pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema override mismatch (in=%s, my=%s)",
14227 							sphDumpAttr(tIn).cstr(), sphDumpAttr(*pMy).cstr() );
14228 						return false;
14229 					}
14230 				} else
14231 				{
14232 					// static; check for full match
14233 					if (!( tIn==*pMy ))
14234 					{
14235 						pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema mismatch (in=%s, my=%s)",
14236 							sphDumpAttr(tIn).cstr(), sphDumpAttr(*pMy).cstr() );
14237 						return false;
14238 					}
14239 				}
14240 				break;
14241 			}
14242 
14243 			case SPH_EVAL_PREFILTER:
14244 			case SPH_EVAL_PRESORT:
14245 			case SPH_EVAL_FINAL:
14246 			{
14247 				ISphExpr * pExpr = tIn.m_pExpr.Ptr();
14248 				if ( !pExpr )
14249 					pExpr = sphSortSetupExpr ( tIn.m_sName, tSchema );
14250 				if ( !pExpr )
14251 				{
14252 					pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema expression missing evaluator (stage=%d, in=%s)",
14253 						(int)tIn.m_eStage, sphDumpAttr(tIn).cstr() );
14254 					return false;
14255 				}
14256 
14257 				// an expression that index/searcher should compute
14258 				CalcItem_t tCalc;
14259 				tCalc.m_eType = tIn.m_eAttrType;
14260 				tCalc.m_tLoc = tIn.m_tLocator;
14261 				tCalc.m_pExpr = pExpr;
14262 				tCalc.m_pExpr->SetMVAPool ( pMvaPool );
14263 
14264 				switch ( tIn.m_eStage )
14265 				{
14266 					case SPH_EVAL_PREFILTER:	m_dCalcFilter.Add ( tCalc ); break;
14267 					case SPH_EVAL_PRESORT:		m_dCalcSort.Add ( tCalc ); break;
14268 					case SPH_EVAL_FINAL:		m_dCalcFinal.Add ( tCalc ); break;
14269 					default:					break;
14270 				}
14271 				break;
14272 			}
14273 
14274 			case SPH_EVAL_SORTER:
14275 				// sorter tells it will compute itself; so just skip it
14276 				break;
14277 
14278 			default:
14279 				pResult->m_sError.SetSprintf ( "INTERNAL ERROR: unhandled eval stage=%d", (int)tIn.m_eStage );
14280 				return false;
14281 		}
14282 	}
14283 
14284 	// ok, we can emit matches in this schema (incoming for sorter, outgoing for index/searcher)
14285 	pResult->m_tSchema = tInSchema;
14286 	return true;
14287 }
14288 
14289 
SetupStarDict(CSphScopedPtr<CSphDict> & tContainer,CSphDict * pPrevDict,ISphTokenizer & tTokenizer) const14290 CSphDict * CSphIndex_VLN::SetupStarDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict, ISphTokenizer & tTokenizer ) const
14291 {
14292 	// setup proper dict
14293 	bool bUseStarDict = false;
14294 	if (
14295 		( m_uVersion>=7 && ( m_tSettings.m_iMinPrefixLen>0 || m_tSettings.m_iMinInfixLen>0 ) && m_bEnableStar ) || // v.7 added mangling to infixes
14296 		( m_uVersion==6 && ( m_tSettings.m_iMinPrefixLen>0 ) && m_bEnableStar ) ) // v.6 added mangling to prefixes
14297 	{
14298 		bUseStarDict = true;
14299 	}
14300 
14301 	// no star? just return the original one
14302 	if ( !bUseStarDict )
14303 		return pPrevDict;
14304 
14305 	// spawn wrapper, and put it in the box
14306 	// wrapper type depends on version; v.8 introduced new mangling rules
14307 	if ( m_uVersion>=8 )
14308 		tContainer = new CSphDictStarV8 ( pPrevDict, m_tSettings.m_iMinPrefixLen>0, m_tSettings.m_iMinInfixLen>0 );
14309 	else
14310 		tContainer = new CSphDictStar ( pPrevDict );
14311 
14312 	CSphRemapRange tStar ( '*', '*', '*' ); // FIXME? check and warn if star was already there
14313 	tTokenizer.AddCaseFolding ( tStar );
14314 
14315 	return tContainer.Ptr();
14316 }
14317 
14318 
SetupExactDict(CSphScopedPtr<CSphDict> & tContainer,CSphDict * pPrevDict,ISphTokenizer & tTokenizer) const14319 CSphDict * CSphIndex_VLN::SetupExactDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict, ISphTokenizer & tTokenizer ) const
14320 {
14321 	if ( m_uVersion<12 || !m_tSettings.m_bIndexExactWords )
14322 		return pPrevDict;
14323 
14324 	tContainer = new CSphDictExact ( pPrevDict );
14325 
14326 	CSphRemapRange tStar ( '=', '=', '=' ); // FIXME? check and warn if star was already there
14327 	tTokenizer.AddCaseFolding ( tStar );
14328 
14329 	return tContainer.Ptr();
14330 }
14331 
14332 
GetKeywords(CSphVector<CSphKeywordInfo> & dKeywords,const char * szQuery,bool bGetStats,CSphString & sError) const14333 bool CSphIndex_VLN::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString & sError ) const
14334 {
14335 	WITH_QWORD ( this, false, Qword, return DoGetKeywords<Qword> ( dKeywords, szQuery, bGetStats, sError ) );
14336 	return false;
14337 }
14338 
14339 
14340 template < class Qword >
DoGetKeywords(CSphVector<CSphKeywordInfo> & dKeywords,const char * szQuery,bool bGetStats,CSphString & sError) const14341 bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString & sError ) const
14342 {
14343 	if ( !m_pPreread || !*m_pPreread )
14344 	{
14345 		sError = "index not preread";
14346 		return false;
14347 	}
14348 
14349 	CSphScopedPtr <CSphAutofile> pDoclist ( NULL );
14350 	CSphScopedPtr <CSphAutofile> pHitlist ( NULL );
14351 
14352 	CSphScopedPtr<ISphTokenizer> pTokenizer ( m_pTokenizer->Clone ( false ) ); // avoid race
14353 	pTokenizer->EnableTokenizedMultiformTracking ();
14354 
14355 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
14356 	CSphDict * pDictBase = m_pDict;
14357 	if ( pDictBase->HasState() )
14358 	{
14359 		tDictCloned = pDictBase = pDictBase->Clone();
14360 	}
14361 
14362 	CSphScopedPtr<CSphDict> tDict ( NULL );
14363 	CSphDict * pDict = SetupStarDict ( tDict, pDictBase, *pTokenizer.Ptr() );
14364 
14365 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
14366 	pDict = SetupExactDict ( tDict2, pDict, *pTokenizer.Ptr() );
14367 
14368 	// prepare for setup
14369 	CSphAutofile tDummy1, tDummy2, tDummy3, tWordlist;
14370 
14371 	if ( !m_bKeepFilesOpen )
14372 		if ( tWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, sError ) < 0 )
14373 			return false;
14374 
14375 	DiskIndexQwordSetup_c tTermSetup ( tDummy1, tDummy2
14376 		, m_bPreloadWordlist ? tDummy3 : ( m_bKeepFilesOpen ? m_tWordlist.m_tFile : tWordlist )
14377 		, m_bPreloadWordlist ? 0 : m_tWordlist.m_iMaxChunk );
14378 	tTermSetup.m_pDict = pDict;
14379 	tTermSetup.m_pIndex = this;
14380 	tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
14381 	dKeywords.Resize ( 0 );
14382 
14383 	Qword QueryWord ( false, false );
14384 	CSphString sTokenized;
14385 	BYTE * sWord;
14386 	int nWords = 0;
14387 
14388 	CSphString sQbuf ( szQuery );
14389 	pTokenizer->SetBuffer ( (BYTE*)sQbuf.cstr(), strlen(szQuery) );
14390 
14391 	while ( ( sWord = pTokenizer->GetToken() )!=NULL )
14392 	{
14393 		BYTE * sMultiform = pTokenizer->GetTokenizedMultiform();
14394 		if ( sMultiform )
14395 			sTokenized = (const char*)sMultiform;
14396 		else
14397 			sTokenized = (const char*)sWord;
14398 
14399 		SphWordID_t iWord = pDict->GetWordID ( sWord );
14400 		if ( iWord )
14401 		{
14402 			if ( bGetStats )
14403 			{
14404 				QueryWord.Reset ();
14405 				QueryWord.m_sWord = (const char*)sWord;
14406 				QueryWord.m_sDictWord = (const char*)sWord;
14407 				QueryWord.m_iWordID = iWord;
14408 				tTermSetup.QwordSetup ( &QueryWord );
14409 			}
14410 
14411 			CSphKeywordInfo & tInfo = dKeywords.Add();
14412 			Swap ( tInfo.m_sTokenized, sTokenized );
14413 			tInfo.m_sNormalized = (const char*)sWord;
14414 			tInfo.m_iDocs = bGetStats ? QueryWord.m_iDocs : 0;
14415 			tInfo.m_iHits = bGetStats ? QueryWord.m_iHits : 0;
14416 			++nWords;
14417 
14418 			if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
14419 				*(char *)tInfo.m_sNormalized.cstr() = '=';
14420 		}
14421 	}
14422 
14423 	return true;
14424 }
14425 
14426 // fix MSVC 2005 fuckup, template DoGetKeywords() just above somehow resets forScope
14427 #if USE_WINDOWS
14428 #pragma conform(forScope,on)
14429 #endif
14430 
14431 
IsWeightColumn(const CSphString & sAttr,const CSphSchema & tSchema)14432 static bool IsWeightColumn ( const CSphString & sAttr, const CSphSchema & tSchema )
14433 {
14434 	if ( sAttr=="@weight" )
14435 		return true;
14436 
14437 	const CSphColumnInfo * pCol = tSchema.GetAttr ( sAttr.cstr() );
14438 	return ( pCol && pCol->m_bWeight );
14439 }
14440 
14441 
CreateFilters(bool bFullscan,const CSphVector<CSphFilterSettings> * pdFilters,const CSphSchema & tSchema,const DWORD * pMvaPool,CSphString & sError)14442 bool CSphQueryContext::CreateFilters ( bool bFullscan, const CSphVector<CSphFilterSettings> * pdFilters, const CSphSchema & tSchema, const DWORD * pMvaPool, CSphString & sError )
14443 {
14444 	if ( !pdFilters )
14445 		return true;
14446 	ARRAY_FOREACH ( i, (*pdFilters) )
14447 	{
14448 		const CSphFilterSettings & tFilter = (*pdFilters)[i];
14449 		if ( tFilter.m_sAttrName.IsEmpty() )
14450 			continue;
14451 
14452 		bool bWeight = IsWeightColumn ( tFilter.m_sAttrName, tSchema );
14453 
14454 		if ( bFullscan && bWeight )
14455 			continue; // @weight is not avaiable in fullscan mode
14456 
14457 		ISphFilter * pFilter = sphCreateFilter ( tFilter, tSchema, pMvaPool, sError );
14458 		if ( !pFilter )
14459 			return false;
14460 
14461 		ISphFilter ** pGroup = bWeight ? &m_pWeightFilter : &m_pFilter;
14462 		*pGroup = sphJoinFilters ( *pGroup, pFilter );
14463 	}
14464 	return true;
14465 }
14466 
14467 
SetupOverrides(const CSphQuery * pQuery,CSphQueryResult * pResult,const CSphSchema & tIndexSchema)14468 bool CSphQueryContext::SetupOverrides ( const CSphQuery * pQuery, CSphQueryResult * pResult, const CSphSchema & tIndexSchema )
14469 {
14470 	m_pOverrides = NULL;
14471 	m_dOverrideIn.Resize ( pQuery->m_dOverrides.GetLength() );
14472 	m_dOverrideOut.Resize ( pQuery->m_dOverrides.GetLength() );
14473 
14474 	ARRAY_FOREACH ( i, pQuery->m_dOverrides )
14475 	{
14476 		const char * sAttr = pQuery->m_dOverrides[i].m_sAttr.cstr(); // shortcut
14477 		const CSphColumnInfo * pCol = tIndexSchema.GetAttr ( sAttr );
14478 		if ( !pCol )
14479 		{
14480 			pResult->m_sError.SetSprintf ( "attribute override: unknown attribute name '%s'", sAttr );
14481 			return false;
14482 		}
14483 
14484 		if ( pCol->m_eAttrType!=pQuery->m_dOverrides[i].m_eAttrType )
14485 		{
14486 			pResult->m_sError.SetSprintf ( "attribute override: attribute '%s' type mismatch (index=%d, query=%d)",
14487 				sAttr, pCol->m_eAttrType, pQuery->m_dOverrides[i].m_eAttrType );
14488 			return false;
14489 		}
14490 
14491 		const CSphColumnInfo * pOutCol = pResult->m_tSchema.GetAttr ( pQuery->m_dOverrides[i].m_sAttr.cstr() );
14492 		if ( !pOutCol )
14493 		{
14494 			pResult->m_sError.SetSprintf ( "attribute override: unknown attribute name '%s' in outgoing schema", sAttr );
14495 			return false;
14496 		}
14497 
14498 		m_dOverrideIn[i] = pCol->m_tLocator;
14499 		m_dOverrideOut[i] = pOutCol->m_tLocator;
14500 
14501 #ifndef NDEBUG
14502 		// check that the values are actually sorted
14503 		const CSphVector<CSphAttrOverride::IdValuePair_t> & dValues = pQuery->m_dOverrides[i].m_dValues;
14504 		for ( int j=1; j<dValues.GetLength(); j++ )
14505 			assert ( dValues[j-1] < dValues[j] );
14506 #endif
14507 	}
14508 
14509 	if ( pQuery->m_dOverrides.GetLength() )
14510 		m_pOverrides = &pQuery->m_dOverrides;
14511 	return true;
14512 }
14513 
sphQueryHeightCalc(const XQNode_t * pNode)14514 static int sphQueryHeightCalc ( const XQNode_t * pNode )
14515 {
14516 	if ( !pNode->m_dChildren.GetLength() )
14517 		return pNode->m_dWords.GetLength();
14518 
14519 	if ( pNode->GetOp()==SPH_QUERY_BEFORE )
14520 		return 1;
14521 
14522 	int iMaxChild = 0;
14523 	int iHeight = 0;
14524 	ARRAY_FOREACH ( i, pNode->m_dChildren )
14525 	{
14526 		int iBottom = sphQueryHeightCalc ( pNode->m_dChildren[i] );
14527 		int iTop = pNode->m_dChildren.GetLength()-i-1;
14528 		if ( iBottom+iTop>=iMaxChild+iHeight )
14529 		{
14530 			iMaxChild = iBottom;
14531 			iHeight = iTop;
14532 		}
14533 	}
14534 
14535 	return iMaxChild+iHeight;
14536 }
14537 
14538 #define SPH_EXTNODE_STACK_SIZE 160
14539 
sphCheckQueryHeight(const XQNode_t * pRoot,CSphString & sError)14540 bool sphCheckQueryHeight ( const XQNode_t * pRoot, CSphString & sError )
14541 {
14542 	int iHeight = 0;
14543 	if ( pRoot )
14544 		iHeight = sphQueryHeightCalc ( pRoot );
14545 
14546 	int64_t iQueryStack = sphGetStackUsed() + iHeight*SPH_EXTNODE_STACK_SIZE;
14547 	bool bValid = ( g_iThreadStackSize>=iQueryStack );
14548 	if ( !bValid )
14549 		sError.SetSprintf ( "query too complex, not enough stack (thread_stack_size=%dK or higher required)",
14550 			(int)( ( iQueryStack + 1024 - ( iQueryStack%1024 ) ) / 1024 ) );
14551 
14552 	return bValid;
14553 }
14554 
CloneKeyword(const XQNode_t * pNode)14555 static XQNode_t * CloneKeyword ( const XQNode_t * pNode )
14556 {
14557 	assert ( pNode );
14558 
14559 	XQNode_t * pRes = new XQNode_t ( pNode->m_dSpec );
14560 	pRes->m_dWords = pNode->m_dWords;
14561 	return pRes;
14562 }
14563 
14564 
ExpandKeyword(XQNode_t * pNode,const CSphIndexSettings & tSettings)14565 static XQNode_t * ExpandKeyword ( XQNode_t * pNode, const CSphIndexSettings & tSettings )
14566 {
14567 	assert ( pNode );
14568 
14569 	XQNode_t * pExpand = new XQNode_t ( pNode->m_dSpec );
14570 	pExpand->SetOp ( SPH_QUERY_OR, pNode );
14571 
14572 	if ( tSettings.m_iMinInfixLen>0 )
14573 	{
14574 		assert ( pNode->m_dChildren.GetLength()==0 );
14575 		assert ( pNode->m_dWords.GetLength()==1 );
14576 		XQNode_t * pInfix = CloneKeyword ( pNode );
14577 		pInfix->m_dWords[0].m_sWord.SetSprintf ( "*%s*", pNode->m_dWords[0].m_sWord.cstr() );
14578 		pInfix->m_dWords[0].m_uStarPosition = STAR_BOTH;
14579 		pExpand->m_dChildren.Add ( pInfix );
14580 	} else if ( tSettings.m_iMinPrefixLen>0 )
14581 	{
14582 		assert ( pNode->m_dChildren.GetLength()==0 );
14583 		assert ( pNode->m_dWords.GetLength()==1 );
14584 		XQNode_t * pPrefix = CloneKeyword ( pNode );
14585 		pPrefix->m_dWords[0].m_sWord.SetSprintf ( "%s*", pNode->m_dWords[0].m_sWord.cstr() );
14586 		pPrefix->m_dWords[0].m_uStarPosition = STAR_FRONT;
14587 		pExpand->m_dChildren.Add ( pPrefix );
14588 	}
14589 
14590 	if ( tSettings.m_bIndexExactWords )
14591 	{
14592 		assert ( pNode->m_dChildren.GetLength()==0 );
14593 		assert ( pNode->m_dWords.GetLength()==1 );
14594 		XQNode_t * pExact = CloneKeyword ( pNode );
14595 		pExact->m_dWords[0].m_sWord.SetSprintf ( "=%s", pNode->m_dWords[0].m_sWord.cstr() );
14596 		pExpand->m_dChildren.Add ( pExact );
14597 	}
14598 
14599 	return pExpand;
14600 }
14601 
ExpandKeywords(XQNode_t * pNode,const CSphIndexSettings & tSettings)14602 static XQNode_t * ExpandKeywords ( XQNode_t * pNode, const CSphIndexSettings & tSettings )
14603 {
14604 	// only if expansion makes sense at all
14605 	if ( tSettings.m_iMinInfixLen<=0 && tSettings.m_iMinPrefixLen<=0 && !tSettings.m_bIndexExactWords )
14606 		return pNode;
14607 
14608 	// process children for composite nodes
14609 	if ( pNode->m_dChildren.GetLength() )
14610 	{
14611 		ARRAY_FOREACH ( i, pNode->m_dChildren )
14612 			pNode->m_dChildren[i] = ExpandKeywords ( pNode->m_dChildren[i], tSettings );
14613 		return pNode;
14614 	}
14615 
14616 	// if that's a phrase/proximity node, create a very special, magic phrase/proximity node
14617 	if ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM )
14618 	{
14619 		assert ( pNode->m_dWords.GetLength()>1 );
14620 		ARRAY_FOREACH ( i, pNode->m_dWords )
14621 		{
14622 			XQNode_t * pWord = new XQNode_t ( pNode->m_dSpec );
14623 			pWord->m_dWords.Add ( pNode->m_dWords[i] );
14624 			pNode->m_dChildren.Add ( ExpandKeyword ( pWord, tSettings ) );
14625 			pNode->m_dChildren.Last()->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
14626 		}
14627 		pNode->m_dWords.Reset();
14628 		pNode->m_bVirtuallyPlain = true;
14629 		return pNode;
14630 	}
14631 
14632 	// skip empty plain nodes
14633 	if ( pNode->m_dWords.GetLength()<=0 )
14634 		return pNode;
14635 
14636 	// process keywords for plain nodes
14637 	assert ( pNode->m_dWords.GetLength()==1 );
14638 
14639 	XQKeyword_t & tKeyword = pNode->m_dWords[0];
14640 	if ( tKeyword.m_uStarPosition!=STAR_NONE
14641 		|| tKeyword.m_sWord.Begins("=")
14642 		|| tKeyword.m_sWord.Begins("*")
14643 		|| tKeyword.m_sWord.Ends("*") )
14644 	{
14645 		return pNode;
14646 	}
14647 
14648 	// do the expansion
14649 	return ExpandKeyword ( pNode, tSettings );
14650 }
14651 
14652 // transform the "one two three"/1 quorum into one|two|three (~40% faster)
TransformQuorum(XQNode_t ** ppNode)14653 static void TransformQuorum ( XQNode_t ** ppNode )
14654 {
14655 	XQNode_t *& pNode = *ppNode;
14656 	if ( pNode->GetOp()!=SPH_QUERY_QUORUM || pNode->m_iOpArg!=1 )
14657 		return;
14658 
14659 	assert ( pNode->m_dChildren.GetLength()==0 );
14660 	CSphVector<XQNode_t*> dArgs;
14661 	ARRAY_FOREACH ( i, pNode->m_dWords )
14662 	{
14663 		XQNode_t * pAnd = new XQNode_t ( pNode->m_dSpec );
14664 		pAnd->m_dWords.Add ( pNode->m_dWords[i] );
14665 		dArgs.Add ( pAnd );
14666 	}
14667 	pNode->m_dWords.Reset();
14668 	pNode->SetOp ( SPH_QUERY_OR, dArgs );
14669 }
14670 
14671 
14672 struct BinaryNode_t
14673 {
14674 	int m_iLo;
14675 	int m_iHi;
14676 };
14677 
BuildExpandedTree(const XQKeyword_t & tRootWord,CSphVector<CSphNamedInt> & dWordSrc,XQNode_t * pRoot)14678 static void BuildExpandedTree ( const XQKeyword_t & tRootWord, CSphVector<CSphNamedInt> & dWordSrc, XQNode_t * pRoot )
14679 {
14680 	assert ( dWordSrc.GetLength() );
14681 	pRoot->m_dWords.Reset();
14682 
14683 	CSphVector<BinaryNode_t> dNodes;
14684 	dNodes.Reserve ( dWordSrc.GetLength() );
14685 
14686 	XQNode_t * pCur = pRoot;
14687 
14688 	dNodes.Add();
14689 	dNodes.Last().m_iLo = 0;
14690 	dNodes.Last().m_iHi = ( dWordSrc.GetLength()-1 );
14691 
14692 	while ( dNodes.GetLength() )
14693 	{
14694 		BinaryNode_t tNode = dNodes.Pop();
14695 		if ( tNode.m_iHi<tNode.m_iLo )
14696 		{
14697 			pCur = pCur->m_pParent;
14698 			continue;
14699 		}
14700 
14701 		int iMid = ( tNode.m_iLo+tNode.m_iHi ) / 2;
14702 		dNodes.Add ();
14703 		dNodes.Last().m_iLo = tNode.m_iLo;
14704 		dNodes.Last().m_iHi = iMid-1;
14705 		dNodes.Add ();
14706 		dNodes.Last().m_iLo = iMid+1;
14707 		dNodes.Last().m_iHi = tNode.m_iHi;
14708 
14709 		if ( pCur->m_dWords.GetLength() )
14710 		{
14711 			assert ( pCur->m_dWords.GetLength()==1 );
14712 			XQNode_t * pTerm = CloneKeyword ( pRoot );
14713 			Swap ( pTerm->m_dWords, pCur->m_dWords );
14714 			pCur->m_dChildren.Add ( pTerm );
14715 		}
14716 
14717 		XQNode_t * pChild = CloneKeyword ( pRoot );
14718 		pChild->m_dWords.Add ( tRootWord );
14719 		pChild->m_dWords.Last().m_sWord.Swap ( dWordSrc[iMid].m_sName );
14720 		pChild->m_dWords.Last().m_bExpanded = true;
14721 		pChild->m_bNotWeighted = ( dWordSrc[iMid].m_iValue==0 );
14722 
14723 		pChild->m_pParent = pCur;
14724 		pCur->m_dChildren.Add ( pChild );
14725 		pCur->SetOp ( SPH_QUERY_OR );
14726 
14727 		pCur = pChild;
14728 	}
14729 }
14730 
Swap(CSphNamedInt & a,CSphNamedInt & b)14731 void Swap ( CSphNamedInt & a, CSphNamedInt & b )
14732 {
14733 	a.m_sName.Swap ( b.m_sName );
14734 	Swap ( a.m_iValue, b.m_iValue );
14735 }
14736 
14737 struct WordDocsGreaterOp_t
14738 {
IsLessWordDocsGreaterOp_t14739 	inline bool IsLess ( const CSphNamedInt & a, const CSphNamedInt & b )
14740 	{
14741 		return a.m_iValue > b.m_iValue;
14742 	}
14743 };
14744 
14745 
sphExpandXQNode(XQNode_t * pNode,ExpansionContext_t & tCtx)14746 XQNode_t * sphExpandXQNode ( XQNode_t * pNode, ExpansionContext_t & tCtx )
14747 {
14748 	assert ( pNode );
14749 	assert ( tCtx.m_pResult );
14750 
14751 	// process children for composite nodes
14752 	if ( pNode->m_dChildren.GetLength() )
14753 	{
14754 		ARRAY_FOREACH ( i, pNode->m_dChildren )
14755 		{
14756 			pNode->m_dChildren[i] = sphExpandXQNode ( pNode->m_dChildren[i], tCtx );
14757 		}
14758 		return pNode;
14759 	}
14760 
14761 	// if that's a phrase/proximity node, create a very special, magic phrase/proximity node
14762 	if ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM )
14763 	{
14764 		assert ( pNode->m_dWords.GetLength()>1 );
14765 		ARRAY_FOREACH ( i, pNode->m_dWords )
14766 		{
14767 			XQNode_t * pWord = new XQNode_t ( pNode->m_dSpec );
14768 			pWord->m_dWords.Add ( pNode->m_dWords[i] );
14769 			pNode->m_dChildren.Add ( sphExpandXQNode ( pWord, tCtx ) );
14770 			pNode->m_dChildren.Last()->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
14771 
14772 			// tricky part
14773 			// current node may have field/zone limits attached
14774 			// normally those get pushed down during query parsing
14775 			// but here we create nodes manually and have to push down limits too
14776 			pWord->CopySpecs ( pNode );
14777 		}
14778 		pNode->m_dWords.Reset();
14779 		pNode->m_bVirtuallyPlain = true;
14780 		return pNode;
14781 	}
14782 
14783 	// skip empty plain nodes
14784 	if ( pNode->m_dWords.GetLength()<=0 )
14785 		return pNode;
14786 
14787 	// process keywords for plain nodes
14788 	assert ( pNode->m_dChildren.GetLength()==0 );
14789 	assert ( pNode->m_dWords.GetLength()==1 );
14790 
14791 	if ( ( !tCtx.m_bStarEnabled || !pNode->m_dWords[0].m_sWord.Ends("*") ) )
14792 		return pNode;
14793 
14794 	const CSphString & sFullWord = pNode->m_dWords[0].m_sWord;
14795 	const char * sAdjustedWord = sFullWord.cstr();
14796 	int iWordLen = sFullWord.Length();
14797 	if ( tCtx.m_bStarEnabled )
14798 		iWordLen = Max ( iWordLen-1, 0 );
14799 
14800 	// leading special symbols trimming
14801 	if ( sFullWord.Begins("=") || sFullWord.Begins("*") )
14802 	{
14803 		sAdjustedWord++;
14804 		iWordLen = Max ( iWordLen-1, 0 );
14805 	}
14806 
14807 	// we refuse to search query less then min-prefix-len
14808 	if ( iWordLen<tCtx.m_iMinPrefixLen )
14809 		return pNode;
14810 
14811 	// prefix expansion looking only into non stemmed words
14812 	CSphString sFixed;
14813 	if ( tCtx.m_bHasMorphology )
14814 	{
14815 		sFixed = pNode->m_dWords[0].m_sWord.SubString ( sAdjustedWord-sFullWord.cstr(), iWordLen );
14816 		sFixed.SetSprintf ( "%c%s", MAGIC_WORD_HEAD_NONSTEMMED, sFixed.cstr() );
14817 		sAdjustedWord = sFixed.cstr();
14818 		iWordLen++;
14819 	}
14820 
14821 	CSphVector<CSphNamedInt> dPrefixedWords;
14822 	tCtx.m_pWordlist->GetPrefixedWords ( sAdjustedWord, iWordLen, dPrefixedWords, tCtx.m_pBuf, tCtx.m_iFD );
14823 
14824 	if ( !dPrefixedWords.GetLength() )
14825 	{
14826 		// mark source word as expanded to prevent warning on terms mismatch in statistics
14827 		pNode->m_dWords.Begin()->m_bExpanded = true;
14828 		return pNode;
14829 	}
14830 
14831 	// sort word's to leftmost max documents, rightmost least documents
14832 	dPrefixedWords.Sort ( WordDocsGreaterOp_t() );
14833 
14834 	// clip words with the lowest doc frequency as rare words are misspelling
14835 	if ( tCtx.m_iExpansionLimit && tCtx.m_iExpansionLimit<dPrefixedWords.GetLength() )
14836 	{
14837 		dPrefixedWords.Resize ( tCtx.m_iExpansionLimit );
14838 	}
14839 
14840 	// mark new words as expanded to skip theirs check on merge ( expanded words differs across different indexes )
14841 	ARRAY_FOREACH ( i, dPrefixedWords )
14842 	{
14843 		tCtx.m_pResult->AddStat ( dPrefixedWords[i].m_sName, 0, 0, true );
14844 	}
14845 
14846 	// replace MAGIC_WORD_HEAD_NONSTEMMED symbol to '='
14847 	if ( tCtx.m_bHasMorphology )
14848 	{
14849 		ARRAY_FOREACH ( i, dPrefixedWords )
14850 		{
14851 			( (char *)dPrefixedWords[i].m_sName.cstr() )[0] = '=';
14852 		}
14853 	}
14854 
14855 	const XQKeyword_t tPrefixingWord = pNode->m_dWords[0];
14856 	BuildExpandedTree ( tPrefixingWord, dPrefixedWords, pNode );
14857 
14858 	return pNode;
14859 }
14860 
ExpandPrefix(XQNode_t * pNode,CSphString & sError,CSphQueryResultMeta * pResult) const14861 XQNode_t * CSphIndex_VLN::ExpandPrefix ( XQNode_t * pNode, CSphString & sError, CSphQueryResultMeta * pResult ) const
14862 {
14863 	if ( !pNode || !( m_pDict->GetSettings().m_bWordDict && m_tSettings.m_iMinPrefixLen>0 ) )
14864 		return pNode;
14865 
14866 	// thread safe outer storage for dictionaries chunks and file
14867 	BYTE * pBuf = NULL;
14868 	int iFD = -1;
14869 	CSphAutofile rdWordlist;
14870 	if ( !m_bPreloadWordlist )
14871 	{
14872 		if ( m_bKeepFilesOpen )
14873 			iFD = m_tWordlist.m_tFile.GetFD();
14874 		else
14875 		{
14876 			iFD = rdWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, sError );
14877 			if ( iFD<0 )
14878 				return NULL;
14879 		}
14880 
14881 		if ( m_tWordlist.m_iMaxChunk>0 )
14882 			pBuf = new BYTE [ m_tWordlist.m_iMaxChunk ];
14883 	}
14884 
14885 	assert ( m_pPreread && *m_pPreread );
14886 	assert ( !m_bPreloadWordlist || !m_tWordlist.m_pBuf.IsEmpty() );
14887 
14888 	ExpansionContext_t tCtx;
14889 	tCtx.m_pWordlist = &m_tWordlist;
14890 	tCtx.m_pBuf = pBuf;
14891 	tCtx.m_pResult = pResult;
14892 	tCtx.m_iFD = iFD;
14893 	tCtx.m_iMinPrefixLen = m_tSettings.m_iMinPrefixLen;
14894 	tCtx.m_iExpansionLimit = m_iExpansionLimit;
14895 	tCtx.m_bStarEnabled = m_bEnableStar;
14896 	tCtx.m_bHasMorphology = m_pDict->HasMorphology();
14897 
14898 	pNode = sphExpandXQNode ( pNode, tCtx );
14899 
14900 	SafeDeleteArray ( pBuf );
14901 
14902 	return pNode;
14903 }
14904 
14905 
14906 // transform the (A B) NEAR C into A NEAR B NEAR C
TransformNear(XQNode_t ** ppNode)14907 static void TransformNear ( XQNode_t ** ppNode )
14908 {
14909 	XQNode_t *& pNode = *ppNode;
14910 	if ( pNode->GetOp()==SPH_QUERY_NEAR )
14911 	{
14912 		assert ( pNode->m_dWords.GetLength()==0 );
14913 		CSphVector<XQNode_t*> dArgs;
14914 		int iStartFrom;
14915 
14916 		// transform all (A B C) NEAR D into A NEAR B NEAR C NEAR D
14917 		do
14918 		{
14919 			dArgs.Reset();
14920 			iStartFrom = 0;
14921 			ARRAY_FOREACH ( i, pNode->m_dChildren )
14922 			{
14923 				XQNode_t * pChild = pNode->m_dChildren[i]; ///< shortcut
14924 				if ( pChild->GetOp()==SPH_QUERY_AND && pChild->m_dChildren.GetLength()>0 )
14925 				{
14926 					ARRAY_FOREACH ( j, pChild->m_dChildren )
14927 						if ( j==0 && iStartFrom==0 )
14928 						{
14929 							// we will remove the node anyway, so just replace it with 1-st child instead
14930 							pNode->m_dChildren[i] = pChild->m_dChildren[j];
14931 							iStartFrom = i+1;
14932 						} else
14933 							dArgs.Add ( pChild->m_dChildren[j] );
14934 					pChild->m_dChildren.Reset();
14935 					SafeDelete ( pChild );
14936 				} else if ( iStartFrom!=0 )
14937 					dArgs.Add ( pChild );
14938 			}
14939 
14940 			if ( iStartFrom!=0 )
14941 			{
14942 				pNode->m_dChildren.Resize ( iStartFrom + dArgs.GetLength() );
14943 				ARRAY_FOREACH ( i, dArgs )
14944 					pNode->m_dChildren [ i + iStartFrom ] = dArgs[i];
14945 			}
14946 		} while ( iStartFrom!=0 );
14947 	}
14948 
14949 	ARRAY_FOREACH ( i, pNode->m_dChildren )
14950 		TransformNear ( &pNode->m_dChildren[i] );
14951 }
14952 
14953 
14954 /// tag excluded keywords (rvals to operator NOT)
TagExcluded(XQNode_t * pNode,bool bNot)14955 static void TagExcluded ( XQNode_t * pNode, bool bNot )
14956 {
14957 	if ( pNode->GetOp()==SPH_QUERY_ANDNOT )
14958 	{
14959 		assert ( pNode->m_dChildren.GetLength()==2 );
14960 		assert ( pNode->m_dWords.GetLength()==0 );
14961 		TagExcluded ( pNode->m_dChildren[0], bNot );
14962 		TagExcluded ( pNode->m_dChildren[1], !bNot );
14963 
14964 	} else if ( pNode->m_dChildren.GetLength() )
14965 	{
14966 		// FIXME? check if this works okay with "virtually plain" stuff?
14967 		ARRAY_FOREACH ( i, pNode->m_dChildren )
14968 			TagExcluded ( pNode->m_dChildren[i], bNot );
14969 	} else
14970 	{
14971 		// tricky bit
14972 		// no assert on length here and that is intended
14973 		// we have fully empty nodes (0 children, 0 words) sometimes!
14974 		ARRAY_FOREACH ( i, pNode->m_dWords )
14975 			pNode->m_dWords[i].m_bExcluded = bNot;
14976 	}
14977 }
14978 
14979 
sphTransformExtendedQuery(XQNode_t ** ppNode)14980 void sphTransformExtendedQuery ( XQNode_t ** ppNode )
14981 {
14982 	TransformQuorum ( ppNode );
14983 	TransformNear ( ppNode );
14984 	TagExcluded ( *ppNode, false );
14985 }
14986 
14987 
14988 struct CmpPSortersByRandom_fn
14989 {
IsLessCmpPSortersByRandom_fn14990 	inline bool IsLess ( const ISphMatchSorter * a, const ISphMatchSorter * b ) const
14991 	{
14992 		assert ( a );
14993 		assert ( b );
14994 		return a->m_bRandomize < b->m_bRandomize;
14995 	}
14996 };
14997 
14998 
14999 /// one regular query vs many sorters
MultiQuery(const CSphQuery * pQuery,CSphQueryResult * pResult,int iSorters,ISphMatchSorter ** ppSorters,const CSphVector<CSphFilterSettings> * pExtraFilters,int iTag) const15000 bool CSphIndex_VLN::MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const
15001 {
15002 	assert ( pQuery );
15003 
15004 	MEMORY ( SPH_MEM_IDX_DISK_MULTY_QUERY );
15005 
15006 	// to avoid the checking of a ppSorters's element for NULL on every next step, just filter out all nulls right here
15007 	CSphVector<ISphMatchSorter*> dSorters;
15008 	dSorters.Reserve ( iSorters );
15009 	for ( int i=0; i<iSorters; i++ )
15010 		if ( ppSorters[i] )
15011 			dSorters.Add ( ppSorters[i] );
15012 
15013 	iSorters = dSorters.GetLength();
15014 
15015 	// if we have anything to work with
15016 	if ( iSorters==0 )
15017 		return false;
15018 
15019 	// non-random at the start, random at the end
15020 	dSorters.Sort ( CmpPSortersByRandom_fn() );
15021 
15022 	// fast path for scans
15023 	if ( pQuery->m_sQuery.IsEmpty() )
15024 		return MultiScan ( pQuery, pResult, iSorters, &dSorters[0], pExtraFilters, iTag );
15025 
15026 	CSphScopedPtr<ISphTokenizer> pTokenizer ( m_pTokenizer->Clone ( false ) );
15027 
15028 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
15029 	CSphDict * pDictBase = m_pDict;
15030 	if ( pDictBase->HasState() )
15031 	{
15032 		tDictCloned = pDictBase = pDictBase->Clone();
15033 	}
15034 
15035 	CSphScopedPtr<CSphDict> tDict ( NULL );
15036 	CSphDict * pDict = SetupStarDict ( tDict, pDictBase, *pTokenizer.Ptr() );
15037 
15038 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
15039 	pDict = SetupExactDict ( tDict2, pDict, *pTokenizer.Ptr() );
15040 
15041 	// parse query
15042 	XQQuery_t tParsed;
15043 	if ( !sphParseExtendedQuery ( tParsed, pQuery->m_sQuery.cstr(), pTokenizer.Ptr(), &m_tSchema, pDict, m_tSettings.m_iStopwordStep ) )
15044 	{
15045 		pResult->m_sError = tParsed.m_sParseError;
15046 		return false;
15047 	}
15048 
15049 	// transform query if needed (quorum transform, keyword expansion, etc.)
15050 	sphTransformExtendedQuery ( &tParsed.m_pRoot );
15051 
15052 	// expanding prefix in word dictionary case
15053 	XQNode_t * pPrefixed = ExpandPrefix ( tParsed.m_pRoot, pResult->m_sError, pResult );
15054 	if ( !pPrefixed )
15055 		return false;
15056 	tParsed.m_pRoot = pPrefixed;
15057 
15058 	if ( m_bExpandKeywords )
15059 		tParsed.m_pRoot = ExpandKeywords ( tParsed.m_pRoot, m_tSettings );
15060 
15061 	if ( !sphCheckQueryHeight ( tParsed.m_pRoot, pResult->m_sError ) )
15062 		return false;
15063 
15064 	// flag common subtrees
15065 	int iCommonSubtrees = 0;
15066 	if ( m_iMaxCachedDocs && m_iMaxCachedHits )
15067 		iCommonSubtrees = sphMarkCommonSubtrees ( 1, &tParsed );
15068 
15069 	CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );
15070 	bool bResult = ParsedMultiQuery ( pQuery, pResult, iSorters, &dSorters[0], tParsed, pDict, pExtraFilters, &tNodeCache, iTag );
15071 
15072 	return bResult;
15073 }
15074 
15075 
15076 /// many regular queries with one sorter attached to each query.
15077 /// returns true if at least one query succeeded. The failed queries indicated with pResult->m_iMultiplier==-1
MultiQueryEx(int iQueries,const CSphQuery * pQueries,CSphQueryResult ** ppResults,ISphMatchSorter ** ppSorters,const CSphVector<CSphFilterSettings> * pExtraFilters,int iTag) const15078 bool CSphIndex_VLN::MultiQueryEx ( int iQueries, const CSphQuery * pQueries, CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters, const CSphVector<CSphFilterSettings> * pExtraFilters, int iTag ) const
15079 {
15080 	// ensure we have multiple queries
15081 	if ( iQueries==1 )
15082 		return MultiQuery ( pQueries, ppResults[0], 1, ppSorters, pExtraFilters, iTag );
15083 
15084 	MEMORY ( SPH_MEM_IDX_DISK_MULTY_QUERY_EX );
15085 
15086 	assert ( pQueries );
15087 	assert ( ppResults );
15088 	assert ( ppSorters );
15089 
15090 	ISphTokenizer * pTokenizer = m_pTokenizer->Clone ( false );
15091 
15092 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
15093 	CSphDict * pDictBase = m_pDict;
15094 	if ( pDictBase->HasState() )
15095 	{
15096 		tDictCloned = pDictBase = pDictBase->Clone();
15097 	}
15098 
15099 	CSphScopedPtr<CSphDict> tDict ( NULL );
15100 	CSphDict * pDict = SetupStarDict ( tDict, pDictBase, *pTokenizer );
15101 
15102 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
15103 	pDict = SetupExactDict ( tDict2, pDict, *pTokenizer );
15104 
15105 	CSphFixedVector<XQQuery_t> dXQ ( iQueries );
15106 	bool bResult = false;
15107 	bool bResultScan = false;
15108 	for ( int i=0; i<iQueries; i++ )
15109 	{
15110 		// nothing to do without a sorter
15111 		if ( !ppSorters[i] )
15112 		{
15113 			ppResults[i]->m_iMultiplier = -1; ///< show that this particular query failed
15114 			continue;
15115 		}
15116 
15117 		// fast path for scans
15118 		if ( pQueries[i].m_sQuery.IsEmpty() )
15119 		{
15120 			if ( MultiScan ( pQueries + i, ppResults[i], 1, &ppSorters[i], pExtraFilters, iTag ) )
15121 				bResultScan = true;
15122 			else
15123 				ppResults[i]->m_iMultiplier = -1; ///< show that this particular query failed
15124 			continue;
15125 		}
15126 
15127 		ppResults[i]->m_tIOStats.Start();
15128 
15129 		// parse query
15130 		if ( sphParseExtendedQuery ( dXQ[i], pQueries[i].m_sQuery.cstr(), pTokenizer, &m_tSchema, pDict, m_tSettings.m_iStopwordStep ) )
15131 		{
15132 			// transform query if needed (quorum transform, keyword expansion, etc.)
15133 			sphTransformExtendedQuery ( &dXQ[i].m_pRoot );
15134 
15135 			// expanding prefix in word dictionary case
15136 			XQNode_t * pPrefixed = ExpandPrefix ( dXQ[i].m_pRoot, ppResults[i]->m_sError, ppResults[i] );
15137 			if ( pPrefixed )
15138 			{
15139 				dXQ[i].m_pRoot = pPrefixed;
15140 
15141 				if ( m_bExpandKeywords )
15142 					dXQ[i].m_pRoot = ExpandKeywords ( dXQ[i].m_pRoot, m_tSettings );
15143 
15144 				if ( sphCheckQueryHeight ( dXQ[i].m_pRoot, ppResults[i]->m_sError ) )
15145 				{
15146 					bResult = true;
15147 				} else
15148 				{
15149 					ppResults[i]->m_iMultiplier = -1;
15150 					SafeDelete ( dXQ[i].m_pRoot );
15151 				}
15152 			} else
15153 			{
15154 				ppResults[i]->m_iMultiplier = -1;
15155 				SafeDelete ( dXQ[i].m_pRoot );
15156 			}
15157 		} else
15158 		{
15159 			ppResults[i]->m_sError = dXQ[i].m_sParseError;
15160 			ppResults[i]->m_iMultiplier = -1;
15161 		}
15162 
15163 		ppResults[i]->m_tIOStats.Stop();
15164 	}
15165 
15166 	// continue only if we have at least one non-failed
15167 	if ( bResult )
15168 	{
15169 		int iCommonSubtrees = 0;
15170 		if ( m_iMaxCachedDocs && m_iMaxCachedHits )
15171 			iCommonSubtrees = sphMarkCommonSubtrees ( iQueries, &dXQ[0] );
15172 
15173 		CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );
15174 		bResult = false;
15175 		for ( int j=0; j<iQueries; j++ )
15176 		{
15177 			// fullscan case
15178 			if ( pQueries[j].m_sQuery.IsEmpty() )
15179 				continue;
15180 
15181 			ppResults[j]->m_tIOStats.Start();
15182 
15183 			if ( dXQ[j].m_pRoot && ppSorters[j]
15184 					&& ParsedMultiQuery ( &pQueries[j], ppResults[j], 1, &ppSorters[j], dXQ[j], pDict, pExtraFilters, &tNodeCache, iTag ) )
15185 			{
15186 				bResult = true;
15187 				ppResults[j]->m_iMultiplier = iCommonSubtrees ? iQueries : 1;
15188 			} else
15189 			{
15190 				ppResults[j]->m_iMultiplier = -1;
15191 			}
15192 
15193 			ppResults[j]->m_tIOStats.Stop();
15194 		}
15195 	}
15196 
15197 	SafeDelete ( pTokenizer );
15198 	return bResult | bResultScan;
15199 }
15200 
ParsedMultiQuery(const CSphQuery * pQuery,CSphQueryResult * pResult,int iSorters,ISphMatchSorter ** ppSorters,const XQQuery_t & tXQ,CSphDict * pDict,const CSphVector<CSphFilterSettings> * pExtraFilters,CSphQueryNodeCache * pNodeCache,int iTag) const15201 bool CSphIndex_VLN::ParsedMultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const XQQuery_t & tXQ, CSphDict * pDict, const CSphVector<CSphFilterSettings> * pExtraFilters, CSphQueryNodeCache * pNodeCache, int iTag ) const
15202 {
15203 	assert ( pQuery );
15204 	assert ( pResult );
15205 	assert ( ppSorters );
15206 	assert ( !pQuery->m_sQuery.IsEmpty() && pQuery->m_eMode!=SPH_MATCH_FULLSCAN ); // scans must go through MultiScan()
15207 	assert ( iTag>=0 );
15208 
15209 	// start counting
15210 	int64_t tmQueryStart = sphMicroTimer();
15211 
15212 	///////////////////
15213 	// setup searching
15214 	///////////////////
15215 
15216 	PROFILER_INIT ();
15217 	PROFILE_BEGIN ( query_init );
15218 
15219 	// non-ready index, empty response!
15220 	if ( !m_pPreread || !*m_pPreread )
15221 	{
15222 		pResult->m_sError = "index not preread";
15223 		return false;
15224 	}
15225 
15226 	// select the sorter with max schema
15227 	int iMaxSchemaSize = -1;
15228 	int iMaxSchemaIndex = -1;
15229 	for ( int i=0; i<iSorters; i++ )
15230 		if ( ppSorters[i]->GetSchema().GetRowSize() > iMaxSchemaSize )
15231 		{
15232 			iMaxSchemaSize = ppSorters[i]->GetSchema().GetRowSize();
15233 			iMaxSchemaIndex = i;
15234 		}
15235 
15236 	// setup calculations and result schema
15237 	CSphQueryContext tCtx;
15238 	if ( !tCtx.SetupCalc ( pResult, ppSorters[iMaxSchemaIndex]->GetSchema(), m_tSchema, GetMVAPool() ) )
15239 		return false;
15240 
15241 	// set string pool for string on_sort expression fix up
15242 	tCtx.SetStringPool ( m_pStrings.GetWritePtr() );
15243 
15244 	// open files
15245 	CSphAutofile tDoclist, tHitlist, tWordlist, tDummy;
15246 	if ( !m_bKeepFilesOpen )
15247 	{
15248 		if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, pResult->m_sError ) < 0 )
15249 			return false;
15250 
15251 		if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, pResult->m_sError ) < 0 )
15252 			return false;
15253 
15254 		if ( tWordlist.Open ( GetIndexFileName ( "spi" ), SPH_O_READ, pResult->m_sError ) < 0 )
15255 			return false;
15256 	}
15257 
15258 	// setup search terms
15259 	DiskIndexQwordSetup_c tTermSetup ( m_bKeepFilesOpen ? m_tDoclistFile : tDoclist,
15260 		m_bKeepFilesOpen ? m_tHitlistFile : tHitlist,
15261 		m_bPreloadWordlist ? tDummy : ( m_bKeepFilesOpen ? m_tWordlist.m_tFile : tWordlist ),
15262 		m_bPreloadWordlist ? 0 : m_tWordlist.m_iMaxChunk );
15263 
15264 	tTermSetup.m_pDict = pDict;
15265 	tTermSetup.m_pIndex = this;
15266 	tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
15267 	tTermSetup.m_tMin.m_iDocID = m_pMin->m_iDocID;
15268 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
15269 	{
15270 		tTermSetup.m_tMin.Clone ( *m_pMin, m_tSchema.GetRowSize() );
15271 		tTermSetup.m_iInlineRowitems = m_tSchema.GetRowSize();
15272 	}
15273 	tTermSetup.m_iDynamicRowitems = pResult->m_tSchema.GetDynamicSize();
15274 
15275 	if ( pQuery->m_uMaxQueryMsec>0 )
15276 		tTermSetup.m_iMaxTimer = sphMicroTimer() + pQuery->m_uMaxQueryMsec*1000; // max_query_time
15277 	tTermSetup.m_pWarning = &pResult->m_sWarning;
15278 	tTermSetup.m_bSetupReaders = true;
15279 	tTermSetup.m_pCtx = &tCtx;
15280 	tTermSetup.m_pNodeCache = pNodeCache;
15281 
15282 	int iIndexWeight = pQuery->GetIndexWeight ( m_sIndexName.cstr() );
15283 
15284 	// bind weights
15285 	tCtx.BindWeights ( pQuery, m_tSchema, iIndexWeight );
15286 
15287 	SphWordStatChecker_t tStatDiff;
15288 	tStatDiff.Set ( pResult->m_hWordStats );
15289 
15290 	// setup query
15291 	// must happen before index-level reject, in order to build proper keyword stats
15292 	CSphScopedPtr<ISphRanker> pRanker ( sphCreateRanker ( tXQ, pQuery, pResult, tTermSetup, tCtx ) );
15293 	if ( !pRanker.Ptr() )
15294 		return false;
15295 
15296 	tStatDiff.DumpDiffer ( pResult->m_hWordStats, m_sIndexName.cstr(), pResult->m_sWarning );
15297 
15298 	// empty index, empty response!
15299 	if ( m_bIsEmpty )
15300 		return true;
15301 	assert ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN || !m_pDocinfo.IsEmpty() ); // check that docinfo is preloaded
15302 
15303 	// setup filters
15304 	if ( !tCtx.CreateFilters ( pQuery->m_sQuery.IsEmpty(), &pQuery->m_dFilters, pResult->m_tSchema, GetMVAPool(), pResult->m_sError ) )
15305 		return false;
15306 	if ( !tCtx.CreateFilters ( pQuery->m_sQuery.IsEmpty(), pExtraFilters, pResult->m_tSchema, GetMVAPool(), pResult->m_sError ) )
15307 		return false;
15308 
15309 	// check if we can early reject the whole index
15310 	if ( tCtx.m_pFilter && m_uDocinfoIndex )
15311 	{
15312 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
15313 		DWORD * pMinEntry = const_cast<DWORD*> ( &m_pDocinfoIndex [ 2*m_uDocinfoIndex*uStride ] );
15314 		DWORD * pMaxEntry = pMinEntry + uStride;
15315 
15316 		if ( !tCtx.m_pFilter->EvalBlock ( pMinEntry, pMaxEntry ) )
15317 			return true;
15318 	}
15319 
15320 	// setup lookup
15321 	tCtx.m_bLookupFilter = ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN ) && pQuery->m_dFilters.GetLength();
15322 	if ( tCtx.m_dCalcFilter.GetLength() || pQuery->m_eRanker==SPH_RANK_EXPR )
15323 		tCtx.m_bLookupFilter = true; // suboptimal in case of attr-independent expressions, but we don't care
15324 
15325 	tCtx.m_bLookupSort = false;
15326 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !tCtx.m_bLookupFilter )
15327 		for ( int iSorter=0; iSorter<iSorters && !tCtx.m_bLookupSort; iSorter++ )
15328 			if ( ppSorters[iSorter]->UsesAttrs() )
15329 				tCtx.m_bLookupSort = true;
15330 	if ( tCtx.m_dCalcSort.GetLength() )
15331 		tCtx.m_bLookupSort = true; // suboptimal in case of attr-independent expressions, but we don't care
15332 
15333 	// setup sorters vs. MVA
15334 	for ( int i=0; i<iSorters; i++ )
15335 	{
15336 		(ppSorters[i])->SetMVAPool ( m_pMva.GetWritePtr() );
15337 		(ppSorters[i])->SetStringPool ( m_pStrings.GetWritePtr() );
15338 	}
15339 
15340 	// setup overrides
15341 	if ( !tCtx.SetupOverrides ( pQuery, pResult, m_tSchema ) )
15342 		return false;
15343 
15344 	PROFILE_END ( query_init );
15345 
15346 	//////////////////////////////////////
15347 	// find and weight matching documents
15348 	//////////////////////////////////////
15349 
15350 	bool bFinalLookup = !tCtx.m_bLookupFilter && !tCtx.m_bLookupSort;
15351 	bool bFinalPass = bFinalLookup || tCtx.m_dCalcFinal.GetLength();
15352 	int iMyTag = bFinalPass ? -1 : iTag;
15353 
15354 	PROFILE_BEGIN ( query_match );
15355 	switch ( pQuery->m_eMode )
15356 	{
15357 		case SPH_MATCH_ALL:
15358 		case SPH_MATCH_PHRASE:
15359 		case SPH_MATCH_ANY:
15360 		case SPH_MATCH_EXTENDED:
15361 		case SPH_MATCH_EXTENDED2:
15362 		case SPH_MATCH_BOOLEAN:
15363 			if ( !MatchExtended ( &tCtx, pQuery, iSorters, ppSorters, pRanker.Ptr(), iMyTag ) )
15364 				return false;
15365 			break;
15366 
15367 		default:
15368 			sphDie ( "INTERNAL ERROR: unknown matching mode (mode=%d)", pQuery->m_eMode );
15369 	}
15370 	PROFILE_END ( query_match );
15371 
15372 	////////////////////
15373 	// cook result sets
15374 	////////////////////
15375 
15376 	// adjust result sets
15377 	for ( int iSorter=0; iSorter<iSorters; iSorter++ )
15378 	{
15379 		ISphMatchSorter * pTop = ppSorters[iSorter];
15380 		if ( pTop->GetLength() && bFinalPass )
15381 		{
15382 			CSphMatch * const pHead = pTop->Finalize();
15383 			const int iCount = pTop->GetLength ();
15384 			CSphMatch * const pTail = pHead + iCount;
15385 
15386 			for ( CSphMatch * pCur=pHead; pCur<pTail; pCur++ )
15387 				if ( pCur->m_iTag<0 )
15388 			{
15389 				if ( bFinalLookup )
15390 					CopyDocinfo ( &tCtx, *pCur, FindDocinfo ( pCur->m_iDocID ) );
15391 				tCtx.CalcFinal ( *pCur );
15392 				pCur->m_iTag = iTag;
15393 			}
15394 		}
15395 
15396 		// mva and string pools ptrs
15397 		pResult->m_pMva = m_pMva.GetWritePtr();
15398 		pResult->m_pStrings = m_pStrings.GetWritePtr();
15399 	}
15400 
15401 	PROFILER_DONE ();
15402 	PROFILE_SHOW ();
15403 
15404 	// query timer
15405 	pResult->m_iQueryTime += (int)( ( sphMicroTimer()-tmQueryStart )/1000 );
15406 	return true;
15407 }
15408 
15409 //////////////////////////////////////////////////////////////////////////
15410 // INDEX CHECKING
15411 //////////////////////////////////////////////////////////////////////////
15412 
15413 #define LOC_FAIL(_args) \
15414 	if ( ++iFails<=FAILS_THRESH ) \
15415 	{ \
15416 		fprintf ( fp, "FAILED, " ); \
15417 		fprintf _args; \
15418 		fprintf ( fp, "\n" ); \
15419 		iFailsPrinted++; \
15420 		\
15421 		if ( iFails==FAILS_THRESH ) \
15422 			fprintf ( fp, "(threshold reached; suppressing further output)\n" ); \
15423 	}
15424 
DebugCheck(FILE * fp)15425 int CSphIndex_VLN::DebugCheck ( FILE * fp )
15426 {
15427 	int64_t tmCheck = sphMicroTimer();
15428 	int iFails = 0;
15429 	int iFailsPrinted = 0;
15430 	const int FAILS_THRESH = 100;
15431 
15432 	// check if index is ready
15433 	if ( m_dShared.GetNumEntries()!=SPH_SHARED_VARS_COUNT || !m_pPreread || !*m_pPreread )
15434 		LOC_FAIL(( fp, "index not preread" ));
15435 
15436 	bool bProgress = isatty ( fileno ( fp ) )!=0;
15437 
15438 	//////////////
15439 	// open files
15440 	//////////////
15441 
15442 	if ( !LoadHitlessWords () )
15443 		LOC_FAIL(( fp, "unable to load hitless words: %s", m_sLastError.cstr() ));
15444 
15445 	CSphString sError;
15446 	CSphAutoreader rdDict, rdDocs, rdHits;
15447 
15448 	if ( !rdDict.Open ( GetIndexFileName("spi"), sError ) )
15449 		LOC_FAIL(( fp, "unable to open dictionary: %s", sError.cstr() ));
15450 
15451 	if ( !rdDocs.Open ( GetIndexFileName("spd"), sError ) )
15452 		LOC_FAIL(( fp, "unable to open doclist: %s", sError.cstr() ));
15453 
15454 	if ( !rdHits.Open ( GetIndexFileName("spp"), sError ) )
15455 		LOC_FAIL(( fp, "unable to open hitlist: %s", sError.cstr() ));
15456 
15457 	CSphSavedFile tStat;
15458 	const CSphTokenizerSettings & tTokenizerSettings = m_pTokenizer->GetSettings ();
15459 	if ( !tTokenizerSettings.m_sSynonymsFile.IsEmpty() && !GetFileStats ( tTokenizerSettings.m_sSynonymsFile.cstr(), tStat, &sError ) )
15460 		LOC_FAIL(( fp, "unable to open exceptions '%s': %s", tTokenizerSettings.m_sSynonymsFile.cstr(), sError.cstr() ));
15461 
15462 	const CSphDictSettings & tDictSettings = m_pDict->GetSettings ();
15463 	if ( !tDictSettings.m_sStopwords.IsEmpty() && !GetFileStats ( tDictSettings.m_sStopwords.cstr(), tStat, &sError ) )
15464 		LOC_FAIL(( fp, "unable to open stopwords '%s': %s", tDictSettings.m_sStopwords.cstr(), sError.cstr() ));
15465 	if ( !tDictSettings.m_sWordforms.IsEmpty() && !GetFileStats ( tDictSettings.m_sWordforms.cstr(), tStat, &sError ) )
15466 		LOC_FAIL(( fp, "unable to open wordforms '%s': %s", tDictSettings.m_sWordforms.cstr(), sError.cstr() ));
15467 
15468 	////////////////////
15469 	// check dictionary
15470 	////////////////////
15471 
15472 	fprintf ( fp, "checking dictionary...\n" );
15473 
15474 	SphWordID_t uWordid = 0;
15475 	int64_t iDoclistOffset = 0;
15476 	int iWordsTotal = 0;
15477 
15478 	char sWord[MAX_KEYWORD_BYTES], sLastWord[MAX_KEYWORD_BYTES];
15479 	memset ( sWord, 0, sizeof(sWord) );
15480 	memset ( sLastWord, 0, sizeof(sLastWord) );
15481 
15482 	const int iWordPerCP = m_uVersion>=21 ? SPH_WORDLIST_CHECKPOINT : 1024;
15483 	const bool bWordDict = m_pDict->GetSettings().m_bWordDict;
15484 
15485 	CSphVector<CSphWordlistCheckpoint> dCheckpoints;
15486 
15487 	if ( bWordDict && m_uVersion<21 )
15488 		LOC_FAIL(( fp, "dictionary needed index version not less then 21 (readed=%d)"
15489 			, m_uVersion ));
15490 
15491 	rdDict.SeekTo ( 1, READ_NO_SIZE_HINT );
15492 	for ( ; rdDict.GetPos()!=m_tWordlist.m_iCheckpointsPos && !m_bIsEmpty; )
15493 	{
15494 		// sanity checks
15495 		if ( rdDict.GetPos()>=m_tWordlist.m_iCheckpointsPos )
15496 		{
15497 			LOC_FAIL(( fp, "reading past checkpoints" ));
15498 			break;
15499 		}
15500 
15501 		// store current entry pos (for checkpointing later), read next delta
15502 		const int64_t iDictPos = rdDict.GetPos();
15503 		const SphWordID_t iDeltaWord = bWordDict ? rdDict.GetByte() : rdDict.UnzipWordid();
15504 
15505 		// checkpoint encountered, handle it
15506 		if ( !iDeltaWord )
15507 		{
15508 			rdDict.UnzipOffset();
15509 
15510 			if ( ( iWordsTotal%iWordPerCP )!=0 && rdDict.GetPos()!=m_tWordlist.m_iCheckpointsPos )
15511 				LOC_FAIL(( fp, "unexpected checkpoint (pos="INT64_FMT", word=%d, words=%d, expected=%d)",
15512 					iDictPos, iWordsTotal, ( iWordsTotal%iWordPerCP ), iWordPerCP ));
15513 
15514 			uWordid = 0;
15515 			iDoclistOffset = 0;
15516 			continue;
15517 		}
15518 
15519 		SphWordID_t uNewWordid = 0;
15520 		SphOffset_t iNewDoclistOffset = 0;
15521 		int iDocs = 0;
15522 		int iHits = 0;
15523 
15524 		if ( bWordDict )
15525 		{
15526 			// unpack next word
15527 			// must be in sync with DictEnd()!
15528 			BYTE uPack = (BYTE)iDeltaWord;
15529 			int iMatch, iDelta;
15530 			if ( uPack & 0x80 )
15531 			{
15532 				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
15533 				iMatch = uPack & 15;
15534 			} else
15535 			{
15536 				iDelta = uPack & 127;
15537 				iMatch = rdDict.GetByte();
15538 			}
15539 			const int iLastWordLen = strlen(sLastWord);
15540 			if ( iMatch+iDelta>=(int)sizeof(sLastWord)-1 || iMatch>iLastWordLen )
15541 			{
15542 				LOC_FAIL(( fp, "wrong word-delta (pos="INT64_FMT", word=%s, len=%d, begin=%d, delta=%d)",
15543 					iDictPos, sLastWord, iLastWordLen, iMatch, iDelta ));
15544 				rdDict.SkipBytes ( iDelta );
15545 			} else
15546 			{
15547 				rdDict.GetBytes ( sWord + iMatch, iDelta );
15548 				sWord [ iMatch+iDelta ] = '\0';
15549 			}
15550 
15551 			iNewDoclistOffset = rdDict.UnzipOffset();
15552 			iDocs = rdDict.UnzipInt();
15553 			iHits = rdDict.UnzipInt();
15554 			int iHint = ( iDocs>=DOCLIST_HINT_THRESH ) ? rdDict.GetByte() : 0;
15555 			iHint = DoclistHintUnpack ( iDocs, (BYTE)iHint );
15556 
15557 			const int iNewWordLen = strlen(sWord);
15558 
15559 			if ( iNewWordLen==0 )
15560 				LOC_FAIL(( fp, "empty word in dictionary (pos="INT64_FMT")",
15561 					iDictPos ));
15562 
15563 			if ( iLastWordLen && iNewWordLen )
15564 				if ( sphDictCmpStrictly ( sWord, iNewWordLen, sLastWord, iLastWordLen )<=0 )
15565 					LOC_FAIL(( fp, "word order decreased (pos="INT64_FMT", word=%s, prev=%s)",
15566 						iDictPos, sLastWord, sWord ));
15567 
15568 			if ( iHint<0 )
15569 				LOC_FAIL(( fp, "invalid word hint (pos="INT64_FMT", word=%s, hint=%d)",
15570 					iDictPos, sWord, iHint ));
15571 
15572 			if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
15573 				LOC_FAIL(( fp, "invalid docs/hits (pos="INT64_FMT", word=%s, docs="INT64_FMT", hits="INT64_FMT")",
15574 					(int64_t)iDictPos, sWord, (int64_t)iDocs, (int64_t)iHits ));
15575 
15576 			memcpy ( sLastWord, sWord, sizeof(sLastWord) );
15577 
15578 		} else
15579 		{
15580 			// finish reading the entire entry
15581 			uNewWordid = uWordid + iDeltaWord;
15582 			iNewDoclistOffset = iDoclistOffset + rdDict.UnzipOffset();
15583 			iDocs = rdDict.UnzipInt();
15584 			iHits = rdDict.UnzipInt();
15585 			bool bHitless = m_dHitlessWords.BinarySearch ( uNewWordid );
15586 			if ( bHitless )
15587 				iDocs &= 0x7fffffff;
15588 
15589 			if ( uNewWordid<=uWordid )
15590 				LOC_FAIL(( fp, "wordid decreased (pos="INT64_FMT", wordid="UINT64_FMT", previd="UINT64_FMT")",
15591 					(int64_t)iDictPos, (uint64_t)uNewWordid, (uint64_t)uWordid ));
15592 
15593 			if ( iNewDoclistOffset<=iDoclistOffset )
15594 				LOC_FAIL(( fp, "doclist offset decreased (pos="INT64_FMT", wordid="UINT64_FMT")",
15595 					(int64_t)iDictPos, (uint64_t)uNewWordid ));
15596 
15597 			if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
15598 				LOC_FAIL(( fp, "invalid docs/hits (pos="INT64_FMT", wordid="UINT64_FMT", docs="INT64_FMT", hits="INT64_FMT", hitless=%s)",
15599 					(int64_t)iDictPos, (uint64_t)uNewWordid, (int64_t)iDocs, (int64_t)iHits, ( bHitless?"true":"false" ) ));
15600 		}
15601 
15602 		// update stats, add checkpoint
15603 		if ( ( iWordsTotal%iWordPerCP )==0 )
15604 		{
15605 			CSphWordlistCheckpoint & tCP = dCheckpoints.Add();
15606 			tCP.m_iWordlistOffset = iDictPos;
15607 			if ( bWordDict )
15608 			{
15609 				const int iLen = strlen ( sWord );
15610 				char * sWordChecked = new char [iLen+1];
15611 				strncpy ( sWordChecked, sWord, iLen+1 );
15612 				tCP.m_sWord = sWordChecked;
15613 			} else
15614 				tCP.m_iWordID = uNewWordid;
15615 		}
15616 
15617 		uWordid = uNewWordid;
15618 		iDoclistOffset = iNewDoclistOffset;
15619 		iWordsTotal++;
15620 	}
15621 
15622 	// check the checkpoints
15623 	if ( dCheckpoints.GetLength()!=m_tWordlist.m_dCheckpoints.GetLength() )
15624 		LOC_FAIL(( fp, "checkpoint count mismatch (read=%d, calc=%d)",
15625 			m_tWordlist.m_dCheckpoints.GetLength(), dCheckpoints.GetLength() ));
15626 
15627 	for ( int i=0; i < Min ( dCheckpoints.GetLength(), m_tWordlist.m_dCheckpoints.GetLength() ); i++ )
15628 	{
15629 		const CSphWordlistCheckpoint & tRefCP = dCheckpoints[i];
15630 		const CSphWordlistCheckpoint & tCP = m_tWordlist.m_dCheckpoints[i];
15631 		const int iLen = bWordDict ? strlen ( tCP.m_sWord ) : 0;
15632 		if ( bWordDict && ( strlen ( tRefCP.m_sWord )==0 || strlen ( tCP.m_sWord )==0 ) )
15633 		{
15634 			LOC_FAIL(( fp, "empty checkpoint %d (read_word=%s, read_len=%u, readpos="INT64_FMT", calc_word=%s, calc_len=%u, calcpos="INT64_FMT")",
15635 				i, tCP.m_sWord, (DWORD)strlen ( tCP.m_sWord ), (int64_t)tCP.m_iWordlistOffset,
15636 					tRefCP.m_sWord, (DWORD)strlen ( tRefCP.m_sWord ), (int64_t)tRefCP.m_iWordlistOffset ));
15637 
15638 		} else if ( sphCheckpointCmpStrictly ( tCP.m_sWord, iLen, tCP.m_iWordID, bWordDict, tRefCP )
15639 			|| tRefCP.m_iWordlistOffset!=tCP.m_iWordlistOffset )
15640 		{
15641 			if ( bWordDict )
15642 			{
15643 				LOC_FAIL(( fp, "checkpoint %d differs (read_word=%s, readpos="INT64_FMT", calc_word=%s, calcpos="INT64_FMT")",
15644 					i,
15645 					tCP.m_sWord,
15646 					(int64_t)tCP.m_iWordlistOffset,
15647 					tRefCP.m_sWord,
15648 					(int64_t)tRefCP.m_iWordlistOffset ));
15649 			} else
15650 			{
15651 				LOC_FAIL(( fp, "checkpoint %d differs (readid="UINT64_FMT", readpos="INT64_FMT", calcid="UINT64_FMT", calcpos="INT64_FMT")",
15652 					i,
15653 					(uint64_t)tCP.m_iWordID,
15654 					(int64_t)tCP.m_iWordlistOffset,
15655 					(uint64_t)tRefCP.m_iWordID,
15656 					(int64_t)tRefCP.m_iWordlistOffset ));
15657 			}
15658 		}
15659 	}
15660 	if ( bWordDict )
15661 		ARRAY_FOREACH ( i, dCheckpoints )
15662 			SafeDeleteArray ( dCheckpoints[i].m_sWord );
15663 
15664 	dCheckpoints.Reset ();
15665 
15666 	///////////////////////
15667 	// check docs and hits
15668 	///////////////////////
15669 
15670 	fprintf ( fp, "checking data...\n" );
15671 
15672 	int64_t iDocsSize = rdDocs.GetFilesize();
15673 
15674 	rdDict.SeekTo ( 1, READ_NO_SIZE_HINT );
15675 	rdDocs.SeekTo ( 1, READ_NO_SIZE_HINT );
15676 	rdHits.SeekTo ( 1, READ_NO_SIZE_HINT );
15677 
15678 	uWordid = 0;
15679 	iDoclistOffset = 0;
15680 	int iDictDocs, iDictHits;
15681 
15682 	int iWordsChecked = 0;
15683 	for ( ;rdDict.GetPos()<m_tWordlist.m_iCheckpointsPos; )
15684 	{
15685 		const SphWordID_t iDeltaWord = bWordDict ? rdDict.GetByte() : rdDict.UnzipWordid();
15686 		if ( !iDeltaWord )
15687 		{
15688 			rdDict.UnzipOffset();
15689 
15690 			uWordid = 0;
15691 			iDoclistOffset = 0;
15692 			continue;
15693 		}
15694 
15695 		if ( bWordDict )
15696 		{
15697 			// unpack next word
15698 			// must be in sync with DictEnd()!
15699 			BYTE uPack = (BYTE)iDeltaWord;
15700 
15701 			int iMatch, iDelta;
15702 			if ( uPack & 0x80 )
15703 			{
15704 				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
15705 				iMatch = uPack & 15;
15706 			} else
15707 			{
15708 				iDelta = uPack & 127;
15709 				iMatch = rdDict.GetByte();
15710 			}
15711 			const int iLastWordLen = strlen(sWord);
15712 			if ( iMatch+iDelta>=(int)sizeof(sWord)-1 || iMatch>iLastWordLen )
15713 				rdDict.SkipBytes ( iDelta );
15714 			else
15715 			{
15716 				rdDict.GetBytes ( sWord + iMatch, iDelta );
15717 				sWord [ iMatch+iDelta ] = '\0';
15718 			}
15719 
15720 			iDoclistOffset = rdDict.UnzipOffset();
15721 			iDictDocs = rdDict.UnzipInt();
15722 			iDictHits = rdDict.UnzipInt();
15723 			DoclistHintUnpack ( iDictDocs, ( iDictDocs>=DOCLIST_HINT_THRESH ) ? rdDict.GetByte() : 0 );
15724 		} else
15725 		{
15726 			// finish reading the entire entry
15727 			uWordid = uWordid + iDeltaWord;
15728 			iDoclistOffset = iDoclistOffset + rdDict.UnzipOffset();
15729 			iDictDocs = rdDict.UnzipInt();
15730 			if ( m_dHitlessWords.BinarySearch ( uWordid ) )
15731 				iDictDocs &= 0x7fffffff;
15732 			iDictHits = rdDict.UnzipInt();
15733 		}
15734 
15735 		// check whether the offset is as expected
15736 		if ( iDoclistOffset!=rdDocs.GetPos() )
15737 		{
15738 			if ( !bWordDict )
15739 				LOC_FAIL(( fp, "unexpected doclist offset (wordid="UINT64_FMT"(%s)(%d), dictpos="INT64_FMT", doclistpos="INT64_FMT")",
15740 					(uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, (int64_t)rdDocs.GetPos() ));
15741 
15742 			if ( iDoclistOffset>=iDocsSize || iDoclistOffset<0 )
15743 			{
15744 				LOC_FAIL(( fp, "unexpected doclist offset, off the file (wordid="UINT64_FMT"(%s)(%d), dictpos="INT64_FMT", doclistsize="INT64_FMT")",
15745 					(uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, iDocsSize ));
15746 				iWordsChecked++;
15747 				continue;
15748 			} else
15749 				rdDocs.SeekTo ( iDoclistOffset, READ_NO_SIZE_HINT );
15750 		}
15751 
15752 		// create and manually setup doclist reader
15753 		DiskIndexQwordTraits_c * pQword = NULL;
15754 		WITH_QWORD ( this, false, T, pQword = new T ( false, false ) );
15755 
15756 		pQword->m_tDoc.Reset ( m_tSchema.GetDynamicSize() );
15757 		pQword->m_iMinID = m_pMin->m_iDocID;
15758 		pQword->m_tDoc.m_iDocID = m_pMin->m_iDocID;
15759 		if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
15760 		{
15761 			pQword->m_iInlineAttrs = m_tSchema.GetDynamicSize();
15762 			pQword->m_pInlineFixup = m_pMin->m_pDynamic;
15763 		} else
15764 		{
15765 			pQword->m_iInlineAttrs = 0;
15766 			pQword->m_pInlineFixup = NULL;
15767 		}
15768 		pQword->m_iDocs = 0;
15769 		pQword->m_iHits = 0;
15770 		pQword->m_rdDoclist.SetFile ( rdDocs.GetFD(), rdDocs.GetFilename().cstr() );
15771 		pQword->m_rdDoclist.SeekTo ( rdDocs.GetPos(), READ_NO_SIZE_HINT );
15772 		pQword->m_rdHitlist.SetFile ( rdHits.GetFD(), rdHits.GetFilename().cstr() );
15773 		pQword->m_rdHitlist.SeekTo ( rdHits.GetPos(), READ_NO_SIZE_HINT );
15774 
15775 		CSphRowitem * pInlineStorage = NULL;
15776 		if ( pQword->m_iInlineAttrs )
15777 			pInlineStorage = new CSphRowitem [ pQword->m_iInlineAttrs ];
15778 
15779 		// loop the doclist
15780 		SphDocID_t uLastDocid = 0;
15781 		int iDoclistDocs = 0;
15782 		int iDoclistHits = 0;
15783 		int iHitlistHits = 0;
15784 
15785 		// FIXME!!! dict=keywords + hitless_words=some
15786 		bool bHitless = ( m_tSettings.m_eHitless==SPH_HITLESS_ALL ||
15787 			( m_tSettings.m_eHitless==SPH_HITLESS_SOME && m_dHitlessWords.BinarySearch ( uWordid ) ) );
15788 		pQword->m_bHasHitlist = !bHitless;
15789 
15790 		for ( ;; )
15791 		{
15792 			const CSphMatch & tDoc = pQword->GetNextDoc ( pInlineStorage );
15793 			if ( !tDoc.m_iDocID )
15794 				break;
15795 
15796 			// checks!
15797 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
15798 			{
15799 				const CSphRowitem * pFound = FindDocinfo ( tDoc.m_iDocID );
15800 				if ( !pFound )
15801 					LOC_FAIL(( fp, "row not found (wordid="UINT64_FMT"(%s), docid="DOCID_FMT")",
15802 						uint64_t(uWordid), sWord, tDoc.m_iDocID ));
15803 
15804 				if ( pFound )
15805 					if ( tDoc.m_iDocID!=DOCINFO2ID(pFound) )
15806 						LOC_FAIL(( fp, "row found but id mismatches (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", found="DOCID_FMT")",
15807 							uint64_t(uWordid), sWord, tDoc.m_iDocID, DOCINFO2ID(pFound) ));
15808 			}
15809 
15810 			if ( tDoc.m_iDocID<=uLastDocid )
15811 				LOC_FAIL(( fp, "docid decreased (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", lastid="DOCID_FMT")",
15812 					uint64_t(uWordid), sWord, tDoc.m_iDocID, uLastDocid ));
15813 
15814 			uLastDocid = tDoc.m_iDocID;
15815 			iDoclistDocs++;
15816 			iDoclistHits += pQword->m_uMatchHits;
15817 
15818 			// check position in case of regular (not-inline) hit
15819 			if (!( pQword->m_iHitlistPos>>63 ))
15820 			{
15821 				if ( !bWordDict && pQword->m_iHitlistPos!=pQword->m_rdHitlist.GetPos() )
15822 					LOC_FAIL(( fp, "unexpected hitlist offset (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", expected="INT64_FMT", actual="INT64_FMT")",
15823 						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID,
15824 						(int64_t)pQword->m_iHitlistPos, (int64_t)pQword->m_rdHitlist.GetPos() ));
15825 			}
15826 
15827 			// aim
15828 			pQword->SeekHitlist ( pQword->m_iHitlistPos );
15829 
15830 			// loop the hitlist
15831 			int iDocHits = 0;
15832 			CSphSmallBitvec dFieldMask;
15833 			dFieldMask.Unset();
15834 			Hitpos_t uLastHit = EMPTY_HIT;
15835 
15836 			while ( !bHitless )
15837 			{
15838 				Hitpos_t uHit = pQword->GetNextHit();
15839 				if ( uHit==EMPTY_HIT )
15840 					break;
15841 
15842 				if (!( uLastHit<uHit ))
15843 					LOC_FAIL(( fp, "hit decreased (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", hit=%u, last=%u)",
15844 						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, uHit, uLastHit ));
15845 				uLastHit = uHit;
15846 
15847 				int iField = HITMAN::GetField ( uHit );
15848 				if ( iField<0 || iField>=SPH_MAX_FIELDS )
15849 				{
15850 					LOC_FAIL(( fp, "hit field out of bounds (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", field=%d)",
15851 						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, iField ));
15852 
15853 				} else if ( iField>=m_tSchema.m_dFields.GetLength() )
15854 				{
15855 					LOC_FAIL(( fp, "hit field out of schema (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", field=%d)",
15856 						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, iField ));
15857 				}
15858 
15859 				dFieldMask.Set(iField);
15860 				iDocHits++; // to check doclist entry
15861 				iHitlistHits++; // to check dictionary entry
15862 			}
15863 
15864 			// check hit count
15865 			if ( iDocHits!=(int)pQword->m_uMatchHits && !bHitless )
15866 				LOC_FAIL(( fp, "doc hit count mismatch (wordid="UINT64_FMT"(%s), docid="DOCID_FMT", doclist=%d, hitlist=%d)",
15867 					(uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID, pQword->m_uMatchHits, iDocHits ));
15868 
15869 			// check the mask
15870 			if ( dFieldMask!=pQword->m_dQwordFields && !bHitless )
15871 				LOC_FAIL(( fp, "field mask mismatch (wordid="UINT64_FMT"(%s), docid="DOCID_FMT")",
15872 					(uint64_t)uWordid, sWord, pQword->m_tDoc.m_iDocID ));
15873 
15874 			// update my hitlist reader
15875 			rdHits.SeekTo ( pQword->m_rdHitlist.GetPos(), READ_NO_SIZE_HINT );
15876 		}
15877 
15878 		// do checks
15879 		if ( iDictDocs!=iDoclistDocs )
15880 			LOC_FAIL(( fp, "doc count mismatch (wordid="UINT64_FMT"(%s), dict=%d, doclist=%d, hitless=%s)",
15881 				uint64_t(uWordid), sWord, iDictDocs, iDoclistDocs, ( bHitless?"true":"false" ) ));
15882 
15883 		if ( ( iDictHits!=iDoclistHits || iDictHits!=iHitlistHits ) && !bHitless )
15884 			LOC_FAIL(( fp, "hit count mismatch (wordid="UINT64_FMT"(%s), dict=%d, doclist=%d, hitlist=%d)",
15885 				uint64_t(uWordid), sWord, iDictHits, iDoclistHits, iHitlistHits ));
15886 
15887 		// move my reader instance forward too
15888 		rdDocs.SeekTo ( pQword->m_rdDoclist.GetPos(), READ_NO_SIZE_HINT );
15889 
15890 		// cleanup
15891 		SafeDelete ( pInlineStorage );
15892 		SafeDelete ( pQword );
15893 
15894 		// progress bar
15895 		if ( (++iWordsChecked)%1000==0 && bProgress )
15896 		{
15897 			fprintf ( fp, "%d/%d\r", iWordsChecked, iWordsTotal );
15898 			fflush ( fp );
15899 		}
15900 	}
15901 
15902 	///////////////////////////
15903 	// check rows (attributes)
15904 	///////////////////////////
15905 
15906 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_pDocinfo.IsEmpty() )
15907 	{
15908 		fprintf ( fp, "checking rows...\n" );
15909 
15910 		// sizes and counts
15911 		DWORD uRowsTotal = m_uDocinfo;
15912 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
15913 
15914 		DWORD uAllRowsTotal = uRowsTotal;
15915 		uAllRowsTotal += 2*(1+m_uDocinfoIndex); // should had been fixed up to v.20 by the loader
15916 
15917 		if ( uAllRowsTotal*uStride!=m_pDocinfo.GetNumEntries() )
15918 			LOC_FAIL(( fp, "rowitems count mismatch (expected=%u, loaded="INT64_FMT")",
15919 				uAllRowsTotal*uStride, (int64_t)m_pDocinfo.GetNumEntries() ));
15920 
15921 		// extract rowitem indexes for MVAs etc
15922 		// (ie. attr types that we can and will run additional checks on)
15923 		CSphVector<int> dMvaItems;
15924 		CSphVector<CSphAttrLocator> dFloatItems;
15925 		CSphVector<CSphAttrLocator> dStrItems;
15926 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
15927 		{
15928 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
15929 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
15930 			{
15931 				if ( tAttr.m_tLocator.m_iBitCount!=ROWITEM_BITS )
15932 				{
15933 					LOC_FAIL(( fp, "unexpected MVA bitcount (attr=%d, expected=%d, got=%d)",
15934 						i, ROWITEM_BITS, tAttr.m_tLocator.m_iBitCount ));
15935 					continue;
15936 				}
15937 				if ( ( tAttr.m_tLocator.m_iBitOffset % ROWITEM_BITS )!=0 )
15938 				{
15939 					LOC_FAIL(( fp, "unaligned MVA bitoffset (attr=%d, bitoffset=%d)",
15940 						i, tAttr.m_tLocator.m_iBitOffset ));
15941 					continue;
15942 				}
15943 				if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
15944 				dMvaItems.Add ( tAttr.m_tLocator.m_iBitOffset/ROWITEM_BITS );
15945 			} else if ( tAttr.m_eAttrType==SPH_ATTR_FLOAT )
15946 				dFloatItems.Add	( tAttr.m_tLocator );
15947 			else if ( tAttr.m_eAttrType==SPH_ATTR_STRING )
15948 				dStrItems.Add ( tAttr.m_tLocator );
15949 		}
15950 		int iMva64 = dMvaItems.GetLength();
15951 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
15952 		{
15953 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
15954 			if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
15955 				dMvaItems.Add ( tAttr.m_tLocator.m_iBitOffset/ROWITEM_BITS );
15956 		}
15957 
15958 		// walk string data, build a list of acceptable start offsets
15959 		// must be sorted by construction
15960 		CSphVector<DWORD> dStringOffsets;
15961 		if ( m_pStrings.GetNumEntries()>1 )
15962 		{
15963 			const BYTE * pBase = m_pStrings.GetWritePtr();
15964 			const BYTE * pCur = pBase + 1;
15965 			const BYTE * pMax = pBase + m_pStrings.GetNumEntries();
15966 			while ( pCur<pMax )
15967 			{
15968 				const BYTE * pStr = NULL;
15969 				const int iLen = sphUnpackStr ( pCur, &pStr );
15970 
15971 				// 4 bytes must be enough to encode string length, hence pCur+4
15972 				if ( pStr+iLen>pMax || pStr<pCur || pStr>pCur+4 )
15973 				{
15974 					LOC_FAIL(( fp, "string length out of bounds (offset=%u, len=%d)", (DWORD)(pCur-pBase), iLen ));
15975 					break;
15976 				}
15977 
15978 				dStringOffsets.Add ( (DWORD)(pCur-pBase) );
15979 				pCur = pStr + iLen;
15980 			}
15981 		}
15982 
15983 		// loop the rows
15984 		const CSphRowitem * pRow = m_pDocinfo.GetWritePtr();
15985 		const DWORD * pMvaBase = m_pMva.GetWritePtr();
15986 		const DWORD * pMvaMax = pMvaBase + m_pMva.GetNumEntries();
15987 		const DWORD * pMva = pMvaBase;
15988 
15989 		int iOrphan = 0;
15990 		SphDocID_t uLastID = 0;
15991 
15992 		for ( DWORD uRow=0; uRow<uRowsTotal; uRow++, pRow+=uStride )
15993 		{
15994 			// check that ids are ascending
15995 			bool bIsSpaValid = uLastID < DOCINFO2ID(pRow);
15996 			if ( !bIsSpaValid )
15997 				LOC_FAIL(( fp, "docid decreased (row=%u, id="DOCID_FMT", lastid="DOCID_FMT")",
15998 					uRow, DOCINFO2ID(pRow), uLastID ));
15999 
16000 			uLastID = DOCINFO2ID(pRow);
16001 
16002 			///////////////////////////
16003 			// check MVAs
16004 			///////////////////////////
16005 
16006 			if ( dMvaItems.GetLength() )
16007 			{
16008 				const DWORD * pMvaSpaFixed = NULL;
16009 				const CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
16010 				bool bHasValues = false;
16011 				ARRAY_FOREACH ( iItem, dMvaItems )
16012 				{
16013 					const DWORD uOffset = pAttrs[dMvaItems[iItem]];
16014 					bHasValues |= uOffset!=0;
16015 
16016 					if ( pMvaBase+uOffset>=pMvaMax )
16017 					{
16018 						bIsSpaValid = false;
16019 						LOC_FAIL(( fp, "MVA index out of bounds (row=%u, mvaattr=%d, docid="DOCID_FMT", index=%u)",
16020 							uRow, iItem, uLastID, uOffset ));
16021 					}
16022 
16023 					if ( uOffset && pMvaBase+uOffset<pMvaMax && !pMvaSpaFixed )
16024 						pMvaSpaFixed = pMvaBase + uOffset - sizeof(SphDocID_t) / sizeof(DWORD);
16025 				}
16026 
16027 				// MVAs ptr recovery from previous errors only if current spa record is valid
16028 				if ( pMva!=pMvaSpaFixed && bIsSpaValid && pMvaSpaFixed )
16029 					pMva = pMvaSpaFixed;
16030 
16031 				bool bLastIDChecked = false;
16032 
16033 				SphDocID_t uLastMvaID = 0;
16034 				while ( pMva<pMvaMax && DOCINFO2ID(pMva)<=uLastID )
16035 				{
16036 					const SphDocID_t uMvaID = DOCINFO2ID(pMva);
16037 					pMva = DOCINFO2ATTRS(pMva);
16038 
16039 					if ( bLastIDChecked && uLastID==uMvaID )
16040 						LOC_FAIL(( fp, "duplicate docid found (row=%u, docid expected="DOCID_FMT", got="DOCID_FMT", index=%u)",
16041 							uRow, uLastID, uMvaID, (DWORD)(pMva-pMvaBase) ));
16042 
16043 					if ( uMvaID<uLastMvaID )
16044 						LOC_FAIL(( fp, "MVA docid decreased (row=%u, spa docid="DOCID_FMT", last MVA docid="DOCID_FMT", MVA docid="DOCID_FMT", index=%u)",
16045 							uRow, uLastID, uLastMvaID, uMvaID, (DWORD)(pMva-pMvaBase) ));
16046 
16047 					bool bIsMvaCorrect = uLastMvaID<=uMvaID && uMvaID<=uLastID;
16048 					uLastMvaID = uMvaID;
16049 
16050 					// loop MVAs
16051 					ARRAY_FOREACH_COND ( iItem, dMvaItems, bIsMvaCorrect )
16052 					{
16053 						const DWORD uSpaOffset = pAttrs[dMvaItems[iItem]];
16054 
16055 						// zero offset means empty MVA in rt index
16056 						if ( !uSpaOffset )
16057 							continue;
16058 
16059 						// check offset (index)
16060 						if ( uMvaID==uLastID && bIsSpaValid && pMva!=pMvaBase+uSpaOffset )
16061 						{
16062 							LOC_FAIL(( fp, "unexpected MVA docid (row=%u, mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", expected=%u, got=%u)",
16063 								uRow, iItem, uLastID, uMvaID, (DWORD)(pMva-pMvaBase), uSpaOffset ));
16064 							// it's unexpected but it's our best guess
16065 							// but do fix up only once, to prevent infinite loop
16066 							if ( !bLastIDChecked )
16067 								pMva = pMvaBase+uSpaOffset;
16068 						}
16069 
16070 						if ( pMva>=pMvaMax )
16071 						{
16072 							LOC_FAIL(( fp, "MVA index out of bounds (row=%u, mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", index=%u)",
16073 								uRow, iItem, uLastID, uMvaID, (DWORD)(pMva-pMvaBase) ));
16074 							bIsMvaCorrect = false;
16075 							continue;
16076 						}
16077 
16078 						// check values
16079 						DWORD uValues = *pMva++;
16080 
16081 						if ( pMva+uValues-1>=pMvaMax )
16082 						{
16083 							LOC_FAIL(( fp, "MVA count out of bounds (row=%u, mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", count=%u)",
16084 								uRow, iItem, uLastID, uMvaID, uValues ));
16085 							pMva += uValues;
16086 							bIsMvaCorrect = false;
16087 							continue;
16088 						}
16089 						// check that values are ascending
16090 						for ( DWORD uVal=(iItem>=iMva64 ? 2 : 1); uVal<uValues && bIsMvaCorrect; )
16091 						{
16092 							int64_t iPrev, iCur;
16093 							if ( iItem>=iMva64 )
16094 							{
16095 								iPrev = MVA_UPSIZE ( pMva+uVal-2 );
16096 								iCur = MVA_UPSIZE ( pMva+uVal );
16097 								uVal += 2;
16098 							} else
16099 							{
16100 								iPrev = pMva[uVal-1];
16101 								iCur = pMva[uVal];
16102 								uVal++;
16103 							}
16104 
16105 							if ( iCur<=iPrev )
16106 							{
16107 								LOC_FAIL(( fp, "unsorted MVA values (row=%u, mvaattr=%d, docid expected="DOCID_FMT", got="DOCID_FMT", val[%u]=%u, val[%u]=%u)",
16108 									uRow, iItem, uLastID, uMvaID, ( iItem>=iMva64 ? uVal-2 : uVal-1 ), (unsigned int)iPrev, uVal, (unsigned int)iCur ));
16109 								bIsMvaCorrect = false;
16110 							}
16111 
16112 							uVal += ( iItem>=iMva64 ? 2 : 1 );
16113 						}
16114 						pMva += uValues;
16115 					}
16116 
16117 					if ( !bIsMvaCorrect )
16118 						break;
16119 
16120 					// orphan only ON no errors && ( not matched ids || ids matched multiply times )
16121 					if ( bIsMvaCorrect && ( uMvaID!=uLastID || ( uMvaID==uLastID && bLastIDChecked ) ) )
16122 						iOrphan++;
16123 
16124 					bLastIDChecked |= uLastID==uMvaID;
16125 				}
16126 
16127 				if ( !bLastIDChecked && bHasValues )
16128 					LOC_FAIL(( fp, "missed or damaged MVA (row=%u, docid expected="DOCID_FMT")",
16129 						uRow, uLastID ));
16130 			}
16131 
16132 			///////////////////////////
16133 			// check floats
16134 			///////////////////////////
16135 
16136 			ARRAY_FOREACH ( iItem, dFloatItems )
16137 			{
16138 				const CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
16139 				const DWORD uValue = (DWORD)sphGetRowAttr ( pAttrs, dFloatItems[ iItem ] );
16140 				const DWORD uExp = ( uValue >> 23 ) & 0xff;
16141 				const DWORD uMantissa = uValue & 0x003fffff;
16142 
16143 				// check normalized
16144 				if ( uExp==0 && uMantissa!=0 )
16145 					LOC_FAIL(( fp, "float attribute value is unnormalized (row=%u, attr=%d, id="DOCID_FMT", raw=0x%x, value=%f)",
16146 						uRow, iItem, uLastID, uValue, sphDW2F ( uValue ) ));
16147 
16148 				// check +-inf
16149 				if ( uExp==0xff && uMantissa==0 )
16150 					LOC_FAIL(( fp, "float attribute is infinity (row=%u, attr=%d, id="DOCID_FMT", raw=0x%x, value=%f)",
16151 						uRow, iItem, uLastID, uValue, sphDW2F ( uValue ) ));
16152 			}
16153 
16154 			/////////////////
16155 			// check strings
16156 			/////////////////
16157 
16158 			ARRAY_FOREACH ( iItem, dStrItems )
16159 			{
16160 				const CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
16161 
16162 				const DWORD uOffset = (DWORD)sphGetRowAttr ( pAttrs, dStrItems[ iItem ] );
16163 				if ( uOffset>=m_pStrings.GetNumEntries() )
16164 				{
16165 					LOC_FAIL(( fp, "string offset out of bounds (row=%u, stringattr=%d, docid="DOCID_FMT", index=%u)",
16166 						uRow, iItem, uLastID, uOffset ));
16167 					continue;
16168 				}
16169 
16170 				if ( !uOffset )
16171 					continue;
16172 
16173 				const BYTE * pStr = NULL;
16174 				const int iLen = sphUnpackStr ( m_pStrings.GetWritePtr() + uOffset, &pStr );
16175 
16176 				// check that length is sane
16177 				if ( pStr+iLen-1>=m_pStrings.GetWritePtr()+m_pStrings.GetLength() )
16178 				{
16179 					LOC_FAIL(( fp, "string length out of bounds (row=%u, stringattr=%d, docid="DOCID_FMT", index=%u)",
16180 						uRow, iItem, uLastID, (unsigned int)( pStr-m_pStrings.GetWritePtr()+iLen-1 ) ));
16181 					continue;
16182 				}
16183 
16184 				// check that offset is one of the good ones
16185 				// (that is, that we don't point in the middle of some other data)
16186 				if ( !dStringOffsets.BinarySearch ( uOffset ) )
16187 				{
16188 					LOC_FAIL(( fp, "string offset is not a string start (row=%u, stringattr=%d, docid="DOCID_FMT", offset=%u)",
16189 						uRow, iItem, uLastID, uOffset ));
16190 				}
16191 			}
16192 
16193 			// progress bar
16194 			if ( uRow%1000==0 && bProgress )
16195 			{
16196 				fprintf ( fp, "%d/%d\r", uRow, uRowsTotal );
16197 				fflush ( fp );
16198 			}
16199 		}
16200 
16201 		if ( iOrphan )
16202 			fprintf ( fp, "WARNING: %d orphaned MVA entries were found\n", iOrphan );
16203 
16204 		///////////////////////////
16205 		// check blocks index
16206 		///////////////////////////
16207 
16208 		fprintf ( fp, "checking attribute blocks index...\n" );
16209 
16210 		// check size
16211 		const DWORD uTempDocinfoIndex = ( m_uDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
16212 		if ( uTempDocinfoIndex!=m_uDocinfoIndex )
16213 			LOC_FAIL(( fp, "block count differs (expected=%d, got=%d)",
16214 				uTempDocinfoIndex, m_uDocinfoIndex ));
16215 
16216 		const DWORD uMinMaxStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
16217 		const DWORD * pDocinfoIndexMax = m_pDocinfoIndex + 2*( 1+m_uDocinfoIndex )*uMinMaxStride;
16218 
16219 		for ( DWORD uIndexEntry=0; uIndexEntry<m_uDocinfo; uIndexEntry++ )
16220 		{
16221 			const DWORD uBlock = uIndexEntry / DOCINFO_INDEX_FREQ;
16222 
16223 			// we have to do some checks in border cases, for example: when move from 1st to 2nd block
16224 			const DWORD uPrevEntryBlock = ( uIndexEntry-1 )/DOCINFO_INDEX_FREQ;
16225 			const bool bIsBordersCheckTime = uPrevEntryBlock!=uBlock;
16226 
16227 			const DWORD * pAttr = m_pDocinfo.GetWritePtr() + uIndexEntry * uMinMaxStride;
16228 			const SphDocID_t uDocID = DOCINFO2ID(pAttr);
16229 
16230 			const DWORD * pMinEntry = m_pDocinfoIndex + 2 * uBlock * uMinMaxStride;
16231 			const DWORD * pMaxEntry = pMinEntry + uMinMaxStride;
16232 			const DWORD * pMinAttrs = DOCINFO2ATTRS ( pMinEntry );
16233 			const DWORD * pMaxAttrs = pMinAttrs + uMinMaxStride;
16234 
16235 			// check docid vs global range
16236 			if ( pMaxEntry+uMinMaxStride > pDocinfoIndexMax )
16237 				LOC_FAIL(( fp, "unexpected block index end (row=%u, docid="DOCID_FMT", block=%d, max=%u, cur=%u)",
16238 					uIndexEntry, uDocID, uBlock, (DWORD)(pDocinfoIndexMax-m_pDocinfoIndex), (DWORD)(pMaxEntry+uMinMaxStride-m_pDocinfoIndex) ));
16239 
16240 			// check attribute location vs global range
16241 			if ( pMaxAttrs+uMinMaxStride > pDocinfoIndexMax )
16242 				LOC_FAIL(( fp, "attribute position out of blocks index (row=%u, docid="DOCID_FMT", block=%u, expected<%u, got=%u)",
16243 					uIndexEntry, uDocID, uBlock, (DWORD)(pDocinfoIndexMax-m_pDocinfoIndex), (DWORD)(pMaxAttrs+uMinMaxStride-m_pDocinfoIndex) ));
16244 
16245 			const SphDocID_t uMinDocID = *(SphDocID_t*)pMinEntry;
16246 			const SphDocID_t uMaxDocID = *(SphDocID_t*)pMaxEntry;
16247 
16248 			// checks is docid min max range valid
16249 			if ( uMinDocID > uMaxDocID && bIsBordersCheckTime )
16250 				LOC_FAIL(( fp, "invalid docid range (row=%u, block=%d, min="DOCID_FMT", max="DOCID_FMT")",
16251 					uIndexEntry, uBlock, uMinDocID, uMaxDocID ));
16252 
16253 			// checks docid vs blocks range
16254 			if ( uDocID < uMinDocID || uDocID > uMaxDocID )
16255 				LOC_FAIL(( fp, "unexpected docid range (row=%u, docid="DOCID_FMT", block=%d, min="DOCID_FMT", max="DOCID_FMT")",
16256 					uIndexEntry, uDocID, uBlock, uMinDocID, uMaxDocID ));
16257 
16258 			bool bIsFirstMva = true;
16259 
16260 			// check values vs blocks range
16261 			const DWORD * pSpaRow = DOCINFO2ATTRS(pAttr);
16262 			for ( int iItem=0; iItem<m_tSchema.GetAttrsCount(); iItem++ )
16263 			{
16264 				const CSphColumnInfo & tCol = m_tSchema.GetAttr(iItem);
16265 
16266 				switch ( tCol.m_eAttrType )
16267 				{
16268 				case SPH_ATTR_INTEGER:
16269 				case SPH_ATTR_TIMESTAMP:
16270 				case SPH_ATTR_BOOL:
16271 				case SPH_ATTR_BIGINT:
16272 					{
16273 						const SphAttr_t uVal = sphGetRowAttr ( pSpaRow, tCol.m_tLocator );
16274 						const SphAttr_t uMin = sphGetRowAttr ( pMinAttrs, tCol.m_tLocator );
16275 						const SphAttr_t uMax = sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator );
16276 
16277 						// checks is attribute min max range valid
16278 						if ( uMin > uMax && bIsBordersCheckTime )
16279 							LOC_FAIL(( fp, "invalid attribute range (row=%u, block=%d, min="INT64_FMT", max="INT64_FMT")",
16280 								uIndexEntry, uBlock, uMin, uMax ));
16281 
16282 						if ( uVal < uMin || uVal > uMax )
16283 							LOC_FAIL(( fp, "unexpected attribute value (row=%u, attr=%u, docid="DOCID_FMT", block=%d, value=0x%x, min=0x%x, max=0x%x)",
16284 								uIndexEntry, iItem, uDocID, uBlock, (DWORD)uVal, (DWORD)uMin, (DWORD)uMax ));
16285 					}
16286 					break;
16287 
16288 				case SPH_ATTR_FLOAT:
16289 					{
16290 						const float fVal = sphDW2F ( (DWORD)sphGetRowAttr ( pSpaRow, tCol.m_tLocator ) );
16291 						const float fMin = sphDW2F ( (DWORD)sphGetRowAttr ( pMinAttrs, tCol.m_tLocator ) );
16292 						const float fMax = sphDW2F ( (DWORD)sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator ) );
16293 
16294 						// checks is attribute min max range valid
16295 						if ( fMin > fMax && bIsBordersCheckTime )
16296 							LOC_FAIL(( fp, "invalid attribute range (row=%u, block=%d, min=%f, max=%f)",
16297 								uIndexEntry, uBlock, fMin, fMax ));
16298 
16299 						if ( fVal < fMin || fVal > fMax )
16300 							LOC_FAIL(( fp, "unexpected attribute value (row=%u, attr=%u, docid="DOCID_FMT", block=%d, value=%f, min=%f, max=%f)",
16301 								uIndexEntry, iItem, uDocID, uBlock, fVal, fMin, fMax ));
16302 					}
16303 					break;
16304 
16305 				case SPH_ATTR_UINT32SET:
16306 					{
16307 						const DWORD uMin = (DWORD)sphGetRowAttr ( pMinAttrs, tCol.m_tLocator );
16308 						const DWORD uMax = (DWORD)sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator );
16309 
16310 						// checks is MVA attribute min max range valid
16311 						if ( uMin > uMax && bIsBordersCheckTime && uMin!=0xffffffff && uMax!=0 )
16312 							LOC_FAIL(( fp, "invalid MVA range (row=%u, block=%d, min=0x%x, max=0x%x)",
16313 							uIndexEntry, uBlock, uMin, uMax ));
16314 
16315 						SphAttr_t uOff = sphGetRowAttr ( pSpaRow, tCol.m_tLocator );
16316 						if ( !uOff )
16317 							break;
16318 
16319 						pMva = m_pMva.GetWritePtr() + uOff;
16320 						const DWORD * pMvaDocID = bIsFirstMva ? ( pMva - sizeof(SphDocID_t) / sizeof(DWORD) ) : NULL;
16321 						bIsFirstMva = false;
16322 
16323 						if ( uOff>=(SphAttr_t)m_pMva.GetNumEntries() )
16324 							break;
16325 
16326 						if ( pMvaDocID && DOCINFO2ID ( pMvaDocID )!=uDocID )
16327 						{
16328 							LOC_FAIL(( fp, "unexpected MVA docid (row=%u, mvaattr=%d, expected="DOCID_FMT", got="DOCID_FMT", block=%d, index=%u)",
16329 								uIndexEntry, iItem, uDocID, DOCINFO2ID ( pMvaDocID ), uBlock, (DWORD)uOff ));
16330 							break;
16331 						}
16332 
16333 						// check values
16334 						const DWORD uValues = *pMva++;
16335 						if ( uOff+uValues>(SphAttr_t)m_pMva.GetNumEntries() )
16336 							break;
16337 
16338 						for ( DWORD iVal=0; iVal<uValues; iVal++ )
16339 						{
16340 							const DWORD uVal = *pMva++;
16341 							if ( uVal < uMin || uVal > uMax )
16342 								LOC_FAIL(( fp, "unexpected MVA value (row=%u, attr=%u, docid="DOCID_FMT", block=%d, index=%u, value=0x%x, min=0x%x, max=0x%x)",
16343 								uIndexEntry, iItem, uDocID, uBlock, iVal, (DWORD)uVal, (DWORD)uMin, (DWORD)uMax ));
16344 						}
16345 					}
16346 					break;
16347 
16348 				default:
16349 					break;
16350 				}
16351 			}
16352 
16353 			// progress bar
16354 			if ( uIndexEntry%1000==0 && bProgress )
16355 			{
16356 				fprintf ( fp, "%d/%d\r", uIndexEntry, m_uDocinfo );
16357 				fflush ( fp );
16358 			}
16359 		}
16360 	}
16361 
16362 	///////////////////////////
16363 	// check kill-list
16364 	///////////////////////////
16365 
16366 	fprintf ( fp, "checking kill-list...\n" );
16367 
16368 	// check size
16369 	if ( m_pKillList.GetNumEntries()!=m_iKillListSize )
16370 		LOC_FAIL(( fp, "kill-list size differs (expected=%d, got="INT64_FMT")",
16371 			m_iKillListSize, (int64_t)m_pKillList.GetNumEntries() ));
16372 
16373 	// check that ids are ascending
16374 	for ( DWORD uID=1; uID<m_pKillList.GetNumEntries(); uID++ )
16375 		if ( m_pKillList[uID]<=m_pKillList[uID-1] )
16376 			LOC_FAIL(( fp, "unsorted kill-list values (val[%d]=%d, val[%d]=%d)",
16377 				uID-1, (DWORD)m_pKillList[uID-1], uID, (DWORD)m_pKillList[uID] ));
16378 
16379 	///////////////////////////
16380 	// all finished
16381 	///////////////////////////
16382 
16383 	// well, no known kinds of failures, maybe some unknown ones
16384 	tmCheck = sphMicroTimer() - tmCheck;
16385 	if ( !iFails )
16386 		fprintf ( fp, "check passed" );
16387 	else if ( iFails!=iFailsPrinted )
16388 		fprintf ( fp, "check FAILED, %d of %d failures reported", iFailsPrinted, iFails );
16389 	else
16390 		fprintf ( fp, "check FAILED, %d failures reported", iFails );
16391 	fprintf ( fp, ", %d.%d sec elapsed\n", (int)(tmCheck/1000000), (int)((tmCheck/100000)%10) );
16392 
16393 	return Min ( iFails, 255 ); // this is the exitcode; so cap it
16394 } // NOLINT function length
16395 
16396 
16397 //////////////////////////////////////////////////////////////////////////
16398 
16399 /// morphology
16400 enum
16401 {
16402 	SPH_MORPH_STEM_EN,
16403 	SPH_MORPH_STEM_RU_CP1251,
16404 	SPH_MORPH_STEM_RU_UTF8,
16405 	SPH_MORPH_STEM_CZ,
16406 	SPH_MORPH_SOUNDEX,
16407 	SPH_MORPH_METAPHONE_SBCS,
16408 	SPH_MORPH_METAPHONE_UTF8,
16409 	SPH_MORPH_LIBSTEMMER_FIRST,
16410 	SPH_MORPH_LIBSTEMMER_LAST = SPH_MORPH_LIBSTEMMER_FIRST + 64
16411 };
16412 
16413 
16414 /////////////////////////////////////////////////////////////////////////////
16415 // BASE DICTIONARY INTERFACE
16416 /////////////////////////////////////////////////////////////////////////////
16417 
DictBegin(CSphAutofile &,CSphAutofile &,int)16418 void CSphDict::DictBegin ( CSphAutofile &, CSphAutofile &, int )						{}
DictEntry(SphWordID_t,BYTE *,int,int,SphOffset_t,SphOffset_t)16419 void CSphDict::DictEntry ( SphWordID_t, BYTE *, int, int, SphOffset_t, SphOffset_t )	{}
DictEndEntries(SphOffset_t)16420 void CSphDict::DictEndEntries ( SphOffset_t )											{}
DictEnd(SphOffset_t *,int *,int,CSphString &)16421 bool CSphDict::DictEnd ( SphOffset_t *, int *, int, CSphString & )						{ return true; }
DictIsError() const16422 bool CSphDict::DictIsError () const														{ return true; }
16423 
16424 /////////////////////////////////////////////////////////////////////////////
16425 // CRC32/64 DICTIONARIES
16426 /////////////////////////////////////////////////////////////////////////////
16427 
16428 /// wordform container
16429 struct WordformContainer_t
16430 {
16431 	int							m_iRefCount;
16432 	CSphString					m_sFilename;
16433 	struct_stat					m_tStat;
16434 	DWORD						m_uCRC32;
16435 	uint64_t					m_uTokenizerFNV;
16436 	CSphString					m_sIndexName;
16437 	CSphVector <CSphString>		m_dNormalForms;
16438 	CSphMultiformContainer * m_pMultiWordforms;
16439 	CSphOrderedHash < int, CSphString, CSphStrHashFunc, 1048576 >	m_dHash;
16440 
16441 	WordformContainer_t ();
16442 	~WordformContainer_t ();
16443 
16444 	bool						IsEqual ( const char * szFile, DWORD uCRC32 );
16445 };
16446 
16447 
16448 /// common CRC32/64 dictionary stuff
16449 struct CSphDictCRCTraits : CSphDict
16450 {
16451 						CSphDictCRCTraits ();
16452 	virtual				~CSphDictCRCTraits ();
16453 
16454 	virtual void		LoadStopwords ( const char * sFiles, ISphTokenizer * pTokenizer );
16455 	virtual bool		LoadWordforms ( const char * szFile, ISphTokenizer * pTokenizer, const char * sIndex );
16456 	virtual bool		SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError );
16457 	virtual bool		HasMorphology() const;
16458 	virtual void		ApplyStemmers ( BYTE * pWord );
16459 
SetupCSphDictCRCTraits16460 	virtual void		Setup ( const CSphDictSettings & tSettings ) { m_tSettings = tSettings; }
GetSettingsCSphDictCRCTraits16461 	virtual const CSphDictSettings & GetSettings () const { return m_tSettings; }
GetStopwordsFileInfosCSphDictCRCTraits16462 	virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_dSWFileInfos; }
GetWordformsFileInfoCSphDictCRCTraits16463 	virtual const CSphSavedFile & GetWordformsFileInfo () { return m_tWFFileInfo; }
GetMultiWordformsCSphDictCRCTraits16464 	virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pWordforms ? m_pWordforms->m_pMultiWordforms : NULL; }
16465 
16466 	static void			SweepWordformContainers ( const char * szFile, DWORD uCRC32 );
16467 
16468 	virtual void DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit );
16469 	virtual void DictEntry ( SphWordID_t uWordID, BYTE * sKeyword, int iDocs, int iHits, SphOffset_t iDoclistOffset, SphOffset_t iDoclistLength );
16470 	virtual void DictEndEntries ( SphOffset_t iDoclistOffset );
16471 	virtual bool DictEnd ( SphOffset_t * pCheckpointsPos, int * pCheckpointsCount, int iMemLimit, CSphString & sError );
DictIsErrorCSphDictCRCTraits16472 	virtual bool DictIsError () const { return m_wrDict.IsError(); }
16473 
16474 protected:
16475 	CSphVector < int >	m_dMorph;
16476 #if USE_LIBSTEMMER
16477 	CSphVector < sb_stemmer * >	m_dStemmers;
16478 	struct DescStemmer_t
16479 	{
16480 		CSphString m_sAlgo;
16481 		CSphString m_sEnc;
16482 	};
16483 	CSphVector<DescStemmer_t> m_dDescStemmers;
16484 #endif
16485 
16486 	int					m_iStopwords;	///< stopwords count
16487 	SphWordID_t *		m_pStopwords;	///< stopwords ID list
16488 	CSphFixedVector<SphWordID_t> m_dStopwordContainer;
16489 
16490 protected:
16491 	bool				ToNormalForm ( BYTE * pWord );
16492 	bool				ParseMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError );
16493 	SphWordID_t			FilterStopword ( SphWordID_t uID ) const;	///< filter ID against stopwords list
16494 	CSphDict *			CloneBase ( CSphDictCRCTraits * pDict ) const;
16495 	virtual bool		HasState () const;
16496 
16497 	CSphTightVector<CSphWordlistCheckpoint>	m_dCheckpoints;		///< checkpoint offsets
16498 
16499 	CSphWriter			m_wrDict;			///< final dict file writer
16500 	CSphString			m_sWriterError;		///< writer error message storage
16501 	int					m_iEntries;			///< dictionary entries stored
16502 	SphOffset_t			m_iLastDoclistPos;
16503 	SphWordID_t			m_iLastWordID;
16504 
16505 private:
16506 	WordformContainer_t *		m_pWordforms;
16507 	CSphVector<CSphSavedFile>	m_dSWFileInfos;
16508 	CSphSavedFile				m_tWFFileInfo;
16509 	CSphDictSettings			m_tSettings;
16510 
16511 	static CSphVector<WordformContainer_t*>		m_dWordformContainers;
16512 
16513 	WordformContainer_t * GetWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex );
16514 	WordformContainer_t * LoadWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex );
16515 
16516 	bool				InitMorph ( const char * szMorph, int iLength, bool bUseUTF8, CSphString & sError );
16517 	bool				AddMorph ( int iMorph );
16518 	bool				StemById ( BYTE * pWord, int iStemmer );
16519 };
16520 
16521 CSphVector < WordformContainer_t * > CSphDictCRCTraits::m_dWordformContainers;
16522 
16523 
16524 /// specialized CRC32/64 implementations
16525 template < bool CRC32DICT >
16526 struct CSphDictCRC : public CSphDictCRCTraits
16527 {
16528 	inline SphWordID_t		DoCrc ( const BYTE * pWord ) const;
16529 	inline SphWordID_t		DoCrc ( const BYTE * pWord, int iLen ) const;
16530 
16531 	virtual SphWordID_t		GetWordID ( BYTE * pWord );
16532 	virtual SphWordID_t		GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops );
16533 	virtual SphWordID_t		GetWordIDWithMarkers ( BYTE * pWord );
16534 	virtual SphWordID_t		GetWordIDNonStemmed ( BYTE * pWord );
16535 	virtual bool			IsStopWord ( const BYTE * pWord ) const;
16536 
CloneCSphDictCRC16537 	virtual CSphDict *		Clone () const { return CloneBase ( new CSphDictCRC<CRC32DICT>() ); }
16538 };
16539 
16540 /////////////////////////////////////////////////////////////////////////////
16541 
16542 DWORD g_dSphinxCRC32 [ 256 ] =
16543 {
16544 	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
16545 	0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
16546 	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
16547 	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
16548 	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
16549 	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
16550 	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
16551 	0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
16552 	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
16553 	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
16554 	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
16555 	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
16556 	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
16557 	0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
16558 	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
16559 	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
16560 	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
16561 	0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
16562 	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
16563 	0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
16564 	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
16565 	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
16566 	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
16567 	0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
16568 	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
16569 	0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
16570 	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
16571 	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
16572 	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
16573 	0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
16574 	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
16575 	0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
16576 	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
16577 	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
16578 	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
16579 	0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
16580 	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
16581 	0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
16582 	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
16583 	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
16584 	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
16585 	0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
16586 	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
16587 	0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
16588 	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
16589 	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
16590 	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
16591 	0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
16592 	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
16593 	0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
16594 	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
16595 	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
16596 	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
16597 	0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
16598 	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
16599 	0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
16600 	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
16601 	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
16602 	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
16603 	0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
16604 	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
16605 	0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
16606 	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
16607 	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
16608 };
16609 
16610 
sphCRC32(const BYTE * pString)16611 DWORD sphCRC32 ( const BYTE * pString )
16612 {
16613 	// calc CRC
16614 	DWORD crc = ~((DWORD)0);
16615 	for ( const BYTE * p=pString; *p; p++ )
16616 		crc = (crc >> 8) ^ g_dSphinxCRC32 [ (crc ^ (*p)) & 0xff ];
16617 	return ~crc;
16618 }
16619 
sphCRC32(const BYTE * pString,int iLen)16620 DWORD sphCRC32 ( const BYTE * pString, int iLen )
16621 {
16622 	// calc CRC
16623 	DWORD crc = ~((DWORD)0);
16624 	for ( int i=0; i<iLen; i++ )
16625 		crc = (crc >> 8) ^ g_dSphinxCRC32 [ (crc ^ pString[i]) & 0xff ];
16626 	return ~crc;
16627 }
16628 
sphCRC32(const BYTE * pString,int iLen,DWORD uPrevCRC)16629 DWORD sphCRC32 ( const BYTE * pString, int iLen, DWORD uPrevCRC )
16630 {
16631 	// calc CRC
16632 	DWORD crc = ~((DWORD)uPrevCRC);
16633 	for ( int i=0; i<iLen; i++ )
16634 		crc = (crc >> 8) ^ g_dSphinxCRC32 [ (crc ^ pString[i]) & 0xff ];
16635 	return ~crc;
16636 }
16637 
16638 /////////////////////////////////////////////////////////////////////////////
16639 
sphFNV64(const BYTE * s)16640 uint64_t sphFNV64 ( const BYTE * s )
16641 {
16642 	uint64_t hval = 0xcbf29ce484222325ULL;
16643 	while ( *s )
16644 	{
16645 		// xor the bottom with the current octet
16646 		hval ^= (uint64_t)*s++;
16647 
16648 		// multiply by the 64 bit FNV magic prime mod 2^64
16649 		hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); // gcc optimization
16650 	}
16651 	return hval;
16652 }
16653 
16654 
sphFNV64(const BYTE * s,int iLen,uint64_t uPrev)16655 uint64_t sphFNV64 ( const BYTE * s, int iLen, uint64_t uPrev )
16656 {
16657 	uint64_t hval = uPrev;
16658 	for ( ; iLen>0; iLen-- )
16659 	{
16660 		// xor the bottom with the current octet
16661 		hval ^= (uint64_t)*s++;
16662 
16663 		// multiply by the 64 bit FNV magic prime mod 2^64
16664 		hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); // gcc optimization
16665 	}
16666 	return hval;
16667 }
16668 
16669 /////////////////////////////////////////////////////////////////////////////
16670 
sphCalcFileCRC32(const char * szFilename,DWORD & uCRC32)16671 bool sphCalcFileCRC32 ( const char * szFilename, DWORD & uCRC32 )
16672 {
16673 	uCRC32 = 0;
16674 
16675 	if ( !szFilename )
16676 		return false;
16677 
16678 	FILE * pFile = fopen ( szFilename, "rb" );
16679 	if ( !pFile )
16680 		return false;
16681 
16682 	DWORD crc = ~((DWORD)0);
16683 
16684 	const int BUFFER_SIZE = 131072;
16685 	static BYTE * pBuffer = NULL;
16686 	if ( !pBuffer )
16687 		pBuffer = new BYTE [ BUFFER_SIZE ];
16688 
16689 	int iBytesRead;
16690 	while ( ( iBytesRead = fread ( pBuffer, 1, BUFFER_SIZE, pFile ) )!=0 )
16691 	{
16692 		for ( int i=0; i<iBytesRead; i++ )
16693 			crc = (crc >> 8) ^ g_dSphinxCRC32 [ (crc ^ pBuffer[i]) & 0xff ];
16694 	}
16695 
16696 	fclose ( pFile );
16697 
16698 	uCRC32 = ~crc;
16699 	return true;
16700 }
16701 
16702 
GetFileStats(const char * szFilename,CSphSavedFile & tInfo,CSphString * pError)16703 static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo, CSphString * pError )
16704 {
16705 	if ( !szFilename || !*szFilename )
16706 	{
16707 		memset ( &tInfo, 0, sizeof ( tInfo ) );
16708 		return true;
16709 	}
16710 
16711 	tInfo.m_sFilename = szFilename;
16712 
16713 	struct_stat tStat;
16714 	memset ( &tStat, 0, sizeof ( tStat ) );
16715 	if ( stat ( szFilename, &tStat ) < 0 )
16716 	{
16717 		if ( pError )
16718 			*pError = strerror ( errno );
16719 		memset ( &tStat, 0, sizeof ( tStat ) );
16720 		return false;
16721 	}
16722 
16723 	tInfo.m_uSize = tStat.st_size;
16724 	tInfo.m_uCTime = tStat.st_ctime;
16725 	tInfo.m_uMTime = tStat.st_mtime;
16726 
16727 	DWORD uCRC32 = 0;
16728 	sphCalcFileCRC32 ( szFilename, uCRC32 );
16729 
16730 	tInfo.m_uCRC32 = uCRC32;
16731 	return true;
16732 }
16733 
16734 /////////////////////////////////////////////////////////////////////////////
16735 
WordformContainer_t()16736 WordformContainer_t::WordformContainer_t ()
16737 	: m_iRefCount ( 0 )
16738 	, m_uTokenizerFNV ( 0 )
16739 	, m_pMultiWordforms ( NULL )
16740 {
16741 }
16742 
16743 
~WordformContainer_t()16744 WordformContainer_t::~WordformContainer_t ()
16745 {
16746 	if ( m_pMultiWordforms )
16747 	{
16748 		m_pMultiWordforms->m_Hash.IterateStart ();
16749 		while ( m_pMultiWordforms->m_Hash.IterateNext () )
16750 		{
16751 			CSphMultiforms * pWordforms = m_pMultiWordforms->m_Hash.IterateGet ();
16752 			ARRAY_FOREACH ( i, pWordforms->m_dWordforms )
16753 				SafeDelete ( pWordforms->m_dWordforms[i] );
16754 
16755 			SafeDelete ( pWordforms );
16756 		}
16757 
16758 		SafeDelete ( m_pMultiWordforms );
16759 	}
16760 }
16761 
16762 
IsEqual(const char * szFile,DWORD uCRC32)16763 bool WordformContainer_t::IsEqual ( const char * szFile, DWORD uCRC32 )
16764 {
16765 	if ( !szFile )
16766 		return false;
16767 
16768 	struct_stat FileStat;
16769 	if ( stat ( szFile, &FileStat ) < 0 )
16770 		return false;
16771 
16772 	return m_sFilename==szFile && m_tStat.st_ctime==FileStat.st_ctime
16773 		&& m_tStat.st_mtime==FileStat.st_mtime && m_tStat.st_size==FileStat.st_size && m_uCRC32==uCRC32;
16774 }
16775 
16776 /////////////////////////////////////////////////////////////////////////////
16777 
CSphDictCRCTraits()16778 CSphDictCRCTraits::CSphDictCRCTraits ()
16779 	: m_iStopwords	( 0 )
16780 	, m_pStopwords	( NULL )
16781 	, m_dStopwordContainer ( 0 )
16782 	, m_iEntries ( 0 )
16783 	, m_iLastDoclistPos ( 0 )
16784 	, m_iLastWordID ( 0 )
16785 	, m_pWordforms	( NULL )
16786 {
16787 }
16788 
16789 
~CSphDictCRCTraits()16790 CSphDictCRCTraits::~CSphDictCRCTraits ()
16791 {
16792 #if USE_LIBSTEMMER
16793 	ARRAY_FOREACH ( i, m_dStemmers )
16794 		sb_stemmer_delete ( m_dStemmers[i] );
16795 #endif
16796 
16797 	if ( m_pWordforms )
16798 		--m_pWordforms->m_iRefCount;
16799 }
16800 
16801 
FilterStopword(SphWordID_t uID) const16802 SphWordID_t CSphDictCRCTraits::FilterStopword ( SphWordID_t uID ) const
16803 {
16804 	if ( !m_iStopwords )
16805 		return uID;
16806 
16807 	// OPTIMIZE: binary search is not too good, could do some hashing instead
16808 	SphWordID_t * pStart = m_pStopwords;
16809 	SphWordID_t * pEnd = m_pStopwords + m_iStopwords - 1;
16810 	do
16811 	{
16812 		if ( uID==*pStart || uID==*pEnd )
16813 			return 0;
16814 
16815 		if ( uID<*pStart || uID>*pEnd )
16816 			return uID;
16817 
16818 		SphWordID_t * pMid = pStart + (pEnd-pStart)/2;
16819 		if ( uID==*pMid )
16820 			return 0;
16821 
16822 		if ( uID<*pMid )
16823 			pEnd = pMid;
16824 		else
16825 			pStart = pMid;
16826 	} while ( pEnd-pStart>1 );
16827 
16828 	return uID;
16829 }
16830 
16831 
ToNormalForm(BYTE * pWord)16832 bool CSphDictCRCTraits::ToNormalForm ( BYTE * pWord )
16833 {
16834 	if ( !m_pWordforms )
16835 		return false;
16836 
16837 	int * pIndex = m_pWordforms->m_dHash ( (char *)pWord );
16838 	if ( !pIndex )
16839 		return false;
16840 
16841 	if ( *pIndex<0 || *pIndex>=m_pWordforms->m_dNormalForms.GetLength () )
16842 		return false;
16843 
16844 	if ( m_pWordforms->m_dNormalForms [*pIndex].IsEmpty () )
16845 		return false;
16846 
16847 	strcpy ( (char *)pWord, m_pWordforms->m_dNormalForms[*pIndex].cstr() ); // NOLINT
16848 	return true;
16849 }
16850 
16851 
ParseMorphology(const char * szMorph,bool bUseUTF8,CSphString & sError)16852 bool CSphDictCRCTraits::ParseMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError )
16853 {
16854 	const char * szStart = szMorph;
16855 
16856 	while ( *szStart )
16857 	{
16858 		while ( *szStart && ( sphIsSpace ( *szStart ) || *szStart==',' ) )
16859 			++szStart;
16860 
16861 		if ( !*szStart )
16862 			break;
16863 
16864 		const char * szWordStart = szStart;
16865 
16866 		while ( *szStart && !sphIsSpace ( *szStart ) && *szStart!=',' )
16867 			++szStart;
16868 
16869 		if ( szStart - szWordStart > 0 )
16870 		{
16871 			if ( !InitMorph ( szWordStart, szStart - szWordStart, bUseUTF8, sError ) )
16872 				return false;
16873 		}
16874 	}
16875 
16876 	return true;
16877 }
16878 
16879 
InitMorph(const char * szMorph,int iLength,bool bUseUTF8,CSphString & sError)16880 bool CSphDictCRCTraits::InitMorph ( const char * szMorph, int iLength, bool bUseUTF8, CSphString & sError )
16881 {
16882 	if ( iLength==0 )
16883 		return true;
16884 
16885 	if ( iLength==4 && !strncmp ( szMorph, "none", iLength ) )
16886 		return true;
16887 
16888 	if ( iLength==7 && !strncmp ( szMorph, "stem_en", iLength ) )
16889 	{
16890 		stem_en_init ();
16891 		return AddMorph ( SPH_MORPH_STEM_EN );
16892 	}
16893 
16894 	if ( iLength==7 && !strncmp ( szMorph, "stem_ru", iLength ) )
16895 	{
16896 		stem_ru_init ();
16897 		return AddMorph ( bUseUTF8 ? SPH_MORPH_STEM_RU_UTF8 : SPH_MORPH_STEM_RU_CP1251 );
16898 	}
16899 
16900 	if ( iLength==7 && !strncmp ( szMorph, "stem_cz", iLength ) )
16901 	{
16902 		stem_cz_init ();
16903 		return AddMorph ( SPH_MORPH_STEM_CZ );
16904 	}
16905 
16906 	if ( iLength==9 && !strncmp ( szMorph, "stem_enru", iLength ) )
16907 	{
16908 		stem_en_init ();
16909 		stem_ru_init ();
16910 
16911 		if ( !AddMorph ( SPH_MORPH_STEM_EN ) )
16912 			return false;
16913 
16914 		return AddMorph ( bUseUTF8 ? SPH_MORPH_STEM_RU_UTF8 : SPH_MORPH_STEM_RU_CP1251 );
16915 	}
16916 
16917 	if ( iLength==7 && !strncmp ( szMorph, "soundex", iLength ) )
16918 		return AddMorph ( SPH_MORPH_SOUNDEX );
16919 
16920 	if ( iLength==9 && !strncmp ( szMorph, "metaphone", iLength ) )
16921 		return AddMorph ( bUseUTF8 ? SPH_MORPH_METAPHONE_UTF8 : SPH_MORPH_METAPHONE_SBCS );
16922 
16923 	sError = "";
16924 
16925 #if USE_LIBSTEMMER
16926 	const int LIBSTEMMER_LEN = 11;
16927 	const int MAX_ALGO_LENGTH = 64;
16928 	if ( iLength > LIBSTEMMER_LEN && iLength - LIBSTEMMER_LEN < MAX_ALGO_LENGTH && !strncmp ( szMorph, "libstemmer_", LIBSTEMMER_LEN ) )
16929 	{
16930 		CSphString sAlgo;
16931 		CSphString sEnc;
16932 		sAlgo.SetBinary ( szMorph+LIBSTEMMER_LEN, iLength - LIBSTEMMER_LEN );
16933 
16934 		sb_stemmer * pStemmer = NULL;
16935 
16936 		if ( bUseUTF8 )
16937 		{
16938 			sEnc = "UTF_8";
16939 			pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
16940 		} else
16941 		{
16942 			sEnc = "ISO_8859_1";
16943 			pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
16944 
16945 			if ( !pStemmer )
16946 			{
16947 				sEnc = "ISO_8859_2";
16948 				pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
16949 			}
16950 
16951 			if ( !pStemmer )
16952 			{
16953 				sEnc = "KOI8_R";
16954 				pStemmer = sb_stemmer_new ( sAlgo.cstr(), sEnc.cstr() );
16955 			}
16956 		}
16957 
16958 		if ( !pStemmer )
16959 		{
16960 			sError.SetSprintf ( "libstemmer morphology algorithm '%s' not available for %s encoding - IGNORED",
16961 				sAlgo.cstr(), bUseUTF8 ? "UTF-8" : "SBCS" );
16962 			return false;
16963 		}
16964 
16965 		AddMorph ( SPH_MORPH_LIBSTEMMER_FIRST + m_dStemmers.GetLength () );
16966 		ARRAY_FOREACH ( i, m_dStemmers )
16967 		{
16968 			if ( m_dStemmers[i]==pStemmer )
16969 			{
16970 				sb_stemmer_delete ( pStemmer );
16971 				return false;
16972 			}
16973 		}
16974 
16975 		m_dStemmers.Add ( pStemmer );
16976 		DescStemmer_t & tDesc = m_dDescStemmers.Add();
16977 		tDesc.m_sAlgo.Swap ( sAlgo );
16978 		tDesc.m_sEnc.Swap ( sEnc );
16979 		return true;
16980 	}
16981 #endif
16982 
16983 	return false;
16984 }
16985 
16986 
AddMorph(int iMorph)16987 bool CSphDictCRCTraits::AddMorph ( int iMorph )
16988 {
16989 	ARRAY_FOREACH ( i, m_dMorph )
16990 		if ( m_dMorph[i]==iMorph )
16991 			return false;
16992 
16993 	m_dMorph.Add ( iMorph );
16994 	return true;
16995 }
16996 
16997 
16998 
ApplyStemmers(BYTE * pWord)16999 void CSphDictCRCTraits::ApplyStemmers ( BYTE * pWord )
17000 {
17001 	// try wordforms
17002 	if ( ToNormalForm ( pWord ) )
17003 		return;
17004 
17005 	// check length
17006 	if ( m_tSettings.m_iMinStemmingLen>1 )
17007 		if ( sphUTF8Len ( (const char*)pWord )<m_tSettings.m_iMinStemmingLen )
17008 			return;
17009 
17010 	// try stemmers
17011 	ARRAY_FOREACH ( i, m_dMorph )
17012 		if ( StemById ( pWord, m_dMorph[i] ) )
17013 			break;
17014 }
17015 
CloneBase(CSphDictCRCTraits * pDict) const17016 CSphDict * CSphDictCRCTraits::CloneBase ( CSphDictCRCTraits * pDict ) const
17017 {
17018 	assert ( pDict );
17019 	pDict->m_tSettings = m_tSettings;
17020 	pDict->m_iStopwords = m_iStopwords;
17021 	pDict->m_pStopwords = m_pStopwords;
17022 	pDict->m_pWordforms = m_pWordforms;
17023 	if ( m_pWordforms )
17024 		m_pWordforms->m_iRefCount++;
17025 
17026 	pDict->m_dMorph = m_dMorph;
17027 #if USE_LIBSTEMMER
17028 	assert ( m_dDescStemmers.GetLength()==m_dStemmers.GetLength() );
17029 	pDict->m_dDescStemmers = m_dDescStemmers;
17030 	ARRAY_FOREACH ( i, m_dDescStemmers )
17031 	{
17032 		pDict->m_dStemmers.Add ( sb_stemmer_new ( m_dDescStemmers[i].m_sAlgo.cstr(), m_dDescStemmers[i].m_sEnc.cstr() ) );
17033 		assert ( pDict->m_dStemmers.Last() );
17034 	}
17035 #endif
17036 
17037 	return pDict;
17038 }
17039 
HasState() const17040 bool CSphDictCRCTraits::HasState() const
17041 {
17042 #if !USE_LIBSTEMMER
17043 	return false;
17044 #else
17045 	return ( m_dDescStemmers.GetLength()>0 );
17046 #endif
17047 }
17048 
17049 /////////////////////////////////////////////////////////////////////////////
17050 
17051 template<>
DoCrc(const BYTE * pWord) const17052 SphWordID_t CSphDictCRC<true>::DoCrc ( const BYTE * pWord ) const
17053 {
17054 	return sphCRC32 ( pWord );
17055 }
17056 
17057 
17058 template<>
DoCrc(const BYTE * pWord) const17059 SphWordID_t CSphDictCRC<false>::DoCrc ( const BYTE * pWord ) const
17060 {
17061 	return (SphWordID_t) sphFNV64 ( pWord );
17062 }
17063 
17064 
17065 template<>
DoCrc(const BYTE * pWord,int iLen) const17066 SphWordID_t CSphDictCRC<true>::DoCrc ( const BYTE * pWord, int iLen ) const
17067 {
17068 	return sphCRC32 ( pWord, iLen );
17069 }
17070 
17071 
17072 template<>
DoCrc(const BYTE * pWord,int iLen) const17073 SphWordID_t CSphDictCRC<false>::DoCrc ( const BYTE * pWord, int iLen ) const
17074 {
17075 	return (SphWordID_t) sphFNV64 ( pWord, iLen );
17076 }
17077 
17078 
17079 template < bool CRC32DICT >
GetWordID(BYTE * pWord)17080 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordID ( BYTE * pWord )
17081 {
17082 	// skip stemmers for magic words
17083 	if ( pWord[0]>=0x20 )
17084 		ApplyStemmers ( pWord );
17085 	return FilterStopword ( DoCrc ( pWord ) );
17086 }
17087 
17088 
17089 template < bool CRC32DICT >
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)17090 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
17091 {
17092 	SphWordID_t uId = DoCrc ( pWord, iLen );
17093 	return bFilterStops ? FilterStopword ( uId ) : uId;
17094 }
17095 
17096 
17097 template < bool CRC32DICT >
GetWordIDWithMarkers(BYTE * pWord)17098 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordIDWithMarkers ( BYTE * pWord )
17099 {
17100 	ApplyStemmers ( pWord + 1 );
17101 	SphWordID_t uWordId = DoCrc ( pWord + 1 );
17102 	int iLength = strlen ( (const char *)(pWord + 1) );
17103 	pWord [iLength + 1] = MAGIC_WORD_TAIL;
17104 	pWord [iLength + 2] = '\0';
17105 	return FilterStopword ( uWordId ) ? DoCrc ( pWord ) : 0;
17106 }
17107 
17108 
17109 template < bool CRC32DICT >
GetWordIDNonStemmed(BYTE * pWord)17110 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordIDNonStemmed ( BYTE * pWord )
17111 {
17112 	SphWordID_t uWordId = DoCrc ( pWord + 1 );
17113 	if ( !FilterStopword ( uWordId ) )
17114 		return 0;
17115 
17116 	return DoCrc ( pWord );
17117 }
17118 
17119 
17120 template < bool CRC32DICT >
IsStopWord(const BYTE * pWord) const17121 bool CSphDictCRC<CRC32DICT>::IsStopWord ( const BYTE * pWord ) const
17122 {
17123 	return FilterStopword ( DoCrc ( pWord ) )==0;
17124 }
17125 
17126 //////////////////////////////////////////////////////////////////////////
17127 
LoadStopwords(const char * sFiles,ISphTokenizer * pTokenizer)17128 void CSphDictCRCTraits::LoadStopwords ( const char * sFiles, ISphTokenizer * pTokenizer )
17129 {
17130 	assert ( !m_pStopwords );
17131 	assert ( !m_iStopwords );
17132 
17133 	// tokenize file list
17134 	if ( !sFiles || !*sFiles )
17135 		return;
17136 
17137 	m_dSWFileInfos.Resize ( 0 );
17138 
17139 	char * sList = new char [ 1+strlen(sFiles) ];
17140 	strcpy ( sList, sFiles ); // NOLINT
17141 
17142 	char * pCur = sList;
17143 	char * sName = NULL;
17144 
17145 	CSphVector<SphWordID_t> dStop;
17146 
17147 	for ( ;; )
17148 	{
17149 		// find next name start
17150 		while ( *pCur && isspace(*pCur) ) pCur++;
17151 		if ( !*pCur ) break;
17152 		sName = pCur;
17153 
17154 		// find next name end
17155 		while ( *pCur && !isspace(*pCur) ) pCur++;
17156 		if ( *pCur ) *pCur++ = '\0';
17157 
17158 		BYTE * pBuffer = NULL;
17159 
17160 		CSphSavedFile tInfo;
17161 		tInfo.m_sFilename = sName;
17162 		GetFileStats ( sName, tInfo, NULL );
17163 		m_dSWFileInfos.Add ( tInfo );
17164 
17165 		// open file
17166 		struct_stat st;
17167 		if ( stat ( sName, &st )==0 )
17168 			pBuffer = new BYTE [(size_t)st.st_size];
17169 		else
17170 		{
17171 			sphWarn ( "stopwords: failed to get file size for '%s'", sName );
17172 			continue;
17173 		}
17174 
17175 		FILE * fp = fopen ( sName, "rb" );
17176 		if ( !fp )
17177 		{
17178 			sphWarn ( "failed to load stopwords from '%s'", sName );
17179 			SafeDeleteArray ( pBuffer );
17180 			continue;
17181 		}
17182 
17183 		// tokenize file
17184 		int iLength = (int)fread ( pBuffer, 1, (size_t)st.st_size, fp );
17185 
17186 		BYTE * pToken;
17187 		pTokenizer->SetBuffer ( pBuffer, iLength );
17188 		while ( ( pToken = pTokenizer->GetToken() )!=NULL )
17189 			dStop.Add ( GetWordID ( pToken ) );
17190 
17191 		// close file
17192 		fclose ( fp );
17193 
17194 		SafeDeleteArray ( pBuffer );
17195 	}
17196 
17197 	SafeDeleteArray ( sList );
17198 
17199 	// sort stopwords
17200 	dStop.Uniq();
17201 
17202 	// store IDs
17203 	if ( dStop.GetLength() )
17204 	{
17205 		m_dStopwordContainer.Reset ( dStop.GetLength() );
17206 		ARRAY_FOREACH ( i, dStop )
17207 			m_dStopwordContainer[i] = dStop[i];
17208 
17209 		m_iStopwords = m_dStopwordContainer.GetLength ();
17210 		m_pStopwords = m_dStopwordContainer.Begin();
17211 	}
17212 }
17213 
17214 
SweepWordformContainers(const char * szFile,DWORD uCRC32)17215 void CSphDictCRCTraits::SweepWordformContainers ( const char * szFile, DWORD uCRC32 )
17216 {
17217 	for ( int i = 0; i < m_dWordformContainers.GetLength (); )
17218 	{
17219 		WordformContainer_t * WC = m_dWordformContainers[i];
17220 		if ( WC->m_iRefCount==0 && !WC->IsEqual ( szFile, uCRC32 ) )
17221 		{
17222 			delete WC;
17223 			m_dWordformContainers.Remove ( i );
17224 		} else
17225 			++i;
17226 	}
17227 }
17228 
17229 
GetWordformContainer(const char * szFile,DWORD uCRC32,const ISphTokenizer * pTokenizer,const char * sIndex)17230 WordformContainer_t * CSphDictCRCTraits::GetWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex )
17231 {
17232 	uint64_t uTokenizerFNV = pTokenizer->GetSettingsFNV();
17233 	ARRAY_FOREACH ( i, m_dWordformContainers )
17234 		if ( m_dWordformContainers[i]->IsEqual ( szFile, uCRC32 ) )
17235 		{
17236 			WordformContainer_t * pContainer = m_dWordformContainers[i];
17237 			if ( uTokenizerFNV==pContainer->m_uTokenizerFNV )
17238 				return pContainer;
17239 
17240 			sphWarning ( "index %s: wordforms file %s is shared with index %s, but tokenizer settings are different", sIndex, szFile, pContainer->m_sIndexName.cstr() );
17241 			break;
17242 		}
17243 
17244 	WordformContainer_t * pContainer = LoadWordformContainer ( szFile, uCRC32, pTokenizer, sIndex );
17245 	if ( pContainer )
17246 		m_dWordformContainers.Add ( pContainer );
17247 
17248 	return pContainer;
17249 }
17250 
17251 
LoadWordformContainer(const char * szFile,DWORD uCRC32,const ISphTokenizer * pTokenizer,const char * sIndex)17252 WordformContainer_t * CSphDictCRCTraits::LoadWordformContainer ( const char * szFile, DWORD uCRC32, const ISphTokenizer * pTokenizer, const char * sIndex )
17253 {
17254 	// stat it; we'll store stats for later checks
17255 	struct_stat FileStat;
17256 	if ( !szFile || !*szFile || stat ( szFile, &FileStat )<0 )
17257 		return NULL;
17258 
17259 	// allocate it
17260 	WordformContainer_t * pContainer = new WordformContainer_t;
17261 	if ( !pContainer )
17262 		return NULL;
17263 	pContainer->m_sFilename = szFile;
17264 	pContainer->m_tStat = FileStat;
17265 	pContainer->m_uCRC32 = uCRC32;
17266 	pContainer->m_uTokenizerFNV = pTokenizer->GetSettingsFNV();
17267 	pContainer->m_sIndexName = sIndex;
17268 
17269 	// open it
17270 	CSphString sError;
17271 	CSphAutoreader rdWordforms;
17272 	if ( !rdWordforms.Open ( szFile, sError ) )
17273 		return NULL;
17274 
17275 	// my tokenizer
17276 	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( false ) );
17277 	pMyTokenizer->AddSpecials ( ">" );
17278 
17279 	// scan it line by line
17280 	char sBuffer [ 6*SPH_MAX_WORD_LEN + 512 ]; // enough to hold 2 UTF-8 words, plus some whitespace overhead
17281 	int iLineLen;
17282 	bool bSeparatorFound = false;
17283 	CSphString sFrom;
17284 	CSphVector<CSphString> dKeys;
17285 	while ( ( iLineLen = rdWordforms.GetLine ( sBuffer, sizeof(sBuffer) ) )>=0 )
17286 	{
17287 		// parse the line
17288 		pMyTokenizer->SetBuffer ( (BYTE*)sBuffer, iLineLen );
17289 
17290 		CSphString sKey;
17291 		CSphString sBlended;
17292 		bool bStopwordsPresent = false;
17293 		bool bMultiform = false;
17294 		dKeys.Resize ( 0 );
17295 
17296 		BYTE * pFrom = NULL;
17297 		while ( ( pFrom = pMyTokenizer->GetToken () )!=NULL )
17298 		{
17299 			// blended got tokenized
17300 			// left> as separate parts and whole blended
17301 			// >right as single token
17302 			if ( pMyTokenizer->TokenIsBlended() )
17303 			{
17304 				if ( !sBlended.IsEmpty() && ( sBlended!=(const char *)pFrom ) )
17305 				{
17306 					sphWarning ( "wordform contain multiple blended (might be 1 blended keyword) ( wordforms='%s' ). Fix your wordforms file '%s'.",
17307 						sBuffer, szFile );
17308 					break;
17309 				}
17310 
17311 				sBlended = (const char*)pFrom;
17312 				continue;
17313 			}
17314 
17315 			const BYTE * pCur = (const BYTE *) pMyTokenizer->GetBufferPtr ();
17316 
17317 			while ( isspace(*pCur) ) pCur++;
17318 			if ( *pCur=='>' )
17319 			{
17320 				// TODO: check and warn on tail tokens at right part of wordform
17321 				sFrom = (const char*)pFrom;
17322 				bSeparatorFound = true;
17323 				pMyTokenizer->SetBufferPtr ( (const char*) pCur+1 );
17324 				break;
17325 			} else
17326 			{
17327 				bMultiform = true;
17328 				if ( sKey.IsEmpty() )
17329 				{
17330 					sKey = (const char*)pFrom;
17331 				} else
17332 				{
17333 					dKeys.Add ( (const char*)pFrom );
17334 					if ( !bStopwordsPresent && !GetWordID ( pFrom, dKeys.Last().Length(), true ) )
17335 						bStopwordsPresent = true;
17336 				}
17337 			}
17338 		}
17339 
17340 		if ( !pFrom ) continue; // FIXME! report parsing error
17341 		if ( !bSeparatorFound ) continue; // FIXME! report parsing error
17342 
17343 		BYTE * pTo = pMyTokenizer->GetToken ();
17344 		if ( !pTo ) continue; // FIXME! report parsing error
17345 
17346 		CSphString sTo ( (const char *)pTo );
17347 
17348 		int iLastTokenLen = pMyTokenizer->GetLastTokenLen();
17349 		if ( !pMyTokenizer->TokenIsBlended () && pMyTokenizer->GetToken () )
17350 			sphWarning ( "invalid mapping (must be exactly 1 destination keyword) ( wordforms='%s' ). Fix your wordforms file '%s'.",
17351 						sBuffer, szFile );
17352 
17353 		if ( bMultiform )
17354 		{
17355 			dKeys.Add ( sFrom );
17356 
17357 			bool bToIsStopword = !GetWordID ( pTo, sTo.Length(), true );
17358 			bool bKeyIsStopword = !GetWordID ( (BYTE *)sKey.cstr(), sKey.Length(), true );
17359 
17360 			if ( bToIsStopword || bStopwordsPresent || bKeyIsStopword )
17361 			{
17362 				const int MAX_REPORT_LEN = 1024;
17363 				char szStopwordReport[MAX_REPORT_LEN];
17364 				szStopwordReport[0] = '\0';
17365 
17366 				ARRAY_FOREACH ( i, dKeys )
17367 				{
17368 					int iStrLen = dKeys[i].Length();
17369 					int iLen = strlen ( szStopwordReport );
17370 					if ( iLen + iStrLen + 2 > MAX_REPORT_LEN )
17371 						break;
17372 
17373 					strncat ( szStopwordReport, dKeys[i].cstr(), iLen );	// NOLINT
17374 					iLen += iStrLen;
17375 					szStopwordReport[iLen] = ' ';
17376 					szStopwordReport[iLen+1] = '\0';
17377 				}
17378 
17379 				sphWarning ( "wordforms contain stopwords ( wordform='%s %s> %s' ). Fix your wordforms file '%s'.",
17380 					sKey.cstr(), szStopwordReport, sTo.cstr(), szFile );
17381 			}
17382 
17383 			if ( bToIsStopword )
17384 				continue;
17385 
17386 			if ( bStopwordsPresent )
17387 				ARRAY_FOREACH ( i, dKeys )
17388 					if ( !GetWordID ( (BYTE *)( dKeys[i].cstr() ), dKeys[i].Length(), true ) )
17389 					{
17390 						dKeys.Remove(i);
17391 						i--;
17392 					}
17393 
17394 			if ( bKeyIsStopword )
17395 			{
17396 				if ( dKeys.GetLength() )
17397 				{
17398 					sKey.Swap ( dKeys[0] );
17399 					dKeys.Remove(0);
17400 				} else
17401 					continue;
17402 			}
17403 
17404 			if ( !dKeys.GetLength() )
17405 			{
17406 				sFrom = sKey;
17407 			}
17408 		} else
17409 		{
17410 			if ( !GetWordID ( (BYTE *)sFrom.cstr(), sFrom.Length(), true ) || !GetWordID ( pTo, sTo.Length(), true ) )
17411 			{
17412 				sphWarning ( "wordforms contain stopwords ( wordform='%s > %s' ). Fix your wordforms file '%s'.",
17413 					sFrom.cstr(), sTo.cstr(), szFile );
17414 
17415 				continue;
17416 			}
17417 		}
17418 
17419 		const CSphString & sSourceWordform = ( bMultiform ? sTo : sFrom );
17420 
17421 		// check wordform that source token is a new token or has same destination token
17422 		int * pRefTo = pContainer->m_dHash ( sSourceWordform );
17423 		assert ( !pRefTo || ( *pRefTo>=0 && *pRefTo<pContainer->m_dNormalForms.GetLength() ) );
17424 		if ( !bMultiform && pRefTo && pContainer->m_dNormalForms[*pRefTo]!=sTo )
17425 		{
17426 			const CSphString & sRefTo = pContainer->m_dNormalForms[*pRefTo];
17427 			sphWarning ( "duplicate wordform found - skipped ( current='%s > %s', stored='%s > %s' ). Fix your wordforms file '%s'.",
17428 				sSourceWordform.cstr(), sTo.cstr(), sSourceWordform.cstr(), sRefTo.cstr(), szFile );
17429 		}
17430 
17431 		if ( pRefTo && !bMultiform )
17432 			continue;
17433 
17434 		if ( !pRefTo )
17435 		{
17436 			pContainer->m_dNormalForms.AddUnique ( sTo );
17437 			pContainer->m_dHash.Add ( pContainer->m_dNormalForms.GetLength()-1, sSourceWordform );
17438 		}
17439 
17440 		if ( bMultiform )
17441 		{
17442 			CSphMultiform * pMultiWordform = new CSphMultiform;
17443 			pMultiWordform->m_sNormalForm = sTo;
17444 			pMultiWordform->m_iNormalTokenLen = iLastTokenLen;
17445 			pMultiWordform->m_dTokens = dKeys;
17446 			if ( !pContainer->m_pMultiWordforms )
17447 				pContainer->m_pMultiWordforms = new CSphMultiformContainer;
17448 
17449 			CSphMultiforms ** pWordforms = pContainer->m_pMultiWordforms->m_Hash ( sKey );
17450 			if ( pWordforms )
17451 			{
17452 				(*pWordforms)->m_dWordforms.Add ( pMultiWordform );
17453 				(*pWordforms)->m_iMinTokens = Min ( (*pWordforms)->m_iMinTokens, pMultiWordform->m_dTokens.GetLength () );
17454 				(*pWordforms)->m_iMaxTokens = Max ( (*pWordforms)->m_iMaxTokens, pMultiWordform->m_dTokens.GetLength () );
17455 				pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, (*pWordforms)->m_iMaxTokens );
17456 			} else
17457 			{
17458 				CSphMultiforms * pNewWordforms = new CSphMultiforms;
17459 				pNewWordforms->m_dWordforms.Add ( pMultiWordform );
17460 				pNewWordforms->m_iMinTokens = pMultiWordform->m_dTokens.GetLength ();
17461 				pNewWordforms->m_iMaxTokens = pMultiWordform->m_dTokens.GetLength ();
17462 				pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, pNewWordforms->m_iMaxTokens );
17463 				pContainer->m_pMultiWordforms->m_Hash.Add ( pNewWordforms, sKey );
17464 			}
17465 
17466 			if ( !sBlended.IsEmpty() )
17467 			{
17468 				pMultiWordform = new CSphMultiform;
17469 				pMultiWordform->m_sNormalForm = sTo;
17470 				pMultiWordform->m_iNormalTokenLen = iLastTokenLen;
17471 				pMultiWordform->m_dTokens.Reserve ( dKeys.GetLength()+1 );
17472 				pMultiWordform->m_dTokens.Add ( sKey );
17473 				ARRAY_FOREACH ( i, dKeys )
17474 					pMultiWordform->m_dTokens.Add ( dKeys[i] );
17475 
17476 				pWordforms = pContainer->m_pMultiWordforms->m_Hash ( sBlended );
17477 				if ( pWordforms )
17478 				{
17479 					(*pWordforms)->m_dWordforms.Add ( pMultiWordform );
17480 					(*pWordforms)->m_iMinTokens = Min ( (*pWordforms)->m_iMinTokens, pMultiWordform->m_dTokens.GetLength () );
17481 					(*pWordforms)->m_iMaxTokens = Max ( (*pWordforms)->m_iMaxTokens, pMultiWordform->m_dTokens.GetLength () );
17482 					pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, (*pWordforms)->m_iMaxTokens );
17483 				} else
17484 				{
17485 					CSphMultiforms * pNewWordforms = new CSphMultiforms;
17486 					pNewWordforms->m_dWordforms.Add ( pMultiWordform );
17487 					pNewWordforms->m_iMinTokens = pMultiWordform->m_dTokens.GetLength ();
17488 					pNewWordforms->m_iMaxTokens = pMultiWordform->m_dTokens.GetLength ();
17489 					pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, pNewWordforms->m_iMaxTokens );
17490 					pContainer->m_pMultiWordforms->m_Hash.Add ( pNewWordforms, sBlended );
17491 				}
17492 			}
17493 		}
17494 	}
17495 
17496 	return pContainer;
17497 }
17498 
17499 
LoadWordforms(const char * szFile,ISphTokenizer * pTokenizer,const char * sIndex)17500 bool CSphDictCRCTraits::LoadWordforms ( const char * szFile, ISphTokenizer * pTokenizer, const char * sIndex )
17501 {
17502 	CSphString sError;
17503 	bool bGotStat = GetFileStats ( szFile, m_tWFFileInfo, &sError );
17504 	if ( szFile && *szFile && !bGotStat )
17505 	{
17506 		sphWarning ( "wordforms: failed to read file %s", szFile );
17507 		return false;
17508 	}
17509 
17510 	DWORD uCRC32 = m_tWFFileInfo.m_uCRC32;
17511 
17512 	SweepWordformContainers ( szFile, uCRC32 );
17513 	m_pWordforms = GetWordformContainer ( szFile, uCRC32, pTokenizer, sIndex );
17514 	if ( m_pWordforms )
17515 		m_pWordforms->m_iRefCount++;
17516 
17517 	return !!m_pWordforms;
17518 }
17519 
17520 
SetMorphology(const char * szMorph,bool bUseUTF8,CSphString & sError)17521 bool CSphDictCRCTraits::SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError )
17522 {
17523 	m_dMorph.Reset ();
17524 
17525 #if USE_LIBSTEMMER
17526 	ARRAY_FOREACH ( i, m_dStemmers )
17527 		sb_stemmer_delete ( m_dStemmers[i] );
17528 
17529 	m_dStemmers.Reset ();
17530 #endif
17531 
17532 	if ( !szMorph )
17533 		return true;
17534 
17535 	CSphString sOption = szMorph;
17536 	sOption.ToLower ();
17537 
17538 	sError = "";
17539 	if ( !ParseMorphology ( sOption.cstr (), bUseUTF8, sError ) )
17540 	{
17541 		m_dMorph.Reset ();
17542 
17543 		if ( sError.IsEmpty () )
17544 			sError.SetSprintf ( "invalid morphology option '%s' - IGNORED", sOption.cstr() );
17545 
17546 		return false;
17547 	}
17548 
17549 	return true;
17550 }
17551 
17552 
HasMorphology() const17553 bool CSphDictCRCTraits::HasMorphology() const
17554 {
17555 	return ( m_dMorph.GetLength()>0 );
17556 }
17557 
17558 
17559 /// common id-based stemmer
StemById(BYTE * pWord,int iStemmer)17560 bool CSphDictCRCTraits::StemById ( BYTE * pWord, int iStemmer )
17561 {
17562 	char szBuf [ MAX_KEYWORD_BYTES ];
17563 
17564 	// safe quick strncpy without (!) padding and with a side of strlen
17565 	char * p = szBuf;
17566 	char * pMax = szBuf + sizeof(szBuf) - 1;
17567 	BYTE * pLastSBS = NULL;
17568 	while ( *pWord && p<pMax )
17569 	{
17570 		pLastSBS = ( *pWord )<0x80 ? pWord : pLastSBS;
17571 		*p++ = *pWord++;
17572 	}
17573 	int iLen = p - szBuf;
17574 	*p = '\0';
17575 	pWord -= iLen;
17576 
17577 	switch ( iStemmer )
17578 	{
17579 	case SPH_MORPH_STEM_EN:
17580 		stem_en ( pWord, iLen );
17581 		break;
17582 
17583 	case SPH_MORPH_STEM_RU_CP1251:
17584 		stem_ru_cp1251 ( pWord );
17585 		break;
17586 
17587 	case SPH_MORPH_STEM_RU_UTF8:
17588 		// skip stemming in case of SBC at the end of the word
17589 		if ( pLastSBS && ( pLastSBS-pWord+1 )>=iLen )
17590 			break;
17591 
17592 		// stem only UTF8 tail
17593 		if ( !pLastSBS )
17594 		{
17595 			stem_ru_utf8 ( (WORD*)pWord );
17596 		} else
17597 		{
17598 			stem_ru_utf8 ( (WORD *)( pLastSBS+1 ) );
17599 		}
17600 		break;
17601 
17602 	case SPH_MORPH_STEM_CZ:
17603 		stem_cz ( pWord );
17604 		break;
17605 
17606 	case SPH_MORPH_SOUNDEX:
17607 		stem_soundex ( pWord );
17608 		break;
17609 
17610 	case SPH_MORPH_METAPHONE_SBCS:
17611 		stem_dmetaphone ( pWord, false );
17612 		break;
17613 
17614 	case SPH_MORPH_METAPHONE_UTF8:
17615 		stem_dmetaphone ( pWord, true );
17616 		break;
17617 
17618 	default:
17619 #if USE_LIBSTEMMER
17620 		if ( iStemmer>=SPH_MORPH_LIBSTEMMER_FIRST && iStemmer<SPH_MORPH_LIBSTEMMER_LAST )
17621 		{
17622 			sb_stemmer * pStemmer = m_dStemmers [iStemmer - SPH_MORPH_LIBSTEMMER_FIRST];
17623 			assert ( pStemmer );
17624 
17625 			const sb_symbol * sStemmed = sb_stemmer_stem ( pStemmer, (sb_symbol*)pWord, strlen ( (const char*)pWord ) );
17626 			int iLen = sb_stemmer_length ( pStemmer );
17627 
17628 			memcpy ( pWord, sStemmed, iLen );
17629 			pWord[iLen] = '\0';
17630 		} else
17631 			return false;
17632 
17633 	break;
17634 #else
17635 		return false;
17636 #endif
17637 	}
17638 
17639 	return strcmp ( (char *)pWord, szBuf )!=0;
17640 }
17641 
DictBegin(CSphAutofile &,CSphAutofile & tDictFile,int)17642 void CSphDictCRCTraits::DictBegin ( CSphAutofile &, CSphAutofile & tDictFile, int )
17643 {
17644 	m_wrDict.CloseFile ();
17645 	m_wrDict.SetFile ( tDictFile, NULL, m_sWriterError );
17646 	m_wrDict.PutByte ( 1 );
17647 }
17648 
DictEnd(SphOffset_t * pCheckpointsPos,int * pCheckpointsCount,int,CSphString & sError)17649 bool CSphDictCRCTraits::DictEnd ( SphOffset_t * pCheckpointsPos, int * pCheckpointsCount, int, CSphString & sError )
17650 {
17651 	// flush wordlist checkpoints
17652 	*pCheckpointsPos = m_wrDict.GetPos();
17653 	*pCheckpointsCount = m_dCheckpoints.GetLength();
17654 
17655 	ARRAY_FOREACH ( i, m_dCheckpoints )
17656 	{
17657 		assert ( m_dCheckpoints[i].m_iWordlistOffset );
17658 		m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordID );
17659 		m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordlistOffset );
17660 	}
17661 
17662 	m_wrDict.CloseFile ();
17663 
17664 	if ( m_wrDict.IsError() )
17665 		sError = m_sWriterError;
17666 	return !m_wrDict.IsError();
17667 }
17668 
DictEntry(SphWordID_t uWordID,BYTE *,int iDocs,int iHits,SphOffset_t iDoclistOffset,SphOffset_t)17669 void CSphDictCRCTraits::DictEntry ( SphWordID_t uWordID, BYTE *, int iDocs, int iHits, SphOffset_t iDoclistOffset, SphOffset_t )
17670 {
17671 	// insert wordlist checkpoint
17672 	if ( ( m_iEntries % SPH_WORDLIST_CHECKPOINT )==0 )
17673 	{
17674 		if ( m_iEntries ) // but not the 1st entry
17675 		{
17676 			assert ( iDoclistOffset > m_iLastDoclistPos );
17677 			m_wrDict.ZipInt ( 0 ); // indicate checkpoint
17678 			m_wrDict.ZipOffset ( iDoclistOffset - m_iLastDoclistPos ); // store last length
17679 		}
17680 
17681 		// restart delta coding, once per SPH_WORDLIST_CHECKPOINT entries
17682 		m_iLastWordID = 0;
17683 		m_iLastDoclistPos = 0;
17684 
17685 		// begin new wordlist entry
17686 		assert ( m_wrDict.GetPos()<=UINT_MAX );
17687 
17688 		CSphWordlistCheckpoint & tCheckpoint = m_dCheckpoints.Add();
17689 		tCheckpoint.m_iWordID = uWordID;
17690 		tCheckpoint.m_iWordlistOffset = m_wrDict.GetPos();
17691 	}
17692 
17693 	assert ( iDoclistOffset > m_iLastDoclistPos );
17694 	m_wrDict.ZipOffset ( uWordID - m_iLastWordID ); // FIXME! slow with 32bit wordids
17695 	m_wrDict.ZipOffset ( iDoclistOffset - m_iLastDoclistPos );
17696 
17697 	m_iLastWordID = uWordID;
17698 	m_iLastDoclistPos = iDoclistOffset;
17699 
17700 	assert ( iDocs );
17701 	assert ( iHits );
17702 	m_wrDict.ZipInt ( iDocs );
17703 	m_wrDict.ZipInt ( iHits );
17704 
17705 	m_iEntries++;
17706 }
17707 
DictEndEntries(SphOffset_t iDoclistOffset)17708 void CSphDictCRCTraits::DictEndEntries ( SphOffset_t iDoclistOffset )
17709 {
17710 	assert ( iDoclistOffset>=m_iLastDoclistPos );
17711 	m_wrDict.ZipInt ( 0 ); // indicate checkpoint
17712 	m_wrDict.ZipOffset ( iDoclistOffset - m_iLastDoclistPos ); // store last doclist length
17713 }
17714 
17715 //////////////////////////////////////////////////////////////////////////
17716 // KEYWORDS STORING DICTIONARY
17717 //////////////////////////////////////////////////////////////////////////
17718 
17719 class CSphDictKeywords : public CSphDictCRC<true>
17720 {
17721 private:
17722 	static const int				SLOTS			= 65536;
17723 	static const int				ENTRY_CHUNK		= 65536;
17724 	static const int				KEYWORD_CHUNK	= 1048576;
17725 	static const int				DICT_CHUNK		= 65536;
17726 
17727 public:
17728 	// OPTIMIZE? change pointers to 8:24 locators to save RAM on x64 gear?
17729 	struct HitblockKeyword_t
17730 	{
17731 		SphWordID_t					m_uWordid;			// locally unique word id (crc value, adjusted in case of collsion)
17732 		HitblockKeyword_t *			m_pNextHash;		// next hashed entry
17733 		char *						m_pKeyword;			// keyword
17734 	};
17735 
17736 	struct HitblockException_t
17737 	{
17738 		HitblockKeyword_t *			m_pEntry;			// hash entry
17739 		SphWordID_t					m_uCRC;				// original unadjusted crc
17740 
operator <CSphDictKeywords::HitblockException_t17741 		bool operator < ( const HitblockException_t & rhs ) const
17742 		{
17743 			return m_pEntry->m_uWordid < rhs.m_pEntry->m_uWordid;
17744 		}
17745 	};
17746 
17747 	struct DictKeyword_t
17748 	{
17749 		char *						m_sKeyword;
17750 		SphOffset_t					m_uOff;
17751 		int							m_iDocs;
17752 		int							m_iHits;
17753 		BYTE						m_uHint;
17754 	};
17755 
17756 	struct DictBlock_t
17757 	{
17758 		SphOffset_t					m_iPos;
17759 		int							m_iLen;
17760 	};
17761 
17762 private:
17763 	HitblockKeyword_t *				m_dHash [ SLOTS ];	///< hash by wordid (!)
17764 	CSphVector<HitblockException_t>	m_dExceptions;
17765 
17766 	bool							m_bHitblock;		///< should we store words on GetWordID or not
17767 	int								m_iMemUse;			///< current memory use by all the chunks
17768 	int								m_iDictLimit;		///< allowed memory limit for dict block collection
17769 
17770 	CSphVector<HitblockKeyword_t*>	m_dEntryChunks;		///< hash chunks, only used when indexing hitblocks
17771 	HitblockKeyword_t *				m_pEntryChunk;
17772 	int								m_iEntryChunkFree;
17773 
17774 	CSphVector<BYTE*>				m_dKeywordChunks;	///< keyword storage
17775 	BYTE *							m_pKeywordChunk;
17776 	int								m_iKeywordChunkFree;
17777 
17778 	CSphVector<DictKeyword_t*>		m_dDictChunks;		///< dict entry chunks, only used when sorting final dict
17779 	DictKeyword_t *					m_pDictChunk;
17780 	int								m_iDictChunkFree;
17781 
17782 	int								m_iTmpFD;			///< temp dict file descriptor
17783 	CSphWriter						m_wrTmpDict;		///< temp dict writer
17784 	CSphVector<DictBlock_t>			m_dDictBlocks;		///< on-disk locations of dict entry blocks
17785 
17786 	char							m_sClippedWord[MAX_KEYWORD_BYTES]; ///< keyword storage for cliiped word
17787 
17788 private:
17789 	SphWordID_t						HitblockGetID ( const char * pWord, int iLen, SphWordID_t uCRC );
17790 	HitblockKeyword_t *				HitblockAddKeyword ( DWORD uHash, const char * pWord, int iLen, SphWordID_t uID );
17791 
17792 public:
17793 	explicit				CSphDictKeywords ();
17794 	virtual					~CSphDictKeywords ();
17795 
HitblockBegin()17796 	virtual void			HitblockBegin () { m_bHitblock = true; }
17797 	virtual void			HitblockPatch ( CSphWordHit * pHits, int iHits ) const;
17798 	virtual const char *	HitblockGetKeyword ( SphWordID_t uWordID );
HitblockGetMemUse()17799 	virtual int				HitblockGetMemUse () { return m_iMemUse; }
17800 	virtual void			HitblockReset ();
17801 
17802 	virtual void			DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit );
17803 	virtual void			DictEntry ( SphWordID_t uWordID, BYTE * sKeyword, int iDocs, int iHits, SphOffset_t iDoclistOffset, SphOffset_t iDoclistLength );
DictEndEntries(SphOffset_t)17804 	virtual void			DictEndEntries ( SphOffset_t ) {}
17805 	virtual bool			DictEnd ( SphOffset_t * pCheckpointsPos, int * pCheckpointsCount, int iMemLimit, CSphString & sError );
17806 
17807 	virtual SphWordID_t		GetWordID ( BYTE * pWord );
17808 	virtual SphWordID_t		GetWordIDWithMarkers ( BYTE * pWord );
17809 	virtual SphWordID_t		GetWordIDNonStemmed ( BYTE * pWord );
17810 	virtual SphWordID_t		GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops );
17811 
17812 private:
17813 	void					DictFlush ();
17814 };
17815 
17816 //////////////////////////////////////////////////////////////////////////
17817 
CSphDictKeywords()17818 CSphDictKeywords::CSphDictKeywords ()
17819 	: m_bHitblock ( false )
17820 	, m_iMemUse ( 0 )
17821 	, m_iDictLimit ( 0 )
17822 	, m_pEntryChunk ( NULL )
17823 	, m_iEntryChunkFree ( 0 )
17824 	, m_pKeywordChunk ( NULL )
17825 	, m_iKeywordChunkFree ( 0 )
17826 	, m_pDictChunk ( NULL )
17827 	, m_iDictChunkFree ( 0 )
17828 {
17829 	memset ( m_dHash, 0, sizeof(m_dHash) );
17830 }
17831 
~CSphDictKeywords()17832 CSphDictKeywords::~CSphDictKeywords ()
17833 {
17834 	HitblockReset();
17835 }
17836 
HitblockReset()17837 void CSphDictKeywords::HitblockReset()
17838 {
17839 	m_dExceptions.Resize ( 0 );
17840 
17841 	ARRAY_FOREACH ( i, m_dEntryChunks )
17842 		SafeDeleteArray ( m_dEntryChunks[i] );
17843 	m_dEntryChunks.Resize ( 0 );
17844 	m_pEntryChunk = NULL;
17845 	m_iEntryChunkFree = 0;
17846 
17847 	ARRAY_FOREACH ( i, m_dKeywordChunks )
17848 		SafeDeleteArray ( m_dKeywordChunks[i] );
17849 	m_dKeywordChunks.Resize ( 0 );
17850 	m_pKeywordChunk = NULL;
17851 	m_iKeywordChunkFree = 0;
17852 
17853 	m_iMemUse = 0;
17854 
17855 	memset ( m_dHash, 0, sizeof(m_dHash) );
17856 }
17857 
HitblockAddKeyword(DWORD uHash,const char * sWord,int iLen,SphWordID_t uID)17858 CSphDictKeywords::HitblockKeyword_t * CSphDictKeywords::HitblockAddKeyword ( DWORD uHash, const char * sWord, int iLen, SphWordID_t uID )
17859 {
17860 	assert ( iLen<MAX_KEYWORD_BYTES );
17861 
17862 	// alloc entry
17863 	if ( !m_iEntryChunkFree )
17864 	{
17865 		m_pEntryChunk = new HitblockKeyword_t [ ENTRY_CHUNK ];
17866 		m_iEntryChunkFree = ENTRY_CHUNK;
17867 		m_dEntryChunks.Add ( m_pEntryChunk );
17868 		m_iMemUse += sizeof(HitblockKeyword_t)*ENTRY_CHUNK;
17869 	}
17870 	HitblockKeyword_t * pEntry = m_pEntryChunk++;
17871 	m_iEntryChunkFree--;
17872 
17873 	// alloc keyword
17874 	iLen++;
17875 	if ( m_iKeywordChunkFree < iLen )
17876 	{
17877 		m_pKeywordChunk = new BYTE [ KEYWORD_CHUNK ];
17878 		m_iKeywordChunkFree = KEYWORD_CHUNK;
17879 		m_dKeywordChunks.Add ( m_pKeywordChunk );
17880 		m_iMemUse += KEYWORD_CHUNK;
17881 	}
17882 
17883 	// fill it
17884 	memcpy ( m_pKeywordChunk, sWord, iLen );
17885 	m_pKeywordChunk[iLen-1] = '\0';
17886 	pEntry->m_pKeyword = (char*)m_pKeywordChunk;
17887 	pEntry->m_uWordid = uID;
17888 	m_pKeywordChunk += iLen;
17889 	m_iKeywordChunkFree -= iLen;
17890 
17891 	// mtf it
17892 	pEntry->m_pNextHash = m_dHash [ uHash ];
17893 	m_dHash [ uHash ] = pEntry;
17894 
17895 	return pEntry;
17896 }
17897 
HitblockGetID(const char * sWord,int iLen,SphWordID_t uCRC)17898 SphWordID_t CSphDictKeywords::HitblockGetID ( const char * sWord, int iLen, SphWordID_t uCRC )
17899 {
17900 	if ( iLen>=MAX_KEYWORD_BYTES-4 ) // fix of very long word (zones)
17901 	{
17902 		memcpy ( m_sClippedWord, sWord, MAX_KEYWORD_BYTES-4 );
17903 		memset ( m_sClippedWord+MAX_KEYWORD_BYTES-4, 0, 4 );
17904 
17905 		CSphString sOrig;
17906 		sOrig.SetBinary ( sWord, iLen );
17907 		sphWarn ( "word overrun buffer, clipped!!!\nclipped (len=%d, word='%s')\noriginal (len=%d, word='%s')", MAX_KEYWORD_BYTES-4, m_sClippedWord, iLen, sOrig.cstr() );
17908 
17909 		sWord = m_sClippedWord;
17910 		iLen = MAX_KEYWORD_BYTES-4;
17911 		uCRC = sphCRC32 ( (const BYTE *)m_sClippedWord, MAX_KEYWORD_BYTES-4 );
17912 	}
17913 
17914 	// is this a known one? find it
17915 	// OPTIMIZE? in theory we could use something faster than crc32; but quick lookup3 test did not show any improvements
17916 	const DWORD uHash = (DWORD)( uCRC % SLOTS );
17917 
17918 	HitblockKeyword_t * pEntry = m_dHash [ uHash ];
17919 	HitblockKeyword_t ** ppEntry = &m_dHash [ uHash ];
17920 	while ( pEntry )
17921 	{
17922 		// check crc
17923 		if ( pEntry->m_uWordid!=uCRC )
17924 		{
17925 			// crc mismatch, try next entry
17926 			ppEntry = &pEntry->m_pNextHash;
17927 			pEntry = pEntry->m_pNextHash;
17928 			continue;
17929 		}
17930 
17931 		// crc matches, check keyword
17932 		register int iWordLen = iLen;
17933 		register const char * a = pEntry->m_pKeyword;
17934 		register const char * b = sWord;
17935 		while ( *a==*b && iWordLen-- )
17936 		{
17937 			if ( !*a || !iWordLen )
17938 			{
17939 				// known word, mtf it, and return id
17940 				(*ppEntry) = pEntry->m_pNextHash;
17941 				pEntry->m_pNextHash = m_dHash [ uHash ];
17942 				m_dHash [ uHash ] = pEntry;
17943 				return pEntry->m_uWordid;
17944 			}
17945 			a++;
17946 			b++;
17947 		}
17948 
17949 		// collision detected!
17950 		// our crc is taken as a wordid, but keyword does not match
17951 		// welcome to the land of very tricky magic
17952 		//
17953 		// pEntry might either be a known exception, or a regular keyword
17954 		// sWord might either be a known exception, or a new one
17955 		// if they are not known, they needed to be added as exceptions now
17956 		//
17957 		// in case sWord is new, we need to assign a new unique wordid
17958 		// for that, we keep incrementing the crc until it is unique
17959 		// a starting point for wordid search loop would be handy
17960 		//
17961 		// let's scan the exceptions vector and work on all this
17962 		//
17963 		// NOTE, beware of the order, it is wordid asc, which does NOT guarantee crc asc
17964 		// example, assume crc(w1)==X, crc(w2)==X+1, crc(w3)==X (collides with w1)
17965 		// wordids will be X, X+1, X+2 but crcs will be X, X+1, X
17966 		//
17967 		// OPTIMIZE, might make sense to use binary search
17968 		// OPTIMIZE, add early out somehow
17969 		SphWordID_t uWordid = uCRC + 1;
17970 		const int iExcLen = m_dExceptions.GetLength();
17971 		int iExc = m_dExceptions.GetLength();
17972 		ARRAY_FOREACH ( i, m_dExceptions )
17973 		{
17974 			const HitblockKeyword_t * pExcWord = m_dExceptions[i].m_pEntry;
17975 
17976 			// incoming word is a known exception? just return the pre-assigned wordid
17977 			if ( m_dExceptions[i].m_uCRC==uCRC && strncmp ( pExcWord->m_pKeyword, sWord, iLen )==0 )
17978 				return pExcWord->m_uWordid;
17979 
17980 			// incoming word collided into a known exception? clear the matched entry; no need to re-add it (see below)
17981 			if ( pExcWord==pEntry )
17982 				pEntry = NULL;
17983 
17984 			// find first exception with wordid greater or equal to our candidate
17985 			if ( pExcWord->m_uWordid>=uWordid && iExc==iExcLen )
17986 				iExc = i;
17987 		}
17988 
17989 		// okay, this is a new collision
17990 		// if entry was a regular word, we have to add it
17991 		if ( pEntry )
17992 		{
17993 			m_dExceptions.Add();
17994 			m_dExceptions.Last().m_pEntry = pEntry;
17995 			m_dExceptions.Last().m_uCRC = uCRC;
17996 		}
17997 
17998 		// need to assign a new unique wordid now
17999 		// keep scanning both exceptions and keywords for collisions
18000 		for ( ;; )
18001 		{
18002 			// iExc must be either the first exception greater or equal to current candidate, or out of bounds
18003 			assert ( iExc==iExcLen || m_dExceptions[iExc].m_pEntry->m_uWordid>=uWordid );
18004 			assert ( iExc==0 || m_dExceptions[iExc-1].m_pEntry->m_uWordid<uWordid );
18005 
18006 			// candidate collides with a known exception? increment it, and keep looking
18007 			if ( iExc<iExcLen && m_dExceptions[iExc].m_pEntry->m_uWordid==uWordid )
18008 			{
18009 				uWordid++;
18010 				while ( iExc<iExcLen && m_dExceptions[iExc].m_pEntry->m_uWordid<uWordid )
18011 					iExc++;
18012 				continue;
18013 			}
18014 
18015 			// candidate collides with a keyword? must be a regular one; add it as an exception, and keep looking
18016 			HitblockKeyword_t * pCheck = m_dHash [ (DWORD)( uWordid % SLOTS ) ];
18017 			while ( pCheck )
18018 			{
18019 				if ( pCheck->m_uWordid==uWordid )
18020 					break;
18021 				pCheck = pCheck->m_pNextHash;
18022 			}
18023 
18024 			// no collisions; we've found our unique wordid!
18025 			if ( !pCheck )
18026 				break;
18027 
18028 			// got a collision; add it
18029 			HitblockException_t & tColl = m_dExceptions.Add();
18030 			tColl.m_pEntry = pCheck;
18031 			tColl.m_uCRC = pCheck->m_uWordid; // not a known exception; hence, wordid must equal crc
18032 
18033 			// and keep looking
18034 			uWordid++;
18035 			continue;
18036 		}
18037 
18038 		// and finally, we have that precious new wordid
18039 		// so hash our new unique under its new unique adjusted wordid
18040 		pEntry = HitblockAddKeyword ( (DWORD)( uWordid % SLOTS ), sWord, iLen, uWordid );
18041 
18042 		// add it as a collision too
18043 		m_dExceptions.Add();
18044 		m_dExceptions.Last().m_pEntry = pEntry;
18045 		m_dExceptions.Last().m_uCRC = uCRC;
18046 
18047 		// keep exceptions list sorted by wordid
18048 		m_dExceptions.Sort();
18049 
18050 		return pEntry->m_uWordid;
18051 	}
18052 
18053 	// new keyword with unique crc
18054 	pEntry = HitblockAddKeyword ( uHash, sWord, iLen, uCRC );
18055 	return pEntry->m_uWordid;
18056 }
18057 
18058 struct DictKeywordTagged_t : public CSphDictKeywords::DictKeyword_t
18059 {
18060 	int m_iBlock;
18061 };
18062 
18063 struct DictKeywordTaggedCmp_fn
18064 {
IsLessDictKeywordTaggedCmp_fn18065 	static inline bool IsLess ( const DictKeywordTagged_t & a, const DictKeywordTagged_t & b )
18066 	{
18067 		return strcmp ( a.m_sKeyword, b.m_sKeyword ) < 0;
18068 	}
18069 };
18070 
DictReadEntry(CSphBin * pBin,DictKeywordTagged_t & tEntry,BYTE * pKeyword)18071 static void DictReadEntry ( CSphBin * pBin, DictKeywordTagged_t & tEntry, BYTE * pKeyword )
18072 {
18073 	int iKeywordLen = pBin->ReadByte ();
18074 	if ( iKeywordLen<0 )
18075 	{
18076 		// early eof or read error; flag must be raised
18077 		assert ( pBin->IsError() );
18078 		return;
18079 	}
18080 
18081 	assert ( iKeywordLen>0 && iKeywordLen<MAX_KEYWORD_BYTES-1 );
18082 	if ( pBin->ReadBytes ( pKeyword, iKeywordLen )<0 )
18083 	{
18084 		assert ( pBin->IsError() );
18085 		return;
18086 	}
18087 	pKeyword[iKeywordLen] = '\0';
18088 
18089 	tEntry.m_sKeyword = (char*)pKeyword;
18090 	tEntry.m_uOff = pBin->UnzipOffset();
18091 	tEntry.m_iDocs = pBin->UnzipInt();
18092 	tEntry.m_iHits = pBin->UnzipInt();
18093 	tEntry.m_uHint = (BYTE) pBin->ReadByte();
18094 }
18095 
DictBegin(CSphAutofile & tTempDict,CSphAutofile & tDict,int iDictLimit)18096 void CSphDictKeywords::DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit )
18097 {
18098 	m_iTmpFD = tTempDict.GetFD();
18099 	m_wrTmpDict.CloseFile ();
18100 	m_wrTmpDict.SetFile ( tTempDict, NULL, m_sWriterError );
18101 
18102 	m_wrDict.CloseFile ();
18103 	m_wrDict.SetFile ( tDict, NULL, m_sWriterError );
18104 	m_wrDict.PutByte ( 1 );
18105 
18106 	m_iDictLimit = Max ( iDictLimit, KEYWORD_CHUNK + DICT_CHUNK*(int)sizeof(DictKeyword_t) ); // can't use less than 1 chunk
18107 }
18108 
DictEnd(SphOffset_t * pCheckpointsPos,int * pCheckpointsCount,int iMemLimit,CSphString & sError)18109 bool CSphDictKeywords::DictEnd ( SphOffset_t * pCheckpointsPos, int * pCheckpointsCount, int iMemLimit, CSphString & sError )
18110 {
18111 	DictFlush ();
18112 	m_wrTmpDict.CloseFile (); // tricky: file is not owned, so it won't get closed, and iTmpFD won't get invalidated
18113 
18114 	if ( !m_dDictBlocks.GetLength() )
18115 		m_wrDict.CloseFile();
18116 
18117 	if ( m_wrTmpDict.IsError() || m_wrDict.IsError() )
18118 	{
18119 		sError.SetSprintf ( "dictionary write error (out of space?)" );
18120 		return false;
18121 	}
18122 
18123 	if ( !m_dDictBlocks.GetLength() )
18124 	{
18125 		*pCheckpointsPos = m_wrDict.GetPos ();
18126 		*pCheckpointsCount = 0;
18127 		return true;
18128 	}
18129 
18130 	// initialize readers
18131 	CSphVector<CSphBin*> dBins ( m_dDictBlocks.GetLength() );
18132 
18133 	int iMaxBlock = 0;
18134 	ARRAY_FOREACH ( i, m_dDictBlocks )
18135 		iMaxBlock = Max ( iMaxBlock, m_dDictBlocks[i].m_iLen );
18136 
18137 	iMemLimit = Max ( iMemLimit, iMaxBlock*m_dDictBlocks.GetLength() );
18138 	int iBinSize = CSphBin::CalcBinSize ( iMemLimit, m_dDictBlocks.GetLength(), "sort_dict" );
18139 
18140 	SphOffset_t iSharedOffset = -1;
18141 	ARRAY_FOREACH ( i, m_dDictBlocks )
18142 	{
18143 		dBins[i] = new CSphBin();
18144 		dBins[i]->m_iFileLeft = m_dDictBlocks[i].m_iLen;
18145 		dBins[i]->m_iFilePos = m_dDictBlocks[i].m_iPos;
18146 		dBins[i]->Init ( m_iTmpFD, &iSharedOffset, iBinSize );
18147 	}
18148 
18149 	// keywords storage
18150 	BYTE * pKeywords = new BYTE [ MAX_KEYWORD_BYTES*dBins.GetLength() ];
18151 
18152 	#define LOC_CLEANUP() \
18153 		{ \
18154 			ARRAY_FOREACH ( iIdx, dBins ) \
18155 				SafeDelete ( dBins[iIdx] ); \
18156 			SafeDeleteArray ( pKeywords ); \
18157 		}
18158 
18159 	// do the sort
18160 	CSphQueue < DictKeywordTagged_t, DictKeywordTaggedCmp_fn > qWords ( dBins.GetLength() );
18161 	DictKeywordTagged_t tEntry;
18162 
18163 	ARRAY_FOREACH ( i, dBins )
18164 	{
18165 		DictReadEntry ( dBins[i], tEntry, pKeywords + i*MAX_KEYWORD_BYTES );
18166 		if ( dBins[i]->IsError() )
18167 		{
18168 			sError.SetSprintf ( "entry read error in dictionary sort (bin %d of %d)", i, dBins.GetLength() );
18169 			LOC_CLEANUP();
18170 			return false;
18171 		}
18172 
18173 		tEntry.m_iBlock = i;
18174 		qWords.Push ( tEntry );
18175 	}
18176 
18177 	CSphKeywordDeltaWriter tLastKeyword;
18178 	int iWords = 0;
18179 	while ( qWords.GetLength() )
18180 	{
18181 		const DictKeywordTagged_t & tWord = qWords.Root();
18182 		const int iLen = strlen ( tWord.m_sKeyword ); // OPTIMIZE?
18183 
18184 		// store checkpoints as needed
18185 		if ( ( iWords % SPH_WORDLIST_CHECKPOINT )==0 )
18186 		{
18187 			// emit a checkpoint, unless we're at the very dict beginning
18188 			if ( iWords )
18189 			{
18190 				m_wrDict.ZipInt ( 0 );
18191 				m_wrDict.ZipInt ( 0 );
18192 			}
18193 
18194 			BYTE * sClone = new BYTE [ iLen+1 ]; // OPTIMIZE? pool these?
18195 			memcpy ( sClone, tWord.m_sKeyword, iLen+1 );
18196 			sClone[iLen] = '\0';
18197 
18198 			CSphWordlistCheckpoint & tCheckpoint = m_dCheckpoints.Add ();
18199 			tCheckpoint.m_sWord = (char*) sClone;
18200 			tCheckpoint.m_iWordlistOffset = m_wrDict.GetPos();
18201 
18202 			tLastKeyword.Reset();
18203 		}
18204 		iWords++;
18205 
18206 		// write final dict entry
18207 		assert ( iLen );
18208 		assert ( tWord.m_uOff );
18209 		assert ( tWord.m_iDocs );
18210 		assert ( tWord.m_iHits );
18211 
18212 		tLastKeyword.PutDelta ( m_wrDict, (const BYTE *)tWord.m_sKeyword, iLen );
18213 		m_wrDict.ZipOffset ( tWord.m_uOff );
18214 		m_wrDict.ZipInt ( tWord.m_iDocs );
18215 		m_wrDict.ZipInt ( tWord.m_iHits );
18216 		if ( tWord.m_uHint )
18217 			m_wrDict.PutByte ( tWord.m_uHint );
18218 
18219 		// next
18220 		int iBin = tWord.m_iBlock;
18221 		qWords.Pop ();
18222 
18223 		if ( !dBins[iBin]->IsDone() )
18224 		{
18225 			DictReadEntry ( dBins[iBin], tEntry, pKeywords + iBin*MAX_KEYWORD_BYTES );
18226 			if ( dBins[iBin]->IsError() )
18227 			{
18228 				sError.SetSprintf ( "entry read error in dictionary sort (bin %d of %d)", iBin, dBins.GetLength() );
18229 				LOC_CLEANUP();
18230 				return false;
18231 			}
18232 
18233 			tEntry.m_iBlock = iBin;
18234 			qWords.Push ( tEntry );
18235 		}
18236 	}
18237 
18238 	// end of dictionary block
18239 	m_wrDict.ZipInt ( 0 );
18240 	m_wrDict.ZipInt ( 0 );
18241 
18242 	LOC_CLEANUP();
18243 	#undef LOC_CLEANUP
18244 
18245 	// flush wordlist checkpoints
18246 	*pCheckpointsPos = m_wrDict.GetPos();
18247 	*pCheckpointsCount = m_dCheckpoints.GetLength();
18248 
18249 	ARRAY_FOREACH ( i, m_dCheckpoints )
18250 	{
18251 		const int iLen = strlen ( m_dCheckpoints[i].m_sWord );
18252 
18253 		assert ( m_dCheckpoints[i].m_iWordlistOffset>0 );
18254 		assert ( iLen>0 && iLen<MAX_KEYWORD_BYTES );
18255 
18256 		m_wrDict.PutDword ( iLen );
18257 		m_wrDict.PutBytes ( m_dCheckpoints[i].m_sWord, iLen );
18258 		m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordlistOffset );
18259 
18260 		SafeDeleteArray ( m_dCheckpoints[i].m_sWord );
18261 	}
18262 
18263 	m_wrDict.CloseFile ();
18264 	if ( m_wrDict.IsError() )
18265 		sError.SetSprintf ( "dictionary write error (out of space?)" );
18266 
18267 	return !m_wrDict.IsError();
18268 }
18269 
18270 
18271 struct DictKeywordCmp_fn
18272 {
IsLessDictKeywordCmp_fn18273 	inline bool IsLess ( CSphDictKeywords::DictKeyword_t * a, CSphDictKeywords::DictKeyword_t * b ) const
18274 	{
18275 		return strcmp ( a->m_sKeyword, b->m_sKeyword ) < 0;
18276 	}
18277 };
18278 
DictFlush()18279 void CSphDictKeywords::DictFlush ()
18280 {
18281 	if ( !m_dDictChunks.GetLength() )
18282 		return;
18283 	assert ( m_dDictChunks.GetLength() && m_dKeywordChunks.GetLength() );
18284 
18285 	// sort em
18286 	int iTotalWords = m_dDictChunks.GetLength()*DICT_CHUNK - m_iDictChunkFree;
18287 	CSphVector<DictKeyword_t*> dWords ( iTotalWords );
18288 
18289 	int iIdx = 0;
18290 	ARRAY_FOREACH ( i, m_dDictChunks )
18291 	{
18292 		int iWords = DICT_CHUNK;
18293 		if ( i==m_dDictChunks.GetLength()-1 )
18294 			iWords -= m_iDictChunkFree;
18295 
18296 		DictKeyword_t * pWord = m_dDictChunks[i];
18297 		for ( int j=0; j<iWords; j++ )
18298 			dWords[iIdx++] = pWord++;
18299 	}
18300 
18301 	dWords.Sort ( DictKeywordCmp_fn() );
18302 
18303 	// write em
18304 	DictBlock_t & tBlock = m_dDictBlocks.Add();
18305 	tBlock.m_iPos = m_wrTmpDict.GetPos ();
18306 
18307 	ARRAY_FOREACH ( i, dWords )
18308 	{
18309 		const DictKeyword_t * pWord = dWords[i];
18310 		int iLen = strlen ( pWord->m_sKeyword );
18311 		m_wrTmpDict.PutByte ( iLen );
18312 		m_wrTmpDict.PutBytes ( pWord->m_sKeyword, iLen );
18313 		m_wrTmpDict.ZipOffset ( pWord->m_uOff );
18314 		m_wrTmpDict.ZipInt ( pWord->m_iDocs );
18315 		m_wrTmpDict.ZipInt ( pWord->m_iHits );
18316 		m_wrTmpDict.PutByte ( pWord->m_uHint );
18317 	}
18318 
18319 	tBlock.m_iLen = (int)( m_wrTmpDict.GetPos() - tBlock.m_iPos );
18320 
18321 	// clean up buffers
18322 	ARRAY_FOREACH ( i, m_dDictChunks )
18323 		SafeDeleteArray ( m_dDictChunks[i] );
18324 	m_dDictChunks.Resize ( 0 );
18325 	m_pDictChunk = NULL;
18326 	m_iDictChunkFree = 0;
18327 
18328 	ARRAY_FOREACH ( i, m_dKeywordChunks )
18329 		SafeDeleteArray ( m_dKeywordChunks[i] );
18330 	m_dKeywordChunks.Resize ( 0 );
18331 	m_pKeywordChunk = NULL;
18332 	m_iKeywordChunkFree = 0;
18333 
18334 	m_iMemUse = 0;
18335 }
18336 
DictEntry(SphWordID_t,BYTE * sKeyword,int iDocs,int iHits,SphOffset_t iDoclistOffset,SphOffset_t iDoclistLength)18337 void CSphDictKeywords::DictEntry ( SphWordID_t, BYTE * sKeyword, int iDocs, int iHits, SphOffset_t iDoclistOffset, SphOffset_t iDoclistLength )
18338 {
18339 	// they say, this might just happen during merge
18340 	// FIXME! can we make merge avoid sending such keywords to dict and assert here?
18341 	if ( !iDocs )
18342 		return;
18343 
18344 	assert ( iHits );
18345 	assert ( iDoclistLength>0 );
18346 
18347 	DictKeyword_t * pWord = NULL;
18348 	int iLen = strlen ( (char*)sKeyword ) + 1;
18349 
18350 	for ( ;; )
18351 	{
18352 		// alloc dict entry
18353 		if ( !m_iDictChunkFree )
18354 		{
18355 			if ( m_iDictLimit && ( m_iMemUse + (int)sizeof(DictKeyword_t)*DICT_CHUNK )>m_iDictLimit )
18356 				DictFlush ();
18357 
18358 			m_pDictChunk = new DictKeyword_t [ DICT_CHUNK ];
18359 			m_iDictChunkFree = DICT_CHUNK;
18360 			m_dDictChunks.Add ( m_pDictChunk );
18361 			m_iMemUse += sizeof(DictKeyword_t)*DICT_CHUNK;
18362 		}
18363 
18364 		// alloc keyword
18365 		if ( m_iKeywordChunkFree < iLen )
18366 		{
18367 			if ( m_iDictLimit && ( m_iMemUse + KEYWORD_CHUNK )>m_iDictLimit )
18368 			{
18369 				DictFlush ();
18370 				continue; // because we just flushed pWord
18371 			}
18372 
18373 			m_pKeywordChunk = new BYTE [ KEYWORD_CHUNK ];
18374 			m_iKeywordChunkFree = KEYWORD_CHUNK;
18375 			m_dKeywordChunks.Add ( m_pKeywordChunk );
18376 			m_iMemUse += KEYWORD_CHUNK;
18377 		}
18378 		// aw kay
18379 		break;
18380 	}
18381 
18382 	pWord = m_pDictChunk++;
18383 	m_iDictChunkFree--;
18384 	pWord->m_sKeyword = (char*)m_pKeywordChunk;
18385 	memcpy ( m_pKeywordChunk, sKeyword, iLen );
18386 	m_pKeywordChunk[iLen-1] = '\0';
18387 	m_pKeywordChunk += iLen;
18388 	m_iKeywordChunkFree -= iLen;
18389 
18390 	pWord->m_uOff = iDoclistOffset;
18391 	pWord->m_iDocs = iDocs;
18392 	pWord->m_iHits = iHits;
18393 	pWord->m_uHint = sphDoclistHintPack ( iDocs, iDoclistLength );
18394 }
18395 
GetWordID(BYTE * pWord)18396 SphWordID_t CSphDictKeywords::GetWordID ( BYTE * pWord )
18397 {
18398 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordID ( pWord );
18399 	if ( !uCRC || !m_bHitblock )
18400 		return uCRC;
18401 
18402 	int iLen = strlen ( (const char *)pWord );
18403 	return HitblockGetID ( (const char *)pWord, iLen, uCRC );
18404 }
18405 
GetWordIDWithMarkers(BYTE * pWord)18406 SphWordID_t CSphDictKeywords::GetWordIDWithMarkers ( BYTE * pWord )
18407 {
18408 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordIDWithMarkers ( pWord );
18409 	if ( !uCRC || !m_bHitblock )
18410 		return uCRC;
18411 
18412 	int iLen = strlen ( (const char *)pWord );
18413 	return HitblockGetID ( (const char *)pWord, iLen, uCRC );
18414 }
18415 
GetWordIDNonStemmed(BYTE * pWord)18416 SphWordID_t CSphDictKeywords::GetWordIDNonStemmed ( BYTE * pWord )
18417 {
18418 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordIDNonStemmed ( pWord );
18419 	if ( !uCRC || !m_bHitblock )
18420 		return uCRC;
18421 
18422 	int iLen = strlen ( (const char *)pWord );
18423 	return HitblockGetID ( (const char *)pWord, iLen, uCRC );
18424 }
18425 
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)18426 SphWordID_t CSphDictKeywords::GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
18427 {
18428 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordID ( pWord, iLen, bFilterStops );
18429 	if ( !uCRC || !m_bHitblock )
18430 		return uCRC;
18431 
18432 	return HitblockGetID ( (const char *)pWord, iLen, uCRC ); // !COMMIT would break, we kind of strcmp inside; but must never get called?
18433 }
18434 
18435 /// binary search for the first hit with wordid greater than or equal to reference
FindFirstGte(CSphWordHit * pHits,int iHits,SphWordID_t uID)18436 static CSphWordHit * FindFirstGte ( CSphWordHit * pHits, int iHits, SphWordID_t uID )
18437 {
18438 	if ( pHits->m_iWordID==uID )
18439 		return pHits;
18440 
18441 	CSphWordHit * pL = pHits;
18442 	CSphWordHit * pR = pHits + iHits - 1;
18443 	if ( pL->m_iWordID > uID || pR->m_iWordID < uID )
18444 		return NULL;
18445 
18446 	while ( pR-pL!=1 )
18447 	{
18448 		CSphWordHit * pM = pL + ( pR-pL )/2;
18449 		if ( pM->m_iWordID < uID )
18450 			pL = pM;
18451 		else
18452 			pR = pM;
18453 	}
18454 
18455 	assert ( pR-pL==1 );
18456 	assert ( pL->m_iWordID<uID );
18457 	assert ( pR->m_iWordID>=uID );
18458 	return pR;
18459 }
18460 
18461 /// full crc and keyword check
FullIsLess(const CSphDictKeywords::HitblockException_t & a,const CSphDictKeywords::HitblockException_t & b)18462 static inline bool FullIsLess ( const CSphDictKeywords::HitblockException_t & a, const CSphDictKeywords::HitblockException_t & b )
18463 {
18464 	if ( a.m_uCRC!=b.m_uCRC )
18465 		return a.m_uCRC < b.m_uCRC;
18466 	return strcmp ( a.m_pEntry->m_pKeyword, b.m_pEntry->m_pKeyword ) < 0;
18467 }
18468 
18469 /// sort functor to compute collided hits reordering
18470 struct HitblockPatchSort_fn
18471 {
18472 	const CSphDictKeywords::HitblockException_t * m_pExc;
18473 
HitblockPatchSort_fnHitblockPatchSort_fn18474 	explicit HitblockPatchSort_fn ( const CSphDictKeywords::HitblockException_t * pExc )
18475 		: m_pExc ( pExc )
18476 	{}
18477 
IsLessHitblockPatchSort_fn18478 	bool IsLess ( int a, int b ) const
18479 	{
18480 		return FullIsLess ( m_pExc[a], m_pExc[b] );
18481 	}
18482 };
18483 
18484 /// do hit block patching magic
HitblockPatch(CSphWordHit * pHits,int iHits) const18485 void CSphDictKeywords::HitblockPatch ( CSphWordHit * pHits, int iHits ) const
18486 {
18487 	if ( !pHits || iHits<=0 )
18488 		return;
18489 
18490 	const CSphVector<HitblockException_t> & dExc = m_dExceptions; // shortcut
18491 	CSphVector<CSphWordHit*> dChunk;
18492 
18493 	// reorder hit chunks for exceptions (aka crc collisions)
18494 	for ( int iFirst = 0; iFirst < dExc.GetLength()-1; )
18495 	{
18496 		// find next span of collisions, iFirst inclusive, iMax exclusive ie. [iFirst,iMax)
18497 		// (note that exceptions array is always sorted)
18498 		SphWordID_t uFirstWordid = dExc[iFirst].m_pEntry->m_uWordid;
18499 		assert ( dExc[iFirst].m_uCRC==uFirstWordid );
18500 
18501 		int iMax = iFirst+1;
18502 		SphWordID_t uSpan = uFirstWordid+1;
18503 		while ( iMax < dExc.GetLength() && dExc[iMax].m_pEntry->m_uWordid==uSpan )
18504 		{
18505 			iMax++;
18506 			uSpan++;
18507 		}
18508 
18509 		// check whether they are in proper order already
18510 		bool bSorted = true;
18511 		for ( int i=iFirst; i<iMax-1 && bSorted; i++ )
18512 			if ( FullIsLess ( dExc[i+1], dExc[i] ) )
18513 				bSorted = false;
18514 
18515 		// order is ok; skip this span
18516 		if ( bSorted )
18517 		{
18518 			iFirst = iMax;
18519 			continue;
18520 		}
18521 
18522 		// we need to fix up these collision hits
18523 		// convert them from arbitrary "wordid asc" to strict "crc asc, keyword asc" order
18524 		// lets begin with looking up hit chunks for every wordid
18525 		dChunk.Resize ( iMax-iFirst+1 );
18526 
18527 		// find the end
18528 		dChunk.Last() = FindFirstGte ( pHits, iHits, uFirstWordid+iMax-iFirst );
18529 		if ( !dChunk.Last() )
18530 		{
18531 			assert ( iMax==dExc.GetLength() && pHits[iHits-1].m_iWordID==uFirstWordid+iMax-1-iFirst );
18532 			dChunk.Last() = pHits+iHits;
18533 		}
18534 
18535 		// find the start
18536 		dChunk[0] = FindFirstGte ( pHits, dChunk.Last()-pHits, uFirstWordid );
18537 		assert ( dChunk[0] && dChunk[0]->m_iWordID==uFirstWordid );
18538 
18539 		// find the chunk starts
18540 		for ( int i=1; i<dChunk.GetLength()-1; i++ )
18541 		{
18542 			dChunk[i] = FindFirstGte ( dChunk[i-1], dChunk.Last()-dChunk[i-1], uFirstWordid+i );
18543 			assert ( dChunk[i] && dChunk[i]->m_iWordID==uFirstWordid+i );
18544 		}
18545 
18546 		CSphWordHit * pTemp;
18547 		if ( iMax-iFirst==2 )
18548 		{
18549 			// most frequent case, just two collisions
18550 			// OPTIMIZE? allocate buffer for the smaller chunk, not just first chunk
18551 			pTemp = new CSphWordHit [ dChunk[1]-dChunk[0] ];
18552 			memcpy ( pTemp, dChunk[0], ( dChunk[1]-dChunk[0] )*sizeof(CSphWordHit) );
18553 			memmove ( dChunk[0], dChunk[1], ( dChunk[2]-dChunk[1] )*sizeof(CSphWordHit) );
18554 			memcpy ( dChunk[0] + ( dChunk[2]-dChunk[1] ), pTemp, ( dChunk[1]-dChunk[0] )*sizeof(CSphWordHit) );
18555 		} else
18556 		{
18557 			// generic case, more than two
18558 			CSphVector<int> dReorder ( iMax-iFirst );
18559 			ARRAY_FOREACH ( i, dReorder )
18560 				dReorder[i] = i;
18561 
18562 			HitblockPatchSort_fn fnSort ( &dExc[iFirst] );
18563 			dReorder.Sort ( fnSort );
18564 
18565 			// OPTIMIZE? could skip heading and trailing blocks that are already in position
18566 			pTemp = new CSphWordHit [ dChunk.Last()-dChunk[0] ];
18567 			CSphWordHit * pOut = pTemp;
18568 
18569 			ARRAY_FOREACH ( i, dReorder )
18570 			{
18571 				int iChunk = dReorder[i];
18572 				int iChunkHits = dChunk[iChunk+1] - dChunk[iChunk];
18573 				memcpy ( pOut, dChunk[iChunk], iChunkHits*sizeof(CSphWordHit) );
18574 				pOut += iChunkHits;
18575 			}
18576 
18577 			assert ( ( pOut-pTemp )==( dChunk.Last()-dChunk[0] ) );
18578 			memcpy ( dChunk[0], pTemp, ( dChunk.Last()-dChunk[0] )*sizeof(CSphWordHit) );
18579 		}
18580 
18581 		// patching done
18582 		SafeDeleteArray ( pTemp );
18583 		iFirst = iMax;
18584 	}
18585 }
18586 
HitblockGetKeyword(SphWordID_t uWordID)18587 const char * CSphDictKeywords::HitblockGetKeyword ( SphWordID_t uWordID )
18588 {
18589 	const DWORD uHash = (DWORD)( uWordID % SLOTS );
18590 
18591 	HitblockKeyword_t * pEntry = m_dHash [ uHash ];
18592 	while ( pEntry )
18593 	{
18594 		// check crc
18595 		if ( pEntry->m_uWordid!=uWordID )
18596 		{
18597 			// crc mismatch, try next entry
18598 			pEntry = pEntry->m_pNextHash;
18599 			continue;
18600 		}
18601 
18602 		return pEntry->m_pKeyword;
18603 	}
18604 
18605 	assert ( m_dExceptions.GetLength() );
18606 	ARRAY_FOREACH ( i, m_dExceptions )
18607 		if ( m_dExceptions[i].m_pEntry->m_uWordid==uWordID )
18608 			return m_dExceptions[i].m_pEntry->m_pKeyword;
18609 
18610 	sphWarning ( "hash missing value in operator [] (wordid="INT64_FMT", hash=%d)", (int64_t)uWordID, uHash );
18611 	assert ( 0 && "hash missing value in operator []" );
18612 	return "\31oops";
18613 }
18614 
18615 
18616 //////////////////////////////////////////////////////////////////////////
18617 // KEYWORDS STORING DICTIONARY
18618 //////////////////////////////////////////////////////////////////////////
18619 
18620 class CRtDictKeywords : public ISphRtDictWraper
18621 {
18622 private:
18623 	CSphDict *				m_pBase;
18624 	SmallStringHash_T<int>	m_hKeywords;
18625 	CSphVector<BYTE>		m_dPackedKeywords;
18626 
18627 public:
CRtDictKeywords(CSphDict * pBase)18628 	explicit CRtDictKeywords ( CSphDict * pBase )
18629 		: m_pBase ( pBase )
18630 	{
18631 		m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
18632 	}
~CRtDictKeywords()18633 	virtual ~CRtDictKeywords() {}
18634 
GetWordID(BYTE * pWord)18635 	virtual SphWordID_t GetWordID ( BYTE * pWord )
18636 	{
18637 		SphWordID_t uCRC = m_pBase->GetWordID ( pWord );
18638 		if ( uCRC )
18639 			return AddKeyword ( pWord );
18640 		else
18641 			return 0;
18642 	}
18643 
GetWordIDWithMarkers(BYTE * pWord)18644 	virtual SphWordID_t GetWordIDWithMarkers ( BYTE * pWord )
18645 	{
18646 		SphWordID_t uCRC = m_pBase->GetWordIDWithMarkers ( pWord );
18647 		if ( uCRC )
18648 			return AddKeyword ( pWord );
18649 		else
18650 			return 0;
18651 	}
18652 
GetWordIDNonStemmed(BYTE * pWord)18653 	virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord )
18654 	{
18655 		SphWordID_t uCRC = m_pBase->GetWordIDNonStemmed ( pWord );
18656 		if ( uCRC )
18657 			return AddKeyword ( pWord );
18658 		else
18659 			return 0;
18660 	}
18661 
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)18662 	virtual SphWordID_t		GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
18663 	{
18664 		SphWordID_t uCRC = m_pBase->GetWordID ( pWord, iLen, bFilterStops );
18665 		if ( uCRC )
18666 			return AddKeyword ( pWord );
18667 		else
18668 			return 0;
18669 	}
18670 
GetPackedKeywords()18671 	virtual const BYTE * GetPackedKeywords () { return m_dPackedKeywords.Begin(); }
GetPackedLen()18672 	virtual int GetPackedLen () { return m_dPackedKeywords.GetLength(); }
ResetKeywords()18673 	virtual void ResetKeywords()
18674 	{
18675 		m_dPackedKeywords.Resize ( 0 );
18676 		m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
18677 		m_hKeywords.Reset();
18678 	}
18679 
AddKeyword(const BYTE * pWord)18680 	SphWordID_t AddKeyword ( const BYTE * pWord )
18681 	{
18682 		int iLen = strlen ( (const char *)pWord );
18683 		CSphString sWord;
18684 		sWord.SetBinary ( (const char *)pWord, iLen );
18685 
18686 		int * pOff = m_hKeywords ( sWord );
18687 		if ( pOff )
18688 		{
18689 			return *pOff;
18690 		}
18691 
18692 		assert ( iLen<255 );
18693 		int iOff = m_dPackedKeywords.GetLength();
18694 		m_dPackedKeywords.Resize ( iOff+iLen+1 );
18695 		m_dPackedKeywords[iOff] = (BYTE)( iLen & 0xFF );
18696 		memcpy ( m_dPackedKeywords.Begin()+iOff+1, pWord, iLen );
18697 
18698 		m_hKeywords.Add ( iOff, sWord );
18699 
18700 		return iOff;
18701 	}
18702 
LoadStopwords(const char * sFiles,ISphTokenizer * pTokenizer)18703 	virtual void LoadStopwords ( const char * sFiles, ISphTokenizer * pTokenizer ) { m_pBase->LoadStopwords ( sFiles, pTokenizer ); }
LoadWordforms(const char * sFile,ISphTokenizer * pTokenizer,const char * sIndex)18704 	virtual bool LoadWordforms ( const char * sFile, ISphTokenizer * pTokenizer, const char * sIndex ) { return m_pBase->LoadWordforms ( sFile, pTokenizer, sIndex ); }
SetMorphology(const char * szMorph,bool bUseUTF8,CSphString & sError)18705 	virtual bool SetMorphology ( const char * szMorph, bool bUseUTF8, CSphString & sError ) { return m_pBase->SetMorphology ( szMorph, bUseUTF8, sError ); }
Setup(const CSphDictSettings & tSettings)18706 	virtual void Setup ( const CSphDictSettings & tSettings ) { m_pBase->Setup ( tSettings ); }
GetSettings() const18707 	virtual const CSphDictSettings & GetSettings () const { return m_pBase->GetSettings(); }
GetStopwordsFileInfos()18708 	virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_pBase->GetStopwordsFileInfos(); }
GetWordformsFileInfo()18709 	virtual const CSphSavedFile & GetWordformsFileInfo () { return m_pBase->GetWordformsFileInfo(); }
GetMultiWordforms() const18710 	virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pBase->GetMultiWordforms(); }
IsStopWord(const BYTE * pWord) const18711 	virtual bool IsStopWord ( const BYTE * pWord ) const { return m_pBase->IsStopWord ( pWord ); }
18712 };
18713 
sphCreateRtKeywordsDictionaryWrapper(CSphDict * pBase)18714 ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase )
18715 {
18716 	return new CRtDictKeywords ( pBase );
18717 }
18718 
18719 
18720 //////////////////////////////////////////////////////////////////////////
18721 // DICTIONARY FACTORIES
18722 //////////////////////////////////////////////////////////////////////////
18723 
SetupDictionary(CSphDict * pDict,const CSphDictSettings & tSettings,ISphTokenizer * pTokenizer,CSphString & sError,const char * sIndex)18724 static CSphDict * SetupDictionary ( CSphDict * pDict, const CSphDictSettings & tSettings, ISphTokenizer * pTokenizer, CSphString & sError, const char * sIndex )
18725 {
18726 	assert ( pTokenizer );
18727 	assert ( pDict );
18728 
18729 	pDict->Setup ( tSettings );
18730 	if ( pDict->SetMorphology ( tSettings.m_sMorphology.cstr (), pTokenizer->IsUtf8(), sError ) )
18731 		sError = "";
18732 
18733 	pDict->LoadStopwords ( tSettings.m_sStopwords.cstr (), pTokenizer );
18734 	pDict->LoadWordforms ( tSettings.m_sWordforms.cstr (), pTokenizer, sIndex );
18735 	return pDict;
18736 }
18737 
18738 
sphCreateDictionaryCRC(const CSphDictSettings & tSettings,ISphTokenizer * pTokenizer,CSphString & sError,const char * sIndex)18739 CSphDict * sphCreateDictionaryCRC ( const CSphDictSettings & tSettings, ISphTokenizer * pTokenizer, CSphString & sError, const char * sIndex )
18740 {
18741 	CSphDict * pDict = NULL;
18742 	if ( tSettings.m_bCrc32 )
18743 		pDict = new CSphDictCRC<true> ();
18744 	else
18745 		pDict = new CSphDictCRC<false> ();
18746 	if ( !pDict )
18747 		return NULL;
18748 	return SetupDictionary ( pDict, tSettings, pTokenizer, sError, sIndex );
18749 }
18750 
18751 
sphCreateDictionaryKeywords(const CSphDictSettings & tSettings,ISphTokenizer * pTokenizer,CSphString & sError,const char * sIndex)18752 CSphDict * sphCreateDictionaryKeywords ( const CSphDictSettings & tSettings, ISphTokenizer * pTokenizer, CSphString & sError, const char * sIndex )
18753 {
18754 	CSphDict * pDict = new CSphDictKeywords();
18755 	return SetupDictionary ( pDict, tSettings, pTokenizer, sError, sIndex );
18756 }
18757 
18758 
sphShutdownWordforms()18759 void sphShutdownWordforms ()
18760 {
18761 	CSphDictCRCTraits::SweepWordformContainers ( NULL, 0 );
18762 }
18763 
18764 /////////////////////////////////////////////////////////////////////////////
18765 // HTML STRIPPER
18766 /////////////////////////////////////////////////////////////////////////////
18767 
sphIsTag(int c)18768 static inline int sphIsTag ( int c )
18769 {
18770 	return sphIsAlpha(c) || c=='.' || c==':';
18771 }
18772 
sphIsTagStart(int c)18773 static inline int sphIsTagStart ( int c )
18774 {
18775 	return ( c>='a' && c<='z' ) || ( c>='A' && c<='Z' ) || c=='_' || c=='.' || c==':';
18776 }
18777 
CSphHTMLStripper(bool bDefaultTags)18778 CSphHTMLStripper::CSphHTMLStripper ( bool bDefaultTags )
18779 {
18780 	if ( bDefaultTags )
18781 	{
18782 		// known inline tags
18783 		const char * dKnown[] =
18784 		{
18785 			"a", "b", "i", "s", "u",
18786 			"basefont", "big", "em", "font", "img",
18787 			"label", "small", "span", "strike", "strong",
18788 			"sub\0", "sup\0", // fix gcc 3.4.3 on solaris10 compiler bug
18789 			"tt"
18790 		};
18791 
18792 		m_dTags.Resize ( sizeof(dKnown)/sizeof(dKnown[0]) );
18793 		ARRAY_FOREACH ( i, m_dTags )
18794 		{
18795 			m_dTags[i].m_sTag = dKnown[i];
18796 			m_dTags[i].m_iTagLen = strlen ( dKnown[i] );
18797 			m_dTags[i].m_bInline = true;
18798 		}
18799 	}
18800 
18801 	UpdateTags ();
18802 }
18803 
18804 
GetCharIndex(int iCh) const18805 int CSphHTMLStripper::GetCharIndex ( int iCh ) const
18806 {
18807 	if ( iCh>='a' && iCh<='z' ) return iCh-'a';
18808 	if ( iCh>='A' && iCh<='Z' ) return iCh-'A';
18809 	if ( iCh=='_' ) return 26;
18810 	if ( iCh==':' ) return 27;
18811 	return -1;
18812 }
18813 
18814 
UpdateTags()18815 void CSphHTMLStripper::UpdateTags ()
18816 {
18817 	m_dTags.Sort ();
18818 
18819 	for ( int i=0; i<MAX_CHAR_INDEX; i++ )
18820 	{
18821 		m_dStart[i] = INT_MAX;
18822 		m_dEnd[i] = -1;
18823 	}
18824 
18825 	ARRAY_FOREACH ( i, m_dTags )
18826 	{
18827 		int iIdx = GetCharIndex ( m_dTags[i].m_sTag.cstr()[0] );
18828 		if ( iIdx<0 )
18829 			continue;
18830 
18831 		m_dStart[iIdx] = Min ( m_dStart[iIdx], i );
18832 		m_dEnd[iIdx] = Max ( m_dEnd[iIdx], i );
18833 	}
18834 }
18835 
18836 
SetIndexedAttrs(const char * sConfig,CSphString & sError)18837 bool CSphHTMLStripper::SetIndexedAttrs ( const char * sConfig, CSphString & sError )
18838 {
18839 	if ( !sConfig || !*sConfig )
18840 		return true;
18841 
18842 	char sTag[256], sAttr[256];
18843 
18844 	const char * p = sConfig, * s;
18845 	#define LOC_ERROR(_msg,_pos) { sError.SetSprintf ( "SetIndexedAttrs(): %s near '%s'", _msg, _pos ); return false; }
18846 
18847 	while ( *p )
18848 	{
18849 		// skip spaces
18850 		while ( *p && isspace(*p) ) p++;
18851 		if ( !*p ) break;
18852 
18853 		// check tag name
18854 		s = p; while ( sphIsTag(*p) ) p++;
18855 		if ( s==p ) LOC_ERROR ( "invalid character in tag name", s );
18856 
18857 		// get tag name
18858 		if ( p-s>=(int)sizeof(sTag) ) LOC_ERROR ( "tag name too long", s );
18859 		strncpy ( sTag, s, p-s );
18860 		sTag[p-s] = '\0';
18861 
18862 		// skip spaces
18863 		while ( *p && isspace(*p) ) p++;
18864 		if ( *p++!='=' ) LOC_ERROR ( "'=' expected", p-1 );
18865 
18866 		// add indexed tag entry, if not there yet
18867 		strlwr ( sTag );
18868 
18869 		int iIndexTag = -1;
18870 		ARRAY_FOREACH ( i, m_dTags )
18871 			if ( m_dTags[i].m_sTag==sTag )
18872 		{
18873 			iIndexTag = i;
18874 			break;
18875 		}
18876 		if ( iIndexTag<0 )
18877 		{
18878 			m_dTags.Add();
18879 			m_dTags.Last().m_sTag = sTag;
18880 			m_dTags.Last().m_iTagLen = strlen ( sTag );
18881 			iIndexTag = m_dTags.GetLength()-1;
18882 		}
18883 
18884 		m_dTags[iIndexTag].m_bIndexAttrs = true;
18885 		CSphVector<CSphString> & dAttrs = m_dTags[iIndexTag].m_dAttrs;
18886 
18887 		// scan attributes
18888 		while ( *p )
18889 		{
18890 			// skip spaces
18891 			while ( *p && isspace(*p) ) p++;
18892 			if ( !*p ) break;
18893 
18894 			// check attr name
18895 			s = p; while ( sphIsTag(*p) ) p++;
18896 			if ( s==p ) LOC_ERROR ( "invalid character in attribute name", s );
18897 
18898 			// get attr name
18899 			if ( p-s>=(int)sizeof(sAttr) ) LOC_ERROR ( "attribute name too long", s );
18900 			strncpy ( sAttr, s, p-s );
18901 			sAttr[p-s] = '\0';
18902 
18903 			// add attr, if not there yet
18904 			int iAttr;
18905 			for ( iAttr=0; iAttr<dAttrs.GetLength(); iAttr++ )
18906 				if ( dAttrs[iAttr]==sAttr )
18907 					break;
18908 
18909 			if ( iAttr==dAttrs.GetLength() )
18910 				dAttrs.Add ( sAttr );
18911 
18912 			// skip spaces
18913 			while ( *p && isspace(*p) ) p++;
18914 			if ( !*p ) break;
18915 
18916 			// check if there's next attr or tag
18917 			if ( *p==',' ) { p++; continue; } // next attr
18918 			if ( *p==';' ) { p++; break; } // next tag
18919 			LOC_ERROR ( "',' or ';' or end of line expected", p );
18920 		}
18921 	}
18922 
18923 	#undef LOC_ERROR
18924 
18925 	UpdateTags ();
18926 	return true;
18927 }
18928 
18929 
SetRemovedElements(const char * sConfig,CSphString &)18930 bool CSphHTMLStripper::SetRemovedElements ( const char * sConfig, CSphString & )
18931 {
18932 	if ( !sConfig || !*sConfig )
18933 		return true;
18934 
18935 	const char * p = sConfig;
18936 	while ( *p )
18937 	{
18938 		// skip separators
18939 		while ( *p && !sphIsTag(*p) ) p++;
18940 		if ( !*p ) break;
18941 
18942 		// get tag name
18943 		const char * s = p;
18944 		while ( sphIsTag(*p) ) p++;
18945 
18946 		CSphString sTag;
18947 		sTag.SetBinary ( s, p-s );
18948 		sTag.ToLower ();
18949 
18950 		// mark it
18951 		int iTag;
18952 		for ( iTag=0; iTag<m_dTags.GetLength(); iTag++ )
18953 			if ( m_dTags[iTag].m_sTag==sTag )
18954 		{
18955 			m_dTags[iTag].m_bRemove = true;
18956 			break;
18957 		}
18958 
18959 		if ( iTag==m_dTags.GetLength() )
18960 		{
18961 			m_dTags.Add();
18962 			m_dTags.Last().m_sTag = sTag;
18963 			m_dTags.Last().m_iTagLen = strlen ( sTag.cstr() );
18964 			m_dTags.Last().m_bRemove = true;
18965 		}
18966 	}
18967 
18968 	UpdateTags ();
18969 	return true;
18970 }
18971 
18972 
EnableParagraphs()18973 void CSphHTMLStripper::EnableParagraphs ()
18974 {
18975 	// known block-level elements
18976 	const char * dBlock[] = { "address", "blockquote", "caption", "center",
18977 		"dd", "div", "dl", "dt", "h1", "h2", "h3", "h4", "h5", "li", "menu",
18978 		"ol", "p", "pre", "table", "tbody", "td", "tfoot", "th", "thead",
18979 		"tr", "ul", NULL };
18980 
18981 	for ( int iBlock=0; dBlock[iBlock]; iBlock++ )
18982 	{
18983 		const char * sTag = dBlock[iBlock];
18984 
18985 		// mark if known already
18986 		int iTag;
18987 		for ( iTag=0; iTag<m_dTags.GetLength(); iTag++ )
18988 			if ( m_dTags[iTag].m_sTag==sTag )
18989 		{
18990 			m_dTags[iTag].m_bPara = true;
18991 			break;
18992 		}
18993 
18994 		// add if not known yet
18995 		if ( iTag==m_dTags.GetLength() )
18996 		{
18997 			m_dTags.Add();
18998 			m_dTags.Last().m_sTag = sTag;
18999 			m_dTags.Last().m_iTagLen = strlen(sTag);
19000 			m_dTags.Last().m_bPara = true;
19001 		}
19002 	}
19003 
19004 	UpdateTags ();
19005 }
19006 
19007 
SetZones(const char * sZones,CSphString & sError)19008 bool CSphHTMLStripper::SetZones ( const char * sZones, CSphString & sError )
19009 {
19010 	// yet another mini parser!
19011 	// index_zones = {tagname | prefix*} [, ...]
19012 	if ( !sZones || !*sZones )
19013 		return true;
19014 
19015 	const char * s = sZones;
19016 	while ( *s )
19017 	{
19018 		// skip spaces
19019 		while ( sphIsSpace(*s) )
19020 			s++;
19021 		if ( !*s )
19022 			break;
19023 
19024 		// expect ident
19025 		if ( !sphIsTagStart(*s) )
19026 		{
19027 			sError.SetSprintf ( "unexpected char near '%s' in index_zones", s );
19028 			return false;
19029 		}
19030 
19031 		// get ident (either tagname or prefix*)
19032 		const char * sTag = s;
19033 		while ( sphIsTag(*s) )
19034 			s++;
19035 
19036 		const char * sTagEnd = s;
19037 		bool bPrefix = false;
19038 		if ( *s=='*' )
19039 		{
19040 			s++;
19041 			bPrefix = true;
19042 		}
19043 
19044 		// skip spaces
19045 		while ( sphIsSpace(*s) )
19046 			s++;
19047 
19048 		// expect eof or comma after ident
19049 		if ( *s && *s!=',' )
19050 		{
19051 			sError.SetSprintf ( "unexpected char near '%s' in index_zones", s );
19052 			return false;
19053 		}
19054 		if ( *s==',' )
19055 			s++;
19056 
19057 		// got valid entry, handle it
19058 		CSphHTMLStripper::StripperTag_t & tTag = m_dTags.Add();
19059 		tTag.m_sTag.SetBinary ( sTag, sTagEnd-sTag );
19060 		tTag.m_iTagLen = (int)( sTagEnd-sTag );
19061 		tTag.m_bZone = true;
19062 		tTag.m_bZonePrefix = bPrefix;
19063 	}
19064 
19065 	UpdateTags ();
19066 	return true;
19067 }
19068 
19069 
SkipQuoted(const BYTE * p)19070 const BYTE * SkipQuoted ( const BYTE * p )
19071 {
19072 	const BYTE * pMax = p + 512; // 512 bytes should be enough for a reasonable HTML attribute value, right?!
19073 	const BYTE * pProbEnd = NULL; // (most) probable end location in case we don't find a matching quote
19074 	BYTE cEnd = *p++; // either apostrophe or quote
19075 
19076 	while ( p<pMax && *p && *p!=cEnd )
19077 	{
19078 		if ( !pProbEnd )
19079 			if ( *p=='>' || *p=='\r' )
19080 				pProbEnd = p;
19081 		p++;
19082 	}
19083 
19084 	if ( *p==cEnd )
19085 		return p+1;
19086 
19087 	if ( pProbEnd )
19088 		return pProbEnd;
19089 
19090 	return p;
19091 }
19092 
19093 
19094 struct HtmlEntity_t
19095 {
19096 	const char *	m_sName;
19097 	int				m_iCode;
19098 };
19099 
19100 
HtmlEntityHash(const BYTE * str,int len)19101 static inline DWORD HtmlEntityHash ( const BYTE * str, int len )
19102 {
19103 	static const unsigned short asso_values[] =
19104 	{
19105 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19106 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19107 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19108 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19109 		421, 421, 421, 421, 421, 421, 421, 421, 421, 4,
19110 		6, 22, 1, 421, 421, 421, 421, 421, 421, 421,
19111 		421, 421, 421, 421, 421, 170, 48, 0, 5, 44,
19112 		0, 10, 10, 86, 421, 7, 0, 1, 42, 93,
19113 		41, 421, 0, 5, 8, 14, 421, 421, 5, 11,
19114 		8, 421, 421, 421, 421, 421, 421, 1, 25, 27,
19115 		9, 2, 113, 82, 14, 3, 179, 1, 81, 91,
19116 		12, 0, 1, 180, 56, 17, 5, 31, 60, 7,
19117 		3, 161, 2, 3, 421, 421, 421, 421, 421, 421,
19118 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19119 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19120 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19121 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19122 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19123 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19124 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19125 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19126 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19127 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19128 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19129 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
19130 		421, 421, 421, 421, 421, 421, 421
19131 	};
19132 
19133 	register int hval = len;
19134 	switch ( hval )
19135 	{
19136 		default:	hval += asso_values [ str[4] ];
19137 		case 4:
19138 		case 3:		hval += asso_values [ str[2] ];
19139 		case 2:		hval += asso_values [ str[1]+1 ];
19140 		case 1:		hval += asso_values [ str[0] ];
19141 					break;
19142 	}
19143 	return hval + asso_values [ str[len-1] ];
19144 }
19145 
19146 
HtmlEntityLookup(const BYTE * str,int len)19147 static inline int HtmlEntityLookup ( const BYTE * str, int len )
19148 {
19149 	static const unsigned char lengthtable[] =
19150 	{
19151 		0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 3,
19152 		4, 3, 3, 5, 3, 6, 5, 5, 3, 4, 4, 5, 3, 4,
19153 		4, 0, 5, 4, 5, 6, 5, 6, 4, 5, 3, 3, 5, 0,
19154 		0, 0, 0, 6, 0, 5, 5, 0, 5, 6, 6, 3, 0, 3,
19155 		5, 3, 0, 6, 0, 4, 3, 6, 3, 6, 6, 6, 6, 5,
19156 		5, 5, 5, 5, 5, 2, 6, 4, 0, 6, 3, 3, 3, 0,
19157 		4, 5, 4, 4, 4, 3, 7, 4, 3, 6, 2, 3, 6, 4,
19158 		3, 6, 5, 6, 5, 5, 4, 2, 0, 0, 4, 6, 8, 0,
19159 		0, 0, 5, 5, 0, 6, 6, 2, 2, 4, 4, 6, 6, 4,
19160 		4, 5, 6, 2, 3, 4, 6, 5, 0, 2, 0, 0, 6, 6,
19161 		6, 6, 6, 4, 6, 5, 0, 6, 4, 5, 4, 6, 6, 0,
19162 		0, 4, 6, 5, 6, 0, 6, 4, 5, 6, 5, 6, 4, 0,
19163 		3, 6, 0, 4, 4, 4, 5, 4, 6, 0, 4, 4, 6, 5,
19164 		6, 7, 2, 2, 6, 2, 5, 2, 5, 0, 0, 0, 4, 4,
19165 		2, 4, 2, 2, 4, 0, 4, 4, 4, 5, 5, 0, 3, 7,
19166 		5, 0, 5, 6, 5, 0, 6, 0, 6, 0, 4, 6, 4, 6,
19167 		6, 2, 6, 0, 5, 5, 4, 6, 6, 0, 5, 6, 4, 4,
19168 		4, 4, 0, 5, 0, 5, 0, 4, 5, 4, 0, 4, 4, 4,
19169 		0, 0, 0, 4, 0, 0, 0, 5, 6, 5, 3, 0, 0, 6,
19170 		5, 4, 5, 5, 5, 5, 0, 5, 5, 0, 5, 0, 0, 0,
19171 		4, 6, 0, 3, 0, 5, 5, 0, 0, 3, 6, 5, 0, 4,
19172 		0, 0, 0, 0, 5, 7, 5, 3, 5, 3, 0, 0, 6, 0,
19173 		6, 0, 0, 7, 0, 0, 5, 0, 5, 0, 0, 0, 0, 5,
19174 		4, 0, 0, 0, 0, 0, 7, 4, 0, 0, 3, 0, 0, 0,
19175 		3, 0, 6, 0, 0, 7, 5, 5, 0, 3, 0, 0, 0, 0,
19176 		0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5,
19177 		5, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19178 		0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19179 		0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0,
19180 		4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
19181 		5
19182 	};
19183 
19184 	static const struct HtmlEntity_t wordlist[] =
19185 	{
19186 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19187 		{"Rho", 929},
19188 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19189 		{"Chi", 935},
19190 		{"phi", 966},
19191 		{"iota", 953},
19192 		{"psi", 968},
19193 		{"int", 8747},
19194 		{"theta", 952},
19195 		{"amp", 38},
19196 		{"there4", 8756},
19197 		{"Theta", 920},
19198 		{"omega", 969},
19199 		{"and", 8743},
19200 		{"prop", 8733},
19201 		{"ensp", 8194},
19202 		{"image", 8465},
19203 		{"not", 172},
19204 		{"isin", 8712},
19205 		{"sdot", 8901},
19206 		{"", 0},
19207 		{"prime", 8242},
19208 		{"prod", 8719},
19209 		{"trade", 8482},
19210 		{"Scaron", 352},
19211 		{"kappa", 954},
19212 		{"thinsp", 8201},
19213 		{"emsp", 8195},
19214 		{"thorn", 254},
19215 		{"eta", 951},
19216 		{"chi", 967},
19217 		{"Kappa", 922},
19218 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
19219 		{"scaron", 353},
19220 		{"", 0},
19221 		{"notin", 8713},
19222 		{"ndash", 8211},
19223 		{"", 0},
19224 		{"acute", 180},
19225 		{"otilde", 245},
19226 		{"atilde", 227},
19227 		{"Phi", 934},
19228 		{"", 0},
19229 		{"Psi", 936},
19230 		{"pound", 163},
19231 		{"cap", 8745},
19232 		{"", 0},
19233 		{"otimes", 8855},
19234 		{"", 0},
19235 		{"nbsp", 32},
19236 		{"rho", 961},
19237 		{"ntilde", 241},
19238 		{"eth", 240},
19239 		{"oacute", 243},
19240 		{"aacute", 225},
19241 		{"eacute", 233},
19242 		{"iacute", 237},
19243 		{"nabla", 8711},
19244 		{"Prime", 8243},
19245 		{"ocirc", 244},
19246 		{"acirc", 226},
19247 		{"ecirc", 234},
19248 		{"icirc", 238},
19249 		{"or", 8744},
19250 		{"Yacute", 221},
19251 		{"nsub", 8836},
19252 		{"", 0},
19253 		{"Uacute", 218},
19254 		{"Eta", 919},
19255 		{"ETH", 208},
19256 		{"sup", 8835},
19257 		{"", 0},
19258 		{"supe", 8839},
19259 		{"Ucirc", 219},
19260 		{"sup1", 185},
19261 		{"para", 182},
19262 		{"sup2", 178},
19263 		{"loz", 9674},
19264 		{"omicron", 959},
19265 		{"part", 8706},
19266 		{"cup", 8746},
19267 		{"Ntilde", 209},
19268 		{"Mu", 924},
19269 		{"tau", 964},
19270 		{"uacute", 250},
19271 		{"Iota", 921},
19272 		{"Tau", 932},
19273 		{"rsaquo", 8250},
19274 		{"alpha", 945},
19275 		{"Ccedil", 199},
19276 		{"ucirc", 251},
19277 		{"oline", 8254},
19278 		{"sup3", 179},
19279 		{"nu", 957},
19280 		{"", 0}, {"", 0},
19281 		{"sube", 8838},
19282 		{"Eacute", 201},
19283 		{"thetasym", 977},
19284 		{"", 0}, {"", 0}, {"", 0},
19285 		{"Omega", 937},
19286 		{"Ecirc", 202},
19287 		{"", 0},
19288 		{"lowast", 8727},
19289 		{"iquest", 191},
19290 		{"lt", 60},
19291 		{"gt", 62},
19292 		{"ordm", 186},
19293 		{"euro", 8364},
19294 		{"oslash", 248},
19295 		{"lsaquo", 8249},
19296 		{"zeta", 950},
19297 		{"cong", 8773},
19298 		{"mdash", 8212},
19299 		{"ccedil", 231},
19300 		{"ne", 8800},
19301 		{"sub", 8834},
19302 		{"Zeta", 918},
19303 		{"Lambda", 923},
19304 		{"Gamma", 915},
19305 		{"", 0},
19306 		{"Nu", 925},
19307 		{"", 0}, {"", 0},
19308 		{"ograve", 242},
19309 		{"agrave", 224},
19310 		{"egrave", 232},
19311 		{"igrave", 236},
19312 		{"frac14", 188},
19313 		{"ordf", 170},
19314 		{"Otilde", 213},
19315 		{"infin", 8734},
19316 		{"", 0},
19317 		{"frac12", 189},
19318 		{"beta", 946},
19319 		{"radic", 8730},
19320 		{"darr", 8595},
19321 		{"Iacute", 205},
19322 		{"Ugrave", 217},
19323 		{"", 0}, {"", 0},
19324 		{"harr", 8596},
19325 		{"hearts", 9829},
19326 		{"Icirc", 206},
19327 		{"Oacute", 211},
19328 		{"", 0},
19329 		{"frac34", 190},
19330 		{"cent", 162},
19331 		{"crarr", 8629},
19332 		{"curren", 164},
19333 		{"Ocirc", 212},
19334 		{"brvbar", 166},
19335 		{"sect", 167},
19336 		{"", 0},
19337 		{"ang", 8736},
19338 		{"ugrave", 249},
19339 		{"", 0},
19340 		{"Beta", 914},
19341 		{"uarr", 8593},
19342 		{"dArr", 8659},
19343 		{"asymp", 8776},
19344 		{"perp", 8869},
19345 		{"Dagger", 8225},
19346 		{"", 0},
19347 		{"hArr", 8660},
19348 		{"rang", 9002},
19349 		{"dagger", 8224},
19350 		{"exist", 8707},
19351 		{"Egrave", 200},
19352 		{"Omicron", 927},
19353 		{"mu", 956},
19354 		{"pi", 960},
19355 		{"weierp", 8472},
19356 		{"xi", 958},
19357 		{"clubs", 9827},
19358 		{"Xi", 926},
19359 		{"aring", 229},
19360 		{"", 0}, {"", 0}, {"", 0},
19361 		{"copy", 169},
19362 		{"uArr", 8657},
19363 		{"ni", 8715},
19364 		{"rarr", 8594},
19365 		{"le", 8804},
19366 		{"ge", 8805},
19367 		{"zwnj", 8204},
19368 		{"", 0},
19369 		{"apos", 39},
19370 		{"macr", 175},
19371 		{"lang", 9001},
19372 		{"gamma", 947},
19373 		{"Delta", 916},
19374 		{"", 0},
19375 		{"uml", 168},
19376 		{"alefsym", 8501},
19377 		{"delta", 948},
19378 		{"", 0},
19379 		{"bdquo", 8222},
19380 		{"lambda", 955},
19381 		{"equiv", 8801},
19382 		{"", 0},
19383 		{"Oslash", 216},
19384 		{"", 0},
19385 		{"hellip", 8230},
19386 		{"", 0},
19387 		{"rArr", 8658},
19388 		{"Atilde", 195},
19389 		{"larr", 8592},
19390 		{"spades", 9824},
19391 		{"Igrave", 204},
19392 		{"Pi", 928},
19393 		{"yacute", 253},
19394 		{"", 0},
19395 		{"diams", 9830},
19396 		{"sbquo", 8218},
19397 		{"fnof", 402},
19398 		{"Ograve", 210},
19399 		{"plusmn", 177},
19400 		{"", 0},
19401 		{"rceil", 8969},
19402 		{"Aacute", 193},
19403 		{"ouml", 246},
19404 		{"auml", 228},
19405 		{"euml", 235},
19406 		{"iuml", 239},
19407 		{"", 0},
19408 		{"Acirc", 194},
19409 		{"", 0},
19410 		{"rdquo", 8221},
19411 		{"", 0},
19412 		{"lArr", 8656},
19413 		{"rsquo", 8217},
19414 		{"Yuml", 376},
19415 		{"", 0},
19416 		{"quot", 34},
19417 		{"Uuml", 220},
19418 		{"bull", 8226},
19419 		{"", 0}, {"", 0}, {"", 0},
19420 		{"real", 8476},
19421 		{"", 0}, {"", 0}, {"", 0},
19422 		{"lceil", 8968},
19423 		{"permil", 8240},
19424 		{"upsih", 978},
19425 		{"sum", 8721},
19426 		{"", 0}, {"", 0},
19427 		{"divide", 247},
19428 		{"raquo", 187},
19429 		{"uuml", 252},
19430 		{"ldquo", 8220},
19431 		{"Alpha", 913},
19432 		{"szlig", 223},
19433 		{"lsquo", 8216},
19434 		{"", 0},
19435 		{"Sigma", 931},
19436 		{"tilde", 732},
19437 		{"", 0},
19438 		{"THORN", 222},
19439 		{"", 0}, {"", 0}, {"", 0},
19440 		{"Euml", 203},
19441 		{"rfloor", 8971},
19442 		{"", 0},
19443 		{"lrm", 8206},
19444 		{"", 0},
19445 		{"sigma", 963},
19446 		{"iexcl", 161},
19447 		{"", 0}, {"", 0},
19448 		{"deg", 176},
19449 		{"middot", 183},
19450 		{"laquo", 171},
19451 		{"", 0},
19452 		{"circ", 710},
19453 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
19454 		{"frasl", 8260},
19455 		{"epsilon", 949},
19456 		{"oplus", 8853},
19457 		{"yen", 165},
19458 		{"micro", 181},
19459 		{"piv", 982},
19460 		{"", 0}, {"", 0},
19461 		{"lfloor", 8970},
19462 		{"", 0},
19463 		{"Agrave", 192},
19464 		{"", 0}, {"", 0},
19465 		{"Upsilon", 933},
19466 		{"", 0}, {"", 0},
19467 		{"times", 215},
19468 		{"", 0},
19469 		{"cedil", 184},
19470 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
19471 		{"minus", 8722},
19472 		{"Iuml", 207},
19473 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19474 		{"upsilon", 965},
19475 		{"Ouml", 214},
19476 		{"", 0}, {"", 0},
19477 		{"rlm", 8207},
19478 		{"", 0}, {"", 0}, {"", 0},
19479 		{"reg", 174},
19480 		{"", 0},
19481 		{"forall", 8704},
19482 		{"", 0}, {"", 0},
19483 		{"Epsilon", 917},
19484 		{"empty", 8709},
19485 		{"OElig", 338},
19486 		{"", 0},
19487 		{"shy", 173},
19488 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19489 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
19490 		{"Aring", 197},
19491 		{"", 0}, {"", 0}, {"", 0},
19492 		{"oelig", 339},
19493 		{"aelig", 230},
19494 		{"", 0},
19495 		{"zwj", 8205},
19496 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19497 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19498 		{"sim", 8764},
19499 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19500 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19501 		{"yuml", 255},
19502 		{"sigmaf", 962},
19503 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19504 		{"Auml", 196},
19505 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
19506 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
19507 		{"AElig", 198}
19508 	};
19509 
19510 	const int MIN_WORD_LENGTH		= 2;
19511 	const int MAX_WORD_LENGTH		= 8;
19512 	const int MAX_HASH_VALUE		= 420;
19513 
19514 	if ( len<=MAX_WORD_LENGTH && len>=MIN_WORD_LENGTH )
19515 	{
19516 		register int key = HtmlEntityHash ( str, len );
19517 		if ( key<=MAX_HASH_VALUE && key>=0 )
19518 			if ( len==lengthtable[key] )
19519 		{
19520 			register const char * s = wordlist[key].m_sName;
19521 			if ( *str==*s && !memcmp ( str+1, s+1, len-1 ) )
19522 				return wordlist[key].m_iCode;
19523 		}
19524 	}
19525 	return 0;
19526 }
19527 
19528 
Strip(BYTE * sData) const19529 void CSphHTMLStripper::Strip ( BYTE * sData ) const
19530 {
19531 	const BYTE * s = sData;
19532 	BYTE * d = sData;
19533 	for ( ;; )
19534 	{
19535 		/////////////////////////////////////
19536 		// scan until eof, or tag, or entity
19537 		/////////////////////////////////////
19538 
19539 		while ( *s && *s!='<' && *s!='&' )
19540 		{
19541 			if ( *s>=0x20 )
19542 				*d++ = *s;
19543 			else
19544 				*d++ = ' ';
19545 			s++;
19546 		}
19547 		if ( !*s )
19548 			break;
19549 
19550 		/////////////////
19551 		// handle entity
19552 		/////////////////
19553 
19554 		if ( *s=='&' )
19555 		{
19556 			if ( s[1]=='#' )
19557 			{
19558 				// handle "&#number;" form
19559 				int iCode = 0;
19560 				s += 2;
19561 				while ( isdigit(*s) )
19562 					iCode = iCode*10 + (*s++) - '0';
19563 
19564 				if ( ( iCode>=0 && iCode<=0x1f ) || *s!=';' ) // 0-31 are reserved codes
19565 					continue;
19566 
19567 				d += sphUTF8Encode ( d, iCode );
19568 				s++;
19569 
19570 			} else
19571 			{
19572 				// skip until ';' or max length
19573 				if ( ( s[1]>='a' && s[1]<='z' ) || ( s[1]>='A' && s[1]<='Z' ) )
19574 				{
19575 					const int MAX_ENTITY_LEN = 8;
19576 					const BYTE * sStart = s+1;
19577 					while ( *s && *s!=';' && s-sStart<=MAX_ENTITY_LEN )
19578 						s++;
19579 
19580 					if ( *s==';' )
19581 					{
19582 						int iCode = HtmlEntityLookup ( sStart, (int)(s-sStart) );
19583 						if ( iCode>0 )
19584 						{
19585 							// this is a known entity; encode it
19586 							d += sphUTF8Encode ( d, iCode );
19587 							s++;
19588 							continue;
19589 						}
19590 					}
19591 
19592 					// rollback
19593 					s = sStart-1;
19594 				}
19595 
19596 				// if we're here, it's not an entity; pass the leading ampersand and rescan
19597 				*d++ = *s++;
19598 			}
19599 			continue;
19600 		}
19601 
19602 		//////////////
19603 		// handle tag
19604 		//////////////
19605 
19606 		assert ( *s=='<' );
19607 		if ( GetCharIndex(s[1])<0 )
19608 		{
19609 			if ( s[1]=='/' )
19610 			{
19611 				// check if it's valid closing tag
19612 				if ( GetCharIndex(s[2])<0 )
19613 				{
19614 					*d++ = *s++;
19615 					continue;
19616 				}
19617 
19618 			} else if ( s[1]=='!' )
19619 			{
19620 				if ( s[2]=='-' && s[3]=='-' )
19621 				{
19622 					// it's valid comment; scan until comment end
19623 					s += 4; // skip opening '<!--'
19624 					while ( *s )
19625 					{
19626 						if ( s[0]=='-' && s[1]=='-' && s[2]=='>' )
19627 							break;
19628 						s++;
19629 					}
19630 					if ( !*s )
19631 						break;
19632 					s += 3; // skip closing '-->'
19633 					continue;
19634 
19635 				} else if ( isalpha(s[2]) )
19636 				{
19637 					// it's <!doctype> style PI; scan until PI end
19638 					s += 2;
19639 					while ( *s && *s!='>' )
19640 					{
19641 						if ( *s=='\'' || *s=='"' )
19642 						{
19643 							s = SkipQuoted ( s );
19644 							while ( isspace(*s) ) s++;
19645 						} else
19646 						{
19647 							s++;
19648 						}
19649 					}
19650 					if ( *s=='>' )
19651 						s++;
19652 					continue;
19653 
19654 				} else
19655 				{
19656 					// it's something malformed; just ignore
19657 					*d++ = *s++;
19658 					continue;
19659 				}
19660 
19661 			} else if ( s[1]=='?' )
19662 			{
19663 				// scan until PI end
19664 				s += 2; // skip opening '<?'
19665 				while ( *s )
19666 				{
19667 					if ( s[0]=='?' && s[1]=='>' )
19668 						break;
19669 					s++;
19670 				}
19671 				if ( !*s )
19672 					break;
19673 				s += 2; // skip closing '?>'
19674 				continue;
19675 
19676 			} else
19677 			{
19678 				// simply malformed
19679 				*d++ = *s++;
19680 				continue;
19681 			}
19682 		}
19683 		s++; // skip '<'
19684 
19685 		//////////////////////////////////////
19686 		// lookup this tag in known tags list
19687 		//////////////////////////////////////
19688 
19689 		const StripperTag_t * pTag = NULL;
19690 		int iZoneNameLen = 0;
19691 		const BYTE * sZoneName = NULL;
19692 		s = FindTag ( s, &pTag, &sZoneName, &iZoneNameLen );
19693 
19694 		/////////////////////////////////////
19695 		// process tag contents
19696 		// index attributes if needed
19697 		// gracefully handle malformed stuff
19698 		/////////////////////////////////////
19699 
19700 #define LOC_SKIP_SPACES() { while ( sphIsSpace(*s) ) s++; if ( !*s || *s=='>' ) break; }
19701 
19702 		bool bIndexAttrs = ( pTag && pTag->m_bIndexAttrs );
19703 		while ( *s && *s!='>' )
19704 		{
19705 			LOC_SKIP_SPACES();
19706 			if ( sphIsTagStart(*s) )
19707 			{
19708 				// skip attribute name while it's valid
19709 				const BYTE * sAttr = s;
19710 				while ( sphIsTag(*s) )
19711 					s++;
19712 
19713 				// blanks or a value after a valid attribute name?
19714 				if ( sphIsSpace(*s) || *s=='=' )
19715 				{
19716 					const int iAttrLen = (int)( s - sAttr );
19717 					LOC_SKIP_SPACES();
19718 
19719 					// a valid name but w/o a value; keep scanning
19720 					if ( *s!='=' )
19721 						continue;
19722 
19723 					// got value!
19724 					s++;
19725 					LOC_SKIP_SPACES();
19726 
19727 					// check attribute name
19728 					// OPTIMIZE! remove linear search
19729 					int iAttr = -1;
19730 					if ( bIndexAttrs )
19731 					{
19732 						for ( iAttr=0; iAttr<pTag->m_dAttrs.GetLength(); iAttr++ )
19733 						{
19734 							int iLen = strlen ( pTag->m_dAttrs[iAttr].cstr() );
19735 							if ( iLen==iAttrLen && !strncasecmp ( pTag->m_dAttrs[iAttr].cstr(), (const char*)sAttr, iLen ) )
19736 								break;
19737 						}
19738 						if ( iAttr==pTag->m_dAttrs.GetLength() )
19739 							iAttr = -1;
19740 					}
19741 
19742 					// process the value
19743 					const BYTE * sVal = s;
19744 					if ( *s=='\'' || *s=='"' )
19745 					{
19746 						// skip quoted value until a matching quote
19747 						s = SkipQuoted ( s );
19748 					} else
19749 					{
19750 						// skip unquoted value until tag end or whitespace
19751 						while ( *s && *s!='>' && !sphIsSpace(*s) )
19752 							s++;
19753 					}
19754 
19755 					// if this one is to be indexed, copy it
19756 					if ( iAttr>=0 )
19757 					{
19758 						const BYTE * sMax = s;
19759 						if ( *sVal=='\'' || *sVal=='"' )
19760 						{
19761 							if ( sMax[-1]==sVal[0] )
19762 								sMax--;
19763 							sVal++;
19764 						}
19765 						while ( sVal<sMax )
19766 							*d++ = *sVal++;
19767 						*d++ = ' ';
19768 					}
19769 
19770 					// handled the value; keep scanning
19771 					continue;
19772 				}
19773 
19774 				// nope, got an invalid character in the sequence (or maybe eof)
19775 				// fall through to an invalid name handler
19776 			}
19777 
19778 			// keep skipping until tag end or whitespace
19779 			while ( *s && *s!='>' && !sphIsSpace(*s) )
19780 				s++;
19781 		}
19782 
19783 #undef LOC_SKIP_SPACES
19784 
19785 		// skip closing angle bracket, if any
19786 		if ( *s )
19787 			s++;
19788 
19789 		// unknown tag is done; others might require a bit more work
19790 		if ( !pTag )
19791 		{
19792 			*d++ = ' '; // unknown tags are *not* inline by default
19793 			continue;
19794 		}
19795 
19796 		// handle zones
19797 		if ( pTag->m_bZone )
19798 		{
19799 			// should be at tag's end
19800 			assert ( s[0]=='\0' || s[-1]=='>' );
19801 
19802 			// emit secret codes
19803 			*d++ = MAGIC_CODE_ZONE;
19804 			for ( int i=0; i<iZoneNameLen; i++ )
19805 				*d++ = (BYTE) tolower ( sZoneName[i] );
19806 			if ( *d )
19807 				*d++ = MAGIC_CODE_ZONE;
19808 
19809 			if ( !*s )
19810 				break;
19811 			continue;
19812 		}
19813 
19814 		// handle paragraph boundaries
19815 		if ( pTag->m_bPara )
19816 		{
19817 			*d++ = MAGIC_CODE_PARAGRAPH;
19818 			continue;
19819 		}
19820 
19821 		// in all cases, the tag must be fully processed at this point
19822 		// not a remove-tag? we're done
19823 		if ( !pTag->m_bRemove )
19824 		{
19825 			if ( !pTag->m_bInline )
19826 				*d++ = ' ';
19827 			continue;
19828 		}
19829 
19830 		// sudden eof? bail out
19831 		if ( !*s )
19832 			break;
19833 
19834 		// must be a proper remove-tag end, then
19835 		assert ( pTag->m_bRemove && s[-1]=='>' );
19836 
19837 		// short-form? we're done
19838 		if ( s[-2]=='/' )
19839 			continue;
19840 
19841 		// skip everything until the closing tag
19842 		// FIXME! should we handle insane cases with quoted closing tag within tag?
19843 		for ( ;; )
19844 		{
19845 			while ( *s && ( s[0]!='<' || s[1]!='/' ) ) s++;
19846 			if ( !*s ) break;
19847 
19848 			s += 2; // skip </
19849 			if ( strncasecmp ( pTag->m_sTag.cstr(), (const char*)s, pTag->m_iTagLen )!=0 ) continue;
19850 			if ( !sphIsTag ( s[pTag->m_iTagLen] ) )
19851 			{
19852 				s += pTag->m_iTagLen; // skip tag
19853 				if ( *s=='>' ) s++;
19854 				break;
19855 			}
19856 		}
19857 
19858 		if ( !pTag->m_bInline ) *d++ = ' ';
19859 	}
19860 	*d++ = '\0';
19861 
19862 	// space, paragraph sequences elimination pass
19863 	s = sData;
19864 	d = sData;
19865 	bool bSpaceOut = false;
19866 	bool bParaOut = false;
19867 	bool bZoneOut = false;
19868 	while ( const char c = *s++ )
19869 	{
19870 		assert ( d<=s-1 );
19871 
19872 		// handle different character classes
19873 		if ( sphIsSpace(c) )
19874 		{
19875 			// handle whitespace, skip dupes
19876 			if ( !bSpaceOut )
19877 				*d++ = ' ';
19878 
19879 			bSpaceOut = true;
19880 			continue;
19881 
19882 		} else if ( c==MAGIC_CODE_PARAGRAPH )
19883 		{
19884 			// handle paragraph marker, skip dupes
19885 			if ( !bParaOut && !bZoneOut )
19886 			{
19887 				*d++ = c;
19888 				bParaOut = true;
19889 			}
19890 
19891 			bSpaceOut = true;
19892 			continue;
19893 
19894 		} else if ( c==MAGIC_CODE_ZONE )
19895 		{
19896 			// zone marker
19897 			// rewind preceding paragraph, if any, it is redundant
19898 			if ( bParaOut )
19899 			{
19900 				assert ( d>sData && d[-1]==MAGIC_CODE_PARAGRAPH );
19901 				d--;
19902 			}
19903 
19904 			// copy \4zoneid\4
19905 			*d++ = c;
19906 			while ( *s && *s!=MAGIC_CODE_ZONE )
19907 				*d++ = *s++;
19908 
19909 			if ( *s )
19910 				*d++ = *s++;
19911 
19912 			// update state
19913 			// no spaces paragraphs allowed
19914 			bSpaceOut = bZoneOut = true;
19915 			bParaOut = false;
19916 			continue;
19917 
19918 		} else
19919 		{
19920 			*d++ = c;
19921 			bSpaceOut = bParaOut = bZoneOut = false;
19922 		}
19923 	}
19924 	*d++ = '\0';
19925 }
19926 
FindTag(const BYTE * sSrc,const StripperTag_t ** ppTag,const BYTE ** ppZoneName,int * pZoneNameLen) const19927 const BYTE * CSphHTMLStripper::FindTag ( const BYTE * sSrc, const StripperTag_t ** ppTag, const BYTE ** ppZoneName, int * pZoneNameLen ) const
19928 {
19929 	assert ( sSrc && ppTag && ppZoneName && pZoneNameLen );
19930 	assert ( sSrc[0]!='/' || sSrc[1]!='\0' );
19931 
19932 	const BYTE * sTagName = ( sSrc[0]=='/' ) ? sSrc+1 : sSrc;
19933 
19934 	*ppZoneName = sSrc;
19935 	*pZoneNameLen = 0;
19936 
19937 	int iIdx = GetCharIndex ( sTagName[0] );
19938 	assert ( iIdx>=0 && iIdx<MAX_CHAR_INDEX );
19939 
19940 	if ( m_dEnd[iIdx]>=0 )
19941 	{
19942 		int iStart = m_dStart[iIdx];
19943 		int iEnd = m_dEnd[iIdx];
19944 
19945 		for ( int i=iStart; i<=iEnd; i++ )
19946 		{
19947 			int iLen = m_dTags[i].m_iTagLen;
19948 			int iCmp = strncasecmp ( m_dTags[i].m_sTag.cstr(), (const char*)sTagName, iLen );
19949 
19950 			// the tags are sorted; so if current candidate is already greater, rest can be skipped
19951 			if ( iCmp>0 )
19952 				break;
19953 
19954 			// do we have a match?
19955 			if ( iCmp==0 )
19956 			{
19957 				// got exact match?
19958 				if ( !sphIsTag ( sTagName[iLen] ) )
19959 				{
19960 					*ppTag = m_dTags.Begin() + i;
19961 					sSrc = sTagName + iLen; // skip tag name
19962 					if ( m_dTags[i].m_bZone )
19963 						*pZoneNameLen = sSrc - *ppZoneName;
19964 					break;
19965 				}
19966 
19967 				// got wildcard match?
19968 				if ( m_dTags[i].m_bZonePrefix )
19969 				{
19970 					*ppTag = m_dTags.Begin() + i;
19971 					sSrc = sTagName + iLen;
19972 					while ( sphIsTag(*sSrc) )
19973 						sSrc++;
19974 					*pZoneNameLen = sSrc - *ppZoneName;
19975 					break;
19976 				}
19977 			}
19978 		}
19979 	}
19980 
19981 	return sSrc;
19982 }
19983 
IsValidTagStart(int iCh) const19984 bool CSphHTMLStripper::IsValidTagStart ( int iCh ) const
19985 {
19986 	int i = GetCharIndex ( iCh );
19987 	return ( i>=0 && i<MAX_CHAR_INDEX );
19988 }
19989 
19990 
19991 /////////////////////////////////////////////////////////////////////////////
19992 // GENERIC SOURCE
19993 /////////////////////////////////////////////////////////////////////////////
19994 
CSphSourceSettings()19995 CSphSourceSettings::CSphSourceSettings ()
19996 	: m_iMinPrefixLen ( 0 )
19997 	, m_iMinInfixLen ( 0 )
19998 	, m_iBoundaryStep ( 0 )
19999 	, m_bIndexExactWords ( false )
20000 	, m_iOvershortStep ( 1 )
20001 	, m_iStopwordStep ( 1 )
20002 	, m_bIndexSP ( false )
20003 {}
20004 
20005 
GetWordpart(const char * sField,bool bWordDict)20006 ESphWordpart CSphSourceSettings::GetWordpart ( const char * sField, bool bWordDict )
20007 {
20008 	if ( bWordDict )
20009 		return SPH_WORDPART_WHOLE;
20010 
20011 	bool bPrefix = ( m_iMinPrefixLen>0 ) && ( m_dPrefixFields.GetLength()==0 || m_dPrefixFields.Contains ( sField ) );
20012 	bool bInfix = ( m_iMinInfixLen>0 ) && ( m_dInfixFields.GetLength()==0 || m_dInfixFields.Contains ( sField ) );
20013 
20014 	assert ( !( bPrefix && bInfix ) ); // no field must be marked both prefix and infix
20015 	if ( bPrefix )
20016 		return SPH_WORDPART_PREFIX;
20017 	if ( bInfix )
20018 		return SPH_WORDPART_INFIX;
20019 	return SPH_WORDPART_WHOLE;
20020 }
20021 
20022 //////////////////////////////////////////////////////////////////////////
20023 
CSphSource(const char * sName)20024 CSphSource::CSphSource ( const char * sName )
20025 	: m_pTokenizer ( NULL )
20026 	, m_pDict ( NULL )
20027 	, m_tSchema ( sName )
20028 	, m_bStripHTML ( false )
20029 	, m_iNullIds ( 0 )
20030 	, m_iMaxIds ( 0 )
20031 {
20032 	m_pStripper = new CSphHTMLStripper ( true );
20033 }
20034 
20035 
~CSphSource()20036 CSphSource::~CSphSource()
20037 {
20038 	delete m_pStripper;
20039 }
20040 
20041 
SetDict(CSphDict * pDict)20042 void CSphSource::SetDict ( CSphDict * pDict )
20043 {
20044 	assert ( pDict );
20045 	m_pDict = pDict;
20046 }
20047 
20048 
GetStats()20049 const CSphSourceStats & CSphSource::GetStats ()
20050 {
20051 	return m_tStats;
20052 }
20053 
20054 
SetStripHTML(const char * sExtractAttrs,const char * sRemoveElements,bool bDetectParagraphs,const char * sZones,CSphString & sError)20055 bool CSphSource::SetStripHTML ( const char * sExtractAttrs, const char * sRemoveElements, bool bDetectParagraphs, const char * sZones, CSphString & sError )
20056 {
20057 	if ( !m_pStripper->SetIndexedAttrs ( sExtractAttrs, sError ) )
20058 		return false;
20059 
20060 	if ( !m_pStripper->SetRemovedElements ( sRemoveElements, sError ) )
20061 		return false;
20062 
20063 	if ( bDetectParagraphs )
20064 		m_pStripper->EnableParagraphs ();
20065 
20066 	if ( !m_pStripper->SetZones ( sZones, sError ) )
20067 		return false;
20068 
20069 	m_bStripHTML = true;
20070 	return true;
20071 }
20072 
20073 
SetTokenizer(ISphTokenizer * pTokenizer)20074 void CSphSource::SetTokenizer ( ISphTokenizer * pTokenizer )
20075 {
20076 	assert ( pTokenizer );
20077 	m_pTokenizer = pTokenizer;
20078 }
20079 
20080 
UpdateSchema(CSphSchema * pInfo,CSphString & sError)20081 bool CSphSource::UpdateSchema ( CSphSchema * pInfo, CSphString & sError )
20082 {
20083 	assert ( pInfo );
20084 
20085 	// fill it
20086 	if ( pInfo->m_dFields.GetLength()==0 && pInfo->GetAttrsCount()==0 )
20087 	{
20088 		*pInfo = m_tSchema;
20089 		return true;
20090 	}
20091 
20092 	// check it
20093 	return m_tSchema.CompareTo ( *pInfo, sError );
20094 }
20095 
20096 
Setup(const CSphSourceSettings & tSettings)20097 void CSphSource::Setup ( const CSphSourceSettings & tSettings )
20098 {
20099 	m_iMinPrefixLen = Max ( tSettings.m_iMinPrefixLen, 0 );
20100 	m_iMinInfixLen = Max ( tSettings.m_iMinInfixLen, 0 );
20101 	m_iBoundaryStep = Max ( tSettings.m_iBoundaryStep, -1 );
20102 	m_bIndexExactWords = tSettings.m_bIndexExactWords;
20103 	m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
20104 	m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
20105 	m_bIndexSP = tSettings.m_bIndexSP;
20106 	m_dPrefixFields = tSettings.m_dPrefixFields;
20107 	m_dInfixFields = tSettings.m_dInfixFields;
20108 }
20109 
20110 
VerifyID(SphDocID_t uID)20111 SphDocID_t CSphSource::VerifyID ( SphDocID_t uID )
20112 {
20113 	if ( uID==0 )
20114 	{
20115 		m_iNullIds++;
20116 		return 0;
20117 	}
20118 
20119 	if ( uID==DOCID_MAX )
20120 	{
20121 		m_iMaxIds++;
20122 		return 0;
20123 	}
20124 
20125 	return uID;
20126 }
20127 
20128 
IterateJoinedHits(CSphString &)20129 ISphHits * CSphSource::IterateJoinedHits ( CSphString & )
20130 {
20131 	static ISphHits dDummy;
20132 	m_tDocInfo.m_iDocID = 0; // pretend that's an eof
20133 	return &dDummy;
20134 }
20135 
20136 /////////////////////////////////////////////////////////////////////////////
20137 // DOCUMENT SOURCE
20138 /////////////////////////////////////////////////////////////////////////////
20139 
FormatEscaped(FILE * fp,const char * sLine)20140 static void FormatEscaped ( FILE * fp, const char * sLine )
20141 {
20142 	// handle empty lines
20143 	if ( !sLine || !*sLine )
20144 	{
20145 		fprintf ( fp, "''" );
20146 		return;
20147 	}
20148 
20149 	// pass one, count the needed buffer size
20150 	int iLen = strlen(sLine);
20151 	int iOut = 0;
20152 	for ( int i=0; i<iLen; i++ )
20153 		switch ( sLine[i] )
20154 	{
20155 		case '\t':
20156 		case '\'':
20157 		case '\\':
20158 			iOut += 2;
20159 			break;
20160 
20161 		default:
20162 			iOut++;
20163 			break;
20164 	}
20165 	iOut += 2; // quotes
20166 
20167 	// allocate the buffer
20168 	char sMinibuffer[8192];
20169 	char * sMaxibuffer = NULL;
20170 	char * sBuffer = sMinibuffer;
20171 
20172 	if ( iOut>(int)sizeof(sMinibuffer) )
20173 	{
20174 		sMaxibuffer = new char [ iOut+4 ]; // 4 is just my safety gap
20175 		sBuffer = sMaxibuffer;
20176 	}
20177 
20178 	// pass two, escape it
20179 	char * sOut = sBuffer;
20180 	*sOut++ = '\'';
20181 
20182 	for ( int i=0; i<iLen; i++ )
20183 		switch ( sLine[i] )
20184 	{
20185 		case '\t':
20186 		case '\'':
20187 		case '\\':	*sOut++ = '\\'; // no break intended
20188 		default:	*sOut++ = sLine[i];
20189 	}
20190 	*sOut++ = '\'';
20191 
20192 	// print!
20193 	assert ( sOut==sBuffer+iOut );
20194 	fwrite ( sBuffer, 1, iOut, fp );
20195 
20196 	// cleanup
20197 	SafeDeleteArray ( sMaxibuffer );
20198 }
20199 
CSphBuildHitsState_t()20200 CSphSource_Document::CSphBuildHitsState_t::CSphBuildHitsState_t ()
20201 	: m_bProcessingHits ( false )
20202 	, m_bDocumentDone ( false )
20203 	, m_dFields ( NULL )
20204 	, m_iStartPos ( 0 )
20205 	, m_iHitPos ( 0 )
20206 	, m_iField ( 0 )
20207 	, m_iStartField ( 0 )
20208 	, m_iEndField ( 0 )
20209 	, m_iBuildLastStep ( 1 )
20210 {
20211 }
20212 
CSphSource_Document(const char * sName)20213 CSphSource_Document::CSphSource_Document ( const char * sName )
20214 	: CSphSource ( sName )
20215 	, m_pReadFileBuffer ( NULL )
20216 	, m_iReadFileBufferSize ( 256 * 1024 )
20217 	, m_iMaxFileBufferSize ( 2 * 1024 * 1024 )
20218 	, m_eOnFileFieldError ( FFE_IGNORE_FIELD )
20219 	, m_fpDumpRows ( NULL )
20220 	, m_iPlainFieldsLength ( 0 )
20221 	, m_iMaxHits ( MAX_SOURCE_HITS )
20222 {
20223 }
20224 
20225 
IterateDocument(CSphString & sError)20226 bool CSphSource_Document::IterateDocument ( CSphString & sError )
20227 {
20228 	assert ( m_pTokenizer );
20229 	assert ( !m_tState.m_bProcessingHits );
20230 	PROFILE ( src_document );
20231 
20232 	m_tHits.m_dData.Resize ( 0 );
20233 
20234 	m_tState = CSphBuildHitsState_t();
20235 	m_tState.m_iEndField = m_iPlainFieldsLength;
20236 
20237 	m_dMva.Resize ( 1 ); // must not have zero offset
20238 
20239 	// fetch next document
20240 	for ( ;; )
20241 	{
20242 		m_tState.m_dFields = NextDocument ( sError );
20243 		if ( m_tDocInfo.m_iDocID==0 )
20244 			return true;
20245 
20246 		if ( !m_tState.m_dFields )
20247 			return false;
20248 
20249 		// tricky bit
20250 		// we can only skip document indexing from here, IterateHits() is too late
20251 		// so in case the user chose to skip documents with file field problems
20252 		// we need to check for those here
20253 		if ( m_eOnFileFieldError==FFE_SKIP_DOCUMENT || m_eOnFileFieldError==FFE_FAIL_INDEX )
20254 		{
20255 			bool bOk = true;
20256 			for ( int iField=0; iField<m_tState.m_iEndField && bOk; iField++ )
20257 			{
20258 				const BYTE * sFilename = m_tState.m_dFields[iField];
20259 				if ( m_tSchema.m_dFields[iField].m_bFilename )
20260 					bOk &= CheckFileField ( sFilename );
20261 
20262 				if ( !bOk && m_eOnFileFieldError==FFE_FAIL_INDEX )
20263 				{
20264 					sError.SetSprintf ( "error reading file field data (docid=" DOCID_FMT ", filename=%s)",
20265 						m_tDocInfo.m_iDocID, sFilename );
20266 					return false;
20267 				}
20268 			}
20269 			if ( !bOk && m_eOnFileFieldError==FFE_SKIP_DOCUMENT )
20270 				continue;
20271 		}
20272 
20273 		// we're good
20274 		break;
20275 	}
20276 
20277 	m_tStats.m_iTotalDocuments++;
20278 	return true;
20279 }
20280 
20281 
IterateHits(CSphString & sError)20282 ISphHits * CSphSource_Document::IterateHits ( CSphString & sError )
20283 {
20284 	if ( m_tState.m_bDocumentDone )
20285 		return NULL;
20286 
20287 	m_tHits.m_dData.Resize ( 0 );
20288 
20289 	BuildHits ( sError, false );
20290 
20291 	return &m_tHits;
20292 }
20293 
20294 
CheckFileField(const BYTE * sField)20295 bool CSphSource_Document::CheckFileField ( const BYTE * sField )
20296 {
20297 	CSphAutofile tFileSource;
20298 	CSphString sError;
20299 
20300 	if ( tFileSource.Open ( (const char *)sField, SPH_O_READ, sError )==-1 )
20301 	{
20302 		sphWarning ( "docid=" DOCID_FMT ": %s", m_tDocInfo.m_iDocID, sError.cstr() );
20303 		return false;
20304 	}
20305 
20306 	int64_t iFileSize = tFileSource.GetSize();
20307 	if ( iFileSize+16 > m_iMaxFileBufferSize )
20308 	{
20309 		sphWarning ( "docid=" DOCID_FMT ": file '%s' too big for a field (size="INT64_FMT", max_file_field_buffer=%d)",
20310 			m_tDocInfo.m_iDocID, (const char *)sField, iFileSize, m_iMaxFileBufferSize );
20311 		return false;
20312 	}
20313 
20314 	return true;
20315 }
20316 
20317 
20318 /// returns file size on success, and replaces *ppField with a pointer to data
20319 /// returns -1 on failure (and emits a warning)
LoadFileField(BYTE ** ppField,CSphString & sError)20320 int CSphSource_Document::LoadFileField ( BYTE ** ppField, CSphString & sError )
20321 {
20322 	CSphAutofile tFileSource;
20323 
20324 	BYTE * sField = *ppField;
20325 	if ( tFileSource.Open ( (const char *)sField, SPH_O_READ, sError )==-1 )
20326 	{
20327 		sphWarning ( "docid=" DOCID_FMT ": %s", m_tDocInfo.m_iDocID, sError.cstr() );
20328 		return -1;
20329 	}
20330 
20331 	int64_t iFileSize = tFileSource.GetSize();
20332 	if ( iFileSize+16 > m_iMaxFileBufferSize )
20333 	{
20334 		sphWarning ( "docid=" DOCID_FMT ": file '%s' too big for a field (size="INT64_FMT", max_file_field_buffer=%d)",
20335 			m_tDocInfo.m_iDocID, (const char *)sField, iFileSize, m_iMaxFileBufferSize );
20336 		return -1;
20337 	}
20338 
20339 	int iFieldBytes = (int)iFileSize;
20340 	if ( !iFieldBytes )
20341 		return 0;
20342 
20343 	int iBufSize = Max ( m_iReadFileBufferSize, 1 << sphLog2 ( iFieldBytes+15 ) );
20344 	if ( m_iReadFileBufferSize < iBufSize )
20345 		SafeDeleteArray ( m_pReadFileBuffer );
20346 
20347 	if ( !m_pReadFileBuffer )
20348 	{
20349 		m_pReadFileBuffer = new char [ iBufSize ];
20350 		m_iReadFileBufferSize = iBufSize;
20351 	}
20352 
20353 	if ( !tFileSource.Read ( m_pReadFileBuffer, iFieldBytes, sError ) )
20354 	{
20355 		sphWarning ( "docid=" DOCID_FMT ": read failed: %s", m_tDocInfo.m_iDocID, sError.cstr() );
20356 		return -1;
20357 	}
20358 
20359 	m_pReadFileBuffer[iFieldBytes] = '\0';
20360 
20361 	*ppField = (BYTE*)m_pReadFileBuffer;
20362 	return iFieldBytes;
20363 }
20364 
20365 //////////////////////////////////////////////////////////////////////////
20366 // HIT GENERATORS
20367 //////////////////////////////////////////////////////////////////////////
20368 
BuildZoneHits(SphDocID_t uDocid,BYTE * sWord)20369 bool CSphSource_Document::BuildZoneHits ( SphDocID_t uDocid, BYTE * sWord )
20370 {
20371 	if ( *sWord==MAGIC_CODE_SENTENCE || *sWord==MAGIC_CODE_PARAGRAPH || *sWord==MAGIC_CODE_ZONE )
20372 	{
20373 		m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( (BYTE*)MAGIC_WORD_SENTENCE ), m_tState.m_iHitPos );
20374 
20375 		if ( *sWord==MAGIC_CODE_PARAGRAPH || *sWord==MAGIC_CODE_ZONE )
20376 			m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( (BYTE*)MAGIC_WORD_PARAGRAPH ), m_tState.m_iHitPos );
20377 
20378 		if ( *sWord==MAGIC_CODE_ZONE )
20379 		{
20380 			BYTE * pZone = (BYTE*) m_pTokenizer->GetBufferPtr();
20381 			BYTE * pEnd = pZone;
20382 			while ( *pEnd && *pEnd!=MAGIC_CODE_ZONE )
20383 			{
20384 				pEnd++;
20385 			}
20386 
20387 			if ( *pEnd && *pEnd==MAGIC_CODE_ZONE )
20388 			{
20389 				*pEnd = '\0';
20390 				m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( pZone-1 ), m_tState.m_iHitPos );
20391 				m_pTokenizer->SetBufferPtr ( (const char*) pEnd+1 );
20392 			}
20393 		}
20394 
20395 		m_tState.m_iBuildLastStep = 1;
20396 		return true;
20397 	}
20398 	return false;
20399 }
20400 
20401 
20402 // track blended start and reset on not blended token
TrackBlendedStart(const ISphTokenizer * pTokenizer,int iBlendedHitsStart,int iHitsCount)20403 static int TrackBlendedStart ( const ISphTokenizer * pTokenizer, int iBlendedHitsStart, int iHitsCount )
20404 {
20405 	iBlendedHitsStart = ( ( pTokenizer->TokenIsBlended() || pTokenizer->TokenIsBlendedPart() ) ? iBlendedHitsStart : -1 );
20406 	if ( pTokenizer->TokenIsBlended() )
20407 		iBlendedHitsStart = iHitsCount;
20408 
20409 	return iBlendedHitsStart;
20410 }
20411 
20412 
20413 #define BUILD_SUBSTRING_HITS_COUNT 4
20414 
BuildSubstringHits(SphDocID_t uDocid,bool bPayload,ESphWordpart eWordpart,bool bSkipEndMarker)20415 void CSphSource_Document::BuildSubstringHits ( SphDocID_t uDocid, bool bPayload, ESphWordpart eWordpart, bool bSkipEndMarker )
20416 {
20417 	bool bPrefixField = ( eWordpart==SPH_WORDPART_PREFIX );
20418 	bool bInfixMode = m_iMinInfixLen > 0;
20419 
20420 	int iMinInfixLen = bPrefixField ? m_iMinPrefixLen : m_iMinInfixLen;
20421 	if ( !m_tState.m_bProcessingHits )
20422 		m_tState.m_iBuildLastStep = 1;
20423 
20424 	BYTE * sWord = NULL;
20425 	BYTE sBuf [ 16+3*SPH_MAX_WORD_LEN ];
20426 
20427 	int iIterHitCount = BUILD_SUBSTRING_HITS_COUNT;
20428 	if ( bPrefixField )
20429 		iIterHitCount += SPH_MAX_WORD_LEN - m_iMinPrefixLen;
20430 	else
20431 		iIterHitCount += ( ( m_iMinInfixLen+SPH_MAX_WORD_LEN ) * ( SPH_MAX_WORD_LEN-m_iMinInfixLen ) / 2 );
20432 
20433 	// FIELDEND_MASK at blended token stream should be set for HEAD token too
20434 	int iBlendedHitsStart = -1;
20435 
20436 	// index all infixes
20437 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+iIterHitCount<m_iMaxHits )
20438 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
20439 	{
20440 		iBlendedHitsStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
20441 
20442 		if ( !bPayload )
20443 		{
20444 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
20445 			if ( m_pTokenizer->GetBoundary() )
20446 				HITMAN::AddPos ( &m_tState.m_iHitPos, m_iBoundaryStep );
20447 			m_tState.m_iBuildLastStep = 1;
20448 		}
20449 
20450 		if ( BuildZoneHits ( uDocid, sWord ) )
20451 			continue;
20452 
20453 		int iLen = m_pTokenizer->GetLastTokenLen ();
20454 
20455 		// always index full word (with magic head/tail marker(s))
20456 		int iBytes = strlen ( (const char*)sWord );
20457 		memcpy ( sBuf + 1, sWord, iBytes );
20458 		sBuf[iBytes+1] = '\0';
20459 
20460 		if ( m_bIndexExactWords )
20461 		{
20462 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
20463 			m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
20464 		}
20465 
20466 		sBuf[0] = MAGIC_WORD_HEAD;
20467 
20468 		// stemmed word w/markers
20469 		SphWordID_t iWord = m_pDict->GetWordIDWithMarkers ( sBuf );
20470 		if ( !iWord )
20471 		{
20472 			m_tState.m_iBuildLastStep = m_iStopwordStep;
20473 			continue;
20474 		}
20475 		m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
20476 		m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
20477 
20478 		// restore stemmed word
20479 		int iStemmedLen = strlen ( ( const char *)sBuf );
20480 		sBuf [iStemmedLen - 1] = '\0';
20481 
20482 		// stemmed word w/o markers
20483 		if ( strcmp ( (const char *)sBuf + 1, (const char *)sWord ) )
20484 			m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sBuf + 1, iStemmedLen - 2, true ), m_tState.m_iHitPos );
20485 
20486 		// restore word
20487 		memcpy ( sBuf + 1, sWord, iBytes );
20488 		sBuf[iBytes+1] = MAGIC_WORD_TAIL;
20489 		sBuf[iBytes+2] = '\0';
20490 
20491 		// if there are no infixes, that's it
20492 		if ( iMinInfixLen > iLen )
20493 		{
20494 			// index full word
20495 			m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sWord ), m_tState.m_iHitPos );
20496 			continue;
20497 		}
20498 
20499 		// process all infixes
20500 		int iMaxStart = bPrefixField ? 0 : ( iLen - iMinInfixLen );
20501 
20502 		BYTE * sInfix = sBuf + 1;
20503 
20504 		for ( int iStart=0; iStart<=iMaxStart; iStart++ )
20505 		{
20506 			BYTE * sInfixEnd = sInfix;
20507 			for ( int i = 0; i < iMinInfixLen; i++ )
20508 				sInfixEnd += m_pTokenizer->GetCodepointLength ( *sInfixEnd );
20509 
20510 			for ( int i=iMinInfixLen; i<=iLen-iStart; i++ )
20511 			{
20512 				m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix, sInfixEnd-sInfix, false ), m_tState.m_iHitPos );
20513 
20514 				// word start: add magic head
20515 				if ( bInfixMode && iStart==0 )
20516 					m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix - 1, sInfixEnd-sInfix + 1, false ), m_tState.m_iHitPos );
20517 
20518 				// word end: add magic tail
20519 				if ( bInfixMode && i==iLen-iStart )
20520 					m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix, sInfixEnd-sInfix+1, false ), m_tState.m_iHitPos );
20521 
20522 				sInfixEnd += m_pTokenizer->GetCodepointLength ( *sInfixEnd );
20523 			}
20524 
20525 			sInfix += m_pTokenizer->GetCodepointLength ( *sInfix );
20526 		}
20527 	}
20528 
20529 	m_tState.m_bProcessingHits = ( sWord!=NULL );
20530 
20531 	// mark trailing hits
20532 	if ( !bSkipEndMarker && !m_tState.m_bProcessingHits && m_tHits.Length() )
20533 	{
20534 		CSphWordHit * pHit = const_cast < CSphWordHit * > ( m_tHits.Last() );
20535 		Hitpos_t uRefPos = pHit->m_iWordPos;
20536 
20537 		for ( ; pHit>=m_tHits.First() && pHit->m_iWordPos==uRefPos; pHit-- )
20538 			HITMAN::SetEndMarker ( &pHit->m_iWordPos );
20539 
20540 		// mark blended HEAD as trailing too
20541 		if ( iBlendedHitsStart>=0 )
20542 		{
20543 			assert ( iBlendedHitsStart>=0 && iBlendedHitsStart<m_tHits.Length() );
20544 			pHit = const_cast < CSphWordHit * > ( m_tHits.First()+iBlendedHitsStart );
20545 			uRefPos = pHit->m_iWordPos;
20546 
20547 			const CSphWordHit * pEnd = m_tHits.First()+m_tHits.Length();
20548 			for ( ; pHit<pEnd && pHit->m_iWordPos==uRefPos; pHit++ )
20549 				HITMAN::SetEndMarker ( &pHit->m_iWordPos );
20550 		}
20551 	}
20552 }
20553 
20554 
20555 #define BUILD_REGULAR_HITS_COUNT 6
20556 
BuildRegularHits(SphDocID_t uDocid,bool bPayload,bool bSkipEndMarker)20557 void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, bool bSkipEndMarker )
20558 {
20559 	bool bWordDict = m_pDict->GetSettings().m_bWordDict;
20560 	bool bGlobalPartialMatch = !bWordDict && ( m_iMinPrefixLen > 0 || m_iMinInfixLen > 0 );
20561 
20562 	if ( !m_tState.m_bProcessingHits )
20563 		m_tState.m_iBuildLastStep = 1;
20564 
20565 	BYTE * sWord = NULL;
20566 	BYTE sBuf [ 16+3*SPH_MAX_WORD_LEN ];
20567 
20568 	// FIELDEND_MASK at blended token stream should be set for HEAD token too
20569 	int iBlendedHitsStart = -1;
20570 
20571 	// index words only
20572 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
20573 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
20574 	{
20575 		iBlendedHitsStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
20576 
20577 		if ( !bPayload )
20578 		{
20579 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
20580 			if ( m_pTokenizer->GetBoundary() )
20581 				HITMAN::AddPos ( &m_tState.m_iHitPos, m_iBoundaryStep );
20582 		}
20583 
20584 		if ( BuildZoneHits ( uDocid, sWord ) )
20585 			continue;
20586 
20587 		if ( bGlobalPartialMatch )
20588 		{
20589 			int iBytes = strlen ( (const char*)sWord );
20590 			memcpy ( sBuf + 1, sWord, iBytes );
20591 			sBuf[0] = MAGIC_WORD_HEAD;
20592 			sBuf[iBytes+1] = '\0';
20593 			m_tHits.AddHit ( uDocid, m_pDict->GetWordIDWithMarkers ( sBuf ), m_tState.m_iHitPos );
20594 		}
20595 
20596 		if ( m_bIndexExactWords )
20597 		{
20598 			int iBytes = strlen ( (const char*)sWord );
20599 			memcpy ( sBuf + 1, sWord, iBytes );
20600 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
20601 			sBuf[iBytes+1] = '\0';
20602 			m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
20603 		}
20604 
20605 		SphWordID_t iWord = m_pDict->GetWordID ( sWord );
20606 		if ( iWord )
20607 		{
20608 			m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
20609 			m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
20610 		} else
20611 			m_tState.m_iBuildLastStep = m_iStopwordStep;
20612 	}
20613 
20614 	m_tState.m_bProcessingHits = ( sWord!=NULL );
20615 
20616 	// mark trailing hit
20617 	if ( !bSkipEndMarker && !m_tState.m_bProcessingHits && m_tHits.Length() )
20618 	{
20619 		CSphWordHit * pHit = const_cast < CSphWordHit * > ( m_tHits.Last() );
20620 		HITMAN::SetEndMarker ( &pHit->m_iWordPos );
20621 
20622 		// mark blended HEAD as trailing too
20623 		if ( iBlendedHitsStart>=0 )
20624 		{
20625 			assert ( iBlendedHitsStart>=0 && iBlendedHitsStart<m_tHits.Length() );
20626 			CSphWordHit * pBlendedHit = const_cast < CSphWordHit * > ( m_tHits.First() + iBlendedHitsStart );
20627 			HITMAN::SetEndMarker ( &pBlendedHit->m_iWordPos );
20628 		}
20629 	}
20630 }
20631 
20632 
BuildHits(CSphString & sError,bool bSkipEndMarker)20633 void CSphSource_Document::BuildHits ( CSphString & sError, bool bSkipEndMarker )
20634 {
20635 	SphDocID_t uDocid = m_tDocInfo.m_iDocID;
20636 
20637 	for ( ; m_tState.m_iField<m_tState.m_iEndField; m_tState.m_iField++ )
20638 	{
20639 		if ( !m_tState.m_bProcessingHits )
20640 		{
20641 			// get that field
20642 			BYTE * sField = m_tState.m_dFields[m_tState.m_iField-m_tState.m_iStartField];
20643 			if ( !sField || !(*sField) )
20644 				continue;
20645 
20646 			// load files
20647 			int iFieldBytes = m_tSchema.m_dFields[m_tState.m_iField].m_bFilename
20648 				? LoadFileField ( &sField, sError )
20649 				: (int) strlen ( (char*)sField );
20650 
20651 			if ( iFieldBytes<=0 )
20652 				continue;
20653 
20654 			// strip html
20655 			if ( m_bStripHTML )
20656 			{
20657 				m_pStripper->Strip ( sField );
20658 				iFieldBytes = (int) strlen ( (char*)sField );
20659 			}
20660 
20661 			// tokenize and build hits
20662 			m_tStats.m_iTotalBytes += iFieldBytes;
20663 
20664 			m_pTokenizer->SetBuffer ( sField, iFieldBytes );
20665 
20666 			m_tState.m_iHitPos = HITMAN::Create ( m_tState.m_iField, m_tState.m_iStartPos );
20667 		}
20668 
20669 		const CSphColumnInfo & tField = m_tSchema.m_dFields[m_tState.m_iField];
20670 
20671 		if ( tField.m_eWordpart!=SPH_WORDPART_WHOLE )
20672 			BuildSubstringHits ( uDocid, tField.m_bPayload, tField.m_eWordpart, bSkipEndMarker );
20673 		else
20674 			BuildRegularHits ( uDocid, tField.m_bPayload, bSkipEndMarker );
20675 
20676 		if ( m_tState.m_bProcessingHits )
20677 			break;
20678 	}
20679 
20680 	m_tState.m_bDocumentDone = !m_tState.m_bProcessingHits;
20681 }
20682 
20683 //////////////////////////////////////////////////////////////////////////
20684 
IterateFieldMVAStart(int iAttr)20685 SphRange_t CSphSource_Document::IterateFieldMVAStart ( int iAttr )
20686 {
20687 	SphRange_t tRange;
20688 	tRange.m_iStart = tRange.m_iLength = 0;
20689 
20690 	if ( iAttr<0 || iAttr>=m_tSchema.GetAttrsCount() )
20691 		return tRange;
20692 
20693 	const CSphColumnInfo & tMva = m_tSchema.GetAttr ( iAttr );
20694 	int uOff = MVA_DOWNSIZE ( m_tDocInfo.GetAttr ( tMva.m_tLocator ) );
20695 	if ( !uOff )
20696 		return tRange;
20697 
20698 	int iCount = m_dMva[uOff];
20699 	assert ( iCount );
20700 
20701 	tRange.m_iStart = uOff+1;
20702 	tRange.m_iLength = iCount;
20703 
20704 	return tRange;
20705 }
20706 
20707 
sphAddMva64(CSphVector<DWORD> & dStorage,int64_t iVal)20708 static int sphAddMva64 ( CSphVector<DWORD> & dStorage, int64_t iVal )
20709 {
20710 	int uOff = dStorage.GetLength();
20711 	dStorage.Resize ( uOff+2 );
20712 	dStorage[uOff] = MVA_DOWNSIZE ( iVal );
20713 	dStorage[uOff+1] = MVA_DOWNSIZE ( ( iVal>>32 ) & 0xffffffff );
20714 	return uOff;
20715 }
20716 
20717 
ParseFieldMVA(CSphVector<DWORD> & dMva,const char * szValue,bool bMva64)20718 int CSphSource_Document::ParseFieldMVA ( CSphVector < DWORD > & dMva, const char * szValue, bool bMva64 )
20719 {
20720 	if ( !szValue )
20721 		return 0;
20722 
20723 	const char * pPtr = szValue;
20724 	const char * pDigit = NULL;
20725 	const int MAX_NUMBER_LEN = 64;
20726 	char szBuf [MAX_NUMBER_LEN];
20727 
20728 	assert ( dMva.GetLength() ); // must not have zero offset
20729 	int uOff = dMva.GetLength();
20730 	dMva.Add ( 0 ); // reserve value for count
20731 
20732 	while ( *pPtr )
20733 	{
20734 		if ( ( *pPtr>='0' && *pPtr<='9' ) || ( bMva64 && *pPtr=='-' ) )
20735 		{
20736 			if ( !pDigit )
20737 				pDigit = pPtr;
20738 		} else
20739 		{
20740 			if ( pDigit )
20741 			{
20742 				if ( pPtr - pDigit < MAX_NUMBER_LEN )
20743 				{
20744 					strncpy ( szBuf, pDigit, pPtr - pDigit );
20745 					szBuf [pPtr - pDigit] = '\0';
20746 					if ( !bMva64 )
20747 						dMva.Add ( sphToDword ( szBuf ) );
20748 					else
20749 						sphAddMva64 ( dMva, sphToInt64 ( szBuf ) );
20750 				}
20751 
20752 				pDigit = NULL;
20753 			}
20754 		}
20755 
20756 		pPtr++;
20757 	}
20758 
20759 	if ( pDigit )
20760 	{
20761 		if ( !bMva64 )
20762 			dMva.Add ( sphToDword ( pDigit ) );
20763 		else
20764 			sphAddMva64 ( dMva, sphToInt64 ( pDigit ) );
20765 	}
20766 
20767 	int iCount = dMva.GetLength()-uOff-1;
20768 	if ( !iCount )
20769 	{
20770 		dMva.Pop(); // remove reserved value for count in case of 0 MVAs
20771 		return 0;
20772 	} else
20773 	{
20774 		dMva[uOff] = iCount;
20775 		return uOff; // return offset to ( count, [value] )
20776 	}
20777 }
20778 
20779 /////////////////////////////////////////////////////////////////////////////
20780 // GENERIC SQL SOURCE
20781 /////////////////////////////////////////////////////////////////////////////
20782 
CSphSourceParams_SQL()20783 CSphSourceParams_SQL::CSphSourceParams_SQL ()
20784 	: m_iRangeStep ( 1024 )
20785 	, m_iRefRangeStep ( 1024 )
20786 	, m_bPrintQueries ( false )
20787 	, m_iRangedThrottle ( 0 )
20788 	, m_iMaxFileBufferSize ( 0 )
20789 	, m_eOnFileFieldError ( FFE_IGNORE_FIELD )
20790 	, m_iPort ( 0 )
20791 {
20792 }
20793 
20794 
20795 const char * const CSphSource_SQL::MACRO_VALUES [ CSphSource_SQL::MACRO_COUNT ] =
20796 {
20797 	"$start",
20798 	"$end"
20799 };
20800 
20801 
CSphSource_SQL(const char * sName)20802 CSphSource_SQL::CSphSource_SQL ( const char * sName )
20803 	: CSphSource_Document	( sName )
20804 	, m_bSqlConnected		( false )
20805 	, m_uMinID				( 0 )
20806 	, m_uMaxID				( 0 )
20807 	, m_uCurrentID			( 0 )
20808 	, m_uMaxFetchedID		( 0 )
20809 	, m_iMultiAttr			( -1 )
20810 	, m_iSqlFields			( 0 )
20811 	, m_bCanUnpack			( false )
20812 	, m_bUnpackFailed		( false )
20813 	, m_bUnpackOverflow		( false )
20814 	, m_iJoinedHitField		( -1 )
20815 	, m_iJoinedHitID		( 0 )
20816 	, m_iJoinedHitPos		( 0 )
20817 {
20818 }
20819 
20820 
Setup(const CSphSourceParams_SQL & tParams)20821 bool CSphSource_SQL::Setup ( const CSphSourceParams_SQL & tParams )
20822 {
20823 	// checks
20824 	assert ( !tParams.m_sQuery.IsEmpty() );
20825 
20826 	m_tParams = tParams;
20827 
20828 	// defaults
20829 	#define LOC_FIX_NULL(_arg) if ( !m_tParams._arg.cstr() ) m_tParams._arg = "";
20830 	LOC_FIX_NULL ( m_sHost );
20831 	LOC_FIX_NULL ( m_sUser );
20832 	LOC_FIX_NULL ( m_sPass );
20833 	LOC_FIX_NULL ( m_sDB );
20834 	#undef LOC_FIX_NULL
20835 
20836 	#define LOC_FIX_QARRAY(_arg) \
20837 		ARRAY_FOREACH ( i, m_tParams._arg ) \
20838 			if ( m_tParams._arg[i].IsEmpty() ) \
20839 				m_tParams._arg.Remove ( i-- );
20840 	LOC_FIX_QARRAY ( m_dQueryPre );
20841 	LOC_FIX_QARRAY ( m_dQueryPost );
20842 	LOC_FIX_QARRAY ( m_dQueryPostIndex );
20843 	#undef LOC_FIX_QARRAY
20844 
20845 	// build and store default DSN for error reporting
20846 	char sBuf [ 1024 ];
20847 	snprintf ( sBuf, sizeof(sBuf), "sql://%s:***@%s:%d/%s",
20848 		m_tParams.m_sUser.cstr(), m_tParams.m_sHost.cstr(),
20849 		m_tParams.m_iPort, m_tParams.m_sDB.cstr() );
20850 	m_sSqlDSN = sBuf;
20851 
20852 	if ( m_tParams.m_iMaxFileBufferSize > 0 )
20853 		m_iMaxFileBufferSize = m_tParams.m_iMaxFileBufferSize;
20854 	m_eOnFileFieldError = m_tParams.m_eOnFileFieldError;
20855 
20856 	return true;
20857 }
20858 
20859 
RunQueryStep(const char * sQuery,CSphString & sError)20860 bool CSphSource_SQL::RunQueryStep ( const char * sQuery, CSphString & sError )
20861 {
20862 	sError = "";
20863 
20864 	if ( m_tParams.m_iRangeStep<=0 )
20865 		return false;
20866 	if ( m_uCurrentID>m_uMaxID )
20867 		return false;
20868 
20869 	static const int iBufSize = 32;
20870 	char * sRes = NULL;
20871 
20872 	sphSleepMsec ( m_tParams.m_iRangedThrottle );
20873 
20874 	//////////////////////////////////////////////
20875 	// range query with $start/$end interpolation
20876 	//////////////////////////////////////////////
20877 
20878 	assert ( m_uMinID>0 );
20879 	assert ( m_uMaxID>0 );
20880 	assert ( m_uMinID<=m_uMaxID );
20881 	assert ( sQuery );
20882 
20883 	char sValues [ MACRO_COUNT ] [ iBufSize ];
20884 	SphDocID_t uNextID = Min ( m_uCurrentID + (SphDocID_t)m_tParams.m_iRangeStep - 1, m_uMaxID );
20885 	snprintf ( sValues[0], iBufSize, DOCID_FMT, m_uCurrentID );
20886 	snprintf ( sValues[1], iBufSize, DOCID_FMT, uNextID );
20887 	g_iIndexerCurrentRangeMin = m_uCurrentID;
20888 	g_iIndexerCurrentRangeMax = uNextID;
20889 	m_uCurrentID = 1 + uNextID;
20890 
20891 	// OPTIMIZE? things can be precalculated
20892 	const char * sCur = sQuery;
20893 	int iLen = 0;
20894 	while ( *sCur )
20895 	{
20896 		if ( *sCur=='$' )
20897 		{
20898 			int i;
20899 			for ( i=0; i<MACRO_COUNT; i++ )
20900 				if ( strncmp ( MACRO_VALUES[i], sCur, strlen ( MACRO_VALUES[i] ) )==0 )
20901 			{
20902 				sCur += strlen ( MACRO_VALUES[i] );
20903 				iLen += strlen ( sValues[i] );
20904 				break;
20905 			}
20906 			if ( i<MACRO_COUNT )
20907 				continue;
20908 		}
20909 
20910 		sCur++;
20911 		iLen++;
20912 	}
20913 	iLen++; // trailing zero
20914 
20915 	// do interpolation
20916 	sRes = new char [ iLen ];
20917 	sCur = sQuery;
20918 
20919 	char * sDst = sRes;
20920 	while ( *sCur )
20921 	{
20922 		if ( *sCur=='$' )
20923 		{
20924 			int i;
20925 			for ( i=0; i<MACRO_COUNT; i++ )
20926 				if ( strncmp ( MACRO_VALUES[i], sCur, strlen ( MACRO_VALUES[i] ) )==0 )
20927 			{
20928 				strcpy ( sDst, sValues[i] ); // NOLINT
20929 				sCur += strlen ( MACRO_VALUES[i] );
20930 				sDst += strlen ( sValues[i] );
20931 				break;
20932 			}
20933 			if ( i<MACRO_COUNT )
20934 				continue;
20935 		}
20936 		*sDst++ = *sCur++;
20937 	}
20938 	*sDst++ = '\0';
20939 	assert ( sDst-sRes==iLen );
20940 
20941 	// run query
20942 	SqlDismissResult ();
20943 	bool bRes = SqlQuery ( sRes );
20944 
20945 	if ( !bRes )
20946 		sError.SetSprintf ( "sql_range_query: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
20947 
20948 	SafeDeleteArray ( sRes );
20949 	return bRes;
20950 }
20951 
20952 
20953 /// connect to SQL server
Connect(CSphString & sError)20954 bool CSphSource_SQL::Connect ( CSphString & sError )
20955 {
20956 	// do not connect twice
20957 	if ( m_bSqlConnected )
20958 		return true;
20959 
20960 	// try to connect
20961 	if ( !SqlConnect() )
20962 	{
20963 		sError.SetSprintf ( "sql_connect: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
20964 		return false;
20965 	}
20966 
20967 	m_tHits.m_dData.Reserve ( m_iMaxHits );
20968 
20969 	// all good
20970 	m_bSqlConnected = true;
20971 	return true;
20972 }
20973 
20974 
20975 #define LOC_ERROR(_msg,_arg)			{ sError.SetSprintf ( _msg, _arg ); return false; }
20976 #define LOC_ERROR2(_msg,_arg,_arg2)		{ sError.SetSprintf ( _msg, _arg, _arg2 ); return false; }
20977 
20978 /// setup them ranges (called both for document range-queries and MVA range-queries)
SetupRanges(const char * sRangeQuery,const char * sQuery,const char * sPrefix,CSphString & sError)20979 bool CSphSource_SQL::SetupRanges ( const char * sRangeQuery, const char * sQuery, const char * sPrefix, CSphString & sError )
20980 {
20981 	// check step
20982 	if ( m_tParams.m_iRangeStep<=0 )
20983 		LOC_ERROR ( "sql_range_step="INT64_FMT": must be non-zero positive", m_tParams.m_iRangeStep );
20984 
20985 	if ( m_tParams.m_iRangeStep<128 )
20986 		sphWarn ( "sql_range_step="INT64_FMT": too small; might hurt indexing performance!", m_tParams.m_iRangeStep );
20987 
20988 	// check query for macros
20989 	for ( int i=0; i<MACRO_COUNT; i++ )
20990 		if ( !strstr ( sQuery, MACRO_VALUES[i] ) )
20991 			LOC_ERROR2 ( "%s: macro '%s' not found in match fetch query", sPrefix, MACRO_VALUES[i] );
20992 
20993 	// run query
20994 	if ( !SqlQuery ( sRangeQuery ) )
20995 	{
20996 		sError.SetSprintf ( "%s: range-query failed: %s (DSN=%s)", sPrefix, SqlError(), m_sSqlDSN.cstr() );
20997 		return false;
20998 	}
20999 
21000 	// fetch min/max
21001 	int iCols = SqlNumFields ();
21002 	if ( iCols!=2 )
21003 		LOC_ERROR2 ( "%s: expected 2 columns (min_id/max_id), got %d", sPrefix, iCols );
21004 
21005 	if ( !SqlFetchRow() )
21006 	{
21007 		sError.SetSprintf ( "%s: range-query fetch failed: %s (DSN=%s)", sPrefix, SqlError(), m_sSqlDSN.cstr() );
21008 		return false;
21009 	}
21010 
21011 	if ( ( SqlColumn(0)==NULL || !SqlColumn(0)[0] ) && ( SqlColumn(1)==NULL || !SqlColumn(1)[0] ) )
21012 	{
21013 		// the source seems to be empty; workaround
21014 		m_uMinID = 1;
21015 		m_uMaxID = 1;
21016 
21017 	} else
21018 	{
21019 		// get and check min/max id
21020 		const char * sCol0 = SqlColumn(0);
21021 		const char * sCol1 = SqlColumn(1);
21022 		m_uMinID = sphToDocid ( sCol0 );
21023 		m_uMaxID = sphToDocid ( sCol1 );
21024 		if ( !sCol0 ) sCol0 = "(null)";
21025 		if ( !sCol1 ) sCol1 = "(null)";
21026 
21027 		if ( m_uMinID<=0 )
21028 			LOC_ERROR ( "sql_query_range: min_id='%s': must be positive 32/64-bit unsigned integer", sCol0 );
21029 		if ( m_uMaxID<=0 )
21030 			LOC_ERROR ( "sql_query_range: max_id='%s': must be positive 32/64-bit unsigned integer", sCol1 );
21031 		if ( m_uMinID>m_uMaxID )
21032 			LOC_ERROR2 ( "sql_query_range: min_id='%s', max_id='%s': min_id must be less than max_id", sCol0, sCol1 );
21033 	}
21034 
21035 	SqlDismissResult ();
21036 	return true;
21037 }
21038 
21039 
21040 /// issue main rows fetch query
IterateStart(CSphString & sError)21041 bool CSphSource_SQL::IterateStart ( CSphString & sError )
21042 {
21043 	assert ( m_bSqlConnected );
21044 
21045 	m_iNullIds = false;
21046 	m_iMaxIds = false;
21047 
21048 	// run pre-queries
21049 	ARRAY_FOREACH ( i, m_tParams.m_dQueryPre )
21050 	{
21051 		if ( !SqlQuery ( m_tParams.m_dQueryPre[i].cstr() ) )
21052 		{
21053 			sError.SetSprintf ( "sql_query_pre[%d]: %s (DSN=%s)", i, SqlError(), m_sSqlDSN.cstr() );
21054 			SqlDisconnect ();
21055 			return false;
21056 		}
21057 		SqlDismissResult ();
21058 	}
21059 
21060 	for ( ;; )
21061 	{
21062 		m_tParams.m_iRangeStep = 0;
21063 
21064 		// issue first fetch query
21065 		if ( !m_tParams.m_sQueryRange.IsEmpty() )
21066 		{
21067 			m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
21068 			// run range-query; setup ranges
21069 			if ( !SetupRanges ( m_tParams.m_sQueryRange.cstr(), m_tParams.m_sQuery.cstr(), "sql_query_range: ", sError ) )
21070 				return false;
21071 
21072 			// issue query
21073 			m_uCurrentID = m_uMinID;
21074 			if ( !RunQueryStep ( m_tParams.m_sQuery.cstr(), sError ) )
21075 				return false;
21076 		} else
21077 		{
21078 			// normal query; just issue
21079 			if ( !SqlQuery ( m_tParams.m_sQuery.cstr() ) )
21080 			{
21081 				sError.SetSprintf ( "sql_query: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
21082 				return false;
21083 			}
21084 		}
21085 		break;
21086 	}
21087 
21088 	// some post-query setup
21089 	m_tSchema.Reset();
21090 
21091 	for ( int i=0; i<SPH_MAX_FIELDS; i++ )
21092 		m_dUnpack[i] = SPH_UNPACK_NONE;
21093 
21094 	m_iSqlFields = SqlNumFields(); // for rowdump
21095 	int iCols = SqlNumFields() - 1; // skip column 0, which must be the id
21096 
21097 	CSphVector<bool> dFound;
21098 	dFound.Resize ( m_tParams.m_dAttrs.GetLength() );
21099 	ARRAY_FOREACH ( i, dFound )
21100 		dFound[i] = false;
21101 
21102 	const bool bWordDict = m_pDict->GetSettings().m_bWordDict;
21103 
21104 	// map plain attrs from SQL
21105 	for ( int i=0; i<iCols; i++ )
21106 	{
21107 		const char * sName = SqlFieldName ( i+1 );
21108 		if ( !sName )
21109 			LOC_ERROR ( "column number %d has no name", i+1 );
21110 
21111 		CSphColumnInfo tCol ( sName );
21112 		ARRAY_FOREACH ( j, m_tParams.m_dAttrs )
21113 			if ( !strcasecmp ( tCol.m_sName.cstr(), m_tParams.m_dAttrs[j].m_sName.cstr() ) )
21114 		{
21115 			const CSphColumnInfo & tAttr = m_tParams.m_dAttrs[j];
21116 
21117 			tCol.m_eAttrType = tAttr.m_eAttrType;
21118 			assert ( tCol.m_eAttrType!=SPH_ATTR_NONE );
21119 
21120 			if ( ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) && tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
21121 				LOC_ERROR ( "multi-valued attribute '%s' of wrong source-type found in query; must be 'field'", tAttr.m_sName.cstr() );
21122 
21123 			tCol = tAttr;
21124 			dFound[j] = true;
21125 			break;
21126 		}
21127 
21128 		ARRAY_FOREACH ( j, m_tParams.m_dFileFields )
21129 		{
21130 			if ( !strcasecmp ( tCol.m_sName.cstr(), m_tParams.m_dFileFields[j].cstr() ) )
21131 				tCol.m_bFilename = true;
21132 		}
21133 
21134 		tCol.m_iIndex = i+1;
21135 		tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), bWordDict );
21136 
21137 		if ( tCol.m_eAttrType==SPH_ATTR_NONE || tCol.m_bIndexed )
21138 		{
21139 			m_tSchema.m_dFields.Add ( tCol );
21140 			ARRAY_FOREACH ( k, m_tParams.m_dUnpack )
21141 			{
21142 				CSphUnpackInfo & tUnpack = m_tParams.m_dUnpack[k];
21143 				if ( tUnpack.m_sName==tCol.m_sName )
21144 				{
21145 					if ( !m_bCanUnpack )
21146 					{
21147 						sError.SetSprintf ( "this source does not support column unpacking" );
21148 						return false;
21149 					}
21150 					int iIndex = m_tSchema.m_dFields.GetLength() - 1;
21151 					if ( iIndex < SPH_MAX_FIELDS )
21152 					{
21153 						m_dUnpack[iIndex] = tUnpack.m_eFormat;
21154 						m_dUnpackBuffers[iIndex].Resize ( SPH_UNPACK_BUFFER_SIZE );
21155 					}
21156 					break;
21157 				}
21158 			}
21159 		}
21160 
21161 		if ( tCol.m_eAttrType!=SPH_ATTR_NONE )
21162 			m_tSchema.AddAttr ( tCol, true ); // all attributes are dynamic at indexing time
21163 	}
21164 
21165 	// map multi-valued attrs
21166 	ARRAY_FOREACH ( i, m_tParams.m_dAttrs )
21167 	{
21168 		const CSphColumnInfo & tAttr = m_tParams.m_dAttrs[i];
21169 		if ( ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) && tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
21170 		{
21171 			CSphColumnInfo tMva = tAttr;
21172 			tMva.m_iIndex = m_tSchema.GetAttrsCount();
21173 			m_tSchema.AddAttr ( tMva, true ); // all attributes are dynamic at indexing time
21174 			dFound[i] = true;
21175 		}
21176 	}
21177 
21178 	// warn if some attrs went unmapped
21179 	ARRAY_FOREACH ( i, dFound )
21180 		if ( !dFound[i] )
21181 			sphWarn ( "attribute '%s' not found - IGNORING", m_tParams.m_dAttrs[i].m_sName.cstr() );
21182 
21183 	// joined fields
21184 	m_iPlainFieldsLength = m_tSchema.m_dFields.GetLength();
21185 
21186 	ARRAY_FOREACH ( i, m_tParams.m_dJoinedFields )
21187 	{
21188 		CSphColumnInfo tCol;
21189 		tCol.m_iIndex = -1;
21190 		tCol.m_sName = m_tParams.m_dJoinedFields[i].m_sName;
21191 		tCol.m_sQuery = m_tParams.m_dJoinedFields[i].m_sQuery;
21192 		tCol.m_bPayload = m_tParams.m_dJoinedFields[i].m_bPayload;
21193 		tCol.m_eSrc = m_tParams.m_dJoinedFields[i].m_sRanged.IsEmpty() ? SPH_ATTRSRC_QUERY : SPH_ATTRSRC_RANGEDQUERY;
21194 		tCol.m_sQueryRange = m_tParams.m_dJoinedFields[i].m_sRanged;
21195 		tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), bWordDict );
21196 		m_tSchema.m_dFields.Add ( tCol );
21197 	}
21198 
21199 	// alloc storage
21200 	m_tDocInfo.Reset ( m_tSchema.GetRowSize() );
21201 	m_dStrAttrs.Resize ( m_tSchema.GetAttrsCount() );
21202 
21203 	// check it
21204 	if ( m_tSchema.m_dFields.GetLength()>SPH_MAX_FIELDS )
21205 		LOC_ERROR2 ( "too many fields (fields=%d, max=%d)",
21206 			m_tSchema.m_dFields.GetLength(), SPH_MAX_FIELDS );
21207 
21208 	// log it
21209 	if ( m_fpDumpRows )
21210 	{
21211 		const char * sTable = m_tSchema.m_sName.cstr();
21212 
21213 		time_t iNow = time ( NULL );
21214 		fprintf ( m_fpDumpRows, "#\n# === source %s ts %d\n# %s#\n", sTable, (int)iNow, ctime ( &iNow ) );
21215 		ARRAY_FOREACH ( i, m_tSchema.m_dFields )
21216 			fprintf ( m_fpDumpRows, "# field %d: %s\n", i, m_tSchema.m_dFields[i].m_sName.cstr() );
21217 
21218 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
21219 		{
21220 			const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
21221 			fprintf ( m_fpDumpRows, "# %s = %s # attr %d\n", sphTypeDirective ( tCol.m_eAttrType ), tCol.m_sName.cstr(), i );
21222 		}
21223 
21224 		fprintf ( m_fpDumpRows, "#\n\nDROP TABLE IF EXISTS rows_%s;\nCREATE TABLE rows_%s (\n  id VARCHAR(32) NOT NULL,\n",
21225 			sTable, sTable );
21226 		for ( int i=1; i<m_iSqlFields; i++ )
21227 			fprintf ( m_fpDumpRows, "  %s VARCHAR(4096) NOT NULL,\n", SqlFieldName(i) );
21228 		fprintf ( m_fpDumpRows, "  KEY(id) );\n\n" );
21229 	}
21230 
21231 	return true;
21232 }
21233 
21234 #undef LOC_ERROR
21235 #undef LOC_ERROR2
21236 #undef LOC_SQL_ERROR
21237 
21238 
Disconnect()21239 void CSphSource_SQL::Disconnect ()
21240 {
21241 	SafeDeleteArray ( m_pReadFileBuffer );
21242 	m_tHits.m_dData.Reset();
21243 
21244 	if ( m_iNullIds )
21245 		sphWarn ( "source %s: skipped %d document(s) with zero/NULL ids", m_tSchema.m_sName.cstr(), m_iNullIds );
21246 
21247 	if ( m_iMaxIds )
21248 		sphWarn ( "source %s: skipped %d document(s) with DOCID_MAX ids", m_tSchema.m_sName.cstr(), m_iMaxIds );
21249 
21250 	m_iNullIds = 0;
21251 	m_iMaxIds = 0;
21252 
21253 	if ( m_bSqlConnected )
21254 		SqlDisconnect ();
21255 	m_bSqlConnected = false;
21256 }
21257 
21258 
NextDocument(CSphString & sError)21259 BYTE ** CSphSource_SQL::NextDocument ( CSphString & sError )
21260 {
21261 	PROFILE ( src_sql );
21262 	assert ( m_bSqlConnected );
21263 
21264 	// get next non-zero-id row
21265 	do
21266 	{
21267 		// try to get next row
21268 		bool bGotRow = SqlFetchRow ();
21269 
21270 		// when the party's over...
21271 		while ( !bGotRow )
21272 		{
21273 			// is that an error?
21274 			if ( SqlIsError() )
21275 			{
21276 				sError.SetSprintf ( "sql_fetch_row: %s", SqlError() );
21277 				m_tDocInfo.m_iDocID = 1; // 0 means legal eof
21278 				return NULL;
21279 			}
21280 
21281 			// maybe we can do next step yet?
21282 			if ( !RunQueryStep ( m_tParams.m_sQuery.cstr(), sError ) )
21283 			{
21284 				// if there's a message, there's an error
21285 				// otherwise, we're just over
21286 				if ( !sError.IsEmpty() )
21287 				{
21288 					m_tDocInfo.m_iDocID = 1; // 0 means legal eof
21289 					return NULL;
21290 				}
21291 
21292 			} else
21293 			{
21294 				// step went fine; try to fetch
21295 				bGotRow = SqlFetchRow ();
21296 				continue;
21297 			}
21298 
21299 			SqlDismissResult ();
21300 
21301 			// ok, we're over
21302 			ARRAY_FOREACH ( i, m_tParams.m_dQueryPost )
21303 			{
21304 				if ( !SqlQuery ( m_tParams.m_dQueryPost[i].cstr() ) )
21305 				{
21306 					sphWarn ( "sql_query_post[%d]: error=%s, query=%s",
21307 						i, SqlError(), m_tParams.m_dQueryPost[i].cstr() );
21308 					break;
21309 				}
21310 				SqlDismissResult ();
21311 			}
21312 
21313 			m_tDocInfo.m_iDocID = 0; // 0 means legal eof
21314 			return NULL;
21315 		}
21316 
21317 		// get him!
21318 		m_tDocInfo.m_iDocID = VerifyID ( sphToDocid ( SqlColumn(0) ) );
21319 		m_uMaxFetchedID = Max ( m_uMaxFetchedID, m_tDocInfo.m_iDocID );
21320 	} while ( !m_tDocInfo.m_iDocID );
21321 
21322 	// cleanup attrs
21323 	for ( int i=0; i<m_tSchema.GetRowSize(); i++ )
21324 		m_tDocInfo.m_pDynamic[i] = 0;
21325 
21326 	// split columns into fields and attrs
21327 	for ( int i=0; i<m_iPlainFieldsLength; i++ )
21328 	{
21329 		// get that field
21330 		#if USE_ZLIB
21331 		if ( m_dUnpack[i]!=SPH_UNPACK_NONE )
21332 		{
21333 			m_dFields[i] = (BYTE*) SqlUnpackColumn ( i, m_dUnpack[i] );
21334 			continue;
21335 		}
21336 		#endif
21337 		m_dFields[i] = (BYTE*) SqlColumn ( m_tSchema.m_dFields[i].m_iIndex );
21338 	}
21339 
21340 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
21341 	{
21342 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i); // shortcut
21343 
21344 		if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
21345 		{
21346 			int uOff = 0;
21347 			if ( tAttr.m_eSrc==SPH_ATTRSRC_FIELD )
21348 			{
21349 				uOff = ParseFieldMVA ( m_dMva, SqlColumn ( tAttr.m_iIndex ), tAttr.m_eAttrType==SPH_ATTR_INT64SET );
21350 			}
21351 			m_tDocInfo.SetAttr ( tAttr.m_tLocator, uOff );
21352 			continue;
21353 		}
21354 
21355 		switch ( tAttr.m_eAttrType )
21356 		{
21357 			case SPH_ATTR_ORDINAL:
21358 			case SPH_ATTR_STRING:
21359 			case SPH_ATTR_WORDCOUNT:
21360 				// memorize string, fixup NULLs
21361 				m_dStrAttrs[i] = SqlColumn ( tAttr.m_iIndex );
21362 				if ( !m_dStrAttrs[i].cstr() )
21363 					m_dStrAttrs[i] = "";
21364 
21365 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
21366 				break;
21367 
21368 			case SPH_ATTR_FLOAT:
21369 				m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
21370 				break;
21371 
21372 			case SPH_ATTR_BIGINT:
21373 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
21374 				break;
21375 
21376 			default:
21377 				// just store as uint by default
21378 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
21379 				break;
21380 		}
21381 	}
21382 
21383 	// log it
21384 	if ( m_fpDumpRows )
21385 	{
21386 		fprintf ( m_fpDumpRows, "INSERT INTO rows_%s VALUES (", m_tSchema.m_sName.cstr() );
21387 		for ( int i=0; i<m_iSqlFields; i++ )
21388 		{
21389 			if ( i )
21390 				fprintf ( m_fpDumpRows, ", " );
21391 			FormatEscaped ( m_fpDumpRows, SqlColumn(i) );
21392 		}
21393 		fprintf ( m_fpDumpRows, ");\n" );
21394 	}
21395 
21396 	return m_dFields;
21397 }
21398 
21399 
PostIndex()21400 void CSphSource_SQL::PostIndex ()
21401 {
21402 	if ( !m_tParams.m_dQueryPostIndex.GetLength() )
21403 		return;
21404 
21405 	assert ( !m_bSqlConnected );
21406 
21407 	#define LOC_SQL_ERROR(_msg) { sSqlError = _msg; break; }
21408 
21409 	const char * sSqlError = NULL;
21410 	for ( ;; )
21411 	{
21412 		if ( !SqlConnect () )
21413 			LOC_SQL_ERROR ( "mysql_real_connect" );
21414 
21415 		ARRAY_FOREACH ( i, m_tParams.m_dQueryPostIndex )
21416 		{
21417 			char * sQuery = sphStrMacro ( m_tParams.m_dQueryPostIndex[i].cstr(), "$maxid", m_uMaxFetchedID );
21418 			bool bRes = SqlQuery ( sQuery );
21419 			delete [] sQuery;
21420 
21421 			if ( !bRes )
21422 				LOC_SQL_ERROR ( "sql_query_post_index" );
21423 
21424 			SqlDismissResult ();
21425 		}
21426 
21427 		break;
21428 	}
21429 
21430 	if ( sSqlError )
21431 		sphWarn ( "%s: %s (DSN=%s)", sSqlError, SqlError(), m_sSqlDSN.cstr() );
21432 
21433 	#undef LOC_SQL_ERROR
21434 
21435 	SqlDisconnect ();
21436 }
21437 
21438 
IterateMultivaluedStart(int iAttr,CSphString & sError)21439 bool CSphSource_SQL::IterateMultivaluedStart ( int iAttr, CSphString & sError )
21440 {
21441 	if ( iAttr<0 || iAttr>=m_tSchema.GetAttrsCount() )
21442 		return false;
21443 
21444 	m_iMultiAttr = iAttr;
21445 	const CSphColumnInfo & tAttr = m_tSchema.GetAttr(iAttr);
21446 
21447 	if ( !(tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) )
21448 		return false;
21449 
21450 	CSphString sPrefix;
21451 	switch ( tAttr.m_eSrc )
21452 	{
21453 	case SPH_ATTRSRC_FIELD:
21454 		return false;
21455 
21456 	case SPH_ATTRSRC_QUERY:
21457 		// run simple query
21458 		if ( !SqlQuery ( tAttr.m_sQuery.cstr() ) )
21459 		{
21460 			sError.SetSprintf ( "multi-valued attr '%s' query failed: %s", tAttr.m_sName.cstr(), SqlError() );
21461 			return false;
21462 		}
21463 		break;
21464 
21465 	case SPH_ATTRSRC_RANGEDQUERY:
21466 			m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
21467 
21468 			// setup ranges
21469 			sPrefix.SetSprintf ( "multi-valued attr '%s' ranged query: ", tAttr.m_sName.cstr() );
21470 			if ( !SetupRanges ( tAttr.m_sQueryRange.cstr(), tAttr.m_sQuery.cstr(), sPrefix.cstr(), sError ) )
21471 				return false;
21472 
21473 			// run first step (in order to report errors)
21474 			m_uCurrentID = m_uMinID;
21475 			if ( !RunQueryStep ( tAttr.m_sQuery.cstr(), sError ) )
21476 				return false;
21477 
21478 			break;
21479 
21480 	default:
21481 		sError.SetSprintf ( "INTERNAL ERROR: unknown multi-valued attr source type %d", tAttr.m_eSrc );
21482 		return false;
21483 	}
21484 
21485 	// check fields count
21486 	if ( SqlNumFields()!=2 )
21487 	{
21488 		sError.SetSprintf ( "multi-valued attr '%s' query returned %d fields (expected 2)", tAttr.m_sName.cstr(), SqlNumFields() );
21489 		SqlDismissResult ();
21490 		return false;
21491 	}
21492 	return true;
21493 }
21494 
21495 
IterateMultivaluedNext()21496 bool CSphSource_SQL::IterateMultivaluedNext ()
21497 {
21498 	const CSphColumnInfo & tAttr = m_tSchema.GetAttr ( m_iMultiAttr );
21499 
21500 	assert ( m_bSqlConnected );
21501 	assert ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET );
21502 
21503 	// fetch next row
21504 	bool bGotRow = SqlFetchRow ();
21505 	while ( !bGotRow )
21506 	{
21507 		if ( SqlIsError() )
21508 			sphDie ( "sql_fetch_row: %s", SqlError() ); // FIXME! this should be reported
21509 
21510 		if ( tAttr.m_eSrc!=SPH_ATTRSRC_RANGEDQUERY )
21511 			return false;
21512 
21513 		CSphString sTmp;
21514 		if ( !RunQueryStep ( tAttr.m_sQuery.cstr(), sTmp ) ) // FIXME! this should be reported
21515 			return false;
21516 
21517 		bGotRow = SqlFetchRow ();
21518 		continue;
21519 	}
21520 
21521 	// return that tuple or offset to storage for MVA64 value
21522 	m_tDocInfo.m_iDocID = sphToDocid ( SqlColumn(0) );
21523 	m_dMva.Resize ( 0 );
21524 	if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
21525 		m_dMva.Add ( sphToDword ( SqlColumn(1) ) );
21526 	else
21527 		sphAddMva64 ( m_dMva, sphToInt64 ( SqlColumn(1) ) );
21528 
21529 	return true;
21530 }
21531 
21532 
IterateKillListStart(CSphString & sError)21533 bool CSphSource_SQL::IterateKillListStart ( CSphString & sError )
21534 {
21535 	if ( m_tParams.m_sQueryKilllist.IsEmpty () )
21536 		return false;
21537 
21538 	if ( !SqlQuery ( m_tParams.m_sQueryKilllist.cstr () ) )
21539 	{
21540 		sError.SetSprintf ( "killlist query failed: %s", SqlError() );
21541 		return false;
21542 	}
21543 
21544 	return true;
21545 }
21546 
21547 
IterateKillListNext(SphDocID_t & tDocId)21548 bool CSphSource_SQL::IterateKillListNext ( SphDocID_t & tDocId )
21549 {
21550 	if ( SqlFetchRow () )
21551 		tDocId = sphToDocid ( SqlColumn(0) );
21552 	else
21553 	{
21554 		if ( SqlIsError() )
21555 			sphDie ( "sql_query_killlist: %s", SqlError() ); // FIXME! this should be reported
21556 		else
21557 		{
21558 			SqlDismissResult ();
21559 			return false;
21560 		}
21561 	}
21562 
21563 	return true;
21564 }
21565 
21566 
ReportUnpackError(int iIndex,int iError)21567 void CSphSource_SQL::ReportUnpackError ( int iIndex, int iError )
21568 {
21569 	if ( !m_bUnpackFailed )
21570 	{
21571 		m_bUnpackFailed = true;
21572 		sphWarn ( "failed to unpack column '%s', error=%d, docid=" DOCID_FMT, SqlFieldName(iIndex), iError, m_tDocInfo.m_iDocID );
21573 	}
21574 }
21575 
21576 
21577 #if !USE_ZLIB
21578 
SqlUnpackColumn(int iFieldIndex,ESphUnpackFormat)21579 const char * CSphSource_SQL::SqlUnpackColumn ( int iFieldIndex, ESphUnpackFormat )
21580 {
21581 	return SqlColumn ( m_tSchema.m_dFields[iFieldIndex].m_iIndex );
21582 }
21583 
21584 #else
21585 
SqlUnpackColumn(int iFieldIndex,ESphUnpackFormat eFormat)21586 const char * CSphSource_SQL::SqlUnpackColumn ( int iFieldIndex, ESphUnpackFormat eFormat )
21587 {
21588 	int iIndex = m_tSchema.m_dFields[iFieldIndex].m_iIndex;
21589 	const char * pData = SqlColumn(iIndex);
21590 
21591 	if ( pData==NULL )
21592 		return NULL;
21593 
21594 	int iPackedLen = SqlColumnLength(iIndex);
21595 	if ( iPackedLen<=0 )
21596 		return NULL;
21597 
21598 
21599 	CSphVector<char> & tBuffer = m_dUnpackBuffers[iFieldIndex];
21600 	switch ( eFormat )
21601 	{
21602 		case SPH_UNPACK_MYSQL_COMPRESS:
21603 		{
21604 			if ( iPackedLen<=4 )
21605 			{
21606 				if ( !m_bUnpackFailed )
21607 				{
21608 					m_bUnpackFailed = true;
21609 					sphWarn ( "failed to unpack '%s', invalid column size (size=%d), docid="DOCID_FMT, SqlFieldName(iIndex), iPackedLen, m_tDocInfo.m_iDocID );
21610 				}
21611 				return NULL;
21612 			}
21613 
21614 			unsigned long uSize = 0;
21615 			for ( int i=0; i<4; i++ )
21616 				uSize += ((unsigned long)((BYTE)pData[i])) << ( 8*i );
21617 			uSize &= 0x3FFFFFFF;
21618 
21619 			if ( uSize > m_tParams.m_uUnpackMemoryLimit )
21620 			{
21621 				if ( !m_bUnpackOverflow )
21622 				{
21623 					m_bUnpackOverflow = true;
21624 					sphWarn ( "failed to unpack '%s', column size limit exceeded (size=%d), docid="DOCID_FMT, SqlFieldName(iIndex), (int)uSize, m_tDocInfo.m_iDocID );
21625 				}
21626 				return NULL;
21627 			}
21628 
21629 			int iResult;
21630 			tBuffer.Resize ( uSize + 1 );
21631 			unsigned long uLen = iPackedLen-4;
21632 			iResult = uncompress ( (Bytef *)tBuffer.Begin(), &uSize, (Bytef *)pData + 4, uLen );
21633 			if ( iResult==Z_OK )
21634 			{
21635 				tBuffer[uSize] = 0;
21636 				return &tBuffer[0];
21637 			} else
21638 				ReportUnpackError ( iIndex, iResult );
21639 			return NULL;
21640 		}
21641 
21642 		case SPH_UNPACK_ZLIB:
21643 		{
21644 			char * sResult = 0;
21645 			int iBufferOffset = 0;
21646 			int iResult;
21647 
21648 			z_stream tStream;
21649 			tStream.zalloc = Z_NULL;
21650 			tStream.zfree = Z_NULL;
21651 			tStream.opaque = Z_NULL;
21652 			tStream.avail_in = iPackedLen;
21653 			tStream.next_in = (Bytef *)SqlColumn(iIndex);
21654 
21655 			iResult = inflateInit ( &tStream );
21656 			if ( iResult!=Z_OK )
21657 				return NULL;
21658 
21659 			for ( ;; )
21660 			{
21661 				tStream.next_out = (Bytef *)&tBuffer[iBufferOffset];
21662 				tStream.avail_out = tBuffer.GetLength() - iBufferOffset - 1;
21663 
21664 				iResult = inflate ( &tStream, Z_NO_FLUSH );
21665 				if ( iResult==Z_STREAM_END )
21666 				{
21667 					tBuffer [ tStream.total_out ] = 0;
21668 					sResult = &tBuffer[0];
21669 					break;
21670 				} else if ( iResult==Z_OK )
21671 				{
21672 					assert ( tStream.avail_out==0 );
21673 
21674 					tBuffer.Resize ( tBuffer.GetLength()*2 );
21675 					iBufferOffset = tStream.total_out;
21676 				} else
21677 				{
21678 					ReportUnpackError ( iIndex, iResult );
21679 					break;
21680 				}
21681 			}
21682 
21683 			inflateEnd ( &tStream );
21684 			return sResult;
21685 		}
21686 
21687 		case SPH_UNPACK_NONE:
21688 			return pData;
21689 	}
21690 	return NULL;
21691 }
21692 #endif // USE_ZLIB
21693 
21694 
IterateJoinedHits(CSphString & sError)21695 ISphHits * CSphSource_SQL::IterateJoinedHits ( CSphString & sError )
21696 {
21697 	m_tHits.m_dData.Resize ( 0 );
21698 
21699 	// eof check
21700 	if ( m_iJoinedHitField>=m_tSchema.m_dFields.GetLength() )
21701 	{
21702 		m_tDocInfo.m_iDocID = 0;
21703 		return &m_tHits;
21704 	}
21705 
21706 	bool bProcessingRanged = true;
21707 
21708 	// my fetch loop
21709 	while ( m_iJoinedHitField<m_tSchema.m_dFields.GetLength() )
21710 	{
21711 		if ( m_tState.m_bProcessingHits || SqlFetchRow() )
21712 		{
21713 			// next row
21714 			m_tDocInfo.m_iDocID = sphToDocid ( SqlColumn(0) ); // FIXME! handle conversion errors and zero/max values?
21715 
21716 			// field start? restart ids
21717 			if ( !m_iJoinedHitID )
21718 				m_iJoinedHitID = m_tDocInfo.m_iDocID;
21719 
21720 			// docid asc requirement violated? report an error
21721 			if ( m_iJoinedHitID>m_tDocInfo.m_iDocID )
21722 			{
21723 				sError.SetSprintf ( "joined field '%s': query MUST return document IDs in ASC order",
21724 					m_tSchema.m_dFields[m_iJoinedHitField].m_sName.cstr() );
21725 				return NULL;
21726 			}
21727 
21728 			// next document? update tracker, reset position
21729 			if ( m_iJoinedHitID<m_tDocInfo.m_iDocID )
21730 			{
21731 				m_iJoinedHitID = m_tDocInfo.m_iDocID;
21732 				m_iJoinedHitPos = 0;
21733 			}
21734 
21735 			if ( !m_tState.m_bProcessingHits )
21736 			{
21737 				m_tState = CSphBuildHitsState_t();
21738 				m_tState.m_iField = m_iJoinedHitField;
21739 				m_tState.m_iStartField = m_iJoinedHitField;
21740 				m_tState.m_iEndField = m_iJoinedHitField+1;
21741 
21742 				if ( m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload )
21743 					m_tState.m_iStartPos = sphToDword ( SqlColumn(2) );
21744 				else
21745 					m_tState.m_iStartPos = m_iJoinedHitPos;
21746 			}
21747 
21748 			// build those hits
21749 			BYTE * dText[] = { (BYTE *)SqlColumn(1) };
21750 			m_tState.m_dFields = dText;
21751 
21752 			BuildHits ( sError, true );
21753 
21754 			// update current position
21755 			if ( !m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload && !m_tState.m_bProcessingHits && m_tHits.Length() )
21756 				m_iJoinedHitPos = HITMAN::GetPos ( m_tHits.Last()->m_iWordPos );
21757 
21758 			if ( m_tState.m_bProcessingHits )
21759 				break;
21760 		} else if ( SqlIsError() )
21761 		{
21762 			// error while fetching row
21763 			sError = SqlError();
21764 			return NULL;
21765 
21766 		} else
21767 		{
21768 			int iLastField = m_iJoinedHitField;
21769 			bool bRanged = ( m_iJoinedHitField>=m_iPlainFieldsLength && m_iJoinedHitField<m_tSchema.m_dFields.GetLength()
21770 				&& m_tSchema.m_dFields[m_iJoinedHitField].m_eSrc==SPH_ATTRSRC_RANGEDQUERY );
21771 
21772 			// current field is over, continue to next field
21773 			if ( m_iJoinedHitField<0 )
21774 				m_iJoinedHitField = m_iPlainFieldsLength;
21775 			else if ( !bRanged || !bProcessingRanged )
21776 				m_iJoinedHitField++;
21777 
21778 			// eof check
21779 			if ( m_iJoinedHitField>=m_tSchema.m_dFields.GetLength() )
21780 			{
21781 				m_tDocInfo.m_iDocID = ( m_tHits.Length() ? 1 : 0 ); // to eof or not to eof
21782 				return &m_tHits;
21783 			}
21784 
21785 			SqlDismissResult ();
21786 
21787 			bProcessingRanged = false;
21788 			bool bCheckNumFields = true;
21789 			CSphColumnInfo & tJoined = m_tSchema.m_dFields[m_iJoinedHitField];
21790 
21791 			// start fetching next field
21792 			if ( tJoined.m_eSrc!=SPH_ATTRSRC_RANGEDQUERY )
21793 			{
21794 				if ( !SqlQuery ( tJoined.m_sQuery.cstr() ) )
21795 				{
21796 					sError = SqlError();
21797 					return NULL;
21798 				}
21799 			} else
21800 			{
21801 				m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
21802 
21803 				// setup ranges for next field
21804 				if ( iLastField!=m_iJoinedHitField )
21805 				{
21806 					CSphString sPrefix;
21807 					sPrefix.SetSprintf ( "joined field '%s' ranged query: ", tJoined.m_sName.cstr() );
21808 					if ( !SetupRanges ( tJoined.m_sQueryRange.cstr(), tJoined.m_sQuery.cstr(), sPrefix.cstr(), sError ) )
21809 						return NULL;
21810 
21811 					m_uCurrentID = m_uMinID;
21812 				}
21813 
21814 				// run first step (in order to report errors)
21815 				bool bRes = RunQueryStep ( tJoined.m_sQuery.cstr(), sError );
21816 				bProcessingRanged = bRes; // select next documents in range or loop once to process next field
21817 				bCheckNumFields = bRes;
21818 
21819 				if ( !sError.IsEmpty() )
21820 					return NULL;
21821 			}
21822 
21823 			const int iExpected = m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload ? 3 : 2;
21824 			if ( bCheckNumFields && SqlNumFields()!=iExpected )
21825 			{
21826 				const char * sName = m_tSchema.m_dFields[m_iJoinedHitField].m_sName.cstr();
21827 				sError.SetSprintf ( "joined field '%s': query MUST return exactly %d columns, got %d", sName, iExpected, SqlNumFields() );
21828 				return NULL;
21829 			}
21830 
21831 			m_iJoinedHitID = 0;
21832 			m_iJoinedHitPos = 0;
21833 		}
21834 	}
21835 
21836 	return &m_tHits;
21837 }
21838 
21839 /////////////////////////////////////////////////////////////////////////////
21840 // MYSQL SOURCE
21841 /////////////////////////////////////////////////////////////////////////////
21842 
21843 #if USE_MYSQL
21844 
CSphSourceParams_MySQL()21845 CSphSourceParams_MySQL::CSphSourceParams_MySQL ()
21846 	: m_iFlags ( 0 )
21847 {
21848 	m_iPort = 3306;
21849 }
21850 
21851 
CSphSource_MySQL(const char * sName)21852 CSphSource_MySQL::CSphSource_MySQL ( const char * sName )
21853 	: CSphSource_SQL	( sName )
21854 	, m_pMysqlResult	( NULL )
21855 	, m_pMysqlFields	( NULL )
21856 	, m_tMysqlRow		( NULL )
21857 	, m_pMysqlLengths	( NULL )
21858 {
21859 	m_bCanUnpack = true;
21860 }
21861 
21862 
SqlDismissResult()21863 void CSphSource_MySQL::SqlDismissResult ()
21864 {
21865 	if ( !m_pMysqlResult )
21866 		return;
21867 
21868 	while ( m_pMysqlResult )
21869 	{
21870 		mysql_free_result ( m_pMysqlResult );
21871 		m_pMysqlResult = NULL;
21872 
21873 		// stored procedures might return multiple result sets
21874 		// FIXME? we might want to index all of them
21875 		// but for now, let's simply dismiss additional result sets
21876 		if ( mysql_next_result ( &m_tMysqlDriver )==0 )
21877 		{
21878 			m_pMysqlResult = mysql_use_result ( &m_tMysqlDriver );
21879 
21880 			static bool bOnce = false;
21881 			if ( !bOnce && m_pMysqlResult && mysql_num_rows ( m_pMysqlResult ) )
21882 			{
21883 				sphWarn ( "indexing of multiple result sets is not supported yet; some results sets were dismissed!" );
21884 				bOnce = true;
21885 			}
21886 		}
21887 	}
21888 
21889 	m_pMysqlFields = NULL;
21890 	m_pMysqlLengths = NULL;
21891 }
21892 
21893 
SqlQuery(const char * sQuery)21894 bool CSphSource_MySQL::SqlQuery ( const char * sQuery )
21895 {
21896 	if ( mysql_query ( &m_tMysqlDriver, sQuery ) )
21897 	{
21898 		if ( m_tParams.m_bPrintQueries )
21899 			fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
21900 		return false;
21901 	}
21902 	if ( m_tParams.m_bPrintQueries )
21903 		fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
21904 
21905 	m_pMysqlResult = mysql_use_result ( &m_tMysqlDriver );
21906 	m_pMysqlFields = NULL;
21907 	return true;
21908 }
21909 
21910 
SqlIsError()21911 bool CSphSource_MySQL::SqlIsError ()
21912 {
21913 	return mysql_errno ( &m_tMysqlDriver )!=0;
21914 }
21915 
21916 
SqlError()21917 const char * CSphSource_MySQL::SqlError ()
21918 {
21919 	return mysql_error ( &m_tMysqlDriver );
21920 }
21921 
21922 
SqlConnect()21923 bool CSphSource_MySQL::SqlConnect ()
21924 {
21925 	mysql_init ( &m_tMysqlDriver );
21926 
21927 	if ( !m_sSslKey.IsEmpty() || !m_sSslCert.IsEmpty() || !m_sSslCA.IsEmpty() )
21928 		mysql_ssl_set ( &m_tMysqlDriver, m_sSslKey.cstr(), m_sSslCert.cstr(), m_sSslCA.cstr(), NULL, NULL );
21929 
21930 	m_iMysqlConnectFlags |= CLIENT_MULTI_RESULTS; // we now know how to handle this
21931 	bool bRes = ( NULL!=mysql_real_connect ( &m_tMysqlDriver,
21932 		m_tParams.m_sHost.cstr(), m_tParams.m_sUser.cstr(), m_tParams.m_sPass.cstr(),
21933 		m_tParams.m_sDB.cstr(), m_tParams.m_iPort, m_sMysqlUsock.cstr(), m_iMysqlConnectFlags ) );
21934 	if ( m_tParams.m_bPrintQueries )
21935 		fprintf ( stdout, bRes ? "SQL-CONNECT: ok\n" : "SQL-CONNECT: FAIL\n" );
21936 	return bRes;
21937 }
21938 
21939 
SqlDisconnect()21940 void CSphSource_MySQL::SqlDisconnect ()
21941 {
21942 	if ( m_tParams.m_bPrintQueries )
21943 		fprintf ( stdout, "SQL-DISCONNECT\n" );
21944 
21945 	mysql_close ( &m_tMysqlDriver );
21946 }
21947 
21948 
SqlNumFields()21949 int CSphSource_MySQL::SqlNumFields ()
21950 {
21951 	if ( !m_pMysqlResult )
21952 		return -1;
21953 
21954 	return mysql_num_fields ( m_pMysqlResult );
21955 }
21956 
21957 
SqlFetchRow()21958 bool CSphSource_MySQL::SqlFetchRow ()
21959 {
21960 	if ( !m_pMysqlResult )
21961 		return false;
21962 
21963 	m_tMysqlRow = mysql_fetch_row ( m_pMysqlResult );
21964 	return m_tMysqlRow!=NULL;
21965 }
21966 
21967 
SqlColumn(int iIndex)21968 const char * CSphSource_MySQL::SqlColumn ( int iIndex )
21969 {
21970 	if ( !m_pMysqlResult )
21971 		return NULL;
21972 
21973 	return m_tMysqlRow[iIndex];
21974 }
21975 
21976 
SqlFieldName(int iIndex)21977 const char * CSphSource_MySQL::SqlFieldName ( int iIndex )
21978 {
21979 	if ( !m_pMysqlResult )
21980 		return NULL;
21981 
21982 	if ( !m_pMysqlFields )
21983 		m_pMysqlFields = mysql_fetch_fields ( m_pMysqlResult );
21984 
21985 	return m_pMysqlFields[iIndex].name;
21986 }
21987 
21988 
SqlColumnLength(int iIndex)21989 DWORD CSphSource_MySQL::SqlColumnLength ( int iIndex )
21990 {
21991 	if ( !m_pMysqlResult )
21992 		return 0;
21993 
21994 	if ( !m_pMysqlLengths )
21995 		m_pMysqlLengths = mysql_fetch_lengths ( m_pMysqlResult );
21996 
21997 	return m_pMysqlLengths[iIndex];
21998 }
21999 
22000 
Setup(const CSphSourceParams_MySQL & tParams)22001 bool CSphSource_MySQL::Setup ( const CSphSourceParams_MySQL & tParams )
22002 {
22003 	if ( !CSphSource_SQL::Setup ( tParams ) )
22004 		return false;
22005 
22006 	m_sMysqlUsock = tParams.m_sUsock;
22007 	m_iMysqlConnectFlags = tParams.m_iFlags;
22008 	m_sSslKey = tParams.m_sSslKey;
22009 	m_sSslCert = tParams.m_sSslCert;
22010 	m_sSslCA = tParams.m_sSslCA;
22011 
22012 	// build and store DSN for error reporting
22013 	char sBuf [ 1024 ];
22014 	snprintf ( sBuf, sizeof(sBuf), "mysql%s", m_sSqlDSN.cstr()+3 );
22015 	m_sSqlDSN = sBuf;
22016 
22017 	return true;
22018 }
22019 
22020 #endif // USE_MYSQL
22021 
22022 /////////////////////////////////////////////////////////////////////////////
22023 // PGSQL SOURCE
22024 /////////////////////////////////////////////////////////////////////////////
22025 
22026 #if USE_PGSQL
22027 
CSphSourceParams_PgSQL()22028 CSphSourceParams_PgSQL::CSphSourceParams_PgSQL ()
22029 {
22030 	m_iRangeStep = 1024;
22031 	m_iPort = 5432;
22032 }
22033 
22034 
CSphSource_PgSQL(const char * sName)22035 CSphSource_PgSQL::CSphSource_PgSQL ( const char * sName )
22036 	: CSphSource_SQL	( sName )
22037 	, m_pPgResult		( NULL )
22038 	, m_iPgRows			( 0 )
22039 	, m_iPgRow			( 0 )
22040 {
22041 }
22042 
22043 
SqlIsError()22044 bool CSphSource_PgSQL::SqlIsError ()
22045 {
22046 	return ( m_iPgRow<m_iPgRows ); // if we're over, it's just last row
22047 }
22048 
22049 
SqlError()22050 const char * CSphSource_PgSQL::SqlError ()
22051 {
22052 	return PQerrorMessage ( m_tPgDriver );
22053 }
22054 
22055 
Setup(const CSphSourceParams_PgSQL & tParams)22056 bool CSphSource_PgSQL::Setup ( const CSphSourceParams_PgSQL & tParams )
22057 {
22058 	// checks
22059 	CSphSource_SQL::Setup ( tParams );
22060 
22061 	m_sPgClientEncoding = tParams.m_sClientEncoding;
22062 	if ( !m_sPgClientEncoding.cstr() )
22063 		m_sPgClientEncoding = "";
22064 
22065 	// build and store DSN for error reporting
22066 	char sBuf [ 1024 ];
22067 	snprintf ( sBuf, sizeof(sBuf), "pgsql%s", m_sSqlDSN.cstr()+3 );
22068 	m_sSqlDSN = sBuf;
22069 
22070 	return true;
22071 }
22072 
22073 
IterateStart(CSphString & sError)22074 bool CSphSource_PgSQL::IterateStart ( CSphString & sError )
22075 {
22076 	bool bResult = CSphSource_SQL::IterateStart ( sError );
22077 	if ( !bResult )
22078 		return false;
22079 
22080 	int iMaxIndex = 0;
22081 	for ( int i = 0; i < m_tSchema.GetAttrsCount(); i++ )
22082 		iMaxIndex = Max ( iMaxIndex, m_tSchema.GetAttr(i).m_iIndex );
22083 
22084 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
22085 		iMaxIndex = Max ( iMaxIndex, m_tSchema.m_dFields[i].m_iIndex );
22086 
22087 	m_dIsColumnBool.Resize ( iMaxIndex + 1 );
22088 	ARRAY_FOREACH ( i, m_dIsColumnBool )
22089 		m_dIsColumnBool[i] = false;
22090 
22091 	for ( int i = 0; i < m_tSchema.GetAttrsCount(); i++ )
22092 		m_dIsColumnBool [ m_tSchema.GetAttr(i).m_iIndex ] = ( m_tSchema.GetAttr(i).m_eAttrType==SPH_ATTR_BOOL );
22093 
22094 	return true;
22095 }
22096 
22097 
SqlConnect()22098 bool CSphSource_PgSQL::SqlConnect ()
22099 {
22100 	char sPort[64];
22101 	snprintf ( sPort, sizeof(sPort), "%d", m_tParams.m_iPort );
22102 	m_tPgDriver = PQsetdbLogin ( m_tParams.m_sHost.cstr(), sPort, NULL, NULL,
22103 		m_tParams.m_sDB.cstr(), m_tParams.m_sUser.cstr(), m_tParams.m_sPass.cstr() );
22104 
22105 	if ( PQstatus ( m_tPgDriver )==CONNECTION_BAD )
22106 	{
22107 		if ( m_tParams.m_bPrintQueries )
22108 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
22109 		return false;
22110 	}
22111 
22112 	// set client encoding
22113 	if ( !m_sPgClientEncoding.IsEmpty() )
22114 		if ( -1==PQsetClientEncoding ( m_tPgDriver, m_sPgClientEncoding.cstr() ) )
22115 	{
22116 		SqlDisconnect ();
22117 		if ( m_tParams.m_bPrintQueries )
22118 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
22119 		return false;
22120 	}
22121 
22122 	if ( m_tParams.m_bPrintQueries )
22123 		fprintf ( stdout, "SQL-CONNECT: ok\n" );
22124 	return true;
22125 }
22126 
22127 
SqlDisconnect()22128 void CSphSource_PgSQL::SqlDisconnect ()
22129 {
22130 	if ( m_tParams.m_bPrintQueries )
22131 		fprintf ( stdout, "SQL-DISCONNECT\n" );
22132 
22133 	PQfinish ( m_tPgDriver );
22134 }
22135 
22136 
SqlQuery(const char * sQuery)22137 bool CSphSource_PgSQL::SqlQuery ( const char * sQuery )
22138 {
22139 	m_iPgRow = -1;
22140 	m_iPgRows = 0;
22141 
22142 	m_pPgResult = PQexec ( m_tPgDriver, sQuery );
22143 
22144 	ExecStatusType eRes = PQresultStatus ( m_pPgResult );
22145 	if ( ( eRes!=PGRES_COMMAND_OK ) && ( eRes!=PGRES_TUPLES_OK ) )
22146 	{
22147 		if ( m_tParams.m_bPrintQueries )
22148 			fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
22149 		return false;
22150 	}
22151 	if ( m_tParams.m_bPrintQueries )
22152 		fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
22153 
22154 	m_iPgRows = PQntuples ( m_pPgResult );
22155 	return true;
22156 }
22157 
22158 
SqlDismissResult()22159 void CSphSource_PgSQL::SqlDismissResult ()
22160 {
22161 	if ( !m_pPgResult )
22162 		return;
22163 
22164 	PQclear ( m_pPgResult );
22165 	m_pPgResult = NULL;
22166 }
22167 
22168 
SqlNumFields()22169 int CSphSource_PgSQL::SqlNumFields ()
22170 {
22171 	if ( !m_pPgResult )
22172 		return -1;
22173 
22174 	return PQnfields ( m_pPgResult );
22175 }
22176 
22177 
SqlColumn(int iIndex)22178 const char * CSphSource_PgSQL::SqlColumn ( int iIndex )
22179 {
22180 	if ( !m_pPgResult )
22181 		return NULL;
22182 
22183 	const char * szValue = PQgetvalue ( m_pPgResult, m_iPgRow, iIndex );
22184 	if ( m_dIsColumnBool.GetLength() && m_dIsColumnBool[iIndex] && szValue[0]=='t' && !szValue[1] )
22185 		return "1";
22186 
22187 	return szValue;
22188 }
22189 
22190 
SqlFieldName(int iIndex)22191 const char * CSphSource_PgSQL::SqlFieldName ( int iIndex )
22192 {
22193 	if ( !m_pPgResult )
22194 		return NULL;
22195 
22196 	return PQfname ( m_pPgResult, iIndex );
22197 }
22198 
22199 
SqlFetchRow()22200 bool CSphSource_PgSQL::SqlFetchRow ()
22201 {
22202 	if ( !m_pPgResult )
22203 		return false;
22204 	return ( ++m_iPgRow<m_iPgRows );
22205 }
22206 
22207 
SqlColumnLength(int)22208 DWORD CSphSource_PgSQL::SqlColumnLength ( int )
22209 {
22210 	return 0;
22211 }
22212 
22213 #endif // USE_PGSQL
22214 
22215 /////////////////////////////////////////////////////////////////////////////
22216 // XMLPIPE
22217 /////////////////////////////////////////////////////////////////////////////
22218 
CSphSource_XMLPipe(BYTE * dInitialBuf,int iBufLen,const char * sName)22219 CSphSource_XMLPipe::CSphSource_XMLPipe ( BYTE * dInitialBuf, int iBufLen, const char * sName )
22220 	: CSphSource	( sName )
22221 	, m_iBufferSize	( 1048576 )
22222 	, m_bEOF		( false )
22223 	, m_bWarned		( false )
22224 	, m_iInitialBufLen ( iBufLen )
22225 	, m_bHitsReady ( false )
22226 {
22227 	assert ( m_iBufferSize > iBufLen );
22228 
22229 	m_pTag = NULL;
22230 	m_iTagLength = 0;
22231 	m_pPipe = NULL;
22232 	m_pBuffer = NULL;
22233 	m_pBufferEnd = NULL;
22234 	m_sBuffer = new BYTE [m_iBufferSize];
22235 
22236 	if ( iBufLen )
22237 		memcpy ( m_sBuffer, dInitialBuf, iBufLen );
22238 }
22239 
22240 
~CSphSource_XMLPipe()22241 CSphSource_XMLPipe::~CSphSource_XMLPipe ()
22242 {
22243 	Disconnect ();
22244 	SafeDeleteArray ( m_sBuffer );
22245 }
22246 
22247 
Disconnect()22248 void CSphSource_XMLPipe::Disconnect ()
22249 {
22250 	m_iInitialBufLen = 0;
22251 
22252 	m_tHits.m_dData.Reset();
22253 	m_tSchema.Reset ();
22254 
22255 	if ( m_pPipe )
22256 	{
22257 		pclose ( m_pPipe );
22258 		m_pPipe = NULL;
22259 	}
22260 }
22261 
22262 
Setup(FILE * pPipe,const char * sCommand)22263 bool CSphSource_XMLPipe::Setup ( FILE * pPipe, const char * sCommand )
22264 {
22265 	assert ( sCommand );
22266 	m_pPipe = pPipe;
22267 	m_sCommand = sCommand;
22268 	return true;
22269 }
22270 
22271 
Connect(CSphString &)22272 bool CSphSource_XMLPipe::Connect ( CSphString & )
22273 {
22274 	m_bEOF = false;
22275 	m_bWarned = false;
22276 
22277 	m_tSchema.m_dFields.Reset ();
22278 	m_tSchema.m_dFields.Add ( CSphColumnInfo ( "title" ) );
22279 	m_tSchema.m_dFields.Add ( CSphColumnInfo ( "body" ) );
22280 
22281 	CSphColumnInfo tGid ( "gid", SPH_ATTR_INTEGER );
22282 	CSphColumnInfo tTs ( "ts", SPH_ATTR_TIMESTAMP );
22283 	m_tSchema.AddAttr ( tGid, true ); // all attributes are dynamic at indexing time
22284 	m_tSchema.AddAttr ( tTs, true ); // all attributes are dynamic at indexing time
22285 
22286 	m_tDocInfo.Reset ( m_tSchema.GetRowSize() );
22287 
22288 	m_pBuffer = m_iInitialBufLen > 0 ? m_sBuffer : NULL;
22289 	m_pBufferEnd = m_pBuffer ? m_pBuffer + m_iInitialBufLen : NULL;
22290 
22291 	char sBuf [ 1024 ];
22292 	snprintf ( sBuf, sizeof(sBuf), "xmlpipe(%s)", m_sCommand.cstr() );
22293 	m_tSchema.m_sName = sBuf;
22294 
22295 	m_tHits.m_dData.Reserve ( MAX_SOURCE_HITS );
22296 
22297 	return true;
22298 }
22299 
IterateDocument(CSphString & sError)22300 bool CSphSource_XMLPipe::IterateDocument ( CSphString & sError )
22301 {
22302 	PROFILE ( src_xmlpipe );
22303 	char sTitle [ 1024 ]; // FIXME?
22304 
22305 	assert ( m_pPipe );
22306 	assert ( m_pTokenizer );
22307 
22308 	m_tHits.m_dData.Resize ( 0 );
22309 	m_bHitsReady = false;
22310 
22311 	/////////////////////////
22312 	// parse document header
22313 	/////////////////////////
22314 
22315 	// check for eof
22316 	if ( !SkipWhitespace() )
22317 	{
22318 		m_tDocInfo.m_iDocID = 0;
22319 		return true;
22320 	}
22321 
22322 	// look for opening '<document>' tag
22323 	SetTag ( "document" );
22324 	if ( !SkipTag ( true, sError ) )
22325 		return false;
22326 
22327 	if ( !ScanInt ( "id", &m_tDocInfo.m_iDocID, sError ) )
22328 		return false;
22329 	m_tStats.m_iTotalDocuments++;
22330 
22331 	SphAttr_t uVal;
22332 	if ( !ScanInt ( "group", &uVal, sError ) ) uVal = 1; m_tDocInfo.SetAttr ( m_tSchema.GetAttr(0).m_tLocator, uVal );
22333 	if ( !ScanInt ( "timestamp", &uVal, sError ) ) uVal = 1; m_tDocInfo.SetAttr ( m_tSchema.GetAttr(1).m_tLocator, uVal );
22334 
22335 	if ( !ScanStr ( "title", sTitle, sizeof(sTitle), sError ) )
22336 		return false;
22337 
22338 	// index title
22339 	{
22340 		int iLen = (int)strlen ( sTitle );
22341 		Hitpos_t iPos = HITMAN::Create ( 0, 1 );
22342 		BYTE * sWord;
22343 
22344 		m_pTokenizer->SetBuffer ( (BYTE*)sTitle, iLen );
22345 		while ( ( sWord = m_pTokenizer->GetToken() )!=NULL && m_tHits.Length()<MAX_SOURCE_HITS )
22346 		{
22347 			m_tHits.AddHit ( m_tDocInfo.m_iDocID, m_pDict->GetWordID ( sWord ), iPos );
22348 			HITMAN::AddPos ( &iPos, 1 );
22349 		}
22350 	}
22351 
22352 	CheckHitsCount ( "title" );
22353 
22354 	SetTag ( "body" );
22355 	if ( !SkipTag ( true, sError ) )
22356 		return false;
22357 
22358 	m_iWordPos = 0;
22359 
22360 	/////////////////////////////
22361 	// parse body chunk by chunk
22362 	/////////////////////////////
22363 
22364 	// check for body tag end in this buffer
22365 	const char * szBodyEnd = "</body>";
22366 
22367 	bool bFirstPass = true;
22368 	bool bBodyEnd = false;
22369 	BYTE * p = m_pBuffer;
22370 
22371 	while ( !bBodyEnd )
22372 	{
22373 		p = m_pBuffer;
22374 		while ( p<m_pBufferEnd && !bBodyEnd )
22375 		{
22376 			BYTE * pBufTemp = p;
22377 			BYTE * pEndTemp = (BYTE *)szBodyEnd;
22378 			while ( pBufTemp < m_pBufferEnd && *pEndTemp && *pBufTemp==*pEndTemp )
22379 			{
22380 				++pBufTemp;
22381 				++pEndTemp;
22382 			}
22383 
22384 			if ( !*pEndTemp )
22385 				bBodyEnd = true;
22386 			else
22387 				p++;
22388 		}
22389 
22390 		if ( !bFirstPass )
22391 			break;
22392 
22393 		bFirstPass = false;
22394 
22395 		if ( !bBodyEnd )
22396 			UpdateBuffer ();
22397 	}
22398 
22399 	if ( !bBodyEnd )
22400 	{
22401 		if ( !m_bWarned )
22402 		{
22403 			sphWarn ( "xmlpipe: encountered body larger than %d bytes while scanning docid=" DOCID_FMT " body", m_iBufferSize, m_tDocInfo.m_iDocID );
22404 			m_bWarned = true;
22405 		}
22406 	}
22407 
22408 	m_pTokenizer->SetBuffer ( m_pBuffer, p-m_pBuffer );
22409 
22410 	// tokenize
22411 	BYTE * sWord;
22412 	while ( ( sWord = m_pTokenizer->GetToken () )!=NULL && m_tHits.Length()<MAX_SOURCE_HITS )
22413 		m_tHits.AddHit ( m_tDocInfo.m_iDocID, m_pDict->GetWordID ( sWord ), HITMAN::Create ( 1, ++m_iWordPos ) );
22414 
22415 	CheckHitsCount ( "body" );
22416 
22417 	m_pBuffer = p;
22418 
22419 	SetTag ( "body" );
22420 
22421 	// some tag was found
22422 	if ( bBodyEnd )
22423 	{
22424 		// let's check if it's '</body>' which is the only allowed tag at this point
22425 		if ( !SkipTag ( false, sError ) )
22426 			return false;
22427 	} else
22428 	{
22429 		// search for '</body>' tag
22430 		bool bFound = false;
22431 
22432 		while ( !bFound )
22433 		{
22434 			while ( m_pBuffer < m_pBufferEnd && *m_pBuffer!='<' )
22435 				++m_pBuffer;
22436 
22437 			BYTE * pBufferTmp = m_pBuffer;
22438 			if ( m_pBuffer < m_pBufferEnd )
22439 			{
22440 				if ( !SkipTag ( false, sError ) )
22441 				{
22442 					if ( m_bEOF )
22443 						return false;
22444 					else
22445 					{
22446 						if ( m_pBuffer==pBufferTmp )
22447 							m_pBuffer = pBufferTmp + 1;
22448 					}
22449 				} else
22450 					bFound = true;
22451 			} else
22452 				if ( !UpdateBuffer () )
22453 					return false;
22454 		}
22455 	}
22456 
22457 
22458 	// let's check if it's '</document>' which is the only allowed tag at this point
22459 	SetTag ( "document" );
22460 	if ( !SkipTag ( false, sError ) )
22461 		return false;
22462 
22463 	// if it was all correct, we have to flush our hits
22464 	m_bHitsReady = m_tHits.Length()>0;
22465 	return true;
22466 }
22467 
22468 
IterateHits(CSphString &)22469 ISphHits * CSphSource_XMLPipe::IterateHits ( CSphString & )
22470 {
22471 	if ( !m_bHitsReady )
22472 		return NULL;
22473 
22474 	m_bHitsReady = false;
22475 
22476 	return &m_tHits;
22477 }
22478 
22479 
IterateFieldMVAStart(int)22480 SphRange_t	CSphSource_XMLPipe::IterateFieldMVAStart ( int )
22481 {
22482 	SphRange_t tRange;
22483 	tRange.m_iStart = tRange.m_iLength = 0;
22484 	return tRange;
22485 }
22486 
22487 
SetTag(const char * sTag)22488 void CSphSource_XMLPipe::SetTag ( const char * sTag )
22489 {
22490 	m_pTag = sTag;
22491 	m_iTagLength = (int)strlen ( sTag );
22492 }
22493 
22494 
UpdateBuffer()22495 bool CSphSource_XMLPipe::UpdateBuffer ()
22496 {
22497 	assert ( m_pBuffer!=m_sBuffer );
22498 
22499 	int iLeft = Max ( m_pBufferEnd-m_pBuffer, 0 );
22500 	if ( iLeft>0 )
22501 		memmove ( m_sBuffer, m_pBuffer, iLeft );
22502 
22503 	size_t iLen = fread ( &m_sBuffer [ iLeft ], 1, m_iBufferSize-iLeft, m_pPipe );
22504 	m_tStats.m_iTotalBytes += iLen;
22505 
22506 	m_pBuffer = m_sBuffer;
22507 	m_pBufferEnd = m_pBuffer+iLeft+iLen;
22508 
22509 	return ( iLen!=0 );
22510 }
22511 
22512 
SkipWhitespace()22513 bool CSphSource_XMLPipe::SkipWhitespace ()
22514 {
22515 	for ( ;; )
22516 	{
22517 		// suck in some data if needed
22518 		if ( m_pBuffer>=m_pBufferEnd )
22519 			if ( !UpdateBuffer() )
22520 				return false;
22521 
22522 		// skip whitespace
22523 		while ( (m_pBuffer<m_pBufferEnd) && isspace ( *m_pBuffer ) )
22524 			m_pBuffer++;
22525 
22526 		// did we anything non-whitspace?
22527 		if ( m_pBuffer<m_pBufferEnd )
22528 			break;
22529 	}
22530 
22531 	assert ( m_pBuffer<m_pBufferEnd );
22532 	return true;
22533 }
22534 
22535 
CheckTag(bool bOpen,CSphString & sError)22536 bool CSphSource_XMLPipe::CheckTag ( bool bOpen, CSphString & sError )
22537 {
22538 	int iAdd = bOpen ? 2 : 3;
22539 
22540 	// if case the tag is at buffer boundary, try to suck in some more data
22541 	if ( m_pBufferEnd-m_pBuffer < m_iTagLength+iAdd )
22542 		UpdateBuffer ();
22543 
22544 	if ( m_pBufferEnd-m_pBuffer < m_iTagLength+iAdd )
22545 	{
22546 		m_bEOF = true;
22547 		sError.SetSprintf ( "xmlpipe: expected '<%s%s>', got EOF",
22548 			bOpen ? "" : "/", m_pTag );
22549 		return false;
22550 	}
22551 
22552 	// check tag
22553 	bool bOk = bOpen
22554 		? ( ( m_pBuffer[0]=='<' )
22555 			&& ( m_pBuffer[m_iTagLength+1]=='>' )
22556 			&& strncmp ( (char*)(m_pBuffer+1), m_pTag, m_iTagLength )==0 )
22557 		: ( ( m_pBuffer[0]=='<' )
22558 			&& ( m_pBuffer[1]=='/' )
22559 			&& ( m_pBuffer[m_iTagLength+2]=='>' )
22560 			&& strncmp ( (char*)(m_pBuffer+2), m_pTag, m_iTagLength )==0 );
22561 	if ( !bOk )
22562 	{
22563 		char sGot[64];
22564 		int iCopy = Min ( m_pBufferEnd-m_pBuffer, (int)sizeof(sGot)-1 );
22565 
22566 		strncpy ( sGot, (char*)m_pBuffer, iCopy );
22567 		sGot [ iCopy ] = '\0';
22568 
22569 		sError.SetSprintf ( "xmlpipe: expected '<%s%s>', got '%s'",
22570 			bOpen ? "" : "/", m_pTag, sGot );
22571 		return false;
22572 	}
22573 
22574 	// got tag
22575 	m_pBuffer += iAdd+m_iTagLength;
22576 	assert ( m_pBuffer<=m_pBufferEnd );
22577 	return true;
22578 }
22579 
22580 
SkipTag(bool bOpen,CSphString & sError)22581 bool CSphSource_XMLPipe::SkipTag ( bool bOpen, CSphString & sError )
22582 {
22583 	if ( !SkipWhitespace() )
22584 	{
22585 		m_bEOF = true;
22586 		sError.SetSprintf ( "xmlpipe: expected '<%s%s>', got EOF",
22587 			bOpen ? "" : "/", m_pTag );
22588 		return false;
22589 	}
22590 
22591 	return CheckTag ( bOpen, sError );
22592 }
22593 
22594 
ScanInt(const char * sTag,DWORD * pRes,CSphString & sError)22595 bool CSphSource_XMLPipe::ScanInt ( const char * sTag, DWORD * pRes, CSphString & sError )
22596 {
22597 	uint64_t uRes;
22598 	if ( !ScanInt ( sTag, &uRes, sError ) )
22599 		return false;
22600 
22601 	(*pRes) = (DWORD)uRes;
22602 	return true;
22603 }
22604 
22605 
ScanInt(const char * sTag,uint64_t * pRes,CSphString & sError)22606 bool CSphSource_XMLPipe::ScanInt ( const char * sTag, uint64_t * pRes, CSphString & sError )
22607 {
22608 	assert ( sTag );
22609 	assert ( pRes );
22610 
22611 	// scan for <sTag>
22612 	SetTag ( sTag );
22613 	if ( !SkipTag ( true, sError ) )
22614 		return false;
22615 
22616 	if ( !SkipWhitespace() )
22617 	{
22618 		sError.SetSprintf ( "xmlpipe: expected <%s> data, got EOF", m_pTag );
22619 		return false;
22620 	}
22621 
22622 	*pRes = 0;
22623 	while ( m_pBuffer<m_pBufferEnd )
22624 	{
22625 		// FIXME! could check for overflow
22626 		while ( isdigit ( *m_pBuffer ) && m_pBuffer<m_pBufferEnd )
22627 			(*pRes) = 10*(*pRes) + (int)( (*m_pBuffer++)-'0' );
22628 
22629 		if ( m_pBuffer<m_pBufferEnd )
22630 			break;
22631 		else
22632 			UpdateBuffer ();
22633 	}
22634 
22635 	// scan for </sTag>
22636 	if ( !SkipTag ( false, sError ) )
22637 		return false;
22638 
22639 	return true;
22640 }
22641 
22642 
ScanStr(const char * sTag,char * pRes,int iMaxLength,CSphString & sError)22643 bool CSphSource_XMLPipe::ScanStr ( const char * sTag, char * pRes, int iMaxLength, CSphString & sError )
22644 {
22645 	assert ( sTag );
22646 	assert ( pRes );
22647 
22648 	char * pEnd = pRes+iMaxLength-1;
22649 
22650 	// scan for <sTag>
22651 	SetTag ( sTag );
22652 	if ( !SkipTag ( true, sError ) )
22653 		return false;
22654 
22655 	if ( !SkipWhitespace() )
22656 	{
22657 		sError.SetSprintf ( "xmlpipe: expected <%s> data, got EOF", m_pTag );
22658 		return false;
22659 	}
22660 
22661 	while ( m_pBuffer<m_pBufferEnd )
22662 	{
22663 		while ( (*m_pBuffer)!='<' && pRes<pEnd && m_pBuffer<m_pBufferEnd )
22664 			*pRes++ = *m_pBuffer++;
22665 
22666 		if ( m_pBuffer<m_pBufferEnd )
22667 			break;
22668 		else
22669 			UpdateBuffer ();
22670 	}
22671 	*pRes++ = '\0';
22672 
22673 	// scan for </sTag>
22674 	if ( !SkipTag ( false, sError ) )
22675 		return false;
22676 
22677 	return true;
22678 }
22679 
22680 
CheckHitsCount(const char * sField)22681 void CSphSource_XMLPipe::CheckHitsCount ( const char * sField )
22682 {
22683 	if ( m_tHits.Length()>=MAX_SOURCE_HITS && m_pTokenizer->GetTokenEnd()!=m_pTokenizer->GetBufferEnd() )
22684 		sphWarn ( "xmlpipe: collected hits larger than %d(MAX_SOURCE_HITS) while scanning docid=" DOCID_FMT " %s - clipped!!!", MAX_SOURCE_HITS, m_tDocInfo.m_iDocID, sField );
22685 }
22686 
22687 
22688 /////////////////////////////////////////////////////////////////////////////
22689 // XMLPIPE (v2)
22690 /////////////////////////////////////////////////////////////////////////////
22691 
22692 #if USE_LIBEXPAT || USE_LIBXML
22693 
22694 /// XML pipe source implementation (v2)
22695 class CSphSource_XMLPipe2 : public CSphSource_Document
22696 {
22697 public:
22698 					CSphSource_XMLPipe2 ( BYTE * dInitialBuf, int iBufLen, const char * sName, int iFieldBufferMax, bool bFixupUTF8 );
22699 					~CSphSource_XMLPipe2 ();
22700 
22701 	bool			Setup ( FILE * pPipe, const CSphConfigSection & hSource );			///< memorize the command
22702 	virtual bool	Connect ( CSphString & sError );			///< run the command and open the pipe
22703 	virtual void	Disconnect ();								///< close the pipe
22704 
IterateStart(CSphString &)22705 	virtual bool	IterateStart ( CSphString & ) { m_iPlainFieldsLength = m_tSchema.m_dFields.GetLength(); return true; }	///< Connect() starts getting documents automatically, so this one is empty
22706 	virtual BYTE **	NextDocument ( CSphString & sError );			///< parse incoming chunk and emit some hits
22707 
HasAttrsConfigured()22708 	virtual bool	HasAttrsConfigured ()							{ return true; }	///< xmlpipe always has some attrs for now
IterateMultivaluedStart(int,CSphString &)22709 	virtual bool	IterateMultivaluedStart ( int, CSphString & )	{ return false; }
IterateMultivaluedNext()22710 	virtual bool	IterateMultivaluedNext ()						{ return false; }
22711 	virtual bool	IterateKillListStart ( CSphString & );
22712 	virtual bool	IterateKillListNext ( SphDocID_t & tDocId );
22713 
22714 
22715 	void			StartElement ( const char * szName, const char ** pAttrs );
22716 	void			EndElement ( const char * pName );
22717 	void			Characters ( const char * pCharacters, int iLen );
22718 
22719 #if USE_LIBXML
22720 	int				ReadBuffer ( BYTE * pBuffer, int iLen );
22721 	void			ProcessNode ( xmlTextReaderPtr pReader );
22722 #endif
22723 
22724 	void			Error ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
22725 
22726 private:
22727 	struct Document_t
22728 	{
22729 		SphDocID_t				m_iDocID;
22730 		CSphVector<CSphString>	m_dFields;
22731 		CSphVector<CSphString>	m_dAttrs;
22732 	};
22733 
22734 	Document_t *				m_pCurDocument;
22735 	CSphVector<Document_t *>	m_dParsedDocuments;
22736 
22737 	FILE *			m_pPipe;			///< incoming stream
22738 	CSphString		m_sCommand;			///< my command
22739 	CSphString		m_sError;
22740 	CSphVector<CSphString> m_dDefaultAttrs;
22741 	CSphVector<CSphString> m_dInvalid;
22742 	CSphVector<CSphString> m_dWarned;
22743 	int				m_iElementDepth;
22744 
22745 	BYTE *			m_pBuffer;
22746 	int				m_iBufferSize;
22747 
22748 	CSphVector<BYTE*>m_dFieldPtrs;
22749 	bool			m_bRemoveParsed;
22750 
22751 	bool			m_bInDocset;
22752 	bool			m_bInSchema;
22753 	bool			m_bInDocument;
22754 	bool			m_bInKillList;
22755 	bool			m_bInId;
22756 	bool			m_bInIgnoredTag;
22757 	bool			m_bFirstTagAfterDocset;
22758 
22759 	int				m_iKillListIterator;
22760 	CSphVector < SphDocID_t > m_dKillList;
22761 
22762 	int				m_iMVA;
22763 	int				m_iMVAIterator;
22764 	CSphVector < CSphVector <DWORD> > m_dFieldMVAs;
22765 	CSphVector < int > m_dAttrToMVA;
22766 
22767 	int				m_iCurField;
22768 	int				m_iCurAttr;
22769 
22770 #if USE_LIBEXPAT
22771 	XML_Parser		m_pParser;
22772 #endif
22773 
22774 #if USE_LIBXML
22775 	xmlTextReaderPtr m_pParser;
22776 
22777 	BYTE *			m_pBufferPtr;
22778 	BYTE *			m_pBufferEnd;
22779 	bool			m_bPassedBufferEnd;
22780 	CSphVector <const char *> m_dAttrs;
22781 #endif
22782 
22783 	int				m_iInitialBufSize;
22784 
22785 	int				m_iFieldBufferMax;
22786 	BYTE * 			m_pFieldBuffer;
22787 	int				m_iFieldBufferLen;
22788 
22789 	bool			m_bFixupUTF8;		///< whether to replace invalid utf-8 codepoints with spaces
22790 	int				m_iReparseStart;	///< utf-8 fixerupper might need to postpone a few bytes, starting at this offset
22791 	int				m_iReparseLen;		///< and this much bytes (under 4)
22792 
22793 	const char *	DecorateMessage ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
22794 	const char *	DecorateMessageVA ( const char * sTemplate, va_list ap );
22795 
22796 	void			ConfigureAttrs ( const CSphVariant * pHead, ESphAttr eAttrType );
22797 	void			ConfigureFields ( const CSphVariant * pHead );
22798 	void			AddFieldToSchema ( const char * szName );
22799 	void			UnexpectedCharaters ( const char * pCharacters, int iLen, const char * szComment );
22800 
22801 #if USE_LIBEXPAT
22802 	bool			ParseNextChunk ( int iBufferLen, CSphString & sError );
22803 #endif
22804 
22805 #if USE_LIBXML
22806 	int				ParseNextChunk ( CSphString & sError );
22807 #endif
22808 
DocumentError(const char * sWhere)22809 	void DocumentError ( const char * sWhere )
22810 	{
22811 		Error ( "malformed source, <sphinx:document> found inside %s", sWhere );
22812 
22813 		// Ideally I'd like to display a notice on the next line that
22814 		// would say where exactly it's allowed. E.g.:
22815 		//
22816 		// <sphinx:document> must be contained in <sphinx:docset>
22817 	}
22818 };
22819 
22820 
22821 #if USE_LIBEXPAT
22822 // callbacks
xmlStartElement(void * user_data,const XML_Char * name,const XML_Char ** attrs)22823 static void XMLCALL xmlStartElement ( void * user_data, const XML_Char * name, const XML_Char ** attrs )
22824 {
22825 	CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
22826 	pSource->StartElement ( name, attrs );
22827 }
22828 
22829 
xmlEndElement(void * user_data,const XML_Char * name)22830 static void XMLCALL xmlEndElement ( void * user_data, const XML_Char * name )
22831 {
22832 	CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
22833 	pSource->EndElement ( name );
22834 }
22835 
22836 
xmlCharacters(void * user_data,const XML_Char * ch,int len)22837 static void XMLCALL xmlCharacters ( void * user_data, const XML_Char * ch, int len )
22838 {
22839 	CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
22840 	pSource->Characters ( ch, len );
22841 }
22842 
22843 #if USE_LIBICONV
xmlUnknownEncoding(void *,const XML_Char * name,XML_Encoding * info)22844 static int XMLCALL xmlUnknownEncoding ( void *, const XML_Char * name, XML_Encoding * info )
22845 {
22846 	iconv_t pDesc = iconv_open ( "UTF-16", name );
22847 	if ( !pDesc )
22848 		return XML_STATUS_ERROR;
22849 
22850 	for ( size_t i = 0; i < 256; i++ )
22851 	{
22852 		char cIn = (char) i;
22853 		char dOut[4];
22854 		memset ( dOut, 0, sizeof ( dOut ) );
22855 #if ICONV_INBUF_CONST
22856 		const char * pInbuf = &cIn;
22857 #else
22858 		char * pInbuf = &cIn;
22859 #endif
22860 		char * pOutbuf = dOut;
22861 		size_t iInBytesLeft = 1;
22862 		size_t iOutBytesLeft = 4;
22863 
22864 		if ( iconv ( pDesc, &pInbuf, &iInBytesLeft, &pOutbuf, &iOutBytesLeft )!=size_t(-1) )
22865 			info->map[i] = int ( BYTE ( dOut[0] ) ) << 8 | int ( BYTE ( dOut[1] ) );
22866 		else
22867 			info->map[i] = 0;
22868 	}
22869 
22870 	iconv_close ( pDesc );
22871 
22872 	return XML_STATUS_OK;
22873 }
22874 #endif
22875 
22876 #endif
22877 
22878 #if USE_LIBXML
xmlReadBuffers(void * context,char * buffer,int len)22879 int	xmlReadBuffers ( void * context, char * buffer, int len )
22880 {
22881 	CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) context;
22882 	return pSource->ReadBuffer ( (BYTE*)buffer, len );
22883 }
22884 
xmlErrorHandler(void * arg,const char * msg,xmlParserSeverities severity,xmlTextReaderLocatorPtr locator)22885 void xmlErrorHandler ( void * arg, const char * msg, xmlParserSeverities severity, xmlTextReaderLocatorPtr locator )
22886 {
22887 	if ( severity==XML_PARSER_SEVERITY_ERROR )
22888 	{
22889 		int iLine = xmlTextReaderLocatorLineNumber ( locator );
22890 		CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) arg;
22891 		pSource->Error ( "%s (line=%d)", msg, iLine );
22892 	}
22893 }
22894 #endif
22895 
22896 
CSphSource_XMLPipe2(BYTE * dInitialBuf,int iBufLen,const char * sName,int iFieldBufferMax,bool bFixupUTF8)22897 CSphSource_XMLPipe2::CSphSource_XMLPipe2 ( BYTE * dInitialBuf, int iBufLen, const char * sName, int iFieldBufferMax, bool bFixupUTF8 )
22898 	: CSphSource_Document ( sName )
22899 	, m_pCurDocument	( NULL )
22900 	, m_pPipe			( NULL )
22901 	, m_iElementDepth	( 0 )
22902 	, m_iBufferSize		( 1048576 )
22903 	, m_bRemoveParsed	( false )
22904 	, m_bInDocset		( false )
22905 	, m_bInSchema		( false )
22906 	, m_bInDocument		( false )
22907 	, m_bInKillList		( false )
22908 	, m_bInId			( false )
22909 	, m_bInIgnoredTag	( false )
22910 	, m_bFirstTagAfterDocset	( false )
22911 	, m_iKillListIterator		( 0 )
22912 	, m_iMVA			( 0 )
22913 	, m_iMVAIterator	( 0 )
22914 	, m_iCurField		( -1 )
22915 	, m_iCurAttr		( -1 )
22916 	, m_pParser			( NULL )
22917 #if USE_LIBXML
22918 	, m_pBufferPtr			( NULL )
22919 	, m_pBufferEnd			( NULL )
22920 	, m_bPassedBufferEnd	( false )
22921 #endif
22922 	, m_iInitialBufSize	( iBufLen )
22923 	, m_iFieldBufferLen	( 0 )
22924 	, m_bFixupUTF8		( bFixupUTF8 )
22925 	, m_iReparseStart	( 0 )
22926 	, m_iReparseLen		( 0 )
22927 {
22928 	assert ( m_iBufferSize > iBufLen );
22929 
22930 	m_pBuffer = new BYTE [m_iBufferSize];
22931 	m_iFieldBufferMax = Max ( iFieldBufferMax, 65536 );
22932 	m_pFieldBuffer = new BYTE [ m_iFieldBufferMax ];
22933 
22934 	if ( iBufLen )
22935 		memcpy ( m_pBuffer, dInitialBuf, iBufLen );
22936 
22937 	m_iInitialBufSize = iBufLen;
22938 }
22939 
22940 
~CSphSource_XMLPipe2()22941 CSphSource_XMLPipe2::~CSphSource_XMLPipe2 ()
22942 {
22943 	Disconnect ();
22944 	SafeDeleteArray ( m_pBuffer );
22945 	SafeDeleteArray ( m_pFieldBuffer );
22946 	ARRAY_FOREACH ( i, m_dParsedDocuments )
22947 		SafeDelete ( m_dParsedDocuments[i] );
22948 }
22949 
22950 
Disconnect()22951 void CSphSource_XMLPipe2::Disconnect ()
22952 {
22953 	if ( m_pPipe )
22954 	{
22955 		pclose ( m_pPipe );
22956 		m_pPipe = NULL;
22957 	}
22958 
22959 #if USE_LIBEXPAT
22960 	if ( m_pParser )
22961 	{
22962 		XML_ParserFree ( m_pParser );
22963 		m_pParser = NULL;
22964 	}
22965 #endif
22966 
22967 #if USE_LIBXML
22968 	if ( m_pParser )
22969 	{
22970 		xmlFreeTextReader ( m_pParser );
22971 		m_pParser = NULL;
22972 	}
22973 #endif
22974 
22975 	m_tHits.m_dData.Reset();
22976 
22977 	m_iInitialBufSize = 0;
22978 }
22979 
22980 
Error(const char * sTemplate,...)22981 void CSphSource_XMLPipe2::Error ( const char * sTemplate, ... )
22982 {
22983 	if ( !m_sError.IsEmpty() )
22984 		return;
22985 
22986 	va_list ap;
22987 	va_start ( ap, sTemplate );
22988 	m_sError = DecorateMessageVA ( sTemplate, ap );
22989 	va_end ( ap );
22990 }
22991 
22992 
DecorateMessage(const char * sTemplate,...)22993 const char * CSphSource_XMLPipe2::DecorateMessage ( const char * sTemplate, ... )
22994 {
22995 	va_list ap;
22996 	va_start ( ap, sTemplate );
22997 	const char * sRes = DecorateMessageVA ( sTemplate, ap );
22998 	va_end ( ap );
22999 	return sRes;
23000 }
23001 
23002 
DecorateMessageVA(const char * sTemplate,va_list ap)23003 const char * CSphSource_XMLPipe2::DecorateMessageVA ( const char * sTemplate, va_list ap )
23004 {
23005 	static char sBuf[1024];
23006 
23007 	snprintf ( sBuf, sizeof(sBuf), "source '%s': ", m_tSchema.m_sName.cstr() );
23008 	int iBufLen = strlen ( sBuf );
23009 	int iLeft = sizeof(sBuf) - iBufLen;
23010 	char * szBufStart = sBuf + iBufLen;
23011 
23012 	vsnprintf ( szBufStart, iLeft, sTemplate, ap );
23013 	iBufLen = strlen ( sBuf );
23014 	iLeft = sizeof(sBuf) - iBufLen;
23015 	szBufStart = sBuf + iBufLen;
23016 
23017 #if USE_LIBEXPAT
23018 	if ( m_pParser )
23019 	{
23020 		SphDocID_t uFailedID = 0;
23021 		if ( m_dParsedDocuments.GetLength() )
23022 			uFailedID = m_dParsedDocuments.Last()->m_iDocID;
23023 
23024 		snprintf ( szBufStart, iLeft, " (line=%d, pos=%d, docid=" DOCID_FMT ")",
23025 			(int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ),
23026 			uFailedID );
23027 	}
23028 #endif
23029 
23030 #if USE_LIBXML
23031 	if ( m_pParser )
23032 	{
23033 		SphDocID_t uFailedID = 0;
23034 		if ( m_dParsedDocuments.GetLength() )
23035 			uFailedID = m_dParsedDocuments.Last()->m_iDocID;
23036 
23037 		snprintf ( szBufStart, iLeft, " (docid=" DOCID_FMT ")", uFailedID );
23038 	}
23039 #endif
23040 
23041 	return sBuf;
23042 }
23043 
23044 
AddFieldToSchema(const char * szName)23045 void CSphSource_XMLPipe2::AddFieldToSchema ( const char * szName )
23046 {
23047 	CSphColumnInfo tCol ( szName );
23048 	tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), m_pDict && m_pDict->GetSettings().m_bWordDict );
23049 	m_tSchema.m_dFields.Add ( tCol );
23050 }
23051 
23052 
ConfigureAttrs(const CSphVariant * pHead,ESphAttr eAttrType)23053 void CSphSource_XMLPipe2::ConfigureAttrs ( const CSphVariant * pHead, ESphAttr eAttrType )
23054 {
23055 	for ( const CSphVariant * pCur = pHead; pCur; pCur= pCur->m_pNext )
23056 	{
23057 		CSphColumnInfo tCol ( pCur->cstr(), eAttrType );
23058 		char * pColon = strchr ( const_cast<char*> ( tCol.m_sName.cstr() ), ':' );
23059 		if ( pColon )
23060 		{
23061 			*pColon = '\0';
23062 
23063 			if ( eAttrType==SPH_ATTR_INTEGER )
23064 			{
23065 				int iBits = strtol ( pColon+1, NULL, 10 );
23066 				if ( iBits<=0 || iBits>ROWITEM_BITS )
23067 				{
23068 					sphWarn ( "%s", DecorateMessage ( "attribute '%s': invalid bitcount=%d (bitcount ignored)", tCol.m_sName.cstr(), iBits ) );
23069 					iBits = -1;
23070 				}
23071 
23072 				tCol.m_tLocator.m_iBitCount = iBits;
23073 			} else
23074 				sphWarn ( "%s", DecorateMessage ( "attribute '%s': bitcount is only supported for integer types", tCol.m_sName.cstr() ) );
23075 		}
23076 
23077 		tCol.m_iIndex = m_tSchema.GetAttrsCount ();
23078 
23079 		if ( eAttrType==SPH_ATTR_UINT32SET || eAttrType==SPH_ATTR_INT64SET )
23080 		{
23081 			tCol.m_eAttrType = eAttrType;
23082 			tCol.m_eSrc = SPH_ATTRSRC_FIELD;
23083 		}
23084 
23085 		m_tSchema.AddAttr ( tCol, true ); // all attributes are dynamic at indexing time
23086 	}
23087 }
23088 
23089 
ConfigureFields(const CSphVariant * pHead)23090 void CSphSource_XMLPipe2::ConfigureFields ( const CSphVariant * pHead )
23091 {
23092 	for ( const CSphVariant * pCur = pHead; pCur; pCur= pCur->m_pNext )
23093 	{
23094 		CSphString sFieldName = pCur->cstr ();
23095 
23096 		bool bFound = false;
23097 		for ( int i = 0; i < m_tSchema.m_dFields.GetLength () && !bFound; i++ )
23098 			bFound = m_tSchema.m_dFields[i].m_sName==sFieldName;
23099 
23100 		if ( bFound )
23101 			sphWarn ( "%s", DecorateMessage ( "duplicate field '%s'", sFieldName.cstr () ) );
23102 		else
23103 			AddFieldToSchema ( sFieldName.cstr () );
23104 	}
23105 }
23106 
23107 
Setup(FILE * pPipe,const CSphConfigSection & hSource)23108 bool CSphSource_XMLPipe2::Setup ( FILE * pPipe, const CSphConfigSection & hSource )
23109 {
23110 	m_pPipe = pPipe;
23111 
23112 	m_tSchema.Reset ();
23113 
23114 	m_sCommand = hSource["xmlpipe_command"].cstr ();
23115 
23116 	ConfigureAttrs ( hSource("xmlpipe_attr_uint"),			SPH_ATTR_INTEGER );
23117 	ConfigureAttrs ( hSource("xmlpipe_attr_timestamp"),		SPH_ATTR_TIMESTAMP );
23118 	ConfigureAttrs ( hSource("xmlpipe_attr_str2ordinal"),	SPH_ATTR_ORDINAL );
23119 	ConfigureAttrs ( hSource("xmlpipe_attr_bool"),			SPH_ATTR_BOOL );
23120 	ConfigureAttrs ( hSource("xmlpipe_attr_float"),			SPH_ATTR_FLOAT );
23121 	ConfigureAttrs ( hSource("xmlpipe_attr_bigint"),		SPH_ATTR_BIGINT );
23122 	ConfigureAttrs ( hSource("xmlpipe_attr_multi"),			SPH_ATTR_UINT32SET );
23123 	ConfigureAttrs ( hSource("xmlpipe_attr_multi_64"),		SPH_ATTR_INT64SET );
23124 	ConfigureAttrs ( hSource("xmlpipe_attr_string"),		SPH_ATTR_STRING );
23125 	ConfigureAttrs ( hSource("xmlpipe_attr_wordcount"),		SPH_ATTR_WORDCOUNT );
23126 	ConfigureAttrs ( hSource("xmlpipe_field_string"),		SPH_ATTR_STRING );
23127 	ConfigureAttrs ( hSource("xmlpipe_field_wordcount"),	SPH_ATTR_WORDCOUNT );
23128 
23129 	m_tDocInfo.Reset ( m_tSchema.GetRowSize () );
23130 
23131 	ConfigureFields ( hSource("xmlpipe_field") );
23132 	ConfigureFields ( hSource("xmlpipe_field_string") );
23133 	ConfigureFields ( hSource("xmlpipe_field_wordcount") );
23134 
23135 	m_dStrAttrs.Resize ( m_tSchema.GetAttrsCount() );
23136 
23137 	return true;
23138 }
23139 
23140 
Connect(CSphString & sError)23141 bool CSphSource_XMLPipe2::Connect ( CSphString & sError )
23142 {
23143 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
23144 	{
23145 		CSphColumnInfo & tCol = m_tSchema.m_dFields[i];
23146 		tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), m_pDict && m_pDict->GetSettings().m_bWordDict );
23147 	}
23148 
23149 #if USE_LIBEXPAT
23150 	m_pParser = XML_ParserCreate(NULL);
23151 	if ( !m_pParser )
23152 	{
23153 		sError.SetSprintf ( "xmlpipe: failed to create XML parser" );
23154 		return false;
23155 	}
23156 
23157 	XML_SetUserData ( m_pParser, this );
23158 	XML_SetElementHandler ( m_pParser, xmlStartElement, xmlEndElement );
23159 	XML_SetCharacterDataHandler ( m_pParser, xmlCharacters );
23160 
23161 #if USE_LIBICONV
23162 	XML_SetUnknownEncodingHandler ( m_pParser, xmlUnknownEncoding, NULL );
23163 #endif
23164 
23165 #endif
23166 
23167 #if USE_LIBXML
23168 	m_pBufferPtr = m_pBuffer;
23169 	m_pBufferEnd = m_pBuffer + m_iInitialBufSize;
23170 	m_bPassedBufferEnd = false;
23171 
23172 	m_dAttrs.Reserve ( 16 );
23173 	m_dAttrs.Resize ( 0 );
23174 
23175 	m_pParser = xmlReaderForIO ( (xmlInputReadCallback)xmlReadBuffers, NULL, this, NULL, NULL, 0 );
23176 	if ( !m_pParser )
23177 	{
23178 		sError.SetSprintf ( "xmlpipe: failed to create XML parser" );
23179 		return false;
23180 	}
23181 
23182 	xmlTextReaderSetErrorHandler ( m_pParser, xmlErrorHandler, this );
23183 #endif
23184 
23185 	m_dKillList.Reserve ( 1024 );
23186 	m_dKillList.Resize ( 0 );
23187 
23188 	m_bRemoveParsed = false;
23189 	m_bInDocset = false;
23190 	m_bInSchema = false;
23191 	m_bInDocument = false;
23192 	m_bInKillList = false;
23193 	m_bInId = false;
23194 	m_bFirstTagAfterDocset = false;
23195 	m_iCurField = -1;
23196 	m_iCurAttr = -1;
23197 	m_iElementDepth = 0;
23198 
23199 	m_dParsedDocuments.Reset ();
23200 	m_dDefaultAttrs.Reset ();
23201 	m_dInvalid.Reset ();
23202 	m_dWarned.Reset ();
23203 
23204 	m_dParsedDocuments.Reserve ( 1024 );
23205 	m_dParsedDocuments.Resize ( 0 );
23206 
23207 	m_iKillListIterator = 0;
23208 
23209 	m_iMVA = 0;
23210 	m_iMVAIterator = 0;
23211 
23212 	m_sError = "";
23213 
23214 #if USE_LIBEXPAT
23215 	int iBytesRead = m_iInitialBufSize;
23216 	iBytesRead += fread ( m_pBuffer + m_iInitialBufSize, 1, m_iBufferSize - m_iInitialBufSize, m_pPipe );
23217 
23218 	if ( !ParseNextChunk ( iBytesRead, sError ) )
23219 		return false;
23220 #endif
23221 
23222 #if USE_LIBXML
23223 	if ( ParseNextChunk ( sError )==-1 )
23224 		return false;
23225 #endif
23226 
23227 	m_dAttrToMVA.Resize ( 0 );
23228 
23229 	int iFieldMVA = 0;
23230 	for ( int i = 0; i < m_tSchema.GetAttrsCount (); i++ )
23231 	{
23232 		const CSphColumnInfo & tCol = m_tSchema.GetAttr ( i );
23233 		if ( ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET ) && tCol.m_eSrc==SPH_ATTRSRC_FIELD )
23234 			m_dAttrToMVA.Add ( iFieldMVA++ );
23235 		else
23236 			m_dAttrToMVA.Add ( -1 );
23237 	}
23238 
23239 	m_dFieldMVAs.Resize ( iFieldMVA );
23240 	ARRAY_FOREACH ( i, m_dFieldMVAs )
23241 		m_dFieldMVAs[i].Reserve ( 16 );
23242 
23243 	m_tHits.m_dData.Reserve ( m_iMaxHits );
23244 
23245 	return true;
23246 }
23247 
23248 
23249 #if USE_LIBXML
ParseNextChunk(CSphString & sError)23250 int CSphSource_XMLPipe2::ParseNextChunk ( CSphString & sError )
23251 {
23252 	int iRet = xmlTextReaderRead ( m_pParser );
23253 	while ( iRet==1 )
23254 	{
23255 		ProcessNode ( m_pParser );
23256 		if ( !m_sError.IsEmpty () )
23257 		{
23258 			sError = m_sError;
23259 			m_tDocInfo.m_iDocID = 1;
23260 			return false;
23261 		}
23262 
23263 		if ( m_bPassedBufferEnd )
23264 			break;
23265 
23266 		iRet = xmlTextReaderRead ( m_pParser );
23267 	}
23268 
23269 	m_bPassedBufferEnd = false;
23270 
23271 	if ( !m_sError.IsEmpty () || iRet==-1 )
23272 	{
23273 		sError = m_sError;
23274 		m_tDocInfo.m_iDocID = 1;
23275 		return -1;
23276 	}
23277 
23278 	return iRet;
23279 }
23280 #endif
23281 
23282 
23283 #if USE_LIBEXPAT
ParseNextChunk(int iBufferLen,CSphString & sError)23284 bool CSphSource_XMLPipe2::ParseNextChunk ( int iBufferLen, CSphString & sError )
23285 {
23286 	if ( !iBufferLen )
23287 		return true;
23288 
23289 	bool bLast = ( iBufferLen!=m_iBufferSize );
23290 
23291 	m_iReparseLen = 0;
23292 	if ( m_bFixupUTF8 )
23293 	{
23294 		BYTE * p = m_pBuffer;
23295 		BYTE * pMax = m_pBuffer + iBufferLen;
23296 
23297 		while ( p<pMax )
23298 		{
23299 			BYTE v = *p;
23300 
23301 			// fix control codes
23302 			if ( v<0x20 && v!=0x0D && v!=0x0A )
23303 			{
23304 				*p++ = ' ';
23305 				continue;
23306 			}
23307 
23308 			// accept ascii7 codes
23309 			if ( v<128 )
23310 			{
23311 				p++;
23312 				continue;
23313 			}
23314 
23315 			// remove invalid start bytes
23316 			if ( v<0xC2 )
23317 			{
23318 				*p++ = ' ';
23319 				continue;
23320 			}
23321 
23322 			// get and check byte count
23323 			int iBytes = 0;
23324 			while ( v & 0x80 )
23325 			{
23326 				iBytes++;
23327 				v <<= 1;
23328 			}
23329 			if ( iBytes<2 || iBytes>3 )
23330 			{
23331 				*p++ = ' ';
23332 				continue;
23333 			}
23334 
23335 			// if we're on a boundary, save these few bytes for the future
23336 			if ( p+iBytes>pMax )
23337 			{
23338 				m_iReparseStart = (int)(p-m_pBuffer);
23339 				m_iReparseLen = (int)(pMax-p);
23340 				iBufferLen -= m_iReparseLen;
23341 				break;
23342 			}
23343 
23344 			// otherwise (not a boundary), check them all
23345 			int i = 1;
23346 			int iVal = ( v >> iBytes );
23347 			for ( ; i<iBytes; i++ )
23348 			{
23349 				if ( ( p[i] & 0xC0 )!=0x80 )
23350 					break;
23351 				iVal = ( iVal<<6 ) + ( p[i] & 0x3f );
23352 			}
23353 
23354 			if ( i!=iBytes // remove invalid sequences
23355 				|| ( iVal>=0xd800 && iVal<=0xdfff ) // and utf-16 surrogate pairs
23356 				|| ( iBytes==3 && iVal<0x800 ) // and overlong 3-byte codes
23357 				|| ( iVal>=0xfff0 && iVal<=0xffff ) ) // and kinda-valid specials expat chokes on anyway
23358 			{
23359 				iBytes = i;
23360 				for ( i=0; i<iBytes; i++ )
23361 					p[i] = ' ';
23362 			}
23363 
23364 			// only move forward by the amount of succesfully processed bytes!
23365 			p += i;
23366 		}
23367 	}
23368 
23369 	if ( XML_Parse ( m_pParser, (const char*) m_pBuffer, iBufferLen, bLast )!=XML_STATUS_OK )
23370 	{
23371 		SphDocID_t uFailedID = 0;
23372 		if ( m_dParsedDocuments.GetLength() )
23373 			uFailedID = m_dParsedDocuments.Last()->m_iDocID;
23374 
23375 		sError.SetSprintf ( "source '%s': XML parse error: %s (line=%d, pos=%d, docid=" DOCID_FMT ")",
23376 			m_tSchema.m_sName.cstr(), XML_ErrorString ( XML_GetErrorCode ( m_pParser ) ),
23377 			(int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ),
23378 			uFailedID );
23379 		m_tDocInfo.m_iDocID = 1;
23380 		return false;
23381 	}
23382 
23383 	if ( !m_sError.IsEmpty () )
23384 	{
23385 		sError = m_sError;
23386 		m_tDocInfo.m_iDocID = 1;
23387 		return false;
23388 	}
23389 
23390 	return true;
23391 }
23392 #endif
23393 
23394 
NextDocument(CSphString & sError)23395 BYTE **	CSphSource_XMLPipe2::NextDocument ( CSphString & sError )
23396 {
23397 	if ( m_bRemoveParsed )
23398 	{
23399 		SafeDelete ( m_dParsedDocuments[0] );
23400 		m_dParsedDocuments.RemoveFast ( 0 );
23401 		m_bRemoveParsed = false;
23402 	}
23403 
23404 	int iReadResult = 0;
23405 
23406 #if USE_LIBEXPAT
23407 	while ( m_dParsedDocuments.GetLength()==0 )
23408 	{
23409 		// saved bytes to the front!
23410 		if ( m_iReparseLen )
23411 			memmove ( m_pBuffer, m_pBuffer+m_iReparseStart, m_iReparseLen );
23412 
23413 		// read more data
23414 		iReadResult = fread ( m_pBuffer+m_iReparseLen, 1, m_iBufferSize-m_iReparseLen, m_pPipe );
23415 		if ( iReadResult==0 )
23416 			break;
23417 
23418 		// and parse it
23419 		if ( !ParseNextChunk ( iReadResult+m_iReparseLen, sError ) )
23420 			return NULL;
23421 	}
23422 #endif
23423 
23424 #if USE_LIBXML
23425 	while ( m_dParsedDocuments.GetLength()==0 && ( iReadResult = ParseNextChunk ( sError ) )==1 );
23426 #endif
23427 
23428 	while ( m_dParsedDocuments.GetLength()!=0 )
23429 	{
23430 		Document_t * pDocument = m_dParsedDocuments[0];
23431 		int nAttrs = m_tSchema.GetAttrsCount ();
23432 
23433 		// docid
23434 		m_tDocInfo.m_iDocID = VerifyID ( pDocument->m_iDocID );
23435 		if ( m_tDocInfo.m_iDocID==0 )
23436 		{
23437 			SafeDelete ( m_dParsedDocuments[0] );
23438 			m_dParsedDocuments.RemoveFast ( 0 );
23439 			continue;
23440 		}
23441 
23442 		// attributes
23443 		for ( int i = 0; i < nAttrs; i++ )
23444 		{
23445 			const CSphString & sAttrValue = pDocument->m_dAttrs[i].IsEmpty () && m_dDefaultAttrs.GetLength () ? m_dDefaultAttrs[i] : pDocument->m_dAttrs[i];
23446 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr ( i );
23447 
23448 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
23449 			{
23450 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, ParseFieldMVA ( m_dMva, sAttrValue.cstr (), tAttr.m_eAttrType==SPH_ATTR_INT64SET ) );
23451 				continue;
23452 			}
23453 
23454 			switch ( tAttr.m_eAttrType )
23455 			{
23456 				case SPH_ATTR_ORDINAL:
23457 				case SPH_ATTR_STRING:
23458 				case SPH_ATTR_WORDCOUNT:
23459 					m_dStrAttrs[i] = sAttrValue.cstr ();
23460 					if ( !m_dStrAttrs[i].cstr() )
23461 						m_dStrAttrs[i] = "";
23462 
23463 					m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
23464 					break;
23465 
23466 				case SPH_ATTR_FLOAT:
23467 					m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( sAttrValue.cstr () ) );
23468 					break;
23469 
23470 				case SPH_ATTR_BIGINT:
23471 					m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( sAttrValue.cstr () ) );
23472 					break;
23473 
23474 				default:
23475 					m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( sAttrValue.cstr () ) );
23476 					break;
23477 			}
23478 		}
23479 
23480 		m_bRemoveParsed = true;
23481 
23482 		int nFields = m_tSchema.m_dFields.GetLength ();
23483 		if ( !nFields )
23484 		{
23485 			m_tDocInfo.m_iDocID = 0;
23486 			return NULL;
23487 		}
23488 
23489 		m_dFieldPtrs.Resize ( nFields );
23490 		for ( int i = 0; i < nFields; ++i )
23491 			m_dFieldPtrs[i] = (BYTE*)( pDocument->m_dFields [i].cstr() );
23492 
23493 		return (BYTE **)&( m_dFieldPtrs[0] );
23494 	}
23495 
23496 	if ( !iReadResult )
23497 		m_tDocInfo.m_iDocID = 0;
23498 
23499 	return NULL;
23500 }
23501 
23502 
IterateKillListStart(CSphString &)23503 bool CSphSource_XMLPipe2::IterateKillListStart ( CSphString & )
23504 {
23505 	m_iKillListIterator = 0;
23506 	return true;
23507 }
23508 
23509 
IterateKillListNext(SphDocID_t & tDocId)23510 bool CSphSource_XMLPipe2::IterateKillListNext ( SphDocID_t & tDocId )
23511 {
23512 	if ( m_iKillListIterator>=m_dKillList.GetLength () )
23513 		return false;
23514 
23515 	tDocId = m_dKillList [ m_iKillListIterator++ ];
23516 	return true;
23517 }
23518 
23519 enum EXMLElem
23520 {
23521 	ELEM_DOCSET,
23522 	ELEM_SCHEMA,
23523 	ELEM_FIELD,
23524 	ELEM_ATTR,
23525 	ELEM_DOCUMENT,
23526 	ELEM_KLIST,
23527 	ELEM_NONE
23528 };
23529 
LookupElement(const char * szName)23530 static EXMLElem LookupElement ( const char * szName )
23531 {
23532 	if ( szName[0]!='s' )
23533 		return ELEM_NONE;
23534 
23535 	int iLen = strlen(szName);
23536 	if ( iLen>=11 && iLen<=15 )
23537 	{
23538 		char iHash = ( iLen + szName[7] ) & 15;
23539 		switch ( iHash )
23540 		{
23541 		case 1:		if ( !strcmp ( szName, "sphinx:docset" ) )		return ELEM_DOCSET;
23542 		case 0:		if ( !strcmp ( szName, "sphinx:schema" ) )		return ELEM_SCHEMA;
23543 		case 2:		if ( !strcmp ( szName, "sphinx:field" ) )		return ELEM_FIELD;
23544 		case 12:	if ( !strcmp ( szName, "sphinx:attr" ) )		return ELEM_ATTR;
23545 		case 3:		if ( !strcmp ( szName, "sphinx:document" ) )	return ELEM_DOCUMENT;
23546 		case 10:	if ( !strcmp ( szName, "sphinx:killlist" ) )	return ELEM_KLIST;
23547 		}
23548 	}
23549 
23550 	return ELEM_NONE;
23551 }
23552 
StartElement(const char * szName,const char ** pAttrs)23553 void CSphSource_XMLPipe2::StartElement ( const char * szName, const char ** pAttrs )
23554 {
23555 	EXMLElem ePos = LookupElement ( szName );
23556 
23557 	switch ( ePos )
23558 	{
23559 	case ELEM_DOCSET:
23560 		m_bInDocset = true;
23561 		m_bFirstTagAfterDocset = true;
23562 		return;
23563 
23564 	case ELEM_SCHEMA:
23565 	{
23566 		if ( !m_bInDocset || !m_bFirstTagAfterDocset )
23567 		{
23568 			Error ( "<sphinx:schema> is allowed immediately after <sphinx:docset> only" );
23569 			return;
23570 		}
23571 
23572 		if ( m_tSchema.m_dFields.GetLength () > 0 || m_tSchema.GetAttrsCount () > 0 )
23573 		{
23574 			sphWarn ( "%s", DecorateMessage ( "both embedded and configured schemas found; using embedded" ) );
23575 			m_tSchema.Reset ();
23576 			CSphMatch tDocInfo;
23577 			Swap ( m_tDocInfo, tDocInfo );
23578 		}
23579 
23580 		m_bFirstTagAfterDocset = false;
23581 		m_bInSchema = true;
23582 	}
23583 	return;
23584 
23585 	case ELEM_FIELD:
23586 	{
23587 		if ( !m_bInDocset || !m_bInSchema )
23588 		{
23589 			Error ( "<sphinx:field> is allowed inside <sphinx:schema> only" );
23590 			return;
23591 		}
23592 
23593 		const char ** dAttrs = pAttrs;
23594 		CSphColumnInfo Info;
23595 		CSphString sDefault;
23596 		bool bIsAttr = false;
23597 
23598 		while ( dAttrs[0] && dAttrs[1] && dAttrs[0][0] && dAttrs[1][0] )
23599 		{
23600 			if ( !strcmp ( *dAttrs, "name" ) )
23601 			{
23602 				AddFieldToSchema ( dAttrs[1] );
23603 				Info.m_sName = dAttrs[1];
23604 			} else if ( !strcmp ( *dAttrs, "attr" ) )
23605 			{
23606 				bIsAttr = true;
23607 				if ( !strcmp ( dAttrs[1], "string" ) )
23608 					Info.m_eAttrType = SPH_ATTR_STRING;
23609 				else if ( !strcmp ( dAttrs[1], "wordcount" ) )
23610 					Info.m_eAttrType = SPH_ATTR_WORDCOUNT;
23611 
23612 			} else if ( !strcmp ( *dAttrs, "default" ) )
23613 				sDefault = dAttrs[1];
23614 
23615 			dAttrs += 2;
23616 		}
23617 
23618 		if ( bIsAttr )
23619 		{
23620 			Info.m_iIndex = m_tSchema.GetAttrsCount ();
23621 			m_tSchema.AddAttr ( Info, true ); // all attributes are dynamic at indexing time
23622 			m_dDefaultAttrs.Add ( sDefault );
23623 		}
23624 	}
23625 	return;
23626 
23627 	case ELEM_ATTR:
23628 	{
23629 		if ( !m_bInDocset || !m_bInSchema )
23630 		{
23631 			Error ( "<sphinx:attr> is allowed inside <sphinx:schema> only" );
23632 			return;
23633 		}
23634 
23635 		bool bError = false;
23636 		CSphString sDefault;
23637 
23638 		CSphColumnInfo Info;
23639 		Info.m_eAttrType = SPH_ATTR_INTEGER;
23640 
23641 		const char ** dAttrs = pAttrs;
23642 
23643 		while ( dAttrs[0] && dAttrs[1] && dAttrs[0][0] && dAttrs[1][0] && !bError )
23644 		{
23645 			if ( !strcmp ( *dAttrs, "name" ) )
23646 				Info.m_sName = dAttrs[1];
23647 			else if ( !strcmp ( *dAttrs, "bits" ) )
23648 				Info.m_tLocator.m_iBitCount = strtol ( dAttrs[1], NULL, 10 );
23649 			else if ( !strcmp ( *dAttrs, "default" ) )
23650 				sDefault = dAttrs[1];
23651 			else if ( !strcmp ( *dAttrs, "type" ) )
23652 			{
23653 				const char * szType = dAttrs[1];
23654 				if ( !strcmp ( szType, "int" ) )				Info.m_eAttrType = SPH_ATTR_INTEGER;
23655 				else if ( !strcmp ( szType, "timestamp" ) )		Info.m_eAttrType = SPH_ATTR_TIMESTAMP;
23656 				else if ( !strcmp ( szType, "str2ordinal" ) )	Info.m_eAttrType = SPH_ATTR_ORDINAL;
23657 				else if ( !strcmp ( szType, "bool" ) )			Info.m_eAttrType = SPH_ATTR_BOOL;
23658 				else if ( !strcmp ( szType, "float" ) )			Info.m_eAttrType = SPH_ATTR_FLOAT;
23659 				else if ( !strcmp ( szType, "bigint" ) )		Info.m_eAttrType = SPH_ATTR_BIGINT;
23660 				else if ( !strcmp ( szType, "string" ) )		Info.m_eAttrType = SPH_ATTR_STRING;
23661 				else if ( !strcmp ( szType, "wordcount" ) )		Info.m_eAttrType = SPH_ATTR_WORDCOUNT;
23662 				else if ( !strcmp ( szType, "multi" ) )
23663 				{
23664 					Info.m_eAttrType = SPH_ATTR_UINT32SET;
23665 					Info.m_eSrc = SPH_ATTRSRC_FIELD;
23666 				} else if ( !strcmp ( szType, "multi_64" ) )
23667 				{
23668 					Info.m_eAttrType = SPH_ATTR_INT64SET;
23669 					Info.m_eSrc = SPH_ATTRSRC_FIELD;
23670 				} else
23671 				{
23672 					Error ( "unknown column type '%s'", szType );
23673 					bError = true;
23674 				}
23675 			}
23676 
23677 			dAttrs += 2;
23678 		}
23679 
23680 		if ( !bError )
23681 		{
23682 			Info.m_iIndex = m_tSchema.GetAttrsCount ();
23683 			m_tSchema.AddAttr ( Info, true ); // all attributes are dynamic at indexing time
23684 			m_dDefaultAttrs.Add ( sDefault );
23685 		}
23686 	}
23687 	return;
23688 
23689 	case ELEM_DOCUMENT:
23690 	{
23691 		if ( !m_bInDocset || m_bInSchema )
23692 			return DocumentError ( "<sphinx:schema>" );
23693 
23694 		if ( m_bInKillList )
23695 			return DocumentError ( "<sphinx:killlist>" );
23696 
23697 		if ( m_bInDocument )
23698 			return DocumentError ( "<sphinx:document>" );
23699 
23700 		if ( m_tSchema.m_dFields.GetLength()==0 && m_tSchema.GetAttrsCount()==0 )
23701 		{
23702 			Error ( "no schema configured, and no embedded schema found" );
23703 			return;
23704 		}
23705 
23706 		m_bInDocument = true;
23707 
23708 		assert ( !m_pCurDocument );
23709 		m_pCurDocument = new Document_t;
23710 
23711 		m_pCurDocument->m_iDocID = 0;
23712 		m_pCurDocument->m_dFields.Resize ( m_tSchema.m_dFields.GetLength () );
23713 		m_pCurDocument->m_dAttrs.Resize ( m_tSchema.GetAttrsCount () );
23714 
23715 		if ( pAttrs[0] && pAttrs[1] && pAttrs[0][0] && pAttrs[1][0] )
23716 			if ( !strcmp ( pAttrs[0], "id" ) )
23717 				m_pCurDocument->m_iDocID = sphToDocid ( pAttrs[1] );
23718 
23719 		if ( m_pCurDocument->m_iDocID==0 )
23720 			Error ( "attribute 'id' required in <sphinx:document>" );
23721 	}
23722 	return;
23723 
23724 	case ELEM_KLIST:
23725 	{
23726 		if ( !m_bInDocset || m_bInDocument || m_bInSchema )
23727 		{
23728 			Error ( "<sphinx:killlist> is not allowed inside <sphinx:schema> or <sphinx:document>" );
23729 			return;
23730 		}
23731 
23732 		m_bInKillList = true;
23733 	}
23734 	return;
23735 
23736 	case ELEM_NONE: break; // avoid warning
23737 	}
23738 
23739 	if ( m_bInKillList )
23740 	{
23741 		if ( m_bInId )
23742 		{
23743 			m_iElementDepth++;
23744 			return;
23745 		}
23746 
23747 		if ( strcmp ( szName, "id" ) )
23748 		{
23749 			Error ( "only 'id' is allowed inside <sphinx:killlist>" );
23750 			return;
23751 		}
23752 
23753 		m_bInId = true;
23754 
23755 	} else if ( m_bInDocument )
23756 	{
23757 		if ( m_iCurField!=-1 || m_iCurAttr!=-1 )
23758 		{
23759 			m_iElementDepth++;
23760 			return;
23761 		}
23762 
23763 		for ( int i = 0; i < m_tSchema.m_dFields.GetLength () && m_iCurField==-1; i++ )
23764 			if ( m_tSchema.m_dFields[i].m_sName==szName )
23765 				m_iCurField = i;
23766 
23767 		for ( int i = 0; i < m_tSchema.GetAttrsCount () && m_iCurAttr==-1; i++ )
23768 			if ( m_tSchema.GetAttr(i).m_sName==szName )
23769 				m_iCurAttr = i;
23770 
23771 		if ( m_iCurAttr!=-1 || m_iCurField!=-1 )
23772 			return;
23773 
23774 		m_bInIgnoredTag = true;
23775 
23776 		bool bInvalidFound = false;
23777 		for ( int i = 0; i < m_dInvalid.GetLength () && !bInvalidFound; i++ )
23778 			bInvalidFound = m_dInvalid[i]==szName;
23779 
23780 		if ( !bInvalidFound )
23781 		{
23782 			sphWarn ( "%s", DecorateMessage ( "unknown field/attribute '%s'; ignored", szName ) );
23783 			m_dInvalid.Add ( szName );
23784 		}
23785 	}
23786 }
23787 
23788 
EndElement(const char * szName)23789 void CSphSource_XMLPipe2::EndElement ( const char * szName )
23790 {
23791 	m_bInIgnoredTag = false;
23792 
23793 	EXMLElem ePos = LookupElement ( szName );
23794 
23795 	switch ( ePos )
23796 	{
23797 	case ELEM_DOCSET:
23798 		m_bInDocset = false;
23799 		return;
23800 
23801 	case ELEM_SCHEMA:
23802 		m_bInSchema = false;
23803 		m_tDocInfo.Reset ( m_tSchema.GetRowSize () );
23804 		m_dStrAttrs.Resize ( m_tSchema.GetAttrsCount() );
23805 		return;
23806 
23807 	case ELEM_DOCUMENT:
23808 		m_bInDocument = false;
23809 		if ( m_pCurDocument )
23810 			m_dParsedDocuments.Add ( m_pCurDocument );
23811 		m_pCurDocument = NULL;
23812 		return;
23813 
23814 	case ELEM_KLIST:
23815 		m_bInKillList = false;
23816 		return;
23817 
23818 	case ELEM_FIELD: // avoid warnings
23819 	case ELEM_ATTR:
23820 	case ELEM_NONE: break;
23821 	}
23822 
23823 	if ( m_bInKillList )
23824 	{
23825 		if ( m_iElementDepth!=0 )
23826 		{
23827 			m_iElementDepth--;
23828 			return;
23829 		}
23830 
23831 		if ( m_bInId )
23832 		{
23833 			m_pFieldBuffer [ Min ( m_iFieldBufferLen, m_iFieldBufferMax-1 ) ] = '\0';
23834 			m_dKillList.Add ( sphToDocid ( (const char *)m_pFieldBuffer ) );
23835 			m_iFieldBufferLen = 0;
23836 			m_bInId = false;
23837 		}
23838 
23839 	} else if ( m_bInDocument && ( m_iCurAttr!=-1 || m_iCurField!=-1 ) )
23840 	{
23841 		if ( m_iElementDepth!=0 )
23842 		{
23843 			m_iElementDepth--;
23844 			return;
23845 		}
23846 
23847 		if ( m_iCurField!=-1 )
23848 		{
23849 			assert ( m_pCurDocument );
23850 			if ( !m_pCurDocument->m_dFields [ m_iCurField ].IsEmpty () )
23851 				sphWarn ( "duplicate text node <%s> - using first value", m_tSchema.m_dFields [ m_iCurField ].m_sName.cstr() );
23852 			else
23853 				m_pCurDocument->m_dFields [ m_iCurField ].SetBinary ( (char*)m_pFieldBuffer, m_iFieldBufferLen );
23854 		}
23855 		if ( m_iCurAttr!=-1 )
23856 		{
23857 			assert ( m_pCurDocument );
23858 			if ( !m_pCurDocument->m_dAttrs [ m_iCurAttr ].IsEmpty () )
23859 				sphWarn ( "duplicate attribute node <%s> - using first value", m_tSchema.GetAttr ( m_iCurAttr ).m_sName.cstr() );
23860 			else
23861 				m_pCurDocument->m_dAttrs [ m_iCurAttr ].SetBinary ( (char*)m_pFieldBuffer, m_iFieldBufferLen );
23862 		}
23863 
23864 		m_iFieldBufferLen = 0;
23865 
23866 		m_iCurAttr = -1;
23867 		m_iCurField = -1;
23868 	}
23869 }
23870 
23871 
UnexpectedCharaters(const char * pCharacters,int iLen,const char * szComment)23872 void CSphSource_XMLPipe2::UnexpectedCharaters ( const char * pCharacters, int iLen, const char * szComment )
23873 {
23874 	const int MAX_WARNING_LENGTH = 64;
23875 
23876 	bool bSpaces = true;
23877 	for ( int i = 0; i < iLen && bSpaces; i++ )
23878 		if ( !sphIsSpace ( pCharacters[i] ) )
23879 			bSpaces = false;
23880 
23881 	if ( !bSpaces )
23882 	{
23883 		CSphString sWarning;
23884 #if USE_LIBEXPAT
23885 		sWarning.SetBinary ( pCharacters, Min ( iLen, MAX_WARNING_LENGTH ) );
23886 		sphWarn ( "source '%s': unexpected string '%s' (line=%d, pos=%d) %s",
23887 			m_tSchema.m_sName.cstr(), sWarning.cstr (),
23888 			(int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ), szComment );
23889 #endif
23890 
23891 #if USE_LIBXML
23892 		int i = 0;
23893 		for ( i=0; i<iLen && sphIsSpace ( pCharacters[i] ); i++ );
23894 		sWarning.SetBinary ( pCharacters + i, Min ( iLen - i, MAX_WARNING_LENGTH ) );
23895 		for ( i=iLen-i-1; i>=0 && sphIsSpace ( sWarning.cstr()[i] ); i-- );
23896 		if ( i>=0 )
23897 			( (char *)sWarning.cstr() )[i+1] = '\0';
23898 
23899 		sphWarn ( "source '%s': unexpected string '%s' %s", m_tSchema.m_sName.cstr(), sWarning.cstr(), szComment );
23900 #endif
23901 	}
23902 }
23903 
23904 
Characters(const char * pCharacters,int iLen)23905 void CSphSource_XMLPipe2::Characters ( const char * pCharacters, int iLen )
23906 {
23907 	if ( m_bInIgnoredTag )
23908 		return;
23909 
23910 	if ( !m_bInDocset )
23911 	{
23912 		UnexpectedCharaters ( pCharacters, iLen, "outside of <sphinx:docset>" );
23913 		return;
23914 	}
23915 
23916 	if ( !m_bInSchema && !m_bInDocument && !m_bInKillList )
23917 	{
23918 		UnexpectedCharaters ( pCharacters, iLen, "outside of <sphinx:schema> and <sphinx:document>" );
23919 		return;
23920 	}
23921 
23922 	if ( m_iCurAttr==-1 && m_iCurField==-1 && !m_bInKillList )
23923 	{
23924 		UnexpectedCharaters ( pCharacters, iLen, m_bInDocument ? "inside <sphinx:document>" : ( m_bInSchema ? "inside <sphinx:schema>" : "" ) );
23925 		return;
23926 	}
23927 
23928 	if ( iLen + m_iFieldBufferLen < m_iFieldBufferMax )
23929 	{
23930 		memcpy ( m_pFieldBuffer + m_iFieldBufferLen, pCharacters, iLen );
23931 		m_iFieldBufferLen += iLen;
23932 
23933 	} else
23934 	{
23935 		const CSphString & sName = ( m_iCurField!=-1 ) ? m_tSchema.m_dFields[m_iCurField].m_sName : m_tSchema.GetAttr ( m_iCurAttr ).m_sName;
23936 
23937 		bool bWarned = false;
23938 		for ( int i = 0; i < m_dWarned.GetLength () && !bWarned; i++ )
23939 			bWarned = m_dWarned[i]==sName;
23940 
23941 		if ( !bWarned )
23942 		{
23943 #if USE_LIBEXPAT
23944 			sphWarn ( "source '%s': field/attribute '%s' length exceeds max length (line=%d, pos=%d, docid=" DOCID_FMT ")",
23945 				m_tSchema.m_sName.cstr(), sName.cstr(),
23946 				(int)XML_GetCurrentLineNumber ( m_pParser ), (int)XML_GetCurrentColumnNumber ( m_pParser ),
23947 				m_pCurDocument->m_iDocID );
23948 #endif
23949 
23950 #if USE_LIBXML
23951 			sphWarn ( "source '%s': field/attribute '%s' length exceeds max length (docid=" DOCID_FMT ")",
23952 				m_tSchema.m_sName.cstr(), sName.cstr(), m_pCurDocument->m_iDocID );
23953 #endif
23954 			m_dWarned.Add ( sName );
23955 		}
23956 	}
23957 }
23958 
23959 
23960 #if USE_LIBXML
ReadBuffer(BYTE * pBuffer,int iLen)23961 int CSphSource_XMLPipe2::ReadBuffer ( BYTE * pBuffer, int iLen )
23962 {
23963 	int iLeft = Max ( m_pBufferEnd - m_pBufferPtr, 0 );
23964 	if ( iLeft < iLen )
23965 	{
23966 		memmove ( m_pBuffer, m_pBufferPtr, iLeft );
23967 		size_t iRead = fread ( m_pBuffer + iLeft, 1, m_iBufferSize - iLeft, m_pPipe );
23968 
23969 		m_bPassedBufferEnd = ( ( m_iBufferSize - iLeft )==int(iRead) );
23970 
23971 		m_pBufferPtr = m_pBuffer;
23972 		m_pBufferEnd = m_pBuffer + iLeft + iRead;
23973 
23974 		iLeft = Max ( m_pBufferEnd - m_pBuffer, 0 );
23975 	}
23976 
23977 	int iToCopy = Min ( iLen, iLeft );
23978 	memcpy ( pBuffer, m_pBufferPtr, iToCopy );
23979 	m_pBufferPtr += iToCopy;
23980 
23981 	return iToCopy;
23982 }
23983 
23984 
ProcessNode(xmlTextReaderPtr pReader)23985 void CSphSource_XMLPipe2::ProcessNode ( xmlTextReaderPtr pReader )
23986 {
23987 	int iType = xmlTextReaderNodeType ( pReader );
23988 	switch ( iType )
23989 	{
23990 	case XML_READER_TYPE_ELEMENT:
23991 		{
23992 			const char * szName = (char*)xmlTextReaderName ( pReader );
23993 
23994 			m_dAttrs.Resize ( 0 );
23995 
23996 			if ( xmlTextReaderHasAttributes ( pReader ) )
23997 			{
23998 				if ( xmlTextReaderMoveToFirstAttribute ( pReader )!=1 )
23999 					return;
24000 
24001 				do
24002 				{
24003 					int iLen = m_dAttrs.GetLength ();
24004 					m_dAttrs.Resize ( iLen + 2 );
24005 					m_dAttrs[iLen] = (char*)xmlTextReaderName ( pReader );
24006 					m_dAttrs[iLen+1] = (char*)xmlTextReaderValue ( pReader );
24007 				}
24008 				while ( xmlTextReaderMoveToNextAttribute ( pReader )==1 );
24009 			}
24010 
24011 			int iLen = m_dAttrs.GetLength ();
24012 			m_dAttrs.Resize ( iLen + 2 );
24013 			m_dAttrs[iLen] = NULL;
24014 			m_dAttrs[iLen+1] = NULL;
24015 
24016 			StartElement ( szName, &m_dAttrs[0] );
24017 		}
24018 		break;
24019 	case XML_READER_TYPE_END_ELEMENT:
24020 		EndElement ( (char*)xmlTextReaderName ( pReader ) );
24021 		break;
24022 	case XML_TEXT_NODE:
24023 		{
24024 			const char * szText = (char*)xmlTextReaderValue	( pReader );
24025 			Characters ( szText, strlen ( szText ) );
24026 		}
24027 		break;
24028 	}
24029 }
24030 #endif
24031 
sphCreateSourceXmlpipe2(const CSphConfigSection * pSource,FILE * pPipe,BYTE * dInitialBuf,int iBufLen,const char * szSourceName,int iMaxFieldLen)24032 CSphSource * sphCreateSourceXmlpipe2 ( const CSphConfigSection * pSource, FILE * pPipe, BYTE * dInitialBuf, int iBufLen, const char * szSourceName, int iMaxFieldLen )
24033 {
24034 	CSphSource_XMLPipe2 * pXMLPipe = new CSphSource_XMLPipe2 ( dInitialBuf, iBufLen, szSourceName, iMaxFieldLen, pSource->GetInt ( "xmlpipe_fixup_utf8", 0 )!=0 );
24035 	if ( !pXMLPipe->Setup ( pPipe, *pSource ) )
24036 		SafeDelete ( pXMLPipe );
24037 
24038 	return pXMLPipe;
24039 }
24040 
24041 #endif
24042 
24043 
sphDetectXMLPipe(const char * szCommand,BYTE * dBuf,int & iBufSize,int iMaxBufSize,bool & bUsePipe2)24044 FILE * sphDetectXMLPipe ( const char * szCommand, BYTE * dBuf, int & iBufSize, int iMaxBufSize, bool & bUsePipe2 )
24045 {
24046 	bUsePipe2 = true; // default is xmlpipe2
24047 
24048 	FILE * pPipe = popen ( szCommand, "r" );
24049 	if ( !pPipe )
24050 		return NULL;
24051 
24052 	BYTE * pStart = dBuf;
24053 	iBufSize = (int)fread ( dBuf, 1, iMaxBufSize, pPipe );
24054 	BYTE * pEnd = pStart + iBufSize;
24055 
24056 	// BOM
24057 	if ( iBufSize>=3 )
24058 		if ( !strncmp ( (char*)pStart, "\xEF\xBB\xBF", 3 ) )
24059 			pStart += 3;
24060 
24061 	while ( isspace ( *pStart ) && pStart < pEnd )
24062 		pStart++;
24063 
24064 	if ( ( pEnd - pStart)>=5 )
24065 		bUsePipe2 = !strncasecmp ( (char *)pStart, "<?xml", 5 );
24066 
24067 	return pPipe;
24068 }
24069 
24070 
24071 #if USE_ODBC
24072 
CSphSourceParams_ODBC()24073 CSphSourceParams_ODBC::CSphSourceParams_ODBC ()
24074 	: m_bWinAuth	( false )
24075 	, m_bUnicode	( false )
24076 {
24077 }
24078 
24079 
CSphSource_ODBC(const char * sName)24080 CSphSource_ODBC::CSphSource_ODBC ( const char * sName )
24081 	: CSphSource_SQL	( sName )
24082 	, m_bWinAuth		( false )
24083 	, m_bUnicode		( false )
24084 	, m_hEnv			( NULL )
24085 	, m_hDBC			( NULL )
24086 	, m_hStmt			( NULL )
24087 	, m_nResultCols		( 0 )
24088 {
24089 }
24090 
24091 
SqlDismissResult()24092 void CSphSource_ODBC::SqlDismissResult ()
24093 {
24094 	if ( m_hStmt )
24095 	{
24096 		SQLCloseCursor ( m_hStmt );
24097 		SQLFreeHandle ( SQL_HANDLE_STMT, m_hStmt );
24098 		m_hStmt = NULL;
24099 	}
24100 }
24101 
24102 
24103 #define MS_SQL_BUFFER_GAP 16
24104 
24105 
SqlQuery(const char * sQuery)24106 bool CSphSource_ODBC::SqlQuery ( const char * sQuery )
24107 {
24108 	if ( SQLAllocHandle ( SQL_HANDLE_STMT, m_hDBC, &m_hStmt )==SQL_ERROR )
24109 	{
24110 		if ( m_tParams.m_bPrintQueries )
24111 			fprintf ( stdout, "SQL-QUERY: %s: FAIL (SQLAllocHandle failed)\n", sQuery );
24112 		return false;
24113 	}
24114 
24115 	if ( SQLExecDirect ( m_hStmt, (SQLCHAR *)sQuery, SQL_NTS )==SQL_ERROR )
24116 	{
24117 		GetSqlError ( SQL_HANDLE_STMT, m_hStmt );
24118 		if ( m_tParams.m_bPrintQueries )
24119 			fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
24120 		return false;
24121 	}
24122 	if ( m_tParams.m_bPrintQueries )
24123 		fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
24124 
24125 	SQLSMALLINT nCols = 0;
24126 	m_nResultCols = 0;
24127 	if ( SQLNumResultCols ( m_hStmt, &nCols )==SQL_ERROR )
24128 		return false;
24129 
24130 	m_nResultCols = nCols;
24131 
24132 	const int MAX_NAME_LEN = 512;
24133 	char szColumnName[MAX_NAME_LEN];
24134 
24135 	m_dColumns.Resize ( m_nResultCols );
24136 	int iTotalBuffer = 0;
24137 	ARRAY_FOREACH ( i, m_dColumns )
24138 	{
24139 		QueryColumn_t & tCol = m_dColumns[i];
24140 
24141 		SQLULEN uColSize = 0;
24142 		SQLSMALLINT iNameLen = 0;
24143 		SQLSMALLINT iDataType = 0;
24144 		if ( SQLDescribeCol ( m_hStmt, (SQLUSMALLINT)(i+1), (SQLCHAR*)szColumnName, MAX_NAME_LEN, &iNameLen, &iDataType, &uColSize, NULL, NULL )==SQL_ERROR )
24145 			return false;
24146 
24147 		tCol.m_sName = szColumnName;
24148 		tCol.m_sName.ToLower();
24149 
24150 		// deduce buffer size
24151 		// use a small buffer by default, and a bigger one for varchars
24152 		int iBuffLen = DEFAULT_COL_SIZE;
24153 		if ( iDataType==SQL_WCHAR || iDataType==SQL_WVARCHAR || iDataType==SQL_WLONGVARCHAR|| iDataType==SQL_VARCHAR )
24154 			iBuffLen = VARCHAR_COL_SIZE;
24155 
24156 		if ( m_hColBuffers ( tCol.m_sName ) )
24157 			iBuffLen = m_hColBuffers [ tCol.m_sName ]; // got explicit user override
24158 		else if ( uColSize )
24159 			iBuffLen = Min ( uColSize+1, (SQLULEN) MAX_COL_SIZE ); // got data from driver
24160 
24161 		tCol.m_dContents.Resize ( iBuffLen + MS_SQL_BUFFER_GAP );
24162 		tCol.m_dRaw.Resize ( iBuffLen + MS_SQL_BUFFER_GAP );
24163 		tCol.m_iInd = 0;
24164 		tCol.m_iBufferSize = iBuffLen;
24165 		tCol.m_bUnicode = m_bUnicode && ( iDataType==SQL_WCHAR || iDataType==SQL_WVARCHAR || iDataType==SQL_WLONGVARCHAR );
24166 		tCol.m_bTruncated = false;
24167 		iTotalBuffer += iBuffLen;
24168 
24169 		if ( SQLBindCol ( m_hStmt, (SQLUSMALLINT)(i+1),
24170 			tCol.m_bUnicode ? SQL_UNICODE : SQL_C_CHAR,
24171 			tCol.m_bUnicode ? tCol.m_dRaw.Begin() : tCol.m_dContents.Begin(),
24172 			iBuffLen, &(tCol.m_iInd) )==SQL_ERROR )
24173 				return false;
24174 	}
24175 
24176 	if ( iTotalBuffer>WARN_ROW_SIZE )
24177 		sphWarn ( "row buffer is over %d bytes; consider revising sql_column_buffers", iTotalBuffer );
24178 
24179 	return true;
24180 }
24181 
24182 
SqlIsError()24183 bool CSphSource_ODBC::SqlIsError ()
24184 {
24185 	return !m_sError.IsEmpty ();
24186 }
24187 
24188 
SqlError()24189 const char * CSphSource_ODBC::SqlError ()
24190 {
24191 	return m_sError.cstr();
24192 }
24193 
24194 
SqlConnect()24195 bool CSphSource_ODBC::SqlConnect ()
24196 {
24197 	if ( SQLAllocHandle ( SQL_HANDLE_ENV, NULL, &m_hEnv )==SQL_ERROR )
24198 	{
24199 		if ( m_tParams.m_bPrintQueries )
24200 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
24201 		return false;
24202 	}
24203 
24204 	SQLSetEnvAttr ( m_hEnv, SQL_ATTR_ODBC_VERSION, (void*) SQL_OV_ODBC3, SQL_IS_INTEGER );
24205 
24206 	if ( SQLAllocHandle ( SQL_HANDLE_DBC, m_hEnv, &m_hDBC )==SQL_ERROR )
24207 	{
24208 		if ( m_tParams.m_bPrintQueries )
24209 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
24210 		return false;
24211 	}
24212 
24213 	OdbcPostConnect ();
24214 
24215 	char szOutConn [2048];
24216 	SQLSMALLINT iOutConn = 0;
24217 	if ( SQLDriverConnect ( m_hDBC, NULL, (SQLTCHAR*) m_sOdbcDSN.cstr(), SQL_NTS, (SQLCHAR*)szOutConn, sizeof(szOutConn), &iOutConn, SQL_DRIVER_NOPROMPT )==SQL_ERROR )
24218 	{
24219 		GetSqlError ( SQL_HANDLE_DBC, m_hDBC );
24220 		if ( m_tParams.m_bPrintQueries )
24221 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
24222 		return false;
24223 	}
24224 
24225 	if ( m_tParams.m_bPrintQueries )
24226 		fprintf ( stdout, "SQL-CONNECT: ok\n" );
24227 	return true;
24228 }
24229 
24230 
SqlDisconnect()24231 void CSphSource_ODBC::SqlDisconnect ()
24232 {
24233 	if ( m_tParams.m_bPrintQueries )
24234 		fprintf ( stdout, "SQL-DISCONNECT\n" );
24235 
24236 	if ( m_hStmt!=NULL )
24237 		SQLFreeHandle ( SQL_HANDLE_STMT, m_hStmt );
24238 
24239 	if ( m_hDBC )
24240 	{
24241 		SQLDisconnect ( m_hDBC );
24242 		SQLFreeHandle ( SQL_HANDLE_DBC, m_hDBC );
24243 	}
24244 
24245 	if ( m_hEnv )
24246 		SQLFreeHandle ( SQL_HANDLE_ENV, m_hEnv );
24247 }
24248 
24249 
SqlNumFields()24250 int CSphSource_ODBC::SqlNumFields ()
24251 {
24252 	if ( !m_hStmt )
24253 		return -1;
24254 
24255 	return m_nResultCols;
24256 }
24257 
24258 
SqlFetchRow()24259 bool CSphSource_ODBC::SqlFetchRow ()
24260 {
24261 	if ( !m_hStmt )
24262 		return false;
24263 
24264 	SQLRETURN iRet = SQLFetch ( m_hStmt );
24265 	if ( iRet==SQL_ERROR || iRet==SQL_INVALID_HANDLE || iRet==SQL_NO_DATA )
24266 	{
24267 		GetSqlError ( SQL_HANDLE_STMT, m_hStmt );
24268 		return false;
24269 	}
24270 
24271 	ARRAY_FOREACH ( i, m_dColumns )
24272 	{
24273 		QueryColumn_t & tCol = m_dColumns[i];
24274 		switch ( tCol.m_iInd )
24275 		{
24276 			case SQL_NULL_DATA:
24277 				tCol.m_dContents[0] = '\0';
24278 				tCol.m_dContents[0] = '\0';
24279 				break;
24280 
24281 			default:
24282 #if USE_WINDOWS // FIXME! support UCS-2 columns on Unix too
24283 				if ( tCol.m_bUnicode )
24284 				{
24285 					// WideCharToMultiByte should get NULL terminated string
24286 					memset ( tCol.m_dRaw.Begin()+tCol.m_iBufferSize, 0, MS_SQL_BUFFER_GAP );
24287 
24288 					int iConv = WideCharToMultiByte ( CP_UTF8, 0, LPCWSTR ( tCol.m_dRaw.Begin() ), tCol.m_iInd/sizeof(WCHAR),
24289 						LPSTR ( tCol.m_dContents.Begin() ), tCol.m_iBufferSize-1, NULL, NULL );
24290 
24291 					if ( iConv==0 )
24292 						if ( GetLastError()==ERROR_INSUFFICIENT_BUFFER )
24293 							iConv = tCol.m_iBufferSize-1;
24294 
24295 					tCol.m_dContents[iConv] = '\0';
24296 
24297 				} else
24298 #endif
24299 				{
24300 					if ( tCol.m_iInd>=0 && tCol.m_iInd<tCol.m_iBufferSize )
24301 					{
24302 						// data fetched ok; add trailing zero
24303 						tCol.m_dContents[tCol.m_iInd] = '\0';
24304 
24305 					} else if ( tCol.m_iInd>=tCol.m_iBufferSize && !tCol.m_bTruncated )
24306 					{
24307 						// out of buffer; warn about that (once)
24308 						tCol.m_bTruncated = true;
24309 						sphWarn ( "'%s' column truncated (buffer=%d, got=%d); consider revising sql_column_buffers",
24310 							tCol.m_sName.cstr(), tCol.m_iBufferSize-1, (int) tCol.m_iInd );
24311 					}
24312 				}
24313 			break;
24314 		}
24315 	}
24316 
24317 	return iRet!=SQL_NO_DATA;
24318 }
24319 
24320 
SqlColumn(int iIndex)24321 const char * CSphSource_ODBC::SqlColumn ( int iIndex )
24322 {
24323 	if ( !m_hStmt )
24324 		return NULL;
24325 
24326 	return &(m_dColumns [iIndex].m_dContents[0]);
24327 }
24328 
24329 
SqlFieldName(int iIndex)24330 const char * CSphSource_ODBC::SqlFieldName ( int iIndex )
24331 {
24332 	return m_dColumns[iIndex].m_sName.cstr();
24333 }
24334 
24335 
SqlColumnLength(int)24336 DWORD CSphSource_ODBC::SqlColumnLength ( int )
24337 {
24338 	return 0;
24339 }
24340 
24341 
Setup(const CSphSourceParams_ODBC & tParams)24342 bool CSphSource_ODBC::Setup ( const CSphSourceParams_ODBC & tParams )
24343 {
24344 	if ( !CSphSource_SQL::Setup ( tParams ) )
24345 		return false;
24346 
24347 	// parse column buffers spec, if any
24348 	if ( !tParams.m_sColBuffers.IsEmpty() )
24349 	{
24350 		const char * p = tParams.m_sColBuffers.cstr();
24351 		while ( *p )
24352 		{
24353 			// skip space
24354 			while ( sphIsSpace(*p) )
24355 				p++;
24356 
24357 			// expect eof or ident
24358 			if ( !*p )
24359 				break;
24360 			if ( !sphIsAlpha(*p) )
24361 			{
24362 				m_sError.SetSprintf ( "identifier expected in sql_column_buffers near '%s'", p );
24363 				return false;
24364 			}
24365 
24366 			// get ident
24367 			CSphString sCol;
24368 			const char * pIdent = p;
24369 			while ( sphIsAlpha(*p) )
24370 				p++;
24371 			sCol.SetBinary ( pIdent, p-pIdent );
24372 
24373 			// skip space
24374 			while ( sphIsSpace(*p) )
24375 				p++;
24376 
24377 			// expect assignment
24378 			if ( *p!='=' )
24379 			{
24380 				m_sError.SetSprintf ( "'=' expected in sql_column_buffers near '%s'", p );
24381 				return false;
24382 			}
24383 			p++;
24384 
24385 			// skip space
24386 			while ( sphIsSpace(*p) )
24387 				p++;
24388 
24389 			// expect number
24390 			if (!( *p>='0' && *p<='9' ))
24391 			{
24392 				m_sError.SetSprintf ( "number expected in sql_column_buffers near '%s'", p );
24393 				return false;
24394 			}
24395 
24396 			// get value
24397 			int iSize = 0;
24398 			while ( *p>='0' && *p<='9' )
24399 			{
24400 				iSize = 10*iSize + ( *p-'0' );
24401 				p++;
24402 			}
24403 			if ( *p=='K' )
24404 			{
24405 				iSize *= 1024;
24406 				p++;
24407 			} else if ( *p=='M' )
24408 			{
24409 				iSize *= 1048576;
24410 				p++;
24411 			}
24412 
24413 			// hash value
24414 			sCol.ToLower();
24415 			m_hColBuffers.Add ( iSize, sCol );
24416 
24417 			// skip space
24418 			while ( sphIsSpace(*p) )
24419 				p++;
24420 
24421 			// expect eof or comma
24422 			if ( !*p )
24423 				break;
24424 			if ( *p!=',' )
24425 			{
24426 				m_sError.SetSprintf ( "comma expected in sql_column_buffers near '%s'", p );
24427 				return false;
24428 			}
24429 			p++;
24430 		}
24431 	}
24432 
24433 	// ODBC specific params
24434 	m_sOdbcDSN = tParams.m_sOdbcDSN;
24435 	m_bWinAuth = tParams.m_bWinAuth;
24436 	m_bUnicode = tParams.m_bUnicode;
24437 
24438 	// build and store DSN for error reporting
24439 	char sBuf [ 1024 ];
24440 	snprintf ( sBuf, sizeof(sBuf), "odbc%s", m_sSqlDSN.cstr()+3 );
24441 	m_sSqlDSN = sBuf;
24442 
24443 	return true;
24444 }
24445 
24446 
GetSqlError(SQLSMALLINT iHandleType,SQLHANDLE hHandle)24447 void CSphSource_ODBC::GetSqlError ( SQLSMALLINT iHandleType, SQLHANDLE hHandle )
24448 {
24449 	if ( !hHandle )
24450 	{
24451 		m_sError.SetSprintf ( "invalid handle" );
24452 		return;
24453 	}
24454 
24455 	char szState[16] = "";
24456 	char szMessageText[1024] = "";
24457 	SQLINTEGER iError;
24458 	SQLSMALLINT iLen;
24459 	SQLGetDiagRec ( iHandleType, hHandle, 1, (SQLCHAR*)szState, &iError, (SQLCHAR*)szMessageText, 1024, &iLen );
24460 	m_sError = szMessageText;
24461 }
24462 
24463 //////////////////////////////////////////////////////////////////////////
24464 
OdbcPostConnect()24465 void CSphSource_MSSQL::OdbcPostConnect ()
24466 {
24467 	const int MAX_LEN = 1024;
24468 	char szDriver[MAX_LEN];
24469 	char szDriverAttrs[MAX_LEN];
24470 	SQLSMALLINT iDescLen = 0;
24471 	SQLSMALLINT iAttrLen = 0;
24472 	SQLSMALLINT iDir = SQL_FETCH_FIRST;
24473 
24474 	CSphString sDriver;
24475 	for ( ;; )
24476 	{
24477 		SQLRETURN iRet = SQLDrivers ( m_hEnv, iDir, (SQLCHAR*)szDriver, MAX_LEN, &iDescLen, (SQLCHAR*)szDriverAttrs, MAX_LEN, &iAttrLen );
24478 		if ( iRet==SQL_NO_DATA )
24479 			break;
24480 
24481 		iDir = SQL_FETCH_NEXT;
24482 		if ( !strcmp ( szDriver, "SQL Native Client" )
24483 			|| !strncmp ( szDriver, "SQL Server Native Client", strlen("SQL Server Native Client") ) )
24484 		{
24485 			sDriver = szDriver;
24486 			break;
24487 		}
24488 	}
24489 
24490 	if ( sDriver.IsEmpty() )
24491 		sDriver = "SQL Server";
24492 
24493 	if ( m_bWinAuth && m_tParams.m_sUser.IsEmpty () )
24494 	{
24495 		m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};Database={%s};Trusted_Connection=yes",
24496 			sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sDB.cstr () );
24497 
24498 	} else if ( m_bWinAuth )
24499 	{
24500 		m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};UID={%s};PWD={%s};Database={%s};Trusted_Connection=yes",
24501 			sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sUser.cstr (), m_tParams.m_sPass.cstr (), m_tParams.m_sDB.cstr () );
24502 	} else
24503 	{
24504 		m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};UID={%s};PWD={%s};Database={%s}",
24505 			sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sUser.cstr (), m_tParams.m_sPass.cstr (), m_tParams.m_sDB.cstr () );
24506 	}
24507 }
24508 
24509 #endif
24510 
24511 /////////////////////////////////////////////////////////////////////////////
24512 // MERGER HELPERS
24513 /////////////////////////////////////////////////////////////////////////////
24514 
Read(CSphReader & tReader)24515 void CSphDocMVA::Read ( CSphReader & tReader )
24516 {
24517 	m_iDocID = tReader.GetDocid();
24518 	if ( m_iDocID )
24519 	{
24520 		ARRAY_FOREACH ( i, m_dMVA )
24521 		{
24522 			DWORD iValues = tReader.GetDword();
24523 			m_dMVA[i].Resize ( iValues );
24524 			if ( iValues )
24525 				tReader.GetBytes ( m_dMVA[i].Begin(), iValues*sizeof(DWORD) );
24526 		}
24527 	}
24528 }
24529 
Write(CSphWriter & tWriter)24530 void CSphDocMVA::Write ( CSphWriter & tWriter )
24531 {
24532 	tWriter.PutDocid ( m_iDocID );
24533 	ARRAY_FOREACH ( i, m_dMVA )
24534 	{
24535 		m_dOffsets[i] = ( DWORD )tWriter.GetPos() / sizeof( DWORD );
24536 
24537 		int iValues = m_dMVA[i].GetLength();
24538 		tWriter.PutDword ( iValues );
24539 		if ( iValues )
24540 			tWriter.PutBytes ( m_dMVA[i].Begin(), iValues*sizeof(DWORD) );
24541 	}
24542 }
24543 
24544 /////////////////////////////////////////////////////////////////////////////
24545 
sphSetQuiet(bool bQuiet)24546 void sphSetQuiet ( bool bQuiet )
24547 {
24548 	g_bSphQuiet = bQuiet;
24549 }
24550 
24551 
sphSetDebugCheck()24552 void sphSetDebugCheck ()
24553 {
24554 	g_bDebugCheck = true;
24555 }
24556 
24557 
GetPercent(int64_t a,int64_t b)24558 static inline float GetPercent ( int64_t a, int64_t b )
24559 {
24560 	if ( b==0 )
24561 		return 100.0f;
24562 
24563 	int64_t r = a*100000/b;
24564 	return float(r)/1000;
24565 }
24566 
24567 
BuildMessage() const24568 const char * CSphIndexProgress::BuildMessage() const
24569 {
24570 	static char sBuf[256];
24571 	switch ( m_ePhase )
24572 	{
24573 		case PHASE_COLLECT:
24574 			snprintf ( sBuf, sizeof(sBuf), "collected "INT64_FMT" docs, %.1f MB", m_iDocuments,
24575 				float(m_iBytes)/1000000.0f );
24576 			break;
24577 
24578 		case PHASE_SORT:
24579 			snprintf ( sBuf, sizeof(sBuf), "sorted %.1f Mhits, %.1f%% done", float(m_iHits)/1000000,
24580 				GetPercent ( m_iHits, m_iHitsTotal ) );
24581 			break;
24582 
24583 		case PHASE_COLLECT_MVA:
24584 			snprintf ( sBuf, sizeof(sBuf), "collected "INT64_FMT" attr values", m_iAttrs );
24585 			break;
24586 
24587 		case PHASE_SORT_MVA:
24588 			snprintf ( sBuf, sizeof(sBuf), "sorted %.1f Mvalues, %.1f%% done", float(m_iAttrs)/1000000,
24589 				GetPercent ( m_iAttrs, m_iAttrsTotal ) );
24590 			break;
24591 
24592 		case PHASE_MERGE:
24593 			snprintf ( sBuf, sizeof(sBuf), "merged %.1f Kwords", float(m_iWords)/1000 );
24594 			break;
24595 
24596 		case PHASE_PREREAD:
24597 			snprintf ( sBuf, sizeof(sBuf), "read %.1f of %.1f MB, %.1f%% done",
24598 				float(m_iBytes)/1000000.0f, float(m_iBytesTotal)/1000000.0f,
24599 				GetPercent ( m_iBytes, m_iBytesTotal ) );
24600 			break;
24601 
24602 		case PHASE_PRECOMPUTE:
24603 			snprintf ( sBuf, sizeof(sBuf), "indexing attributes, %d.%d%% done", m_iDone/10, m_iDone%10 );
24604 			break;
24605 
24606 		default:
24607 			assert ( 0 && "internal error: unhandled progress phase" );
24608 			snprintf ( sBuf, sizeof(sBuf), "(progress-phase-%d)", m_ePhase );
24609 			break;
24610 	}
24611 
24612 	sBuf[sizeof(sBuf)-1] = '\0';
24613 	return sBuf;
24614 }
24615 
24616 /////////////////////////////////////////////////////////////////////////////
24617 
sphDictCmp(const char * pStr1,int iLen1,const char * pStr2,int iLen2)24618 int sphDictCmp ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 )
24619 {
24620 	assert ( pStr1 && pStr2 );
24621 	assert ( iLen1 && iLen2 );
24622 	const int iCmpLen = Min ( iLen1, iLen2 );
24623 	return strncmp ( pStr1, pStr2, iCmpLen );
24624 }
24625 
sphDictCmpStrictly(const char * pStr1,int iLen1,const char * pStr2,int iLen2)24626 int sphDictCmpStrictly ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 )
24627 {
24628 	assert ( pStr1 && pStr2 );
24629 	assert ( iLen1 && iLen2 );
24630 	const int iCmpLen = Min ( iLen1, iLen2 );
24631 	const int iCmpRes = strncmp ( pStr1, pStr2, iCmpLen );
24632 	return iCmpRes==0 ? iLen1-iLen2 : iCmpRes;
24633 }
24634 
24635 
WordDictInfo_t()24636 WordDictInfo_t::WordDictInfo_t ()
24637 {
24638 	m_uOff = 0;
24639 	m_iDocs = 0;
24640 	m_iHits = 0;
24641 	m_iDoclistHint = 0;
24642 }
24643 
WordReaderContext_t()24644 WordReaderContext_t::WordReaderContext_t()
24645 {
24646 	m_sWord[0] = '\0';
24647 	m_iLen = 0;
24648 }
24649 
24650 
CWordlist()24651 CWordlist::CWordlist ()
24652 	: m_dCheckpoints ( 0 )
24653 {
24654 	m_iCheckpointsPos = 0;
24655 	m_iSize = 0;
24656 	m_iMaxChunk = 0;
24657 	m_bWordDict = false;
24658 	m_pWords = NULL;
24659 }
24660 
~CWordlist()24661 CWordlist::~CWordlist ()
24662 {
24663 	Reset();
24664 }
24665 
Reset()24666 void CWordlist::Reset ()
24667 {
24668 	m_tFile.Close ();
24669 	m_pBuf.Reset ();
24670 
24671 	m_dCheckpoints.Reset ( 0 );
24672 	SafeDeleteArray ( m_pWords );
24673 }
24674 
ReadCP(CSphAutofile & tFile,DWORD uVer,bool bWordDict,CSphString & sError)24675 bool CWordlist::ReadCP ( CSphAutofile & tFile, DWORD uVer, bool bWordDict, CSphString & sError )
24676 {
24677 	assert ( ( uVer>=21 && bWordDict ) || !bWordDict );
24678 	assert ( m_iCheckpointsPos>0 );
24679 	assert ( m_iSize-m_iCheckpointsPos<UINT_MAX );
24680 
24681 	const int iCheckpointOnlySize = (int)(m_iSize-m_iCheckpointsPos);
24682 	const int iCount = m_dCheckpoints.GetLength();
24683 
24684 	CSphReader tReader;
24685 	tReader.SetFile ( tFile );
24686 	tReader.SeekTo ( m_iCheckpointsPos, iCheckpointOnlySize );
24687 
24688 	m_bWordDict = bWordDict;
24689 
24690 	if ( m_bWordDict )
24691 	{
24692 		const int iArenaSize = iCheckpointOnlySize - (sizeof(DWORD)+sizeof(SphOffset_t))*iCount + sizeof(BYTE)*iCount;
24693 		assert ( iArenaSize>=0 );
24694 		m_pWords = new BYTE[iArenaSize];
24695 		assert ( m_pWords );
24696 
24697 		BYTE * pWord = m_pWords;
24698 		ARRAY_FOREACH ( i, m_dCheckpoints )
24699 		{
24700 			m_dCheckpoints[i].m_sWord = (char *)pWord;
24701 
24702 			const int iLen = tReader.GetDword();
24703 			assert ( iLen>0 );
24704 			assert ( iLen+1+(pWord-m_pWords)<=iArenaSize );
24705 			tReader.GetBytes ( pWord, iLen );
24706 			pWord[iLen] = '\0';
24707 			pWord += iLen+1;
24708 
24709 			m_dCheckpoints[i].m_iWordlistOffset = tReader.GetOffset();
24710 		}
24711 	} else if ( uVer>=11 )
24712 	{
24713 		// read v.14 checkpoints
24714 		ARRAY_FOREACH ( i, m_dCheckpoints )
24715 		{
24716 			m_dCheckpoints[i].m_iWordID = (SphWordID_t)tReader.GetOffset();
24717 			m_dCheckpoints[i].m_iWordlistOffset = tReader.GetOffset();
24718 		}
24719 	} else
24720 	{
24721 		// convert v.10 checkpoints
24722 		ARRAY_FOREACH ( i, m_dCheckpoints )
24723 		{
24724 #if USE_64BIT
24725 			m_dCheckpoints[i].m_iWordID = tReader.GetOffset();
24726 #else
24727 			m_dCheckpoints[i].m_iWordID = tReader.GetDword();
24728 #endif
24729 			m_dCheckpoints[i].m_iWordlistOffset = tReader.GetDword();
24730 		}
24731 	}
24732 
24733 	SphOffset_t uMaxChunk = 0;
24734 	ARRAY_FOREACH ( i, m_dCheckpoints )
24735 	{
24736 		SphOffset_t uNextOffset = ( i+1 )==m_dCheckpoints.GetLength()
24737 			? m_iSize
24738 			: m_dCheckpoints[i+1].m_iWordlistOffset;
24739 		uMaxChunk = Max ( uMaxChunk, uNextOffset - m_dCheckpoints[i].m_iWordlistOffset );
24740 	}
24741 	assert ( uMaxChunk<UINT_MAX );
24742 	m_iMaxChunk = (int)uMaxChunk;
24743 
24744 	if ( tReader.GetErrorFlag() )
24745 		sError = tReader.GetErrorMessage();
24746 
24747 	return !tReader.GetErrorFlag();
24748 }
24749 
FindCheckpoint(const char * sWord,int iWordLen,SphWordID_t iWordID,bool bStarMode) const24750 const CSphWordlistCheckpoint * CWordlist::FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const
24751 {
24752 	return sphSearchCheckpoint ( sWord, iWordLen, iWordID, bStarMode, m_bWordDict, m_dCheckpoints.Begin(), &m_dCheckpoints.Last() );
24753 }
24754 
GetWord(const BYTE * pBuf,const char * pStr,int iLen,WordDictInfo_t & tWord,bool bStarMode,WordReaderContext_t & tCtx) const24755 const BYTE * CWordlist::GetWord ( const BYTE * pBuf, const char * pStr, int iLen, WordDictInfo_t & tWord, bool bStarMode, WordReaderContext_t & tCtx ) const
24756 {
24757 	assert ( pBuf );
24758 	assert ( pStr && iLen>0 );
24759 
24760 	for ( ;; )
24761 	{
24762 		// unpack next word
24763 		// must be in sync with DictEnd()!
24764 		BYTE uPack = *pBuf++;
24765 		if ( !uPack )
24766 			return NULL; // wordlist chunk is over
24767 
24768 		int iMatch, iDelta;
24769 		if ( uPack & 0x80 )
24770 		{
24771 			iDelta = ( ( uPack>>4 ) & 7 ) + 1;
24772 			iMatch = uPack & 15;
24773 		} else
24774 		{
24775 			iDelta = uPack & 127;
24776 			iMatch = *pBuf++;
24777 		}
24778 
24779 		assert ( iMatch+iDelta<(int)sizeof(tCtx.m_sWord)-1 );
24780 		assert ( iMatch<=(int)strlen ( (char *)tCtx.m_sWord ) );
24781 
24782 		memcpy ( tCtx.m_sWord + iMatch, pBuf, iDelta );
24783 		pBuf += iDelta;
24784 
24785 		tCtx.m_iLen = iMatch + iDelta;
24786 		tCtx.m_sWord[tCtx.m_iLen] = '\0';
24787 
24788 		// list is sorted, so if there was no match, there's no such word
24789 		int iCmpRes = bStarMode
24790 			? sphDictCmp ( pStr, iLen, (char*)tCtx.m_sWord, tCtx.m_iLen )
24791 			: sphDictCmpStrictly ( pStr, iLen, (char*)tCtx.m_sWord, tCtx.m_iLen );
24792 		if ( iCmpRes<0 )
24793 			return NULL;
24794 
24795 		const SphOffset_t uOff = sphUnzipOffset ( pBuf );
24796 		const int iDocs = sphUnzipInt ( pBuf );
24797 		const int iHits = sphUnzipInt ( pBuf );
24798 		BYTE uHint = ( iDocs>=DOCLIST_HINT_THRESH ) ? *pBuf++ : 0;
24799 
24800 		// it matches?!
24801 		if ( iCmpRes==0 && ( !bStarMode || iLen<=tCtx.m_iLen ) )
24802 		{
24803 			tWord.m_sWord = (char*)tCtx.m_sWord;
24804 			tWord.m_uOff = uOff;
24805 			tWord.m_iDocs = iDocs;
24806 			tWord.m_iHits = iHits;
24807 			tWord.m_iDoclistHint = DoclistHintUnpack ( iDocs, uHint );
24808 			return pBuf;
24809 		}
24810 	}
24811 }
24812 
GetWord(const BYTE * pBuf,SphWordID_t iWordID,WordDictInfo_t & tWord) const24813 bool CWordlist::GetWord ( const BYTE * pBuf, SphWordID_t iWordID, WordDictInfo_t & tWord ) const
24814 {
24815 	SphWordID_t iLastID = 0;
24816 	SphOffset_t uLastOff = 0;
24817 
24818 	for ( ;; )
24819 	{
24820 		// unpack next word ID
24821 		const SphWordID_t iDeltaWord = sphUnzipWordid ( pBuf ); // FIXME! slow with 32bit wordids
24822 
24823 		if ( iDeltaWord==0 ) // wordlist chunk is over
24824 			return false;
24825 
24826 		iLastID += iDeltaWord;
24827 
24828 		// list is sorted, so if there was no match, there's no such word
24829 		if ( iLastID>iWordID )
24830 			return false;
24831 
24832 		// unpack next offset
24833 		const SphOffset_t iDeltaOffset = sphUnzipOffset ( pBuf );
24834 		uLastOff += iDeltaOffset;
24835 
24836 		// unpack doc/hit count
24837 		const int iDocs = sphUnzipInt ( pBuf );
24838 		const int iHits = sphUnzipInt ( pBuf );
24839 
24840 		assert ( iDeltaOffset );
24841 		assert ( iDocs );
24842 		assert ( iHits );
24843 
24844 		// it matches?!
24845 		if ( iLastID==iWordID )
24846 		{
24847 			sphUnzipWordid ( pBuf ); // might be 0 at checkpoint
24848 			const SphOffset_t iDoclistLen = sphUnzipOffset ( pBuf );
24849 
24850 			tWord.m_uOff = uLastOff;
24851 			tWord.m_iDocs = iDocs;
24852 			tWord.m_iHits = iHits;
24853 			tWord.m_iDoclistHint = (int)iDoclistLen;
24854 
24855 			return true;
24856 		}
24857 	}
24858 }
24859 
AcquireDict(const CSphWordlistCheckpoint * pCheckpoint,int iFD,BYTE * pDictBuf) const24860 const BYTE * CWordlist::AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint, int iFD, BYTE * pDictBuf ) const
24861 {
24862 	assert ( pCheckpoint );
24863 	assert ( m_dCheckpoints.GetLength() );
24864 	assert ( pCheckpoint>=m_dCheckpoints.Begin() && pCheckpoint<=&m_dCheckpoints.Last() );
24865 	assert ( pCheckpoint->m_iWordlistOffset>0 && pCheckpoint->m_iWordlistOffset<=m_iSize );
24866 	assert ( m_pBuf.IsEmpty() || pCheckpoint->m_iWordlistOffset<(int64_t)m_pBuf.GetLength() );
24867 
24868 	const BYTE * pBuf = NULL;
24869 
24870 	if ( !m_pBuf.IsEmpty() )
24871 		pBuf = m_pBuf.GetWritePtr()+pCheckpoint->m_iWordlistOffset;
24872 	else
24873 	{
24874 		assert ( pDictBuf );
24875 		SphOffset_t iChunkLength = 0;
24876 
24877 		// not the end?
24878 		if ( pCheckpoint < &m_dCheckpoints.Last() )
24879 			iChunkLength = pCheckpoint[1].m_iWordlistOffset - pCheckpoint->m_iWordlistOffset;
24880 		else
24881 			iChunkLength = m_iSize - pCheckpoint->m_iWordlistOffset;
24882 
24883 		if ( (int)sphPread ( iFD, pDictBuf, (size_t)iChunkLength, pCheckpoint->m_iWordlistOffset )==iChunkLength )
24884 			pBuf = pDictBuf;
24885 	}
24886 
24887 	return pBuf;
24888 }
24889 
GetPrefixedWords(const char * sWord,int iWordLen,CSphVector<CSphNamedInt> & dPrefixedWords,BYTE * pDictBuf,int iFD) const24890 void CWordlist::GetPrefixedWords ( const char * sWord, int iWordLen, CSphVector<CSphNamedInt> & dPrefixedWords, BYTE * pDictBuf, int iFD ) const
24891 {
24892 	assert ( iWordLen>0 );
24893 
24894 	// empty index?
24895 	if ( !m_dCheckpoints.GetLength() )
24896 		return;
24897 
24898 	const CSphWordlistCheckpoint * pCheckpoint = FindCheckpoint ( sWord, iWordLen, 0, true );
24899 
24900 	while ( pCheckpoint )
24901 	{
24902 		// decode wordlist chunk
24903 		const BYTE * pBuf = AcquireDict ( pCheckpoint, iFD, pDictBuf );
24904 		assert ( pBuf );
24905 
24906 		WordReaderContext_t tReaderCtx;
24907 
24908 		while ( pBuf )
24909 		{
24910 			WordDictInfo_t tResWord;
24911 			pBuf = GetWord ( pBuf, sWord, iWordLen, tResWord, true, tReaderCtx );
24912 
24913 			if ( pBuf )
24914 			{
24915 				assert ( !tResWord.m_sWord.IsEmpty() );
24916 				CSphNamedInt & tPrefixed = dPrefixedWords.Add();
24917 				tPrefixed.m_sName = tResWord.m_sWord; // OPTIMIZE? swap mb?
24918 				tPrefixed.m_iValue = tResWord.m_iDocs;
24919 			}
24920 		}
24921 
24922 		pCheckpoint++;
24923 		if ( pCheckpoint > &m_dCheckpoints.Last() )
24924 			break;
24925 
24926 		if ( sphDictCmp ( sWord, iWordLen, pCheckpoint->m_sWord, strlen ( pCheckpoint->m_sWord ) )<0 )
24927 			break;
24928 	}
24929 }
24930 
Hash(const CSphString & sKey)24931 int CSphStrHashFunc::Hash ( const CSphString & sKey )
24932 {
24933 	return sKey.IsEmpty() ? 0 : sphCRC32 ( (const BYTE *)sKey.cstr() );
24934 }
24935 
24936 
24937 // all indexes should produce same terms for same query
Set(const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat)24938 void SphWordStatChecker_t::Set ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat )
24939 {
24940 	m_dSrcWords.Reserve ( hStat.GetLength() );
24941 	hStat.IterateStart();
24942 	while ( hStat.IterateNext() )
24943 	{
24944 		if ( hStat.IterateGet().m_bExpanded )
24945 			continue;
24946 
24947 		m_dSrcWords.Add ( sphFNV64 ( (const BYTE*)hStat.IterateGetKey().cstr() ) );
24948 	}
24949 	m_dSrcWords.Sort();
24950 }
24951 
24952 
DumpDiffer(const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat,const char * sIndex,CSphString & sWarning)24953 void SphWordStatChecker_t::DumpDiffer ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat, const char * sIndex, CSphString & sWarning )
24954 {
24955 	if ( !m_dSrcWords.GetLength() )
24956 		return;
24957 
24958 	bool bGotHead = false;
24959 
24960 	hStat.IterateStart();
24961 	while ( hStat.IterateNext() )
24962 	{
24963 		if ( hStat.IterateGet().m_bExpanded )
24964 			continue;
24965 
24966 		uint64_t uHash = sphFNV64 ( (const BYTE *)hStat.IterateGetKey().cstr() );
24967 		if ( !m_dSrcWords.BinarySearch ( uHash ) )
24968 		{
24969 			if ( !bGotHead )
24970 			{
24971 				sWarning.SetSprintf ( "index '%s': query word(s) mismatch: %s", sIndex, hStat.IterateGetKey().cstr() );
24972 				bGotHead = true;
24973 			} else
24974 			{
24975 				sWarning.SetSprintf ( "%s, %s", sWarning.cstr(), hStat.IterateGetKey().cstr() );
24976 			}
24977 		}
24978 	}
24979 }
24980 
24981 //////////////////////////////////////////////////////////////////////////
24982 // CSphQueryResultMeta
24983 //////////////////////////////////////////////////////////////////////////
24984 
CSphQueryResultMeta()24985 CSphQueryResultMeta::CSphQueryResultMeta ()
24986 : m_iQueryTime ( 0 )
24987 , m_iCpuTime ( 0 )
24988 , m_iMultiplier ( 1 )
24989 , m_iMatches ( 0 )
24990 , m_iTotalMatches ( 0 )
24991 {
24992 }
24993 
24994 
AddStat(const CSphString & sWord,int64_t iDocs,int64_t iHits,bool bExpanded)24995 void CSphQueryResultMeta::AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits, bool bExpanded )
24996 {
24997 	CSphString sFixed;
24998 	const CSphString * pFixed = &sWord;
24999 	if ( sWord.cstr()[0]==MAGIC_WORD_HEAD )
25000 	{
25001 		sFixed = sWord;
25002 		*(char *)( sFixed.cstr() ) = '*';
25003 		pFixed = &sFixed;
25004 	} else if ( sWord.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
25005 	{
25006 		if ( !bExpanded )
25007 		{
25008 			sFixed = sWord;
25009 			*(char *)( sFixed.cstr() ) = '=';
25010 			pFixed = &sFixed;
25011 		} else
25012 		{
25013 			sFixed = sWord.SubString ( 1, sWord.Length()-1 );
25014 			pFixed = &sFixed;
25015 		}
25016 	}
25017 
25018 	WordStat_t * pStats = m_hWordStats ( *pFixed );
25019 	if ( !pStats )
25020 	{
25021 		CSphQueryResultMeta::WordStat_t tStats;
25022 		tStats.m_iDocs = iDocs;
25023 		tStats.m_iHits = iHits;
25024 		tStats.m_bExpanded = bExpanded;
25025 		m_hWordStats.Add ( tStats, *pFixed );
25026 	} else
25027 	{
25028 		pStats->m_iDocs += iDocs;
25029 		pStats->m_iHits += iHits;
25030 		pStats->m_bExpanded |= bExpanded;
25031 	}
25032 }
25033 
25034 
CSphQueryResultMeta(const CSphQueryResultMeta & tMeta)25035 CSphQueryResultMeta::CSphQueryResultMeta ( const CSphQueryResultMeta & tMeta )
25036 {
25037 	*this = tMeta;
25038 }
25039 
25040 
operator =(const CSphQueryResultMeta & tMeta)25041 CSphQueryResultMeta & CSphQueryResultMeta::operator= ( const CSphQueryResultMeta & tMeta )
25042 {
25043 	m_iQueryTime = tMeta.m_iQueryTime;
25044 	m_iCpuTime = tMeta.m_iCpuTime;
25045 	m_iMultiplier = tMeta.m_iMultiplier;
25046 	m_iMatches = tMeta.m_iMatches;
25047 	m_iTotalMatches = tMeta.m_iTotalMatches;
25048 
25049 	m_sError = tMeta.m_sError;
25050 	m_sWarning = tMeta.m_sWarning;
25051 
25052 	m_hWordStats = tMeta.m_hWordStats;
25053 
25054 	return *this;
25055 }
25056 
25057 //
25058 // $Id: sphinx.cpp 4113 2013-08-26 07:43:28Z deogar $
25059 //
25060