1 //
2 // $Id$
3 //
4 
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 
16 #include "sphinx.h"
17 #include "sphinxstem.h"
18 #include "sphinxquery.h"
19 #include "sphinxutils.h"
20 #include "sphinxexpr.h"
21 #include "sphinxfilter.h"
22 #include "sphinxint.h"
23 #include "sphinxsearch.h"
24 #include "sphinxjson.h"
25 #include "sphinxplugin.h"
26 
27 #include <errno.h>
28 #include <ctype.h>
29 #include <fcntl.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <stdarg.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <limits.h>
36 #include <time.h>
37 #include <math.h>
38 #include <float.h>
39 
40 #define SPH_UNPACK_BUFFER_SIZE	4096
41 #define SPH_READ_PROGRESS_CHUNK (8192*1024)
42 #define SPH_READ_NOPROGRESS_CHUNK (32768*1024)
43 
44 #if USE_LIBSTEMMER
45 #include <libstemmer.h>
46 #endif
47 
48 #if USE_LIBEXPAT
49 #define XMLIMPORT
50 #include "expat.h"
51 
52 // workaround for expat versions prior to 1.95.7
53 #ifndef XMLCALL
54 #define XMLCALL
55 #endif
56 #endif
57 
58 #if USE_LIBICONV
59 #include "iconv.h"
60 #endif
61 
62 #if USE_ZLIB
63 #include <zlib.h>
64 #endif
65 
66 #if USE_ODBC
67 #include <sql.h>
68 #endif
69 
70 #if USE_RE2
71 #include <string>
72 #include <re2/re2.h>
73 #endif
74 
75 #if USE_RLP
76 #include "bt_rlp_c.h"
77 #include <bt_xwchar.h>
78 #endif
79 
80 #if USE_WINDOWS
81 	#include <io.h> // for open()
82 
83 	// workaround Windows quirks
84 	#define popen		_popen
85 	#define pclose		_pclose
86 	#define snprintf	_snprintf
87 	#define sphSeek		_lseeki64
88 
89 	#define stat		_stat64
90 	#define fstat		_fstat64
91 	#if _MSC_VER<1400
92 	#define struct_stat	__stat64
93 	#else
94 	#define struct_stat	struct _stat64
95 	#endif
96 
97 	#define ICONV_INBUF_CONST	1
98 #else
99 	#include <unistd.h>
100 	#include <sys/time.h>
101 
102 	#define sphSeek		lseek
103 
104 	#define struct_stat		struct stat
105 #endif
106 
107 #if ( USE_WINDOWS && !BUILD_WITH_CMAKE ) // on windows with cmake manual linkage is not necessary
108 #if ( USE_MYSQL )
109 	#pragma comment(linker, "/defaultlib:libmysql.lib")
110 	#pragma message("Automatically linking with libmysql.lib")
111 #endif
112 
113 #if ( USE_PGSQL )
114 	#pragma comment(linker, "/defaultlib:libpq.lib")
115 	#pragma message("Automatically linking with libpq.lib")
116 #endif
117 
118 #if ( USE_LIBSTEMMER )
119 	#pragma comment(linker, "/defaultlib:libstemmer_c.lib")
120 	#pragma message("Automatically linking with libstemmer_c.lib")
121 #endif
122 
123 #if ( USE_LIBEXPAT )
124 	#pragma comment(linker, "/defaultlib:libexpat.lib")
125 	#pragma message("Automatically linking with libexpat.lib")
126 #endif
127 
128 #if ( USE_LIBICONV )
129 	#pragma comment(linker, "/defaultlib:iconv.lib")
130 	#pragma message("Automatically linking with iconv.lib")
131 #endif
132 
133 #if ( USE_RE2 )
134 	#pragma comment(linker, "/defaultlib:re2.lib")
135 	#pragma message("Automatically linking with re2.lib")
136 #endif
137 #endif
138 
139 #if ( USE_WINDOWS && USE_RLP )
140 	#pragma comment(linker, "/defaultlib:btrlpc.lib")
141 	#pragma message("Automatically linking with btrlpc.lib")
142 	#pragma comment(linker, "/defaultlib:btutils.lib")
143 	#pragma message("Automatically linking with btutils.lib")
144 #endif
145 
146 /////////////////////////////////////////////////////////////////////////////
147 
148 // logf() is not there sometimes (eg. Solaris 9)
149 #if !USE_WINDOWS && !HAVE_LOGF
logf(float v)150 static inline float logf ( float v )
151 {
152 	return (float) log ( v );
153 }
154 #endif
155 
156 #if USE_WINDOWS
localtime_r(const time_t * clock,struct tm * res)157 void localtime_r ( const time_t * clock, struct tm * res )
158 {
159 	*res = *localtime ( clock );
160 }
161 
gmtime_r(const time_t * clock,struct tm * res)162 void gmtime_r ( const time_t * clock, struct tm * res )
163 {
164 	*res = *gmtime ( clock );
165 }
166 #endif
167 
168 // forward decl
169 void sphWarn ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 1, 2 ) ) );
170 static bool sphTruncate ( int iFD );
171 
172 /////////////////////////////////////////////////////////////////////////////
173 // GLOBALS
174 /////////////////////////////////////////////////////////////////////////////
175 
176 const char *		SPHINX_DEFAULT_UTF8_TABLE	= "0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F, U+401->U+451, U+451";
177 
178 const char *		MAGIC_WORD_SENTENCE		= "\3sentence";		// emitted from source on sentence boundary, stored in dictionary
179 const char *		MAGIC_WORD_PARAGRAPH	= "\3paragraph";	// emitted from source on paragraph boundary, stored in dictionary
180 
181 bool				g_bJsonStrict				= false;
182 bool				g_bJsonAutoconvNumbers		= false;
183 bool				g_bJsonKeynamesToLowercase	= false;
184 
185 static const int	DEFAULT_READ_BUFFER		= 262144;
186 static const int	DEFAULT_READ_UNHINTED	= 32768;
187 static const int	MIN_READ_BUFFER			= 8192;
188 static const int	MIN_READ_UNHINTED		= 1024;
189 #define READ_NO_SIZE_HINT 0
190 
191 static int			g_iReadBuffer			= DEFAULT_READ_BUFFER;
192 static int			g_iReadUnhinted			= DEFAULT_READ_UNHINTED;
193 
194 #ifndef SHAREDIR
195 #define SHAREDIR "."
196 #endif
197 
198 CSphString			g_sLemmatizerBase		= SHAREDIR;
199 
200 #if USE_RLP
201 CSphString			g_sRLPRoot				= SHAREDIR;
202 CSphString			g_sRLPEnv				= SHAREDIR"/rlp-environment.xml";
203 int					g_iRLPMaxBatchSize		= 51200;
204 int					g_iRLPMaxBatchDocs		= 50;
205 #endif
206 
207 // quick hack for indexer crash reporting
208 // one day, these might turn into a callback or something
209 int64_t		g_iIndexerCurrentDocID		= 0;
210 int64_t		g_iIndexerCurrentHits		= 0;
211 int64_t		g_iIndexerCurrentRangeMin	= 0;
212 int64_t		g_iIndexerCurrentRangeMax	= 0;
213 int64_t		g_iIndexerPoolStartDocID	= 0;
214 int64_t		g_iIndexerPoolStartHit		= 0;
215 
216 
217 /// global IDF
218 class CSphGlobalIDF
219 {
220 public:
CSphGlobalIDF()221 	CSphGlobalIDF ()
222 		: m_iTotalDocuments ( 0 )
223 		, m_iTotalWords ( 0 )
224 	{}
225 
226 	bool			Touch ( const CSphString & sFilename );
227 	bool			Preread ( const CSphString & sFilename, CSphString & sError );
228 	DWORD			GetDocs ( const CSphString & sWord ) const;
229 	float			GetIDF ( const CSphString & sWord, int64_t iDocsLocal, bool bPlainIDF );
230 
231 protected:
232 #pragma pack(push,4)
233 	struct IDFWord_t
234 	{
235 		uint64_t				m_uWordID;
236 		DWORD					m_iDocs;
237 	};
238 #pragma pack(pop)
239 	STATIC_SIZE_ASSERT			( IDFWord_t, 12 );
240 
241 	static const int			HASH_BITS = 16;
242 	int64_t						m_iTotalDocuments;
243 	int64_t						m_iTotalWords;
244 	SphOffset_t					m_uMTime;
245 	CSphSharedBuffer<IDFWord_t>	m_pWords;
246 	CSphSharedBuffer<int64_t>	m_pHash;
247 };
248 
249 
250 /// global idf definitions hash
251 static SmallStringHash_T <CSphGlobalIDF * >	g_hGlobalIDFs;
252 static CSphStaticMutex						g_tGlobalIDFLock;
253 
254 /////////////////////////////////////////////////////////////////////////////
255 // COMPILE-TIME CHECKS
256 /////////////////////////////////////////////////////////////////////////////
257 
258 STATIC_SIZE_ASSERT ( SphOffset_t, 8 );
259 
260 /////////////////////////////////////////////////////////////////////////////
261 
262 #if !USE_WINDOWS
263 
264 bool g_bHeadProcess = true;
265 
sphSetProcessInfo(bool bHead)266 void sphSetProcessInfo ( bool bHead )
267 {
268 	g_bHeadProcess = bHead;
269 }
270 
271 #endif // USE_WINDOWS
272 
273 // whatever to collect IO stats
274 static bool g_bCollectIOStats = false;
275 static SphThreadKey_t g_tIOStatsTls;
276 
277 
sphInitIOStats()278 bool sphInitIOStats ()
279 {
280 	if ( !sphThreadKeyCreate ( &g_tIOStatsTls ) )
281 		return false;
282 
283 	g_bCollectIOStats = true;
284 	return true;
285 }
286 
sphDoneIOStats()287 void sphDoneIOStats ()
288 {
289 	sphThreadKeyDelete ( g_tIOStatsTls );
290 	g_bCollectIOStats = false;
291 }
292 
293 
CSphIOStats()294 CSphIOStats::CSphIOStats ()
295 	: m_iReadTime ( 0 )
296 	, m_iReadOps ( 0 )
297 	, m_iReadBytes ( 0 )
298 	, m_iWriteTime ( 0 )
299 	, m_iWriteOps ( 0 )
300 	, m_iWriteBytes ( 0 )
301 	, m_pPrev ( NULL )
302 {}
303 
304 
~CSphIOStats()305 CSphIOStats::~CSphIOStats ()
306 {
307 	Stop();
308 }
309 
310 
Start()311 void CSphIOStats::Start()
312 {
313 	if ( !g_bCollectIOStats )
314 		return;
315 
316 	m_pPrev = (CSphIOStats *)sphThreadGet ( g_tIOStatsTls );
317 	sphThreadSet ( g_tIOStatsTls, this );
318 	m_bEnabled = true;
319 }
320 
Stop()321 void CSphIOStats::Stop()
322 {
323 	if ( !g_bCollectIOStats )
324 		return;
325 
326 	m_bEnabled = false;
327 	sphThreadSet ( g_tIOStatsTls, m_pPrev );
328 }
329 
330 
Add(const CSphIOStats & b)331 void CSphIOStats::Add ( const CSphIOStats & b )
332 {
333 	m_iReadTime += b.m_iReadTime;
334 	m_iReadOps += b.m_iReadOps;
335 	m_iReadBytes += b.m_iReadBytes;
336 	m_iWriteTime += b.m_iWriteTime;
337 	m_iWriteOps += b.m_iWriteOps;
338 	m_iWriteBytes += b.m_iWriteBytes;
339 }
340 
341 
GetIOStats()342 static CSphIOStats * GetIOStats ()
343 {
344 	if ( !g_bCollectIOStats )
345 		return NULL;
346 
347 	CSphIOStats * pIOStats = (CSphIOStats *)sphThreadGet ( g_tIOStatsTls );
348 
349 	if ( !pIOStats || !pIOStats->IsEnabled() )
350 		return NULL;
351 	return pIOStats;
352 }
353 
354 // a tiny wrapper over ::read() which additionally performs IO stats update
sphRead(int iFD,void * pBuf,size_t iCount)355 static int64_t sphRead ( int iFD, void * pBuf, size_t iCount )
356 {
357 	CSphIOStats * pIOStats = GetIOStats();
358 	int64_t tmStart = 0;
359 	if ( pIOStats )
360 		tmStart = sphMicroTimer();
361 
362 	int64_t iRead = ::read ( iFD, pBuf, iCount );
363 
364 	if ( pIOStats )
365 	{
366 		pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
367 		pIOStats->m_iReadOps++;
368 		pIOStats->m_iReadBytes += (-1==iRead) ? 0 : iCount;
369 	}
370 
371 	return iRead;
372 }
373 
374 
375 static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo, CSphString * pError );
376 
377 /////////////////////////////////////////////////////////////////////////////
378 // INTERNAL SPHINX CLASSES DECLARATIONS
379 /////////////////////////////////////////////////////////////////////////////
380 
CSphAutofile()381 CSphAutofile::CSphAutofile ()
382 	: m_iFD ( -1 )
383 	, m_bTemporary ( false )
384 	, m_bWouldTemporary ( false )
385 	, m_pStat ( NULL )
386 {
387 }
388 
389 
CSphAutofile(const CSphString & sName,int iMode,CSphString & sError,bool bTemp)390 CSphAutofile::CSphAutofile ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp )
391 	: m_iFD ( -1 )
392 	, m_bTemporary ( false )
393 	, m_bWouldTemporary ( false )
394 	, m_pStat ( NULL )
395 {
396 	Open ( sName, iMode, sError, bTemp );
397 }
398 
399 
~CSphAutofile()400 CSphAutofile::~CSphAutofile ()
401 {
402 	Close ();
403 }
404 
405 
Open(const CSphString & sName,int iMode,CSphString & sError,bool bTemp)406 int CSphAutofile::Open ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp )
407 {
408 	assert ( m_iFD==-1 && m_sFilename.IsEmpty () );
409 	assert ( !sName.IsEmpty() );
410 
411 #if USE_WINDOWS
412 	if ( iMode==SPH_O_READ )
413 	{
414 		intptr_t tFD = (intptr_t)CreateFile ( sName.cstr(), GENERIC_READ , FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL );
415 		m_iFD = _open_osfhandle ( tFD, 0 );
416 	} else
417 		m_iFD = ::open ( sName.cstr(), iMode, 0644 );
418 #else
419 	m_iFD = ::open ( sName.cstr(), iMode, 0644 );
420 #endif
421 	m_sFilename = sName; // not exactly sure why is this uncoditional. for error reporting later, i suppose
422 
423 	if ( m_iFD<0 )
424 		sError.SetSprintf ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
425 	else
426 	{
427 		m_bTemporary = bTemp; // only if we managed to actually open it
428 		m_bWouldTemporary = true; // if a shit happen - we could delete the file.
429 	}
430 
431 	return m_iFD;
432 }
433 
434 
Close()435 void CSphAutofile::Close ()
436 {
437 	if ( m_iFD>=0 )
438 	{
439 		::close ( m_iFD );
440 		if ( m_bTemporary )
441 			::unlink ( m_sFilename.cstr() );
442 	}
443 
444 	m_iFD = -1;
445 	m_sFilename = "";
446 	m_bTemporary = false;
447 	m_bWouldTemporary = false;
448 }
449 
SetTemporary()450 void CSphAutofile::SetTemporary()
451 {
452 	m_bTemporary = m_bWouldTemporary;
453 }
454 
455 
GetFilename() const456 const char * CSphAutofile::GetFilename () const
457 {
458 	assert ( m_sFilename.cstr() );
459 	return m_sFilename.cstr();
460 }
461 
462 
GetSize(SphOffset_t iMinSize,bool bCheckSizeT,CSphString & sError)463 SphOffset_t CSphAutofile::GetSize ( SphOffset_t iMinSize, bool bCheckSizeT, CSphString & sError )
464 {
465 	struct_stat st;
466 	if ( stat ( GetFilename(), &st )<0 )
467 	{
468 		sError.SetSprintf ( "failed to stat %s: %s", GetFilename(), strerror(errno) );
469 		return -1;
470 	}
471 	if ( st.st_size<iMinSize )
472 	{
473 		sError.SetSprintf ( "failed to load %s: bad size " INT64_FMT " (at least " INT64_FMT " bytes expected)",
474 			GetFilename(), (int64_t)st.st_size, (int64_t)iMinSize );
475 		return -1;
476 	}
477 	if ( bCheckSizeT )
478 	{
479 		size_t uCheck = (size_t)st.st_size;
480 		if ( st.st_size!=SphOffset_t(uCheck) )
481 		{
482 			sError.SetSprintf ( "failed to load %s: bad size " INT64_FMT " (out of size_t; 4 GB limit on 32-bit machine hit?)",
483 				GetFilename(), (int64_t)st.st_size );
484 			return -1;
485 		}
486 	}
487 	return st.st_size;
488 }
489 
490 
GetSize()491 SphOffset_t CSphAutofile::GetSize ()
492 {
493 	CSphString sTmp;
494 	return GetSize ( 0, false, sTmp );
495 }
496 
497 
Read(void * pBuf,int64_t iCount,CSphString & sError)498 bool CSphAutofile::Read ( void * pBuf, int64_t iCount, CSphString & sError )
499 {
500 	int64_t iToRead = iCount;
501 	BYTE * pCur = (BYTE *)pBuf;
502 	while ( iToRead>0 )
503 	{
504 		int64_t iToReadOnce = ( m_pStat )
505 			? Min ( iToRead, SPH_READ_PROGRESS_CHUNK )
506 			: Min ( iToRead, SPH_READ_NOPROGRESS_CHUNK );
507 		int64_t iGot = sphRead ( GetFD(), pCur, (size_t)iToReadOnce );
508 
509 		if ( iGot==-1 )
510 		{
511 			// interrupted by a signal - try again
512 			if ( errno==EINTR )
513 				continue;
514 
515 			sError.SetSprintf ( "read error in %s (%s); " INT64_FMT " of " INT64_FMT " bytes read",
516 							GetFilename(), strerror(errno), iCount-iToRead, iCount );
517 			return false;
518 		}
519 
520 		// EOF
521 		if ( iGot==0 )
522 		{
523 			sError.SetSprintf ( "unexpected EOF in %s (%s); " INT64_FMT " of " INT64_FMT " bytes read",
524 							GetFilename(), strerror(errno), iCount-iToRead, iCount );
525 			return false;
526 		}
527 
528 		iToRead -= iGot;
529 		pCur += iGot;
530 
531 		if ( m_pStat )
532 		{
533 			m_pStat->m_iBytes += iGot;
534 			m_pStat->Show ( false );
535 		}
536 	}
537 
538 	if ( iToRead!=0 )
539 	{
540 		sError.SetSprintf ( "read error in %s (%s); " INT64_FMT " of " INT64_FMT " bytes read",
541 							GetFilename(), strerror(errno), iCount-iToRead, iCount );
542 		return false;
543 	}
544 	return true;
545 }
546 
547 
SetProgressCallback(CSphIndexProgress * pStat)548 void CSphAutofile::SetProgressCallback ( CSphIndexProgress * pStat )
549 {
550 	m_pStat = pStat;
551 }
552 
553 
554 /////////////////////////////////////////////////////////////////////////////
555 
556 /// generic stateless priority queue
557 template < typename T, typename COMP > class CSphQueue
558 {
559 protected:
560 	T *		m_pData;
561 	int		m_iUsed;
562 	int		m_iSize;
563 
564 public:
565 	/// ctor
CSphQueue(int iSize)566 	explicit CSphQueue ( int iSize )
567 		: m_iUsed ( 0 )
568 		, m_iSize ( iSize )
569 	{
570 		assert ( iSize>0 );
571 		m_pData = new T [ iSize ];
572 		assert ( m_pData );
573 	}
574 
575 	/// dtor
~CSphQueue()576 	~CSphQueue ()
577 	{
578 		SafeDeleteArray ( m_pData );
579 	}
580 
581 	/// add entry to the queue
Push(const T & tEntry)582 	bool Push ( const T & tEntry )
583 	{
584 		if ( m_iUsed==m_iSize )
585 		{
586 			// if it's worse that current min, reject it, else pop off current min
587 			if ( COMP::IsLess ( tEntry, m_pData[0] ) )
588 				return true;
589 			else
590 				Pop ();
591 		}
592 
593 		// do add
594 		m_pData [ m_iUsed ] = tEntry;
595 		int iEntry = m_iUsed++;
596 
597 		// sift up if needed, so that worst (lesser) ones float to the top
598 		while ( iEntry )
599 		{
600 			int iParent = ( iEntry-1 ) >> 1;
601 			if ( !COMP::IsLess ( m_pData[iEntry], m_pData[iParent] ) )
602 				break;
603 
604 			// entry is less than parent, should float to the top
605 			Swap ( m_pData[iEntry], m_pData[iParent] );
606 			iEntry = iParent;
607 		}
608 
609 		return true;
610 	}
611 
612 	/// remove root (ie. top priority) entry
Pop()613 	void Pop ()
614 	{
615 		assert ( m_iUsed );
616 		if ( !(--m_iUsed) ) // empty queue? just return
617 			return;
618 
619 		// make the last entry my new root
620 		m_pData[0] = m_pData[m_iUsed];
621 
622 		// sift down if needed
623 		int iEntry = 0;
624 		for ( ;; )
625 		{
626 			// select child
627 			int iChild = (iEntry<<1) + 1;
628 			if ( iChild>=m_iUsed )
629 				break;
630 
631 			// select smallest child
632 			if ( iChild+1<m_iUsed )
633 				if ( COMP::IsLess ( m_pData[iChild+1], m_pData[iChild] ) )
634 					iChild++;
635 
636 			// if smallest child is less than entry, do float it to the top
637 			if ( COMP::IsLess ( m_pData[iChild], m_pData[iEntry] ) )
638 			{
639 				Swap ( m_pData[iChild], m_pData[iEntry] );
640 				iEntry = iChild;
641 				continue;
642 			}
643 
644 			break;
645 		}
646 	}
647 
648 	/// get entries count
GetLength() const649 	inline int GetLength () const
650 	{
651 		return m_iUsed;
652 	}
653 
654 	/// get current root
Root() const655 	inline const T & Root () const
656 	{
657 		assert ( m_iUsed );
658 		return m_pData[0];
659 	}
660 };
661 
662 //////////////////////////////////////////////////////////////////////////
663 
664 /// possible bin states
665 enum ESphBinState
666 {
667 	BIN_ERR_READ	= -2,	///< bin read error
668 	BIN_ERR_END		= -1,	///< bin end
669 	BIN_POS			= 0,	///< bin is in "expects pos delta" state
670 	BIN_DOC			= 1,	///< bin is in "expects doc delta" state
671 	BIN_WORD		= 2		///< bin is in "expects word delta" state
672 };
673 
674 
675 enum ESphBinRead
676 {
677 	BIN_READ_OK,			///< bin read ok
678 	BIN_READ_EOF,			///< bin end
679 	BIN_READ_ERROR,			///< bin read error
680 	BIN_PRECACHE_OK,		///< precache ok
681 	BIN_PRECACHE_ERROR		///< precache failed
682 };
683 
684 
685 /// aggregated hit info
686 struct CSphAggregateHit
687 {
688 	SphDocID_t		m_uDocID;		///< document ID
689 	SphWordID_t		m_uWordID;		///< word ID in current dictionary
690 	const BYTE *	m_sKeyword;		///< word itself (in keywords dictionary case only)
691 	Hitpos_t		m_iWordPos;		///< word position in current document, or hit count in case of aggregate hit
692 	FieldMask_t	m_dFieldMask;	///< mask of fields containing this word, 0 for regular hits, non-0 for aggregate hits
693 
CSphAggregateHitCSphAggregateHit694 	CSphAggregateHit()
695 		: m_uDocID ( 0 )
696 		, m_uWordID ( 0 )
697 		, m_sKeyword ( NULL )
698 	{}
699 
GetAggrCountCSphAggregateHit700 	int GetAggrCount () const
701 	{
702 		assert ( !m_dFieldMask.TestAll ( false ) );
703 		return m_iWordPos;
704 	}
705 
SetAggrCountCSphAggregateHit706 	void SetAggrCount ( int iVal )
707 	{
708 		m_iWordPos = iVal;
709 	}
710 };
711 
712 
713 static const int MAX_KEYWORD_BYTES = SPH_MAX_WORD_LEN*3+4;
714 
715 
716 /// bin, block input buffer
717 struct CSphBin
718 {
719 	static const int	MIN_SIZE	= 8192;
720 	static const int	WARN_SIZE	= 262144;
721 
722 protected:
723 	ESphHitless			m_eMode;
724 	int					m_iSize;
725 
726 	BYTE *				m_dBuffer;
727 	BYTE *				m_pCurrent;
728 	int					m_iLeft;
729 	int					m_iDone;
730 	ESphBinState		m_eState;
731 	bool				m_bWordDict;
732 	bool				m_bError;	// FIXME? sort of redundant, but states are a mess
733 
734 	CSphAggregateHit	m_tHit;									///< currently decoded hit
735 	BYTE				m_sKeyword [ MAX_KEYWORD_BYTES ];	///< currently decoded hit keyword (in keywords dict mode)
736 
737 #ifndef NDEBUG
738 	SphWordID_t			m_iLastWordID;
739 	BYTE				m_sLastKeyword [ MAX_KEYWORD_BYTES ];
740 #endif
741 
742 	int					m_iFile;		///< my file
743 	SphOffset_t *		m_pFilePos;		///< shared current offset in file
744 	ThrottleState_t *	m_pThrottle;
745 
746 public:
747 	SphOffset_t			m_iFilePos;		///< my current offset in file
748 	int					m_iFileLeft;	///< how much data is still unread from the file
749 
750 public:
751 	explicit 			CSphBin ( ESphHitless eMode = SPH_HITLESS_NONE, bool bWordDict = false );
752 						~CSphBin ();
753 
754 	static int			CalcBinSize ( int iMemoryLimit, int iBlocks, const char * sPhase, bool bWarn = true );
755 	void				Init ( int iFD, SphOffset_t * pSharedOffset, const int iBinSize );
756 
757 	SphWordID_t			ReadVLB ();
758 	int					ReadByte ();
759 	ESphBinRead			ReadBytes ( void * pDest, int iBytes );
760 	int					ReadHit ( CSphAggregateHit * pHit, int iRowitems, CSphRowitem * pRowitems );
761 
762 	DWORD				UnzipInt ();
763 	SphOffset_t			UnzipOffset ();
764 
765 	bool				IsEOF () const;
766 	bool				IsDone () const;
IsErrorCSphBin767 	bool				IsError () const { return m_bError; }
768 	ESphBinRead			Precache ();
SetThrottleCSphBin769 	void				SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
770 };
771 
772 /////////////////////////////////////////////////////////////////////////////
773 
774 class CSphIndex_VLN;
775 
776 /// everything required to setup search term
777 class DiskIndexQwordSetup_c : public ISphQwordSetup
778 {
779 public:
780 	const CSphAutofile &	m_tDoclist;
781 	const CSphAutofile &	m_tHitlist;
782 	bool					m_bSetupReaders;
783 	const BYTE *			m_pSkips;
784 	CSphQueryProfile *		m_pProfile;
785 
786 public:
DiskIndexQwordSetup_c(const CSphAutofile & tDoclist,const CSphAutofile & tHitlist,const BYTE * pSkips,CSphQueryProfile * pProfile)787 	DiskIndexQwordSetup_c ( const CSphAutofile & tDoclist, const CSphAutofile & tHitlist, const BYTE * pSkips, CSphQueryProfile * pProfile )
788 		: m_tDoclist ( tDoclist )
789 		, m_tHitlist ( tHitlist )
790 		, m_bSetupReaders ( false )
791 		, m_pSkips ( pSkips )
792 		, m_pProfile ( pProfile )
793 	{
794 	}
795 
796 	virtual ISphQword *					QwordSpawn ( const XQKeyword_t & tWord ) const;
797 	virtual bool						QwordSetup ( ISphQword * ) const;
798 
799 	bool								Setup ( ISphQword * ) const;
800 };
801 
802 
803 /// query word from the searcher's point of view
804 class DiskIndexQwordTraits_c : public ISphQword
805 {
806 	static const int	MINIBUFFER_LEN = 1024;
807 
808 public:
809 	/// tricky bit
810 	/// m_uHitPosition is always a current position in the .spp file
811 	/// base ISphQword::m_iHitlistPos carries the inlined hit data when m_iDocs==1
812 	/// but this one is always a real position, used for delta coding
813 	SphOffset_t		m_uHitPosition;
814 	Hitpos_t		m_uInlinedHit;
815 	DWORD			m_uHitState;
816 
817 	CSphMatch		m_tDoc;			///< current match (partial)
818 	Hitpos_t		m_iHitPos;		///< current hit postition, from hitlist
819 
820 	BYTE			m_dDoclistBuf [ MINIBUFFER_LEN ];
821 	BYTE			m_dHitlistBuf [ MINIBUFFER_LEN ];
822 	CSphReader		m_rdDoclist;	///< my doclist reader
823 	CSphReader		m_rdHitlist;	///< my hitlist reader
824 
825 	SphDocID_t		m_iMinID;		///< min ID to fixup
826 	int				m_iInlineAttrs;	///< inline attributes count
827 
828 	const CSphRowitem *	m_pInlineFixup;	///< inline attributes fixup (POINTER TO EXTERNAL DATA, NOT MANAGED BY THIS CLASS!)
829 
830 #ifndef NDEBUG
831 	bool			m_bHitlistOver;
832 #endif
833 
834 public:
DiskIndexQwordTraits_c(bool bUseMini,bool bExcluded)835 	explicit DiskIndexQwordTraits_c ( bool bUseMini, bool bExcluded )
836 		: m_uHitPosition ( 0 )
837 		, m_uHitState ( 0 )
838 		, m_iHitPos ()
839 		, m_rdDoclist ( bUseMini ? m_dDoclistBuf : NULL, bUseMini ? MINIBUFFER_LEN : 0 )
840 		, m_rdHitlist ( bUseMini ? m_dHitlistBuf : NULL, bUseMini ? MINIBUFFER_LEN : 0 )
841 		, m_iMinID ( 0 )
842 		, m_iInlineAttrs ( 0 )
843 		, m_pInlineFixup ( NULL )
844 #ifndef NDEBUG
845 		, m_bHitlistOver ( true )
846 #endif
847 	{
848 		m_iHitPos = EMPTY_HIT;
849 		m_bExcluded = bExcluded;
850 	}
851 
ResetDecoderState()852 	void ResetDecoderState ()
853 	{
854 		ISphQword::Reset();
855 		m_uHitPosition = 0;
856 		m_uInlinedHit = 0;
857 		m_uHitState = 0;
858 		m_tDoc.m_uDocID = m_iMinID;
859 		m_iHitPos = EMPTY_HIT;
860 	}
861 
862 	virtual bool Setup ( const DiskIndexQwordSetup_c * pSetup ) = 0;
863 };
864 
865 
operator <(const SkiplistEntry_t & a,SphDocID_t b)866 bool operator < ( const SkiplistEntry_t & a, SphDocID_t b )		{ return a.m_iBaseDocid<b; }
operator ==(const SkiplistEntry_t & a,SphDocID_t b)867 bool operator == ( const SkiplistEntry_t & a, SphDocID_t b )	{ return a.m_iBaseDocid==b; }
operator <(SphDocID_t a,const SkiplistEntry_t & b)868 bool operator < ( SphDocID_t a, const SkiplistEntry_t & b )		{ return a<b.m_iBaseDocid; }
869 
870 
871 /// query word from the searcher's point of view
872 template < bool INLINE_HITS, bool INLINE_DOCINFO, bool DISABLE_HITLIST_SEEK >
873 class DiskIndexQword_c : public DiskIndexQwordTraits_c
874 {
875 public:
DiskIndexQword_c(bool bUseMinibuffer,bool bExcluded)876 	DiskIndexQword_c ( bool bUseMinibuffer, bool bExcluded )
877 		: DiskIndexQwordTraits_c ( bUseMinibuffer, bExcluded )
878 	{}
879 
Reset()880 	virtual void Reset ()
881 	{
882 		m_rdDoclist.Reset ();
883 		m_rdDoclist.Reset ();
884 		m_iInlineAttrs = 0;
885 		ResetDecoderState();
886 	}
887 
GetHitlistEntry()888 	void GetHitlistEntry ()
889 	{
890 		assert ( !m_bHitlistOver );
891 		DWORD iDelta = m_rdHitlist.UnzipInt ();
892 		if ( iDelta )
893 		{
894 			m_iHitPos += iDelta;
895 		} else
896 		{
897 			m_iHitPos = EMPTY_HIT;
898 #ifndef NDEBUG
899 			m_bHitlistOver = true;
900 #endif
901 		}
902 	}
903 
HintDocid(SphDocID_t uMinID)904 	virtual void HintDocid ( SphDocID_t uMinID )
905 	{
906 		// tricky bit
907 		// FindSpan() will match a block where BaseDocid is >= RefValue
908 		// meaning that the subsequent ids decoded will be strictly > RefValue
909 		// meaning that if previous (!) blocks end with uMinID exactly,
910 		// and we use uMinID itself as RefValue, that document gets lost!
911 		// OPTIMIZE? keep last matched block index maybe?
912 		int iBlock = FindSpan ( m_dSkiplist, uMinID - m_iMinID - 1 );
913 		if ( iBlock<0 )
914 			return;
915 		const SkiplistEntry_t & t = m_dSkiplist [ iBlock ];
916 		if ( t.m_iOffset<=m_rdDoclist.GetPos() )
917 			return;
918 		m_rdDoclist.SeekTo ( t.m_iOffset, -1 );
919 		m_tDoc.m_uDocID = t.m_iBaseDocid + m_iMinID;
920 		m_uHitPosition = m_iHitlistPos = t.m_iBaseHitlistPos;
921 	}
922 
GetNextDoc(DWORD * pDocinfo)923 	virtual const CSphMatch & GetNextDoc ( DWORD * pDocinfo )
924 	{
925 		SphDocID_t uDelta = m_rdDoclist.UnzipDocid();
926 		if ( uDelta )
927 		{
928 			m_bAllFieldsKnown = false;
929 			m_tDoc.m_uDocID += uDelta;
930 			if_const ( INLINE_DOCINFO )
931 			{
932 				assert ( pDocinfo );
933 				for ( int i=0; i<m_iInlineAttrs; i++ )
934 					pDocinfo[i] = m_rdDoclist.UnzipInt() + m_pInlineFixup[i];
935 			}
936 
937 			if_const ( INLINE_HITS )
938 			{
939 				m_uMatchHits = m_rdDoclist.UnzipInt();
940 				const DWORD uFirst = m_rdDoclist.UnzipInt();
941 				if ( m_uMatchHits==1 && m_bHasHitlist )
942 				{
943 					DWORD uField = m_rdDoclist.UnzipInt(); // field and end marker
944 					m_iHitlistPos = uFirst | ( uField << 23 ) | ( U64C(1)<<63 );
945 					m_dQwordFields.UnsetAll();
946 					// want to make sure bad field data not cause crash
947 					m_dQwordFields.Set ( ( uField >> 1 ) & ( (DWORD)SPH_MAX_FIELDS-1 ) );
948 					m_bAllFieldsKnown = true;
949 				} else
950 				{
951 					m_dQwordFields.Assign32 ( uFirst );
952 					m_uHitPosition += m_rdDoclist.UnzipOffset();
953 					m_iHitlistPos = m_uHitPosition;
954 				}
955 			} else
956 			{
957 				SphOffset_t iDeltaPos = m_rdDoclist.UnzipOffset();
958 				assert ( iDeltaPos>=0 );
959 
960 				m_iHitlistPos += iDeltaPos;
961 
962 				m_dQwordFields.Assign32 ( m_rdDoclist.UnzipInt() );
963 				m_uMatchHits = m_rdDoclist.UnzipInt();
964 			}
965 		} else
966 		{
967 			m_tDoc.m_uDocID = 0;
968 		}
969 		return m_tDoc;
970 	}
971 
SeekHitlist(SphOffset_t uOff)972 	virtual void SeekHitlist ( SphOffset_t uOff )
973 	{
974 		if ( uOff >> 63 )
975 		{
976 			m_uHitState = 1;
977 			m_uInlinedHit = (DWORD)uOff; // truncate high dword
978 		} else
979 		{
980 			m_uHitState = 0;
981 			m_iHitPos = EMPTY_HIT;
982 			if_const ( DISABLE_HITLIST_SEEK )
983 				assert ( m_rdHitlist.GetPos()==uOff ); // make sure we're where caller thinks we are.
984 			else
985 				m_rdHitlist.SeekTo ( uOff, READ_NO_SIZE_HINT );
986 		}
987 #ifndef NDEBUG
988 		m_bHitlistOver = false;
989 #endif
990 	}
991 
GetNextHit()992 	virtual Hitpos_t GetNextHit ()
993 	{
994 		assert ( m_bHasHitlist );
995 		switch ( m_uHitState )
996 		{
997 			case 0: // read hit from hitlist
998 				GetHitlistEntry ();
999 				return m_iHitPos;
1000 
1001 			case 1: // return inlined hit
1002 				m_uHitState = 2;
1003 				return m_uInlinedHit;
1004 
1005 			case 2: // return end-of-hitlist marker after inlined hit
1006 				#ifndef NDEBUG
1007 				m_bHitlistOver = true;
1008 				#endif
1009 				m_uHitState = 0;
1010 				return EMPTY_HIT;
1011 		}
1012 		sphDie ( "INTERNAL ERROR: impossible hit emitter state" );
1013 		return EMPTY_HIT;
1014 	}
1015 
Setup(const DiskIndexQwordSetup_c * pSetup)1016 	bool Setup ( const DiskIndexQwordSetup_c * pSetup )
1017 	{
1018 		return pSetup->Setup ( this );
1019 	}
1020 };
1021 
1022 //////////////////////////////////////////////////////////////////////////////
1023 
1024 #define WITH_QWORD(INDEX, NO_SEEK, NAME, ACTION)													\
1025 {																									\
1026 	CSphIndex_VLN * INDEX##pIndex = (CSphIndex_VLN *)INDEX;												\
1027 	DWORD INDEX##uInlineHits = INDEX##pIndex->m_tSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE;					\
1028 	DWORD INDEX##uInlineDocinfo = INDEX##pIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE;						\
1029 																									\
1030 	switch ( ( INDEX##uInlineHits<<1 ) | INDEX##uInlineDocinfo )													\
1031 	{																								\
1032 		case 0: { typedef DiskIndexQword_c < false, false, NO_SEEK > NAME; ACTION; break; }			\
1033 		case 1: { typedef DiskIndexQword_c < false, true, NO_SEEK > NAME; ACTION; break; }			\
1034 		case 2: { typedef DiskIndexQword_c < true, false, NO_SEEK > NAME; ACTION; break; }			\
1035 		case 3: { typedef DiskIndexQword_c < true, true, NO_SEEK > NAME; ACTION; break; }			\
1036 		default:																					\
1037 			sphDie ( "INTERNAL ERROR: impossible qword settings" );									\
1038 	}																								\
1039 }
1040 
1041 /////////////////////////////////////////////////////////////////////////////
1042 
1043 #define HITLESS_DOC_MASK 0x7FFFFFFF
1044 #define HITLESS_DOC_FLAG 0x80000000
1045 
1046 
1047 struct Slice64_t
1048 {
1049 	uint64_t	m_uOff;
1050 	int			m_iLen;
1051 };
1052 
1053 struct DiskSubstringPayload_t : public ISphSubstringPayload
1054 {
DiskSubstringPayload_tDiskSubstringPayload_t1055 	explicit DiskSubstringPayload_t ( int iDoclists )
1056 		: m_dDoclist ( iDoclists )
1057 		, m_iTotalDocs ( 0 )
1058 		, m_iTotalHits ( 0 )
1059 	{}
1060 	CSphFixedVector<Slice64_t>	m_dDoclist;
1061 	int							m_iTotalDocs;
1062 	int							m_iTotalHits;
1063 };
1064 
1065 
1066 template < bool INLINE_HITS >
1067 class DiskPayloadQword_c : public DiskIndexQword_c<INLINE_HITS, false, false>
1068 {
1069 	typedef DiskIndexQword_c<INLINE_HITS, false, false> BASE;
1070 
1071 public:
DiskPayloadQword_c(const DiskSubstringPayload_t * pPayload,bool bExcluded,const CSphAutofile & tDoclist,const CSphAutofile & tHitlist,CSphQueryProfile * pProfile)1072 	explicit DiskPayloadQword_c ( const DiskSubstringPayload_t * pPayload, bool bExcluded,
1073 		const CSphAutofile & tDoclist, const CSphAutofile & tHitlist, CSphQueryProfile * pProfile )
1074 		: BASE ( true, bExcluded )
1075 	{
1076 		m_pPayload = pPayload;
1077 		this->m_iDocs = m_pPayload->m_iTotalDocs;
1078 		this->m_iHits = m_pPayload->m_iTotalHits;
1079 		m_iDoclist = 0;
1080 
1081 		this->m_rdDoclist.SetFile ( tDoclist );
1082 		this->m_rdDoclist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
1083 		this->m_rdDoclist.m_pProfile = pProfile;
1084 		this->m_rdDoclist.m_eProfileState = SPH_QSTATE_READ_DOCS;
1085 
1086 		this->m_rdHitlist.SetFile ( tHitlist );
1087 		this->m_rdHitlist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
1088 		this->m_rdHitlist.m_pProfile = pProfile;
1089 		this->m_rdHitlist.m_eProfileState = SPH_QSTATE_READ_HITS;
1090 	}
1091 
GetNextDoc(DWORD * pDocinfo)1092 	virtual const CSphMatch & GetNextDoc ( DWORD * pDocinfo )
1093 	{
1094 		const CSphMatch & tMatch = BASE::GetNextDoc ( pDocinfo );
1095 		assert ( &tMatch==&this->m_tDoc );
1096 		if ( !tMatch.m_uDocID && m_iDoclist<m_pPayload->m_dDoclist.GetLength() )
1097 		{
1098 			BASE::ResetDecoderState();
1099 			SetupReader();
1100 			BASE::GetNextDoc ( pDocinfo );
1101 			assert ( this->m_tDoc.m_uDocID );
1102 		}
1103 
1104 		return this->m_tDoc;
1105 	}
1106 
Setup(const DiskIndexQwordSetup_c *)1107 	bool Setup ( const DiskIndexQwordSetup_c * )
1108 	{
1109 		if ( m_iDoclist>=m_pPayload->m_dDoclist.GetLength() )
1110 			return false;
1111 
1112 		SetupReader();
1113 		return true;
1114 	}
1115 
1116 private:
SetupReader()1117 	void SetupReader ()
1118 	{
1119 		uint64_t uDocOff = m_pPayload->m_dDoclist[m_iDoclist].m_uOff;
1120 		int iHint = m_pPayload->m_dDoclist[m_iDoclist].m_iLen;
1121 		m_iDoclist++;
1122 
1123 		this->m_rdDoclist.SeekTo ( uDocOff, iHint );
1124 	}
1125 
1126 	const DiskSubstringPayload_t *	m_pPayload;
1127 	int								m_iDoclist;
1128 };
1129 
1130 /////////////////////////////////////////////////////////////////////////////
1131 
1132 
1133 /////////////////////////////////////////////////////////////////////////////
1134 
1135 struct CSphWordlistCheckpoint
1136 {
1137 	union
1138 	{
1139 		SphWordID_t		m_uWordID;
1140 		const char *	m_sWord;
1141 	};
1142 	SphOffset_t			m_iWordlistOffset;
1143 };
1144 
1145 // pre-v11 wordlist checkpoint
1146 struct CSphWordlistCheckpoint_v10
1147 {
1148 	SphWordID_t			m_uWordID;
1149 	DWORD				m_iWordlistOffset;
1150 };
1151 
1152 /////////////////////////////////////////////////////////////////////////////
1153 
ReadFileInfo(CSphReader & tReader,const char * szFilename,CSphSavedFile & tFile,CSphString * sWarning)1154 static void ReadFileInfo ( CSphReader & tReader, const char * szFilename, CSphSavedFile & tFile, CSphString * sWarning )
1155 {
1156 	tFile.m_uSize = tReader.GetOffset ();
1157 	tFile.m_uCTime = tReader.GetOffset ();
1158 	tFile.m_uMTime = tReader.GetOffset ();
1159 	tFile.m_uCRC32 = tReader.GetDword ();
1160 	tFile.m_sFilename = szFilename;
1161 
1162 	if ( szFilename && *szFilename && sWarning )
1163 	{
1164 		struct_stat tFileInfo;
1165 		if ( stat ( szFilename, &tFileInfo ) < 0 )
1166 			sWarning->SetSprintf ( "failed to stat %s: %s", szFilename, strerror(errno) );
1167 		else
1168 		{
1169 			DWORD uMyCRC32 = 0;
1170 			if ( !sphCalcFileCRC32 ( szFilename, uMyCRC32 ) )
1171 				sWarning->SetSprintf ( "failed to calculate CRC32 for %s", szFilename );
1172 			else
1173 				if ( uMyCRC32!=tFile.m_uCRC32 || tFileInfo.st_size!=tFile.m_uSize
1174 					|| tFileInfo.st_ctime!=tFile.m_uCTime || tFileInfo.st_mtime!=tFile.m_uMTime )
1175 						sWarning->SetSprintf ( "'%s' differs from the original", szFilename );
1176 		}
1177 	}
1178 }
1179 
1180 
WriteFileInfo(CSphWriter & tWriter,const CSphSavedFile & tInfo)1181 static void WriteFileInfo ( CSphWriter & tWriter, const CSphSavedFile & tInfo )
1182 {
1183 	tWriter.PutOffset ( tInfo.m_uSize );
1184 	tWriter.PutOffset ( tInfo.m_uCTime );
1185 	tWriter.PutOffset ( tInfo.m_uMTime );
1186 	tWriter.PutDword ( tInfo.m_uCRC32 );
1187 }
1188 
1189 
1190 /// dict=keywords block reader
1191 class KeywordsBlockReader_c : public CSphDictEntry
1192 {
1193 private:
1194 	const BYTE *	m_pBuf;
1195 	BYTE			m_sWord [ MAX_KEYWORD_BYTES ];
1196 	int				m_iLen;
1197 	BYTE			m_uHint;
1198 	bool			m_bHaveSkips;
1199 
1200 public:
1201 	explicit		KeywordsBlockReader_c ( const BYTE * pBuf, bool bHaveSkiplists );
1202 	bool			UnpackWord();
1203 
GetWord() const1204 	const char *	GetWord() const			{ return (const char*)m_sWord; }
GetWordLen() const1205 	int				GetWordLen() const		{ return m_iLen; }
1206 };
1207 
1208 
1209 // dictionary header
1210 struct DictHeader_t
1211 {
1212 	int				m_iDictCheckpoints;			///< how many dict checkpoints (keyword blocks) are there
1213 	SphOffset_t		m_iDictCheckpointsOffset;	///< dict checkpoints file position
1214 
1215 	int				m_iInfixCodepointBytes;		///< max bytes per infix codepoint (0 means no infixes)
1216 	int64_t			m_iInfixBlocksOffset;		///< infix blocks file position (stored as unsigned 32bit int as keywords dictionary is pretty small)
1217 	int				m_iInfixBlocksWordsSize;	///< infix checkpoints size
1218 
DictHeader_tDictHeader_t1219 	DictHeader_t()
1220 		: m_iDictCheckpoints ( 0 )
1221 		, m_iDictCheckpointsOffset ( 0 )
1222 		, m_iInfixCodepointBytes ( 0 )
1223 		, m_iInfixBlocksOffset ( 0 )
1224 		, m_iInfixBlocksWordsSize ( 0 )
1225 	{}
1226 };
1227 
1228 
1229 // !COMMIT eliminate this, move it to proper dict impls
1230 class CWordlist : public ISphWordlist, public DictHeader_t
1231 {
1232 public:
1233 	CSphFixedVector<CSphWordlistCheckpoint>	m_dCheckpoints;		///< checkpoint offsets
1234 	CSphVector<InfixBlock_t>				m_dInfixBlocks;
1235 
1236 	CSphAutofile						m_tFile;				///< file
1237 	int64_t								m_iSize;				///< file size
1238 	CSphSharedBuffer<BYTE>				m_pBuf;					///< my cache
1239 	int									m_iMaxChunk;			///< max size of entry between checkpoints
1240 	SphOffset_t							m_iWordsEnd;			///< end of wordlist
1241 	bool								m_bHaveSkips;			///< whether there are skiplists
1242 
1243 	CSphFixedVector<BYTE>				m_pWords;				///< arena for checkpoint's words
1244 	BYTE *								m_pInfixBlocksWords;	///< arena for infix checkpoint's words
1245 
1246 public:
1247 										CWordlist ();
1248 										~CWordlist ();
1249 	void								Reset ();
1250 
1251 	bool								ReadCP ( CSphAutofile & tFile, DWORD uVersion, bool bWordDict, CSphString & sError );
1252 
1253 	const CSphWordlistCheckpoint *		FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const;
1254 	bool								GetWord ( const BYTE * pBuf, SphWordID_t iWordID, CSphDictEntry & tWord ) const;
1255 
1256 	const BYTE *						AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint ) const;
1257 	virtual void						GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const;
1258 	virtual void						GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const;
1259 
1260 private:
1261 	bool								m_bWordDict;
1262 };
1263 
1264 
1265 class CSphHitBuilder;
1266 
1267 
1268 struct BuildHeader_t : public CSphSourceStats, public DictHeader_t
1269 {
BuildHeader_tBuildHeader_t1270 	explicit BuildHeader_t ( const CSphSourceStats & tStat )
1271 		: m_sHeaderExtension ( NULL )
1272 		, m_pThrottle ( NULL )
1273 		, m_pMinRow ( NULL )
1274 		, m_uMinDocid ( 0 )
1275 		, m_uKillListSize ( 0 )
1276 		, m_iMinMaxIndex ( 0 )
1277 		, m_iTotalDups ( 0 )
1278 	{
1279 		m_iTotalDocuments = tStat.m_iTotalDocuments;
1280 		m_iTotalBytes = tStat.m_iTotalBytes;
1281 	}
1282 
1283 	const char *		m_sHeaderExtension;
1284 	ThrottleState_t *	m_pThrottle;
1285 	const CSphRowitem *	m_pMinRow;
1286 	SphDocID_t			m_uMinDocid;
1287 	DWORD				m_uKillListSize;
1288 	int64_t				m_iMinMaxIndex;
1289 	int					m_iTotalDups;
1290 };
1291 
CheckFmtMagic(DWORD uHeader)1292 const char* CheckFmtMagic ( DWORD uHeader )
1293 {
1294 	if ( uHeader!=INDEX_MAGIC_HEADER )
1295 	{
1296 		FlipEndianess ( &uHeader );
1297 		if ( uHeader==INDEX_MAGIC_HEADER )
1298 #if USE_LITTLE_ENDIAN
1299 			return "This instance is working on little-endian platform, but %s seems built on big-endian host.";
1300 #else
1301 			return "This instance is working on big-endian platform, but %s seems built on little-endian host.";
1302 #endif
1303 		else
1304 			return "%s is invalid header file (too old index version?)";
1305 	}
1306 	return NULL;
1307 }
1308 
ReadVersion(const char * sPath,CSphString & sError)1309 DWORD ReadVersion ( const char * sPath, CSphString & sError )
1310 {
1311 	BYTE dBuffer[8];
1312 	CSphAutoreader rdHeader ( dBuffer, sizeof(dBuffer) );
1313 	if ( !rdHeader.Open ( sPath, sError ) )
1314 		return 0;
1315 
1316 	// check magic header
1317 	const char* sMsg = CheckFmtMagic ( rdHeader.GetDword() );
1318 	if ( sMsg )
1319 	{
1320 		sError.SetSprintf ( sMsg, sPath );
1321 		return 0;
1322 	}
1323 
1324 	// get version
1325 	DWORD uVersion = rdHeader.GetDword();
1326 	if ( uVersion==0 || uVersion>INDEX_FORMAT_VERSION )
1327 	{
1328 		sError.SetSprintf ( "%s is v.%d, binary is v.%d", sPath, uVersion, INDEX_FORMAT_VERSION );
1329 		return 0;
1330 	}
1331 
1332 	return uVersion;
1333 }
1334 
1335 
1336 static const char * g_dNewExts17[] = { ".new.sph", ".new.spa", ".new.spi", ".new.spd", ".new.spp", ".new.spm", ".new.spk", ".new.sps" };
1337 static const char * g_dOldExts17[] = { ".old.sph", ".old.spa", ".old.spi", ".old.spd", ".old.spp", ".old.spm", ".old.spk", ".old.sps", ".old.mvp" };
1338 static const char * g_dCurExts17[] = { ".sph", ".spa", ".spi", ".spd", ".spp", ".spm", ".spk", ".sps", ".mvp" };
1339 static const char * g_dLocExts17[] = { ".sph", ".spa", ".spi", ".spd", ".spp", ".spm", ".spk", ".sps", ".spl" };
1340 
1341 static const char * g_dNewExts31[] = { ".new.sph", ".new.spa", ".new.spi", ".new.spd", ".new.spp", ".new.spm", ".new.spk", ".new.sps", ".new.spe" };
1342 static const char * g_dOldExts31[] = { ".old.sph", ".old.spa", ".old.spi", ".old.spd", ".old.spp", ".old.spm", ".old.spk", ".old.sps", ".old.spe", ".old.mvp" };
1343 static const char * g_dCurExts31[] = { ".sph", ".spa", ".spi", ".spd", ".spp", ".spm", ".spk", ".sps", ".spe", ".mvp" };
1344 static const char * g_dLocExts31[] = { ".sph", ".spa", ".spi", ".spd", ".spp", ".spm", ".spk", ".sps", ".spe", ".spl" };
1345 
1346 static const char ** g_pppAllExts[] = { g_dCurExts31, g_dNewExts31, g_dOldExts31, g_dLocExts31 };
1347 
1348 
sphGetExts(ESphExtType eType,DWORD uVersion)1349 const char ** sphGetExts ( ESphExtType eType, DWORD uVersion )
1350 {
1351 	if ( uVersion<31 )
1352 	{
1353 		switch ( eType )
1354 		{
1355 		case SPH_EXT_TYPE_NEW: return g_dNewExts17;
1356 		case SPH_EXT_TYPE_OLD: return g_dOldExts17;
1357 		case SPH_EXT_TYPE_CUR: return g_dCurExts17;
1358 		case SPH_EXT_TYPE_LOC: return g_dLocExts17;
1359 		}
1360 
1361 	} else
1362 	{
1363 		switch ( eType )
1364 		{
1365 		case SPH_EXT_TYPE_NEW: return g_dNewExts31;
1366 		case SPH_EXT_TYPE_OLD: return g_dOldExts31;
1367 		case SPH_EXT_TYPE_CUR: return g_dCurExts31;
1368 		case SPH_EXT_TYPE_LOC: return g_dLocExts31;
1369 		}
1370 	}
1371 
1372 	assert ( 0 && "Unknown extension type" );
1373 	return NULL;
1374 }
1375 
sphGetExtCount(DWORD uVersion)1376 int sphGetExtCount ( DWORD uVersion )
1377 {
1378 	if ( uVersion<31 )
1379 		return 8;
1380 	else
1381 		return 9;
1382 }
1383 
sphGetExt(ESphExtType eType,ESphExt eExt)1384 const char * sphGetExt ( ESphExtType eType, ESphExt eExt )
1385 {
1386 	if ( eExt==SPH_EXT_MVP )
1387 	{
1388 		assert ( eType==SPH_EXT_TYPE_CUR || eType==SPH_EXT_TYPE_OLD );
1389 		return g_pppAllExts[eType][eExt];
1390 	}
1391 
1392 	assert ( eExt>=0 && eExt<=(int)sizeof(g_pppAllExts[0])/(int)sizeof(g_pppAllExts[0][0]));
1393 
1394 	return g_pppAllExts[eType][eExt];
1395 }
1396 
1397 /// this pseudo-index used to store and manage the tokenizer
1398 /// without any footprint in real files
1399 //////////////////////////////////////////////////////////////////////////
1400 static CSphSourceStats g_tTmpDummyStat;
1401 class CSphTokenizerIndex : public CSphIndex
1402 {
1403 public:
CSphTokenizerIndex()1404 	CSphTokenizerIndex () : CSphIndex ( NULL, NULL ) {}
GetKillList() const1405 	virtual SphDocID_t *		GetKillList () const { return NULL; }
GetKillListSize() const1406 	virtual int					GetKillListSize () const { return 0 ; }
HasDocid(SphDocID_t) const1407 	virtual bool				HasDocid ( SphDocID_t ) const { return false; }
Build(const CSphVector<CSphSource * > &,int,int)1408 	virtual int					Build ( const CSphVector<CSphSource*> & , int , int ) { return 0; }
Merge(CSphIndex *,const CSphVector<CSphFilterSettings> &,bool)1409 	virtual bool				Merge ( CSphIndex * , const CSphVector<CSphFilterSettings> & , bool ) {return false; }
Prealloc(bool,bool,CSphString &)1410 	virtual bool				Prealloc ( bool , bool , CSphString & ) { return false; }
Dealloc()1411 	virtual void				Dealloc () {}
Preread()1412 	virtual bool				Preread () { return false; }
SetBase(const char *)1413 	virtual void				SetBase ( const char * ) {}
Rename(const char *)1414 	virtual bool				Rename ( const char * ) { return false; }
Lock()1415 	virtual bool				Lock () { return false; }
Unlock()1416 	virtual void				Unlock () {}
Mlock()1417 	virtual bool				Mlock () { return false; }
PostSetup()1418 	virtual void				PostSetup() {}
EarlyReject(CSphQueryContext *,CSphMatch &) const1419 	virtual bool				EarlyReject ( CSphQueryContext * , CSphMatch & ) const { return false; }
GetStats() const1420 	virtual const CSphSourceStats &	GetStats () const { return g_tTmpDummyStat; }
GetStatus(CSphIndexStatus * pRes) const1421 	virtual void			GetStatus ( CSphIndexStatus* pRes ) const { assert (pRes); if ( pRes ) { pRes->m_iDiskUse = 0; pRes->m_iRamUse = 0;}}
MultiQuery(const CSphQuery *,CSphQueryResult *,int,ISphMatchSorter **,const CSphMultiQueryArgs &) const1422 	virtual bool				MultiQuery ( const CSphQuery * , CSphQueryResult * , int , ISphMatchSorter ** , const CSphMultiQueryArgs & ) const { return false; }
MultiQueryEx(int,const CSphQuery *,CSphQueryResult **,ISphMatchSorter **,const CSphMultiQueryArgs &) const1423 	virtual bool				MultiQueryEx ( int , const CSphQuery * , CSphQueryResult ** , ISphMatchSorter ** , const CSphMultiQueryArgs & ) const { return false; }
1424 	virtual bool				GetKeywords ( CSphVector <CSphKeywordInfo> & , const char * , bool , CSphString * ) const;
FillKeywords(CSphVector<CSphKeywordInfo> &) const1425 	virtual bool				FillKeywords ( CSphVector <CSphKeywordInfo> & ) const { return true; }
UpdateAttributes(const CSphAttrUpdate &,int,CSphString &,CSphString &)1426 	virtual int					UpdateAttributes ( const CSphAttrUpdate & , int , CSphString & , CSphString & ) { return -1; }
SaveAttributes(CSphString &) const1427 	virtual bool				SaveAttributes ( CSphString & ) const { return false; }
GetAttributeStatus() const1428 	virtual DWORD				GetAttributeStatus () const { return 0; }
CreateModifiedFiles(bool,const CSphString &,ESphAttr,int,CSphString &)1429 	virtual bool				CreateModifiedFiles ( bool , const CSphString & , ESphAttr , int , CSphString & ) { return true; }
AddRemoveAttribute(bool,const CSphString &,ESphAttr,int,CSphString &)1430 	virtual bool				AddRemoveAttribute ( bool, const CSphString &, ESphAttr, int, CSphString & ) { return true; }
DebugDumpHeader(FILE *,const char *,bool)1431 	virtual void				DebugDumpHeader ( FILE *, const char *, bool ) {}
DebugDumpDocids(FILE *)1432 	virtual void				DebugDumpDocids ( FILE * ) {}
DebugDumpHitlist(FILE *,const char *,bool)1433 	virtual void				DebugDumpHitlist ( FILE * , const char * , bool ) {}
DebugCheck(FILE *)1434 	virtual int					DebugCheck ( FILE * ) { return 0; } // NOLINT
DebugDumpDict(FILE *)1435 	virtual void				DebugDumpDict ( FILE * ) {}
SetProgressCallback(CSphIndexProgress::IndexingProgress_fn)1436 	virtual	void				SetProgressCallback ( CSphIndexProgress::IndexingProgress_fn ) {}
1437 };
1438 
1439 
1440 struct CSphTemplateQueryFilter : public ISphQueryFilter
1441 {
AddKeywordStatsCSphTemplateQueryFilter1442 	virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords )
1443 	{
1444 		SphWordID_t iWord = m_pDict->GetWordID ( sWord );
1445 		if ( !iWord )
1446 			return;
1447 
1448 		CSphKeywordInfo & tInfo = dKeywords.Add();
1449 		tInfo.m_sTokenized = (const char *)sTokenized;
1450 		tInfo.m_sNormalized = (const char*)sWord;
1451 		tInfo.m_iDocs = 0;
1452 		tInfo.m_iHits = 0;
1453 		tInfo.m_iQpos = iQpos;
1454 
1455 		if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
1456 			*(char *)tInfo.m_sNormalized.cstr() = '=';
1457 	}
1458 };
1459 
1460 
GetKeywords(CSphVector<CSphKeywordInfo> & dKeywords,const char * szQuery,bool,CSphString *) const1461 bool CSphTokenizerIndex::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool, CSphString * ) const
1462 {
1463 	// short-cut if no query or keywords to fill
1464 	if ( !szQuery || !szQuery[0] )
1465 		return true;
1466 
1467 	CSphScopedPtr<ISphTokenizer> pTokenizer ( m_pTokenizer->Clone ( SPH_CLONE_INDEX ) ); // avoid race
1468 	pTokenizer->EnableTokenizedMultiformTracking ();
1469 
1470 	// need to support '*' and '=' but not the other specials
1471 	// so m_pQueryTokenizer does not work for us, gotta clone and setup one manually
1472 	if ( IsStarDict() )
1473 		pTokenizer->AddPlainChar ( '*' );
1474 	if ( m_tSettings.m_bIndexExactWords )
1475 		pTokenizer->AddPlainChar ( '=' );
1476 
1477 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
1478 	CSphDict * pDictBase = m_pDict;
1479 	if ( pDictBase->HasState() )
1480 		tDictCloned = pDictBase = pDictBase->Clone();
1481 
1482 	CSphDict * pDict = pDictBase;
1483 	if ( IsStarDict() )
1484 		pDict = new CSphDictStar ( pDictBase );
1485 
1486 	if ( m_tSettings.m_bIndexExactWords )
1487 		pDict = new CSphDictExact ( pDict );
1488 
1489 	dKeywords.Resize ( 0 );
1490 
1491 	pTokenizer->SetBuffer ( (const BYTE*)szQuery, strlen(szQuery) );
1492 
1493 	CSphTemplateQueryFilter tAotFilter;
1494 	tAotFilter.m_pTokenizer = pTokenizer.Ptr();
1495 	tAotFilter.m_pDict = pDict;
1496 	tAotFilter.m_pSettings = &m_tSettings;
1497 
1498 	tAotFilter.GetKeywords ( dKeywords );
1499 
1500 	return true;
1501 }
1502 
1503 
sphCreateIndexTemplate()1504 CSphIndex * sphCreateIndexTemplate ( )
1505 {
1506 	return new CSphTokenizerIndex();
1507 }
1508 
1509 
1510 /// this is my actual VLN-compressed phrase index implementation
1511 class CSphIndex_VLN : public CSphIndex
1512 {
1513 	friend class DiskIndexQwordSetup_c;
1514 	friend class CSphMerger;
1515 	friend class AttrIndexBuilder_t<SphDocID_t>;
1516 	friend struct SphFinalMatchCalc_t;
1517 
1518 public:
1519 	explicit					CSphIndex_VLN ( const char* sIndexName, const char * sFilename );
1520 								~CSphIndex_VLN ();
1521 
1522 	virtual int					Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer );
SetProgressCallback(CSphIndexProgress::IndexingProgress_fn pfnProgress)1523 	virtual	void				SetProgressCallback ( CSphIndexProgress::IndexingProgress_fn pfnProgress ) { m_tProgress.m_fnProgress = pfnProgress; }
1524 
1525 	virtual bool				LoadHeader ( const char * sHeaderName, bool bStripPath, CSphEmbeddedFiles & tEmbeddedFiles, CSphString & sWarning );
1526 	virtual bool				WriteHeader ( const BuildHeader_t & tBuildHeader, CSphWriter & fdInfo ) const;
1527 
1528 	virtual void				DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig );
1529 	virtual void				DebugDumpDocids ( FILE * fp );
1530 	virtual void				DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID );
1531 	virtual void				DebugDumpDict ( FILE * fp );
1532 	virtual void				SetDebugCheck ();
1533 	virtual int					DebugCheck ( FILE * fp );
1534 	template <class Qword> void	DumpHitlist ( FILE * fp, const char * sKeyword, bool bID );
1535 
1536 	virtual bool				Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning );
1537 	virtual bool				Mlock ();
1538 	virtual void				Dealloc ();
1539 	virtual void				SetEnableOndiskAttributes ( bool bPool );
1540 
1541 	virtual bool				Preread ();
1542 	template<typename T> bool	PrereadSharedBuffer ( CSphSharedBuffer<T> & pBuffer, const char * sExt, int64_t iExpected=0, int64_t iOffset=0 );
1543 
1544 	virtual void				SetBase ( const char * sNewBase );
1545 	virtual bool				Rename ( const char * sNewBase );
1546 
1547 	virtual bool				Lock ();
1548 	virtual void				Unlock ();
PostSetup()1549 	virtual void				PostSetup() {}
1550 
1551 	virtual bool				MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const;
1552 	virtual bool				MultiQueryEx ( int iQueries, const CSphQuery * pQueries, CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const;
1553 	virtual bool				GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString * pError ) const;
1554 	template <class Qword> bool	DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, bool bFillOnly, CSphString * pError ) const;
1555 	virtual bool 				FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const;
1556 
1557 	virtual bool				Merge ( CSphIndex * pSource, const CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists );
1558 
1559 	template <class QWORDDST, class QWORDSRC>
1560 	static bool					MergeWords ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex, const ISphFilter * pFilter, const CSphVector<SphDocID_t> & dKillList, SphDocID_t uMinID, CSphHitBuilder * pHitBuilder, CSphString & sError, CSphSourceStats & tStat, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle, volatile bool * pGlobalStop, volatile bool * pLocalStop );
1561 	static bool					DoMerge ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex, bool bMergeKillLists, ISphFilter * pFilter, const CSphVector<SphDocID_t> & dKillList, CSphString & sError, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle, volatile bool * pGlobalStop, volatile bool * pLocalStop );
1562 
1563 	virtual int					UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError, CSphString & sWarning );
1564 	virtual bool				SaveAttributes ( CSphString & sError ) const;
1565 	virtual DWORD				GetAttributeStatus () const;
1566 
1567 	virtual bool				CreateModifiedFiles ( bool bAddAttr, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError );
1568 	virtual bool				AddRemoveAttribute ( bool bAdd, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError );
1569 
1570 	bool						EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const;
1571 
SetKeepAttrs(const CSphString & sKeepAttrs)1572 	virtual void				SetKeepAttrs ( const CSphString & sKeepAttrs ) { m_sKeepAttrs = sKeepAttrs; }
1573 
1574 	virtual SphDocID_t *		GetKillList () const;
GetKillListSize() const1575 	virtual int					GetKillListSize () const { return m_uKillListSize; }
1576 	virtual bool				HasDocid ( SphDocID_t uDocid ) const;
1577 
GetStats() const1578 	virtual const CSphSourceStats &		GetStats () const { return m_tStats; }
GetFieldLens() const1579 	virtual int64_t *					GetFieldLens() const { return m_tSettings.m_bIndexFieldLens ? m_dFieldLens.Begin() : NULL; }
1580 	virtual void				GetStatus ( CSphIndexStatus* ) const;
1581 	virtual bool 				BuildDocList ( SphAttr_t ** ppDocList, int64_t * pCount, CSphString * pError ) const;
1582 	virtual bool				ReplaceKillList ( const SphDocID_t * pKillist, int iCount );
1583 
1584 private:
1585 
1586 	static const int			MIN_WRITE_BUFFER		= 262144;	///< min write buffer size
1587 	static const int			DEFAULT_WRITE_BUFFER	= 1048576;	///< default write buffer size
1588 
1589 private:
1590 	// common stuff
1591 	int							m_iLockFD;
1592 	CSphSourceStats				m_tStats;			///< my stats
1593 	int							m_iTotalDups;
1594 	CSphFixedVector<CSphRowitem>	m_dMinRow;
1595 	SphDocID_t						m_uMinDocid;
1596 	CSphFixedVector<int64_t>		m_dFieldLens;	///< total per-field lengths summed over entire indexed data, in tokens
1597 
1598 private:
1599 
1600 	CSphIndexProgress			m_tProgress;
1601 
1602 	bool						LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords );
1603 
1604 private:
1605 	// searching-only, per-index
1606 	static const int			DOCINFO_HASH_BITS	= 18;	// FIXME! make this configurable
1607 
1608 	int64_t						m_iDocinfo;				///< my docinfo cache size
1609 	CSphSharedBuffer<DWORD>		m_pDocinfoHash;			///< hashed ids, to accelerate lookups
1610 	int64_t						m_iDocinfoIndex;		///< docinfo "index" entries count (each entry is 2x docinfo rows, for min/max)
1611 	DWORD *						m_pDocinfoIndex;		///< docinfo "index", to accelerate filtering during full-scan (2x rows for each block, and 2x rows for the whole index, 1+m_uDocinfoIndex entries)
1612 
1613 	CSphSharedBuffer<DWORD>		m_dAttrShared;			///< my docinfo cache
1614 	CSphSharedBuffer<DWORD>		m_dMvaShared;			///< my multi-valued attrs cache
1615 	CSphSharedBuffer<BYTE>		m_dStringShared;		///< my in-RAM strings cache
1616 
1617 	CSphMappedBuffer<DWORD>		m_dAttrMapped;
1618 	CSphMappedBuffer<DWORD>		m_dMvaMapped;
1619 	CSphMappedBuffer<BYTE>		m_dStringMapped;
1620 
1621 	CSphBufferTrait<DWORD>		m_tAttr;
1622 	CSphBufferTrait<DWORD>		m_tMva;
1623 	CSphBufferTrait<BYTE>		m_tString;
1624 
1625 	bool						m_bOndiskAllAttr;
1626 	bool						m_bOndiskPoolAttr;
1627 	bool						m_bArenaProhibit;
1628 
1629 	CWordlist					m_tWordlist;			///< my wordlist
1630 
1631 	CSphString					m_sKeepAttrs;			///< retain attributes of that index reindexing
1632 
1633 	CSphSharedBuffer<SphDocID_t>	m_pKillList;		///< killlist
1634 	DWORD						m_uKillListSize;		///< killlist size (in elements)
1635 
1636 	CSphSharedBuffer<BYTE>		m_pSkiplists;			///< (compressed) skiplists data
1637 
1638 	int64_t						m_iMinMaxIndex;			///< stored min/max cache offset (counted in DWORDs)
1639 
1640 	CSphAutofile				m_tDoclistFile;			///< doclist file
1641 	CSphAutofile				m_tHitlistFile;			///< hitlist file
1642 
1643 #define SPH_SHARED_VARS_COUNT 2
1644 
1645 	DWORD *						m_pPreread;
1646 	DWORD *						m_pAttrsStatus;
1647 	CSphSharedBuffer<DWORD>		m_dShared;				///< are we ready to search
1648 
1649 	bool						m_bPreallocated;		///< are we ready to preread
1650 	DWORD						m_uVersion;				///< data files version
1651 	bool						m_bUse64;				///< whether the header is id64
1652 	bool						m_bHaveSkips;			///< whether we have skiplists
1653 
1654 	int							m_iIndexTag;			///< my ids for MVA updates pool
1655 	static volatile int			m_iIndexTagSeq;			///< static ids sequence
1656 
1657 	bool						m_bIsEmpty;				///< do we have actually indexed documents (m_iTotalDocuments is just fetched documents, not indexed!)
1658 	bool						m_bDebugCheck;
1659 
1660 private:
1661 	CSphString					GetIndexFileName ( const char * sExt ) const;
1662 
1663 	bool						ParsedMultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const XQQuery_t & tXQ, CSphDict * pDict, const CSphMultiQueryArgs & tArgs, CSphQueryNodeCache * pNodeCache, const SphWordStatChecker_t & tStatDiff ) const;
1664 	bool						MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const;
1665 	void						MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery, int iSorters, ISphMatchSorter ** ppSorters, ISphRanker * pRanker, int iTag, int iIndexWeight ) const;
1666 
1667 	const DWORD *				FindDocinfo ( SphDocID_t uDocID ) const;
1668 	void						CopyDocinfo ( const CSphQueryContext * pCtx, CSphMatch & tMatch, const DWORD * pFound ) const;
1669 
1670 	bool						BuildMVA ( const CSphVector<CSphSource*> & dSources, CSphFixedVector<CSphWordHit> & dHits, int iArenaSize, int iFieldFD, int nFieldMVAs, int iFieldMVAInPool, CSphIndex_VLN * pPrevIndex );
1671 
1672 	bool						IsStarDict() const;
1673 	CSphDict *					SetupStarDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const;
1674 	CSphDict *					SetupExactDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const;
1675 
1676 	bool						RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSize, SphOffset_t * pFileSize, CSphBin * pMinBin, SphOffset_t * pSharedOffset );
1677 	bool						PrecomputeMinMax();
1678 
1679 private:
1680 	bool						LoadPersistentMVA ( CSphString & sError );
1681 
1682 	bool						JuggleFile ( const char* szExt, CSphString & sError, bool bNeedOrigin=true ) const;
1683 	XQNode_t *					ExpandPrefix ( XQNode_t * pNode, CSphQueryResultMeta * pResult, CSphScopedPayload * pPayloads ) const;
1684 
1685 	const CSphRowitem *			CopyRow ( const CSphRowitem * pDocinfo, DWORD * pTmpDocinfo, const CSphColumnInfo * pNewAttr, int iOldStride ) const;
1686 
1687 	bool						BuildDone ( const BuildHeader_t & tBuildHeader, CSphString & sError ) const;
1688 };
1689 
1690 volatile int CSphIndex_VLN::m_iIndexTagSeq = 0;
1691 
1692 /////////////////////////////////////////////////////////////////////////////
1693 // UTILITY FUNCTIONS
1694 /////////////////////////////////////////////////////////////////////////////
1695 
1696 /// indexer warning
sphWarn(const char * sTemplate,...)1697 void sphWarn ( const char * sTemplate, ... )
1698 {
1699 	va_list ap;
1700 	va_start ( ap, sTemplate );
1701 	fprintf ( stdout, "WARNING: " );
1702 	vfprintf ( stdout, sTemplate, ap );
1703 	fprintf ( stdout, "\n" );
1704 	va_end ( ap );
1705 }
1706 
1707 //////////////////////////////////////////////////////////////////////////
1708 
1709 static ThrottleState_t g_tThrottle;
1710 
sphSetThrottling(int iMaxIOps,int iMaxIOSize)1711 void sphSetThrottling ( int iMaxIOps, int iMaxIOSize )
1712 {
1713 	g_tThrottle.m_iMaxIOps = iMaxIOps;
1714 	g_tThrottle.m_iMaxIOSize = iMaxIOSize;
1715 }
1716 
1717 
sphThrottleSleep(ThrottleState_t * pState)1718 static inline void sphThrottleSleep ( ThrottleState_t * pState )
1719 {
1720 	assert ( pState );
1721 	if ( pState->m_iMaxIOps>0 )
1722 	{
1723 		int64_t tmTimer = sphMicroTimer();
1724 		int64_t tmSleep = Max ( pState->m_tmLastIOTime + 1000000/pState->m_iMaxIOps - tmTimer, 0 );
1725 		sphSleepMsec ( (int)(tmSleep/1000) );
1726 		pState->m_tmLastIOTime = tmTimer + tmSleep;
1727 	}
1728 }
1729 
1730 
sphWriteThrottled(int iFD,const void * pBuf,int64_t iCount,const char * sName,CSphString & sError,ThrottleState_t * pThrottle)1731 bool sphWriteThrottled ( int iFD, const void * pBuf, int64_t iCount, const char * sName, CSphString & sError, ThrottleState_t * pThrottle )
1732 {
1733 	assert ( pThrottle );
1734 	if ( iCount<=0 )
1735 		return true;
1736 
1737 	// by default, slice ios by at most 1 GB
1738 	int iChunkSize = ( 1UL<<30 );
1739 
1740 	// when there's a sane max_iosize (4K to 1GB), use it
1741 	if ( pThrottle->m_iMaxIOSize>=4096 )
1742 		iChunkSize = Min ( iChunkSize, pThrottle->m_iMaxIOSize );
1743 
1744 	CSphIOStats * pIOStats = GetIOStats();
1745 
1746 	// while there's data, write it chunk by chunk
1747 	const BYTE * p = (const BYTE*) pBuf;
1748 	while ( iCount>0 )
1749 	{
1750 		// wait for a timely occasion
1751 		sphThrottleSleep ( pThrottle );
1752 
1753 		// write (and maybe time)
1754 		int64_t tmTimer = 0;
1755 		if ( pIOStats )
1756 			tmTimer = sphMicroTimer();
1757 
1758 		int iToWrite = iChunkSize;
1759 		if ( iCount<iChunkSize )
1760 			iToWrite = (int)iCount;
1761 		int iWritten = ::write ( iFD, p, iToWrite );
1762 
1763 		if ( pIOStats )
1764 		{
1765 			pIOStats->m_iWriteTime += sphMicroTimer() - tmTimer;
1766 			pIOStats->m_iWriteOps++;
1767 			pIOStats->m_iWriteBytes += iToWrite;
1768 		}
1769 
1770 		// success? rinse, repeat
1771 		if ( iWritten==iToWrite )
1772 		{
1773 			iCount -= iToWrite;
1774 			p += iToWrite;
1775 			continue;
1776 		}
1777 
1778 		// failure? report, bailout
1779 		if ( iWritten<0 )
1780 			sError.SetSprintf ( "%s: write error: %s", sName, strerror(errno) );
1781 		else
1782 			sError.SetSprintf ( "%s: write error: %d of %d bytes written", sName, iWritten, iToWrite );
1783 		return false;
1784 	}
1785 	return true;
1786 }
1787 
1788 
sphReadThrottled(int iFD,void * pBuf,size_t iCount,ThrottleState_t * pThrottle)1789 static size_t sphReadThrottled ( int iFD, void * pBuf, size_t iCount, ThrottleState_t * pThrottle )
1790 {
1791 	assert ( pThrottle );
1792 	if ( pThrottle->m_iMaxIOSize && int(iCount) > pThrottle->m_iMaxIOSize )
1793 	{
1794 		size_t nChunks = iCount / pThrottle->m_iMaxIOSize;
1795 		size_t nBytesLeft = iCount % pThrottle->m_iMaxIOSize;
1796 
1797 		size_t nBytesRead = 0;
1798 		size_t iRead = 0;
1799 
1800 		for ( size_t i=0; i<nChunks; i++ )
1801 		{
1802 			iRead = sphReadThrottled ( iFD, (char *)pBuf + i*pThrottle->m_iMaxIOSize, pThrottle->m_iMaxIOSize, pThrottle );
1803 			nBytesRead += iRead;
1804 			if ( iRead!=(size_t)pThrottle->m_iMaxIOSize )
1805 				return nBytesRead;
1806 		}
1807 
1808 		if ( nBytesLeft > 0 )
1809 		{
1810 			iRead = sphReadThrottled ( iFD, (char *)pBuf + nChunks*pThrottle->m_iMaxIOSize, nBytesLeft, pThrottle );
1811 			nBytesRead += iRead;
1812 			if ( iRead!=nBytesLeft )
1813 				return nBytesRead;
1814 		}
1815 
1816 		return nBytesRead;
1817 	}
1818 
1819 	sphThrottleSleep ( pThrottle );
1820 	return (size_t)sphRead ( iFD, pBuf, iCount ); // FIXME? we sure this is under 2gb?
1821 }
1822 
SafeClose(int & iFD)1823 void SafeClose ( int & iFD )
1824 {
1825 	if ( iFD>=0 )
1826 		::close ( iFD );
1827 	iFD = -1;
1828 }
1829 
1830 //////////////////////////////////////////////////////////////////////////
1831 
1832 #if !USE_WINDOWS
strlwr(char * s)1833 char * strlwr ( char * s )
1834 {
1835 	while ( *s )
1836 	{
1837 		*s = tolower ( *s );
1838 		s++;
1839 	}
1840 	return s;
1841 }
1842 #endif
1843 
1844 
sphStrMacro(const char * sTemplate,const char * sMacro,SphDocID_t uValue)1845 static char * sphStrMacro ( const char * sTemplate, const char * sMacro, SphDocID_t uValue )
1846 {
1847 	// expand macro
1848 	char sExp[32];
1849 	snprintf ( sExp, sizeof(sExp), DOCID_FMT, uValue );
1850 
1851 	// calc lengths
1852 	int iExp = strlen ( sExp );
1853 	int iMacro = strlen ( sMacro );
1854 	int iDelta = iExp-iMacro;
1855 
1856 	// calc result length
1857 	int iRes = strlen ( sTemplate );
1858 	const char * sCur = sTemplate;
1859 	while ( ( sCur = strstr ( sCur, sMacro ) )!=NULL )
1860 	{
1861 		iRes += iDelta;
1862 		sCur++;
1863 	}
1864 
1865 	// build result
1866 	char * sRes = new char [ iRes+1 ];
1867 	char * sOut = sRes;
1868 	const char * sLast = sTemplate;
1869 	sCur = sTemplate;
1870 
1871 	while ( ( sCur = strstr ( sCur, sMacro ) )!=NULL )
1872 	{
1873 		strncpy ( sOut, sLast, sCur-sLast ); sOut += sCur-sLast;
1874 		strcpy ( sOut, sExp ); sOut += iExp; // NOLINT
1875 		sCur += iMacro;
1876 		sLast = sCur;
1877 	}
1878 
1879 	if ( *sLast )
1880 		strcpy ( sOut, sLast ); // NOLINT
1881 
1882 	assert ( (int)strlen(sRes)==iRes );
1883 	return sRes;
1884 }
1885 
1886 
sphToFloat(const char * s)1887 static float sphToFloat ( const char * s )
1888 {
1889 	if ( !s ) return 0.0f;
1890 	return (float)strtod ( s, NULL );
1891 }
1892 
1893 
sphToDword(const char * s)1894 static DWORD sphToDword ( const char * s )
1895 {
1896 	if ( !s ) return 0;
1897 	return strtoul ( s, NULL, 10 );
1898 }
1899 
1900 
sphToUint64(const char * s)1901 static uint64_t sphToUint64 ( const char * s )
1902 {
1903 	if ( !s ) return 0;
1904 	return strtoull ( s, NULL, 10 );
1905 }
1906 
1907 
sphToInt64(const char * s)1908 static int64_t sphToInt64 ( const char * s )
1909 {
1910 	if ( !s ) return 0;
1911 	return strtoll ( s, NULL, 10 );
1912 }
1913 
1914 
1915 #if USE_64BIT
1916 #define sphToDocid sphToUint64
1917 #else
1918 #define sphToDocid sphToDword
1919 #endif
1920 
1921 
1922 #if USE_WINDOWS
1923 
sphLockEx(int iFile,bool bWait)1924 bool sphLockEx ( int iFile, bool bWait )
1925 {
1926 	HANDLE hHandle = (HANDLE) _get_osfhandle ( iFile );
1927 	if ( hHandle!=INVALID_HANDLE_VALUE )
1928 	{
1929 		OVERLAPPED tOverlapped;
1930 		memset ( &tOverlapped, 0, sizeof ( tOverlapped ) );
1931 		return !!LockFileEx ( hHandle, LOCKFILE_EXCLUSIVE_LOCK | ( bWait ? 0 : LOCKFILE_FAIL_IMMEDIATELY ), 0, 1, 0, &tOverlapped );
1932 	}
1933 
1934 	return false;
1935 }
1936 
sphLockUn(int iFile)1937 void sphLockUn ( int iFile )
1938 {
1939 	HANDLE hHandle = (HANDLE) _get_osfhandle ( iFile );
1940 	if ( hHandle!=INVALID_HANDLE_VALUE )
1941 	{
1942 		OVERLAPPED tOverlapped;
1943 		memset ( &tOverlapped, 0, sizeof ( tOverlapped ) );
1944 		UnlockFileEx ( hHandle, 0, 1, 0, &tOverlapped );
1945 	}
1946 }
1947 
1948 #else
1949 
sphLockEx(int iFile,bool bWait)1950 bool sphLockEx ( int iFile, bool bWait )
1951 {
1952 	struct flock tLock;
1953 	tLock.l_type = F_WRLCK;
1954 	tLock.l_whence = SEEK_SET;
1955 	tLock.l_start = 0;
1956 	tLock.l_len = 0;
1957 
1958 	int iCmd = bWait ? F_SETLKW : F_SETLK; // FIXME! check for HAVE_F_SETLKW?
1959 	return ( fcntl ( iFile, iCmd, &tLock )!=-1 );
1960 }
1961 
1962 
sphLockUn(int iFile)1963 void sphLockUn ( int iFile )
1964 {
1965 	struct flock tLock;
1966 	tLock.l_type = F_UNLCK;
1967 	tLock.l_whence = SEEK_SET;
1968 	tLock.l_start = 0;
1969 	tLock.l_len = 0;
1970 
1971 	fcntl ( iFile, F_SETLK, &tLock );
1972 }
1973 #endif
1974 
1975 
sphSleepMsec(int iMsec)1976 void sphSleepMsec ( int iMsec )
1977 {
1978 	if ( iMsec<0 )
1979 		return;
1980 
1981 #if USE_WINDOWS
1982 	Sleep ( iMsec );
1983 
1984 #else
1985 	struct timeval tvTimeout;
1986 	tvTimeout.tv_sec = iMsec / 1000; // full seconds
1987 	tvTimeout.tv_usec = ( iMsec % 1000 ) * 1000; // remainder is msec, so *1000 for usec
1988 
1989 	select ( 0, NULL, NULL, NULL, &tvTimeout ); // FIXME? could handle EINTR
1990 #endif
1991 }
1992 
1993 
sphIsReadable(const char * sPath,CSphString * pError)1994 bool sphIsReadable ( const char * sPath, CSphString * pError )
1995 {
1996 	int iFD = ::open ( sPath, O_RDONLY );
1997 
1998 	if ( iFD<0 )
1999 	{
2000 		if ( pError )
2001 			pError->SetSprintf ( "%s unreadable: %s", sPath, strerror(errno) );
2002 		return false;
2003 	}
2004 
2005 	close ( iFD );
2006 	return true;
2007 }
2008 
2009 
sphOpenFile(const char * sFile,CSphString & sError)2010 int sphOpenFile ( const char * sFile, CSphString & sError )
2011 {
2012 	int iFD = ::open ( sFile, SPH_O_READ, 0644 );
2013 	if ( iFD<0 )
2014 	{
2015 		sError.SetSprintf ( "failed to open file '%s': '%s'", sFile, strerror(errno) );
2016 		return -1;
2017 	}
2018 
2019 	return iFD;
2020 }
2021 
2022 
sphGetFileSize(int iFD,CSphString & sError)2023 int64_t sphGetFileSize ( int iFD, CSphString & sError )
2024 {
2025 	if ( iFD<0 )
2026 	{
2027 		sError.SetSprintf ( "invalid descriptor to fstat '%d'", iFD );
2028 		return -1;
2029 	}
2030 
2031 	struct_stat st;
2032 	if ( fstat ( iFD, &st )<0 )
2033 	{
2034 		sError.SetSprintf ( "failed to fstat file '%d': '%s'", iFD, strerror(errno) );
2035 		return -1;
2036 	}
2037 
2038 	return st.st_size;
2039 }
2040 
2041 
2042 
sphSetReadBuffers(int iReadBuffer,int iReadUnhinted)2043 void sphSetReadBuffers ( int iReadBuffer, int iReadUnhinted )
2044 {
2045 	if ( iReadBuffer<=0 )
2046 		iReadBuffer = DEFAULT_READ_BUFFER;
2047 	g_iReadBuffer = Max ( iReadBuffer, MIN_READ_BUFFER );
2048 
2049 	if ( iReadUnhinted<=0 )
2050 		iReadUnhinted = DEFAULT_READ_UNHINTED;
2051 	g_iReadUnhinted = Max ( iReadUnhinted, MIN_READ_UNHINTED );
2052 }
2053 
2054 //////////////////////////////////////////////////////////////////////////
2055 // DOCINFO
2056 //////////////////////////////////////////////////////////////////////////
2057 
2058 static DWORD *				g_pMvaArena = NULL;		///< initialized by sphArenaInit()
2059 
2060 // OPTIMIZE! try to inline or otherwise simplify maybe
GetAttrMVA(const CSphAttrLocator & tLoc,const DWORD * pPool,bool bArenaProhibit) const2061 const DWORD * CSphMatch::GetAttrMVA ( const CSphAttrLocator & tLoc, const DWORD * pPool, bool bArenaProhibit ) const
2062 {
2063 	DWORD uIndex = MVA_DOWNSIZE ( GetAttr ( tLoc ) );
2064 	if ( !uIndex )
2065 		return NULL;
2066 
2067 	if ( !bArenaProhibit && ( uIndex & MVA_ARENA_FLAG ) )
2068 		return g_pMvaArena + ( uIndex & MVA_OFFSET_MASK );
2069 
2070 	assert ( pPool );
2071 	return pPool + uIndex;
2072 }
2073 
2074 /////////////////////////////////////////////////////////////////////////////
2075 // TOKENIZING EXCEPTIONS
2076 /////////////////////////////////////////////////////////////////////////////
2077 
2078 /// exceptions trie, stored in a tidy simple blob
2079 /// we serialize each trie node as follows:
2080 ///
2081 /// int result_offset, 0 if no output mapping
2082 /// BYTE num_bytes, 0 if no further valid bytes can be accepted
2083 /// BYTE values[num_bytes], known accepted byte values
2084 /// BYTE offsets[num_bytes], and the respective next node offsets
2085 ///
2086 /// output mappings themselves are serialized just after the nodes,
2087 /// as plain old ASCIIZ strings
2088 class ExceptionsTrie_c
2089 {
2090 	friend class		ExceptionsTrieGen_c;
2091 
2092 protected:
2093 	int					m_dFirst[256];	///< table to speedup 1st byte lookup
2094 	CSphVector<BYTE>	m_dData;		///< data blob
2095 	int					m_iCount;		///< number of exceptions
2096 	int					m_iMappings;	///< offset where the nodes end, and output mappings start
2097 
2098 public:
GetMapping(int i) const2099 	const BYTE * GetMapping ( int i ) const
2100 	{
2101 		assert ( i>=0 && i<m_iMappings );
2102 		int p = *(int*)&m_dData[i];
2103 		if ( !p )
2104 			return NULL;
2105 		assert ( p>=m_iMappings && p<m_dData.GetLength() );
2106 		return &m_dData[p];
2107 	}
2108 
GetFirst(BYTE v) const2109 	int GetFirst ( BYTE v ) const
2110 	{
2111 		return m_dFirst[v];
2112 	}
2113 
GetNext(int i,BYTE v) const2114 	int GetNext ( int i, BYTE v ) const
2115 	{
2116 		assert ( i>=0 && i<m_iMappings );
2117 		if ( i==0 )
2118 			return m_dFirst[v];
2119 		const BYTE * p = &m_dData[i];
2120 		int n = p[4];
2121 		p += 5;
2122 		for ( i=0; i<n; i++ )
2123 			if ( p[i]==v )
2124 				return *(int*)&p [ n + 4*i ]; // FIXME? unaligned
2125 		return -1;
2126 	}
2127 
2128 public:
Export(CSphWriter & w) const2129 	void Export ( CSphWriter & w ) const
2130 	{
2131 		CSphVector<BYTE> dPrefix;
2132 		int iCount = 0;
2133 
2134 		w.PutDword ( m_iCount );
2135 		Export ( w, dPrefix, 0, &iCount );
2136 		assert ( iCount==m_iCount );
2137 	}
2138 
2139 protected:
Export(CSphWriter & w,CSphVector<BYTE> & dPrefix,int iNode,int * pCount) const2140 	void Export ( CSphWriter & w, CSphVector<BYTE> & dPrefix, int iNode, int * pCount ) const
2141 	{
2142 		assert ( iNode>=0 && iNode<m_iMappings );
2143 		const BYTE * p = &m_dData[iNode];
2144 
2145 		int iTo = *(int*)p;
2146 		if ( iTo>0 )
2147 		{
2148 			CSphString s;
2149 			const char * sTo = (char*)&m_dData[iTo];
2150 			s.SetBinary ( (char*)dPrefix.Begin(), dPrefix.GetLength() );
2151 			s.SetSprintf ( "%s => %s\n", s.cstr(), sTo );
2152 			w.PutString ( s.cstr() );
2153 			(*pCount)++;
2154 		}
2155 
2156 		int n = p[4];
2157 		if ( n==0 )
2158 			return;
2159 
2160 		p += 5;
2161 		for ( int i=0; i<n; i++ )
2162 		{
2163 			dPrefix.Add ( p[i] );
2164 			Export ( w, dPrefix, *(int*)&p[n+4*i], pCount );
2165 			dPrefix.Pop();
2166 		}
2167 	}
2168 };
2169 
2170 
2171 /// intermediate exceptions trie node
2172 /// only used by ExceptionsTrieGen_c, while building a blob
2173 class ExceptionsTrieNode_c
2174 {
2175 	friend class						ExceptionsTrieGen_c;
2176 
2177 protected:
2178 	struct Entry_t
2179 	{
2180 		BYTE					m_uValue;
2181 		ExceptionsTrieNode_c *	m_pKid;
2182 	};
2183 
2184 	CSphString					m_sTo;		///< output mapping for current prefix, if any
2185 	CSphVector<Entry_t>			m_dKids;	///< known and accepted incoming byte values
2186 
2187 public:
~ExceptionsTrieNode_c()2188 	~ExceptionsTrieNode_c()
2189 	{
2190 		ARRAY_FOREACH ( i, m_dKids )
2191 			SafeDelete ( m_dKids[i].m_pKid );
2192 	}
2193 
2194 	/// returns false on a duplicate "from" part, or true on success
AddMapping(const BYTE * sFrom,const BYTE * sTo)2195 	bool AddMapping ( const BYTE * sFrom, const BYTE * sTo )
2196 	{
2197 		// no more bytes to consume? this is our output mapping then
2198 		if ( !*sFrom )
2199 		{
2200 			if ( !m_sTo.IsEmpty() )
2201 				return false;
2202 			m_sTo = (const char*)sTo;
2203 			return true;
2204 		}
2205 
2206 		int i;
2207 		for ( i=0; i<m_dKids.GetLength(); i++ )
2208 			if ( m_dKids[i].m_uValue==*sFrom )
2209 				break;
2210 		if ( i==m_dKids.GetLength() )
2211 		{
2212 			Entry_t & t = m_dKids.Add();
2213 			t.m_uValue = *sFrom;
2214 			t.m_pKid = new ExceptionsTrieNode_c();
2215 		}
2216 		return m_dKids[i].m_pKid->AddMapping ( sFrom+1, sTo );
2217 	}
2218 };
2219 
2220 
2221 /// exceptions trie builder
2222 /// plain old text mappings in, nice useful trie out
2223 class ExceptionsTrieGen_c
2224 {
2225 protected:
2226 	ExceptionsTrieNode_c *	m_pRoot;
2227 	int						m_iCount;
2228 
2229 public:
ExceptionsTrieGen_c()2230 	ExceptionsTrieGen_c()
2231 	{
2232 		m_pRoot = new ExceptionsTrieNode_c();
2233 		m_iCount = 0;
2234 	}
2235 
~ExceptionsTrieGen_c()2236 	~ExceptionsTrieGen_c()
2237 	{
2238 		SafeDelete ( m_pRoot );
2239 	}
2240 
2241 	/// trims left/right whitespace, folds inner whitespace
FoldSpace(char * s) const2242 	void FoldSpace ( char * s ) const
2243 	{
2244 		// skip leading spaces
2245 		char * d = s;
2246 		while ( *s && sphIsSpace(*s) )
2247 			s++;
2248 
2249 		// handle degenerate (empty string) case
2250 		if ( !*s )
2251 		{
2252 			*d = '\0';
2253 			return;
2254 		}
2255 
2256 		while ( *s )
2257 		{
2258 			// copy another token, add exactly 1 space after it, and skip whitespace
2259 			while ( *s && !sphIsSpace(*s) )
2260 				*d++ = *s++;
2261 			*d++ = ' ';
2262 			while ( sphIsSpace(*s) )
2263 				s++;
2264 		}
2265 
2266 		// replace that last space that we added
2267 		d[-1] = '\0';
2268 	}
2269 
ParseLine(char * sBuffer,CSphString & sError)2270 	bool ParseLine ( char * sBuffer, CSphString & sError )
2271 	{
2272 		#define LOC_ERR(_arg) { sError = _arg; return false; }
2273 		assert ( m_pRoot );
2274 
2275 		// extract map-from and map-to parts
2276 		char * sSplit = strstr ( sBuffer, "=>" );
2277 		if ( !sSplit )
2278 			LOC_ERR ( "mapping token (=>) not found" );
2279 
2280 		char * sFrom = sBuffer;
2281 		char * sTo = sSplit + 2; // skip "=>"
2282 		*sSplit = '\0';
2283 
2284 		// trim map-from, map-to
2285 		FoldSpace ( sFrom );
2286 		FoldSpace ( sTo );
2287 		if ( !*sFrom )
2288 			LOC_ERR ( "empty map-from part" );
2289 		if ( !*sTo )
2290 			LOC_ERR ( "empty map-to part" );
2291 		if ( (int)strlen(sFrom) > MAX_KEYWORD_BYTES )
2292 			LOC_ERR ( "map-from part too long" );
2293 		if ( (int)strlen(sTo)>MAX_KEYWORD_BYTES )
2294 			LOC_ERR ( "map-from part too long" );
2295 
2296 		// all parsed ok; add it!
2297 		if ( m_pRoot->AddMapping ( (BYTE*)sFrom, (BYTE*)sTo ) )
2298 			m_iCount++;
2299 		else
2300 			LOC_ERR ( "duplicate map-from part" );
2301 
2302 		return true;
2303 		#undef LOC_ERR
2304 	}
2305 
Build()2306 	ExceptionsTrie_c * Build()
2307 	{
2308 		if ( !m_pRoot || !m_pRoot->m_sTo.IsEmpty() || m_pRoot->m_dKids.GetLength()==0 )
2309 			return NULL;
2310 
2311 		ExceptionsTrie_c * pRes = new ExceptionsTrie_c();
2312 		pRes->m_iCount = m_iCount;
2313 
2314 		// save the nodes themselves
2315 		CSphVector<BYTE> dMappings;
2316 		SaveNode ( pRes, m_pRoot, dMappings );
2317 
2318 		// append and fixup output mappings
2319 		CSphVector<BYTE> & d = pRes->m_dData;
2320 		pRes->m_iMappings = d.GetLength();
2321 		memcpy ( d.AddN ( dMappings.GetLength() ), dMappings.Begin(), dMappings.GetLength() );
2322 
2323 		BYTE * p = d.Begin();
2324 		BYTE * pMax = p + pRes->m_iMappings;
2325 		while ( p<pMax )
2326 		{
2327 			// fixup offset in the current node, if needed
2328 			int * pOff = (int*)p; // FIXME? unaligned
2329 			if ( (*pOff)<0 )
2330 				*pOff = 0; // convert -1 to 0 for non-outputs
2331 			else
2332 				(*pOff) += pRes->m_iMappings; // fixup offsets for outputs
2333 
2334 			// proceed to the next node
2335 			int n = p[4];
2336 			p += 5 + 5*n;
2337 		}
2338 		assert ( p==pMax );
2339 
2340 		// build the speedup table for the very 1st byte
2341 		for ( int i=0; i<256; i++ )
2342 			pRes->m_dFirst[i] = -1;
2343 		int n = d[4];
2344 		for ( int i=0; i<n; i++ )
2345 			pRes->m_dFirst [ d[5+i] ] = *(int*)&pRes->m_dData [ 5+n+4*i ];
2346 
2347 		SafeDelete ( m_pRoot );
2348 		m_pRoot = new ExceptionsTrieNode_c();
2349 		m_iCount = 0;
2350 		return pRes;
2351 	}
2352 
2353 protected:
SaveInt(CSphVector<BYTE> & v,int p,int x)2354 	void SaveInt ( CSphVector<BYTE> & v, int p, int x )
2355 	{
2356 #if USE_LITTLE_ENDIAN
2357 		v[p] = x & 0xff;
2358 		v[p+1] = (x>>8) & 0xff;
2359 		v[p+2] = (x>>16) & 0xff;
2360 		v[p+3] = (x>>24) & 0xff;
2361 #else
2362 		v[p] = (x>>24) & 0xff;
2363 		v[p+1] = (x>>16) & 0xff;
2364 		v[p+2] = (x>>8) & 0xff;
2365 		v[p+3] = x & 0xff;
2366 #endif
2367 	}
2368 
SaveNode(ExceptionsTrie_c * pRes,ExceptionsTrieNode_c * pNode,CSphVector<BYTE> & dMappings)2369 	int SaveNode ( ExceptionsTrie_c * pRes, ExceptionsTrieNode_c * pNode, CSphVector<BYTE> & dMappings )
2370 	{
2371 		CSphVector<BYTE> & d = pRes->m_dData; // shortcut
2372 
2373 		// remember the start node offset
2374 		int iRes = d.GetLength();
2375 		int n = pNode->m_dKids.GetLength();
2376 		assert (!( pNode->m_sTo.IsEmpty() && n==0 ));
2377 
2378 		// save offset into dMappings, or temporary (!) save -1 if there is no output mapping
2379 		// note that we will fixup those -1's to 0's afterwards
2380 		int iOff = -1;
2381 		if ( !pNode->m_sTo.IsEmpty() )
2382 		{
2383 			iOff = dMappings.GetLength();
2384 			int iLen = pNode->m_sTo.Length();
2385 			memcpy ( dMappings.AddN ( iLen+1 ), pNode->m_sTo.cstr(), iLen+1 );
2386 		}
2387 		d.AddN(4);
2388 		SaveInt ( d, d.GetLength()-4, iOff );
2389 
2390 		// sort children nodes by value
2391 		pNode->m_dKids.Sort ( bind ( &ExceptionsTrieNode_c::Entry_t::m_uValue ) );
2392 
2393 		// save num_values, and values[]
2394 		d.Add ( (BYTE)n );
2395 		ARRAY_FOREACH ( i, pNode->m_dKids )
2396 			d.Add ( pNode->m_dKids[i].m_uValue );
2397 
2398 		// save offsets[], and the respective child nodes
2399 		int p = d.GetLength();
2400 		d.AddN ( 4*n );
2401 		for ( int i=0; i<n; i++, p+=4 )
2402 			SaveInt ( d, p, SaveNode ( pRes, pNode->m_dKids[i].m_pKid, dMappings ) );
2403 		assert ( p==iRes+5+5*n );
2404 
2405 		// done!
2406 		return iRes;
2407 	}
2408 };
2409 
2410 /////////////////////////////////////////////////////////////////////////////
2411 // TOKENIZERS
2412 /////////////////////////////////////////////////////////////////////////////
2413 
2414 inline int sphUTF8Decode ( const BYTE * & pBuf ); // forward ref for GCC
2415 inline int sphUTF8Encode ( BYTE * pBuf, int iCode ); // forward ref for GCC
2416 
2417 
2418 class CSphTokenizerBase : public ISphTokenizer
2419 {
2420 public:
2421 	CSphTokenizerBase();
2422 	~CSphTokenizerBase();
2423 
2424 	virtual bool			SetCaseFolding ( const char * sConfig, CSphString & sError );
2425 	virtual bool			LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError );
2426 	virtual void			WriteSynonyms ( CSphWriter & tWriter );
2427 	virtual void			CloneBase ( const CSphTokenizerBase * pFrom, ESphTokenizerClone eMode );
2428 
GetTokenStart() const2429 	virtual const char *	GetTokenStart () const		{ return (const char *) m_pTokenStart; }
GetTokenEnd() const2430 	virtual const char *	GetTokenEnd () const		{ return (const char *) m_pTokenEnd; }
GetBufferPtr() const2431 	virtual const char *	GetBufferPtr () const		{ return (const char *) m_pCur; }
GetBufferEnd() const2432 	virtual const char *	GetBufferEnd () const		{ return (const char *) m_pBufferMax; }
2433 	virtual void			SetBufferPtr ( const char * sNewPtr );
2434 	virtual uint64_t		GetSettingsFNV () const;
2435 
2436 	virtual bool			SetBlendChars ( const char * sConfig, CSphString & sError );
WasTokenMultiformDestination(bool &,int &) const2437 	virtual bool			WasTokenMultiformDestination ( bool &, int & ) const { return false; }
2438 
2439 public:
2440 	// lightweight clones must impose a lockdown on some methods
2441 	// (specifically those that change the lowercaser data table)
2442 
AddPlainChar(char c)2443 	virtual void AddPlainChar ( char c )
2444 	{
2445 		assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
2446 		ISphTokenizer::AddPlainChar ( c );
2447 	}
2448 
AddSpecials(const char * sSpecials)2449 	virtual void AddSpecials ( const char * sSpecials )
2450 	{
2451 		assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
2452 		ISphTokenizer::AddSpecials ( sSpecials );
2453 	}
2454 
Setup(const CSphTokenizerSettings & tSettings)2455 	virtual void Setup ( const CSphTokenizerSettings & tSettings )
2456 	{
2457 		assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
2458 		ISphTokenizer::Setup ( tSettings );
2459 	}
2460 
RemapCharacters(const char * sConfig,DWORD uFlags,const char * sSource,bool bCanRemap,CSphString & sError)2461 	virtual bool RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError )
2462 	{
2463 		assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
2464 		return ISphTokenizer::RemapCharacters ( sConfig, uFlags, sSource, bCanRemap, sError );
2465 	}
2466 
2467 protected:
2468 	bool	BlendAdjust ( const BYTE * pPosition );
2469 	int		CodepointArbitrationI ( int iCodepoint );
2470 	int		CodepointArbitrationQ ( int iCodepoint, bool bWasEscaped, BYTE uNextByte );
2471 
2472 protected:
2473 	const BYTE *		m_pBuffer;							///< my buffer
2474 	const BYTE *		m_pBufferMax;						///< max buffer ptr, exclusive (ie. this ptr is invalid, but every ptr below is ok)
2475 	const BYTE *		m_pCur;								///< current position
2476 	const BYTE *		m_pTokenStart;						///< last token start point
2477 	const BYTE *		m_pTokenEnd;						///< last token end point
2478 
2479 	BYTE				m_sAccum [ 3*SPH_MAX_WORD_LEN+3 ];	///< folded token accumulator
2480 	BYTE *				m_pAccum;							///< current accumulator position
2481 	int					m_iAccum;							///< boundary token size
2482 
2483 	BYTE				m_sAccumBlend [ 3*SPH_MAX_WORD_LEN+3 ];	///< blend-acc, an accumulator copy for additional blended variants
2484 	int					m_iBlendNormalStart;					///< points to first normal char in the accumulators (might be NULL)
2485 	int					m_iBlendNormalEnd;						///< points just past (!) last normal char in the accumulators (might be NULL)
2486 
2487 	ExceptionsTrie_c *	m_pExc;								///< exceptions trie, if any
2488 
2489 	bool				m_bHasBlend;
2490 	const BYTE *		m_pBlendStart;
2491 	const BYTE *		m_pBlendEnd;
2492 
2493 	ESphTokenizerClone	m_eMode;
2494 };
2495 
2496 
2497 /// methods that get specialized with regards to charset type
2498 /// aka GetCodepoint() decoder and everything that depends on it
2499 class CSphTokenizerBase2 : public CSphTokenizerBase
2500 {
2501 protected:
2502 	/// get codepoint
GetCodepoint()2503 	inline int GetCodepoint ()
2504 	{
2505 		while ( m_pCur<m_pBufferMax )
2506 		{
2507 			int iCode = sphUTF8Decode ( m_pCur );
2508 			if ( iCode>=0 )
2509 				return iCode; // successful decode
2510 		}
2511 		return -1; // eof
2512 	}
2513 
2514 	/// accum codepoint
AccumCodepoint(int iCode)2515 	inline void AccumCodepoint ( int iCode )
2516 	{
2517 		assert ( iCode>0 );
2518 		assert ( m_iAccum>=0 );
2519 
2520 		// throw away everything which is over the token size
2521 		bool bFit = ( m_iAccum<SPH_MAX_WORD_LEN );
2522 		bFit &= ( m_pAccum-m_sAccum+SPH_MAX_UTF8_BYTES<=(int)sizeof(m_sAccum));
2523 
2524 		if ( bFit )
2525 		{
2526 			m_pAccum += sphUTF8Encode ( m_pAccum, iCode );
2527 			assert ( m_pAccum>=m_sAccum && m_pAccum<m_sAccum+sizeof(m_sAccum) );
2528 			m_iAccum++;
2529 		}
2530 	}
2531 
2532 protected:
2533 	BYTE *			GetBlendedVariant ();
2534 	bool			CheckException ( const BYTE * pStart, const BYTE * pCur, bool bQueryMode );
2535 
2536 	template < bool IS_QUERY, bool IS_BLEND >
2537 	BYTE *						DoGetToken();
2538 
2539 	void						FlushAccum ();
2540 
2541 public:
2542 	virtual int		SkipBlended ();
2543 };
2544 
2545 
2546 /// UTF-8 tokenizer
2547 template < bool IS_QUERY >
2548 class CSphTokenizer_UTF8 : public CSphTokenizerBase2
2549 {
2550 public:
2551 								CSphTokenizer_UTF8 ();
2552 	virtual void				SetBuffer ( const BYTE * sBuffer, int iLength );
2553 	virtual BYTE *				GetToken ();
2554 	virtual ISphTokenizer *		Clone ( ESphTokenizerClone eMode ) const;
2555 	virtual int					GetCodepointLength ( int iCode ) const;
GetMaxCodepointLength() const2556 	virtual int					GetMaxCodepointLength () const { return m_tLC.GetMaxCodepointLength(); }
2557 };
2558 
2559 
2560 /// UTF-8 tokenizer with n-grams
2561 template < bool IS_QUERY >
2562 class CSphTokenizer_UTF8Ngram : public CSphTokenizer_UTF8<IS_QUERY>
2563 {
2564 public:
CSphTokenizer_UTF8Ngram()2565 						CSphTokenizer_UTF8Ngram () : m_iNgramLen ( 1 ) {}
2566 
2567 public:
2568 	virtual bool		SetNgramChars ( const char * sConfig, CSphString & sError );
2569 	virtual void		SetNgramLen ( int iLen );
2570 	virtual BYTE *		GetToken ();
2571 
2572 protected:
2573 	int					m_iNgramLen;
2574 	CSphString			m_sNgramCharsStr;
2575 };
2576 
2577 
2578 struct CSphNormalForm
2579 {
2580 	CSphString				m_sForm;
2581 	int						m_iLengthCP;
2582 };
2583 
2584 struct CSphMultiform
2585 {
2586 	int								m_iFileId;
2587 	CSphTightVector<CSphNormalForm>	m_dNormalForm;
2588 	CSphTightVector<CSphString>		m_dTokens;
2589 };
2590 
2591 
2592 struct CSphMultiforms
2593 {
2594 	int							m_iMinTokens;
2595 	int							m_iMaxTokens;
2596 	CSphVector<CSphMultiform*>	m_pForms;		// OPTIMIZE? blobify?
2597 };
2598 
2599 
2600 struct CSphMultiformContainer
2601 {
CSphMultiformContainerCSphMultiformContainer2602 							CSphMultiformContainer () : m_iMaxTokens ( 0 ) {}
2603 
2604 	int						m_iMaxTokens;
2605 	typedef CSphOrderedHash < CSphMultiforms *, CSphString, CSphStrHashFunc, 131072 > CSphMultiformHash;
2606 	CSphMultiformHash	m_Hash;
2607 };
2608 
2609 
2610 /// token filter for multiforms support
2611 class CSphMultiformTokenizer : public CSphTokenFilter
2612 {
2613 public:
2614 	CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer );
2615 	~CSphMultiformTokenizer ();
2616 
SetCaseFolding(const char * sConfig,CSphString & sError)2617 	virtual bool					SetCaseFolding ( const char * sConfig, CSphString & sError )	{ return m_pTokenizer->SetCaseFolding ( sConfig, sError ); }
AddPlainChar(char c)2618 	virtual void					AddPlainChar ( char c )											{ m_pTokenizer->AddPlainChar ( c ); }
AddSpecials(const char * sSpecials)2619 	virtual void					AddSpecials ( const char * sSpecials )							{ m_pTokenizer->AddSpecials ( sSpecials ); }
SetIgnoreChars(const char * sIgnored,CSphString & sError)2620 	virtual bool					SetIgnoreChars ( const char * sIgnored, CSphString & sError )	{ return m_pTokenizer->SetIgnoreChars ( sIgnored, sError ); }
SetNgramChars(const char * sConfig,CSphString & sError)2621 	virtual bool					SetNgramChars ( const char * sConfig, CSphString & sError )		{ return m_pTokenizer->SetNgramChars ( sConfig, sError ); }
SetNgramLen(int iLen)2622 	virtual void					SetNgramLen ( int iLen )										{ m_pTokenizer->SetNgramLen ( iLen ); }
LoadSynonyms(const char * sFilename,const CSphEmbeddedFiles * pFiles,CSphString & sError)2623 	virtual bool					LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError ) { return m_pTokenizer->LoadSynonyms ( sFilename, pFiles, sError ); }
SetBoundary(const char * sConfig,CSphString & sError)2624 	virtual bool					SetBoundary ( const char * sConfig, CSphString & sError )		{ return m_pTokenizer->SetBoundary ( sConfig, sError ); }
Setup(const CSphTokenizerSettings & tSettings)2625 	virtual void					Setup ( const CSphTokenizerSettings & tSettings )				{ m_pTokenizer->Setup ( tSettings ); }
GetSettings() const2626 	virtual const CSphTokenizerSettings &	GetSettings () const									{ return m_pTokenizer->GetSettings (); }
GetSynFileInfo() const2627 	virtual const CSphSavedFile &	GetSynFileInfo () const											{ return m_pTokenizer->GetSynFileInfo (); }
EnableSentenceIndexing(CSphString & sError)2628 	virtual bool					EnableSentenceIndexing ( CSphString & sError )					{ return m_pTokenizer->EnableSentenceIndexing ( sError ); }
EnableZoneIndexing(CSphString & sError)2629 	virtual bool					EnableZoneIndexing ( CSphString & sError )						{ return m_pTokenizer->EnableZoneIndexing ( sError ); }
2630 
2631 public:
2632 	virtual void					SetBuffer ( const BYTE * sBuffer, int iLength );
2633 	virtual BYTE *					GetToken ();
EnableTokenizedMultiformTracking()2634 	virtual void					EnableTokenizedMultiformTracking ()			{ m_bBuildMultiform = true; }
GetLastTokenLen() const2635 	virtual int						GetLastTokenLen () const					{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_iTokenLen : m_pTokenizer->GetLastTokenLen(); }
GetBoundary()2636 	virtual bool					GetBoundary ()								{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bBoundary : m_pTokenizer->GetBoundary(); }
WasTokenSpecial()2637 	virtual bool					WasTokenSpecial ()							{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bSpecial : m_pTokenizer->WasTokenSpecial(); }
GetOvershortCount()2638 	virtual int						GetOvershortCount ()						{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_iOvershortCount : m_pTokenizer->GetOvershortCount(); }
GetTokenizedMultiform()2639 	virtual BYTE *					GetTokenizedMultiform ()					{ return m_sTokenizedMultiform[0] ? m_sTokenizedMultiform : NULL; }
TokenIsBlended() const2640 	virtual bool					TokenIsBlended () const						{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bBlended : m_pTokenizer->TokenIsBlended(); }
TokenIsBlendedPart() const2641 	virtual bool					TokenIsBlendedPart () const					{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_bBlendedPart : m_pTokenizer->TokenIsBlendedPart(); }
2642 	virtual int						SkipBlended ();
2643 
2644 public:
2645 	virtual ISphTokenizer *			Clone ( ESphTokenizerClone eMode ) const;
GetTokenStart() const2646 	virtual const char *			GetTokenStart () const		{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_szTokenStart : m_pTokenizer->GetTokenStart(); }
GetTokenEnd() const2647 	virtual const char *			GetTokenEnd () const		{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_szTokenEnd : m_pTokenizer->GetTokenEnd(); }
GetBufferPtr() const2648 	virtual const char *			GetBufferPtr () const		{ return m_iStart<m_dStoredTokens.GetLength() ? m_dStoredTokens[m_iStart].m_pBufferPtr : m_pTokenizer->GetBufferPtr(); }
2649 	virtual void					SetBufferPtr ( const char * sNewPtr );
2650 	virtual uint64_t				GetSettingsFNV () const;
2651 	virtual bool					WasTokenMultiformDestination ( bool & bHead, int & iDestCount ) const;
2652 
2653 private:
2654 	const CSphMultiformContainer *	m_pMultiWordforms;
2655 	int								m_iStart;
2656 	int								m_iOutputPending;
2657 	const CSphMultiform *			m_pCurrentForm;
2658 	const char *					m_szPendingBufferPtr;
2659 
2660 	bool				m_bBuildMultiform;
2661 	BYTE				m_sTokenizedMultiform [ 3*SPH_MAX_WORD_LEN+4 ];
2662 
2663 	CSphVector<StoredToken_t>		m_dStoredTokens;
2664 };
2665 
2666 
2667 /// token filter for bigram indexing
2668 ///
2669 /// passes tokens through until an eligible pair is found
2670 /// then buffers and returns that pair as a blended token
2671 /// then returns the first token as a regular one
2672 /// then pops the first one and cycles again
2673 ///
2674 /// pair (aka bigram) eligibility depends on bigram_index value
2675 /// "all" means that all token pairs gets indexed
2676 /// "first_freq" means that 1st token must be from bigram_freq_words
2677 /// "both_freq" means that both tokens must be from bigram_freq_words
2678 class CSphBigramTokenizer : public CSphTokenFilter
2679 {
2680 protected:
2681 	enum
2682 	{
2683 		BIGRAM_CLEAN,	///< clean slate, nothing accumulated
2684 		BIGRAM_PAIR,	///< just returned a pair from m_sBuf, and m_iFirst/m_pSecond are correct
2685 		BIGRAM_FIRST	///< just returned a first token from m_sBuf, so m_iFirst/m_pSecond are still good
2686 	}		m_eState;
2687 	BYTE	m_sBuf [ MAX_KEYWORD_BYTES ];	///< pair buffer
2688 	BYTE *	m_pSecond;						///< second token pointer
2689 	int		m_iFirst;						///< first token length, bytes
2690 
2691 	ESphBigram			m_eMode;			///< bigram indexing mode
2692 	int					m_iMaxLen;			///< max bigram_freq_words length
2693 	int					m_dWordsHash[256];	///< offsets into m_dWords hashed by 1st byte
2694 	CSphVector<BYTE>	m_dWords;			///< case-folded, sorted bigram_freq_words
2695 
2696 public:
CSphBigramTokenizer(ISphTokenizer * pTok,ESphBigram eMode,CSphVector<CSphString> & dWords)2697 	CSphBigramTokenizer ( ISphTokenizer * pTok, ESphBigram eMode, CSphVector<CSphString> & dWords )
2698 		: CSphTokenFilter ( pTok )
2699 	{
2700 		assert ( pTok );
2701 		assert ( eMode!=SPH_BIGRAM_NONE );
2702 		assert ( eMode==SPH_BIGRAM_ALL || dWords.GetLength() );
2703 
2704 		m_sBuf[0] = 0;
2705 		m_pSecond = NULL;
2706 		m_eState = BIGRAM_CLEAN;
2707 		memset ( m_dWordsHash, 0, sizeof(m_dWordsHash) );
2708 
2709 		m_eMode = eMode;
2710 		m_iMaxLen = 0;
2711 
2712 		// only keep unique, real, short enough words
2713 		dWords.Uniq();
2714 		ARRAY_FOREACH ( i, dWords )
2715 		{
2716 			int iLen = Min ( dWords[i].Length(), 255 );
2717 			if ( !iLen )
2718 				continue;
2719 			m_iMaxLen = Max ( m_iMaxLen, iLen );
2720 
2721 			// hash word blocks by the first letter
2722 			BYTE uFirst = *(BYTE*)( dWords[i].cstr() );
2723 			if ( !m_dWordsHash [ uFirst ] )
2724 			{
2725 				m_dWords.Add ( 0 ); // end marker for the previous block
2726 				m_dWordsHash [ uFirst ] = m_dWords.GetLength(); // hash new block
2727 			}
2728 
2729 			// store that word
2730 			int iPos = m_dWords.GetLength();
2731 			m_dWords.Resize ( iPos+iLen+1 );
2732 
2733 			m_dWords[iPos] = (BYTE)iLen;
2734 			memcpy ( &m_dWords [ iPos+1 ], dWords[i].cstr(), iLen );
2735 		}
2736 		m_dWords.Add ( 0 );
2737 	}
2738 
CSphBigramTokenizer(ISphTokenizer * pTok,const CSphBigramTokenizer * pBase)2739 	CSphBigramTokenizer ( ISphTokenizer * pTok, const CSphBigramTokenizer * pBase )
2740 		: CSphTokenFilter ( pTok )
2741 	{
2742 		m_sBuf[0] = 0;
2743 		m_pSecond = NULL;
2744 		m_eState = BIGRAM_CLEAN;
2745 		m_eMode = pBase->m_eMode;
2746 		m_iMaxLen = pBase->m_iMaxLen;
2747 		memcpy ( m_dWordsHash, pBase->m_dWordsHash, sizeof(m_dWordsHash) );
2748 		m_dWords = pBase->m_dWords;
2749 	}
2750 
Clone(ESphTokenizerClone eMode) const2751 	ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
2752 	{
2753 		ISphTokenizer * pTok = m_pTokenizer->Clone ( eMode );
2754 		return new CSphBigramTokenizer ( pTok, this );
2755 	}
2756 
SetBuffer(const BYTE * sBuffer,int iLength)2757 	void SetBuffer ( const BYTE * sBuffer, int iLength )
2758 	{
2759 		m_pTokenizer->SetBuffer ( sBuffer, iLength );
2760 	}
2761 
TokenIsBlended() const2762 	bool TokenIsBlended() const
2763 	{
2764 		if ( m_eState==BIGRAM_PAIR )
2765 			return true;
2766 		if ( m_eState==BIGRAM_FIRST )
2767 			return false;
2768 		return m_pTokenizer->TokenIsBlended();
2769 	}
2770 
IsFreq(int iLen,BYTE * sWord)2771 	bool IsFreq ( int iLen, BYTE * sWord )
2772 	{
2773 		// early check
2774 		if ( iLen>m_iMaxLen )
2775 			return false;
2776 
2777 		// hash lookup, then linear scan
2778 		int iPos = m_dWordsHash [ *sWord ];
2779 		if ( !iPos )
2780 			return false;
2781 		while ( m_dWords[iPos] )
2782 		{
2783 			if ( m_dWords[iPos]==iLen && !memcmp ( sWord, &m_dWords[iPos+1], iLen ) )
2784 				break;
2785 			iPos += 1+m_dWords[iPos];
2786 		}
2787 		return m_dWords[iPos]!=0;
2788 	}
2789 
GetToken()2790 	BYTE * GetToken()
2791 	{
2792 		if ( m_eState==BIGRAM_FIRST || m_eState==BIGRAM_CLEAN )
2793 		{
2794 			BYTE * pFirst;
2795 			if ( m_eState==BIGRAM_FIRST )
2796 			{
2797 				// first out, clean slate again, actually
2798 				// and second will now become our next first
2799 				assert ( m_pSecond );
2800 				m_eState = BIGRAM_CLEAN;
2801 				pFirst = m_pSecond;
2802 				m_pSecond = NULL;
2803 			} else
2804 			{
2805 				// just clean slate
2806 				// assure we're, well, clean
2807 				assert ( !m_pSecond );
2808 				pFirst = m_pTokenizer->GetToken();
2809 			}
2810 
2811 			// clean slate
2812 			// get first non-blended token
2813 			if ( !pFirst )
2814 				return NULL;
2815 
2816 			// pass through blended
2817 			// could handle them as first too, but.. cumbersome
2818 			if ( m_pTokenizer->TokenIsBlended() )
2819 				return pFirst;
2820 
2821 			// check pair
2822 			// in first_freq and both_freq modes, 1st token must be listed
2823 			m_iFirst = strlen ( (const char*)pFirst );
2824 			if ( m_eMode!=SPH_BIGRAM_ALL && !IsFreq ( m_iFirst, pFirst ) )
2825 					return pFirst;
2826 
2827 			// copy it
2828 			// subsequent calls can and will override token accumulator
2829 			memcpy ( m_sBuf, pFirst, m_iFirst+1 );
2830 
2831 			// grow a pair!
2832 			// get a second one (lookahead, in a sense)
2833 			BYTE * pSecond = m_pTokenizer->GetToken();
2834 
2835 			// eof? oi
2836 			if ( !pSecond )
2837 				return m_sBuf;
2838 
2839 			// got a pair!
2840 			// check combined length
2841 			m_pSecond = pSecond;
2842 			int iSecond = strlen ( (const char*)pSecond );
2843 			if ( m_iFirst+iSecond+1 > SPH_MAX_WORD_LEN )
2844 			{
2845 				// too long pair
2846 				// return first token as is
2847 				m_eState = BIGRAM_FIRST;
2848 				return m_sBuf;
2849 			}
2850 
2851 			// check pair
2852 			// in freq2 mode, both tokens must be listed
2853 			if ( m_eMode==SPH_BIGRAM_BOTHFREQ && !IsFreq ( iSecond, m_pSecond ) )
2854 			{
2855 				m_eState = BIGRAM_FIRST;
2856 				return m_sBuf;
2857 			}
2858 
2859 			// ok, this is a eligible pair
2860 			// begin with returning first+second pair (as blended)
2861 			m_eState = BIGRAM_PAIR;
2862 			m_sBuf [ m_iFirst ] = MAGIC_WORD_BIGRAM;
2863 			assert ( m_iFirst + strlen ( (const char*)pSecond ) < sizeof(m_sBuf) );
2864 			strcpy ( (char*)m_sBuf+m_iFirst+1, (const char*)pSecond ); //NOLINT
2865 			return m_sBuf;
2866 
2867 		} else if ( m_eState==BIGRAM_PAIR )
2868 		{
2869 			// pair (aka bigram) out, return first token as a regular token
2870 			m_eState = BIGRAM_FIRST;
2871 			m_sBuf [ m_iFirst ] = 0;
2872 			return m_sBuf;
2873 		}
2874 
2875 		assert ( 0 && "unhandled bigram tokenizer internal state" );
2876 		return NULL;
2877 	}
2878 
GetSettingsFNV() const2879 	uint64_t GetSettingsFNV () const
2880 	{
2881 		uint64_t uHash = CSphTokenFilter::GetSettingsFNV();
2882 		uHash = sphFNV64 ( m_dWords.Begin(), m_dWords.GetLength(), uHash );
2883 		return uHash;
2884 	}
2885 };
2886 
2887 /////////////////////////////////////////////////////////////////////////////
2888 
FillStoredTokenInfo(StoredToken_t & tToken,const BYTE * sToken,ISphTokenizer * pTokenizer)2889 void FillStoredTokenInfo ( StoredToken_t & tToken, const BYTE * sToken, ISphTokenizer * pTokenizer )
2890 {
2891 	assert ( sToken && pTokenizer );
2892 	strncpy ( (char *)tToken.m_sToken, (const char *)sToken, sizeof(tToken.m_sToken) );
2893 	tToken.m_szTokenStart = pTokenizer->GetTokenStart ();
2894 	tToken.m_szTokenEnd = pTokenizer->GetTokenEnd ();
2895 	tToken.m_iOvershortCount = pTokenizer->GetOvershortCount ();
2896 	tToken.m_iTokenLen = pTokenizer->GetLastTokenLen ();
2897 	tToken.m_pBufferPtr = pTokenizer->GetBufferPtr ();
2898 	tToken.m_pBufferEnd = pTokenizer->GetBufferEnd();
2899 	tToken.m_bBoundary = pTokenizer->GetBoundary ();
2900 	tToken.m_bSpecial = pTokenizer->WasTokenSpecial ();
2901 	tToken.m_bBlended = pTokenizer->TokenIsBlended();
2902 	tToken.m_bBlendedPart = pTokenizer->TokenIsBlendedPart();
2903 }
2904 
2905 
2906 #if USE_RLP
2907 
2908 BT_RLP_EnvironmentC * g_pRLPEnv = NULL;
2909 int g_iRLPEnvRefCount = 0;
2910 
RLPLog(void *,int iChannel,const char * szMessage)2911 static void RLPLog ( void *, int iChannel, const char * szMessage )
2912 {
2913 	switch ( iChannel )
2914 	{
2915 	case 0:
2916 		sphWarning ( "%s", szMessage );
2917 		break;
2918 
2919 	case 1:
2920 		sphLogFatal ( "%s", szMessage );
2921 		break;
2922 
2923 	default:
2924 		sphInfo ( "%s", szMessage );
2925 		break;
2926 	}
2927 }
2928 
sphRLPInit(const char * szRootPath,const char * szEnvPath,CSphString & sError)2929 static bool sphRLPInit ( const char * szRootPath, const char * szEnvPath, CSphString & sError )
2930 {
2931 	if ( !g_pRLPEnv )
2932 	{
2933 		if ( !BT_RLP_CLibrary_VersionIsCompatible ( BT_RLP_CLIBRARY_INTERFACE_VERSION ) )
2934 		{
2935 			sError.SetSprintf ( "RLP library mismatch: have %ld expect %d", BT_RLP_CLibrary_VersionNumber(), BT_RLP_CLIBRARY_INTERFACE_VERSION );
2936 			return false;
2937 		}
2938 
2939 		BT_RLP_Environment_SetBTRootDirectory ( szRootPath );
2940 		BT_RLP_Environment_SetLogCallbackFunction ( NULL, RLPLog );
2941 		BT_RLP_Environment_SetLogLevel ( "error" );
2942 
2943 		g_pRLPEnv = BT_RLP_Environment_Create();
2944 		if ( !g_pRLPEnv )
2945 		{
2946 			sError = "Unable to initialize RLP environment";
2947 			return false;
2948 		}
2949 
2950 		BT_Result iRes = BT_RLP_Environment_InitializeFromFile ( g_pRLPEnv, szEnvPath );
2951 		if ( iRes!=BT_OK )
2952 		{
2953 			sError = "Unable to initialize the RLP environment";
2954 			BT_RLP_Environment_Destroy ( g_pRLPEnv );
2955 			g_pRLPEnv = NULL;
2956 			return false;
2957 		}
2958 	}
2959 
2960 	g_iRLPEnvRefCount++;
2961 	return true;
2962 }
2963 
2964 
sphRLPFree()2965 static void sphRLPFree ()
2966 {
2967 	g_iRLPEnvRefCount--;
2968 	if ( !g_iRLPEnvRefCount )
2969 	{
2970 		assert ( g_pRLPEnv );
2971 		BT_RLP_Environment_Destroy ( g_pRLPEnv );
2972 		g_pRLPEnv = NULL;
2973 	}
2974 }
2975 
2976 
sphIsJunkPOS(const char * szPOS)2977 static bool sphIsJunkPOS ( const char * szPOS )
2978 {
2979 	// drop EOS and PUNCT
2980 	return !szPOS || !*szPOS || ( *szPOS=='E' && *(szPOS+1)=='O' ) || ( *szPOS=='P' && *(szPOS+1)=='U' );
2981 }
2982 
2983 
2984 class CSphRLPTokenizer : public CSphTokenFilter
2985 {
2986 public:
CSphRLPTokenizer(ISphTokenizer * pTok,const char * szRootPath,const char * szEnvPath,const char * szCtxPath,bool bStandalone)2987 	CSphRLPTokenizer ( ISphTokenizer * pTok, const char * szRootPath, const char * szEnvPath, const char * szCtxPath, bool bStandalone )
2988 		: CSphTokenFilter ( pTok )
2989 		, m_pContext ( NULL )
2990 		, m_pFactory ( NULL )
2991 		, m_pTokenIterator ( NULL )
2992 		, m_sRootPath ( szRootPath )
2993 		, m_sEnvPath ( szEnvPath )
2994 		, m_sCtxPath ( szCtxPath )
2995 		, m_bChineseBuffer ( false )
2996 		, m_bStandalone ( bStandalone )
2997 		, m_bInitialized ( false )
2998 		, m_iLargeSegmentOffset ( 0 )
2999 		, m_iCurNonChineseToken ( 0 )
3000 		, m_eTokenType ( TOK_CHINESE )
3001 		, m_iNextCompoundComponent ( -1 )
3002 		, m_pTokenizerClone ( NULL )
3003 	{
3004 		assert ( pTok );
3005 		sphUTF8Encode ( m_pMarkerTokenSeparator, PROXY_TOKEN_SEPARATOR );
3006 	}
3007 
~CSphRLPTokenizer()3008 	virtual ~CSphRLPTokenizer()
3009 	{
3010 		if ( m_pTokenIterator )
3011 			BT_RLP_TokenIterator_Destroy ( m_pTokenIterator );
3012 
3013 		if ( m_pFactory )
3014 			BT_RLP_TokenIteratorFactory_Destroy ( m_pFactory );
3015 
3016 		if ( m_pContext )
3017 			BT_RLP_Environment_DestroyContext ( g_pRLPEnv, m_pContext );
3018 
3019 		sphRLPFree();
3020 
3021 		SafeDelete ( m_pTokenizerClone );
3022 	}
3023 
Init(CSphString & sError)3024 	bool Init ( CSphString & sError )
3025 	{
3026 		assert ( !m_bInitialized );
3027 
3028 		if ( !sphRLPInit ( m_sRootPath.cstr(), m_sEnvPath.cstr(), sError ) )
3029 			return false;
3030 
3031 		assert ( g_pRLPEnv );
3032 
3033 		BT_Result iRes = BT_RLP_Environment_GetContextFromFile ( g_pRLPEnv, m_sCtxPath.cstr(), &m_pContext );
3034 		if ( iRes!=BT_OK )
3035 		{
3036 			sError = "Unable to create RLP context";
3037 			return false;
3038 		}
3039 
3040 		m_pFactory = BT_RLP_TokenIteratorFactory_Create();
3041 		if ( !m_pFactory )
3042 		{
3043 			sError = "Unable to create RLP token iterator factory";
3044 			return false;
3045 		}
3046 
3047 		BT_RLP_TokenIteratorFactory_SetReturnCompoundComponents ( m_pFactory, true );
3048 
3049 		m_bInitialized = true;
3050 
3051 		return true;
3052 	}
3053 
Clone(ESphTokenizerClone eMode) const3054 	virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
3055 	{
3056 		CSphRLPTokenizer * pClone = new CSphRLPTokenizer ( m_pTokenizer->Clone ( eMode ), m_sRootPath.cstr(), m_sEnvPath.cstr(), m_sCtxPath.cstr(), m_bStandalone );
3057 		CSphString sError;
3058 		Verify ( pClone->Init ( sError ) );
3059 
3060 		return pClone;
3061 	}
3062 
SplitBufferIntoTokens(const BYTE * szBuffer,int iLength)3063 	void SplitBufferIntoTokens ( const BYTE * szBuffer, int iLength )
3064 	{
3065 		m_dDocBuffer.Resize(0);
3066 		m_dNonChineseTokens.Resize(0);
3067 		m_iCurNonChineseToken = 0;
3068 
3069 		m_pTokenizer->SetBuffer ( szBuffer, iLength );
3070 
3071 		BYTE * pToken;
3072 		while ( ( pToken = m_pTokenizer->GetToken() )!=NULL )
3073 		{
3074 			int iTokenLen = strlen ( (const char*)pToken );
3075 
3076 			int iOldBufferLen = m_dDocBuffer.GetLength();
3077 			m_dDocBuffer.Resize ( iOldBufferLen+Max ( PROXY_MARKER_LEN+1, iTokenLen )+1 );
3078 			BYTE * pCurDocPtr = &(m_dDocBuffer[iOldBufferLen]);
3079 
3080 			if ( sphDetectChinese ( pToken, iTokenLen ) )
3081 			{
3082 				// collect it in one big chinese token buffer that will be processed by RLP
3083 				memcpy ( pCurDocPtr, pToken, iTokenLen );
3084 				pCurDocPtr += iTokenLen;
3085 			} else
3086 			{
3087 				StoredToken_t & tStored = m_dNonChineseTokens.Add();
3088 				FillStoredTokenInfo ( tStored, pToken, m_pTokenizer );
3089 
3090 				// fixup a couple of fields
3091 				tStored.m_szTokenStart = (const char *)tStored.m_sToken;
3092 				tStored.m_szTokenEnd = (const char *)tStored.m_sToken+iTokenLen;
3093 
3094 				// add a 'non-chinese token' marker to the chinese token stream
3095 				*pCurDocPtr++ = ' ';
3096 				COPY_MARKER ( pCurDocPtr, m_pMarkerTokenSeparator );
3097 			}
3098 
3099 			*pCurDocPtr++ = ' ';
3100 
3101 			m_dDocBuffer.Resize ( pCurDocPtr-m_dDocBuffer.Begin() );
3102 		}
3103 
3104 		ProcessBufferRLP ( m_dDocBuffer.Begin(), m_dDocBuffer.GetLength() );
3105 	}
3106 
SetBuffer(const BYTE * szBuffer,int iBufferLength)3107 	virtual void SetBuffer ( const BYTE * szBuffer, int iBufferLength )
3108 	{
3109 		assert ( m_bInitialized && g_pRLPEnv && m_pContext && m_pFactory );
3110 
3111 		m_bChineseBuffer = m_bStandalone ? sphDetectChinese ( szBuffer, iBufferLength ) : true;
3112 		if ( m_bChineseBuffer )
3113 		{
3114 			int iLength = iBufferLength;
3115 			if ( iBufferLength>=MAX_CHUNK_SIZE )
3116 			{
3117 				m_dLargeBuffer.Resize ( iLength );
3118 				memcpy ( m_dLargeBuffer.Begin(), szBuffer, iBufferLength );
3119 				iLength = GetNextLengthToSegment ( szBuffer, iBufferLength );
3120 				m_iLargeSegmentOffset = iLength;
3121 			}
3122 
3123 			if ( m_bStandalone )
3124 				SplitBufferIntoTokens ( szBuffer, iLength );
3125 			else
3126 				ProcessBufferRLP ( szBuffer, iLength );
3127 		} else
3128 			m_pTokenizer->SetBuffer ( szBuffer, iBufferLength );
3129 	}
3130 
GetToken()3131 	virtual BYTE * GetToken()
3132 	{
3133 		assert ( m_bInitialized && g_pRLPEnv && m_pContext && m_pFactory );
3134 
3135 		if ( m_bChineseBuffer )
3136 		{
3137 			m_eTokenType = TOK_CHINESE;
3138 			bool bStopword;
3139 			BYTE * pToken = GetNextTokenRLP ( bStopword );
3140 			if ( pToken )
3141 			{
3142 				if ( m_bStandalone )
3143 				{
3144 					int iTokenLen = strlen ( (const char *)pToken );
3145 					if ( iTokenLen==PROXY_MARKER_LEN && CMP_MARKER ( pToken, m_pMarkerTokenSeparator ) )
3146 					{
3147 						m_eTokenType = TOK_NONCHINESE_STORED;
3148 						return m_dNonChineseTokens[m_iCurNonChineseToken++].m_sToken;
3149 					}
3150 
3151 					// it could still be a non-chinese token that sphinx tokenizer didn't separate
3152 					// but RLP tokenizer did
3153 					if ( !bStopword && !sphDetectChinese ( pToken, iTokenLen ) )
3154 					{
3155 						// we don't want to mess up the states in our base tokenizer, so we use a cloned tokenizer
3156 						if ( !m_pTokenizerClone )
3157 						{
3158 							m_pTokenizerClone = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
3159 							assert ( m_pTokenizerClone );
3160 						}
3161 
3162 						m_pTokenizerClone->SetBuffer ( pToken, iTokenLen );
3163 						pToken = m_pTokenizerClone->GetToken();
3164 						if ( pToken )
3165 						{
3166 							m_eTokenType = TOK_NONCHINESE_RLP;
3167 							FillStoredTokenInfo ( m_tStoredRLPToken, pToken, m_pTokenizer );
3168 							m_tStoredRLPToken.m_szTokenStart = (const char *)m_tStoredRLPToken.m_sToken;
3169 							m_tStoredRLPToken.m_szTokenEnd = (const char *)m_tStoredRLPToken.m_sToken+iTokenLen;
3170 							m_tStoredRLPToken.m_pBufferPtr = m_tStoredRLPToken.m_szTokenStart;
3171 							m_tStoredRLPToken.m_pBufferEnd = m_tStoredRLPToken.m_szTokenEnd;
3172 						}
3173 					}
3174 				}
3175 
3176 				if ( pToken )
3177 					return pToken;
3178 			}
3179 
3180 			DestroyIteratorRLP();
3181 
3182 			while ( m_dLargeBuffer.GetLength() )
3183 			{
3184 				BYTE * pBuffer = m_dLargeBuffer.Begin()+m_iLargeSegmentOffset;
3185 				int iBufferLength = m_dLargeBuffer.GetLength()-m_iLargeSegmentOffset;
3186 				int iLength = GetNextLengthToSegment ( pBuffer, iBufferLength );
3187 				if ( iLength )
3188 				{
3189 					ProcessBufferRLP ( pBuffer, iLength );
3190 					m_iLargeSegmentOffset += iLength;
3191 
3192 					if ( m_bStandalone )
3193 						SplitBufferIntoTokens ( pBuffer, iLength );
3194 					else
3195 						ProcessBufferRLP ( pBuffer, iLength );
3196 
3197 					pToken = GetToken();
3198 					if ( pToken )
3199 						return pToken;
3200 					else
3201 						DestroyIteratorRLP();
3202 				} else
3203 				{
3204 					m_dLargeBuffer.Resize(0);
3205 					m_iLargeSegmentOffset = 0;
3206 				}
3207 			}
3208 
3209 			return NULL;
3210 		} else
3211 			return m_pTokenizer->GetToken();
3212 	}
3213 
GetMorphFlag() const3214 	virtual bool GetMorphFlag () const
3215 	{
3216 		if ( !m_bChineseBuffer )
3217 			return true;
3218 
3219 		return m_eTokenType!=TOK_CHINESE;
3220 	}
3221 
GetLastTokenLen() const3222 	virtual int GetLastTokenLen() const
3223 	{
3224 		if ( m_bChineseBuffer )
3225 		{
3226 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3227 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_iTokenLen;
3228 
3229 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3230 				return m_tStoredRLPToken.m_iTokenLen;
3231 
3232 			return sphUTF8Len ( (const char*)m_dUTF8Buffer, sizeof ( m_dUTF8Buffer ) );
3233 		}
3234 
3235 		return m_pTokenizer->GetLastTokenLen();
3236 	}
3237 
GetBoundary()3238 	virtual bool GetBoundary()
3239 	{
3240 		if ( m_bChineseBuffer )
3241 		{
3242 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3243 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_bBoundary;
3244 
3245 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3246 				return m_tStoredRLPToken.m_bBoundary;
3247 
3248 			return false;
3249 		}
3250 
3251 		return m_pTokenizer->GetBoundary();
3252 	}
3253 
WasTokenSpecial()3254 	virtual bool WasTokenSpecial()
3255 	{
3256 		if ( m_bChineseBuffer )
3257 		{
3258 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3259 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_bSpecial;
3260 
3261 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3262 				return m_tStoredRLPToken.m_bSpecial;
3263 
3264 			return false;
3265 		}
3266 
3267 		return m_pTokenizer->WasTokenSpecial();
3268 	}
3269 
GetOvershortCount()3270 	virtual int	GetOvershortCount ()
3271 	{
3272 		if ( m_bChineseBuffer )
3273 		{
3274 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3275 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_iOvershortCount;
3276 
3277 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3278 				return m_tStoredRLPToken.m_iOvershortCount;
3279 
3280 			return 0;
3281 		}
3282 
3283 		return m_pTokenizer->GetOvershortCount();
3284 	}
3285 
TokenIsBlended() const3286 	virtual bool TokenIsBlended () const
3287 	{
3288 		if ( m_bChineseBuffer )
3289 		{
3290 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3291 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_bBlended;
3292 
3293 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3294 				return m_tStoredRLPToken.m_bBlended;
3295 
3296 			return false;
3297 		}
3298 
3299 		return m_pTokenizer->TokenIsBlended();
3300 	}
3301 
TokenIsBlendedPart() const3302 	virtual bool TokenIsBlendedPart () const
3303 	{
3304 		if ( m_bChineseBuffer )
3305 		{
3306 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3307 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_bBlendedPart;
3308 
3309 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3310 				return m_tStoredRLPToken.m_bBlendedPart;
3311 
3312 			return false;
3313 		}
3314 
3315 		return m_pTokenizer->TokenIsBlendedPart();
3316 	}
3317 
GetTokenStart() const3318 	virtual const char * GetTokenStart () const
3319 	{
3320 		if ( m_bChineseBuffer )
3321 		{
3322 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3323 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_szTokenStart;
3324 
3325 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3326 				return m_tStoredRLPToken.m_szTokenStart;
3327 
3328 			return (const char*)m_dUTF8Buffer;
3329 		}
3330 
3331 		return m_pTokenizer->GetTokenStart();
3332 	}
3333 
GetTokenEnd() const3334 	virtual const char * GetTokenEnd () const
3335 	{
3336 		if ( m_bChineseBuffer )
3337 		{
3338 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3339 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_szTokenEnd;
3340 
3341 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3342 				return m_tStoredRLPToken.m_szTokenEnd;
3343 
3344 			return (const char*)m_dUTF8Buffer + strlen ( (const char*)m_dUTF8Buffer );
3345 		}
3346 
3347 		return m_pTokenizer->GetTokenEnd();
3348 	}
3349 
SetBufferPtr(const char * sNewPtr)3350 	virtual void SetBufferPtr ( const char * sNewPtr )
3351 	{
3352 		// we'll handle it as a complete re-tokenization
3353 		DestroyIteratorRLP ();
3354 		m_iCurNonChineseToken = 0;
3355 		m_eTokenType = TOK_CHINESE;
3356 		SetBuffer ( (const BYTE*)sNewPtr, strlen ( sNewPtr ) );
3357 	}
3358 
GetBufferPtr() const3359 	virtual const char * GetBufferPtr () const
3360 	{
3361 		if ( m_bChineseBuffer )
3362 		{
3363 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3364 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_pBufferPtr;
3365 
3366 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3367 				return m_tStoredRLPToken.m_pBufferPtr;
3368 
3369 			return (const char*)m_dUTF8Buffer;
3370 		}
3371 
3372 		return m_pTokenizer->GetBufferPtr();
3373 	}
3374 
GetBufferEnd() const3375 	virtual const char * GetBufferEnd () const
3376 	{
3377 		if ( m_bChineseBuffer )
3378 		{
3379 			if ( m_eTokenType==TOK_NONCHINESE_STORED )
3380 				return m_dNonChineseTokens[m_iCurNonChineseToken-1].m_pBufferEnd;
3381 
3382 			if ( m_eTokenType==TOK_NONCHINESE_RLP )
3383 				return m_tStoredRLPToken.m_pBufferEnd;
3384 
3385 			return (const char*)m_dUTF8Buffer;
3386 		}
3387 
3388 		return m_pTokenizer->GetBufferEnd ();
3389 	}
3390 
GetRLPContext() const3391 	virtual const char * GetRLPContext () const
3392 	{
3393 		return m_sCtxPath.cstr();
3394 	}
3395 
3396 private:
3397 	enum TokType_e
3398 	{
3399 		TOK_CHINESE,
3400 		TOK_NONCHINESE_STORED,
3401 		TOK_NONCHINESE_RLP
3402 	};
3403 
3404 	BT_RLP_ContextC *		m_pContext;
3405 	BT_RLP_TokenIteratorFactoryC * m_pFactory;
3406 	BT_RLP_TokenIteratorC *	m_pTokenIterator;
3407 	CSphString				m_sRootPath;
3408 	CSphString				m_sEnvPath;
3409 	CSphString				m_sCtxPath;
3410 	bool					m_bChineseBuffer;
3411 	bool					m_bStandalone;
3412 	bool					m_bInitialized;
3413 	static const int		MAX_CHUNK_SIZE = 10485760;
3414 	static const int		MAX_TOKEN_LEN = 1024;
3415 	CSphTightVector<BYTE>	m_dLargeBuffer;
3416 	CSphVector<BYTE>		m_dDocBuffer;
3417 	CSphTightVector<StoredToken_t> m_dNonChineseTokens;
3418 	StoredToken_t			m_tStoredRLPToken;
3419 	int						m_iLargeSegmentOffset;
3420 	int						m_iCurNonChineseToken;
3421 	TokType_e				m_eTokenType;
3422 	int						m_iNextCompoundComponent;
3423 	BYTE					m_dUTF8Buffer[MAX_TOKEN_LEN];
3424 	BYTE					m_pMarkerTokenSeparator[PROXY_MARKER_LEN];
3425 	ISphTokenizer *			m_pTokenizerClone;
3426 
IsSpecialCode(int iCode) const3427 	bool IsSpecialCode ( int iCode ) const
3428 	{
3429 		return iCode==PROXY_DOCUMENT_START || iCode==PROXY_FIELD_START_CHINESE || iCode==PROXY_FIELD_START_NONCHINESE || iCode==PROXY_TOKENIZED;
3430 	}
3431 
IsChineseSeparator(int iCode) const3432 	bool IsChineseSeparator ( int iCode ) const
3433 	{
3434 		assert ( m_pTokenizer );
3435 		return ( iCode>=0x3000 && iCode<=0x303F ) || IsSpecialCode ( iCode ) || !m_pTokenizer->GetLowercaser().ToLower ( iCode );
3436 	}
3437 
ProcessBufferRLP(const BYTE * pBuffer,int iLength)3438 	void ProcessBufferRLP ( const BYTE * pBuffer, int iLength )
3439 	{
3440 		assert ( !m_pTokenIterator );
3441 
3442 		BT_Result iRes = BT_RLP_Context_ProcessBuffer ( m_pContext, pBuffer, iLength, BT_LANGUAGE_SIMPLIFIED_CHINESE, "UTF-8", NULL );
3443 		// iteration should still work ok in this case
3444 		if ( iRes!=BT_OK )
3445 			sphWarning ( "BT_RLP_Context_ProcessBuffer error" );
3446 
3447 		m_pTokenIterator = BT_RLP_TokenIteratorFactory_CreateIterator ( m_pFactory, m_pContext );
3448 		if ( !m_pTokenIterator )
3449 			sphWarning ( "BT_RLP_TokenIteratorFactory_CreateIterator error" );
3450 	}
3451 
GetNextTokenRLP(bool & bStopword)3452 	BYTE * GetNextTokenRLP ( bool & bStopword )
3453 	{
3454 		static const char * RPL_SPECIAL_STOPWORD = "rlpspecialstopword";
3455 		bStopword = false;
3456 
3457 		if ( !m_pTokenIterator )
3458 			return NULL;
3459 
3460 		if ( m_iNextCompoundComponent!=-1 )
3461 		{
3462 			if ( m_iNextCompoundComponent>=(int)BT_RLP_TokenIterator_GetNumberOfCompoundComponents ( m_pTokenIterator ) )
3463 				m_iNextCompoundComponent = -1;
3464 			else
3465 			{
3466 				const BT_Char16 * pToken = BT_RLP_TokenIterator_GetCompoundComponent ( m_pTokenIterator, m_iNextCompoundComponent++ );
3467 				if ( BT_RLP_TokenIterator_IsStopword ( m_pTokenIterator ) )
3468 				{
3469 					strncpy ( (char*)m_dUTF8Buffer, RPL_SPECIAL_STOPWORD, MAX_TOKEN_LEN );
3470 					bStopword = true;
3471 				} else
3472 				{
3473 					assert ( pToken );
3474 					bt_xutf16toutf8 ( (char*)m_dUTF8Buffer, pToken, sizeof(m_dUTF8Buffer) );
3475 				}
3476 
3477 				return &(m_dUTF8Buffer[0]);
3478 			}
3479 		}
3480 
3481 		while ( BT_RLP_TokenIterator_Next ( m_pTokenIterator ) )
3482 		{
3483 			const char * szPartOfSpeech = BT_RLP_TokenIterator_GetPartOfSpeech ( m_pTokenIterator );
3484 
3485 			if ( sphIsJunkPOS ( szPartOfSpeech ) )
3486 				continue;
3487 
3488 			const BT_Char16 * pToken;
3489 			int nCC = BT_RLP_TokenIterator_GetNumberOfCompoundComponents ( m_pTokenIterator );
3490 			if ( nCC>0 )
3491 			{
3492 				m_iNextCompoundComponent = 0;
3493 				pToken = BT_RLP_TokenIterator_GetCompoundComponent ( m_pTokenIterator, m_iNextCompoundComponent++ );
3494 			} else
3495 				pToken = BT_RLP_TokenIterator_GetToken ( m_pTokenIterator );
3496 
3497 			if ( BT_RLP_TokenIterator_IsStopword ( m_pTokenIterator ) )
3498 			{
3499 				strncpy ( (char*)m_dUTF8Buffer, RPL_SPECIAL_STOPWORD, MAX_TOKEN_LEN );
3500 				bStopword = true;
3501 			} else
3502 			{
3503 				assert ( pToken );
3504 				bt_xutf16toutf8 ( (char*)m_dUTF8Buffer, pToken, sizeof(m_dUTF8Buffer) );
3505 			}
3506 
3507 			return &(m_dUTF8Buffer[0]);
3508 		}
3509 
3510 		return NULL;
3511 	}
3512 
DestroyIteratorRLP()3513 	void DestroyIteratorRLP()
3514 	{
3515 		if ( m_pTokenIterator )
3516 		{
3517 			BT_RLP_TokenIterator_Destroy ( m_pTokenIterator );
3518 			m_pTokenIterator = NULL;
3519 		}
3520 
3521 		if ( m_pContext )
3522 			BT_RLP_Context_DestroyResultStorage ( m_pContext );
3523 	}
3524 
GetNextLengthToSegment(const BYTE * pBuffer,int iLength) const3525 	int GetNextLengthToSegment ( const BYTE * pBuffer, int iLength ) const
3526 	{
3527 		const BYTE * pCurBuf = pBuffer;
3528 		const BYTE * pLastSeparator = NULL;
3529 		int iLengthLeft = Min ( iLength, MAX_CHUNK_SIZE );
3530 		while ( pCurBuf<pBuffer+iLengthLeft )
3531 		{
3532 			int iCode = sphUTF8Decode ( pCurBuf );
3533 			if ( IsChineseSeparator ( iCode ) )
3534 				pLastSeparator = pCurBuf;
3535 		}
3536 
3537 		return pLastSeparator ? pLastSeparator-pBuffer : iLengthLeft;
3538 	}
3539 };
3540 
3541 
3542 class CSphRLPResultSplitter : public CSphTokenFilter
3543 {
3544 public:
CSphRLPResultSplitter(ISphTokenizer * pTok,const char * szCtxPath)3545 	CSphRLPResultSplitter ( ISphTokenizer * pTok, const char * szCtxPath )
3546 		: CSphTokenFilter ( pTok )
3547 		, m_bTokenized ( false )
3548 		, m_bNonChineseToken ( false )
3549 		, m_iStart ( 0 )
3550 		, m_iTokenLenBytes ( 0 )
3551 		, m_sCtxPath ( szCtxPath )
3552 	{
3553 		assert ( pTok );
3554 
3555 		sphUTF8Encode ( m_pTokenizedMarker, PROXY_TOKENIZED );
3556 		sphUTF8Encode ( m_pNonChineseMarker, PROXY_MORPH );
3557 	}
3558 
SetBuffer(const BYTE * szBuffer,int iBufferLength)3559 	virtual void SetBuffer ( const BYTE * szBuffer, int iBufferLength )
3560 	{
3561 		// detect if this is an already tokenized buffer or not
3562 		m_bTokenized = false;
3563 		m_iStart = 0;
3564 
3565 		if ( iBufferLength>=PROXY_MARKER_LEN+1 )
3566 		{
3567 			const BYTE * pBufPtr = szBuffer;
3568 			if ( CMP_MARKER ( pBufPtr, m_pTokenizedMarker ) )
3569 			{
3570 				pBufPtr += PROXY_MARKER_LEN+1;
3571 				m_bTokenized = true;
3572 				int iBufSize = iBufferLength-(pBufPtr-szBuffer);
3573 				m_dBuffer.Resize ( iBufSize+1 );
3574 				memcpy ( m_dBuffer.Begin(), pBufPtr, iBufSize );
3575 				m_dBuffer[iBufSize] = '\0';
3576 			}
3577 		}
3578 
3579 		if ( !m_bTokenized )
3580 			m_pTokenizer->SetBuffer ( szBuffer, iBufferLength );
3581 	}
3582 
GetToken()3583 	virtual BYTE * GetToken()
3584 	{
3585 		if ( m_bTokenized )
3586 		{
3587 			if ( m_iStart>=m_dBuffer.GetLength() )
3588 				return NULL;
3589 
3590 			// check if we have a marker
3591 			if ( m_dBuffer.GetLength()-m_iStart>=PROXY_MARKER_LEN && CMP_MARKER ( (m_dBuffer.Begin()+m_iStart), m_pNonChineseMarker ) )
3592 			{
3593 				m_iStart += PROXY_MARKER_LEN;
3594 				m_bNonChineseToken = true;
3595 			} else
3596 				m_bNonChineseToken = false;
3597 
3598 			bool bFound = false;
3599 			for ( int i = m_iStart; i < m_dBuffer.GetLength() && !bFound; i++ )
3600 				if ( m_dBuffer[i]==' ' )
3601 				{
3602 					m_tCurToken.m_szTokenStart = (const char*)&(m_dBuffer[m_iStart]);
3603 					m_dBuffer[i] = 0;
3604 					m_iTokenLenBytes = i-m_iStart;
3605 					m_iStart = i+1;
3606 					bFound = true;
3607 				}
3608 
3609 			if ( !bFound )
3610 			{
3611 				m_tCurToken.m_szTokenStart = (const char*)&(m_dBuffer[m_iStart]);
3612 				m_iTokenLenBytes = m_dBuffer.GetLength()-m_iStart;
3613 				m_iStart = m_dBuffer.GetLength();
3614 			}
3615 
3616 			// collect extra token data from the text stream
3617 			if ( m_bNonChineseToken )
3618 			{
3619 				int iDataStart = m_iStart;
3620 				while ( m_iStart < m_dBuffer.GetLength() && m_dBuffer[m_iStart]!=' ' )
3621 					m_iStart++;
3622 
3623 				DWORD uPacked = strtoul ( (const char*)&(m_dBuffer[iDataStart]), NULL, 16 );
3624 				m_tCurToken.m_bBlended = !!(uPacked & PROXY_BLENDED_FLAG);
3625 				m_tCurToken.m_bBlendedPart = !!(uPacked & PROXY_BLENDED_PART_FLAG);
3626 				m_tCurToken.m_bBoundary = !!(uPacked & PROXY_BOUNDARY_FLAG);
3627 				m_tCurToken.m_bSpecial = !!(uPacked & PROXY_SPECIAL_FLAG);
3628 				m_tCurToken.m_iTokenLen = uPacked & ( ( 1<<6 )-1 );
3629 
3630 				// skip space
3631 				if ( m_iStart < m_dBuffer.GetLength() )
3632 				{
3633 					assert ( m_dBuffer[m_iStart]==' ' );
3634 					m_iStart++;
3635 				}
3636 
3637 				if ( uPacked & PROXY_HAVE_OVERSHORT )
3638 				{
3639 					iDataStart = m_iStart;
3640 					while ( m_iStart < m_dBuffer.GetLength() && m_dBuffer[m_iStart]!=' ' )
3641 						m_iStart++;
3642 
3643 					// skip space
3644 					if ( m_iStart < m_dBuffer.GetLength() )
3645 					{
3646 						assert ( m_dBuffer[m_iStart]==' ' );
3647 						m_dBuffer[m_iStart++] = 0;
3648 					}
3649 
3650 					m_tCurToken.m_iOvershortCount = strtoul ( (const char*)&(m_dBuffer[iDataStart]), NULL, 16 );
3651 				} else
3652 					m_tCurToken.m_iOvershortCount = 0;
3653 			}
3654 
3655 			m_tCurToken.m_pBufferPtr = (const char*)m_dBuffer.Begin();
3656 			m_tCurToken.m_szTokenEnd = m_tCurToken.m_szTokenStart+m_iTokenLenBytes;
3657 
3658 			return (BYTE*)m_tCurToken.m_szTokenStart;
3659 		} else
3660 			return m_pTokenizer->GetToken();
3661 	}
3662 
Clone(ESphTokenizerClone eMode) const3663 	virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
3664 	{
3665 		return new CSphRLPResultSplitter ( m_pTokenizer->Clone ( eMode ), m_sCtxPath.cstr() );
3666 	}
3667 
GetMorphFlag() const3668 	virtual bool GetMorphFlag () const
3669 	{
3670 		if ( !m_bTokenized )
3671 			return true;
3672 
3673 		return m_bNonChineseToken;
3674 	}
3675 
GetLastTokenLen() const3676 	virtual int GetLastTokenLen() const
3677 	{
3678 		if ( !m_bTokenized )
3679 			return m_pTokenizer->GetLastTokenLen();
3680 
3681 		if ( m_bNonChineseToken )
3682 			return m_tCurToken.m_iTokenLen;
3683 
3684 		return sphUTF8Len ( m_tCurToken.m_szTokenStart, m_iTokenLenBytes );
3685 	}
3686 
GetBoundary()3687 	virtual bool GetBoundary()
3688 	{
3689 		if ( !m_bTokenized )
3690 			return m_pTokenizer->GetBoundary();
3691 
3692 		if ( m_bNonChineseToken )
3693 			return m_tCurToken.m_bBoundary;
3694 
3695 		return false;
3696 	}
3697 
WasTokenSpecial()3698 	virtual bool WasTokenSpecial()
3699 	{
3700 		if ( !m_bTokenized )
3701 			return m_pTokenizer->WasTokenSpecial();
3702 
3703 		if ( m_bNonChineseToken )
3704 			return m_tCurToken.m_bSpecial;
3705 
3706 		return false;
3707 	}
3708 
GetOvershortCount()3709 	virtual int	GetOvershortCount ()
3710 	{
3711 		if ( !m_bTokenized )
3712 			return m_pTokenizer->GetOvershortCount();
3713 
3714 		if ( m_bNonChineseToken )
3715 			return m_tCurToken.m_iOvershortCount;
3716 
3717 		return 0;
3718 	}
3719 
TokenIsBlended() const3720 	virtual bool TokenIsBlended () const
3721 	{
3722 		if ( !m_bTokenized )
3723 			return m_pTokenizer->TokenIsBlended();
3724 
3725 		if ( m_bNonChineseToken )
3726 			return m_tCurToken.m_bBlended;
3727 
3728 		return false;
3729 	}
3730 
TokenIsBlendedPart() const3731 	virtual bool TokenIsBlendedPart () const
3732 	{
3733 		if ( !m_bTokenized )
3734 			return m_pTokenizer->TokenIsBlendedPart();
3735 
3736 		if ( m_bNonChineseToken )
3737 			return m_tCurToken.m_bBlendedPart;
3738 
3739 		return false;
3740 	}
3741 
GetTokenStart() const3742 	virtual const char * GetTokenStart () const
3743 	{
3744 		if ( !m_bTokenized )
3745 			return m_pTokenizer->GetTokenStart();
3746 
3747 		return m_tCurToken.m_szTokenStart;
3748 	}
3749 
GetTokenEnd() const3750 	virtual const char * GetTokenEnd () const
3751 	{
3752 		if ( !m_bTokenized )
3753 			return m_pTokenizer->GetTokenEnd();
3754 
3755 		return m_tCurToken.m_szTokenEnd;
3756 	}
3757 
GetBufferPtr() const3758 	virtual const char * GetBufferPtr () const
3759 	{
3760 		if ( !m_bTokenized )
3761 			return m_pTokenizer->GetBufferPtr();
3762 
3763 		return m_tCurToken.m_pBufferPtr;
3764 	}
3765 
GetRLPContext() const3766 	virtual const char * GetRLPContext () const
3767 	{
3768 		return m_sCtxPath.cstr();
3769 	}
3770 
3771 private:
3772 	bool					m_bTokenized;
3773 	bool					m_bNonChineseToken;
3774 	int						m_iStart;
3775 	StoredToken_t			m_tCurToken;
3776 	int						m_iTokenLenBytes;
3777 	CSphString				m_sCtxPath;
3778 	BYTE					m_pTokenizedMarker[PROXY_MARKER_LEN];
3779 	BYTE					m_pNonChineseMarker[PROXY_MARKER_LEN];
3780 	CSphTightVector<BYTE>	m_dBuffer;
3781 };
3782 
3783 #endif
3784 
3785 //////////////////////////////////////////////////////////////////////////
3786 
sphCreateUTF8Tokenizer()3787 ISphTokenizer * sphCreateUTF8Tokenizer ()
3788 {
3789 	return new CSphTokenizer_UTF8<false> ();
3790 }
3791 
sphCreateUTF8NgramTokenizer()3792 ISphTokenizer * sphCreateUTF8NgramTokenizer ()
3793 {
3794 	return new CSphTokenizer_UTF8Ngram<false> ();
3795 }
3796 
3797 /////////////////////////////////////////////////////////////////////////////
3798 
3799 enum
3800 {
3801 	MASK_CODEPOINT			= 0x00ffffffUL,	// mask off codepoint flags
3802 	MASK_FLAGS				= 0xff000000UL, // mask off codepoint value
3803 	FLAG_CODEPOINT_SPECIAL	= 0x01000000UL,	// this codepoint is special
3804 	FLAG_CODEPOINT_DUAL		= 0x02000000UL,	// this codepoint is special but also a valid word part
3805 	FLAG_CODEPOINT_NGRAM	= 0x04000000UL,	// this codepoint is n-gram indexed
3806 	FLAG_CODEPOINT_BOUNDARY	= 0x10000000UL,	// this codepoint is phrase boundary
3807 	FLAG_CODEPOINT_IGNORE	= 0x20000000UL,	// this codepoint is ignored
3808 	FLAG_CODEPOINT_BLEND	= 0x40000000UL	// this codepoint is "blended" (indexed both as a character, and as a separator)
3809 };
3810 
3811 
CSphLowercaser()3812 CSphLowercaser::CSphLowercaser ()
3813 	: m_pData ( NULL )
3814 {
3815 }
3816 
3817 
Reset()3818 void CSphLowercaser::Reset()
3819 {
3820 	SafeDeleteArray ( m_pData );
3821 	m_pData = new int [ CHUNK_SIZE ];
3822 	memset ( m_pData, 0, CHUNK_SIZE*sizeof(int) ); // NOLINT sizeof(int)
3823 	m_iChunks = 1;
3824 	m_pChunk[0] = m_pData; // chunk 0 must always be allocated, for utf-8 tokenizer shortcut to work
3825 	for ( int i=1; i<CHUNK_COUNT; i++ )
3826 		m_pChunk[i] = NULL;
3827 }
3828 
3829 
~CSphLowercaser()3830 CSphLowercaser::~CSphLowercaser ()
3831 {
3832 	SafeDeleteArray ( m_pData );
3833 }
3834 
3835 
SetRemap(const CSphLowercaser * pLC)3836 void CSphLowercaser::SetRemap ( const CSphLowercaser * pLC )
3837 {
3838 	if ( !pLC )
3839 		return;
3840 
3841 	SafeDeleteArray ( m_pData );
3842 
3843 	m_iChunks = pLC->m_iChunks;
3844 	m_pData = new int [ m_iChunks*CHUNK_SIZE ];
3845 	memcpy ( m_pData, pLC->m_pData, sizeof(int)*m_iChunks*CHUNK_SIZE ); // NOLINT sizeof(int)
3846 
3847 	for ( int i=0; i<CHUNK_COUNT; i++ )
3848 		m_pChunk[i] = pLC->m_pChunk[i]
3849 			? pLC->m_pChunk[i] - pLC->m_pData + m_pData
3850 			: NULL;
3851 }
3852 
3853 
AddRemaps(const CSphVector<CSphRemapRange> & dRemaps,DWORD uFlags)3854 void CSphLowercaser::AddRemaps ( const CSphVector<CSphRemapRange> & dRemaps, DWORD uFlags )
3855 {
3856 	if ( !dRemaps.GetLength() )
3857 		return;
3858 
3859 	// build new chunks map
3860 	// 0 means "was unused"
3861 	// 1 means "was used"
3862 	// 2 means "is used now"
3863 	int dUsed [ CHUNK_COUNT ];
3864 	for ( int i=0; i<CHUNK_COUNT; i++ )
3865 		dUsed[i] = m_pChunk[i] ? 1 : 0;
3866 
3867 	int iNewChunks = m_iChunks;
3868 
3869 	ARRAY_FOREACH ( i, dRemaps )
3870 	{
3871 		const CSphRemapRange & tRemap = dRemaps[i];
3872 
3873 		#define LOC_CHECK_RANGE(_a) assert ( (_a)>=0 && (_a)<MAX_CODE );
3874 		LOC_CHECK_RANGE ( tRemap.m_iStart );
3875 		LOC_CHECK_RANGE ( tRemap.m_iEnd );
3876 		LOC_CHECK_RANGE ( tRemap.m_iRemapStart );
3877 		LOC_CHECK_RANGE ( tRemap.m_iRemapStart + tRemap.m_iEnd - tRemap.m_iStart );
3878 		#undef LOC_CHECK_RANGE
3879 
3880 		for ( int iChunk=( tRemap.m_iStart >> CHUNK_BITS ); iChunk<=( tRemap.m_iEnd >> CHUNK_BITS ); iChunk++ )
3881 			if ( dUsed[iChunk]==0 )
3882 		{
3883 			dUsed[iChunk] = 2;
3884 			iNewChunks++;
3885 		}
3886 	}
3887 
3888 	// alloc new tables and copy, if necessary
3889 	if ( iNewChunks>m_iChunks )
3890 	{
3891 		int * pData = new int [ iNewChunks*CHUNK_SIZE ];
3892 		memset ( pData, 0, sizeof(int)*iNewChunks*CHUNK_SIZE ); // NOLINT sizeof(int)
3893 
3894 		int * pChunk = pData;
3895 		for ( int i=0; i<CHUNK_COUNT; i++ )
3896 		{
3897 			int * pOldChunk = m_pChunk[i];
3898 
3899 			// build new ptr
3900 			if ( dUsed[i] )
3901 			{
3902 				m_pChunk[i] = pChunk;
3903 				pChunk += CHUNK_SIZE;
3904 			}
3905 
3906 			// copy old data
3907 			if ( dUsed[i]==1 )
3908 				memcpy ( m_pChunk[i], pOldChunk, sizeof(int)*CHUNK_SIZE ); // NOLINT sizeof(int)
3909 		}
3910 		assert ( pChunk-pData==iNewChunks*CHUNK_SIZE );
3911 
3912 		SafeDeleteArray ( m_pData );
3913 		m_pData = pData;
3914 		m_iChunks = iNewChunks;
3915 	}
3916 
3917 	// fill new stuff
3918 	ARRAY_FOREACH ( i, dRemaps )
3919 	{
3920 		const CSphRemapRange & tRemap = dRemaps[i];
3921 
3922 		DWORD iRemapped = tRemap.m_iRemapStart;
3923 		for ( int j=tRemap.m_iStart; j<=tRemap.m_iEnd; j++, iRemapped++ )
3924 		{
3925 			assert ( m_pChunk [ j >> CHUNK_BITS ] );
3926 			int & iCodepoint = m_pChunk [ j >> CHUNK_BITS ] [ j & CHUNK_MASK ];
3927 			bool bWordPart = ( iCodepoint & MASK_CODEPOINT )!=0;
3928 			int iNew = iRemapped | uFlags | ( iCodepoint & MASK_FLAGS );
3929 			if ( bWordPart && ( uFlags & FLAG_CODEPOINT_SPECIAL ) )
3930 				iNew |= FLAG_CODEPOINT_DUAL;
3931 			iCodepoint = iNew;
3932 		}
3933 	}
3934 }
3935 
3936 
AddSpecials(const char * sSpecials)3937 void CSphLowercaser::AddSpecials ( const char * sSpecials )
3938 {
3939 	assert ( sSpecials );
3940 	int iSpecials = strlen(sSpecials);
3941 
3942 	CSphVector<CSphRemapRange> dRemaps;
3943 	dRemaps.Resize ( iSpecials );
3944 	ARRAY_FOREACH ( i, dRemaps )
3945 		dRemaps[i].m_iStart = dRemaps[i].m_iEnd = dRemaps[i].m_iRemapStart = sSpecials[i];
3946 
3947 	AddRemaps ( dRemaps, FLAG_CODEPOINT_SPECIAL );
3948 }
3949 
operator =(const CSphLowercaser & rhs)3950 const CSphLowercaser & CSphLowercaser::operator = ( const CSphLowercaser & rhs )
3951 {
3952 	SetRemap ( &rhs );
3953 	return * this;
3954 }
3955 
GetFNV() const3956 uint64_t CSphLowercaser::GetFNV () const
3957 {
3958 	int iLen = ( sizeof(int) * m_iChunks * CHUNK_SIZE ) / sizeof(BYTE); // NOLINT
3959 	return sphFNV64 ( m_pData, iLen );
3960 }
3961 
GetMaxCodepointLength() const3962 int CSphLowercaser::GetMaxCodepointLength () const
3963 {
3964 	int iMax = 0;
3965 	for ( int iChunk=0; iChunk<CHUNK_COUNT; iChunk++ )
3966 	{
3967 		int * pChunk = m_pChunk[iChunk];
3968 		if ( !pChunk )
3969 			continue;
3970 
3971 		int * pMax = pChunk + CHUNK_SIZE;
3972 		while ( pChunk<pMax )
3973 		{
3974 			int iCode = *pChunk++ & MASK_CODEPOINT;
3975 			iMax = Max ( iMax, iCode );
3976 		}
3977 	}
3978 	if ( iMax<0x80 )
3979 		return 1;
3980 	if ( iMax<0x800 )
3981 		return 2;
3982 	return 3; // actually, 4 once we hit 0x10000
3983 }
3984 
3985 /////////////////////////////////////////////////////////////////////////////
3986 
3987 /// parser to build lowercaser from textual config
3988 class CSphCharsetDefinitionParser
3989 {
3990 public:
CSphCharsetDefinitionParser()3991 						CSphCharsetDefinitionParser () : m_bError ( false ) {}
3992 	bool				Parse ( const char * sConfig, CSphVector<CSphRemapRange> & dRanges );
3993 	const char *		GetLastError ();
3994 
3995 protected:
3996 	bool				m_bError;
3997 	char				m_sError [ 1024 ];
3998 	const char *		m_pCurrent;
3999 
4000 	bool				Error ( const char * sMessage );
4001 	void				SkipSpaces ();
4002 	bool				IsEof ();
4003 	bool				CheckEof ();
4004 	int					HexDigit ( int c );
4005 	int					ParseCharsetCode ();
4006 	bool				AddRange ( const CSphRemapRange & tRange, CSphVector<CSphRemapRange> & dRanges );
4007 };
4008 
4009 
GetLastError()4010 const char * CSphCharsetDefinitionParser::GetLastError ()
4011 {
4012 	return m_bError ? m_sError : NULL;
4013 }
4014 
4015 
IsEof()4016 bool CSphCharsetDefinitionParser::IsEof ()
4017 {
4018 	return ( *m_pCurrent )==0;
4019 }
4020 
4021 
CheckEof()4022 bool CSphCharsetDefinitionParser::CheckEof ()
4023 {
4024 	if ( IsEof() )
4025 	{
4026 		Error ( "unexpected end of line" );
4027 		return true;
4028 	} else
4029 	{
4030 		return false;
4031 	}
4032 }
4033 
4034 
Error(const char * sMessage)4035 bool CSphCharsetDefinitionParser::Error ( const char * sMessage )
4036 {
4037 	char sErrorBuffer[32];
4038 	strncpy ( sErrorBuffer, m_pCurrent, sizeof(sErrorBuffer) );
4039 	sErrorBuffer [ sizeof(sErrorBuffer)-1 ] = '\0';
4040 
4041 	snprintf ( m_sError, sizeof(m_sError), "%s near '%s'",
4042 		sMessage, sErrorBuffer );
4043 	m_sError [ sizeof(m_sError)-1 ] = '\0';
4044 
4045 	m_bError = true;
4046 	return false;
4047 }
4048 
4049 
HexDigit(int c)4050 int CSphCharsetDefinitionParser::HexDigit ( int c )
4051 {
4052 	if ( c>='0' && c<='9' ) return c-'0';
4053 	if ( c>='a' && c<='f' ) return c-'a'+10;
4054 	if ( c>='A' && c<='F' ) return c-'A'+10;
4055 	return 0;
4056 }
4057 
4058 
SkipSpaces()4059 void CSphCharsetDefinitionParser::SkipSpaces ()
4060 {
4061 	while ( ( *m_pCurrent ) && isspace ( (BYTE)*m_pCurrent ) )
4062 		m_pCurrent++;
4063 }
4064 
4065 
ParseCharsetCode()4066 int CSphCharsetDefinitionParser::ParseCharsetCode ()
4067 {
4068 	const char * p = m_pCurrent;
4069 	int iCode = 0;
4070 
4071 	if ( p[0]=='U' && p[1]=='+' )
4072 	{
4073 		p += 2;
4074 		while ( isxdigit(*p) )
4075 		{
4076 			iCode = iCode*16 + HexDigit ( *p++ );
4077 		}
4078 		while ( isspace(*p) )
4079 			p++;
4080 
4081 	} else
4082 	{
4083 		if ( (*(BYTE*)p)<32 || (*(BYTE*)p)>127 )
4084 		{
4085 			Error ( "non-ASCII characters not allowed, use 'U+00AB' syntax" );
4086 			return -1;
4087 		}
4088 
4089 		iCode = *p++;
4090 		while ( isspace(*p) )
4091 			p++;
4092 	}
4093 
4094 	m_pCurrent = p;
4095 	return iCode;
4096 }
4097 
AddRange(const CSphRemapRange & tRange,CSphVector<CSphRemapRange> & dRanges)4098 bool CSphCharsetDefinitionParser::AddRange ( const CSphRemapRange & tRange, CSphVector<CSphRemapRange> & dRanges )
4099 {
4100 	if ( tRange.m_iRemapStart>=0x20 )
4101 	{
4102 		dRanges.Add ( tRange );
4103 		return true;
4104 	}
4105 
4106 	CSphString sError;
4107 	sError.SetSprintf ( "dest range (U+%x) below U+20, not allowed", tRange.m_iRemapStart );
4108 	Error ( sError.cstr() );
4109 	return false;
4110 }
4111 
4112 
4113 struct CharsetAlias_t
4114 {
4115 	CSphString					m_sName;
4116 	int							m_iNameLen;
4117 	CSphVector<CSphRemapRange>	m_dRemaps;
4118 };
4119 
4120 static CSphVector<CharsetAlias_t> g_dCharsetAliases;
4121 static const char * g_sDefaultCharsetAliases[] = { "english", "A..Z->a..z, a..z", "russian", "U+410..U+42F->U+430..U+44F, U+430..U+44F, U+401->U+451, U+451", NULL };
4122 
sphInitCharsetAliasTable(CSphString & sError)4123 bool sphInitCharsetAliasTable ( CSphString & sError ) // FIXME!!! move alias generation to config common section
4124 {
4125 	g_dCharsetAliases.Reset();
4126 	CSphCharsetDefinitionParser tParser;
4127 	CSphVector<CharsetAlias_t> dAliases;
4128 
4129 	for ( int i=0; g_sDefaultCharsetAliases[i]; i+=2 )
4130 	{
4131 		CharsetAlias_t & tCur = dAliases.Add();
4132 		tCur.m_sName = g_sDefaultCharsetAliases[i];
4133 		tCur.m_iNameLen = tCur.m_sName.Length();
4134 
4135 		if ( !tParser.Parse ( g_sDefaultCharsetAliases[i+1], tCur.m_dRemaps ) )
4136 		{
4137 			sError = tParser.GetLastError();
4138 			return false;
4139 		}
4140 	}
4141 
4142 	g_dCharsetAliases.SwapData ( dAliases );
4143 	return true;
4144 }
4145 
4146 
Parse(const char * sConfig,CSphVector<CSphRemapRange> & dRanges)4147 bool CSphCharsetDefinitionParser::Parse ( const char * sConfig, CSphVector<CSphRemapRange> & dRanges )
4148 {
4149 	m_pCurrent = sConfig;
4150 	dRanges.Reset ();
4151 
4152 	// do parse
4153 	while ( *m_pCurrent )
4154 	{
4155 		SkipSpaces ();
4156 		if ( IsEof () )
4157 			break;
4158 
4159 		// check for stray comma
4160 		if ( *m_pCurrent==',' )
4161 			return Error ( "stray ',' not allowed, use 'U+002C' instead" );
4162 
4163 		// alias
4164 		bool bGotAlias = false;
4165 		ARRAY_FOREACH_COND ( i, g_dCharsetAliases, !bGotAlias )
4166 		{
4167 			const CharsetAlias_t & tCur = g_dCharsetAliases[i];
4168 			bGotAlias = ( strncmp ( tCur.m_sName.cstr(), m_pCurrent, tCur.m_iNameLen )==0 && ( !m_pCurrent[tCur.m_iNameLen] || m_pCurrent[tCur.m_iNameLen]==',' ) );
4169 			if ( !bGotAlias )
4170 				continue;
4171 
4172 			// skip to next definition
4173 			m_pCurrent += tCur.m_iNameLen;
4174 			if ( *m_pCurrent && *m_pCurrent==',' )
4175 				m_pCurrent++;
4176 
4177 			ARRAY_FOREACH ( iDef, tCur.m_dRemaps )
4178 			{
4179 				if ( !AddRange ( tCur.m_dRemaps[iDef], dRanges ) )
4180 					return false;
4181 			}
4182 		}
4183 		if ( bGotAlias )
4184 			continue;
4185 
4186 		// parse char code
4187 		const char * pStart = m_pCurrent;
4188 		int iStart = ParseCharsetCode();
4189 		if ( iStart<0 )
4190 			return false;
4191 
4192 		// stray char?
4193 		if ( !*m_pCurrent || *m_pCurrent==',' )
4194 		{
4195 			// stray char
4196 			if ( !AddRange ( CSphRemapRange ( iStart, iStart, iStart ), dRanges ) )
4197 				return false;
4198 
4199 			if ( IsEof () )
4200 				break;
4201 			m_pCurrent++;
4202 			continue;
4203 		}
4204 
4205 		// stray remap?
4206 		if ( m_pCurrent[0]=='-' && m_pCurrent[1]=='>' )
4207 		{
4208 			// parse and add
4209 			m_pCurrent += 2;
4210 			int iDest = ParseCharsetCode ();
4211 			if ( iDest<0 )
4212 				return false;
4213 			if ( !AddRange ( CSphRemapRange ( iStart, iStart, iDest ), dRanges ) )
4214 				return false;
4215 
4216 			// it's either end of line now, or must be followed by comma
4217 			if ( *m_pCurrent )
4218 				if ( *m_pCurrent++!=',' )
4219 					return Error ( "syntax error" );
4220 			continue;
4221 		}
4222 
4223 		// range start?
4224 		if (!( m_pCurrent[0]=='.' && m_pCurrent[1]=='.' ))
4225 			return Error ( "syntax error" );
4226 		m_pCurrent += 2;
4227 
4228 		SkipSpaces ();
4229 		if ( CheckEof () )
4230 			return false;
4231 
4232 		// parse range end char code
4233 		int iEnd = ParseCharsetCode ();
4234 		if ( iEnd<0 )
4235 			return false;
4236 		if ( iStart>iEnd )
4237 		{
4238 			m_pCurrent = pStart;
4239 			return Error ( "range end less than range start" );
4240 		}
4241 
4242 		// stray range?
4243 		if ( !*m_pCurrent || *m_pCurrent==',' )
4244 		{
4245 			if ( !AddRange ( CSphRemapRange ( iStart, iEnd, iStart ), dRanges ) )
4246 				return false;
4247 
4248 			if ( IsEof () )
4249 				break;
4250 			m_pCurrent++;
4251 			continue;
4252 		}
4253 
4254 		// "checkerboard" range?
4255 		if ( m_pCurrent[0]=='/' && m_pCurrent[1]=='2' )
4256 		{
4257 			for ( int i=iStart; i<iEnd; i+=2 )
4258 			{
4259 				if ( !AddRange ( CSphRemapRange ( i, i, i+1 ), dRanges ) )
4260 					return false;
4261 				if ( !AddRange ( CSphRemapRange ( i+1, i+1, i+1 ), dRanges ) )
4262 					return false;
4263 			}
4264 
4265 			// skip "/2", expect ","
4266 			m_pCurrent += 2;
4267 			SkipSpaces ();
4268 			if ( *m_pCurrent )
4269 				if ( *m_pCurrent++!=',' )
4270 					return Error ( "expected end of line or ','" );
4271 			continue;
4272 		}
4273 
4274 		// remapped range?
4275 		if (!( m_pCurrent[0]=='-' && m_pCurrent[1]=='>' ))
4276 			return Error ( "expected end of line, ',' or '-><char>'" );
4277 		m_pCurrent += 2;
4278 
4279 		SkipSpaces ();
4280 		if ( CheckEof () )
4281 			return false;
4282 
4283 		// parse dest start
4284 		const char * pRemapStart = m_pCurrent;
4285 		int iRemapStart = ParseCharsetCode ();
4286 		if ( iRemapStart<0 )
4287 			return false;
4288 
4289 		// expect '..'
4290 		if ( CheckEof () )
4291 			return false;
4292 		if (!( m_pCurrent[0]=='.' && m_pCurrent[1]=='.' ))
4293 			return Error ( "expected '..'" );
4294 		m_pCurrent += 2;
4295 
4296 		// parse dest end
4297 		int iRemapEnd = ParseCharsetCode ();
4298 		if ( iRemapEnd<0 )
4299 			return false;
4300 
4301 		// check dest range
4302 		if ( iRemapStart>iRemapEnd )
4303 		{
4304 			m_pCurrent = pRemapStart;
4305 			return Error ( "dest range end less than dest range start" );
4306 		}
4307 
4308 		// check for length mismatch
4309 		if ( ( iRemapEnd-iRemapStart )!=( iEnd-iStart ) )
4310 		{
4311 			m_pCurrent = pStart;
4312 			return Error ( "dest range length must match src range length" );
4313 		}
4314 
4315 		// remapped ok
4316 		if ( !AddRange ( CSphRemapRange ( iStart, iEnd, iRemapStart ), dRanges ) )
4317 			return false;
4318 
4319 		if ( IsEof () )
4320 			break;
4321 		if ( *m_pCurrent!=',' )
4322 			return Error ( "expected ','" );
4323 		m_pCurrent++;
4324 	}
4325 
4326 	dRanges.Sort ();
4327 	for ( int i=0; i<dRanges.GetLength()-1; i++ )
4328 	{
4329 		if ( dRanges[i].m_iEnd>=dRanges[i+1].m_iStart )
4330 		{
4331 			// FIXME! add an ambiguity check
4332 			dRanges[i].m_iEnd = Max ( dRanges[i].m_iEnd, dRanges[i+1].m_iEnd );
4333 			dRanges.Remove ( i+1 );
4334 			i--;
4335 		}
4336 	}
4337 
4338 	return true;
4339 }
4340 
4341 //////////////////////////////////////////////////////////////////////////
4342 
sphParseCharset(const char * sCharset,CSphVector<CSphRemapRange> & dRemaps)4343 bool sphParseCharset ( const char * sCharset, CSphVector<CSphRemapRange> & dRemaps )
4344 {
4345 	CSphCharsetDefinitionParser tParser;
4346 	return tParser.Parse ( sCharset, dRemaps );
4347 }
4348 
4349 /////////////////////////////////////////////////////////////////////////////
4350 
CSphSavedFile()4351 CSphSavedFile::CSphSavedFile ()
4352 	: m_uSize	( 0 )
4353 	, m_uCTime	( 0 )
4354 	, m_uMTime	( 0 )
4355 	, m_uCRC32	( 0 )
4356 {
4357 }
4358 
4359 
CSphEmbeddedFiles()4360 CSphEmbeddedFiles::CSphEmbeddedFiles ()
4361 	: m_bEmbeddedSynonyms	( false )
4362 	, m_bEmbeddedStopwords	( false )
4363 	, m_bEmbeddedWordforms	( false )
4364 {
4365 }
4366 
4367 
Reset()4368 void CSphEmbeddedFiles::Reset()
4369 {
4370 	m_dSynonyms.Reset();
4371 	m_dStopwordFiles.Reset();
4372 	m_dStopwords.Reset();
4373 	m_dWordforms.Reset();
4374 	m_dWordformFiles.Reset();
4375 }
4376 
4377 
CSphTokenizerSettings()4378 CSphTokenizerSettings::CSphTokenizerSettings ()
4379 	: m_iType				( TOKENIZER_UTF8 )
4380 	, m_iMinWordLen			( 1 )
4381 	, m_iNgramLen			( 0 )
4382 {
4383 }
4384 
4385 
LoadTokenizerSettings(CSphReader & tReader,CSphTokenizerSettings & tSettings,CSphEmbeddedFiles & tEmbeddedFiles,DWORD uVersion,CSphString & sWarning)4386 bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSettings,
4387 	CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning )
4388 {
4389 	if ( uVersion<9 )
4390 		return true;
4391 
4392 	tSettings.m_iType = tReader.GetByte ();
4393 	if ( tSettings.m_iType!=TOKENIZER_UTF8 && tSettings.m_iType!=TOKENIZER_NGRAM )
4394 	{
4395 		sWarning = "can't load an old index with SBCS tokenizer";
4396 		return false;
4397 	}
4398 
4399 	tSettings.m_sCaseFolding = tReader.GetString ();
4400 	tSettings.m_iMinWordLen = tReader.GetDword ();
4401 	tEmbeddedFiles.m_bEmbeddedSynonyms = false;
4402 	if ( uVersion>=30 )
4403 	{
4404 		tEmbeddedFiles.m_bEmbeddedSynonyms = !!tReader.GetByte();
4405 		if ( tEmbeddedFiles.m_bEmbeddedSynonyms )
4406 		{
4407 			int nSynonyms = (int)tReader.GetDword();
4408 			tEmbeddedFiles.m_dSynonyms.Resize ( nSynonyms );
4409 			ARRAY_FOREACH ( i, tEmbeddedFiles.m_dSynonyms )
4410 				tEmbeddedFiles.m_dSynonyms[i] = tReader.GetString();
4411 		}
4412 	}
4413 
4414 	tSettings.m_sSynonymsFile = tReader.GetString ();
4415 	ReadFileInfo ( tReader, tSettings.m_sSynonymsFile.cstr (),
4416 		tEmbeddedFiles.m_tSynonymFile, tEmbeddedFiles.m_bEmbeddedSynonyms ? NULL : &sWarning );
4417 	tSettings.m_sBoundary = tReader.GetString ();
4418 	tSettings.m_sIgnoreChars = tReader.GetString ();
4419 	tSettings.m_iNgramLen = tReader.GetDword ();
4420 	tSettings.m_sNgramChars = tReader.GetString ();
4421 	if ( uVersion>=15 )
4422 		tSettings.m_sBlendChars = tReader.GetString ();
4423 	if ( uVersion>=24 )
4424 		tSettings.m_sBlendMode = tReader.GetString();
4425 
4426 	return true;
4427 }
4428 
4429 
4430 /// gets called from and MUST be in sync with RtIndex_t::SaveDiskHeader()!
4431 /// note that SaveDiskHeader() occasionaly uses some PREVIOUS format version!
SaveTokenizerSettings(CSphWriter & tWriter,ISphTokenizer * pTokenizer,int iEmbeddedLimit)4432 void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, int iEmbeddedLimit )
4433 {
4434 	assert ( pTokenizer );
4435 
4436 	const CSphTokenizerSettings & tSettings = pTokenizer->GetSettings ();
4437 	tWriter.PutByte ( tSettings.m_iType );
4438 	tWriter.PutString ( tSettings.m_sCaseFolding.cstr () );
4439 	tWriter.PutDword ( tSettings.m_iMinWordLen );
4440 
4441 	bool bEmbedSynonyms = pTokenizer->GetSynFileInfo ().m_uSize<=(SphOffset_t)iEmbeddedLimit;
4442 	tWriter.PutByte ( bEmbedSynonyms ? 1 : 0 );
4443 	if ( bEmbedSynonyms )
4444 		pTokenizer->WriteSynonyms ( tWriter );
4445 
4446 	tWriter.PutString ( tSettings.m_sSynonymsFile.cstr () );
4447 	WriteFileInfo ( tWriter, pTokenizer->GetSynFileInfo () );
4448 	tWriter.PutString ( tSettings.m_sBoundary.cstr () );
4449 	tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
4450 	tWriter.PutDword ( tSettings.m_iNgramLen );
4451 	tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
4452 	tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
4453 	tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
4454 }
4455 
4456 
LoadDictionarySettings(CSphReader & tReader,CSphDictSettings & tSettings,CSphEmbeddedFiles & tEmbeddedFiles,DWORD uVersion,CSphString & sWarning)4457 void LoadDictionarySettings ( CSphReader & tReader, CSphDictSettings & tSettings,
4458 	CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning )
4459 {
4460 	if ( uVersion<9 )
4461 		return;
4462 
4463 	tSettings.m_sMorphology = tReader.GetString ();
4464 
4465 	tEmbeddedFiles.m_bEmbeddedStopwords = false;
4466 	if ( uVersion>=30 )
4467 	{
4468 		tEmbeddedFiles.m_bEmbeddedStopwords = !!tReader.GetByte();
4469 		if ( tEmbeddedFiles.m_bEmbeddedStopwords )
4470 		{
4471 			int nStopwords = (int)tReader.GetDword();
4472 			tEmbeddedFiles.m_dStopwords.Resize ( nStopwords );
4473 			ARRAY_FOREACH ( i, tEmbeddedFiles.m_dStopwords )
4474 				tEmbeddedFiles.m_dStopwords[i] = (SphWordID_t)tReader.UnzipOffset();
4475 		}
4476 	}
4477 
4478 	tSettings.m_sStopwords = tReader.GetString ();
4479 	int nFiles = tReader.GetDword ();
4480 
4481 	CSphString sFile;
4482 	tEmbeddedFiles.m_dStopwordFiles.Resize ( nFiles );
4483 	for ( int i = 0; i < nFiles; i++ )
4484 	{
4485 		sFile = tReader.GetString ();
4486 		ReadFileInfo ( tReader, sFile.cstr (), tEmbeddedFiles.m_dStopwordFiles[i], tEmbeddedFiles.m_bEmbeddedSynonyms ? NULL : &sWarning );
4487 	}
4488 
4489 	tEmbeddedFiles.m_bEmbeddedWordforms = false;
4490 	if ( uVersion>=30 )
4491 	{
4492 		tEmbeddedFiles.m_bEmbeddedWordforms = !!tReader.GetByte();
4493 		if ( tEmbeddedFiles.m_bEmbeddedWordforms )
4494 		{
4495 			int nWordforms = (int)tReader.GetDword();
4496 			tEmbeddedFiles.m_dWordforms.Resize ( nWordforms );
4497 			ARRAY_FOREACH ( i, tEmbeddedFiles.m_dWordforms )
4498 				tEmbeddedFiles.m_dWordforms[i] = tReader.GetString();
4499 		}
4500 	}
4501 
4502 	if ( uVersion>=29 )
4503 		tSettings.m_dWordforms.Resize ( tReader.GetDword() );
4504 	else
4505 		tSettings.m_dWordforms.Resize(1);
4506 
4507 	tEmbeddedFiles.m_dWordformFiles.Resize ( tSettings.m_dWordforms.GetLength() );
4508 	ARRAY_FOREACH ( i, tSettings.m_dWordforms )
4509 	{
4510 		tSettings.m_dWordforms[i] = tReader.GetString();
4511 		ReadFileInfo ( tReader, tSettings.m_dWordforms[i].cstr(),
4512 			tEmbeddedFiles.m_dWordformFiles[i], tEmbeddedFiles.m_bEmbeddedWordforms ? NULL : &sWarning );
4513 	}
4514 
4515 	if ( uVersion>=13 )
4516 		tSettings.m_iMinStemmingLen = tReader.GetDword ();
4517 
4518 	tSettings.m_bWordDict = false; // default to crc for old indexes
4519 	if ( uVersion>=21 )
4520 	{
4521 		tSettings.m_bWordDict = ( tReader.GetByte()!=0 );
4522 		if ( !tSettings.m_bWordDict )
4523 			sphWarning ( "dict=crc deprecated, use dict=keywords instead" );
4524 	}
4525 
4526 	if ( uVersion>=36 )
4527 		tSettings.m_bStopwordsUnstemmed = ( tReader.GetByte()!=0 );
4528 
4529 	if ( uVersion>=37 )
4530 		tSettings.m_sMorphFingerprint = tReader.GetString();
4531 }
4532 
4533 
4534 /// gets called from and MUST be in sync with RtIndex_t::SaveDiskHeader()!
4535 /// note that SaveDiskHeader() occasionaly uses some PREVIOUS format version!
SaveDictionarySettings(CSphWriter & tWriter,CSphDict * pDict,bool bForceWordDict,int iEmbeddedLimit)4536 void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict, int iEmbeddedLimit )
4537 {
4538 	assert ( pDict );
4539 	const CSphDictSettings & tSettings = pDict->GetSettings ();
4540 
4541 	tWriter.PutString ( tSettings.m_sMorphology.cstr () );
4542 	const CSphVector <CSphSavedFile> & dSWFileInfos = pDict->GetStopwordsFileInfos ();
4543 	SphOffset_t uTotalSize = 0;
4544 	ARRAY_FOREACH ( i, dSWFileInfos )
4545 		uTotalSize += dSWFileInfos[i].m_uSize;
4546 
4547 	bool bEmbedStopwords = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
4548 	tWriter.PutByte ( bEmbedStopwords ? 1 : 0 );
4549 	if ( bEmbedStopwords )
4550 		pDict->WriteStopwords ( tWriter );
4551 
4552 	tWriter.PutString ( tSettings.m_sStopwords.cstr () );
4553 	tWriter.PutDword ( dSWFileInfos.GetLength () );
4554 	ARRAY_FOREACH ( i, dSWFileInfos )
4555 	{
4556 		tWriter.PutString ( dSWFileInfos[i].m_sFilename.cstr () );
4557 		WriteFileInfo ( tWriter, dSWFileInfos[i] );
4558 	}
4559 
4560 	const CSphVector <CSphSavedFile> & dWFFileInfos = pDict->GetWordformsFileInfos ();
4561 	uTotalSize = 0;
4562 	ARRAY_FOREACH ( i, dWFFileInfos )
4563 		uTotalSize += dWFFileInfos[i].m_uSize;
4564 
4565 	bool bEmbedWordforms = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
4566 	tWriter.PutByte ( bEmbedWordforms ? 1 : 0 );
4567 	if ( bEmbedWordforms )
4568 		pDict->WriteWordforms ( tWriter );
4569 
4570 	tWriter.PutDword ( dWFFileInfos.GetLength() );
4571 	ARRAY_FOREACH ( i, dWFFileInfos )
4572 	{
4573 		tWriter.PutString ( dWFFileInfos[i].m_sFilename.cstr() );
4574 		WriteFileInfo ( tWriter, dWFFileInfos[i] );
4575 	}
4576 
4577 	tWriter.PutDword ( tSettings.m_iMinStemmingLen );
4578 	tWriter.PutByte ( tSettings.m_bWordDict || bForceWordDict );
4579 	tWriter.PutByte ( tSettings.m_bStopwordsUnstemmed );
4580 	tWriter.PutString ( pDict->GetMorphDataFingerprint() );
4581 }
4582 
4583 
LoadFieldFilterSettings(CSphReader & tReader,CSphFieldFilterSettings & tFieldFilterSettings)4584 static void LoadFieldFilterSettings ( CSphReader & tReader, CSphFieldFilterSettings & tFieldFilterSettings )
4585 {
4586 	int nRegexps = tReader.GetDword();
4587 	if ( !nRegexps )
4588 		return;
4589 
4590 	tFieldFilterSettings.m_dRegexps.Resize ( nRegexps );
4591 	ARRAY_FOREACH ( i, tFieldFilterSettings.m_dRegexps )
4592 		tFieldFilterSettings.m_dRegexps[i] = tReader.GetString();
4593 
4594 	tReader.GetByte(); // deprecated utf-8 flag
4595 }
4596 
4597 
SaveFieldFilterSettings(CSphWriter & tWriter,ISphFieldFilter * pFieldFilter)4598 void SaveFieldFilterSettings ( CSphWriter & tWriter, ISphFieldFilter * pFieldFilter )
4599 {
4600 	if ( !pFieldFilter )
4601 	{
4602 		tWriter.PutDword ( 0 );
4603 		return;
4604 	}
4605 
4606 	CSphFieldFilterSettings tSettings;
4607 	pFieldFilter->GetSettings ( tSettings );
4608 
4609 	tWriter.PutDword ( tSettings.m_dRegexps.GetLength() );
4610 	ARRAY_FOREACH ( i, tSettings.m_dRegexps )
4611 		tWriter.PutString ( tSettings.m_dRegexps[i] );
4612 
4613 	tWriter.PutByte(1); // deprecated utf8 flag
4614 }
4615 
4616 
ShortTokenFilter(BYTE * pToken,int iLen)4617 static inline bool ShortTokenFilter ( BYTE * pToken, int iLen )
4618 {
4619 	return pToken[0]=='*' || ( iLen > 0 && pToken[iLen-1]=='*' );
4620 }
4621 
4622 /////////////////////////////////////////////////////////////////////////////
4623 
ISphTokenizer()4624 ISphTokenizer::ISphTokenizer ()
4625 	: m_iLastTokenLen ( 0 )
4626 	, m_bTokenBoundary ( false )
4627 	, m_bBoundary ( false )
4628 	, m_bWasSpecial ( false )
4629 	, m_bWasSynonym ( false )
4630 	, m_bEscaped ( false )
4631 	, m_iOvershortCount ( 0 )
4632 	, m_eTokenMorph ( SPH_TOKEN_MORPH_RAW )
4633 	, m_bBlended ( false )
4634 	, m_bNonBlended ( true )
4635 	, m_bBlendedPart ( false )
4636 	, m_bBlendAdd ( false )
4637 	, m_uBlendVariants ( BLEND_TRIM_NONE )
4638 	, m_uBlendVariantsPending ( 0 )
4639 	, m_bBlendSkipPure ( false )
4640 	, m_bShortTokenFilter ( false )
4641 	, m_bDetectSentences ( false )
4642 	, m_bPhrase ( false )
4643 {}
4644 
4645 
SetCaseFolding(const char * sConfig,CSphString & sError)4646 bool ISphTokenizer::SetCaseFolding ( const char * sConfig, CSphString & sError )
4647 {
4648 	CSphVector<CSphRemapRange> dRemaps;
4649 	CSphCharsetDefinitionParser tParser;
4650 	if ( !tParser.Parse ( sConfig, dRemaps ) )
4651 	{
4652 		sError = tParser.GetLastError();
4653 		return false;
4654 	}
4655 
4656 	const int MIN_CODE = 0x21;
4657 	ARRAY_FOREACH ( i, dRemaps )
4658 	{
4659 		CSphRemapRange & tMap = dRemaps[i];
4660 
4661 		if ( tMap.m_iStart<MIN_CODE || tMap.m_iStart>=m_tLC.MAX_CODE )
4662 		{
4663 			sphWarning ( "wrong character mapping start specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED",
4664 				tMap.m_iStart, MIN_CODE, m_tLC.MAX_CODE-1 );
4665 			tMap.m_iStart = Min ( Max ( tMap.m_iStart, MIN_CODE ), m_tLC.MAX_CODE-1 );
4666 		}
4667 
4668 		if ( tMap.m_iEnd<MIN_CODE || tMap.m_iEnd>=m_tLC.MAX_CODE )
4669 		{
4670 			sphWarning ( "wrong character mapping end specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED",
4671 				tMap.m_iEnd, MIN_CODE, m_tLC.MAX_CODE-1 );
4672 			tMap.m_iEnd = Min ( Max ( tMap.m_iEnd, MIN_CODE ), m_tLC.MAX_CODE-1 );
4673 		}
4674 
4675 		if ( tMap.m_iRemapStart<MIN_CODE || tMap.m_iRemapStart>=m_tLC.MAX_CODE )
4676 		{
4677 			sphWarning ( "wrong character remapping start specified: U+%x, should be between U+%x and U+%x (inclusive); CLAMPED",
4678 				tMap.m_iRemapStart, MIN_CODE, m_tLC.MAX_CODE-1 );
4679 			tMap.m_iRemapStart = Min ( Max ( tMap.m_iRemapStart, MIN_CODE ), m_tLC.MAX_CODE-1 );
4680 		}
4681 
4682 		int iRemapEnd = tMap.m_iRemapStart+tMap.m_iEnd-tMap.m_iStart;
4683 		if ( iRemapEnd<MIN_CODE || iRemapEnd>=m_tLC.MAX_CODE )
4684 		{
4685 			sphWarning ( "wrong character remapping end specified: U+%x, should be between U+%x and U+%x (inclusive); IGNORED",
4686 				iRemapEnd, MIN_CODE, m_tLC.MAX_CODE-1 );
4687 			dRemaps.Remove(i);
4688 			i--;
4689 		}
4690 	}
4691 
4692 	m_tLC.Reset ();
4693 	m_tLC.AddRemaps ( dRemaps, 0 );
4694 	return true;
4695 }
4696 
4697 
AddPlainChar(char c)4698 void ISphTokenizer::AddPlainChar ( char c )
4699 {
4700 	CSphVector<CSphRemapRange> dTmp ( 1 );
4701 	dTmp[0].m_iStart = dTmp[0].m_iEnd = dTmp[0].m_iRemapStart = c;
4702 	m_tLC.AddRemaps ( dTmp, 0 );
4703 }
4704 
4705 
AddSpecials(const char * sSpecials)4706 void ISphTokenizer::AddSpecials ( const char * sSpecials )
4707 {
4708 	m_tLC.AddSpecials ( sSpecials );
4709 }
4710 
4711 
Setup(const CSphTokenizerSettings & tSettings)4712 void ISphTokenizer::Setup ( const CSphTokenizerSettings & tSettings )
4713 {
4714 	m_tSettings = tSettings;
4715 }
4716 
4717 
Create(const CSphTokenizerSettings & tSettings,const CSphEmbeddedFiles * pFiles,CSphString & sError)4718 ISphTokenizer * ISphTokenizer::Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError )
4719 {
4720 	CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL );
4721 
4722 	switch ( tSettings.m_iType )
4723 	{
4724 		case TOKENIZER_UTF8:	pTokenizer = sphCreateUTF8Tokenizer (); break;
4725 		case TOKENIZER_NGRAM:	pTokenizer = sphCreateUTF8NgramTokenizer (); break;
4726 		default:
4727 			sError.SetSprintf ( "failed to create tokenizer (unknown charset type '%d')", tSettings.m_iType );
4728 			return NULL;
4729 	}
4730 
4731 	pTokenizer->Setup ( tSettings );
4732 
4733 	if ( !tSettings.m_sCaseFolding.IsEmpty () && !pTokenizer->SetCaseFolding ( tSettings.m_sCaseFolding.cstr (), sError ) )
4734 	{
4735 		sError.SetSprintf ( "'charset_table': %s", sError.cstr() );
4736 		return NULL;
4737 	}
4738 
4739 	if ( !tSettings.m_sSynonymsFile.IsEmpty () && !pTokenizer->LoadSynonyms ( tSettings.m_sSynonymsFile.cstr (),
4740 		pFiles && pFiles->m_bEmbeddedSynonyms ? pFiles : NULL, sError ) )
4741 	{
4742 		sError.SetSprintf ( "'synonyms': %s", sError.cstr() );
4743 		return NULL;
4744 	}
4745 
4746 	if ( !tSettings.m_sBoundary.IsEmpty () && !pTokenizer->SetBoundary ( tSettings.m_sBoundary.cstr (), sError ) )
4747 	{
4748 		sError.SetSprintf ( "'phrase_boundary': %s", sError.cstr() );
4749 		return NULL;
4750 	}
4751 
4752 	if ( !tSettings.m_sIgnoreChars.IsEmpty () && !pTokenizer->SetIgnoreChars ( tSettings.m_sIgnoreChars.cstr (), sError ) )
4753 	{
4754 		sError.SetSprintf ( "'ignore_chars': %s", sError.cstr() );
4755 		return NULL;
4756 	}
4757 
4758 	if ( !tSettings.m_sBlendChars.IsEmpty () && !pTokenizer->SetBlendChars ( tSettings.m_sBlendChars.cstr (), sError ) )
4759 	{
4760 		sError.SetSprintf ( "'blend_chars': %s", sError.cstr() );
4761 		return NULL;
4762 	}
4763 
4764 	if ( !pTokenizer->SetBlendMode ( tSettings.m_sBlendMode.cstr (), sError ) )
4765 	{
4766 		sError.SetSprintf ( "'blend_mode': %s", sError.cstr() );
4767 		return NULL;
4768 	}
4769 
4770 	pTokenizer->SetNgramLen ( tSettings.m_iNgramLen );
4771 
4772 	if ( !tSettings.m_sNgramChars.IsEmpty () && !pTokenizer->SetNgramChars ( tSettings.m_sNgramChars.cstr (), sError ) )
4773 	{
4774 		sError.SetSprintf ( "'ngram_chars': %s", sError.cstr() );
4775 		return NULL;
4776 	}
4777 
4778 	return pTokenizer.LeakPtr ();
4779 }
4780 
4781 
CreateMultiformFilter(ISphTokenizer * pTokenizer,const CSphMultiformContainer * pContainer)4782 ISphTokenizer * ISphTokenizer::CreateMultiformFilter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
4783 {
4784 	if ( !pContainer )
4785 		return pTokenizer;
4786 	return new CSphMultiformTokenizer ( pTokenizer, pContainer );
4787 }
4788 
4789 
CreateBigramFilter(ISphTokenizer * pTokenizer,ESphBigram eBigramIndex,const CSphString & sBigramWords,CSphString & sError)4790 ISphTokenizer * ISphTokenizer::CreateBigramFilter ( ISphTokenizer * pTokenizer, ESphBigram eBigramIndex, const CSphString & sBigramWords, CSphString & sError )
4791 {
4792 	assert ( pTokenizer );
4793 
4794 	if ( eBigramIndex==SPH_BIGRAM_NONE )
4795 		return pTokenizer;
4796 
4797 	CSphVector<CSphString> dFreq;
4798 	if ( eBigramIndex!=SPH_BIGRAM_ALL )
4799 	{
4800 		const BYTE * pTok = NULL;
4801 		pTokenizer->SetBuffer ( (const BYTE*)sBigramWords.cstr(), sBigramWords.Length() );
4802 		while ( ( pTok = pTokenizer->GetToken() )!=NULL )
4803 			dFreq.Add ( (const char*)pTok );
4804 
4805 		if ( !dFreq.GetLength() )
4806 		{
4807 			SafeDelete ( pTokenizer );
4808 			sError.SetSprintf ( "bigram_freq_words does not contain any valid words" );
4809 			return NULL;
4810 		}
4811 	}
4812 
4813 	return new CSphBigramTokenizer ( pTokenizer, eBigramIndex, dFreq );
4814 }
4815 
4816 
4817 class PluginFilterTokenizer_c : public CSphTokenFilter
4818 {
4819 protected:
4820 	const PluginTokenFilter_c *	m_pFilter;		///< plugin descriptor
4821 	CSphString					m_sOptions;		///< options string for the plugin init()
4822 	void *						m_pUserdata;	///< userdata returned from by the plugin init()
4823 	bool						m_bGotExtra;	///< are we looping through extra tokens?
4824 	int							m_iPosDelta;	///< position delta for the current token, see comments in GetToken()
4825 	bool						m_bWasBlended;	///< whether the last raw token was blended
4826 
4827 public:
PluginFilterTokenizer_c(ISphTokenizer * pTok,const PluginTokenFilter_c * pFilter,const char * sOptions)4828 	PluginFilterTokenizer_c ( ISphTokenizer * pTok, const PluginTokenFilter_c * pFilter, const char * sOptions )
4829 		: CSphTokenFilter ( pTok )
4830 		, m_pFilter ( pFilter )
4831 		, m_sOptions ( sOptions )
4832 		, m_pUserdata ( NULL )
4833 		, m_bGotExtra ( false )
4834 		, m_iPosDelta ( 0 )
4835 		, m_bWasBlended ( false )
4836 	{
4837 		assert ( m_pTokenizer );
4838 		assert ( m_pFilter );
4839 		m_pFilter->Use();
4840 		// FIXME!!! handle error in constructor \ move to setup?
4841 		CSphString sError;
4842 		SetFilterSchema ( CSphSchema(), sError );
4843 	}
4844 
~PluginFilterTokenizer_c()4845 	~PluginFilterTokenizer_c()
4846 	{
4847 		if ( m_pFilter->m_fnDeinit )
4848 			m_pFilter->m_fnDeinit ( m_pUserdata );
4849 		m_pFilter->Release();
4850 	}
4851 
Clone(ESphTokenizerClone eMode) const4852 	ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const
4853 	{
4854 		ISphTokenizer * pTok = m_pTokenizer->Clone ( eMode );
4855 		return new PluginFilterTokenizer_c ( pTok, m_pFilter, m_sOptions.cstr() );
4856 	}
4857 
SetFilterSchema(const CSphSchema & s,CSphString & sError)4858 	virtual bool SetFilterSchema ( const CSphSchema & s, CSphString & sError )
4859 	{
4860 		if ( m_pUserdata && m_pFilter->m_fnDeinit )
4861 			m_pFilter->m_fnDeinit ( m_pUserdata );
4862 
4863 		CSphVector<const char*> dFields;
4864 		ARRAY_FOREACH ( i, s.m_dFields )
4865 			dFields.Add ( s.m_dFields[i].m_sName.cstr() );
4866 
4867 		char sErrBuf[SPH_UDF_ERROR_LEN+1];
4868 		if ( m_pFilter->m_fnInit ( &m_pUserdata, dFields.GetLength(), dFields.Begin(), m_sOptions.cstr(), sErrBuf )==0 )
4869 			return true;
4870 		sError = sErrBuf;
4871 		return false;
4872 	}
4873 
SetFilterOptions(const char * sOptions,CSphString & sError)4874 	virtual bool SetFilterOptions ( const char * sOptions, CSphString & sError )
4875 	{
4876 		char sErrBuf[SPH_UDF_ERROR_LEN+1];
4877 		if ( m_pFilter->m_fnBeginDocument ( m_pUserdata, sOptions, sErrBuf )==0 )
4878 			return true;
4879 		sError = sErrBuf;
4880 		return false;
4881 	}
4882 
BeginField(int iField)4883 	virtual void BeginField ( int iField )
4884 	{
4885 		if ( m_pFilter->m_fnBeginField )
4886 			m_pFilter->m_fnBeginField ( m_pUserdata, iField );
4887 	}
4888 
GetToken()4889 	virtual BYTE * GetToken ()
4890 	{
4891 		// we have two principal states here
4892 		// a) have pending extra tokens, keep looping and returning those
4893 		// b) no extras, keep pushing until plugin returns anything
4894 		//
4895 		// we also have to handle position deltas, and that story is a little tricky
4896 		// positions are not assigned in the tokenizer itself (we might wanna refactor that)
4897 		// however, tokenizer has some (partial) control over the keyword positions, too
4898 		// when it skips some too-short tokens, it returns a non-zero value via GetOvershortCount()
4899 		// when it returns a blended token, it returns true via TokenIsBlended()
4900 		// so while the default position delta is 1, overshorts can increase it by N,
4901 		// and blended flag can decrease it by 1, and that's under tokenizer's control
4902 		//
4903 		// so for the plugins, we simplify (well i hope!) this complexity a little
4904 		// we compute a proper position delta here, pass it, and let the plugin modify it
4905 		// we report all tokens as regular, and return the delta via GetOvershortCount()
4906 
4907 		// state (a), just loop the pending extras
4908 		if ( m_bGotExtra )
4909 		{
4910 			m_iPosDelta = 1; // default delta is 1
4911 			BYTE * pTok = (BYTE*) m_pFilter->m_fnGetExtraToken ( m_pUserdata, &m_iPosDelta );
4912 			if ( pTok )
4913 				return pTok;
4914 			m_bGotExtra = false;
4915 		}
4916 
4917 		// state (b), push raw tokens, return results
4918 		for ( ;; )
4919 		{
4920 			// get next raw token, handle field end
4921 			BYTE * pRaw = m_pTokenizer->GetToken();
4922 			if ( !pRaw )
4923 			{
4924 				// no more hits? notify plugin of a field end,
4925 				// and check if there are pending tokens
4926 				m_bGotExtra = 0;
4927 				if ( m_pFilter->m_fnEndField )
4928 					if ( !m_pFilter->m_fnEndField ( m_pUserdata ) )
4929 						return NULL;
4930 
4931 				// got them, start fetching
4932 				m_bGotExtra = true;
4933 				return (BYTE*)m_pFilter->m_fnGetExtraToken ( m_pUserdata, &m_iPosDelta );
4934 			}
4935 
4936 			// compute proper position delta
4937 			m_iPosDelta = ( m_bWasBlended ? 0 : 1 ) + m_pTokenizer->GetOvershortCount();
4938 			m_bWasBlended = m_pTokenizer->TokenIsBlended();
4939 
4940 			// push raw token to plugin, return a processed one, if any
4941 			int iExtra = 0;
4942 			BYTE * pTok = (BYTE*)m_pFilter->m_fnPushToken ( m_pUserdata, (char*)pRaw, &iExtra, &m_iPosDelta );
4943 			m_bGotExtra = ( iExtra!=0 );
4944 			if ( pTok )
4945 				return pTok;
4946 		}
4947 	}
4948 
GetOvershortCount()4949 	virtual int GetOvershortCount()
4950 	{
4951 		return m_iPosDelta-1;
4952 	}
4953 
TokenIsBlended() const4954 	virtual bool TokenIsBlended() const
4955 	{
4956 		return false;
4957 	}
4958 };
4959 
4960 
CreatePluginFilter(ISphTokenizer * pTokenizer,const CSphString & sSpec,CSphString & sError)4961 ISphTokenizer * ISphTokenizer::CreatePluginFilter ( ISphTokenizer * pTokenizer, const CSphString & sSpec, CSphString & sError )
4962 {
4963 	CSphVector<CSphString> dPlugin; // dll, filtername, options
4964 	if ( !sphPluginParseSpec ( sSpec, dPlugin, sError ) )
4965 		return NULL;
4966 
4967 	if ( !dPlugin.GetLength() )
4968 		return pTokenizer;
4969 
4970 	const PluginDesc_c * p = sphPluginAcquire ( dPlugin[0].cstr(), PLUGIN_INDEX_TOKEN_FILTER, dPlugin[1].cstr(), sError );
4971 	if ( !p )
4972 	{
4973 		sError.SetSprintf ( "INTERNAL ERROR: plugin %s:%s loaded ok but lookup fails", dPlugin[0].cstr(), dPlugin[1].cstr() );
4974 		return NULL;
4975 	}
4976 	ISphTokenizer * pPluginTokenizer = new PluginFilterTokenizer_c ( pTokenizer, (const PluginTokenFilter_c *)p, dPlugin[2].cstr() );
4977 	p->Release(); // plugin got owned by filter no need to leak counter
4978 	return pPluginTokenizer;
4979 }
4980 
4981 
4982 #if USE_RLP
CreateRLPFilter(ISphTokenizer * pTokenizer,bool bChineseRLP,const char * szRLPRoot,const char * szRLPEnv,const char * szRLPCtx,bool bFilterChinese,CSphString & sError)4983 ISphTokenizer * ISphTokenizer::CreateRLPFilter ( ISphTokenizer * pTokenizer, bool bChineseRLP, const char * szRLPRoot,
4984 												const char * szRLPEnv, const char * szRLPCtx, bool bFilterChinese, CSphString & sError )
4985 {
4986 	assert ( pTokenizer );
4987 	if ( !bChineseRLP )
4988 		return pTokenizer;
4989 
4990 	CSphRLPTokenizer * pRLP = new CSphRLPTokenizer ( pTokenizer, szRLPRoot, szRLPEnv, szRLPCtx, bFilterChinese );
4991 	if ( !pRLP->Init ( sError ) )
4992 		SafeDelete ( pRLP );
4993 
4994 	return pRLP;
4995 }
4996 
4997 
CreateRLPResultSplitter(ISphTokenizer * pTokenizer,const char * szRLPCtx)4998 ISphTokenizer * ISphTokenizer::CreateRLPResultSplitter ( ISphTokenizer * pTokenizer, const char * szRLPCtx )
4999 {
5000 	assert ( pTokenizer );
5001 	return new CSphRLPResultSplitter ( pTokenizer, szRLPCtx );
5002 }
5003 
5004 struct QueryRLP_t
5005 {
5006 	BT_RLP_ContextC *				m_pContext;
5007 	BT_RLP_TokenIteratorFactoryC *	m_pFactory;
5008 
QueryRLP_tQueryRLP_t5009 	QueryRLP_t ()
5010 		: m_pContext ( NULL )
5011 		, m_pFactory ( NULL )
5012 	{ }
5013 
~QueryRLP_tQueryRLP_t5014 	~QueryRLP_t()
5015 	{
5016 		if ( m_pFactory )
5017 			BT_RLP_TokenIteratorFactory_Destroy ( m_pFactory );
5018 
5019 		if ( m_pContext )
5020 			BT_RLP_Environment_DestroyContext ( g_pRLPEnv, m_pContext );
5021 
5022 		sphRLPFree();
5023 	}
5024 };
5025 
AddTokenRLP(const BT_Char16 * pToken,CSphTightVector<char> & dBuf,bool bAddSpace)5026 static void AddTokenRLP ( const BT_Char16 * pToken, CSphTightVector<char> & dBuf, bool bAddSpace )
5027 {
5028 	assert ( pToken );
5029 	int iOff = dBuf.GetLength();
5030 	dBuf.Resize ( iOff + SPH_MAX_WORD_LEN*3+1 );
5031 
5032 	if ( bAddSpace )
5033 	{
5034 		dBuf[iOff] = ' ';
5035 		iOff++;
5036 	}
5037 
5038 	bt_xutf16toutf8 ( dBuf.Begin() + iOff, pToken, SPH_MAX_WORD_LEN*3 );
5039 
5040 	int iTokLen = strnlen ( dBuf.Begin() + iOff, SPH_MAX_WORD_LEN*3 );
5041 	dBuf.Resize ( iOff+iTokLen );
5042 }
5043 
TokenenizeRLP(const BYTE * sToken,int iLen,bool bAddSpace,QueryRLP_t & tRLP,CSphTightVector<char> & dBuf,CSphString & sError)5044 static void TokenenizeRLP ( const BYTE * sToken, int iLen, bool bAddSpace, QueryRLP_t & tRLP, CSphTightVector<char> & dBuf, CSphString & sError )
5045 {
5046 	assert ( sToken && iLen );
5047 
5048 	// iteration should still work ok in this case
5049 	if ( BT_RLP_Context_ProcessBuffer ( tRLP.m_pContext, sToken, iLen, BT_LANGUAGE_SIMPLIFIED_CHINESE, "UTF-8", NULL )!=BT_OK )
5050 		sphWarning ( "BT_RLP_Context_ProcessBuffer error" );
5051 
5052 	BT_RLP_TokenIteratorC * pIt = BT_RLP_TokenIteratorFactory_CreateIterator ( tRLP.m_pFactory, tRLP.m_pContext );
5053 	if ( !pIt )
5054 	{
5055 		sError = "BT_RLP_TokenIteratorFactory_CreateIterator error";
5056 		return;
5057 	}
5058 
5059 	while ( BT_RLP_TokenIterator_Next ( pIt ) )
5060 	{
5061 		int iComponents = BT_RLP_TokenIterator_GetNumberOfCompoundComponents ( pIt );
5062 		if ( !iComponents )
5063 		{
5064 			if ( !BT_RLP_TokenIterator_IsStopword ( pIt ) ) // FIXME!!! manage Chinese stopwords properly or disable them at indexing too
5065 			{
5066 				AddTokenRLP ( BT_RLP_TokenIterator_GetToken ( pIt ), dBuf, bAddSpace );
5067 				bAddSpace = true;
5068 			}
5069 		} else
5070 		{
5071 			for ( int i=0; i<iComponents; i++ )
5072 			{
5073 				if ( !BT_RLP_TokenIterator_IsStopword ( pIt ) ) // FIXME!!! manage Chinese stopwords properly or disable them at indexing too
5074 				{
5075 					AddTokenRLP ( BT_RLP_TokenIterator_GetCompoundComponent ( pIt, i ), dBuf, bAddSpace );
5076 					bAddSpace = true;
5077 				}
5078 			}
5079 		}
5080 	}
5081 
5082 	BT_RLP_TokenIterator_Destroy ( pIt );
5083 }
5084 
IsPullCode(int iChar)5085 static bool IsPullCode ( int iChar )
5086 {
5087 	return ( iChar=='!' || iChar=='^' || iChar=='$' || iChar=='*' || iChar=='=' );
5088 }
5089 
5090 
ProcessQueryRLP(const char * sRLPContext,const char * sQuery,const char ** sProcessed,CSphTightVector<char> & dBuf,CSphString & sError)5091 bool ISphTokenizer::ProcessQueryRLP ( const char * sRLPContext, const char * sQuery, const char ** sProcessed, CSphTightVector<char> & dBuf, CSphString & sError )
5092 {
5093 	assert ( g_pRLPEnv && sRLPContext && *sRLPContext );
5094 	assert ( sProcessed );
5095 
5096 	int iQueryLen = sQuery ? strlen ( sQuery ) : 0;
5097 	if ( !iQueryLen || !sphDetectChinese ( (const BYTE *)sQuery, iQueryLen ) )
5098 	{
5099 		*sProcessed = sQuery;
5100 		return true;
5101 	}
5102 
5103 	QueryRLP_t tRLP;
5104 
5105 	if ( !sphRLPInit ( g_sRLPRoot.cstr(), g_sRLPEnv.cstr(), sError ) )
5106 		return false;
5107 
5108 	if ( BT_RLP_Environment_GetContextFromFile ( g_pRLPEnv, sRLPContext, &tRLP.m_pContext )!=BT_OK )
5109 	{
5110 		sError = "Unable to create RLP context";
5111 		return false;
5112 	}
5113 
5114 	tRLP.m_pFactory = BT_RLP_TokenIteratorFactory_Create();
5115 	if ( !tRLP.m_pFactory )
5116 	{
5117 		sError = "Unable to create RLP token iterator factory";
5118 		return false;
5119 	}
5120 
5121 	// TODO: check that query really doesn't need components
5122 	BT_RLP_TokenIteratorFactory_SetReturnCompoundComponents ( tRLP.m_pFactory, true );
5123 
5124 	dBuf.Reserve ( iQueryLen );
5125 
5126 	const BYTE * sBegin = (const BYTE * )sQuery;
5127 	const BYTE * sEnd = (const BYTE * )( sQuery + iQueryLen );
5128 	const BYTE * sCur = (const BYTE * )sQuery;
5129 	const BYTE * sSrc = (const BYTE * )sQuery;
5130 	bool bWasChinese = sphIsChineseCode ( sphUTF8Decode ( sCur ) ); // is initial token RLP?
5131 
5132 	while ( sCur<sEnd )
5133 	{
5134 		const BYTE * sTokenStart = sCur;
5135 		int iCode = sphUTF8Decode ( sCur );
5136 		bool bGotChinese = sphIsChineseCode ( iCode );
5137 
5138 		if ( bGotChinese==bWasChinese )
5139 			continue;
5140 
5141 		int iLen = sTokenStart - sSrc;
5142 		if ( bWasChinese )
5143 		{
5144 			// check char right before Chinese token and make sure to keep specials together
5145 			bool bAddSpace = ( sSrc-1>=sBegin && !IsPullCode ( sSrc[-1] ) );
5146 			TokenenizeRLP ( sSrc, iLen, bAddSpace, tRLP, dBuf, sError );
5147 
5148 			// check char right after Chinese token and make sure to keep specials together
5149 			if ( !IsPullCode ( iCode ) )
5150 				dBuf.Add ( ' ' );
5151 		} else
5152 		{
5153 			char * sDst = dBuf.AddN ( iLen );
5154 			memcpy ( sDst, sSrc, iLen );
5155 		}
5156 
5157 		bWasChinese = bGotChinese;
5158 		sSrc = sTokenStart;
5159 	}
5160 
5161 	// copy query tail
5162 	int iLen = sCur - sSrc;
5163 	if ( bWasChinese )
5164 	{
5165 		// check char right before Chinese token and make sure to keep specials together
5166 		bool bAddSpace = ( sSrc-1>=sBegin && !IsPullCode ( sSrc[-1] ) );
5167 		TokenenizeRLP ( sSrc, iLen, bAddSpace, tRLP, dBuf, sError );
5168 	} else
5169 	{
5170 		char * sDst = dBuf.AddN ( iLen );
5171 		memcpy ( sDst, sSrc, iLen );
5172 	}
5173 	dBuf.Add ( '\0' );
5174 
5175 	if ( sError.IsEmpty() )
5176 	{
5177 		dBuf.Add ( '\0' );
5178 		*sProcessed = dBuf.Begin();
5179 		return true;
5180 	} else
5181 	{
5182 		dBuf.Reset();
5183 		return false;
5184 	}
5185 }
5186 
5187 #endif
5188 
AddSpecialsSPZ(const char * sSpecials,const char * sDirective,CSphString & sError)5189 bool ISphTokenizer::AddSpecialsSPZ ( const char * sSpecials, const char * sDirective, CSphString & sError )
5190 {
5191 	for ( int i=0; sSpecials[i]; i++ )
5192 	{
5193 		int iCode = m_tLC.ToLower ( sSpecials[i] );
5194 		if ( iCode & ( FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_BOUNDARY | FLAG_CODEPOINT_IGNORE ) )
5195 		{
5196 			sError.SetSprintf ( "%s requires that character '%c' is not in ngram_chars, phrase_boundary, or ignore_chars",
5197 				sDirective, sSpecials[i] );
5198 			return false;
5199 		}
5200 	}
5201 
5202 	AddSpecials ( sSpecials );
5203 	return true;
5204 }
5205 
5206 
EnableSentenceIndexing(CSphString & sError)5207 bool ISphTokenizer::EnableSentenceIndexing ( CSphString & sError )
5208 {
5209 	const char sSpecials[] = { '.', '?', '!', MAGIC_CODE_PARAGRAPH, 0 };
5210 
5211 	if ( !AddSpecialsSPZ ( sSpecials, "index_sp", sError ) )
5212 		return false;
5213 
5214 	m_bDetectSentences = true;
5215 	return true;
5216 }
5217 
5218 
EnableZoneIndexing(CSphString & sError)5219 bool ISphTokenizer::EnableZoneIndexing ( CSphString & sError )
5220 {
5221 	static const char sSpecials[] = { MAGIC_CODE_ZONE, 0 };
5222 	return AddSpecialsSPZ ( sSpecials, "index_zones", sError );
5223 }
5224 
GetSettingsFNV() const5225 uint64_t ISphTokenizer::GetSettingsFNV () const
5226 {
5227 	uint64_t uHash = m_tLC.GetFNV();
5228 
5229 	DWORD uFlags = 0;
5230 	if ( m_bBlendSkipPure )
5231 		uFlags |= 1<<1;
5232 	if ( m_bShortTokenFilter )
5233 		uFlags |= 1<<2;
5234 	uHash = sphFNV64 ( &uFlags, sizeof(uFlags), uHash );
5235 	uHash = sphFNV64 ( &m_uBlendVariants, sizeof(m_uBlendVariants), uHash );
5236 
5237 	uHash = sphFNV64 ( &m_tSettings.m_iType, sizeof(m_tSettings.m_iType), uHash );
5238 	uHash = sphFNV64 ( &m_tSettings.m_iMinWordLen, sizeof(m_tSettings.m_iMinWordLen), uHash );
5239 	uHash = sphFNV64 ( &m_tSettings.m_iNgramLen, sizeof(m_tSettings.m_iNgramLen), uHash );
5240 
5241 	return uHash;
5242 }
5243 
5244 //////////////////////////////////////////////////////////////////////////
5245 
CSphTokenizerBase()5246 CSphTokenizerBase::CSphTokenizerBase ()
5247 	: m_pBuffer		( NULL )
5248 	, m_pBufferMax	( NULL )
5249 	, m_pCur		( NULL )
5250 	, m_pTokenStart ( NULL )
5251 	, m_pTokenEnd	( NULL )
5252 	, m_iAccum		( 0 )
5253 	, m_pExc		( NULL )
5254 	, m_bHasBlend	( false )
5255 	, m_pBlendStart		( NULL )
5256 	, m_pBlendEnd		( NULL )
5257 	, m_eMode ( SPH_CLONE_INDEX )
5258 {
5259 	m_pAccum = m_sAccum;
5260 }
5261 
5262 
~CSphTokenizerBase()5263 CSphTokenizerBase::~CSphTokenizerBase()
5264 {
5265 	SafeDelete ( m_pExc );
5266 }
5267 
5268 
SetCaseFolding(const char * sConfig,CSphString & sError)5269 bool CSphTokenizerBase::SetCaseFolding ( const char * sConfig, CSphString & sError )
5270 {
5271 	assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
5272 	if ( m_pExc )
5273 	{
5274 		sError = "SetCaseFolding() must not be called after LoadSynonyms()";
5275 		return false;
5276 	}
5277 	m_bHasBlend = false;
5278 	return ISphTokenizer::SetCaseFolding ( sConfig, sError );
5279 }
5280 
5281 
SetBlendChars(const char * sConfig,CSphString & sError)5282 bool CSphTokenizerBase::SetBlendChars ( const char * sConfig, CSphString & sError )
5283 {
5284 	assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
5285 	m_bHasBlend = ISphTokenizer::SetBlendChars ( sConfig, sError );
5286 	return m_bHasBlend;
5287 }
5288 
5289 
LoadSynonyms(const char * sFilename,const CSphEmbeddedFiles * pFiles,CSphString & sError)5290 bool CSphTokenizerBase::LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError )
5291 {
5292 	assert ( m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
5293 
5294 	ExceptionsTrieGen_c g;
5295 	if ( pFiles )
5296 	{
5297 		m_tSynFileInfo = pFiles->m_tSynonymFile;
5298 		ARRAY_FOREACH ( i, pFiles->m_dSynonyms )
5299 		{
5300 			if ( !g.ParseLine ( (char*)pFiles->m_dSynonyms[i].cstr(), sError ) )
5301 				sphWarning ( "%s line %d: %s", pFiles->m_tSynonymFile.m_sFilename.cstr(), i, sError.cstr() );
5302 		}
5303 	} else
5304 	{
5305 		if ( !sFilename || !*sFilename )
5306 			return true;
5307 
5308 		GetFileStats ( sFilename, m_tSynFileInfo, NULL );
5309 
5310 		CSphAutoreader tReader;
5311 		if ( !tReader.Open ( sFilename, sError ) )
5312 			return false;
5313 
5314 		char sBuffer[1024];
5315 		int iLine = 0;
5316 		while ( tReader.GetLine ( sBuffer, sizeof(sBuffer) )>=0 )
5317 		{
5318 			iLine++;
5319 			if ( !g.ParseLine ( sBuffer, sError ) )
5320 				sphWarning ( "%s line %d: %s", sFilename, iLine, sError.cstr() );
5321 		}
5322 	}
5323 
5324 	m_pExc = g.Build();
5325 	return true;
5326 }
5327 
5328 
WriteSynonyms(CSphWriter & tWriter)5329 void CSphTokenizerBase::WriteSynonyms ( CSphWriter & tWriter )
5330 {
5331 	if ( m_pExc )
5332 		m_pExc->Export ( tWriter );
5333 	else
5334 		tWriter.PutDword ( 0 );
5335 }
5336 
5337 
CloneBase(const CSphTokenizerBase * pFrom,ESphTokenizerClone eMode)5338 void CSphTokenizerBase::CloneBase ( const CSphTokenizerBase * pFrom, ESphTokenizerClone eMode )
5339 {
5340 	m_eMode = eMode;
5341 	m_pExc = NULL;
5342 	if ( pFrom->m_pExc )
5343 	{
5344 		m_pExc = new ExceptionsTrie_c();
5345 		*m_pExc = *pFrom->m_pExc;
5346 	}
5347 	m_tSettings = pFrom->m_tSettings;
5348 	m_bHasBlend = pFrom->m_bHasBlend;
5349 	m_uBlendVariants = pFrom->m_uBlendVariants;
5350 	m_bBlendSkipPure = pFrom->m_bBlendSkipPure;
5351 	m_bShortTokenFilter = ( eMode!=SPH_CLONE_INDEX );
5352 
5353 	switch ( eMode )
5354 	{
5355 		case SPH_CLONE_INDEX:
5356 			assert ( pFrom->m_eMode==SPH_CLONE_INDEX );
5357 			m_tLC = pFrom->m_tLC;
5358 			break;
5359 
5360 		case SPH_CLONE_QUERY:
5361 		{
5362 			assert ( pFrom->m_eMode==SPH_CLONE_INDEX || pFrom->m_eMode==SPH_CLONE_QUERY );
5363 			m_tLC = pFrom->m_tLC;
5364 
5365 			CSphVector<CSphRemapRange> dRemaps;
5366 			CSphRemapRange Range;
5367 			Range.m_iStart = Range.m_iEnd = Range.m_iRemapStart = '\\';
5368 			dRemaps.Add ( Range );
5369 			m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_SPECIAL );
5370 
5371 			m_uBlendVariants = BLEND_TRIM_NONE;
5372 			break;
5373 		}
5374 
5375 		case SPH_CLONE_QUERY_LIGHTWEIGHT:
5376 		{
5377 			// FIXME? avoid double lightweight clones, too?
5378 			assert ( pFrom->m_eMode!=SPH_CLONE_INDEX );
5379 			assert ( pFrom->m_tLC.ToLower ( '\\' ) & FLAG_CODEPOINT_SPECIAL );
5380 
5381 			// lightweight tokenizer clone
5382 			// copy 3 KB of lowercaser chunk pointers, but do NOT copy the table data
5383 			SafeDeleteArray ( m_tLC.m_pData );
5384 			m_tLC.m_iChunks = 0;
5385 			m_tLC.m_pData = NULL;
5386 			for ( int i=0; i<CSphLowercaser::CHUNK_COUNT; i++ )
5387 				m_tLC.m_pChunk[i] = pFrom->m_tLC.m_pChunk[i];
5388 			break;
5389 		}
5390 	}
5391 }
5392 
GetSettingsFNV() const5393 uint64_t CSphTokenizerBase::GetSettingsFNV () const
5394 {
5395 	uint64_t uHash = ISphTokenizer::GetSettingsFNV();
5396 
5397 	DWORD uFlags = 0;
5398 	if ( m_bHasBlend )
5399 		uFlags |= 1<<0;
5400 	uHash = sphFNV64 ( &uFlags, sizeof(uFlags), uHash );
5401 
5402 	return uHash;
5403 }
5404 
5405 
SetBufferPtr(const char * sNewPtr)5406 void CSphTokenizerBase::SetBufferPtr ( const char * sNewPtr )
5407 {
5408 	assert ( (BYTE*)sNewPtr>=m_pBuffer && (BYTE*)sNewPtr<=m_pBufferMax );
5409 	m_pCur = Min ( m_pBufferMax, Max ( m_pBuffer, (const BYTE*)sNewPtr ) );
5410 	m_iAccum = 0;
5411 	m_pAccum = m_sAccum;
5412 	m_pTokenStart = m_pTokenEnd = NULL;
5413 	m_pBlendStart = m_pBlendEnd = NULL;
5414 }
5415 
5416 
SkipBlended()5417 int CSphTokenizerBase2::SkipBlended()
5418 {
5419 	if ( !m_pBlendEnd )
5420 		return 0;
5421 
5422 	const BYTE * pMax = m_pBufferMax;
5423 	m_pBufferMax = m_pBlendEnd;
5424 
5425 	// loop until the blended token end
5426 	int iBlended = 0; // how many blended subtokens we have seen so far
5427 	int iAccum = 0; // how many non-blended chars in a row we have seen so far
5428 	while ( m_pCur < m_pBufferMax )
5429 	{
5430 		int iCode = GetCodepoint();
5431 		if ( iCode=='\\' )
5432 			iCode = GetCodepoint(); // no boundary check, GetCP does it
5433 		iCode = m_tLC.ToLower ( iCode ); // no -1 check, ToLower does it
5434 		if ( iCode<0 )
5435 			iCode = 0;
5436 		if ( iCode & FLAG_CODEPOINT_BLEND )
5437 			iCode = 0;
5438 		if ( iCode & MASK_CODEPOINT )
5439 		{
5440 			iAccum++;
5441 			continue;
5442 		}
5443 		if ( iAccum>=m_tSettings.m_iMinWordLen )
5444 			iBlended++;
5445 		iAccum = 0;
5446 	}
5447 	if ( iAccum>=m_tSettings.m_iMinWordLen )
5448 		iBlended++;
5449 
5450 	m_pBufferMax = pMax;
5451 	return iBlended;
5452 }
5453 
5454 
5455 /// adjusts blending magic when we're about to return a token (any token)
5456 /// returns false if current token should be skipped, true otherwise
BlendAdjust(const BYTE * pCur)5457 bool CSphTokenizerBase::BlendAdjust ( const BYTE * pCur )
5458 {
5459 	// check if all we got is a bunch of blended characters (pure-blended case)
5460 	if ( m_bBlended && !m_bNonBlended )
5461 	{
5462 		// we either skip this token, or pretend it was normal
5463 		// in both cases, clear the flag
5464 		m_bBlended = false;
5465 
5466 		// do we need to skip it?
5467 		if ( m_bBlendSkipPure )
5468 		{
5469 			m_pBlendStart = NULL;
5470 			return false;
5471 		}
5472 	}
5473 	m_bNonBlended = false;
5474 
5475 	// adjust buffer pointers
5476 	if ( m_bBlended && m_pBlendStart )
5477 	{
5478 		// called once per blended token, on processing start
5479 		// at this point, full blended token is in the accumulator
5480 		// and we're about to return it
5481 		m_pCur = m_pBlendStart;
5482 		m_pBlendEnd = pCur;
5483 		m_pBlendStart = NULL;
5484 		m_bBlendedPart = true;
5485 	} else if ( pCur>=m_pBlendEnd )
5486 	{
5487 		// tricky bit, as at this point, token we're about to return
5488 		// can either be a blended subtoken, or the next one
5489 		m_bBlendedPart = ( m_pTokenStart!=NULL ) && ( m_pTokenStart<m_pBlendEnd );
5490 		m_pBlendEnd = NULL;
5491 		m_pBlendStart = NULL;
5492 	} else if ( !m_pBlendEnd )
5493 	{
5494 		// we aren't re-parsing blended; so clear the "blended subtoken" flag
5495 		m_bBlendedPart = false;
5496 	}
5497 	return true;
5498 }
5499 
5500 
CopySubstring(BYTE * pDst,const BYTE * pSrc,int iLen)5501 static inline void CopySubstring ( BYTE * pDst, const BYTE * pSrc, int iLen )
5502 {
5503 	while ( iLen-->0 && *pSrc )
5504 		*pDst++ = *pSrc++;
5505 	*pDst++ = '\0';
5506 }
5507 
5508 
GetBlendedVariant()5509 BYTE * CSphTokenizerBase2::GetBlendedVariant ()
5510 {
5511 	// we can get called on several occasions
5512 	// case 1, a new blended token was just accumulated
5513 	if ( m_bBlended && !m_bBlendAdd )
5514 	{
5515 		// fast path for the default case (trim_none)
5516 		if ( m_uBlendVariants==BLEND_TRIM_NONE )
5517 			return m_sAccum;
5518 
5519 		// analyze the full token, find non-blended bounds
5520 		m_iBlendNormalStart = -1;
5521 		m_iBlendNormalEnd = -1;
5522 
5523 		// OPTIMIZE? we can skip this based on non-blended flag from adjust
5524 		const BYTE * p = m_sAccum;
5525 		while ( *p )
5526 		{
5527 			int iLast = (int)( p-m_sAccum );
5528 			int iCode = sphUTF8Decode(p);
5529 			if (!( m_tLC.ToLower ( iCode ) & FLAG_CODEPOINT_BLEND ))
5530 			{
5531 				m_iBlendNormalEnd = (int)( p-m_sAccum );
5532 				if ( m_iBlendNormalStart<0 )
5533 					m_iBlendNormalStart = iLast;
5534 			}
5535 		}
5536 
5537 		// build todo mask
5538 		// check and revert a few degenerate cases
5539 		m_uBlendVariantsPending = m_uBlendVariants;
5540 		if ( m_uBlendVariantsPending & BLEND_TRIM_BOTH )
5541 		{
5542 			if ( m_iBlendNormalStart<0 )
5543 			{
5544 				// no heading blended; revert BOTH to TAIL
5545 				m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
5546 				m_uBlendVariantsPending |= BLEND_TRIM_TAIL;
5547 			} else if ( m_iBlendNormalEnd<0 )
5548 			{
5549 				// no trailing blended; revert BOTH to HEAD
5550 				m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
5551 				m_uBlendVariantsPending |= BLEND_TRIM_HEAD;
5552 			}
5553 		}
5554 		if ( m_uBlendVariantsPending & BLEND_TRIM_HEAD )
5555 		{
5556 			// either no heading blended, or pure blended; revert HEAD to NONE
5557 			if ( m_iBlendNormalStart<=0 )
5558 			{
5559 				m_uBlendVariantsPending &= ~BLEND_TRIM_HEAD;
5560 				m_uBlendVariantsPending |= BLEND_TRIM_NONE;
5561 			}
5562 		}
5563 		if ( m_uBlendVariantsPending & BLEND_TRIM_TAIL )
5564 		{
5565 			// either no trailing blended, or pure blended; revert TAIL to NONE
5566 			if ( m_iBlendNormalEnd<=0 || m_sAccum[m_iBlendNormalEnd]==0 )
5567 			{
5568 				m_uBlendVariantsPending &= ~BLEND_TRIM_TAIL;
5569 				m_uBlendVariantsPending |= BLEND_TRIM_NONE;
5570 			}
5571 		}
5572 
5573 		// ok, we are going to return a few variants after all, flag that
5574 		// OPTIMIZE? add fast path for "single" variants?
5575 		m_bBlendAdd = true;
5576 		assert ( m_uBlendVariantsPending );
5577 
5578 		// we also have to stash the original blended token
5579 		// because accumulator contents may get trashed by caller (say, when stemming)
5580 		strncpy ( (char*)m_sAccumBlend, (char*)m_sAccum, sizeof(m_sAccumBlend) );
5581 	}
5582 
5583 	// case 2, caller is checking for pending variants, have we even got any?
5584 	if ( !m_bBlendAdd )
5585 		return NULL;
5586 
5587 	// handle trim_none
5588 	// this MUST be the first handler, so that we could avoid copying below, and just return the original accumulator
5589 	if ( m_uBlendVariantsPending & BLEND_TRIM_NONE )
5590 	{
5591 		m_uBlendVariantsPending &= ~BLEND_TRIM_NONE;
5592 		m_bBlended = true;
5593 		return m_sAccum;
5594 	}
5595 
5596 	// handle trim_both
5597 	if ( m_uBlendVariantsPending & BLEND_TRIM_BOTH )
5598 	{
5599 		m_uBlendVariantsPending &= ~BLEND_TRIM_BOTH;
5600 		if ( m_iBlendNormalStart<0 )
5601 			m_uBlendVariantsPending |= BLEND_TRIM_TAIL; // no heading blended; revert BOTH to TAIL
5602 		else if ( m_iBlendNormalEnd<0 )
5603 			m_uBlendVariantsPending |= BLEND_TRIM_HEAD; // no trailing blended; revert BOTH to HEAD
5604 		else
5605 		{
5606 			assert ( m_iBlendNormalStart<m_iBlendNormalEnd );
5607 			CopySubstring ( m_sAccum, m_sAccumBlend+m_iBlendNormalStart, m_iBlendNormalEnd-m_iBlendNormalStart );
5608 			m_bBlended = true;
5609 			return m_sAccum;
5610 		}
5611 	}
5612 
5613 	// handle TRIM_HEAD
5614 	if ( m_uBlendVariantsPending & BLEND_TRIM_HEAD )
5615 	{
5616 		m_uBlendVariantsPending &= ~BLEND_TRIM_HEAD;
5617 		if ( m_iBlendNormalStart>=0 )
5618 		{
5619 			// FIXME! need we check for overshorts?
5620 			CopySubstring ( m_sAccum, m_sAccumBlend+m_iBlendNormalStart, sizeof(m_sAccum) );
5621 			m_bBlended = true;
5622 			return m_sAccum;
5623 		}
5624 	}
5625 
5626 	// handle TRIM_TAIL
5627 	if ( m_uBlendVariantsPending & BLEND_TRIM_TAIL )
5628 	{
5629 		m_uBlendVariantsPending &= ~BLEND_TRIM_TAIL;
5630 		if ( m_iBlendNormalEnd>0 )
5631 		{
5632 			// FIXME! need we check for overshorts?
5633 			CopySubstring ( m_sAccum, m_sAccumBlend, m_iBlendNormalEnd );
5634 			m_bBlended = true;
5635 			return m_sAccum;
5636 		}
5637 	}
5638 
5639 	// all clear, no more variants to go
5640 	m_bBlendAdd = false;
5641 	return NULL;
5642 }
5643 
5644 
IsCapital(int iCh)5645 static inline bool IsCapital ( int iCh )
5646 {
5647 	return iCh>='A' && iCh<='Z';
5648 }
5649 
5650 
IsWhitespace(BYTE c)5651 static inline bool IsWhitespace ( BYTE c )
5652 {
5653 	return ( c=='\0' || c==' ' || c=='\t' || c=='\r' || c=='\n' );
5654 }
5655 
5656 
IsWhitespace(int c)5657 static inline bool IsWhitespace ( int c )
5658 {
5659 	return ( c=='\0' || c==' ' || c=='\t' || c=='\r' || c=='\n' );
5660 }
5661 
5662 
IsBoundary(BYTE c,bool bPhrase)5663 static inline bool IsBoundary ( BYTE c, bool bPhrase )
5664 {
5665 	// FIXME? sorta intersects with specials
5666 	// then again, a shortened-down list (more strict syntax) is reasonble here too
5667 	return IsWhitespace(c) || c=='"' || ( !bPhrase && ( c=='(' || c==')' || c=='|' ) );
5668 }
5669 
5670 
IsPunctuation(int c)5671 static inline bool IsPunctuation ( int c )
5672 {
5673 	return ( c>=33 && c<=47 ) || ( c>=58 && c<=64 ) || ( c>=91 && c<=96 ) || ( c>=123 && c<=126 );
5674 }
5675 
5676 
CodepointArbitrationI(int iCode)5677 int CSphTokenizerBase::CodepointArbitrationI ( int iCode )
5678 {
5679 	if ( !m_bDetectSentences )
5680 		return iCode;
5681 
5682 	// detect sentence boundaries
5683 	// FIXME! should use charset_table (or add a new directive) and support languages other than English
5684 	int iSymbol = iCode & MASK_CODEPOINT;
5685 	if ( iSymbol=='?' || iSymbol=='!' )
5686 	{
5687 		// definitely a sentence boundary
5688 		return MAGIC_CODE_SENTENCE | FLAG_CODEPOINT_SPECIAL;
5689 	}
5690 
5691 	if ( iSymbol=='.' )
5692 	{
5693 		// inline dot ("in the U.K and"), not a boundary
5694 		bool bInwordDot = ( sphIsAlpha ( m_pCur[0] ) || m_pCur[0]==',' );
5695 
5696 		// followed by a small letter or an opening paren, not a boundary
5697 		// FIXME? might want to scan for more than one space
5698 		// Yoyodine Inc. exists ...
5699 		// Yoyodine Inc. (the company) ..
5700 		bool bInphraseDot = ( sphIsSpace ( m_pCur[0] )
5701 			&& ( ( 'a'<=m_pCur[1] && m_pCur[1]<='z' )
5702 				|| ( m_pCur[1]=='(' && 'a'<=m_pCur[2] && m_pCur[2]<='z' ) ) );
5703 
5704 		// preceded by something that looks like a middle name, opening first name, salutation
5705 		bool bMiddleName = false;
5706 		switch ( m_iAccum )
5707 		{
5708 			case 1:
5709 				// 1-char capital letter
5710 				// example: J. R. R. Tolkien, who wrote Hobbit ...
5711 				// example: John D. Doe ...
5712 				bMiddleName = IsCapital ( m_pCur[-2] );
5713 				break;
5714 			case 2:
5715 				// 2-char token starting with a capital
5716 				if ( IsCapital ( m_pCur[-3] ) )
5717 				{
5718 					// capital+small
5719 					// example: Known as Mr. Doe ...
5720 					if ( !IsCapital ( m_pCur[-2] ) )
5721 						bMiddleName = true;
5722 
5723 					// known capital+capital (MR, DR, MS)
5724 					if (
5725 						( m_pCur[-3]=='M' && m_pCur[-2]=='R' ) ||
5726 						( m_pCur[-3]=='M' && m_pCur[-2]=='S' ) ||
5727 						( m_pCur[-3]=='D' && m_pCur[-2]=='R' ) )
5728 							bMiddleName = true;
5729 				}
5730 				break;
5731 			case 3:
5732 				// preceded by a known 3-byte token (MRS, DRS)
5733 				// example: Survived by Mrs. Doe ...
5734 				if ( ( m_sAccum[0]=='m' || m_sAccum[0]=='d' ) && m_sAccum[1]=='r' && m_sAccum[2]=='s' )
5735 					bMiddleName = true;
5736 				break;
5737 		}
5738 
5739 		if ( !bInwordDot && !bInphraseDot && !bMiddleName )
5740 		{
5741 			// sentence boundary
5742 			return MAGIC_CODE_SENTENCE | FLAG_CODEPOINT_SPECIAL;
5743 		} else
5744 		{
5745 			// just a character
5746 			if ( ( iCode & MASK_FLAGS )==FLAG_CODEPOINT_SPECIAL )
5747 				return 0; // special only, not dual? then in this context, it is a separator
5748 			else
5749 				return iCode & ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL ); // perhaps it was blended, so return the original code
5750 		}
5751 	}
5752 
5753 	// pass-through
5754 	return iCode;
5755 }
5756 
5757 
CodepointArbitrationQ(int iCode,bool bWasEscaped,BYTE uNextByte)5758 int CSphTokenizerBase::CodepointArbitrationQ ( int iCode, bool bWasEscaped, BYTE uNextByte )
5759 {
5760 	if ( iCode & FLAG_CODEPOINT_NGRAM )
5761 		return iCode; // ngrams are handled elsewhere
5762 
5763 	int iSymbol = iCode & MASK_CODEPOINT;
5764 
5765 	// codepoints can't be blended and special at the same time
5766 	if ( ( iCode & FLAG_CODEPOINT_BLEND ) && ( iCode & FLAG_CODEPOINT_SPECIAL ) )
5767 	{
5768 		bool bBlend =
5769 			bWasEscaped || // escaped characters should always act as blended
5770 			( m_bPhrase && !sphIsModifier ( iSymbol ) && iSymbol!='"' ) || // non-modifier special inside phrase
5771 			( m_iAccum && ( iSymbol=='@' || iSymbol=='/' || iSymbol=='-' ) ); // some specials in the middle of a token
5772 
5773 		// clear special or blend flags
5774 		iCode &= bBlend
5775 			? ~( FLAG_CODEPOINT_DUAL | FLAG_CODEPOINT_SPECIAL )
5776 			: ~( FLAG_CODEPOINT_DUAL | FLAG_CODEPOINT_BLEND );
5777 	}
5778 
5779 	// escaped specials are not special
5780 	// dash and dollar inside the word are not special (however, single opening modifier is not a word!)
5781 	// non-modifier specials within phrase are not special
5782 	bool bDashInside = ( m_iAccum && iSymbol=='-' && !( m_iAccum==1 && sphIsModifier ( m_sAccum[0] ) ));
5783 	if ( iCode & FLAG_CODEPOINT_SPECIAL )
5784 		if ( bWasEscaped
5785 			|| bDashInside
5786 			|| ( m_iAccum && iSymbol=='$' && !IsBoundary ( uNextByte, m_bPhrase ) )
5787 			|| ( m_bPhrase && iSymbol!='"' && !sphIsModifier ( iSymbol ) ) )
5788 	{
5789 		if ( iCode & FLAG_CODEPOINT_DUAL )
5790 			iCode &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
5791 		else
5792 			iCode = 0;
5793 	}
5794 
5795 	// if we didn't remove special by now, it must win
5796 	if ( iCode & FLAG_CODEPOINT_DUAL )
5797 	{
5798 		assert ( iCode & FLAG_CODEPOINT_SPECIAL );
5799 		iCode = iSymbol | FLAG_CODEPOINT_SPECIAL;
5800 	}
5801 
5802 	// ideally, all conflicts must be resolved here
5803 	// well, at least most
5804 	assert ( sphBitCount ( iCode & MASK_FLAGS )<=1 );
5805 	return iCode;
5806 }
5807 
5808 #if !USE_WINDOWS
5809 #define __forceinline inline
5810 #endif
5811 
IsSeparator(int iFolded,bool bFirst)5812 static __forceinline bool IsSeparator ( int iFolded, bool bFirst )
5813 {
5814 	// eternal separator
5815 	if ( iFolded<0 || ( iFolded & MASK_CODEPOINT )==0 )
5816 		return true;
5817 
5818 	// just a codepoint
5819 	if (!( iFolded & MASK_FLAGS ))
5820 		return false;
5821 
5822 	// any magic flag, besides dual
5823 	if (!( iFolded & FLAG_CODEPOINT_DUAL ))
5824 		return true;
5825 
5826 	// FIXME? n-grams currently also set dual
5827 	if ( iFolded & FLAG_CODEPOINT_NGRAM )
5828 		return true;
5829 
5830 	// dual depends on position
5831 	return bFirst;
5832 }
5833 
5834 // handles escaped specials that are not in the character set
5835 // returns true if the codepoint should be processed as a simple codepoint,
5836 // returns false if it should be processed as a whitespace
5837 // for example: aaa\!bbb => aaa bbb
Special2Simple(int & iCodepoint)5838 static inline bool Special2Simple ( int & iCodepoint )
5839 {
5840 	if ( ( iCodepoint & FLAG_CODEPOINT_DUAL ) || !( iCodepoint & FLAG_CODEPOINT_SPECIAL ) )
5841 	{
5842 		iCodepoint &= ~( FLAG_CODEPOINT_SPECIAL | FLAG_CODEPOINT_DUAL );
5843 		return true;
5844 	}
5845 
5846 	return false;
5847 }
5848 
5849 
RemapCharacters(const char * sConfig,DWORD uFlags,const char * sSource,bool bCanRemap,CSphString & sError)5850 bool ISphTokenizer::RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError )
5851 {
5852 	// parse
5853 	CSphVector<CSphRemapRange> dRemaps;
5854 	CSphCharsetDefinitionParser tParser;
5855 	if ( !tParser.Parse ( sConfig, dRemaps ) )
5856 	{
5857 		sError = tParser.GetLastError();
5858 		return false;
5859 	}
5860 
5861 	// check
5862 	ARRAY_FOREACH ( i, dRemaps )
5863 	{
5864 		const CSphRemapRange & r = dRemaps[i];
5865 
5866 		if ( !bCanRemap && r.m_iStart!=r.m_iRemapStart )
5867 		{
5868 			sError.SetSprintf ( "%s characters must not be remapped (map-from=U+%x, map-to=U+%x)",
5869 				sSource, r.m_iStart, r.m_iRemapStart );
5870 			return false;
5871 		}
5872 
5873 		for ( int j=r.m_iStart; j<=r.m_iEnd; j++ )
5874 			if ( m_tLC.ToLower(j) )
5875 		{
5876 			sError.SetSprintf ( "%s characters must not be referenced anywhere else (code=U+%x)", sSource, j );
5877 			return false;
5878 		}
5879 
5880 		if ( bCanRemap )
5881 			for ( int j=r.m_iRemapStart; j<=r.m_iRemapStart + r.m_iEnd - r.m_iStart; j++ )
5882 				if ( m_tLC.ToLower(j) )
5883 		{
5884 			sError.SetSprintf ( "%s characters must not be referenced anywhere else (code=U+%x)", sSource, j );
5885 			return false;
5886 		}
5887 	}
5888 
5889 	// add mapping
5890 	m_tLC.AddRemaps ( dRemaps, uFlags );
5891 	return true;
5892 }
5893 
SetBoundary(const char * sConfig,CSphString & sError)5894 bool ISphTokenizer::SetBoundary ( const char * sConfig, CSphString & sError )
5895 {
5896 	return RemapCharacters ( sConfig, FLAG_CODEPOINT_BOUNDARY, "phrase boundary", false, sError );
5897 }
5898 
SetIgnoreChars(const char * sConfig,CSphString & sError)5899 bool ISphTokenizer::SetIgnoreChars ( const char * sConfig, CSphString & sError )
5900 {
5901 	return RemapCharacters ( sConfig, FLAG_CODEPOINT_IGNORE, "ignored", false, sError );
5902 }
5903 
SetBlendChars(const char * sConfig,CSphString & sError)5904 bool ISphTokenizer::SetBlendChars ( const char * sConfig, CSphString & sError )
5905 {
5906 	return sConfig ? RemapCharacters ( sConfig, FLAG_CODEPOINT_BLEND, "blend", true, sError ) : false;
5907 }
5908 
5909 
sphStrncmp(const char * sCheck,int iCheck,const char * sRef)5910 static bool sphStrncmp ( const char * sCheck, int iCheck, const char * sRef )
5911 {
5912 	return ( iCheck==(int)strlen(sRef) && memcmp ( sCheck, sRef, iCheck )==0 );
5913 }
5914 
5915 
SetBlendMode(const char * sMode,CSphString & sError)5916 bool ISphTokenizer::SetBlendMode ( const char * sMode, CSphString & sError )
5917 {
5918 	if ( !sMode || !*sMode )
5919 	{
5920 		m_uBlendVariants = BLEND_TRIM_NONE;
5921 		m_bBlendSkipPure = false;
5922 		return true;
5923 	}
5924 
5925 	m_uBlendVariants = 0;
5926 	const char * p = sMode;
5927 	while ( *p )
5928 	{
5929 		while ( !sphIsAlpha(*p) )
5930 			p++;
5931 		if ( !*p )
5932 			break;
5933 
5934 		const char * sTok = p;
5935 		while ( sphIsAlpha(*p) )
5936 			p++;
5937 		if ( sphStrncmp ( sTok, p-sTok, "trim_none" ) )
5938 			m_uBlendVariants |= BLEND_TRIM_NONE;
5939 		else if ( sphStrncmp ( sTok, p-sTok, "trim_head" ) )
5940 			m_uBlendVariants |= BLEND_TRIM_HEAD;
5941 		else if ( sphStrncmp ( sTok, p-sTok, "trim_tail" ) )
5942 			m_uBlendVariants |= BLEND_TRIM_TAIL;
5943 		else if ( sphStrncmp ( sTok, p-sTok, "trim_both" ) )
5944 			m_uBlendVariants |= BLEND_TRIM_BOTH;
5945 		else if ( sphStrncmp ( sTok, p-sTok, "skip_pure" ) )
5946 			m_bBlendSkipPure = true;
5947 		else
5948 		{
5949 			sError.SetSprintf ( "unknown blend_mode option near '%s'", sTok );
5950 			return false;
5951 		}
5952 	}
5953 
5954 	if ( !m_uBlendVariants )
5955 	{
5956 		sError.SetSprintf ( "blend_mode must define at least one variant to index" );
5957 		m_uBlendVariants = BLEND_TRIM_NONE;
5958 		m_bBlendSkipPure = false;
5959 		return false;
5960 	}
5961 	return true;
5962 }
5963 
5964 /////////////////////////////////////////////////////////////////////////////
5965 
5966 template < bool IS_QUERY >
CSphTokenizer_UTF8()5967 CSphTokenizer_UTF8<IS_QUERY>::CSphTokenizer_UTF8 ()
5968 {
5969 	CSphString sTmp;
5970 	SetCaseFolding ( SPHINX_DEFAULT_UTF8_TABLE, sTmp );
5971 	m_bHasBlend = false;
5972 }
5973 
5974 
5975 template < bool IS_QUERY >
SetBuffer(const BYTE * sBuffer,int iLength)5976 void CSphTokenizer_UTF8<IS_QUERY>::SetBuffer ( const BYTE * sBuffer, int iLength )
5977 {
5978 	// check that old one is over and that new length is sane
5979 	assert ( iLength>=0 );
5980 
5981 	// set buffer
5982 	m_pBuffer = sBuffer;
5983 	m_pBufferMax = sBuffer + iLength;
5984 	m_pCur = sBuffer;
5985 	m_pTokenStart = m_pTokenEnd = NULL;
5986 	m_pBlendStart = m_pBlendEnd = NULL;
5987 
5988 	m_iOvershortCount = 0;
5989 	m_bBoundary = m_bTokenBoundary = false;
5990 }
5991 
5992 
5993 template < bool IS_QUERY >
GetToken()5994 BYTE * CSphTokenizer_UTF8<IS_QUERY>::GetToken ()
5995 {
5996 	m_bWasSpecial = false;
5997 	m_bBlended = false;
5998 	m_iOvershortCount = 0;
5999 	m_bTokenBoundary = false;
6000 	m_bWasSynonym = false;
6001 
6002 	return m_bHasBlend
6003 		? DoGetToken<IS_QUERY,true>()
6004 		: DoGetToken<IS_QUERY,false>();
6005 }
6006 
6007 
CheckException(const BYTE * pStart,const BYTE * pCur,bool bQueryMode)6008 bool CSphTokenizerBase2::CheckException ( const BYTE * pStart, const BYTE * pCur, bool bQueryMode )
6009 {
6010 	assert ( m_pExc );
6011 	assert ( pStart );
6012 
6013 	// at this point [pStart,pCur) is our regular tokenization candidate,
6014 	// and pCur is pointing at what normally is considered separtor
6015 	//
6016 	// however, it might be either a full exception (if we're really lucky)
6017 	// or (more likely) an exception prefix, so lets check for that
6018 	//
6019 	// interestingly enough, note that our token might contain a full exception
6020 	// as a prefix, for instance [USAF] token vs [USA] exception; but in that case
6021 	// we still need to tokenize regularly, because even exceptions need to honor
6022 	// word boundaries
6023 
6024 	// lets begin with a special (hopefully fast) check for the 1st byte
6025 	const BYTE * p = pStart;
6026 	if ( m_pExc->GetFirst ( *p )<0 )
6027 		return false;
6028 
6029 	// consume all the (character data) bytes until the first separator
6030 	int iNode = 0;
6031 	while ( p<pCur )
6032 	{
6033 		if ( bQueryMode && *p=='\\' )
6034 		{
6035 			p++;
6036 			continue;;
6037 		}
6038 		iNode = m_pExc->GetNext ( iNode, *p++ );
6039 		if ( iNode<0 )
6040 			return false;
6041 	}
6042 
6043 	const BYTE * pMapEnd = NULL; // the longest exception found so far is [pStart,pMapEnd)
6044 	const BYTE * pMapTo = NULL; // the destination mapping
6045 
6046 	// now, we got ourselves a valid exception prefix, so lets keep consuming more bytes,
6047 	// ie. until further separators, and keep looking for a full exception match
6048 	while ( iNode>=0 )
6049 	{
6050 		// in query mode, ignore quoting slashes
6051 		if ( bQueryMode && *p=='\\' )
6052 		{
6053 			p++;
6054 			continue;
6055 		}
6056 
6057 		// decode one more codepoint, check if it is a separator
6058 		bool bSep = true;
6059 		bool bSpace = sphIsSpace(*p); // okay despite utf-8, cause hard whitespace is all ascii-7
6060 
6061 		const BYTE * q = p;
6062 		if ( p<m_pBufferMax )
6063 			bSep = IsSeparator ( m_tLC.ToLower ( sphUTF8Decode(q) ), false ); // FIXME? sometimes they ARE first
6064 
6065 		// there is a separator ahead, so check if we have a full match
6066 		if ( bSep && m_pExc->GetMapping(iNode) )
6067 		{
6068 			pMapEnd = p;
6069 			pMapTo = m_pExc->GetMapping(iNode);
6070 		}
6071 
6072 		// eof? bail
6073 		if ( p>=m_pBufferMax )
6074 			break;
6075 
6076 		// not eof? consume those bytes
6077 		if ( bSpace )
6078 		{
6079 			// and fold (hard) whitespace while we're at it!
6080 			while ( sphIsSpace(*p) )
6081 				p++;
6082 			iNode = m_pExc->GetNext ( iNode, ' ' );
6083 		} else
6084 		{
6085 			// just consume the codepoint, byte-by-byte
6086 			while ( p<q && iNode>=0 )
6087 				iNode = m_pExc->GetNext ( iNode, *p++ );
6088 		}
6089 
6090 		// we just consumed a separator, so check for a full match again
6091 		if ( iNode>=0 && bSep && m_pExc->GetMapping(iNode) )
6092 		{
6093 			pMapEnd = p;
6094 			pMapTo = m_pExc->GetMapping(iNode);
6095 		}
6096 	}
6097 
6098 	// found anything?
6099 	if ( !pMapTo )
6100 		return false;
6101 
6102 	strncpy ( (char*)m_sAccum, (char*)pMapTo, sizeof(m_sAccum) );
6103 	m_pCur = pMapEnd;
6104 	m_pTokenStart = pStart;
6105 	m_pTokenEnd = pMapEnd;
6106 	m_iLastTokenLen = strlen ( (char*)m_sAccum );
6107 
6108 	m_bWasSynonym = true;
6109 	return true;
6110 }
6111 
6112 template < bool IS_QUERY, bool IS_BLEND >
DoGetToken()6113 BYTE * CSphTokenizerBase2::DoGetToken ()
6114 {
6115 	// return pending blending variants
6116 	if_const ( IS_BLEND )
6117 	{
6118 		BYTE * pVar = GetBlendedVariant ();
6119 		if ( pVar )
6120 			return pVar;
6121 		m_bBlendedPart = ( m_pBlendEnd!=NULL );
6122 	}
6123 
6124 	// in query mode, lets capture (soft-whitespace hard-whitespace) sequences and adjust overshort counter
6125 	// sample queries would be (one NEAR $$$) or (one | $$$ two) where $ is not a valid character
6126 	bool bGotNonToken = ( !IS_QUERY || m_bPhrase ); // only do this in query mode, never in indexing mode, never within phrases
6127 	bool bGotSoft = false; // hey Beavis he said soft huh huhhuh
6128 
6129 	m_pTokenStart = NULL;
6130 	for ( ;; )
6131 	{
6132 		// get next codepoint
6133 		const BYTE * const pCur = m_pCur; // to redo special char, if there's a token already
6134 
6135 		int iCodePoint;
6136 		int iCode;
6137 		if ( pCur<m_pBufferMax && *pCur<128 )
6138 		{
6139 			iCodePoint = *m_pCur++;
6140 			iCode = m_tLC.m_pChunk[0][iCodePoint];
6141 		} else
6142 		{
6143 			iCodePoint = GetCodepoint(); // advances m_pCur
6144 			iCode = m_tLC.ToLower ( iCodePoint );
6145 		}
6146 
6147 		// handle escaping
6148 		bool bWasEscaped = ( IS_QUERY && iCodePoint=='\\' ); // whether current codepoint was escaped
6149 		if ( bWasEscaped )
6150 		{
6151 			iCodePoint = GetCodepoint();
6152 			iCode = m_tLC.ToLower ( iCodePoint );
6153 			if ( !Special2Simple ( iCode ) )
6154 				iCode = 0;
6155 		}
6156 
6157 		// handle eof
6158 		if ( iCode<0 )
6159 		{
6160 			FlushAccum ();
6161 
6162 			// suddenly, exceptions
6163 			if ( m_pExc && m_pTokenStart && CheckException ( m_pTokenStart, pCur, IS_QUERY ) )
6164 				return m_sAccum;
6165 
6166 			// skip trailing short word
6167 			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen )
6168 			{
6169 				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) )
6170 				{
6171 					if ( m_iLastTokenLen )
6172 						m_iOvershortCount++;
6173 					m_iLastTokenLen = 0;
6174 					if_const ( IS_BLEND )
6175 						BlendAdjust ( pCur );
6176 					return NULL;
6177 				}
6178 			}
6179 
6180 			// keep token end here as BlendAdjust might change m_pCur
6181 			m_pTokenEnd = m_pCur;
6182 
6183 			// return trailing word
6184 			if_const ( IS_BLEND && !BlendAdjust ( pCur ) )
6185 				return NULL;
6186 			if_const ( IS_BLEND && m_bBlended )
6187 				return GetBlendedVariant();
6188 			return m_sAccum;
6189 		}
6190 
6191 		// handle all the flags..
6192 		if_const ( IS_QUERY )
6193 			iCode = CodepointArbitrationQ ( iCode, bWasEscaped, *m_pCur );
6194 		else if ( m_bDetectSentences )
6195 			iCode = CodepointArbitrationI ( iCode );
6196 
6197 		// handle ignored chars
6198 		if ( iCode & FLAG_CODEPOINT_IGNORE )
6199 			continue;
6200 
6201 		// handle blended characters
6202 		if_const ( IS_BLEND && ( iCode & FLAG_CODEPOINT_BLEND ) )
6203 		{
6204 			if ( m_pBlendEnd )
6205 				iCode = 0;
6206 			else
6207 			{
6208 				m_bBlended = true;
6209 				m_pBlendStart = m_iAccum ? m_pTokenStart : pCur;
6210 			}
6211 		}
6212 
6213 		// handle soft-whitespace-only tokens
6214 		if ( !bGotNonToken && !m_iAccum )
6215 		{
6216 			if ( !bGotSoft )
6217 			{
6218 				// detect opening soft whitespace
6219 				if ( ( iCode==0 && !IsWhitespace ( iCodePoint ) && !IsPunctuation ( iCodePoint ) )
6220 					|| ( ( iCode & FLAG_CODEPOINT_BLEND ) && !m_iAccum ) )
6221 				{
6222 					bGotSoft = true;
6223 				}
6224 			} else
6225 			{
6226 				// detect closing hard whitespace or special
6227 				// (if there was anything meaningful in the meantime, we must never get past the outer if!)
6228 				if ( IsWhitespace ( iCodePoint ) || ( iCode & FLAG_CODEPOINT_SPECIAL ) )
6229 				{
6230 					m_iOvershortCount++;
6231 					bGotNonToken = true;
6232 				}
6233 			}
6234 		}
6235 
6236 		// handle whitespace and boundary
6237 		if ( m_bBoundary && ( iCode==0 ) )
6238 		{
6239 			m_bTokenBoundary = true;
6240 			m_iBoundaryOffset = pCur - m_pBuffer - 1;
6241 		}
6242 		m_bBoundary = ( iCode & FLAG_CODEPOINT_BOUNDARY )!=0;
6243 
6244 		// handle separator (aka, most likely a token!)
6245 		if ( iCode==0 || m_bBoundary )
6246 		{
6247 			FlushAccum ();
6248 
6249 			// suddenly, exceptions
6250 			if ( m_pExc && CheckException ( m_pTokenStart ? m_pTokenStart : pCur, pCur, IS_QUERY ) )
6251 				return m_sAccum;
6252 
6253 			if_const ( IS_BLEND && !BlendAdjust ( pCur ) )
6254 				continue;
6255 
6256 			if ( m_iLastTokenLen<m_tSettings.m_iMinWordLen
6257 				&& !( m_bShortTokenFilter && ShortTokenFilter ( m_sAccum, m_iLastTokenLen ) ) )
6258 			{
6259 				if ( m_iLastTokenLen )
6260 					m_iOvershortCount++;
6261 				continue;
6262 			} else
6263 			{
6264 				m_pTokenEnd = pCur;
6265 				if_const ( IS_BLEND && m_bBlended )
6266 					return GetBlendedVariant();
6267 				return m_sAccum;
6268 			}
6269 		}
6270 
6271 		// handle specials
6272 		if ( iCode & FLAG_CODEPOINT_SPECIAL )
6273 		{
6274 			// skip short words preceding specials
6275 			if ( m_iAccum<m_tSettings.m_iMinWordLen )
6276 			{
6277 				m_sAccum[m_iAccum] = '\0';
6278 
6279 				if ( !m_bShortTokenFilter || !ShortTokenFilter ( m_sAccum, m_iAccum ) )
6280 				{
6281 					if ( m_iAccum )
6282 						m_iOvershortCount++;
6283 
6284 					FlushAccum ();
6285 				}
6286 			}
6287 
6288 			if ( m_iAccum==0 )
6289 			{
6290 				m_bNonBlended = m_bNonBlended || ( !( iCode & FLAG_CODEPOINT_BLEND ) && !( iCode & FLAG_CODEPOINT_SPECIAL ) );
6291 				m_bWasSpecial = !( iCode & FLAG_CODEPOINT_NGRAM );
6292 				m_pTokenStart = pCur;
6293 				m_pTokenEnd = m_pCur;
6294 				AccumCodepoint ( iCode & MASK_CODEPOINT ); // handle special as a standalone token
6295 			} else
6296 			{
6297 				m_pCur = pCur; // we need to flush current accum and then redo special char again
6298 				m_pTokenEnd = pCur;
6299 			}
6300 
6301 			FlushAccum ();
6302 
6303 			// suddenly, exceptions
6304 			if ( m_pExc && CheckException ( m_pTokenStart, pCur, IS_QUERY ) )
6305 				return m_sAccum;
6306 
6307 			if_const ( IS_BLEND )
6308 			{
6309 				if ( !BlendAdjust ( pCur ) )
6310 					continue;
6311 				if ( m_bBlended )
6312 					return GetBlendedVariant();
6313 			}
6314 			return m_sAccum;
6315 		}
6316 
6317 		if ( m_iAccum==0 )
6318 			m_pTokenStart = pCur;
6319 
6320 		// tricky bit
6321 		// heading modifiers must not (!) affected blended status
6322 		// eg. we want stuff like '=-' (w/o apostrophes) thrown away when pure_blend is on
6323 		if_const ( IS_BLEND )
6324 			if_const (!( IS_QUERY && !m_iAccum && sphIsModifier ( iCode & MASK_CODEPOINT ) ) )
6325 				m_bNonBlended = m_bNonBlended || !( iCode & FLAG_CODEPOINT_BLEND );
6326 
6327 		// just accumulate
6328 		// manual inlining of utf8 encoder gives us a few extra percent
6329 		// which is important here, this is a hotspot
6330 		if ( m_iAccum<SPH_MAX_WORD_LEN && ( m_pAccum-m_sAccum+SPH_MAX_UTF8_BYTES<=(int)sizeof(m_sAccum) ) )
6331 		{
6332 			iCode &= MASK_CODEPOINT;
6333 			m_iAccum++;
6334 			SPH_UTF8_ENCODE ( m_pAccum, iCode );
6335 		}
6336 	}
6337 }
6338 
6339 
FlushAccum()6340 void CSphTokenizerBase2::FlushAccum ()
6341 {
6342 	assert ( m_pAccum-m_sAccum < (int)sizeof(m_sAccum) );
6343 	m_iLastTokenLen = m_iAccum;
6344 	*m_pAccum = 0;
6345 	m_iAccum = 0;
6346 	m_pAccum = m_sAccum;
6347 }
6348 
6349 
6350 template < bool IS_QUERY >
Clone(ESphTokenizerClone eMode) const6351 ISphTokenizer * CSphTokenizer_UTF8<IS_QUERY>::Clone ( ESphTokenizerClone eMode ) const
6352 {
6353 	CSphTokenizerBase * pClone;
6354 	if ( eMode!=SPH_CLONE_INDEX )
6355 		pClone = new CSphTokenizer_UTF8<true>();
6356 	else
6357 		pClone = new CSphTokenizer_UTF8<false>();
6358 	pClone->CloneBase ( this, eMode );
6359 	return pClone;
6360 }
6361 
6362 
6363 template < bool IS_QUERY >
GetCodepointLength(int iCode) const6364 int CSphTokenizer_UTF8<IS_QUERY>::GetCodepointLength ( int iCode ) const
6365 {
6366 	if ( iCode<128 )
6367 		return 1;
6368 
6369 	int iBytes = 0;
6370 	while ( iCode & 0x80 )
6371 	{
6372 		iBytes++;
6373 		iCode <<= 1;
6374 	}
6375 
6376 	assert ( iBytes>=2 && iBytes<=4 );
6377 	return iBytes;
6378 }
6379 
6380 /////////////////////////////////////////////////////////////////////////////
6381 
6382 template < bool IS_QUERY >
SetNgramChars(const char * sConfig,CSphString & sError)6383 bool CSphTokenizer_UTF8Ngram<IS_QUERY>::SetNgramChars ( const char * sConfig, CSphString & sError )
6384 {
6385 	assert ( this->m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
6386 	CSphVector<CSphRemapRange> dRemaps;
6387 	CSphCharsetDefinitionParser tParser;
6388 	if ( !tParser.Parse ( sConfig, dRemaps ) )
6389 	{
6390 		sError = tParser.GetLastError();
6391 		return false;
6392 	}
6393 
6394 	// gcc braindamage requires this
6395 	this->m_tLC.AddRemaps ( dRemaps, FLAG_CODEPOINT_NGRAM | FLAG_CODEPOINT_SPECIAL ); // !COMMIT support other n-gram lengths than 1
6396 	m_sNgramCharsStr = sConfig;
6397 	return true;
6398 }
6399 
6400 
6401 template < bool IS_QUERY >
SetNgramLen(int iLen)6402 void CSphTokenizer_UTF8Ngram<IS_QUERY>::SetNgramLen ( int iLen )
6403 {
6404 	assert ( this->m_eMode!=SPH_CLONE_QUERY_LIGHTWEIGHT );
6405 	assert ( iLen>0 );
6406 	m_iNgramLen = iLen;
6407 }
6408 
6409 
6410 template < bool IS_QUERY >
GetToken()6411 BYTE * CSphTokenizer_UTF8Ngram<IS_QUERY>::GetToken ()
6412 {
6413 	// !COMMIT support other n-gram lengths than 1
6414 	assert ( m_iNgramLen==1 );
6415 	return CSphTokenizer_UTF8<IS_QUERY>::GetToken ();
6416 }
6417 
6418 //////////////////////////////////////////////////////////////////////////
6419 
CSphMultiformTokenizer(ISphTokenizer * pTokenizer,const CSphMultiformContainer * pContainer)6420 CSphMultiformTokenizer::CSphMultiformTokenizer ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer )
6421 	: CSphTokenFilter ( pTokenizer )
6422 	, m_pMultiWordforms ( pContainer )
6423 	, m_iStart	( 0 )
6424 	, m_iOutputPending ( -1 )
6425 	, m_pCurrentForm ( NULL )
6426 	, m_szPendingBufferPtr ( NULL )
6427 	, m_bBuildMultiform	( false )
6428 {
6429 	assert ( pTokenizer && pContainer );
6430 	m_dStoredTokens.Reserve ( pContainer->m_iMaxTokens + 6 ); // max form tokens + some blended tokens
6431 	m_sTokenizedMultiform[0] = '\0';
6432 }
6433 
6434 
~CSphMultiformTokenizer()6435 CSphMultiformTokenizer::~CSphMultiformTokenizer ()
6436 {
6437 	SafeDelete ( m_pTokenizer );
6438 }
6439 
6440 
GetToken()6441 BYTE * CSphMultiformTokenizer::GetToken ()
6442 {
6443 	if ( m_iOutputPending > -1 && m_pCurrentForm )
6444 	{
6445 		if ( ++m_iOutputPending>=m_pCurrentForm->m_dNormalForm.GetLength() )
6446 		{
6447 			m_iOutputPending = -1;
6448 			m_szPendingBufferPtr = NULL;
6449 			m_pCurrentForm = NULL;
6450 		} else
6451 		{
6452 			bool bLastForm = ( m_iOutputPending==m_pCurrentForm->m_dNormalForm.GetLength()-1 );
6453 
6454 			StoredToken_t & tStart = m_dStoredTokens[m_iStart];
6455 			strncpy ( (char *)tStart.m_sToken, m_pCurrentForm->m_dNormalForm[m_iOutputPending].m_sForm.cstr(), sizeof(tStart.m_sToken) );
6456 			if ( bLastForm )
6457 				tStart.m_pBufferPtr = m_szPendingBufferPtr;
6458 
6459 			tStart.m_iTokenLen = m_pCurrentForm->m_dNormalForm[m_iOutputPending].m_iLengthCP;
6460 			tStart.m_bBoundary = false;
6461 			tStart.m_bSpecial = false;
6462 			tStart.m_bBlended = false;
6463 			tStart.m_bBlendedPart = false;
6464 			return tStart.m_sToken;
6465 		}
6466 	}
6467 
6468 	m_sTokenizedMultiform[0] = '\0';
6469 	m_iStart++;
6470 
6471 	if ( m_iStart>=m_dStoredTokens.GetLength() )
6472 	{
6473 		m_iStart = 0;
6474 		m_dStoredTokens.Resize ( 0 );
6475 		const BYTE * pToken = m_pTokenizer->GetToken();
6476 		if ( !pToken )
6477 			return NULL;
6478 
6479 		FillStoredTokenInfo ( m_dStoredTokens.Add(), pToken, m_pTokenizer );
6480 		while ( m_dStoredTokens.Last().m_bBlended || m_dStoredTokens.Last().m_bBlendedPart )
6481 		{
6482 			pToken = m_pTokenizer->GetToken ();
6483 			if ( !pToken )
6484 				break;
6485 
6486 			FillStoredTokenInfo ( m_dStoredTokens.Add(), pToken, m_pTokenizer );
6487 		}
6488 	}
6489 
6490 	CSphMultiforms ** pWordforms = NULL;
6491 	int iTokensGot = 1;
6492 	bool bBlended = false;
6493 
6494 	// check multi-form
6495 	// only blended parts checked for multi-form with blended
6496 	// in case ALL blended parts got transformed primary blended got replaced by normal form
6497 	// otherwise blended tokens provided as is
6498 	if ( m_dStoredTokens[m_iStart].m_bBlended || m_dStoredTokens[m_iStart].m_bBlendedPart )
6499 	{
6500 		if ( m_dStoredTokens[m_iStart].m_bBlended && m_iStart+1<m_dStoredTokens.GetLength() && m_dStoredTokens[m_iStart+1].m_bBlendedPart )
6501 		{
6502 			pWordforms = m_pMultiWordforms->m_Hash ( (const char *)m_dStoredTokens[m_iStart+1].m_sToken );
6503 			if ( pWordforms )
6504 			{
6505 				bBlended = true;
6506 				for ( int i=m_iStart+2; i<m_dStoredTokens.GetLength(); i++ )
6507 				{
6508 					// break out on blended over or got completely different blended
6509 					if ( m_dStoredTokens[i].m_bBlended || !m_dStoredTokens[i].m_bBlendedPart )
6510 						break;
6511 
6512 					iTokensGot++;
6513 				}
6514 			}
6515 		}
6516 	} else
6517 	{
6518 		pWordforms = m_pMultiWordforms->m_Hash ( (const char *)m_dStoredTokens[m_iStart].m_sToken );
6519 		if ( pWordforms )
6520 		{
6521 			int iTokensNeed = (*pWordforms)->m_iMaxTokens + 1;
6522 			int iCur = m_iStart;
6523 			bool bGotBlended = false;
6524 
6525 			// collect up ahead to multi-form tokens or all blended tokens
6526 			while ( iTokensGot<iTokensNeed || bGotBlended )
6527 			{
6528 				iCur++;
6529 				if ( iCur>=m_dStoredTokens.GetLength() )
6530 				{
6531 					// fetch next token
6532 					const BYTE* pToken = m_pTokenizer->GetToken ();
6533 					if ( !pToken )
6534 						break;
6535 
6536 					FillStoredTokenInfo ( m_dStoredTokens.Add(), pToken, m_pTokenizer );
6537 				}
6538 
6539 				bool bCurBleneded = ( m_dStoredTokens[iCur].m_bBlended || m_dStoredTokens[iCur].m_bBlendedPart );
6540 				if ( bGotBlended && !bCurBleneded )
6541 					break;
6542 
6543 				bGotBlended = bCurBleneded;
6544 				// count only regular tokens; can not fold mixed (regular+blended) tokens to form
6545 				iTokensGot += ( bGotBlended ? 0 : 1 );
6546 			}
6547 		}
6548 	}
6549 
6550 	if ( !pWordforms || iTokensGot<(*pWordforms)->m_iMinTokens+1 )
6551 		return m_dStoredTokens[m_iStart].m_sToken;
6552 
6553 	int iStartToken = m_iStart + ( bBlended ? 1 : 0 );
6554 	ARRAY_FOREACH ( i, (*pWordforms)->m_pForms )
6555 	{
6556 		const CSphMultiform * pCurForm = (*pWordforms)->m_pForms[i];
6557 		int iFormTokCount = pCurForm->m_dTokens.GetLength();
6558 
6559 		if ( iTokensGot<iFormTokCount+1 || ( bBlended && iTokensGot!=iFormTokCount+1 ) )
6560 			continue;
6561 
6562 		int iForm = 0;
6563 		for ( ; iForm<iFormTokCount; iForm++ )
6564 		{
6565 			const StoredToken_t & tTok = m_dStoredTokens[iStartToken + 1 + iForm];
6566 			const char * szStored = (const char*)tTok.m_sToken;
6567 			const char * szNormal = pCurForm->m_dTokens[iForm].cstr ();
6568 
6569 			if ( *szNormal!=*szStored || strcasecmp ( szNormal, szStored ) )
6570 				break;
6571 		}
6572 
6573 		// early out - no destination form detected
6574 		if ( iForm!=iFormTokCount )
6575 			continue;
6576 
6577 		// tokens after folded form are valid tail that should be processed next time
6578 		if ( m_bBuildMultiform )
6579 		{
6580 			BYTE * pOut = m_sTokenizedMultiform;
6581 			BYTE * pMax = pOut + sizeof(m_sTokenizedMultiform);
6582 			for ( int j=0; j<iFormTokCount+1 && pOut<pMax; j++ )
6583 			{
6584 				const StoredToken_t & tTok = m_dStoredTokens[iStartToken+j];
6585 				const BYTE * sTok = tTok.m_sToken;
6586 				if ( j && pOut<pMax )
6587 					*pOut++ = ' ';
6588 				while ( *sTok && pOut<pMax )
6589 					*pOut++ = *sTok++;
6590 			}
6591 			*pOut = '\0';
6592 			*(pMax-1) = '\0';
6593 		}
6594 
6595 		if ( !bBlended )
6596 		{
6597 			// fold regular tokens to form
6598 			StoredToken_t & tStart = m_dStoredTokens[m_iStart];
6599 			StoredToken_t & tEnd = m_dStoredTokens[m_iStart+iFormTokCount];
6600 			m_iStart += iFormTokCount;
6601 
6602 			strncpy ( (char *)tEnd.m_sToken, pCurForm->m_dNormalForm[0].m_sForm.cstr(), sizeof(tEnd.m_sToken) );
6603 			tEnd.m_szTokenStart = tStart.m_szTokenStart;
6604 			tEnd.m_iTokenLen = pCurForm->m_dNormalForm[0].m_iLengthCP;
6605 
6606 			tEnd.m_bBoundary = false;
6607 			tEnd.m_bSpecial = false;
6608 			tEnd.m_bBlended = false;
6609 			tEnd.m_bBlendedPart = false;
6610 
6611 			if ( pCurForm->m_dNormalForm.GetLength()>1 )
6612 			{
6613 				m_iOutputPending = 0;
6614 				m_szPendingBufferPtr = tEnd.m_szTokenStart;
6615 				tEnd.m_pBufferPtr = m_szPendingBufferPtr;
6616 				m_pCurrentForm = pCurForm;
6617 			}
6618 		} else
6619 		{
6620 			// replace blended by form
6621 			// FIXME: add multiple destination token support here (if needed)
6622 			assert ( pCurForm->m_dNormalForm.GetLength()==1 );
6623 			StoredToken_t & tDst = m_dStoredTokens[m_iStart];
6624 			strncpy ( (char *)tDst.m_sToken, pCurForm->m_dNormalForm[0].m_sForm.cstr(), sizeof(tDst.m_sToken) );
6625 			tDst.m_iTokenLen = pCurForm->m_dNormalForm[0].m_iLengthCP;
6626 		}
6627 		break;
6628 	}
6629 
6630 	return m_dStoredTokens[m_iStart].m_sToken;
6631 }
6632 
6633 
Clone(ESphTokenizerClone eMode) const6634 ISphTokenizer * CSphMultiformTokenizer::Clone ( ESphTokenizerClone eMode ) const
6635 {
6636 	ISphTokenizer * pClone = m_pTokenizer->Clone ( eMode );
6637 	return CreateMultiformFilter ( pClone, m_pMultiWordforms );
6638 }
6639 
6640 
SetBufferPtr(const char * sNewPtr)6641 void CSphMultiformTokenizer::SetBufferPtr ( const char * sNewPtr )
6642 {
6643 	m_iStart = 0;
6644 	m_iOutputPending = -1;
6645 	m_pCurrentForm = NULL;
6646 	m_dStoredTokens.Resize ( 0 );
6647 	m_pTokenizer->SetBufferPtr ( sNewPtr );
6648 }
6649 
SetBuffer(const BYTE * sBuffer,int iLength)6650 void CSphMultiformTokenizer::SetBuffer ( const BYTE * sBuffer, int iLength )
6651 {
6652 	m_pTokenizer->SetBuffer ( sBuffer, iLength );
6653 	SetBufferPtr ( (const char *)sBuffer );
6654 }
6655 
GetSettingsFNV() const6656 uint64_t CSphMultiformTokenizer::GetSettingsFNV () const
6657 {
6658 	uint64_t uHash = CSphTokenFilter::GetSettingsFNV();
6659 	uHash ^= (uint64_t)m_pMultiWordforms;
6660 	return uHash;
6661 }
6662 
6663 
SkipBlended()6664 int CSphMultiformTokenizer::SkipBlended ()
6665 {
6666 	bool bGotBlended = ( m_iStart<m_dStoredTokens.GetLength() &&
6667 		( m_dStoredTokens[m_iStart].m_bBlended || m_dStoredTokens[m_iStart].m_bBlendedPart ) );
6668 	if ( !bGotBlended )
6669 		return 0;
6670 
6671 	int iWasStart = m_iStart;
6672 	for ( int iTok=m_iStart+1; iTok<m_dStoredTokens.GetLength() && m_dStoredTokens[iTok].m_bBlendedPart && !m_dStoredTokens[iTok].m_bBlended; iTok++ )
6673 		m_iStart = iTok;
6674 
6675 	return m_iStart-iWasStart;
6676 }
6677 
WasTokenMultiformDestination(bool & bHead,int & iDestCount) const6678 bool CSphMultiformTokenizer::WasTokenMultiformDestination ( bool & bHead, int & iDestCount ) const
6679 {
6680 	if ( m_iOutputPending>-1 && m_pCurrentForm && m_pCurrentForm->m_dNormalForm.GetLength()>1 && m_iOutputPending<m_pCurrentForm->m_dNormalForm.GetLength() )
6681 	{
6682 		bHead = ( m_iOutputPending==0 );
6683 		iDestCount = m_pCurrentForm->m_dNormalForm.GetLength();
6684 		return true;
6685 	} else
6686 	{
6687 		return false;
6688 	}
6689 }
6690 
6691 /////////////////////////////////////////////////////////////////////////////
6692 // FILTER
6693 /////////////////////////////////////////////////////////////////////////////
6694 
CSphFilterSettings()6695 CSphFilterSettings::CSphFilterSettings ()
6696 	: m_sAttrName	( "" )
6697 	, m_bExclude	( false )
6698 	, m_bHasEqual		( true )
6699 	, m_iMinValue	( LLONG_MIN )
6700 	, m_iMaxValue	( LLONG_MAX )
6701 	, m_pValues		( NULL )
6702 	, m_nValues		( 0 )
6703 {}
6704 
6705 
SetExternalValues(const SphAttr_t * pValues,int nValues)6706 void CSphFilterSettings::SetExternalValues ( const SphAttr_t * pValues, int nValues )
6707 {
6708 	m_pValues = pValues;
6709 	m_nValues = nValues;
6710 }
6711 
6712 
operator ==(const CSphFilterSettings & rhs) const6713 bool CSphFilterSettings::operator == ( const CSphFilterSettings & rhs ) const
6714 {
6715 	// check name, mode, type
6716 	if ( m_sAttrName!=rhs.m_sAttrName || m_bExclude!=rhs.m_bExclude || m_eType!=rhs.m_eType )
6717 		return false;
6718 
6719 	bool bSameStrings = false;
6720 	switch ( m_eType )
6721 	{
6722 		case SPH_FILTER_RANGE:
6723 			return m_iMinValue==rhs.m_iMinValue && m_iMaxValue==rhs.m_iMaxValue;
6724 
6725 		case SPH_FILTER_FLOATRANGE:
6726 			return m_fMinValue==rhs.m_fMinValue && m_fMaxValue==rhs.m_fMaxValue;
6727 
6728 		case SPH_FILTER_VALUES:
6729 			if ( m_dValues.GetLength()!=rhs.m_dValues.GetLength() )
6730 				return false;
6731 
6732 			ARRAY_FOREACH ( i, m_dValues )
6733 				if ( m_dValues[i]!=rhs.m_dValues[i] )
6734 					return false;
6735 
6736 			return true;
6737 
6738 		case SPH_FILTER_STRING:
6739 		case SPH_FILTER_USERVAR:
6740 		case SPH_FILTER_STRING_LIST:
6741 			if ( m_dStrings.GetLength()!=rhs.m_dStrings.GetLength() )
6742 				return false;
6743 			bSameStrings = ARRAY_ALL ( bSameStrings, m_dStrings, m_dStrings[_all]==rhs.m_dStrings[_all] );
6744 			return bSameStrings;
6745 
6746 		default:
6747 			assert ( 0 && "internal error: unhandled filter type in comparison" );
6748 			return false;
6749 	}
6750 }
6751 
6752 /////////////////////////////////////////////////////////////////////////////
6753 // QUERY
6754 /////////////////////////////////////////////////////////////////////////////
6755 
CSphQuery()6756 CSphQuery::CSphQuery ()
6757 	: m_sIndexes	( "*" )
6758 	, m_sQuery		( "" )
6759 	, m_sRawQuery	( "" )
6760 	, m_iOffset		( 0 )
6761 	, m_iLimit		( 20 )
6762 	, m_pWeights	( NULL )
6763 	, m_iWeights	( 0 )
6764 	, m_eMode		( SPH_MATCH_EXTENDED )
6765 	, m_eRanker		( SPH_RANK_DEFAULT )
6766 	, m_eSort		( SPH_SORT_RELEVANCE )
6767 	, m_iRandSeed	( -1 )
6768 	, m_iMaxMatches	( DEFAULT_MAX_MATCHES )
6769 	, m_bSortKbuffer	( false )
6770 	, m_bZSlist			( false )
6771 	, m_bSimplify		( false )
6772 	, m_bPlainIDF		( false )
6773 	, m_bGlobalIDF		( false )
6774 	, m_bNormalizedTFIDF ( true )
6775 	, m_bLocalDF		( false )
6776 	, m_eGroupFunc		( SPH_GROUPBY_ATTR )
6777 	, m_sGroupSortBy	( "@groupby desc" )
6778 	, m_sGroupDistinct	( "" )
6779 	, m_iCutoff			( 0 )
6780 	, m_iRetryCount		( 0 )
6781 	, m_iRetryDelay		( 0 )
6782 	, m_iAgentQueryTimeout	( 0 )
6783 	, m_bGeoAnchor		( false )
6784 	, m_fGeoLatitude	( 0.0f )
6785 	, m_fGeoLongitude	( 0.0f )
6786 	, m_uMaxQueryMsec	( 0 )
6787 	, m_iMaxPredictedMsec ( 0 )
6788 	, m_sComment		( "" )
6789 	, m_sSelect			( "" )
6790 	, m_iOuterOffset	( 0 )
6791 	, m_iOuterLimit		( 0 )
6792 	, m_bHasOuter		( false )
6793 	, m_bReverseScan	( false )
6794 	, m_bIgnoreNonexistent ( false )
6795 	, m_bIgnoreNonexistentIndexes ( false )
6796 	, m_bStrict			( false )
6797 	, m_pTableFunc		( NULL )
6798 
6799 	, m_iSQLSelectStart	( -1 )
6800 	, m_iSQLSelectEnd	( -1 )
6801 	, m_iGroupbyLimit	( 1 )
6802 
6803 	, m_eCollation		( SPH_COLLATION_DEFAULT )
6804 	, m_bAgent			( false )
6805 	, m_bFacet			( false )
6806 {}
6807 
6808 
~CSphQuery()6809 CSphQuery::~CSphQuery ()
6810 {
6811 }
6812 
6813 
6814 //////////////////////////////////////////////////////////////////////////
6815 
6816 struct SelectBounds_t
6817 {
6818 	int		m_iStart;
6819 	int		m_iEnd;
6820 };
6821 #define YYSTYPE SelectBounds_t
6822 class SelectParser_t;
6823 
6824 #ifdef CMAKE_GENERATED_GRAMMAR
6825 	#include "bissphinxselect.h"
6826 #else
6827 	#include "yysphinxselect.h"
6828 #endif
6829 
6830 
6831 class SelectParser_t
6832 {
6833 public:
6834 	int				GetToken ( YYSTYPE * lvalp );
6835 	void			AddItem ( YYSTYPE * pExpr, ESphAggrFunc eAggrFunc=SPH_AGGR_NONE, YYSTYPE * pStart=NULL, YYSTYPE * pEnd=NULL );
6836 	void			AddItem ( const char * pToken, YYSTYPE * pStart=NULL, YYSTYPE * pEnd=NULL );
6837 	void			AliasLastItem ( YYSTYPE * pAlias );
6838 	void			AddOption ( YYSTYPE * pOpt, YYSTYPE * pVal );
6839 
6840 private:
6841 	void			AutoAlias ( CSphQueryItem & tItem, YYSTYPE * pStart, YYSTYPE * pEnd );
6842 	bool			IsTokenEqual ( YYSTYPE * pTok, const char * sRef );
6843 
6844 public:
6845 	CSphString		m_sParserError;
6846 	const char *	m_pLastTokenStart;
6847 
6848 	const char *	m_pStart;
6849 	const char *	m_pCur;
6850 
6851 	CSphQuery *		m_pQuery;
6852 };
6853 
yylex(YYSTYPE * lvalp,SelectParser_t * pParser)6854 int yylex ( YYSTYPE * lvalp, SelectParser_t * pParser )
6855 {
6856 	return pParser->GetToken ( lvalp );
6857 }
6858 
yyerror(SelectParser_t * pParser,const char * sMessage)6859 void yyerror ( SelectParser_t * pParser, const char * sMessage )
6860 {
6861 	pParser->m_sParserError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastTokenStart );
6862 }
6863 
6864 #ifdef CMAKE_GENERATED_GRAMMAR
6865 #include "bissphinxselect.c"
6866 #else
6867 
6868 #include "yysphinxselect.c"
6869 
6870 #endif
6871 
6872 
GetToken(YYSTYPE * lvalp)6873 int SelectParser_t::GetToken ( YYSTYPE * lvalp )
6874 {
6875 	// skip whitespace, check eof
6876 	while ( isspace ( *m_pCur ) )
6877 		m_pCur++;
6878 	if ( !*m_pCur )
6879 		return 0;
6880 
6881 	// begin working that token
6882 	m_pLastTokenStart = m_pCur;
6883 	lvalp->m_iStart = m_pCur-m_pStart;
6884 
6885 	// check for constant
6886 	if ( isdigit ( *m_pCur ) )
6887 	{
6888 		char * pEnd = NULL;
6889 		double fDummy; // to avoid gcc unused result warning
6890 		fDummy = strtod ( m_pCur, &pEnd );
6891 		fDummy *= 2; // to avoid gcc unused variable warning
6892 
6893 		m_pCur = pEnd;
6894 		lvalp->m_iEnd = m_pCur-m_pStart;
6895 		return SEL_TOKEN;
6896 	}
6897 
6898 	// check for token
6899 	if ( sphIsAttr ( m_pCur[0] ) || ( m_pCur[0]=='@' && sphIsAttr ( m_pCur[1] ) && !isdigit ( m_pCur[1] ) ) )
6900 	{
6901 		m_pCur++;
6902 		while ( sphIsAttr ( *m_pCur ) ) m_pCur++;
6903 		lvalp->m_iEnd = m_pCur-m_pStart;
6904 
6905 		#define LOC_CHECK(_str,_len,_ret) \
6906 			if ( lvalp->m_iEnd==_len+lvalp->m_iStart && strncasecmp ( m_pStart+lvalp->m_iStart, _str, _len )==0 ) return _ret;
6907 
6908 		LOC_CHECK ( "ID", 2, SEL_ID );
6909 		LOC_CHECK ( "AS", 2, SEL_AS );
6910 		LOC_CHECK ( "OR", 2, TOK_OR );
6911 		LOC_CHECK ( "AND", 3, TOK_AND );
6912 		LOC_CHECK ( "NOT", 3, TOK_NOT );
6913 		LOC_CHECK ( "DIV", 3, TOK_DIV );
6914 		LOC_CHECK ( "MOD", 3, TOK_MOD );
6915 		LOC_CHECK ( "AVG", 3, SEL_AVG );
6916 		LOC_CHECK ( "MIN", 3, SEL_MIN );
6917 		LOC_CHECK ( "MAX", 3, SEL_MAX );
6918 		LOC_CHECK ( "SUM", 3, SEL_SUM );
6919 		LOC_CHECK ( "GROUP_CONCAT", 12, SEL_GROUP_CONCAT );
6920 		LOC_CHECK ( "GROUPBY", 7, SEL_GROUPBY );
6921 		LOC_CHECK ( "COUNT", 5, SEL_COUNT );
6922 		LOC_CHECK ( "DISTINCT", 8, SEL_DISTINCT );
6923 		LOC_CHECK ( "WEIGHT", 6, SEL_WEIGHT );
6924 		LOC_CHECK ( "OPTION", 6, SEL_OPTION );
6925 		LOC_CHECK ( "IS", 2, TOK_IS );
6926 		LOC_CHECK ( "NULL", 4, TOK_NULL );
6927 		LOC_CHECK ( "FOR", 3, TOK_FOR );
6928 		LOC_CHECK ( "IN", 2, TOK_FUNC_IN );
6929 
6930 		#undef LOC_CHECK
6931 
6932 		return SEL_TOKEN;
6933 	}
6934 
6935 	// check for equality checks
6936 	lvalp->m_iEnd = 1+lvalp->m_iStart;
6937 	switch ( *m_pCur )
6938 	{
6939 		case '<':
6940 			m_pCur++;
6941 			if ( *m_pCur=='>' ) { m_pCur++; lvalp->m_iEnd++; return TOK_NE; }
6942 			if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; return TOK_LTE; }
6943 			return '<';
6944 
6945 		case '>':
6946 			m_pCur++;
6947 			if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; return TOK_GTE; }
6948 			return '>';
6949 
6950 		case '=':
6951 			m_pCur++;
6952 			if ( *m_pCur=='=' ) { m_pCur++; lvalp->m_iEnd++; }
6953 			return TOK_EQ;
6954 
6955 		case '\'':
6956 		{
6957 			const char cEnd = *m_pCur;
6958 			for ( const char * s = m_pCur+1; *s; s++ )
6959 			{
6960 				if ( *s==cEnd && s-1>=m_pCur && *(s-1)!='\\' )
6961 				{
6962 					m_pCur = s+1;
6963 					return TOK_CONST_STRING;
6964 				}
6965 			}
6966 			return -1;
6967 		}
6968 	}
6969 
6970 	// check for comment begin/end
6971 	if ( m_pCur[0]=='/' && m_pCur[1]=='*' )
6972 	{
6973 		m_pCur += 2;
6974 		lvalp->m_iEnd += 1;
6975 		return SEL_COMMENT_OPEN;
6976 	}
6977 	if ( m_pCur[0]=='*' && m_pCur[1]=='/' )
6978 	{
6979 		m_pCur += 2;
6980 		lvalp->m_iEnd += 1;
6981 		return SEL_COMMENT_CLOSE;
6982 	}
6983 
6984 	// return char as a token
6985 	return *m_pCur++;
6986 }
6987 
AutoAlias(CSphQueryItem & tItem,YYSTYPE * pStart,YYSTYPE * pEnd)6988 void SelectParser_t::AutoAlias ( CSphQueryItem & tItem, YYSTYPE * pStart, YYSTYPE * pEnd )
6989 {
6990 	if ( pStart && pEnd )
6991 	{
6992 		tItem.m_sAlias.SetBinary ( m_pStart + pStart->m_iStart, pEnd->m_iEnd - pStart->m_iStart );
6993 		sphColumnToLowercase ( const_cast<char *>( tItem.m_sAlias.cstr() ) ); // as in SqlParser_c
6994 	} else
6995 		tItem.m_sAlias = tItem.m_sExpr;
6996 }
6997 
AddItem(YYSTYPE * pExpr,ESphAggrFunc eAggrFunc,YYSTYPE * pStart,YYSTYPE * pEnd)6998 void SelectParser_t::AddItem ( YYSTYPE * pExpr, ESphAggrFunc eAggrFunc, YYSTYPE * pStart, YYSTYPE * pEnd )
6999 {
7000 	CSphQueryItem & tItem = m_pQuery->m_dItems.Add();
7001 	tItem.m_sExpr.SetBinary ( m_pStart + pExpr->m_iStart, pExpr->m_iEnd - pExpr->m_iStart );
7002 	sphColumnToLowercase ( const_cast<char *>( tItem.m_sExpr.cstr() ) );
7003 	tItem.m_eAggrFunc = eAggrFunc;
7004 	AutoAlias ( tItem, pStart, pEnd );
7005 }
7006 
AddItem(const char * pToken,YYSTYPE * pStart,YYSTYPE * pEnd)7007 void SelectParser_t::AddItem ( const char * pToken, YYSTYPE * pStart, YYSTYPE * pEnd )
7008 {
7009 	CSphQueryItem & tItem = m_pQuery->m_dItems.Add();
7010 	tItem.m_sExpr = pToken;
7011 	tItem.m_eAggrFunc = SPH_AGGR_NONE;
7012 	sphColumnToLowercase ( const_cast<char *>( tItem.m_sExpr.cstr() ) );
7013 	AutoAlias ( tItem, pStart, pEnd );
7014 }
7015 
AliasLastItem(YYSTYPE * pAlias)7016 void SelectParser_t::AliasLastItem ( YYSTYPE * pAlias )
7017 {
7018 	if ( pAlias )
7019 	{
7020 		CSphQueryItem & tItem = m_pQuery->m_dItems.Last();
7021 		tItem.m_sAlias.SetBinary ( m_pStart + pAlias->m_iStart, pAlias->m_iEnd - pAlias->m_iStart );
7022 		tItem.m_sAlias.ToLower();
7023 	}
7024 }
7025 
IsTokenEqual(YYSTYPE * pTok,const char * sRef)7026 bool SelectParser_t::IsTokenEqual ( YYSTYPE * pTok, const char * sRef )
7027 {
7028 	int iLen = strlen(sRef);
7029 	if ( iLen!=( pTok->m_iEnd - pTok->m_iStart ) )
7030 		return false;
7031 	return strncasecmp ( m_pStart + pTok->m_iStart, sRef, iLen )==0;
7032 }
7033 
AddOption(YYSTYPE * pOpt,YYSTYPE * pVal)7034 void SelectParser_t::AddOption ( YYSTYPE * pOpt, YYSTYPE * pVal )
7035 {
7036 	if ( IsTokenEqual ( pOpt, "reverse_scan" ) )
7037 	{
7038 		if ( IsTokenEqual ( pVal, "1" ) )
7039 			m_pQuery->m_bReverseScan = true;
7040 	} else if ( IsTokenEqual ( pOpt, "sort_method" ) )
7041 	{
7042 		if ( IsTokenEqual ( pVal, "kbuffer" ) )
7043 			m_pQuery->m_bSortKbuffer = true;
7044 	} else if ( IsTokenEqual ( pOpt, "max_predicted_time" ) )
7045 	{
7046 		char szNumber[256];
7047 		int iLen = pVal->m_iEnd-pVal->m_iStart;
7048 		assert ( iLen < (int)sizeof(szNumber) );
7049 		strncpy ( szNumber, m_pStart+pVal->m_iStart, iLen );
7050 		int64_t iMaxPredicted = strtoull ( szNumber, NULL, 10 );
7051 		m_pQuery->m_iMaxPredictedMsec = int(iMaxPredicted > INT_MAX ? INT_MAX : iMaxPredicted );
7052 	}
7053 }
7054 
ParseSelectList(CSphString & sError)7055 bool CSphQuery::ParseSelectList ( CSphString & sError )
7056 {
7057 	m_dItems.Reset ();
7058 	if ( m_sSelect.IsEmpty() )
7059 		return true; // empty is ok; will just return everything
7060 
7061 	SelectParser_t tParser;
7062 	tParser.m_pStart = m_sSelect.cstr();
7063 	tParser.m_pCur = tParser.m_pStart;
7064 	tParser.m_pQuery = this;
7065 
7066 	yyparse ( &tParser );
7067 
7068 	sError = tParser.m_sParserError;
7069 	return sError.IsEmpty ();
7070 }
7071 
7072 /////////////////////////////////////////////////////////////////////////////
7073 // QUERY STATS
7074 /////////////////////////////////////////////////////////////////////////////
7075 
CSphQueryStats()7076 CSphQueryStats::CSphQueryStats()
7077 	: m_pNanoBudget ( NULL )
7078 	, m_iFetchedDocs ( 0 )
7079 	, m_iFetchedHits ( 0 )
7080 	, m_iSkips ( 0 )
7081 {
7082 }
7083 
Add(const CSphQueryStats & tStats)7084 void CSphQueryStats::Add ( const CSphQueryStats & tStats )
7085 {
7086 	m_iFetchedDocs += tStats.m_iFetchedDocs;
7087 	m_iFetchedHits += tStats.m_iFetchedHits;
7088 	m_iSkips += tStats.m_iSkips;
7089 }
7090 
7091 
7092 /////////////////////////////////////////////////////////////////////////////
7093 // SCHEMAS
7094 /////////////////////////////////////////////////////////////////////////////
7095 
sphDumpAttr(const CSphColumnInfo & tAttr)7096 static CSphString sphDumpAttr ( const CSphColumnInfo & tAttr )
7097 {
7098 	CSphString sRes;
7099 	sRes.SetSprintf ( "%s %s:%d@%d", sphTypeName ( tAttr.m_eAttrType ), tAttr.m_sName.cstr(),
7100 		tAttr.m_tLocator.m_iBitCount, tAttr.m_tLocator.m_iBitOffset );
7101 	return sRes;
7102 }
7103 
7104 
7105 /// make string lowercase but keep case of JSON.field
sphColumnToLowercase(char * sVal)7106 void sphColumnToLowercase ( char * sVal )
7107 {
7108 	if ( !sVal || !*sVal )
7109 		return;
7110 
7111 	// make all chars lowercase but only prior to '.', ',', and '[' delimiters
7112 	// leave quoted values unchanged
7113 	for ( bool bQuoted=false; *sVal && *sVal!='.' && *sVal!=',' && *sVal!='['; sVal++ )
7114 	{
7115 		if ( !bQuoted )
7116 			*sVal = (char) tolower ( *sVal );
7117 		if ( *sVal=='\'' )
7118 			bQuoted = !bQuoted;
7119 	}
7120 }
7121 
7122 
CSphColumnInfo(const char * sName,ESphAttr eType)7123 CSphColumnInfo::CSphColumnInfo ( const char * sName, ESphAttr eType )
7124 	: m_sName ( sName )
7125 	, m_eAttrType ( eType )
7126 	, m_eWordpart ( SPH_WORDPART_WHOLE )
7127 	, m_bIndexed ( false )
7128 	, m_iIndex ( -1 )
7129 	, m_eSrc ( SPH_ATTRSRC_NONE )
7130 	, m_pExpr ( NULL )
7131 	, m_eAggrFunc ( SPH_AGGR_NONE )
7132 	, m_eStage ( SPH_EVAL_STATIC )
7133 	, m_bPayload ( false )
7134 	, m_bFilename ( false )
7135 	, m_bWeight ( false )
7136 	, m_uNext ( 0xffff )
7137 {
7138 	sphColumnToLowercase ( const_cast<char *>( m_sName.cstr() ) );
7139 }
7140 
7141 //////////////////////////////////////////////////////////////////////////
7142 
Reset()7143 void ISphSchema::Reset()
7144 {
7145 	m_dPtrAttrs.Reset();
7146 	m_dFactorAttrs.Reset();
7147 }
7148 
7149 
InsertAttr(CSphVector<CSphColumnInfo> & dAttrs,CSphVector<int> & dUsed,int iPos,const CSphColumnInfo & tCol,bool bDynamic)7150 void ISphSchema::InsertAttr ( CSphVector<CSphColumnInfo> & dAttrs, CSphVector<int> & dUsed, int iPos, const CSphColumnInfo & tCol, bool bDynamic )
7151 {
7152 	assert ( 0<=iPos && iPos<=dAttrs.GetLength() );
7153 	assert ( tCol.m_eAttrType!=SPH_ATTR_NONE && !tCol.m_tLocator.IsID() ); // not via this orifice bro
7154 	if ( tCol.m_eAttrType==SPH_ATTR_NONE || tCol.m_tLocator.IsID() )
7155 		return;
7156 
7157 	dAttrs.Insert ( iPos, tCol );
7158 	CSphAttrLocator & tLoc = dAttrs [ iPos ].m_tLocator;
7159 
7160 	int iBits = ROWITEM_BITS;
7161 	if ( tLoc.m_iBitCount>0 )
7162 		iBits = tLoc.m_iBitCount;
7163 	if ( tCol.m_eAttrType==SPH_ATTR_BOOL )
7164 		iBits = 1;
7165 	if ( tCol.m_eAttrType==SPH_ATTR_BIGINT || tCol.m_eAttrType==SPH_ATTR_JSON_FIELD )
7166 		iBits = 64;
7167 
7168 	if ( tCol.m_eAttrType==SPH_ATTR_STRINGPTR || tCol.m_eAttrType==SPH_ATTR_FACTORS || tCol.m_eAttrType==SPH_ATTR_FACTORS_JSON )
7169 	{
7170 		assert ( bDynamic );
7171 		iBits = ROWITEMPTR_BITS;
7172 		CSphNamedInt & t = ( tCol.m_eAttrType==SPH_ATTR_STRINGPTR )
7173 			? m_dPtrAttrs.Add()
7174 			: m_dFactorAttrs.Add();
7175 		t.m_iValue = dUsed.GetLength();
7176 		t.m_sName = tCol.m_sName;
7177 	}
7178 
7179 	tLoc.m_iBitCount = iBits;
7180 	tLoc.m_bDynamic = bDynamic;
7181 
7182 	if ( iBits>=ROWITEM_BITS )
7183 	{
7184 		tLoc.m_iBitOffset = dUsed.GetLength()*ROWITEM_BITS;
7185 		int iItems = (iBits+ROWITEM_BITS-1) / ROWITEM_BITS;
7186 		for ( int i=0; i<iItems; i++ )
7187 			dUsed.Add ( ROWITEM_BITS );
7188 	} else
7189 	{
7190 		int iItem;
7191 		for ( iItem=0; iItem<dUsed.GetLength(); iItem++ )
7192 			if ( dUsed[iItem]+iBits<=ROWITEM_BITS )
7193 				break;
7194 		if ( iItem==dUsed.GetLength() )
7195 			dUsed.Add ( 0 );
7196 		tLoc.m_iBitOffset = iItem*ROWITEM_BITS + dUsed[iItem];
7197 		dUsed[iItem] += iBits;
7198 	}
7199 }
7200 
7201 
CloneWholeMatch(CSphMatch * pDst,const CSphMatch & rhs) const7202 void ISphSchema::CloneWholeMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const
7203 {
7204 	assert ( pDst );
7205 	FreeStringPtrs ( pDst );
7206 	pDst->Combine ( rhs, GetRowSize() );
7207 	CopyPtrs ( pDst, rhs );
7208 }
7209 
7210 
CopyPtrs(CSphMatch * pDst,const CSphMatch & rhs) const7211 void ISphSchema::CopyPtrs ( CSphMatch * pDst, const CSphMatch & rhs ) const
7212 {
7213 	ARRAY_FOREACH ( i, m_dPtrAttrs )
7214 		*(const char**) (pDst->m_pDynamic+m_dPtrAttrs[i].m_iValue) = CSphString (*(const char**)(rhs.m_pDynamic+m_dPtrAttrs[i].m_iValue)).Leak();
7215 
7216 	// not immediately obvious: this is not needed while pushing matches to sorters; factors are held in an outer hash table
7217 	// but it is necessary to copy factors when combining results from several indexes via a sorter because at this moment matches are the owners of factor data
7218 	ARRAY_FOREACH ( i, m_dFactorAttrs )
7219 	{
7220 		int iOffset = m_dFactorAttrs[i].m_iValue;
7221 		BYTE * pData = *(BYTE**)(rhs.m_pDynamic+iOffset);
7222 		if ( pData )
7223 		{
7224 			DWORD uDataSize = *(DWORD*)pData;
7225 			assert ( uDataSize );
7226 
7227 			BYTE * pCopy = new BYTE[uDataSize];
7228 			memcpy ( pCopy, pData, uDataSize );
7229 			*(BYTE**)(pDst->m_pDynamic+iOffset) = pCopy;
7230 		}
7231 	}
7232 }
7233 
7234 
FreeStringPtrs(CSphMatch * pMatch) const7235 void ISphSchema::FreeStringPtrs ( CSphMatch * pMatch ) const
7236 {
7237 	assert ( pMatch );
7238 	if ( !pMatch->m_pDynamic )
7239 		return;
7240 
7241 	if ( m_dPtrAttrs.GetLength() )
7242 	{
7243 		CSphString sStr;
7244 		ARRAY_FOREACH ( i, m_dPtrAttrs )
7245 		{
7246 			sStr.Adopt ( (char**) (pMatch->m_pDynamic+m_dPtrAttrs[i].m_iValue));
7247 		}
7248 	}
7249 
7250 	ARRAY_FOREACH ( i, m_dFactorAttrs )
7251 	{
7252 		int iOffset = m_dFactorAttrs[i].m_iValue;
7253 		BYTE * pData = *(BYTE**)(pMatch->m_pDynamic+iOffset);
7254 		if ( pData )
7255 		{
7256 			delete [] pData;
7257 			*(BYTE**)(pMatch->m_pDynamic+iOffset) = NULL;
7258 		}
7259 	}
7260 }
7261 
7262 //////////////////////////////////////////////////////////////////////////
7263 
CSphSchema(const char * sName)7264 CSphSchema::CSphSchema ( const char * sName )
7265 	: m_sName ( sName )
7266 	, m_iStaticSize ( 0 )
7267 {
7268 	for ( int i=0; i<BUCKET_COUNT; i++ )
7269 		m_dBuckets[i] = 0xffff;
7270 }
7271 
7272 
CompareTo(const CSphSchema & rhs,CSphString & sError,bool bFullComparison) const7273 bool CSphSchema::CompareTo ( const CSphSchema & rhs, CSphString & sError, bool bFullComparison ) const
7274 {
7275 	// check attr count
7276 	if ( GetAttrsCount()!=rhs.GetAttrsCount() )
7277 	{
7278 		sError.SetSprintf ( "attribute count mismatch (me=%s, in=%s, myattrs=%d, inattrs=%d)",
7279 			m_sName.cstr(), rhs.m_sName.cstr(),
7280 			GetAttrsCount(), rhs.GetAttrsCount() );
7281 		return false;
7282 	}
7283 
7284 	// check attrs
7285 	ARRAY_FOREACH ( i, m_dAttrs )
7286 	{
7287 		const CSphColumnInfo & tAttr1 = rhs.m_dAttrs[i];
7288 		const CSphColumnInfo & tAttr2 = m_dAttrs[i];
7289 
7290 		bool bMismatch;
7291 		if ( bFullComparison )
7292 			bMismatch = !(tAttr1==tAttr2);
7293 		else
7294 		{
7295 			ESphAttr eAttr1 = tAttr1.m_eAttrType;
7296 			ESphAttr eAttr2 = tAttr2.m_eAttrType;
7297 
7298 			bMismatch = tAttr1.m_sName!=tAttr2.m_sName || eAttr1!=eAttr2 || tAttr1.m_eWordpart!=tAttr2.m_eWordpart ||
7299 				tAttr1.m_bIndexed!=tAttr2.m_bIndexed ||	tAttr1.m_tLocator.m_iBitCount!=tAttr2.m_tLocator.m_iBitCount ||
7300 				tAttr1.m_tLocator.m_iBitOffset!=tAttr2.m_tLocator.m_iBitOffset;
7301 		}
7302 
7303 		if ( bMismatch )
7304 		{
7305 			sError.SetSprintf ( "attribute mismatch (me=%s, in=%s, idx=%d, myattr=%s, inattr=%s)",
7306 				m_sName.cstr(), rhs.m_sName.cstr(), i, sphDumpAttr ( m_dAttrs[i] ).cstr(), sphDumpAttr ( rhs.m_dAttrs[i] ).cstr() );
7307 			return false;
7308 		}
7309 	}
7310 
7311 	// check field count
7312 	if ( rhs.m_dFields.GetLength()!=m_dFields.GetLength() )
7313 	{
7314 		sError.SetSprintf ( "fulltext fields count mismatch (me=%s, in=%s, myfields=%d, infields=%d)",
7315 			m_sName.cstr(), rhs.m_sName.cstr(),
7316 			m_dFields.GetLength(), rhs.m_dFields.GetLength() );
7317 		return false;
7318 	}
7319 
7320 	// check fulltext field names
7321 	ARRAY_FOREACH ( i, rhs.m_dFields )
7322 		if ( rhs.m_dFields[i].m_sName!=m_dFields[i].m_sName )
7323 	{
7324 		sError.SetSprintf ( "fulltext field mismatch (me=%s, myfield=%s, idx=%d, in=%s, infield=%s)",
7325 			m_sName.cstr(), rhs.m_sName.cstr(),
7326 			i, m_dFields[i].m_sName.cstr(), rhs.m_dFields[i].m_sName.cstr() );
7327 		return false;
7328 	}
7329 
7330 	return true;
7331 }
7332 
7333 
GetFieldIndex(const char * sName) const7334 int CSphSchema::GetFieldIndex ( const char * sName ) const
7335 {
7336 	if ( !sName )
7337 		return -1;
7338 	ARRAY_FOREACH ( i, m_dFields )
7339 		if ( strcasecmp ( m_dFields[i].m_sName.cstr(), sName )==0 )
7340 			return i;
7341 	return -1;
7342 }
7343 
7344 
GetAttrIndex(const char * sName) const7345 int CSphSchema::GetAttrIndex ( const char * sName ) const
7346 {
7347 	if ( !sName )
7348 		return -1;
7349 
7350 	if ( m_dAttrs.GetLength()>=HASH_THRESH )
7351 	{
7352 		DWORD uCrc = sphCRC32 ( sName );
7353 		DWORD uPos = m_dBuckets [ uCrc%BUCKET_COUNT ];
7354 		while ( uPos!=0xffff && m_dAttrs [ uPos ].m_sName!=sName )
7355 			uPos = m_dAttrs [ uPos ].m_uNext;
7356 
7357 		return (short)uPos; // 0xffff == -1 is our "end of list" marker
7358 	}
7359 
7360 	ARRAY_FOREACH ( i, m_dAttrs )
7361 		if ( m_dAttrs[i].m_sName==sName )
7362 			return i;
7363 
7364 	return -1;
7365 }
7366 
7367 
GetAttr(const char * sName) const7368 const CSphColumnInfo * CSphSchema::GetAttr ( const char * sName ) const
7369 {
7370 	int iIndex = GetAttrIndex ( sName );
7371 	if ( iIndex>=0 )
7372 		return &m_dAttrs[iIndex];
7373 	return NULL;
7374 }
7375 
7376 
Reset()7377 void CSphSchema::Reset ()
7378 {
7379 	ISphSchema::Reset();
7380 	m_dFields.Reset();
7381 	m_dAttrs.Reset();
7382 	for ( int i=0; i<BUCKET_COUNT; i++ )
7383 		m_dBuckets[i] = 0xffff;
7384 	m_dStaticUsed.Reset();
7385 	m_dDynamicUsed.Reset();
7386 	m_iStaticSize = 0;
7387 }
7388 
7389 
InsertAttr(int iPos,const CSphColumnInfo & tCol,bool bDynamic)7390 void CSphSchema::InsertAttr ( int iPos, const CSphColumnInfo & tCol, bool bDynamic )
7391 {
7392 	// it's redundant in case of AddAttr
7393 	if ( iPos!=m_dAttrs.GetLength() )
7394 		UpdateHash ( iPos-1, 1 );
7395 
7396 	ISphSchema::InsertAttr ( m_dAttrs, bDynamic ? m_dDynamicUsed : m_dStaticUsed, iPos, tCol, bDynamic );
7397 
7398 	// update static size
7399 	m_iStaticSize = m_dStaticUsed.GetLength();
7400 
7401 	// do hash add
7402 	if ( m_dAttrs.GetLength()==HASH_THRESH )
7403 		RebuildHash();
7404 	else if ( m_dAttrs.GetLength()>HASH_THRESH )
7405 	{
7406 		WORD & uPos = GetBucketPos ( m_dAttrs [ iPos ].m_sName.cstr() );
7407 		m_dAttrs [ iPos ].m_uNext = uPos;
7408 		uPos = (WORD)iPos;
7409 	}
7410 }
7411 
7412 
RemoveAttr(const char * szAttr,bool bDynamic)7413 void CSphSchema::RemoveAttr ( const char * szAttr, bool bDynamic )
7414 {
7415 	int iIndex = GetAttrIndex ( szAttr );
7416 	if ( iIndex<0 )
7417 		return;
7418 
7419 	CSphVector<CSphColumnInfo> dBackup = m_dAttrs;
7420 
7421 	if ( bDynamic )
7422 		m_dDynamicUsed.Reset();
7423 	else
7424 	{
7425 		m_dStaticUsed.Reset();
7426 		m_iStaticSize = 0;
7427 	}
7428 
7429 	ISphSchema::Reset();
7430 	m_dAttrs.Reset();
7431 
7432 	ARRAY_FOREACH ( i, dBackup )
7433 		if ( i!=iIndex )
7434 			AddAttr ( dBackup[i], bDynamic );
7435 }
7436 
7437 
AddAttr(const CSphColumnInfo & tCol,bool bDynamic)7438 void CSphSchema::AddAttr ( const CSphColumnInfo & tCol, bool bDynamic )
7439 {
7440 	InsertAttr ( m_dAttrs.GetLength(), tCol, bDynamic );
7441 }
7442 
7443 
IsReserved(const char * szToken)7444 bool CSphSchema::IsReserved ( const char * szToken )
7445 {
7446 	static const char * dReserved[] =
7447 	{
7448 		"AND", "AS", "BY", "DIV", "FACET", "FALSE", "FROM", "ID", "IN", "IS", "LIMIT",
7449 		"MOD", "NOT", "NULL", "OR", "ORDER", "SELECT", "TRUE", NULL
7450 	};
7451 
7452 	const char ** p = dReserved;
7453 	while ( *p )
7454 		if ( strcasecmp ( szToken, *p++ )==0 )
7455 			return true;
7456 	return false;
7457 }
7458 
7459 
GetBucketPos(const char * sName)7460 WORD & CSphSchema::GetBucketPos ( const char * sName )
7461 {
7462 	DWORD uCrc = sphCRC32 ( sName );
7463 	return m_dBuckets [ uCrc % BUCKET_COUNT ];
7464 }
7465 
7466 
RebuildHash()7467 void CSphSchema::RebuildHash ()
7468 {
7469 	if ( m_dAttrs.GetLength()<HASH_THRESH )
7470 		return;
7471 
7472 	for ( int i=0; i<BUCKET_COUNT; i++ )
7473 		m_dBuckets[i] = 0xffff;
7474 
7475 	ARRAY_FOREACH ( i, m_dAttrs )
7476 	{
7477 		WORD & uPos = GetBucketPos ( m_dAttrs[i].m_sName.cstr() );
7478 		m_dAttrs[i].m_uNext = uPos;
7479 		uPos = WORD(i);
7480 	}
7481 }
7482 
7483 
UpdateHash(int iStartIndex,int iAddVal)7484 void CSphSchema::UpdateHash ( int iStartIndex, int iAddVal )
7485 {
7486 	if ( m_dAttrs.GetLength()<HASH_THRESH )
7487 		return;
7488 
7489 	ARRAY_FOREACH ( i, m_dAttrs )
7490 	{
7491 		WORD & uPos = m_dAttrs[i].m_uNext;
7492 		if ( uPos!=0xffff && uPos>iStartIndex )
7493 			uPos = (WORD)( uPos + iAddVal );
7494 	}
7495 	for ( int i=0; i<BUCKET_COUNT; i++ )
7496 	{
7497 		WORD & uPos = m_dBuckets[i];
7498 		if ( uPos!=0xffff && uPos>iStartIndex )
7499 			uPos = (WORD)( uPos + iAddVal );
7500 	}
7501 }
7502 
7503 
AssignTo(CSphRsetSchema & lhs) const7504 void CSphSchema::AssignTo ( CSphRsetSchema & lhs ) const
7505 {
7506 	lhs = *this;
7507 }
7508 
7509 //////////////////////////////////////////////////////////////////////////
7510 
CSphRsetSchema()7511 CSphRsetSchema::CSphRsetSchema()
7512 	: m_pIndexSchema ( NULL )
7513 {}
7514 
7515 
Reset()7516 void CSphRsetSchema::Reset ()
7517 {
7518 	ISphSchema::Reset();
7519 	m_pIndexSchema = NULL;
7520 	m_dExtraAttrs.Reset();
7521 	m_dDynamicUsed.Reset();
7522 	m_dFields.Reset();
7523 }
7524 
7525 
AddDynamicAttr(const CSphColumnInfo & tCol)7526 void CSphRsetSchema::AddDynamicAttr ( const CSphColumnInfo & tCol )
7527 {
7528 	ISphSchema::InsertAttr ( m_dExtraAttrs, m_dDynamicUsed, m_dExtraAttrs.GetLength(), tCol, true );
7529 }
7530 
7531 
GetRowSize() const7532 int CSphRsetSchema::GetRowSize() const
7533 {
7534 	// we copy over dynamic map in case index schema has dynamic attributes
7535 	// (that happens in case of inline attributes, or RAM segments in RT indexes)
7536 	// so there is no need to add GetDynamicSize() here
7537 	return m_pIndexSchema
7538 		? m_dDynamicUsed.GetLength() + m_pIndexSchema->GetStaticSize()
7539 		: m_dDynamicUsed.GetLength();
7540 }
7541 
7542 
GetStaticSize() const7543 int CSphRsetSchema::GetStaticSize() const
7544 {
7545 	// result set schemas additions are always dynamic
7546 	return m_pIndexSchema ? m_pIndexSchema->GetStaticSize() : 0;
7547 }
7548 
7549 
GetDynamicSize() const7550 int CSphRsetSchema::GetDynamicSize() const
7551 {
7552 	// we copy over dynamic map in case index schema has dynamic attributes
7553 	return m_dDynamicUsed.GetLength();
7554 }
7555 
7556 
GetAttrsCount() const7557 int CSphRsetSchema::GetAttrsCount() const
7558 {
7559 	return m_pIndexSchema
7560 		? m_dExtraAttrs.GetLength() + m_pIndexSchema->GetAttrsCount() - m_dRemoved.GetLength()
7561 		: m_dExtraAttrs.GetLength();
7562 }
7563 
7564 
GetAttrIndex(const char * sName) const7565 int CSphRsetSchema::GetAttrIndex ( const char * sName ) const
7566 {
7567 	ARRAY_FOREACH ( i, m_dExtraAttrs )
7568 		if ( m_dExtraAttrs[i].m_sName==sName )
7569 			return i + ( m_pIndexSchema ? m_pIndexSchema->GetAttrsCount() - m_dRemoved.GetLength() : 0 );
7570 
7571 	if ( !m_pIndexSchema )
7572 		return -1;
7573 
7574 	int iRes = m_pIndexSchema->GetAttrIndex(sName);
7575 	if ( iRes>=0 )
7576 	{
7577 		if ( m_dRemoved.Contains ( iRes ) )
7578 			return -1;
7579 		int iSub = 0;
7580 		ARRAY_FOREACH_COND ( i, m_dRemoved, iRes>=m_dRemoved[i] )
7581 			iSub++;
7582 		return iRes - iSub;
7583 	}
7584 	return -1;
7585 }
7586 
7587 
GetAttr(int iIndex) const7588 const CSphColumnInfo & CSphRsetSchema::GetAttr ( int iIndex ) const
7589 {
7590 	if ( !m_pIndexSchema )
7591 		return m_dExtraAttrs[iIndex];
7592 
7593 	if ( iIndex < m_pIndexSchema->GetAttrsCount() - m_dRemoved.GetLength() )
7594 	{
7595 		ARRAY_FOREACH_COND ( i, m_dRemoved, iIndex>=m_dRemoved[i] )
7596 			iIndex++;
7597 		return m_pIndexSchema->GetAttr(iIndex);
7598 	}
7599 
7600 	return m_dExtraAttrs [ iIndex - m_pIndexSchema->GetAttrsCount() + m_dRemoved.GetLength() ];
7601 }
7602 
7603 
GetAttr(const char * sName) const7604 const CSphColumnInfo * CSphRsetSchema::GetAttr ( const char * sName ) const
7605 {
7606 	ARRAY_FOREACH ( i, m_dExtraAttrs )
7607 		if ( m_dExtraAttrs[i].m_sName==sName )
7608 			return &m_dExtraAttrs[i];
7609 	if ( m_pIndexSchema )
7610 		return m_pIndexSchema->GetAttr ( sName );
7611 	return NULL;
7612 }
7613 
7614 
operator =(const ISphSchema & rhs)7615 CSphRsetSchema & CSphRsetSchema::operator = ( const ISphSchema & rhs )
7616 {
7617 	rhs.AssignTo ( *this );
7618 	return *this;
7619 }
7620 
7621 
operator =(const CSphSchema & rhs)7622 CSphRsetSchema & CSphRsetSchema::operator = ( const CSphSchema & rhs )
7623 {
7624 	Reset();
7625 	m_dFields = rhs.m_dFields; // OPTIMIZE? sad but copied
7626 	m_pIndexSchema = &rhs;
7627 
7628 	// copy over dynamic rowitems map
7629 	// so that the new attributes we might add would not overlap
7630 	m_dDynamicUsed = rhs.m_dDynamicUsed;
7631 	return *this;
7632 }
7633 
7634 
RemoveStaticAttr(int iAttr)7635 void CSphRsetSchema::RemoveStaticAttr ( int iAttr )
7636 {
7637 	assert ( m_pIndexSchema );
7638 	assert ( iAttr>=0 );
7639 	assert ( iAttr<( m_pIndexSchema->GetAttrsCount() - m_dRemoved.GetLength() ) );
7640 
7641 	// map from rset indexes (adjusted for removal) to index schema indexes (the original ones)
7642 	ARRAY_FOREACH_COND ( i, m_dRemoved, iAttr>=m_dRemoved[i] )
7643 		iAttr++;
7644 	m_dRemoved.Add ( iAttr );
7645 	m_dRemoved.Uniq();
7646 }
7647 
7648 
SwapAttrs(CSphVector<CSphColumnInfo> & dAttrs)7649 void CSphRsetSchema::SwapAttrs ( CSphVector<CSphColumnInfo> & dAttrs )
7650 {
7651 #ifndef NDEBUG
7652 	// ensure that every incoming column has a matching original column
7653 	// only check locators and attribute types, because at this stage,
7654 	// names that are used in dAttrs are already overwritten by the aliases
7655 	// (example: SELECT col1 a, col2 b, count(*) c FROM test)
7656 	//
7657 	// FIXME? maybe also lockdown the schema from further swaps, adds etc from here?
7658 	ARRAY_FOREACH ( i, dAttrs )
7659 	{
7660 		if ( dAttrs[i].m_tLocator.IsID() )
7661 			continue;
7662 		bool bFound1 = false;
7663 		if ( m_pIndexSchema )
7664 		{
7665 			const CSphVector<CSphColumnInfo> & dSrc = m_pIndexSchema->m_dAttrs;
7666 			bFound1 = ARRAY_ANY ( bFound1, dSrc, dSrc[_any].m_tLocator==dAttrs[i].m_tLocator && dSrc[_any].m_eAttrType==dAttrs[i].m_eAttrType )
7667 		}
7668 		bool bFound2 = ARRAY_ANY ( bFound2, m_dExtraAttrs,
7669 			m_dExtraAttrs[_any].m_tLocator==dAttrs[i].m_tLocator && m_dExtraAttrs[_any].m_eAttrType==dAttrs[i].m_eAttrType )
7670 			assert ( bFound1 || bFound2 );
7671 	}
7672 #endif
7673 	m_dExtraAttrs.SwapData ( dAttrs );
7674 	m_pIndexSchema = NULL;
7675 }
7676 
7677 
CloneMatch(CSphMatch * pDst,const CSphMatch & rhs) const7678 void CSphRsetSchema::CloneMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const
7679 {
7680 	assert ( pDst );
7681 	FreeStringPtrs ( pDst );
7682 	pDst->Combine ( rhs, GetDynamicSize() );
7683 	CopyPtrs ( pDst, rhs );
7684 }
7685 
7686 
7687 ///////////////////////////////////////////////////////////////////////////////
7688 // BIT-ENCODED FILE OUTPUT
7689 ///////////////////////////////////////////////////////////////////////////////
7690 
CSphWriter()7691 CSphWriter::CSphWriter ()
7692 	: m_sName ( "" )
7693 	, m_iPos ( -1 )
7694 	, m_iWritten ( 0 )
7695 
7696 	, m_iFD ( -1 )
7697 	, m_iPoolUsed ( 0 )
7698 	, m_pBuffer ( NULL )
7699 	, m_pPool ( NULL )
7700 	, m_bOwnFile ( false )
7701 	, m_pSharedOffset ( NULL )
7702 	, m_iBufferSize	( 262144 )
7703 
7704 	, m_bError ( false )
7705 	, m_pError ( NULL )
7706 {
7707 	m_pThrottle = &g_tThrottle;
7708 }
7709 
7710 
SetBufferSize(int iBufferSize)7711 void CSphWriter::SetBufferSize ( int iBufferSize )
7712 {
7713 	if ( iBufferSize!=m_iBufferSize )
7714 	{
7715 		m_iBufferSize = Max ( iBufferSize, 262144 );
7716 		SafeDeleteArray ( m_pBuffer );
7717 	}
7718 }
7719 
7720 
OpenFile(const CSphString & sName,CSphString & sErrorBuffer)7721 bool CSphWriter::OpenFile ( const CSphString & sName, CSphString & sErrorBuffer )
7722 {
7723 	assert ( !sName.IsEmpty() );
7724 	assert ( m_iFD<0 && "already open" );
7725 
7726 	m_bOwnFile = true;
7727 	m_sName = sName;
7728 	m_pError = &sErrorBuffer;
7729 
7730 	if ( !m_pBuffer )
7731 		m_pBuffer = new BYTE [ m_iBufferSize ];
7732 
7733 	m_iFD = ::open ( m_sName.cstr(), SPH_O_NEW, 0644 );
7734 	m_pPool = m_pBuffer;
7735 	m_iPoolUsed = 0;
7736 	m_iPos = 0;
7737 	m_iWritten = 0;
7738 	m_bError = ( m_iFD<0 );
7739 
7740 	if ( m_bError )
7741 		m_pError->SetSprintf ( "failed to create %s: %s" , sName.cstr(), strerror(errno) );
7742 
7743 	return !m_bError;
7744 }
7745 
7746 
SetFile(CSphAutofile & tAuto,SphOffset_t * pSharedOffset,CSphString & sError)7747 void CSphWriter::SetFile ( CSphAutofile & tAuto, SphOffset_t * pSharedOffset, CSphString & sError )
7748 {
7749 	assert ( m_iFD<0 && "already open" );
7750 	m_bOwnFile = false;
7751 
7752 	if ( !m_pBuffer )
7753 		m_pBuffer = new BYTE [ m_iBufferSize ];
7754 
7755 	m_iFD = tAuto.GetFD();
7756 	m_sName = tAuto.GetFilename();
7757 	m_pPool = m_pBuffer;
7758 	m_iPoolUsed = 0;
7759 	m_iPos = 0;
7760 	m_iWritten = 0;
7761 	m_pSharedOffset = pSharedOffset;
7762 	m_pError = &sError;
7763 	assert ( m_pError );
7764 }
7765 
7766 
~CSphWriter()7767 CSphWriter::~CSphWriter ()
7768 {
7769 	CloseFile ();
7770 	SafeDeleteArray ( m_pBuffer );
7771 }
7772 
7773 
CloseFile(bool bTruncate)7774 void CSphWriter::CloseFile ( bool bTruncate )
7775 {
7776 	if ( m_iFD>=0 )
7777 	{
7778 		Flush ();
7779 		if ( bTruncate )
7780 			sphTruncate ( m_iFD );
7781 		if ( m_bOwnFile )
7782 			::close ( m_iFD );
7783 		m_iFD = -1;
7784 	}
7785 }
7786 
UnlinkFile()7787 void CSphWriter::UnlinkFile()
7788 {
7789 	if ( m_bOwnFile )
7790 	{
7791 		if ( m_iFD>=0 )
7792 			::close ( m_iFD );
7793 
7794 		m_iFD = -1;
7795 		::unlink ( m_sName.cstr() );
7796 		m_sName = "";
7797 	}
7798 	SafeDeleteArray ( m_pBuffer );
7799 }
7800 
7801 
PutByte(int data)7802 void CSphWriter::PutByte ( int data )
7803 {
7804 	assert ( m_pPool );
7805 	if ( m_iPoolUsed==m_iBufferSize )
7806 		Flush ();
7807 	*m_pPool++ = BYTE ( data & 0xff );
7808 	m_iPoolUsed++;
7809 	m_iPos++;
7810 }
7811 
7812 
PutBytes(const void * pData,int64_t iSize)7813 void CSphWriter::PutBytes ( const void * pData, int64_t iSize )
7814 {
7815 	assert ( m_pPool );
7816 	const BYTE * pBuf = (const BYTE *) pData;
7817 	while ( iSize>0 )
7818 	{
7819 		int iPut = ( iSize<m_iBufferSize ? int(iSize) : m_iBufferSize ); // comparison int64 to int32
7820 		if ( m_iPoolUsed+iPut>m_iBufferSize )
7821 			Flush ();
7822 		assert ( m_iPoolUsed+iPut<=m_iBufferSize );
7823 
7824 		memcpy ( m_pPool, pBuf, iPut );
7825 		m_pPool += iPut;
7826 		m_iPoolUsed += iPut;
7827 		m_iPos += iPut;
7828 
7829 		pBuf += iPut;
7830 		iSize -= iPut;
7831 	}
7832 }
7833 
7834 
ZipInt(DWORD uValue)7835 void CSphWriter::ZipInt ( DWORD uValue )
7836 {
7837 	int iBytes = 1;
7838 
7839 	DWORD u = ( uValue>>7 );
7840 	while ( u )
7841 	{
7842 		u >>= 7;
7843 		iBytes++;
7844 	}
7845 
7846 	while ( iBytes-- )
7847 		PutByte (
7848 			( 0x7f & ( uValue >> (7*iBytes) ) )
7849 			| ( iBytes ? 0x80 : 0 ) );
7850 }
7851 
7852 
ZipOffset(uint64_t uValue)7853 void CSphWriter::ZipOffset ( uint64_t uValue )
7854 {
7855 	int iBytes = 1;
7856 
7857 	uint64_t u = ((uint64_t)uValue)>>7;
7858 	while ( u )
7859 	{
7860 		u >>= 7;
7861 		iBytes++;
7862 	}
7863 
7864 	while ( iBytes-- )
7865 		PutByte (
7866 			( 0x7f & (DWORD)( uValue >> (7*iBytes) ) )
7867 			| ( iBytes ? 0x80 : 0 ) );
7868 }
7869 
7870 
ZipOffsets(CSphVector<SphOffset_t> * pData)7871 void CSphWriter::ZipOffsets ( CSphVector<SphOffset_t> * pData )
7872 {
7873 	assert ( pData );
7874 
7875 	SphOffset_t * pValue = &((*pData)[0]);
7876 	int n = pData->GetLength ();
7877 
7878 	while ( n-->0 )
7879 	{
7880 		SphOffset_t uValue = *pValue++;
7881 
7882 		int iBytes = 1;
7883 
7884 		uint64_t u = ((uint64_t)uValue)>>7;
7885 		while ( u )
7886 		{
7887 			u >>= 7;
7888 			iBytes++;
7889 		}
7890 
7891 		while ( iBytes-- )
7892 			PutByte (
7893 				( 0x7f & (DWORD)( uValue >> (7*iBytes) ) )
7894 				| ( iBytes ? 0x80 : 0 ) );
7895 	}
7896 }
7897 
7898 
Flush()7899 void CSphWriter::Flush ()
7900 {
7901 	if ( m_pSharedOffset && *m_pSharedOffset!=m_iWritten )
7902 		sphSeek ( m_iFD, m_iWritten, SEEK_SET );
7903 
7904 	if ( !sphWriteThrottled ( m_iFD, m_pBuffer, m_iPoolUsed, m_sName.cstr(), *m_pError, m_pThrottle ) )
7905 		m_bError = true;
7906 
7907 	m_iWritten += m_iPoolUsed;
7908 	m_iPoolUsed = 0;
7909 	m_pPool = m_pBuffer;
7910 
7911 	if ( m_pSharedOffset )
7912 		*m_pSharedOffset = m_iWritten;
7913 }
7914 
7915 
PutString(const char * szString)7916 void CSphWriter::PutString ( const char * szString )
7917 {
7918 	int iLen = szString ? strlen ( szString ) : 0;
7919 	PutDword ( iLen );
7920 	if ( iLen )
7921 		PutBytes ( szString, iLen );
7922 }
7923 
7924 
PutString(const CSphString & sString)7925 void CSphWriter::PutString ( const CSphString & sString )
7926 {
7927 	int iLen = sString.Length();
7928 	PutDword ( iLen );
7929 	if ( iLen )
7930 		PutBytes ( sString.cstr(), iLen );
7931 }
7932 
7933 
Tag(const char * sTag)7934 void CSphWriter::Tag ( const char * sTag )
7935 {
7936 	assert ( sTag && *sTag ); // empty tags are nonsense
7937 	assert ( strlen(sTag)<64 ); // huge tags are nonsense
7938 	PutBytes ( sTag, strlen(sTag) );
7939 }
7940 
7941 
SeekTo(SphOffset_t iPos)7942 void CSphWriter::SeekTo ( SphOffset_t iPos )
7943 {
7944 	assert ( iPos>=0 );
7945 
7946 	if ( iPos>=m_iWritten && iPos<=( m_iWritten + m_iPoolUsed ) )
7947 	{
7948 		// seeking inside the buffer
7949 		m_iPoolUsed = (int)( iPos - m_iWritten );
7950 		m_pPool = m_pBuffer + m_iPoolUsed;
7951 	} else
7952 	{
7953 		assert ( iPos<m_iWritten ); // seeking forward in a writer, we don't support it
7954 		sphSeek ( m_iFD, iPos, SEEK_SET );
7955 
7956 		// seeking outside the buffer; so the buffer must be discarded
7957 		// also, current write position must be adjusted
7958 		m_pPool = m_pBuffer;
7959 		m_iPoolUsed = 0;
7960 		m_iWritten = iPos;
7961 	}
7962 	m_iPos = iPos;
7963 }
7964 
7965 ///////////////////////////////////////////////////////////////////////////////
7966 // BIT-ENCODED FILE INPUT
7967 ///////////////////////////////////////////////////////////////////////////////
7968 
CSphReader(BYTE * pBuf,int iSize)7969 CSphReader::CSphReader ( BYTE * pBuf, int iSize )
7970 	: m_pProfile ( NULL )
7971 	, m_eProfileState ( SPH_QSTATE_IO )
7972 	, m_iFD ( -1 )
7973 	, m_iPos ( 0 )
7974 	, m_iBuffPos ( 0 )
7975 	, m_iBuffUsed ( 0 )
7976 	, m_pBuff ( pBuf )
7977 	, m_iSizeHint ( 0 )
7978 	, m_iBufSize ( iSize )
7979 	, m_bBufOwned ( false )
7980 	, m_iReadUnhinted ( DEFAULT_READ_UNHINTED )
7981 	, m_bError ( false )
7982 {
7983 	assert ( pBuf==NULL || iSize>0 );
7984 	m_pThrottle = &g_tThrottle;
7985 }
7986 
7987 
~CSphReader()7988 CSphReader::~CSphReader ()
7989 {
7990 	if ( m_bBufOwned )
7991 		SafeDeleteArray ( m_pBuff );
7992 }
7993 
7994 
SetBuffers(int iReadBuffer,int iReadUnhinted)7995 void CSphReader::SetBuffers ( int iReadBuffer, int iReadUnhinted )
7996 {
7997 	if ( !m_pBuff )
7998 		m_iBufSize = iReadBuffer;
7999 	m_iReadUnhinted = iReadUnhinted;
8000 }
8001 
8002 
SetFile(int iFD,const char * sFilename)8003 void CSphReader::SetFile ( int iFD, const char * sFilename )
8004 {
8005 	m_iFD = iFD;
8006 	m_iPos = 0;
8007 	m_iBuffPos = 0;
8008 	m_iBuffUsed = 0;
8009 	m_sFilename = sFilename;
8010 }
8011 
8012 
SetFile(const CSphAutofile & tFile)8013 void CSphReader::SetFile ( const CSphAutofile & tFile )
8014 {
8015 	SetFile ( tFile.GetFD(), tFile.GetFilename() );
8016 }
8017 
8018 
Reset()8019 void CSphReader::Reset ()
8020 {
8021 	SetFile ( -1, "" );
8022 }
8023 
8024 
8025 /// sizehint > 0 means we expect to read approx that much bytes
8026 /// sizehint == 0 means no hint, use default (happens later in UpdateCache())
8027 /// sizehint == -1 means reposition and adjust current hint
SeekTo(SphOffset_t iPos,int iSizeHint)8028 void CSphReader::SeekTo ( SphOffset_t iPos, int iSizeHint )
8029 {
8030 	assert ( iPos>=0 );
8031 	assert ( iSizeHint>=-1 );
8032 
8033 #ifndef NDEBUG
8034 #if PARANOID
8035 	struct_stat tStat;
8036 	fstat ( m_iFD, &tStat );
8037 	if ( iPos > tStat.st_size )
8038 		sphDie ( "INTERNAL ERROR: seeking past the end of file" );
8039 #endif
8040 #endif
8041 
8042 	if ( iPos>=m_iPos && iPos<m_iPos+m_iBuffUsed )
8043 	{
8044 		m_iBuffPos = (int)( iPos-m_iPos ); // reposition to proper byte
8045 		m_iSizeHint = iSizeHint - ( m_iBuffUsed - m_iBuffPos ); // we already have some bytes cached, so let's adjust size hint
8046 		assert ( m_iBuffPos<m_iBuffUsed );
8047 	} else
8048 	{
8049 		m_iPos = iPos;
8050 		m_iBuffPos = 0; // for GetPos() to work properly, aaaargh
8051 		m_iBuffUsed = 0;
8052 
8053 		if ( iSizeHint==-1 )
8054 		{
8055 			// the adjustment bureau
8056 			// we need to seek but still keep the current hint
8057 			// happens on a skiplist jump, for instance
8058 			int64_t iHintLeft = m_iPos + m_iSizeHint - iPos;
8059 			if ( iHintLeft>0 && iHintLeft<INT_MAX )
8060 				iSizeHint = (int)iHintLeft;
8061 			else
8062 				iSizeHint = 0;
8063 		}
8064 
8065 		// get that hint
8066 		assert ( iSizeHint>=0 );
8067 		m_iSizeHint = iSizeHint;
8068 	}
8069 }
8070 
8071 
SkipBytes(int iCount)8072 void CSphReader::SkipBytes ( int iCount )
8073 {
8074 	// 0 means "no hint", so this clamp works alright
8075 	SeekTo ( m_iPos+m_iBuffPos+iCount, Max ( m_iSizeHint-m_iBuffPos-iCount, 0 ) );
8076 }
8077 
8078 
8079 #if USE_WINDOWS
8080 
8081 // atomic seek+read for Windows
sphPread(int iFD,void * pBuf,int iBytes,SphOffset_t iOffset)8082 int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
8083 {
8084 	if ( iBytes==0 )
8085 		return 0;
8086 
8087 	CSphIOStats * pIOStats = GetIOStats();
8088 	int64_t tmStart = 0;
8089 	if ( pIOStats )
8090 		tmStart = sphMicroTimer();
8091 
8092 	HANDLE hFile;
8093 	hFile = (HANDLE) _get_osfhandle ( iFD );
8094 	if ( hFile==INVALID_HANDLE_VALUE )
8095 		return -1;
8096 
8097 	STATIC_SIZE_ASSERT ( SphOffset_t, 8 );
8098 	OVERLAPPED tOverlapped = { 0 };
8099 	tOverlapped.Offset = (DWORD)( iOffset & I64C(0xffffffff) );
8100 	tOverlapped.OffsetHigh = (DWORD)( iOffset>>32 );
8101 
8102 	DWORD uRes;
8103 	if ( !ReadFile ( hFile, pBuf, iBytes, &uRes, &tOverlapped ) )
8104 	{
8105 		DWORD uErr = GetLastError();
8106 		if ( uErr==ERROR_HANDLE_EOF )
8107 			return 0;
8108 
8109 		errno = uErr; // FIXME! should remap from Win to POSIX
8110 		return -1;
8111 	}
8112 
8113 	if ( pIOStats )
8114 	{
8115 		pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
8116 		pIOStats->m_iReadOps++;
8117 		pIOStats->m_iReadBytes += iBytes;
8118 	}
8119 
8120 	return uRes;
8121 }
8122 
8123 #else
8124 #if HAVE_PREAD
8125 
8126 // atomic seek+read for non-Windows systems with pread() call
sphPread(int iFD,void * pBuf,int iBytes,SphOffset_t iOffset)8127 int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
8128 {
8129 	CSphIOStats * pIOStats = GetIOStats();
8130 	if ( !pIOStats )
8131 		return ::pread ( iFD, pBuf, iBytes, iOffset );
8132 
8133 	int64_t tmStart = sphMicroTimer();
8134 	int iRes = (int) ::pread ( iFD, pBuf, iBytes, iOffset );
8135 	if ( pIOStats )
8136 	{
8137 		pIOStats->m_iReadTime += sphMicroTimer() - tmStart;
8138 		pIOStats->m_iReadOps++;
8139 		pIOStats->m_iReadBytes += iBytes;
8140 	}
8141 	return iRes;
8142 }
8143 
8144 #else
8145 
8146 // generic fallback; prone to races between seek and read
sphPread(int iFD,void * pBuf,int iBytes,SphOffset_t iOffset)8147 int sphPread ( int iFD, void * pBuf, int iBytes, SphOffset_t iOffset )
8148 {
8149 	if ( sphSeek ( iFD, iOffset, SEEK_SET )==-1 )
8150 		return -1;
8151 
8152 	return sphReadThrottled ( iFD, pBuf, iBytes, &g_tThrottle );
8153 }
8154 
8155 #endif // HAVE_PREAD
8156 #endif // USE_WINDOWS
8157 
8158 
UpdateCache()8159 void CSphReader::UpdateCache ()
8160 {
8161 	ESphQueryState eOld = SPH_QSTATE_TOTAL;
8162 	if ( m_pProfile )
8163 		eOld = m_pProfile->Switch ( m_eProfileState );
8164 
8165 	assert ( m_iFD>=0 );
8166 
8167 	// alloc buf on first actual read
8168 	if ( !m_pBuff )
8169 	{
8170 		if ( m_iBufSize<=0 )
8171 			m_iBufSize = DEFAULT_READ_BUFFER;
8172 
8173 		m_bBufOwned = true;
8174 		m_pBuff = new BYTE [ m_iBufSize ];
8175 	}
8176 
8177 	// stream position could be changed externally
8178 	// so let's just hope that the OS optimizes redundant seeks
8179 	SphOffset_t iNewPos = m_iPos + Min ( m_iBuffPos, m_iBuffUsed );
8180 
8181 	if ( m_iSizeHint<=0 )
8182 		m_iSizeHint = ( m_iReadUnhinted>0 ) ? m_iReadUnhinted : DEFAULT_READ_UNHINTED;
8183 	int iReadLen = Min ( m_iSizeHint, m_iBufSize );
8184 
8185 	m_iBuffPos = 0;
8186 	m_iBuffUsed = sphPread ( m_iFD, m_pBuff, iReadLen, iNewPos ); // FIXME! what about throttling?
8187 
8188 	if ( m_iBuffUsed<0 )
8189 	{
8190 		m_iBuffUsed = m_iBuffPos = 0;
8191 		m_bError = true;
8192 		m_sError.SetSprintf ( "pread error in %s: pos=" INT64_FMT ", len=%d, code=%d, msg=%s",
8193 			m_sFilename.cstr(), (int64_t)iNewPos, iReadLen, errno, strerror(errno) );
8194 		if ( m_pProfile )
8195 			m_pProfile->Switch ( eOld );
8196 		return;
8197 	}
8198 
8199 	// all fine, adjust offset and hint
8200 	m_iSizeHint -= m_iBuffUsed;
8201 	m_iPos = iNewPos;
8202 	if ( m_pProfile )
8203 		m_pProfile->Switch ( eOld );
8204 }
8205 
8206 
GetByte()8207 int CSphReader::GetByte ()
8208 {
8209 	if ( m_iBuffPos>=m_iBuffUsed )
8210 	{
8211 		UpdateCache ();
8212 		if ( m_iBuffPos>=m_iBuffUsed )
8213 			return 0; // unexpected io failure
8214 	}
8215 
8216 	assert ( m_iBuffPos<m_iBuffUsed );
8217 	return m_pBuff [ m_iBuffPos++ ];
8218 }
8219 
8220 
GetBytes(void * pData,int iSize)8221 void CSphReader::GetBytes ( void * pData, int iSize )
8222 {
8223 	BYTE * pOut = (BYTE*) pData;
8224 
8225 	while ( iSize>m_iBufSize )
8226 	{
8227 		int iLen = m_iBuffUsed - m_iBuffPos;
8228 		assert ( iLen<=m_iBufSize );
8229 
8230 		memcpy ( pOut, m_pBuff+m_iBuffPos, iLen );
8231 		m_iBuffPos += iLen;
8232 		pOut += iLen;
8233 		iSize -= iLen;
8234 		m_iSizeHint = iSize; // FIXME!
8235 
8236 		if ( iSize>0 )
8237 		{
8238 			UpdateCache ();
8239 			if ( !m_iBuffUsed )
8240 			{
8241 				memset ( pData, 0, iSize );
8242 				return; // unexpected io failure
8243 			}
8244 		}
8245 	}
8246 
8247 	if ( m_iBuffPos+iSize>m_iBuffUsed )
8248 	{
8249 		// move old buffer tail to buffer head to avoid losing the data
8250 		const int iLen = m_iBuffUsed - m_iBuffPos;
8251 		if ( iLen>0 )
8252 		{
8253 			memcpy ( pOut, m_pBuff+m_iBuffPos, iLen );
8254 			m_iBuffPos += iLen;
8255 			pOut += iLen;
8256 			iSize -= iLen;
8257 		}
8258 
8259 		m_iSizeHint = iSize - m_iBuffUsed + m_iBuffPos; // FIXME!
8260 		UpdateCache ();
8261 		if ( m_iBuffPos+iSize>m_iBuffUsed )
8262 		{
8263 			memset ( pData, 0, iSize ); // unexpected io failure
8264 			return;
8265 		}
8266 	}
8267 
8268 	assert ( (m_iBuffPos+iSize)<=m_iBuffUsed );
8269 	memcpy ( pOut, m_pBuff+m_iBuffPos, iSize );
8270 	m_iBuffPos += iSize;
8271 }
8272 
8273 
GetBytesZerocopy(const BYTE ** ppData,int iMax)8274 int CSphReader::GetBytesZerocopy ( const BYTE ** ppData, int iMax )
8275 {
8276 	if ( m_iBuffPos>=m_iBuffUsed )
8277 	{
8278 		UpdateCache ();
8279 		if ( m_iBuffPos>=m_iBuffUsed )
8280 			return 0; // unexpected io failure
8281 	}
8282 
8283 	int iChunk = Min ( m_iBuffUsed-m_iBuffPos, iMax );
8284 	*ppData = m_pBuff + m_iBuffPos;
8285 	m_iBuffPos += iChunk;
8286 	return iChunk;
8287 }
8288 
8289 
GetLine(char * sBuffer,int iMaxLen)8290 int CSphReader::GetLine ( char * sBuffer, int iMaxLen )
8291 {
8292 	int iOutPos = 0;
8293 	iMaxLen--; // reserve space for trailing '\0'
8294 
8295 	// grab as many chars as we can
8296 	while ( iOutPos<iMaxLen )
8297 	{
8298 		// read next chunk if necessary
8299 		if ( m_iBuffPos>=m_iBuffUsed )
8300 		{
8301 			UpdateCache ();
8302 			if ( m_iBuffPos>=m_iBuffUsed )
8303 			{
8304 				if ( iOutPos==0 ) return -1; // current line is empty; indicate eof
8305 				break; // return current line; will return eof next time
8306 			}
8307 		}
8308 
8309 		// break on CR or LF
8310 		if ( m_pBuff[m_iBuffPos]=='\r' || m_pBuff[m_iBuffPos]=='\n' )
8311 			break;
8312 
8313 		// one more valid char
8314 		sBuffer[iOutPos++] = m_pBuff[m_iBuffPos++];
8315 	}
8316 
8317 	// skip everything until the newline or eof
8318 	for ( ;; )
8319 	{
8320 		// read next chunk if necessary
8321 		if ( m_iBuffPos>=m_iBuffUsed )
8322 			UpdateCache ();
8323 
8324 		// eof?
8325 		if ( m_iBuffPos>=m_iBuffUsed )
8326 			break;
8327 
8328 		// newline?
8329 		if ( m_pBuff[m_iBuffPos++]=='\n' )
8330 			break;
8331 	}
8332 
8333 	// finalize
8334 	sBuffer[iOutPos] = '\0';
8335 	return iOutPos;
8336 }
8337 
ResetError()8338 void CSphReader::ResetError()
8339 {
8340 	m_bError = false;
8341 	m_sError = "";
8342 }
8343 
8344 /////////////////////////////////////////////////////////////////////////////
8345 
8346 #if PARANOID
8347 
8348 #define SPH_VARINT_DECODE(_type,_getexpr) \
8349 	DWORD b = 0; \
8350 	_type v = 0; \
8351 	int it = 0; \
8352 	do { b = _getexpr; v = ( v<<7 ) + ( b&0x7f ); it++; } while ( b&0x80 ); \
8353 	assert ( (it-1)*7<=sizeof(_type)*8 ); \
8354 	return v;
8355 
8356 #else
8357 
8358 #define SPH_VARINT_DECODE(_type,_getexpr) \
8359 	DWORD b = _getexpr; \
8360 	_type res = 0; \
8361 	while ( b & 0x80 ) \
8362 	{ \
8363 		res = ( res<<7 ) + ( b & 0x7f ); \
8364 		b = _getexpr; \
8365 	} \
8366 	res = ( res<<7 ) + b; \
8367 	return res;
8368 
8369 #endif // PARANOID
8370 
sphUnzipInt(const BYTE * & pBuf)8371 DWORD sphUnzipInt ( const BYTE * & pBuf )			{ SPH_VARINT_DECODE ( DWORD, *pBuf++ ); }
sphUnzipOffset(const BYTE * & pBuf)8372 SphOffset_t sphUnzipOffset ( const BYTE * & pBuf )	{ SPH_VARINT_DECODE ( SphOffset_t, *pBuf++ ); }
8373 
UnzipInt()8374 DWORD CSphReader::UnzipInt ()			{ SPH_VARINT_DECODE ( DWORD, GetByte() ); }
UnzipOffset()8375 uint64_t CSphReader::UnzipOffset ()	{ SPH_VARINT_DECODE ( uint64_t, GetByte() ); }
8376 
8377 
8378 #if USE_64BIT
8379 #define sphUnzipWordid sphUnzipOffset
8380 #else
8381 #define sphUnzipWordid sphUnzipInt
8382 #endif
8383 
8384 /////////////////////////////////////////////////////////////////////////////
8385 
operator =(const CSphReader & rhs)8386 const CSphReader & CSphReader::operator = ( const CSphReader & rhs )
8387 {
8388 	SetFile ( rhs.m_iFD, rhs.m_sFilename.cstr() );
8389 	SeekTo ( rhs.m_iPos + rhs.m_iBuffPos, rhs.m_iSizeHint );
8390 	return *this;
8391 }
8392 
8393 
GetDword()8394 DWORD CSphReader::GetDword ()
8395 {
8396 	DWORD uRes = 0;
8397 	GetBytes ( &uRes, sizeof(DWORD) );
8398 	return uRes;
8399 }
8400 
8401 
GetOffset()8402 SphOffset_t CSphReader::GetOffset ()
8403 {
8404 	SphOffset_t uRes = 0;
8405 	GetBytes ( &uRes, sizeof(SphOffset_t) );
8406 	return uRes;
8407 }
8408 
8409 
GetString()8410 CSphString CSphReader::GetString ()
8411 {
8412 	CSphString sRes;
8413 
8414 	DWORD iLen = GetDword ();
8415 	if ( iLen )
8416 	{
8417 		char * sBuf = new char [ iLen ];
8418 		GetBytes ( sBuf, iLen );
8419 		sRes.SetBinary ( sBuf, iLen );
8420 		SafeDeleteArray ( sBuf );
8421 	}
8422 
8423 	return sRes;
8424 }
8425 
Tag(const char * sTag)8426 bool CSphReader::Tag ( const char * sTag )
8427 {
8428 	if ( m_bError )
8429 		return false;
8430 
8431 	assert ( sTag && *sTag ); // empty tags are nonsense
8432 	assert ( strlen(sTag)<64 ); // huge tags are nonsense
8433 
8434 	int iLen = strlen(sTag);
8435 	char sBuf[64];
8436 	GetBytes ( sBuf, iLen );
8437 	if ( !memcmp ( sBuf, sTag, iLen ) )
8438 		return true;
8439 	m_bError = true;
8440 	m_sError.SetSprintf ( "expected tag %s was not found", sTag );
8441 	return false;
8442 }
8443 
8444 //////////////////////////////////////////////////////////////////////////
8445 
~CSphAutoreader()8446 CSphAutoreader::~CSphAutoreader ()
8447 {
8448 	Close ();
8449 }
8450 
8451 
Open(const CSphString & sFilename,CSphString & sError)8452 bool CSphAutoreader::Open ( const CSphString & sFilename, CSphString & sError )
8453 {
8454 	assert ( m_iFD<0 );
8455 	assert ( !sFilename.IsEmpty() );
8456 
8457 	m_iFD = ::open ( sFilename.cstr(), SPH_O_READ, 0644 );
8458 	m_iPos = 0;
8459 	m_iBuffPos = 0;
8460 	m_iBuffUsed = 0;
8461 	m_sFilename = sFilename;
8462 
8463 	if ( m_iFD<0 )
8464 		sError.SetSprintf ( "failed to open %s: %s", sFilename.cstr(), strerror(errno) );
8465 	return ( m_iFD>=0 );
8466 }
8467 
8468 
Close()8469 void CSphAutoreader::Close ()
8470 {
8471 	if ( m_iFD>=0 )
8472 		::close ( m_iFD	);
8473 	m_iFD = -1;
8474 }
8475 
8476 
GetFilesize()8477 SphOffset_t CSphAutoreader::GetFilesize ()
8478 {
8479 	assert ( m_iFD>=0 );
8480 
8481 	struct_stat st;
8482 	if ( m_iFD<0 || fstat ( m_iFD, &st )<0 )
8483 		return -1;
8484 
8485 	return st.st_size;
8486 }
8487 
8488 /////////////////////////////////////////////////////////////////////////////
8489 // QUERY RESULT
8490 /////////////////////////////////////////////////////////////////////////////
8491 
CSphQueryResult()8492 CSphQueryResult::CSphQueryResult ()
8493 {
8494 	m_iQueryTime = 0;
8495 	m_iRealQueryTime = 0;
8496 	m_iCpuTime = 0;
8497 	m_iMultiplier = 1;
8498 	m_iTotalMatches = 0;
8499 	m_pMva = NULL;
8500 	m_pStrings = NULL;
8501 	m_iOffset = 0;
8502 	m_iCount = 0;
8503 	m_iSuccesses = 0;
8504 	m_pProfile = NULL;
8505 	m_bArenaProhibit = false;
8506 }
8507 
8508 
~CSphQueryResult()8509 CSphQueryResult::~CSphQueryResult ()
8510 {
8511 	ARRAY_FOREACH ( i, m_dStorage2Free )
8512 	{
8513 		SafeDeleteArray ( m_dStorage2Free[i] );
8514 	}
8515 	ARRAY_FOREACH ( i, m_dMatches )
8516 		m_tSchema.FreeStringPtrs ( &m_dMatches[i] );
8517 }
8518 
LeakStorages(CSphQueryResult & tDst)8519 void CSphQueryResult::LeakStorages ( CSphQueryResult & tDst )
8520 {
8521 	ARRAY_FOREACH ( i, m_dStorage2Free )
8522 		tDst.m_dStorage2Free.Add ( m_dStorage2Free[i] );
8523 
8524 	m_dStorage2Free.Reset();
8525 }
8526 
8527 
8528 /////////////////////////////////////////////////////////////////////////////
8529 // CHUNK READER
8530 /////////////////////////////////////////////////////////////////////////////
8531 
CSphBin(ESphHitless eMode,bool bWordDict)8532 CSphBin::CSphBin ( ESphHitless eMode, bool bWordDict )
8533 	: m_eMode ( eMode )
8534 	, m_dBuffer ( NULL )
8535 	, m_pCurrent ( NULL )
8536 	, m_iLeft ( 0 )
8537 	, m_iDone ( 0 )
8538 	, m_eState ( BIN_POS )
8539 	, m_bWordDict ( bWordDict )
8540 	, m_bError ( false )
8541 	, m_iFile ( -1 )
8542 	, m_pFilePos ( NULL )
8543 	, m_iFilePos ( 0 )
8544 	, m_iFileLeft ( 0 )
8545 {
8546 	m_tHit.m_sKeyword = bWordDict ? m_sKeyword : NULL;
8547 	m_sKeyword[0] = '\0';
8548 	m_pThrottle = &g_tThrottle;
8549 
8550 #ifndef NDEBUG
8551 	m_iLastWordID = 0;
8552 	m_sLastKeyword[0] = '\0';
8553 #endif
8554 }
8555 
8556 
CalcBinSize(int iMemoryLimit,int iBlocks,const char * sPhase,bool bWarn)8557 int CSphBin::CalcBinSize ( int iMemoryLimit, int iBlocks, const char * sPhase, bool bWarn )
8558 {
8559 	if ( iBlocks<=0 )
8560 		return CSphBin::MIN_SIZE;
8561 
8562 	int iBinSize = ( ( iMemoryLimit/iBlocks + 2048 ) >> 12 ) << 12; // round to 4k
8563 
8564 	if ( iBinSize<CSphBin::MIN_SIZE )
8565 	{
8566 		iBinSize = CSphBin::MIN_SIZE;
8567 		sphWarn ( "%s: mem_limit=%d kb extremely low, increasing to %d kb",
8568 			sPhase, iMemoryLimit/1024, iBinSize*iBlocks/1024 );
8569 	}
8570 
8571 	if ( iBinSize<CSphBin::WARN_SIZE && bWarn )
8572 	{
8573 		sphWarn ( "%s: merge_block_size=%d kb too low, increasing mem_limit may improve performance",
8574 			sPhase, iBinSize/1024 );
8575 	}
8576 
8577 	return iBinSize;
8578 }
8579 
8580 
Init(int iFD,SphOffset_t * pSharedOffset,const int iBinSize)8581 void CSphBin::Init ( int iFD, SphOffset_t * pSharedOffset, const int iBinSize )
8582 {
8583 	assert ( !m_dBuffer );
8584 	assert ( iBinSize>=MIN_SIZE );
8585 	assert ( pSharedOffset );
8586 
8587 	m_iFile = iFD;
8588 	m_pFilePos = pSharedOffset;
8589 
8590 	m_iSize = iBinSize;
8591 	m_dBuffer = new BYTE [ iBinSize ];
8592 	m_pCurrent = m_dBuffer;
8593 
8594 	m_tHit.m_uDocID = 0;
8595 	m_tHit.m_uWordID = 0;
8596 	m_tHit.m_iWordPos = EMPTY_HIT;
8597 	m_tHit.m_dFieldMask.UnsetAll();
8598 
8599 	m_bError = false;
8600 }
8601 
8602 
~CSphBin()8603 CSphBin::~CSphBin ()
8604 {
8605 	SafeDeleteArray ( m_dBuffer );
8606 }
8607 
8608 
ReadByte()8609 int CSphBin::ReadByte ()
8610 {
8611 	BYTE r;
8612 
8613 	if ( !m_iLeft )
8614 	{
8615 		if ( *m_pFilePos!=m_iFilePos )
8616 		{
8617 			sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
8618 			*m_pFilePos = m_iFilePos;
8619 		}
8620 
8621 		int n = m_iFileLeft > m_iSize
8622 			? m_iSize
8623 			: (int)m_iFileLeft;
8624 		if ( n==0 )
8625 		{
8626 			m_iDone = 1;
8627 			m_iLeft = 1;
8628 		} else
8629 		{
8630 			assert ( m_dBuffer );
8631 
8632 			if ( sphReadThrottled ( m_iFile, m_dBuffer, n, m_pThrottle )!=(size_t)n )
8633 			{
8634 				m_bError = true;
8635 				return -2;
8636 			}
8637 			m_iLeft = n;
8638 
8639 			m_iFilePos += n;
8640 			m_iFileLeft -= n;
8641 			m_pCurrent = m_dBuffer;
8642 			*m_pFilePos += n;
8643 		}
8644 	}
8645 	if ( m_iDone )
8646 	{
8647 		m_bError = true; // unexpected (!) eof
8648 		return -1;
8649 	}
8650 
8651 	m_iLeft--;
8652 	r = *(m_pCurrent);
8653 	m_pCurrent++;
8654 	return r;
8655 }
8656 
8657 
ReadBytes(void * pDest,int iBytes)8658 ESphBinRead CSphBin::ReadBytes ( void * pDest, int iBytes )
8659 {
8660 	assert ( iBytes>0 );
8661 	assert ( iBytes<=m_iSize );
8662 
8663 	if ( m_iDone )
8664 		return BIN_READ_EOF;
8665 
8666 	if ( m_iLeft<iBytes )
8667 	{
8668 		if ( *m_pFilePos!=m_iFilePos )
8669 		{
8670 			sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
8671 			*m_pFilePos = m_iFilePos;
8672 		}
8673 
8674 		int n = Min ( m_iFileLeft, m_iSize - m_iLeft );
8675 		if ( n==0 )
8676 		{
8677 			m_iDone = 1;
8678 			m_bError = true; // unexpected (!) eof
8679 			return BIN_READ_EOF;
8680 		}
8681 
8682 		assert ( m_dBuffer );
8683 		memmove ( m_dBuffer, m_pCurrent, m_iLeft );
8684 
8685 		if ( sphReadThrottled ( m_iFile, m_dBuffer + m_iLeft, n, m_pThrottle )!=(size_t)n )
8686 		{
8687 			m_bError = true;
8688 			return BIN_READ_ERROR;
8689 		}
8690 
8691 		m_iLeft += n;
8692 		m_iFilePos += n;
8693 		m_iFileLeft -= n;
8694 		m_pCurrent = m_dBuffer;
8695 		*m_pFilePos += n;
8696 	}
8697 
8698 	assert ( m_iLeft>=iBytes );
8699 	m_iLeft -= iBytes;
8700 
8701 	memcpy ( pDest, m_pCurrent, iBytes );
8702 	m_pCurrent += iBytes;
8703 
8704 	return BIN_READ_OK;
8705 }
8706 
8707 
ReadVLB()8708 SphWordID_t CSphBin::ReadVLB ()
8709 {
8710 	SphWordID_t uValue = 0;
8711 	int iByte, iOffset = 0;
8712 	do
8713 	{
8714 		if ( ( iByte = ReadByte() )<0 )
8715 			return 0;
8716 		uValue += ( ( SphWordID_t ( iByte & 0x7f ) ) << iOffset );
8717 		iOffset += 7;
8718 	}
8719 	while ( iByte & 0x80 );
8720 	return uValue;
8721 }
8722 
UnzipInt()8723 DWORD CSphBin::UnzipInt ()
8724 {
8725 	int b = 0;
8726 	DWORD v = 0;
8727 	do
8728 	{
8729 		b = ReadByte();
8730 		if ( b<0 )
8731 			b = 0;
8732 		v = ( v<<7 ) + ( b & 0x7f );
8733 	} while ( b & 0x80 );
8734 	return v;
8735 }
8736 
UnzipOffset()8737 SphOffset_t CSphBin::UnzipOffset ()
8738 {
8739 	int b = 0;
8740 	SphOffset_t v = 0;
8741 	do
8742 	{
8743 		b = ReadByte();
8744 		if ( b<0 )
8745 			b = 0;
8746 		v = ( v<<7 ) + ( b & 0x7f );
8747 	} while ( b & 0x80 );
8748 	return v;
8749 }
8750 
ReadHit(CSphAggregateHit * pOut,int iRowitems,CSphRowitem * pRowitems)8751 int CSphBin::ReadHit ( CSphAggregateHit * pOut, int iRowitems, CSphRowitem * pRowitems )
8752 {
8753 	// expected EOB
8754 	if ( m_iDone )
8755 	{
8756 		pOut->m_uWordID = 0;
8757 		return 1;
8758 	}
8759 
8760 	CSphAggregateHit & tHit = m_tHit; // shortcut
8761 	for ( ;; )
8762 	{
8763 		// SPH_MAX_WORD_LEN is now 42 only to keep ReadVLB() below
8764 		// technically, we can just use different functions on different paths, if ever needed
8765 		STATIC_ASSERT ( SPH_MAX_WORD_LEN*3<=127, KEYWORD_TOO_LONG );
8766 		SphWordID_t uDelta = ReadVLB();
8767 
8768 		if ( uDelta )
8769 		{
8770 			switch ( m_eState )
8771 			{
8772 				case BIN_WORD:
8773 					if ( m_bWordDict )
8774 					{
8775 #ifdef NDEBUG
8776 						// FIXME?! move this under PARANOID or something?
8777 						// or just introduce an assert() checked release build?
8778 						if ( uDelta>=sizeof(m_sKeyword) )
8779 							sphDie ( "INTERNAL ERROR: corrupted keyword length (len=" UINT64_FMT ", deltapos=" UINT64_FMT ")",
8780 								(uint64_t)uDelta, (uint64_t)(m_iFilePos-m_iLeft) );
8781 #else
8782 						assert ( uDelta>0 && uDelta<sizeof(m_sKeyword)-1 );
8783 #endif
8784 
8785 						ReadBytes ( m_sKeyword, (int)uDelta );
8786 						m_sKeyword[uDelta] = '\0';
8787 						tHit.m_uWordID = sphCRC32 ( m_sKeyword ); // must be in sync with dict!
8788 
8789 #ifndef NDEBUG
8790 						assert ( ( m_iLastWordID<tHit.m_uWordID )
8791 							|| ( m_iLastWordID==tHit.m_uWordID && strcmp ( (char*)m_sLastKeyword, (char*)m_sKeyword )<0 ) );
8792 						strncpy ( (char*)m_sLastKeyword, (char*)m_sKeyword, sizeof(m_sLastKeyword) );
8793 #endif
8794 
8795 					} else
8796 					{
8797 						tHit.m_uWordID += uDelta;
8798 					}
8799 					tHit.m_uDocID = 0;
8800 					tHit.m_iWordPos = EMPTY_HIT;
8801 					tHit.m_dFieldMask.UnsetAll();
8802 					m_eState = BIN_DOC;
8803 					break;
8804 
8805 				case BIN_DOC:
8806 					// doc id
8807 					m_eState = BIN_POS;
8808 					tHit.m_uDocID += uDelta;
8809 					tHit.m_iWordPos = EMPTY_HIT;
8810 					for ( int i=0; i<iRowitems; i++, pRowitems++ )
8811 						*pRowitems = (DWORD)ReadVLB(); // FIXME? check range?
8812 					break;
8813 
8814 				case BIN_POS:
8815 					if ( m_eMode==SPH_HITLESS_ALL )
8816 					{
8817 						tHit.m_dFieldMask.Assign32 ( (DWORD)ReadVLB() );
8818 						m_eState = BIN_DOC;
8819 
8820 					} else if ( m_eMode==SPH_HITLESS_SOME )
8821 					{
8822 						if ( uDelta & 1 )
8823 						{
8824 							tHit.m_dFieldMask.Assign32 ( (DWORD)ReadVLB() );
8825 							m_eState = BIN_DOC;
8826 						}
8827 						uDelta >>= 1;
8828 					}
8829 					tHit.m_iWordPos += (DWORD)uDelta;
8830 					*pOut = tHit;
8831 					return 1;
8832 
8833 				default:
8834 					sphDie ( "INTERNAL ERROR: unknown bin state (state=%d)", m_eState );
8835 			}
8836 		} else
8837 		{
8838 			switch ( m_eState )
8839 			{
8840 				case BIN_POS:	m_eState = BIN_DOC; break;
8841 				case BIN_DOC:	m_eState = BIN_WORD; break;
8842 				case BIN_WORD:	m_iDone = 1; pOut->m_uWordID = 0; return 1;
8843 				default:		sphDie ( "INTERNAL ERROR: unknown bin state (state=%d)", m_eState );
8844 			}
8845 		}
8846 	}
8847 }
8848 
8849 
IsEOF() const8850 bool CSphBin::IsEOF () const
8851 {
8852 	return m_iDone!=0 || m_iFileLeft<=0;
8853 }
8854 
8855 
IsDone() const8856 bool CSphBin::IsDone () const
8857 {
8858 	return m_iDone!=0 || ( m_iFileLeft<=0 && m_iLeft<=0 );
8859 }
8860 
8861 
Precache()8862 ESphBinRead CSphBin::Precache ()
8863 {
8864 	if ( m_iFileLeft > m_iSize-m_iLeft )
8865 	{
8866 		m_bError = true;
8867 		return BIN_PRECACHE_ERROR;
8868 	}
8869 
8870 	if ( !m_iFileLeft )
8871 		return BIN_PRECACHE_OK;
8872 
8873 	if ( *m_pFilePos!=m_iFilePos )
8874 	{
8875 		sphSeek ( m_iFile, m_iFilePos, SEEK_SET );
8876 		*m_pFilePos = m_iFilePos;
8877 	}
8878 
8879 	assert ( m_dBuffer );
8880 	memmove ( m_dBuffer, m_pCurrent, m_iLeft );
8881 
8882 	if ( sphReadThrottled ( m_iFile, m_dBuffer+m_iLeft, m_iFileLeft, m_pThrottle )!=(size_t)m_iFileLeft )
8883 	{
8884 		m_bError = true;
8885 		return BIN_READ_ERROR;
8886 	}
8887 
8888 	m_iLeft += m_iFileLeft;
8889 	m_iFilePos += m_iFileLeft;
8890 	m_iFileLeft -= m_iFileLeft;
8891 	m_pCurrent = m_dBuffer;
8892 	*m_pFilePos += m_iFileLeft;
8893 
8894 	return BIN_PRECACHE_OK;
8895 }
8896 
8897 
8898 //////////////////////////////////////////////////////////////////////////
8899 // INDEX SETTINGS
8900 //////////////////////////////////////////////////////////////////////////
8901 
CSphIndexSettings()8902 CSphIndexSettings::CSphIndexSettings ()
8903 	: m_eDocinfo			( SPH_DOCINFO_NONE )
8904 	, m_eHitFormat			( SPH_HIT_FORMAT_PLAIN )
8905 	, m_bHtmlStrip			( false )
8906 	, m_eHitless			( SPH_HITLESS_NONE )
8907 	, m_bVerbose			( false )
8908 	, m_iEmbeddedLimit		( 0 )
8909 	, m_eBigramIndex		( SPH_BIGRAM_NONE )
8910 	, m_uAotFilterMask		( 0 )
8911 	, m_eChineseRLP			( SPH_RLP_NONE )
8912 {
8913 }
8914 
8915 //////////////////////////////////////////////////////////////////////////
8916 // GLOBAL MVA STORAGE ARENA
8917 //////////////////////////////////////////////////////////////////////////
8918 
8919 class tTester : public ISphNoncopyable
8920 {
8921 public:
8922 	virtual void Reset() = 0;
8923 	virtual void TestData ( int iData ) = 0;
~tTester()8924 	virtual ~tTester() {}
8925 };
8926 
8927 /// shared-memory arena allocator
8928 /// manages small tagged dword strings, upto 4096 bytes in size
8929 class CSphArena
8930 {
8931 public:
8932 							CSphArena ();
8933 							~CSphArena ();
8934 
8935 	DWORD *					ReInit ( int uMaxBytes );
GetError() const8936 	const char *			GetError () const { return m_sError.cstr(); }
8937 
8938 	int						TaggedAlloc ( int iTag, int iBytes );
8939 	void					TaggedFreeIndex ( int iTag, int iIndex );
8940 	void					TaggedFreeTag ( int iTag );
8941 
8942 	void					ExamineTag ( tTester* pTest, int iTag );
8943 
8944 protected:
8945 	static const int		MIN_BITS	= 4;
8946 	static const int		MAX_BITS	= 12;
8947 	static const int		NUM_SIZES	= MAX_BITS-MIN_BITS+2;	///< one for 0 (empty pages), and one for each size from min to max
8948 
8949 	static const int		PAGE_SIZE	= 1<<MAX_BITS;
8950 	static const int		PAGE_ALLOCS	= 1<<( MAX_BITS-MIN_BITS);
8951 	static const int		PAGE_BITMAP	= ( PAGE_ALLOCS+8*sizeof(DWORD)-1 )/( 8*sizeof(DWORD) );
8952 
8953 	static const int		MAX_TAGS		= 1024;
8954 	static const int		MAX_LOGENTRIES	= 29;
8955 
8956 	///< page descriptor
8957 	struct PageDesc_t
8958 	{
8959 		int					m_iSizeBits;			///< alloc size
8960 		int					m_iPrev;				///< prev free page of this size
8961 		int					m_iNext;				///< next free page of this size
8962 		int					m_iUsed;				///< usage count
8963 		DWORD				m_uBitmap[PAGE_BITMAP];	///< usage bitmap
8964 	};
8965 
8966 	///< tag descriptor
8967 	struct TagDesc_t
8968 	{
8969 		int					m_iTag;					///< tag value
8970 		int					m_iAllocs;				///< active allocs
8971 		int					m_iLogHead;				///< pointer to head allocs log entry
8972 	};
8973 
8974 	///< allocs log entry
8975 	struct AllocsLogEntry_t
8976 	{
8977 		int					m_iUsed;
8978 		int					m_iNext;
8979 		int					m_dEntries[MAX_LOGENTRIES];
8980 	};
8981 	STATIC_SIZE_ASSERT ( AllocsLogEntry_t, 124 );
8982 
8983 protected:
8984 	DWORD *					Init ( int uMaxBytes );
8985 	int						RawAlloc ( int iBytes );
8986 	void					RawFree ( int iIndex );
8987 	void					RemoveTag ( TagDesc_t * pTag );
8988 
8989 protected:
8990 	CSphProcessSharedMutex	m_tProcMutex;
8991 	CSphMutex				m_tThdMutex;
8992 
8993 	int						m_iPages;			///< max pages count
8994 	CSphSharedBuffer<DWORD>	m_pArena;			///< arena that stores everything (all other pointers point here)
8995 
8996 	PageDesc_t *			m_pPages;			///< page descriptors
8997 	int *					m_pFreelistHeads;	///< free-list heads
8998 	int *					m_pTagCount;
8999 	TagDesc_t *				m_pTags;
9000 
9001 	DWORD *					m_pBasePtr;			///< base data storage pointer
9002 	CSphString				m_sError;
9003 
9004 #if ARENADEBUG
9005 protected:
9006 	int *					m_pTotalAllocs;
9007 	int *					m_pTotalBytes;
9008 
9009 public:
9010 	void					CheckFreelists ();
9011 #else
CheckFreelists()9012 	inline void				CheckFreelists () {}
9013 #endif // ARENADEBUG
9014 };
9015 
9016 class tDocCollector : public tTester
9017 {
9018 	CSphVector<SphDocID_t> * m_dCollection;
9019 public:
tDocCollector(CSphVector<SphDocID_t> & dCollection)9020 	explicit tDocCollector ( CSphVector<SphDocID_t> & dCollection )
9021 		: m_dCollection ( &dCollection )
9022 	{}
Reset()9023 	virtual void Reset()
9024 	{
9025 		m_dCollection->Reset();
9026 	}
TestData(int iData)9027 	virtual void TestData ( int iData )
9028 	{
9029 		if ( !g_pMvaArena )
9030 			return;
9031 
9032 		m_dCollection->Add ( *(SphDocID_t*)(g_pMvaArena + iData) );
9033 	}
9034 };
9035 
9036 //////////////////////////////////////////////////////////////////////////
CSphArena()9037 CSphArena::CSphArena ()
9038 	: m_iPages ( 0 )
9039 {
9040 	m_tThdMutex.Init();
9041 }
9042 
9043 
~CSphArena()9044 CSphArena::~CSphArena ()
9045 {
9046 	// notify callers that arena no longer exists
9047 	g_pMvaArena = NULL;
9048 	m_tThdMutex.Done();
9049 }
9050 
ReInit(int uMaxBytes)9051 DWORD * CSphArena::ReInit ( int uMaxBytes )
9052 {
9053 	if ( m_iPages!=0 )
9054 	{
9055 		m_pArena.Reset();
9056 		m_iPages = 0;
9057 	}
9058 	return Init ( uMaxBytes );
9059 }
9060 
Init(int uMaxBytes)9061 DWORD * CSphArena::Init ( int uMaxBytes )
9062 {
9063 	m_iPages = ( uMaxBytes+PAGE_SIZE-1 ) / PAGE_SIZE;
9064 
9065 	int iData = m_iPages*PAGE_SIZE; // data size, bytes
9066 	int iMyTaglist = sizeof(int) + MAX_TAGS*sizeof(TagDesc_t); // int length, TagDesc_t[] tags; NOLINT
9067 	int iMy = m_iPages*sizeof(PageDesc_t) + NUM_SIZES*sizeof(int) + iMyTaglist; // my internal structures size, bytes; NOLINT
9068 #if ARENADEBUG
9069 	iMy += 2*sizeof(int); // debugging counters; NOLINT
9070 #endif
9071 
9072 	assert ( iData%sizeof(DWORD)==0 );
9073 	assert ( iMy%sizeof(DWORD)==0 );
9074 
9075 	CSphString sError, sWarning;
9076 	if ( m_tProcMutex.GetError() || !m_pArena.Alloc ( (iData+iMy)/sizeof(DWORD), sError, sWarning ) )
9077 	{
9078 		m_iPages = 0;
9079 		if ( m_tProcMutex.GetError() )
9080 			m_sError = m_tProcMutex.GetError();
9081 		else
9082 			m_sError.SetSprintf ( "alloc, error='%s', warning='%s'", sError.cstr(), sWarning.cstr() );
9083 		return NULL;
9084 	}
9085 
9086 	// setup internal pointers
9087 	DWORD * pCur = m_pArena.GetWritePtr();
9088 
9089 	m_pPages = (PageDesc_t*) pCur;
9090 	pCur += sizeof(PageDesc_t)*m_iPages/sizeof(DWORD);
9091 
9092 	m_pFreelistHeads = (int*) pCur;
9093 	pCur += NUM_SIZES; // one for each size, and one extra for zero
9094 
9095 	m_pTagCount = (int*) pCur++;
9096 	m_pTags = (TagDesc_t*) pCur;
9097 	pCur += sizeof(TagDesc_t)*MAX_TAGS/sizeof(DWORD);
9098 
9099 #if ARENADEBUG
9100 	m_pTotalAllocs = (int*) pCur++;
9101 	m_pTotalBytes = (int*) pCur++;
9102 	*m_pTotalAllocs = 0;
9103 	*m_pTotalBytes = 0;
9104 #endif
9105 
9106 	m_pBasePtr = m_pArena.GetWritePtr() + iMy/sizeof(DWORD);
9107 	assert ( m_pBasePtr==pCur );
9108 
9109 	// setup initial state
9110 	for ( int i=0; i<m_iPages; i++ )
9111 	{
9112 		m_pPages[i].m_iSizeBits = 0; // fully empty
9113 		m_pPages[i].m_iPrev = ( i>0 ) ? i-1 : -1;
9114 		m_pPages[i].m_iNext = ( i<m_iPages-1 ) ? i+1 : -1;
9115 	}
9116 
9117 	m_pFreelistHeads[0] = 0;
9118 	for ( int i=1; i<NUM_SIZES; i++ )
9119 		m_pFreelistHeads[i] = -1;
9120 
9121 	*m_pTagCount = 0;
9122 
9123 	return m_pBasePtr;
9124 }
9125 
9126 
RawAlloc(int iBytes)9127 int CSphArena::RawAlloc ( int iBytes )
9128 {
9129 	CheckFreelists ();
9130 
9131 	if ( iBytes<=0 || iBytes>( ( 1 << MAX_BITS ) - (int)sizeof(int) ) )
9132 		return -1;
9133 
9134 	int iSizeBits = sphLog2 ( iBytes+2*sizeof(int)-1 ); // always reserve sizeof(int) for the tag and AllocsLogEntry_t backtrack; NOLINT
9135 	iSizeBits = Max ( iSizeBits, MIN_BITS );
9136 	assert ( iSizeBits>=MIN_BITS && iSizeBits<=MAX_BITS );
9137 
9138 	int iSizeSlot = iSizeBits-MIN_BITS+1;
9139 	assert ( iSizeSlot>=1 && iSizeSlot<NUM_SIZES );
9140 
9141 	// get semi-free page for this size
9142 	PageDesc_t * pPage = NULL;
9143 	if ( m_pFreelistHeads[iSizeSlot]>=0 )
9144 	{
9145 		// got something in the free-list
9146 		pPage = m_pPages + m_pFreelistHeads[iSizeSlot];
9147 
9148 	} else
9149 	{
9150 		// nothing in free-list, alloc next empty one
9151 		if ( m_pFreelistHeads[0]<0 )
9152 			return -1; // out of memory
9153 
9154 		// update the page
9155 		pPage = m_pPages + m_pFreelistHeads[0];
9156 		assert ( pPage->m_iPrev==-1 );
9157 
9158 		m_pFreelistHeads[iSizeSlot] = m_pFreelistHeads[0];
9159 		m_pFreelistHeads[0] = pPage->m_iNext;
9160 		if ( pPage->m_iNext>=0 )
9161 			m_pPages[pPage->m_iNext].m_iPrev = -1;
9162 
9163 		pPage->m_iSizeBits = iSizeBits;
9164 		pPage->m_iUsed = 0;
9165 		pPage->m_iNext = -1;
9166 
9167 		CheckFreelists ();
9168 
9169 		// setup bitmap
9170 		int iUsedBits = ( 1<<(MAX_BITS-iSizeBits) ); // max-used-bits = page-size/alloc-size = ( 1<<page-bitsize )/( 1<<alloc-bitsize )
9171 		assert ( iUsedBits>0 && iUsedBits<=(PAGE_BITMAP<<5) );
9172 
9173 		for ( int i=0; i<PAGE_BITMAP; i++ )
9174 			pPage->m_uBitmap[i] = ( ( i<<5 )>=iUsedBits ) ? 0xffffffffUL : 0;
9175 
9176 		if ( iUsedBits<32 )
9177 			pPage->m_uBitmap[0] = ( 0xffffffffUL<<iUsedBits );
9178 	}
9179 
9180 	// get free alloc slot and use it
9181 	assert ( pPage );
9182 	assert ( pPage->m_iSizeBits==iSizeBits );
9183 
9184 	for ( int i=0; i<PAGE_BITMAP; i++ ) // FIXME! optimize, can scan less
9185 	{
9186 		if ( pPage->m_uBitmap[i]==0xffffffffUL )
9187 			continue;
9188 
9189 		int iFree = FindBit ( pPage->m_uBitmap[i] );
9190 		pPage->m_uBitmap[i] |= ( 1<<iFree );
9191 
9192 		pPage->m_iUsed++;
9193 		if ( pPage->m_iUsed==( PAGE_SIZE >> pPage->m_iSizeBits ) )
9194 		{
9195 			// this page is full now, unchain from the free-list
9196 			assert ( m_pFreelistHeads[iSizeSlot]==pPage-m_pPages );
9197 			m_pFreelistHeads[iSizeSlot] = pPage->m_iNext;
9198 			if ( pPage->m_iNext>=0 )
9199 			{
9200 				assert ( m_pPages[pPage->m_iNext].m_iPrev==pPage-m_pPages );
9201 				m_pPages[pPage->m_iNext].m_iPrev = -1;
9202 			}
9203 			pPage->m_iNext = -1;
9204 		}
9205 
9206 #if ARENADEBUG
9207 		(*m_pTotalAllocs)++;
9208 		(*m_pTotalBytes) += ( 1<<iSizeBits );
9209 #endif
9210 
9211 		CheckFreelists ();
9212 
9213 		int iOffset = ( pPage-m_pPages )*PAGE_SIZE + ( i*32+iFree )*( 1<<iSizeBits ); // raw internal byte offset (FIXME! optimize with shifts?)
9214 		int iIndex = 2 + ( iOffset/sizeof(DWORD) ); // dword index with tag and backtrack fixup
9215 
9216 		m_pBasePtr[iIndex-1] = DWORD(-1); // untagged by default
9217 		m_pBasePtr[iIndex-2] = DWORD(-1); // backtrack nothere
9218 		return iIndex;
9219 	}
9220 
9221 	assert ( 0 && "internal error, no free slots in free page" );
9222 	return -1;
9223 }
9224 
9225 
RawFree(int iIndex)9226 void CSphArena::RawFree ( int iIndex )
9227 {
9228 	CheckFreelists ();
9229 
9230 	int iOffset = (iIndex-2)*sizeof(DWORD); // remove tag fixup, and go to raw internal byte offset
9231 	int iPage = iOffset / PAGE_SIZE;
9232 
9233 	if ( iPage<0 || iPage>m_iPages )
9234 	{
9235 		assert ( 0 && "internal error, freed index out of arena" );
9236 		return;
9237 	}
9238 
9239 	PageDesc_t * pPage = m_pPages + iPage;
9240 	int iBit = ( iOffset % PAGE_SIZE ) >> pPage->m_iSizeBits;
9241 	assert ( ( iOffset % PAGE_SIZE )==( iBit << pPage->m_iSizeBits ) && "internal error, freed offset is unaligned" );
9242 
9243 	if (!( pPage->m_uBitmap[iBit>>5] & ( 1UL<<(iBit & 31) ) ))
9244 	{
9245 		assert ( 0 && "internal error, freed index already freed" );
9246 		return;
9247 	}
9248 
9249 	pPage->m_uBitmap[iBit>>5] &= ~( 1UL << ( iBit & 31 ) );
9250 	pPage->m_iUsed--;
9251 
9252 #if ARENADEBUG
9253 	(*m_pTotalAllocs)--;
9254 	(*m_pTotalBytes) -= ( 1<<pPage->m_iSizeBits );
9255 #endif
9256 
9257 	CheckFreelists ();
9258 
9259 	int iSizeSlot = pPage->m_iSizeBits-MIN_BITS+1;
9260 
9261 	if ( pPage->m_iUsed==( PAGE_SIZE >> pPage->m_iSizeBits )-1 )
9262 	{
9263 		// this page was full, but it's semi-free now
9264 		// chain to free-list
9265 		assert ( pPage->m_iPrev==-1 ); // full pages must not be in any list
9266 		assert ( pPage->m_iNext==-1 );
9267 
9268 		pPage->m_iNext = m_pFreelistHeads[iSizeSlot];
9269 		if ( pPage->m_iNext>=0 )
9270 		{
9271 			assert ( m_pPages[pPage->m_iNext].m_iPrev==-1 );
9272 			assert ( m_pPages[pPage->m_iNext].m_iSizeBits==pPage->m_iSizeBits );
9273 			m_pPages[pPage->m_iNext].m_iPrev = iPage;
9274 		}
9275 		m_pFreelistHeads[iSizeSlot] = iPage;
9276 	}
9277 
9278 	if ( pPage->m_iUsed==0 )
9279 	{
9280 		// this page is empty now
9281 		// unchain from free-list
9282 		if ( pPage->m_iPrev>=0 )
9283 		{
9284 			// non-head page
9285 			assert ( m_pPages[pPage->m_iPrev].m_iNext==iPage );
9286 			m_pPages[pPage->m_iPrev].m_iNext = pPage->m_iNext;
9287 
9288 			if ( pPage->m_iNext>=0 )
9289 			{
9290 				assert ( m_pPages[pPage->m_iNext].m_iPrev==iPage );
9291 				m_pPages[pPage->m_iNext].m_iPrev = pPage->m_iPrev;
9292 			}
9293 
9294 		} else
9295 		{
9296 			// head page
9297 			assert ( m_pFreelistHeads[iSizeSlot]==iPage );
9298 			assert ( pPage->m_iPrev==-1 );
9299 
9300 			if ( pPage->m_iNext>=0 )
9301 			{
9302 				assert ( m_pPages[pPage->m_iNext].m_iPrev==iPage );
9303 				m_pPages[pPage->m_iNext].m_iPrev = -1;
9304 			}
9305 			m_pFreelistHeads[iSizeSlot] = pPage->m_iNext;
9306 		}
9307 
9308 		pPage->m_iSizeBits = 0;
9309 		pPage->m_iPrev = -1;
9310 		pPage->m_iNext = m_pFreelistHeads[0];
9311 		if ( pPage->m_iNext>=0 )
9312 		{
9313 			assert ( m_pPages[pPage->m_iNext].m_iPrev==-1 );
9314 			assert ( m_pPages[pPage->m_iNext].m_iSizeBits==0 );
9315 			m_pPages[pPage->m_iNext].m_iPrev = iPage;
9316 		}
9317 		m_pFreelistHeads[0] = iPage;
9318 	}
9319 
9320 	CheckFreelists ();
9321 }
9322 
9323 
TaggedAlloc(int iTag,int iBytes)9324 int CSphArena::TaggedAlloc ( int iTag, int iBytes )
9325 {
9326 	if ( !m_iPages )
9327 		return -1; // uninitialized
9328 
9329 	assert ( iTag>=0 );
9330 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
9331 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
9332 
9333 	// find that tag first
9334 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
9335 	if ( !pTag )
9336 	{
9337 		if ( *m_pTagCount==MAX_TAGS )
9338 			return -1; // out of tags
9339 
9340 		int iLogHead = RawAlloc ( sizeof(AllocsLogEntry_t) );
9341 		if ( iLogHead<0 )
9342 			return -1; // out of memory
9343 
9344 		assert ( iLogHead>=2 );
9345 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLogHead );
9346 		pLog->m_iUsed = 0;
9347 		pLog->m_iNext = -1;
9348 
9349 		// add new tag
9350 		pTag = m_pTags + (*m_pTagCount)++;
9351 		pTag->m_iTag = iTag;
9352 		pTag->m_iAllocs = 0;
9353 		pTag->m_iLogHead = iLogHead;
9354 
9355 		// re-sort
9356 		// OPTIMIZE! full-blown sort is overkill here
9357 		sphSort ( m_pTags, *m_pTagCount, sphMemberLess ( &TagDesc_t::m_iTag ) );
9358 
9359 		// we must be able to find it now
9360 		pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
9361 		assert ( pTag && "internal error, fresh tag not found in TaggedAlloc()" );
9362 
9363 		if ( !pTag )
9364 			return -1; // internal error
9365 	}
9366 
9367 	// grow the log if needed
9368 	int iLogEntry = pTag->m_iLogHead;
9369 	AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + pTag->m_iLogHead );
9370 	if ( pLog->m_iUsed==MAX_LOGENTRIES )
9371 	{
9372 		int iNewEntry = RawAlloc ( sizeof(AllocsLogEntry_t) );
9373 		if ( iNewEntry<0 )
9374 			return -1; // out of memory
9375 
9376 		assert ( iNewEntry>=2 );
9377 		iLogEntry = iNewEntry;
9378 		AllocsLogEntry_t * pNew = (AllocsLogEntry_t*) ( m_pBasePtr + iNewEntry );
9379 		pNew->m_iUsed = 0;
9380 		pNew->m_iNext = pTag->m_iLogHead;
9381 		pTag->m_iLogHead = iNewEntry;
9382 		pLog = pNew;
9383 	}
9384 
9385 	// do the alloc itself
9386 	int iIndex = RawAlloc ( iBytes );
9387 	if ( iIndex<0 )
9388 		return -1; // out of memory
9389 
9390 	assert ( iIndex>=2 );
9391 	// tag it
9392 	m_pBasePtr[iIndex-1] = iTag;
9393 	// set data->AllocsLogEntry_t backtrack
9394 	m_pBasePtr[iIndex-2] = iLogEntry;
9395 
9396 	// log it
9397 	assert ( pLog->m_iUsed<MAX_LOGENTRIES );
9398 	pLog->m_dEntries [ pLog->m_iUsed++ ] = iIndex;
9399 	pTag->m_iAllocs++;
9400 
9401 	// and we're done
9402 	return iIndex;
9403 }
9404 
9405 
TaggedFreeIndex(int iTag,int iIndex)9406 void CSphArena::TaggedFreeIndex ( int iTag, int iIndex )
9407 {
9408 	if ( !m_iPages )
9409 		return; // uninitialized
9410 
9411 	assert ( iTag>=0 );
9412 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
9413 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
9414 
9415 	// find that tag
9416 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
9417 	assert ( pTag && "internal error, unknown tag in TaggedFreeIndex()" );
9418 	assert ( m_pBasePtr[iIndex-1]==DWORD(iTag) && "internal error, tag mismatch in TaggedFreeIndex()" );
9419 
9420 	// defence against internal errors
9421 	if ( !pTag )
9422 		return;
9423 
9424 	// untag it
9425 	m_pBasePtr[iIndex-1] = DWORD(-1);
9426 
9427 	// free it
9428 	RawFree ( iIndex );
9429 
9430 	// update AllocsLogEntry_t
9431 	int iLogEntry = m_pBasePtr[iIndex-2];
9432 	assert ( iLogEntry>=2 );
9433 	m_pBasePtr[iIndex-2] = DWORD(-1);
9434 	AllocsLogEntry_t * pLogEntry = (AllocsLogEntry_t*) ( m_pBasePtr + iLogEntry );
9435 	for ( int i = 0; i<MAX_LOGENTRIES; i++ )
9436 	{
9437 		if ( pLogEntry->m_dEntries[i]!=iIndex )
9438 			continue;
9439 
9440 		pLogEntry->m_dEntries[i] = pLogEntry->m_dEntries[pLogEntry->m_iUsed-1]; // RemoveFast
9441 		pLogEntry->m_iUsed--;
9442 		break;
9443 	}
9444 	assert ( pLogEntry->m_iUsed>=0 );
9445 
9446 	// remove from tag entries list
9447 	if ( pLogEntry->m_iUsed==0 )
9448 	{
9449 		if ( pTag->m_iLogHead==iLogEntry )
9450 		{
9451 			pTag->m_iLogHead = pLogEntry->m_iNext;
9452 		} else
9453 		{
9454 			int iLog = pTag->m_iLogHead;
9455 			while ( iLog>=0 )
9456 			{
9457 				AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
9458 				if ( iLogEntry!=pLog->m_iNext )
9459 				{
9460 					iLog = pLog->m_iNext;
9461 					continue;
9462 				} else
9463 				{
9464 					pLog->m_iNext = pLogEntry->m_iNext;
9465 					break;
9466 				}
9467 			}
9468 		}
9469 		RawFree ( iLogEntry );
9470 	}
9471 
9472 	// update the tag descriptor
9473 	pTag->m_iAllocs--;
9474 	assert ( pTag->m_iAllocs>=0 );
9475 
9476 	// remove the descriptor if its empty now
9477 	if ( pTag->m_iAllocs==0 )
9478 		RemoveTag ( pTag );
9479 }
9480 
9481 
TaggedFreeTag(int iTag)9482 void CSphArena::TaggedFreeTag ( int iTag )
9483 {
9484 	if ( !m_iPages )
9485 		return; // uninitialized
9486 
9487 	assert ( iTag>=0 );
9488 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
9489 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
9490 
9491 	// find that tag
9492 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
9493 	if ( !pTag )
9494 		return;
9495 
9496 	// walk the log and free it
9497 	int iLog = pTag->m_iLogHead;
9498 	while ( iLog>=0 )
9499 	{
9500 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
9501 		iLog = pLog->m_iNext;
9502 
9503 		// free each alloc if tag still matches
9504 		for ( int i=0; i<pLog->m_iUsed; i++ )
9505 		{
9506 			int iIndex = pLog->m_dEntries[i];
9507 			if ( m_pBasePtr[iIndex-1]==DWORD(iTag) )
9508 			{
9509 				m_pBasePtr[iIndex-1] = DWORD(-1); // avoid double free
9510 				RawFree ( iIndex );
9511 				pTag->m_iAllocs--;
9512 			}
9513 		}
9514 	}
9515 
9516 	// check for mismatches
9517 	assert ( pTag->m_iAllocs==0 );
9518 
9519 	// remove the descriptor
9520 	RemoveTag ( pTag );
9521 }
9522 
ExamineTag(tTester * pTest,int iTag)9523 void CSphArena::ExamineTag ( tTester* pTest, int iTag )
9524 {
9525 	if ( !pTest )
9526 		return;
9527 
9528 	pTest->Reset();
9529 
9530 	if ( !m_iPages )
9531 		return; // uninitialized
9532 
9533 	assert ( iTag>=0 );
9534 	CSphScopedLock<CSphMutex> tThdLock ( m_tThdMutex );
9535 	CSphScopedLock<CSphProcessSharedMutex> tProcLock ( m_tProcMutex );
9536 
9537 	// find that tag
9538 	TagDesc_t * pTag = sphBinarySearch ( m_pTags, m_pTags+(*m_pTagCount)-1, bind ( &TagDesc_t::m_iTag ), iTag );
9539 	if ( !pTag )
9540 		return;
9541 
9542 	// walk the log and tick it's chunks
9543 	int iLog = pTag->m_iLogHead;
9544 	while ( iLog>=0 )
9545 	{
9546 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
9547 		iLog = pLog->m_iNext;
9548 
9549 		// tick each alloc
9550 		for ( int i=0; i<pLog->m_iUsed; i++ )
9551 			pTest->TestData ( pLog->m_dEntries[i] );
9552 	}
9553 }
9554 
RemoveTag(TagDesc_t * pTag)9555 void CSphArena::RemoveTag ( TagDesc_t * pTag )
9556 {
9557 	assert ( pTag );
9558 	assert ( pTag->m_iAllocs==0 );
9559 
9560 	// dealloc log chain
9561 	int iLog = pTag->m_iLogHead;
9562 	while ( iLog>=0 )
9563 	{
9564 		AllocsLogEntry_t * pLog = (AllocsLogEntry_t*) ( m_pBasePtr + iLog );
9565 		int iNext = pLog->m_iNext;
9566 
9567 		RawFree ( iLog );
9568 		iLog = iNext;
9569 	}
9570 
9571 	// remove tag from the list
9572 	int iTail = m_pTags + (*m_pTagCount) - pTag - 1;
9573 	memmove ( pTag, pTag+1, iTail*sizeof(TagDesc_t) );
9574 	(*m_pTagCount)--;
9575 }
9576 
9577 
9578 #if ARENADEBUG
CheckFreelists()9579 void CSphArena::CheckFreelists ()
9580 {
9581 	assert ( m_pFreelistHeads[0]==-1 || m_pPages[m_pFreelistHeads[0]].m_iSizeBits==0 );
9582 	for ( int iSizeSlot=1; iSizeSlot<NUM_SIZES; iSizeSlot++ )
9583 		assert ( m_pFreelistHeads[iSizeSlot]==-1 || m_pPages[m_pFreelistHeads[iSizeSlot]].m_iSizeBits-MIN_BITS+1==iSizeSlot );
9584 }
9585 #endif // ARENADEBUG
9586 
9587 //////////////////////////////////////////////////////////////////////////
9588 
9589 static CSphArena g_tMvaArena; // global mega-arena
9590 
sphArenaInit(int iMaxBytes)9591 const char * sphArenaInit ( int iMaxBytes )
9592 {
9593 	if ( !g_pMvaArena )
9594 		g_pMvaArena = g_tMvaArena.ReInit ( iMaxBytes );
9595 
9596 	const char * sError = g_tMvaArena.GetError();
9597 	return sError;
9598 }
9599 
9600 
9601 //////////////////////////////////////////////////////////////////////////
9602 
CSphMultiQueryArgs(const KillListVector & dKillList,int iIndexWeight)9603 CSphMultiQueryArgs::CSphMultiQueryArgs ( const KillListVector & dKillList, int iIndexWeight )
9604 	: m_dKillList ( dKillList )
9605 	, m_iIndexWeight ( iIndexWeight )
9606 	, m_iTag ( 0 )
9607 	, m_uPackedFactorFlags ( SPH_FACTOR_DISABLE )
9608 	, m_bLocalDF ( false )
9609 	, m_pLocalDocs ( NULL )
9610 	, m_iTotalDocs ( 0 )
9611 {
9612 	assert ( iIndexWeight>0 );
9613 }
9614 
9615 
9616 /////////////////////////////////////////////////////////////////////////////
9617 // INDEX
9618 /////////////////////////////////////////////////////////////////////////////
9619 
CSphIndex(const char * sIndexName,const char * sFilename)9620 CSphIndex::CSphIndex ( const char * sIndexName, const char * sFilename )
9621 	: m_iTID ( 0 )
9622 	, m_bExpandKeywords ( false )
9623 	, m_iExpansionLimit ( 0 )
9624 	, m_tSchema ( sFilename )
9625 	, m_bInplaceSettings ( false )
9626 	, m_iHitGap ( 0 )
9627 	, m_iDocinfoGap ( 0 )
9628 	, m_fRelocFactor ( 0.0f )
9629 	, m_fWriteFactor ( 0.0f )
9630 	, m_bKeepFilesOpen ( false )
9631 	, m_bBinlog ( true )
9632 	, m_bStripperInited ( true )
9633 	, m_bId32to64 ( false )
9634 	, m_pFieldFilter ( NULL )
9635 	, m_pTokenizer ( NULL )
9636 	, m_pQueryTokenizer ( NULL )
9637 	, m_pDict ( NULL )
9638 	, m_iMaxCachedDocs ( 0 )
9639 	, m_iMaxCachedHits ( 0 )
9640 	, m_sIndexName ( sIndexName )
9641 	, m_sFilename ( sFilename )
9642 {
9643 }
9644 
9645 
~CSphIndex()9646 CSphIndex::~CSphIndex ()
9647 {
9648 	SafeDelete ( m_pFieldFilter );
9649 	SafeDelete ( m_pQueryTokenizer );
9650 	SafeDelete ( m_pTokenizer );
9651 	SafeDelete ( m_pDict );
9652 }
9653 
9654 
SetInplaceSettings(int iHitGap,int iDocinfoGap,float fRelocFactor,float fWriteFactor)9655 void CSphIndex::SetInplaceSettings ( int iHitGap, int iDocinfoGap, float fRelocFactor, float fWriteFactor )
9656 {
9657 	m_iHitGap = iHitGap;
9658 	m_iDocinfoGap = iDocinfoGap;
9659 	m_fRelocFactor = fRelocFactor;
9660 	m_fWriteFactor = fWriteFactor;
9661 	m_bInplaceSettings = true;
9662 }
9663 
9664 
SetFieldFilter(ISphFieldFilter * pFieldFilter)9665 void CSphIndex::SetFieldFilter ( ISphFieldFilter * pFieldFilter )
9666 {
9667 	if ( m_pFieldFilter!=pFieldFilter )
9668 		SafeDelete ( m_pFieldFilter );
9669 	m_pFieldFilter = pFieldFilter;
9670 }
9671 
9672 
SetTokenizer(ISphTokenizer * pTokenizer)9673 void CSphIndex::SetTokenizer ( ISphTokenizer * pTokenizer )
9674 {
9675 	if ( m_pTokenizer!=pTokenizer )
9676 		SafeDelete ( m_pTokenizer );
9677 	m_pTokenizer = pTokenizer;
9678 }
9679 
9680 
SetupQueryTokenizer()9681 void CSphIndex::SetupQueryTokenizer()
9682 {
9683 	// create and setup a master copy of query time tokenizer
9684 	// that we can then use to create lightweight clones
9685 	SafeDelete ( m_pQueryTokenizer );
9686 	m_pQueryTokenizer = m_pTokenizer->Clone ( SPH_CLONE_QUERY );
9687 	if ( IsStarDict() )
9688 	{
9689 		m_pQueryTokenizer->AddPlainChar ( '*' );
9690 		m_pQueryTokenizer->AddPlainChar ( '?' );
9691 		m_pQueryTokenizer->AddPlainChar ( '%' );
9692 	}
9693 	if ( m_tSettings.m_bIndexExactWords )
9694 	{
9695 		m_pQueryTokenizer->AddPlainChar ( '=' );
9696 		m_pQueryTokenizer->AddSpecials ( "()|-!@~\"/^$<=" );
9697 	} else
9698 	{
9699 		m_pQueryTokenizer->AddSpecials ( "()|-!@~\"/^$<" );
9700 	}
9701 }
9702 
9703 
LeakTokenizer()9704 ISphTokenizer *	CSphIndex::LeakTokenizer ()
9705 {
9706 	ISphTokenizer * pTokenizer = m_pTokenizer;
9707 	m_pTokenizer = NULL;
9708 	return pTokenizer;
9709 }
9710 
9711 
SetDictionary(CSphDict * pDict)9712 void CSphIndex::SetDictionary ( CSphDict * pDict )
9713 {
9714 	if ( m_pDict!=pDict )
9715 		SafeDelete ( m_pDict );
9716 
9717 	m_pDict = pDict;
9718 }
9719 
9720 
LeakDictionary()9721 CSphDict * CSphIndex::LeakDictionary ()
9722 {
9723 	CSphDict * pDict = m_pDict;
9724 	m_pDict = NULL;
9725 	return pDict;
9726 }
9727 
9728 
Setup(const CSphIndexSettings & tSettings)9729 void CSphIndex::Setup ( const CSphIndexSettings & tSettings )
9730 {
9731 	m_bStripperInited = true;
9732 	m_tSettings = tSettings;
9733 }
9734 
9735 
SetCacheSize(int iMaxCachedDocs,int iMaxCachedHits)9736 void CSphIndex::SetCacheSize ( int iMaxCachedDocs, int iMaxCachedHits )
9737 {
9738 	m_iMaxCachedDocs = iMaxCachedDocs;
9739 	m_iMaxCachedHits = iMaxCachedHits;
9740 }
9741 
9742 
GetGlobalIDF(const CSphString & sWord,int64_t iDocsLocal,bool bPlainIDF) const9743 float CSphIndex::GetGlobalIDF ( const CSphString & sWord, int64_t iDocsLocal, bool bPlainIDF ) const
9744 {
9745 	g_tGlobalIDFLock.Lock ();
9746 	CSphGlobalIDF ** ppGlobalIDF = g_hGlobalIDFs ( m_sGlobalIDFPath );
9747 	float fIDF = ppGlobalIDF && *ppGlobalIDF ? ( *ppGlobalIDF )->GetIDF ( sWord, iDocsLocal, bPlainIDF ) : 0.0f;
9748 	g_tGlobalIDFLock.Unlock ();
9749 	return fIDF;
9750 }
9751 
9752 
BuildDocList(SphAttr_t ** ppDocList,int64_t * pCount,CSphString *) const9753 bool CSphIndex::BuildDocList ( SphAttr_t ** ppDocList, int64_t * pCount, CSphString * ) const
9754 {
9755 	assert ( *ppDocList && pCount );
9756 	*ppDocList = NULL;
9757 	*pCount = 0;
9758 	return true;
9759 }
9760 
9761 /////////////////////////////////////////////////////////////////////////////
9762 
sphCreateIndexPhrase(const char * szIndexName,const char * sFilename)9763 CSphIndex * sphCreateIndexPhrase ( const char* szIndexName, const char * sFilename )
9764 {
9765 	return new CSphIndex_VLN ( szIndexName, sFilename );
9766 }
9767 
9768 
CSphIndex_VLN(const char * sIndexName,const char * sFilename)9769 CSphIndex_VLN::CSphIndex_VLN ( const char* sIndexName, const char * sFilename )
9770 	: CSphIndex ( sIndexName, sFilename )
9771 	, m_iLockFD ( -1 )
9772 	, m_iTotalDups ( 0 )
9773 	, m_dMinRow ( 0 )
9774 	, m_dFieldLens ( SPH_MAX_FIELDS )
9775 {
9776 	m_sFilename = sFilename;
9777 
9778 	m_iDocinfo = 0;
9779 	m_iDocinfoIndex = 0;
9780 	m_pDocinfoIndex = NULL;
9781 
9782 	m_bPreallocated = false;
9783 	m_uVersion = INDEX_FORMAT_VERSION;
9784 
9785 	m_uKillListSize = 0;
9786 	m_iMinMaxIndex = 0;
9787 
9788 	m_iIndexTag = -1;
9789 	m_bIsEmpty = true;
9790 
9791 	m_pPreread = NULL;
9792 	m_pAttrsStatus = NULL;
9793 
9794 	m_uMinDocid = 0;
9795 
9796 	m_bOndiskAllAttr = false;
9797 	m_bOndiskPoolAttr = false;
9798 	m_bDebugCheck = false;
9799 	m_bArenaProhibit = false;
9800 
9801 	ARRAY_FOREACH ( i, m_dFieldLens )
9802 		m_dFieldLens[i] = 0;
9803 }
9804 
9805 
~CSphIndex_VLN()9806 CSphIndex_VLN::~CSphIndex_VLN ()
9807 {
9808 #if USE_WINDOWS
9809 	if ( m_iIndexTag>=0 && g_pMvaArena )
9810 #else
9811 	if ( m_iIndexTag>=0 && g_bHeadProcess && g_pMvaArena )
9812 #endif
9813 		g_tMvaArena.TaggedFreeTag ( m_iIndexTag );
9814 
9815 #if !USE_WINDOWS
9816 	if ( g_bHeadProcess )
9817 #endif
9818 	Unlock();
9819 }
9820 
9821 
9822 /////////////////////////////////////////////////////////////////////////////
9823 
9824 
UpdateAttributes(const CSphAttrUpdate & tUpd,int iIndex,CSphString & sError,CSphString & sWarning)9825 int CSphIndex_VLN::UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError, CSphString & sWarning )
9826 {
9827 	// check if we can
9828 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
9829 	{
9830 		sError.SetSprintf ( "docinfo=extern required for updates" );
9831 		return -1;
9832 	}
9833 	if ( m_bOndiskAllAttr || m_bOndiskPoolAttr )
9834 	{
9835 		sError.SetSprintf ( "can not update ondisk_attrs enabled index" );
9836 		return -1;
9837 	}
9838 
9839 	assert ( tUpd.m_dDocids.GetLength()==tUpd.m_dRows.GetLength() );
9840 	assert ( tUpd.m_dDocids.GetLength()==tUpd.m_dRowOffset.GetLength() );
9841 	DWORD uRows = tUpd.m_dDocids.GetLength();
9842 
9843 	// check if we have to
9844 	if ( !m_iDocinfo || !uRows )
9845 		return 0;
9846 
9847 	if ( m_bBinlog && g_pBinlog )
9848 		g_pBinlog->BinlogUpdateAttributes ( &m_iTID, m_sIndexName.cstr(), tUpd );
9849 
9850 	// remap update schema to index schema
9851 	int iUpdLen = tUpd.m_dAttrs.GetLength();
9852 	CSphVector<CSphAttrLocator> dLocators ( iUpdLen );
9853 	CSphBitvec dFloats ( iUpdLen );
9854 	CSphBitvec dBigints ( iUpdLen );
9855 	CSphBitvec dDoubles ( iUpdLen );
9856 	CSphBitvec dJsonFields ( iUpdLen );
9857 	CSphVector < CSphRefcountedPtr<ISphExpr> > dExpr ( iUpdLen );
9858 	memset ( dLocators.Begin(), 0, dLocators.GetSizeBytes() );
9859 
9860 	uint64_t uDst64 = 0;
9861 	ARRAY_FOREACH ( i, tUpd.m_dAttrs )
9862 	{
9863 		int iIdx = m_tSchema.GetAttrIndex ( tUpd.m_dAttrs[i] );
9864 
9865 		if ( iIdx<0 )
9866 		{
9867 			CSphString sJsonCol, sJsonKey;
9868 			if ( sphJsonNameSplit ( tUpd.m_dAttrs[i], &sJsonCol, &sJsonKey ) )
9869 			{
9870 				iIdx = m_tSchema.GetAttrIndex ( sJsonCol.cstr() );
9871 				if ( iIdx>=0 )
9872 					dExpr[i] = sphExprParse ( tUpd.m_dAttrs[i], m_tSchema, NULL, NULL, sError, NULL );
9873 			}
9874 		}
9875 
9876 		if ( iIdx>=0 )
9877 		{
9878 			// forbid updates on non-int columns
9879 			const CSphColumnInfo & tCol = m_tSchema.GetAttr(iIdx);
9880 			if ( !( tCol.m_eAttrType==SPH_ATTR_BOOL || tCol.m_eAttrType==SPH_ATTR_INTEGER || tCol.m_eAttrType==SPH_ATTR_TIMESTAMP
9881 				|| tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET
9882 				|| tCol.m_eAttrType==SPH_ATTR_BIGINT || tCol.m_eAttrType==SPH_ATTR_FLOAT || tCol.m_eAttrType==SPH_ATTR_JSON ))
9883 			{
9884 				sError.SetSprintf ( "attribute '%s' can not be updated "
9885 					"(must be boolean, integer, bigint, float, timestamp, MVA or JSON)",
9886 					tUpd.m_dAttrs[i] );
9887 				return -1;
9888 			}
9889 
9890 			// forbid updates on MVA columns if there's no arena
9891 			if ( ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET ) && !g_pMvaArena )
9892 			{
9893 				sError.SetSprintf ( "MVA attribute '%s' can not be updated (MVA arena not initialized)", tCol.m_sName.cstr() );
9894 				return -1;
9895 			}
9896 			if ( ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET ) && m_bArenaProhibit )
9897 			{
9898 				sError.SetSprintf ( "MVA attribute '%s' can not be updated (already so many MVA " INT64_FMT ", should be less %d)",
9899 					tCol.m_sName.cstr(), m_tMva.GetNumEntries(), INT_MAX );
9900 				return -1;
9901 			}
9902 
9903 			bool bSrcMva = ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET );
9904 			bool bDstMva = ( tUpd.m_dTypes[i]==SPH_ATTR_UINT32SET || tUpd.m_dTypes[i]==SPH_ATTR_INT64SET );
9905 			if ( bSrcMva!=bDstMva )
9906 			{
9907 				sError.SetSprintf ( "attribute '%s' MVA flag mismatch", tUpd.m_dAttrs[i] );
9908 				return -1;
9909 			}
9910 
9911 			if ( tCol.m_eAttrType==SPH_ATTR_UINT32SET && tUpd.m_dTypes[i]==SPH_ATTR_INT64SET )
9912 			{
9913 				sError.SetSprintf ( "attribute '%s' MVA bits (dst=%d, src=%d) mismatch", tUpd.m_dAttrs[i],
9914 					tCol.m_eAttrType, tUpd.m_dTypes[i] );
9915 				return -1;
9916 			}
9917 
9918 			if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
9919 				uDst64 |= ( U64C(1)<<i );
9920 
9921 			if ( tCol.m_eAttrType==SPH_ATTR_FLOAT )
9922 				dFloats.BitSet(i);
9923 			else if ( tCol.m_eAttrType==SPH_ATTR_JSON )
9924 				dJsonFields.BitSet(i);
9925 			dLocators[i] = ( tCol.m_tLocator );
9926 		} else if ( tUpd.m_bIgnoreNonexistent )
9927 		{
9928 			continue;
9929 		} else
9930 		{
9931 			sError.SetSprintf ( "attribute '%s' not found", tUpd.m_dAttrs[i] );
9932 			return -1;
9933 		}
9934 
9935 		// this is a hack
9936 		// Query parser tries to detect an attribute type. And this is wrong because, we should
9937 		// take attribute type from schema. Probably we'll rewrite updates in future but
9938 		// for now this fix just works.
9939 		// Fixes cases like UPDATE float_attr=1 WHERE id=1;
9940 		assert ( iIdx>=0 );
9941 		if ( tUpd.m_dTypes[i]==SPH_ATTR_INTEGER && m_tSchema.GetAttr(iIdx).m_eAttrType==SPH_ATTR_FLOAT )
9942 		{
9943 			const_cast<CSphAttrUpdate &>(tUpd).m_dTypes[i] = SPH_ATTR_FLOAT;
9944 			const_cast<CSphAttrUpdate &>(tUpd).m_dPool[i] = sphF2DW ( (float)tUpd.m_dPool[i] );
9945 		}
9946 
9947 		if ( tUpd.m_dTypes[i]==SPH_ATTR_BIGINT )
9948 			dBigints.BitSet(i);
9949 		else if ( tUpd.m_dTypes[i]==SPH_ATTR_FLOAT )
9950 			dDoubles.BitSet(i);
9951 	}
9952 
9953 	// FIXME! FIXME! FIXME! overwriting just-freed blocks might hurt concurrent searchers;
9954 	// should implement a simplistic MVCC-style delayed-free to avoid that
9955 
9956 	// do the update
9957 	const int iFirst = ( iIndex<0 ) ? 0 : iIndex;
9958 	const int iLast = ( iIndex<0 ) ? uRows : iIndex+1;
9959 
9960 	// first pass, if needed
9961 	if ( tUpd.m_bStrict )
9962 	{
9963 		for ( int iUpd=iFirst; iUpd<iLast; iUpd++ )
9964 		{
9965 			const DWORD * pEntry = ( tUpd.m_dRows[iUpd] ? tUpd.m_dRows[iUpd] : FindDocinfo ( tUpd.m_dDocids[iUpd] ) );
9966 			if ( !pEntry )
9967 				continue; // no such id
9968 
9969 			// raw row might be from RT (another RAM segment or disk chunk)
9970 			const DWORD * pRows = m_tAttr.GetWritePtr();
9971 			const DWORD * pRowsEnd = pRows + m_tAttr.GetNumEntries();
9972 			bool bValidRow = ( pRows<=pEntry && pEntry<pRowsEnd );
9973 			if ( !bValidRow )
9974 				continue;
9975 
9976 			pEntry = DOCINFO2ATTRS(pEntry);
9977 			int iPos = tUpd.m_dRowOffset[iUpd];
9978 			ARRAY_FOREACH ( iCol, tUpd.m_dAttrs )
9979 				if ( dJsonFields.BitGet ( iCol ) )
9980 				{
9981 					ESphJsonType eType = dDoubles.BitGet ( iCol )
9982 						? JSON_DOUBLE
9983 						: ( dBigints.BitGet ( iCol ) ? JSON_INT64 : JSON_INT32 );
9984 
9985 					SphAttr_t uValue = dDoubles.BitGet ( iCol )
9986 						? sphD2QW ( (double)sphDW2F ( tUpd.m_dPool[iPos] ) )
9987 						: dBigints.BitGet ( iCol ) ? MVA_UPSIZE ( &tUpd.m_dPool[iPos] ) : tUpd.m_dPool[iPos];
9988 
9989 					if ( !sphJsonInplaceUpdate ( eType, uValue, dExpr[iCol].Ptr(), m_tString.GetWritePtr(), pEntry, false ) )
9990 					{
9991 						sError.SetSprintf ( "attribute '%s' can not be updated (not found or incompatible types) ", tUpd.m_dAttrs[iCol] );
9992 						return -1;
9993 					}
9994 
9995 					iPos += dBigints.BitGet ( iCol ) ? 2 : 1;
9996 				}
9997 		}
9998 	}
9999 
10000 	// row update must leave it in cosistent state; so let's preallocate all the needed MVA
10001 	// storage upfront to avoid suddenly having to rollback if allocation fails later
10002 	int iNumMVA = 0;
10003 	ARRAY_FOREACH ( i, tUpd.m_dAttrs )
10004 		if ( tUpd.m_dTypes[i]==SPH_ATTR_UINT32SET || tUpd.m_dTypes[i]==SPH_ATTR_INT64SET )
10005 			iNumMVA++;
10006 
10007 	// OPTIMIZE! execute the code below conditionally
10008 	CSphVector<DWORD*> dRowPtrs;
10009 	CSphVector<int> dMvaPtrs;
10010 
10011 	dRowPtrs.Resize ( uRows );
10012 	dMvaPtrs.Resize ( uRows*iNumMVA );
10013 	dMvaPtrs.Fill ( -1 );
10014 
10015 	// preallocate
10016 	bool bFailed = false;
10017 	for ( int iUpd=iFirst; iUpd<iLast && !bFailed; iUpd++ )
10018 	{
10019 		dRowPtrs[iUpd] = NULL;
10020 		DWORD * pEntry = const_cast < DWORD * > ( tUpd.m_dRows[iUpd] ? tUpd.m_dRows[iUpd] : FindDocinfo ( tUpd.m_dDocids[iUpd] ) );
10021 		if ( !pEntry )
10022 			continue; // no such id
10023 
10024 		// raw row might be from RT (another RAM segment or disk chunk) or another index from same update query
10025 		const DWORD * pRows = m_tAttr.GetWritePtr();
10026 		const DWORD * pRowsEnd = pRows + m_tAttr.GetNumEntries();
10027 		bool bValidRow = ( pRows<=pEntry && pEntry<pRowsEnd );
10028 		if ( !bValidRow )
10029 			continue;
10030 
10031 		dRowPtrs[iUpd] = pEntry;
10032 
10033 		int iPoolPos = tUpd.m_dRowOffset[iUpd];
10034 		int iMvaPtr = iUpd*iNumMVA;
10035 		ARRAY_FOREACH_COND ( iCol, tUpd.m_dAttrs, !bFailed )
10036 		{
10037 			bool bSrcMva32 = ( tUpd.m_dTypes[iCol]==SPH_ATTR_UINT32SET );
10038 			bool bSrcMva64 = ( tUpd.m_dTypes[iCol]==SPH_ATTR_INT64SET );
10039 			if (!( bSrcMva32 || bSrcMva64 )) // FIXME! optimize using a prebuilt dword mask?
10040 			{
10041 				iPoolPos++;
10042 				if ( dBigints.BitGet ( iCol ) )
10043 					iPoolPos++;
10044 				continue;
10045 			}
10046 
10047 			// get the requested new count
10048 			int iNewCount = (int)tUpd.m_dPool[iPoolPos++];
10049 			iPoolPos += iNewCount;
10050 
10051 			// try to alloc
10052 			int iAlloc = -1;
10053 			if ( iNewCount )
10054 			{
10055 				bool bDst64 = ( uDst64 & ( U64C(1) << iCol ) )!=0;
10056 				assert ( (iNewCount%2)==0 );
10057 				int iLen = ( bDst64 ? iNewCount : iNewCount/2 );
10058 				iAlloc = g_tMvaArena.TaggedAlloc ( m_iIndexTag, (1+iLen)*sizeof(DWORD)+sizeof(SphDocID_t) );
10059 				if ( iAlloc<0 )
10060 					bFailed = true;
10061 			}
10062 
10063 			// whatever the outcome, move the pointer
10064 			dMvaPtrs[iMvaPtr++] = iAlloc;
10065 		}
10066 	}
10067 
10068 	// if there were any allocation failures, rollback everything
10069 	if ( bFailed )
10070 	{
10071 		ARRAY_FOREACH ( i, dMvaPtrs )
10072 			if ( dMvaPtrs[i]>=0 )
10073 				g_tMvaArena.TaggedFreeIndex ( m_iIndexTag, dMvaPtrs[i] );
10074 
10075 		sError.SetSprintf ( "out of pool memory on MVA update" );
10076 		return -1;
10077 	}
10078 
10079 	// preallocation went OK; do the actual update
10080 	int iRowStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
10081 	int iUpdated = 0;
10082 	DWORD uUpdateMask = 0;
10083 	int iJsonWarnings = 0;
10084 
10085 	for ( int iUpd=iFirst; iUpd<iLast; iUpd++ )
10086 	{
10087 		bool bUpdated = false;
10088 
10089 		DWORD * pEntry = dRowPtrs[iUpd];
10090 		if ( !pEntry )
10091 			continue; // no such id
10092 
10093 		int64_t iBlock = int64_t ( pEntry-m_tAttr.GetWritePtr() ) / ( iRowStride*DOCINFO_INDEX_FREQ );
10094 		DWORD * pBlockRanges = const_cast < DWORD * > ( &m_pDocinfoIndex[iBlock*iRowStride*2] );
10095 		DWORD * pIndexRanges = const_cast < DWORD * > ( &m_pDocinfoIndex[m_iDocinfoIndex*iRowStride*2] );
10096 		assert ( iBlock>=0 && iBlock<m_iDocinfoIndex );
10097 
10098 		pEntry = DOCINFO2ATTRS(pEntry);
10099 
10100 		int iPos = tUpd.m_dRowOffset[iUpd];
10101 		int iMvaPtr = iUpd*iNumMVA;
10102 		ARRAY_FOREACH ( iCol, tUpd.m_dAttrs )
10103 		{
10104 			bool bSrcMva32 = ( tUpd.m_dTypes[iCol]==SPH_ATTR_UINT32SET );
10105 			bool bSrcMva64 = ( tUpd.m_dTypes[iCol]==SPH_ATTR_INT64SET );
10106 			bool bSrcJson = dJsonFields.BitGet ( iCol );
10107 			if (!( bSrcMva32 || bSrcMva64 || bSrcJson )) // FIXME! optimize using a prebuilt dword mask?
10108 			{
10109 				// plain update
10110 				SphAttr_t uValue = dBigints.BitGet ( iCol ) ? MVA_UPSIZE ( &tUpd.m_dPool[iPos] ) : tUpd.m_dPool[iPos];
10111 				sphSetRowAttr ( pEntry, dLocators[iCol], uValue );
10112 
10113 				// update block and index ranges
10114 				for ( int i=0; i<2; i++ )
10115 				{
10116 					DWORD * pBlock = i ? pBlockRanges : pIndexRanges;
10117 					SphAttr_t uMin = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol] );
10118 					SphAttr_t uMax = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ) , dLocators[iCol] );
10119 					if ( dFloats.BitGet ( iCol ) ) // update float's indexes assumes float comparision
10120 					{
10121 						float fValue = sphDW2F ( (DWORD) uValue );
10122 						float fMin = sphDW2F ( (DWORD) uMin );
10123 						float fMax = sphDW2F ( (DWORD) uMax );
10124 						if ( fValue<fMin )
10125 							sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], sphF2DW ( fValue ) );
10126 						if ( fValue>fMax )
10127 							sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], sphF2DW ( fValue ) );
10128 					} else // update usual integers
10129 					{
10130 						if ( uValue<uMin )
10131 							sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], uValue );
10132 						if ( uValue>uMax )
10133 							sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], uValue );
10134 					}
10135 				}
10136 
10137 				bUpdated = true;
10138 				uUpdateMask |= ATTRS_UPDATED;
10139 
10140 				// next
10141 				iPos += dBigints.BitGet ( iCol ) ? 2 : 1;
10142 				continue;
10143 			}
10144 
10145 			if ( bSrcJson )
10146 			{
10147 				ESphJsonType eType = dDoubles.BitGet ( iCol )
10148 					? JSON_DOUBLE
10149 					: ( dBigints.BitGet ( iCol ) ? JSON_INT64 : JSON_INT32 );
10150 
10151 				SphAttr_t uValue = dDoubles.BitGet ( iCol )
10152 					? sphD2QW ( (double)sphDW2F ( tUpd.m_dPool[iPos] ) )
10153 					: dBigints.BitGet ( iCol ) ? MVA_UPSIZE ( &tUpd.m_dPool[iPos] ) : tUpd.m_dPool[iPos];
10154 
10155 				if ( sphJsonInplaceUpdate ( eType, uValue, dExpr[iCol].Ptr(), m_tString.GetWritePtr(), pEntry, true ) )
10156 				{
10157 					bUpdated = true;
10158 					uUpdateMask |= ATTRS_STRINGS_UPDATED;
10159 
10160 				} else
10161 					iJsonWarnings++;
10162 
10163 				iPos += dBigints.BitGet ( iCol ) ? 2 : 1;
10164 				continue;
10165 			}
10166 
10167 			// MVA update
10168 			DWORD uOldIndex = MVA_DOWNSIZE ( sphGetRowAttr ( pEntry, dLocators[iCol] ) );
10169 
10170 			// get new count, store new data if needed
10171 			DWORD uNew = tUpd.m_dPool[iPos++];
10172 			const DWORD * pSrc = tUpd.m_dPool.Begin() + iPos;
10173 			iPos += uNew;
10174 
10175 			int64_t iNewMin = LLONG_MAX, iNewMax = LLONG_MIN;
10176 			int iNewIndex = dMvaPtrs[iMvaPtr++];
10177 			if ( uNew )
10178 			{
10179 				assert ( iNewIndex>=0 );
10180 				SphDocID_t* pDocid = (SphDocID_t *)(g_pMvaArena + iNewIndex);
10181 				*pDocid++ = ( tUpd.m_dRows[iUpd] ? DOCINFO2ID ( tUpd.m_dRows[iUpd] ) : tUpd.m_dDocids[iUpd] );
10182 				iNewIndex = (DWORD *)pDocid - g_pMvaArena;
10183 
10184 				assert ( iNewIndex>=0 );
10185 				DWORD * pDst = g_pMvaArena + iNewIndex;
10186 
10187 				bool bDst64 = ( uDst64 & ( U64C(1) << iCol ) )!=0;
10188 				assert ( ( uNew%2 )==0 );
10189 				int iLen = ( bDst64 ? uNew : uNew/2 );
10190 				// setup new value (flagged index) to store within row
10191 				uNew = DWORD(iNewIndex) | MVA_ARENA_FLAG;
10192 
10193 				// MVA values counter first
10194 				*pDst++ = iLen;
10195 				if ( bDst64 )
10196 				{
10197 					while ( iLen )
10198 					{
10199 						int64_t uValue = MVA_UPSIZE ( pSrc );
10200 						iNewMin = Min ( iNewMin, uValue );
10201 						iNewMax = Max ( iNewMax, uValue );
10202 						*pDst++ = *pSrc++;
10203 						*pDst++ = *pSrc++;
10204 						iLen -= 2;
10205 					}
10206 				} else
10207 				{
10208 					while ( iLen-- )
10209 					{
10210 						DWORD uValue = *pSrc;
10211 						pSrc += 2;
10212 						*pDst++ = uValue;
10213 						iNewMin = Min ( iNewMin, uValue );
10214 						iNewMax = Max ( iNewMax, uValue );
10215 					}
10216 				}
10217 			}
10218 
10219 			// store new value
10220 			sphSetRowAttr ( pEntry, dLocators[iCol], uNew );
10221 
10222 			// update block and index ranges
10223 			if ( uNew )
10224 				for ( int i=0; i<2; i++ )
10225 			{
10226 				DWORD * pBlock = i ? pBlockRanges : pIndexRanges;
10227 				int64_t iMin = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol] );
10228 				int64_t iMax = sphGetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol] );
10229 				if ( iNewMin<iMin || iNewMax>iMax )
10230 				{
10231 					sphSetRowAttr ( DOCINFO2ATTRS ( pBlock ), dLocators[iCol], Min ( iMin, iNewMin ) );
10232 					sphSetRowAttr ( DOCINFO2ATTRS ( pBlock+iRowStride ), dLocators[iCol], Max ( iMax, iNewMax ) );
10233 				}
10234 			}
10235 
10236 			// free old storage if needed
10237 			if ( uOldIndex & MVA_ARENA_FLAG )
10238 			{
10239 				uOldIndex = ((DWORD*)((SphDocID_t*)(g_pMvaArena + (uOldIndex & MVA_OFFSET_MASK))-1))-g_pMvaArena;
10240 				g_tMvaArena.TaggedFreeIndex ( m_iIndexTag, uOldIndex );
10241 			}
10242 
10243 			bUpdated = true;
10244 			uUpdateMask |= ATTRS_MVA_UPDATED;
10245 		}
10246 
10247 		if ( bUpdated )
10248 			iUpdated++;
10249 	}
10250 
10251 	if ( iJsonWarnings>0 )
10252 	{
10253 		sWarning.SetSprintf ( "%d attribute(s) can not be updated (not found or incompatible types)", iJsonWarnings );
10254 		if ( iUpdated==0 )
10255 		{
10256 			sError = sWarning;
10257 			return -1;
10258 		}
10259 	}
10260 
10261 	*m_pAttrsStatus |= uUpdateMask; // FIXME! add lock/atomic?
10262 	return iUpdated;
10263 }
10264 
LoadPersistentMVA(CSphString & sError)10265 bool CSphIndex_VLN::LoadPersistentMVA ( CSphString & sError )
10266 {
10267 	// prepare the file to load
10268 	CSphAutoreader fdReader;
10269 	if ( !fdReader.Open ( GetIndexFileName("mvp"), m_sLastError ) )
10270 	{
10271 		// no mvp means no saved attributes.
10272 		m_sLastError = "";
10273 		return true;
10274 	}
10275 
10276 	// check if we can
10277 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
10278 	{
10279 		sError.SetSprintf ( "docinfo=extern required for updates" );
10280 		return false;
10281 	}
10282 	if ( m_bArenaProhibit )
10283 	{
10284 		sError.SetSprintf ( "MVA update disabled (already so many MVA " INT64_FMT ", should be less %d)", m_tMva.GetNumEntries(), INT_MAX );
10285 		return false;
10286 	}
10287 
10288 	DWORD uDocs = fdReader.GetDword();
10289 
10290 	// if we have docs to update
10291 	if ( !uDocs )
10292 		return false;
10293 
10294 	CSphVector<SphDocID_t> dAffected ( uDocs );
10295 	fdReader.GetBytes ( &dAffected[0], uDocs*sizeof(SphDocID_t) );
10296 
10297 	// collect the indexes of MVA schema attributes
10298 	CSphVector<CSphAttrLocator> dMvaLocators;
10299 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
10300 	{
10301 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
10302 		if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
10303 			dMvaLocators.Add ( tAttr.m_tLocator );
10304 	}
10305 #ifndef NDEBUG
10306 	int iMva64 = dMvaLocators.GetLength();
10307 #endif
10308 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
10309 	{
10310 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
10311 		if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
10312 			dMvaLocators.Add ( tAttr.m_tLocator );
10313 	}
10314 	assert ( dMvaLocators.GetLength()!=0 );
10315 
10316 	if ( g_tMvaArena.GetError() ) // have to reset affected MVA in case of ( persistent MVA + no MVA arena )
10317 	{
10318 		ARRAY_FOREACH ( iDoc, dAffected )
10319 		{
10320 			DWORD * pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[iDoc] ) );
10321 			assert ( pDocinfo );
10322 			DWORD * pAttrs = DOCINFO2ATTRS ( pDocinfo );
10323 			ARRAY_FOREACH ( iMva, dMvaLocators )
10324 			{
10325 				// reset MVA from arena
10326 				if ( MVA_DOWNSIZE ( sphGetRowAttr ( pAttrs, dMvaLocators[iMva] ) ) & MVA_ARENA_FLAG )
10327 					sphSetRowAttr ( pAttrs, dMvaLocators[iMva], 0 );
10328 			}
10329 		}
10330 
10331 		sphWarning ( "index '%s' forced to reset persistent MVAs ( %s )", m_sIndexName.cstr(), g_tMvaArena.GetError() );
10332 		fdReader.Close();
10333 		return true;
10334 	}
10335 
10336 	CSphVector<DWORD*> dRowPtrs ( uDocs );
10337 	CSphVector<int> dAllocs;
10338 	dAllocs.Reserve ( uDocs );
10339 
10340 	// prealloc values (and also preload)
10341 	bool bFailed = false;
10342 	ARRAY_FOREACH ( i, dAffected )
10343 	{
10344 		DWORD* pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[i] ) );
10345 		assert ( pDocinfo );
10346 		pDocinfo = DOCINFO2ATTRS ( pDocinfo );
10347 		ARRAY_FOREACH_COND ( j, dMvaLocators, !bFailed )
10348 		{
10349 			// if this MVA was updated
10350 			if ( MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) ) & MVA_ARENA_FLAG )
10351 			{
10352 				DWORD uCount = fdReader.GetDword();
10353 				if ( uCount )
10354 				{
10355 					assert ( j<iMva64 || ( uCount%2 )==0 );
10356 					int iAlloc = g_tMvaArena.TaggedAlloc ( m_iIndexTag, (1+uCount)*sizeof(DWORD)+sizeof(SphDocID_t) );
10357 					if ( iAlloc<0 )
10358 						bFailed = true;
10359 					else
10360 					{
10361 						SphDocID_t *pDocid = (SphDocID_t*)(g_pMvaArena + iAlloc);
10362 						*pDocid++ = dAffected[i];
10363 						DWORD * pData = (DWORD*)pDocid;
10364 						*pData++ = uCount;
10365 						fdReader.GetBytes ( pData, uCount*sizeof(DWORD) );
10366 						dAllocs.Add ( iAlloc );
10367 					}
10368 				}
10369 			}
10370 		}
10371 		if ( bFailed )
10372 			break;
10373 		dRowPtrs[i] = pDocinfo;
10374 	}
10375 	fdReader.Close();
10376 
10377 	if ( bFailed )
10378 	{
10379 		ARRAY_FOREACH ( i, dAllocs )
10380 			g_tMvaArena.TaggedFreeIndex ( m_iIndexTag, dAllocs[i] );
10381 
10382 		sError.SetSprintf ( "out of pool memory on loading persistent MVA values" );
10383 		return false;
10384 	}
10385 
10386 	// prealloc && load ok, fix the attributes now
10387 	int iAllocIndex = 0;
10388 	ARRAY_FOREACH ( i, dAffected )
10389 	{
10390 		DWORD* pDocinfo = dRowPtrs[i];
10391 		assert ( pDocinfo );
10392 		ARRAY_FOREACH_COND ( j, dMvaLocators, !bFailed )
10393 			// if this MVA was updated
10394 			if ( MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) ) & MVA_ARENA_FLAG )
10395 				sphSetRowAttr ( pDocinfo, dMvaLocators[j],
10396 					((DWORD*)(((SphDocID_t*)(g_pMvaArena + dAllocs[iAllocIndex++]))+1) - g_pMvaArena) | MVA_ARENA_FLAG );
10397 	}
10398 	return true;
10399 }
10400 
10401 //////////////////////////////////////////////////////////////////////////
10402 
PrecomputeMinMax()10403 bool CSphIndex_VLN::PrecomputeMinMax()
10404 {
10405 	if ( !m_iDocinfo )
10406 		return true;
10407 
10408 	AttrIndexBuilder_c tBuilder ( m_tSchema );
10409 	tBuilder.Prepare ( m_pDocinfoIndex, m_pDocinfoIndex + ( m_iDocinfoIndex+1 ) * 2 * ( DOCINFO_IDSIZE + m_tSchema.GetRowSize() ) );
10410 
10411 	int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
10412 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_PRECOMPUTE;
10413 	m_tProgress.m_iDone = 0;
10414 	m_iMinMaxIndex = 0;
10415 
10416 	for ( int64_t iIndexEntry=0; iIndexEntry<m_iDocinfo; iIndexEntry++ )
10417 	{
10418 		if ( !tBuilder.Collect ( m_tAttr.GetWritePtr() + iIndexEntry * iStride, m_tMva.GetWritePtr(),
10419 				m_tMva.GetNumEntries(), m_sLastError, true ) )
10420 				return false;
10421 
10422 		m_iMinMaxIndex += iStride;
10423 
10424 		// show progress
10425 		int64_t iDone = (iIndexEntry+1)*1000/m_iDocinfoIndex;
10426 		if ( iDone!=m_tProgress.m_iDone )
10427 		{
10428 			m_tProgress.m_iDone = (int)iDone;
10429 			m_tProgress.Show ( m_tProgress.m_iDone==1000 );
10430 		}
10431 	}
10432 
10433 	tBuilder.FinishCollect();
10434 	return true;
10435 }
10436 
10437 // safely rename an index file
JuggleFile(const char * szExt,CSphString & sError,bool bNeedOrigin) const10438 bool CSphIndex_VLN::JuggleFile ( const char* szExt, CSphString & sError, bool bNeedOrigin ) const
10439 {
10440 	CSphString sExt = GetIndexFileName ( szExt );
10441 	CSphString sExtNew, sExtOld;
10442 	sExtNew.SetSprintf ( "%s.tmpnew", sExt.cstr() );
10443 	sExtOld.SetSprintf ( "%s.tmpold", sExt.cstr() );
10444 
10445 	if ( ::rename ( sExt.cstr(), sExtOld.cstr() ) )
10446 	{
10447 		if ( bNeedOrigin )
10448 		{
10449 			sError.SetSprintf ( "rename '%s' to '%s' failed: %s", sExt.cstr(), sExtOld.cstr(), strerror(errno) );
10450 			return false;
10451 		}
10452 	}
10453 
10454 	if ( ::rename ( sExtNew.cstr(), sExt.cstr() ) )
10455 	{
10456 		if ( bNeedOrigin && !::rename ( sExtOld.cstr(), sExt.cstr() ) )
10457 		{
10458 			// rollback failed too!
10459 			sError.SetSprintf ( "rollback rename to '%s' failed: %s; INDEX UNUSABLE; FIX FILE NAMES MANUALLY", sExt.cstr(), strerror(errno) );
10460 		} else
10461 		{
10462 			// rollback went ok
10463 			sError.SetSprintf ( "rename '%s' to '%s' failed: %s", sExtNew.cstr(), sExt.cstr(), strerror(errno) );
10464 		}
10465 		return false;
10466 	}
10467 
10468 	// all done
10469 	::unlink ( sExtOld.cstr() );
10470 	return true;
10471 }
10472 
SaveAttributes(CSphString & sError) const10473 bool CSphIndex_VLN::SaveAttributes ( CSphString & sError ) const
10474 {
10475 	if ( !m_pAttrsStatus || !*m_pAttrsStatus || !m_iDocinfo )
10476 		return true;
10477 
10478 	if ( m_bOndiskAllAttr || m_bOndiskPoolAttr )
10479 	{
10480 		sError.SetSprintf ( "ondisk_attrs enabled; saving is not (yet) possible" );
10481 		return false;
10482 	}
10483 
10484 	DWORD uAttrStatus = *m_pAttrsStatus;
10485 
10486 	sphLogDebugvv ( "index '%s' attrs (%d) saving...", m_sIndexName.cstr(), uAttrStatus );
10487 
10488 	assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && m_iDocinfo && m_tAttr.GetWritePtr() );
10489 
10490 	for ( ; uAttrStatus & ATTRS_MVA_UPDATED ; )
10491 	{
10492 		// collect the indexes of MVA schema attributes
10493 		CSphVector<CSphAttrLocator> dMvaLocators;
10494 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
10495 		{
10496 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
10497 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
10498 				dMvaLocators.Add ( tAttr.m_tLocator );
10499 		}
10500 #ifndef NDEBUG
10501 		int iMva64 = dMvaLocators.GetLength();
10502 #endif
10503 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
10504 		{
10505 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
10506 			if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
10507 				dMvaLocators.Add ( tAttr.m_tLocator );
10508 		}
10509 
10510 		// collect the list of all docids with changed MVA attributes
10511 		CSphVector<SphDocID_t> dAffected;
10512 		{
10513 			tDocCollector dCollect ( dAffected );
10514 			g_tMvaArena.ExamineTag ( &dCollect, m_iIndexTag );
10515 		}
10516 		dAffected.Uniq();
10517 
10518 		if ( !dAffected.GetLength() )
10519 			break;
10520 
10521 		// prepare the file to save into;
10522 		CSphWriter fdFlushMVA;
10523 		fdFlushMVA.OpenFile ( GetIndexFileName("mvp.tmpnew"), sError );
10524 		if ( fdFlushMVA.IsError() )
10525 			return false;
10526 
10527 		// save the vector of affected docids
10528 		DWORD uPos = dAffected.GetLength();
10529 		fdFlushMVA.PutDword ( uPos );
10530 		fdFlushMVA.PutBytes ( &dAffected[0], uPos*sizeof(SphDocID_t) );
10531 
10532 		// save the updated MVA vectors
10533 		ARRAY_FOREACH ( i, dAffected )
10534 		{
10535 			DWORD* pDocinfo = const_cast<DWORD*> ( FindDocinfo ( dAffected[i] ) );
10536 			assert ( pDocinfo );
10537 
10538 			pDocinfo = DOCINFO2ATTRS ( pDocinfo );
10539 			ARRAY_FOREACH ( j, dMvaLocators )
10540 			{
10541 				DWORD uOldIndex = MVA_DOWNSIZE ( sphGetRowAttr ( pDocinfo, dMvaLocators[j] ) );
10542 				// if this MVA was updated
10543 				if ( uOldIndex & MVA_ARENA_FLAG )
10544 				{
10545 					DWORD * pMva = g_pMvaArena + ( uOldIndex & MVA_OFFSET_MASK );
10546 					DWORD uCount = *pMva;
10547 					assert ( j<iMva64 || ( uCount%2 )==0 );
10548 					fdFlushMVA.PutDword ( uCount );
10549 					fdFlushMVA.PutBytes ( pMva+1, uCount*sizeof(DWORD) );
10550 				}
10551 			}
10552 		}
10553 		fdFlushMVA.CloseFile();
10554 		if ( !JuggleFile ( "mvp", sError, false ) )
10555 			return false;
10556 		break;
10557 	}
10558 
10559 	if ( m_bId32to64 )
10560 	{
10561 		sError.SetSprintf ( "id32 index loaded by id64 binary; saving is not (yet) possible" );
10562 		return false;
10563 	}
10564 
10565 	assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && m_iDocinfo && m_tAttr.GetWritePtr() );
10566 
10567 	// save current state
10568 	CSphAutofile fdTmpnew ( GetIndexFileName("spa.tmpnew"), SPH_O_NEW, sError );
10569 	if ( fdTmpnew.GetFD()<0 )
10570 		return false;
10571 
10572 	int uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
10573 	int64_t iSize = m_iDocinfo*sizeof(DWORD)*uStride;
10574 	if ( m_uVersion>=20 )
10575 		iSize += (m_iDocinfoIndex+1)*uStride*sizeof(CSphRowitem)*2;
10576 
10577 	if ( !sphWriteThrottled ( fdTmpnew.GetFD(), m_tAttr.GetWritePtr(), iSize, "docinfo", sError, &g_tThrottle ) )
10578 		return false;
10579 
10580 	fdTmpnew.Close ();
10581 
10582 	if ( !JuggleFile ( "spa", sError ) )
10583 		return false;
10584 
10585 	if ( m_bBinlog && g_pBinlog )
10586 		g_pBinlog->NotifyIndexFlush ( m_sIndexName.cstr(), m_iTID, false );
10587 
10588 	// save .sps file (inplace update only, no remapping/resizing)
10589 	if ( uAttrStatus & ATTRS_STRINGS_UPDATED )
10590 	{
10591 		CSphWriter tStrWriter;
10592 		if ( !tStrWriter.OpenFile ( GetIndexFileName("sps.tmpnew"), sError ) )
10593 			return false;
10594 		tStrWriter.PutBytes ( m_tString.GetWritePtr(), m_tString.GetLengthBytes() );
10595 		tStrWriter.CloseFile();
10596 		if ( !JuggleFile ( "sps", sError ) )
10597 			return false;
10598 	}
10599 
10600 	if ( *m_pAttrsStatus==uAttrStatus )
10601 		*m_pAttrsStatus = 0;
10602 
10603 	sphLogDebugvv ( "index '%s' attrs (%d) saved", m_sIndexName.cstr(), *m_pAttrsStatus );
10604 
10605 	return true;
10606 }
10607 
GetAttributeStatus() const10608 DWORD CSphIndex_VLN::GetAttributeStatus () const
10609 {
10610 	assert ( m_pAttrsStatus );
10611 	return *m_pAttrsStatus;
10612 }
10613 
10614 
CopyRow(const CSphRowitem * pDocinfo,DWORD * pTmpDocinfo,const CSphColumnInfo * pNewAttr,int iOldStride) const10615 const CSphRowitem * CSphIndex_VLN::CopyRow ( const CSphRowitem * pDocinfo, DWORD * pTmpDocinfo, const CSphColumnInfo * pNewAttr, int iOldStride ) const
10616 {
10617 	SphDocID_t uDocId = DOCINFO2ID ( pDocinfo );
10618 	const DWORD * pAttrs = DOCINFO2ATTRS ( pDocinfo );
10619 	memcpy ( DOCINFO2ATTRS ( pTmpDocinfo ), pAttrs, (iOldStride - DOCINFO_IDSIZE)*sizeof(DWORD) );
10620 	sphSetRowAttr ( DOCINFO2ATTRS ( pTmpDocinfo ), pNewAttr->m_tLocator, 0 );
10621 	DOCINFOSETID ( pTmpDocinfo, uDocId );
10622 	return pDocinfo + iOldStride;
10623 }
10624 
10625 
CopyRowAttrByAttr(const CSphRowitem * pDocinfo,DWORD * pTmpDocinfo,const CSphSchema & tOldSchema,const CSphSchema & tNewSchema,int iAttrToRemove,const CSphVector<int> & dAttrMap,int iOldStride)10626 static const CSphRowitem * CopyRowAttrByAttr ( const CSphRowitem * pDocinfo, DWORD * pTmpDocinfo, const CSphSchema & tOldSchema, const CSphSchema & tNewSchema, int iAttrToRemove, const CSphVector<int> & dAttrMap, int iOldStride )
10627 {
10628 	DOCINFOSETID ( pTmpDocinfo, DOCINFO2ID ( pDocinfo ) );
10629 
10630 	for ( int iAttr = 0; iAttr < tOldSchema.GetAttrsCount(); iAttr++ )
10631 		if ( iAttr!=iAttrToRemove )
10632 		{
10633 			SphAttr_t tValue = sphGetRowAttr ( DOCINFO2ATTRS ( pDocinfo ), tOldSchema.GetAttr ( iAttr ).m_tLocator );
10634 			sphSetRowAttr ( DOCINFO2ATTRS ( pTmpDocinfo ), tNewSchema.GetAttr ( dAttrMap[iAttr] ).m_tLocator, tValue );
10635 		}
10636 
10637 	return pDocinfo + iOldStride;
10638 }
10639 
10640 
CreateAttrMap(CSphVector<int> & dAttrMap,const CSphSchema & tOldSchema,const CSphSchema & tNewSchema,int iAttrToRemove)10641 static void CreateAttrMap ( CSphVector<int> & dAttrMap, const CSphSchema & tOldSchema, const CSphSchema & tNewSchema, int iAttrToRemove )
10642 {
10643 	dAttrMap.Resize ( tOldSchema.GetAttrsCount() );
10644 	for ( int iAttr = 0; iAttr < tOldSchema.GetAttrsCount(); iAttr++ )
10645 		if ( iAttr!=iAttrToRemove )
10646 		{
10647 			dAttrMap[iAttr] = tNewSchema.GetAttrIndex ( tOldSchema.GetAttr ( iAttr ).m_sName.cstr() );
10648 			assert ( dAttrMap[iAttr]>=0 );
10649 		} else
10650 			dAttrMap[iAttr] = -1;
10651 }
10652 
10653 
CreateModifiedFiles(bool bAddAttr,const CSphString & sAttrName,ESphAttr eAttrType,int iPos,CSphString & sError)10654 bool CSphIndex_VLN::CreateModifiedFiles ( bool bAddAttr, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError )
10655 {
10656 	if ( m_bOndiskAllAttr || m_bOndiskPoolAttr )
10657 	{
10658 		sError.SetSprintf ( "ondisk_attrs enabled; adding and removing attributes is not (yet) possible" );
10659 		return false;
10660 	}
10661 
10662 	CSphSchema tNewSchema = m_tSchema;
10663 
10664 	if ( bAddAttr )
10665 	{
10666 		CSphColumnInfo tInfo ( sAttrName.cstr(), eAttrType );
10667 		tNewSchema.InsertAttr ( iPos, tInfo, false );
10668 	} else
10669 		tNewSchema.RemoveAttr ( sAttrName.cstr(), false );
10670 
10671 	CSphFixedVector<CSphRowitem> dMinRow ( tNewSchema.GetRowSize() );
10672 	int iOldStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
10673 	int iNewStride = DOCINFO_IDSIZE + tNewSchema.GetRowSize();
10674 
10675 	int64_t iNewMinMaxIndex = m_iDocinfo*iNewStride;
10676 
10677 	BuildHeader_t tBuildHeader ( m_tStats );
10678 	tBuildHeader.m_sHeaderExtension = "new.sph";
10679 	tBuildHeader.m_pThrottle = &g_tThrottle;
10680 	tBuildHeader.m_pMinRow = dMinRow.Begin();
10681 	tBuildHeader.m_uMinDocid = m_uMinDocid;
10682 	tBuildHeader.m_uKillListSize = m_uKillListSize;
10683 	tBuildHeader.m_iMinMaxIndex = iNewMinMaxIndex;
10684 
10685 	*(DictHeader_t*)&tBuildHeader = *(DictHeader_t*)&m_tWordlist;
10686 
10687 	CSphSchema tOldSchema = m_tSchema;
10688 	m_tSchema = tNewSchema;
10689 
10690 	// save the header
10691 	bool bBuildRes = BuildDone ( tBuildHeader, sError );
10692 	m_tSchema = tOldSchema;
10693 	if ( !bBuildRes )
10694 		return false;
10695 
10696 	// generate a new .SPA file
10697 	CSphWriter tSPAWriter;
10698 	tSPAWriter.SetBufferSize ( 524288 );
10699 	CSphString sSPAfile = GetIndexFileName ( "new.spa" );
10700 	if ( !tSPAWriter.OpenFile ( sSPAfile, sError ) )
10701 		return false;
10702 
10703 	const CSphRowitem * pDocinfo = m_tAttr.GetWritePtr();
10704 	if ( !pDocinfo )
10705 	{
10706 		sError = "index must have at least one attribute";
10707 		return false;
10708 	}
10709 
10710 	CSphFixedVector<DWORD> dTmpDocinfos ( iNewStride );
10711 	DWORD * pTmpDocinfo = dTmpDocinfos.Begin();
10712 
10713 	if ( bAddAttr )
10714 	{
10715 		const CSphColumnInfo * pNewAttr = tNewSchema.GetAttr ( sAttrName.cstr() );
10716 		assert ( pNewAttr );
10717 
10718 		for ( int i = 0; i < m_iDocinfo + (m_iDocinfoIndex+1)*2 && !tSPAWriter.IsError(); i++ )
10719 		{
10720 			pDocinfo = CopyRow ( pDocinfo, pTmpDocinfo, pNewAttr, iOldStride );
10721 			tSPAWriter.PutBytes ( pTmpDocinfo, iNewStride*sizeof(DWORD) );
10722 		}
10723 	} else
10724 	{
10725 		int iAttrToRemove = tOldSchema.GetAttrIndex ( sAttrName.cstr() );
10726 		assert ( iAttrToRemove>=0 );
10727 
10728 		CSphVector<int> dAttrMap;
10729 		CreateAttrMap ( dAttrMap, tOldSchema, tNewSchema, iAttrToRemove );
10730 
10731 		for ( int i = 0; i < m_iDocinfo + (m_iDocinfoIndex+1)*2 && !tSPAWriter.IsError(); i++ )
10732 		{
10733 			pDocinfo = CopyRowAttrByAttr ( pDocinfo, pTmpDocinfo, tOldSchema, tNewSchema, iAttrToRemove, dAttrMap, iOldStride );
10734 			tSPAWriter.PutBytes ( pTmpDocinfo, iNewStride*sizeof(DWORD) );
10735 		}
10736 	}
10737 
10738 	if ( tSPAWriter.IsError() )
10739 	{
10740 		sError.SetSprintf ( "error writing to %s", sSPAfile.cstr() );
10741 		return false;
10742 	}
10743 
10744 	return true;
10745 }
10746 
10747 
AddRemoveAttribute(bool bAdd,const CSphString & sAttrName,ESphAttr eAttrType,int iPos,CSphString & sError)10748 bool CSphIndex_VLN::AddRemoveAttribute ( bool bAdd, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError )
10749 {
10750 	if ( m_bOndiskAllAttr || m_bOndiskPoolAttr )
10751 	{
10752 		sError.SetSprintf ( "ondisk_attrs enabled; adding attribute is not (yet) possible" );
10753 		return false;
10754 	}
10755 
10756 	int iOldStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
10757 	CSphSchema tOldSchema = m_tSchema;
10758 
10759 	if ( bAdd )
10760 	{
10761 		CSphColumnInfo tInfo ( sAttrName.cstr(), eAttrType );
10762 		m_tSchema.InsertAttr ( iPos, tInfo, false );
10763 
10764 		if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
10765 			m_tSettings.m_eDocinfo = SPH_DOCINFO_EXTERN;
10766 	} else
10767 		m_tSchema.RemoveAttr ( sAttrName.cstr(), false );
10768 
10769 	int iNewStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
10770 	m_iMinMaxIndex = m_iDocinfo*iNewStride;
10771 
10772 	CSphString sWarning;
10773 	const CSphRowitem * pDocinfo = m_dAttrShared.GetWritePtr();
10774 	CSphSharedBuffer<DWORD> pNewDocinfo;
10775 
10776 	// fixme: this could cause inconsistency between on-disk and in-memory data
10777 	if ( !pNewDocinfo.Alloc ( m_iDocinfo*iNewStride + (m_iDocinfoIndex+1)*iNewStride*2, sError, sWarning ) )
10778 		return false;
10779 
10780 	DWORD * pNewDocinfos = pNewDocinfo.GetWritePtr();
10781 	assert ( pNewDocinfos );
10782 
10783 	if ( bAdd )
10784 	{
10785 		const CSphColumnInfo * pNewAttr = m_tSchema.GetAttr ( sAttrName.cstr() );
10786 		assert ( pNewAttr );
10787 
10788 		for ( int i = 0; i < m_iDocinfo + (m_iDocinfoIndex+1)*2; i++ )
10789 		{
10790 			pDocinfo = CopyRow ( pDocinfo, pNewDocinfos, pNewAttr, iOldStride );
10791 			pNewDocinfos += iNewStride;
10792 		}
10793 	} else
10794 	{
10795 		int iAttrToRemove = tOldSchema.GetAttrIndex ( sAttrName.cstr() );
10796 		assert ( iAttrToRemove>=0 );
10797 
10798 		CSphVector<int> dAttrMap;
10799 		CreateAttrMap ( dAttrMap, tOldSchema, m_tSchema, iAttrToRemove );
10800 
10801 		for ( int i = 0; i < m_iDocinfo + (m_iDocinfoIndex+1)*2; i++ )
10802 		{
10803 			pDocinfo = CopyRowAttrByAttr ( pDocinfo, pNewDocinfos, tOldSchema, m_tSchema, iAttrToRemove, dAttrMap, iOldStride );
10804 			pNewDocinfos += iNewStride;
10805 		}
10806 	}
10807 
10808 	m_dAttrShared.Swap ( pNewDocinfo );
10809 	m_tAttr.Set ( m_dAttrShared.GetWritePtr(), m_dAttrShared.GetNumEntries() );
10810 
10811 	m_pDocinfoIndex = m_dAttrShared.GetWritePtr() + m_iDocinfo*iNewStride;
10812 
10813 	return true;
10814 }
10815 
10816 
10817 /////////////////////////////////////////////////////////////////////////////
10818 
10819 struct CmpHit_fn
10820 {
IsLessCmpHit_fn10821 	inline bool IsLess ( const CSphWordHit & a, const CSphWordHit & b ) const
10822 	{
10823 		return ( a.m_uWordID<b.m_uWordID ) ||
10824 				( a.m_uWordID==b.m_uWordID && a.m_uDocID<b.m_uDocID ) ||
10825 				( a.m_uWordID==b.m_uWordID && a.m_uDocID==b.m_uDocID && HITMAN::GetPosWithField ( a.m_uWordPos )<HITMAN::GetPosWithField ( b.m_uWordPos ) );
10826 	}
10827 };
10828 
10829 
10830 /// sort baked docinfos by document ID
10831 struct DocinfoSort_fn
10832 {
10833 	typedef SphDocID_t MEDIAN_TYPE;
10834 
10835 	int m_iStride;
10836 
DocinfoSort_fnDocinfoSort_fn10837 	explicit DocinfoSort_fn ( int iStride )
10838 		: m_iStride ( iStride )
10839 	{}
10840 
KeyDocinfoSort_fn10841 	SphDocID_t Key ( DWORD * pData ) const
10842 	{
10843 		return DOCINFO2ID(pData);
10844 	}
10845 
CopyKeyDocinfoSort_fn10846 	void CopyKey ( SphDocID_t * pMed, DWORD * pVal ) const
10847 	{
10848 		*pMed = Key(pVal);
10849 	}
10850 
IsLessDocinfoSort_fn10851 	bool IsLess ( SphDocID_t a, SphDocID_t b ) const
10852 	{
10853 		return a < b;
10854 	}
10855 
SwapDocinfoSort_fn10856 	void Swap ( DWORD * a, DWORD * b ) const
10857 	{
10858 		for ( int i=0; i<m_iStride; i++ )
10859 			::Swap ( a[i], b[i] );
10860 	}
10861 
AddDocinfoSort_fn10862 	DWORD * Add ( DWORD * p, int i ) const
10863 	{
10864 		return p+i*m_iStride;
10865 	}
10866 
SubDocinfoSort_fn10867 	int Sub ( DWORD * b, DWORD * a ) const
10868 	{
10869 		return (int)((b-a)/m_iStride);
10870 	}
10871 };
10872 
10873 
sphSortDocinfos(DWORD * pBuf,int iCount,int iStride)10874 void sphSortDocinfos ( DWORD * pBuf, int iCount, int iStride )
10875 {
10876 	DocinfoSort_fn fnSort ( iStride );
10877 	sphSort ( pBuf, iCount, fnSort, fnSort );
10878 }
10879 
10880 
GetIndexFileName(const char * sExt) const10881 CSphString CSphIndex_VLN::GetIndexFileName ( const char * sExt ) const
10882 {
10883 	CSphString sRes;
10884 	sRes.SetSprintf ( "%s.%s", m_sFilename.cstr(), sExt );
10885 	return sRes;
10886 }
10887 
10888 
10889 class CSphHitBuilder
10890 {
10891 public:
10892 	CSphHitBuilder ( const CSphIndexSettings & tSettings, const CSphVector<SphWordID_t> & dHitless, bool bMerging, int iBufSize, CSphDict * pDict, CSphString * sError );
~CSphHitBuilder()10893 	~CSphHitBuilder () {}
10894 
10895 	bool	CreateIndexFiles ( const char * sDocName, const char * sHitName, const char * sSkipName, bool bInplace, int iWriteBuffer, CSphAutofile & tHit, SphOffset_t * pSharedOffset );
10896 	void	HitReset ();
10897 	void	cidxHit ( CSphAggregateHit * pHit, const CSphRowitem * pAttrs );
10898 	bool	cidxDone ( int iMemLimit, int iMinInfixLen, int iMaxCodepointLen, DictHeader_t * pDictHeader );
10899 	int		cidxWriteRawVLB ( int fd, CSphWordHit * pHit, int iHits, DWORD * pDocinfo, int iDocinfos, int iStride );
10900 
GetHitfilePos() const10901 	SphOffset_t		GetHitfilePos () const { return m_wrHitlist.GetPos (); }
CloseHitlist()10902 	void			CloseHitlist () { m_wrHitlist.CloseFile (); }
IsError() const10903 	bool			IsError () const { return ( m_pDict->DictIsError() || m_wrDoclist.IsError() || m_wrHitlist.IsError() ); }
10904 	void			SetMin ( const CSphRowitem * pDynamic, int iDynamic );
HitblockBegin()10905 	void			HitblockBegin () { m_pDict->HitblockBegin(); }
IsWordDict() const10906 	bool			IsWordDict () const { return m_pDict->GetSettings().m_bWordDict; }
SetThrottle(ThrottleState_t * pState)10907 	void			SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
10908 
10909 private:
10910 	void	DoclistBeginEntry ( SphDocID_t uDocid, const DWORD * pAttrs );
10911 	void	DoclistEndEntry ( Hitpos_t uLastPos );
10912 	void	DoclistEndList ();
10913 
10914 	CSphWriter					m_wrDoclist;			///< wordlist writer
10915 	CSphWriter					m_wrHitlist;			///< hitlist writer
10916 	CSphWriter					m_wrSkiplist;			///< skiplist writer
10917 	CSphFixedVector<BYTE>		m_dWriteBuffer;			///< my write buffer (for temp files)
10918 	ThrottleState_t *			m_pThrottle;
10919 
10920 	CSphFixedVector<CSphRowitem>	m_dMinRow;
10921 
10922 	CSphAggregateHit			m_tLastHit;				///< hitlist entry
10923 	Hitpos_t					m_iPrevHitPos;			///< previous hit position
10924 	bool						m_bGotFieldEnd;
10925 	BYTE						m_sLastKeyword [ MAX_KEYWORD_BYTES ];
10926 
10927 	const CSphVector<SphWordID_t> &	m_dHitlessWords;
10928 	CSphDict *					m_pDict;
10929 	CSphString *				m_pLastError;
10930 
10931 	SphOffset_t					m_iLastHitlistPos;		///< doclist entry
10932 	SphOffset_t					m_iLastHitlistDelta;	///< doclist entry
10933 	FieldMask_t					m_dLastDocFields;		///< doclist entry
10934 	DWORD						m_uLastDocHits;			///< doclist entry
10935 
10936 	CSphDictEntry				m_tWord;				///< dictionary entry
10937 
10938 	ESphHitFormat				m_eHitFormat;
10939 	ESphHitless					m_eHitless;
10940 	bool						m_bMerging;
10941 
10942 	CSphVector<SkiplistEntry_t>	m_dSkiplist;
10943 };
10944 
10945 
CSphHitBuilder(const CSphIndexSettings & tSettings,const CSphVector<SphWordID_t> & dHitless,bool bMerging,int iBufSize,CSphDict * pDict,CSphString * sError)10946 CSphHitBuilder::CSphHitBuilder ( const CSphIndexSettings & tSettings,
10947 	const CSphVector<SphWordID_t> & dHitless, bool bMerging, int iBufSize,
10948 	CSphDict * pDict, CSphString * sError )
10949 	: m_dWriteBuffer ( iBufSize )
10950 	, m_dMinRow ( 0 )
10951 	, m_iPrevHitPos ( 0 )
10952 	, m_bGotFieldEnd ( false )
10953 	, m_dHitlessWords ( dHitless )
10954 	, m_pDict ( pDict )
10955 	, m_pLastError ( sError )
10956 	, m_eHitFormat ( tSettings.m_eHitFormat )
10957 	, m_eHitless ( tSettings.m_eHitless )
10958 	, m_bMerging ( bMerging )
10959 {
10960 	m_sLastKeyword[0] = '\0';
10961 	HitReset();
10962 
10963 	m_iLastHitlistPos = 0;
10964 	m_iLastHitlistDelta = 0;
10965 	m_dLastDocFields.UnsetAll();
10966 	m_uLastDocHits = 0;
10967 
10968 	m_tWord.m_iDoclistOffset = 0;
10969 	m_tWord.m_iDocs = 0;
10970 	m_tWord.m_iHits = 0;
10971 
10972 	assert ( m_pDict );
10973 	assert ( m_pLastError );
10974 
10975 	m_pThrottle = &g_tThrottle;
10976 }
10977 
10978 
SetMin(const CSphRowitem * pDynamic,int iDynamic)10979 void CSphHitBuilder::SetMin ( const CSphRowitem * pDynamic, int iDynamic )
10980 {
10981 	assert ( !iDynamic || pDynamic );
10982 
10983 	m_dMinRow.Reset ( iDynamic );
10984 	ARRAY_FOREACH ( i, m_dMinRow )
10985 	{
10986 		m_dMinRow[i] = pDynamic[i];
10987 	}
10988 }
10989 
10990 
CreateIndexFiles(const char * sDocName,const char * sHitName,const char * sSkipName,bool bInplace,int iWriteBuffer,CSphAutofile & tHit,SphOffset_t * pSharedOffset)10991 bool CSphHitBuilder::CreateIndexFiles ( const char * sDocName, const char * sHitName, const char * sSkipName,
10992 	bool bInplace, int iWriteBuffer, CSphAutofile & tHit, SphOffset_t * pSharedOffset )
10993 {
10994 	// doclist and hitlist files
10995 	m_wrDoclist.CloseFile();
10996 	m_wrHitlist.CloseFile();
10997 	m_wrSkiplist.CloseFile();
10998 
10999 	m_wrDoclist.SetBufferSize ( m_dWriteBuffer.GetLength() );
11000 	m_wrHitlist.SetBufferSize ( bInplace ? iWriteBuffer : m_dWriteBuffer.GetLength() );
11001 	m_wrDoclist.SetThrottle ( m_pThrottle );
11002 	m_wrHitlist.SetThrottle ( m_pThrottle );
11003 
11004 	if ( !m_wrDoclist.OpenFile ( sDocName, *m_pLastError ) )
11005 		return false;
11006 
11007 	if ( bInplace )
11008 	{
11009 		sphSeek ( tHit.GetFD(), 0, SEEK_SET );
11010 		m_wrHitlist.SetFile ( tHit, pSharedOffset, *m_pLastError );
11011 	} else
11012 	{
11013 		if ( !m_wrHitlist.OpenFile ( sHitName, *m_pLastError ) )
11014 			return false;
11015 	}
11016 
11017 	if ( !m_wrSkiplist.OpenFile ( sSkipName, *m_pLastError ) )
11018 		return false;
11019 
11020 	// put dummy byte (otherwise offset would start from 0, first delta would be 0
11021 	// and VLB encoding of offsets would fuckup)
11022 	BYTE bDummy = 1;
11023 	m_wrDoclist.PutBytes ( &bDummy, 1 );
11024 	m_wrHitlist.PutBytes ( &bDummy, 1 );
11025 	m_wrSkiplist.PutBytes ( &bDummy, 1 );
11026 	return true;
11027 }
11028 
11029 
HitReset()11030 void CSphHitBuilder::HitReset()
11031 {
11032 	m_tLastHit.m_uDocID = 0;
11033 	m_tLastHit.m_uWordID = 0;
11034 	m_tLastHit.m_iWordPos = EMPTY_HIT;
11035 	m_tLastHit.m_sKeyword = m_sLastKeyword;
11036 	m_iPrevHitPos = 0;
11037 	m_bGotFieldEnd = false;
11038 }
11039 
11040 
11041 // doclist entry format
11042 // (with the new and shiny "inline hit" format, that is)
11043 //
11044 // zint docid_delta
11045 // zint[] inline_attrs
11046 // zint doc_hits
11047 // if doc_hits==1:
11048 // 		zint field_pos
11049 // 		zint field_no
11050 // else:
11051 // 		zint field_mask
11052 // 		zint hlist_offset_delta
11053 //
11054 // so 4 bytes/doc minimum
11055 // avg 4-6 bytes/doc according to our tests
11056 
11057 
DoclistBeginEntry(SphDocID_t uDocid,const DWORD * pAttrs)11058 void CSphHitBuilder::DoclistBeginEntry ( SphDocID_t uDocid, const DWORD * pAttrs )
11059 {
11060 	// build skiplist
11061 	// that is, save decoder state and doclist position per every 128 documents
11062 	if ( ( m_tWord.m_iDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
11063 	{
11064 		SkiplistEntry_t & tBlock = m_dSkiplist.Add();
11065 		tBlock.m_iBaseDocid = m_tLastHit.m_uDocID;
11066 		tBlock.m_iOffset = m_wrDoclist.GetPos();
11067 		tBlock.m_iBaseHitlistPos = m_iLastHitlistPos;
11068 	}
11069 
11070 	// begin doclist entry
11071 	m_wrDoclist.ZipOffset ( uDocid - m_tLastHit.m_uDocID );
11072 	assert ( !pAttrs || m_dMinRow.GetLength() );
11073 	if ( pAttrs )
11074 	{
11075 		ARRAY_FOREACH ( i, m_dMinRow )
11076 			m_wrDoclist.ZipInt ( pAttrs[i] - m_dMinRow[i] );
11077 	}
11078 }
11079 
11080 
DoclistEndEntry(Hitpos_t uLastPos)11081 void CSphHitBuilder::DoclistEndEntry ( Hitpos_t uLastPos )
11082 {
11083 	// end doclist entry
11084 	if ( m_eHitFormat==SPH_HIT_FORMAT_INLINE )
11085 	{
11086 		bool bIgnoreHits =
11087 			( m_eHitless==SPH_HITLESS_ALL ) ||
11088 			( m_eHitless==SPH_HITLESS_SOME && ( m_tWord.m_iDocs & HITLESS_DOC_FLAG ) );
11089 
11090 		// inline the only hit into doclist (unless it is completely discarded)
11091 		// and finish doclist entry
11092 		m_wrDoclist.ZipInt ( m_uLastDocHits );
11093 		if ( m_uLastDocHits==1 && !bIgnoreHits )
11094 		{
11095 			m_wrHitlist.SeekTo ( m_iLastHitlistPos );
11096 			m_wrDoclist.ZipInt ( uLastPos & 0x7FFFFF );
11097 			m_wrDoclist.ZipInt ( uLastPos >> 23 );
11098 			m_iLastHitlistPos -= m_iLastHitlistDelta;
11099 			assert ( m_iLastHitlistPos>=0 );
11100 
11101 		} else
11102 		{
11103 			m_wrDoclist.ZipInt ( m_dLastDocFields.GetMask32() );
11104 			m_wrDoclist.ZipOffset ( m_iLastHitlistDelta );
11105 		}
11106 	} else // plain format - finish doclist entry
11107 	{
11108 		assert ( m_eHitFormat==SPH_HIT_FORMAT_PLAIN );
11109 		m_wrDoclist.ZipOffset ( m_iLastHitlistDelta );
11110 		m_wrDoclist.ZipInt ( m_dLastDocFields.GetMask32() );
11111 		m_wrDoclist.ZipInt ( m_uLastDocHits );
11112 	}
11113 	m_dLastDocFields.UnsetAll();
11114 	m_uLastDocHits = 0;
11115 
11116 	// update keyword stats
11117 	m_tWord.m_iDocs++;
11118 }
11119 
11120 
DoclistEndList()11121 void CSphHitBuilder::DoclistEndList ()
11122 {
11123 	// emit eof marker
11124 	m_wrDoclist.ZipInt ( 0 );
11125 
11126 	// emit skiplist
11127 	// OPTIMIZE? placing it after doclist means an extra seek on searching
11128 	// however placing it before means some (longer) doclist data moves while indexing
11129 	if ( m_tWord.m_iDocs>SPH_SKIPLIST_BLOCK )
11130 	{
11131 		assert ( m_dSkiplist.GetLength() );
11132 		assert ( m_dSkiplist[0].m_iOffset==m_tWord.m_iDoclistOffset );
11133 		assert ( m_dSkiplist[0].m_iBaseDocid==0 );
11134 		assert ( m_dSkiplist[0].m_iBaseHitlistPos==0 );
11135 
11136 		m_tWord.m_iSkiplistOffset = m_wrSkiplist.GetPos();
11137 
11138 		// delta coding, but with a couple of skiplist specific tricks
11139 		// 1) first entry is omitted, it gets reconstructed from dict itself
11140 		// both base values are zero, and offset equals doclist offset
11141 		// 2) docids are at least SKIPLIST_BLOCK apart
11142 		// doclist entries are at least 4*SKIPLIST_BLOCK bytes apart
11143 		// so we additionally subtract that to improve delta coding
11144 		// 3) zero deltas are allowed and *not* used as any markers,
11145 		// as we know the exact skiplist entry count anyway
11146 		SkiplistEntry_t tLast = m_dSkiplist[0];
11147 		for ( int i=1; i<m_dSkiplist.GetLength(); i++ )
11148 		{
11149 			const SkiplistEntry_t & t = m_dSkiplist[i];
11150 			assert ( t.m_iBaseDocid - tLast.m_iBaseDocid>=SPH_SKIPLIST_BLOCK );
11151 			assert ( t.m_iOffset - tLast.m_iOffset>=4*SPH_SKIPLIST_BLOCK );
11152 			m_wrSkiplist.ZipOffset ( t.m_iBaseDocid - tLast.m_iBaseDocid - SPH_SKIPLIST_BLOCK );
11153 			m_wrSkiplist.ZipOffset ( t.m_iOffset - tLast.m_iOffset - 4*SPH_SKIPLIST_BLOCK );
11154 			m_wrSkiplist.ZipOffset ( t.m_iBaseHitlistPos - tLast.m_iBaseHitlistPos );
11155 			tLast = t;
11156 		}
11157 	}
11158 
11159 	// in any event, reset skiplist
11160 	m_dSkiplist.Resize ( 0 );
11161 }
11162 
11163 
cidxHit(CSphAggregateHit * pHit,const CSphRowitem * pAttrs)11164 void CSphHitBuilder::cidxHit ( CSphAggregateHit * pHit, const CSphRowitem * pAttrs )
11165 {
11166 	assert (
11167 		( pHit->m_uWordID!=0 && pHit->m_iWordPos!=EMPTY_HIT && pHit->m_uDocID!=0 ) || // it's either ok hit
11168 		( pHit->m_uWordID==0 && pHit->m_iWordPos==EMPTY_HIT ) ); // or "flush-hit"
11169 
11170 	/////////////
11171 	// next word
11172 	/////////////
11173 
11174 	bool bNextWord = ( m_tLastHit.m_uWordID!=pHit->m_uWordID ||
11175 		( m_pDict->GetSettings().m_bWordDict && strcmp ( (char*)m_tLastHit.m_sKeyword, (char*)pHit->m_sKeyword ) ) ); // OPTIMIZE?
11176 	bool bNextDoc = bNextWord || ( m_tLastHit.m_uDocID!=pHit->m_uDocID );
11177 
11178 	if ( m_bGotFieldEnd && ( bNextWord || bNextDoc ) )
11179 	{
11180 		// writing hits only without duplicates
11181 		assert ( HITMAN::GetPosWithField ( m_iPrevHitPos )!=HITMAN::GetPosWithField ( m_tLastHit.m_iWordPos ) );
11182 		HITMAN::SetEndMarker ( &m_tLastHit.m_iWordPos );
11183 		m_wrHitlist.ZipInt ( m_tLastHit.m_iWordPos - m_iPrevHitPos );
11184 		m_bGotFieldEnd = false;
11185 	}
11186 
11187 
11188 	if ( bNextDoc )
11189 	{
11190 		// finish hitlist, if any
11191 		Hitpos_t uLastPos = m_tLastHit.m_iWordPos;
11192 		if ( m_tLastHit.m_iWordPos!=EMPTY_HIT )
11193 		{
11194 			m_wrHitlist.ZipInt ( 0 );
11195 			m_tLastHit.m_iWordPos = EMPTY_HIT;
11196 			m_iPrevHitPos = EMPTY_HIT;
11197 		}
11198 
11199 		// finish doclist entry, if any
11200 		if ( m_tLastHit.m_uDocID )
11201 			DoclistEndEntry ( uLastPos );
11202 	}
11203 
11204 	if ( bNextWord )
11205 	{
11206 		// finish doclist, if any
11207 		if ( m_tLastHit.m_uDocID )
11208 		{
11209 			// emit end-of-doclist marker
11210 			DoclistEndList ();
11211 
11212 			// emit dict entry
11213 			m_tWord.m_uWordID = m_tLastHit.m_uWordID;
11214 			m_tWord.m_sKeyword = m_tLastHit.m_sKeyword;
11215 			m_tWord.m_iDoclistLength = m_wrDoclist.GetPos() - m_tWord.m_iDoclistOffset;
11216 			m_pDict->DictEntry ( m_tWord );
11217 
11218 			// reset trackers
11219 			m_tWord.m_iDocs = 0;
11220 			m_tWord.m_iHits = 0;
11221 
11222 			m_tLastHit.m_uDocID = 0;
11223 			m_iLastHitlistPos = 0;
11224 		}
11225 
11226 		// flush wordlist, if this is the end
11227 		if ( pHit->m_iWordPos==EMPTY_HIT )
11228 		{
11229 			m_pDict->DictEndEntries ( m_wrDoclist.GetPos() );
11230 			return;
11231 		}
11232 
11233 		assert ( pHit->m_uWordID > m_tLastHit.m_uWordID
11234 			|| ( m_pDict->GetSettings().m_bWordDict &&
11235 				pHit->m_uWordID==m_tLastHit.m_uWordID && strcmp ( (char*)pHit->m_sKeyword, (char*)m_tLastHit.m_sKeyword )>0 )
11236 			|| m_bMerging );
11237 		m_tWord.m_iDoclistOffset = m_wrDoclist.GetPos();
11238 		m_tLastHit.m_uWordID = pHit->m_uWordID;
11239 		if ( m_pDict->GetSettings().m_bWordDict )
11240 		{
11241 			assert ( strlen ( (char *)pHit->m_sKeyword )<sizeof(m_sLastKeyword)-1 );
11242 			strncpy ( (char*)m_tLastHit.m_sKeyword, (char*)pHit->m_sKeyword, sizeof(m_sLastKeyword) ); // OPTIMIZE?
11243 		}
11244 	}
11245 
11246 	if ( bNextDoc )
11247 	{
11248 		// begin new doclist entry for new doc id
11249 		assert ( pHit->m_uDocID>m_tLastHit.m_uDocID );
11250 		assert ( m_wrHitlist.GetPos()>=m_iLastHitlistPos );
11251 
11252 		DoclistBeginEntry ( pHit->m_uDocID, pAttrs );
11253 		m_iLastHitlistDelta = m_wrHitlist.GetPos() - m_iLastHitlistPos;
11254 
11255 		m_tLastHit.m_uDocID = pHit->m_uDocID;
11256 		m_iLastHitlistPos = m_wrHitlist.GetPos();
11257 	}
11258 
11259 	///////////
11260 	// the hit
11261 	///////////
11262 
11263 	if ( !pHit->m_dFieldMask.TestAll(false) ) // merge aggregate hits into the current hit
11264 	{
11265 		int iHitCount = pHit->GetAggrCount();
11266 		assert ( m_eHitless );
11267 		assert ( iHitCount );
11268 		assert ( !pHit->m_dFieldMask.TestAll(false) );
11269 
11270 		m_uLastDocHits += iHitCount;
11271 		for ( int i=0; i<FieldMask_t::SIZE; i++ )
11272 			m_dLastDocFields[i] |= pHit->m_dFieldMask[i];
11273 		m_tWord.m_iHits += iHitCount;
11274 
11275 		if ( m_eHitless==SPH_HITLESS_SOME )
11276 			m_tWord.m_iDocs |= HITLESS_DOC_FLAG;
11277 
11278 	} else // handle normal hits
11279 	{
11280 		Hitpos_t iHitPosPure = HITMAN::GetPosWithField ( pHit->m_iWordPos );
11281 
11282 		// skip any duplicates and keep only 1st position in place
11283 		// duplicates are hit with same position: [N, N] [N, N | FIELDEND_MASK] [N | FIELDEND_MASK, N] [N | FIELDEND_MASK, N | FIELDEND_MASK]
11284 		if ( iHitPosPure==m_tLastHit.m_iWordPos )
11285 			return;
11286 
11287 		// storing previous hit that might have a field end flag
11288 		if ( m_bGotFieldEnd )
11289 		{
11290 			if ( HITMAN::GetField ( pHit->m_iWordPos )!=HITMAN::GetField ( m_tLastHit.m_iWordPos ) ) // is field end flag real?
11291 				HITMAN::SetEndMarker ( &m_tLastHit.m_iWordPos );
11292 
11293 			m_wrHitlist.ZipInt ( m_tLastHit.m_iWordPos - m_iPrevHitPos );
11294 			m_bGotFieldEnd = false;
11295 		}
11296 
11297 		/* duplicate hits from duplicated documents
11298 		... 0x03, 0x03 ...
11299 		... 0x8003, 0x8003 ...
11300 		... 1, 0x8003, 0x03 ...
11301 		... 1, 0x03, 0x8003 ...
11302 		... 1, 0x8003, 0x04 ...
11303 		... 1, 0x03, 0x8003, 0x8003 ...
11304 		... 1, 0x03, 0x8003, 0x03 ...
11305 		*/
11306 
11307 		assert ( m_tLastHit.m_iWordPos < pHit->m_iWordPos );
11308 
11309 		// add hit delta without field end marker
11310 		// or postpone adding to hitlist till got another uniq hit
11311 		if ( iHitPosPure==pHit->m_iWordPos )
11312 		{
11313 			m_wrHitlist.ZipInt ( pHit->m_iWordPos - m_tLastHit.m_iWordPos );
11314 			m_tLastHit.m_iWordPos = pHit->m_iWordPos;
11315 		} else
11316 		{
11317 			assert ( HITMAN::IsEnd ( pHit->m_iWordPos ) );
11318 			m_bGotFieldEnd = true;
11319 			m_iPrevHitPos = m_tLastHit.m_iWordPos;
11320 			m_tLastHit.m_iWordPos = HITMAN::GetPosWithField ( pHit->m_iWordPos );
11321 		}
11322 
11323 		// update matched fields mask
11324 		m_dLastDocFields.Set ( HITMAN::GetField ( pHit->m_iWordPos ) );
11325 
11326 		m_uLastDocHits++;
11327 		m_tWord.m_iHits++;
11328 	}
11329 }
11330 
11331 
ReadSchemaColumn(CSphReader & rdInfo,CSphColumnInfo & tCol,DWORD uVersion)11332 static void ReadSchemaColumn ( CSphReader & rdInfo, CSphColumnInfo & tCol, DWORD uVersion )
11333 {
11334 	tCol.m_sName = rdInfo.GetString ();
11335 	if ( tCol.m_sName.IsEmpty () )
11336 		tCol.m_sName = "@emptyname";
11337 
11338 	tCol.m_sName.ToLower ();
11339 	tCol.m_eAttrType = (ESphAttr) rdInfo.GetDword (); // FIXME? check/fixup?
11340 
11341 	if ( uVersion>=5 ) // m_uVersion for searching
11342 	{
11343 		rdInfo.GetDword (); // ignore rowitem
11344 		tCol.m_tLocator.m_iBitOffset = rdInfo.GetDword ();
11345 		tCol.m_tLocator.m_iBitCount = rdInfo.GetDword ();
11346 	} else
11347 	{
11348 		tCol.m_tLocator.m_iBitOffset = -1;
11349 		tCol.m_tLocator.m_iBitCount = -1;
11350 	}
11351 
11352 	if ( uVersion>=16 ) // m_uVersion for searching
11353 		tCol.m_bPayload = ( rdInfo.GetByte()!=0 );
11354 
11355 	// WARNING! max version used here must be in sync with RtIndex_t::Prealloc
11356 }
11357 
11358 
ReadSchema(CSphReader & rdInfo,CSphSchema & m_tSchema,DWORD uVersion,bool bDynamic)11359 void ReadSchema ( CSphReader & rdInfo, CSphSchema & m_tSchema, DWORD uVersion, bool bDynamic )
11360 {
11361 	m_tSchema.Reset ();
11362 
11363 	m_tSchema.m_dFields.Resize ( rdInfo.GetDword() );
11364 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
11365 		ReadSchemaColumn ( rdInfo, m_tSchema.m_dFields[i], uVersion );
11366 
11367 	int iNumAttrs = rdInfo.GetDword();
11368 
11369 	for ( int i=0; i<iNumAttrs; i++ )
11370 	{
11371 		CSphColumnInfo tCol;
11372 		ReadSchemaColumn ( rdInfo, tCol, uVersion );
11373 		m_tSchema.AddAttr ( tCol, bDynamic );
11374 	}
11375 }
11376 
11377 
WriteSchemaColumn(CSphWriter & fdInfo,const CSphColumnInfo & tCol)11378 static void WriteSchemaColumn ( CSphWriter & fdInfo, const CSphColumnInfo & tCol )
11379 {
11380 	int iLen = strlen ( tCol.m_sName.cstr() );
11381 	fdInfo.PutDword ( iLen );
11382 	fdInfo.PutBytes ( tCol.m_sName.cstr(), iLen );
11383 
11384 	ESphAttr eAttrType = tCol.m_eAttrType;
11385 	fdInfo.PutDword ( eAttrType );
11386 
11387 	fdInfo.PutDword ( tCol.m_tLocator.CalcRowitem() ); // for backwards compatibility
11388 	fdInfo.PutDword ( tCol.m_tLocator.m_iBitOffset );
11389 	fdInfo.PutDword ( tCol.m_tLocator.m_iBitCount );
11390 
11391 	fdInfo.PutByte ( tCol.m_bPayload );
11392 }
11393 
11394 
WriteSchema(CSphWriter & fdInfo,const CSphSchema & tSchema)11395 void WriteSchema ( CSphWriter & fdInfo, const CSphSchema & tSchema )
11396 {
11397 	// schema
11398 	fdInfo.PutDword ( tSchema.m_dFields.GetLength() );
11399 	ARRAY_FOREACH ( i, tSchema.m_dFields )
11400 		WriteSchemaColumn ( fdInfo, tSchema.m_dFields[i] );
11401 
11402 	fdInfo.PutDword ( tSchema.GetAttrsCount() );
11403 	for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
11404 		WriteSchemaColumn ( fdInfo, tSchema.GetAttr(i) );
11405 }
11406 
11407 
SaveIndexSettings(CSphWriter & tWriter,const CSphIndexSettings & tSettings)11408 void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings )
11409 {
11410 	tWriter.PutDword ( tSettings.m_iMinPrefixLen );
11411 	tWriter.PutDword ( tSettings.m_iMinInfixLen );
11412 	tWriter.PutDword ( tSettings.m_iMaxSubstringLen );
11413 	tWriter.PutByte ( tSettings.m_bHtmlStrip ? 1 : 0 );
11414 	tWriter.PutString ( tSettings.m_sHtmlIndexAttrs.cstr () );
11415 	tWriter.PutString ( tSettings.m_sHtmlRemoveElements.cstr () );
11416 	tWriter.PutByte ( tSettings.m_bIndexExactWords ? 1 : 0 );
11417 	tWriter.PutDword ( tSettings.m_eHitless );
11418 	tWriter.PutDword ( tSettings.m_eHitFormat );
11419 	tWriter.PutByte ( tSettings.m_bIndexSP );
11420 	tWriter.PutString ( tSettings.m_sZones );
11421 	tWriter.PutDword ( tSettings.m_iBoundaryStep );
11422 	tWriter.PutDword ( tSettings.m_iStopwordStep );
11423 	tWriter.PutDword ( tSettings.m_iOvershortStep );
11424 	tWriter.PutDword ( tSettings.m_iEmbeddedLimit );
11425 	tWriter.PutByte ( tSettings.m_eBigramIndex );
11426 	tWriter.PutString ( tSettings.m_sBigramWords );
11427 	tWriter.PutByte ( tSettings.m_bIndexFieldLens );
11428 	tWriter.PutByte ( tSettings.m_eChineseRLP );
11429 	tWriter.PutString ( tSettings.m_sRLPContext );
11430 	tWriter.PutString ( tSettings.m_sIndexTokenFilter );
11431 }
11432 
11433 
WriteHeader(const BuildHeader_t & tBuildHeader,CSphWriter & fdInfo) const11434 bool CSphIndex_VLN::WriteHeader ( const BuildHeader_t & tBuildHeader, CSphWriter & fdInfo ) const
11435 {
11436 	// version
11437 	fdInfo.PutDword ( INDEX_MAGIC_HEADER );
11438 	fdInfo.PutDword ( INDEX_FORMAT_VERSION );
11439 
11440 	// bits
11441 	fdInfo.PutDword ( USE_64BIT );
11442 
11443 	// docinfo
11444 	fdInfo.PutDword ( m_tSettings.m_eDocinfo );
11445 
11446 	// schema
11447 	WriteSchema ( fdInfo, m_tSchema );
11448 
11449 	// min doc
11450 	fdInfo.PutOffset ( tBuildHeader.m_uMinDocid ); // was dword in v.1
11451 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
11452 		fdInfo.PutBytes ( tBuildHeader.m_pMinRow, m_tSchema.GetRowSize()*sizeof(CSphRowitem) );
11453 
11454 	// wordlist checkpoints
11455 	fdInfo.PutOffset ( tBuildHeader.m_iDictCheckpointsOffset );
11456 	fdInfo.PutDword ( tBuildHeader.m_iDictCheckpoints );
11457 	fdInfo.PutByte ( tBuildHeader.m_iInfixCodepointBytes );
11458 	fdInfo.PutDword ( (DWORD)tBuildHeader.m_iInfixBlocksOffset );
11459 	fdInfo.PutDword ( tBuildHeader.m_iInfixBlocksWordsSize );
11460 
11461 	// index stats
11462 	fdInfo.PutDword ( (DWORD)tBuildHeader.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
11463 	fdInfo.PutOffset ( tBuildHeader.m_iTotalBytes );
11464 	fdInfo.PutDword ( tBuildHeader.m_iTotalDups );
11465 
11466 	// index settings
11467 	SaveIndexSettings ( fdInfo, m_tSettings );
11468 
11469 	// tokenizer info
11470 	assert ( m_pTokenizer );
11471 	SaveTokenizerSettings ( fdInfo, m_pTokenizer, m_tSettings.m_iEmbeddedLimit );
11472 
11473 	// dictionary info
11474 	assert ( m_pDict );
11475 	SaveDictionarySettings ( fdInfo, m_pDict, false, m_tSettings.m_iEmbeddedLimit );
11476 
11477 	fdInfo.PutDword ( tBuildHeader.m_uKillListSize );
11478 	fdInfo.PutOffset ( tBuildHeader.m_iMinMaxIndex );
11479 
11480 	// field filter info
11481 	SaveFieldFilterSettings ( fdInfo, m_pFieldFilter );
11482 
11483 	// average field lengths
11484 	if ( m_tSettings.m_bIndexFieldLens )
11485 		ARRAY_FOREACH ( i, m_tSchema.m_dFields )
11486 			fdInfo.PutOffset ( m_dFieldLens[i] );
11487 
11488 	return true;
11489 }
11490 
11491 
BuildDone(const BuildHeader_t & tBuildHeader,CSphString & sError) const11492 bool CSphIndex_VLN::BuildDone ( const BuildHeader_t & tBuildHeader, CSphString & sError ) const
11493 {
11494 	CSphWriter fdInfo;
11495 	fdInfo.SetThrottle ( tBuildHeader.m_pThrottle );
11496 	fdInfo.OpenFile ( GetIndexFileName ( tBuildHeader.m_sHeaderExtension ), sError );
11497 	if ( fdInfo.IsError() )
11498 		return false;
11499 
11500 	if ( !WriteHeader ( tBuildHeader, fdInfo ) )
11501 		return false;
11502 
11503 	// close header
11504 	fdInfo.CloseFile ();
11505 	return !fdInfo.IsError();
11506 }
11507 
11508 
cidxDone(int iMemLimit,int iMinInfixLen,int iMaxCodepointLen,DictHeader_t * pDictHeader)11509 bool CSphHitBuilder::cidxDone ( int iMemLimit, int iMinInfixLen, int iMaxCodepointLen, DictHeader_t * pDictHeader )
11510 {
11511 	assert ( pDictHeader );
11512 
11513 	if ( m_bGotFieldEnd )
11514 	{
11515 		HITMAN::SetEndMarker ( &m_tLastHit.m_iWordPos );
11516 		m_wrHitlist.ZipInt ( m_tLastHit.m_iWordPos - m_iPrevHitPos );
11517 		m_bGotFieldEnd = false;
11518 	}
11519 
11520 	// finalize dictionary
11521 	// in dict=crc mode, just flushes wordlist checkpoints
11522 	// in dict=keyword mode, also creates infix index, if needed
11523 
11524 	if ( iMinInfixLen>0 && m_pDict->GetSettings().m_bWordDict )
11525 		pDictHeader->m_iInfixCodepointBytes = iMaxCodepointLen;
11526 
11527 	if ( !m_pDict->DictEnd ( pDictHeader, iMemLimit, *m_pLastError, m_pThrottle ) )
11528 		return false;
11529 
11530 	// close all data files
11531 	m_wrDoclist.CloseFile ();
11532 	m_wrHitlist.CloseFile ( true );
11533 	return !IsError();
11534 }
11535 
11536 
encodeVLB(BYTE * buf,DWORD v)11537 inline int encodeVLB ( BYTE * buf, DWORD v )
11538 {
11539 	BYTE b;
11540 	int n = 0;
11541 
11542 	do
11543 	{
11544 		b = (BYTE)(v & 0x7f);
11545 		v >>= 7;
11546 		if ( v )
11547 			b |= 0x80;
11548 		*buf++ = b;
11549 		n++;
11550 	} while ( v );
11551 	return n;
11552 }
11553 
11554 
encodeKeyword(BYTE * pBuf,const char * pKeyword)11555 inline int encodeKeyword ( BYTE * pBuf, const char * pKeyword )
11556 {
11557 	int iLen = strlen ( pKeyword ); // OPTIMIZE! remove this and memcpy and check if thats faster
11558 	assert ( iLen>0 && iLen<128 ); // so that ReadVLB()
11559 
11560 	*pBuf = (BYTE) iLen;
11561 	memcpy ( pBuf+1, pKeyword, iLen );
11562 	return 1+iLen;
11563 }
11564 
11565 
cidxWriteRawVLB(int fd,CSphWordHit * pHit,int iHits,DWORD * pDocinfo,int iDocinfos,int iStride)11566 int CSphHitBuilder::cidxWriteRawVLB ( int fd, CSphWordHit * pHit, int iHits, DWORD * pDocinfo, int iDocinfos, int iStride )
11567 {
11568 	assert ( pHit );
11569 	assert ( iHits>0 );
11570 
11571 	/////////////////////////////
11572 	// do simple bitwise hashing
11573 	/////////////////////////////
11574 
11575 	static const int HBITS = 11;
11576 	static const int HSIZE = ( 1 << HBITS );
11577 
11578 	SphDocID_t uStartID = 0;
11579 	int dHash [ HSIZE+1 ];
11580 	int iShift = 0;
11581 
11582 	if ( pDocinfo )
11583 	{
11584 		uStartID = DOCINFO2ID ( pDocinfo );
11585 		int iBits = sphLog2 ( DOCINFO2ID ( pDocinfo + (iDocinfos-1)*iStride ) - uStartID );
11586 		iShift = ( iBits<HBITS ) ? 0 : ( iBits-HBITS );
11587 
11588 		#ifndef NDEBUG
11589 		for ( int i=0; i<=HSIZE; i++ )
11590 			dHash[i] = -1;
11591 		#endif
11592 
11593 		dHash[0] = 0;
11594 		int iHashed = 0;
11595 		for ( int i=0; i<iDocinfos; i++ )
11596 		{
11597 			int iHash = (int)( ( DOCINFO2ID ( pDocinfo+i*iStride ) - uStartID ) >> iShift );
11598 			assert ( iHash>=0 && iHash<HSIZE );
11599 
11600 			if ( iHash>iHashed )
11601 			{
11602 				dHash [ iHashed+1 ] = i-1; // right boundary for prev hash value
11603 				dHash [ iHash ] = i; // left boundary for next hash value
11604 				iHashed = iHash;
11605 			}
11606 		}
11607 		dHash [ iHashed+1 ] = iDocinfos-1; // right boundary for last hash value
11608 	}
11609 
11610 	///////////////////////////////////////
11611 	// encode through a small write buffer
11612 	///////////////////////////////////////
11613 
11614 	BYTE *pBuf, *maxP;
11615 	int n = 0, w;
11616 	SphWordID_t d1, l1 = 0;
11617 	SphDocID_t d2, l2 = 0;
11618 	DWORD d3, l3 = 0; // !COMMIT must be wide enough
11619 
11620 	int iGap = Max ( 16*sizeof(DWORD) + iStride*sizeof(DWORD) + ( m_pDict->GetSettings().m_bWordDict ? MAX_KEYWORD_BYTES : 0 ), 128u );
11621 	pBuf = m_dWriteBuffer.Begin();
11622 	maxP = m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() - iGap;
11623 
11624 	SphDocID_t uAttrID = 0; // current doc id
11625 	DWORD * pAttrs = NULL; // current doc attrs
11626 
11627 	// hit aggregation state
11628 	DWORD uHitCount = 0;
11629 	DWORD uHitFieldMask = 0;
11630 
11631 	const int iPositionShift = m_eHitless==SPH_HITLESS_SOME ? 1 : 0;
11632 
11633 	while ( iHits-- )
11634 	{
11635 		// find attributes by id
11636 		if ( pDocinfo && uAttrID!=pHit->m_uDocID )
11637 		{
11638 			int iHash = (int)( ( pHit->m_uDocID - uStartID ) >> iShift );
11639 			assert ( iHash>=0 && iHash<HSIZE );
11640 
11641 			int iStart = dHash[iHash];
11642 			int iEnd = dHash[iHash+1];
11643 
11644 			if ( pHit->m_uDocID==DOCINFO2ID ( pDocinfo + iStart*iStride ) )
11645 			{
11646 				pAttrs = DOCINFO2ATTRS ( pDocinfo + iStart*iStride );
11647 
11648 			} else if ( pHit->m_uDocID==DOCINFO2ID ( pDocinfo + iEnd*iStride ) )
11649 			{
11650 				pAttrs = DOCINFO2ATTRS ( pDocinfo + iEnd*iStride );
11651 
11652 			} else
11653 			{
11654 				pAttrs = NULL;
11655 				while ( iEnd-iStart>1 )
11656 				{
11657 					// check if nothing found
11658 					if (
11659 						pHit->m_uDocID < DOCINFO2ID ( pDocinfo + iStart*iStride ) ||
11660 						pHit->m_uDocID > DOCINFO2ID ( pDocinfo + iEnd*iStride ) )
11661 							break;
11662 					assert ( pHit->m_uDocID > DOCINFO2ID ( pDocinfo + iStart*iStride ) );
11663 					assert ( pHit->m_uDocID < DOCINFO2ID ( pDocinfo + iEnd*iStride ) );
11664 
11665 					int iMid = iStart + (iEnd-iStart)/2;
11666 					if ( pHit->m_uDocID==DOCINFO2ID ( pDocinfo + iMid*iStride ) )
11667 					{
11668 						pAttrs = DOCINFO2ATTRS ( pDocinfo + iMid*iStride );
11669 						break;
11670 					}
11671 					if ( pHit->m_uDocID<DOCINFO2ID ( pDocinfo + iMid*iStride ) )
11672 						iEnd = iMid;
11673 					else
11674 						iStart = iMid;
11675 				}
11676 			}
11677 
11678 			if ( !pAttrs )
11679 				sphDie ( "INTERNAL ERROR: failed to lookup attributes while saving collected hits" );
11680 			assert ( DOCINFO2ID ( pAttrs - DOCINFO_IDSIZE )==pHit->m_uDocID );
11681 			uAttrID = pHit->m_uDocID;
11682 		}
11683 
11684 		// calc deltas
11685 		d1 = pHit->m_uWordID - l1;
11686 		d2 = pHit->m_uDocID - l2;
11687 		d3 = pHit->m_uWordPos - l3;
11688 
11689 		// ignore duplicate hits
11690 		if ( d1==0 && d2==0 && d3==0 ) // OPTIMIZE? check if ( 0==(d1|d2|d3) ) is faster
11691 		{
11692 			pHit++;
11693 			continue;
11694 		}
11695 
11696 		// checks below are intended handle several "fun" cases
11697 		//
11698 		// case 1, duplicate documents (same docid), different field contents, but ending with
11699 		// the same keyword, ending up in multiple field end markers within the same keyword
11700 		// eg. [foo] in positions {1, 0x800005} in 1st dupe, {3, 0x800007} in 2nd dupe
11701 		//
11702 		// case 2, blended token in the field end, duplicate parts, different positions (as expected)
11703 		// for those parts but still multiple field end markers, eg. [U.S.S.R.] in the end of field
11704 
11705 		// replacement of hit itself by field-end form
11706 		if ( d1==0 && d2==0 && HITMAN::GetPosWithField ( pHit->m_uWordPos )==HITMAN::GetPosWithField ( l3 ) )
11707 		{
11708 			l3 = pHit->m_uWordPos;
11709 			pHit++;
11710 			continue;
11711 		}
11712 
11713 		// reset field-end inside token stream due of document duplicates
11714 		if ( d1==0 && d2==0 && HITMAN::IsEnd ( l3 ) && HITMAN::GetField ( pHit->m_uWordPos )==HITMAN::GetField ( l3 ) )
11715 		{
11716 			l3 = HITMAN::GetPosWithField ( l3 );
11717 			d3 = HITMAN::GetPosWithField ( pHit->m_uWordPos ) - l3;
11718 
11719 			if ( d3==0 )
11720 			{
11721 				pHit++;
11722 				continue;
11723 			}
11724 		}
11725 
11726 		// non-zero delta restarts all the fields after it
11727 		// because their deltas might now be negative
11728 		if ( d1 ) d2 = pHit->m_uDocID;
11729 		if ( d2 ) d3 = pHit->m_uWordPos;
11730 
11731 		// when we moved to the next word or document
11732 		bool bFlushed = false;
11733 		if ( d1 || d2 )
11734 		{
11735 			// flush previous aggregate hit
11736 			if ( uHitCount )
11737 			{
11738 				// we either skip all hits or the high bit must be available for marking
11739 				// failing that, we can't produce a consistent index
11740 				assert ( m_eHitless!=SPH_HITLESS_NONE );
11741 				assert ( m_eHitless==SPH_HITLESS_ALL || !( uHitCount & 0x80000000UL ) );
11742 
11743 				if ( m_eHitless!=SPH_HITLESS_ALL )
11744 					uHitCount = ( uHitCount << 1 ) | 1;
11745 				pBuf += encodeVLB ( pBuf, uHitCount );
11746 				pBuf += encodeVLB ( pBuf, uHitFieldMask );
11747 				assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11748 
11749 				uHitCount = 0;
11750 				uHitFieldMask = 0;
11751 
11752 				bFlushed = true;
11753 			}
11754 
11755 			// start aggregating if we're skipping all hits or this word is in a list of ignored words
11756 			if ( ( m_eHitless==SPH_HITLESS_ALL ) ||
11757 				( m_eHitless==SPH_HITLESS_SOME && m_dHitlessWords.BinarySearch ( pHit->m_uWordID ) ) )
11758 			{
11759 				uHitCount = 1;
11760 				uHitFieldMask |= 1 << HITMAN::GetField ( pHit->m_uWordPos );
11761 			}
11762 
11763 		} else if ( uHitCount ) // next hit for the same word/doc pair, update state if we need it
11764 		{
11765 			uHitCount++;
11766 			uHitFieldMask |= 1 << HITMAN::GetField ( pHit->m_uWordPos );
11767 		}
11768 
11769 		// encode enough restart markers
11770 		if ( d1 ) pBuf += encodeVLB ( pBuf, 0 );
11771 		if ( d2 && !bFlushed ) pBuf += encodeVLB ( pBuf, 0 );
11772 
11773 		assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11774 
11775 		// encode deltas
11776 #if USE_64BIT
11777 #define LOC_ENCODE sphEncodeVLB8
11778 #else
11779 #define LOC_ENCODE encodeVLB
11780 #endif
11781 
11782 		// encode keyword
11783 		if ( d1 )
11784 		{
11785 			if ( m_pDict->GetSettings().m_bWordDict )
11786 				pBuf += encodeKeyword ( pBuf, m_pDict->HitblockGetKeyword ( pHit->m_uWordID ) ); // keyword itself in case of keywords dict
11787 			else
11788 				pBuf += LOC_ENCODE ( pBuf, d1 ); // delta in case of CRC dict
11789 
11790 			assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11791 		}
11792 
11793 		// encode docid delta
11794 		if ( d2 )
11795 		{
11796 			pBuf += LOC_ENCODE ( pBuf, d2 );
11797 			assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11798 		}
11799 
11800 #undef LOC_ENCODE
11801 
11802 		// encode attrs
11803 		if ( d2 && pAttrs )
11804 		{
11805 			for ( int i=0; i<iStride-DOCINFO_IDSIZE; i++ )
11806 			{
11807 				pBuf += encodeVLB ( pBuf, pAttrs[i] );
11808 				assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11809 			}
11810 		}
11811 
11812 		assert ( d3 );
11813 		if ( !uHitCount ) // encode position delta, unless accumulating hits
11814 		{
11815 			pBuf += encodeVLB ( pBuf, d3 << iPositionShift );
11816 			assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11817 		}
11818 
11819 		// update current state
11820 		l1 = pHit->m_uWordID;
11821 		l2 = pHit->m_uDocID;
11822 		l3 = pHit->m_uWordPos;
11823 
11824 		pHit++;
11825 
11826 		if ( pBuf>maxP )
11827 		{
11828 			w = (int)(pBuf - m_dWriteBuffer.Begin());
11829 			assert ( w<m_dWriteBuffer.GetLength() );
11830 			if ( !sphWriteThrottled ( fd, m_dWriteBuffer.Begin(), w, "raw_hits", *m_pLastError, m_pThrottle ) )
11831 				return -1;
11832 			n += w;
11833 			pBuf = m_dWriteBuffer.Begin();
11834 		}
11835 	}
11836 
11837 	// flush last aggregate
11838 	if ( uHitCount )
11839 	{
11840 		assert ( m_eHitless!=SPH_HITLESS_NONE );
11841 		assert ( m_eHitless==SPH_HITLESS_ALL || !( uHitCount & 0x80000000UL ) );
11842 
11843 		if ( m_eHitless!=SPH_HITLESS_ALL )
11844 			uHitCount = ( uHitCount << 1 ) | 1;
11845 		pBuf += encodeVLB ( pBuf, uHitCount );
11846 		pBuf += encodeVLB ( pBuf, uHitFieldMask );
11847 
11848 		assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11849 	}
11850 
11851 	pBuf += encodeVLB ( pBuf, 0 );
11852 	pBuf += encodeVLB ( pBuf, 0 );
11853 	pBuf += encodeVLB ( pBuf, 0 );
11854 	assert ( pBuf<m_dWriteBuffer.Begin() + m_dWriteBuffer.GetLength() );
11855 	w = (int)(pBuf - m_dWriteBuffer.Begin());
11856 	assert ( w<m_dWriteBuffer.GetLength() );
11857 	if ( !sphWriteThrottled ( fd, m_dWriteBuffer.Begin(), w, "raw_hits", *m_pLastError, m_pThrottle ) )
11858 		return -1;
11859 	n += w;
11860 
11861 	return n;
11862 }
11863 
11864 /////////////////////////////////////////////////////////////////////////////
11865 
11866 // OPTIMIZE?
SPH_CMPAGGRHIT_LESS(const CSphAggregateHit & a,const CSphAggregateHit & b)11867 inline bool SPH_CMPAGGRHIT_LESS ( const CSphAggregateHit & a, const CSphAggregateHit & b )
11868 {
11869 	if ( a.m_uWordID < b.m_uWordID )
11870 		return true;
11871 
11872 	if ( a.m_uWordID > b.m_uWordID )
11873 		return false;
11874 
11875 	if ( a.m_sKeyword )
11876 	{
11877 		int iCmp = strcmp ( (char*)a.m_sKeyword, (char*)b.m_sKeyword ); // OPTIMIZE?
11878 		if ( iCmp!=0 )
11879 			return ( iCmp<0 );
11880 	}
11881 
11882 	return
11883 		( a.m_uDocID < b.m_uDocID ) ||
11884 		( a.m_uDocID==b.m_uDocID && HITMAN::GetPosWithField ( a.m_iWordPos )<HITMAN::GetPosWithField ( b.m_iWordPos ) );
11885 }
11886 
11887 
11888 /// hit priority queue entry
11889 struct CSphHitQueueEntry : public CSphAggregateHit
11890 {
11891 	int m_iBin;
11892 };
11893 
11894 
11895 /// hit priority queue
11896 struct CSphHitQueue
11897 {
11898 public:
11899 	CSphHitQueueEntry *		m_pData;
11900 	int						m_iSize;
11901 	int						m_iUsed;
11902 
11903 public:
11904 	/// create queue
CSphHitQueueCSphHitQueue11905 	explicit CSphHitQueue ( int iSize )
11906 	{
11907 		assert ( iSize>0 );
11908 		m_iSize = iSize;
11909 		m_iUsed = 0;
11910 		m_pData = new CSphHitQueueEntry [ iSize ];
11911 	}
11912 
11913 	/// destroy queue
~CSphHitQueueCSphHitQueue11914 	~CSphHitQueue ()
11915 	{
11916 		SafeDeleteArray ( m_pData );
11917 	}
11918 
11919 	/// add entry to the queue
PushCSphHitQueue11920 	void Push ( CSphAggregateHit & tHit, int iBin )
11921 	{
11922 		// check for overflow and do add
11923 		assert ( m_iUsed<m_iSize );
11924 		m_pData [ m_iUsed ].m_uDocID = tHit.m_uDocID;
11925 		m_pData [ m_iUsed ].m_uWordID = tHit.m_uWordID;
11926 		m_pData [ m_iUsed ].m_sKeyword = tHit.m_sKeyword; // bin must hold the actual data for the queue
11927 		m_pData [ m_iUsed ].m_iWordPos = tHit.m_iWordPos;
11928 		m_pData [ m_iUsed ].m_dFieldMask = tHit.m_dFieldMask;
11929 		m_pData [ m_iUsed ].m_iBin = iBin;
11930 
11931 		int iEntry = m_iUsed++;
11932 
11933 		// sift up if needed
11934 		while ( iEntry )
11935 		{
11936 			int iParent = ( iEntry-1 ) >> 1;
11937 			if ( SPH_CMPAGGRHIT_LESS ( m_pData[iEntry], m_pData[iParent] ) )
11938 			{
11939 				// entry is less than parent, should float to the top
11940 				Swap ( m_pData[iEntry], m_pData[iParent] );
11941 				iEntry = iParent;
11942 			} else
11943 			{
11944 				break;
11945 			}
11946 		}
11947 	}
11948 
11949 	/// remove root (ie. top priority) entry
PopCSphHitQueue11950 	void Pop ()
11951 	{
11952 		assert ( m_iUsed );
11953 		if ( !(--m_iUsed) ) // empty queue? just return
11954 			return;
11955 
11956 		// make the last entry my new root
11957 		m_pData[0] = m_pData[m_iUsed];
11958 
11959 		// sift down if needed
11960 		int iEntry = 0;
11961 		for ( ;; )
11962 		{
11963 			// select child
11964 			int iChild = (iEntry<<1) + 1;
11965 			if ( iChild>=m_iUsed )
11966 				break;
11967 
11968 			// select smallest child
11969 			if ( iChild+1<m_iUsed )
11970 				if ( SPH_CMPAGGRHIT_LESS ( m_pData[iChild+1], m_pData[iChild] ) )
11971 					iChild++;
11972 
11973 			// if smallest child is less than entry, do float it to the top
11974 			if ( SPH_CMPAGGRHIT_LESS ( m_pData[iChild], m_pData[iEntry] ) )
11975 			{
11976 				Swap ( m_pData[iChild], m_pData[iEntry] );
11977 				iEntry = iChild;
11978 				continue;
11979 			}
11980 
11981 			break;
11982 		}
11983 	}
11984 };
11985 
11986 
11987 struct CmpQueuedDocinfo_fn
11988 {
11989 	static DWORD *	m_pStorage;
11990 	static int		m_iStride;
11991 
IsLessCmpQueuedDocinfo_fn11992 	static inline bool IsLess ( const int a, const int b )
11993 	{
11994 		return DOCINFO2ID ( m_pStorage + a*m_iStride ) < DOCINFO2ID ( m_pStorage + b*m_iStride );
11995 	}
11996 };
11997 DWORD *		CmpQueuedDocinfo_fn::m_pStorage		= NULL;
11998 int			CmpQueuedDocinfo_fn::m_iStride		= 1;
11999 
12000 
12001 #define MAX_SOURCE_HITS	32768
12002 static const int MIN_KEYWORDS_DICT	= 4*1048576;	// FIXME! ideally must be in sync with impl (ENTRY_CHUNKS, KEYWORD_CHUNKS)
12003 
12004 /////////////////////////////////////////////////////////////////////////////
12005 
12006 struct MvaEntry_t
12007 {
12008 	SphDocID_t	m_uDocID;
12009 	int			m_iAttr;
12010 	int64_t		m_iValue;
12011 
operator <MvaEntry_t12012 	inline bool operator < ( const MvaEntry_t & rhs ) const
12013 	{
12014 		if ( m_uDocID!=rhs.m_uDocID ) return m_uDocID<rhs.m_uDocID;
12015 		if ( m_iAttr!=rhs.m_iAttr ) return m_iAttr<rhs.m_iAttr;
12016 		return m_iValue<rhs.m_iValue;
12017 	}
12018 };
12019 
12020 
12021 struct MvaEntryTag_t : public MvaEntry_t
12022 {
12023 	int			m_iTag;
12024 };
12025 
12026 
12027 struct MvaEntryCmp_fn
12028 {
IsLessMvaEntryCmp_fn12029 	static inline bool IsLess ( const MvaEntry_t & a, const MvaEntry_t & b )
12030 	{
12031 		return a<b;
12032 	}
12033 };
12034 
12035 
BuildMVA(const CSphVector<CSphSource * > & dSources,CSphFixedVector<CSphWordHit> & dHits,int iArenaSize,int iFieldFD,int nFieldMVAs,int iFieldMVAInPool,CSphIndex_VLN * pPrevIndex)12036 bool CSphIndex_VLN::BuildMVA ( const CSphVector<CSphSource*> & dSources, CSphFixedVector<CSphWordHit> & dHits,
12037 		int iArenaSize, int iFieldFD, int nFieldMVAs, int iFieldMVAInPool, CSphIndex_VLN * pPrevIndex )
12038 {
12039 	// initialize writer (data file must always exist)
12040 	CSphWriter wrMva;
12041 	if ( !wrMva.OpenFile ( GetIndexFileName("spm"), m_sLastError ) )
12042 		return false;
12043 
12044 	// calcs and checks
12045 	bool bOnlyFieldMVAs = true;
12046 	CSphVector<int> dMvaIndexes;
12047 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
12048 	{
12049 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
12050 		if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
12051 		{
12052 			dMvaIndexes.Add ( i );
12053 			if ( tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
12054 				bOnlyFieldMVAs = false;
12055 		}
12056 	}
12057 	int iMva64 = dMvaIndexes.GetLength();
12058 	// mva32 first
12059 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
12060 	{
12061 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
12062 		if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
12063 		{
12064 			dMvaIndexes.Add ( i );
12065 			if ( tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
12066 				bOnlyFieldMVAs = false;
12067 		}
12068 	}
12069 
12070 	if ( dMvaIndexes.GetLength()<=0 )
12071 		return true;
12072 
12073 	// reuse hits pool
12074 	MvaEntry_t * pMvaPool = (MvaEntry_t*) dHits.Begin();
12075 	MvaEntry_t * pMvaMax = pMvaPool + ( iArenaSize/sizeof(MvaEntry_t) );
12076 	MvaEntry_t * pMva = pMvaPool;
12077 
12078 	// create temp file
12079 	CSphAutofile fdTmpMva ( GetIndexFileName("tmp3"), SPH_O_NEW, m_sLastError, true );
12080 	if ( fdTmpMva.GetFD()<0 )
12081 		return false;
12082 
12083 	//////////////////////////////
12084 	// collect and partially sort
12085 	//////////////////////////////
12086 
12087 	CSphVector<int> dBlockLens;
12088 	dBlockLens.Reserve ( 1024 );
12089 
12090 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_COLLECT_MVA;
12091 
12092 	if ( !bOnlyFieldMVAs )
12093 	{
12094 		ARRAY_FOREACH ( iSource, dSources )
12095 		{
12096 			CSphSource * pSource = dSources[iSource];
12097 			if ( !pSource->Connect ( m_sLastError ) )
12098 				return false;
12099 
12100 			ARRAY_FOREACH ( i, dMvaIndexes )
12101 			{
12102 				int iAttr = dMvaIndexes[i];
12103 				const CSphColumnInfo & tAttr = m_tSchema.GetAttr(iAttr);
12104 
12105 				if ( tAttr.m_eSrc==SPH_ATTRSRC_FIELD )
12106 					continue;
12107 
12108 				if ( !pSource->IterateMultivaluedStart ( iAttr, m_sLastError ) )
12109 					return false;
12110 
12111 				while ( pSource->IterateMultivaluedNext () )
12112 				{
12113 					if ( pPrevIndex && pPrevIndex->FindDocinfo ( pSource->m_tDocInfo.m_uDocID ) )
12114 						continue;
12115 
12116 					pMva->m_uDocID = pSource->m_tDocInfo.m_uDocID;
12117 					pMva->m_iAttr = i;
12118 					if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
12119 					{
12120 						pMva->m_iValue = pSource->m_dMva[0];
12121 					} else
12122 					{
12123 						pMva->m_iValue = MVA_UPSIZE ( pSource->m_dMva.Begin() );
12124 					}
12125 
12126 					if ( ++pMva>=pMvaMax )
12127 					{
12128 						sphSort ( pMvaPool, pMva-pMvaPool );
12129 						if ( !sphWriteThrottled ( fdTmpMva.GetFD(), pMvaPool, (pMva-pMvaPool)*sizeof(MvaEntry_t), "temp_mva", m_sLastError, &g_tThrottle ) )
12130 							return false;
12131 
12132 						dBlockLens.Add ( pMva-pMvaPool );
12133 						m_tProgress.m_iAttrs += pMva-pMvaPool;
12134 						pMva = pMvaPool;
12135 
12136 						m_tProgress.Show ( false );
12137 					}
12138 				}
12139 			}
12140 
12141 			pSource->Disconnect ();
12142 		}
12143 
12144 		if ( pMva>pMvaPool )
12145 		{
12146 			sphSort ( pMvaPool, pMva-pMvaPool );
12147 			if ( !sphWriteThrottled ( fdTmpMva.GetFD(), pMvaPool, (pMva-pMvaPool)*sizeof(MvaEntry_t), "temp_mva", m_sLastError, &g_tThrottle ) )
12148 				return false;
12149 
12150 			dBlockLens.Add ( pMva-pMvaPool );
12151 			m_tProgress.m_iAttrs += pMva-pMvaPool;
12152 		}
12153 	}
12154 
12155 	m_tProgress.Show ( true );
12156 
12157 	///////////////////////////
12158 	// free memory for sorting
12159 	///////////////////////////
12160 
12161 	dHits.Reset ( 0 );
12162 
12163 	//////////////
12164 	// fully sort
12165 	//////////////
12166 
12167 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_SORT_MVA;
12168 	m_tProgress.m_iAttrs = m_tProgress.m_iAttrs + nFieldMVAs;
12169 	m_tProgress.m_iAttrsTotal = m_tProgress.m_iAttrs;
12170 	m_tProgress.Show ( false );
12171 
12172 	int	nLastBlockFieldMVAs = iFieldMVAInPool ? ( nFieldMVAs % iFieldMVAInPool ) : 0;
12173 	int nFieldBlocks = iFieldMVAInPool ? ( nFieldMVAs / iFieldMVAInPool + ( nLastBlockFieldMVAs ? 1 : 0 ) ) : 0;
12174 
12175 	// initialize readers
12176 	CSphVector<CSphBin*> dBins;
12177 	dBins.Reserve ( dBlockLens.GetLength() + nFieldBlocks );
12178 
12179 	int iBinSize = CSphBin::CalcBinSize ( iArenaSize, dBlockLens.GetLength() + nFieldBlocks, "sort_mva" );
12180 	SphOffset_t iSharedOffset = -1;
12181 
12182 	ARRAY_FOREACH ( i, dBlockLens )
12183 	{
12184 		dBins.Add ( new CSphBin() );
12185 		dBins[i]->m_iFileLeft = dBlockLens[i]*sizeof(MvaEntry_t);
12186 		dBins[i]->m_iFilePos = ( i==0 ) ? 0 : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
12187 		dBins[i]->Init ( fdTmpMva.GetFD(), &iSharedOffset, iBinSize );
12188 	}
12189 
12190 	SphOffset_t iSharedFieldOffset = -1;
12191 	SphOffset_t uStart = 0;
12192 	for ( int i = 0; i < nFieldBlocks; i++ )
12193 	{
12194 		dBins.Add ( new CSphBin() );
12195 		int iBin = dBins.GetLength () - 1;
12196 
12197 		dBins[iBin]->m_iFileLeft = sizeof(MvaEntry_t)*( i==nFieldBlocks-1
12198 			? ( nLastBlockFieldMVAs ? nLastBlockFieldMVAs : iFieldMVAInPool )
12199 			: iFieldMVAInPool );
12200 		dBins[iBin]->m_iFilePos = uStart;
12201 		dBins[iBin]->Init ( iFieldFD, &iSharedFieldOffset, iBinSize );
12202 
12203 		uStart += dBins [iBin]->m_iFileLeft;
12204 	}
12205 
12206 	// do the sort
12207 	CSphQueue < MvaEntryTag_t, MvaEntryCmp_fn > qMva ( Max ( 1, dBins.GetLength() ) );
12208 	ARRAY_FOREACH ( i, dBins )
12209 	{
12210 		MvaEntryTag_t tEntry;
12211 		if ( dBins[i]->ReadBytes ( (MvaEntry_t*) &tEntry, sizeof(MvaEntry_t) )!=BIN_READ_OK )
12212 		{
12213 			m_sLastError.SetSprintf ( "sort_mva: warmup failed (io error?)" );
12214 			return false;
12215 		}
12216 
12217 		tEntry.m_iTag = i;
12218 		qMva.Push ( tEntry );
12219 	}
12220 
12221 	// spm-file := info-list [ 0+ ]
12222 	// info-list := docid, values-list [ index.schema.mva-count ]
12223 	// values-list := values-count, value [ values-count ]
12224 	// note that mva32 come first then mva64
12225 	SphDocID_t uCurID = 0;
12226 	CSphVector < CSphVector<int64_t> > dCurInfo;
12227 	dCurInfo.Resize ( dMvaIndexes.GetLength() );
12228 
12229 	for ( ;; )
12230 	{
12231 		// flush previous per-document info-list
12232 		if ( !qMva.GetLength() || qMva.Root().m_uDocID!=uCurID )
12233 		{
12234 			if ( uCurID )
12235 			{
12236 				wrMva.PutDocid ( uCurID );
12237 				ARRAY_FOREACH ( i, dCurInfo )
12238 				{
12239 					int iLen = dCurInfo[i].GetLength();
12240 					if ( i>=iMva64 )
12241 					{
12242 						wrMva.PutDword ( iLen*2 );
12243 						wrMva.PutBytes ( dCurInfo[i].Begin(), sizeof(int64_t)*iLen );
12244 					} else
12245 					{
12246 						wrMva.PutDword ( iLen );
12247 						ARRAY_FOREACH ( iVal, dCurInfo[i] )
12248 						{
12249 							wrMva.PutDword ( (DWORD)dCurInfo[i][iVal] );
12250 						}
12251 					}
12252 				}
12253 			}
12254 
12255 			if ( !qMva.GetLength() )
12256 				break;
12257 
12258 			uCurID = qMva.Root().m_uDocID;
12259 			ARRAY_FOREACH ( i, dCurInfo )
12260 				dCurInfo[i].Resize ( 0 );
12261 		}
12262 
12263 		// accumulate this entry
12264 #if PARANOID
12265 		assert ( dCurInfo [ qMva.Root().m_iAttr ].GetLength()==0
12266 			|| dCurInfo [ qMva.Root().m_iAttr ].Last()<=qMva.Root().m_iValue );
12267 #endif
12268 		dCurInfo [ qMva.Root().m_iAttr ].AddUnique ( qMva.Root().m_iValue );
12269 
12270 		// get next entry
12271 		int iBin = qMva.Root().m_iTag;
12272 		qMva.Pop ();
12273 
12274 		MvaEntryTag_t tEntry;
12275 		ESphBinRead iRes = dBins[iBin]->ReadBytes ( (MvaEntry_t*)&tEntry, sizeof(MvaEntry_t) );
12276 		tEntry.m_iTag = iBin;
12277 
12278 		if ( iRes==BIN_READ_OK )
12279 			qMva.Push ( tEntry );
12280 
12281 		if ( iRes==BIN_READ_ERROR )
12282 		{
12283 			m_sLastError.SetSprintf ( "sort_mva: read error" );
12284 			return false;
12285 		}
12286 	}
12287 
12288 	// clean up readers
12289 	ARRAY_FOREACH ( i, dBins )
12290 		SafeDelete ( dBins[i] );
12291 
12292 	wrMva.CloseFile ();
12293 	if ( wrMva.IsError() )
12294 		return false;
12295 
12296 	m_tProgress.Show ( true );
12297 
12298 	return true;
12299 }
12300 
12301 
12302 struct FieldMVARedirect_t
12303 {
12304 	CSphAttrLocator		m_tLocator;
12305 	int					m_iAttr;
12306 	int					m_iMVAAttr;
12307 	bool				m_bMva64;
12308 };
12309 
12310 
RelocateBlock(int iFile,BYTE * pBuffer,int iRelocationSize,SphOffset_t * pFileSize,CSphBin * pMinBin,SphOffset_t * pSharedOffset)12311 bool CSphIndex_VLN::RelocateBlock ( int iFile, BYTE * pBuffer, int iRelocationSize,
12312 	SphOffset_t * pFileSize, CSphBin * pMinBin, SphOffset_t * pSharedOffset )
12313 {
12314 	assert ( pBuffer && pFileSize && pMinBin && pSharedOffset );
12315 
12316 	SphOffset_t iBlockStart = pMinBin->m_iFilePos;
12317 	SphOffset_t iBlockLeft = pMinBin->m_iFileLeft;
12318 
12319 	ESphBinRead eRes = pMinBin->Precache ();
12320 	switch ( eRes )
12321 	{
12322 	case BIN_PRECACHE_OK:
12323 		return true;
12324 	case BIN_READ_ERROR:
12325 		m_sLastError = "block relocation: preread error";
12326 		return false;
12327 	default:
12328 		break;
12329 	}
12330 
12331 	int nTransfers = (int)( ( iBlockLeft+iRelocationSize-1) / iRelocationSize );
12332 
12333 	SphOffset_t uTotalRead = 0;
12334 	SphOffset_t uNewBlockStart = *pFileSize;
12335 
12336 	for ( int i = 0; i < nTransfers; i++ )
12337 	{
12338 		sphSeek ( iFile, iBlockStart + uTotalRead, SEEK_SET );
12339 
12340 		int iToRead = i==nTransfers-1 ? (int)( iBlockLeft % iRelocationSize ) : iRelocationSize;
12341 		size_t iRead = sphReadThrottled ( iFile, pBuffer, iToRead, &g_tThrottle );
12342 		if ( iRead!=size_t(iToRead) )
12343 		{
12344 			m_sLastError.SetSprintf ( "block relocation: read error (%d of %d bytes read): %s", (int)iRead, iToRead, strerror(errno) );
12345 			return false;
12346 		}
12347 
12348 		sphSeek ( iFile, *pFileSize, SEEK_SET );
12349 		uTotalRead += iToRead;
12350 
12351 		if ( !sphWriteThrottled ( iFile, pBuffer, iToRead, "block relocation", m_sLastError, &g_tThrottle ) )
12352 			return false;
12353 
12354 		*pFileSize += iToRead;
12355 	}
12356 
12357 	assert ( uTotalRead==iBlockLeft );
12358 
12359 	// update block pointers
12360 	pMinBin->m_iFilePos = uNewBlockStart;
12361 	*pSharedOffset = *pFileSize;
12362 
12363 	return true;
12364 }
12365 
12366 
LoadHitlessWords(CSphVector<SphWordID_t> & dHitlessWords)12367 bool CSphIndex_VLN::LoadHitlessWords ( CSphVector<SphWordID_t> & dHitlessWords )
12368 {
12369 	assert ( dHitlessWords.GetLength()==0 );
12370 
12371 	if ( m_tSettings.m_sHitlessFiles.IsEmpty() )
12372 		return true;
12373 
12374 	const char * szStart = m_tSettings.m_sHitlessFiles.cstr();
12375 
12376 	while ( *szStart )
12377 	{
12378 		while ( *szStart && ( sphIsSpace ( *szStart ) || *szStart==',' ) )
12379 			++szStart;
12380 
12381 		if ( !*szStart )
12382 			break;
12383 
12384 		const char * szWordStart = szStart;
12385 
12386 		while ( *szStart && !sphIsSpace ( *szStart ) && *szStart!=',' )
12387 			++szStart;
12388 
12389 		if ( szStart - szWordStart > 0 )
12390 		{
12391 			CSphString sFilename;
12392 			sFilename.SetBinary ( szWordStart, szStart-szWordStart );
12393 
12394 			CSphAutofile tFile ( sFilename.cstr(), SPH_O_READ, m_sLastError );
12395 			if ( tFile.GetFD()==-1 )
12396 				return false;
12397 
12398 			CSphVector<BYTE> dBuffer ( (int)tFile.GetSize() );
12399 			if ( !tFile.Read ( &dBuffer[0], dBuffer.GetLength(), m_sLastError ) )
12400 				return false;
12401 
12402 			// FIXME!!! dict=keywords + hitless_words=some
12403 			m_pTokenizer->SetBuffer ( &dBuffer[0], dBuffer.GetLength() );
12404 			while ( BYTE * sToken = m_pTokenizer->GetToken() )
12405 				dHitlessWords.Add ( m_pDict->GetWordID ( sToken ) );
12406 		}
12407 	}
12408 
12409 	dHitlessWords.Uniq();
12410 	return true;
12411 }
12412 
12413 
sphTruncate(int iFD)12414 static bool sphTruncate ( int iFD )
12415 {
12416 #if USE_WINDOWS
12417 	return SetEndOfFile ( (HANDLE) _get_osfhandle(iFD) )!=0;
12418 #else
12419 	return ::ftruncate ( iFD, ::lseek ( iFD, 0, SEEK_CUR ) )==0;
12420 #endif
12421 }
12422 
12423 class DeleteOnFail : public ISphNoncopyable
12424 {
12425 public:
DeleteOnFail()12426 	DeleteOnFail() : m_bShitHappened ( true )
12427 	{}
~DeleteOnFail()12428 	~DeleteOnFail()
12429 	{
12430 		if ( m_bShitHappened )
12431 		{
12432 			ARRAY_FOREACH ( i, m_dWriters )
12433 				m_dWriters[i]->UnlinkFile();
12434 
12435 			ARRAY_FOREACH ( i, m_dAutofiles )
12436 				m_dAutofiles[i]->SetTemporary();
12437 		}
12438 	}
AddWriter(CSphWriter * pWr)12439 	void AddWriter ( CSphWriter * pWr )
12440 	{
12441 		if ( pWr )
12442 			m_dWriters.Add ( pWr );
12443 	}
AddAutofile(CSphAutofile * pAf)12444 	void AddAutofile ( CSphAutofile * pAf )
12445 	{
12446 		if ( pAf )
12447 			m_dAutofiles.Add ( pAf );
12448 	}
AllIsDone()12449 	void AllIsDone()
12450 	{
12451 		m_bShitHappened = false;
12452 	}
12453 private:
12454 	bool	m_bShitHappened;
12455 	CSphVector<CSphWriter*> m_dWriters;
12456 	CSphVector<CSphAutofile*> m_dAutofiles;
12457 };
12458 
12459 
Build(const CSphVector<CSphSource * > & dSources,int iMemoryLimit,int iWriteBuffer)12460 int CSphIndex_VLN::Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer )
12461 {
12462 	assert ( dSources.GetLength() );
12463 
12464 	CSphVector<SphWordID_t> dHitlessWords;
12465 
12466 	if ( !LoadHitlessWords ( dHitlessWords ) )
12467 		return 0;
12468 
12469 	int iHitBuilderBufferSize = ( iWriteBuffer>0 )
12470 		? Max ( iWriteBuffer, MIN_WRITE_BUFFER )
12471 		: DEFAULT_WRITE_BUFFER;
12472 
12473 	// vars shared between phases
12474 	CSphVector<CSphBin*> dBins;
12475 	SphOffset_t iSharedOffset = -1;
12476 
12477 	m_pDict->HitblockBegin();
12478 
12479 	// setup sources
12480 	ARRAY_FOREACH ( iSource, dSources )
12481 	{
12482 		CSphSource * pSource = dSources[iSource];
12483 		assert ( pSource );
12484 
12485 		pSource->SetDict ( m_pDict );
12486 		pSource->Setup ( m_tSettings );
12487 	}
12488 
12489 	// connect 1st source and fetch its schema
12490 	if ( !dSources[0]->Connect ( m_sLastError )
12491 		|| !dSources[0]->IterateStart ( m_sLastError )
12492 		|| !dSources[0]->UpdateSchema ( &m_tSchema, m_sLastError ) )
12493 	{
12494 		return 0;
12495 	}
12496 
12497 	if ( m_tSchema.m_dFields.GetLength()==0 )
12498 	{
12499 		m_sLastError.SetSprintf ( "No fields in schema - will not index" );
12500 		return 0;
12501 	}
12502 
12503 	// check docinfo
12504 	if ( m_tSchema.GetAttrsCount()==0 && m_tSettings.m_eDocinfo!=SPH_DOCINFO_NONE )
12505 	{
12506 		sphWarning ( "Attribute count is 0: switching to none docinfo" );
12507 		m_tSettings.m_eDocinfo = SPH_DOCINFO_NONE;
12508 	}
12509 
12510 	if ( dSources[0]->HasJoinedFields() && m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
12511 	{
12512 		m_sLastError.SetSprintf ( "got joined fields, but docinfo is 'inline' (fix your config file)" );
12513 		return 0;
12514 	}
12515 
12516 	if ( m_tSchema.GetAttrsCount()>0 && m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
12517 	{
12518 		m_sLastError.SetSprintf ( "got attributes, but docinfo is 'none' (fix your config file)" );
12519 		return 0;
12520 	}
12521 
12522 	bool bHaveFieldMVAs = false;
12523 	int iFieldLens = -1;
12524 	CSphVector<int> dMvaIndexes;
12525 	CSphVector<CSphAttrLocator> dMvaLocators;
12526 
12527 	// strings storage
12528 	CSphVector<int> dStringAttrs;
12529 
12530 	// chunks to partically sort string attributes
12531 	CSphVector<DWORD> dStringChunks;
12532 	SphOffset_t uStringChunk = 0;
12533 
12534 	// Sphinx-BSON storage
12535 	CSphVector<BYTE> dBson;
12536 	dBson.Reserve ( 1024 );
12537 
12538 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
12539 	{
12540 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
12541 		switch ( tCol.m_eAttrType )
12542 		{
12543 			case SPH_ATTR_UINT32SET:
12544 				if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
12545 					bHaveFieldMVAs = true;
12546 				dMvaIndexes.Add ( i );
12547 				dMvaLocators.Add ( tCol.m_tLocator );
12548 				break;
12549 			case SPH_ATTR_STRING:
12550 			case SPH_ATTR_JSON:
12551 				dStringAttrs.Add ( i );
12552 				break;
12553 			case SPH_ATTR_TOKENCOUNT:
12554 				if ( iFieldLens<0 )
12555 					iFieldLens = i;
12556 				break;
12557 			default:
12558 				break;
12559 		}
12560 	}
12561 
12562 	// no field lengths for docinfo=inline
12563 	assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN || iFieldLens==-1 );
12564 
12565 	// this loop must NOT be merged with the previous one;
12566 	// mva64 must intentionally be after all the mva32
12567 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
12568 	{
12569 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
12570 		if ( tCol.m_eAttrType!=SPH_ATTR_INT64SET )
12571 			continue;
12572 		if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
12573 			bHaveFieldMVAs = true;
12574 		dMvaIndexes.Add ( i );
12575 		dMvaLocators.Add ( tCol.m_tLocator );
12576 	}
12577 
12578 	bool bGotMVA = ( dMvaIndexes.GetLength()!=0 );
12579 	if ( bGotMVA && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
12580 	{
12581 		m_sLastError.SetSprintf ( "multi-valued attributes require docinfo=extern (fix your config file)" );
12582 		return 0;
12583 	}
12584 
12585 	if ( dStringAttrs.GetLength() && m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
12586 	{
12587 		m_sLastError.SetSprintf ( "string attributes require docinfo=extern (fix your config file)" );
12588 		return 0;
12589 	}
12590 
12591 	if ( !m_pTokenizer->SetFilterSchema ( m_tSchema, m_sLastError ) )
12592 		return 0;
12593 
12594 	CSphHitBuilder tHitBuilder ( m_tSettings, dHitlessWords, false, iHitBuilderBufferSize, m_pDict, &m_sLastError );
12595 
12596 	////////////////////////////////////////////////
12597 	// collect and partially sort hits and docinfos
12598 	////////////////////////////////////////////////
12599 
12600 	// killlist storage
12601 	CSphVector <SphDocID_t> dKillList;
12602 
12603 	// adjust memory requirements
12604 	int iOldLimit = iMemoryLimit;
12605 
12606 	// book memory to store at least 64K attribute rows
12607 	const int iDocinfoStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
12608 	int iDocinfoMax = Max ( iMemoryLimit/16/iDocinfoStride/sizeof(DWORD), 65536ul );
12609 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
12610 		iDocinfoMax = 1;
12611 
12612 	// book at least 32 KB for field MVAs, if needed
12613 	int iFieldMVAPoolSize = Max ( 32768, iMemoryLimit/16 );
12614 	if ( bHaveFieldMVAs==0 )
12615 		iFieldMVAPoolSize = 0;
12616 
12617 	// book at least 2 MB for keywords dict, if needed
12618 	int iDictSize = 0;
12619 	if ( m_pDict->GetSettings().m_bWordDict )
12620 		iDictSize = Max ( MIN_KEYWORDS_DICT, iMemoryLimit/8 );
12621 
12622 	// do we have enough left for hits?
12623 	int iHitsMax = 1048576;
12624 
12625 	iMemoryLimit -= iDocinfoMax*iDocinfoStride*sizeof(DWORD) + iFieldMVAPoolSize + iDictSize;
12626 	if ( iMemoryLimit < iHitsMax*(int)sizeof(CSphWordHit) )
12627 	{
12628 		iMemoryLimit = iOldLimit + iHitsMax*sizeof(CSphWordHit) - iMemoryLimit;
12629 		sphWarn ( "collect_hits: mem_limit=%d kb too low, increasing to %d kb",
12630 			iOldLimit/1024, iMemoryLimit/1024 );
12631 	} else
12632 	{
12633 		iHitsMax = iMemoryLimit / sizeof(CSphWordHit);
12634 	}
12635 
12636 	// allocate raw hits block
12637 	CSphFixedVector<CSphWordHit> dHits ( iHitsMax + MAX_SOURCE_HITS );
12638 	CSphWordHit * pHits = dHits.Begin();
12639 	CSphWordHit * pHitsMax = dHits.Begin() + iHitsMax;
12640 
12641 	// after finishing with hits this pool will be used to sort strings
12642 	int iPoolSize = dHits.GetSizeBytes();
12643 
12644 	// allocate docinfos buffer
12645 	CSphFixedVector<DWORD> dDocinfos ( iDocinfoMax*iDocinfoStride );
12646 	DWORD * pDocinfo = dDocinfos.Begin();
12647 	const DWORD * pDocinfoMax = dDocinfos.Begin() + iDocinfoMax*iDocinfoStride;
12648 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_NONE )
12649 	{
12650 		pDocinfo = NULL;
12651 		pDocinfoMax = NULL;
12652 	}
12653 
12654 	CSphVector < MvaEntry_t > dFieldMVAs;
12655 	dFieldMVAs.Reserve ( 16384 );
12656 
12657 	CSphVector < SphOffset_t > dFieldMVABlocks;
12658 	dFieldMVABlocks.Reserve ( 4096 );
12659 
12660 	CSphVector < FieldMVARedirect_t > dFieldMvaIndexes;
12661 
12662 	if ( bHaveFieldMVAs )
12663 		dFieldMvaIndexes.Reserve ( 8 );
12664 
12665 	int iMaxPoolFieldMVAs = iFieldMVAPoolSize / sizeof ( MvaEntry_t );
12666 	int nFieldMVAs = 0;
12667 
12668 	CSphScopedPtr<CSphIndex_VLN> pPrevIndex(NULL);
12669 	if ( !m_sKeepAttrs.IsEmpty() )
12670 	{
12671 		CSphString sWarning;
12672 		pPrevIndex = (CSphIndex_VLN *)sphCreateIndexPhrase ( "keep-attrs", m_sKeepAttrs.cstr() );
12673 		if ( !pPrevIndex->Prealloc ( false, false, sWarning ) || !pPrevIndex->Preread() )
12674 		{
12675 			CSphString sError;
12676 			if ( !sWarning.IsEmpty() )
12677 				sError.SetSprintf ( "warning: '%s',", sWarning.cstr() );
12678 			if ( !pPrevIndex->GetLastError().IsEmpty() )
12679 				sError.SetSprintf ( "%serror: '%s'", sError.scstr(), pPrevIndex->GetLastError().cstr() );
12680 			sphWarn ( "unable to load 'keep-attrs' index (%s); ignoring --keep-attrs", sError.cstr() );
12681 
12682 			pPrevIndex.Reset();
12683 		} else
12684 		{
12685 			// check schemas
12686 			CSphString sError;
12687 			if ( !m_tSchema.CompareTo ( pPrevIndex->m_tSchema, sError, false ) )
12688 			{
12689 				sphWarn ( "schemas are different (%s); ignoring --keep-attrs", sError.cstr() );
12690 				pPrevIndex.Reset();
12691 			}
12692 		}
12693 	}
12694 
12695 	// create temp files
12696 	CSphAutofile fdLock ( GetIndexFileName("tmp0"), SPH_O_NEW, m_sLastError, true );
12697 	CSphAutofile fdHits ( GetIndexFileName ( m_bInplaceSettings ? "spp" : "tmp1" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
12698 	CSphAutofile fdDocinfos ( GetIndexFileName ( m_bInplaceSettings ? "spa" : "tmp2" ), SPH_O_NEW, m_sLastError, !m_bInplaceSettings );
12699 	CSphAutofile fdTmpFieldMVAs ( GetIndexFileName("tmp7"), SPH_O_NEW, m_sLastError, true );
12700 	CSphWriter tStrWriter;
12701 	CSphWriter tStrFinalWriter;
12702 
12703 	if ( !tStrWriter.OpenFile ( GetIndexFileName("tmps"), m_sLastError ) )
12704 		return 0;
12705 	tStrWriter.PutByte ( 0 ); // dummy byte, to reserve magic zero offset
12706 
12707 	if ( !tStrFinalWriter.OpenFile ( GetIndexFileName("sps"), m_sLastError ) )
12708 		return 0;
12709 	tStrFinalWriter.PutByte ( 0 ); // dummy byte, to reserve magic zero offset
12710 
12711 	DeleteOnFail dFileWatchdog;
12712 
12713 	if ( m_bInplaceSettings )
12714 	{
12715 		dFileWatchdog.AddAutofile ( &fdHits );
12716 		dFileWatchdog.AddAutofile ( &fdDocinfos );
12717 	}
12718 
12719 	dFileWatchdog.AddWriter ( &tStrWriter );
12720 	dFileWatchdog.AddWriter ( &tStrFinalWriter );
12721 
12722 	if ( fdLock.GetFD()<0 || fdHits.GetFD()<0 || fdDocinfos.GetFD()<0 || fdTmpFieldMVAs.GetFD ()<0 )
12723 		return 0;
12724 
12725 	SphOffset_t iHitsGap = 0;
12726 	SphOffset_t iDocinfosGap = 0;
12727 
12728 	if ( m_bInplaceSettings )
12729 	{
12730 		const int HIT_SIZE_AVG = 4;
12731 		const float HIT_BLOCK_FACTOR = 1.0f;
12732 		const float DOCINFO_BLOCK_FACTOR = 1.0f;
12733 
12734 		if ( m_iHitGap )
12735 			iHitsGap = (SphOffset_t) m_iHitGap;
12736 		else
12737 			iHitsGap = (SphOffset_t)( iHitsMax*HIT_BLOCK_FACTOR*HIT_SIZE_AVG );
12738 
12739 		iHitsGap = Max ( iHitsGap, 1 );
12740 		sphSeek ( fdHits.GetFD (), iHitsGap, SEEK_SET );
12741 
12742 		if ( m_iDocinfoGap )
12743 			iDocinfosGap = (SphOffset_t) m_iDocinfoGap;
12744 		else
12745 			iDocinfosGap = (SphOffset_t)( iDocinfoMax*DOCINFO_BLOCK_FACTOR*iDocinfoStride*sizeof(DWORD) );
12746 
12747 		iDocinfosGap = Max ( iDocinfosGap, 1 );
12748 		sphSeek ( fdDocinfos.GetFD (), iDocinfosGap, SEEK_SET );
12749 	}
12750 
12751 	if ( !sphLockEx ( fdLock.GetFD(), false ) )
12752 	{
12753 		m_sLastError.SetSprintf ( "failed to lock '%s': another indexer running?", fdLock.GetFilename() );
12754 		return 0;
12755 	}
12756 
12757 	// setup accumulating docinfo IDs range
12758 	m_dMinRow.Reset ( m_tSchema.GetRowSize() );
12759 	m_uMinDocid = DOCID_MAX;
12760 	ARRAY_FOREACH ( i, m_dMinRow )
12761 		m_dMinRow[i] = ROWITEM_MAX;
12762 
12763 	m_tStats.Reset ();
12764 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_COLLECT;
12765 	m_tProgress.m_iAttrs = 0;
12766 
12767 	CSphVector<int> dHitBlocks;
12768 	dHitBlocks.Reserve ( 1024 );
12769 
12770 	int iDocinfoBlocks = 0;
12771 
12772 	ARRAY_FOREACH ( iSource, dSources )
12773 	{
12774 		// connect and check schema, if it's not the first one
12775 		CSphSource * pSource = dSources[iSource];
12776 
12777 		if ( iSource )
12778 		{
12779 			if ( !pSource->Connect ( m_sLastError )
12780 				|| !pSource->IterateStart ( m_sLastError )
12781 				|| !pSource->UpdateSchema ( &m_tSchema, m_sLastError ) )
12782 			{
12783 				return 0;
12784 			}
12785 
12786 			if ( pSource->HasJoinedFields() && m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
12787 			{
12788 				m_sLastError.SetSprintf ( "got joined fields, but docinfo is 'inline' (fix your config file)" );
12789 				return 0;
12790 			}
12791 		}
12792 
12793 		dFieldMvaIndexes.Resize ( 0 );
12794 
12795 		ARRAY_FOREACH ( i, dMvaIndexes )
12796 		{
12797 			int iAttr = dMvaIndexes[i];
12798 			const CSphColumnInfo & tCol = m_tSchema.GetAttr ( iAttr );
12799 			if ( tCol.m_eSrc==SPH_ATTRSRC_FIELD )
12800 			{
12801 				FieldMVARedirect_t & tRedirect = dFieldMvaIndexes.Add();
12802 				tRedirect.m_tLocator = tCol.m_tLocator;
12803 				tRedirect.m_iAttr = iAttr;
12804 				tRedirect.m_iMVAAttr = i;
12805 				tRedirect.m_bMva64 = ( tCol.m_eAttrType==SPH_ATTR_INT64SET );
12806 			}
12807 		}
12808 
12809 		// joined filter
12810 		bool bGotJoined = ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_INLINE ) && pSource->HasJoinedFields();
12811 
12812 		// fetch documents
12813 		for ( ;; )
12814 		{
12815 			// get next doc, and handle errors
12816 			bool bGotDoc = pSource->IterateDocument ( m_sLastError );
12817 			if ( !bGotDoc )
12818 				return 0;
12819 
12820 			// ensure docid is sane
12821 			if ( pSource->m_tDocInfo.m_uDocID==DOCID_MAX )
12822 			{
12823 				m_sLastError.SetSprintf ( "docid==DOCID_MAX (source broken?)" );
12824 				return 0;
12825 			}
12826 
12827 			// check for eof
12828 			if ( !pSource->m_tDocInfo.m_uDocID )
12829 				break;
12830 
12831 			// show progress bar
12832 			if ( ( pSource->GetStats().m_iTotalDocuments % 1000 )==0 )
12833 			{
12834 				m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments + pSource->GetStats().m_iTotalDocuments;
12835 				m_tProgress.m_iBytes = m_tStats.m_iTotalBytes + pSource->GetStats().m_iTotalBytes;
12836 				m_tProgress.Show ( false );
12837 			}
12838 
12839 			// update crashdump
12840 			g_iIndexerCurrentDocID = pSource->m_tDocInfo.m_uDocID;
12841 			g_iIndexerCurrentHits = pHits-dHits.Begin();
12842 
12843 			DWORD * pPrevDocinfo = NULL;
12844 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pPrevIndex.Ptr() )
12845 				pPrevDocinfo = const_cast<DWORD*>( pPrevIndex->FindDocinfo ( pSource->m_tDocInfo.m_uDocID ) );
12846 
12847 			if ( dMvaIndexes.GetLength() && pPrevDocinfo && pPrevIndex->m_tMva.GetWritePtr() )
12848 			{
12849 				// fetch old mva values
12850 				ARRAY_FOREACH ( i, dMvaIndexes )
12851 				{
12852 					const CSphColumnInfo & tCol = m_tSchema.GetAttr ( dMvaIndexes[i] );
12853 					SphAttr_t uOff = sphGetRowAttr ( DOCINFO2ATTRS ( pPrevDocinfo ), tCol.m_tLocator );
12854 					if ( !uOff )
12855 						continue;
12856 
12857 					const DWORD * pMVA = pPrevIndex->m_tMva.GetWritePtr()+uOff;
12858 					int nMVAs = *pMVA++;
12859 					for ( int iMVA = 0; iMVA < nMVAs; iMVA++ )
12860 					{
12861 						MvaEntry_t & tMva = dFieldMVAs.Add();
12862 						tMva.m_uDocID = pSource->m_tDocInfo.m_uDocID;
12863 						tMva.m_iAttr = i;
12864 						if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
12865 						{
12866 							tMva.m_iValue = MVA_UPSIZE(pMVA);
12867 							pMVA++;
12868 						} else
12869 							tMva.m_iValue = *pMVA;
12870 
12871 						pMVA++;
12872 
12873 						int iLength = dFieldMVAs.GetLength ();
12874 						if ( iLength==iMaxPoolFieldMVAs )
12875 						{
12876 							dFieldMVAs.Sort();
12877 							if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0],
12878 								iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError, &g_tThrottle ) )
12879 								return 0;
12880 
12881 							dFieldMVAs.Resize ( 0 );
12882 
12883 							nFieldMVAs += iMaxPoolFieldMVAs;
12884 						}
12885 					}
12886 				}
12887 
12888 			} else if ( bHaveFieldMVAs )
12889 			{
12890 				// store field MVAs
12891 				ARRAY_FOREACH ( i, dFieldMvaIndexes )
12892 				{
12893 					int iAttr = dFieldMvaIndexes[i].m_iAttr;
12894 					int iMVA = dFieldMvaIndexes[i].m_iMVAAttr;
12895 					bool bMva64 = dFieldMvaIndexes[i].m_bMva64;
12896 					int iStep = ( bMva64 ? 2 : 1 );
12897 
12898 					// store per-document MVAs
12899 					SphRange_t tFieldMva = pSource->IterateFieldMVAStart ( iAttr );
12900 					m_tProgress.m_iAttrs += ( tFieldMva.m_iLength / iStep );
12901 
12902 					assert ( ( tFieldMva.m_iStart + tFieldMva.m_iLength )<=pSource->m_dMva.GetLength() );
12903 					for ( int j=tFieldMva.m_iStart; j<( tFieldMva.m_iStart+tFieldMva.m_iLength); j+=iStep )
12904 					{
12905 						MvaEntry_t & tMva = dFieldMVAs.Add();
12906 						tMva.m_uDocID = pSource->m_tDocInfo.m_uDocID;
12907 						tMva.m_iAttr = iMVA;
12908 						if ( bMva64 )
12909 						{
12910 							tMva.m_iValue = MVA_UPSIZE ( pSource->m_dMva.Begin() + j );
12911 						} else
12912 						{
12913 							tMva.m_iValue = pSource->m_dMva[j];
12914 						}
12915 
12916 						int iLength = dFieldMVAs.GetLength ();
12917 						if ( iLength==iMaxPoolFieldMVAs )
12918 						{
12919 							dFieldMVAs.Sort();
12920 							if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0],
12921 								iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError, &g_tThrottle ) )
12922 									return 0;
12923 
12924 							dFieldMVAs.Resize ( 0 );
12925 
12926 							nFieldMVAs += iMaxPoolFieldMVAs;
12927 						}
12928 					}
12929 				}
12930 			}
12931 
12932 			// store strings and JSON blobs
12933 			if ( pPrevDocinfo )
12934 			{
12935 				CSphRowitem * pPrevAttrs = DOCINFO2ATTRS ( pPrevDocinfo );
12936 				ARRAY_FOREACH ( i, dStringAttrs )
12937 				{
12938 					const CSphAttrLocator & tLoc = m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator;
12939 					SphAttr_t uPrevOff = sphGetRowAttr ( pPrevAttrs, tLoc );
12940 					BYTE * pBase = pPrevIndex->m_tString.GetWritePtr();
12941 					if ( !uPrevOff || !pBase )
12942 						sphSetRowAttr ( pPrevAttrs, tLoc, 0 );
12943 					else
12944 					{
12945 						const BYTE * pStr = NULL;
12946 						int iLen = sphUnpackStr ( pBase+uPrevOff, &pStr );
12947 						if ( !iLen )
12948 							sphSetRowAttr ( pPrevAttrs, tLoc, 0 );
12949 						else
12950 						{
12951 							SphOffset_t uOff = tStrWriter.GetPos();
12952 							if ( uint64_t(uOff)>>32 )
12953 							{
12954 								m_sLastError.SetSprintf ( "too many string attributes (current index format allows up to 4 GB)" );
12955 								return 0;
12956 							}
12957 
12958 							sphSetRowAttr ( pPrevAttrs, tLoc, DWORD(uOff) );
12959 
12960 							BYTE dPackedLen[4];
12961 							int iLenLen = sphPackStrlen ( dPackedLen, iLen );
12962 							tStrWriter.PutBytes ( &dPackedLen, iLenLen );
12963 							tStrWriter.PutBytes ( pStr, iLen );
12964 						}
12965 					}
12966 				}
12967 			} else
12968 			{
12969 				ARRAY_FOREACH ( i, dStringAttrs )
12970 				{
12971 					// FIXME! optimize locators etc?
12972 					// FIXME! support binary strings w/embedded zeroes?
12973 					// get data, calc length
12974 					const char * sData = pSource->m_dStrAttrs[dStringAttrs[i]].cstr();
12975 					int iLen = sData ? strlen ( sData ) : 0;
12976 
12977 					// no data
12978 					if ( !iLen )
12979 					{
12980 						pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, 0 );
12981 						continue;
12982 					}
12983 
12984 					// handle JSON
12985 					if ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_eAttrType==SPH_ATTR_JSON ) // FIXME? optimize?
12986 					{
12987 						// WARNING, tricky bit
12988 						// flex lexer needs last two (!) bytes to be zeroes
12989 						// asciiz string supplies one, and we fill out the extra one
12990 						// and that works, because CSphString always allocates a small extra gap
12991 						char * pData = const_cast<char*>(sData);
12992 						pData[iLen+1] = '\0';
12993 
12994 						dBson.Resize ( 0 );
12995 						if ( !sphJsonParse ( dBson, pData, g_bJsonAutoconvNumbers, g_bJsonKeynamesToLowercase, m_sLastError ) )
12996 						{
12997 							m_sLastError.SetSprintf ( "document " DOCID_FMT ", attribute %s: JSON error: %s",
12998 								pSource->m_tDocInfo.m_uDocID, m_tSchema.GetAttr ( dStringAttrs[i] ).m_sName.cstr(),
12999 								m_sLastError.cstr() );
13000 
13001 							// bail?
13002 							if ( g_bJsonStrict )
13003 								return 0;
13004 
13005 							// warn and ignore
13006 							sphWarning ( "%s", m_sLastError.cstr() );
13007 							m_sLastError = "";
13008 							pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, 0 );
13009 							continue;
13010 						}
13011 						if ( !dBson.GetLength() )
13012 						{
13013 							// empty SphinxBSON, need not save any data
13014 							pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, 0 );
13015 							continue;
13016 						}
13017 
13018 						// let's go save the newly built SphinxBSON blob
13019 						sData = (const char*)dBson.Begin();
13020 						iLen = dBson.GetLength();
13021 					}
13022 
13023 					// calc offset, do sanity checks
13024 					SphOffset_t uOff = tStrWriter.GetPos();
13025 					if ( uint64_t(uOff)>>32 )
13026 					{
13027 						m_sLastError.SetSprintf ( "too many string attributes (current index format allows up to 4 GB)" );
13028 						return 0;
13029 					}
13030 					pSource->m_tDocInfo.SetAttr ( m_tSchema.GetAttr ( dStringAttrs[i] ).m_tLocator, DWORD(uOff) );
13031 
13032 					// pack length, emit it, emit data
13033 					BYTE dPackedLen[4];
13034 					int iLenLen = sphPackStrlen ( dPackedLen, iLen );
13035 					tStrWriter.PutBytes ( &dPackedLen, iLenLen );
13036 					tStrWriter.PutBytes ( sData, iLen );
13037 
13038 					// check if current pos is the good one for sorting
13039 					if ( uOff+iLenLen+iLen-uStringChunk > iPoolSize )
13040 					{
13041 						dStringChunks.Add ( DWORD ( uOff-uStringChunk ) );
13042 						uStringChunk = uOff;
13043 					}
13044 				}
13045 			}
13046 
13047 			// docinfo=inline might be flushed while collecting hits
13048 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
13049 			{
13050 				// store next entry
13051 				DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_uDocID );
13052 				memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSource->m_tDocInfo.m_pDynamic, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
13053 				pDocinfo += iDocinfoStride;
13054 
13055 				// update min docinfo
13056 				assert ( pSource->m_tDocInfo.m_uDocID );
13057 				m_uMinDocid = Min ( m_uMinDocid, pSource->m_tDocInfo.m_uDocID );
13058 				ARRAY_FOREACH ( i, m_dMinRow )
13059 					m_dMinRow[i] = Min ( m_dMinRow[i], pSource->m_tDocInfo.m_pDynamic[i] );
13060 			}
13061 
13062 			// store hits
13063 			while ( const ISphHits * pDocHits = pSource->IterateHits ( m_sLastWarning ) )
13064 			{
13065 				int iDocHits = pDocHits->Length();
13066 #if PARANOID
13067 				for ( int i=0; i<iDocHits; i++ )
13068 				{
13069 					assert ( pDocHits->m_dData[i].m_uDocID==pSource->m_tDocInfo.m_uDocID );
13070 					assert ( pDocHits->m_dData[i].m_uWordID );
13071 					assert ( pDocHits->m_dData[i].m_iWordPos );
13072 				}
13073 #endif
13074 
13075 				assert ( ( pHits+iDocHits )<=( pHitsMax+MAX_SOURCE_HITS ) );
13076 
13077 				memcpy ( pHits, pDocHits->First(), iDocHits*sizeof(CSphWordHit) );
13078 				pHits += iDocHits;
13079 
13080 				// check if we need to flush
13081 				if ( pHits<pHitsMax
13082 					&& !( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE && pDocinfo>=pDocinfoMax )
13083 					&& !( iDictSize && m_pDict->HitblockGetMemUse() > iDictSize ) )
13084 				{
13085 					continue;
13086 				}
13087 
13088 				// update crashdump
13089 				g_iIndexerPoolStartDocID = pSource->m_tDocInfo.m_uDocID;
13090 				g_iIndexerPoolStartHit = pHits-dHits.Begin();
13091 
13092 				// sort hits
13093 				int iHits = pHits - dHits.Begin();
13094 				{
13095 					sphSort ( dHits.Begin(), iHits, CmpHit_fn() );
13096 					m_pDict->HitblockPatch ( dHits.Begin(), iHits );
13097 				}
13098 				pHits = dHits.Begin();
13099 
13100 				if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
13101 				{
13102 					// we're inlining, so let's flush both hits and docs
13103 					int iDocs = ( pDocinfo - dDocinfos.Begin() ) / iDocinfoStride;
13104 					pDocinfo = dDocinfos.Begin();
13105 
13106 					sphSortDocinfos ( dDocinfos.Begin(), iDocs, iDocinfoStride );
13107 
13108 					dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits,
13109 						dDocinfos.Begin(), iDocs, iDocinfoStride ) );
13110 
13111 					// we are inlining, so if there are more hits in this document,
13112 					// we'll need to know it's info next flush
13113 					if ( iDocHits )
13114 					{
13115 						DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_uDocID );
13116 						memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSource->m_tDocInfo.m_pDynamic, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
13117 						pDocinfo += iDocinfoStride;
13118 					}
13119 				} else
13120 				{
13121 					// we're not inlining, so only flush hits, docs are flushed independently
13122 					dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits,
13123 						NULL, 0, 0 ) );
13124 				}
13125 				m_pDict->HitblockReset ();
13126 
13127 				if ( dHitBlocks.Last()<0 )
13128 					return 0;
13129 
13130 				// progress bar
13131 				m_tProgress.m_iHitsTotal += iHits;
13132 				m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments + pSource->GetStats().m_iTotalDocuments;
13133 				m_tProgress.m_iBytes = m_tStats.m_iTotalBytes + pSource->GetStats().m_iTotalBytes;
13134 				m_tProgress.Show ( false );
13135 			}
13136 
13137 			// update min docinfo
13138 			assert ( pSource->m_tDocInfo.m_uDocID );
13139 			m_uMinDocid = Min ( m_uMinDocid, pSource->m_tDocInfo.m_uDocID );
13140 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
13141 			{
13142 				ARRAY_FOREACH ( i, m_dMinRow )
13143 					m_dMinRow[i] = Min ( m_dMinRow[i], pSource->m_tDocInfo.m_pDynamic[i] );
13144 			}
13145 
13146 			// update total field lengths
13147 			if ( iFieldLens>=0 )
13148 			{
13149 				ARRAY_FOREACH ( i, m_tSchema.m_dFields )
13150 					m_dFieldLens[i] += pSource->m_tDocInfo.GetAttr ( m_tSchema.GetAttr ( i+iFieldLens ).m_tLocator );
13151 			}
13152 
13153 			// store docinfo
13154 			// with the advent of SPH_ATTR_TOKENCOUNT, now MUST be done AFTER iterating the hits
13155 			// because field lengths are computed during that iterating
13156 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
13157 			{
13158 				// store next entry
13159 				DOCINFOSETID ( pDocinfo, pSource->m_tDocInfo.m_uDocID );
13160 
13161 				// old docinfo found, use it instead of the new one
13162 				const DWORD * pSrc = pPrevDocinfo ? DOCINFO2ATTRS ( pPrevDocinfo ) : pSource->m_tDocInfo.m_pDynamic;
13163 				memcpy ( DOCINFO2ATTRS ( pDocinfo ), pSrc, sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
13164 				pDocinfo += iDocinfoStride;
13165 
13166 				// if not inlining, flush buffer if it's full
13167 				// (if inlining, it will flushed later, along with the hits)
13168 				if ( pDocinfo>=pDocinfoMax )
13169 				{
13170 					assert ( pDocinfo==pDocinfoMax );
13171 					int iLen = iDocinfoMax*iDocinfoStride*sizeof(DWORD);
13172 
13173 					sphSortDocinfos ( dDocinfos.Begin(), iDocinfoMax, iDocinfoStride );
13174 					if ( !sphWriteThrottled ( fdDocinfos.GetFD(), dDocinfos.Begin(), iLen, "raw_docinfos", m_sLastError, &g_tThrottle ) )
13175 						return 0;
13176 
13177 					pDocinfo = dDocinfos.Begin();
13178 					iDocinfoBlocks++;
13179 				}
13180 			}
13181 
13182 			// go on, loop next document
13183 		}
13184 
13185 		// FIXME! uncontrolled memory usage; add checks and/or diskbased sort in the future?
13186 		if ( pSource->IterateKillListStart ( m_sLastError ) )
13187 		{
13188 			SphDocID_t uDocId;
13189 			while ( pSource->IterateKillListNext ( uDocId ) )
13190 				dKillList.Add ( uDocId );
13191 		}
13192 
13193 		// fetch joined fields
13194 		if ( bGotJoined )
13195 		{
13196 			// flush tail of regular hits
13197 			int iHits = pHits - dHits.Begin();
13198 			if ( iDictSize && m_pDict->HitblockGetMemUse() && iHits )
13199 			{
13200 				sphSort ( dHits.Begin(), iHits, CmpHit_fn() );
13201 				m_pDict->HitblockPatch ( dHits.Begin(), iHits );
13202 				pHits = dHits.Begin();
13203 				m_tProgress.m_iHitsTotal += iHits;
13204 				dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits, NULL, 0, 0 ) );
13205 				if ( dHitBlocks.Last()<0 )
13206 					return 0;
13207 				m_pDict->HitblockReset ();
13208 			}
13209 
13210 			for ( ;; )
13211 			{
13212 				// get next doc, and handle errors
13213 				ISphHits * pJoinedHits = pSource->IterateJoinedHits ( m_sLastError );
13214 				if ( !pJoinedHits )
13215 					return 0;
13216 
13217 				// ensure docid is sane
13218 				if ( pSource->m_tDocInfo.m_uDocID==DOCID_MAX )
13219 				{
13220 					m_sLastError.SetSprintf ( "joined_docid==DOCID_MAX (source broken?)" );
13221 					return 0;
13222 				}
13223 
13224 				// check for eof
13225 				if ( !pSource->m_tDocInfo.m_uDocID )
13226 					break;
13227 
13228 				int iJoinedHits = pJoinedHits->Length();
13229 				memcpy ( pHits, pJoinedHits->First(), iJoinedHits*sizeof(CSphWordHit) );
13230 				pHits += iJoinedHits;
13231 
13232 				// check if we need to flush
13233 				if ( pHits<pHitsMax && !( iDictSize && m_pDict->HitblockGetMemUse() > iDictSize ) )
13234 					continue;
13235 
13236 				// store hits
13237 				int iHits = pHits - dHits.Begin();
13238 				sphSort ( dHits.Begin(), iHits, CmpHit_fn() );
13239 				m_pDict->HitblockPatch ( dHits.Begin(), iHits );
13240 
13241 				pHits = dHits.Begin();
13242 				m_tProgress.m_iHitsTotal += iHits;
13243 
13244 				dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits, NULL, 0, 0 ) );
13245 				if ( dHitBlocks.Last()<0 )
13246 					return 0;
13247 				m_pDict->HitblockReset ();
13248 			}
13249 		}
13250 
13251 		// this source is over, disconnect and update stats
13252 		pSource->Disconnect ();
13253 
13254 		m_tStats.m_iTotalDocuments += pSource->GetStats().m_iTotalDocuments;
13255 		m_tStats.m_iTotalBytes += pSource->GetStats().m_iTotalBytes;
13256 	}
13257 
13258 	if ( m_tStats.m_iTotalDocuments>=INT_MAX )
13259 	{
13260 		m_sLastError.SetSprintf ( "index over %d documents not supported (got documents count=" INT64_FMT ")", INT_MAX, m_tStats.m_iTotalDocuments );
13261 		return 0;
13262 	}
13263 
13264 	// flush last docinfo block
13265 	int iDocinfoLastBlockSize = 0;
13266 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pDocinfo>dDocinfos.Begin() )
13267 	{
13268 		iDocinfoLastBlockSize = ( pDocinfo - dDocinfos.Begin() ) / iDocinfoStride;
13269 		assert ( pDocinfo==( dDocinfos.Begin() + iDocinfoLastBlockSize*iDocinfoStride ) );
13270 
13271 		int iLen = iDocinfoLastBlockSize*iDocinfoStride*sizeof(DWORD);
13272 		sphSortDocinfos ( dDocinfos.Begin(), iDocinfoLastBlockSize, iDocinfoStride );
13273 		if ( !sphWriteThrottled ( fdDocinfos.GetFD(), dDocinfos.Begin(), iLen, "raw_docinfos", m_sLastError, &g_tThrottle ) )
13274 			return 0;
13275 
13276 		iDocinfoBlocks++;
13277 	}
13278 
13279 	// flush last hit block
13280 	if ( pHits>dHits.Begin() )
13281 	{
13282 		int iHits = pHits - dHits.Begin();
13283 		{
13284 			sphSort ( dHits.Begin(), iHits, CmpHit_fn() );
13285 			m_pDict->HitblockPatch ( dHits.Begin(), iHits );
13286 		}
13287 		m_tProgress.m_iHitsTotal += iHits;
13288 
13289 		if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
13290 		{
13291 			int iDocs = ( pDocinfo - dDocinfos.Begin() ) / iDocinfoStride;
13292 			sphSortDocinfos ( dDocinfos.Begin(), iDocs, iDocinfoStride );
13293 			dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits,
13294 				dDocinfos.Begin(), iDocs, iDocinfoStride ) );
13295 		} else
13296 		{
13297 			dHitBlocks.Add ( tHitBuilder.cidxWriteRawVLB ( fdHits.GetFD(), dHits.Begin(), iHits, NULL, 0, 0 ) );
13298 		}
13299 		m_pDict->HitblockReset ();
13300 
13301 		if ( dHitBlocks.Last()<0 )
13302 			return 0;
13303 	}
13304 
13305 	// flush last field MVA block
13306 	if ( bHaveFieldMVAs && dFieldMVAs.GetLength () )
13307 	{
13308 		int iLength = dFieldMVAs.GetLength ();
13309 		nFieldMVAs += iLength;
13310 
13311 		dFieldMVAs.Sort();
13312 		if ( !sphWriteThrottled ( fdTmpFieldMVAs.GetFD (), &dFieldMVAs[0],
13313 			iLength*sizeof(MvaEntry_t), "temp_field_mva", m_sLastError, &g_tThrottle ) )
13314 				return 0;
13315 
13316 		dFieldMVAs.Reset ();
13317 	}
13318 
13319 	m_tProgress.m_iDocuments = m_tStats.m_iTotalDocuments;
13320 	m_tProgress.m_iBytes = m_tStats.m_iTotalBytes;
13321 	m_tProgress.Show ( true );
13322 
13323 	///////////////////////////////////////
13324 	// collect and sort multi-valued attrs
13325 	///////////////////////////////////////
13326 	if ( !BuildMVA ( dSources, dHits, iHitsMax*sizeof(CSphWordHit), fdTmpFieldMVAs.GetFD (), nFieldMVAs, iMaxPoolFieldMVAs, pPrevIndex.Ptr() ) )
13327 		return 0;
13328 
13329 	// reset persistent mva update pool
13330 	::unlink ( GetIndexFileName("mvp").cstr() );
13331 
13332 	// reset hits pool
13333 	dHits.Reset ( 0 );
13334 
13335 	CSphString sFieldMVAFile = fdTmpFieldMVAs.GetFilename ();
13336 	fdTmpFieldMVAs.Close ();
13337 	::unlink ( sFieldMVAFile.cstr () );
13338 
13339 	/////////////////
13340 	// sort docinfos
13341 	/////////////////
13342 
13343 	// initialize MVA reader
13344 	CSphAutoreader rdMva;
13345 	if ( !rdMva.Open ( GetIndexFileName("spm"), m_sLastError ) )
13346 		return 0;
13347 
13348 	SphDocID_t uMvaID = rdMva.GetDocid();
13349 
13350 	// initialize writer
13351 	int iDocinfoFD = -1;
13352 	SphOffset_t iDocinfoWritePos = 0;
13353 	CSphScopedPtr<CSphAutofile> pfdDocinfoFinal ( NULL );
13354 
13355 	if ( m_bInplaceSettings )
13356 		iDocinfoFD = fdDocinfos.GetFD ();
13357 	else
13358 	{
13359 		pfdDocinfoFinal = new CSphAutofile ( GetIndexFileName("spa"), SPH_O_NEW, m_sLastError );
13360 		iDocinfoFD = pfdDocinfoFinal->GetFD();
13361 		if ( iDocinfoFD < 0 )
13362 			return 0;
13363 	}
13364 
13365 	int iDupes = 0;
13366 	int iMinBlock = -1;
13367 
13368 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && iDocinfoBlocks )
13369 	{
13370 		// initialize readers
13371 		assert ( dBins.GetLength()==0 );
13372 		dBins.Reserve ( iDocinfoBlocks );
13373 
13374 		float fReadFactor = 1.0f;
13375 		float fRelocFactor = 0.0f;
13376 		if ( m_bInplaceSettings )
13377 		{
13378 			assert ( m_fRelocFactor > 0.005f && m_fRelocFactor < 0.95f );
13379 			fRelocFactor = m_fRelocFactor;
13380 			fReadFactor -= fRelocFactor;
13381 		}
13382 
13383 		int iBinSize = CSphBin::CalcBinSize ( int ( iMemoryLimit * fReadFactor ), iDocinfoBlocks, "sort_docinfos" );
13384 		int iRelocationSize = m_bInplaceSettings ? int ( iMemoryLimit * fRelocFactor ) : 0;
13385 		CSphFixedVector<BYTE> dRelocationBuffer ( iRelocationSize );
13386 		iSharedOffset = -1;
13387 
13388 		for ( int i=0; i<iDocinfoBlocks; i++ )
13389 		{
13390 			dBins.Add ( new CSphBin() );
13391 			dBins[i]->m_iFileLeft = ( ( i==iDocinfoBlocks-1 ) ? iDocinfoLastBlockSize : iDocinfoMax )*iDocinfoStride*sizeof(DWORD);
13392 			dBins[i]->m_iFilePos = ( i==0 ) ? iDocinfosGap : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
13393 			dBins[i]->Init ( fdDocinfos.GetFD(), &iSharedOffset, iBinSize );
13394 		}
13395 
13396 		SphOffset_t iDocinfoFileSize = 0;
13397 		if ( iDocinfoBlocks )
13398 			iDocinfoFileSize = dBins [ iDocinfoBlocks-1 ]->m_iFilePos + dBins [ iDocinfoBlocks-1 ]->m_iFileLeft;
13399 
13400 		// docinfo queue
13401 		CSphFixedVector<DWORD> dDocinfoQueue ( iDocinfoBlocks*iDocinfoStride );
13402 		CSphQueue < int, CmpQueuedDocinfo_fn > tDocinfo ( iDocinfoBlocks );
13403 
13404 		CmpQueuedDocinfo_fn::m_pStorage = dDocinfoQueue.Begin();
13405 		CmpQueuedDocinfo_fn::m_iStride = iDocinfoStride;
13406 
13407 		pDocinfo = dDocinfoQueue.Begin();
13408 		for ( int i=0; i<iDocinfoBlocks; i++ )
13409 		{
13410 			if ( dBins[i]->ReadBytes ( pDocinfo, iDocinfoStride*sizeof(DWORD) )!=BIN_READ_OK )
13411 			{
13412 				m_sLastError.SetSprintf ( "sort_docinfos: warmup failed (io error?)" );
13413 				return 0;
13414 			}
13415 			pDocinfo += iDocinfoStride;
13416 			tDocinfo.Push ( i );
13417 		}
13418 
13419 		// while the queue has data for us
13420 		pDocinfo = dDocinfos.Begin();
13421 		SphDocID_t uLastId = 0;
13422 		m_iMinMaxIndex = 0;
13423 
13424 		// prepare the collector for min/max of attributes
13425 		AttrIndexBuilder_c tMinMax ( m_tSchema );
13426 		int64_t iMinMaxSize = tMinMax.GetExpectedSize ( m_tStats.m_iTotalDocuments );
13427 		if ( iMinMaxSize>INT_MAX || m_tStats.m_iTotalDocuments>INT_MAX )
13428 		{
13429 			m_sLastError.SetSprintf ( "attribute files (.spa) over 128 GB are not supported (min-max approximate=" INT64_FMT ", documents count=" INT64_FMT ")",
13430 				iMinMaxSize, m_tStats.m_iTotalDocuments );
13431 			return 0;
13432 		}
13433 		CSphFixedVector<DWORD> dMinMaxBuffer ( (int)iMinMaxSize );
13434 		memset ( dMinMaxBuffer.Begin(), 0, (int)iMinMaxSize*sizeof(DWORD) );
13435 
13436 		// { fixed row + dummy value ( zero offset elimination ) + mva data for that row } fixed row - for MinMaxBuilder
13437 		CSphVector < DWORD > dMvaPool;
13438 		tMinMax.Prepare ( dMinMaxBuffer.Begin(), dMinMaxBuffer.Begin() + dMinMaxBuffer.GetLength() ); // FIXME!!! for over INT_MAX blocks
13439 		uint64_t uLastMvaOff = 0;
13440 
13441 		// the last (or, lucky, the only, string chunk)
13442 		dStringChunks.Add ( DWORD ( tStrWriter.GetPos()-uStringChunk ) );
13443 
13444 		tStrWriter.CloseFile();
13445 		if ( !dStringAttrs.GetLength() )
13446 			::unlink ( GetIndexFileName("tmps").cstr() );
13447 
13448 		SphDocID_t uLastDupe = 0;
13449 		while ( tDocinfo.GetLength() )
13450 		{
13451 			// obtain bin index and next entry
13452 			int iBin = tDocinfo.Root();
13453 			DWORD * pEntry = dDocinfoQueue.Begin() + iBin*iDocinfoStride;
13454 
13455 			assert ( DOCINFO2ID ( pEntry )>=uLastId && "descending documents" );
13456 
13457 			// skip duplicates
13458 			if ( DOCINFO2ID ( pEntry )==uLastId )
13459 			{
13460 				// dupe, report it
13461 				if ( m_tSettings.m_bVerbose && uLastDupe!=uLastId )
13462 					sphWarn ( "duplicated document id=" DOCID_FMT, uLastId );
13463 
13464 				uLastDupe = uLastId;
13465 				iDupes++;
13466 
13467 			} else
13468 			{
13469 				// new unique document, handle it
13470 				m_iMinMaxIndex += iDocinfoStride;
13471 
13472 				CSphRowitem * pCollectibleRow = pEntry;
13473 				// update MVA
13474 				if ( bGotMVA )
13475 				{
13476 					// go to next id
13477 					while ( uMvaID<DOCINFO2ID(pEntry) )
13478 					{
13479 						ARRAY_FOREACH ( i, dMvaIndexes )
13480 						{
13481 							int iCount = rdMva.GetDword();
13482 							rdMva.SkipBytes ( iCount*sizeof(DWORD) );
13483 						}
13484 
13485 						uMvaID = rdMva.GetDocid();
13486 						if ( !uMvaID )
13487 							uMvaID = DOCID_MAX;
13488 					}
13489 
13490 					assert ( uMvaID>=DOCINFO2ID(pEntry) );
13491 					if ( uMvaID==DOCINFO2ID(pEntry) )
13492 					{
13493 						// fixed row + dummy value ( zero offset elemination )
13494 						dMvaPool.Resize ( iDocinfoStride+1 );
13495 						memcpy ( dMvaPool.Begin(), pEntry, iDocinfoStride * sizeof(DWORD) );
13496 
13497 						CSphRowitem * pAttr = DOCINFO2ATTRS ( pEntry );
13498 						ARRAY_FOREACH ( i, dMvaIndexes )
13499 						{
13500 							uLastMvaOff = rdMva.GetPos()/sizeof(DWORD);
13501 							int iPoolOff = dMvaPool.GetLength();
13502 							if ( uLastMvaOff>UINT_MAX )
13503 								sphDie ( "MVA counter overflows " UINT64_FMT " at document " DOCID_FMT ", total MVA entries " UINT64_FMT " ( try to index less documents )", uLastMvaOff, uMvaID, rdMva.GetFilesize() );
13504 
13505 							sphSetRowAttr ( pAttr, dMvaLocators[i], uLastMvaOff );
13506 							// there is the cloned row at the beginning of MVA pool, lets skip it
13507 							sphSetRowAttr ( dMvaPool.Begin()+DOCINFO_IDSIZE, dMvaLocators[i], iPoolOff - iDocinfoStride );
13508 
13509 							DWORD iMvaCount = rdMva.GetDword();
13510 							dMvaPool.Resize ( iPoolOff+iMvaCount+1 );
13511 							dMvaPool[iPoolOff] = iMvaCount;
13512 							rdMva.GetBytes ( dMvaPool.Begin()+iPoolOff+1, sizeof(DWORD)*iMvaCount );
13513 						}
13514 						pCollectibleRow = dMvaPool.Begin();
13515 
13516 						uMvaID = rdMva.GetDocid();
13517 						if ( !uMvaID )
13518 							uMvaID = DOCID_MAX;
13519 					}
13520 				}
13521 
13522 				if ( !tMinMax.Collect ( pCollectibleRow, dMvaPool.Begin()+iDocinfoStride, dMvaPool.GetLength()-iDocinfoStride, m_sLastError, false ) )
13523 					return 0;
13524 				dMvaPool.Resize ( iDocinfoStride );
13525 
13526 				// emit it
13527 				memcpy ( pDocinfo, pEntry, iDocinfoStride*sizeof(DWORD) );
13528 				pDocinfo += iDocinfoStride;
13529 				uLastId = DOCINFO2ID(pEntry);
13530 
13531 				if ( pDocinfo>=pDocinfoMax )
13532 				{
13533 					int iLen = iDocinfoMax*iDocinfoStride*sizeof(DWORD);
13534 
13535 					if ( m_bInplaceSettings )
13536 					{
13537 						if ( iMinBlock==-1 || dBins[iMinBlock]->IsEOF () )
13538 						{
13539 							iMinBlock = -1;
13540 							ARRAY_FOREACH ( i, dBins )
13541 								if ( !dBins[i]->IsEOF () && ( iMinBlock==-1 || dBins [i]->m_iFilePos<dBins[iMinBlock]->m_iFilePos ) )
13542 									iMinBlock = i;
13543 						}
13544 
13545 						if ( iMinBlock!=-1 && ( iDocinfoWritePos + iLen ) > dBins[iMinBlock]->m_iFilePos )
13546 						{
13547 							if ( !RelocateBlock ( iDocinfoFD, dRelocationBuffer.Begin(), iRelocationSize, &iDocinfoFileSize, dBins[iMinBlock], &iSharedOffset ) )
13548 								return 0;
13549 
13550 							iMinBlock = (iMinBlock+1) % dBins.GetLength ();
13551 						}
13552 
13553 						sphSeek ( iDocinfoFD, iDocinfoWritePos, SEEK_SET );
13554 						iSharedOffset = iDocinfoWritePos;
13555 					}
13556 
13557 					if ( !sphWriteThrottled ( iDocinfoFD, dDocinfos.Begin(), iLen, "sort_docinfo", m_sLastError, &g_tThrottle ) )
13558 						return 0;
13559 
13560 					iDocinfoWritePos += iLen;
13561 					pDocinfo = dDocinfos.Begin();
13562 				}
13563 			}
13564 
13565 			// pop its index, update it, push its index again
13566 			tDocinfo.Pop ();
13567 			ESphBinRead eRes = dBins[iBin]->ReadBytes ( pEntry, iDocinfoStride*sizeof(DWORD) );
13568 			if ( eRes==BIN_READ_ERROR )
13569 			{
13570 				m_sLastError.SetSprintf ( "sort_docinfo: failed to read entry" );
13571 				return 0;
13572 			}
13573 			if ( eRes==BIN_READ_OK )
13574 				tDocinfo.Push ( iBin );
13575 		}
13576 
13577 		if ( pDocinfo>dDocinfos.Begin() )
13578 		{
13579 			assert ( 0==( pDocinfo-dDocinfos.Begin() ) % iDocinfoStride );
13580 			int iLen = ( pDocinfo - dDocinfos.Begin() )*sizeof(DWORD);
13581 
13582 			if ( m_bInplaceSettings )
13583 				sphSeek ( iDocinfoFD, iDocinfoWritePos, SEEK_SET );
13584 
13585 			if ( !sphWriteThrottled ( iDocinfoFD, dDocinfos.Begin(), iLen, "sort_docinfo", m_sLastError, &g_tThrottle ) )
13586 				return 0;
13587 
13588 			if ( m_bInplaceSettings )
13589 				if ( !sphTruncate ( iDocinfoFD ) )
13590 					sphWarn ( "failed to truncate %s", fdDocinfos.GetFilename() );
13591 
13592 			iDocinfoWritePos += iLen;
13593 		}
13594 		tMinMax.FinishCollect();
13595 		int64_t iMinMaxRealSize = tMinMax.GetActualSize() * sizeof(DWORD);
13596 		if ( !sphWriteThrottled ( iDocinfoFD, dMinMaxBuffer.Begin(), iMinMaxRealSize, "minmax_docinfo", m_sLastError, &g_tThrottle ) )
13597 				return 0;
13598 
13599 		// clean up readers
13600 		ARRAY_FOREACH ( i, dBins )
13601 			SafeDelete ( dBins[i] );
13602 
13603 		dBins.Reset ();
13604 
13605 		if ( uLastMvaOff>INT_MAX )
13606 			sphWarning ( "MVA update disabled (collected MVA " INT64_FMT ", should be less %d)", uLastMvaOff, INT_MAX );
13607 	}
13608 
13609 	dDocinfos.Reset ( 0 );
13610 	pDocinfo = NULL;
13611 
13612 	// iDocinfoWritePos now contains the true size of pure attributes (without block indexes) in bytes
13613 	int iStringStride = dStringAttrs.GetLength();
13614 	SphOffset_t iNumDocs = iDocinfoWritePos/sizeof(DWORD)/iDocinfoStride;
13615 	CSphTightVector<DWORD> dStrOffsets;
13616 
13617 	if ( iStringStride )
13618 	{
13619 		// read only non-zero string locators
13620 		{
13621 			CSphReader tAttrReader;
13622 			tAttrReader.SetFile ( iDocinfoFD, GetIndexFileName ( "spa" ).cstr() );
13623 			CSphFixedVector<DWORD> dDocinfo ( iDocinfoStride );
13624 			pDocinfo = dDocinfo.Begin();
13625 			for ( SphOffset_t i=0; i<iNumDocs; ++i )
13626 			{
13627 				tAttrReader.GetBytes ( pDocinfo, iDocinfoStride*sizeof(DWORD) );
13628 				CSphRowitem * pAttrs = DOCINFO2ATTRS ( pDocinfo );
13629 				ARRAY_FOREACH ( j, dStringAttrs )
13630 				{
13631 					const CSphAttrLocator & tLoc = m_tSchema.GetAttr ( dStringAttrs[j] ).m_tLocator;
13632 					DWORD uData = (DWORD)sphGetRowAttr ( pAttrs, tLoc );
13633 					if ( uData )
13634 						dStrOffsets.Add ( uData );
13635 				}
13636 			}
13637 		} // the spa reader eliminates out of this scope
13638 		DWORD iNumStrings = dStrOffsets.GetLength();
13639 
13640 		// reopen strings for reading
13641 		CSphAutofile tRawStringsFile;
13642 		CSphReader tStrReader;
13643 		if ( tRawStringsFile.Open ( GetIndexFileName("tmps"), SPH_O_READ, m_sLastError, true )<0 )
13644 			return 0;
13645 		tStrReader.SetFile ( tRawStringsFile );
13646 
13647 		// now just load string chunks and resort them...
13648 		CSphFixedVector<BYTE> dStringPool ( iPoolSize );
13649 		BYTE* pStringsBegin = dStringPool.Begin();
13650 
13651 		// if we have more than 1 string chunks, we need several passes and bitmask to distinquish them
13652 		if ( dStringChunks.GetLength()>1 )
13653 		{
13654 			dStrOffsets.Resize ( iNumStrings+( iNumStrings>>5 )+1 );
13655 			DWORD* pDocinfoBitmap = &dStrOffsets [ iNumStrings ];
13656 			for ( DWORD i=0; i<1+( iNumStrings>>5 ); ++i )
13657 				pDocinfoBitmap[i] = 0;
13658 			SphOffset_t iMinStrings = 0;
13659 
13660 			ARRAY_FOREACH ( i, dStringChunks )
13661 			{
13662 				// read the current chunk
13663 				SphOffset_t iMaxStrings = iMinStrings + dStringChunks[i];
13664 				tStrReader.GetBytes ( pStringsBegin, dStringChunks[i] );
13665 
13666 				// walk throw the attributes and put the strings in the new order
13667 				DWORD uMaskOff = 0;
13668 				DWORD uMask = 1;
13669 				for ( DWORD k=0; k<iNumStrings; ++k )
13670 				{
13671 					if ( uMask==0x80000000 )
13672 					{
13673 						uMask = 1;
13674 						++uMaskOff;
13675 					} else
13676 						uMask <<= 1;
13677 					DWORD& uCurStr = dStrOffsets[k];
13678 					// already processed, or hit out of the the current chunk?
13679 					if ( pDocinfoBitmap[uMaskOff]&uMask || !uCurStr || uCurStr<iMinStrings || uCurStr>=iMaxStrings )
13680 						continue;
13681 
13682 					const BYTE * pStr = NULL;
13683 					int iLen = sphUnpackStr ( pStringsBegin + uCurStr - iMinStrings, &pStr );
13684 					if ( !iLen )
13685 						uCurStr = 0;
13686 					else
13687 					{
13688 						uCurStr = (DWORD)tStrFinalWriter.GetPos();
13689 						BYTE dPackedLen[4];
13690 						int iLenLen = sphPackStrlen ( dPackedLen, iLen );
13691 						tStrFinalWriter.PutBytes ( &dPackedLen, iLenLen );
13692 						tStrFinalWriter.PutBytes ( pStr, iLen );
13693 					}
13694 					pDocinfoBitmap[uMaskOff]|=uMask;
13695 				}
13696 				iMinStrings = iMaxStrings;
13697 			}
13698 		} else if ( dStringChunks.GetLength()==1 ) // only one chunk. Plain and simple!
13699 		{
13700 			DWORD iStringChunk = dStringChunks[0];
13701 			tStrReader.GetBytes ( pStringsBegin, iStringChunk );
13702 
13703 			// walk throw the attributes and put the strings in the new order
13704 			for ( DWORD k=0; k<iNumStrings; ++k )
13705 			{
13706 				DWORD& uOffset = dStrOffsets[k];
13707 				// already processed, or hit out of the the current chunk?
13708 				if ( uOffset<1 || uOffset>=iStringChunk )
13709 					continue;
13710 
13711 				const BYTE * pStr = NULL;
13712 				int iLen = sphUnpackStr ( pStringsBegin + uOffset, &pStr );
13713 				if ( !iLen )
13714 					uOffset = 0;
13715 				else
13716 				{
13717 					uOffset = (DWORD)tStrFinalWriter.GetPos();
13718 					BYTE dPackedLen[4];
13719 					int iLenLen = sphPackStrlen ( dPackedLen, iLen );
13720 					tStrFinalWriter.PutBytes ( &dPackedLen, iLenLen );
13721 					tStrFinalWriter.PutBytes ( pStr, iLen );
13722 				}
13723 			}
13724 		}
13725 		dStringPool.Reset(0);
13726 		// now save back patched string locators
13727 		{
13728 			DWORD iDocPoolSize = iPoolSize/iDocinfoStride/sizeof(DWORD);
13729 			CSphFixedVector<DWORD> dDocinfoPool ( iDocPoolSize*iDocinfoStride );
13730 			pDocinfo = dDocinfoPool.Begin();
13731 			DWORD iToRead = Min ( iDocPoolSize, DWORD(iNumDocs) );
13732 			SphOffset_t iPos = 0;
13733 			DWORD iStr = 0;
13734 			while ( iToRead )
13735 			{
13736 				sphSeek ( iDocinfoFD, iPos, SEEK_SET );
13737 				sphRead ( iDocinfoFD, pDocinfo, iToRead*iDocinfoStride*sizeof(DWORD));
13738 				for ( DWORD i=0; i<iToRead; ++i )
13739 				{
13740 					CSphRowitem * pAttrs = DOCINFO2ATTRS ( pDocinfo+i*iDocinfoStride );
13741 					ARRAY_FOREACH ( j, dStringAttrs )
13742 					{
13743 						const CSphAttrLocator& tLocator = m_tSchema.GetAttr ( dStringAttrs[j] ).m_tLocator;
13744 						if ( sphGetRowAttr ( pAttrs, tLocator ) )
13745 							sphSetRowAttr ( pAttrs, tLocator, dStrOffsets[iStr++] );
13746 					}
13747 				}
13748 				sphSeek ( iDocinfoFD, iPos, SEEK_SET );
13749 				sphWrite ( iDocinfoFD, pDocinfo, iToRead*iDocinfoStride*sizeof(DWORD));
13750 				iPos+=iToRead*iDocinfoStride*sizeof(DWORD);
13751 				iNumDocs-=iToRead;
13752 				iToRead = Min ( iDocPoolSize, DWORD(iNumDocs) );
13753 			}
13754 		} // all temporary buffers eliminates out of this scope
13755 	}
13756 
13757 	// it might be zero-length, but it must exist
13758 	if ( m_bInplaceSettings )
13759 		fdDocinfos.Close ();
13760 	else
13761 	{
13762 		assert ( pfdDocinfoFinal.Ptr () );
13763 		pfdDocinfoFinal->Close ();
13764 	}
13765 
13766 	// dump killlist
13767 	CSphAutofile tKillList ( GetIndexFileName("spk"), SPH_O_NEW, m_sLastError );
13768 	if ( tKillList.GetFD()<0 )
13769 		return 0;
13770 
13771 	if ( dKillList.GetLength () )
13772 	{
13773 		dKillList.Uniq ();
13774 
13775 		m_uKillListSize = dKillList.GetLength ();
13776 
13777 		if ( !sphWriteThrottled ( tKillList.GetFD(), &dKillList[0],
13778 			m_uKillListSize*sizeof(SphDocID_t), "kill list", m_sLastError, &g_tThrottle ) )
13779 				return 0;
13780 	}
13781 
13782 	tKillList.Close ();
13783 
13784 	///////////////////////////////////
13785 	// sort and write compressed index
13786 	///////////////////////////////////
13787 
13788 	// initialize readers
13789 	assert ( dBins.GetLength()==0 );
13790 	dBins.Reserve ( dHitBlocks.GetLength() );
13791 
13792 	iSharedOffset = -1;
13793 
13794 	float fReadFactor = 1.0f;
13795 	int iRelocationSize = 0;
13796 	iWriteBuffer = iHitBuilderBufferSize;
13797 
13798 	if ( m_bInplaceSettings )
13799 	{
13800 		assert ( m_fRelocFactor > 0.005f && m_fRelocFactor < 0.95f );
13801 		assert ( m_fWriteFactor > 0.005f && m_fWriteFactor < 0.95f );
13802 		assert ( m_fWriteFactor+m_fRelocFactor < 1.0f );
13803 
13804 		fReadFactor -= m_fRelocFactor + m_fWriteFactor;
13805 
13806 		iRelocationSize = int ( iMemoryLimit * m_fRelocFactor );
13807 		iWriteBuffer = int ( iMemoryLimit * m_fWriteFactor );
13808 	}
13809 
13810 	int iBinSize = CSphBin::CalcBinSize ( int ( iMemoryLimit * fReadFactor ),
13811 		dHitBlocks.GetLength() + m_pDict->GetSettings().m_bWordDict, "sort_hits" );
13812 
13813 	CSphFixedVector <BYTE> dRelocationBuffer ( iRelocationSize );
13814 	iSharedOffset = -1;
13815 
13816 	ARRAY_FOREACH ( i, dHitBlocks )
13817 	{
13818 		dBins.Add ( new CSphBin ( m_tSettings.m_eHitless, m_pDict->GetSettings().m_bWordDict ) );
13819 		dBins[i]->m_iFileLeft = dHitBlocks[i];
13820 		dBins[i]->m_iFilePos = ( i==0 ) ? iHitsGap : dBins[i-1]->m_iFilePos + dBins[i-1]->m_iFileLeft;
13821 		dBins[i]->Init ( fdHits.GetFD(), &iSharedOffset, iBinSize );
13822 	}
13823 
13824 	// if there were no hits, create zero-length index files
13825 	int iRawBlocks = dBins.GetLength();
13826 
13827 	//////////////////////////////
13828 	// create new index files set
13829 	//////////////////////////////
13830 
13831 	tHitBuilder.CreateIndexFiles ( GetIndexFileName("spd").cstr(), GetIndexFileName("spp").cstr(),
13832 		GetIndexFileName("spe").cstr(), m_bInplaceSettings, iWriteBuffer, fdHits, &iSharedOffset );
13833 
13834 	// dict files
13835 	CSphAutofile fdTmpDict ( GetIndexFileName("tmp8"), SPH_O_NEW, m_sLastError, true );
13836 	CSphAutofile fdDict ( GetIndexFileName("spi"), SPH_O_NEW, m_sLastError, false );
13837 	if ( fdTmpDict.GetFD()<0 || fdDict.GetFD()<0 )
13838 		return 0;
13839 	m_pDict->DictBegin ( fdTmpDict, fdDict, iBinSize, &g_tThrottle );
13840 
13841 	// adjust min IDs, and fill header
13842 	assert ( m_uMinDocid>0 );
13843 	m_uMinDocid--;
13844 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
13845 		ARRAY_FOREACH ( i, m_dMinRow )
13846 			m_dMinRow[i]--;
13847 
13848 	tHitBuilder.SetMin ( m_dMinRow.Begin(), m_dMinRow.GetLength() );
13849 
13850 	//////////////
13851 	// final sort
13852 	//////////////
13853 
13854 	if ( iRawBlocks )
13855 	{
13856 		int iLastBin = dBins.GetLength () - 1;
13857 		SphOffset_t iHitFileSize = dBins[iLastBin]->m_iFilePos + dBins [iLastBin]->m_iFileLeft;
13858 
13859 		CSphHitQueue tQueue ( iRawBlocks );
13860 		CSphAggregateHit tHit;
13861 
13862 		// initialize hitlist encoder state
13863 		tHitBuilder.HitReset();
13864 
13865 		// initial fill
13866 		int iRowitems = ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ) ? m_tSchema.GetRowSize() : 0;
13867 		CSphFixedVector<CSphRowitem> dInlineAttrs ( iRawBlocks*iRowitems );
13868 
13869 		CSphFixedVector<BYTE> dActive ( iRawBlocks );
13870 		for ( int i=0; i<iRawBlocks; i++ )
13871 		{
13872 			if ( !dBins[i]->ReadHit ( &tHit, iRowitems, dInlineAttrs.Begin() + i * iRowitems ) )
13873 			{
13874 				m_sLastError.SetSprintf ( "sort_hits: warmup failed (io error?)" );
13875 				return 0;
13876 			}
13877 			dActive[i] = ( tHit.m_uWordID!=0 );
13878 			if ( dActive[i] )
13879 				tQueue.Push ( tHit, i );
13880 		}
13881 
13882 		// init progress meter
13883 		m_tProgress.m_ePhase = CSphIndexProgress::PHASE_SORT;
13884 		m_tProgress.m_iHits = 0;
13885 
13886 		// while the queue has data for us
13887 		// FIXME! analyze binsRead return code
13888 		int iHitsSorted = 0;
13889 		iMinBlock = -1;
13890 		while ( tQueue.m_iUsed )
13891 		{
13892 			int iBin = tQueue.m_pData->m_iBin;
13893 
13894 			// pack and emit queue root
13895 			tQueue.m_pData->m_uDocID -= m_uMinDocid;
13896 
13897 			if ( m_bInplaceSettings )
13898 			{
13899 				if ( iMinBlock==-1 || dBins[iMinBlock]->IsEOF () || !dActive[iMinBlock] )
13900 				{
13901 					iMinBlock = -1;
13902 					ARRAY_FOREACH ( i, dBins )
13903 						if ( !dBins[i]->IsEOF () && dActive[i] && ( iMinBlock==-1 || dBins[i]->m_iFilePos < dBins[iMinBlock]->m_iFilePos ) )
13904 							iMinBlock = i;
13905 				}
13906 
13907 				int iToWriteMax = 3*sizeof(DWORD);
13908 				if ( iMinBlock!=-1 && ( tHitBuilder.GetHitfilePos() + iToWriteMax ) > dBins[iMinBlock]->m_iFilePos )
13909 				{
13910 					if ( !RelocateBlock ( fdHits.GetFD (), dRelocationBuffer.Begin(), iRelocationSize, &iHitFileSize, dBins[iMinBlock], &iSharedOffset ) )
13911 						return 0;
13912 
13913 					iMinBlock = (iMinBlock+1) % dBins.GetLength ();
13914 				}
13915 			}
13916 
13917 			tHitBuilder.cidxHit ( tQueue.m_pData, iRowitems ? dInlineAttrs.Begin() + iBin * iRowitems : NULL );
13918 			if ( tHitBuilder.IsError() )
13919 				return 0;
13920 
13921 			// pop queue root and push next hit from popped bin
13922 			tQueue.Pop ();
13923 			if ( dActive[iBin] )
13924 			{
13925 				dBins[iBin]->ReadHit ( &tHit, iRowitems, dInlineAttrs.Begin() + iBin * iRowitems );
13926 				dActive[iBin] = ( tHit.m_uWordID!=0 );
13927 				if ( dActive[iBin] )
13928 					tQueue.Push ( tHit, iBin );
13929 			}
13930 
13931 			// progress
13932 			if ( ++iHitsSorted==1000000 )
13933 			{
13934 				m_tProgress.m_iHits += iHitsSorted;
13935 				m_tProgress.Show ( false );
13936 				iHitsSorted = 0;
13937 			}
13938 		}
13939 
13940 		m_tProgress.m_iHits = m_tProgress.m_iHitsTotal; // sum might be less than total because of dupes!
13941 		m_tProgress.Show ( true );
13942 
13943 		ARRAY_FOREACH ( i, dBins )
13944 			SafeDelete ( dBins[i] );
13945 		dBins.Reset ();
13946 
13947 		CSphAggregateHit tFlush;
13948 		tFlush.m_uDocID = 0;
13949 		tFlush.m_uWordID = 0;
13950 		tFlush.m_sKeyword = NULL;
13951 		tFlush.m_iWordPos = EMPTY_HIT;
13952 		tFlush.m_dFieldMask.UnsetAll();
13953 		tHitBuilder.cidxHit ( &tFlush, NULL );
13954 
13955 		if ( m_bInplaceSettings )
13956 		{
13957 			tHitBuilder.CloseHitlist();
13958 			if ( !sphTruncate ( fdHits.GetFD () ) )
13959 				sphWarn ( "failed to truncate %s", fdHits.GetFilename() );
13960 		}
13961 	}
13962 
13963 	if ( iDupes )
13964 		sphWarn ( "%d duplicate document id pairs found", iDupes );
13965 
13966 	BuildHeader_t tBuildHeader ( m_tStats );
13967 	if ( !tHitBuilder.cidxDone ( iMemoryLimit, m_tSettings.m_iMinInfixLen, m_pTokenizer->GetMaxCodepointLength(), &tBuildHeader ) )
13968 		return 0;
13969 
13970 	tBuildHeader.m_sHeaderExtension = "sph";
13971 	tBuildHeader.m_pMinRow = m_dMinRow.Begin();
13972 	tBuildHeader.m_uMinDocid = m_uMinDocid;
13973 	tBuildHeader.m_pThrottle = &g_tThrottle;
13974 	tBuildHeader.m_uKillListSize = m_uKillListSize;
13975 	tBuildHeader.m_iMinMaxIndex = m_iMinMaxIndex;
13976 	tBuildHeader.m_iTotalDups = iDupes;
13977 
13978 	// we're done
13979 	if ( !BuildDone ( tBuildHeader, m_sLastError ) )
13980 		return 0;
13981 
13982 	// when the party's over..
13983 	ARRAY_FOREACH ( i, dSources )
13984 		dSources[i]->PostIndex ();
13985 
13986 	dFileWatchdog.AllIsDone();
13987 	return 1;
13988 } // NOLINT function length
13989 
13990 
13991 /////////////////////////////////////////////////////////////////////////////
13992 // MERGER HELPERS
13993 /////////////////////////////////////////////////////////////////////////////
13994 
13995 
CopyFile(const char * sSrc,const char * sDst,CSphString & sErrStr,ThrottleState_t * pThrottle,volatile bool * pGlobalStop,volatile bool * pLocalStop)13996 static bool CopyFile ( const char * sSrc, const char * sDst, CSphString & sErrStr, ThrottleState_t * pThrottle, volatile bool * pGlobalStop, volatile bool * pLocalStop )
13997 {
13998 	assert ( sSrc );
13999 	assert ( sDst );
14000 
14001 	const DWORD iMaxBufSize = 1024 * 1024;
14002 
14003 	CSphAutofile tSrcFile ( sSrc, SPH_O_READ, sErrStr );
14004 	CSphAutofile tDstFile ( sDst, SPH_O_NEW, sErrStr );
14005 
14006 	if ( tSrcFile.GetFD()<0 || tDstFile.GetFD()<0 )
14007 		return false;
14008 
14009 	SphOffset_t iFileSize = tSrcFile.GetSize();
14010 	DWORD iBufSize = (DWORD) Min ( iFileSize, (SphOffset_t)iMaxBufSize );
14011 
14012 	if ( iFileSize )
14013 	{
14014 		CSphFixedVector<BYTE> dData ( iBufSize );
14015 		bool bError = true;
14016 
14017 		while ( iFileSize > 0 )
14018 		{
14019 			if ( *pGlobalStop || *pLocalStop )
14020 				return false;
14021 
14022 			DWORD iSize = (DWORD) Min ( iFileSize, (SphOffset_t)iBufSize );
14023 
14024 			size_t iRead = sphReadThrottled ( tSrcFile.GetFD(), dData.Begin(), iSize, pThrottle );
14025 			if ( iRead!=iSize )
14026 			{
14027 				sErrStr.SetSprintf ( "read error in %s; " INT64_FMT " of %d bytes read", sSrc, (int64_t)iRead, iSize );
14028 				break;
14029 			}
14030 
14031 			if ( !sphWriteThrottled ( tDstFile.GetFD(), dData.Begin(), iSize, "CopyFile", sErrStr, pThrottle ) )
14032 				break;
14033 
14034 			iFileSize -= iSize;
14035 
14036 			if ( !iFileSize )
14037 				bError = false;
14038 		}
14039 
14040 		return ( bError==false );
14041 	}
14042 
14043 	return true;
14044 }
14045 
14046 
CopyRowString(const BYTE * pBase,const CSphVector<CSphAttrLocator> & dString,CSphRowitem * pRow,CSphWriter & wrTo)14047 static void CopyRowString ( const BYTE * pBase, const CSphVector<CSphAttrLocator> & dString, CSphRowitem * pRow, CSphWriter & wrTo )
14048 {
14049 	if ( !dString.GetLength() )
14050 		return;
14051 
14052 	CSphRowitem * pAttr = DOCINFO2ATTRS ( pRow );
14053 	ARRAY_FOREACH ( i, dString )
14054 	{
14055 		SphAttr_t uOff = sphGetRowAttr ( pAttr, dString[i] );
14056 		// magic offset? do nothing
14057 		if ( !uOff )
14058 			continue;
14059 
14060 		const BYTE * pStr = NULL;
14061 		int iLen = sphUnpackStr ( pBase + uOff, &pStr );
14062 
14063 		// no data? do nothing
14064 		if ( !iLen )
14065 			continue;
14066 
14067 		// copy bytes
14068 		uOff = (SphAttr_t)wrTo.GetPos();
14069 		assert ( uOff<UINT_MAX );
14070 		sphSetRowAttr ( pAttr, dString[i], uOff );
14071 
14072 		BYTE dPackedLen[4];
14073 		int iLenLen = sphPackStrlen ( dPackedLen, iLen );
14074 		wrTo.PutBytes ( &dPackedLen, iLenLen );
14075 		wrTo.PutBytes ( pStr, iLen );
14076 	}
14077 }
14078 
CopyRowMVA(const DWORD * pBase,const CSphVector<CSphAttrLocator> & dMva,SphDocID_t uDocid,CSphRowitem * pRow,CSphWriter & wrTo)14079 static void CopyRowMVA ( const DWORD * pBase, const CSphVector<CSphAttrLocator> & dMva,
14080 	SphDocID_t uDocid, CSphRowitem * pRow, CSphWriter & wrTo )
14081 {
14082 	if ( !dMva.GetLength() )
14083 		return;
14084 
14085 	CSphRowitem * pAttr = DOCINFO2ATTRS ( pRow );
14086 	bool bDocidWriten = false;
14087 	ARRAY_FOREACH ( i, dMva )
14088 	{
14089 		SphAttr_t uOff = sphGetRowAttr ( pAttr, dMva[i] );
14090 		if ( !uOff )
14091 			continue;
14092 
14093 		assert ( pBase );
14094 		if ( !bDocidWriten )
14095 		{
14096 			assert ( DOCINFO2ID ( pBase + uOff - DOCINFO_IDSIZE )==uDocid ); // there is DocID prior to 1st MVA
14097 			wrTo.PutDocid ( uDocid );
14098 			bDocidWriten = true;
14099 		}
14100 
14101 		assert ( wrTo.GetPos()/sizeof(DWORD)<=UINT_MAX );
14102 		SphAttr_t uNewOff = ( DWORD )wrTo.GetPos() / sizeof( DWORD );
14103 		sphSetRowAttr ( pAttr, dMva[i], uNewOff );
14104 
14105 		DWORD iValues = pBase[uOff];
14106 		wrTo.PutBytes ( pBase + uOff, ( iValues+1 )*sizeof(DWORD) );
14107 	}
14108 }
14109 
14110 
14111 static const int DOCLIST_HINT_THRESH = 256;
14112 
14113 // let uDocs be DWORD here to prevent int overflow in case of hitless word (highest bit is 1)
DoclistHintUnpack(DWORD uDocs,BYTE uHint)14114 static int DoclistHintUnpack ( DWORD uDocs, BYTE uHint )
14115 {
14116 	if ( uDocs<(DWORD)DOCLIST_HINT_THRESH )
14117 		return (int)Min ( 8*(int64_t)uDocs, INT_MAX );
14118 	else
14119 		return (int)Min ( 4*(int64_t)uDocs+( int64_t(uDocs)*uHint/64 ), INT_MAX );
14120 }
14121 
sphDoclistHintPack(SphOffset_t iDocs,SphOffset_t iLen)14122 BYTE sphDoclistHintPack ( SphOffset_t iDocs, SphOffset_t iLen )
14123 {
14124 	// we won't really store a hint for small lists
14125 	if ( iDocs<DOCLIST_HINT_THRESH )
14126 		return 0;
14127 
14128 	// for bigger lists len/docs varies 4x-6x on test indexes
14129 	// so lets assume that 4x-8x should be enough for everybody
14130 	SphOffset_t iDelta = Min ( Max ( iLen-4*iDocs, 0 ), 4*iDocs-1 ); // len delta over 4x, clamped to [0x..4x) range
14131 	BYTE uHint = (BYTE)( 64*iDelta/iDocs ); // hint now must be in [0..256) range
14132 	while ( uHint<255 && ( iDocs*uHint/64 )<iDelta ) // roundoff (suddenly, my guru math skillz failed me)
14133 		uHint++;
14134 
14135 	return uHint;
14136 }
14137 
14138 // !COMMIT eliminate this, move to dict (or at least couple with CWordlist)
14139 class CSphDictReader
14140 {
14141 public:
14142 	// current word
14143 	SphWordID_t		m_uWordID;
14144 	SphOffset_t		m_iDoclistOffset;
14145 	int				m_iDocs;
14146 	int				m_iHits;
14147 	bool			m_bHasHitlist;
14148 	int				m_iHint;
14149 
14150 private:
14151 	ESphHitless		m_eHitless;
14152 	CSphAutoreader	m_tMyReader;
14153 	CSphReader *	m_pReader;
14154 	SphOffset_t		m_iMaxPos;
14155 
14156 	bool			m_bWordDict;
14157 	char			m_sWord[MAX_KEYWORD_BYTES];
14158 
14159 	int				m_iCheckpoint;
14160 	bool			m_bHasSkips;
14161 
14162 public:
CSphDictReader()14163 	CSphDictReader()
14164 		: m_uWordID ( 0 )
14165 		, m_iDoclistOffset ( 0 )
14166 		, m_iHint ( 0 )
14167 		, m_iMaxPos ( 0 )
14168 		, m_bWordDict ( true )
14169 		, m_iCheckpoint ( 1 )
14170 		, m_bHasSkips ( false )
14171 	{
14172 		m_sWord[0] = '\0';
14173 	}
14174 
Setup(const CSphString & sFilename,SphOffset_t iMaxPos,ESphHitless eHitless,CSphString & sError,bool bWordDict,ThrottleState_t * pThrottle,bool bHasSkips)14175 	bool Setup ( const CSphString & sFilename, SphOffset_t iMaxPos, ESphHitless eHitless,
14176 		CSphString & sError, bool bWordDict, ThrottleState_t * pThrottle, bool bHasSkips )
14177 	{
14178 		if ( !m_tMyReader.Open ( sFilename, sError ) )
14179 			return false;
14180 		Setup ( &m_tMyReader, iMaxPos, eHitless, bWordDict, pThrottle, bHasSkips );
14181 		return true;
14182 	}
14183 
Setup(CSphReader * pReader,SphOffset_t iMaxPos,ESphHitless eHitless,bool bWordDict,ThrottleState_t * pThrottle,bool bHasSkips)14184 	void Setup ( CSphReader * pReader, SphOffset_t iMaxPos, ESphHitless eHitless, bool bWordDict, ThrottleState_t * pThrottle, bool bHasSkips )
14185 	{
14186 		m_pReader = pReader;
14187 		m_pReader->SetThrottle ( pThrottle );
14188 		m_pReader->SeekTo ( 1, READ_NO_SIZE_HINT );
14189 
14190 		m_iMaxPos = iMaxPos;
14191 		m_eHitless = eHitless;
14192 		m_bWordDict = bWordDict;
14193 		m_sWord[0] = '\0';
14194 		m_iCheckpoint = 1;
14195 		m_bHasSkips = bHasSkips;
14196 	}
14197 
Read()14198 	bool Read()
14199 	{
14200 		if ( m_pReader->GetPos()>=m_iMaxPos )
14201 			return false;
14202 
14203 		// get leading value
14204 		SphWordID_t iWord0 = m_bWordDict ? m_pReader->GetByte() : m_pReader->UnzipWordid();
14205 		if ( !iWord0 )
14206 		{
14207 			// handle checkpoint
14208 			m_iCheckpoint++;
14209 			m_pReader->UnzipOffset();
14210 
14211 			m_uWordID = 0;
14212 			m_iDoclistOffset = 0;
14213 			m_sWord[0] = '\0';
14214 
14215 			if ( m_pReader->GetPos()>=m_iMaxPos )
14216 				return false;
14217 
14218 			iWord0 = m_bWordDict ? m_pReader->GetByte() : m_pReader->UnzipWordid(); // get next word
14219 		}
14220 		if ( !iWord0 )
14221 			return false; // some failure
14222 
14223 		// get word entry
14224 		if ( m_bWordDict )
14225 		{
14226 			// unpack next word
14227 			// must be in sync with DictEnd()!
14228 			assert ( iWord0<=255 );
14229 			BYTE uPack = (BYTE) iWord0;
14230 
14231 			int iMatch, iDelta;
14232 			if ( uPack & 0x80 )
14233 			{
14234 				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
14235 				iMatch = uPack & 15;
14236 			} else
14237 			{
14238 				iDelta = uPack & 127;
14239 				iMatch = m_pReader->GetByte();
14240 			}
14241 			assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
14242 			assert ( iMatch<=(int)strlen(m_sWord) );
14243 
14244 			m_pReader->GetBytes ( m_sWord + iMatch, iDelta );
14245 			m_sWord [ iMatch+iDelta ] = '\0';
14246 
14247 			m_iDoclistOffset = m_pReader->UnzipOffset();
14248 			m_iDocs = m_pReader->UnzipInt();
14249 			m_iHits = m_pReader->UnzipInt();
14250 			m_iHint = 0;
14251 			if ( m_iDocs>=DOCLIST_HINT_THRESH )
14252 				m_iHint = m_pReader->GetByte();
14253 			if ( m_bHasSkips && ( m_iDocs > SPH_SKIPLIST_BLOCK ) )
14254 				m_pReader->UnzipInt();
14255 
14256 			m_uWordID = (SphWordID_t) sphCRC32 ( GetWord() ); // set wordID for indexing
14257 
14258 		} else
14259 		{
14260 			m_uWordID += iWord0;
14261 			m_iDoclistOffset += m_pReader->UnzipOffset();
14262 			m_iDocs = m_pReader->UnzipInt();
14263 			m_iHits = m_pReader->UnzipInt();
14264 			if ( m_bHasSkips && ( m_iDocs > SPH_SKIPLIST_BLOCK ) )
14265 				m_pReader->UnzipOffset();
14266 		}
14267 
14268 		m_bHasHitlist =
14269 			( m_eHitless==SPH_HITLESS_NONE ) ||
14270 			( m_eHitless==SPH_HITLESS_SOME && !( m_iDocs & HITLESS_DOC_FLAG ) );
14271 		m_iDocs = m_eHitless==SPH_HITLESS_SOME ? ( m_iDocs & HITLESS_DOC_MASK ) : m_iDocs;
14272 
14273 		return true; // FIXME? errorflag?
14274 	}
14275 
CmpWord(const CSphDictReader & tOther) const14276 	int CmpWord ( const CSphDictReader & tOther ) const
14277 	{
14278 		if ( m_bWordDict )
14279 			return strcmp ( m_sWord, tOther.m_sWord );
14280 
14281 		int iRes = 0;
14282 		iRes = m_uWordID<tOther.m_uWordID ? -1 : iRes;
14283 		iRes = m_uWordID>tOther.m_uWordID ? 1 : iRes;
14284 		return iRes;
14285 	}
14286 
GetWord() const14287 	BYTE * GetWord() const { return (BYTE *)m_sWord; }
14288 
GetCheckpoint() const14289 	int GetCheckpoint() const { return m_iCheckpoint; }
14290 };
14291 
CreateMergeFilters(const CSphVector<CSphFilterSettings> & dSettings,const CSphSchema & tSchema,const DWORD * pMvaPool,const BYTE * pStrings,bool bArenaProhibit)14292 static ISphFilter * CreateMergeFilters ( const CSphVector<CSphFilterSettings> & dSettings,
14293 										const CSphSchema & tSchema, const DWORD * pMvaPool, const BYTE * pStrings, bool bArenaProhibit )
14294 {
14295 	CSphString sError;
14296 	ISphFilter * pResult = NULL;
14297 	ARRAY_FOREACH ( i, dSettings )
14298 	{
14299 		ISphFilter * pFilter = sphCreateFilter ( dSettings[i], tSchema, pMvaPool, pStrings, sError, SPH_COLLATION_DEFAULT, bArenaProhibit );
14300 		if ( pFilter )
14301 			pResult = sphJoinFilters ( pResult, pFilter );
14302 	}
14303 	return pResult;
14304 }
14305 
CheckDocsCount(int64_t iDocs,CSphString & sError)14306 static bool CheckDocsCount ( int64_t iDocs, CSphString & sError )
14307 {
14308 	if ( iDocs<INT_MAX )
14309 		return true;
14310 
14311 	sError.SetSprintf ( "index over %d documents not supported (got " INT64_FMT " documents)", INT_MAX, iDocs );
14312 	return false;
14313 }
14314 
14315 
14316 class CSphMerger
14317 {
14318 private:
14319 	CSphFixedVector<CSphRowitem> m_dInlineRow;
14320 	CSphHitBuilder *	m_pHitBuilder;
14321 	SphDocID_t			m_uMinID;
14322 
14323 public:
CSphMerger(CSphHitBuilder * pHitBuilder,int iInlineCount,SphDocID_t uMinID)14324 	explicit CSphMerger ( CSphHitBuilder * pHitBuilder, int iInlineCount, SphDocID_t uMinID )
14325 		: m_dInlineRow ( iInlineCount )
14326 		, m_pHitBuilder ( pHitBuilder )
14327 		, m_uMinID ( uMinID )
14328 	{
14329 	}
14330 
14331 	template < typename QWORD >
PrepareQword(QWORD & tQword,const CSphDictReader & tReader,SphDocID_t iMinID,bool bWordDict)14332 	static inline void PrepareQword ( QWORD & tQword, const CSphDictReader & tReader, SphDocID_t iMinID, bool bWordDict ) //NOLINT
14333 	{
14334 		tQword.m_iMinID = iMinID;
14335 		tQword.m_tDoc.m_uDocID = iMinID;
14336 
14337 		tQword.m_iDocs = tReader.m_iDocs;
14338 		tQword.m_iHits = tReader.m_iHits;
14339 		tQword.m_bHasHitlist = tReader.m_bHasHitlist;
14340 
14341 		tQword.m_uHitPosition = 0;
14342 		tQword.m_iHitlistPos = 0;
14343 
14344 		if ( bWordDict )
14345 			tQword.m_rdDoclist.SeekTo ( tReader.m_iDoclistOffset, tReader.m_iHint );
14346 	}
14347 
14348 	template < typename QWORD >
NextDocument(QWORD & tQword,const CSphIndex_VLN * pSourceIndex,const ISphFilter * pFilter,const CSphVector<SphDocID_t> & dKillList)14349 	inline bool NextDocument ( QWORD & tQword, const CSphIndex_VLN * pSourceIndex, const ISphFilter * pFilter, const CSphVector<SphDocID_t> & dKillList )
14350 	{
14351 		for ( ;; )
14352 		{
14353 			tQword.GetNextDoc ( m_dInlineRow.Begin() );
14354 			if ( tQword.m_tDoc.m_uDocID )
14355 			{
14356 				tQword.SeekHitlist ( tQword.m_iHitlistPos );
14357 
14358 				if ( dKillList.BinarySearch ( tQword.m_tDoc.m_uDocID ) ) // optimize this somehow?
14359 				{
14360 					while ( tQword.m_bHasHitlist && tQword.GetNextHit()!=EMPTY_HIT );
14361 					continue;
14362 				}
14363 				if ( pFilter )
14364 				{
14365 					CSphMatch tMatch;
14366 					tMatch.m_uDocID = tQword.m_tDoc.m_uDocID;
14367 					if ( pFilter->UsesAttrs() )
14368 					{
14369 						if ( m_dInlineRow.GetLength() )
14370 							tMatch.m_pDynamic = m_dInlineRow.Begin();
14371 						else
14372 						{
14373 							const DWORD * pInfo = pSourceIndex->FindDocinfo ( tQword.m_tDoc.m_uDocID );
14374 							tMatch.m_pStatic = pInfo?DOCINFO2ATTRS ( pInfo ):NULL;
14375 						}
14376 					}
14377 					bool bResult = pFilter->Eval ( tMatch );
14378 					tMatch.m_pDynamic = NULL;
14379 					if ( !bResult )
14380 					{
14381 						while ( tQword.m_bHasHitlist && tQword.GetNextHit()!=EMPTY_HIT );
14382 						continue;
14383 					}
14384 				}
14385 				return true;
14386 			} else
14387 				return false;
14388 		}
14389 	}
14390 
14391 	template < typename QWORD >
TransferData(QWORD & tQword,SphWordID_t iWordID,const BYTE * sWord,const CSphIndex_VLN * pSourceIndex,const ISphFilter * pFilter,const CSphVector<SphDocID_t> & dKillList,volatile bool * pGlobalStop,volatile bool * pLocalStop)14392 	inline void TransferData ( QWORD & tQword, SphWordID_t iWordID, const BYTE * sWord,
14393 							const CSphIndex_VLN * pSourceIndex, const ISphFilter * pFilter,
14394 							const CSphVector<SphDocID_t> & dKillList, volatile bool * pGlobalStop, volatile bool * pLocalStop )
14395 	{
14396 		CSphAggregateHit tHit;
14397 		tHit.m_uWordID = iWordID;
14398 		tHit.m_sKeyword = sWord;
14399 		tHit.m_dFieldMask.UnsetAll();
14400 
14401 		while ( CSphMerger::NextDocument ( tQword, pSourceIndex, pFilter, dKillList ) && !*pGlobalStop && !*pLocalStop )
14402 		{
14403 			if ( tQword.m_bHasHitlist )
14404 				TransferHits ( tQword, tHit );
14405 			else
14406 			{
14407 				// convert to aggregate if there is no hit-list
14408 				tHit.m_uDocID = tQword.m_tDoc.m_uDocID - m_uMinID;
14409 				tHit.m_dFieldMask = tQword.m_dQwordFields;
14410 				tHit.SetAggrCount ( tQword.m_uMatchHits );
14411 				m_pHitBuilder->cidxHit ( &tHit, m_dInlineRow.Begin() );
14412 			}
14413 		}
14414 	}
14415 
14416 	template < typename QWORD >
TransferHits(QWORD & tQword,CSphAggregateHit & tHit)14417 	inline void TransferHits ( QWORD & tQword, CSphAggregateHit & tHit )
14418 	{
14419 		assert ( tQword.m_bHasHitlist );
14420 		tHit.m_uDocID = tQword.m_tDoc.m_uDocID - m_uMinID;
14421 		for ( Hitpos_t uHit = tQword.GetNextHit(); uHit!=EMPTY_HIT; uHit = tQword.GetNextHit() )
14422 		{
14423 			tHit.m_iWordPos = uHit;
14424 			m_pHitBuilder->cidxHit ( &tHit, m_dInlineRow.Begin() );
14425 		}
14426 	}
14427 
14428 	template < typename QWORD >
ConfigureQword(QWORD & tQword,const CSphAutofile & tHits,const CSphAutofile & tDocs,int iDynamic,int iInline,const CSphRowitem * pMin,ThrottleState_t * pThrottle)14429 	static inline void ConfigureQword ( QWORD & tQword, const CSphAutofile & tHits, const CSphAutofile & tDocs,
14430 		int iDynamic, int iInline, const CSphRowitem * pMin, ThrottleState_t * pThrottle )
14431 	{
14432 		tQword.m_iInlineAttrs = iInline;
14433 		tQword.m_pInlineFixup = iInline ? pMin : NULL;
14434 
14435 		tQword.m_rdHitlist.SetThrottle ( pThrottle );
14436 		tQword.m_rdHitlist.SetFile ( tHits );
14437 		tQword.m_rdHitlist.GetByte();
14438 
14439 		tQword.m_rdDoclist.SetThrottle ( pThrottle );
14440 		tQword.m_rdDoclist.SetFile ( tDocs );
14441 		tQword.m_rdDoclist.GetByte();
14442 
14443 		tQword.m_tDoc.Reset ( iDynamic );
14444 	}
14445 
GetInline() const14446 	const CSphRowitem * GetInline () const { return m_dInlineRow.Begin(); }
AcquireInline() const14447 	CSphRowitem * AcquireInline () const { return m_dInlineRow.Begin(); }
14448 };
14449 
14450 
14451 template < typename QWORDDST, typename QWORDSRC >
MergeWords(const CSphIndex_VLN * pDstIndex,const CSphIndex_VLN * pSrcIndex,const ISphFilter * pFilter,const CSphVector<SphDocID_t> & dKillList,SphDocID_t uMinID,CSphHitBuilder * pHitBuilder,CSphString & sError,CSphSourceStats & tStat,CSphIndexProgress & tProgress,ThrottleState_t * pThrottle,volatile bool * pGlobalStop,volatile bool * pLocalStop)14452 bool CSphIndex_VLN::MergeWords ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex,
14453 								const ISphFilter * pFilter, const CSphVector<SphDocID_t> & dKillList, SphDocID_t uMinID,
14454 								CSphHitBuilder * pHitBuilder, CSphString & sError, CSphSourceStats & tStat,
14455 								CSphIndexProgress & tProgress, ThrottleState_t * pThrottle, volatile bool * pGlobalStop, volatile bool * pLocalStop )
14456 {
14457 	CSphAutofile tDummy;
14458 	pHitBuilder->CreateIndexFiles ( pDstIndex->GetIndexFileName("tmp.spd").cstr(),
14459 		pDstIndex->GetIndexFileName("tmp.spp").cstr(),
14460 		pDstIndex->GetIndexFileName("tmp.spe").cstr(),
14461 		false, 0, tDummy, NULL );
14462 
14463 	CSphDictReader tDstReader;
14464 	CSphDictReader tSrcReader;
14465 
14466 	bool bWordDict = pHitBuilder->IsWordDict();
14467 
14468 	if ( !tDstReader.Setup ( pDstIndex->GetIndexFileName("spi"), pDstIndex->m_tWordlist.m_iWordsEnd,
14469 		pDstIndex->m_tSettings.m_eHitless, sError, bWordDict, pThrottle, pDstIndex->m_tWordlist.m_bHaveSkips ) )
14470 			return false;
14471 	if ( !tSrcReader.Setup ( pSrcIndex->GetIndexFileName("spi"), pSrcIndex->m_tWordlist.m_iWordsEnd,
14472 		pSrcIndex->m_tSettings.m_eHitless, sError, bWordDict, pThrottle, pSrcIndex->m_tWordlist.m_bHaveSkips ) )
14473 			return false;
14474 
14475 	const SphDocID_t uDstMinID = pDstIndex->m_uMinDocid;
14476 	const SphDocID_t uSrcMinID = pSrcIndex->m_uMinDocid;
14477 
14478 	/// prepare for indexing
14479 	pHitBuilder->HitblockBegin();
14480 	pHitBuilder->HitReset();
14481 	pHitBuilder->SetMin ( pDstIndex->m_dMinRow.Begin(), pDstIndex->m_dMinRow.GetLength() );
14482 
14483 	/// setup qwords
14484 
14485 	QWORDDST tDstQword ( false, false );
14486 	QWORDSRC tSrcQword ( false, false );
14487 
14488 	CSphAutofile tSrcDocs, tSrcHits;
14489 	tSrcDocs.Open ( pSrcIndex->GetIndexFileName("spd"), SPH_O_READ, sError );
14490 	tSrcHits.Open ( pSrcIndex->GetIndexFileName("spp"), SPH_O_READ, sError );
14491 
14492 	CSphAutofile tDstDocs, tDstHits;
14493 	tDstDocs.Open ( pDstIndex->GetIndexFileName("spd"), SPH_O_READ, sError );
14494 	tDstHits.Open ( pDstIndex->GetIndexFileName("spp"), SPH_O_READ, sError );
14495 
14496 	if ( !sError.IsEmpty() || *pGlobalStop || *pLocalStop )
14497 		return false;
14498 
14499 	int iDstInlineSize = pDstIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ? pDstIndex->m_tSchema.GetRowSize() : 0;
14500 	int iSrcInlineSize = pSrcIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE ? pSrcIndex->m_tSchema.GetRowSize() : 0;
14501 
14502 	CSphMerger tMerger ( pHitBuilder, Max ( iDstInlineSize, iSrcInlineSize ), uMinID );
14503 
14504 	CSphMerger::ConfigureQword<QWORDDST> ( tDstQword, tDstHits, tDstDocs,
14505 		pDstIndex->m_tSchema.GetDynamicSize(), iDstInlineSize,
14506 		pDstIndex->m_dMinRow.Begin(), pThrottle );
14507 	CSphMerger::ConfigureQword<QWORDSRC> ( tSrcQword, tSrcHits, tSrcDocs,
14508 		pSrcIndex->m_tSchema.GetDynamicSize(), iSrcInlineSize,
14509 		pSrcIndex->m_dMinRow.Begin(), pThrottle );
14510 
14511 	/// merge
14512 
14513 	bool bDstWord = tDstReader.Read();
14514 	bool bSrcWord = tSrcReader.Read();
14515 
14516 	tProgress.m_ePhase = CSphIndexProgress::PHASE_MERGE;
14517 	tProgress.Show ( false );
14518 
14519 	int iWords = 0;
14520 	int iHitlistsDiscarded = 0;
14521 	for ( ; bDstWord || bSrcWord; iWords++ )
14522 	{
14523 		if ( iWords==1000 )
14524 		{
14525 			tProgress.m_iWords += 1000;
14526 			tProgress.Show ( false );
14527 			iWords = 0;
14528 		}
14529 
14530 		if ( *pGlobalStop || *pLocalStop )
14531 			return false;
14532 
14533 		const int iCmp = tDstReader.CmpWord ( tSrcReader );
14534 
14535 		if ( !bSrcWord || ( bDstWord && iCmp<0 ) )
14536 		{
14537 			// transfer documents and hits from destination
14538 			CSphMerger::PrepareQword<QWORDDST> ( tDstQword, tDstReader, uDstMinID, bWordDict );
14539 			tMerger.TransferData<QWORDDST> ( tDstQword, tDstReader.m_uWordID, tDstReader.GetWord(), pDstIndex, pFilter, dKillList, pGlobalStop, pLocalStop );
14540 			bDstWord = tDstReader.Read();
14541 
14542 		} else if ( !bDstWord || ( bSrcWord && iCmp>0 ) )
14543 		{
14544 			// transfer documents and hits from source
14545 			CSphMerger::PrepareQword<QWORDSRC> ( tSrcQword, tSrcReader, uSrcMinID, bWordDict );
14546 			tMerger.TransferData<QWORDSRC> ( tSrcQword, tSrcReader.m_uWordID, tSrcReader.GetWord(), pSrcIndex, NULL, CSphVector<SphDocID_t>(), pGlobalStop, pLocalStop );
14547 			bSrcWord = tSrcReader.Read();
14548 
14549 		} else // merge documents and hits inside the word
14550 		{
14551 			assert ( iCmp==0 );
14552 
14553 			bool bHitless = !tDstReader.m_bHasHitlist;
14554 			if ( tDstReader.m_bHasHitlist!=tSrcReader.m_bHasHitlist )
14555 			{
14556 				iHitlistsDiscarded++;
14557 				bHitless = true;
14558 			}
14559 
14560 			CSphMerger::PrepareQword<QWORDDST> ( tDstQword, tDstReader, uDstMinID, bWordDict );
14561 			CSphMerger::PrepareQword<QWORDSRC> ( tSrcQword, tSrcReader, uSrcMinID, bWordDict );
14562 
14563 			CSphAggregateHit tHit;
14564 			tHit.m_uWordID = tDstReader.m_uWordID; // !COMMIT m_sKeyword anyone?
14565 			tHit.m_sKeyword = tDstReader.GetWord();
14566 			tHit.m_dFieldMask.UnsetAll();
14567 
14568 			bool bDstDocs = tMerger.NextDocument ( tDstQword, pDstIndex, pFilter, dKillList );
14569 			bool bSrcDocs = true;
14570 
14571 			tSrcQword.GetNextDoc ( tMerger.AcquireInline() );
14572 			tSrcQword.SeekHitlist ( tSrcQword.m_iHitlistPos );
14573 
14574 			while ( bDstDocs || bSrcDocs )
14575 			{
14576 				if ( *pGlobalStop || *pLocalStop )
14577 					return false;
14578 
14579 				if ( !bSrcDocs || ( bDstDocs && tDstQword.m_tDoc.m_uDocID < tSrcQword.m_tDoc.m_uDocID ) )
14580 				{
14581 					// transfer hits from destination
14582 					if ( bHitless )
14583 					{
14584 						while ( tDstQword.m_bHasHitlist && tDstQword.GetNextHit()!=EMPTY_HIT );
14585 
14586 						tHit.m_uDocID = tDstQword.m_tDoc.m_uDocID - uMinID;
14587 						tHit.m_dFieldMask = tDstQword.m_dQwordFields;
14588 						tHit.SetAggrCount ( tDstQword.m_uMatchHits );
14589 						pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
14590 					} else
14591 						tMerger.TransferHits ( tDstQword, tHit );
14592 					bDstDocs = tMerger.NextDocument ( tDstQword, pDstIndex, pFilter, dKillList );
14593 
14594 				} else if ( !bDstDocs || ( bSrcDocs && tDstQword.m_tDoc.m_uDocID > tSrcQword.m_tDoc.m_uDocID ) )
14595 				{
14596 					// transfer hits from source
14597 					if ( bHitless )
14598 					{
14599 						while ( tSrcQword.m_bHasHitlist && tSrcQword.GetNextHit()!=EMPTY_HIT );
14600 
14601 						tHit.m_uDocID = tSrcQword.m_tDoc.m_uDocID - uMinID;
14602 						tHit.m_dFieldMask = tSrcQword.m_dQwordFields;
14603 						tHit.SetAggrCount ( tSrcQword.m_uMatchHits );
14604 						pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
14605 					} else
14606 						tMerger.TransferHits ( tSrcQword, tHit );
14607 					bSrcDocs = tMerger.NextDocument ( tSrcQword, pSrcIndex, NULL, CSphVector<SphDocID_t>() );
14608 
14609 				} else
14610 				{
14611 					// merge hits inside the document
14612 					assert ( bDstDocs );
14613 					assert ( bSrcDocs );
14614 					assert ( tDstQword.m_tDoc.m_uDocID==tSrcQword.m_tDoc.m_uDocID );
14615 
14616 					tHit.m_uDocID = tDstQword.m_tDoc.m_uDocID - uMinID;
14617 
14618 					if ( bHitless )
14619 					{
14620 						while ( tDstQword.m_bHasHitlist && tDstQword.GetNextHit()!=EMPTY_HIT );
14621 						while ( tSrcQword.m_bHasHitlist && tSrcQword.GetNextHit()!=EMPTY_HIT );
14622 
14623 						for ( int i=0; i<FieldMask_t::SIZE; i++ )
14624 							tHit.m_dFieldMask[i] = tDstQword.m_dQwordFields[i] | tSrcQword.m_dQwordFields[i];
14625 						tHit.SetAggrCount ( tDstQword.m_uMatchHits + tSrcQword.m_uMatchHits );
14626 						pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
14627 
14628 					} else
14629 					{
14630 						Hitpos_t uDstHit = tDstQword.GetNextHit();
14631 						Hitpos_t uSrcHit = tSrcQword.GetNextHit();
14632 
14633 						while ( uDstHit!=EMPTY_HIT || uSrcHit!=EMPTY_HIT )
14634 						{
14635 							if ( uSrcHit==EMPTY_HIT || ( uDstHit!=EMPTY_HIT && uDstHit<uSrcHit ) )
14636 							{
14637 								tHit.m_iWordPos = uDstHit;
14638 								pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
14639 								uDstHit = tDstQword.GetNextHit();
14640 
14641 							} else if ( uDstHit==EMPTY_HIT || ( uSrcHit!=EMPTY_HIT && uSrcHit<uDstHit ) )
14642 							{
14643 								tHit.m_iWordPos = uSrcHit;
14644 								pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
14645 								uSrcHit = tSrcQword.GetNextHit();
14646 
14647 							} else
14648 							{
14649 								assert ( uDstHit==uSrcHit );
14650 
14651 								tHit.m_iWordPos = uDstHit;
14652 								pHitBuilder->cidxHit ( &tHit, tMerger.GetInline() );
14653 
14654 								uDstHit = tDstQword.GetNextHit();
14655 								uSrcHit = tSrcQword.GetNextHit();
14656 							}
14657 						}
14658 					}
14659 
14660 					// next document
14661 					bDstDocs = tMerger.NextDocument ( tDstQword, pDstIndex, pFilter, dKillList );
14662 					bSrcDocs = tMerger.NextDocument ( tSrcQword, pSrcIndex, NULL, CSphVector<SphDocID_t>() );
14663 				}
14664 			}
14665 			// next word
14666 			bDstWord = tDstReader.Read();
14667 			bSrcWord = tSrcReader.Read();
14668 		}
14669 	}
14670 
14671 	tStat.m_iTotalDocuments += pSrcIndex->m_tStats.m_iTotalDocuments;
14672 	tStat.m_iTotalBytes += pSrcIndex->m_tStats.m_iTotalBytes;
14673 
14674 	tProgress.m_iWords += iWords;
14675 	tProgress.Show ( false );
14676 
14677 	if ( iHitlistsDiscarded )
14678 		sphWarning ( "discarded hitlists for %u words", iHitlistsDiscarded );
14679 
14680 	return true;
14681 }
14682 
14683 
Merge(CSphIndex * pSource,const CSphVector<CSphFilterSettings> & dFilters,bool bMergeKillLists)14684 bool CSphIndex_VLN::Merge ( CSphIndex * pSource, const CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists )
14685 {
14686 	CSphString sWarning;
14687 	if ( !Prealloc ( false, false, sWarning ) || !Preread() )
14688 		return false;
14689 	if ( !pSource->Prealloc ( false, false, sWarning ) || !pSource->Preread() )
14690 	{
14691 		m_sLastError.SetSprintf ( "source index preload failed: %s", pSource->GetLastError().cstr() );
14692 		return false;
14693 	}
14694 
14695 	// create filters
14696 	CSphScopedPtr<ISphFilter> pFilter ( CreateMergeFilters ( dFilters, m_tSchema, m_tMva.GetWritePtr(), m_tString.GetWritePtr(), m_bArenaProhibit ) );
14697 	CSphVector<SphDocID_t> dKillList ( pSource->GetKillListSize()+2 );
14698 	for ( int i=0; i<dKillList.GetLength()-2; ++i )
14699 		dKillList [ i+1 ] = pSource->GetKillList()[i];
14700 	dKillList[0] = 0;
14701 	dKillList.Last() = DOCID_MAX;
14702 
14703 	bool bGlobalStop = false;
14704 	bool bLocalStop = false;
14705 	return CSphIndex_VLN::DoMerge ( this, (const CSphIndex_VLN *)pSource, bMergeKillLists, pFilter.Ptr(),
14706 									dKillList, m_sLastError, m_tProgress, &g_tThrottle, &bGlobalStop, &bLocalStop );
14707 }
14708 
DoMerge(const CSphIndex_VLN * pDstIndex,const CSphIndex_VLN * pSrcIndex,bool bMergeKillLists,ISphFilter * pFilter,const CSphVector<SphDocID_t> & dKillList,CSphString & sError,CSphIndexProgress & tProgress,ThrottleState_t * pThrottle,volatile bool * pGlobalStop,volatile bool * pLocalStop)14709 bool CSphIndex_VLN::DoMerge ( const CSphIndex_VLN * pDstIndex, const CSphIndex_VLN * pSrcIndex,
14710 							bool bMergeKillLists, ISphFilter * pFilter, const CSphVector<SphDocID_t> & dKillList
14711 							, CSphString & sError, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle,
14712 							volatile bool * pGlobalStop, volatile bool * pLocalStop )
14713 {
14714 	assert ( pDstIndex && pSrcIndex );
14715 
14716 	const CSphSchema & tDstSchema = pDstIndex->m_tSchema;
14717 	const CSphSchema & tSrcSchema = pSrcIndex->m_tSchema;
14718 	if ( !tDstSchema.CompareTo ( tSrcSchema, sError ) )
14719 		return false;
14720 
14721 	if ( pDstIndex->m_tSettings.m_eHitless!=pSrcIndex->m_tSettings.m_eHitless )
14722 	{
14723 		sError = "hitless settings must be the same on merged indices";
14724 		return false;
14725 	}
14726 
14727 	// FIXME!
14728 	if ( pDstIndex->m_tSettings.m_eDocinfo!=pSrcIndex->m_tSettings.m_eDocinfo && !( pDstIndex->m_bIsEmpty || pSrcIndex->m_bIsEmpty ) )
14729 	{
14730 		sError.SetSprintf ( "docinfo storage on non-empty indexes must be the same (dst docinfo %d, empty %d, src docinfo %d, empty %d",
14731 			pDstIndex->m_tSettings.m_eDocinfo, pDstIndex->m_bIsEmpty, pSrcIndex->m_tSettings.m_eDocinfo, pSrcIndex->m_bIsEmpty );
14732 		return false;
14733 	}
14734 
14735 	if ( pDstIndex->m_pDict->GetSettings().m_bWordDict!=pSrcIndex->m_pDict->GetSettings().m_bWordDict )
14736 	{
14737 		sError.SetSprintf ( "dictionary types must be the same (dst dict=%s, src dict=%s )",
14738 			pDstIndex->m_pDict->GetSettings().m_bWordDict ? "keywords" : "crc",
14739 			pSrcIndex->m_pDict->GetSettings().m_bWordDict ? "keywords" : "crc" );
14740 		return false;
14741 	}
14742 
14743 	BuildHeader_t tBuildHeader ( pDstIndex->m_tStats );
14744 
14745 	/////////////////////////////////////////
14746 	// merging attributes (.spa, .spm, .sps)
14747 	/////////////////////////////////////////
14748 
14749 	CSphWriter tSPMWriter, tSPSWriter;
14750 	tSPMWriter.SetThrottle ( pThrottle );
14751 	tSPSWriter.SetThrottle ( pThrottle );
14752 	if ( !tSPMWriter.OpenFile ( pDstIndex->GetIndexFileName("tmp.spm"), sError )
14753 		|| !tSPSWriter.OpenFile ( pDstIndex->GetIndexFileName("tmp.sps"), sError ) )
14754 	{
14755 		return false;
14756 	}
14757 	tSPSWriter.PutByte ( 0 ); // dummy byte, to reserve magic zero offset
14758 
14759 	/// merging
14760 	CSphVector<CSphAttrLocator> dMvaLocators;
14761 	CSphVector<CSphAttrLocator> dStringLocators;
14762 	for ( int i=0; i<tDstSchema.GetAttrsCount(); i++ )
14763 	{
14764 		const CSphColumnInfo & tInfo = tDstSchema.GetAttr(i);
14765 		if ( tInfo.m_eAttrType==SPH_ATTR_UINT32SET )
14766 			dMvaLocators.Add ( tInfo.m_tLocator );
14767 		if ( tInfo.m_eAttrType==SPH_ATTR_STRING || tInfo.m_eAttrType==SPH_ATTR_JSON )
14768 			dStringLocators.Add ( tInfo.m_tLocator );
14769 	}
14770 	for ( int i=0; i<tDstSchema.GetAttrsCount(); i++ )
14771 	{
14772 		const CSphColumnInfo & tInfo = tDstSchema.GetAttr(i);
14773 		if ( tInfo.m_eAttrType==SPH_ATTR_INT64SET )
14774 			dMvaLocators.Add ( tInfo.m_tLocator );
14775 	}
14776 
14777 	CSphVector<SphDocID_t> dPhantomKiller;
14778 
14779 	int64_t iTotalDocuments = 0;
14780 	bool bNeedInfinum = true;
14781 	// minimal docid-1 for merging
14782 	SphDocID_t uMergeInfinum = 0;
14783 
14784 	if ( pDstIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && pSrcIndex->m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
14785 	{
14786 		int iStride = DOCINFO_IDSIZE + pDstIndex->m_tSchema.GetRowSize();
14787 		CSphFixedVector<CSphRowitem> dRow ( iStride );
14788 
14789 		CSphWriter wrRows;
14790 		wrRows.SetThrottle ( pThrottle );
14791 		if ( !wrRows.OpenFile ( pDstIndex->GetIndexFileName("tmp.spa"), sError ) )
14792 			return false;
14793 
14794 		int64_t iExpectedDocs = pDstIndex->m_tStats.m_iTotalDocuments + pSrcIndex->GetStats().m_iTotalDocuments;
14795 		AttrIndexBuilder_c tMinMax ( pDstIndex->m_tSchema );
14796 		int64_t iMinMaxSize = tMinMax.GetExpectedSize ( iExpectedDocs );
14797 		if ( iMinMaxSize>INT_MAX || iExpectedDocs>INT_MAX )
14798 		{
14799 			if ( iMinMaxSize>INT_MAX )
14800 				sError.SetSprintf ( "attribute files over 128 GB are not supported (projected_minmax_size=" INT64_FMT ")", iMinMaxSize );
14801 			else if ( iExpectedDocs>INT_MAX )
14802 				sError.SetSprintf ( "indexes over 2B docs are not supported (projected_docs=" INT64_FMT ")", iExpectedDocs );
14803 			return false;
14804 		}
14805 		CSphFixedVector<DWORD> dMinMaxBuffer ( (int)iMinMaxSize );
14806 		tMinMax.Prepare ( dMinMaxBuffer.Begin(), dMinMaxBuffer.Begin() + dMinMaxBuffer.GetLength() ); // FIXME!!! for over INT_MAX blocks
14807 
14808 		const DWORD * pSrcRow = pSrcIndex->m_tAttr.GetWritePtr(); // they *can* be null if the respective index is empty
14809 		const DWORD * pDstRow = pDstIndex->m_tAttr.GetWritePtr();
14810 
14811 		int64_t iSrcCount = 0;
14812 		int64_t iDstCount = 0;
14813 
14814 		int iKillListIdx = 0;
14815 
14816 		CSphMatch tMatch;
14817 		while ( iSrcCount < pSrcIndex->m_iDocinfo || iDstCount < pDstIndex->m_iDocinfo )
14818 		{
14819 			if ( *pGlobalStop || *pLocalStop )
14820 				return false;
14821 
14822 			SphDocID_t iDstDocID, iSrcDocID;
14823 
14824 			if ( iDstCount < pDstIndex->m_iDocinfo )
14825 			{
14826 				iDstDocID = DOCINFO2ID ( pDstRow );
14827 
14828 				// kill list filter goes first
14829 				while ( dKillList [ iKillListIdx ]<iDstDocID )
14830 					iKillListIdx++;
14831 				if ( dKillList [ iKillListIdx ]==iDstDocID )
14832 				{
14833 					pDstRow += iStride;
14834 					iDstCount++;
14835 					continue;
14836 				}
14837 
14838 				if ( pFilter )
14839 				{
14840 					tMatch.m_uDocID = iDstDocID;
14841 					tMatch.m_pStatic = DOCINFO2ATTRS ( pDstRow );
14842 					tMatch.m_pDynamic = NULL;
14843 					if ( !pFilter->Eval ( tMatch ) )
14844 					{
14845 						pDstRow += iStride;
14846 						iDstCount++;
14847 						continue;
14848 					}
14849 				}
14850 			} else
14851 				iDstDocID = 0;
14852 
14853 			if ( iSrcCount < pSrcIndex->m_iDocinfo )
14854 				iSrcDocID = DOCINFO2ID ( pSrcRow );
14855 			else
14856 				iSrcDocID = 0;
14857 
14858 			if ( ( iDstDocID && iDstDocID < iSrcDocID ) || ( iDstDocID && !iSrcDocID ) )
14859 			{
14860 				Verify ( tMinMax.Collect ( pDstRow, pDstIndex->m_tMva.GetWritePtr(), pDstIndex->m_tMva.GetNumEntries(), sError, true ) );
14861 
14862 				if ( dMvaLocators.GetLength() || dStringLocators.GetLength() )
14863 				{
14864 					memcpy ( dRow.Begin(), pDstRow, iStride * sizeof ( CSphRowitem ) );
14865 					CopyRowMVA ( pDstIndex->m_tMva.GetWritePtr(), dMvaLocators, iDstDocID, dRow.Begin(), tSPMWriter );
14866 					CopyRowString ( pDstIndex->m_tString.GetWritePtr(), dStringLocators, dRow.Begin(), tSPSWriter );
14867 					wrRows.PutBytes ( dRow.Begin(), sizeof(DWORD)*iStride );
14868 				} else
14869 				{
14870 					wrRows.PutBytes ( pDstRow, sizeof(DWORD)*iStride );
14871 				}
14872 
14873 				tBuildHeader.m_iMinMaxIndex += iStride;
14874 				pDstRow += iStride;
14875 				iDstCount++;
14876 				iTotalDocuments++;
14877 				if ( bNeedInfinum )
14878 				{
14879 					bNeedInfinum = false;
14880 					uMergeInfinum = iDstDocID - 1;
14881 				}
14882 
14883 			} else if ( iSrcDocID )
14884 			{
14885 				Verify ( tMinMax.Collect ( pSrcRow, pSrcIndex->m_tMva.GetWritePtr(), pSrcIndex->m_tMva.GetNumEntries(), sError, true ) );
14886 
14887 				if ( dMvaLocators.GetLength() || dStringLocators.GetLength() )
14888 				{
14889 					memcpy ( dRow.Begin(), pSrcRow, iStride * sizeof ( CSphRowitem ) );
14890 					CopyRowMVA ( pSrcIndex->m_tMva.GetWritePtr(), dMvaLocators, iSrcDocID, dRow.Begin(), tSPMWriter );
14891 					CopyRowString ( pSrcIndex->m_tString.GetWritePtr(), dStringLocators, dRow.Begin(), tSPSWriter );
14892 					wrRows.PutBytes ( dRow.Begin(), sizeof(DWORD)*iStride );
14893 				} else
14894 				{
14895 					wrRows.PutBytes ( pSrcRow, sizeof(DWORD)*iStride );
14896 				}
14897 
14898 				tBuildHeader.m_iMinMaxIndex += iStride;
14899 				pSrcRow += iStride;
14900 				iSrcCount++;
14901 				iTotalDocuments++;
14902 				if ( bNeedInfinum )
14903 				{
14904 					bNeedInfinum = false;
14905 					uMergeInfinum = iSrcDocID - 1;
14906 				}
14907 
14908 				if ( iDstDocID==iSrcDocID )
14909 				{
14910 					dPhantomKiller.Add ( iSrcDocID );
14911 					pDstRow += iStride;
14912 					iDstCount++;
14913 				}
14914 			}
14915 		}
14916 
14917 		if ( iTotalDocuments )
14918 		{
14919 			tMinMax.FinishCollect();
14920 			iMinMaxSize = tMinMax.GetActualSize() * sizeof(DWORD);
14921 			wrRows.PutBytes ( dMinMaxBuffer.Begin(), iMinMaxSize );
14922 		}
14923 		wrRows.CloseFile();
14924 		if ( wrRows.IsError() )
14925 			return false;
14926 
14927 	} else if ( pDstIndex->m_bIsEmpty || pSrcIndex->m_bIsEmpty )
14928 	{
14929 		// one of the indexes has no documents; copy the .spa file from the other one
14930 		CSphString sSrc = !pDstIndex->m_bIsEmpty ? pDstIndex->GetIndexFileName("spa") : pSrcIndex->GetIndexFileName("spa");
14931 		CSphString sDst = pDstIndex->GetIndexFileName("tmp.spa");
14932 
14933 		if ( !CopyFile ( sSrc.cstr(), sDst.cstr(), sError, pThrottle, pGlobalStop, pLocalStop ) )
14934 			return false;
14935 
14936 	} else
14937 	{
14938 		// storage is not extern; create dummy .spa file
14939 		CSphAutofile fdSpa ( pDstIndex->GetIndexFileName("tmp.spa"), SPH_O_NEW, sError );
14940 		fdSpa.Close();
14941 	}
14942 
14943 	if ( !CheckDocsCount ( iTotalDocuments, sError ) )
14944 		return false;
14945 
14946 	if ( tSPSWriter.GetPos()>SphOffset_t( U64C(1)<<32 ) )
14947 	{
14948 		sError.SetSprintf ( "resulting .sps file is over 4 GB" );
14949 		return false;
14950 	}
14951 
14952 	if ( tSPMWriter.GetPos()>SphOffset_t( U64C(4)<<32 ) )
14953 	{
14954 		sError.SetSprintf ( "resulting .spm file is over 16 GB" );
14955 		return false;
14956 	}
14957 
14958 	int iOldLen = dPhantomKiller.GetLength();
14959 	int iKillLen = dKillList.GetLength();
14960 	dPhantomKiller.Resize ( iOldLen+iKillLen );
14961 	memcpy ( dPhantomKiller.Begin()+iOldLen, dKillList.Begin(), sizeof(SphDocID_t)*iKillLen );
14962 	dPhantomKiller.Uniq();
14963 
14964 	CSphAutofile tTmpDict ( pDstIndex->GetIndexFileName("tmp8.spi"), SPH_O_NEW, sError, true );
14965 	CSphAutofile tDict ( pDstIndex->GetIndexFileName("tmp.spi"), SPH_O_NEW, sError );
14966 
14967 	if ( !sError.IsEmpty() || tTmpDict.GetFD()<0 || tDict.GetFD()<0 || *pGlobalStop || *pLocalStop )
14968 		return false;
14969 
14970 	CSphScopedPtr<CSphDict> pDict ( pDstIndex->m_pDict->Clone() );
14971 
14972 	int iHitBufferSize = 8 * 1024 * 1024;
14973 	CSphVector<SphWordID_t> dDummy;
14974 	CSphHitBuilder tHitBuilder ( pDstIndex->m_tSettings, dDummy, true, iHitBufferSize, pDict.Ptr(), &sError );
14975 	tHitBuilder.SetThrottle ( pThrottle );
14976 
14977 	CSphFixedVector<CSphRowitem> dMinRow ( pDstIndex->m_dMinRow.GetLength() );
14978 	memcpy ( dMinRow.Begin(), pDstIndex->m_dMinRow.Begin(), sizeof(CSphRowitem)*dMinRow.GetLength() );
14979 	// correct infinum might be already set during spa merging.
14980 	SphDocID_t uMinDocid = ( !uMergeInfinum ) ? Min ( pDstIndex->m_uMinDocid, pSrcIndex->m_uMinDocid ) : uMergeInfinum;
14981 	tBuildHeader.m_uMinDocid = uMinDocid;
14982 	tBuildHeader.m_pMinRow = dMinRow.Begin();
14983 
14984 	// FIXME? is this magic dict block constant any good?..
14985 	pDict->DictBegin ( tTmpDict, tDict, iHitBufferSize, pThrottle );
14986 
14987 	// merge dictionaries, doclists and hitlists
14988 	if ( pDict->GetSettings().m_bWordDict )
14989 	{
14990 		WITH_QWORD ( pDstIndex, false, QwordDst,
14991 			WITH_QWORD ( pSrcIndex, false, QwordSrc,
14992 		{
14993 			if ( !CSphIndex_VLN::MergeWords < QwordDst, QwordSrc > ( pDstIndex, pSrcIndex, pFilter, dPhantomKiller,
14994 																	uMinDocid, &tHitBuilder, sError, tBuildHeader,
14995 																	tProgress, pThrottle, pGlobalStop, pLocalStop ) )
14996 				return false;
14997 		} ) );
14998 	} else
14999 	{
15000 		WITH_QWORD ( pDstIndex, true, QwordDst,
15001 			WITH_QWORD ( pSrcIndex, true, QwordSrc,
15002 		{
15003 			if ( !CSphIndex_VLN::MergeWords < QwordDst, QwordSrc > ( pDstIndex, pSrcIndex, pFilter, dPhantomKiller
15004 																	, uMinDocid, &tHitBuilder, sError, tBuildHeader,
15005 																	tProgress,	pThrottle, pGlobalStop, pLocalStop ) )
15006 				return false;
15007 		} ) );
15008 	}
15009 
15010 	if ( iTotalDocuments )
15011 		tBuildHeader.m_iTotalDocuments = iTotalDocuments;
15012 
15013 	// merge kill-lists
15014 	CSphAutofile tKillList ( pDstIndex->GetIndexFileName("tmp.spk"), SPH_O_NEW, sError );
15015 	if ( tKillList.GetFD () < 0 )
15016 		return false;
15017 
15018 	if ( bMergeKillLists )
15019 	{
15020 		// merge spk
15021 		CSphVector<SphDocID_t> dKillList;
15022 		dKillList.Reserve ( pDstIndex->GetKillListSize()+pSrcIndex->GetKillListSize() );
15023 		for ( int i=0; i<pSrcIndex->GetKillListSize(); i++ ) dKillList.Add ( pSrcIndex->GetKillList()[i] );
15024 		for ( int i=0; i<pDstIndex->GetKillListSize(); i++ ) dKillList.Add ( pDstIndex->GetKillList()[i] );
15025 		dKillList.Uniq ();
15026 
15027 		tBuildHeader.m_uKillListSize = dKillList.GetLength ();
15028 
15029 		if ( *pGlobalStop || *pLocalStop )
15030 			return false;
15031 
15032 		if ( dKillList.GetLength() )
15033 		{
15034 			if ( !sphWriteThrottled ( tKillList.GetFD(), &dKillList[0], dKillList.GetLength()*sizeof(SphDocID_t), "kill_list", sError, pThrottle ) )
15035 				return false;
15036 		}
15037 	}
15038 
15039 	tKillList.Close ();
15040 
15041 	if ( *pGlobalStop || *pLocalStop )
15042 		return false;
15043 
15044 	// finalize
15045 	CSphAggregateHit tFlush;
15046 	tFlush.m_uDocID = 0;
15047 	tFlush.m_uWordID = 0;
15048 	tFlush.m_sKeyword = (BYTE*)""; // tricky: assertion in cidxHit calls strcmp on this in case of empty index!
15049 	tFlush.m_iWordPos = EMPTY_HIT;
15050 	tFlush.m_dFieldMask.UnsetAll();
15051 	tHitBuilder.cidxHit ( &tFlush, NULL );
15052 
15053 	if ( !tHitBuilder.cidxDone ( iHitBufferSize, pDstIndex->m_tSettings.m_iMinInfixLen,
15054 								pDstIndex->m_pTokenizer->GetMaxCodepointLength(), &tBuildHeader ) )
15055 		return false;
15056 
15057 	tBuildHeader.m_sHeaderExtension = "tmp.sph";
15058 	tBuildHeader.m_pThrottle = pThrottle;
15059 
15060 	pDstIndex->BuildDone ( tBuildHeader, sError ); // FIXME? is this magic dict block constant any good?..
15061 
15062 	// we're done
15063 	tProgress.Show ( true );
15064 
15065 	return true;
15066 }
15067 
15068 
sphMerge(const CSphIndex * pDst,const CSphIndex * pSrc,const CSphVector<SphDocID_t> & dKillList,CSphString & sError,CSphIndexProgress & tProgress,ThrottleState_t * pThrottle,volatile bool * pGlobalStop,volatile bool * pLocalStop)15069 bool sphMerge ( const CSphIndex * pDst, const CSphIndex * pSrc, const CSphVector<SphDocID_t> & dKillList,
15070 				CSphString & sError, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle,
15071 				volatile bool * pGlobalStop, volatile bool * pLocalStop )
15072 {
15073 	const CSphIndex_VLN * pDstIndex = (const CSphIndex_VLN *)pDst;
15074 	const CSphIndex_VLN * pSrcIndex = (const CSphIndex_VLN *)pSrc;
15075 
15076 	return CSphIndex_VLN::DoMerge ( pDstIndex, pSrcIndex, false, NULL, dKillList, sError, tProgress, pThrottle, pGlobalStop, pLocalStop );
15077 }
15078 
15079 
15080 /////////////////////////////////////////////////////////////////////////////
15081 // THE SEARCHER
15082 /////////////////////////////////////////////////////////////////////////////
15083 
GetWordID(BYTE *)15084 SphWordID_t CSphDictTraits::GetWordID ( BYTE * )
15085 {
15086 	assert ( 0 && "not implemented" );
15087 	return 0;
15088 }
15089 
15090 
GetWordID(BYTE * pWord)15091 SphWordID_t CSphDictStar::GetWordID ( BYTE * pWord )
15092 {
15093 	char sBuf [ 16+3*SPH_MAX_WORD_LEN ];
15094 	assert ( strlen ( (const char*)pWord ) < 16+3*SPH_MAX_WORD_LEN );
15095 
15096 	if ( m_pDict->GetSettings().m_bStopwordsUnstemmed && m_pDict->IsStopWord ( pWord ) )
15097 		return 0;
15098 
15099 	m_pDict->ApplyStemmers ( pWord );
15100 
15101 	int iLen = strlen ( (const char*)pWord );
15102 	assert ( iLen < 16+3*SPH_MAX_WORD_LEN - 1 );
15103 	// stemmer might squeeze out the word
15104 	if ( iLen && !pWord[0] )
15105 		return 0;
15106 
15107 	memcpy ( sBuf, pWord, iLen+1 );
15108 
15109 	if ( iLen )
15110 	{
15111 		if ( sBuf[iLen-1]=='*' )
15112 		{
15113 			iLen--;
15114 			sBuf[iLen] = '\0';
15115 		} else
15116 		{
15117 			sBuf[iLen] = MAGIC_WORD_TAIL;
15118 			iLen++;
15119 			sBuf[iLen] = '\0';
15120 		}
15121 	}
15122 
15123 	return m_pDict->GetWordID ( (BYTE*)sBuf, iLen, !m_pDict->GetSettings().m_bStopwordsUnstemmed );
15124 }
15125 
15126 
GetWordIDNonStemmed(BYTE * pWord)15127 SphWordID_t	CSphDictStar::GetWordIDNonStemmed ( BYTE * pWord )
15128 {
15129 	return m_pDict->GetWordIDNonStemmed ( pWord );
15130 }
15131 
15132 
15133 //////////////////////////////////////////////////////////////////////////
15134 
CSphDictStarV8(CSphDict * pDict,bool bPrefixes,bool bInfixes)15135 CSphDictStarV8::CSphDictStarV8 ( CSphDict * pDict, bool bPrefixes, bool bInfixes )
15136 	: CSphDictStar	( pDict )
15137 	, m_bPrefixes	( bPrefixes )
15138 	, m_bInfixes	( bInfixes )
15139 {
15140 }
15141 
15142 
GetWordID(BYTE * pWord)15143 SphWordID_t	CSphDictStarV8::GetWordID ( BYTE * pWord )
15144 {
15145 	char sBuf [ 16+3*SPH_MAX_WORD_LEN ];
15146 
15147 	int iLen = strlen ( (const char*)pWord );
15148 	iLen = Min ( iLen, 16+3*SPH_MAX_WORD_LEN - 1 );
15149 
15150 	if ( !iLen )
15151 		return 0;
15152 
15153 	bool bHeadStar = ( pWord[0]=='*' );
15154 	bool bTailStar = ( pWord[iLen-1]=='*' ) && ( iLen>1 );
15155 	bool bMagic = ( pWord[0]<' ' );
15156 
15157 	if ( !bHeadStar && !bTailStar && !bMagic )
15158 	{
15159 		if ( m_pDict->GetSettings().m_bStopwordsUnstemmed && IsStopWord ( pWord ) )
15160 			return 0;
15161 
15162 		m_pDict->ApplyStemmers ( pWord );
15163 
15164 		// stemmer might squeeze out the word
15165 		if ( !pWord[0] )
15166 			return 0;
15167 
15168 		if ( !m_pDict->GetSettings().m_bStopwordsUnstemmed && IsStopWord ( pWord ) )
15169 			return 0;
15170 	}
15171 
15172 	iLen = strlen ( (const char*)pWord );
15173 	assert ( iLen < 16+3*SPH_MAX_WORD_LEN - 2 );
15174 
15175 	if ( !iLen || ( bHeadStar && iLen==1 ) )
15176 		return 0;
15177 
15178 	if ( bMagic ) // pass throu MAGIC_* words
15179 	{
15180 		memcpy ( sBuf, pWord, iLen );
15181 		sBuf[iLen] = '\0';
15182 
15183 	} else if ( m_bInfixes )
15184 	{
15185 		////////////////////////////////////
15186 		// infix or mixed infix+prefix mode
15187 		////////////////////////////////////
15188 
15189 		// handle head star
15190 		if ( bHeadStar )
15191 		{
15192 			memcpy ( sBuf, pWord+1, iLen-- ); // chops star, copies trailing zero, updates iLen
15193 		} else
15194 		{
15195 			sBuf[0] = MAGIC_WORD_HEAD;
15196 			memcpy ( sBuf+1, pWord, ++iLen ); // copies everything incl trailing zero, updates iLen
15197 		}
15198 
15199 		// handle tail star
15200 		if ( bTailStar )
15201 		{
15202 			sBuf[--iLen] = '\0'; // got star, just chop it away
15203 		} else
15204 		{
15205 			sBuf[iLen] = MAGIC_WORD_TAIL; // no star, add tail marker
15206 			sBuf[++iLen] = '\0';
15207 		}
15208 
15209 	} else
15210 	{
15211 		////////////////////
15212 		// prefix-only mode
15213 		////////////////////
15214 
15215 		assert ( m_bPrefixes );
15216 
15217 		// always ignore head star in prefix mode
15218 		if ( bHeadStar )
15219 		{
15220 			pWord++;
15221 			iLen--;
15222 		}
15223 
15224 		// handle tail star
15225 		if ( !bTailStar )
15226 		{
15227 			// exact word search request, always (ie. both in infix/prefix mode) mangles to "\1word\1" in v.8+
15228 			sBuf[0] = MAGIC_WORD_HEAD;
15229 			memcpy ( sBuf+1, pWord, iLen );
15230 			sBuf[iLen+1] = MAGIC_WORD_TAIL;
15231 			sBuf[iLen+2] = '\0';
15232 			iLen += 2;
15233 
15234 		} else
15235 		{
15236 			// prefix search request, mangles to word itself (just chop away the star)
15237 			memcpy ( sBuf, pWord, iLen );
15238 			sBuf[--iLen] = '\0';
15239 		}
15240 	}
15241 
15242 	// calc id for mangled word
15243 	return m_pDict->GetWordID ( (BYTE*)sBuf, iLen, !bHeadStar && !bTailStar );
15244 }
15245 
15246 //////////////////////////////////////////////////////////////////////////
15247 
GetWordID(BYTE * pWord)15248 SphWordID_t CSphDictExact::GetWordID ( BYTE * pWord )
15249 {
15250 	int iLen = strlen ( (const char*)pWord );
15251 	iLen = Min ( iLen, 16+3*SPH_MAX_WORD_LEN - 1 );
15252 
15253 	if ( !iLen )
15254 		return 0;
15255 
15256 	if ( pWord[0]=='=' )
15257 		pWord[0] = MAGIC_WORD_HEAD_NONSTEMMED;
15258 
15259 	if ( pWord[0]<' ' )
15260 		return m_pDict->GetWordIDNonStemmed ( pWord );
15261 
15262 	return m_pDict->GetWordID ( pWord );
15263 }
15264 
15265 
15266 /////////////////////////////////////////////////////////////////////////////
15267 
sphGroupMatch(SphAttr_t iGroup,const SphAttr_t * pGroups,int iGroups)15268 inline bool sphGroupMatch ( SphAttr_t iGroup, const SphAttr_t * pGroups, int iGroups )
15269 {
15270 	if ( !pGroups ) return true;
15271 	const SphAttr_t * pA = pGroups;
15272 	const SphAttr_t * pB = pGroups+iGroups-1;
15273 	if ( iGroup==*pA || iGroup==*pB ) return true;
15274 	if ( iGroup<(*pA) || iGroup>(*pB) ) return false;
15275 
15276 	while ( pB-pA>1 )
15277 	{
15278 		const SphAttr_t * pM = pA + ((pB-pA)/2);
15279 		if ( iGroup==(*pM) )
15280 			return true;
15281 		if ( iGroup<(*pM) )
15282 			pB = pM;
15283 		else
15284 			pA = pM;
15285 	}
15286 	return false;
15287 }
15288 
15289 
EarlyReject(CSphQueryContext * pCtx,CSphMatch & tMatch) const15290 bool CSphIndex_VLN::EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const
15291 {
15292 	// might be needed even when we do not have a filter
15293 	if ( pCtx->m_bLookupFilter )
15294 	{
15295 		const CSphRowitem * pRow = FindDocinfo ( tMatch.m_uDocID );
15296 		if ( !pRow && m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
15297 		{
15298 			pCtx->m_iBadRows++;
15299 			return true;
15300 		}
15301 		CopyDocinfo ( pCtx, tMatch, pRow );
15302 	}
15303 	pCtx->CalcFilter ( tMatch ); // FIXME!!! leak of filtered STRING_PTR
15304 
15305 	return pCtx->m_pFilter ? !pCtx->m_pFilter->Eval ( tMatch ) : false;
15306 }
15307 
15308 
GetKillList() const15309 SphDocID_t * CSphIndex_VLN::GetKillList () const
15310 {
15311 	return m_pKillList.GetWritePtr ();
15312 }
15313 
15314 
BuildDocList(SphAttr_t ** ppDocList,int64_t * pCount,CSphString * pError) const15315 bool CSphIndex_VLN::BuildDocList ( SphAttr_t ** ppDocList, int64_t * pCount, CSphString * pError ) const
15316 {
15317 	assert ( ppDocList && pCount && pError );
15318 	*ppDocList = NULL;
15319 	*pCount = 0;
15320 	if ( !m_iDocinfo )
15321 		return true;
15322 
15323 	// new[] might fail on 32bit here
15324 	int64_t iSizeMax = (size_t)m_iDocinfo;
15325 	if ( iSizeMax!=m_iDocinfo )
15326 	{
15327 		pError->SetSprintf ( "doc-list build size_t overflow (docs count=" INT64_FMT ", size max=" INT64_FMT ")", m_iDocinfo, iSizeMax );
15328 		return false;
15329 	}
15330 
15331 	int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
15332 	SphAttr_t * pDst = new SphAttr_t [(size_t)m_iDocinfo];
15333 	*ppDocList = pDst;
15334 	*pCount = m_iDocinfo;
15335 
15336 	const CSphRowitem * pRow = m_tAttr.GetWritePtr();
15337 	const CSphRowitem * pEnd = m_tAttr.GetWritePtr() + m_iDocinfo*iStride;
15338 	while ( pRow<pEnd )
15339 	{
15340 		*pDst++ = DOCINFO2ID ( pRow );
15341 		pRow += iStride;
15342 	}
15343 
15344 	return true;
15345 }
15346 
ReplaceKillList(const SphDocID_t * pKillist,int iCount)15347 bool CSphIndex_VLN::ReplaceKillList ( const SphDocID_t * pKillist, int iCount )
15348 {
15349 	// dump killlist
15350 	CSphAutofile tKillList ( GetIndexFileName("spk"), SPH_O_NEW, m_sLastError );
15351 	if ( tKillList.GetFD()<0 )
15352 		return false;
15353 
15354 	if ( !sphWriteThrottled ( tKillList.GetFD(), pKillist, iCount*sizeof(SphDocID_t), "kill list", m_sLastError, &g_tThrottle ) )
15355 		return false;
15356 
15357 	tKillList.Close ();
15358 
15359 	BuildHeader_t tBuildHeader ( m_tStats );
15360 	(DictHeader_t &)tBuildHeader = (DictHeader_t)m_tWordlist;
15361 	tBuildHeader.m_sHeaderExtension = "sph";
15362 	tBuildHeader.m_pThrottle = &g_tThrottle;
15363 	tBuildHeader.m_uMinDocid = m_uMinDocid;
15364 	tBuildHeader.m_uKillListSize = iCount;
15365 	tBuildHeader.m_iMinMaxIndex = m_iMinMaxIndex;
15366 
15367 	if ( !BuildDone ( tBuildHeader, m_sLastError ) )
15368 		return false;
15369 
15370 	m_pKillList.Reset ();
15371 	m_uKillListSize = 0;
15372 	if ( iCount )
15373 	{
15374 		if ( !m_pKillList.Alloc ( iCount, m_sLastError, m_sLastWarning ) )
15375 			return false;
15376 
15377 		memcpy ( m_pKillList.GetWritePtr(), pKillist, sizeof(SphDocID_t)*iCount );
15378 		m_uKillListSize = iCount;
15379 	}
15380 
15381 	return true;
15382 }
15383 
15384 
HasDocid(SphDocID_t uDocid) const15385 bool CSphIndex_VLN::HasDocid ( SphDocID_t uDocid ) const
15386 {
15387 	return FindDocinfo ( uDocid )!=NULL;
15388 }
15389 
15390 
FindDocinfo(SphDocID_t uDocID) const15391 const DWORD * CSphIndex_VLN::FindDocinfo ( SphDocID_t uDocID ) const
15392 {
15393 	if ( m_iDocinfo<=0 )
15394 		return NULL;
15395 
15396 	assert ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN );
15397 	assert ( !m_tAttr.IsEmpty() );
15398 	assert ( m_tSchema.GetAttrsCount() );
15399 
15400 	int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
15401 	int64_t iStart = 0;
15402 	int64_t iEnd = m_iDocinfo-1;
15403 
15404 #define LOC_ROW(_index) &m_tAttr [ _index*iStride ]
15405 #define LOC_ID(_index) DOCINFO2ID(LOC_ROW(_index))
15406 
15407 	if ( m_pDocinfoHash.GetLengthBytes() )
15408 	{
15409 		SphDocID_t uFirst = LOC_ID(0);
15410 		SphDocID_t uLast = LOC_ID(iEnd);
15411 		if ( uDocID<uFirst || uDocID>uLast )
15412 			return NULL;
15413 
15414 		int64_t iHash = ( ( uDocID - uFirst ) >> m_pDocinfoHash[0] );
15415 		if ( iHash > ( 1 << DOCINFO_HASH_BITS ) ) // possible in case of broken data, for instance
15416 			return NULL;
15417 
15418 		iStart = m_pDocinfoHash [ iHash+1 ];
15419 		iEnd = m_pDocinfoHash [ iHash+2 ] - 1;
15420 	}
15421 
15422 	if ( uDocID==LOC_ID(iStart) )
15423 		return LOC_ROW(iStart);
15424 
15425 	if ( uDocID==LOC_ID(iEnd) )
15426 		return LOC_ROW(iEnd);
15427 
15428 	while ( iEnd-iStart>1 )
15429 	{
15430 		// check if nothing found
15431 		if ( uDocID<LOC_ID(iStart) || uDocID>LOC_ID(iEnd) )
15432 			return NULL;
15433 		assert ( uDocID > LOC_ID(iStart) );
15434 		assert ( uDocID < LOC_ID(iEnd) );
15435 
15436 		int64_t iMid = iStart + (iEnd-iStart)/2;
15437 		if ( uDocID==LOC_ID(iMid) )
15438 			return LOC_ROW(iMid);
15439 		else if ( uDocID<LOC_ID(iMid) )
15440 			iEnd = iMid;
15441 		else
15442 			iStart = iMid;
15443 	}
15444 
15445 #undef LOC_ID
15446 #undef LOC_ROW
15447 
15448 	return NULL;
15449 }
15450 
CopyDocinfo(const CSphQueryContext * pCtx,CSphMatch & tMatch,const DWORD * pFound) const15451 void CSphIndex_VLN::CopyDocinfo ( const CSphQueryContext * pCtx, CSphMatch & tMatch, const DWORD * pFound ) const
15452 {
15453 	if ( !pFound )
15454 		return;
15455 
15456 	// setup static pointer
15457 	assert ( DOCINFO2ID(pFound)==tMatch.m_uDocID );
15458 	tMatch.m_pStatic = DOCINFO2ATTRS(pFound);
15459 
15460 	// patch if necessary
15461 	if ( pCtx->m_pOverrides )
15462 		ARRAY_FOREACH ( i, (*pCtx->m_pOverrides) )
15463 		{
15464 			const CSphAttrOverride & tOverride = (*pCtx->m_pOverrides)[i]; // shortcut
15465 			const CSphAttrOverride::IdValuePair_t * pEntry = tOverride.m_dValues.BinarySearch (
15466 				bind ( &CSphAttrOverride::IdValuePair_t::m_uDocID ), tMatch.m_uDocID );
15467 			tMatch.SetAttr ( pCtx->m_dOverrideOut[i], pEntry
15468 							? pEntry->m_uValue
15469 							: sphGetRowAttr ( tMatch.m_pStatic, pCtx->m_dOverrideIn[i] ) );
15470 		}
15471 }
15472 
15473 
CalcContextItems(CSphMatch & tMatch,const CSphVector<CSphQueryContext::CalcItem_t> & dItems)15474 static inline void CalcContextItems ( CSphMatch & tMatch, const CSphVector<CSphQueryContext::CalcItem_t> & dItems )
15475 {
15476 	ARRAY_FOREACH ( i, dItems )
15477 	{
15478 		const CSphQueryContext::CalcItem_t & tCalc = dItems[i];
15479 		if ( tCalc.m_eType==SPH_ATTR_INTEGER )
15480 			tMatch.SetAttr ( tCalc.m_tLoc, tCalc.m_pExpr->IntEval(tMatch) );
15481 		else if ( tCalc.m_eType==SPH_ATTR_BIGINT || tCalc.m_eType==SPH_ATTR_JSON_FIELD )
15482 			tMatch.SetAttr ( tCalc.m_tLoc, tCalc.m_pExpr->Int64Eval(tMatch) );
15483 		else if ( tCalc.m_eType==SPH_ATTR_STRINGPTR )
15484 		{
15485 			const BYTE * pStr = NULL;
15486 			tCalc.m_pExpr->StringEval ( tMatch, &pStr );
15487 			tMatch.SetAttr ( tCalc.m_tLoc, (SphAttr_t) pStr ); // FIXME! a potential leak of *previous* value?
15488 		} else if ( tCalc.m_eType==SPH_ATTR_FACTORS || tCalc.m_eType==SPH_ATTR_FACTORS_JSON )
15489 			tMatch.SetAttr ( tCalc.m_tLoc, (SphAttr_t)tCalc.m_pExpr->FactorEval(tMatch) );
15490 		else
15491 			tMatch.SetAttrFloat ( tCalc.m_tLoc, tCalc.m_pExpr->Eval(tMatch) );
15492 	}
15493 }
15494 
CalcFilter(CSphMatch & tMatch) const15495 void CSphQueryContext::CalcFilter ( CSphMatch & tMatch ) const
15496 {
15497 	CalcContextItems ( tMatch, m_dCalcFilter );
15498 }
15499 
15500 
CalcSort(CSphMatch & tMatch) const15501 void CSphQueryContext::CalcSort ( CSphMatch & tMatch ) const
15502 {
15503 	CalcContextItems ( tMatch, m_dCalcSort );
15504 }
15505 
15506 
CalcFinal(CSphMatch & tMatch) const15507 void CSphQueryContext::CalcFinal ( CSphMatch & tMatch ) const
15508 {
15509 	CalcContextItems ( tMatch, m_dCalcFinal );
15510 }
15511 
CalcPostAggregate(CSphMatch & tMatch) const15512 void CSphQueryContext::CalcPostAggregate ( CSphMatch & tMatch ) const
15513 {
15514 	CalcContextItems ( tMatch, m_dCalcPostAggregate );
15515 }
15516 
FreeStrItems(CSphMatch & tMatch,const CSphVector<CSphQueryContext::CalcItem_t> & dItems)15517 static inline void FreeStrItems ( CSphMatch & tMatch, const CSphVector<CSphQueryContext::CalcItem_t> & dItems )
15518 {
15519 	if ( !tMatch.m_pDynamic )
15520 		return;
15521 
15522 	ARRAY_FOREACH ( i, dItems )
15523 	{
15524 		const CSphQueryContext::CalcItem_t & tCalc = dItems[i];
15525 		switch ( tCalc.m_eType )
15526 		{
15527 		case SPH_ATTR_STRINGPTR:
15528 			{
15529 				CSphString sStr;
15530 				sStr.Adopt ( (char**) (tMatch.m_pDynamic+tCalc.m_tLoc.m_iBitOffset/ROWITEM_BITS));
15531 			}
15532 			break;
15533 
15534 		case SPH_ATTR_FACTORS:
15535 		case SPH_ATTR_FACTORS_JSON:
15536 			{
15537 				BYTE * pData = (BYTE *)tMatch.GetAttr ( tCalc.m_tLoc );
15538 				delete [] pData;
15539 				tMatch.SetAttr ( tCalc.m_tLoc, 0 );
15540 			}
15541 			break;
15542 		default:
15543 			break;
15544 		}
15545 	}
15546 }
15547 
FreeStrFilter(CSphMatch & tMatch) const15548 void CSphQueryContext::FreeStrFilter ( CSphMatch & tMatch ) const
15549 {
15550 	FreeStrItems ( tMatch, m_dCalcFilter );
15551 }
15552 
15553 
FreeStrSort(CSphMatch & tMatch) const15554 void CSphQueryContext::FreeStrSort ( CSphMatch & tMatch ) const
15555 {
15556 	FreeStrItems ( tMatch, m_dCalcSort );
15557 }
15558 
15559 
FreeStrFinal(CSphMatch & tMatch) const15560 void CSphQueryContext::FreeStrFinal ( CSphMatch & tMatch ) const
15561 {
15562 	FreeStrItems ( tMatch, m_dCalcFinal );
15563 }
15564 
15565 
ExprCommand(ESphExprCommand eCmd,void * pArg)15566 void CSphQueryContext::ExprCommand ( ESphExprCommand eCmd, void * pArg )
15567 {
15568 	ARRAY_FOREACH ( i, m_dCalcFilter )
15569 		m_dCalcFilter[i].m_pExpr->Command ( eCmd, pArg );
15570 	ARRAY_FOREACH ( i, m_dCalcSort )
15571 		m_dCalcSort[i].m_pExpr->Command ( eCmd, pArg );
15572 	ARRAY_FOREACH ( i, m_dCalcFinal )
15573 		m_dCalcFinal[i].m_pExpr->Command ( eCmd, pArg );
15574 }
15575 
15576 
SetStringPool(const BYTE * pStrings)15577 void CSphQueryContext::SetStringPool ( const BYTE * pStrings )
15578 {
15579 	ExprCommand ( SPH_EXPR_SET_STRING_POOL, (void*)pStrings );
15580 	if ( m_pFilter )
15581 		m_pFilter->SetStringStorage ( pStrings );
15582 	if ( m_pWeightFilter )
15583 		m_pWeightFilter->SetStringStorage ( pStrings );
15584 }
15585 
15586 
SetMVAPool(const DWORD * pMva,bool bArenaProhibit)15587 void CSphQueryContext::SetMVAPool ( const DWORD * pMva, bool bArenaProhibit )
15588 {
15589 	PoolPtrs_t tMva;
15590 	tMva.m_pMva = pMva;
15591 	tMva.m_bArenaProhibit = bArenaProhibit;
15592 	ExprCommand ( SPH_EXPR_SET_MVA_POOL, &tMva );
15593 	if ( m_pFilter )
15594 		m_pFilter->SetMVAStorage ( pMva, bArenaProhibit );
15595 	if ( m_pWeightFilter )
15596 		m_pWeightFilter->SetMVAStorage ( pMva, bArenaProhibit );
15597 }
15598 
15599 
15600 /// FIXME, perhaps
15601 /// this rather crappy helper class really serves exactly 1 (one) simple purpose
15602 ///
15603 /// it passes a sorting queue internals (namely, weight and float sortkey, if any,
15604 /// of the current-worst queue element) to the MIN_TOP_WORST() and MIN_TOP_SORTVAL()
15605 /// expression classes that expose those to the cruel outside world
15606 ///
15607 /// all the COM-like EXTRA_xxx message back and forth is needed because expressions
15608 /// are currently parsed and created earlier than the sorting queue
15609 ///
15610 /// that also is the reason why we mischievously return 0 instead of clearly failing
15611 /// with an error when the sortval is not a dynamic float; by the time we are parsing
15612 /// expressions, we do not *yet* know that; but by the time we create a sorting queue,
15613 /// we do not *want* to leak select expression checks into it
15614 ///
15615 /// alternatively, we probably want to refactor this and introduce Bind(), to parse
15616 /// expressions once, then bind them to actual searching contexts (aka index or segment,
15617 /// and ranker, and sorter, and whatever else might be referenced by the expressions)
15618 struct ContextExtra : public ISphExtra
15619 {
15620 	ISphRanker * m_pRanker;
15621 	ISphMatchSorter * m_pSorter;
15622 
ExtraDataImplContextExtra15623 	virtual bool ExtraDataImpl ( ExtraData_e eData, void ** ppArg )
15624 	{
15625 		if ( eData==EXTRA_GET_QUEUE_WORST || eData==EXTRA_GET_QUEUE_SORTVAL )
15626 		{
15627 			if ( !m_pSorter )
15628 				return false;
15629 			const CSphMatch * pWorst = m_pSorter->GetWorst();
15630 			if ( !pWorst )
15631 				return false;
15632 			if ( eData==EXTRA_GET_QUEUE_WORST )
15633 			{
15634 				*ppArg = (void*)pWorst;
15635 				return true;
15636 			} else
15637 			{
15638 				assert ( eData==EXTRA_GET_QUEUE_SORTVAL );
15639 				const CSphMatchComparatorState & tCmp = m_pSorter->GetState();
15640 				if ( tCmp.m_eKeypart[0]==SPH_KEYPART_FLOAT && tCmp.m_tLocator[0].m_bDynamic
15641 					&& tCmp.m_tLocator[0].m_iBitCount==32 && ( tCmp.m_tLocator[0].m_iBitOffset%32==0 )
15642 					&& tCmp.m_eKeypart[1]==SPH_KEYPART_ID && tCmp.m_dAttrs[1]==-1 )
15643 				{
15644 					*(int*)ppArg = tCmp.m_tLocator[0].m_iBitOffset/32;
15645 					return true;
15646 				} else
15647 				{
15648 					// min_top_sortval() only works with order by float_expr for now
15649 					return false;
15650 				}
15651 			}
15652 		}
15653 		return m_pRanker->ExtraData ( eData, ppArg );
15654 	}
15655 };
15656 
15657 
SetupExtraData(ISphRanker * pRanker,ISphMatchSorter * pSorter)15658 void CSphQueryContext::SetupExtraData ( ISphRanker * pRanker, ISphMatchSorter * pSorter )
15659 {
15660 	ContextExtra tExtra;
15661 	tExtra.m_pRanker = pRanker;
15662 	tExtra.m_pSorter = pSorter;
15663 	ExprCommand ( SPH_EXPR_SET_EXTRA_DATA, &tExtra );
15664 }
15665 
15666 
MatchExtended(CSphQueryContext * pCtx,const CSphQuery * pQuery,int iSorters,ISphMatchSorter ** ppSorters,ISphRanker * pRanker,int iTag,int iIndexWeight) const15667 void CSphIndex_VLN::MatchExtended ( CSphQueryContext * pCtx, const CSphQuery * pQuery, int iSorters, ISphMatchSorter ** ppSorters,
15668 									ISphRanker * pRanker, int iTag, int iIndexWeight ) const
15669 {
15670 	CSphQueryProfile * pProfile = pCtx->m_pProfile;
15671 
15672 	int iCutoff = pQuery->m_iCutoff;
15673 	if ( iCutoff<=0 )
15674 		iCutoff = -1;
15675 
15676 	// do searching
15677 	CSphMatch * pMatch = pRanker->GetMatchesBuffer();
15678 	for ( ;; )
15679 	{
15680 		// ranker does profile switches internally
15681 		int iMatches = pRanker->GetMatches();
15682 		if ( iMatches<=0 )
15683 			break;
15684 
15685 		if ( pProfile )
15686 			pProfile->Switch ( SPH_QSTATE_SORT );
15687 		for ( int i=0; i<iMatches; i++ )
15688 		{
15689 			if ( pCtx->m_bLookupSort )
15690 			{
15691 				const CSphRowitem * pRow = FindDocinfo ( pMatch[i].m_uDocID );
15692 				if ( !pRow && m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
15693 				{
15694 					pCtx->m_iBadRows++;
15695 					continue;
15696 				}
15697 				CopyDocinfo ( pCtx, pMatch[i], pRow );
15698 			}
15699 
15700 			pMatch[i].m_iWeight *= iIndexWeight;
15701 			pCtx->CalcSort ( pMatch[i] );
15702 
15703 			if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )
15704 			{
15705 				pCtx->FreeStrSort ( pMatch[i] );
15706 				continue;
15707 			}
15708 
15709 			pMatch[i].m_iTag = iTag;
15710 
15711 			bool bRand = false;
15712 			bool bNewMatch = false;
15713 			for ( int iSorter=0; iSorter<iSorters; iSorter++ )
15714 			{
15715 				// all non-random sorters are in the beginning,
15716 				// so we can avoid the simple 'first-element' assertion
15717 				if ( !bRand && ppSorters[iSorter]->m_bRandomize )
15718 				{
15719 					bRand = true;
15720 					pMatch[i].m_iWeight = ( sphRand() & 0xffff ) * iIndexWeight;
15721 
15722 					if ( pCtx->m_pWeightFilter && !pCtx->m_pWeightFilter->Eval ( pMatch[i] ) )
15723 						break;
15724 				}
15725 				bNewMatch |= ppSorters[iSorter]->Push ( pMatch[i] );
15726 
15727 				if ( pCtx->m_uPackedFactorFlags & SPH_FACTOR_ENABLE )
15728 				{
15729 					pRanker->ExtraData ( EXTRA_SET_MATCHPUSHED, (void**)&(ppSorters[iSorter]->m_iJustPushed) );
15730 					pRanker->ExtraData ( EXTRA_SET_MATCHPOPPED, (void**)&(ppSorters[iSorter]->m_dJustPopped) );
15731 				}
15732 			}
15733 			pCtx->FreeStrSort ( pMatch[i] );
15734 
15735 			if ( bNewMatch )
15736 				if ( --iCutoff==0 )
15737 					break;
15738 		}
15739 
15740 		if ( iCutoff==0 )
15741 			break;
15742 	}
15743 
15744 	if ( pProfile )
15745 		pProfile->Switch ( SPH_QSTATE_UNKNOWN );
15746 }
15747 
15748 //////////////////////////////////////////////////////////////////////////
15749 
15750 
15751 struct SphFinalMatchCalc_t : ISphMatchProcessor, ISphNoncopyable
15752 {
15753 	const CSphIndex_VLN *		m_pDocinfoSrc;
15754 	const CSphQueryContext &	m_tCtx;
15755 	int64_t						m_iBadRows;
15756 	int							m_iTag;
15757 
SphFinalMatchCalc_tSphFinalMatchCalc_t15758 	SphFinalMatchCalc_t ( int iTag, const CSphIndex_VLN * pIndex, const CSphQueryContext & tCtx )
15759 		: m_pDocinfoSrc ( pIndex )
15760 		, m_tCtx ( tCtx )
15761 		, m_iBadRows ( 0 )
15762 		, m_iTag ( iTag )
15763 	{ }
15764 
ProcessSphFinalMatchCalc_t15765 	virtual void Process ( CSphMatch * pMatch )
15766 	{
15767 		if ( pMatch->m_iTag>=0 )
15768 			return;
15769 
15770 		if ( m_pDocinfoSrc )
15771 		{
15772 			const CSphRowitem * pRow = m_pDocinfoSrc->FindDocinfo ( pMatch->m_uDocID );
15773 			if ( !pRow && m_pDocinfoSrc->m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
15774 			{
15775 				m_iBadRows++;
15776 				pMatch->m_iTag = m_iTag;
15777 				return;
15778 			}
15779 			m_pDocinfoSrc->CopyDocinfo ( &m_tCtx, *pMatch, pRow );
15780 		}
15781 
15782 		m_tCtx.CalcFinal ( *pMatch );
15783 		pMatch->m_iTag = m_iTag;
15784 	}
15785 };
15786 
15787 
MultiScan(const CSphQuery * pQuery,CSphQueryResult * pResult,int iSorters,ISphMatchSorter ** ppSorters,const CSphMultiQueryArgs & tArgs) const15788 bool CSphIndex_VLN::MultiScan ( const CSphQuery * pQuery, CSphQueryResult * pResult,
15789 	int iSorters, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const
15790 {
15791 	assert ( pQuery->m_sQuery.IsEmpty() );
15792 	assert ( tArgs.m_iTag>=0 );
15793 
15794 	// check if index is ready
15795 	if ( !m_pPreread || !*m_pPreread )
15796 	{
15797 		pResult->m_sError = "index not preread";
15798 		return false;
15799 	}
15800 
15801 	// check if index supports scans
15802 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN || !m_tSchema.GetAttrsCount() )
15803 	{
15804 		pResult->m_sError = "fullscan requires extern docinfo";
15805 		return false;
15806 	}
15807 
15808 	// we count documents only (before filters)
15809 	if ( pQuery->m_iMaxPredictedMsec )
15810 		pResult->m_bHasPrediction = true;
15811 
15812 	if ( tArgs.m_uPackedFactorFlags & SPH_FACTOR_ENABLE )
15813 		pResult->m_sWarning.SetSprintf ( "packedfactors() will not work with a fullscan; you need to specify a query" );
15814 
15815 	// check if index has data
15816 	if ( m_bIsEmpty || m_iDocinfo<=0 || m_tAttr.IsEmpty() )
15817 		return true;
15818 
15819 	// start counting
15820 	int64_t tmQueryStart = sphMicroTimer();
15821 
15822 	// select the sorter with max schema
15823 	// uses GetAttrsCount to get working facets (was GetRowSize)
15824 	int iMaxSchemaSize = -1;
15825 	int iMaxSchemaIndex = -1;
15826 	for ( int i=0; i<iSorters; i++ )
15827 		if ( ppSorters[i]->GetSchema().GetAttrsCount() > iMaxSchemaSize )
15828 		{
15829 			iMaxSchemaSize = ppSorters[i]->GetSchema().GetAttrsCount();
15830 			iMaxSchemaIndex = i;
15831 		}
15832 
15833 	// setup calculations and result schema
15834 	CSphQueryContext tCtx;
15835 	if ( !tCtx.SetupCalc ( pResult, ppSorters[iMaxSchemaIndex]->GetSchema(), m_tSchema, m_tMva.GetWritePtr(), m_bArenaProhibit, false ) )
15836 		return false;
15837 
15838 	// set string pool for string on_sort expression fix up
15839 	tCtx.SetStringPool ( m_tString.GetWritePtr() );
15840 
15841 	// setup filters
15842 	if ( !tCtx.CreateFilters ( true, &pQuery->m_dFilters, ppSorters[iMaxSchemaIndex]->GetSchema(),
15843 								m_tMva.GetWritePtr(), m_tString.GetWritePtr(), pResult->m_sError, pQuery->m_eCollation, m_bArenaProhibit, tArgs.m_dKillList ) )
15844 		return false;
15845 
15846 	// check if we can early reject the whole index
15847 	if ( tCtx.m_pFilter && m_iDocinfoIndex )
15848 	{
15849 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
15850 		DWORD * pMinEntry = const_cast<DWORD*> ( &m_pDocinfoIndex [ m_iDocinfoIndex*uStride*2 ] );
15851 		DWORD * pMaxEntry = pMinEntry + uStride;
15852 
15853 		if ( !tCtx.m_pFilter->EvalBlock ( pMinEntry, pMaxEntry ) )
15854 		{
15855 			pResult->m_iQueryTime += (int)( ( sphMicroTimer()-tmQueryStart )/1000 );
15856 			return true;
15857 		}
15858 	}
15859 
15860 	// setup lookup
15861 	tCtx.m_bLookupFilter = false;
15862 	tCtx.m_bLookupSort = true;
15863 
15864 	// setup sorters vs. MVA
15865 	for ( int i=0; i<iSorters; i++ )
15866 	{
15867 		(ppSorters[i])->SetMVAPool ( m_tMva.GetWritePtr(), m_bArenaProhibit );
15868 		(ppSorters[i])->SetStringPool ( m_tString.GetWritePtr() );
15869 	}
15870 
15871 	// setup overrides
15872 	if ( !tCtx.SetupOverrides ( pQuery, pResult, m_tSchema, ppSorters[iMaxSchemaIndex]->GetSchema() ) )
15873 		return false;
15874 
15875 	// prepare to work them rows
15876 	bool bRandomize = ppSorters[0]->m_bRandomize;
15877 
15878 	CSphMatch tMatch;
15879 	tMatch.Reset ( ppSorters[iMaxSchemaIndex]->GetSchema().GetDynamicSize() );
15880 	tMatch.m_iWeight = tArgs.m_iIndexWeight;
15881 	tMatch.m_iTag = tCtx.m_dCalcFinal.GetLength() ? -1 : tArgs.m_iTag;
15882 
15883 	if ( pResult->m_pProfile )
15884 		pResult->m_pProfile->Switch ( SPH_QSTATE_FULLSCAN );
15885 
15886 	// optimize direct lookups by id
15887 	// run full scan with block and row filtering for everything else
15888 	if ( pQuery->m_dFilters.GetLength()==1
15889 		&& pQuery->m_dFilters[0].m_eType==SPH_FILTER_VALUES
15890 		&& pQuery->m_dFilters[0].m_bExclude==false
15891 		&& pQuery->m_dFilters[0].m_sAttrName=="@id"
15892 		&& tArgs.m_dKillList.GetLength()==0 )
15893 	{
15894 		// run id lookups
15895 		for ( int i=0; i<pQuery->m_dFilters[0].GetNumValues(); i++ )
15896 		{
15897 			pResult->m_tStats.m_iFetchedDocs++;
15898 			SphDocID_t uDocid = (SphDocID_t) pQuery->m_dFilters[0].GetValue(i);
15899 			const DWORD * pRow = FindDocinfo ( uDocid );
15900 
15901 			if ( !pRow )
15902 				continue;
15903 
15904 			assert ( uDocid==DOCINFO2ID(pRow) );
15905 			tMatch.m_uDocID = uDocid;
15906 			CopyDocinfo ( &tCtx, tMatch, pRow );
15907 
15908 			if ( bRandomize )
15909 				tMatch.m_iWeight = ( sphRand() & 0xffff ) * tArgs.m_iIndexWeight;
15910 
15911 			// submit match to sorters
15912 			tCtx.CalcSort ( tMatch );
15913 
15914 			for ( int iSorter=0; iSorter<iSorters; iSorter++ )
15915 				ppSorters[iSorter]->Push ( tMatch );
15916 
15917 			// stringptr expressions should be duplicated (or taken over) at this point
15918 			tCtx.FreeStrSort ( tMatch );
15919 		}
15920 	} else
15921 	{
15922 		bool bReverse = pQuery->m_bReverseScan; // shortcut
15923 		int iCutoff = ( pQuery->m_iCutoff<=0 ) ? -1 : pQuery->m_iCutoff;
15924 
15925 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
15926 		int64_t iStart = bReverse ? m_iDocinfoIndex-1 : 0;
15927 		int64_t iEnd = bReverse ? -1 : m_iDocinfoIndex;
15928 		int64_t iStep = bReverse ? -1 : 1;
15929 		for ( int64_t iIndexEntry=iStart; iIndexEntry!=iEnd; iIndexEntry+=iStep )
15930 		{
15931 			// block-level filtering
15932 			const DWORD * pMin = &m_pDocinfoIndex[ iIndexEntry*uStride*2 ];
15933 			const DWORD * pMax = pMin + uStride;
15934 			if ( tCtx.m_pFilter && !tCtx.m_pFilter->EvalBlock ( pMin, pMax ) )
15935 				continue;
15936 
15937 			// row-level filtering
15938 			const DWORD * pBlockStart = m_tAttr.GetWritePtr() + ( iIndexEntry*uStride*DOCINFO_INDEX_FREQ );
15939 			const DWORD * pBlockEnd = m_tAttr.GetWritePtr() + ( Min ( ( iIndexEntry+1 )*DOCINFO_INDEX_FREQ, m_iDocinfo )*uStride );
15940 			if ( bReverse )
15941 			{
15942 				pBlockStart = m_tAttr.GetWritePtr() + ( ( Min ( ( iIndexEntry+1 )*DOCINFO_INDEX_FREQ, m_iDocinfo ) - 1 ) * uStride );
15943 				pBlockEnd = m_tAttr.GetWritePtr() + uStride*( iIndexEntry*DOCINFO_INDEX_FREQ-1 );
15944 			}
15945 			int iDocinfoStep = bReverse ? -(int)uStride : (int)uStride;
15946 
15947 			if ( !tCtx.m_pOverrides && tCtx.m_pFilter && !pQuery->m_iCutoff && !tCtx.m_dCalcFilter.GetLength() && !tCtx.m_dCalcSort.GetLength() )
15948 			{
15949 				// kinda fastpath
15950 				for ( const DWORD * pDocinfo=pBlockStart; pDocinfo!=pBlockEnd; pDocinfo+=iDocinfoStep )
15951 				{
15952 					pResult->m_tStats.m_iFetchedDocs++;
15953 					tMatch.m_uDocID = DOCINFO2ID ( pDocinfo );
15954 					tMatch.m_pStatic = DOCINFO2ATTRS ( pDocinfo );
15955 
15956 					if ( tCtx.m_pFilter->Eval ( tMatch ) )
15957 					{
15958 						if ( bRandomize )
15959 							tMatch.m_iWeight = ( sphRand() & 0xffff ) * tArgs.m_iIndexWeight;
15960 						for ( int iSorter=0; iSorter<iSorters; iSorter++ )
15961 							ppSorters[iSorter]->Push ( tMatch );
15962 					}
15963 					// stringptr expressions should be duplicated (or taken over) at this point
15964 					tCtx.FreeStrFilter ( tMatch );
15965 				}
15966 			} else
15967 			{
15968 				// generic path
15969 				for ( const DWORD * pDocinfo=pBlockStart; pDocinfo!=pBlockEnd; pDocinfo+=iDocinfoStep )
15970 				{
15971 					pResult->m_tStats.m_iFetchedDocs++;
15972 					tMatch.m_uDocID = DOCINFO2ID ( pDocinfo );
15973 					CopyDocinfo ( &tCtx, tMatch, pDocinfo );
15974 
15975 					// early filter only (no late filters in full-scan because of no @weight)
15976 					tCtx.CalcFilter ( tMatch );
15977 					if ( tCtx.m_pFilter && !tCtx.m_pFilter->Eval ( tMatch ) )
15978 					{
15979 						tCtx.FreeStrFilter ( tMatch );
15980 						continue;
15981 					}
15982 
15983 					if ( bRandomize )
15984 						tMatch.m_iWeight = ( sphRand() & 0xffff ) * tArgs.m_iIndexWeight;
15985 
15986 					// submit match to sorters
15987 					tCtx.CalcSort ( tMatch );
15988 
15989 					bool bNewMatch = false;
15990 					for ( int iSorter=0; iSorter<iSorters; iSorter++ )
15991 						bNewMatch |= ppSorters[iSorter]->Push ( tMatch );
15992 
15993 					// stringptr expressions should be duplicated (or taken over) at this point
15994 					tCtx.FreeStrFilter ( tMatch );
15995 					tCtx.FreeStrSort ( tMatch );
15996 
15997 					// handle cutoff
15998 					if ( bNewMatch && --iCutoff==0 )
15999 					{
16000 						iIndexEntry = iEnd - iStep; // outer break
16001 						break;
16002 					}
16003 				}
16004 			}
16005 		}
16006 	}
16007 
16008 	if ( pResult->m_pProfile )
16009 		pResult->m_pProfile->Switch ( SPH_QSTATE_FINALIZE );
16010 
16011 	// do final expression calculations
16012 	if ( tCtx.m_dCalcFinal.GetLength() )
16013 	{
16014 		SphFinalMatchCalc_t tFinal ( tArgs.m_iTag, NULL, tCtx );
16015 		for ( int iSorter=0; iSorter<iSorters; iSorter++ )
16016 		{
16017 			ISphMatchSorter * pTop = ppSorters[iSorter];
16018 			pTop->Finalize ( tFinal, false );
16019 		}
16020 		tCtx.m_iBadRows += tFinal.m_iBadRows;
16021 	}
16022 
16023 	// done
16024 	pResult->m_pMva = m_tMva.GetWritePtr();
16025 	pResult->m_pStrings = m_tString.GetWritePtr();
16026 	pResult->m_bArenaProhibit = m_bArenaProhibit;
16027 	pResult->m_iQueryTime += (int)( ( sphMicroTimer()-tmQueryStart )/1000 );
16028 	pResult->m_iBadRows += tCtx.m_iBadRows;
16029 
16030 	return true;
16031 }
16032 
16033 //////////////////////////////////////////////////////////////////////////////
16034 
QwordSpawn(const XQKeyword_t & tWord) const16035 ISphQword * DiskIndexQwordSetup_c::QwordSpawn ( const XQKeyword_t & tWord ) const
16036 {
16037 	if ( !tWord.m_pPayload )
16038 	{
16039 		WITH_QWORD ( m_pIndex, false, Qword, return new Qword ( tWord.m_bExpanded, tWord.m_bExcluded ) );
16040 	} else
16041 	{
16042 		if ( m_pIndex->GetSettings().m_eHitFormat==SPH_HIT_FORMAT_INLINE )
16043 		{
16044 			return new DiskPayloadQword_c<true> ( (const DiskSubstringPayload_t *)tWord.m_pPayload, tWord.m_bExcluded, m_tDoclist, m_tHitlist, m_pProfile );
16045 		} else
16046 		{
16047 			return new DiskPayloadQword_c<false> ( (const DiskSubstringPayload_t *)tWord.m_pPayload, tWord.m_bExcluded, m_tDoclist, m_tHitlist, m_pProfile );
16048 		}
16049 	}
16050 	return NULL;
16051 }
16052 
16053 
QwordSetup(ISphQword * pWord) const16054 bool DiskIndexQwordSetup_c::QwordSetup ( ISphQword * pWord ) const
16055 {
16056 	DiskIndexQwordTraits_c * pMyWord = (DiskIndexQwordTraits_c*)pWord;
16057 
16058 	// setup attrs
16059 	pMyWord->m_tDoc.Reset ( m_iDynamicRowitems );
16060 	pMyWord->m_iMinID = m_uMinDocid;
16061 	pMyWord->m_tDoc.m_uDocID = m_uMinDocid;
16062 
16063 	return pMyWord->Setup ( this );
16064 }
16065 
16066 
Setup(ISphQword * pWord) const16067 bool DiskIndexQwordSetup_c::Setup ( ISphQword * pWord ) const
16068 {
16069 	// there was a dynamic_cast here once but it's not necessary
16070 	// maybe it worth to rewrite class hierarchy to avoid c-cast here?
16071 	DiskIndexQwordTraits_c & tWord = *(DiskIndexQwordTraits_c*)pWord;
16072 
16073 	if ( m_eDocinfo==SPH_DOCINFO_INLINE )
16074 	{
16075 		tWord.m_iInlineAttrs = m_iInlineRowitems;
16076 		tWord.m_pInlineFixup = m_pMinRow;
16077 	} else
16078 	{
16079 		tWord.m_iInlineAttrs = 0;
16080 		tWord.m_pInlineFixup = NULL;
16081 	}
16082 
16083 	// setup stats
16084 	tWord.m_iDocs = 0;
16085 	tWord.m_iHits = 0;
16086 
16087 	CSphIndex_VLN * pIndex = (CSphIndex_VLN *)m_pIndex;
16088 
16089 	// !COMMIT FIXME!
16090 	// the below stuff really belongs in wordlist
16091 	// which in turn really belongs in dictreader
16092 	// which in turn might or might not be a part of dict
16093 
16094 	// binary search through checkpoints for a one whose range matches word ID
16095 	assert ( pIndex->m_pPreread && *pIndex->m_pPreread );
16096 	assert ( !pIndex->m_tWordlist.m_pBuf.IsEmpty() );
16097 
16098 	// empty index?
16099 	if ( !pIndex->m_tWordlist.m_dCheckpoints.GetLength() )
16100 		return false;
16101 
16102 	const char * sWord = tWord.m_sDictWord.cstr();
16103 	const bool bWordDict = pIndex->m_pDict->GetSettings().m_bWordDict;
16104 	int iWordLen = sWord ? strlen ( sWord ) : 0;
16105 	if ( bWordDict && tWord.m_sWord.Ends("*") )
16106 	{
16107 		iWordLen = Max ( iWordLen-1, 0 );
16108 
16109 		// might match either infix or prefix
16110 		int iMinLen = Max ( pIndex->m_tSettings.m_iMinPrefixLen, pIndex->m_tSettings.m_iMinInfixLen );
16111 		if ( pIndex->m_tSettings.m_iMinPrefixLen )
16112 			iMinLen = Min ( iMinLen, pIndex->m_tSettings.m_iMinPrefixLen );
16113 		if ( pIndex->m_tSettings.m_iMinInfixLen )
16114 			iMinLen = Min ( iMinLen, pIndex->m_tSettings.m_iMinInfixLen );
16115 
16116 		// bail out term shorter than prefix or infix allowed
16117 		if ( iWordLen<iMinLen )
16118 			return false;
16119 	}
16120 
16121 	// leading special symbols trimming
16122 	if ( bWordDict && tWord.m_sDictWord.Begins("*") )
16123 	{
16124 		sWord++;
16125 		iWordLen = Max ( iWordLen-1, 0 );
16126 		// bail out term shorter than infix allowed
16127 		if ( iWordLen<pIndex->m_tSettings.m_iMinInfixLen )
16128 			return false;
16129 	}
16130 
16131 	const CSphWordlistCheckpoint * pCheckpoint = pIndex->m_tWordlist.FindCheckpoint ( sWord, iWordLen, tWord.m_uWordID, false );
16132 	if ( !pCheckpoint )
16133 		return false;
16134 
16135 	// decode wordlist chunk
16136 	const BYTE * pBuf = pIndex->m_tWordlist.AcquireDict ( pCheckpoint );
16137 	assert ( pBuf );
16138 
16139 	CSphDictEntry tRes;
16140 	if ( bWordDict )
16141 	{
16142 		KeywordsBlockReader_c tCtx ( pBuf, m_pSkips!=NULL );
16143 		while ( tCtx.UnpackWord() )
16144 		{
16145 			// block is sorted
16146 			// so once keywords are greater than the reference word, no more matches
16147 			assert ( tCtx.GetWordLen()>0 );
16148 			int iCmp = sphDictCmpStrictly ( sWord, iWordLen, tCtx.GetWord(), tCtx.GetWordLen() );
16149 			if ( iCmp<0 )
16150 				return false;
16151 			if ( iCmp==0 )
16152 				break;
16153 		}
16154 		if ( tCtx.GetWordLen()<=0 )
16155 			return false;
16156 		tRes = tCtx;
16157 
16158 	} else
16159 	{
16160 		if ( !pIndex->m_tWordlist.GetWord ( pBuf, tWord.m_uWordID, tRes ) )
16161 			return false;
16162 	}
16163 
16164 	const ESphHitless eMode = pIndex->m_tSettings.m_eHitless;
16165 	tWord.m_iDocs = eMode==SPH_HITLESS_SOME ? ( tRes.m_iDocs & HITLESS_DOC_MASK ) : tRes.m_iDocs;
16166 	tWord.m_iHits = tRes.m_iHits;
16167 	tWord.m_bHasHitlist =
16168 		( eMode==SPH_HITLESS_NONE ) ||
16169 		( eMode==SPH_HITLESS_SOME && !( tRes.m_iDocs & HITLESS_DOC_FLAG ) );
16170 
16171 	if ( m_bSetupReaders )
16172 	{
16173 		tWord.m_rdDoclist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
16174 		tWord.m_rdDoclist.SetFile ( m_tDoclist );
16175 		tWord.m_rdDoclist.m_pProfile = m_pProfile;
16176 		tWord.m_rdDoclist.m_eProfileState = SPH_QSTATE_READ_DOCS;
16177 
16178 		// read in skiplist
16179 		// OPTIMIZE? maybe cache hot decompressed lists?
16180 		// OPTIMIZE? maybe add an option to decompress on preload instead?
16181 		if ( m_pSkips && tRes.m_iDocs>SPH_SKIPLIST_BLOCK )
16182 		{
16183 			const BYTE * pSkip = m_pSkips + tRes.m_iSkiplistOffset;
16184 
16185 			tWord.m_dSkiplist.Add();
16186 			tWord.m_dSkiplist.Last().m_iBaseDocid = 0;
16187 			tWord.m_dSkiplist.Last().m_iOffset = tRes.m_iDoclistOffset;
16188 			tWord.m_dSkiplist.Last().m_iBaseHitlistPos = 0;
16189 
16190 			for ( int i=1; i<( tWord.m_iDocs/SPH_SKIPLIST_BLOCK ); i++ )
16191 			{
16192 				SkiplistEntry_t & t = tWord.m_dSkiplist.Add();
16193 				SkiplistEntry_t & p = tWord.m_dSkiplist [ tWord.m_dSkiplist.GetLength()-2 ];
16194 				t.m_iBaseDocid = p.m_iBaseDocid + SPH_SKIPLIST_BLOCK + (SphDocID_t) sphUnzipOffset ( pSkip );
16195 				t.m_iOffset = p.m_iOffset + 4*SPH_SKIPLIST_BLOCK + sphUnzipOffset ( pSkip );
16196 				t.m_iBaseHitlistPos = p.m_iBaseHitlistPos + sphUnzipOffset ( pSkip );
16197 			}
16198 		}
16199 
16200 		tWord.m_rdDoclist.SeekTo ( tRes.m_iDoclistOffset, tRes.m_iDoclistHint );
16201 
16202 		tWord.m_rdHitlist.SetBuffers ( g_iReadBuffer, g_iReadUnhinted );
16203 		tWord.m_rdHitlist.SetFile ( m_tHitlist );
16204 		tWord.m_rdHitlist.m_pProfile = m_pProfile;
16205 		tWord.m_rdHitlist.m_eProfileState = SPH_QSTATE_READ_HITS;
16206 	}
16207 
16208 	return true;
16209 }
16210 
16211 //////////////////////////////////////////////////////////////////////////////
16212 
Lock()16213 bool CSphIndex_VLN::Lock ()
16214 {
16215 	CSphString sName = GetIndexFileName("spl");
16216 	sphLogDebug ( "Locking the index via file %s", sName.cstr() );
16217 
16218 	if ( m_iLockFD<0 )
16219 	{
16220 		m_iLockFD = ::open ( sName.cstr(), SPH_O_NEW, 0644 );
16221 		if ( m_iLockFD<0 )
16222 		{
16223 			m_sLastError.SetSprintf ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
16224 			sphLogDebug ( "failed to open %s: %s", sName.cstr(), strerror(errno) );
16225 			return false;
16226 		}
16227 	}
16228 
16229 	if ( !sphLockEx ( m_iLockFD, false ) )
16230 	{
16231 		m_sLastError.SetSprintf ( "failed to lock %s: %s", sName.cstr(), strerror(errno) );
16232 		::close ( m_iLockFD );
16233 		m_iLockFD = -1;
16234 		return false;
16235 	}
16236 	sphLogDebug ( "lock %s success", sName.cstr() );
16237 	return true;
16238 }
16239 
16240 
Unlock()16241 void CSphIndex_VLN::Unlock()
16242 {
16243 	CSphString sName = GetIndexFileName("spl");
16244 	sphLogDebug ( "Unlocking the index (lock %s)", sName.cstr() );
16245 	if ( m_iLockFD>=0 )
16246 	{
16247 		sphLogDebug ( "File ID ok, closing lock FD %d, unlinking %s", m_iLockFD, sName.cstr() );
16248 		sphLockUn ( m_iLockFD );
16249 		::close ( m_iLockFD );
16250 		::unlink ( sName.cstr() );
16251 		m_iLockFD = -1;
16252 	}
16253 }
16254 
16255 
Mlock()16256 bool CSphIndex_VLN::Mlock ()
16257 {
16258 	bool bRes = true;
16259 	bRes &= m_tWordlist.m_pBuf.Mlock ( "wordlist", m_sLastError );
16260 
16261 	if ( m_bOndiskAllAttr )
16262 		return bRes;
16263 
16264 	bRes &= m_dAttrShared.Mlock ( "docinfo", m_sLastError );
16265 
16266 	if ( !m_bOndiskPoolAttr )
16267 	{
16268 		bRes &= m_dMvaShared.Mlock ( "mva", m_sLastError );
16269 		bRes &= m_dStringShared.Mlock ( "strings", m_sLastError );
16270 	}
16271 	return bRes;
16272 }
16273 
16274 
Dealloc()16275 void CSphIndex_VLN::Dealloc ()
16276 {
16277 	if ( !m_bPreallocated )
16278 		return;
16279 
16280 	m_tDoclistFile.Close ();
16281 	m_tHitlistFile.Close ();
16282 	m_pDocinfoHash.Reset ();
16283 	m_dAttrShared.Reset ();
16284 	m_dMvaShared.Reset ();
16285 	m_dStringShared.Reset ();
16286 	m_pKillList.Reset ();
16287 	m_tWordlist.Reset ();
16288 	m_pSkiplists.Reset ();
16289 	m_dAttrMapped.Close();
16290 	m_dMvaMapped.Close();
16291 	m_dStringMapped.Close();
16292 
16293 	m_iDocinfo = 0;
16294 	m_iMinMaxIndex = 0;
16295 	m_tSettings.m_eDocinfo = SPH_DOCINFO_NONE;
16296 
16297 	m_bPreallocated = false;
16298 	SafeDelete ( m_pFieldFilter );
16299 	SafeDelete ( m_pQueryTokenizer );
16300 	SafeDelete ( m_pTokenizer );
16301 	SafeDelete ( m_pDict );
16302 
16303 	if ( m_iIndexTag>=0 && g_pMvaArena )
16304 		g_tMvaArena.TaggedFreeTag ( m_iIndexTag );
16305 	m_iIndexTag = -1;
16306 
16307 	m_pPreread = NULL;
16308 	m_pAttrsStatus = NULL;
16309 
16310 #ifndef NDEBUG
16311 	m_dShared.Reset ();
16312 #endif
16313 }
16314 
16315 
SetEnableOndiskAttributes(bool bPool)16316 void CSphIndex_VLN::SetEnableOndiskAttributes ( bool bPool )
16317 {
16318 	if ( m_bPreallocated )
16319 		return;
16320 
16321 	m_bOndiskAllAttr = !bPool;
16322 	m_bOndiskPoolAttr = bPool;
16323 }
16324 
16325 
LoadIndexSettings(CSphIndexSettings & tSettings,CSphReader & tReader,DWORD uVersion)16326 void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion )
16327 {
16328 	if ( uVersion>=8 )
16329 	{
16330 		tSettings.m_iMinPrefixLen = tReader.GetDword ();
16331 		tSettings.m_iMinInfixLen = tReader.GetDword ();
16332 
16333 	} else if ( uVersion>=6 )
16334 	{
16335 		bool bPrefixesOnly = ( tReader.GetByte ()!=0 );
16336 		tSettings.m_iMinPrefixLen = tReader.GetDword ();
16337 		tSettings.m_iMinInfixLen = 0;
16338 		if ( !bPrefixesOnly )
16339 			Swap ( tSettings.m_iMinPrefixLen, tSettings.m_iMinInfixLen );
16340 	}
16341 
16342 	if ( uVersion>=38 )
16343 		tSettings.m_iMaxSubstringLen = tReader.GetDword();
16344 
16345 	if ( uVersion>=9 )
16346 	{
16347 		tSettings.m_bHtmlStrip = !!tReader.GetByte ();
16348 		tSettings.m_sHtmlIndexAttrs = tReader.GetString ();
16349 		tSettings.m_sHtmlRemoveElements = tReader.GetString ();
16350 	}
16351 
16352 	if ( uVersion>=12 )
16353 		tSettings.m_bIndexExactWords = !!tReader.GetByte ();
16354 
16355 	if ( uVersion>=18 )
16356 		tSettings.m_eHitless = (ESphHitless)tReader.GetDword();
16357 
16358 	if ( uVersion>=19 )
16359 		tSettings.m_eHitFormat = (ESphHitFormat)tReader.GetDword();
16360 	else // force plain format for old indices
16361 		tSettings.m_eHitFormat = SPH_HIT_FORMAT_PLAIN;
16362 
16363 	if ( uVersion>=21 )
16364 		tSettings.m_bIndexSP = !!tReader.GetByte();
16365 
16366 	if ( uVersion>=22 )
16367 	{
16368 		tSettings.m_sZones = tReader.GetString();
16369 		if ( uVersion<25 && !tSettings.m_sZones.IsEmpty() )
16370 			tSettings.m_sZones.SetSprintf ( "%s*", tSettings.m_sZones.cstr() );
16371 	}
16372 
16373 	if ( uVersion>=23 )
16374 	{
16375 		tSettings.m_iBoundaryStep = (int)tReader.GetDword();
16376 		tSettings.m_iStopwordStep = (int)tReader.GetDword();
16377 	}
16378 
16379 	if ( uVersion>=28 )
16380 		tSettings.m_iOvershortStep = (int)tReader.GetDword();
16381 
16382 	if ( uVersion>=30 )
16383 		tSettings.m_iEmbeddedLimit = (int)tReader.GetDword();
16384 
16385 	if ( uVersion>=32 )
16386 	{
16387 		tSettings.m_eBigramIndex = (ESphBigram)tReader.GetByte();
16388 		tSettings.m_sBigramWords = tReader.GetString();
16389 	}
16390 
16391 	if ( uVersion>=35 )
16392 		tSettings.m_bIndexFieldLens = ( tReader.GetByte()!=0 );
16393 
16394 	if ( uVersion>=39 )
16395 	{
16396 		tSettings.m_eChineseRLP = (ESphRLPFilter)tReader.GetByte();
16397 		tSettings.m_sRLPContext = tReader.GetString();
16398 	}
16399 
16400 	if ( uVersion>=41 )
16401 		tSettings.m_sIndexTokenFilter = tReader.GetString();
16402 }
16403 
16404 
LoadHeader(const char * sHeaderName,bool bStripPath,CSphEmbeddedFiles & tEmbeddedFiles,CSphString & sWarning)16405 bool CSphIndex_VLN::LoadHeader ( const char * sHeaderName, bool bStripPath, CSphEmbeddedFiles & tEmbeddedFiles, CSphString & sWarning )
16406 {
16407 	const int MAX_HEADER_SIZE = 32768;
16408 	CSphFixedVector<BYTE> dCacheInfo ( MAX_HEADER_SIZE );
16409 
16410 	CSphAutoreader rdInfo ( dCacheInfo.Begin(), MAX_HEADER_SIZE ); // to avoid mallocs
16411 	if ( !rdInfo.Open ( sHeaderName, m_sLastError ) )
16412 		return false;
16413 
16414 	// magic header
16415 	const char* sFmt = CheckFmtMagic ( rdInfo.GetDword () );
16416 	if ( sFmt )
16417 	{
16418 		m_sLastError.SetSprintf ( sFmt, sHeaderName );
16419 		return false;
16420 	}
16421 
16422 	// version
16423 	m_uVersion = rdInfo.GetDword();
16424 	if ( m_uVersion==0 || m_uVersion>INDEX_FORMAT_VERSION )
16425 	{
16426 		m_sLastError.SetSprintf ( "%s is v.%d, binary is v.%d", sHeaderName, m_uVersion, INDEX_FORMAT_VERSION );
16427 		return false;
16428 	}
16429 
16430 	// bits
16431 	m_bUse64 = false;
16432 	if ( m_uVersion>=2 )
16433 		m_bUse64 = ( rdInfo.GetDword ()!=0 );
16434 
16435 	if ( m_bUse64!=USE_64BIT )
16436 	{
16437 #if USE_64BIT
16438 		// TODO: may be do this param conditional and push it into the config?
16439 		m_bId32to64 = true;
16440 
16441 		if ( m_bOndiskAllAttr || m_bOndiskPoolAttr )
16442 		{
16443 			m_bOndiskAllAttr = false;
16444 			m_bOndiskPoolAttr = false;
16445 			sWarning.SetSprintf ( "can not convert attributes, ondisk_attrs disabled" );
16446 		}
16447 #else
16448 		m_sLastError.SetSprintf ( "'%s' is id%d, and this binary is id%d",
16449 			GetIndexFileName("sph").cstr(),
16450 			m_bUse64 ? 64 : 32, USE_64BIT ? 64 : 32 );
16451 		return false;
16452 #endif
16453 	}
16454 
16455 	// FIXME!!! old index min-max precompute
16456 	if ( m_uVersion<20 && ( m_bOndiskAllAttr || m_bOndiskPoolAttr ) )
16457 	{
16458 		m_bOndiskPoolAttr = false;
16459 		m_bOndiskAllAttr = false;
16460 	}
16461 
16462 	// skiplists
16463 	m_bHaveSkips = ( m_uVersion>=31 );
16464 
16465 	// docinfo
16466 	m_tSettings.m_eDocinfo = (ESphDocinfo) rdInfo.GetDword();
16467 
16468 	// schema
16469 	// 4th arg means that inline attributes need be dynamic in searching time too
16470 	ReadSchema ( rdInfo, m_tSchema, m_uVersion, m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE );
16471 
16472 	// check schema for dupes
16473 	for ( int iAttr=1; iAttr<m_tSchema.GetAttrsCount(); iAttr++ )
16474 	{
16475 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(iAttr);
16476 		for ( int i=0; i<iAttr; i++ )
16477 			if ( m_tSchema.GetAttr(i).m_sName==tCol.m_sName )
16478 				sWarning.SetSprintf ( "duplicate attribute name: %s", tCol.m_sName.cstr() );
16479 	}
16480 
16481 	// in case of *fork rotation we reuse min match from 1st rotated index ( it could be less than my size and inline ( m_pDynamic ) )
16482 	// min doc
16483 	m_dMinRow.Reset ( m_tSchema.GetRowSize() );
16484 	if ( m_uVersion>=2 )
16485 		m_uMinDocid = (SphDocID_t) rdInfo.GetOffset (); // v2+; losing high bits when !USE_64 is intentional, check is performed on bUse64 above
16486 	else
16487 		m_uMinDocid = rdInfo.GetDword(); // v1
16488 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
16489 		rdInfo.GetBytes ( m_dMinRow.Begin(), sizeof(CSphRowitem)*m_tSchema.GetRowSize() );
16490 
16491 	// dictionary header (wordlist checkpoints, infix blocks, etc)
16492 	m_tWordlist.m_iDictCheckpointsOffset = rdInfo.GetOffset();
16493 	m_tWordlist.m_iDictCheckpoints = rdInfo.GetDword();
16494 	if ( m_uVersion>=27 )
16495 	{
16496 		m_tWordlist.m_iInfixCodepointBytes = rdInfo.GetByte();
16497 		m_tWordlist.m_iInfixBlocksOffset = rdInfo.GetDword();
16498 	}
16499 	if ( m_uVersion>=34 )
16500 		m_tWordlist.m_iInfixBlocksWordsSize = rdInfo.GetDword();
16501 
16502 	m_tWordlist.m_dCheckpoints.Reset ( m_tWordlist.m_iDictCheckpoints );
16503 
16504 	// index stats
16505 	m_tStats.m_iTotalDocuments = rdInfo.GetDword ();
16506 	m_tStats.m_iTotalBytes = rdInfo.GetOffset ();
16507 	if ( m_uVersion>=40 )
16508 		m_iTotalDups = rdInfo.GetDword();
16509 
16510 	LoadIndexSettings ( m_tSettings, rdInfo, m_uVersion );
16511 	if ( m_uVersion<9 )
16512 		m_bStripperInited = false;
16513 
16514 	if ( m_uVersion>=9 )
16515 	{
16516 		// tokenizer stuff
16517 		CSphTokenizerSettings tSettings;
16518 		if ( !LoadTokenizerSettings ( rdInfo, tSettings, tEmbeddedFiles, m_uVersion, m_sLastError ) )
16519 			return false;
16520 
16521 		if ( bStripPath )
16522 			StripPath ( tSettings.m_sSynonymsFile );
16523 
16524 		ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tSettings, &tEmbeddedFiles, m_sLastError );
16525 		if ( !pTokenizer )
16526 			return false;
16527 
16528 		// dictionary stuff
16529 		CSphDictSettings tDictSettings;
16530 		LoadDictionarySettings ( rdInfo, tDictSettings, tEmbeddedFiles, m_uVersion, sWarning );
16531 		if ( m_bId32to64 )
16532 			tDictSettings.m_bCrc32 = true;
16533 
16534 		if ( bStripPath )
16535 		{
16536 			StripPath ( tDictSettings.m_sStopwords );
16537 			ARRAY_FOREACH ( i, tDictSettings.m_dWordforms )
16538 				StripPath ( tDictSettings.m_dWordforms[i] );
16539 		}
16540 
16541 		CSphDict * pDict = tDictSettings.m_bWordDict
16542 			? sphCreateDictionaryKeywords ( tDictSettings, &tEmbeddedFiles, pTokenizer, m_sIndexName.cstr(), m_sLastError )
16543 			: sphCreateDictionaryCRC ( tDictSettings, &tEmbeddedFiles, pTokenizer, m_sIndexName.cstr(), m_sLastError );
16544 
16545 		if ( !pDict )
16546 			return false;
16547 
16548 		if ( tDictSettings.m_sMorphFingerprint!=pDict->GetMorphDataFingerprint() )
16549 			sWarning.SetSprintf ( "different lemmatizer dictionaries (index='%s', current='%s')",
16550 				tDictSettings.m_sMorphFingerprint.cstr(),
16551 				pDict->GetMorphDataFingerprint().cstr() );
16552 
16553 		SetDictionary ( pDict );
16554 
16555 		pTokenizer = ISphTokenizer::CreateMultiformFilter ( pTokenizer, pDict->GetMultiWordforms () );
16556 
16557 #if USE_RLP
16558 		pTokenizer = ISphTokenizer::CreateRLPFilter ( pTokenizer, m_tSettings.m_eChineseRLP!=SPH_RLP_NONE, g_sRLPRoot.cstr(),
16559 			g_sRLPEnv.cstr(), m_tSettings.m_sRLPContext.cstr(), true, m_sLastError );
16560 
16561 		if ( !pTokenizer )
16562 			return false;
16563 #endif
16564 
16565 		SetTokenizer ( pTokenizer );
16566 		SetupQueryTokenizer();
16567 
16568 		// initialize AOT if needed
16569 		m_tSettings.m_uAotFilterMask = sphParseMorphAot ( tDictSettings.m_sMorphology.cstr() );
16570 	} else
16571 	{
16572 		if ( m_bId32to64 )
16573 		{
16574 			m_sLastError.SetSprintf ( "too old id32 index; can not be loaded by this id64 binary" );
16575 			return false;
16576 		}
16577 	}
16578 
16579 	if ( m_uVersion>=10 )
16580 		m_uKillListSize = rdInfo.GetDword ();
16581 
16582 	if ( m_uVersion>=33 )
16583 		m_iMinMaxIndex = rdInfo.GetOffset ();
16584 	else if ( m_uVersion>=20 )
16585 		m_iMinMaxIndex = rdInfo.GetDword ();
16586 
16587 	if ( m_uVersion>=28 )
16588 	{
16589 		CSphFieldFilterSettings tFieldFilterSettings;
16590 		LoadFieldFilterSettings ( rdInfo, tFieldFilterSettings );
16591 		if ( tFieldFilterSettings.m_dRegexps.GetLength() )
16592 			SetFieldFilter ( sphCreateFieldFilter ( tFieldFilterSettings, sWarning ) );
16593 	}
16594 
16595 	if ( m_uVersion>=35 && m_tSettings.m_bIndexFieldLens )
16596 		ARRAY_FOREACH ( i, m_tSchema.m_dFields )
16597 			m_dFieldLens[i] = rdInfo.GetOffset(); // FIXME? ideally 64bit even when off is 32bit..
16598 
16599 	// post-load stuff.. for now, bigrams
16600 	CSphIndexSettings & s = m_tSettings;
16601 	if ( s.m_eBigramIndex!=SPH_BIGRAM_NONE && s.m_eBigramIndex!=SPH_BIGRAM_ALL )
16602 	{
16603 		BYTE * pTok;
16604 		m_pTokenizer->SetBuffer ( (BYTE*)s.m_sBigramWords.cstr(), s.m_sBigramWords.Length() );
16605 		while ( ( pTok = m_pTokenizer->GetToken() )!=NULL )
16606 			s.m_dBigramWords.Add() = (const char*)pTok;
16607 		s.m_dBigramWords.Sort();
16608 	}
16609 
16610 
16611 	if ( rdInfo.GetErrorFlag() )
16612 		m_sLastError.SetSprintf ( "%s: failed to parse header (unexpected eof)", sHeaderName );
16613 	return !rdInfo.GetErrorFlag();
16614 }
16615 
16616 
DebugDumpHeader(FILE * fp,const char * sHeaderName,bool bConfig)16617 void CSphIndex_VLN::DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig )
16618 {
16619 	CSphEmbeddedFiles tEmbeddedFiles;
16620 	CSphString sWarning;
16621 	if ( !LoadHeader ( sHeaderName, false, tEmbeddedFiles, sWarning ) )
16622 	{
16623 		fprintf ( fp, "FATAL: failed to load header: %s.\n", m_sLastError.cstr() );
16624 		return;
16625 	}
16626 
16627 	if ( !sWarning.IsEmpty () )
16628 		fprintf ( fp, "WARNING: %s\n", sWarning.cstr () );
16629 
16630 	///////////////////////////////////////////////
16631 	// print header in index config section format
16632 	///////////////////////////////////////////////
16633 
16634 	if ( bConfig )
16635 	{
16636 		fprintf ( fp, "\nsource $dump\n{\n" );
16637 
16638 		fprintf ( fp, "\tsql_query = SELECT id \\\n" );
16639 		ARRAY_FOREACH ( i, m_tSchema.m_dFields )
16640 			fprintf ( fp, "\t, %s \\\n", m_tSchema.m_dFields[i].m_sName.cstr() );
16641 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
16642 		{
16643 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
16644 			fprintf ( fp, "\t, %s \\\n", tAttr.m_sName.cstr() );
16645 		}
16646 		fprintf ( fp, "\tFROM documents\n" );
16647 
16648 		if ( m_tSchema.GetAttrsCount() )
16649 			fprintf ( fp, "\n" );
16650 
16651 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
16652 		{
16653 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
16654 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
16655 				fprintf ( fp, "\tsql_attr_multi = uint %s from field\n", tAttr.m_sName.cstr() );
16656 			else if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
16657 				fprintf ( fp, "\tsql_attr_multi = bigint %s from field\n", tAttr.m_sName.cstr() );
16658 			else if ( tAttr.m_eAttrType==SPH_ATTR_INTEGER && tAttr.m_tLocator.IsBitfield() )
16659 				fprintf ( fp, "\tsql_attr_uint = %s:%d\n", tAttr.m_sName.cstr(), tAttr.m_tLocator.m_iBitCount );
16660 			else if ( tAttr.m_eAttrType==SPH_ATTR_TOKENCOUNT )
16661 			{; // intendedly skip, as these are autogenerated by index_field_lengths=1
16662 			} else
16663 				fprintf ( fp, "\t%s = %s\n", sphTypeDirective ( tAttr.m_eAttrType ), tAttr.m_sName.cstr() );
16664 		}
16665 
16666 		fprintf ( fp, "}\n\nindex $dump\n{\n\tsource = $dump\n\tpath = $dump\n" );
16667 
16668 		if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
16669 			fprintf ( fp, "\tdocinfo = inline\n" );
16670 		if ( m_tSettings.m_iMinPrefixLen )
16671 			fprintf ( fp, "\tmin_prefix_len = %d\n", m_tSettings.m_iMinPrefixLen );
16672 		if ( m_tSettings.m_iMinInfixLen )
16673 			fprintf ( fp, "\tmin_prefix_len = %d\n", m_tSettings.m_iMinInfixLen );
16674 		if ( m_tSettings.m_iMaxSubstringLen )
16675 			fprintf ( fp, "\tmax_substring_len = %d\n", m_tSettings.m_iMaxSubstringLen );
16676 		if ( m_tSettings.m_bIndexExactWords )
16677 			fprintf ( fp, "\tindex_exact_words = %d\n", m_tSettings.m_bIndexExactWords ? 1 : 0 );
16678 		if ( m_tSettings.m_bHtmlStrip )
16679 			fprintf ( fp, "\thtml_strip = 1\n" );
16680 		if ( !m_tSettings.m_sHtmlIndexAttrs.IsEmpty() )
16681 			fprintf ( fp, "\thtml_index_attrs = %s\n", m_tSettings.m_sHtmlIndexAttrs.cstr () );
16682 		if ( !m_tSettings.m_sHtmlRemoveElements.IsEmpty() )
16683 			fprintf ( fp, "\thtml_remove_elements = %s\n", m_tSettings.m_sHtmlRemoveElements.cstr () );
16684 		if ( m_tSettings.m_sZones.cstr() )
16685 			fprintf ( fp, "\tindex_zones = %s\n", m_tSettings.m_sZones.cstr() );
16686 		if ( m_tSettings.m_bIndexFieldLens )
16687 			fprintf ( fp, "\tindex_field_lengths = 1\n" );
16688 
16689 		if ( m_pTokenizer )
16690 		{
16691 			const CSphTokenizerSettings & tSettings = m_pTokenizer->GetSettings ();
16692 			fprintf ( fp, "\tcharset_type = %s\n", ( tSettings.m_iType==TOKENIZER_UTF8 || tSettings.m_iType==TOKENIZER_NGRAM )
16693 					? "utf-8"
16694 					: "unknown tokenizer (deprecated sbcs?)" );
16695 			fprintf ( fp, "\tcharset_table = %s\n", tSettings.m_sCaseFolding.cstr () );
16696 			if ( tSettings.m_iMinWordLen>1 )
16697 				fprintf ( fp, "\tmin_word_len = %d\n", tSettings.m_iMinWordLen );
16698 			if ( tSettings.m_iNgramLen && !tSettings.m_sNgramChars.IsEmpty() )
16699 				fprintf ( fp, "\tngram_len = %d\nngram_chars = %s\n",
16700 					tSettings.m_iNgramLen, tSettings.m_sNgramChars.cstr () );
16701 			if ( !tSettings.m_sSynonymsFile.IsEmpty() )
16702 				fprintf ( fp, "\texceptions = %s\n", tSettings.m_sSynonymsFile.cstr () );
16703 			if ( !tSettings.m_sBoundary.IsEmpty() )
16704 				fprintf ( fp, "\tphrase_boundary = %s\n", tSettings.m_sBoundary.cstr () );
16705 			if ( !tSettings.m_sIgnoreChars.IsEmpty() )
16706 				fprintf ( fp, "\tignore_chars = %s\n", tSettings.m_sIgnoreChars.cstr () );
16707 			if ( !tSettings.m_sBlendChars.IsEmpty() )
16708 				fprintf ( fp, "\tblend_chars = %s\n", tSettings.m_sBlendChars.cstr () );
16709 			if ( !tSettings.m_sBlendMode.IsEmpty() )
16710 				fprintf ( fp, "\tblend_mode = %s\n", tSettings.m_sBlendMode.cstr () );
16711 		}
16712 
16713 		if ( m_pDict )
16714 		{
16715 			const CSphDictSettings & tSettings = m_pDict->GetSettings ();
16716 			if ( tSettings.m_bWordDict )
16717 				fprintf ( fp, "\tdict = keywords\n" );
16718 			if ( !tSettings.m_sMorphology.IsEmpty() )
16719 				fprintf ( fp, "\tmorphology = %s\n", tSettings.m_sMorphology.cstr () );
16720 			if ( !tSettings.m_sStopwords.IsEmpty() )
16721 				fprintf ( fp, "\tstopwords = %s\n", tSettings.m_sStopwords.cstr () );
16722 			if ( tSettings.m_dWordforms.GetLength() )
16723 			{
16724 				fprintf ( fp, "\twordforms =" );
16725 				ARRAY_FOREACH ( i, tSettings.m_dWordforms )
16726 					fprintf ( fp, " %s", tSettings.m_dWordforms[i].cstr () );
16727 				fprintf ( fp, "\n" );
16728 			}
16729 			if ( tSettings.m_iMinStemmingLen>1 )
16730 				fprintf ( fp, "\tmin_stemming_len = %d\n", tSettings.m_iMinStemmingLen );
16731 		}
16732 
16733 		fprintf ( fp, "}\n" );
16734 		return;
16735 	}
16736 
16737 	///////////////////////////////////////////////
16738 	// print header and stats in "readable" format
16739 	///////////////////////////////////////////////
16740 
16741 	fprintf ( fp, "version: %d\n",			m_uVersion );
16742 	fprintf ( fp, "idbits: %d\n",			m_bUse64 ? 64 : 32 );
16743 	fprintf ( fp, "docinfo: " );
16744 	switch ( m_tSettings.m_eDocinfo )
16745 	{
16746 		case SPH_DOCINFO_NONE:		fprintf ( fp, "none\n" ); break;
16747 		case SPH_DOCINFO_INLINE:	fprintf ( fp, "inline\n" ); break;
16748 		case SPH_DOCINFO_EXTERN:	fprintf ( fp, "extern\n" ); break;
16749 		default:					fprintf ( fp, "unknown (value=%d)\n", m_tSettings.m_eDocinfo ); break;
16750 	}
16751 
16752 	fprintf ( fp, "fields: %d\n", m_tSchema.m_dFields.GetLength() );
16753 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
16754 		fprintf ( fp, "  field %d: %s\n", i, m_tSchema.m_dFields[i].m_sName.cstr() );
16755 
16756 	fprintf ( fp, "attrs: %d\n", m_tSchema.GetAttrsCount() );
16757 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
16758 	{
16759 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
16760 		fprintf ( fp, "  attr %d: %s, %s", i, tAttr.m_sName.cstr(), sphTypeName ( tAttr.m_eAttrType ) );
16761 		if ( tAttr.m_eAttrType==SPH_ATTR_INTEGER && tAttr.m_tLocator.m_iBitCount!=32 )
16762 			fprintf ( fp, ", bits %d", tAttr.m_tLocator.m_iBitCount );
16763 		fprintf ( fp, ", bitoff %d\n", tAttr.m_tLocator.m_iBitOffset );
16764 	}
16765 
16766 	// skipped min doc, wordlist checkpoints
16767 	fprintf ( fp, "total-documents: " INT64_FMT "\n", m_tStats.m_iTotalDocuments );
16768 	fprintf ( fp, "total-bytes: " INT64_FMT "\n", int64_t(m_tStats.m_iTotalBytes) );
16769 	fprintf ( fp, "total-duplicates: %d\n", m_iTotalDups );
16770 
16771 	fprintf ( fp, "min-prefix-len: %d\n", m_tSettings.m_iMinPrefixLen );
16772 	fprintf ( fp, "min-infix-len: %d\n", m_tSettings.m_iMinInfixLen );
16773 	fprintf ( fp, "max-substring-len: %d\n", m_tSettings.m_iMaxSubstringLen );
16774 	fprintf ( fp, "exact-words: %d\n", m_tSettings.m_bIndexExactWords ? 1 : 0 );
16775 	fprintf ( fp, "html-strip: %d\n", m_tSettings.m_bHtmlStrip ? 1 : 0 );
16776 	fprintf ( fp, "html-index-attrs: %s\n", m_tSettings.m_sHtmlIndexAttrs.cstr () );
16777 	fprintf ( fp, "html-remove-elements: %s\n", m_tSettings.m_sHtmlRemoveElements.cstr () );
16778 	fprintf ( fp, "index-zones: %s\n", m_tSettings.m_sZones.cstr() );
16779 
16780 	if ( m_pTokenizer )
16781 	{
16782 		const CSphTokenizerSettings & tSettings = m_pTokenizer->GetSettings ();
16783 		fprintf ( fp, "tokenizer-type: %d\n", tSettings.m_iType );
16784 		fprintf ( fp, "tokenizer-case-folding: %s\n", tSettings.m_sCaseFolding.cstr () );
16785 		fprintf ( fp, "tokenizer-min-word-len: %d\n", tSettings.m_iMinWordLen );
16786 		fprintf ( fp, "tokenizer-ngram-chars: %s\n", tSettings.m_sNgramChars.cstr () );
16787 		fprintf ( fp, "tokenizer-ngram-len: %d\n", tSettings.m_iNgramLen );
16788 		fprintf ( fp, "tokenizer-exceptions: %s\n", tSettings.m_sSynonymsFile.cstr () );
16789 		fprintf ( fp, "tokenizer-phrase-boundary: %s\n", tSettings.m_sBoundary.cstr () );
16790 		fprintf ( fp, "tokenizer-ignore-chars: %s\n", tSettings.m_sIgnoreChars.cstr () );
16791 		fprintf ( fp, "tokenizer-blend-chars: %s\n", tSettings.m_sBlendChars.cstr () );
16792 		fprintf ( fp, "tokenizer-blend-mode: %s\n", tSettings.m_sBlendMode.cstr () );
16793 
16794 		fprintf ( fp, "tokenizer-exceptions: %s\n", tSettings.m_sSynonymsFile.cstr () );
16795 		fprintf ( fp, "dictionary-embedded-exceptions: %d\n", tEmbeddedFiles.m_bEmbeddedSynonyms ? 1 : 0 );
16796 		if ( tEmbeddedFiles.m_bEmbeddedSynonyms )
16797 		{
16798 			ARRAY_FOREACH ( i, tEmbeddedFiles.m_dSynonyms )
16799 				fprintf ( fp, "\tdictionary-embedded-exception [%d]: %s\n", i, tEmbeddedFiles.m_dSynonyms[i].cstr () );
16800 		}
16801 	}
16802 
16803 	if ( m_pDict )
16804 	{
16805 		const CSphDictSettings & tSettings = m_pDict->GetSettings ();
16806 		fprintf ( fp, "dict: %s\n", tSettings.m_bWordDict ? "keywords" : "crc" );
16807 		fprintf ( fp, "dictionary-morphology: %s\n", tSettings.m_sMorphology.cstr () );
16808 
16809 		fprintf ( fp, "dictionary-stopwords-file: %s\n", tSettings.m_sStopwords.cstr () );
16810 		fprintf ( fp, "dictionary-embedded-stopwords: %d\n", tEmbeddedFiles.m_bEmbeddedStopwords ? 1 : 0 );
16811 		if ( tEmbeddedFiles.m_bEmbeddedStopwords )
16812 		{
16813 			ARRAY_FOREACH ( i, tEmbeddedFiles.m_dStopwords )
16814 				fprintf ( fp, "\tdictionary-embedded-stopword [%d]: " DOCID_FMT "\n", i, tEmbeddedFiles.m_dStopwords[i] );
16815 		}
16816 
16817 		ARRAY_FOREACH ( i, tSettings.m_dWordforms )
16818 			fprintf ( fp, "dictionary-wordform-file [%d]: %s\n", i, tSettings.m_dWordforms[i].cstr () );
16819 
16820 		fprintf ( fp, "dictionary-embedded-wordforms: %d\n", tEmbeddedFiles.m_bEmbeddedWordforms ? 1 : 0 );
16821 		if ( tEmbeddedFiles.m_bEmbeddedWordforms )
16822 		{
16823 			ARRAY_FOREACH ( i, tEmbeddedFiles.m_dWordforms )
16824 				fprintf ( fp, "\tdictionary-embedded-wordform [%d]: %s\n", i, tEmbeddedFiles.m_dWordforms[i].cstr () );
16825 		}
16826 
16827 		fprintf ( fp, "min-stemming-len: %d\n", tSettings.m_iMinStemmingLen );
16828 	}
16829 
16830 	fprintf ( fp, "killlist-size: %u\n", m_uKillListSize );
16831 	fprintf ( fp, "min-max-index: " INT64_FMT "\n", m_iMinMaxIndex );
16832 
16833 	if ( m_pFieldFilter )
16834 	{
16835 		CSphFieldFilterSettings tSettings;
16836 		m_pFieldFilter->GetSettings ( tSettings );
16837 		ARRAY_FOREACH ( i, tSettings.m_dRegexps )
16838 			fprintf ( fp, "field-filter-regexp [%d]: %s\n", i, tSettings.m_dRegexps[i].cstr() );
16839 	}
16840 }
16841 
16842 
DebugDumpDocids(FILE * fp)16843 void CSphIndex_VLN::DebugDumpDocids ( FILE * fp )
16844 {
16845 	if ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN )
16846 	{
16847 		fprintf ( fp, "FATAL: docids dump only supported for docinfo=extern\n" );
16848 		return;
16849 	}
16850 
16851 	const int iRowStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
16852 
16853 	const int64_t iNumMinMaxRow = ( m_uVersion>=20 ) ? ( (m_iDocinfoIndex+1)*iRowStride*2 ) : 0;
16854 	const int64_t iNumRows = (m_tAttr.GetNumEntries()-iNumMinMaxRow) / iRowStride;
16855 
16856 	const int64_t iDocinfoSize = iRowStride*m_iDocinfo*sizeof(DWORD);
16857 	const int64_t iMinmaxSize = iNumMinMaxRow*sizeof(CSphRowitem);
16858 
16859 	fprintf ( fp, "docinfo-bytes: docinfo=" INT64_FMT ", min-max=" INT64_FMT ", total=" UINT64_FMT "\n"
16860 		, iDocinfoSize, iMinmaxSize, (uint64_t)m_tAttr.GetLengthBytes() );
16861 	fprintf ( fp, "docinfo-stride: %d\n", (int)(iRowStride*sizeof(DWORD)) );
16862 	fprintf ( fp, "docinfo-rows: " INT64_FMT "\n", iNumRows );
16863 
16864 	if ( !m_tAttr.GetNumEntries() )
16865 		return;
16866 
16867 	DWORD * pDocinfo = m_tAttr.GetWritePtr();
16868 	for ( int64_t iRow=0; iRow<iNumRows; iRow++, pDocinfo+=iRowStride )
16869 		printf ( INT64_FMT". id=" DOCID_FMT "\n", iRow+1, DOCINFO2ID ( pDocinfo ) );
16870 	printf ( "--- min-max=" INT64_FMT " ---\n", iNumMinMaxRow );
16871 	for ( int64_t iRow=0; iRow<(m_iDocinfoIndex+1)*2; iRow++, pDocinfo+=iRowStride )
16872 		printf ( "id=" DOCID_FMT "\n", DOCINFO2ID ( pDocinfo ) );
16873 }
16874 
16875 
DebugDumpHitlist(FILE * fp,const char * sKeyword,bool bID)16876 void CSphIndex_VLN::DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID )
16877 {
16878 	WITH_QWORD ( this, false, Qword, DumpHitlist<Qword> ( fp, sKeyword, bID ) );
16879 }
16880 
16881 
16882 template < class Qword >
DumpHitlist(FILE * fp,const char * sKeyword,bool bID)16883 void CSphIndex_VLN::DumpHitlist ( FILE * fp, const char * sKeyword, bool bID )
16884 {
16885 	// get keyword id
16886 	SphWordID_t uWordID = 0;
16887 	BYTE * sTok = NULL;
16888 	if ( !bID )
16889 	{
16890 		CSphString sBuf ( sKeyword );
16891 
16892 		m_pTokenizer->SetBuffer ( (BYTE*)sBuf.cstr(), strlen ( sBuf.cstr() ) );
16893 		sTok = m_pTokenizer->GetToken();
16894 
16895 		if ( !sTok )
16896 			sphDie ( "keyword=%s, no token (too short?)", sKeyword );
16897 
16898 		uWordID = m_pDict->GetWordID ( sTok );
16899 		if ( !uWordID )
16900 			sphDie ( "keyword=%s, tok=%s, no wordid (stopped?)", sKeyword, sTok );
16901 
16902 		fprintf ( fp, "keyword=%s, tok=%s, wordid=" UINT64_FMT "\n", sKeyword, sTok, uint64_t(uWordID) );
16903 
16904 	} else
16905 	{
16906 		uWordID = (SphWordID_t) strtoull ( sKeyword, NULL, 10 );
16907 		if ( !uWordID )
16908 			sphDie ( "failed to convert keyword=%s to id (must be integer)", sKeyword );
16909 
16910 		fprintf ( fp, "wordid=" UINT64_FMT "\n", uint64_t(uWordID) );
16911 	}
16912 
16913 	// open files
16914 	CSphAutofile tDoclist, tHitlist;
16915 	if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError ) < 0 )
16916 		sphDie ( "failed to open doclist: %s", m_sLastError.cstr() );
16917 
16918 	if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, m_sLastError ) < 0 )
16919 		sphDie ( "failed to open hitlist: %s", m_sLastError.cstr() );
16920 
16921 	// aim
16922 	DiskIndexQwordSetup_c tTermSetup ( tDoclist, tHitlist, m_pSkiplists.GetWritePtr(), NULL );
16923 	tTermSetup.m_pDict = m_pDict;
16924 	tTermSetup.m_pIndex = this;
16925 	tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
16926 	tTermSetup.m_uMinDocid = m_uMinDocid;
16927 	tTermSetup.m_pMinRow = m_dMinRow.Begin();
16928 	tTermSetup.m_bSetupReaders = true;
16929 
16930 	Qword tKeyword ( false, false );
16931 	tKeyword.m_tDoc.m_uDocID = m_uMinDocid;
16932 	tKeyword.m_uWordID = uWordID;
16933 	tKeyword.m_sWord = sKeyword;
16934 	tKeyword.m_sDictWord = (const char *)sTok;
16935 	if ( !tTermSetup.QwordSetup ( &tKeyword ) )
16936 		sphDie ( "failed to setup keyword" );
16937 
16938 	int iSize = m_tSchema.GetRowSize();
16939 	CSphVector<CSphRowitem> dAttrs ( iSize );
16940 
16941 	// press play on tape
16942 	for ( ;; )
16943 	{
16944 		tKeyword.GetNextDoc ( iSize ? &dAttrs[0] : NULL );
16945 		if ( !tKeyword.m_tDoc.m_uDocID )
16946 			break;
16947 		tKeyword.SeekHitlist ( tKeyword.m_iHitlistPos );
16948 
16949 		int iHits = 0;
16950 		if ( tKeyword.m_bHasHitlist )
16951 			for ( Hitpos_t uHit = tKeyword.GetNextHit(); uHit!=EMPTY_HIT; uHit = tKeyword.GetNextHit() )
16952 			{
16953 				fprintf ( fp, "doc=" DOCID_FMT ", hit=0x%08x\n", tKeyword.m_tDoc.m_uDocID, uHit ); // FIXME?
16954 				iHits++;
16955 			}
16956 
16957 		if ( !iHits )
16958 		{
16959 			uint64_t uOff = tKeyword.m_iHitlistPos;
16960 			fprintf ( fp, "doc=" DOCID_FMT ", NO HITS, inline=%d, off=" UINT64_FMT "\n",
16961 				tKeyword.m_tDoc.m_uDocID, (int)(uOff>>63), (uOff<<1)>>1 );
16962 		}
16963 	}
16964 }
16965 
16966 
DebugDumpDict(FILE * fp)16967 void CSphIndex_VLN::DebugDumpDict ( FILE * fp )
16968 {
16969 	if ( !m_pDict->GetSettings().m_bWordDict )
16970 	{
16971 		fprintf ( fp, "sorry, DebugDumpDict() only supports dict=keywords for now\n" );
16972 		return;
16973 	}
16974 
16975 	fprintf ( fp, "keyword,docs,hits,offset\n" );
16976 	ARRAY_FOREACH ( i, m_tWordlist.m_dCheckpoints )
16977 	{
16978 		KeywordsBlockReader_c tCtx ( m_tWordlist.AcquireDict ( &m_tWordlist.m_dCheckpoints[i] ), m_bHaveSkips );
16979 		while ( tCtx.UnpackWord() )
16980 			fprintf ( fp, "%s,%d,%d," INT64_FMT "\n", tCtx.GetWord(), tCtx.m_iDocs, tCtx.m_iHits, int64_t(tCtx.m_iDoclistOffset) );
16981 	}
16982 }
16983 
16984 //////////////////////////////////////////////////////////////////////////
16985 
Prealloc(bool bMlock,bool bStripPath,CSphString & sWarning)16986 bool CSphIndex_VLN::Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning )
16987 {
16988 	MEMORY ( MEM_INDEX_DISK );
16989 
16990 	// reset
16991 	Dealloc ();
16992 
16993 	// always keep shared variables flag
16994 	if ( m_dShared.IsEmpty() )
16995 	{
16996 		if ( !m_dShared.Alloc ( SPH_SHARED_VARS_COUNT, m_sLastError, sWarning ) )
16997 			return false;
16998 	}
16999 	memset ( m_dShared.GetWritePtr(), 0, m_dShared.GetLengthBytes() );
17000 	m_pPreread = m_dShared.GetWritePtr()+0;
17001 	m_pAttrsStatus = m_dShared.GetWritePtr()+1;
17002 
17003 	// set new locking flag
17004 	m_tWordlist.m_pBuf.SetMlock ( bMlock );
17005 	m_dAttrShared.SetMlock ( bMlock );
17006 	m_dMvaShared.SetMlock ( bMlock );
17007 	m_dStringShared.SetMlock ( bMlock );
17008 	m_pKillList.SetMlock ( bMlock );
17009 	m_pSkiplists.SetMlock ( bMlock );
17010 
17011 	CSphEmbeddedFiles tEmbeddedFiles;
17012 
17013 	// preload schema
17014 	if ( !LoadHeader ( GetIndexFileName("sph").cstr(), bStripPath, tEmbeddedFiles, sWarning ) )
17015 		return false;
17016 
17017 	tEmbeddedFiles.Reset();
17018 
17019 	// verify that data files are readable
17020 	if ( !sphIsReadable ( GetIndexFileName("spd").cstr(), &m_sLastError ) )
17021 		return false;
17022 
17023 	if ( m_uVersion>=3 && !sphIsReadable ( GetIndexFileName("spp").cstr(), &m_sLastError ) )
17024 		return false;
17025 
17026 	if ( m_bHaveSkips && !sphIsReadable ( GetIndexFileName("spe").cstr(), &m_sLastError ) )
17027 		return false;
17028 
17029 	/////////////////////
17030 	// prealloc wordlist
17031 	/////////////////////
17032 
17033 	// try to open wordlist file in all cases
17034 	CSphAutofile tWordlist ( GetIndexFileName("spi"), SPH_O_READ, m_sLastError );
17035 	if ( tWordlist.GetFD()<0 )
17036 		return false;
17037 
17038 	m_tWordlist.m_iSize = tWordlist.GetSize ( 1, true, m_sLastError );
17039 	if ( m_tWordlist.m_iSize<0 )
17040 		return false;
17041 
17042 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
17043 	{
17044 		CSphAutofile tDocinfo ( GetIndexFileName("spa"), SPH_O_READ, m_sLastError );
17045 		if ( tDocinfo.GetFD()<0 )
17046 			return false;
17047 
17048 		m_bIsEmpty = ( tDocinfo.GetSize ( 0, false, m_sLastError )==0 );
17049 	} else
17050 		m_bIsEmpty = ( m_tWordlist.m_iSize<=1 );
17051 
17052 	if ( ( m_tWordlist.m_iSize<=1 )!=( m_tWordlist.m_dCheckpoints.GetLength()==0 ) )
17053 		sphWarning ( "wordlist size mismatch (size=" INT64_FMT ", checkpoints=%d)", m_tWordlist.m_iSize, m_tWordlist.m_dCheckpoints.GetLength() );
17054 
17055 	// make sure checkpoints are loadable
17056 	// pre-11 indices use different offset type (this is fixed up later during the loading)
17057 	assert ( m_tWordlist.m_iDictCheckpointsOffset>0 );
17058 
17059 	// prealloc wordlist upto checkpoints
17060 	// (keyword blocks aka checkpoints, infix blocks etc will be loaded separately)
17061 	if ( !m_bDebugCheck )
17062 	{
17063 		if ( !m_tWordlist.m_pBuf.Alloc ( m_tWordlist.m_iDictCheckpointsOffset, m_sLastError, sWarning ) )
17064 			return false;
17065 	}
17066 
17067 	// preopen
17068 	if ( m_bKeepFilesOpen )
17069 	{
17070 		if ( m_tDoclistFile.Open ( GetIndexFileName("spd"), SPH_O_READ, m_sLastError ) < 0 )
17071 			return false;
17072 
17073 		if ( m_tHitlistFile.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, m_sLastError ) < 0 )
17074 			return false;
17075 	}
17076 
17077 	/////////////////////
17078 	// prealloc docinfos
17079 	/////////////////////
17080 
17081 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_bIsEmpty )
17082 	{
17083 		/////////////
17084 		// attr data
17085 		/////////////
17086 
17087 		assert ( !( m_bOndiskAllAttr || m_bOndiskPoolAttr ) || ( m_uVersion>=20 && !m_bId32to64 ) );
17088 		int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
17089 
17090 		if ( m_bOndiskAllAttr )
17091 		{
17092 			if ( !m_dAttrMapped.Setup ( GetIndexFileName("spa").cstr(), m_sLastError ) )
17093 				return false;
17094 
17095 			int64_t iDocinfoSize = m_dAttrMapped.GetLengthBytes();
17096 			if ( iDocinfoSize<0 )
17097 				return false;
17098 			iDocinfoSize = iDocinfoSize / sizeof(DWORD);
17099 			int64_t iRealDocinfoSize = m_iMinMaxIndex ? m_iMinMaxIndex : iDocinfoSize;
17100 			m_iDocinfo = iRealDocinfoSize / iStride;
17101 
17102 			if ( !CheckDocsCount ( m_iDocinfo, m_sLastError ) )
17103 				return false;
17104 
17105 			if ( iDocinfoSize < iRealDocinfoSize )
17106 			{
17107 				m_sLastError.SetSprintf ( "precomputed chunk size check mismatch" );
17108 				sphLogDebug ( "precomputed chunk size check mismatch (size=" INT64_FMT ", real=" INT64_FMT ", min-max=" INT64_FMT ", count=" INT64_FMT ")",
17109 					iDocinfoSize, iRealDocinfoSize, m_iMinMaxIndex, m_iDocinfo );
17110 				return false;
17111 			}
17112 
17113 			// FIXME!!! preload min-max index
17114 			m_iDocinfoIndex = ( ( iDocinfoSize - iRealDocinfoSize ) / iStride / 2 ) - 1;
17115 			m_pDocinfoIndex = m_dAttrMapped.GetWritePtr() + m_iMinMaxIndex;
17116 
17117 			m_tAttr.Set ( m_dAttrMapped.GetWritePtr(), m_dAttrMapped.GetNumEntries() );
17118 		} else
17119 		{
17120 			int iStride2 = iStride-1; // id64 - 1 DWORD = id32
17121 			int iEntrySize = sizeof(DWORD)*iStride;
17122 
17123 			CSphAutofile tDocinfo ( GetIndexFileName("spa"), SPH_O_READ, m_sLastError );
17124 			if ( tDocinfo.GetFD()<0 )
17125 				return false;
17126 
17127 			int64_t iDocinfoSize = tDocinfo.GetSize ( iEntrySize, true, m_sLastError );
17128 			if ( iDocinfoSize<0 )
17129 				return false;
17130 			iDocinfoSize = iDocinfoSize / sizeof(DWORD);
17131 			int64_t iRealDocinfoSize = m_iMinMaxIndex ? m_iMinMaxIndex : iDocinfoSize;
17132 			m_iDocinfo = iRealDocinfoSize / iStride;
17133 
17134 
17135 			if ( m_bId32to64 )
17136 			{
17137 				// check also the case of id32 here, and correct m_iDocinfo for it
17138 				m_iDocinfo = iRealDocinfoSize / iStride2;
17139 				m_iMinMaxIndex = m_iMinMaxIndex / iStride2 * iStride;
17140 			}
17141 
17142 			if ( !CheckDocsCount ( m_iDocinfo, m_sLastError ) )
17143 				return false;
17144 
17145 			if ( m_uVersion < 20 )
17146 			{
17147 				if ( m_bId32to64 )
17148 					iDocinfoSize = iDocinfoSize / iStride2 * iStride;
17149 				m_iDocinfoIndex = ( m_iDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
17150 
17151 				// prealloc docinfo
17152 				if ( !m_dAttrShared.Alloc ( iDocinfoSize + (m_iDocinfoIndex+1)*iStride*2 + ( m_bId32to64 ? m_iDocinfo : 0 ), m_sLastError, sWarning ) )
17153 					return false;
17154 
17155 				m_pDocinfoIndex = m_dAttrShared.GetWritePtr()+iDocinfoSize;
17156 			} else
17157 			{
17158 				if ( iDocinfoSize < iRealDocinfoSize )
17159 				{
17160 					m_sLastError.SetSprintf ( "precomputed chunk size check mismatch" );
17161 					sphLogDebug ( "precomputed chunk size check mismatch (size=" INT64_FMT ", real=" INT64_FMT ", min-max=" INT64_FMT ", count=" INT64_FMT ")",
17162 						iDocinfoSize, iRealDocinfoSize, m_iMinMaxIndex, m_iDocinfo );
17163 					return false;
17164 				}
17165 
17166 				m_iDocinfoIndex = ( ( iDocinfoSize - iRealDocinfoSize ) / (m_bId32to64?iStride2:iStride) / 2 ) - 1;
17167 
17168 				// prealloc docinfo
17169 				if ( !m_dAttrShared.Alloc ( iDocinfoSize + ( m_bId32to64 ? ( m_iDocinfo + m_iDocinfoIndex*2 + 2 ) : 0 ), m_sLastError, sWarning ) )
17170 					return false;
17171 
17172 				m_pDocinfoIndex = m_dAttrShared.GetWritePtr() + m_iMinMaxIndex;
17173 			}
17174 
17175 			m_tAttr.Set ( m_dAttrShared.GetWritePtr(), m_dAttrShared.GetNumEntries() );
17176 		}
17177 
17178 		// prealloc docinfo hash but only if docinfo is big enough (in other words if hash is 8x+ less in size)
17179 		if ( m_pDocinfoHash.IsEmpty() && m_dAttrShared.GetLengthBytes() > ( 32 << DOCINFO_HASH_BITS ) && !m_bDebugCheck )
17180 			if ( !m_pDocinfoHash.Alloc ( ( 1 << DOCINFO_HASH_BITS )+4, m_sLastError, sWarning ) )
17181 				return false;
17182 
17183 
17184 		////////////
17185 		// MVA data
17186 		////////////
17187 
17188 		if ( m_uVersion>=4 )
17189 		{
17190 			if ( m_bOndiskAllAttr || m_bOndiskPoolAttr )
17191 			{
17192 				if ( !m_dMvaMapped.Setup ( GetIndexFileName("spm").cstr(), m_sLastError ) )
17193 					return false;
17194 
17195 				m_tMva.Set ( m_dMvaMapped.GetWritePtr(), m_dMvaMapped.GetNumEntries() );
17196 			} else
17197 			{
17198 				// if index is v4, .spm must always exist, even though length could be 0
17199 				CSphAutofile fdMva ( GetIndexFileName("spm"), SPH_O_READ, m_sLastError );
17200 				if ( fdMva.GetFD()<0 )
17201 					return false;
17202 
17203 				SphOffset_t iMvaSize = fdMva.GetSize ( 0, true, m_sLastError );
17204 				if ( iMvaSize<0 )
17205 					return false;
17206 
17207 				// prealloc
17208 				if ( iMvaSize>0 )
17209 					if ( !m_dMvaShared.Alloc ( DWORD(iMvaSize/sizeof(DWORD)), m_sLastError, sWarning ) )
17210 						return false;
17211 
17212 				m_tMva.Set ( m_dMvaShared.GetWritePtr(), m_dMvaShared.GetNumEntries() );
17213 			}
17214 
17215 			if ( m_tMva.GetNumEntries()>INT_MAX )
17216 			{
17217 				m_bArenaProhibit = true;
17218 				sphWarning ( "MVA update disabled (loaded MVA " INT64_FMT ", should be less %d)", m_tMva.GetNumEntries(), INT_MAX );
17219 			}
17220 		}
17221 
17222 		///////////////
17223 		// string data
17224 		///////////////
17225 
17226 		if ( m_uVersion>=17 )
17227 		{
17228 			if ( m_bOndiskAllAttr || m_bOndiskPoolAttr )
17229 			{
17230 				if ( !m_dStringMapped.Setup ( GetIndexFileName("sps").cstr(), m_sLastError ) )
17231 					return false;
17232 
17233 				m_tString.Set ( m_dStringMapped.GetWritePtr(), m_dStringMapped.GetNumEntries() );
17234 			} else
17235 			{
17236 				CSphAutofile fdStrings ( GetIndexFileName("sps"), SPH_O_READ, m_sLastError );
17237 				if ( fdStrings.GetFD()<0 )
17238 					return false;
17239 
17240 				SphOffset_t iStringsSize = fdStrings.GetSize ( 0, true, m_sLastError );
17241 				if ( iStringsSize<0 )
17242 					return false;
17243 
17244 				// prealloc
17245 				if ( iStringsSize>0 )
17246 					if ( !m_dStringShared.Alloc ( DWORD(iStringsSize), m_sLastError, sWarning ) )
17247 						return false;
17248 
17249 				m_tString.Set ( m_dStringShared.GetWritePtr(), m_dStringShared.GetNumEntries() );
17250 			}
17251 		}
17252 	}
17253 
17254 
17255 	// prealloc killlist
17256 	if ( m_uVersion>=10 )
17257 	{
17258 		CSphAutofile tKillList ( GetIndexFileName("spk"), SPH_O_READ, m_sLastError );
17259 		if ( tKillList.GetFD()<0 )
17260 			return false;
17261 
17262 		SphOffset_t iSize = tKillList.GetSize ( 0, true, m_sLastError );
17263 		if ( iSize<0 )
17264 			return false;
17265 
17266 		SphOffset_t uKlistBytes = m_uKillListSize*sizeof(SphDocID_t);
17267 		if ( m_bId32to64 )
17268 			uKlistBytes /= 2;
17269 
17270 		if ( iSize!=uKlistBytes )
17271 		{
17272 			m_sLastError.SetSprintf ( "header k-list size does not match .spk size (klist=" INT64_FMT ", spk=" INT64_FMT ")",
17273 				(int64_t)( uKlistBytes ),
17274 				(int64_t) iSize );
17275 			return false;
17276 		}
17277 
17278 		if ( m_bId32to64 )
17279 			m_uKillListSize *= 2;
17280 
17281 		// prealloc
17282 		if ( iSize>0 && !m_pKillList.Alloc ( m_uKillListSize, m_sLastError, sWarning ) )
17283 			return false;
17284 	}
17285 
17286 	// prealloc skiplist
17287 	if ( m_bHaveSkips && !m_bDebugCheck )
17288 	{
17289 		CSphAutofile fdSkips ( GetIndexFileName("spe"), SPH_O_READ, m_sLastError );
17290 		if ( fdSkips.GetFD()<0 )
17291 			return false;
17292 
17293 		SphOffset_t iSize = fdSkips.GetSize ( 0, true, m_sLastError );
17294 		if ( iSize<0 )
17295 			return false;
17296 
17297 		if ( iSize>0 && !m_pSkiplists.Alloc ( iSize, m_sLastError, sWarning ) )
17298 			return false;
17299 	}
17300 
17301 	bool bWordDict = false;
17302 	if ( m_pDict )
17303 		bWordDict = m_pDict->GetSettings().m_bWordDict;
17304 
17305 	// preload checkpoints (must be done here as they are not shared)
17306 	if 	( !m_tWordlist.ReadCP ( tWordlist, m_uVersion, bWordDict, m_sLastError ) )
17307 	{
17308 		m_sLastError.SetSprintf ( "failed to read %s: %s", GetIndexFileName("spi").cstr(), m_sLastError.cstr () );
17309 		return false;
17310 	}
17311 
17312 	// all done
17313 	m_bPreallocated = true;
17314 	m_iIndexTag = ++m_iIndexTagSeq;
17315 	return true;
17316 }
17317 
17318 
PrereadSharedBuffer(CSphSharedBuffer<T> & pBuffer,const char * sExt,int64_t iExpected,int64_t iOffset)17319 template < typename T > bool CSphIndex_VLN::PrereadSharedBuffer ( CSphSharedBuffer<T> & pBuffer,
17320 	const char * sExt, int64_t iExpected, int64_t iOffset )
17321 {
17322 	sphLogDebug ( "prereading .%s", sExt );
17323 
17324 	if ( !pBuffer.GetLengthBytes() )
17325 		return true;
17326 
17327 	CSphAutofile fdBuf ( GetIndexFileName(sExt), SPH_O_READ, m_sLastError );
17328 	if ( fdBuf.GetFD()<0 )
17329 		return false;
17330 
17331 	fdBuf.SetProgressCallback ( &m_tProgress );
17332 	if ( iExpected==0 )
17333 		iExpected = int64_t ( pBuffer.GetLengthBytes() ) - iOffset*sizeof(T);
17334 	return fdBuf.Read ( pBuffer.GetWritePtr() + iOffset, iExpected, m_sLastError );
17335 }
17336 
17337 
Preread()17338 bool CSphIndex_VLN::Preread ()
17339 {
17340 	MEMORY ( MEM_INDEX_DISK );
17341 
17342 	sphLogDebug ( "CSphIndex_VLN::Preread invoked" );
17343 	if ( !m_bPreallocated )
17344 	{
17345 		m_sLastError = "INTERNAL ERROR: not preallocated";
17346 		return false;
17347 	}
17348 	if ( !m_pPreread || *m_pPreread )
17349 	{
17350 		m_sLastError = "INTERNAL ERROR: already preread";
17351 		return false;
17352 	}
17353 
17354 	///////////////////
17355 	// read everything
17356 	///////////////////
17357 
17358 	m_tProgress.m_ePhase = CSphIndexProgress::PHASE_PREREAD;
17359 	m_tProgress.m_iBytes = 0;
17360 	m_tProgress.m_iBytesTotal = m_pKillList.GetLengthBytes() + m_pSkiplists.GetLengthBytes();
17361 	if ( !m_bOndiskAllAttr )
17362 		m_tProgress.m_iBytesTotal += m_tAttr.GetLengthBytes();
17363 	if ( !m_bOndiskAllAttr && !m_bOndiskPoolAttr )
17364 		m_tProgress.m_iBytesTotal += m_tMva.GetLengthBytes() + m_tString.GetLengthBytes();
17365 
17366 	m_tProgress.m_iBytesTotal += m_tWordlist.m_pBuf.GetLengthBytes();
17367 
17368 	int64_t iExpected = 0;
17369 	if ( m_uVersion<20 )
17370 	{
17371 		int iDwordsInID = m_bId32to64 ? 1 : DOCINFO_IDSIZE;
17372 		iExpected = m_iDocinfo * ( iDwordsInID + m_tSchema.GetRowSize() ) * sizeof(DWORD);
17373 	}
17374 
17375 	int64_t iOffset = 0;
17376 	if ( m_bId32to64 )
17377 		iOffset = m_iDocinfo + 2 + m_iDocinfoIndex*2;
17378 
17379 	if ( !m_bOndiskAllAttr )
17380 	{
17381 		if ( !PrereadSharedBuffer ( m_dAttrShared, "spa", iExpected, iOffset ) )
17382 				return false;
17383 	}
17384 	if ( !( m_bOndiskAllAttr || m_bOndiskPoolAttr ) )
17385 	{
17386 		if ( !PrereadSharedBuffer ( m_dMvaShared, "spm" ) )
17387 			return false;
17388 		if ( !PrereadSharedBuffer ( m_dStringShared, "sps" ) )
17389 			return false;
17390 	}
17391 
17392 	int iKillListOffset = m_bId32to64 ? m_pKillList.GetLengthBytes()/2/sizeof(SphDocID_t) : 0;
17393 	if ( !PrereadSharedBuffer ( m_pKillList, "spk", iKillListOffset, iKillListOffset ) )
17394 		return false;
17395 	if ( !m_bDebugCheck && !PrereadSharedBuffer ( m_pSkiplists, "spe" ) )
17396 		return false;
17397 
17398 #if PARANOID
17399 	for ( int i=1; i<(int)m_uKillListSize; i++ )
17400 		assert ( m_pKillList [ i-1 ]<m_pKillList[i] );
17401 #endif
17402 
17403 	// preload wordlist
17404 	// FIXME! OPTIMIZE! can skip checkpoints
17405 	sphLogDebug ( "Prereading .spi" );
17406 	if ( !PrereadSharedBuffer ( m_tWordlist.m_pBuf, "spi" ) )
17407 		return false;
17408 
17409 	m_tProgress.Show ( true );
17410 
17411 	//////////////////////
17412 	// precalc everything
17413 	//////////////////////
17414 
17415 	if ( m_pKillList.GetLengthBytes() && m_bId32to64 )
17416 	{
17417 		SphDocID_t * pDst = m_pKillList.GetWritePtr();
17418 		const DWORD * pSrc = reinterpret_cast<const DWORD*> ( pDst ) + m_uKillListSize;
17419 		for ( DWORD i=0; i<m_uKillListSize; ++i )
17420 			*pDst++ = *pSrc++;
17421 	}
17422 
17423 	// convert id32 to id64
17424 	if ( m_dAttrShared.GetLengthBytes() && m_bId32to64 ) // FIXME!!! can't 32->64 convert ondisk_attrs
17425 	{
17426 		DWORD * pTarget = m_dAttrShared.GetWritePtr();
17427 		const DWORD * pSource = pTarget + m_iDocinfo + 2 + m_iDocinfoIndex * 2;
17428 		int iStride = m_tSchema.GetRowSize();
17429 		SphDocID_t uDoc;
17430 		int64_t iLimit = m_iDocinfo + ( ( m_uVersion < 20 ) ? 0 : m_iDocinfoIndex * 2 + 2 );
17431 		for ( int64_t i=0; i<iLimit; i++ )
17432 		{
17433 			uDoc = *pSource; ///< wide id32 to id64
17434 			DOCINFOSETID ( pTarget, uDoc );
17435 			memcpy ( pTarget + DOCINFO_IDSIZE, pSource + 1, iStride * sizeof(DWORD) );
17436 			pSource += iStride+1;
17437 			pTarget += iStride+DOCINFO_IDSIZE;
17438 		}
17439 		sphWarning ( "id32 index loaded by id64 binary; attributes converted" );
17440 	}
17441 
17442 	// build attributes hash
17443 	if ( m_tAttr.GetLengthBytes() && m_pDocinfoHash.GetLengthBytes() && !m_bDebugCheck )
17444 	{
17445 		sphLogDebug ( "Hashing docinfo" );
17446 		assert ( CheckDocsCount ( m_iDocinfo, m_sLastError ) );
17447 		int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
17448 		SphDocID_t uFirst = DOCINFO2ID ( &m_tAttr[0] );
17449 		SphDocID_t uRange = DOCINFO2ID ( &m_tAttr[ ( m_iDocinfo-1)*iStride ] ) - uFirst;
17450 		DWORD iShift = 0;
17451 		while ( uRange>=( 1 << DOCINFO_HASH_BITS ) )
17452 		{
17453 			iShift++;
17454 			uRange >>= 1;
17455 		}
17456 
17457 		DWORD * pHash = m_pDocinfoHash.GetWritePtr();
17458 		*pHash++ = iShift;
17459 		*pHash = 0;
17460 		DWORD uLastHash = 0;
17461 
17462 		for ( int64_t i=1; i<m_iDocinfo; i++ )
17463 		{
17464 			assert ( DOCINFO2ID ( &m_tAttr[ i*iStride ] )>uFirst
17465 				&& DOCINFO2ID ( &m_tAttr[ ( i-1 )*iStride ] ) < DOCINFO2ID ( &m_tAttr[ i*iStride ] )
17466 				&& "descending document ID found" );
17467 			DWORD uHash = (DWORD)( ( DOCINFO2ID ( &m_tAttr[ i*iStride ] ) - uFirst ) >> iShift );
17468 			if ( uHash==uLastHash )
17469 				continue;
17470 
17471 			while ( uLastHash<uHash )
17472 				pHash [ ++uLastHash ] = (DWORD)i;
17473 
17474 			uLastHash = uHash;
17475 		}
17476 		pHash [ ++uLastHash ] = (DWORD)m_iDocinfo;
17477 	}
17478 
17479 	// persist MVA needs valid DocinfoHash
17480 	sphLogDebug ( "Prereading .mvp" );
17481 	if ( !LoadPersistentMVA ( m_sLastError ) )
17482 		return false;
17483 
17484 	// build "indexes" for full-scan
17485 	if ( m_uVersion < 20 && !PrecomputeMinMax() )
17486 		return false;
17487 
17488 	// paranoid MVA verification
17489 #if PARANOID
17490 	// find out what attrs are MVA
17491 	CSphVector<int> dMvaRowitem;
17492 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
17493 	{
17494 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
17495 		if ( tCol.m_eAttrType==SPH_ATTR_UINT32SET )
17496 			dMvaRowitem.Add ( tCol.m_tLocator.m_iBitOffset/ROWITEM_BITS );
17497 	}
17498 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
17499 	{
17500 		const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
17501 		if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
17502 			dMvaRowitem.Add ( tCol.m_tLocator.m_iBitOffset/ROWITEM_BITS );
17503 	}
17504 
17505 	// for each docinfo entry, verify that MVA attrs point to right storage location
17506 	int iStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
17507 	for ( int64_t iDoc=0; iDoc<m_iDocinfo && dMvaRowitem.GetLength(); iDoc++ )
17508 	{
17509 		CSphRowitem * pRow = m_tAttr.GetWritePtr() + ( iDoc*iStride );
17510 		CSphRowitem * pAttrs = DOCINFO2ATTRS(pRow);
17511 		SphDocID_t uDocID = DOCINFO2ID(pRow);
17512 
17513 		DWORD uOff = pAttrs[ dMvaRowitem[0] ];
17514 		if ( !uOff )
17515 		{
17516 			// its either all or nothing
17517 			ARRAY_FOREACH ( i, dMvaRowitem )
17518 				assert ( pAttrs[ dMvaRowitem[i] ]==0 );
17519 		} else if ( !( uOff & MVA_ARENA_FLAG ) )
17520 		{
17521 			assert ( uDocID==DOCINFO2ID ( m_tMva.GetWritePtr() + uOff - DOCINFO_IDSIZE ) );
17522 
17523 			// walk the trail
17524 			ARRAY_FOREACH ( i, dMvaRowitem )
17525 			{
17526 				assert ( pAttrs[ dMvaRowitem[i] ]==uOff );
17527 				int iCount = m_tMva[uOff];
17528 				uOff += 1+iCount;
17529 			}
17530 		}
17531 	}
17532 #endif // PARANOID
17533 
17534 	*m_pPreread = 1;
17535 	sphLogDebug ( "Preread successfully finished" );
17536 	return true;
17537 }
17538 
17539 
SetBase(const char * sNewBase)17540 void CSphIndex_VLN::SetBase ( const char * sNewBase )
17541 {
17542 	m_sFilename = sNewBase;
17543 }
17544 
17545 
Rename(const char * sNewBase)17546 bool CSphIndex_VLN::Rename ( const char * sNewBase )
17547 {
17548 	if ( m_sFilename==sNewBase )
17549 		return true;
17550 
17551 	// try to rename everything
17552 	char sFrom [ SPH_MAX_FILENAME_LEN ];
17553 	char sTo [ SPH_MAX_FILENAME_LEN ];
17554 
17555 	// +1 for ".spl"
17556 	int iExtCount = sphGetExtCount() + 1;
17557 	const char ** sExts = sphGetExts ( SPH_EXT_TYPE_LOC );
17558 	DWORD uMask = 0;
17559 
17560 	int iExt;
17561 	for ( iExt=0; iExt<iExtCount; iExt++ )
17562 	{
17563 		const char * sExt = sExts[iExt];
17564 		if ( !strcmp ( sExt, ".spp" ) && m_uVersion<3 ) // .spp files are v3+
17565 			continue;
17566 		if ( !strcmp ( sExt, ".spm" ) && m_uVersion<4 ) // .spm files are v4+
17567 			continue;
17568 		if ( !strcmp ( sExt, ".spk" ) && m_uVersion<10 ) // .spk files are v10+
17569 			continue;
17570 		if ( !strcmp ( sExt, ".sps" ) && m_uVersion<17 ) // .sps files are v17+
17571 			continue;
17572 		if ( !strcmp ( sExt, ".spe" ) && m_uVersion<31 ) // .spe files are v31+
17573 			continue;
17574 
17575 #if !USE_WINDOWS
17576 		if ( !strcmp ( sExt, ".spl" ) && m_iLockFD<0 ) // .spl files are locks
17577 			continue;
17578 #else
17579 		if ( !strcmp ( sExt, ".spl" ) )
17580 		{
17581 			if ( m_iLockFD>=0 )
17582 			{
17583 				::close ( m_iLockFD );
17584 				::unlink ( GetIndexFileName("spl").cstr() );
17585 				sphLogDebug ( "lock %s unlinked, file with ID %d closed", GetIndexFileName("spl").cstr(), m_iLockFD );
17586 				m_iLockFD = -1;
17587 			}
17588 			continue;
17589 		}
17590 #endif
17591 
17592 		snprintf ( sFrom, sizeof(sFrom), "%s%s", m_sFilename.cstr(), sExt );
17593 		snprintf ( sTo, sizeof(sTo), "%s%s", sNewBase, sExt );
17594 
17595 #if USE_WINDOWS
17596 		::unlink ( sTo );
17597 		sphLogDebug ( "%s unlinked", sTo );
17598 #endif
17599 
17600 		if ( ::rename ( sFrom, sTo ) )
17601 		{
17602 			m_sLastError.SetSprintf ( "rename %s to %s failed: %s", sFrom, sTo, strerror(errno) );
17603 			// this is no reason to fail if spl is missing, since it is only lock and no data.
17604 			if ( strcmp ( sExt, ".spl" ) )
17605 				break;
17606 		}
17607 		uMask |= ( 1UL << iExt );
17608 	}
17609 
17610 	// are we good?
17611 	if ( iExt==iExtCount )
17612 	{
17613 		SetBase ( sNewBase );
17614 		sphLogDebug ( "Base set to %s", sNewBase );
17615 		return true;
17616 	}
17617 
17618 	// if there were errors, rollback
17619 	for ( iExt=0; iExt<iExtCount; iExt++ )
17620 	{
17621 		if (!( uMask & ( 1UL << iExt ) ))
17622 			continue;
17623 
17624 		const char * sExt = sExts[iExt];
17625 		snprintf ( sFrom, sizeof(sFrom), "%s%s", sNewBase, sExt );
17626 		snprintf ( sTo, sizeof(sTo), "%s%s", m_sFilename.cstr(), sExt );
17627 		if ( ::rename ( sFrom, sTo ) )
17628 		{
17629 			sphLogDebug ( "Rollback failure when renaming %s to %s", sFrom, sTo );
17630 			// !COMMIT should handle rollback failures somehow
17631 		}
17632 	}
17633 	return false;
17634 }
17635 
17636 //////////////////////////////////////////////////////////////////////////
17637 
CSphQueryContext()17638 CSphQueryContext::CSphQueryContext ()
17639 {
17640 	m_iWeights = 0;
17641 	m_bLookupFilter = false;
17642 	m_bLookupSort = false;
17643 	m_uPackedFactorFlags = SPH_FACTOR_DISABLE;
17644 	m_pFilter = NULL;
17645 	m_pWeightFilter = NULL;
17646 	m_pIndexData = NULL;
17647 	m_pProfile = NULL;
17648 	m_pLocalDocs = NULL;
17649 	m_iTotalDocs = 0;
17650 	m_iBadRows = 0;
17651 }
17652 
~CSphQueryContext()17653 CSphQueryContext::~CSphQueryContext ()
17654 {
17655 	SafeDelete ( m_pFilter );
17656 	SafeDelete ( m_pWeightFilter );
17657 
17658 	ARRAY_FOREACH ( i, m_dUserVals )
17659 		m_dUserVals[i]->Release();
17660 }
17661 
BindWeights(const CSphQuery * pQuery,const CSphSchema & tSchema,CSphString & sWarning)17662 void CSphQueryContext::BindWeights ( const CSphQuery * pQuery, const CSphSchema & tSchema, CSphString & sWarning )
17663 {
17664 	const int MIN_WEIGHT = 1;
17665 	// const int HEAVY_FIELDS = 32;
17666 	const int HEAVY_FIELDS = SPH_MAX_FIELDS;
17667 
17668 	// defaults
17669 	m_iWeights = Min ( tSchema.m_dFields.GetLength(), HEAVY_FIELDS );
17670 	for ( int i=0; i<m_iWeights; i++ )
17671 		m_dWeights[i] = MIN_WEIGHT;
17672 
17673 	// name-bound weights
17674 	CSphString sFieldsNotFound;
17675 	if ( pQuery->m_dFieldWeights.GetLength() )
17676 	{
17677 		ARRAY_FOREACH ( i, pQuery->m_dFieldWeights )
17678 		{
17679 			int j = tSchema.GetFieldIndex ( pQuery->m_dFieldWeights[i].m_sName.cstr() );
17680 			if ( j<0 )
17681 			{
17682 				if ( sFieldsNotFound.IsEmpty() )
17683 					sFieldsNotFound = pQuery->m_dFieldWeights[i].m_sName;
17684 				else
17685 					sFieldsNotFound.SetSprintf ( "%s %s", sFieldsNotFound.cstr(), pQuery->m_dFieldWeights[i].m_sName.cstr() );
17686 			}
17687 
17688 			if ( j>=0 && j<HEAVY_FIELDS )
17689 				m_dWeights[j] = Max ( MIN_WEIGHT, pQuery->m_dFieldWeights[i].m_iValue );
17690 		}
17691 
17692 		if ( !sFieldsNotFound.IsEmpty() )
17693 			sWarning.SetSprintf ( "Fields specified in field_weights option not found: [%s]", sFieldsNotFound.cstr() );
17694 
17695 		return;
17696 	}
17697 
17698 	// order-bound weights
17699 	if ( pQuery->m_pWeights )
17700 	{
17701 		for ( int i=0; i<Min ( m_iWeights, pQuery->m_iWeights ); i++ )
17702 			m_dWeights[i] = Max ( MIN_WEIGHT, (int)pQuery->m_pWeights[i] );
17703 	}
17704 }
17705 
SortedVectorsContain(const CSphVector<int> & dVec1,const CSphVector<int> & dVec2)17706 static bool SortedVectorsContain ( const CSphVector<int> & dVec1, const CSphVector<int> & dVec2 )
17707 {
17708 	const int * pSrc1 = dVec1.Begin ();
17709 	const int * pEnd1 = pSrc1 + dVec1.GetLength ();
17710 	const int * pSrc2 = dVec2.Begin ();
17711 	const int * pEnd2 = pSrc2 + dVec2.GetLength ();
17712 
17713 	while ( pSrc1!=pEnd1 && pSrc2!=pEnd2 )
17714 	{
17715 		if ( *pSrc1==*pSrc2 )
17716 			return true;
17717 
17718 		if ( *pSrc1<*pSrc2 )
17719 			pSrc1++;
17720 		else
17721 			pSrc2++;
17722 	}
17723 
17724 	return false;
17725 }
17726 
SetupCalc(CSphQueryResult * pResult,const ISphSchema & tInSchema,const CSphSchema & tSchema,const DWORD * pMvaPool,bool bArenaProhibit,bool bExtractPostAggr)17727 bool CSphQueryContext::SetupCalc ( CSphQueryResult * pResult, const ISphSchema & tInSchema,
17728 									const CSphSchema & tSchema, const DWORD * pMvaPool, bool bArenaProhibit, bool bExtractPostAggr )
17729 {
17730 	m_dCalcFilter.Resize ( 0 );
17731 	m_dCalcSort.Resize ( 0 );
17732 	m_dCalcFinal.Resize ( 0 );
17733 
17734 	// quickly verify that all my real attributes can be stashed there
17735 	if ( tInSchema.GetAttrsCount() < tSchema.GetAttrsCount() )
17736 	{
17737 		pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema mismatch (incount=%d, mycount=%d)",
17738 			tInSchema.GetAttrsCount(), tSchema.GetAttrsCount() );
17739 		return false;
17740 	}
17741 
17742 	bool bGotAggregate = false;
17743 
17744 	// now match everyone
17745 	for ( int iIn=0; iIn<tInSchema.GetAttrsCount(); iIn++ )
17746 	{
17747 		const CSphColumnInfo & tIn = tInSchema.GetAttr(iIn);
17748 		bGotAggregate |= ( tIn.m_eAggrFunc!=SPH_AGGR_NONE );
17749 
17750 		switch ( tIn.m_eStage )
17751 		{
17752 			case SPH_EVAL_STATIC:
17753 			case SPH_EVAL_OVERRIDE:
17754 			{
17755 				// this check may significantly slow down queries with huge schema attribute count
17756 #ifndef NDEBUG
17757 				const CSphColumnInfo * pMy = tSchema.GetAttr ( tIn.m_sName.cstr() );
17758 				if ( !pMy )
17759 				{
17760 					pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema attr missing from index-schema (in=%s)",
17761 						sphDumpAttr(tIn).cstr() );
17762 					return false;
17763 				}
17764 
17765 				if ( tIn.m_eStage==SPH_EVAL_OVERRIDE )
17766 				{
17767 					// override; check for type/size match and dynamic part
17768 					if ( tIn.m_eAttrType!=pMy->m_eAttrType
17769 						|| tIn.m_tLocator.m_iBitCount!=pMy->m_tLocator.m_iBitCount
17770 						|| !tIn.m_tLocator.m_bDynamic )
17771 					{
17772 						pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema override mismatch (in=%s, my=%s)",
17773 							sphDumpAttr(tIn).cstr(), sphDumpAttr(*pMy).cstr() );
17774 						return false;
17775 					}
17776 				} else
17777 				{
17778 					// static; check for full match
17779 					if (!( tIn==*pMy ))
17780 					{
17781 						pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema mismatch (in=%s, my=%s)",
17782 							sphDumpAttr(tIn).cstr(), sphDumpAttr(*pMy).cstr() );
17783 						return false;
17784 					}
17785 				}
17786 #endif
17787 				break;
17788 			}
17789 
17790 			case SPH_EVAL_PREFILTER:
17791 			case SPH_EVAL_PRESORT:
17792 			case SPH_EVAL_FINAL:
17793 			{
17794 				ISphExpr * pExpr = tIn.m_pExpr.Ptr();
17795 				if ( !pExpr )
17796 				{
17797 					pResult->m_sError.SetSprintf ( "INTERNAL ERROR: incoming-schema expression missing evaluator (stage=%d, in=%s)",
17798 						(int)tIn.m_eStage, sphDumpAttr(tIn).cstr() );
17799 					return false;
17800 				}
17801 
17802 				// an expression that index/searcher should compute
17803 				CalcItem_t tCalc;
17804 				tCalc.m_eType = tIn.m_eAttrType;
17805 				tCalc.m_tLoc = tIn.m_tLocator;
17806 				tCalc.m_pExpr = pExpr;
17807 				PoolPtrs_t tMva;
17808 				tMva.m_pMva = pMvaPool;
17809 				tMva.m_bArenaProhibit = bArenaProhibit;
17810 				tCalc.m_pExpr->Command ( SPH_EXPR_SET_MVA_POOL, &tMva );
17811 
17812 				switch ( tIn.m_eStage )
17813 				{
17814 					case SPH_EVAL_PREFILTER:	m_dCalcFilter.Add ( tCalc ); break;
17815 					case SPH_EVAL_PRESORT:		m_dCalcSort.Add ( tCalc ); break;
17816 					case SPH_EVAL_FINAL:		m_dCalcFinal.Add ( tCalc ); break;
17817 
17818 					default:					break;
17819 				}
17820 				break;
17821 			}
17822 
17823 			case SPH_EVAL_SORTER:
17824 				// sorter tells it will compute itself; so just skip it
17825 			case SPH_EVAL_POSTLIMIT:
17826 				break;
17827 
17828 			default:
17829 				pResult->m_sError.SetSprintf ( "INTERNAL ERROR: unhandled eval stage=%d", (int)tIn.m_eStage );
17830 				return false;
17831 		}
17832 	}
17833 
17834 	// move some items from final calc to post-aggrerate for RT index
17835 	if ( bExtractPostAggr && bGotAggregate && m_dCalcFinal.GetLength () )
17836 	{
17837 		CSphVector<int> dAggrs;
17838 		for ( int i=0; i<tInSchema.GetAttrsCount (); i++ )
17839 		{
17840 			if ( tInSchema.GetAttr ( i ).m_eAggrFunc!=SPH_AGGR_NONE )
17841 				dAggrs.Add ( i );
17842 		}
17843 
17844 		CSphVector<int> dCur;
17845 		ARRAY_FOREACH ( i, m_dCalcFinal )
17846 		{
17847 			const CalcItem_t & tFinal = m_dCalcFinal[i];
17848 			if ( !tFinal.m_pExpr )
17849 				continue;
17850 
17851 			dCur.Resize ( 0 );
17852 			tFinal.m_pExpr->Command ( SPH_EXPR_GET_DEPENDENT_COLS, &dCur );
17853 
17854 			// handle chains of dependencies (e.g. SELECT 1+attr f1, f1-1 f2 ... )
17855 			ARRAY_FOREACH ( j, dCur )
17856 			{
17857 				const CSphColumnInfo & tCol = tInSchema.GetAttr ( dCur[j] );
17858 				if ( tCol.m_pExpr.Ptr () )
17859 					tCol.m_pExpr->Command ( SPH_EXPR_GET_DEPENDENT_COLS, &dCur );
17860 			}
17861 			dCur.Sort ();
17862 
17863 			if ( SortedVectorsContain ( dAggrs, dCur ) )
17864 			{
17865 				m_dCalcPostAggregate.Add ( tFinal );
17866 				m_dCalcFinal.Remove ( i );
17867 				i--;
17868 			}
17869 		}
17870 	}
17871 
17872 	// ok, we can emit matches in this schema (incoming for sorter, outgoing for index/searcher)
17873 	return true;
17874 }
17875 
17876 
IsStarDict() const17877 bool CSphIndex_VLN::IsStarDict () const
17878 {
17879 	return (
17880 		( m_uVersion>=7 && ( m_tSettings.m_iMinPrefixLen>0 || m_tSettings.m_iMinInfixLen>0 ) ) || // v.7 added mangling to infixes
17881 		( m_uVersion==6 && ( m_tSettings.m_iMinPrefixLen>0 ) ) ); // v.6 added mangling to prefixes
17882 }
17883 
17884 
SetupStarDict(CSphScopedPtr<CSphDict> & tContainer,CSphDict * pPrevDict) const17885 CSphDict * CSphIndex_VLN::SetupStarDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const
17886 {
17887 	// spawn wrapper, and put it in the box
17888 	// wrapper type depends on version; v.8 introduced new mangling rules
17889 	if ( !IsStarDict() )
17890 		return pPrevDict;
17891 	if ( m_uVersion>=8 )
17892 		tContainer = new CSphDictStarV8 ( pPrevDict, m_tSettings.m_iMinPrefixLen>0, m_tSettings.m_iMinInfixLen>0 );
17893 	else
17894 		tContainer = new CSphDictStar ( pPrevDict );
17895 
17896 	// FIXME? might wanna verify somehow that the tokenizer has '*' as a character
17897 	return tContainer.Ptr();
17898 }
17899 
17900 
SetupExactDict(CSphScopedPtr<CSphDict> & tContainer,CSphDict * pPrevDict) const17901 CSphDict * CSphIndex_VLN::SetupExactDict ( CSphScopedPtr<CSphDict> & tContainer, CSphDict * pPrevDict ) const
17902 {
17903 	if ( m_uVersion<12 || !m_tSettings.m_bIndexExactWords )
17904 		return pPrevDict;
17905 
17906 	tContainer = new CSphDictExact ( pPrevDict );
17907 	return tContainer.Ptr();
17908 }
17909 
17910 
GetKeywords(CSphVector<CSphKeywordInfo> & dKeywords,const char * szQuery,bool bGetStats,CSphString * pError) const17911 bool CSphIndex_VLN::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
17912 	const char * szQuery, bool bGetStats, CSphString * pError ) const
17913 {
17914 	WITH_QWORD ( this, false, Qword, return DoGetKeywords<Qword> ( dKeywords, szQuery, bGetStats, false, pError ) );
17915 	return false;
17916 }
17917 
17918 
sphParseMorphAot(const char * sMorphology)17919 DWORD sphParseMorphAot ( const char * sMorphology )
17920 {
17921 	if ( !sMorphology || !*sMorphology )
17922 		return 0;
17923 
17924 	CSphVector<CSphString> dMorphs;
17925 	sphSplit ( dMorphs, sMorphology );
17926 
17927 	DWORD uAotFilterMask = 0;
17928 	for ( int j=0; j<AOT_LENGTH; ++j )
17929 	{
17930 		char buf_all[20];
17931 		sprintf ( buf_all, "lemmatize_%s_all", AOT_LANGUAGES[j] ); // NOLINT
17932 		ARRAY_FOREACH ( i, dMorphs )
17933 		{
17934 			if ( dMorphs[i]==buf_all )
17935 			{
17936 				uAotFilterMask |= (1UL) << j;
17937 				break;
17938 			}
17939 		}
17940 	}
17941 
17942 	return uAotFilterMask;
17943 }
17944 
17945 
ISphQueryFilter()17946 ISphQueryFilter::ISphQueryFilter ()
17947 {
17948 	m_pTokenizer = NULL;
17949 	m_pDict = NULL;
17950 	m_pSettings = NULL;
17951 }
17952 
17953 
~ISphQueryFilter()17954 ISphQueryFilter::~ISphQueryFilter ()
17955 {
17956 }
17957 
17958 
GetKeywords(CSphVector<CSphKeywordInfo> & dKeywords)17959 void ISphQueryFilter::GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords )
17960 {
17961 	assert ( m_pTokenizer && m_pDict && m_pSettings );
17962 
17963 	BYTE sTokenized[3*SPH_MAX_WORD_LEN+4];
17964 	BYTE * sWord;
17965 	int iQpos = 1;
17966 
17967 	// FIXME!!! got rid of duplicated term stat and qword setup
17968 	while ( ( sWord = m_pTokenizer->GetToken() )!=NULL )
17969 	{
17970 		const BYTE * sMultiform = m_pTokenizer->GetTokenizedMultiform();
17971 		strncpy ( (char *)sTokenized, sMultiform ? (const char*)sMultiform : (const char*)sWord, sizeof(sTokenized) );
17972 
17973 		AddKeywordStats ( sWord, sTokenized, iQpos, dKeywords );
17974 
17975 		// FIXME!!! handle consecutive blended wo blended parts
17976 		if ( !m_pTokenizer->TokenIsBlended() )
17977 			iQpos++;
17978 	}
17979 
17980 
17981 	if ( !m_pSettings->m_uAotFilterMask )
17982 		return;
17983 
17984 	XQLimitSpec_t tSpec;
17985 	BYTE sTmp[3*SPH_MAX_WORD_LEN+4];
17986 	BYTE sTmp2[3*SPH_MAX_WORD_LEN+4];
17987 	CSphVector<XQNode_t *> dChildren ( 64 );
17988 
17989 	int iTokenizedTotal = dKeywords.GetLength();
17990 	for ( int iTokenized=0; iTokenized<iTokenizedTotal; iTokenized++ )
17991 	{
17992 		int iQpos = dKeywords[iTokenized].m_iQpos;
17993 		// MUST copy as Dict::GetWordID changes word and might add symbols
17994 		strncpy ( (char *)sTokenized, dKeywords[iTokenized].m_sNormalized.scstr(), sizeof(sTokenized) );
17995 		int iPreAotCount = dKeywords.GetLength();
17996 
17997 		XQNode_t tAotNode ( tSpec );
17998 		tAotNode.m_dWords.Resize ( 1 );
17999 		tAotNode.m_dWords.Begin()->m_sWord = (char *)sTokenized;
18000 		TransformAotFilter ( &tAotNode, m_pDict->GetWordforms(), *m_pSettings );
18001 
18002 		dChildren.Resize ( 0 );
18003 		dChildren.Add ( &tAotNode );
18004 
18005 		// recursion unfolded
18006 		ARRAY_FOREACH ( iChild, dChildren )
18007 		{
18008 			// process all words at node
18009 			ARRAY_FOREACH ( iAotKeyword, dChildren[iChild]->m_dWords )
18010 			{
18011 				// MUST copy as Dict::GetWordID changes word and might add symbols
18012 				strncpy ( (char *)sTmp, dChildren[iChild]->m_dWords[iAotKeyword].m_sWord.scstr(), sizeof(sTmp) );
18013 				// prevent use-after-free-bug due to vector grow: AddKeywordsStats() calls dKeywords.Add()
18014 				strncpy ( (char *)sTmp2, dKeywords[iTokenized].m_sTokenized.scstr(), sizeof(sTmp2) );
18015 				AddKeywordStats ( sTmp, sTmp2, iQpos, dKeywords );
18016 			}
18017 
18018 			// push all child nodes at node to process list
18019 			const XQNode_t * pChild = dChildren[iChild];
18020 			ARRAY_FOREACH ( iRec, pChild->m_dChildren )
18021 				dChildren.Add ( pChild->m_dChildren[iRec] );
18022 		}
18023 
18024 		// remove (replace) original word in case of AOT taken place
18025 		if ( iPreAotCount!=dKeywords.GetLength() )
18026 		{
18027 			::Swap ( dKeywords[iTokenized], dKeywords.Last() );
18028 			dKeywords.Resize ( dKeywords.GetLength()-1 );
18029 		}
18030 	}
18031 
18032 	// sort by qpos
18033 	if ( dKeywords.GetLength()!=iTokenizedTotal )
18034 		sphSort ( dKeywords.Begin(), dKeywords.GetLength(), bind ( &CSphKeywordInfo::m_iQpos ) );
18035 }
18036 
18037 
18038 struct CSphPlainQueryFilter : public ISphQueryFilter
18039 {
18040 	const ISphQwordSetup *	m_pTermSetup;
18041 	ISphQword *				m_pQueryWord;
18042 	bool					m_bGetStats;
18043 
AddKeywordStatsCSphPlainQueryFilter18044 	virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords )
18045 	{
18046 		assert ( !m_bGetStats || ( m_pTermSetup && m_pQueryWord ) );
18047 
18048 		SphWordID_t iWord = m_pDict->GetWordID ( sWord );
18049 		if ( !iWord )
18050 			return;
18051 
18052 		if ( m_bGetStats )
18053 		{
18054 			m_pQueryWord->Reset ();
18055 			m_pQueryWord->m_sWord = (const char*)sWord;
18056 			m_pQueryWord->m_sDictWord = (const char*)sWord;
18057 			m_pQueryWord->m_uWordID = iWord;
18058 			m_pTermSetup->QwordSetup ( m_pQueryWord );
18059 		}
18060 
18061 		CSphKeywordInfo & tInfo = dKeywords.Add();
18062 		tInfo.m_sTokenized = (const char *)sTokenized;
18063 		tInfo.m_sNormalized = (const char*)sWord;
18064 		tInfo.m_iDocs = m_bGetStats ? m_pQueryWord->m_iDocs : 0;
18065 		tInfo.m_iHits = m_bGetStats ? m_pQueryWord->m_iHits : 0;
18066 		tInfo.m_iQpos = iQpos;
18067 
18068 		if ( tInfo.m_sNormalized.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
18069 			*(char *)tInfo.m_sNormalized.cstr() = '=';
18070 	}
18071 };
18072 
18073 
18074 template < class Qword >
DoGetKeywords(CSphVector<CSphKeywordInfo> & dKeywords,const char * szQuery,bool bGetStats,bool bFillOnly,CSphString * pError) const18075 bool CSphIndex_VLN::DoGetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords,
18076 	const char * szQuery, bool bGetStats, bool bFillOnly, CSphString * pError ) const
18077 {
18078 	if ( !bFillOnly )
18079 		dKeywords.Resize ( 0 );
18080 
18081 	if ( !m_pPreread || !*m_pPreread )
18082 	{
18083 		if ( pError )
18084 			*pError = "index not preread";
18085 		return false;
18086 	}
18087 
18088 	// short-cut if no query or keywords to fill
18089 	if ( ( bFillOnly && !dKeywords.GetLength() ) || ( !bFillOnly && ( !szQuery || !szQuery[0] ) ) )
18090 		return true;
18091 
18092 	// TODO: in case of bFillOnly skip tokenizer cloning and setup
18093 	CSphScopedPtr<ISphTokenizer> pTokenizer ( m_pTokenizer->Clone ( SPH_CLONE_INDEX ) ); // avoid race
18094 	pTokenizer->EnableTokenizedMultiformTracking ();
18095 
18096 	// need to support '*' and '=' but not the other specials
18097 	// so m_pQueryTokenizer does not work for us, gotta clone and setup one manually
18098 	if ( IsStarDict() )
18099 		pTokenizer->AddPlainChar ( '*' );
18100 	if ( m_tSettings.m_bIndexExactWords )
18101 		pTokenizer->AddPlainChar ( '=' );
18102 
18103 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
18104 	CSphDict * pDictBase = m_pDict;
18105 	if ( pDictBase->HasState() )
18106 		tDictCloned = pDictBase = pDictBase->Clone();
18107 
18108 	CSphScopedPtr<CSphDict> tDict ( NULL );
18109 	CSphDict * pDict = SetupStarDict ( tDict, pDictBase );
18110 
18111 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
18112 	pDict = SetupExactDict ( tDict2, pDict );
18113 
18114 	// FIXME!!! missed bigram, FieldFilter, add flags to fold blended parts, show expanded terms
18115 
18116 	// prepare for setup
18117 	CSphAutofile tDummy1, tDummy2;
18118 
18119 	DiskIndexQwordSetup_c tTermSetup ( tDummy1, tDummy2, m_pSkiplists.GetWritePtr(), NULL );
18120 	tTermSetup.m_pDict = pDict;
18121 	tTermSetup.m_pIndex = this;
18122 	tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
18123 
18124 	Qword tQueryWord ( false, false );
18125 
18126 	CSphPlainQueryFilter tAotFilter;
18127 	tAotFilter.m_pTokenizer = pTokenizer.Ptr();
18128 	tAotFilter.m_pDict = pDict;
18129 	tAotFilter.m_pSettings = &m_tSettings;
18130 	tAotFilter.m_bGetStats = bGetStats;
18131 	tAotFilter.m_pTermSetup = &tTermSetup;
18132 	tAotFilter.m_pQueryWord = &tQueryWord;
18133 
18134 	if ( !bFillOnly )
18135 	{
18136 		pTokenizer->SetBuffer ( (const BYTE *)szQuery, strlen(szQuery) );
18137 
18138 		tAotFilter.GetKeywords ( dKeywords );
18139 	} else
18140 	{
18141 		BYTE sWord[MAX_KEYWORD_BYTES];
18142 
18143 		ARRAY_FOREACH ( i, dKeywords )
18144 		{
18145 			CSphKeywordInfo & tInfo = dKeywords[i];
18146 			int iLen = tInfo.m_sTokenized.Length();
18147 			memcpy ( sWord, tInfo.m_sTokenized.cstr(), iLen );
18148 			sWord[iLen] = '\0';
18149 
18150 			SphWordID_t iWord = pDict->GetWordID ( sWord );
18151 			if ( iWord )
18152 			{
18153 				tQueryWord.Reset ();
18154 				tQueryWord.m_sWord = tInfo.m_sTokenized;
18155 				tQueryWord.m_sDictWord = (const char*)sWord;
18156 				tQueryWord.m_uWordID = iWord;
18157 				tTermSetup.QwordSetup ( &tQueryWord );
18158 
18159 				tInfo.m_iDocs += tQueryWord.m_iDocs;
18160 				tInfo.m_iHits += tQueryWord.m_iHits;
18161 			}
18162 		}
18163 	}
18164 
18165 	return true;
18166 }
18167 
18168 
FillKeywords(CSphVector<CSphKeywordInfo> & dKeywords) const18169 bool CSphIndex_VLN::FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const
18170 {
18171 	WITH_QWORD ( this, false, Qword, return DoGetKeywords<Qword> ( dKeywords, NULL, true, true, NULL ) );
18172 	return false;
18173 }
18174 
18175 
18176 // fix MSVC 2005 fuckup, template DoGetKeywords() just above somehow resets forScope
18177 #if USE_WINDOWS
18178 #pragma conform(forScope,on)
18179 #endif
18180 
18181 
IsWeightColumn(const CSphString & sAttr,const ISphSchema & tSchema)18182 static bool IsWeightColumn ( const CSphString & sAttr, const ISphSchema & tSchema )
18183 {
18184 	if ( sAttr=="@weight" )
18185 		return true;
18186 
18187 	const CSphColumnInfo * pCol = tSchema.GetAttr ( sAttr.cstr() );
18188 	return ( pCol && pCol->m_bWeight );
18189 }
18190 
18191 
CreateFilters(bool bFullscan,const CSphVector<CSphFilterSettings> * pdFilters,const ISphSchema & tSchema,const DWORD * pMvaPool,const BYTE * pStrings,CSphString & sError,ESphCollation eCollation,bool bArenaProhibit,const KillListVector & dKillList)18192 bool CSphQueryContext::CreateFilters ( bool bFullscan,
18193 	const CSphVector<CSphFilterSettings> * pdFilters, const ISphSchema & tSchema,
18194 	const DWORD * pMvaPool, const BYTE * pStrings, CSphString & sError, ESphCollation eCollation, bool bArenaProhibit,
18195 	const KillListVector & dKillList )
18196 {
18197 	if ( !pdFilters && !dKillList.GetLength() )
18198 		return true;
18199 
18200 	if ( pdFilters )
18201 	{
18202 		ARRAY_FOREACH ( i, (*pdFilters) )
18203 		{
18204 			const CSphFilterSettings * pFilterSettings = pdFilters->Begin() + i;
18205 			if ( pFilterSettings->m_sAttrName.IsEmpty() )
18206 				continue;
18207 
18208 			bool bWeight = IsWeightColumn ( pFilterSettings->m_sAttrName, tSchema );
18209 
18210 			if ( bFullscan && bWeight )
18211 				continue; // @weight is not available in fullscan mode
18212 
18213 			// bind user variable local to that daemon
18214 			CSphFilterSettings tUservar;
18215 			if ( pFilterSettings->m_eType==SPH_FILTER_USERVAR )
18216 			{
18217 				const CSphString * sVar = pFilterSettings->m_dStrings.GetLength()==1 ? pFilterSettings->m_dStrings.Begin() : NULL;
18218 				if ( !g_pUservarsHook || !sVar )
18219 				{
18220 					sError = "no global variables found";
18221 					return false;
18222 				}
18223 
18224 				const UservarIntSet_c * pUservar = g_pUservarsHook ( *sVar );
18225 				if ( !pUservar )
18226 				{
18227 					sError.SetSprintf ( "undefined global variable '%s'", sVar->cstr() );
18228 					return false;
18229 				}
18230 
18231 				m_dUserVals.Add ( pUservar );
18232 				tUservar = *pFilterSettings;
18233 				tUservar.m_eType = SPH_FILTER_VALUES;
18234 				tUservar.SetExternalValues ( pUservar->Begin(), pUservar->GetLength() );
18235 				pFilterSettings = &tUservar;
18236 			}
18237 
18238 			ISphFilter * pFilter = sphCreateFilter ( *pFilterSettings, tSchema, pMvaPool, pStrings, sError, eCollation, bArenaProhibit );
18239 			if ( !pFilter )
18240 				return false;
18241 
18242 			ISphFilter ** pGroup = bWeight ? &m_pWeightFilter : &m_pFilter;
18243 			*pGroup = sphJoinFilters ( *pGroup, pFilter );
18244 		}
18245 	}
18246 
18247 	if ( dKillList.GetLength() )
18248 	{
18249 		ISphFilter * pFilter = sphCreateFilter ( dKillList );
18250 		if ( !pFilter )
18251 			return false;
18252 
18253 		m_pFilter = sphJoinFilters ( m_pFilter, pFilter );
18254 	}
18255 
18256 	if ( m_pFilter )
18257 		m_pFilter = m_pFilter->Optimize();
18258 
18259 	return true;
18260 }
18261 
18262 
SetupOverrides(const CSphQuery * pQuery,CSphQueryResult * pResult,const CSphSchema & tIndexSchema,const ISphSchema & tOutgoingSchema)18263 bool CSphQueryContext::SetupOverrides ( const CSphQuery * pQuery, CSphQueryResult * pResult, const CSphSchema & tIndexSchema, const ISphSchema & tOutgoingSchema )
18264 {
18265 	m_pOverrides = NULL;
18266 	m_dOverrideIn.Resize ( pQuery->m_dOverrides.GetLength() );
18267 	m_dOverrideOut.Resize ( pQuery->m_dOverrides.GetLength() );
18268 
18269 	ARRAY_FOREACH ( i, pQuery->m_dOverrides )
18270 	{
18271 		const char * sAttr = pQuery->m_dOverrides[i].m_sAttr.cstr(); // shortcut
18272 		const CSphColumnInfo * pCol = tIndexSchema.GetAttr ( sAttr );
18273 		if ( !pCol )
18274 		{
18275 			pResult->m_sError.SetSprintf ( "attribute override: unknown attribute name '%s'", sAttr );
18276 			return false;
18277 		}
18278 
18279 		if ( pCol->m_eAttrType!=pQuery->m_dOverrides[i].m_eAttrType )
18280 		{
18281 			pResult->m_sError.SetSprintf ( "attribute override: attribute '%s' type mismatch (index=%d, query=%d)",
18282 				sAttr, pCol->m_eAttrType, pQuery->m_dOverrides[i].m_eAttrType );
18283 			return false;
18284 		}
18285 
18286 		const CSphColumnInfo * pOutCol = tOutgoingSchema.GetAttr ( pQuery->m_dOverrides[i].m_sAttr.cstr() );
18287 		if ( !pOutCol )
18288 		{
18289 			pResult->m_sError.SetSprintf ( "attribute override: unknown attribute name '%s' in outgoing schema", sAttr );
18290 			return false;
18291 		}
18292 
18293 		m_dOverrideIn[i] = pCol->m_tLocator;
18294 		m_dOverrideOut[i] = pOutCol->m_tLocator;
18295 
18296 #ifndef NDEBUG
18297 		// check that the values are actually sorted
18298 		const CSphVector<CSphAttrOverride::IdValuePair_t> & dValues = pQuery->m_dOverrides[i].m_dValues;
18299 		for ( int j=1; j<dValues.GetLength(); j++ )
18300 			assert ( dValues[j-1] < dValues[j] );
18301 #endif
18302 	}
18303 
18304 	if ( pQuery->m_dOverrides.GetLength() )
18305 		m_pOverrides = &pQuery->m_dOverrides;
18306 	return true;
18307 }
18308 
sphQueryHeightCalc(const XQNode_t * pNode)18309 static int sphQueryHeightCalc ( const XQNode_t * pNode )
18310 {
18311 	if ( !pNode->m_dChildren.GetLength() )
18312 	{
18313 		// exception, pre-cached OR of tiny (rare) keywords is just one node
18314 		if ( pNode->GetOp()==SPH_QUERY_OR )
18315 		{
18316 #ifndef NDEBUG
18317 			// sanity checks
18318 			// this node must be only created for a huge OR of tiny expansions
18319 			assert ( pNode->m_dWords.GetLength() );
18320 			ARRAY_FOREACH ( i, pNode->m_dWords )
18321 			{
18322 				assert ( pNode->m_dWords[i].m_iAtomPos==pNode->m_dWords[0].m_iAtomPos );
18323 				assert ( pNode->m_dWords[i].m_bExpanded );
18324 			}
18325 #endif
18326 			return 1;
18327 		}
18328 		return pNode->m_dWords.GetLength();
18329 	}
18330 
18331 	if ( pNode->GetOp()==SPH_QUERY_BEFORE )
18332 		return 1;
18333 
18334 	int iMaxChild = 0;
18335 	int iHeight = 0;
18336 	ARRAY_FOREACH ( i, pNode->m_dChildren )
18337 	{
18338 		int iBottom = sphQueryHeightCalc ( pNode->m_dChildren[i] );
18339 		int iTop = pNode->m_dChildren.GetLength()-i-1;
18340 		if ( iBottom+iTop>=iMaxChild+iHeight )
18341 		{
18342 			iMaxChild = iBottom;
18343 			iHeight = iTop;
18344 		}
18345 	}
18346 
18347 	return iMaxChild+iHeight;
18348 }
18349 
18350 #define SPH_EXTNODE_STACK_SIZE 160
18351 
sphCheckQueryHeight(const XQNode_t * pRoot,CSphString & sError)18352 bool sphCheckQueryHeight ( const XQNode_t * pRoot, CSphString & sError )
18353 {
18354 	int iHeight = 0;
18355 	if ( pRoot )
18356 		iHeight = sphQueryHeightCalc ( pRoot );
18357 
18358 	int64_t iQueryStack = sphGetStackUsed() + iHeight*SPH_EXTNODE_STACK_SIZE;
18359 	bool bValid = ( g_iThreadStackSize>=iQueryStack );
18360 	if ( !bValid )
18361 		sError.SetSprintf ( "query too complex, not enough stack (thread_stack=%dK or higher required)",
18362 			(int)( ( iQueryStack + 1024 - ( iQueryStack%1024 ) ) / 1024 ) );
18363 	return bValid;
18364 }
18365 
CloneKeyword(const XQNode_t * pNode)18366 static XQNode_t * CloneKeyword ( const XQNode_t * pNode )
18367 {
18368 	assert ( pNode );
18369 
18370 	XQNode_t * pRes = new XQNode_t ( pNode->m_dSpec );
18371 	pRes->m_dWords = pNode->m_dWords;
18372 	return pRes;
18373 }
18374 
18375 
ExpandKeyword(XQNode_t * pNode,const CSphIndexSettings & tSettings)18376 static XQNode_t * ExpandKeyword ( XQNode_t * pNode, const CSphIndexSettings & tSettings )
18377 {
18378 	assert ( pNode );
18379 
18380 	XQNode_t * pExpand = new XQNode_t ( pNode->m_dSpec );
18381 	pExpand->SetOp ( SPH_QUERY_OR, pNode );
18382 
18383 	if ( tSettings.m_iMinInfixLen>0 )
18384 	{
18385 		assert ( pNode->m_dChildren.GetLength()==0 );
18386 		assert ( pNode->m_dWords.GetLength()==1 );
18387 		XQNode_t * pInfix = CloneKeyword ( pNode );
18388 		pInfix->m_dWords[0].m_sWord.SetSprintf ( "*%s*", pNode->m_dWords[0].m_sWord.cstr() );
18389 		pInfix->m_pParent = pExpand;
18390 		pExpand->m_dChildren.Add ( pInfix );
18391 	} else if ( tSettings.m_iMinPrefixLen>0 )
18392 	{
18393 		assert ( pNode->m_dChildren.GetLength()==0 );
18394 		assert ( pNode->m_dWords.GetLength()==1 );
18395 		XQNode_t * pPrefix = CloneKeyword ( pNode );
18396 		pPrefix->m_dWords[0].m_sWord.SetSprintf ( "%s*", pNode->m_dWords[0].m_sWord.cstr() );
18397 		pPrefix->m_pParent = pExpand;
18398 		pExpand->m_dChildren.Add ( pPrefix );
18399 	}
18400 
18401 	if ( tSettings.m_bIndexExactWords )
18402 	{
18403 		assert ( pNode->m_dChildren.GetLength()==0 );
18404 		assert ( pNode->m_dWords.GetLength()==1 );
18405 		XQNode_t * pExact = CloneKeyword ( pNode );
18406 		pExact->m_dWords[0].m_sWord.SetSprintf ( "=%s", pNode->m_dWords[0].m_sWord.cstr() );
18407 		pExact->m_pParent = pExpand;
18408 		pExpand->m_dChildren.Add ( pExact );
18409 	}
18410 
18411 	return pExpand;
18412 }
18413 
sphQueryExpandKeywords(XQNode_t * pNode,const CSphIndexSettings & tSettings)18414 XQNode_t * sphQueryExpandKeywords ( XQNode_t * pNode, const CSphIndexSettings & tSettings )
18415 {
18416 	// only if expansion makes sense at all
18417 	if ( tSettings.m_iMinInfixLen<=0 && tSettings.m_iMinPrefixLen<=0 && !tSettings.m_bIndexExactWords )
18418 		return pNode;
18419 
18420 	// process children for composite nodes
18421 	if ( pNode->m_dChildren.GetLength() )
18422 	{
18423 		ARRAY_FOREACH ( i, pNode->m_dChildren )
18424 		{
18425 			pNode->m_dChildren[i] = sphQueryExpandKeywords ( pNode->m_dChildren[i], tSettings );
18426 			pNode->m_dChildren[i]->m_pParent = pNode;
18427 		}
18428 		return pNode;
18429 	}
18430 
18431 	// if that's a phrase/proximity node, create a very special, magic phrase/proximity node
18432 	if ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM )
18433 	{
18434 		assert ( pNode->m_dWords.GetLength()>1 );
18435 		ARRAY_FOREACH ( i, pNode->m_dWords )
18436 		{
18437 			XQNode_t * pWord = new XQNode_t ( pNode->m_dSpec );
18438 			pWord->m_dWords.Add ( pNode->m_dWords[i] );
18439 			pNode->m_dChildren.Add ( ExpandKeyword ( pWord, tSettings ) );
18440 			pNode->m_dChildren.Last()->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
18441 			pNode->m_dChildren.Last()->m_pParent = pNode;
18442 		}
18443 		pNode->m_dWords.Reset();
18444 		pNode->m_bVirtuallyPlain = true;
18445 		return pNode;
18446 	}
18447 
18448 	// skip empty plain nodes
18449 	if ( pNode->m_dWords.GetLength()<=0 )
18450 		return pNode;
18451 
18452 	// process keywords for plain nodes
18453 	assert ( pNode->m_dWords.GetLength()==1 );
18454 
18455 	XQKeyword_t & tKeyword = pNode->m_dWords[0];
18456 	if ( tKeyword.m_sWord.Begins("=")
18457 		|| tKeyword.m_sWord.Begins("*")
18458 		|| tKeyword.m_sWord.Ends("*") )
18459 	{
18460 		return pNode;
18461 	}
18462 
18463 	// do the expansion
18464 	return ExpandKeyword ( pNode, tSettings );
18465 }
18466 
18467 
18468 // transform the "one two three"/1 quorum into one|two|three (~40% faster)
TransformQuorum(XQNode_t ** ppNode)18469 static void TransformQuorum ( XQNode_t ** ppNode )
18470 {
18471 	XQNode_t *& pNode = *ppNode;
18472 
18473 	// recurse non-quorum nodes
18474 	if ( pNode->GetOp()!=SPH_QUERY_QUORUM )
18475 	{
18476 		ARRAY_FOREACH ( i, pNode->m_dChildren )
18477 			TransformQuorum ( &pNode->m_dChildren[i] );
18478 		return;
18479 	}
18480 
18481 	// skip quorums with thresholds other than 1
18482 	if ( pNode->m_iOpArg!=1 )
18483 		return;
18484 
18485 	// transform quorums with a threshold of 1 only
18486 	assert ( pNode->GetOp()==SPH_QUERY_QUORUM && pNode->m_dChildren.GetLength()==0 );
18487 	CSphVector<XQNode_t*> dArgs;
18488 	ARRAY_FOREACH ( i, pNode->m_dWords )
18489 	{
18490 		XQNode_t * pAnd = new XQNode_t ( pNode->m_dSpec );
18491 		pAnd->m_dWords.Add ( pNode->m_dWords[i] );
18492 		dArgs.Add ( pAnd );
18493 	}
18494 	pNode->m_dWords.Reset();
18495 	pNode->SetOp ( SPH_QUERY_OR, dArgs );
18496 }
18497 
18498 
18499 struct BinaryNode_t
18500 {
18501 	int m_iLo;
18502 	int m_iHi;
18503 };
18504 
BuildExpandedTree(const XQKeyword_t & tRootWord,ISphWordlist::Args_t & dWordSrc,XQNode_t * pRoot)18505 static void BuildExpandedTree ( const XQKeyword_t & tRootWord, ISphWordlist::Args_t & dWordSrc, XQNode_t * pRoot )
18506 {
18507 	assert ( dWordSrc.m_dExpanded.GetLength() );
18508 	pRoot->m_dWords.Reset();
18509 
18510 	// build a binary tree from all the other expansions
18511 	CSphVector<BinaryNode_t> dNodes;
18512 	dNodes.Reserve ( dWordSrc.m_dExpanded.GetLength() );
18513 
18514 	XQNode_t * pCur = pRoot;
18515 
18516 	dNodes.Add();
18517 	dNodes.Last().m_iLo = 0;
18518 	dNodes.Last().m_iHi = ( dWordSrc.m_dExpanded.GetLength()-1 );
18519 
18520 	while ( dNodes.GetLength() )
18521 	{
18522 		BinaryNode_t tNode = dNodes.Pop();
18523 		if ( tNode.m_iHi<tNode.m_iLo )
18524 		{
18525 			pCur = pCur->m_pParent;
18526 			continue;
18527 		}
18528 
18529 		int iMid = ( tNode.m_iLo+tNode.m_iHi ) / 2;
18530 		dNodes.Add ();
18531 		dNodes.Last().m_iLo = tNode.m_iLo;
18532 		dNodes.Last().m_iHi = iMid-1;
18533 		dNodes.Add ();
18534 		dNodes.Last().m_iLo = iMid+1;
18535 		dNodes.Last().m_iHi = tNode.m_iHi;
18536 
18537 		if ( pCur->m_dWords.GetLength() )
18538 		{
18539 			assert ( pCur->m_dWords.GetLength()==1 );
18540 			XQNode_t * pTerm = CloneKeyword ( pRoot );
18541 			Swap ( pTerm->m_dWords, pCur->m_dWords );
18542 			pCur->m_dChildren.Add ( pTerm );
18543 			pTerm->m_pParent = pCur;
18544 		}
18545 
18546 		XQNode_t * pChild = CloneKeyword ( pRoot );
18547 		pChild->m_dWords.Add ( tRootWord );
18548 		pChild->m_dWords.Last().m_sWord = dWordSrc.GetWordExpanded ( iMid );
18549 		pChild->m_dWords.Last().m_bExpanded = true;
18550 		pChild->m_bNotWeighted = pRoot->m_bNotWeighted;
18551 
18552 		pChild->m_pParent = pCur;
18553 		pCur->m_dChildren.Add ( pChild );
18554 		pCur->SetOp ( SPH_QUERY_OR );
18555 
18556 		pCur = pChild;
18557 	}
18558 }
18559 
18560 
18561 /// do wildcard expansion for keywords dictionary
18562 /// (including prefix and infix expansion)
sphExpandXQNode(XQNode_t * pNode,ExpansionContext_t & tCtx)18563 XQNode_t * sphExpandXQNode ( XQNode_t * pNode, ExpansionContext_t & tCtx )
18564 {
18565 	assert ( pNode );
18566 	assert ( tCtx.m_pResult );
18567 
18568 	// process children for composite nodes
18569 	if ( pNode->m_dChildren.GetLength() )
18570 	{
18571 		ARRAY_FOREACH ( i, pNode->m_dChildren )
18572 		{
18573 			pNode->m_dChildren[i] = sphExpandXQNode ( pNode->m_dChildren[i], tCtx );
18574 			pNode->m_dChildren[i]->m_pParent = pNode;
18575 		}
18576 		return pNode;
18577 	}
18578 
18579 	// if that's a phrase/proximity node, create a very special, magic phrase/proximity node
18580 	if ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM )
18581 	{
18582 		assert ( pNode->m_dWords.GetLength()>1 );
18583 		ARRAY_FOREACH ( i, pNode->m_dWords )
18584 		{
18585 			XQNode_t * pWord = new XQNode_t ( pNode->m_dSpec );
18586 			pWord->m_dWords.Add ( pNode->m_dWords[i] );
18587 			pNode->m_dChildren.Add ( sphExpandXQNode ( pWord, tCtx ) );
18588 			pNode->m_dChildren.Last()->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
18589 			pNode->m_dChildren.Last()->m_pParent = pNode;
18590 
18591 			// tricky part
18592 			// current node may have field/zone limits attached
18593 			// normally those get pushed down during query parsing
18594 			// but here we create nodes manually and have to push down limits too
18595 			pWord->CopySpecs ( pNode );
18596 		}
18597 		pNode->m_dWords.Reset();
18598 		pNode->m_bVirtuallyPlain = true;
18599 		return pNode;
18600 	}
18601 
18602 	// skip empty plain nodes
18603 	if ( pNode->m_dWords.GetLength()<=0 )
18604 		return pNode;
18605 
18606 	// process keywords for plain nodes
18607 	assert ( pNode->m_dChildren.GetLength()==0 );
18608 	assert ( pNode->m_dWords.GetLength()==1 );
18609 
18610 	// check the wildcards
18611 	const char * sFull = pNode->m_dWords[0].m_sWord.cstr();
18612 	const int iLen = strlen ( sFull );
18613 
18614 	int iWilds = 0;
18615 	for ( const char * s = sFull; *s; s++ )
18616 		if ( sphIsWild(*s) )
18617 			iWilds++;
18618 
18619 	// no wildcards, or just wildcards? do not expand
18620 	if ( !iWilds || iWilds==iLen )
18621 		return pNode;
18622 
18623 	bool bUseTermMerge = ( tCtx.m_bMergeSingles && pNode->m_dSpec.m_dZones.GetLength()==0 );
18624 	ISphWordlist::Args_t tWordlist ( bUseTermMerge, tCtx.m_iExpansionLimit, tCtx.m_bHasMorphology, tCtx.m_eHitless, tCtx.m_pIndexData );
18625 
18626 	if ( !sphIsWild(*sFull) || tCtx.m_iMinInfixLen==0 )
18627 	{
18628 		// do prefix expansion
18629 		// remove exact form modifier, if any
18630 		const char * sPrefix = sFull;
18631 		if ( *sPrefix=='=' )
18632 			sPrefix++;
18633 
18634 		// skip leading wildcards
18635 		// (in case we got here on non-infixed index path)
18636 		const char * sWildcard = sPrefix;
18637 		while ( sphIsWild ( *sPrefix ) )
18638 		{
18639 			sPrefix++;
18640 			sWildcard++;
18641 		}
18642 
18643 		// compute non-wildcard prefix length
18644 		int iPrefix = 0;
18645 		for ( const char * s = sPrefix; *s && !sphIsWild(*s); s++ )
18646 			iPrefix++;
18647 
18648 		// do not expand prefixes under min length
18649 		int iMinLen = Max ( tCtx.m_iMinPrefixLen, tCtx.m_iMinInfixLen );
18650 		if ( iPrefix<iMinLen )
18651 		{
18652 			tCtx.m_pResult->m_sWarning.SetSprintf ( "Query word length is less than min prefix length. word: '%s' ", sFull );
18653 			return pNode;
18654 		}
18655 
18656 		// prefix expansion should work on nonstemmed words only
18657 		char sFixed [ MAX_KEYWORD_BYTES ];
18658 		if ( tCtx.m_bHasMorphology )
18659 		{
18660 			sFixed[0] = MAGIC_WORD_HEAD_NONSTEMMED;
18661 			memcpy ( sFixed+1, sPrefix, iPrefix );
18662 			sPrefix = sFixed;
18663 			iPrefix++;
18664 		}
18665 
18666 		tCtx.m_pWordlist->GetPrefixedWords ( sPrefix, iPrefix, sWildcard, tWordlist );
18667 
18668 	} else
18669 	{
18670 		// do infix expansion
18671 		assert ( sphIsWild(*sFull) );
18672 		assert ( tCtx.m_iMinInfixLen>0 );
18673 
18674 		// find the longest substring of non-wildcards
18675 		const char * sMaxInfix = NULL;
18676 		int iMaxInfix = 0;
18677 		int iCur = 0;
18678 
18679 		for ( const char * s = sFull; *s; s++ )
18680 		{
18681 			if ( sphIsWild(*s) )
18682 			{
18683 				iCur = 0;
18684 			} else if ( ++iCur > iMaxInfix )
18685 			{
18686 				sMaxInfix = s-iCur+1;
18687 				iMaxInfix = iCur;
18688 			}
18689 		}
18690 
18691 		// do not expand infixes under min_infix_len
18692 		if ( iMaxInfix < tCtx.m_iMinInfixLen )
18693 		{
18694 			tCtx.m_pResult->m_sWarning.SetSprintf ( "Query word length is less than min infix length. word: '%s' ", sFull );
18695 			return pNode;
18696 		}
18697 
18698 		// ignore heading star
18699 		tCtx.m_pWordlist->GetInfixedWords ( sMaxInfix, iMaxInfix, sFull, tWordlist );
18700 	}
18701 
18702 	// no real expansions?
18703 	// mark source word as expanded to prevent warning on terms mismatch in statistics
18704 	if ( !tWordlist.m_dExpanded.GetLength() && !tWordlist.m_pPayload )
18705 	{
18706 		tCtx.m_pResult->AddStat ( pNode->m_dWords.Begin()->m_sWord, 0, 0 );
18707 		pNode->m_dWords.Begin()->m_bExpanded = true;
18708 		return pNode;
18709 	}
18710 
18711 	// copy the original word (iirc it might get overwritten),
18712 	const XQKeyword_t tRootWord = pNode->m_dWords[0];
18713 	tCtx.m_pResult->AddStat ( tRootWord.m_sWord, tWordlist.m_iTotalDocs, tWordlist.m_iTotalHits );
18714 
18715 	// and build a binary tree of all the expansions
18716 	if ( tWordlist.m_dExpanded.GetLength() )
18717 	{
18718 		BuildExpandedTree ( tRootWord, tWordlist, pNode );
18719 	}
18720 
18721 	if ( tWordlist.m_pPayload )
18722 	{
18723 		ISphSubstringPayload * pPayload = tWordlist.m_pPayload;
18724 		tWordlist.m_pPayload = NULL;
18725 		tCtx.m_pPayloads->Add ( pPayload );
18726 
18727 		if ( pNode->m_dWords.GetLength() )
18728 		{
18729 			// all expanded fit to single payload
18730 			pNode->m_dWords.Begin()->m_bExpanded = true;
18731 			pNode->m_dWords.Begin()->m_pPayload = pPayload;
18732 		} else
18733 		{
18734 			// payload added to expanded binary tree
18735 			assert ( pNode->GetOp()==SPH_QUERY_OR );
18736 			assert ( pNode->m_dChildren.GetLength() );
18737 
18738 			XQNode_t * pSubstringNode = new XQNode_t ( pNode->m_dSpec );
18739 			pSubstringNode->SetOp ( SPH_QUERY_OR );
18740 
18741 			XQKeyword_t tSubstringWord = tRootWord;
18742 			tSubstringWord.m_bExpanded = true;
18743 			tSubstringWord.m_pPayload = pPayload;
18744 			pSubstringNode->m_dWords.Add ( tSubstringWord );
18745 
18746 			pNode->m_dChildren.Add ( pSubstringNode );
18747 			pSubstringNode->m_pParent = pNode;
18748 		}
18749 	}
18750 
18751 	return pNode;
18752 }
18753 
18754 
ExpansionContext_t()18755 ExpansionContext_t::ExpansionContext_t()
18756 	: m_pWordlist ( NULL )
18757 	, m_pBuf ( NULL )
18758 	, m_pResult ( NULL )
18759 	, m_iMinPrefixLen ( 0 )
18760 	, m_iMinInfixLen ( 0 )
18761 	, m_iExpansionLimit ( 0 )
18762 	, m_bHasMorphology ( false )
18763 	, m_bMergeSingles ( false )
18764 	, m_pPayloads ( NULL )
18765 	, m_eHitless ( SPH_HITLESS_NONE )
18766 	, m_pIndexData ( NULL )
18767 {}
18768 
18769 
ExpandPrefix(XQNode_t * pNode,CSphQueryResultMeta * pResult,CSphScopedPayload * pPayloads) const18770 XQNode_t * CSphIndex_VLN::ExpandPrefix ( XQNode_t * pNode, CSphQueryResultMeta * pResult, CSphScopedPayload * pPayloads ) const
18771 {
18772 	if ( !pNode || !m_pDict->GetSettings().m_bWordDict || ( m_tSettings.m_iMinPrefixLen<=0 && m_tSettings.m_iMinInfixLen<=0 ) )
18773 		return pNode;
18774 
18775 	assert ( m_pPreread && *m_pPreread );
18776 	assert ( !m_tWordlist.m_pBuf.IsEmpty() );
18777 
18778 	ExpansionContext_t tCtx;
18779 	tCtx.m_pWordlist = &m_tWordlist;
18780 	tCtx.m_pResult = pResult;
18781 	tCtx.m_iMinPrefixLen = m_tSettings.m_iMinPrefixLen;
18782 	tCtx.m_iMinInfixLen = m_tSettings.m_iMinInfixLen;
18783 	tCtx.m_iExpansionLimit = m_iExpansionLimit;
18784 	tCtx.m_bHasMorphology = m_pDict->HasMorphology();
18785 	tCtx.m_bMergeSingles = m_tSettings.m_eDocinfo!=SPH_DOCINFO_INLINE;
18786 	tCtx.m_pPayloads = pPayloads;
18787 	tCtx.m_eHitless = m_tSettings.m_eHitless;
18788 
18789 	pNode = sphExpandXQNode ( pNode, tCtx );
18790 	pNode->Check ( true );
18791 
18792 	return pNode;
18793 }
18794 
18795 
18796 // transform the (A B) NEAR C into A NEAR B NEAR C
TransformNear(XQNode_t ** ppNode)18797 static void TransformNear ( XQNode_t ** ppNode )
18798 {
18799 	XQNode_t *& pNode = *ppNode;
18800 	if ( pNode->GetOp()==SPH_QUERY_NEAR )
18801 	{
18802 		assert ( pNode->m_dWords.GetLength()==0 );
18803 		CSphVector<XQNode_t*> dArgs;
18804 		int iStartFrom;
18805 
18806 		// transform all (A B C) NEAR D into A NEAR B NEAR C NEAR D
18807 		do
18808 		{
18809 			dArgs.Reset();
18810 			iStartFrom = 0;
18811 			ARRAY_FOREACH ( i, pNode->m_dChildren )
18812 			{
18813 				XQNode_t * pChild = pNode->m_dChildren[i]; ///< shortcut
18814 				if ( pChild->GetOp()==SPH_QUERY_AND && pChild->m_dChildren.GetLength()>0 )
18815 				{
18816 					ARRAY_FOREACH ( j, pChild->m_dChildren )
18817 					{
18818 						if ( j==0 && iStartFrom==0 )
18819 						{
18820 							// we will remove the node anyway, so just replace it with 1-st child instead
18821 							pNode->m_dChildren[i] = pChild->m_dChildren[j];
18822 							pNode->m_dChildren[i]->m_pParent = pNode;
18823 							iStartFrom = i+1;
18824 						} else
18825 						{
18826 							dArgs.Add ( pChild->m_dChildren[j] );
18827 						}
18828 					}
18829 					pChild->m_dChildren.Reset();
18830 					SafeDelete ( pChild );
18831 				} else if ( iStartFrom!=0 )
18832 				{
18833 					dArgs.Add ( pChild );
18834 				}
18835 			}
18836 
18837 			if ( iStartFrom!=0 )
18838 			{
18839 				pNode->m_dChildren.Resize ( iStartFrom + dArgs.GetLength() );
18840 				ARRAY_FOREACH ( i, dArgs )
18841 				{
18842 					pNode->m_dChildren [ i + iStartFrom ] = dArgs[i];
18843 					pNode->m_dChildren [ i + iStartFrom ]->m_pParent = pNode;
18844 				}
18845 			}
18846 		} while ( iStartFrom!=0 );
18847 	}
18848 
18849 	ARRAY_FOREACH ( i, pNode->m_dChildren )
18850 		TransformNear ( &pNode->m_dChildren[i] );
18851 }
18852 
18853 
18854 /// tag excluded keywords (rvals to operator NOT)
TagExcluded(XQNode_t * pNode,bool bNot)18855 static void TagExcluded ( XQNode_t * pNode, bool bNot )
18856 {
18857 	if ( pNode->GetOp()==SPH_QUERY_ANDNOT )
18858 	{
18859 		assert ( pNode->m_dChildren.GetLength()==2 );
18860 		assert ( pNode->m_dWords.GetLength()==0 );
18861 		TagExcluded ( pNode->m_dChildren[0], bNot );
18862 		TagExcluded ( pNode->m_dChildren[1], !bNot );
18863 
18864 	} else if ( pNode->m_dChildren.GetLength() )
18865 	{
18866 		// FIXME? check if this works okay with "virtually plain" stuff?
18867 		ARRAY_FOREACH ( i, pNode->m_dChildren )
18868 			TagExcluded ( pNode->m_dChildren[i], bNot );
18869 	} else
18870 	{
18871 		// tricky bit
18872 		// no assert on length here and that is intended
18873 		// we have fully empty nodes (0 children, 0 words) sometimes!
18874 		ARRAY_FOREACH ( i, pNode->m_dWords )
18875 			pNode->m_dWords[i].m_bExcluded = bNot;
18876 	}
18877 }
18878 
18879 
18880 /// optimize phrase queries if we have bigrams
TransformBigrams(XQNode_t * pNode,const CSphIndexSettings & tSettings)18881 static void TransformBigrams ( XQNode_t * pNode, const CSphIndexSettings & tSettings )
18882 {
18883 	assert ( tSettings.m_eBigramIndex!=SPH_BIGRAM_NONE );
18884 	assert ( tSettings.m_eBigramIndex==SPH_BIGRAM_ALL || tSettings.m_dBigramWords.GetLength() );
18885 
18886 	if ( pNode->GetOp()!=SPH_QUERY_PHRASE )
18887 	{
18888 		ARRAY_FOREACH ( i, pNode->m_dChildren )
18889 			TransformBigrams ( pNode->m_dChildren[i], tSettings );
18890 		return;
18891 	}
18892 
18893 	CSphBitvec bmRemove;
18894 	bmRemove.Init ( pNode->m_dWords.GetLength() );
18895 
18896 	for ( int i=0; i<pNode->m_dWords.GetLength()-1; i++ )
18897 	{
18898 		// check whether this pair was indexed
18899 		bool bBigram = false;
18900 		switch ( tSettings.m_eBigramIndex )
18901 		{
18902 			case SPH_BIGRAM_NONE:
18903 				break;
18904 			case SPH_BIGRAM_ALL:
18905 				bBigram = true;
18906 				break;
18907 			case SPH_BIGRAM_FIRSTFREQ:
18908 				bBigram = tSettings.m_dBigramWords.BinarySearch ( pNode->m_dWords[i].m_sWord )!=NULL;
18909 				break;
18910 			case SPH_BIGRAM_BOTHFREQ:
18911 				bBigram =
18912 					( tSettings.m_dBigramWords.BinarySearch ( pNode->m_dWords[i].m_sWord )!=NULL ) &&
18913 					( tSettings.m_dBigramWords.BinarySearch ( pNode->m_dWords[i+1].m_sWord )!=NULL );
18914 				break;
18915 		}
18916 		if ( !bBigram )
18917 			continue;
18918 
18919 		// replace the pair with a bigram keyword
18920 		// FIXME!!! set phrase weight for this "word" here
18921 		pNode->m_dWords[i].m_sWord.SetSprintf ( "%s%c%s",
18922 			pNode->m_dWords[i].m_sWord.cstr(),
18923 			MAGIC_WORD_BIGRAM,
18924 			pNode->m_dWords[i+1].m_sWord.cstr() );
18925 
18926 		// only mark for removal now, we will sweep later
18927 		// so that [a b c] would convert to ["a b" "b c"], not just ["a b" c]
18928 		bmRemove.BitClear ( i );
18929 		bmRemove.BitSet ( i+1 );
18930 	}
18931 
18932 	// remove marked words
18933 	int iOut = 0;
18934 	ARRAY_FOREACH ( i, pNode->m_dWords )
18935 		if ( !bmRemove.BitGet(i) )
18936 			pNode->m_dWords[iOut++] = pNode->m_dWords[i];
18937 	pNode->m_dWords.Resize ( iOut );
18938 
18939 	// fixup nodes that are not real phrases any more
18940 	if ( pNode->m_dWords.GetLength()==1 )
18941 		pNode->SetOp ( SPH_QUERY_AND );
18942 }
18943 
18944 
18945 /// create a node from a set of lemmas
18946 /// WARNING, tKeyword might or might not be pointing to pNode->m_dWords[0]
18947 /// Called from the daemon side (searchd) in time of query
TransformAotFilterKeyword(XQNode_t * pNode,const XQKeyword_t & tKeyword,const CSphWordforms * pWordforms,const CSphIndexSettings & tSettings)18948 static void TransformAotFilterKeyword ( XQNode_t * pNode, const XQKeyword_t & tKeyword, const CSphWordforms * pWordforms, const CSphIndexSettings & tSettings )
18949 {
18950 	assert ( pNode->m_dWords.GetLength()<=1 );
18951 	assert ( pNode->m_dChildren.GetLength()==0 );
18952 
18953 	XQNode_t * pExact = NULL;
18954 	if ( pWordforms )
18955 	{
18956 		// do a copy, because patching in place is not an option
18957 		// short => longlonglong wordform mapping would crash
18958 		// OPTIMIZE? forms that are not found will (?) get looked up again in the dict
18959 		char sBuf [ MAX_KEYWORD_BYTES ];
18960 		strncpy ( sBuf, tKeyword.m_sWord.cstr(), sizeof(sBuf) );
18961 		if ( pWordforms->ToNormalForm ( (BYTE*)sBuf, true ) )
18962 		{
18963 			if ( !pNode->m_dWords.GetLength() )
18964 				pNode->m_dWords.Add ( tKeyword );
18965 			pNode->m_dWords[0].m_sWord = sBuf;
18966 			pNode->m_dWords[0].m_bMorphed = true;
18967 			return;
18968 		}
18969 	}
18970 
18971 	CSphVector<CSphString> dLemmas;
18972 	DWORD uLangMask = tSettings.m_uAotFilterMask;
18973 	for ( int i=AOT_BEGIN; i<AOT_LENGTH; ++i )
18974 	{
18975 		if ( uLangMask & (1UL<<i) )
18976 		{
18977 			if ( i==AOT_RU )
18978 				sphAotLemmatizeRu ( dLemmas, (BYTE*)tKeyword.m_sWord.cstr() );
18979 			else if ( i==AOT_DE )
18980 				sphAotLemmatizeDe ( dLemmas, (BYTE*)tKeyword.m_sWord.cstr() );
18981 			else
18982 				sphAotLemmatize ( dLemmas, (BYTE*)tKeyword.m_sWord.cstr(), i );
18983 		}
18984 	}
18985 
18986 	// post-morph wordforms
18987 	if ( pWordforms && pWordforms->m_bHavePostMorphNF )
18988 	{
18989 		char sBuf [ MAX_KEYWORD_BYTES ];
18990 		ARRAY_FOREACH ( i, dLemmas )
18991 		{
18992 			strncpy ( sBuf, dLemmas[i].cstr(), sizeof(sBuf) );
18993 			if ( pWordforms->ToNormalForm ( (BYTE*)sBuf, false ) )
18994 				dLemmas[i] = sBuf;
18995 		}
18996 	}
18997 
18998 	if ( dLemmas.GetLength() && tSettings.m_bIndexExactWords )
18999 	{
19000 		pExact = CloneKeyword ( pNode );
19001 		if ( !pExact->m_dWords.GetLength() )
19002 			pExact->m_dWords.Add ( tKeyword );
19003 
19004 		pExact->m_dWords[0].m_sWord.SetSprintf ( "=%s", tKeyword.m_sWord.cstr() );
19005 		pExact->m_pParent = pNode;
19006 	}
19007 
19008 	if ( !pExact && dLemmas.GetLength()<=1 )
19009 	{
19010 		// zero or one lemmas, update node in-place
19011 		if ( !pNode->m_dWords.GetLength() )
19012 			pNode->m_dWords.Add ( tKeyword );
19013 		if ( dLemmas.GetLength() )
19014 		{
19015 			pNode->m_dWords[0].m_sWord = dLemmas[0];
19016 			pNode->m_dWords[0].m_bMorphed = true;
19017 		}
19018 	} else
19019 	{
19020 		// multiple lemmas, create an OR node
19021 		pNode->SetOp ( SPH_QUERY_OR );
19022 		ARRAY_FOREACH ( i, dLemmas )
19023 		{
19024 			pNode->m_dChildren.Add ( new XQNode_t ( pNode->m_dSpec ) );
19025 			pNode->m_dChildren.Last()->m_pParent = pNode;
19026 			XQKeyword_t & tLemma = pNode->m_dChildren.Last()->m_dWords.Add();
19027 			tLemma.m_sWord = dLemmas[i];
19028 			tLemma.m_iAtomPos = tKeyword.m_iAtomPos;
19029 			tLemma.m_bFieldStart = tKeyword.m_bFieldStart;
19030 			tLemma.m_bFieldEnd = tKeyword.m_bFieldEnd;
19031 			tLemma.m_bMorphed = true;
19032 		}
19033 		pNode->m_dWords.Reset();
19034 		if ( pExact )
19035 			pNode->m_dChildren.Add ( pExact );
19036 	}
19037 }
19038 
19039 
19040 /// AOT morph guesses transform
19041 /// replaces tokens with their respective morph guesses subtrees
19042 /// used in lemmatize_ru_all morphology processing mode that can generate multiple guesses
19043 /// in other modes, there is always exactly one morph guess, and the dictionary handles it
19044 /// Called from the daemon side (searchd)
TransformAotFilter(XQNode_t * pNode,const CSphWordforms * pWordforms,const CSphIndexSettings & tSettings)19045 void TransformAotFilter ( XQNode_t * pNode, const CSphWordforms * pWordforms, const CSphIndexSettings & tSettings )
19046 {
19047 	// case one, regular operator (and empty nodes)
19048 	ARRAY_FOREACH ( i, pNode->m_dChildren )
19049 		TransformAotFilter ( pNode->m_dChildren[i], pWordforms, tSettings );
19050 	if ( pNode->m_dChildren.GetLength() || pNode->m_dWords.GetLength()==0 )
19051 		return;
19052 
19053 	// case two, operator on a bag of words
19054 	// FIXME? check phrase vs expand_keywords vs lemmatize_ru_all?
19055 	if ( pNode->m_dWords.GetLength()
19056 		&& ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM ) )
19057 	{
19058 		assert ( pNode->m_dWords.GetLength() );
19059 
19060 		ARRAY_FOREACH ( i, pNode->m_dWords )
19061 		{
19062 			XQNode_t * pNew = new XQNode_t ( pNode->m_dSpec );
19063 			pNew->m_pParent = pNode;
19064 			pNew->m_iAtomPos = pNode->m_dWords[i].m_iAtomPos;
19065 			pNode->m_dChildren.Add ( pNew );
19066 			TransformAotFilterKeyword ( pNew, pNode->m_dWords[i], pWordforms, tSettings );
19067 		}
19068 
19069 		pNode->m_dWords.Reset();
19070 		pNode->m_bVirtuallyPlain = true;
19071 		return;
19072 	}
19073 
19074 	// case three, plain old single keyword
19075 	assert ( pNode->m_dWords.GetLength()==1 );
19076 	TransformAotFilterKeyword ( pNode, pNode->m_dWords[0], pWordforms, tSettings );
19077 }
19078 
19079 
sphTransformExtendedQuery(XQNode_t ** ppNode,const CSphIndexSettings & tSettings,bool bHasBooleanOptimization,const ISphKeywordsStat * pKeywords)19080 void sphTransformExtendedQuery ( XQNode_t ** ppNode, const CSphIndexSettings & tSettings, bool bHasBooleanOptimization, const ISphKeywordsStat * pKeywords )
19081 {
19082 	TransformQuorum ( ppNode );
19083 	( *ppNode )->Check ( true );
19084 	TransformNear ( ppNode );
19085 	( *ppNode )->Check ( true );
19086 	if ( tSettings.m_eBigramIndex!=SPH_BIGRAM_NONE )
19087 		TransformBigrams ( *ppNode, tSettings );
19088 	TagExcluded ( *ppNode, false );
19089 	( *ppNode )->Check ( true );
19090 
19091 	// boolean optimization
19092 	if ( bHasBooleanOptimization )
19093 		sphOptimizeBoolean ( ppNode, pKeywords );
19094 }
19095 
19096 
19097 struct CmpPSortersByRandom_fn
19098 {
IsLessCmpPSortersByRandom_fn19099 	inline bool IsLess ( const ISphMatchSorter * a, const ISphMatchSorter * b ) const
19100 	{
19101 		assert ( a );
19102 		assert ( b );
19103 		return a->m_bRandomize < b->m_bRandomize;
19104 	}
19105 };
19106 
19107 
19108 /// one regular query vs many sorters
MultiQuery(const CSphQuery * pQuery,CSphQueryResult * pResult,int iSorters,ISphMatchSorter ** ppSorters,const CSphMultiQueryArgs & tArgs) const19109 bool CSphIndex_VLN::MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult,
19110 	int iSorters, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const
19111 {
19112 	assert ( pQuery );
19113 	CSphQueryProfile * pProfile = pResult->m_pProfile;
19114 
19115 	MEMORY ( MEM_DISK_QUERY );
19116 
19117 	// to avoid the checking of a ppSorters's element for NULL on every next step, just filter out all nulls right here
19118 	CSphVector<ISphMatchSorter*> dSorters;
19119 	dSorters.Reserve ( iSorters );
19120 	for ( int i=0; i<iSorters; i++ )
19121 		if ( ppSorters[i] )
19122 			dSorters.Add ( ppSorters[i] );
19123 
19124 	iSorters = dSorters.GetLength();
19125 
19126 	// if we have anything to work with
19127 	if ( iSorters==0 )
19128 		return false;
19129 
19130 	// non-random at the start, random at the end
19131 	dSorters.Sort ( CmpPSortersByRandom_fn() );
19132 
19133 	// fast path for scans
19134 	if ( pQuery->m_sQuery.IsEmpty() )
19135 		return MultiScan ( pQuery, pResult, iSorters, &dSorters[0], tArgs );
19136 
19137 	if ( pProfile )
19138 		pProfile->Switch ( SPH_QSTATE_DICT_SETUP );
19139 
19140 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
19141 	CSphDict * pDictBase = m_pDict;
19142 	if ( pDictBase->HasState() )
19143 		tDictCloned = pDictBase = pDictBase->Clone();
19144 
19145 	CSphScopedPtr<CSphDict> tDict ( NULL );
19146 	CSphDict * pDict = SetupStarDict ( tDict, pDictBase );
19147 
19148 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
19149 	pDict = SetupExactDict ( tDict2, pDict );
19150 
19151 	CSphVector<BYTE> dFiltered;
19152 	const BYTE * sModifiedQuery = (BYTE *)pQuery->m_sQuery.cstr();
19153 	if ( m_pFieldFilter && m_pFieldFilter->Apply ( sModifiedQuery, 0, dFiltered ) )
19154 		sModifiedQuery = dFiltered.Begin();
19155 
19156 	// parse query
19157 	if ( pProfile )
19158 		pProfile->Switch ( SPH_QSTATE_PARSE );
19159 
19160 	XQQuery_t tParsed;
19161 	if ( !sphParseExtendedQuery ( tParsed, (const char*)sModifiedQuery, pQuery, m_pQueryTokenizer, &m_tSchema, pDict, m_tSettings ) )
19162 	{
19163 		// FIXME? might wanna reset profile to unknown state
19164 		pResult->m_sError = tParsed.m_sParseError;
19165 		return false;
19166 	}
19167 	if ( !tParsed.m_sParseWarning.IsEmpty() )
19168 		pResult->m_sWarning = tParsed.m_sParseWarning;
19169 
19170 	// transform query if needed (quorum transform, etc.)
19171 	if ( pProfile )
19172 		pProfile->Switch ( SPH_QSTATE_TRANSFORMS );
19173 	sphTransformExtendedQuery ( &tParsed.m_pRoot, m_tSettings, pQuery->m_bSimplify, this );
19174 
19175 	if ( m_bExpandKeywords )
19176 	{
19177 		tParsed.m_pRoot = sphQueryExpandKeywords ( tParsed.m_pRoot, m_tSettings );
19178 		tParsed.m_pRoot->Check ( true );
19179 	}
19180 
19181 	// this should be after keyword expansion
19182 	if ( m_tSettings.m_uAotFilterMask )
19183 		TransformAotFilter ( tParsed.m_pRoot, pDict->GetWordforms(), m_tSettings );
19184 
19185 	SphWordStatChecker_t tStatDiff;
19186 	tStatDiff.Set ( pResult->m_hWordStats );
19187 
19188 	// expanding prefix in word dictionary case
19189 	CSphScopedPayload tPayloads;
19190 	XQNode_t * pPrefixed = ExpandPrefix ( tParsed.m_pRoot, pResult, &tPayloads );
19191 	if ( !pPrefixed )
19192 		return false;
19193 	tParsed.m_pRoot = pPrefixed;
19194 
19195 	if ( !sphCheckQueryHeight ( tParsed.m_pRoot, pResult->m_sError ) )
19196 		return false;
19197 
19198 	// flag common subtrees
19199 	int iCommonSubtrees = 0;
19200 	if ( m_iMaxCachedDocs && m_iMaxCachedHits )
19201 		iCommonSubtrees = sphMarkCommonSubtrees ( 1, &tParsed );
19202 
19203 	tParsed.m_bNeedSZlist = pQuery->m_bZSlist;
19204 
19205 	CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );
19206 	bool bResult = ParsedMultiQuery ( pQuery, pResult, iSorters, &dSorters[0], tParsed, pDict, tArgs, &tNodeCache, tStatDiff );
19207 
19208 	return bResult;
19209 }
19210 
19211 
19212 /// many regular queries with one sorter attached to each query.
19213 /// returns true if at least one query succeeded. The failed queries indicated with pResult->m_iMultiplier==-1
MultiQueryEx(int iQueries,const CSphQuery * pQueries,CSphQueryResult ** ppResults,ISphMatchSorter ** ppSorters,const CSphMultiQueryArgs & tArgs) const19214 bool CSphIndex_VLN::MultiQueryEx ( int iQueries, const CSphQuery * pQueries,
19215 	CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const
19216 {
19217 	// ensure we have multiple queries
19218 	assert ( ppResults );
19219 	if ( iQueries==1 )
19220 		return MultiQuery ( pQueries, ppResults[0], 1, ppSorters, tArgs );
19221 
19222 	MEMORY ( MEM_DISK_QUERYEX );
19223 
19224 	assert ( pQueries );
19225 	assert ( ppSorters );
19226 
19227 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
19228 	CSphDict * pDictBase = m_pDict;
19229 	if ( pDictBase->HasState() )
19230 		tDictCloned = pDictBase = pDictBase->Clone();
19231 
19232 	CSphScopedPtr<CSphDict> tDict ( NULL );
19233 	CSphDict * pDict = SetupStarDict ( tDict, pDictBase );
19234 
19235 	CSphScopedPtr<CSphDict> tDict2 ( NULL );
19236 	pDict = SetupExactDict ( tDict2, pDict );
19237 
19238 	CSphFixedVector<XQQuery_t> dXQ ( iQueries );
19239 	CSphFixedVector<SphWordStatChecker_t> dStatChecker ( iQueries );
19240 	CSphScopedPayload tPayloads;
19241 	bool bResult = false;
19242 	bool bResultScan = false;
19243 	for ( int i=0; i<iQueries; i++ )
19244 	{
19245 		// nothing to do without a sorter
19246 		if ( !ppSorters[i] )
19247 		{
19248 			ppResults[i]->m_iMultiplier = -1; ///< show that this particular query failed
19249 			continue;
19250 		}
19251 
19252 		// fast path for scans
19253 		if ( pQueries[i].m_sQuery.IsEmpty() )
19254 		{
19255 			if ( MultiScan ( pQueries + i, ppResults[i], 1, &ppSorters[i], tArgs ) )
19256 				bResultScan = true;
19257 			else
19258 				ppResults[i]->m_iMultiplier = -1; ///< show that this particular query failed
19259 			continue;
19260 		}
19261 
19262 		ppResults[i]->m_tIOStats.Start();
19263 
19264 		// parse query
19265 		if ( sphParseExtendedQuery ( dXQ[i], pQueries[i].m_sQuery.cstr(), &(pQueries[i]), m_pQueryTokenizer, &m_tSchema, pDict, m_tSettings ) )
19266 		{
19267 			// transform query if needed (quorum transform, keyword expansion, etc.)
19268 			sphTransformExtendedQuery ( &dXQ[i].m_pRoot, m_tSettings, pQueries[i].m_bSimplify, this );
19269 
19270 			if ( m_bExpandKeywords )
19271 			{
19272 				dXQ[i].m_pRoot = sphQueryExpandKeywords ( dXQ[i].m_pRoot, m_tSettings );
19273 				dXQ[i].m_pRoot->Check ( true );
19274 			}
19275 
19276 			// this should be after keyword expansion
19277 			if ( m_tSettings.m_uAotFilterMask )
19278 				TransformAotFilter ( dXQ[i].m_pRoot, pDict->GetWordforms(), m_tSettings );
19279 
19280 			dStatChecker[i].Set ( ppResults[i]->m_hWordStats );
19281 
19282 			// expanding prefix in word dictionary case
19283 			XQNode_t * pPrefixed = ExpandPrefix ( dXQ[i].m_pRoot, ppResults[i], &tPayloads );
19284 			if ( pPrefixed )
19285 			{
19286 				dXQ[i].m_pRoot = pPrefixed;
19287 
19288 				if ( sphCheckQueryHeight ( dXQ[i].m_pRoot, ppResults[i]->m_sError ) )
19289 				{
19290 					bResult = true;
19291 				} else
19292 				{
19293 					ppResults[i]->m_iMultiplier = -1;
19294 					SafeDelete ( dXQ[i].m_pRoot );
19295 				}
19296 			} else
19297 			{
19298 				ppResults[i]->m_iMultiplier = -1;
19299 				SafeDelete ( dXQ[i].m_pRoot );
19300 			}
19301 		} else
19302 		{
19303 			ppResults[i]->m_sError = dXQ[i].m_sParseError;
19304 			ppResults[i]->m_iMultiplier = -1;
19305 		}
19306 		if ( !dXQ[i].m_sParseWarning.IsEmpty() )
19307 			ppResults[i]->m_sWarning = dXQ[i].m_sParseWarning;
19308 
19309 		ppResults[i]->m_tIOStats.Stop();
19310 	}
19311 
19312 	// continue only if we have at least one non-failed
19313 	if ( bResult )
19314 	{
19315 		int iCommonSubtrees = 0;
19316 		if ( m_iMaxCachedDocs && m_iMaxCachedHits )
19317 			iCommonSubtrees = sphMarkCommonSubtrees ( iQueries, &dXQ[0] );
19318 
19319 		CSphQueryNodeCache tNodeCache ( iCommonSubtrees, m_iMaxCachedDocs, m_iMaxCachedHits );
19320 		bResult = false;
19321 		for ( int j=0; j<iQueries; j++ )
19322 		{
19323 			// fullscan case
19324 			if ( pQueries[j].m_sQuery.IsEmpty() )
19325 				continue;
19326 
19327 			ppResults[j]->m_tIOStats.Start();
19328 
19329 			if ( dXQ[j].m_pRoot && ppSorters[j]
19330 					&& ParsedMultiQuery ( &pQueries[j], ppResults[j], 1, &ppSorters[j], dXQ[j], pDict, tArgs, &tNodeCache, dStatChecker[j] ) )
19331 			{
19332 				bResult = true;
19333 				ppResults[j]->m_iMultiplier = iCommonSubtrees ? iQueries : 1;
19334 			} else
19335 			{
19336 				ppResults[j]->m_iMultiplier = -1;
19337 			}
19338 
19339 			ppResults[j]->m_tIOStats.Stop();
19340 		}
19341 	}
19342 
19343 	return bResult | bResultScan;
19344 }
19345 
ParsedMultiQuery(const CSphQuery * pQuery,CSphQueryResult * pResult,int iSorters,ISphMatchSorter ** ppSorters,const XQQuery_t & tXQ,CSphDict * pDict,const CSphMultiQueryArgs & tArgs,CSphQueryNodeCache * pNodeCache,const SphWordStatChecker_t & tStatDiff) const19346 bool CSphIndex_VLN::ParsedMultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult,
19347 	int iSorters, ISphMatchSorter ** ppSorters, const XQQuery_t & tXQ, CSphDict * pDict,
19348 	const CSphMultiQueryArgs & tArgs, CSphQueryNodeCache * pNodeCache, const SphWordStatChecker_t & tStatDiff ) const
19349 {
19350 	assert ( pQuery );
19351 	assert ( pResult );
19352 	assert ( ppSorters );
19353 	assert ( !pQuery->m_sQuery.IsEmpty() && pQuery->m_eMode!=SPH_MATCH_FULLSCAN ); // scans must go through MultiScan()
19354 	assert ( tArgs.m_iTag>=0 );
19355 
19356 	// start counting
19357 	int64_t tmQueryStart = sphMicroTimer();
19358 
19359 	CSphQueryProfile * pProfile = pResult->m_pProfile;
19360 	if ( pProfile )
19361 		pProfile->Switch ( SPH_QSTATE_INIT );
19362 
19363 	///////////////////
19364 	// setup searching
19365 	///////////////////
19366 
19367 	// non-ready index, empty response!
19368 	if ( !m_pPreread || !*m_pPreread )
19369 	{
19370 		pResult->m_sError = "index not preread";
19371 		return false;
19372 	}
19373 
19374 	// select the sorter with max schema
19375 	int iMaxSchemaSize = -1;
19376 	int iMaxSchemaIndex = -1;
19377 	for ( int i=0; i<iSorters; i++ )
19378 		if ( ppSorters[i]->GetSchema().GetRowSize() > iMaxSchemaSize )
19379 		{
19380 			iMaxSchemaSize = ppSorters[i]->GetSchema().GetRowSize();
19381 			iMaxSchemaIndex = i;
19382 		}
19383 
19384 	// setup calculations and result schema
19385 	CSphQueryContext tCtx;
19386 	tCtx.m_pProfile = pProfile;
19387 	tCtx.m_pLocalDocs = tArgs.m_pLocalDocs;
19388 	tCtx.m_iTotalDocs = tArgs.m_iTotalDocs;
19389 	if ( !tCtx.SetupCalc ( pResult, ppSorters[iMaxSchemaIndex]->GetSchema(), m_tSchema, m_tMva.GetWritePtr(), m_bArenaProhibit, false ) )
19390 		return false;
19391 
19392 	// set string pool for string on_sort expression fix up
19393 	tCtx.SetStringPool ( m_tString.GetWritePtr() );
19394 
19395 	tCtx.m_uPackedFactorFlags = tArgs.m_uPackedFactorFlags;
19396 
19397 	// open files
19398 	CSphAutofile tDoclist, tHitlist;
19399 	if ( !m_bKeepFilesOpen )
19400 	{
19401 		if ( pProfile )
19402 			pProfile->Switch ( SPH_QSTATE_OPEN );
19403 
19404 		if ( tDoclist.Open ( GetIndexFileName("spd"), SPH_O_READ, pResult->m_sError ) < 0 )
19405 			return false;
19406 
19407 		if ( tHitlist.Open ( GetIndexFileName ( m_uVersion>=3 ? "spp" : "spd" ), SPH_O_READ, pResult->m_sError ) < 0 )
19408 			return false;
19409 	}
19410 
19411 	if ( pProfile )
19412 		pProfile->Switch ( SPH_QSTATE_INIT );
19413 
19414 	// setup search terms
19415 	DiskIndexQwordSetup_c tTermSetup ( m_bKeepFilesOpen ? m_tDoclistFile : tDoclist,
19416 		m_bKeepFilesOpen ? m_tHitlistFile : tHitlist,
19417 		m_pSkiplists.GetWritePtr(), pProfile );
19418 
19419 	tTermSetup.m_pDict = pDict;
19420 	tTermSetup.m_pIndex = this;
19421 	tTermSetup.m_eDocinfo = m_tSettings.m_eDocinfo;
19422 	tTermSetup.m_uMinDocid = m_uMinDocid;
19423 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
19424 	{
19425 		tTermSetup.m_iInlineRowitems = m_tSchema.GetRowSize();
19426 		tTermSetup.m_pMinRow = m_dMinRow.Begin();
19427 	}
19428 	tTermSetup.m_iDynamicRowitems = ppSorters[iMaxSchemaIndex]->GetSchema().GetDynamicSize();
19429 
19430 	if ( pQuery->m_uMaxQueryMsec>0 )
19431 		tTermSetup.m_iMaxTimer = sphMicroTimer() + pQuery->m_uMaxQueryMsec*1000; // max_query_time
19432 	tTermSetup.m_pWarning = &pResult->m_sWarning;
19433 	tTermSetup.m_bSetupReaders = true;
19434 	tTermSetup.m_pCtx = &tCtx;
19435 	tTermSetup.m_pNodeCache = pNodeCache;
19436 
19437 	// setup prediction constrain
19438 	CSphQueryStats tQueryStats;
19439 	bool bCollectPredictionCounters = ( pQuery->m_iMaxPredictedMsec>0 );
19440 	int64_t iNanoBudget = (int64_t)(pQuery->m_iMaxPredictedMsec) * 1000000; // from milliseconds to nanoseconds
19441 	tQueryStats.m_pNanoBudget = &iNanoBudget;
19442 	if ( bCollectPredictionCounters )
19443 		tTermSetup.m_pStats = &tQueryStats;
19444 
19445 	// bind weights
19446 	tCtx.BindWeights ( pQuery, m_tSchema, pResult->m_sWarning );
19447 
19448 	// setup query
19449 	// must happen before index-level reject, in order to build proper keyword stats
19450 	CSphScopedPtr<ISphRanker> pRanker ( sphCreateRanker ( tXQ, pQuery, pResult, tTermSetup, tCtx ) );
19451 	if ( !pRanker.Ptr() )
19452 		return false;
19453 
19454 	tStatDiff.DumpDiffer ( pResult->m_hWordStats, m_sIndexName.cstr(), pResult->m_sWarning );
19455 
19456 	if ( ( tArgs.m_uPackedFactorFlags & SPH_FACTOR_ENABLE ) && pQuery->m_eRanker!=SPH_RANK_EXPR )
19457 		pResult->m_sWarning.SetSprintf ( "packedfactors() and bm25f() requires using an expression ranker" );
19458 
19459 	tCtx.SetupExtraData ( pRanker.Ptr(), iSorters==1 ? ppSorters[0] : NULL );
19460 
19461 	PoolPtrs_t tMva;
19462 	tMva.m_pMva = m_tMva.GetWritePtr();
19463 	tMva.m_bArenaProhibit = m_bArenaProhibit;
19464 	pRanker->ExtraData ( EXTRA_SET_MVAPOOL, (void**)&tMva );
19465 	pRanker->ExtraData ( EXTRA_SET_STRINGPOOL, (void**)m_tString.GetWritePtr() );
19466 
19467 	int iMatchPoolSize = 0;
19468 	for ( int i=0; i<iSorters; i++ )
19469 		iMatchPoolSize += ppSorters[i]->m_iMatchCapacity;
19470 
19471 	pRanker->ExtraData ( EXTRA_SET_POOL_CAPACITY, (void**)&iMatchPoolSize );
19472 
19473 	// check for the possible integer overflow in m_dPool.Resize
19474 	int64_t iPoolSize = 0;
19475 	if ( pRanker->ExtraData ( EXTRA_GET_POOL_SIZE, (void**)&iPoolSize ) && iPoolSize>INT_MAX )
19476 	{
19477 		pResult->m_sError.SetSprintf ( "ranking factors pool too big (%d Mb), reduce max_matches", (int)( iPoolSize/1024/1024 ) );
19478 		return false;
19479 	}
19480 
19481 	// empty index, empty response!
19482 	if ( m_bIsEmpty )
19483 		return true;
19484 	assert ( m_tSettings.m_eDocinfo!=SPH_DOCINFO_EXTERN || !m_tAttr.IsEmpty() ); // check that docinfo is preloaded
19485 
19486 	// setup filters
19487 	if ( !tCtx.CreateFilters ( pQuery->m_sQuery.IsEmpty(), &pQuery->m_dFilters, ppSorters[iMaxSchemaIndex]->GetSchema(),
19488 								m_tMva.GetWritePtr(), m_tString.GetWritePtr(), pResult->m_sError, pQuery->m_eCollation, m_bArenaProhibit, tArgs.m_dKillList ) )
19489 		return false;
19490 
19491 	// check if we can early reject the whole index
19492 	if ( tCtx.m_pFilter && m_iDocinfoIndex )
19493 	{
19494 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
19495 		DWORD * pMinEntry = const_cast<DWORD*> ( &m_pDocinfoIndex [ m_iDocinfoIndex*uStride*2 ] );
19496 		DWORD * pMaxEntry = pMinEntry + uStride;
19497 
19498 		if ( !tCtx.m_pFilter->EvalBlock ( pMinEntry, pMaxEntry ) )
19499 			return true;
19500 	}
19501 
19502 	// setup lookup
19503 	tCtx.m_bLookupFilter = ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN ) && pQuery->m_dFilters.GetLength();
19504 	if ( tCtx.m_dCalcFilter.GetLength() || pQuery->m_eRanker==SPH_RANK_EXPR || pQuery->m_eRanker==SPH_RANK_EXPORT )
19505 		tCtx.m_bLookupFilter = true; // suboptimal in case of attr-independent expressions, but we don't care
19506 
19507 	tCtx.m_bLookupSort = false;
19508 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !tCtx.m_bLookupFilter )
19509 		for ( int iSorter=0; iSorter<iSorters && !tCtx.m_bLookupSort; iSorter++ )
19510 			if ( ppSorters[iSorter]->UsesAttrs() )
19511 				tCtx.m_bLookupSort = true;
19512 	if ( tCtx.m_dCalcSort.GetLength() )
19513 		tCtx.m_bLookupSort = true; // suboptimal in case of attr-independent expressions, but we don't care
19514 
19515 	// setup sorters vs. MVA
19516 	for ( int i=0; i<iSorters; i++ )
19517 	{
19518 		(ppSorters[i])->SetMVAPool ( m_tMva.GetWritePtr(), m_bArenaProhibit );
19519 		(ppSorters[i])->SetStringPool ( m_tString.GetWritePtr() );
19520 	}
19521 
19522 	// setup overrides
19523 	if ( !tCtx.SetupOverrides ( pQuery, pResult, m_tSchema, ppSorters[iMaxSchemaIndex]->GetSchema() ) )
19524 		return false;
19525 
19526 	//////////////////////////////////////
19527 	// find and weight matching documents
19528 	//////////////////////////////////////
19529 
19530 	bool bFinalLookup = !tCtx.m_bLookupFilter && !tCtx.m_bLookupSort;
19531 	bool bFinalPass = bFinalLookup || tCtx.m_dCalcFinal.GetLength();
19532 	int iMyTag = bFinalPass ? -1 : tArgs.m_iTag;
19533 
19534 	switch ( pQuery->m_eMode )
19535 	{
19536 		case SPH_MATCH_ALL:
19537 		case SPH_MATCH_PHRASE:
19538 		case SPH_MATCH_ANY:
19539 		case SPH_MATCH_EXTENDED:
19540 		case SPH_MATCH_EXTENDED2:
19541 		case SPH_MATCH_BOOLEAN:
19542 			MatchExtended ( &tCtx, pQuery, iSorters, ppSorters, pRanker.Ptr(), iMyTag, tArgs.m_iIndexWeight );
19543 			break;
19544 
19545 		default:
19546 			sphDie ( "INTERNAL ERROR: unknown matching mode (mode=%d)", pQuery->m_eMode );
19547 	}
19548 
19549 	////////////////////
19550 	// cook result sets
19551 	////////////////////
19552 
19553 	if ( pProfile )
19554 		pProfile->Switch ( SPH_QSTATE_FINALIZE );
19555 
19556 	// adjust result sets
19557 	if ( bFinalPass )
19558 	{
19559 		// GotUDF means promise to UDFs that final-stage calls will be evaluated
19560 		// a) over the final, pre-limit result set
19561 		// b) in the final result set order
19562 		bool bGotUDF = false;
19563 		ARRAY_FOREACH_COND ( i, tCtx.m_dCalcFinal, !bGotUDF )
19564 			tCtx.m_dCalcFinal[i].m_pExpr->Command ( SPH_EXPR_GET_UDF, &bGotUDF );
19565 
19566 		SphFinalMatchCalc_t tProcessor ( tArgs.m_iTag, bFinalLookup ? this : NULL, tCtx );
19567 		for ( int iSorter=0; iSorter<iSorters; iSorter++ )
19568 		{
19569 			ISphMatchSorter * pTop = ppSorters[iSorter];
19570 			pTop->Finalize ( tProcessor, bGotUDF );
19571 		}
19572 		pResult->m_iBadRows += tProcessor.m_iBadRows;
19573 	}
19574 
19575 	// mva and string pools ptrs
19576 	pResult->m_pMva = m_tMva.GetWritePtr();
19577 	pResult->m_pStrings = m_tString.GetWritePtr();
19578 	pResult->m_bArenaProhibit = m_bArenaProhibit;
19579 	pResult->m_iBadRows += tCtx.m_iBadRows;
19580 
19581 	// query timer
19582 	int64_t tmWall = sphMicroTimer() - tmQueryStart;
19583 	pResult->m_iQueryTime += (int)( tmWall/1000 );
19584 
19585 #if 0
19586 	printf ( "qtm %d, %d, %d, %d, %d\n", int(tmWall), tQueryStats.m_iFetchedDocs,
19587 		tQueryStats.m_iFetchedHits, tQueryStats.m_iSkips, ppSorters[0]->GetTotalCount() );
19588 #endif
19589 
19590 	if ( pProfile )
19591 		pProfile->Switch ( SPH_QSTATE_UNKNOWN );
19592 
19593 	if ( bCollectPredictionCounters )
19594 	{
19595 		pResult->m_tStats.m_iFetchedDocs += tQueryStats.m_iFetchedDocs;
19596 		pResult->m_tStats.m_iFetchedHits += tQueryStats.m_iFetchedHits;
19597 		pResult->m_tStats.m_iSkips += tQueryStats.m_iSkips;
19598 		pResult->m_bHasPrediction = true;
19599 	}
19600 
19601 	return true;
19602 }
19603 
19604 //////////////////////////////////////////////////////////////////////////
19605 // INDEX STATUS
19606 //////////////////////////////////////////////////////////////////////////
19607 
GetStatus(CSphIndexStatus * pRes) const19608 void CSphIndex_VLN::GetStatus ( CSphIndexStatus* pRes ) const
19609 {
19610 	assert ( pRes );
19611 	if ( !pRes )
19612 		return;
19613 	pRes->m_iRamUse = sizeof(CSphIndex_VLN)
19614 		+ m_dMinRow.GetSizeBytes()
19615 		+ m_dFieldLens.GetSizeBytes()
19616 
19617 		+ m_pDocinfoHash.GetLengthBytes()
19618 		+ m_tAttr.GetLengthBytes()
19619 		+ m_tMva.GetLengthBytes()
19620 		+ m_tString.GetLengthBytes()
19621 		+ m_tWordlist.m_pBuf.GetLengthBytes()
19622 		+ m_pKillList.GetLengthBytes()
19623 		+ m_pSkiplists.GetLengthBytes()
19624 		+ m_dShared.GetLengthBytes();
19625 
19626 	char sFile [ SPH_MAX_FILENAME_LEN ];
19627 	pRes->m_iDiskUse = 0;
19628 	for ( int i=0; i<sphGetExtCount ( m_uVersion ); i++ )
19629 	{
19630 		snprintf ( sFile, sizeof(sFile), "%s%s", m_sFilename.cstr(), sphGetExts ( SPH_EXT_TYPE_CUR, m_uVersion )[i] );
19631 		struct_stat st;
19632 		if ( stat ( sFile, &st )==0 )
19633 			pRes->m_iDiskUse += st.st_size;
19634 	}
19635 }
19636 
19637 //////////////////////////////////////////////////////////////////////////
19638 // INDEX CHECKING
19639 //////////////////////////////////////////////////////////////////////////
19640 
SetDebugCheck()19641 void CSphIndex_VLN::SetDebugCheck ()
19642 {
19643 	SetEnableOndiskAttributes ( false );
19644 	m_bDebugCheck = true;
19645 }
19646 
sphUnpackStrLength(CSphReader & tReader)19647 static int sphUnpackStrLength ( CSphReader & tReader )
19648 {
19649 	int v = tReader.GetByte();
19650 	if ( v & 0x80 )
19651 	{
19652 		if ( v & 0x40 )
19653 		{
19654 			v = ( int ( v & 0x3f )<<16 ) + ( int ( tReader.GetByte() )<<8 );
19655 			v += ( tReader.GetByte() ); // MUST be separate statement; cf. sequence point
19656 		} else
19657 		{
19658 			v = ( int ( v & 0x3f )<<8 ) + ( tReader.GetByte() );
19659 		}
19660 	}
19661 
19662 	return v;
19663 }
19664 
19665 class CSphDocidList
19666 {
19667 public:
CSphDocidList()19668 	CSphDocidList ()
19669 	{
19670 		m_bRawID = true;
19671 		m_iDocidMin = DOCID_MAX;
19672 		m_iDocidMax = 0;
19673 	}
19674 
~CSphDocidList()19675 	~CSphDocidList ()
19676 	{}
19677 
Init(int iRowSize,int64_t iRows,CSphReader & rdAttr,CSphString & sError)19678 	bool Init ( int iRowSize, int64_t iRows, CSphReader & rdAttr, CSphString & sError )
19679 	{
19680 		if ( !iRows )
19681 			return true;
19682 
19683 		int iSkip = sizeof ( CSphRowitem ) * iRowSize;
19684 		CSphString sWarning;
19685 
19686 		rdAttr.SeekTo ( 0, sizeof ( CSphRowitem ) * ( DOCINFO_IDSIZE + iRowSize ) );
19687 		m_iDocidMin = rdAttr.GetDocid ();
19688 		rdAttr.SeekTo ( ( iRows-1 ) * sizeof ( CSphRowitem ) * ( DOCINFO_IDSIZE + iRowSize ), sizeof ( CSphRowitem ) * ( DOCINFO_IDSIZE + iRowSize ) );
19689 		m_iDocidMax = rdAttr.GetDocid();
19690 		rdAttr.SeekTo ( 0, sizeof ( CSphRowitem ) * ( DOCINFO_IDSIZE + iRowSize ) );
19691 
19692 		if ( m_iDocidMax<m_iDocidMin )
19693 			return true;
19694 
19695 		uint64_t uRawBufLenght = sizeof(SphDocID_t) * iRows;
19696 		uint64_t uBitsBufLenght = ( m_iDocidMax - m_iDocidMin ) / 32;
19697 		if ( uRawBufLenght<uBitsBufLenght )
19698 		{
19699 			if ( !m_dDocid.Alloc ( iRows, sError, sWarning ) )
19700 			{
19701 				sError.SetSprintf ( "unable to allocate doc-id storage: %s", sError.cstr () );
19702 				return false;
19703 			}
19704 		} else
19705 		{
19706 			if ( !m_dBits.Alloc ( ( uBitsBufLenght * sizeof(DWORD) )+1, sError, sWarning ) )
19707 			{
19708 				sError.SetSprintf ( "unable to allocate doc-id storage: %s", sError.cstr () );
19709 				return false;
19710 			}
19711 			m_bRawID = false;
19712 			memset ( m_dBits.GetWritePtr(), 0, m_dBits.GetLengthBytes() );
19713 		}
19714 
19715 		for ( int64_t iRow=0; iRow<iRows && !rdAttr.GetErrorFlag (); iRow++ )
19716 		{
19717 			SphDocID_t uDocid = rdAttr.GetDocid ();
19718 			rdAttr.SkipBytes ( iSkip );
19719 
19720 			if ( uDocid<m_iDocidMin || uDocid>m_iDocidMax )
19721 				continue;
19722 
19723 			if ( m_bRawID )
19724 				m_dDocid.GetWritePtr()[iRow] = uDocid;
19725 			else
19726 			{
19727 				SphDocID_t uIndex = uDocid - m_iDocidMin;
19728 				DWORD uBit = 1UL<<(uIndex & 31);
19729 				m_dBits.GetWritePtr()[uIndex>>5] |= uBit;
19730 			}
19731 		}
19732 
19733 		if ( rdAttr.GetErrorFlag () )
19734 		{
19735 			sError.SetSprintf ( "unable to read attributes: %s", rdAttr.GetErrorMessage().cstr() );
19736 			rdAttr.ResetError();
19737 			return false;
19738 		}
19739 
19740 		return true;
19741 	}
19742 
HasDocid(SphDocID_t uDocid)19743 	bool HasDocid ( SphDocID_t uDocid )
19744 	{
19745 		if ( uDocid<m_iDocidMin || uDocid>m_iDocidMax )
19746 			return false;
19747 
19748 		if ( m_bRawID )
19749 		{
19750 			return ( sphBinarySearch ( m_dDocid.GetWritePtr(), m_dDocid.GetWritePtr () + m_dDocid.GetNumEntries() - 1, uDocid )!=NULL );
19751 		} else
19752 		{
19753 			SphDocID_t uIndex = uDocid - m_iDocidMin;
19754 			DWORD uBit = 1UL<<( uIndex & 31 );
19755 
19756 			return ( ( ( m_dBits.GetWritePtr()[uIndex>>5] & uBit ) )!=0 ); // NOLINT
19757 		}
19758 	}
19759 
19760 private:
19761 	CSphSharedBuffer<SphDocID_t> m_dDocid;
19762 	CSphSharedBuffer<DWORD> m_dBits;
19763 	bool m_bRawID;
19764 	SphDocID_t m_iDocidMin;
19765 	SphDocID_t m_iDocidMax;
19766 };
19767 
19768 
19769 // no strnlen on some OSes (Mac OS)
19770 #if !HAVE_STRNLEN
strnlen(const char * s,size_t iMaxLen)19771 size_t strnlen ( const char * s, size_t iMaxLen )
19772 {
19773 	if ( !s )
19774 		return 0;
19775 
19776 	size_t iRes = 0;
19777 	while ( *s++ && iRes<iMaxLen )
19778 		++iRes;
19779 	return iRes;
19780 }
19781 #endif
19782 
19783 
19784 #define LOC_FAIL(_args) \
19785 	if ( ++iFails<=FAILS_THRESH ) \
19786 	{ \
19787 		fprintf ( fp, "FAILED, " ); \
19788 		fprintf _args; \
19789 		fprintf ( fp, "\n" ); \
19790 		iFailsPrinted++; \
19791 		\
19792 		if ( iFails==FAILS_THRESH ) \
19793 			fprintf ( fp, "(threshold reached; suppressing further output)\n" ); \
19794 	}
19795 
19796 
DebugCheck(FILE * fp)19797 int CSphIndex_VLN::DebugCheck ( FILE * fp )
19798 {
19799 	int64_t tmCheck = sphMicroTimer();
19800 	int64_t iFails = 0;
19801 	int iFailsPrinted = 0;
19802 	const int FAILS_THRESH = 100;
19803 
19804 	// check if index is ready
19805 	if ( m_dShared.GetNumEntries()!=SPH_SHARED_VARS_COUNT || !m_pPreread || !*m_pPreread )
19806 		LOC_FAIL(( fp, "index not preread" ));
19807 
19808 	bool bProgress = isatty ( fileno ( fp ) )!=0;
19809 
19810 	//////////////
19811 	// open files
19812 	//////////////
19813 
19814 	CSphString sError;
19815 	CSphAutoreader rdDocs, rdHits;
19816 	CSphAutoreader rdDict;
19817 	CSphAutoreader rdSkips;
19818 	int64_t iSkiplistLen = 0;
19819 
19820 	if ( !rdDict.Open ( GetIndexFileName("spi").cstr(), sError ) )
19821 		LOC_FAIL(( fp, "unable to open dictionary: %s", sError.cstr() ));
19822 
19823 	if ( !rdDocs.Open ( GetIndexFileName("spd"), sError ) )
19824 		LOC_FAIL(( fp, "unable to open doclist: %s", sError.cstr() ));
19825 
19826 	if ( !rdHits.Open ( GetIndexFileName("spp"), sError ) )
19827 		LOC_FAIL(( fp, "unable to open hitlist: %s", sError.cstr() ));
19828 
19829 	if ( m_bHaveSkips )
19830 	{
19831 		if ( !rdSkips.Open ( GetIndexFileName ( "spe" ), sError ) )
19832 			LOC_FAIL ( ( fp, "unable to open skiplist: %s", sError.cstr () ) );
19833 		iSkiplistLen = rdSkips.GetFilesize();
19834 	}
19835 
19836 	CSphAutoreader rdAttr;
19837 	CSphAutoreader rdString;
19838 	CSphAutoreader rdMva;
19839 	int64_t iStrEnd = 0;
19840 	int64_t iMvaEnd = 0;
19841 
19842 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_tAttr.IsEmpty() )
19843 	{
19844 		fprintf ( fp, "checking rows...\n" );
19845 
19846 		if ( !rdAttr.Open ( GetIndexFileName("spa").cstr(), sError ) )
19847 			LOC_FAIL(( fp, "unable to open attributes: %s", sError.cstr() ));
19848 
19849 		if ( !rdString.Open ( GetIndexFileName("sps").cstr(), sError ) )
19850 			LOC_FAIL(( fp, "unable to open strings: %s", sError.cstr() ));
19851 
19852 		if ( !rdMva.Open ( GetIndexFileName("spm").cstr(), sError ) )
19853 			LOC_FAIL(( fp, "unable to open MVA: %s", sError.cstr() ));
19854 	}
19855 
19856 	CSphVector<SphWordID_t> dHitlessWords;
19857 	if ( !LoadHitlessWords ( dHitlessWords ) )
19858 		LOC_FAIL(( fp, "unable to load hitless words: %s", m_sLastError.cstr() ));
19859 
19860 	CSphSavedFile tStat;
19861 	const CSphTokenizerSettings & tTokenizerSettings = m_pTokenizer->GetSettings ();
19862 	if ( !tTokenizerSettings.m_sSynonymsFile.IsEmpty() && !GetFileStats ( tTokenizerSettings.m_sSynonymsFile.cstr(), tStat, &sError ) )
19863 		LOC_FAIL(( fp, "unable to open exceptions '%s': %s", tTokenizerSettings.m_sSynonymsFile.cstr(), sError.cstr() ));
19864 
19865 	const CSphDictSettings & tDictSettings = m_pDict->GetSettings ();
19866 	const char * pStop = tDictSettings.m_sStopwords.cstr();
19867 	for ( ;; )
19868 	{
19869 		// find next name start
19870 		while ( pStop && *pStop && isspace(*pStop) ) pStop++;
19871 		if ( !pStop || !*pStop ) break;
19872 
19873 		const char * sNameStart = pStop;
19874 
19875 		// find next name end
19876 		while ( *pStop && !isspace(*pStop) ) pStop++;
19877 
19878 		CSphString sStopFile;
19879 		sStopFile.SetBinary ( sNameStart, pStop-sNameStart );
19880 
19881 		if ( !GetFileStats ( sStopFile.cstr(), tStat, &sError ) )
19882 			LOC_FAIL(( fp, "unable to open stopwords '%s': %s", sStopFile.cstr(), sError.cstr() ));
19883 	}
19884 
19885 	if ( !tDictSettings.m_dWordforms.GetLength() )
19886 	{
19887 		ARRAY_FOREACH ( i, tDictSettings.m_dWordforms )
19888 		{
19889 			if ( !GetFileStats ( tDictSettings.m_dWordforms[i].cstr(), tStat, &sError ) )
19890 				LOC_FAIL(( fp, "unable to open wordforms '%s': %s", tDictSettings.m_dWordforms[i].cstr(), sError.cstr() ));
19891 		}
19892 	}
19893 
19894 	////////////////////
19895 	// check dictionary
19896 	////////////////////
19897 
19898 	fprintf ( fp, "checking dictionary...\n" );
19899 
19900 	SphWordID_t uWordid = 0;
19901 	int64_t iDoclistOffset = 0;
19902 	int iWordsTotal = 0;
19903 
19904 	char sWord[MAX_KEYWORD_BYTES], sLastWord[MAX_KEYWORD_BYTES];
19905 	memset ( sWord, 0, sizeof(sWord) );
19906 	memset ( sLastWord, 0, sizeof(sLastWord) );
19907 
19908 	const int iWordPerCP = m_uVersion>=21 ? SPH_WORDLIST_CHECKPOINT : 1024;
19909 	const bool bWordDict = m_pDict->GetSettings().m_bWordDict;
19910 
19911 	CSphVector<CSphWordlistCheckpoint> dCheckpoints;
19912 	dCheckpoints.Reserve ( m_tWordlist.m_dCheckpoints.GetLength() );
19913 	CSphVector<char> dCheckpointWords;
19914 	dCheckpointWords.Reserve ( m_tWordlist.m_pWords.GetLength() );
19915 
19916 	if ( bWordDict && m_uVersion<21 )
19917 		LOC_FAIL(( fp, "dictionary needed index version not less then 21 (readed=%d)"
19918 			, m_uVersion ));
19919 
19920 	rdDict.GetByte();
19921 	int iLastSkipsOffset = 0;
19922 	SphOffset_t iWordsEnd = m_tWordlist.m_iWordsEnd;
19923 
19924 	while ( rdDict.GetPos()!=iWordsEnd && !m_bIsEmpty )
19925 	{
19926 		// sanity checks
19927 		if ( rdDict.GetPos()>=iWordsEnd )
19928 		{
19929 			LOC_FAIL(( fp, "reading past checkpoints" ));
19930 			break;
19931 		}
19932 
19933 		// store current entry pos (for checkpointing later), read next delta
19934 		const int64_t iDictPos = rdDict.GetPos();
19935 		SphWordID_t iDeltaWord = 0;
19936 		if ( bWordDict )
19937 		{
19938 			iDeltaWord = rdDict.GetByte();
19939 		} else
19940 		{
19941 			iDeltaWord = rdDict.UnzipWordid();
19942 		}
19943 
19944 		// checkpoint encountered, handle it
19945 		if ( !iDeltaWord )
19946 		{
19947 			rdDict.UnzipOffset();
19948 
19949 			if ( ( iWordsTotal%iWordPerCP )!=0 && rdDict.GetPos()!=iWordsEnd )
19950 				LOC_FAIL(( fp, "unexpected checkpoint (pos=" INT64_FMT ", word=%d, words=%d, expected=%d)",
19951 					iDictPos, iWordsTotal, ( iWordsTotal%iWordPerCP ), iWordPerCP ));
19952 
19953 			uWordid = 0;
19954 			iDoclistOffset = 0;
19955 			continue;
19956 		}
19957 
19958 		SphWordID_t uNewWordid = 0;
19959 		SphOffset_t iNewDoclistOffset = 0;
19960 		int iDocs = 0;
19961 		int iHits = 0;
19962 		bool bHitless = false;
19963 
19964 		if ( bWordDict )
19965 		{
19966 			// unpack next word
19967 			// must be in sync with DictEnd()!
19968 			BYTE uPack = (BYTE)iDeltaWord;
19969 			int iMatch, iDelta;
19970 			if ( uPack & 0x80 )
19971 			{
19972 				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
19973 				iMatch = uPack & 15;
19974 			} else
19975 			{
19976 				iDelta = uPack & 127;
19977 				iMatch = rdDict.GetByte();
19978 			}
19979 			const int iLastWordLen = strlen(sLastWord);
19980 			if ( iMatch+iDelta>=(int)sizeof(sLastWord)-1 || iMatch>iLastWordLen )
19981 			{
19982 				LOC_FAIL(( fp, "wrong word-delta (pos=" INT64_FMT ", word=%s, len=%d, begin=%d, delta=%d)",
19983 					iDictPos, sLastWord, iLastWordLen, iMatch, iDelta ));
19984 				rdDict.SkipBytes ( iDelta );
19985 			} else
19986 			{
19987 				rdDict.GetBytes ( sWord+iMatch, iDelta );
19988 				sWord [ iMatch+iDelta ] = '\0';
19989 			}
19990 
19991 			iNewDoclistOffset = rdDict.UnzipOffset();
19992 			iDocs = rdDict.UnzipInt();
19993 			iHits = rdDict.UnzipInt();
19994 			int iHint = 0;
19995 			if ( iDocs>=DOCLIST_HINT_THRESH )
19996 			{
19997 				iHint = rdDict.GetByte();
19998 			}
19999 			iHint = DoclistHintUnpack ( iDocs, (BYTE)iHint );
20000 
20001 			if ( m_tSettings.m_eHitless==SPH_HITLESS_SOME && ( iDocs & HITLESS_DOC_FLAG )!=0 )
20002 			{
20003 				iDocs = ( iDocs & HITLESS_DOC_MASK );
20004 				bHitless = true;
20005 			}
20006 
20007 			const int iNewWordLen = strlen(sWord);
20008 
20009 			if ( iNewWordLen==0 )
20010 				LOC_FAIL(( fp, "empty word in dictionary (pos=" INT64_FMT ")",
20011 					iDictPos ));
20012 
20013 			if ( iLastWordLen && iNewWordLen )
20014 				if ( sphDictCmpStrictly ( sWord, iNewWordLen, sLastWord, iLastWordLen )<=0 )
20015 					LOC_FAIL(( fp, "word order decreased (pos=" INT64_FMT ", word=%s, prev=%s)",
20016 						iDictPos, sLastWord, sWord ));
20017 
20018 			if ( iHint<0 )
20019 				LOC_FAIL(( fp, "invalid word hint (pos=" INT64_FMT ", word=%s, hint=%d)",
20020 					iDictPos, sWord, iHint ));
20021 
20022 			if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
20023 				LOC_FAIL(( fp, "invalid docs/hits (pos=" INT64_FMT ", word=%s, docs=" INT64_FMT ", hits=" INT64_FMT ")",
20024 					(int64_t)iDictPos, sWord, (int64_t)iDocs, (int64_t)iHits ));
20025 
20026 			memcpy ( sLastWord, sWord, sizeof(sLastWord) );
20027 		} else
20028 		{
20029 			// finish reading the entire entry
20030 			uNewWordid = uWordid + iDeltaWord;
20031 			iNewDoclistOffset = iDoclistOffset + rdDict.UnzipOffset();
20032 			iDocs = rdDict.UnzipInt();
20033 			iHits = rdDict.UnzipInt();
20034 			bHitless = ( dHitlessWords.BinarySearch ( uNewWordid )!=NULL );
20035 			if ( bHitless )
20036 				iDocs = ( iDocs & HITLESS_DOC_MASK );
20037 
20038 			if ( uNewWordid<=uWordid )
20039 				LOC_FAIL(( fp, "wordid decreased (pos=" INT64_FMT ", wordid=" UINT64_FMT ", previd=" UINT64_FMT ")",
20040 					(int64_t)iDictPos, (uint64_t)uNewWordid, (uint64_t)uWordid ));
20041 
20042 			if ( iNewDoclistOffset<=iDoclistOffset )
20043 				LOC_FAIL(( fp, "doclist offset decreased (pos=" INT64_FMT ", wordid=" UINT64_FMT ")",
20044 					(int64_t)iDictPos, (uint64_t)uNewWordid ));
20045 
20046 			if ( iDocs<=0 || iHits<=0 || iHits<iDocs )
20047 				LOC_FAIL(( fp, "invalid docs/hits (pos=" INT64_FMT ", wordid=" UINT64_FMT ", docs=" INT64_FMT ", hits=" INT64_FMT ", hitless=%s)",
20048 					(int64_t)iDictPos, (uint64_t)uNewWordid, (int64_t)iDocs, (int64_t)iHits, ( bHitless?"true":"false" ) ));
20049 		}
20050 
20051 		// skiplist
20052 		if ( m_bHaveSkips && iDocs>SPH_SKIPLIST_BLOCK && !bHitless )
20053 		{
20054 			int iSkipsOffset = rdDict.UnzipInt();
20055 			if ( !bWordDict && iSkipsOffset<iLastSkipsOffset )
20056 				LOC_FAIL(( fp, "descending skiplist pos (last=%d, cur=%d, wordid=%llu)",
20057 					iLastSkipsOffset, iSkipsOffset, UINT64 ( uNewWordid ) ));
20058 			iLastSkipsOffset = iSkipsOffset;
20059 		}
20060 
20061 		// update stats, add checkpoint
20062 		if ( ( iWordsTotal%iWordPerCP )==0 )
20063 		{
20064 			CSphWordlistCheckpoint & tCP = dCheckpoints.Add();
20065 			tCP.m_iWordlistOffset = iDictPos;
20066 
20067 			if ( bWordDict )
20068 			{
20069 				const int iLen = strlen ( sWord );
20070 				char * sArenaWord = dCheckpointWords.AddN ( iLen + 1 );
20071 				memcpy ( sArenaWord, sWord, iLen );
20072 				sArenaWord[iLen] = '\0';
20073 				tCP.m_uWordID = sArenaWord - dCheckpointWords.Begin();
20074 			} else
20075 				tCP.m_uWordID = uNewWordid;
20076 		}
20077 
20078 		// TODO add back infix checking
20079 
20080 		uWordid = uNewWordid;
20081 		iDoclistOffset = iNewDoclistOffset;
20082 		iWordsTotal++;
20083 	}
20084 
20085 	// check the checkpoints
20086 	if ( dCheckpoints.GetLength()!=m_tWordlist.m_dCheckpoints.GetLength() )
20087 		LOC_FAIL(( fp, "checkpoint count mismatch (read=%d, calc=%d)",
20088 			m_tWordlist.m_dCheckpoints.GetLength(), dCheckpoints.GetLength() ));
20089 
20090 	for ( int i=0; i < Min ( dCheckpoints.GetLength(), m_tWordlist.m_dCheckpoints.GetLength() ); i++ )
20091 	{
20092 		CSphWordlistCheckpoint tRefCP = dCheckpoints[i];
20093 		const CSphWordlistCheckpoint & tCP = m_tWordlist.m_dCheckpoints[i];
20094 		const int iLen = bWordDict ? strlen ( tCP.m_sWord ) : 0;
20095 		if ( bWordDict )
20096 			tRefCP.m_sWord = dCheckpointWords.Begin() + tRefCP.m_uWordID;
20097 		if ( bWordDict && ( tRefCP.m_sWord[0]=='\0' || tCP.m_sWord[0]=='\0' ) )
20098 		{
20099 			LOC_FAIL(( fp, "empty checkpoint %d (read_word=%s, read_len=%u, readpos=" INT64_FMT ", calc_word=%s, calc_len=%u, calcpos=" INT64_FMT ")",
20100 				i, tCP.m_sWord, (DWORD)strlen ( tCP.m_sWord ), (int64_t)tCP.m_iWordlistOffset,
20101 					tRefCP.m_sWord, (DWORD)strlen ( tRefCP.m_sWord ), (int64_t)tRefCP.m_iWordlistOffset ));
20102 
20103 		} else if ( sphCheckpointCmpStrictly ( tCP.m_sWord, iLen, tCP.m_uWordID, bWordDict, tRefCP )
20104 			|| tRefCP.m_iWordlistOffset!=tCP.m_iWordlistOffset )
20105 		{
20106 			if ( bWordDict )
20107 			{
20108 				LOC_FAIL(( fp, "checkpoint %d differs (read_word=%s, readpos=" INT64_FMT ", calc_word=%s, calcpos=" INT64_FMT ")",
20109 					i,
20110 					tCP.m_sWord,
20111 					(int64_t)tCP.m_iWordlistOffset,
20112 					tRefCP.m_sWord,
20113 					(int64_t)tRefCP.m_iWordlistOffset ));
20114 			} else
20115 			{
20116 				LOC_FAIL(( fp, "checkpoint %d differs (readid=" UINT64_FMT ", readpos=" INT64_FMT ", calcid=" UINT64_FMT ", calcpos=" INT64_FMT ")",
20117 					i,
20118 					(uint64_t)tCP.m_uWordID,
20119 					(int64_t)tCP.m_iWordlistOffset,
20120 					(uint64_t)tRefCP.m_uWordID,
20121 					(int64_t)tRefCP.m_iWordlistOffset ));
20122 			}
20123 		}
20124 	}
20125 
20126 	dCheckpoints.Reset();
20127 	dCheckpointWords.Reset();
20128 
20129 	///////////////////////
20130 	// check docs and hits
20131 	///////////////////////
20132 
20133 	fprintf ( fp, "checking data...\n" );
20134 
20135 	CSphScopedPtr<CSphDocidList> tDoclist ( new CSphDocidList );
20136 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !tDoclist->Init ( m_tSchema.GetRowSize (), m_iDocinfo, rdAttr, sError ) )
20137 		LOC_FAIL ( ( fp, "%s", sError.cstr () ) );
20138 
20139 	int64_t iDocsSize = rdDocs.GetFilesize();
20140 
20141 	rdDict.SeekTo ( 1, READ_NO_SIZE_HINT );
20142 	rdDocs.SeekTo ( 1, READ_NO_SIZE_HINT );
20143 	rdHits.SeekTo ( 1, READ_NO_SIZE_HINT );
20144 
20145 	uWordid = 0;
20146 	iDoclistOffset = 0;
20147 	int iDictDocs, iDictHits;
20148 	bool bHitless = false;
20149 
20150 	int iWordsChecked = 0;
20151 	while ( rdDict.GetPos()<iWordsEnd )
20152 	{
20153 		bHitless = false;
20154 		SphWordID_t iDeltaWord = 0;
20155 		if ( bWordDict )
20156 		{
20157 			iDeltaWord = rdDict.GetByte();
20158 		} else
20159 		{
20160 			iDeltaWord = rdDict.UnzipWordid();
20161 		}
20162 		if ( !iDeltaWord )
20163 		{
20164 			rdDict.UnzipOffset();
20165 
20166 			uWordid = 0;
20167 			iDoclistOffset = 0;
20168 			continue;
20169 		}
20170 
20171 		if ( bWordDict )
20172 		{
20173 			// unpack next word
20174 			// must be in sync with DictEnd()!
20175 			BYTE uPack = (BYTE)iDeltaWord;
20176 
20177 			int iMatch, iDelta;
20178 			if ( uPack & 0x80 )
20179 			{
20180 				iDelta = ( ( uPack>>4 ) & 7 ) + 1;
20181 				iMatch = uPack & 15;
20182 			} else
20183 			{
20184 				iDelta = uPack & 127;
20185 				iMatch = rdDict.GetByte();
20186 			}
20187 			const int iLastWordLen = strlen(sWord);
20188 			if ( iMatch+iDelta>=(int)sizeof(sWord)-1 || iMatch>iLastWordLen )
20189 			{
20190 				rdDict.SkipBytes ( iDelta );
20191 			} else
20192 			{
20193 				rdDict.GetBytes ( sWord+iMatch, iDelta );
20194 				sWord [ iMatch+iDelta ] = '\0';
20195 			}
20196 
20197 			iDoclistOffset = rdDict.UnzipOffset();
20198 			iDictDocs = rdDict.UnzipInt();
20199 			iDictHits = rdDict.UnzipInt();
20200 			if ( iDictDocs>=DOCLIST_HINT_THRESH )
20201 				rdDict.GetByte();
20202 
20203 			if ( m_tSettings.m_eHitless==SPH_HITLESS_SOME && ( iDictDocs & HITLESS_DOC_FLAG ) )
20204 			{
20205 				iDictDocs = ( iDictDocs & HITLESS_DOC_MASK );
20206 				bHitless = true;
20207 			}
20208 		} else
20209 		{
20210 			// finish reading the entire entry
20211 			uWordid = uWordid + iDeltaWord;
20212 			bHitless = ( dHitlessWords.BinarySearch ( uWordid )!=NULL );
20213 			iDoclistOffset = iDoclistOffset + rdDict.UnzipOffset();
20214 			iDictDocs = rdDict.UnzipInt();
20215 			if ( bHitless )
20216 				iDictDocs = ( iDictDocs & HITLESS_DOC_MASK );
20217 			iDictHits = rdDict.UnzipInt();
20218 		}
20219 
20220 		// FIXME? verify skiplist content too
20221 		int iSkipsOffset = 0;
20222 		if ( m_bHaveSkips && iDictDocs>SPH_SKIPLIST_BLOCK && !bHitless )
20223 			iSkipsOffset = rdDict.UnzipInt();
20224 
20225 		// check whether the offset is as expected
20226 		if ( iDoclistOffset!=rdDocs.GetPos() )
20227 		{
20228 			if ( !bWordDict )
20229 				LOC_FAIL(( fp, "unexpected doclist offset (wordid=" UINT64_FMT "(%s)(%d), dictpos=" INT64_FMT ", doclistpos=" INT64_FMT ")",
20230 					(uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, (int64_t)rdDocs.GetPos() ));
20231 
20232 			if ( iDoclistOffset>=iDocsSize || iDoclistOffset<0 )
20233 			{
20234 				LOC_FAIL(( fp, "unexpected doclist offset, off the file (wordid=" UINT64_FMT "(%s)(%d), dictpos=" INT64_FMT ", doclistsize=" INT64_FMT ")",
20235 					(uint64_t)uWordid, sWord, iWordsChecked, iDoclistOffset, iDocsSize ));
20236 				iWordsChecked++;
20237 				continue;
20238 			} else
20239 				rdDocs.SeekTo ( iDoclistOffset, READ_NO_SIZE_HINT );
20240 		}
20241 
20242 		// create and manually setup doclist reader
20243 		DiskIndexQwordTraits_c * pQword = NULL;
20244 		DWORD uInlineHits = ( m_tSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE );
20245 		DWORD uInlineDocinfo = ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE );
20246 		switch ( ( uInlineHits<<1 ) | uInlineDocinfo )
20247 		{
20248 		case 0: { typedef DiskIndexQword_c < false, false, false > T; pQword = new T ( false, false ); break; }
20249 		case 1: { typedef DiskIndexQword_c < false, true, false > T; pQword = new T ( false, false ); break; }
20250 		case 2: { typedef DiskIndexQword_c < true, false, false > T; pQword = new T ( false, false ); break; }
20251 		case 3: { typedef DiskIndexQword_c < true, true, false > T; pQword = new T ( false, false ); break; }
20252 		}
20253 		if ( !pQword )
20254 			sphDie ( "INTERNAL ERROR: impossible qword settings" );
20255 
20256 		pQword->m_tDoc.Reset ( m_tSchema.GetDynamicSize() );
20257 		pQword->m_iMinID = m_uMinDocid;
20258 		pQword->m_tDoc.m_uDocID = m_uMinDocid;
20259 		if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
20260 		{
20261 			pQword->m_iInlineAttrs = m_tSchema.GetDynamicSize();
20262 			pQword->m_pInlineFixup = m_dMinRow.Begin();
20263 		} else
20264 		{
20265 			pQword->m_iInlineAttrs = 0;
20266 			pQword->m_pInlineFixup = NULL;
20267 		}
20268 		pQword->m_iDocs = 0;
20269 		pQword->m_iHits = 0;
20270 		pQword->m_rdDoclist.SetFile ( rdDocs.GetFD(), rdDocs.GetFilename().cstr() );
20271 		pQword->m_rdDoclist.SeekTo ( rdDocs.GetPos(), READ_NO_SIZE_HINT );
20272 		pQword->m_rdHitlist.SetFile ( rdHits.GetFD(), rdHits.GetFilename().cstr() );
20273 		pQword->m_rdHitlist.SeekTo ( rdHits.GetPos(), READ_NO_SIZE_HINT );
20274 
20275 		CSphRowitem * pInlineStorage = NULL;
20276 		if ( pQword->m_iInlineAttrs )
20277 			pInlineStorage = new CSphRowitem [ pQword->m_iInlineAttrs ];
20278 
20279 		// loop the doclist
20280 		SphDocID_t uLastDocid = 0;
20281 		int iDoclistDocs = 0;
20282 		int iDoclistHits = 0;
20283 		int iHitlistHits = 0;
20284 
20285 		bHitless |= ( m_tSettings.m_eHitless==SPH_HITLESS_ALL ||
20286 			( m_tSettings.m_eHitless==SPH_HITLESS_SOME && dHitlessWords.BinarySearch ( uWordid ) ) );
20287 		pQword->m_bHasHitlist = !bHitless;
20288 
20289 		CSphVector<SkiplistEntry_t> dDoclistSkips;
20290 		for ( ;; )
20291 		{
20292 			// skiplist state is saved just *before* decoding those boundary entries
20293 			if ( m_bHaveSkips && ( iDoclistDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
20294 			{
20295 				SkiplistEntry_t & tBlock = dDoclistSkips.Add();
20296 				tBlock.m_iBaseDocid = pQword->m_tDoc.m_uDocID;
20297 				tBlock.m_iOffset = pQword->m_rdDoclist.GetPos();
20298 				tBlock.m_iBaseHitlistPos = pQword->m_uHitPosition;
20299 			}
20300 
20301 			// FIXME? this can fail on a broken entry (eg fieldid over 256)
20302 			const CSphMatch & tDoc = pQword->GetNextDoc ( pInlineStorage );
20303 			if ( !tDoc.m_uDocID )
20304 				break;
20305 
20306 			// checks!
20307 			if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN )
20308 			{
20309 				SphDocID_t uDocID = tDoc.m_uDocID;
20310 				if ( !tDoclist->HasDocid ( uDocID ) )
20311 				{
20312 					LOC_FAIL(( fp, "row not found (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ")",
20313 						uint64_t(uWordid), sWord, tDoc.m_uDocID ));
20314 				}
20315 			}
20316 
20317 			if ( tDoc.m_uDocID<=uLastDocid )
20318 				LOC_FAIL(( fp, "docid decreased (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", lastid=" DOCID_FMT ")",
20319 					uint64_t(uWordid), sWord, tDoc.m_uDocID, uLastDocid ));
20320 
20321 			uLastDocid = tDoc.m_uDocID;
20322 			iDoclistDocs++;
20323 			iDoclistHits += pQword->m_uMatchHits;
20324 
20325 			// check position in case of regular (not-inline) hit
20326 			if (!( pQword->m_iHitlistPos>>63 ))
20327 			{
20328 				if ( !bWordDict && pQword->m_iHitlistPos!=pQword->m_rdHitlist.GetPos() )
20329 					LOC_FAIL(( fp, "unexpected hitlist offset (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", expected=" INT64_FMT ", actual=" INT64_FMT ")",
20330 						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID,
20331 						(int64_t)pQword->m_iHitlistPos, (int64_t)pQword->m_rdHitlist.GetPos() ));
20332 			}
20333 
20334 			// aim
20335 			pQword->SeekHitlist ( pQword->m_iHitlistPos );
20336 
20337 			// loop the hitlist
20338 			int iDocHits = 0;
20339 			FieldMask_t dFieldMask;
20340 			dFieldMask.UnsetAll();
20341 			Hitpos_t uLastHit = EMPTY_HIT;
20342 
20343 			while ( !bHitless )
20344 			{
20345 				Hitpos_t uHit = pQword->GetNextHit();
20346 				if ( uHit==EMPTY_HIT )
20347 					break;
20348 
20349 				if ( !( uLastHit<uHit ) )
20350 					LOC_FAIL(( fp, "hit entries sorting order decreased (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", hit=%u, last=%u)",
20351 							(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID, uHit, uLastHit ));
20352 
20353 				if ( HITMAN::GetField ( uLastHit )==HITMAN::GetField ( uHit ) )
20354 				{
20355 					if ( !( HITMAN::GetPos ( uLastHit )<HITMAN::GetPos ( uHit ) ) )
20356 						LOC_FAIL(( fp, "hit decreased (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", hit=%u, last=%u)",
20357 								(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID, HITMAN::GetPos ( uHit ), HITMAN::GetPos ( uLastHit ) ));
20358 					if ( HITMAN::IsEnd ( uLastHit ) )
20359 						LOC_FAIL(( fp, "multiple tail hits (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", hit=0x%x, last=0x%x)",
20360 								(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID, uHit, uLastHit ));
20361 				} else
20362 				{
20363 					if ( !( HITMAN::GetField ( uLastHit )<HITMAN::GetField ( uHit ) ) )
20364 						LOC_FAIL(( fp, "hit field decreased (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", hit field=%u, last field=%u)",
20365 								(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID, HITMAN::GetField ( uHit ), HITMAN::GetField ( uLastHit ) ));
20366 				}
20367 
20368 				uLastHit = uHit;
20369 
20370 				int iField = HITMAN::GetField ( uHit );
20371 				if ( iField<0 || iField>=SPH_MAX_FIELDS )
20372 				{
20373 					LOC_FAIL(( fp, "hit field out of bounds (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", field=%d)",
20374 						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID, iField ));
20375 
20376 				} else if ( iField>=m_tSchema.m_dFields.GetLength() )
20377 				{
20378 					LOC_FAIL(( fp, "hit field out of schema (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", field=%d)",
20379 						(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID, iField ));
20380 				} else
20381 				{
20382 					dFieldMask.Set(iField);
20383 				}
20384 
20385 				iDocHits++; // to check doclist entry
20386 				iHitlistHits++; // to check dictionary entry
20387 			}
20388 
20389 			// check hit count
20390 			if ( iDocHits!=(int)pQword->m_uMatchHits && !bHitless )
20391 				LOC_FAIL(( fp, "doc hit count mismatch (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ", doclist=%d, hitlist=%d)",
20392 					(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID, pQword->m_uMatchHits, iDocHits ));
20393 
20394 			if ( GetMatchSchema().m_dFields.GetLength()>32 )
20395 				pQword->CollectHitMask();
20396 
20397 			// check the mask
20398 			if ( memcmp ( dFieldMask.m_dMask, pQword->m_dQwordFields.m_dMask, sizeof(dFieldMask.m_dMask) ) && !bHitless )
20399 				LOC_FAIL(( fp, "field mask mismatch (wordid=" UINT64_FMT "(%s), docid=" DOCID_FMT ")",
20400 					(uint64_t)uWordid, sWord, pQword->m_tDoc.m_uDocID ));
20401 
20402 			// update my hitlist reader
20403 			rdHits.SeekTo ( pQword->m_rdHitlist.GetPos(), READ_NO_SIZE_HINT );
20404 		}
20405 
20406 		// do checks
20407 		if ( iDictDocs!=iDoclistDocs )
20408 			LOC_FAIL(( fp, "doc count mismatch (wordid=" UINT64_FMT "(%s), dict=%d, doclist=%d, hitless=%s)",
20409 				uint64_t(uWordid), sWord, iDictDocs, iDoclistDocs, ( bHitless?"true":"false" ) ));
20410 
20411 		if ( ( iDictHits!=iDoclistHits || iDictHits!=iHitlistHits ) && !bHitless )
20412 			LOC_FAIL(( fp, "hit count mismatch (wordid=" UINT64_FMT "(%s), dict=%d, doclist=%d, hitlist=%d)",
20413 				uint64_t(uWordid), sWord, iDictHits, iDoclistHits, iHitlistHits ));
20414 
20415 		while ( m_bHaveSkips && iDoclistDocs>SPH_SKIPLIST_BLOCK && !bHitless )
20416 		{
20417 			if ( iSkipsOffset<=0 || iSkipsOffset>iSkiplistLen )
20418 			{
20419 				LOC_FAIL(( fp, "invalid skiplist offset (wordid=%llu(%s), off=%d, max=" INT64_FMT ")",
20420 					UINT64 ( uWordid ), sWord, iSkipsOffset, iSkiplistLen ));
20421 				break;
20422 			}
20423 
20424 			// boundary adjustment
20425 			if ( ( iDoclistDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
20426 				dDoclistSkips.Pop();
20427 
20428 			SkiplistEntry_t t;
20429 			t.m_iBaseDocid = m_uMinDocid;
20430 			t.m_iOffset = iDoclistOffset;
20431 			t.m_iBaseHitlistPos = 0;
20432 
20433 			// hint is: dDoclistSkips * ZIPPED( sizeof(int64_t) * 3 ) == dDoclistSkips * 8
20434 			rdSkips.SeekTo ( iSkipsOffset, dDoclistSkips.GetLength()*8 );
20435 			int i = 0;
20436 			while ( ++i<dDoclistSkips.GetLength() )
20437 			{
20438 				const SkiplistEntry_t & r = dDoclistSkips[i];
20439 
20440 				uint64_t uDocidDelta = rdSkips.UnzipOffset();
20441 				uint64_t uOff = rdSkips.UnzipOffset();
20442 				uint64_t uPosDelta = rdSkips.UnzipOffset();
20443 
20444 				if ( rdSkips.GetErrorFlag() )
20445 				{
20446 					LOC_FAIL ( ( fp, "skiplist reading error (wordid=%llu(%s), exp=%d, got=%d, error='%s')",
20447 						UINT64 ( uWordid ), sWord, i, dDoclistSkips.GetLength(), rdSkips.GetErrorMessage().cstr() ) );
20448 					rdSkips.ResetError();
20449 					break;
20450 				}
20451 
20452 				t.m_iBaseDocid += SPH_SKIPLIST_BLOCK + (SphDocID_t)uDocidDelta;
20453 				t.m_iOffset += 4*SPH_SKIPLIST_BLOCK + uOff;
20454 				t.m_iBaseHitlistPos += uPosDelta;
20455 				if ( t.m_iBaseDocid!=r.m_iBaseDocid
20456 					|| t.m_iOffset!=r.m_iOffset ||
20457 					t.m_iBaseHitlistPos!=r.m_iBaseHitlistPos )
20458 				{
20459 					LOC_FAIL(( fp, "skiplist entry %d mismatch (wordid=%llu(%s), exp={%llu, %llu, %llu}, got={%llu, %llu, %llu})",
20460 						i, UINT64 ( uWordid ), sWord,
20461 						UINT64 ( r.m_iBaseDocid ), UINT64 ( r.m_iOffset ), UINT64 ( r.m_iBaseHitlistPos ),
20462 						UINT64 ( t.m_iBaseDocid ), UINT64 ( t.m_iOffset ), UINT64 ( t.m_iBaseHitlistPos ) ));
20463 					break;
20464 				}
20465 			}
20466 			break;
20467 		}
20468 
20469 		// move my reader instance forward too
20470 		rdDocs.SeekTo ( pQword->m_rdDoclist.GetPos(), READ_NO_SIZE_HINT );
20471 
20472 		// cleanup
20473 		SafeDelete ( pInlineStorage );
20474 		SafeDelete ( pQword );
20475 
20476 		// progress bar
20477 		if ( (++iWordsChecked)%1000==0 && bProgress )
20478 		{
20479 			fprintf ( fp, "%d/%d\r", iWordsChecked, iWordsTotal );
20480 			fflush ( fp );
20481 		}
20482 	}
20483 
20484 	tDoclist = NULL;
20485 
20486 	///////////////////////////
20487 	// check rows (attributes)
20488 	///////////////////////////
20489 
20490 	if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_EXTERN && !m_tAttr.IsEmpty() )
20491 	{
20492 		fprintf ( fp, "checking rows...\n" );
20493 
20494 		// sizes and counts
20495 		int64_t iRowsTotal = m_iDocinfo;
20496 		DWORD uStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
20497 
20498 		int64_t iAllRowsTotal = iRowsTotal;
20499 		iAllRowsTotal += (m_iDocinfoIndex+1)*2; // should had been fixed up to v.20 by the loader
20500 
20501 		if ( iAllRowsTotal*uStride!=(int64_t)m_tAttr.GetNumEntries() )
20502 			LOC_FAIL(( fp, "rowitems count mismatch (expected=" INT64_FMT ", loaded=" INT64_FMT ")",
20503 				iAllRowsTotal*uStride, (int64_t)m_tAttr.GetNumEntries() ));
20504 
20505 		iStrEnd = rdString.GetFilesize();
20506 		iMvaEnd = rdMva.GetFilesize();
20507 		CSphFixedVector<DWORD> dRow ( uStride );
20508 		CSphVector<DWORD> dMva;
20509 		rdAttr.SeekTo ( 0, sizeof ( dRow[0] ) * dRow.GetLength() );
20510 
20511 		// extract rowitem indexes for MVAs etc
20512 		// (ie. attr types that we can and will run additional checks on)
20513 		CSphVector<int> dMvaItems;
20514 		CSphVector<CSphAttrLocator> dFloatItems;
20515 		CSphVector<CSphAttrLocator> dStrItems;
20516 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
20517 		{
20518 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
20519 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
20520 			{
20521 				if ( tAttr.m_tLocator.m_iBitCount!=ROWITEM_BITS )
20522 				{
20523 					LOC_FAIL(( fp, "unexpected MVA bitcount (attr=%d, expected=%d, got=%d)",
20524 						i, ROWITEM_BITS, tAttr.m_tLocator.m_iBitCount ));
20525 					continue;
20526 				}
20527 				if ( ( tAttr.m_tLocator.m_iBitOffset % ROWITEM_BITS )!=0 )
20528 				{
20529 					LOC_FAIL(( fp, "unaligned MVA bitoffset (attr=%d, bitoffset=%d)",
20530 						i, tAttr.m_tLocator.m_iBitOffset ));
20531 					continue;
20532 				}
20533 				if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
20534 				dMvaItems.Add ( tAttr.m_tLocator.m_iBitOffset/ROWITEM_BITS );
20535 			} else if ( tAttr.m_eAttrType==SPH_ATTR_FLOAT )
20536 				dFloatItems.Add	( tAttr.m_tLocator );
20537 			else if ( tAttr.m_eAttrType==SPH_ATTR_STRING || tAttr.m_eAttrType==SPH_ATTR_JSON )
20538 				dStrItems.Add ( tAttr.m_tLocator );
20539 		}
20540 		int iMva64 = dMvaItems.GetLength();
20541 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
20542 		{
20543 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i);
20544 			if ( tAttr.m_eAttrType==SPH_ATTR_INT64SET )
20545 				dMvaItems.Add ( tAttr.m_tLocator.m_iBitOffset/ROWITEM_BITS );
20546 		}
20547 
20548 		// walk string data, build a list of acceptable start offsets
20549 		// must be sorted by construction
20550 		CSphVector<DWORD> dStringOffsets;
20551 		if ( m_tString.GetNumEntries()>1 )
20552 		{
20553 			rdString.SeekTo ( 1, READ_NO_SIZE_HINT );
20554 			while ( rdString.GetPos()<iStrEnd )
20555 			{
20556 				int64_t iLastPos = rdString.GetPos();
20557 				const int iLen = sphUnpackStrLength ( rdString );
20558 
20559 				// 4 bytes must be enough to encode string length, hence pCur+4
20560 				if ( rdString.GetPos()+iLen>iStrEnd || rdString.GetPos()>iLastPos+4 )
20561 				{
20562 					LOC_FAIL(( fp, "string length out of bounds (offset=" INT64_FMT ", len=%d)", iLastPos, iLen ));
20563 					break;
20564 				}
20565 
20566 				dStringOffsets.Add ( (DWORD)iLastPos );
20567 				rdString.SkipBytes ( iLen );
20568 			}
20569 		}
20570 
20571 		// loop the rows
20572 		int iOrphan = 0;
20573 		SphDocID_t uLastID = 0;
20574 
20575 		for ( int64_t iRow=0; iRow<iRowsTotal; iRow++ )
20576 		{
20577 			// fetch the row
20578 			rdAttr.GetBytes ( dRow.Begin(), sizeof(dRow[0])*dRow.GetLength() );
20579 			SphDocID_t uCurID = DOCINFO2ID ( dRow.Begin() );
20580 
20581 			// check that ids are ascending
20582 			bool bIsSpaValid = ( uLastID<uCurID );
20583 			if ( !bIsSpaValid )
20584 				LOC_FAIL(( fp, "docid decreased (row=" INT64_FMT ", id=" DOCID_FMT ", lastid=" DOCID_FMT ")",
20585 					iRow, uCurID, uLastID ));
20586 
20587 			uLastID = uCurID;
20588 
20589 			///////////////////////////
20590 			// check MVAs
20591 			///////////////////////////
20592 
20593 			if ( dMvaItems.GetLength() )
20594 			{
20595 				bool bMvaFix = false;
20596 				DWORD uMvaSpaFixed = 0;
20597 				const CSphRowitem * pAttrs = DOCINFO2ATTRS ( dRow.Begin() );
20598 				bool bHasValues = false;
20599 				bool bHasArena = false;
20600 				ARRAY_FOREACH ( iItem, dMvaItems )
20601 				{
20602 					const DWORD uOffset = pAttrs[dMvaItems[iItem]];
20603 					bHasValues |= ( uOffset!=0 );
20604 					bool bArena = ( ( uOffset & MVA_ARENA_FLAG )!=0 ) && !m_bArenaProhibit;
20605 					bHasArena |= bArena;
20606 
20607 					if ( uOffset && !bArena && uOffset>=iMvaEnd )
20608 					{
20609 						bIsSpaValid = false;
20610 						LOC_FAIL(( fp, "MVA index out of bounds (row=" INT64_FMT ", mvaattr=%d, docid=" DOCID_FMT ", index=%u)",
20611 							iRow, iItem, uLastID, uOffset ));
20612 					}
20613 
20614 					if ( uOffset && !bArena && uOffset<iMvaEnd && !bMvaFix )
20615 					{
20616 						uMvaSpaFixed = uOffset - sizeof(SphDocID_t) / sizeof(DWORD);
20617 						bMvaFix = true;
20618 					}
20619 				}
20620 
20621 				// MVAs ptr recovery from previous errors only if current spa record is valid
20622 				if ( rdMva.GetPos()!=SphOffset_t(sizeof(DWORD)*uMvaSpaFixed) && bIsSpaValid && bMvaFix )
20623 					rdMva.SeekTo ( sizeof(DWORD)*uMvaSpaFixed, READ_NO_SIZE_HINT );
20624 
20625 				bool bLastIDChecked = false;
20626 				SphDocID_t uLastMvaID = 0;
20627 				while ( rdMva.GetPos()<iMvaEnd )
20628 				{
20629 					// current row does not reference any MVA values
20630 					// lets mark it as checked and bail
20631 					if ( !bHasValues )
20632 					{
20633 						bLastIDChecked = true;
20634 						break;
20635 					}
20636 
20637 					int64_t iLastPos = rdMva.GetPos();
20638 					const SphDocID_t uMvaID = rdMva.GetDocid();
20639 					if ( uMvaID>uLastID )
20640 						break;
20641 
20642 					if ( bLastIDChecked && uLastID==uMvaID )
20643 						LOC_FAIL(( fp, "duplicate docid found (row=" INT64_FMT ", docid expected=" DOCID_FMT ", got=" DOCID_FMT ", index=" INT64_FMT ")",
20644 							iRow, uLastID, uMvaID, iLastPos ));
20645 
20646 					if ( uMvaID<uLastMvaID )
20647 						LOC_FAIL(( fp, "MVA docid decreased (row=" INT64_FMT ", spa docid=" DOCID_FMT ", last MVA docid=" DOCID_FMT ", MVA docid=" DOCID_FMT ", index=" INT64_FMT ")",
20648 							iRow, uLastID, uLastMvaID, uMvaID, iLastPos ));
20649 
20650 					bool bIsMvaCorrect = ( uLastMvaID<=uMvaID && uMvaID<=uLastID );
20651 					uLastMvaID = uMvaID;
20652 					bool bWasArena = false;
20653 
20654 					// loop MVAs
20655 					ARRAY_FOREACH_COND ( iItem, dMvaItems, bIsMvaCorrect )
20656 					{
20657 						const DWORD uSpaOffset = pAttrs[dMvaItems[iItem]];
20658 						bool bArena = ( ( uSpaOffset & MVA_ARENA_FLAG )!=0 ) && !m_bArenaProhibit;
20659 						bWasArena |= bArena;
20660 
20661 						// zero offset means empty MVA in rt index
20662 						if ( !uSpaOffset || bArena )
20663 							continue;
20664 
20665 						if ( bWasArena )
20666 							rdMva.SeekTo ( sizeof(DWORD)*uSpaOffset, READ_NO_SIZE_HINT );
20667 						bWasArena = false;
20668 
20669 						// check offset (index)
20670 						if ( uMvaID==uLastID && bIsSpaValid && rdMva.GetPos()!=SphOffset_t(sizeof(DWORD)*uSpaOffset) )
20671 						{
20672 							LOC_FAIL(( fp, "unexpected MVA docid (row=" INT64_FMT ", mvaattr=%d, docid expected=" DOCID_FMT ", got=" DOCID_FMT ", expected=" INT64_FMT ", got=%u)",
20673 								iRow, iItem, uLastID, uMvaID, rdMva.GetPos()/sizeof(DWORD), uSpaOffset ));
20674 							// it's unexpected but it's our best guess
20675 							// but do fix up only once, to prevent infinite loop
20676 							if ( !bLastIDChecked )
20677 								rdMva.SeekTo ( sizeof(DWORD)*uSpaOffset, READ_NO_SIZE_HINT );
20678 						}
20679 
20680 						if ( rdMva.GetPos()>=iMvaEnd )
20681 						{
20682 							LOC_FAIL(( fp, "MVA index out of bounds (row=" INT64_FMT ", mvaattr=%d, docid expected=" DOCID_FMT ", got=" DOCID_FMT ", index=" INT64_FMT ")",
20683 								iRow, iItem, uLastID, uMvaID, rdMva.GetPos()/sizeof(DWORD) ));
20684 							bIsMvaCorrect = false;
20685 							continue;
20686 						}
20687 
20688 						// check values
20689 						DWORD uValues = rdMva.GetDword();
20690 
20691 						if ( rdMva.GetPos()+SphOffset_t(sizeof(DWORD)*uValues)-1>=iMvaEnd )
20692 						{
20693 							LOC_FAIL(( fp, "MVA count out of bounds (row=" INT64_FMT ", mvaattr=%d, docid expected=" DOCID_FMT ", got=" DOCID_FMT ", count=%u)",
20694 								iRow, iItem, uLastID, uMvaID, uValues ));
20695 							rdMva.SeekTo ( rdMva.GetPos() + sizeof(DWORD)*uValues, READ_NO_SIZE_HINT );
20696 							bIsMvaCorrect = false;
20697 							continue;
20698 						}
20699 
20700 						dMva.Resize ( uValues );
20701 						rdMva.GetBytes ( dMva.Begin(), sizeof(DWORD)*uValues );
20702 
20703 						// check that values are ascending
20704 						for ( DWORD uVal=(iItem>=iMva64 ? 2 : 1); uVal<uValues && bIsMvaCorrect; )
20705 						{
20706 							int64_t iPrev, iCur;
20707 							if ( iItem>=iMva64 )
20708 							{
20709 								iPrev = MVA_UPSIZE ( dMva.Begin() + uVal - 2 );
20710 								iCur = MVA_UPSIZE ( dMva.Begin() + uVal );
20711 								uVal += 2;
20712 							} else
20713 							{
20714 								iPrev = dMva[uVal-1];
20715 								iCur = dMva[uVal];
20716 								uVal++;
20717 							}
20718 
20719 							if ( iCur<=iPrev )
20720 							{
20721 								LOC_FAIL(( fp, "unsorted MVA values (row=" INT64_FMT ", mvaattr=%d, docid expected=" DOCID_FMT ", got=" DOCID_FMT ", val[%u]=%u, val[%u]=%u)",
20722 									iRow, iItem, uLastID, uMvaID, ( iItem>=iMva64 ? uVal-2 : uVal-1 ), (unsigned int)iPrev, uVal, (unsigned int)iCur ));
20723 								bIsMvaCorrect = false;
20724 							}
20725 
20726 							uVal += ( iItem>=iMva64 ? 2 : 1 );
20727 						}
20728 					}
20729 
20730 					if ( !bIsMvaCorrect )
20731 						break;
20732 
20733 					// orphan only ON no errors && ( not matched ids || ids matched multiply times )
20734 					if ( bIsMvaCorrect && ( uMvaID!=uLastID || ( uMvaID==uLastID && bLastIDChecked ) ) )
20735 						iOrphan++;
20736 
20737 					bLastIDChecked |= ( uLastID==uMvaID );
20738 				}
20739 
20740 				if ( !bLastIDChecked && bHasValues && !bHasArena )
20741 					LOC_FAIL(( fp, "missed or damaged MVA (row=" INT64_FMT ", docid expected=" DOCID_FMT ")",
20742 						iRow, uLastID ));
20743 			}
20744 
20745 			///////////////////////////
20746 			// check floats
20747 			///////////////////////////
20748 
20749 			ARRAY_FOREACH ( iItem, dFloatItems )
20750 			{
20751 				const CSphRowitem * pAttrs = DOCINFO2ATTRS ( dRow.Begin() );
20752 				const DWORD uValue = (DWORD)sphGetRowAttr ( pAttrs, dFloatItems[ iItem ] );
20753 				const DWORD uExp = ( uValue >> 23 ) & 0xff;
20754 				const DWORD uMantissa = uValue & 0x003fffff;
20755 
20756 				// check normalized
20757 				if ( uExp==0 && uMantissa!=0 )
20758 					LOC_FAIL(( fp, "float attribute value is unnormalized (row=" INT64_FMT ", attr=%d, id=" DOCID_FMT ", raw=0x%x, value=%f)",
20759 						iRow, iItem, uLastID, uValue, sphDW2F ( uValue ) ));
20760 
20761 				// check +-inf
20762 				if ( uExp==0xff && uMantissa==0 )
20763 					LOC_FAIL(( fp, "float attribute is infinity (row=" INT64_FMT ", attr=%d, id=" DOCID_FMT ", raw=0x%x, value=%f)",
20764 						iRow, iItem, uLastID, uValue, sphDW2F ( uValue ) ));
20765 			}
20766 
20767 			/////////////////
20768 			// check strings
20769 			/////////////////
20770 
20771 			ARRAY_FOREACH ( iItem, dStrItems )
20772 			{
20773 				const CSphRowitem * pAttrs = DOCINFO2ATTRS ( dRow.Begin() );
20774 
20775 				const DWORD uOffset = (DWORD)sphGetRowAttr ( pAttrs, dStrItems[ iItem ] );
20776 				if ( uOffset>=iStrEnd )
20777 				{
20778 					LOC_FAIL(( fp, "string offset out of bounds (row=" INT64_FMT ", stringattr=%d, docid=" DOCID_FMT ", index=%u)",
20779 						iRow, iItem, uLastID, uOffset ));
20780 					continue;
20781 				}
20782 
20783 				if ( !uOffset )
20784 					continue;
20785 
20786 				rdString.SeekTo ( uOffset, READ_NO_SIZE_HINT );
20787 				const int iLen = sphUnpackStrLength ( rdString );
20788 
20789 				// check that length is sane
20790 				if ( rdString.GetPos()+iLen-1>=iStrEnd )
20791 				{
20792 					LOC_FAIL(( fp, "string length out of bounds (row=" INT64_FMT ", stringattr=%d, docid=" DOCID_FMT ", index=%u)",
20793 						iRow, iItem, uLastID, uOffset ));
20794 					continue;
20795 				}
20796 
20797 				// check that offset is one of the good ones
20798 				// (that is, that we don't point in the middle of some other data)
20799 				if ( !dStringOffsets.BinarySearch ( uOffset ) )
20800 				{
20801 					LOC_FAIL(( fp, "string offset is not a string start (row=" INT64_FMT ", stringattr=%d, docid=" DOCID_FMT ", offset=%u)",
20802 						iRow, iItem, uLastID, uOffset ));
20803 				}
20804 			}
20805 
20806 			// progress bar
20807 			if ( iRow%1000==0 && bProgress )
20808 			{
20809 				fprintf ( fp, INT64_FMT"/" INT64_FMT "\r", iRow, iRowsTotal );
20810 				fflush ( fp );
20811 			}
20812 		}
20813 
20814 		if ( iOrphan )
20815 			fprintf ( fp, "WARNING: %d orphaned MVA entries were found\n", iOrphan );
20816 
20817 		///////////////////////////
20818 		// check blocks index
20819 		///////////////////////////
20820 
20821 		fprintf ( fp, "checking attribute blocks index...\n" );
20822 
20823 		// check size
20824 		const int64_t iTempDocinfoIndex = ( m_iDocinfo+DOCINFO_INDEX_FREQ-1 ) / DOCINFO_INDEX_FREQ;
20825 		if ( iTempDocinfoIndex!=m_iDocinfoIndex )
20826 			LOC_FAIL(( fp, "block count differs (expected=" INT64_FMT ", got=" INT64_FMT ")",
20827 				iTempDocinfoIndex, m_iDocinfoIndex ));
20828 
20829 		const DWORD uMinMaxStride = DOCINFO_IDSIZE + m_tSchema.GetRowSize();
20830 		const DWORD * pDocinfoIndexMax = m_pDocinfoIndex + ( m_iDocinfoIndex+1 )*uMinMaxStride*2;
20831 
20832 		rdAttr.SeekTo ( 0, sizeof ( dRow[0] ) * dRow.GetLength() );
20833 
20834 		for ( int64_t iIndexEntry=0; iIndexEntry<m_iDocinfo; iIndexEntry++ )
20835 		{
20836 			const int64_t iBlock = iIndexEntry / DOCINFO_INDEX_FREQ;
20837 
20838 			// we have to do some checks in border cases, for example: when move from 1st to 2nd block
20839 			const int64_t iPrevEntryBlock = ( iIndexEntry-1 )/DOCINFO_INDEX_FREQ;
20840 			const bool bIsBordersCheckTime = ( iPrevEntryBlock!=iBlock );
20841 
20842 			rdAttr.GetBytes ( dRow.Begin(), sizeof(dRow[0]) * dRow.GetLength() );
20843 			const SphDocID_t uDocID = DOCINFO2ID ( dRow.Begin() );
20844 
20845 			const DWORD * pMinEntry = m_pDocinfoIndex + iBlock * uMinMaxStride * 2;
20846 			const DWORD * pMaxEntry = pMinEntry + uMinMaxStride;
20847 			const DWORD * pMinAttrs = DOCINFO2ATTRS ( pMinEntry );
20848 			const DWORD * pMaxAttrs = pMinAttrs + uMinMaxStride;
20849 
20850 			// check docid vs global range
20851 			if ( pMaxEntry+uMinMaxStride > pDocinfoIndexMax )
20852 				LOC_FAIL(( fp, "unexpected block index end (row=" INT64_FMT ", docid=" DOCID_FMT ", block=" INT64_FMT ", max=" INT64_FMT ", cur=" INT64_FMT ")",
20853 					iIndexEntry, uDocID, iBlock, int64_t ( pDocinfoIndexMax-m_pDocinfoIndex ), int64_t ( pMaxEntry+uMinMaxStride-m_pDocinfoIndex ) ));
20854 
20855 			// check attribute location vs global range
20856 			if ( pMaxAttrs+uMinMaxStride > pDocinfoIndexMax )
20857 				LOC_FAIL(( fp, "attribute position out of blocks index (row=" INT64_FMT ", docid=" DOCID_FMT ", block=" INT64_FMT ", expected<" INT64_FMT ", got=" INT64_FMT ")",
20858 					iIndexEntry, uDocID, iBlock, int64_t ( pDocinfoIndexMax-m_pDocinfoIndex ), int64_t ( pMaxAttrs+uMinMaxStride-m_pDocinfoIndex ) ));
20859 
20860 			const SphDocID_t uMinDocID = DOCINFO2ID ( pMinEntry );
20861 			const SphDocID_t uMaxDocID = DOCINFO2ID ( pMaxEntry );
20862 
20863 			// checks is docid min max range valid
20864 			if ( uMinDocID > uMaxDocID && bIsBordersCheckTime )
20865 				LOC_FAIL(( fp, "invalid docid range (row=" INT64_FMT ", block=" INT64_FMT ", min=" DOCID_FMT ", max=" DOCID_FMT ")",
20866 					iIndexEntry, iBlock, uMinDocID, uMaxDocID ));
20867 
20868 			// checks docid vs blocks range
20869 			if ( uDocID < uMinDocID || uDocID > uMaxDocID )
20870 				LOC_FAIL(( fp, "unexpected docid range (row=" INT64_FMT ", docid=" DOCID_FMT ", block=" INT64_FMT ", min=" DOCID_FMT ", max=" DOCID_FMT ")",
20871 					iIndexEntry, uDocID, iBlock, uMinDocID, uMaxDocID ));
20872 
20873 			bool bIsFirstMva = true;
20874 
20875 			// check values vs blocks range
20876 			const DWORD * pSpaRow = DOCINFO2ATTRS ( dRow.Begin() );
20877 			for ( int iItem=0; iItem<m_tSchema.GetAttrsCount(); iItem++ )
20878 			{
20879 				const CSphColumnInfo & tCol = m_tSchema.GetAttr(iItem);
20880 
20881 				switch ( tCol.m_eAttrType )
20882 				{
20883 				case SPH_ATTR_INTEGER:
20884 				case SPH_ATTR_TIMESTAMP:
20885 				case SPH_ATTR_BOOL:
20886 				case SPH_ATTR_BIGINT:
20887 					{
20888 						const SphAttr_t uVal = sphGetRowAttr ( pSpaRow, tCol.m_tLocator );
20889 						const SphAttr_t uMin = sphGetRowAttr ( pMinAttrs, tCol.m_tLocator );
20890 						const SphAttr_t uMax = sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator );
20891 
20892 						// checks is attribute min max range valid
20893 						if ( uMin > uMax && bIsBordersCheckTime )
20894 							LOC_FAIL(( fp, "invalid attribute range (row=" INT64_FMT ", block=" INT64_FMT ", min=" INT64_FMT ", max=" INT64_FMT ")",
20895 								iIndexEntry, iBlock, uMin, uMax ));
20896 
20897 						if ( uVal < uMin || uVal > uMax )
20898 							LOC_FAIL(( fp, "unexpected attribute value (row=" INT64_FMT ", attr=%u, docid=" DOCID_FMT ", block=" INT64_FMT ", value=0x" UINT64_FMT ", min=0x" UINT64_FMT ", max=0x" UINT64_FMT ")",
20899 								iIndexEntry, iItem, uDocID, iBlock, uint64_t(uVal), uint64_t(uMin), uint64_t(uMax) ));
20900 					}
20901 					break;
20902 
20903 				case SPH_ATTR_FLOAT:
20904 					{
20905 						const float fVal = sphDW2F ( (DWORD)sphGetRowAttr ( pSpaRow, tCol.m_tLocator ) );
20906 						const float fMin = sphDW2F ( (DWORD)sphGetRowAttr ( pMinAttrs, tCol.m_tLocator ) );
20907 						const float fMax = sphDW2F ( (DWORD)sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator ) );
20908 
20909 						// checks is attribute min max range valid
20910 						if ( fMin > fMax && bIsBordersCheckTime )
20911 							LOC_FAIL(( fp, "invalid attribute range (row=" INT64_FMT ", block=" INT64_FMT ", min=%f, max=%f)",
20912 								iIndexEntry, iBlock, fMin, fMax ));
20913 
20914 						if ( fVal < fMin || fVal > fMax )
20915 							LOC_FAIL(( fp, "unexpected attribute value (row=" INT64_FMT ", attr=%u, docid=" DOCID_FMT ", block=" INT64_FMT ", value=%f, min=%f, max=%f)",
20916 								iIndexEntry, iItem, uDocID, iBlock, fVal, fMin, fMax ));
20917 					}
20918 					break;
20919 
20920 				case SPH_ATTR_UINT32SET:
20921 					{
20922 						const DWORD uMin = (DWORD)sphGetRowAttr ( pMinAttrs, tCol.m_tLocator );
20923 						const DWORD uMax = (DWORD)sphGetRowAttr ( pMaxAttrs, tCol.m_tLocator );
20924 
20925 						// checks is MVA attribute min max range valid
20926 						if ( uMin > uMax && bIsBordersCheckTime && uMin!=0xffffffff && uMax!=0 )
20927 							LOC_FAIL(( fp, "invalid MVA range (row=" INT64_FMT ", block=" INT64_FMT ", min=0x%x, max=0x%x)",
20928 							iIndexEntry, iBlock, uMin, uMax ));
20929 
20930 						SphAttr_t uOff = sphGetRowAttr ( pSpaRow, tCol.m_tLocator );
20931 						if ( !uOff || ( uOff & MVA_ARENA_FLAG )!=0 )
20932 							break;
20933 
20934 						SphDocID_t uMvaDocID = 0;
20935 						if ( bIsFirstMva )
20936 						{
20937 							bIsFirstMva = false;
20938 							rdMva.SeekTo ( sizeof(DWORD) * uOff - sizeof(SphDocID_t), READ_NO_SIZE_HINT );
20939 							uMvaDocID = rdMva.GetDocid();
20940 						} else
20941 						{
20942 							rdMva.SeekTo ( sizeof(DWORD) * uOff, READ_NO_SIZE_HINT );
20943 						}
20944 
20945 						if ( uOff>=iMvaEnd )
20946 							break;
20947 
20948 						if ( uMvaDocID && uMvaDocID!=uDocID )
20949 						{
20950 							LOC_FAIL(( fp, "unexpected MVA docid (row=" INT64_FMT ", mvaattr=%d, expected=" DOCID_FMT ", got=" DOCID_FMT ", block=" INT64_FMT ", index=%u)",
20951 								iIndexEntry, iItem, uDocID, uMvaDocID, iBlock, (DWORD)uOff ));
20952 							break;
20953 						}
20954 
20955 						// check values
20956 						const DWORD uValues = rdMva.GetDword();
20957 						if ( uOff+uValues>iMvaEnd )
20958 							break;
20959 
20960 						dMva.Resize ( uValues );
20961 						rdMva.GetBytes ( dMva.Begin(), sizeof ( dMva[0] ) * uValues );
20962 
20963 						for ( DWORD iVal=0; iVal<uValues; iVal++ )
20964 						{
20965 							const DWORD uVal = dMva[iVal];
20966 							if ( uVal < uMin || uVal > uMax )
20967 								LOC_FAIL(( fp, "unexpected MVA value (row=" INT64_FMT ", attr=%u, docid=" DOCID_FMT ", block=" INT64_FMT ", index=%u, value=0x%x, min=0x%x, max=0x%x)",
20968 									iIndexEntry, iItem, uDocID, iBlock, iVal, (DWORD)uVal, (DWORD)uMin, (DWORD)uMax ));
20969 						}
20970 					}
20971 					break;
20972 
20973 				default:
20974 					break;
20975 				}
20976 			}
20977 
20978 			// progress bar
20979 			if ( iIndexEntry%1000==0 && bProgress )
20980 			{
20981 				fprintf ( fp, INT64_FMT"/" INT64_FMT "\r", iIndexEntry, m_iDocinfo );
20982 				fflush ( fp );
20983 			}
20984 		}
20985 	}
20986 
20987 	///////////////////////////
20988 	// check kill-list
20989 	///////////////////////////
20990 
20991 	fprintf ( fp, "checking kill-list...\n" );
20992 
20993 	// check size
20994 	if ( m_pKillList.GetNumEntries()!=m_uKillListSize )
20995 		LOC_FAIL(( fp, "kill-list size differs (expected=%d, got=" INT64_FMT ")",
20996 			m_uKillListSize, (int64_t)m_pKillList.GetNumEntries() ));
20997 
20998 	// check that ids are ascending
20999 	for ( DWORD uID=1; uID<m_pKillList.GetNumEntries(); uID++ )
21000 		if ( m_pKillList[uID]<=m_pKillList[uID-1] )
21001 			LOC_FAIL(( fp, "unsorted kill-list values (val[%d]=%d, val[%d]=%d)",
21002 				uID-1, (DWORD)m_pKillList[uID-1], uID, (DWORD)m_pKillList[uID] ));
21003 
21004 	///////////////////////////
21005 	// all finished
21006 	///////////////////////////
21007 
21008 	// well, no known kinds of failures, maybe some unknown ones
21009 	tmCheck = sphMicroTimer() - tmCheck;
21010 	if ( !iFails )
21011 		fprintf ( fp, "check passed" );
21012 	else if ( iFails!=iFailsPrinted )
21013 		fprintf ( fp, "check FAILED, %d of " INT64_FMT " failures reported", iFailsPrinted, iFails );
21014 	else
21015 		fprintf ( fp, "check FAILED, " INT64_FMT " failures reported", iFails );
21016 	fprintf ( fp, ", %d.%d sec elapsed\n", (int)(tmCheck/1000000), (int)((tmCheck/100000)%10) );
21017 
21018 	return (int)Min ( iFails, 255 ); // this is the exitcode; so cap it
21019 } // NOLINT function length
21020 
21021 
21022 //////////////////////////////////////////////////////////////////////////
21023 
21024 /// morphology
21025 enum
21026 {
21027 	SPH_MORPH_STEM_EN,
21028 	SPH_MORPH_STEM_RU_UTF8,
21029 	SPH_MORPH_STEM_CZ,
21030 	SPH_MORPH_STEM_AR_UTF8,
21031 	SPH_MORPH_SOUNDEX,
21032 	SPH_MORPH_METAPHONE_UTF8,
21033 	SPH_MORPH_AOTLEMMER_BASE,
21034 	SPH_MORPH_AOTLEMMER_RU_UTF8 = SPH_MORPH_AOTLEMMER_BASE,
21035 	SPH_MORPH_AOTLEMMER_EN,
21036 	SPH_MORPH_AOTLEMMER_DE_UTF8,
21037 	SPH_MORPH_AOTLEMMER_BASE_ALL,
21038 	SPH_MORPH_AOTLEMMER_RU_ALL = SPH_MORPH_AOTLEMMER_BASE_ALL,
21039 	SPH_MORPH_AOTLEMMER_EN_ALL,
21040 	SPH_MORPH_AOTLEMMER_DE_ALL,
21041 	SPH_MORPH_LIBSTEMMER_FIRST,
21042 	SPH_MORPH_LIBSTEMMER_LAST = SPH_MORPH_LIBSTEMMER_FIRST + 64
21043 };
21044 
21045 
21046 /////////////////////////////////////////////////////////////////////////////
21047 // BASE DICTIONARY INTERFACE
21048 /////////////////////////////////////////////////////////////////////////////
21049 
DictBegin(CSphAutofile &,CSphAutofile &,int,ThrottleState_t *)21050 void CSphDict::DictBegin ( CSphAutofile &, CSphAutofile &, int, ThrottleState_t * )		{}
DictEntry(const CSphDictEntry &)21051 void CSphDict::DictEntry ( const CSphDictEntry & )										{}
DictEndEntries(SphOffset_t)21052 void CSphDict::DictEndEntries ( SphOffset_t )											{}
DictEnd(DictHeader_t *,int,CSphString &,ThrottleState_t *)21053 bool CSphDict::DictEnd ( DictHeader_t *, int, CSphString &, ThrottleState_t * )			{ return true; }
DictIsError() const21054 bool CSphDict::DictIsError () const														{ return true; }
21055 
21056 /////////////////////////////////////////////////////////////////////////////
21057 // CRC32/64 DICTIONARIES
21058 /////////////////////////////////////////////////////////////////////////////
21059 
21060 struct CSphTemplateDictTraits : CSphDict
21061 {
21062 	CSphTemplateDictTraits ();
21063 	virtual				~CSphTemplateDictTraits ();
21064 
21065 	virtual void		LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer );
21066 	virtual void		LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords );
21067 	virtual void		WriteStopwords ( CSphWriter & tWriter );
21068 	virtual bool		LoadWordforms ( const CSphVector<CSphString> & dFiles, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex );
21069 	virtual void		WriteWordforms ( CSphWriter & tWriter );
GetWordformsCSphTemplateDictTraits21070 	virtual const CSphWordforms *	GetWordforms() { return m_pWordforms; }
DisableWordformsCSphTemplateDictTraits21071 	virtual void		DisableWordforms() { m_bDisableWordforms = true; }
21072 	virtual int			SetMorphology ( const char * szMorph, CSphString & sMessage );
21073 	virtual bool		HasMorphology() const;
21074 	virtual void		ApplyStemmers ( BYTE * pWord ) const;
21075 
SetupCSphTemplateDictTraits21076 	virtual void		Setup ( const CSphDictSettings & tSettings ) { m_tSettings = tSettings; }
GetSettingsCSphTemplateDictTraits21077 	virtual const CSphDictSettings & GetSettings () const { return m_tSettings; }
GetStopwordsFileInfosCSphTemplateDictTraits21078 	virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_dSWFileInfos; }
GetWordformsFileInfosCSphTemplateDictTraits21079 	virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_dWFFileInfos; }
21080 	virtual const CSphMultiformContainer * GetMultiWordforms () const;
21081 	virtual uint64_t	GetSettingsFNV () const;
21082 	virtual void		SetApplyMorph ( bool bApply );
21083 	static void			SweepWordformContainers ( const CSphVector<CSphSavedFile> & dFiles );
21084 
21085 protected:
21086 	CSphVector < int >	m_dMorph;
21087 #if USE_LIBSTEMMER
21088 	CSphVector < sb_stemmer * >	m_dStemmers;
21089 	CSphVector<CSphString> m_dDescStemmers;
21090 #endif
21091 
21092 	int					m_iStopwords;	///< stopwords count
21093 	SphWordID_t *		m_pStopwords;	///< stopwords ID list
21094 	CSphFixedVector<SphWordID_t> m_dStopwordContainer;
21095 	bool				m_bApplyMorph;
21096 
21097 protected:
21098 	int					ParseMorphology ( const char * szMorph, CSphString & sError );
21099 	SphWordID_t			FilterStopword ( SphWordID_t uID ) const;	///< filter ID against stopwords list
21100 	CSphDict *			CloneBase ( CSphTemplateDictTraits * pDict ) const;
21101 	virtual bool		HasState () const;
21102 
21103 	bool				m_bDisableWordforms;
21104 
21105 private:
21106 	CSphWordforms *				m_pWordforms;
21107 	CSphVector<CSphSavedFile>	m_dSWFileInfos;
21108 	CSphVector<CSphSavedFile>	m_dWFFileInfos;
21109 	CSphDictSettings			m_tSettings;
21110 
21111 	static CSphVector<CSphWordforms*>		m_dWordformContainers;
21112 
21113 	CSphWordforms *		GetWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const CSphVector<CSphString> * pEmbeddedWordforms, const ISphTokenizer * pTokenizer, const char * sIndex );
21114 	CSphWordforms *		LoadWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos, const CSphVector<CSphString> * pEmbeddedWordforms, const ISphTokenizer * pTokenizer, const char * sIndex );
21115 
21116 	int					InitMorph ( const char * szMorph, int iLength, CSphString & sError );
21117 	int					AddMorph ( int iMorph ); ///< helper that always returns ST_OK
21118 	bool				StemById ( BYTE * pWord, int iStemmer ) const;
21119 	void				AddWordform ( CSphWordforms * pContainer, char * sBuffer, int iLen, ISphTokenizer * pTokenizer, const char * szFile, const CSphVector<int> & dBlended, int iFileId );
21120 };
21121 
21122 CSphVector<CSphWordforms*> CSphTemplateDictTraits::m_dWordformContainers;
21123 
21124 
21125 /// common CRC32/64 dictionary stuff
21126 struct CSphDiskDictTraits : CSphTemplateDictTraits
21127 {
CSphDiskDictTraitsCSphDiskDictTraits21128 						CSphDiskDictTraits ()
21129 							: m_iEntries ( 0 )
21130 							, m_iLastDoclistPos ( 0 )
21131 							, m_iLastWordID ( 0 )
21132 						{}
~CSphDiskDictTraitsCSphDiskDictTraits21133 						virtual				~CSphDiskDictTraits () {}
21134 
21135 	virtual void DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle );
21136 	virtual void DictEntry ( const CSphDictEntry & tEntry );
21137 	virtual void DictEndEntries ( SphOffset_t iDoclistOffset );
21138 	virtual bool DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * );
DictIsErrorCSphDiskDictTraits21139 	virtual bool DictIsError () const { return m_wrDict.IsError(); }
21140 
21141 protected:
21142 
21143 	CSphTightVector<CSphWordlistCheckpoint>	m_dCheckpoints;		///< checkpoint offsets
21144 	CSphWriter			m_wrDict;			///< final dict file writer
21145 	CSphString			m_sWriterError;		///< writer error message storage
21146 	int					m_iEntries;			///< dictionary entries stored
21147 	SphOffset_t			m_iLastDoclistPos;
21148 	SphWordID_t			m_iLastWordID;
21149 };
21150 
21151 
21152 template < bool CRCALGO >
21153 struct CCRCEngine
21154 {
21155 	inline static SphWordID_t		DoCrc ( const BYTE * pWord );
21156 	inline static SphWordID_t		DoCrc ( const BYTE * pWord, int iLen );
21157 };
21158 
21159 /// specialized CRC32/64 implementations
21160 template < bool CRC32DICT >
21161 struct CSphDictCRC : public CSphDiskDictTraits, CCRCEngine<CRC32DICT>
21162 {
21163 	typedef CCRCEngine<CRC32DICT> tHASH;
21164 	virtual SphWordID_t		GetWordID ( BYTE * pWord );
21165 	virtual SphWordID_t		GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops );
21166 	virtual SphWordID_t		GetWordIDWithMarkers ( BYTE * pWord );
21167 	virtual SphWordID_t		GetWordIDNonStemmed ( BYTE * pWord );
21168 	virtual bool			IsStopWord ( const BYTE * pWord ) const;
21169 
CloneCSphDictCRC21170 	virtual CSphDict *		Clone () const { return CloneBase ( new CSphDictCRC<CRC32DICT>() ); }
21171 };
21172 
21173 struct CSphDictTemplate : public CSphTemplateDictTraits, CCRCEngine<false> // based on flv64
21174 {
21175 	virtual SphWordID_t		GetWordID ( BYTE * pWord );
21176 	virtual SphWordID_t		GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops );
21177 	virtual SphWordID_t		GetWordIDWithMarkers ( BYTE * pWord );
21178 	virtual SphWordID_t		GetWordIDNonStemmed ( BYTE * pWord );
21179 	virtual bool			IsStopWord ( const BYTE * pWord ) const;
21180 
CloneCSphDictTemplate21181 	virtual CSphDict *		Clone () const { return CloneBase ( new CSphDictTemplate() ); }
21182 };
21183 
21184 
21185 /////////////////////////////////////////////////////////////////////////////
21186 
sphFNV64(const void * s)21187 uint64_t sphFNV64 ( const void * s )
21188 {
21189 	return sphFNV64cont ( s, SPH_FNV64_SEED );
21190 }
21191 
21192 
sphFNV64(const void * s,int iLen,uint64_t uPrev)21193 uint64_t sphFNV64 ( const void * s, int iLen, uint64_t uPrev )
21194 {
21195 	const BYTE * p = (const BYTE*)s;
21196 	uint64_t hval = uPrev;
21197 	for ( ; iLen>0; iLen-- )
21198 	{
21199 		// xor the bottom with the current octet
21200 		hval ^= (uint64_t)*p++;
21201 
21202 		// multiply by the 64 bit FNV magic prime mod 2^64
21203 		hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); // gcc optimization
21204 	}
21205 	return hval;
21206 }
21207 
21208 
sphFNV64cont(const void * s,uint64_t uPrev)21209 uint64_t sphFNV64cont ( const void * s, uint64_t uPrev )
21210 {
21211 	const BYTE * p = (const BYTE*)s;
21212 	if ( !p )
21213 		return uPrev;
21214 
21215 	uint64_t hval = uPrev;
21216 	while ( *p )
21217 	{
21218 		// xor the bottom with the current octet
21219 		hval ^= (uint64_t)*p++;
21220 
21221 		// multiply by the 64 bit FNV magic prime mod 2^64
21222 		hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); // gcc optimization
21223 	}
21224 	return hval;
21225 }
21226 
21227 /////////////////////////////////////////////////////////////////////////////
21228 
21229 extern DWORD g_dSphinxCRC32 [ 256 ];
21230 
sphCalcFileCRC32(const char * szFilename,DWORD & uCRC32)21231 bool sphCalcFileCRC32 ( const char * szFilename, DWORD & uCRC32 )
21232 {
21233 	uCRC32 = 0;
21234 
21235 	if ( !szFilename )
21236 		return false;
21237 
21238 	FILE * pFile = fopen ( szFilename, "rb" );
21239 	if ( !pFile )
21240 		return false;
21241 
21242 	DWORD crc = ~((DWORD)0);
21243 
21244 	const int BUFFER_SIZE = 131072;
21245 	static BYTE * pBuffer = NULL;
21246 	if ( !pBuffer )
21247 		pBuffer = new BYTE [ BUFFER_SIZE ];
21248 
21249 	int iBytesRead;
21250 	while ( ( iBytesRead = fread ( pBuffer, 1, BUFFER_SIZE, pFile ) )!=0 )
21251 	{
21252 		for ( int i=0; i<iBytesRead; i++ )
21253 			crc = (crc >> 8) ^ g_dSphinxCRC32 [ (crc ^ pBuffer[i]) & 0xff ];
21254 	}
21255 
21256 	fclose ( pFile );
21257 
21258 	uCRC32 = ~crc;
21259 	return true;
21260 }
21261 
21262 
GetFileStats(const char * szFilename,CSphSavedFile & tInfo,CSphString * pError)21263 static bool GetFileStats ( const char * szFilename, CSphSavedFile & tInfo, CSphString * pError )
21264 {
21265 	if ( !szFilename || !*szFilename )
21266 	{
21267 		memset ( &tInfo, 0, sizeof ( tInfo ) );
21268 		return true;
21269 	}
21270 
21271 	tInfo.m_sFilename = szFilename;
21272 
21273 	struct_stat tStat;
21274 	memset ( &tStat, 0, sizeof ( tStat ) );
21275 	if ( stat ( szFilename, &tStat ) < 0 )
21276 	{
21277 		if ( pError )
21278 			*pError = strerror ( errno );
21279 		memset ( &tStat, 0, sizeof ( tStat ) );
21280 		return false;
21281 	}
21282 
21283 	tInfo.m_uSize = tStat.st_size;
21284 	tInfo.m_uCTime = tStat.st_ctime;
21285 	tInfo.m_uMTime = tStat.st_mtime;
21286 
21287 	DWORD uCRC32 = 0;
21288 	if ( !sphCalcFileCRC32 ( szFilename, uCRC32 ) )
21289 		return false;
21290 
21291 	tInfo.m_uCRC32 = uCRC32;
21292 
21293 	return true;
21294 }
21295 
21296 /////////////////////////////////////////////////////////////////////////////
21297 
CSphWordforms()21298 CSphWordforms::CSphWordforms()
21299 	: m_iRefCount ( 0 )
21300 	, m_uTokenizerFNV ( 0 )
21301 	, m_bHavePostMorphNF ( false )
21302 	, m_pMultiWordforms ( NULL )
21303 {
21304 }
21305 
21306 
~CSphWordforms()21307 CSphWordforms::~CSphWordforms()
21308 {
21309 	if ( m_pMultiWordforms )
21310 	{
21311 		m_pMultiWordforms->m_Hash.IterateStart ();
21312 		while ( m_pMultiWordforms->m_Hash.IterateNext () )
21313 		{
21314 			CSphMultiforms * pWordforms = m_pMultiWordforms->m_Hash.IterateGet ();
21315 			ARRAY_FOREACH ( i, pWordforms->m_pForms )
21316 				SafeDelete ( pWordforms->m_pForms[i] );
21317 
21318 			SafeDelete ( pWordforms );
21319 		}
21320 
21321 		SafeDelete ( m_pMultiWordforms );
21322 	}
21323 }
21324 
21325 
IsEqual(const CSphVector<CSphSavedFile> & dFiles)21326 bool CSphWordforms::IsEqual ( const CSphVector<CSphSavedFile> & dFiles )
21327 {
21328 	if ( m_dFiles.GetLength()!=dFiles.GetLength() )
21329 		return false;
21330 
21331 	ARRAY_FOREACH ( i, m_dFiles )
21332 	{
21333 		const CSphSavedFile & tF1 = m_dFiles[i];
21334 		const CSphSavedFile & tF2 = dFiles[i];
21335 		if ( tF1.m_sFilename!=tF2.m_sFilename || tF1.m_uCRC32!=tF2.m_uCRC32 || tF1.m_uSize!=tF2.m_uSize ||
21336 			tF1.m_uCTime!=tF2.m_uCTime || tF1.m_uMTime!=tF2.m_uMTime )
21337 			return false;
21338 	}
21339 
21340 	return true;
21341 }
21342 
21343 
ToNormalForm(BYTE * pWord,bool bBefore) const21344 bool CSphWordforms::ToNormalForm ( BYTE * pWord, bool bBefore ) const
21345 {
21346 	int * pIndex = m_dHash ( (char *)pWord );
21347 	if ( !pIndex )
21348 		return false;
21349 
21350 	if ( *pIndex<0 || *pIndex>=m_dNormalForms.GetLength () )
21351 		return false;
21352 
21353 	if ( bBefore==m_dNormalForms[*pIndex].m_bAfterMorphology )
21354 		return false;
21355 
21356 	if ( m_dNormalForms[*pIndex].m_sWord.IsEmpty() )
21357 		return false;
21358 
21359 	strcpy ( (char *)pWord, m_dNormalForms[*pIndex].m_sWord.cstr() ); // NOLINT
21360 	return true;
21361 }
21362 
21363 /////////////////////////////////////////////////////////////////////////////
21364 
CSphTemplateDictTraits()21365 CSphTemplateDictTraits::CSphTemplateDictTraits ()
21366 	: m_iStopwords	( 0 )
21367 	, m_pStopwords	( NULL )
21368 	, m_dStopwordContainer ( 0 )
21369 	, m_bApplyMorph ( true )
21370 	, m_bDisableWordforms ( false )
21371 	, m_pWordforms	( NULL )
21372 {
21373 }
21374 
21375 
~CSphTemplateDictTraits()21376 CSphTemplateDictTraits::~CSphTemplateDictTraits ()
21377 {
21378 #if USE_LIBSTEMMER
21379 	ARRAY_FOREACH ( i, m_dStemmers )
21380 		sb_stemmer_delete ( m_dStemmers[i] );
21381 #endif
21382 
21383 	if ( m_pWordforms )
21384 		--m_pWordforms->m_iRefCount;
21385 }
21386 
21387 
FilterStopword(SphWordID_t uID) const21388 SphWordID_t CSphTemplateDictTraits::FilterStopword ( SphWordID_t uID ) const
21389 {
21390 	if ( !m_iStopwords )
21391 		return uID;
21392 
21393 	// OPTIMIZE: binary search is not too good, could do some hashing instead
21394 	SphWordID_t * pStart = m_pStopwords;
21395 	SphWordID_t * pEnd = m_pStopwords + m_iStopwords - 1;
21396 	do
21397 	{
21398 		if ( uID==*pStart || uID==*pEnd )
21399 			return 0;
21400 
21401 		if ( uID<*pStart || uID>*pEnd )
21402 			return uID;
21403 
21404 		SphWordID_t * pMid = pStart + (pEnd-pStart)/2;
21405 		if ( uID==*pMid )
21406 			return 0;
21407 
21408 		if ( uID<*pMid )
21409 			pEnd = pMid;
21410 		else
21411 			pStart = pMid;
21412 	} while ( pEnd-pStart>1 );
21413 
21414 	return uID;
21415 }
21416 
21417 
ParseMorphology(const char * sMorph,CSphString & sMessage)21418 int CSphTemplateDictTraits::ParseMorphology ( const char * sMorph, CSphString & sMessage )
21419 {
21420 	int iRes = ST_OK;
21421 	for ( const char * sStart=sMorph; ; )
21422 	{
21423 		while ( *sStart && ( sphIsSpace ( *sStart ) || *sStart==',' ) )
21424 			++sStart;
21425 		if ( !*sStart )
21426 			break;
21427 
21428 		const char * sWordStart = sStart;
21429 		while ( *sStart && !sphIsSpace ( *sStart ) && *sStart!=',' )
21430 			++sStart;
21431 
21432 		if ( sStart > sWordStart )
21433 		{
21434 			switch ( InitMorph ( sWordStart, sStart - sWordStart, sMessage ) )
21435 			{
21436 				case ST_ERROR:		return ST_ERROR;
21437 				case ST_WARNING:	iRes = ST_WARNING;
21438 				default:			break;
21439 			}
21440 		}
21441 	}
21442 	return iRes;
21443 }
21444 
21445 
InitMorph(const char * szMorph,int iLength,CSphString & sMessage)21446 int CSphTemplateDictTraits::InitMorph ( const char * szMorph, int iLength, CSphString & sMessage )
21447 {
21448 	if ( iLength==0 )
21449 		return ST_OK;
21450 
21451 	if ( iLength==4 && !strncmp ( szMorph, "none", iLength ) )
21452 		return ST_OK;
21453 
21454 	if ( iLength==7 && !strncmp ( szMorph, "stem_en", iLength ) )
21455 	{
21456 		if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_EN ) )
21457 		{
21458 			sMessage.SetSprintf ( "stem_en and lemmatize_en clash" );
21459 			return ST_ERROR;
21460 		}
21461 
21462 		if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_EN_ALL ) )
21463 		{
21464 			sMessage.SetSprintf ( "stem_en and lemmatize_en_all clash" );
21465 			return ST_ERROR;
21466 		}
21467 
21468 		stem_en_init ();
21469 		return AddMorph ( SPH_MORPH_STEM_EN );
21470 	}
21471 
21472 	if ( iLength==7 && !strncmp ( szMorph, "stem_ru", iLength ) )
21473 	{
21474 		if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_UTF8 ) )
21475 		{
21476 			sMessage.SetSprintf ( "stem_ru and lemmatize_ru clash" );
21477 			return ST_ERROR;
21478 		}
21479 
21480 		if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_RU_ALL ) )
21481 		{
21482 			sMessage.SetSprintf ( "stem_ru and lemmatize_ru_all clash" );
21483 			return ST_ERROR;
21484 		}
21485 
21486 		stem_ru_init ();
21487 		return AddMorph ( SPH_MORPH_STEM_RU_UTF8 );
21488 	}
21489 
21490 	for ( int j=0; j<AOT_LENGTH; ++j )
21491 	{
21492 		char buf[20];
21493 		char buf_all[20];
21494 		sprintf ( buf, "lemmatize_%s", AOT_LANGUAGES[j] ); // NOLINT
21495 		sprintf ( buf_all, "lemmatize_%s_all", AOT_LANGUAGES[j] ); // NOLINT
21496 
21497 		if ( iLength==12 && !strncmp ( szMorph, buf, iLength ) )
21498 		{
21499 			if ( j==AOT_RU && m_dMorph.Contains ( SPH_MORPH_STEM_RU_UTF8 ) )
21500 			{
21501 				sMessage.SetSprintf ( "stem_ru and lemmatize_ru clash" );
21502 				return ST_ERROR;
21503 			}
21504 
21505 			if ( j==AOT_EN && m_dMorph.Contains ( SPH_MORPH_STEM_EN ) )
21506 			{
21507 				sMessage.SetSprintf ( "stem_en and lemmatize_en clash" );
21508 				return ST_ERROR;
21509 			}
21510 
21511 			// no test for SPH_MORPH_STEM_DE since we doesn't have it.
21512 
21513 			if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_BASE_ALL+j ) )
21514 			{
21515 				sMessage.SetSprintf ( "%s and %s clash", buf, buf_all );
21516 				return ST_ERROR;
21517 			}
21518 
21519 			CSphString sDictFile;
21520 			sDictFile.SetSprintf ( "%s/%s.pak", g_sLemmatizerBase.cstr(), AOT_LANGUAGES[j] );
21521 			if ( !sphAotInit ( sDictFile, sMessage, j ) )
21522 				return ST_ERROR;
21523 
21524 			// add manually instead of AddMorph(), because we need to update that fingerprint
21525 			int iMorph = j + SPH_MORPH_AOTLEMMER_BASE;
21526 			if ( j==AOT_RU )
21527 				iMorph = SPH_MORPH_AOTLEMMER_RU_UTF8;
21528 			else if ( j==AOT_DE )
21529 				iMorph = SPH_MORPH_AOTLEMMER_DE_UTF8;
21530 
21531 			if ( !m_dMorph.Contains ( iMorph ) )
21532 			{
21533 				if ( m_sMorphFingerprint.IsEmpty() )
21534 					m_sMorphFingerprint.SetSprintf ( "%s:%08x"
21535 						, sphAotDictinfo(j).m_sName.cstr()
21536 						, sphAotDictinfo(j).m_iValue );
21537 				else
21538 					m_sMorphFingerprint.SetSprintf ( "%s;%s:%08x"
21539 					, m_sMorphFingerprint.cstr()
21540 					, sphAotDictinfo(j).m_sName.cstr()
21541 					, sphAotDictinfo(j).m_iValue );
21542 				m_dMorph.Add ( iMorph );
21543 			}
21544 			return ST_OK;
21545 		}
21546 
21547 		if ( iLength==16 && !strncmp ( szMorph, buf_all, iLength ) )
21548 		{
21549 			if ( j==AOT_RU && ( m_dMorph.Contains ( SPH_MORPH_STEM_RU_UTF8 ) ) )
21550 			{
21551 				sMessage.SetSprintf ( "stem_ru and lemmatize_ru_all clash" );
21552 				return ST_ERROR;
21553 			}
21554 
21555 			if ( m_dMorph.Contains ( SPH_MORPH_AOTLEMMER_BASE+j ) )
21556 			{
21557 				sMessage.SetSprintf ( "%s and %s clash", buf, buf_all );
21558 				return ST_ERROR;
21559 			}
21560 
21561 			CSphString sDictFile;
21562 			sDictFile.SetSprintf ( "%s/%s.pak", g_sLemmatizerBase.cstr(), AOT_LANGUAGES[j] );
21563 			if ( !sphAotInit ( sDictFile, sMessage, j ) )
21564 				return ST_ERROR;
21565 
21566 			return AddMorph ( SPH_MORPH_AOTLEMMER_BASE_ALL+j );
21567 		}
21568 	}
21569 
21570 	if ( iLength==7 && !strncmp ( szMorph, "stem_cz", iLength ) )
21571 	{
21572 		stem_cz_init ();
21573 		return AddMorph ( SPH_MORPH_STEM_CZ );
21574 	}
21575 
21576 	if ( iLength==7 && !strncmp ( szMorph, "stem_ar", iLength ) )
21577 		return AddMorph ( SPH_MORPH_STEM_AR_UTF8 );
21578 
21579 	if ( iLength==9 && !strncmp ( szMorph, "stem_enru", iLength ) )
21580 	{
21581 		stem_en_init ();
21582 		stem_ru_init ();
21583 		AddMorph ( SPH_MORPH_STEM_EN );
21584 		return AddMorph ( SPH_MORPH_STEM_RU_UTF8 );
21585 	}
21586 
21587 	if ( iLength==7 && !strncmp ( szMorph, "soundex", iLength ) )
21588 		return AddMorph ( SPH_MORPH_SOUNDEX );
21589 
21590 	if ( iLength==9 && !strncmp ( szMorph, "metaphone", iLength ) )
21591 		return AddMorph ( SPH_MORPH_METAPHONE_UTF8 );
21592 
21593 #if USE_LIBSTEMMER
21594 	const int LIBSTEMMER_LEN = 11;
21595 	const int MAX_ALGO_LENGTH = 64;
21596 	if ( iLength > LIBSTEMMER_LEN && iLength - LIBSTEMMER_LEN < MAX_ALGO_LENGTH && !strncmp ( szMorph, "libstemmer_", LIBSTEMMER_LEN ) )
21597 	{
21598 		CSphString sAlgo;
21599 		sAlgo.SetBinary ( szMorph+LIBSTEMMER_LEN, iLength - LIBSTEMMER_LEN );
21600 
21601 		sb_stemmer * pStemmer = NULL;
21602 
21603 		pStemmer = sb_stemmer_new ( sAlgo.cstr(), "UTF_8" );
21604 
21605 		if ( !pStemmer )
21606 		{
21607 			sMessage.SetSprintf ( "unknown stemmer libstemmer_%s; skipped", sAlgo.cstr() );
21608 			return ST_WARNING;
21609 		}
21610 
21611 		AddMorph ( SPH_MORPH_LIBSTEMMER_FIRST + m_dStemmers.GetLength () );
21612 		ARRAY_FOREACH ( i, m_dStemmers )
21613 		{
21614 			if ( m_dStemmers[i]==pStemmer )
21615 			{
21616 				sb_stemmer_delete ( pStemmer );
21617 				return ST_OK;
21618 			}
21619 		}
21620 
21621 		m_dStemmers.Add ( pStemmer );
21622 		m_dDescStemmers.Add ( sAlgo );
21623 		return ST_OK;
21624 	}
21625 #endif
21626 
21627 	if ( iLength==11 && !strncmp ( szMorph, "rlp_chinese", iLength ) )
21628 		return ST_OK;
21629 
21630 	if ( iLength==19 && !strncmp ( szMorph, "rlp_chinese_batched", iLength ) )
21631 		return ST_OK;
21632 
21633 	sMessage.SetBinary ( szMorph, iLength );
21634 	sMessage.SetSprintf ( "unknown stemmer %s; skipped", sMessage.cstr() );
21635 	return ST_WARNING;
21636 }
21637 
21638 
AddMorph(int iMorph)21639 int CSphTemplateDictTraits::AddMorph ( int iMorph )
21640 {
21641 	if ( !m_dMorph.Contains ( iMorph ) )
21642 		m_dMorph.Add ( iMorph );
21643 	return ST_OK;
21644 }
21645 
21646 
21647 
ApplyStemmers(BYTE * pWord) const21648 void CSphTemplateDictTraits::ApplyStemmers ( BYTE * pWord ) const
21649 {
21650 	if ( !m_bApplyMorph )
21651 		return;
21652 
21653 	// try wordforms
21654 	if ( !m_bDisableWordforms && m_pWordforms && m_pWordforms->ToNormalForm ( pWord, true ) )
21655 		return;
21656 
21657 	// check length
21658 	if ( m_tSettings.m_iMinStemmingLen<=1 || sphUTF8Len ( (const char*)pWord )>=m_tSettings.m_iMinStemmingLen )
21659 	{
21660 		// try stemmers
21661 		ARRAY_FOREACH ( i, m_dMorph )
21662 			if ( StemById ( pWord, m_dMorph[i] ) )
21663 				break;
21664 	}
21665 
21666 	if ( !m_bDisableWordforms && m_pWordforms && m_pWordforms->m_bHavePostMorphNF )
21667 		m_pWordforms->ToNormalForm ( pWord, false );
21668 }
21669 
GetMultiWordforms() const21670 const CSphMultiformContainer * CSphTemplateDictTraits::GetMultiWordforms () const
21671 {
21672 	return m_pWordforms ? m_pWordforms->m_pMultiWordforms : NULL;
21673 }
21674 
GetSettingsFNV() const21675 uint64_t CSphTemplateDictTraits::GetSettingsFNV () const
21676 {
21677 	uint64_t uHash = (uint64_t)m_pWordforms;
21678 
21679 	if ( m_pStopwords )
21680 		uHash = sphFNV64 ( m_pStopwords, m_iStopwords*sizeof(*m_pStopwords), uHash );
21681 
21682 	uHash = sphFNV64 ( &m_tSettings.m_iMinStemmingLen, sizeof(m_tSettings.m_iMinStemmingLen), uHash );
21683 	DWORD uFlags = 0;
21684 	if ( m_tSettings.m_bWordDict )
21685 		uFlags |= 1<<0;
21686 	if ( m_tSettings.m_bCrc32 )
21687 		uFlags |= 1<<1;
21688 	if ( m_tSettings.m_bStopwordsUnstemmed )
21689 		uFlags |= 1<<2;
21690 	uHash = sphFNV64 ( &uFlags, sizeof(uFlags), uHash );
21691 
21692 	uHash = sphFNV64 ( m_dMorph.Begin(), m_dMorph.GetLength()*sizeof(m_dMorph[0]), uHash );
21693 #if USE_LIBSTEMMER
21694 	ARRAY_FOREACH ( i, m_dDescStemmers )
21695 		uHash = sphFNV64 ( m_dDescStemmers[i].cstr(), m_dDescStemmers[i].Length(), uHash );
21696 #endif
21697 
21698 	return uHash;
21699 }
21700 
SetApplyMorph(bool bApply)21701 void CSphTemplateDictTraits::SetApplyMorph ( bool bApply )
21702 {
21703 	if ( HasState() )
21704 		m_bApplyMorph = bApply;
21705 }
21706 
CloneBase(CSphTemplateDictTraits * pDict) const21707 CSphDict * CSphTemplateDictTraits::CloneBase ( CSphTemplateDictTraits * pDict ) const
21708 {
21709 	assert ( pDict );
21710 	pDict->m_tSettings = m_tSettings;
21711 	pDict->m_iStopwords = m_iStopwords;
21712 	pDict->m_pStopwords = m_pStopwords;
21713 	pDict->m_dSWFileInfos = m_dSWFileInfos;
21714 	pDict->m_dWFFileInfos = m_dWFFileInfos;
21715 	pDict->m_pWordforms = m_pWordforms;
21716 	if ( m_pWordforms )
21717 		m_pWordforms->m_iRefCount++;
21718 
21719 	pDict->m_dMorph = m_dMorph;
21720 #if USE_LIBSTEMMER
21721 	assert ( m_dDescStemmers.GetLength()==m_dStemmers.GetLength() );
21722 	pDict->m_dDescStemmers = m_dDescStemmers;
21723 	ARRAY_FOREACH ( i, m_dDescStemmers )
21724 	{
21725 		pDict->m_dStemmers.Add ( sb_stemmer_new ( m_dDescStemmers[i].cstr(), "UTF_8" ) );
21726 		assert ( pDict->m_dStemmers.Last() );
21727 	}
21728 #endif
21729 
21730 	pDict->m_bApplyMorph = m_bApplyMorph;
21731 
21732 	return pDict;
21733 }
21734 
HasState() const21735 bool CSphTemplateDictTraits::HasState() const
21736 {
21737 #if USE_RLP
21738 	return true;
21739 #else
21740 #if !USE_LIBSTEMMER
21741 	return false;
21742 #else
21743 	return ( m_dDescStemmers.GetLength()>0 );
21744 #endif
21745 #endif
21746 }
21747 
21748 /////////////////////////////////////////////////////////////////////////////
21749 
21750 template<>
DoCrc(const BYTE * pWord)21751 SphWordID_t CCRCEngine<true>::DoCrc ( const BYTE * pWord )
21752 {
21753 	return sphCRC32 ( pWord );
21754 }
21755 
21756 
21757 template<>
DoCrc(const BYTE * pWord)21758 SphWordID_t CCRCEngine<false>::DoCrc ( const BYTE * pWord )
21759 {
21760 	return (SphWordID_t) sphFNV64 ( pWord );
21761 }
21762 
21763 
21764 template<>
DoCrc(const BYTE * pWord,int iLen)21765 SphWordID_t CCRCEngine<true>::DoCrc ( const BYTE * pWord, int iLen )
21766 {
21767 	return sphCRC32 ( pWord, iLen );
21768 }
21769 
21770 
21771 template<>
DoCrc(const BYTE * pWord,int iLen)21772 SphWordID_t CCRCEngine<false>::DoCrc ( const BYTE * pWord, int iLen )
21773 {
21774 	return (SphWordID_t) sphFNV64 ( pWord, iLen );
21775 }
21776 
21777 
21778 template < bool CRC32DICT >
GetWordID(BYTE * pWord)21779 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordID ( BYTE * pWord )
21780 {
21781 	// apply stopword filter before stemmers
21782 	if ( GetSettings().m_bStopwordsUnstemmed && !FilterStopword ( tHASH::DoCrc ( pWord ) ) )
21783 		return 0;
21784 
21785 	// skip stemmers for magic words
21786 	if ( pWord[0]>=0x20 )
21787 		ApplyStemmers ( pWord );
21788 
21789 	// stemmer might squeeze out the word
21790 	if ( !pWord[0] )
21791 		return 0;
21792 
21793 	return GetSettings().m_bStopwordsUnstemmed
21794 		? tHASH::DoCrc ( pWord )
21795 		: FilterStopword ( tHASH::DoCrc ( pWord ) );
21796 }
21797 
21798 
21799 template < bool CRC32DICT >
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)21800 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
21801 {
21802 	SphWordID_t uId = tHASH::DoCrc ( pWord, iLen );
21803 	return bFilterStops ? FilterStopword ( uId ) : uId;
21804 }
21805 
21806 
21807 template < bool CRC32DICT >
GetWordIDWithMarkers(BYTE * pWord)21808 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordIDWithMarkers ( BYTE * pWord )
21809 {
21810 	ApplyStemmers ( pWord + 1 );
21811 	SphWordID_t uWordId = tHASH::DoCrc ( pWord + 1 );
21812 	int iLength = strlen ( (const char *)(pWord + 1) );
21813 	pWord [iLength + 1] = MAGIC_WORD_TAIL;
21814 	pWord [iLength + 2] = '\0';
21815 	return FilterStopword ( uWordId ) ? tHASH::DoCrc ( pWord ) : 0;
21816 }
21817 
21818 
21819 template < bool CRC32DICT >
GetWordIDNonStemmed(BYTE * pWord)21820 SphWordID_t CSphDictCRC<CRC32DICT>::GetWordIDNonStemmed ( BYTE * pWord )
21821 {
21822 	SphWordID_t uWordId = tHASH::DoCrc ( pWord + 1 );
21823 	if ( !FilterStopword ( uWordId ) )
21824 		return 0;
21825 
21826 	return tHASH::DoCrc ( pWord );
21827 }
21828 
21829 
21830 template < bool CRC32DICT >
IsStopWord(const BYTE * pWord) const21831 bool CSphDictCRC<CRC32DICT>::IsStopWord ( const BYTE * pWord ) const
21832 {
21833 	return FilterStopword ( tHASH::DoCrc ( pWord ) )==0;
21834 }
21835 
21836 
21837 //////////////////////////////////////////////////////////////////////////
GetWordID(BYTE * pWord)21838 SphWordID_t CSphDictTemplate::GetWordID ( BYTE * pWord )
21839 {
21840 	// apply stopword filter before stemmers
21841 	if ( GetSettings().m_bStopwordsUnstemmed && !FilterStopword ( DoCrc ( pWord ) ) )
21842 		return 0;
21843 
21844 	// skip stemmers for magic words
21845 	if ( pWord[0]>=0x20 )
21846 		ApplyStemmers ( pWord );
21847 
21848 	return GetSettings().m_bStopwordsUnstemmed
21849 		? DoCrc ( pWord )
21850 		: FilterStopword ( DoCrc ( pWord ) );
21851 }
21852 
21853 
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)21854 SphWordID_t CSphDictTemplate::GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
21855 {
21856 	SphWordID_t uId = DoCrc ( pWord, iLen );
21857 	return bFilterStops ? FilterStopword ( uId ) : uId;
21858 }
21859 
21860 
GetWordIDWithMarkers(BYTE * pWord)21861 SphWordID_t CSphDictTemplate::GetWordIDWithMarkers ( BYTE * pWord )
21862 {
21863 	ApplyStemmers ( pWord + 1 );
21864 	// stemmer might squeeze out the word
21865 	if ( !pWord[1] )
21866 		return 0;
21867 	SphWordID_t uWordId = DoCrc ( pWord + 1 );
21868 	int iLength = strlen ( (const char *)(pWord + 1) );
21869 	pWord [iLength + 1] = MAGIC_WORD_TAIL;
21870 	pWord [iLength + 2] = '\0';
21871 	return FilterStopword ( uWordId ) ? DoCrc ( pWord ) : 0;
21872 }
21873 
21874 
GetWordIDNonStemmed(BYTE * pWord)21875 SphWordID_t CSphDictTemplate::GetWordIDNonStemmed ( BYTE * pWord )
21876 {
21877 	SphWordID_t uWordId = DoCrc ( pWord + 1 );
21878 	if ( !FilterStopword ( uWordId ) )
21879 		return 0;
21880 
21881 	return DoCrc ( pWord );
21882 }
21883 
IsStopWord(const BYTE * pWord) const21884 bool CSphDictTemplate::IsStopWord ( const BYTE * pWord ) const
21885 {
21886 	return FilterStopword ( DoCrc ( pWord ) )==0;
21887 }
21888 
21889 //////////////////////////////////////////////////////////////////////////
21890 
LoadStopwords(const char * sFiles,const ISphTokenizer * pTokenizer)21891 void CSphTemplateDictTraits::LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer )
21892 {
21893 	assert ( !m_pStopwords );
21894 	assert ( !m_iStopwords );
21895 
21896 	// tokenize file list
21897 	if ( !sFiles || !*sFiles )
21898 		return;
21899 
21900 	m_dSWFileInfos.Resize ( 0 );
21901 
21902 	CSphScopedPtr<ISphTokenizer> tTokenizer ( pTokenizer->Clone ( SPH_CLONE_INDEX ) );
21903 	CSphFixedVector<char> dList ( 1+strlen(sFiles) );
21904 	strcpy ( dList.Begin(), sFiles ); // NOLINT
21905 
21906 	char * pCur = dList.Begin();
21907 	char * sName = NULL;
21908 
21909 	CSphVector<SphWordID_t> dStop;
21910 
21911 	for ( ;; )
21912 	{
21913 		// find next name start
21914 		while ( *pCur && isspace(*pCur) ) pCur++;
21915 		if ( !*pCur ) break;
21916 		sName = pCur;
21917 
21918 		// find next name end
21919 		while ( *pCur && !isspace(*pCur) ) pCur++;
21920 		if ( *pCur ) *pCur++ = '\0';
21921 
21922 		BYTE * pBuffer = NULL;
21923 
21924 		CSphSavedFile tInfo;
21925 		tInfo.m_sFilename = sName;
21926 		GetFileStats ( sName, tInfo, NULL );
21927 		m_dSWFileInfos.Add ( tInfo );
21928 
21929 		// open file
21930 		struct_stat st;
21931 		if ( stat ( sName, &st )==0 )
21932 			pBuffer = new BYTE [(size_t)st.st_size];
21933 		else
21934 		{
21935 			sphWarn ( "stopwords: failed to get file size for '%s'", sName );
21936 			continue;
21937 		}
21938 
21939 		FILE * fp = fopen ( sName, "rb" );
21940 		if ( !fp )
21941 		{
21942 			sphWarn ( "failed to load stopwords from '%s'", sName );
21943 			SafeDeleteArray ( pBuffer );
21944 			continue;
21945 		}
21946 
21947 		// tokenize file
21948 		int iLength = (int)fread ( pBuffer, 1, (size_t)st.st_size, fp );
21949 
21950 		BYTE * pToken;
21951 		tTokenizer->SetBuffer ( pBuffer, iLength );
21952 		while ( ( pToken = tTokenizer->GetToken() )!=NULL )
21953 			if ( m_tSettings.m_bStopwordsUnstemmed )
21954 				dStop.Add ( GetWordIDNonStemmed ( pToken ) );
21955 			else
21956 				dStop.Add ( GetWordID ( pToken ) );
21957 
21958 		// close file
21959 		fclose ( fp );
21960 
21961 		SafeDeleteArray ( pBuffer );
21962 	}
21963 
21964 	// sort stopwords
21965 	dStop.Uniq();
21966 
21967 	// store IDs
21968 	if ( dStop.GetLength() )
21969 	{
21970 		m_dStopwordContainer.Reset ( dStop.GetLength() );
21971 		ARRAY_FOREACH ( i, dStop )
21972 			m_dStopwordContainer[i] = dStop[i];
21973 
21974 		m_iStopwords = m_dStopwordContainer.GetLength ();
21975 		m_pStopwords = m_dStopwordContainer.Begin();
21976 	}
21977 }
21978 
21979 
LoadStopwords(const CSphVector<SphWordID_t> & dStopwords)21980 void CSphTemplateDictTraits::LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords )
21981 {
21982 	m_dStopwordContainer.Reset ( dStopwords.GetLength() );
21983 	ARRAY_FOREACH ( i, dStopwords )
21984 		m_dStopwordContainer[i] = dStopwords[i];
21985 
21986 	m_iStopwords = m_dStopwordContainer.GetLength ();
21987 	m_pStopwords = m_dStopwordContainer.Begin();
21988 }
21989 
21990 
WriteStopwords(CSphWriter & tWriter)21991 void CSphTemplateDictTraits::WriteStopwords ( CSphWriter & tWriter )
21992 {
21993 	tWriter.PutDword ( (DWORD)m_iStopwords );
21994 	for ( int i = 0; i < m_iStopwords; i++ )
21995 		tWriter.ZipOffset ( m_pStopwords[i] );
21996 }
21997 
21998 
SweepWordformContainers(const CSphVector<CSphSavedFile> & dFiles)21999 void CSphTemplateDictTraits::SweepWordformContainers ( const CSphVector<CSphSavedFile> & dFiles )
22000 {
22001 	for ( int i = 0; i < m_dWordformContainers.GetLength (); )
22002 	{
22003 		CSphWordforms * WC = m_dWordformContainers[i];
22004 		if ( WC->m_iRefCount==0 && !WC->IsEqual ( dFiles ) )
22005 		{
22006 			delete WC;
22007 			m_dWordformContainers.Remove ( i );
22008 		} else
22009 			++i;
22010 	}
22011 }
22012 
22013 
22014 static const int MAX_REPORT_LEN = 1024;
22015 
AddStringToReport(CSphString & sReport,const CSphString & sString,bool bLast)22016 void AddStringToReport ( CSphString & sReport, const CSphString & sString, bool bLast )
22017 {
22018 	int iLen = sReport.Length();
22019 	if ( iLen + sString.Length() + 2 > MAX_REPORT_LEN )
22020 		return;
22021 
22022 	char * szReport = (char *)sReport.cstr();
22023 	strcat ( szReport, sString.cstr() );	// NOLINT
22024 	iLen += sString.Length();
22025 	if ( bLast )
22026 		szReport[iLen] = '\0';
22027 	else
22028 	{
22029 		szReport[iLen] = ' ';
22030 		szReport[iLen+1] = '\0';
22031 	}
22032 }
22033 
22034 
ConcatReportStrings(const CSphTightVector<CSphString> & dStrings,CSphString & sReport)22035 void ConcatReportStrings ( const CSphTightVector<CSphString> & dStrings, CSphString & sReport )
22036 {
22037 	sReport.Reserve ( MAX_REPORT_LEN );
22038 	*(char *)sReport.cstr() = '\0';
22039 
22040 	ARRAY_FOREACH ( i, dStrings )
22041 		AddStringToReport ( sReport, dStrings[i], i==dStrings.GetLength()-1 );
22042 }
22043 
22044 
ConcatReportStrings(const CSphTightVector<CSphNormalForm> & dStrings,CSphString & sReport)22045 void ConcatReportStrings ( const CSphTightVector<CSphNormalForm> & dStrings, CSphString & sReport )
22046 {
22047 	sReport.Reserve ( MAX_REPORT_LEN );
22048 	*(char *)sReport.cstr() = '\0';
22049 
22050 	ARRAY_FOREACH ( i, dStrings )
22051 		AddStringToReport ( sReport, dStrings[i].m_sForm, i==dStrings.GetLength()-1 );
22052 }
22053 
22054 
GetWordformContainer(const CSphVector<CSphSavedFile> & dFileInfos,const CSphVector<CSphString> * pEmbedded,const ISphTokenizer * pTokenizer,const char * sIndex)22055 CSphWordforms * CSphTemplateDictTraits::GetWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos,
22056 	const CSphVector<CSphString> * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex )
22057 {
22058 	uint64_t uTokenizerFNV = pTokenizer->GetSettingsFNV();
22059 	ARRAY_FOREACH ( i, m_dWordformContainers )
22060 		if ( m_dWordformContainers[i]->IsEqual ( dFileInfos ) )
22061 		{
22062 			CSphWordforms * pContainer = m_dWordformContainers[i];
22063 			if ( uTokenizerFNV==pContainer->m_uTokenizerFNV )
22064 				return pContainer;
22065 
22066 			CSphTightVector<CSphString> dErrorReport;
22067 			ARRAY_FOREACH ( j, dFileInfos )
22068 				dErrorReport.Add ( dFileInfos[j].m_sFilename );
22069 
22070 			CSphString sAllFiles;
22071 			ConcatReportStrings ( dErrorReport, sAllFiles );
22072 			sphWarning ( "index '%s': wordforms file '%s' is shared with index '%s', "
22073 				"but tokenizer settings are different",
22074 				sIndex, sAllFiles.cstr(), pContainer->m_sIndexName.cstr() );
22075 		}
22076 
22077 	CSphWordforms * pContainer = LoadWordformContainer ( dFileInfos, pEmbedded, pTokenizer, sIndex );
22078 	if ( pContainer )
22079 		m_dWordformContainers.Add ( pContainer );
22080 
22081 	return pContainer;
22082 }
22083 
22084 
22085 struct CmpMultiforms_fn
22086 {
IsLessCmpMultiforms_fn22087 	inline bool IsLess ( const CSphMultiform * pA, const CSphMultiform * pB ) const
22088 	{
22089 		assert ( pA && pB );
22090 		if ( pA->m_iFileId==pB->m_iFileId )
22091 			return pA->m_dTokens.GetLength() > pB->m_dTokens.GetLength();
22092 
22093 		return pA->m_iFileId > pB->m_iFileId;
22094 	}
22095 };
22096 
22097 
AddWordform(CSphWordforms * pContainer,char * sBuffer,int iLen,ISphTokenizer * pTokenizer,const char * szFile,const CSphVector<int> & dBlended,int iFileId)22098 void CSphTemplateDictTraits::AddWordform ( CSphWordforms * pContainer, char * sBuffer, int iLen,
22099 	ISphTokenizer * pTokenizer, const char * szFile, const CSphVector<int> & dBlended, int iFileId )
22100 {
22101 	CSphVector<CSphString> dTokens;
22102 
22103 	bool bSeparatorFound = false;
22104 	bool bAfterMorphology = false;
22105 
22106 	// parse the line
22107 	pTokenizer->SetBuffer ( (BYTE*)sBuffer, iLen );
22108 
22109 	bool bFirstToken = true;
22110 	bool bStopwordsPresent = false;
22111 	bool bCommentedWholeLine = false;
22112 
22113 	BYTE * pFrom = NULL;
22114 	while ( ( pFrom = pTokenizer->GetToken () )!=NULL )
22115 	{
22116 		if ( *pFrom=='#' )
22117 		{
22118 			bCommentedWholeLine = bFirstToken;
22119 			break;
22120 		}
22121 
22122 		if ( *pFrom=='~' && bFirstToken )
22123 		{
22124 			bAfterMorphology = true;
22125 			bFirstToken = false;
22126 			continue;
22127 		}
22128 
22129 		bFirstToken = false;
22130 
22131 		if ( *pFrom=='>' )
22132 		{
22133 			bSeparatorFound = true;
22134 			break;
22135 		}
22136 
22137 		if ( *pFrom=='=' && *pTokenizer->GetBufferPtr()=='>' )
22138 		{
22139 			pTokenizer->GetToken();
22140 			bSeparatorFound = true;
22141 			break;
22142 		}
22143 
22144 		if ( GetWordID ( pFrom, strlen ( (const char*)pFrom ), true ) )
22145 			dTokens.Add ( (const char*)pFrom );
22146 		else
22147 			bStopwordsPresent = true;
22148 	}
22149 
22150 	if ( !dTokens.GetLength() )
22151 	{
22152 		if ( !bCommentedWholeLine )
22153 			sphWarning ( "index '%s': all source tokens are stopwords (wordform='%s', file='%s'). IGNORED.", pContainer->m_sIndexName.cstr(), sBuffer, szFile );
22154 		return;
22155 	}
22156 
22157 	if ( !bSeparatorFound )
22158 	{
22159 		sphWarning ( "index '%s': no wordform separator found (wordform='%s', file='%s'). IGNORED.", pContainer->m_sIndexName.cstr(), sBuffer, szFile );
22160 		return;
22161 	}
22162 
22163 	BYTE * pTo = pTokenizer->GetToken ();
22164 	if ( !pTo )
22165 	{
22166 		sphWarning ( "index '%s': no destination token found (wordform='%s', file='%s'). IGNORED.", pContainer->m_sIndexName.cstr(), sBuffer, szFile );
22167 		return;
22168 	}
22169 
22170 	if ( *pTo=='#' )
22171 	{
22172 		sphWarning ( "index '%s': misplaced comment (wordform='%s', file='%s'). IGNORED.", pContainer->m_sIndexName.cstr(), sBuffer, szFile );
22173 		return;
22174 	}
22175 
22176 	CSphVector<CSphNormalForm> dDestTokens;
22177 	bool bFirstDestIsStop = !GetWordID ( pTo, strlen ( (const char*)pTo ), true );
22178 	CSphNormalForm & tForm = dDestTokens.Add();
22179 	tForm.m_sForm = (const char *)pTo;
22180 	tForm.m_iLengthCP = pTokenizer->GetLastTokenLen();
22181 
22182 	// what if we have more than one word in the right part?
22183 	const BYTE * pDestToken;
22184 	while ( ( pDestToken = pTokenizer->GetToken() )!=NULL )
22185 	{
22186 		bool bStop = ( !GetWordID ( pDestToken, strlen ( (const char*)pDestToken ), true ) );
22187 		if ( !bStop )
22188 		{
22189 			CSphNormalForm & tForm = dDestTokens.Add();
22190 			tForm.m_sForm = (const char *)pDestToken;
22191 			tForm.m_iLengthCP = pTokenizer->GetLastTokenLen();
22192 		}
22193 
22194 		bStopwordsPresent |= bStop;
22195 	}
22196 
22197 	// we can have wordforms with 1 destination token that is a stopword
22198 	if ( dDestTokens.GetLength()>1 && bFirstDestIsStop )
22199 		dDestTokens.Remove(0);
22200 
22201 	if ( !dDestTokens.GetLength() )
22202 	{
22203 		sphWarning ( "index '%s': destination token is a stopword (wordform='%s', file='%s'). IGNORED.", pContainer->m_sIndexName.cstr(), sBuffer, szFile );
22204 		return;
22205 	}
22206 
22207 	if ( bStopwordsPresent )
22208 		sphWarning ( "index '%s': wordform contains stopwords (wordform='%s'). Fix your wordforms file '%s'.", pContainer->m_sIndexName.cstr(), sBuffer, szFile );
22209 
22210 	// we disabled all blended, so we need to filter them manually
22211 	bool bBlendedPresent = false;
22212 	if ( dBlended.GetLength() )
22213 		ARRAY_FOREACH ( i, dDestTokens )
22214 		{
22215 			int iCode;
22216 			const BYTE * pBuf = (const BYTE *) dDestTokens[i].m_sForm.cstr();
22217 			while ( ( iCode = sphUTF8Decode ( pBuf ) )>0 && !bBlendedPresent )
22218 				bBlendedPresent = ( dBlended.BinarySearch ( iCode )!=NULL );
22219 		}
22220 
22221 	if ( bBlendedPresent )
22222 		sphWarning ( "invalid mapping (destination contains blended characters) (wordform='%s'). Fix your wordforms file '%s'.", sBuffer, szFile );
22223 
22224 	if ( bBlendedPresent && dDestTokens.GetLength()>1 )
22225 	{
22226 		sphWarning ( "blended characters are not allowed with multiple destination tokens (wordform='%s', file='%s'). IGNORED.", sBuffer, szFile );
22227 		return;
22228 	}
22229 
22230 	if ( dTokens.GetLength()>1 || dDestTokens.GetLength()>1 )
22231 	{
22232 		CSphMultiform * pMultiWordform = new CSphMultiform;
22233 		pMultiWordform->m_iFileId = iFileId;
22234 		pMultiWordform->m_dNormalForm.Resize ( dDestTokens.GetLength() );
22235 		ARRAY_FOREACH ( i, dDestTokens )
22236 			pMultiWordform->m_dNormalForm[i] = dDestTokens[i];
22237 
22238 		for ( int i = 1; i < dTokens.GetLength(); i++ )
22239 			pMultiWordform->m_dTokens.Add ( dTokens[i] );
22240 
22241 		if ( !pContainer->m_pMultiWordforms )
22242 			pContainer->m_pMultiWordforms = new CSphMultiformContainer;
22243 
22244 		CSphMultiforms ** pWordforms = pContainer->m_pMultiWordforms->m_Hash ( dTokens[0] );
22245 		if ( pWordforms )
22246 		{
22247 			ARRAY_FOREACH ( iMultiform, (*pWordforms)->m_pForms )
22248 			{
22249 				CSphMultiform * pStoredMF = (*pWordforms)->m_pForms[iMultiform];
22250 				if ( pStoredMF->m_dTokens.GetLength()==pMultiWordform->m_dTokens.GetLength() )
22251 				{
22252 					bool bSameTokens = true;
22253 					ARRAY_FOREACH_COND ( iToken, pStoredMF->m_dTokens, bSameTokens )
22254 						if ( pStoredMF->m_dTokens[iToken]!=pMultiWordform->m_dTokens[iToken] )
22255 							bSameTokens = false;
22256 
22257 					if ( bSameTokens )
22258 					{
22259 						CSphString sStoredTokens, sStoredForms;
22260 						ConcatReportStrings ( pStoredMF->m_dTokens, sStoredTokens );
22261 						ConcatReportStrings ( pStoredMF->m_dNormalForm, sStoredForms );
22262 						sphWarning ( "index '%s': duplicate wordform found - overridden ( current='%s', old='%s %s > %s' ). Fix your wordforms file '%s'.",
22263 							pContainer->m_sIndexName.cstr(), sBuffer, dTokens[0].cstr(), sStoredTokens.cstr(), sStoredForms.cstr(), szFile );
22264 
22265 						pStoredMF->m_dNormalForm.Resize ( pMultiWordform->m_dNormalForm.GetLength() );
22266 						ARRAY_FOREACH ( iForm, pMultiWordform->m_dNormalForm )
22267 							pStoredMF->m_dNormalForm[iForm] = pMultiWordform->m_dNormalForm[iForm];
22268 
22269 						pStoredMF->m_iFileId = iFileId;
22270 
22271 						SafeDelete ( pMultiWordform );
22272 						break; // otherwise, we crash next turn
22273 					}
22274 				}
22275 			}
22276 
22277 			if ( pMultiWordform )
22278 			{
22279 				(*pWordforms)->m_pForms.Add ( pMultiWordform );
22280 
22281 				// sort forms by files and length
22282 				// but do not sort if we're loading embedded
22283 				if ( iFileId>=0 )
22284 					(*pWordforms)->m_pForms.Sort ( CmpMultiforms_fn() );
22285 
22286 				(*pWordforms)->m_iMinTokens = Min ( (*pWordforms)->m_iMinTokens, pMultiWordform->m_dTokens.GetLength () );
22287 				(*pWordforms)->m_iMaxTokens = Max ( (*pWordforms)->m_iMaxTokens, pMultiWordform->m_dTokens.GetLength () );
22288 				pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, (*pWordforms)->m_iMaxTokens );
22289 			}
22290 		} else
22291 		{
22292 			CSphMultiforms * pNewWordforms = new CSphMultiforms;
22293 			pNewWordforms->m_pForms.Add ( pMultiWordform );
22294 			pNewWordforms->m_iMinTokens = pMultiWordform->m_dTokens.GetLength ();
22295 			pNewWordforms->m_iMaxTokens = pMultiWordform->m_dTokens.GetLength ();
22296 			pContainer->m_pMultiWordforms->m_iMaxTokens = Max ( pContainer->m_pMultiWordforms->m_iMaxTokens, pNewWordforms->m_iMaxTokens );
22297 			pContainer->m_pMultiWordforms->m_Hash.Add ( pNewWordforms, dTokens[0] );
22298 		}
22299 
22300 		// let's add destination form to regular wordform to keep destination from being stemmed
22301 		// FIXME!!! handle multiple destination tokens and ~flag for wordforms
22302 		if ( !bAfterMorphology && dDestTokens.GetLength()==1 && !pContainer->m_dHash.Exists ( dDestTokens[0].m_sForm ) )
22303 		{
22304 			CSphStoredNF tForm;
22305 			tForm.m_sWord = dDestTokens[0].m_sForm;
22306 			tForm.m_bAfterMorphology = bAfterMorphology;
22307 			pContainer->m_bHavePostMorphNF |= bAfterMorphology;
22308 			if ( !pContainer->m_dNormalForms.GetLength()
22309 				|| pContainer->m_dNormalForms.Last().m_sWord!=dDestTokens[0].m_sForm
22310 				|| pContainer->m_dNormalForms.Last().m_bAfterMorphology!=bAfterMorphology )
22311 				pContainer->m_dNormalForms.Add ( tForm );
22312 
22313 			pContainer->m_dHash.Add ( pContainer->m_dNormalForms.GetLength()-1, dDestTokens[0].m_sForm );
22314 		}
22315 	} else
22316 	{
22317 		if ( bAfterMorphology )
22318 		{
22319 			BYTE pBuf [16+3*SPH_MAX_WORD_LEN];
22320 			memcpy ( pBuf, dTokens[0].cstr(), dTokens[0].Length()+1 );
22321 			ApplyStemmers ( pBuf );
22322 			dTokens[0] = (char *)pBuf;
22323 		}
22324 
22325 		// check wordform that source token is a new token or has same destination token
22326 		int * pRefTo = pContainer->m_dHash ( dTokens[0] );
22327 		assert ( !pRefTo || ( *pRefTo>=0 && *pRefTo<pContainer->m_dNormalForms.GetLength() ) );
22328 		if ( pRefTo )
22329 		{
22330 			// replace with a new wordform
22331 			if ( pContainer->m_dNormalForms[*pRefTo].m_sWord!=dDestTokens[0].m_sForm || pContainer->m_dNormalForms[*pRefTo].m_bAfterMorphology!=bAfterMorphology )
22332 			{
22333 				CSphStoredNF & tRefTo = pContainer->m_dNormalForms[*pRefTo];
22334 				sphWarning ( "index '%s': duplicate wordform found - overridden ( current='%s', old='%s%s > %s' ). Fix your wordforms file '%s'.",
22335 					pContainer->m_sIndexName.cstr(), sBuffer, tRefTo.m_bAfterMorphology ? "~" : "", dTokens[0].cstr(), tRefTo.m_sWord.cstr(), szFile );
22336 
22337 				tRefTo.m_sWord = dDestTokens[0].m_sForm;
22338 				tRefTo.m_bAfterMorphology = bAfterMorphology;
22339 				pContainer->m_bHavePostMorphNF |= bAfterMorphology;
22340 			} else
22341 				sphWarning ( "index '%s': duplicate wordform found ( '%s' ). Fix your wordforms file '%s'.", pContainer->m_sIndexName.cstr(), sBuffer, szFile );
22342 		} else
22343 		{
22344 			CSphStoredNF tForm;
22345 			tForm.m_sWord = dDestTokens[0].m_sForm;
22346 			tForm.m_bAfterMorphology = bAfterMorphology;
22347 			pContainer->m_bHavePostMorphNF |= bAfterMorphology;
22348 			if ( !pContainer->m_dNormalForms.GetLength()
22349 				|| pContainer->m_dNormalForms.Last().m_sWord!=dDestTokens[0].m_sForm
22350 				|| pContainer->m_dNormalForms.Last().m_bAfterMorphology!=bAfterMorphology)
22351 				pContainer->m_dNormalForms.Add ( tForm );
22352 
22353 			pContainer->m_dHash.Add ( pContainer->m_dNormalForms.GetLength()-1, dTokens[0] );
22354 		}
22355 	}
22356 }
22357 
22358 
LoadWordformContainer(const CSphVector<CSphSavedFile> & dFileInfos,const CSphVector<CSphString> * pEmbeddedWordforms,const ISphTokenizer * pTokenizer,const char * sIndex)22359 CSphWordforms * CSphTemplateDictTraits::LoadWordformContainer ( const CSphVector<CSphSavedFile> & dFileInfos,
22360 	const CSphVector<CSphString> * pEmbeddedWordforms, const ISphTokenizer * pTokenizer, const char * sIndex )
22361 {
22362 	// allocate it
22363 	CSphWordforms * pContainer = new CSphWordforms();
22364 	pContainer->m_dFiles = dFileInfos;
22365 	pContainer->m_uTokenizerFNV = pTokenizer->GetSettingsFNV();
22366 	pContainer->m_sIndexName = sIndex;
22367 
22368 	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( pTokenizer->Clone ( SPH_CLONE_INDEX ) );
22369 	const CSphTokenizerSettings & tSettings = pMyTokenizer->GetSettings();
22370 	CSphVector<int> dBlended;
22371 
22372 	// get a list of blend chars and set add them to the tokenizer as simple chars
22373 	if ( tSettings.m_sBlendChars.Length() )
22374 	{
22375 		CSphVector<char> dNewCharset;
22376 		dNewCharset.Resize ( tSettings.m_sCaseFolding.Length() );
22377 		memcpy ( dNewCharset.Begin(), tSettings.m_sCaseFolding.cstr(), dNewCharset.GetLength() );
22378 
22379 		CSphVector<CSphRemapRange> dRemaps;
22380 		CSphCharsetDefinitionParser tParser;
22381 		if ( tParser.Parse ( tSettings.m_sBlendChars.cstr(), dRemaps ) )
22382 			ARRAY_FOREACH ( i, dRemaps )
22383 				for ( int j = dRemaps[i].m_iStart; j<=dRemaps[i].m_iEnd; j++ )
22384 				{
22385 					dNewCharset.Add ( ',' );
22386 					dNewCharset.Add ( ' ' );
22387 					dNewCharset.Add ( char(j) );
22388 					dBlended.Add ( j );
22389 				}
22390 
22391 		dNewCharset.Add(0);
22392 
22393 		// sort dBlended for binary search
22394 		dBlended.Sort ();
22395 
22396 		CSphString sError;
22397 		pMyTokenizer->SetCaseFolding ( dNewCharset.Begin(), sError );
22398 
22399 		// disable blend chars
22400 		pMyTokenizer->SetBlendChars ( NULL, sError );
22401 	}
22402 
22403 	// add wordform-specific specials
22404 	pMyTokenizer->AddSpecials ( "#=>~" );
22405 
22406 	if ( pEmbeddedWordforms )
22407 	{
22408 		CSphTightVector<CSphString> dFilenames;
22409 		dFilenames.Resize ( dFileInfos.GetLength() );
22410 		ARRAY_FOREACH ( i, dFileInfos )
22411 			dFilenames[i] = dFileInfos[i].m_sFilename;
22412 
22413 		CSphString sAllFiles;
22414 		ConcatReportStrings ( dFilenames, sAllFiles );
22415 
22416 		ARRAY_FOREACH ( i, (*pEmbeddedWordforms) )
22417 			AddWordform ( pContainer, (char*)(*pEmbeddedWordforms)[i].cstr(),
22418 				(*pEmbeddedWordforms)[i].Length(), pMyTokenizer.Ptr(), sAllFiles.cstr(), dBlended, -1 );
22419 	} else
22420 	{
22421 		char sBuffer [ 6*SPH_MAX_WORD_LEN + 512 ]; // enough to hold 2 UTF-8 words, plus some whitespace overhead
22422 
22423 		ARRAY_FOREACH ( i, dFileInfos )
22424 		{
22425 			CSphAutoreader rdWordforms;
22426 			const char * szFile = dFileInfos[i].m_sFilename.cstr();
22427 			CSphString sError;
22428 			if ( !rdWordforms.Open ( szFile, sError ) )
22429 			{
22430 				sphWarning ( "index '%s': %s", sIndex, sError.cstr() );
22431 				return NULL;
22432 			}
22433 
22434 			int iLen;
22435 			while ( ( iLen = rdWordforms.GetLine ( sBuffer, sizeof(sBuffer) ) )>=0 )
22436 				AddWordform ( pContainer, sBuffer, iLen, pMyTokenizer.Ptr(), szFile, dBlended, i );
22437 		}
22438 	}
22439 
22440 	return pContainer;
22441 }
22442 
22443 
LoadWordforms(const CSphVector<CSphString> & dFiles,const CSphEmbeddedFiles * pEmbedded,const ISphTokenizer * pTokenizer,const char * sIndex)22444 bool CSphTemplateDictTraits::LoadWordforms ( const CSphVector<CSphString> & dFiles,
22445 	const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex )
22446 {
22447 	if ( pEmbedded )
22448 	{
22449 		m_dWFFileInfos.Resize ( pEmbedded->m_dWordformFiles.GetLength() );
22450 		ARRAY_FOREACH ( i, m_dWFFileInfos )
22451 			m_dWFFileInfos[i] = pEmbedded->m_dWordformFiles[i];
22452 	} else
22453 	{
22454 		m_dWFFileInfos.Reserve ( dFiles.GetLength() );
22455 		CSphSavedFile tFile;
22456 		ARRAY_FOREACH ( i, dFiles )
22457 			if ( !dFiles[i].IsEmpty() )
22458 			{
22459 				if ( GetFileStats ( dFiles[i].cstr(), tFile, NULL ) )
22460 					m_dWFFileInfos.Add ( tFile );
22461 				else
22462 					sphWarning ( "index '%s': wordforms file '%s' not found", sIndex, dFiles[i].cstr() );
22463 			}
22464 	}
22465 
22466 	if ( !m_dWFFileInfos.GetLength() )
22467 		return false;
22468 
22469 	SweepWordformContainers ( m_dWFFileInfos );
22470 
22471 	m_pWordforms = GetWordformContainer ( m_dWFFileInfos, pEmbedded ? &(pEmbedded->m_dWordforms) : NULL, pTokenizer, sIndex );
22472 	if ( m_pWordforms )
22473 	{
22474 		m_pWordforms->m_iRefCount++;
22475 		if ( m_pWordforms->m_bHavePostMorphNF && !m_dMorph.GetLength() )
22476 			sphWarning ( "index '%s': wordforms contain post-morphology normal forms, but no morphology was specified", sIndex );
22477 	}
22478 
22479 	return !!m_pWordforms;
22480 }
22481 
22482 
WriteWordforms(CSphWriter & tWriter)22483 void CSphTemplateDictTraits::WriteWordforms ( CSphWriter & tWriter )
22484 {
22485 	if ( !m_pWordforms )
22486 	{
22487 		tWriter.PutDword(0);
22488 		return;
22489 	}
22490 
22491 	int nMultiforms = 0;
22492 	if ( m_pWordforms->m_pMultiWordforms )
22493 	{
22494 		CSphMultiformContainer::CSphMultiformHash & tHash = m_pWordforms->m_pMultiWordforms->m_Hash;
22495 		tHash.IterateStart();
22496 		while ( tHash.IterateNext() )
22497 		{
22498 			CSphMultiforms * pMF = tHash.IterateGet();
22499 			nMultiforms += pMF ? pMF->m_pForms.GetLength() : 0;
22500 		}
22501 	}
22502 
22503 	tWriter.PutDword ( m_pWordforms->m_dHash.GetLength()+nMultiforms );
22504 	m_pWordforms->m_dHash.IterateStart();
22505 	while ( m_pWordforms->m_dHash.IterateNext() )
22506 	{
22507 		const CSphString & sKey = m_pWordforms->m_dHash.IterateGetKey();
22508 		int iIndex = m_pWordforms->m_dHash.IterateGet();
22509 		CSphString sLine;
22510 		sLine.SetSprintf ( "%s%s > %s", m_pWordforms->m_dNormalForms[iIndex].m_bAfterMorphology ? "~" : "",
22511 			sKey.cstr(), m_pWordforms->m_dNormalForms[iIndex].m_sWord.cstr() );
22512 		tWriter.PutString ( sLine );
22513 	}
22514 
22515 	if ( m_pWordforms->m_pMultiWordforms )
22516 	{
22517 		CSphMultiformContainer::CSphMultiformHash & tHash = m_pWordforms->m_pMultiWordforms->m_Hash;
22518 		tHash.IterateStart();
22519 		while ( tHash.IterateNext() )
22520 		{
22521 			const CSphString & sKey = tHash.IterateGetKey();
22522 			CSphMultiforms * pMF = tHash.IterateGet();
22523 			if ( !pMF )
22524 				continue;
22525 
22526 			ARRAY_FOREACH ( i, pMF->m_pForms )
22527 			{
22528 				CSphString sLine, sTokens, sForms;
22529 				ConcatReportStrings ( pMF->m_pForms[i]->m_dTokens, sTokens );
22530 				ConcatReportStrings ( pMF->m_pForms[i]->m_dNormalForm, sForms );
22531 
22532 				sLine.SetSprintf ( "%s %s > %s", sKey.cstr(), sTokens.cstr(), sForms.cstr() );
22533 				tWriter.PutString ( sLine );
22534 			}
22535 		}
22536 	}
22537 }
22538 
22539 
SetMorphology(const char * szMorph,CSphString & sMessage)22540 int CSphTemplateDictTraits::SetMorphology ( const char * szMorph, CSphString & sMessage )
22541 {
22542 	m_dMorph.Reset ();
22543 #if USE_LIBSTEMMER
22544 	ARRAY_FOREACH ( i, m_dStemmers )
22545 		sb_stemmer_delete ( m_dStemmers[i] );
22546 	m_dStemmers.Reset ();
22547 #endif
22548 
22549 	if ( !szMorph )
22550 		return ST_OK;
22551 
22552 	CSphString sOption = szMorph;
22553 	sOption.ToLower ();
22554 
22555 	CSphString sError;
22556 	int iRes = ParseMorphology ( sOption.cstr(), sMessage );
22557 	if ( iRes==ST_WARNING && sMessage.IsEmpty() )
22558 		sMessage.SetSprintf ( "invalid morphology option %s; skipped", sOption.cstr() );
22559 	return iRes;
22560 }
22561 
22562 
HasMorphology() const22563 bool CSphTemplateDictTraits::HasMorphology() const
22564 {
22565 	return ( m_dMorph.GetLength()>0 );
22566 }
22567 
22568 
22569 /// common id-based stemmer
StemById(BYTE * pWord,int iStemmer) const22570 bool CSphTemplateDictTraits::StemById ( BYTE * pWord, int iStemmer ) const
22571 {
22572 	char szBuf [ MAX_KEYWORD_BYTES ];
22573 
22574 	// safe quick strncpy without (!) padding and with a side of strlen
22575 	char * p = szBuf;
22576 	char * pMax = szBuf + sizeof(szBuf) - 1;
22577 	BYTE * pLastSBS = NULL;
22578 	while ( *pWord && p<pMax )
22579 	{
22580 		pLastSBS = ( *pWord )<0x80 ? pWord : pLastSBS;
22581 		*p++ = *pWord++;
22582 	}
22583 	int iLen = p - szBuf;
22584 	*p = '\0';
22585 	pWord -= iLen;
22586 
22587 	switch ( iStemmer )
22588 	{
22589 	case SPH_MORPH_STEM_EN:
22590 		stem_en ( pWord, iLen );
22591 		break;
22592 
22593 	case SPH_MORPH_STEM_RU_UTF8:
22594 		// skip stemming in case of SBC at the end of the word
22595 		if ( pLastSBS && ( pLastSBS-pWord+1 )>=iLen )
22596 			break;
22597 
22598 		// stem only UTF8 tail
22599 		if ( !pLastSBS )
22600 		{
22601 			stem_ru_utf8 ( (WORD*)pWord );
22602 		} else
22603 		{
22604 			stem_ru_utf8 ( (WORD *)( pLastSBS+1 ) );
22605 		}
22606 		break;
22607 
22608 	case SPH_MORPH_STEM_CZ:
22609 		stem_cz ( pWord );
22610 		break;
22611 
22612 	case SPH_MORPH_STEM_AR_UTF8:
22613 		stem_ar_utf8 ( pWord );
22614 		break;
22615 
22616 	case SPH_MORPH_SOUNDEX:
22617 		stem_soundex ( pWord );
22618 		break;
22619 
22620 	case SPH_MORPH_METAPHONE_UTF8:
22621 		stem_dmetaphone ( pWord );
22622 		break;
22623 
22624 	case SPH_MORPH_AOTLEMMER_RU_UTF8:
22625 		sphAotLemmatizeRuUTF8 ( pWord );
22626 		break;
22627 
22628 	case SPH_MORPH_AOTLEMMER_EN:
22629 		sphAotLemmatize ( pWord, AOT_EN );
22630 		break;
22631 
22632 	case SPH_MORPH_AOTLEMMER_DE_UTF8:
22633 		sphAotLemmatizeDeUTF8 ( pWord );
22634 		break;
22635 
22636 	case SPH_MORPH_AOTLEMMER_RU_ALL:
22637 	case SPH_MORPH_AOTLEMMER_EN_ALL:
22638 	case SPH_MORPH_AOTLEMMER_DE_ALL:
22639 		// do the real work somewhere else
22640 		// this is mostly for warning suppressing and making some features like
22641 		// index_exact_words=1 vs expand_keywords=1 work
22642 		break;
22643 
22644 	default:
22645 #if USE_LIBSTEMMER
22646 		if ( iStemmer>=SPH_MORPH_LIBSTEMMER_FIRST && iStemmer<SPH_MORPH_LIBSTEMMER_LAST )
22647 		{
22648 			sb_stemmer * pStemmer = m_dStemmers [iStemmer - SPH_MORPH_LIBSTEMMER_FIRST];
22649 			assert ( pStemmer );
22650 
22651 			const sb_symbol * sStemmed = sb_stemmer_stem ( pStemmer, (sb_symbol*)pWord, strlen ( (const char*)pWord ) );
22652 			int iLen = sb_stemmer_length ( pStemmer );
22653 
22654 			memcpy ( pWord, sStemmed, iLen );
22655 			pWord[iLen] = '\0';
22656 		} else
22657 			return false;
22658 
22659 	break;
22660 #else
22661 		return false;
22662 #endif
22663 	}
22664 
22665 	return strcmp ( (char *)pWord, szBuf )!=0;
22666 }
22667 
DictBegin(CSphAutofile &,CSphAutofile & tDict,int,ThrottleState_t * pThrottle)22668 void CSphDiskDictTraits::DictBegin ( CSphAutofile & , CSphAutofile & tDict, int, ThrottleState_t * pThrottle )
22669 {
22670 	m_wrDict.CloseFile ();
22671 	m_wrDict.SetFile ( tDict, NULL, m_sWriterError );
22672 	m_wrDict.SetThrottle ( pThrottle );
22673 	m_wrDict.PutByte ( 1 );
22674 }
22675 
DictEnd(DictHeader_t * pHeader,int,CSphString & sError,ThrottleState_t *)22676 bool CSphDiskDictTraits::DictEnd ( DictHeader_t * pHeader, int, CSphString & sError, ThrottleState_t * )
22677 {
22678 	// flush wordlist checkpoints
22679 	pHeader->m_iDictCheckpointsOffset = m_wrDict.GetPos();
22680 	pHeader->m_iDictCheckpoints = m_dCheckpoints.GetLength();
22681 	ARRAY_FOREACH ( i, m_dCheckpoints )
22682 	{
22683 		assert ( m_dCheckpoints[i].m_iWordlistOffset );
22684 		m_wrDict.PutOffset ( m_dCheckpoints[i].m_uWordID );
22685 		m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordlistOffset );
22686 	}
22687 
22688 	// done
22689 	m_wrDict.CloseFile ();
22690 	if ( m_wrDict.IsError() )
22691 		sError = m_sWriterError;
22692 	return !m_wrDict.IsError();
22693 }
22694 
DictEntry(const CSphDictEntry & tEntry)22695 void CSphDiskDictTraits::DictEntry ( const CSphDictEntry & tEntry )
22696 {
22697 	// insert wordlist checkpoint
22698 	if ( ( m_iEntries % SPH_WORDLIST_CHECKPOINT )==0 )
22699 	{
22700 		if ( m_iEntries ) // but not the 1st entry
22701 		{
22702 			assert ( tEntry.m_iDoclistOffset > m_iLastDoclistPos );
22703 			m_wrDict.ZipInt ( 0 ); // indicate checkpoint
22704 			m_wrDict.ZipOffset ( tEntry.m_iDoclistOffset - m_iLastDoclistPos ); // store last length
22705 		}
22706 
22707 		// restart delta coding, once per SPH_WORDLIST_CHECKPOINT entries
22708 		m_iLastWordID = 0;
22709 		m_iLastDoclistPos = 0;
22710 
22711 		// begin new wordlist entry
22712 		assert ( m_wrDict.GetPos()<=UINT_MAX );
22713 
22714 		CSphWordlistCheckpoint & tCheckpoint = m_dCheckpoints.Add();
22715 		tCheckpoint.m_uWordID = tEntry.m_uWordID;
22716 		tCheckpoint.m_iWordlistOffset = m_wrDict.GetPos();
22717 	}
22718 
22719 	assert ( tEntry.m_iDoclistOffset>m_iLastDoclistPos );
22720 	m_wrDict.ZipOffset ( tEntry.m_uWordID - m_iLastWordID ); // FIXME! slow with 32bit wordids
22721 	m_wrDict.ZipOffset ( tEntry.m_iDoclistOffset - m_iLastDoclistPos );
22722 
22723 	m_iLastWordID = tEntry.m_uWordID;
22724 	m_iLastDoclistPos = tEntry.m_iDoclistOffset;
22725 
22726 	assert ( tEntry.m_iDocs );
22727 	assert ( tEntry.m_iHits );
22728 	m_wrDict.ZipInt ( tEntry.m_iDocs );
22729 	m_wrDict.ZipInt ( tEntry.m_iHits );
22730 
22731 	// write skiplist location info, if any
22732 	if ( tEntry.m_iDocs > SPH_SKIPLIST_BLOCK )
22733 		m_wrDict.ZipOffset ( tEntry.m_iSkiplistOffset );
22734 
22735 	m_iEntries++;
22736 }
22737 
DictEndEntries(SphOffset_t iDoclistOffset)22738 void CSphDiskDictTraits::DictEndEntries ( SphOffset_t iDoclistOffset )
22739 {
22740 	assert ( iDoclistOffset>=m_iLastDoclistPos );
22741 	m_wrDict.ZipInt ( 0 ); // indicate checkpoint
22742 	m_wrDict.ZipOffset ( iDoclistOffset - m_iLastDoclistPos ); // store last doclist length
22743 }
22744 
22745 //////////////////////////////////////////////////////////////////////////
22746 // KEYWORDS STORING DICTIONARY, INFIX HASH BUILDER
22747 //////////////////////////////////////////////////////////////////////////
22748 
22749 template < int SIZE >
22750 struct Infix_t
22751 {
22752 	DWORD m_Data[SIZE];
22753 
22754 #ifndef NDEBUG
22755 	BYTE m_TrailingZero;
22756 
Infix_tInfix_t22757 	Infix_t ()
22758 		: m_TrailingZero ( 0 )
22759 	{}
22760 #endif
22761 
ResetInfix_t22762 	void Reset ()
22763 	{
22764 		for ( int i=0; i<SIZE; i++ )
22765 			m_Data[i] = 0;
22766 	}
22767 
22768 	bool operator == ( const Infix_t<SIZE> & rhs ) const;
22769 };
22770 
22771 
22772 template<>
operator ==(const Infix_t<2> & rhs) const22773 bool Infix_t<2>::operator == ( const Infix_t<2> & rhs ) const
22774 {
22775 	return m_Data[0]==rhs.m_Data[0] && m_Data[1]==rhs.m_Data[1];
22776 }
22777 
22778 
22779 template<>
operator ==(const Infix_t<3> & rhs) const22780 bool Infix_t<3>::operator == ( const Infix_t<3> & rhs ) const
22781 {
22782 	return m_Data[0]==rhs.m_Data[0] && m_Data[1]==rhs.m_Data[1] && m_Data[2]==rhs.m_Data[2];
22783 }
22784 
22785 
22786 template<>
operator ==(const Infix_t<5> & rhs) const22787 bool Infix_t<5>::operator == ( const Infix_t<5> & rhs ) const
22788 {
22789 	return m_Data[0]==rhs.m_Data[0] && m_Data[1]==rhs.m_Data[1] && m_Data[2]==rhs.m_Data[2]
22790 		&& m_Data[3]==rhs.m_Data[3] && m_Data[4]==rhs.m_Data[4];
22791 }
22792 
22793 
22794 struct InfixIntvec_t
22795 {
22796 public:
22797 	// do not change the order of fields in this union - it matters a lot
22798 	union
22799 	{
22800 		DWORD			m_dData[4];
22801 		struct
22802 		{
22803 			int				m_iDynLen;
22804 			int				m_iDynLimit;
22805 			DWORD *			m_pDynData;
22806 		};
22807 	};
22808 
22809 public:
InfixIntvec_tInfixIntvec_t22810 	InfixIntvec_t()
22811 	{
22812 		m_dData[0] = 0;
22813 		m_dData[1] = 0;
22814 		m_dData[2] = 0;
22815 		m_dData[3] = 0;
22816 	}
22817 
~InfixIntvec_tInfixIntvec_t22818 	~InfixIntvec_t()
22819 	{
22820 		if ( IsDynamic() )
22821 			SafeDeleteArray ( m_pDynData );
22822 	}
22823 
IsDynamicInfixIntvec_t22824 	bool IsDynamic() const
22825 	{
22826 		return ( m_dData[0] & 0x80000000UL )!=0;
22827 	}
22828 
AddInfixIntvec_t22829 	void Add ( DWORD uVal )
22830 	{
22831 		if ( !m_dData[0] )
22832 		{
22833 			// empty
22834 			m_dData[0] = uVal | ( 1UL<<24 );
22835 
22836 		} else if ( !IsDynamic() )
22837 		{
22838 			// 1..4 static entries
22839 			int iLen = m_dData[0] >> 24;
22840 			DWORD uLast = m_dData [ iLen-1 ] & 0xffffffUL;
22841 
22842 			// redundant
22843 			if ( uVal==uLast )
22844 				return;
22845 
22846 			// grow static part
22847 			if ( iLen<4 )
22848 			{
22849 				m_dData[iLen] = uVal;
22850 				m_dData[0] = ( m_dData[0] & 0xffffffUL ) | ( ++iLen<<24 );
22851 				return;
22852 			}
22853 
22854 			// dynamize
22855 			DWORD * pDyn = new DWORD[16];
22856 			pDyn[0] = m_dData[0] & 0xffffffUL;
22857 			pDyn[1] = m_dData[1];
22858 			pDyn[2] = m_dData[2];
22859 			pDyn[3] = m_dData[3];
22860 			pDyn[4] = uVal;
22861 			m_iDynLen = 0x80000005UL; // dynamic flag, len=5
22862 			m_iDynLimit = 16; // limit=16
22863 			m_pDynData = pDyn;
22864 
22865 		} else
22866 		{
22867 			// N dynamic entries
22868 			int iLen = m_iDynLen & 0xffffffUL;
22869 			if ( uVal==m_pDynData[iLen-1] )
22870 				return;
22871 			if ( iLen>=m_iDynLimit )
22872 			{
22873 				m_iDynLimit *= 2;
22874 				DWORD * pNew = new DWORD [ m_iDynLimit ];
22875 				for ( int i=0; i<iLen; i++ )
22876 					pNew[i] = m_pDynData[i];
22877 				SafeDeleteArray ( m_pDynData );
22878 				m_pDynData = pNew;
22879 			}
22880 
22881 			m_pDynData[iLen] = uVal;
22882 			m_iDynLen++;
22883 		}
22884 	}
22885 
operator ==InfixIntvec_t22886 	bool operator == ( const InfixIntvec_t & rhs ) const
22887 	{
22888 		// check dynflag, length, maybe first element
22889 		if ( m_dData[0]!=rhs.m_dData[0] )
22890 			return false;
22891 
22892 		// check static data
22893 		if ( !IsDynamic() )
22894 		{
22895 			for ( int i=1; i<(int)(m_dData[0]>>24); i++ )
22896 				if ( m_dData[i]!=rhs.m_dData[i] )
22897 					return false;
22898 			return true;
22899 		}
22900 
22901 		// check dynamic data
22902 		const DWORD * a = m_pDynData;
22903 		const DWORD * b = rhs.m_pDynData;
22904 		const DWORD * m = a + ( m_iDynLen & 0xffffffUL );
22905 		while ( a<m )
22906 			if ( *a++!=*b++ )
22907 				return false;
22908 		return true;
22909 	}
22910 
22911 public:
GetLengthInfixIntvec_t22912 	int GetLength() const
22913 	{
22914 		if ( !IsDynamic() )
22915 			return m_dData[0] >> 24;
22916 		return m_iDynLen & 0xffffffUL;
22917 	}
22918 
operator []InfixIntvec_t22919 	DWORD operator[] ( int iIndex )const
22920 	{
22921 		if ( !IsDynamic() )
22922 			return m_dData[iIndex] & 0xffffffUL;
22923 		return m_pDynData[iIndex];
22924 	}
22925 };
22926 
22927 
Swap(InfixIntvec_t & a,InfixIntvec_t & b)22928 void Swap ( InfixIntvec_t & a, InfixIntvec_t & b )
22929 {
22930 	::Swap ( a.m_dData[0], b.m_dData[0] );
22931 	::Swap ( a.m_dData[1], b.m_dData[1] );
22932 	::Swap ( a.m_dData[2], b.m_dData[2] );
22933 	::Swap ( a.m_dData[3], b.m_dData[3] );
22934 }
22935 
22936 
22937 template < int SIZE >
22938 struct InfixHashEntry_t
22939 {
22940 	Infix_t<SIZE>	m_tKey;		///< key, owned by the hash
22941 	InfixIntvec_t	m_tValue;	///< data, owned by the hash
22942 	int				m_iNext;	///< next entry in hash arena
22943 };
22944 
22945 
22946 template < int SIZE >
22947 class InfixBuilder_c : public ISphInfixBuilder
22948 {
22949 protected:
22950 	static const int							LENGTH = 1048576;
22951 
22952 protected:
22953 	int											m_dHash [ LENGTH ];		///< all the hash entries
22954 	CSphSwapVector < InfixHashEntry_t<SIZE> >	m_dArena;
22955 	CSphVector<InfixBlock_t>					m_dBlocks;
22956 	CSphTightVector<BYTE>						m_dBlocksWords;
22957 
22958 public:
22959 					InfixBuilder_c();
22960 	virtual void	AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint, bool bHasMorphology );
22961 	virtual void	SaveEntries ( CSphWriter & wrDict );
22962 	virtual int64_t	SaveEntryBlocks ( CSphWriter & wrDict );
GetBlocksWordsSize() const22963 	virtual int		GetBlocksWordsSize () const { return m_dBlocksWords.GetLength(); }
22964 
22965 protected:
22966 	/// add new entry
AddEntry(const Infix_t<SIZE> & tKey,DWORD uHash,int iCheckpoint)22967 	void AddEntry ( const Infix_t<SIZE> & tKey, DWORD uHash, int iCheckpoint )
22968 	{
22969 		uHash &= ( LENGTH-1 );
22970 
22971 		int iEntry = m_dArena.GetLength();
22972 		InfixHashEntry_t<SIZE> & tNew = m_dArena.Add();
22973 		tNew.m_tKey = tKey;
22974 		tNew.m_tValue.m_dData[0] = 0x1000000UL | iCheckpoint; // len=1, data=iCheckpoint
22975 		tNew.m_iNext = m_dHash[uHash];
22976 		m_dHash[uHash] = iEntry;
22977 	}
22978 
22979 	/// get value pointer by key
LookupEntry(const Infix_t<SIZE> & tKey,DWORD uHash)22980 	InfixIntvec_t * LookupEntry ( const Infix_t<SIZE> & tKey, DWORD uHash )
22981 	{
22982 		uHash &= ( LENGTH-1 );
22983 		int iEntry = m_dHash [ uHash ];
22984 		int iiEntry = 0;
22985 
22986 		while ( iEntry )
22987 		{
22988 			if ( m_dArena[iEntry].m_tKey==tKey )
22989 			{
22990 				// mtf it, if needed
22991 				if ( iiEntry )
22992 				{
22993 					m_dArena[iiEntry].m_iNext = m_dArena[iEntry].m_iNext;
22994 					m_dArena[iEntry].m_iNext = m_dHash[uHash];
22995 					m_dHash[uHash] = iEntry;
22996 				}
22997 				return &m_dArena[iEntry].m_tValue;
22998 			}
22999 			iiEntry = iEntry;
23000 			iEntry = m_dArena[iEntry].m_iNext;
23001 		}
23002 		return NULL;
23003 	}
23004 };
23005 
23006 
23007 template < int SIZE >
InfixBuilder_c()23008 InfixBuilder_c<SIZE>::InfixBuilder_c()
23009 {
23010 	// init the hash
23011 	for ( int i=0; i<LENGTH; i++ )
23012 		m_dHash[i] = 0;
23013 	m_dArena.Reserve ( 1048576 );
23014 	m_dArena.Resize ( 1 ); // 0 is a reserved index
23015 }
23016 
23017 
23018 /// single-byte case, 2-dword infixes
23019 template<>
AddWord(const BYTE * pWord,int iWordLength,int iCheckpoint,bool bHasMorphology)23020 void InfixBuilder_c<2>::AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint, bool bHasMorphology )
23021 {
23022 	if ( bHasMorphology && *pWord!=MAGIC_WORD_HEAD_NONSTEMMED )
23023 		return;
23024 
23025 	if ( *pWord<0x20 ) // skip heading magic chars, like NONSTEMMED maker
23026 	{
23027 		pWord++;
23028 		iWordLength--;
23029 	}
23030 
23031 	Infix_t<2> sKey;
23032 	for ( int p=0; p<=iWordLength-2; p++ )
23033 	{
23034 		sKey.Reset();
23035 
23036 		BYTE * pKey = (BYTE*)sKey.m_Data;
23037 		const BYTE * s = pWord + p;
23038 		const BYTE * sMax = s + Min ( 6, iWordLength-p );
23039 
23040 		DWORD uHash = 0xffffffUL ^ g_dSphinxCRC32 [ 0xff ^ *s ];
23041 		*pKey++ = *s++; // copy first infix byte
23042 
23043 		while ( s<sMax )
23044 		{
23045 			uHash = (uHash >> 8) ^ g_dSphinxCRC32 [ (uHash ^ *s) & 0xff ];
23046 			*pKey++ = *s++; // copy another infix byte
23047 
23048 			InfixIntvec_t * pVal = LookupEntry ( sKey, uHash );
23049 			if ( pVal )
23050 				pVal->Add ( iCheckpoint );
23051 			else
23052 				AddEntry ( sKey, uHash, iCheckpoint );
23053 		}
23054 	}
23055 }
23056 
23057 
23058 /// UTF-8 case, 3/5-dword infixes
23059 template < int SIZE >
AddWord(const BYTE * pWord,int iWordLength,int iCheckpoint,bool bHasMorphology)23060 void InfixBuilder_c<SIZE>::AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint, bool bHasMorphology )
23061 {
23062 	if ( bHasMorphology && *pWord!=MAGIC_WORD_HEAD_NONSTEMMED )
23063 		return;
23064 
23065 	if ( *pWord<0x20 ) // skip heading magic chars, like NONSTEMMED maker
23066 	{
23067 		pWord++;
23068 		iWordLength--;
23069 	}
23070 
23071 	int iCodes = 0; // codepoints in current word
23072 	BYTE dBytes[SPH_MAX_WORD_LEN+1]; // byte offset for each codepoints
23073 
23074 	// build an offsets table into the bytestring
23075 	dBytes[0] = 0;
23076 	for ( const BYTE * p = (const BYTE*)pWord; p<pWord+iWordLength && iCodes<SPH_MAX_WORD_LEN; )
23077 	{
23078 		int iLen = 0;
23079 		BYTE uVal = *p;
23080 		while ( uVal & 0x80 )
23081 		{
23082 			uVal <<= 1;
23083 			iLen++;
23084 		}
23085 		if ( !iLen )
23086 			iLen = 1;
23087 
23088 		// skip word with large codepoints
23089 		if ( iLen>SIZE )
23090 			return;
23091 
23092 		assert ( iLen>=1 && iLen<=4 );
23093 		p += iLen;
23094 
23095 		dBytes[iCodes+1] = dBytes[iCodes] + (BYTE)iLen;
23096 		iCodes++;
23097 	}
23098 	assert ( pWord[dBytes[iCodes]]==0 || iCodes==SPH_MAX_WORD_LEN );
23099 
23100 	// generate infixes
23101 	Infix_t<SIZE> sKey;
23102 	for ( int p=0; p<=iCodes-2; p++ )
23103 	{
23104 		sKey.Reset();
23105 		BYTE * pKey = (BYTE*)sKey.m_Data;
23106 
23107 		const BYTE * s = pWord + dBytes[p];
23108 		const BYTE * sMax = pWord + dBytes[ p+Min ( 6, iCodes-p ) ];
23109 
23110 		// copy first infix codepoint
23111 		DWORD uHash = 0xffffffffUL;
23112 		do
23113 		{
23114 			uHash = (uHash >> 8) ^ g_dSphinxCRC32 [ (uHash ^ *s) & 0xff ];
23115 			*pKey++ = *s++;
23116 		} while ( ( *s & 0xC0 )==0x80 );
23117 
23118 		while ( s<sMax )
23119 		{
23120 			// copy next infix codepoint
23121 			do
23122 			{
23123 				uHash = (uHash >> 8) ^ g_dSphinxCRC32 [ (uHash ^ *s) & 0xff ];
23124 				*pKey++ = *s++;
23125 			} while ( ( *s & 0xC0 )==0x80 );
23126 
23127 			InfixIntvec_t * pVal = LookupEntry ( sKey, uHash );
23128 			if ( pVal )
23129 				pVal->Add ( iCheckpoint );
23130 			else
23131 				AddEntry ( sKey, uHash, iCheckpoint );
23132 		}
23133 	}
23134 }
23135 
23136 
23137 template < int SIZE >
23138 struct InfixHashCmp_fn
23139 {
23140 	InfixHashEntry_t<SIZE> * m_pBase;
23141 
InfixHashCmp_fnInfixHashCmp_fn23142 	explicit InfixHashCmp_fn ( InfixHashEntry_t<SIZE> * pBase )
23143 		: m_pBase ( pBase )
23144 	{}
23145 
IsLessInfixHashCmp_fn23146 	bool IsLess ( int a, int b ) const
23147 	{
23148 		return strncmp ( (const char*)m_pBase[a].m_tKey.m_Data, (const char*)m_pBase[b].m_tKey.m_Data, sizeof(DWORD)*SIZE )<0;
23149 	}
23150 };
23151 
23152 
ZippedIntSize(DWORD v)23153 static inline int ZippedIntSize ( DWORD v )
23154 {
23155 	if ( v < (1UL<<7) )
23156 		return 1;
23157 	if ( v < (1UL<<14) )
23158 		return 2;
23159 	if ( v < (1UL<<21) )
23160 		return 3;
23161 	if ( v < (1UL<<28) )
23162 		return 4;
23163 	return 5;
23164 }
23165 
23166 
23167 static const char * g_sTagInfixEntries = "infix-entries";
23168 
23169 template < int SIZE >
SaveEntries(CSphWriter & wrDict)23170 void InfixBuilder_c<SIZE>::SaveEntries ( CSphWriter & wrDict )
23171 {
23172 	// intentionally local to this function
23173 	// we mark the block end with an editcode of 0
23174 	const int INFIX_BLOCK_SIZE = 64;
23175 
23176 	wrDict.PutBytes ( g_sTagInfixEntries, strlen ( g_sTagInfixEntries ) );
23177 
23178 	CSphVector<int> dIndex;
23179 	dIndex.Resize ( m_dArena.GetLength()-1 );
23180 	for ( int i=0; i<m_dArena.GetLength()-1; i++ )
23181 		dIndex[i] = i+1;
23182 
23183 	InfixHashCmp_fn<SIZE> fnCmp ( m_dArena.Begin() );
23184 	dIndex.Sort ( fnCmp );
23185 
23186 	m_dBlocksWords.Reserve ( m_dArena.GetLength()/INFIX_BLOCK_SIZE*sizeof(DWORD)*SIZE );
23187 	int iBlock = 0;
23188 	int iPrevKey = -1;
23189 	ARRAY_FOREACH ( iIndex, dIndex )
23190 	{
23191 		InfixIntvec_t & dData = m_dArena[dIndex[iIndex]].m_tValue;
23192 		const BYTE * sKey = (const BYTE*) m_dArena[dIndex[iIndex]].m_tKey.m_Data;
23193 		int iChars = ( SIZE==2 )
23194 			? strnlen ( (const char*)sKey, sizeof(DWORD)*SIZE )
23195 			: sphUTF8Len ( (const char*)sKey, sizeof(DWORD)*SIZE );
23196 		assert ( iChars>=2 && iChars<int(1 + sizeof ( Infix_t<SIZE> ) ) );
23197 
23198 		// keep track of N-infix blocks
23199 		int iAppendBytes = strnlen ( (const char*)sKey, sizeof(DWORD)*SIZE );
23200 		if ( !iBlock )
23201 		{
23202 			int iOff = m_dBlocksWords.GetLength();
23203 			m_dBlocksWords.Resize ( iOff+iAppendBytes+1 );
23204 
23205 			InfixBlock_t & tBlock = m_dBlocks.Add();
23206 			tBlock.m_iInfixOffset = iOff;
23207 			tBlock.m_iOffset = (DWORD)wrDict.GetPos();
23208 
23209 			memcpy ( m_dBlocksWords.Begin()+iOff, sKey, iAppendBytes );
23210 			m_dBlocksWords[iOff+iAppendBytes] = '\0';
23211 		}
23212 
23213 		// compute max common prefix
23214 		// edit_code = ( num_keep_chars<<4 ) + num_append_chars
23215 		int iEditCode = iChars;
23216 		if ( iPrevKey>=0 )
23217 		{
23218 			const BYTE * sPrev = (const BYTE*) m_dArena[dIndex[iPrevKey]].m_tKey.m_Data;
23219 			const BYTE * sCur = (const BYTE*) sKey;
23220 			const BYTE * sMax = sCur + iAppendBytes;
23221 
23222 			int iKeepChars = 0;
23223 			if_const ( SIZE==2 )
23224 			{
23225 				// SBCS path
23226 				while ( sCur<sMax && *sCur && *sCur==*sPrev )
23227 				{
23228 					sCur++;
23229 					sPrev++;
23230 				}
23231 				iKeepChars = (int)( sCur- ( const BYTE* ) sKey );
23232 
23233 				assert ( iKeepChars>=0 && iKeepChars<16 );
23234 				assert ( iChars-iKeepChars>=0 );
23235 				assert ( iChars-iKeepChars<16 );
23236 
23237 				iEditCode = ( iKeepChars<<4 ) + ( iChars-iKeepChars );
23238 				iAppendBytes = ( iChars-iKeepChars );
23239 				sKey = sCur;
23240 
23241 			} else
23242 			{
23243 				// UTF-8 path
23244 				const BYTE * sKeyMax = sCur; // track max matching sPrev prefix in [sKey,sKeyMax)
23245 				while ( sCur<sMax && *sCur && *sCur==*sPrev )
23246 				{
23247 					// current byte matches, move the pointer
23248 					sCur++;
23249 					sPrev++;
23250 
23251 					// tricky bit
23252 					// if the next (!) byte is a valid UTF-8 char start (or eof!)
23253 					// then we just matched not just a byte, but a full char
23254 					// so bump the matching prefix boundary and length
23255 					if ( sCur>=sMax || ( *sCur & 0xC0 )!=0x80 )
23256 					{
23257 						sKeyMax = sCur;
23258 						iKeepChars++;
23259 					}
23260 				}
23261 
23262 				assert ( iKeepChars>=0 && iKeepChars<16 );
23263 				assert ( iChars-iKeepChars>=0 );
23264 				assert ( iChars-iKeepChars<16 );
23265 
23266 				iEditCode = ( iKeepChars<<4 ) + ( iChars-iKeepChars );
23267 				iAppendBytes -= (int)( sKeyMax-sKey );
23268 				sKey = sKeyMax;
23269 			}
23270 		}
23271 
23272 		// write edit code, postfix
23273 		wrDict.PutByte ( iEditCode );
23274 		wrDict.PutBytes ( sKey, iAppendBytes );
23275 
23276 		// compute data length
23277 		int iDataLen = ZippedIntSize ( dData[0] );
23278 		for ( int j=1; j<dData.GetLength(); j++ )
23279 			iDataLen += ZippedIntSize ( dData[j] - dData[j-1] );
23280 
23281 		// write data length, data
23282 		wrDict.ZipInt ( iDataLen );
23283 		wrDict.ZipInt ( dData[0] );
23284 		for ( int j=1; j<dData.GetLength(); j++ )
23285 			wrDict.ZipInt ( dData[j] - dData[j-1] );
23286 
23287 		// mark block end, restart deltas
23288 		iPrevKey = iIndex;
23289 		if ( ++iBlock==INFIX_BLOCK_SIZE )
23290 		{
23291 			iBlock = 0;
23292 			iPrevKey = -1;
23293 			wrDict.PutByte ( 0 );
23294 		}
23295 	}
23296 
23297 	// put end marker
23298 	if ( iBlock )
23299 		wrDict.PutByte ( 0 );
23300 
23301 	const char * pBlockWords = (const char *)m_dBlocksWords.Begin();
23302 	ARRAY_FOREACH ( i, m_dBlocks )
23303 		m_dBlocks[i].m_sInfix = pBlockWords+m_dBlocks[i].m_iInfixOffset;
23304 
23305 	if ( wrDict.GetPos()>UINT_MAX ) // FIXME!!! change to int64
23306 		sphDie ( "INTERNAL ERROR: dictionary size " INT64_FMT " overflow at infix save", wrDict.GetPos() );
23307 }
23308 
23309 
23310 static const char * g_sTagInfixBlocks = "infix-blocks";
23311 
23312 template < int SIZE >
SaveEntryBlocks(CSphWriter & wrDict)23313 int64_t InfixBuilder_c<SIZE>::SaveEntryBlocks ( CSphWriter & wrDict )
23314 {
23315 	// save the blocks
23316 	wrDict.PutBytes ( g_sTagInfixBlocks, strlen ( g_sTagInfixBlocks ) );
23317 
23318 	SphOffset_t iInfixBlocksOffset = wrDict.GetPos();
23319 	assert ( iInfixBlocksOffset<=INT_MAX );
23320 
23321 	wrDict.ZipInt ( m_dBlocks.GetLength() );
23322 	ARRAY_FOREACH ( i, m_dBlocks )
23323 	{
23324 		int iBytes = strlen ( m_dBlocks[i].m_sInfix );
23325 		wrDict.PutByte ( iBytes );
23326 		wrDict.PutBytes ( m_dBlocks[i].m_sInfix, iBytes );
23327 		wrDict.ZipInt ( m_dBlocks[i].m_iOffset ); // maybe delta these on top?
23328 	}
23329 
23330 	return iInfixBlocksOffset;
23331 }
23332 
23333 
sphCreateInfixBuilder(int iCodepointBytes,CSphString * pError)23334 ISphInfixBuilder * sphCreateInfixBuilder ( int iCodepointBytes, CSphString * pError )
23335 {
23336 	assert ( pError );
23337 	*pError = CSphString();
23338 	switch ( iCodepointBytes )
23339 	{
23340 	case 0:		return NULL;
23341 	case 1:		return new InfixBuilder_c<2>(); // upto 6x1 bytes, 2 dwords, sbcs
23342 	case 2:		return new InfixBuilder_c<3>(); // upto 6x2 bytes, 3 dwords, utf-8
23343 	case 3:		return new InfixBuilder_c<5>(); // upto 6x3 bytes, 5 dwords, utf-8
23344 	default:	pError->SetSprintf ( "unhandled max infix codepoint size %d", iCodepointBytes ); return NULL;
23345 	}
23346 }
23347 
23348 //////////////////////////////////////////////////////////////////////////
23349 // KEYWORDS STORING DICTIONARY
23350 //////////////////////////////////////////////////////////////////////////
23351 
23352 class CSphDictKeywords : public CSphDictCRC<true>
23353 {
23354 private:
23355 	static const int				SLOTS			= 65536;
23356 	static const int				ENTRY_CHUNK		= 65536;
23357 	static const int				KEYWORD_CHUNK	= 1048576;
23358 	static const int				DICT_CHUNK		= 65536;
23359 
23360 public:
23361 	// OPTIMIZE? change pointers to 8:24 locators to save RAM on x64 gear?
23362 	struct HitblockKeyword_t
23363 	{
23364 		SphWordID_t					m_uWordid;			// locally unique word id (crc value, adjusted in case of collsion)
23365 		HitblockKeyword_t *			m_pNextHash;		// next hashed entry
23366 		char *						m_pKeyword;			// keyword
23367 	};
23368 
23369 	struct HitblockException_t
23370 	{
23371 		HitblockKeyword_t *			m_pEntry;			// hash entry
23372 		SphWordID_t					m_uCRC;				// original unadjusted crc
23373 
operator <CSphDictKeywords::HitblockException_t23374 		bool operator < ( const HitblockException_t & rhs ) const
23375 		{
23376 			return m_pEntry->m_uWordid < rhs.m_pEntry->m_uWordid;
23377 		}
23378 	};
23379 
23380 	struct DictKeyword_t
23381 	{
23382 		char *						m_sKeyword;
23383 		SphOffset_t					m_uOff;
23384 		int							m_iDocs;
23385 		int							m_iHits;
23386 		BYTE						m_uHint;
23387 		int							m_iSkiplistPos;		///< position in .spe file; not exactly likely to hit 2B
23388 	};
23389 
23390 	struct DictBlock_t
23391 	{
23392 		SphOffset_t					m_iPos;
23393 		int							m_iLen;
23394 	};
23395 
23396 private:
23397 	HitblockKeyword_t *				m_dHash [ SLOTS ];	///< hash by wordid (!)
23398 	CSphVector<HitblockException_t>	m_dExceptions;
23399 
23400 	bool							m_bHitblock;		///< should we store words on GetWordID or not
23401 	int								m_iMemUse;			///< current memory use by all the chunks
23402 	int								m_iDictLimit;		///< allowed memory limit for dict block collection
23403 
23404 	CSphVector<HitblockKeyword_t*>	m_dEntryChunks;		///< hash chunks, only used when indexing hitblocks
23405 	HitblockKeyword_t *				m_pEntryChunk;
23406 	int								m_iEntryChunkFree;
23407 
23408 	CSphVector<BYTE*>				m_dKeywordChunks;	///< keyword storage
23409 	BYTE *							m_pKeywordChunk;
23410 	int								m_iKeywordChunkFree;
23411 
23412 	CSphVector<DictKeyword_t*>		m_dDictChunks;		///< dict entry chunks, only used when sorting final dict
23413 	DictKeyword_t *					m_pDictChunk;
23414 	int								m_iDictChunkFree;
23415 
23416 	int								m_iTmpFD;			///< temp dict file descriptor
23417 	CSphWriter						m_wrTmpDict;		///< temp dict writer
23418 	CSphVector<DictBlock_t>			m_dDictBlocks;		///< on-disk locations of dict entry blocks
23419 
23420 	char							m_sClippedWord[MAX_KEYWORD_BYTES]; ///< keyword storage for cliiped word
23421 
23422 private:
23423 	SphWordID_t						HitblockGetID ( const char * pWord, int iLen, SphWordID_t uCRC );
23424 	HitblockKeyword_t *				HitblockAddKeyword ( DWORD uHash, const char * pWord, int iLen, SphWordID_t uID );
23425 
23426 public:
23427 	explicit				CSphDictKeywords ();
23428 	virtual					~CSphDictKeywords ();
23429 
HitblockBegin()23430 	virtual void			HitblockBegin () { m_bHitblock = true; }
23431 	virtual void			HitblockPatch ( CSphWordHit * pHits, int iHits ) const;
23432 	virtual const char *	HitblockGetKeyword ( SphWordID_t uWordID );
HitblockGetMemUse()23433 	virtual int				HitblockGetMemUse () { return m_iMemUse; }
23434 	virtual void			HitblockReset ();
23435 
23436 	virtual void			DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle );
23437 	virtual void			DictEntry ( const CSphDictEntry & tEntry );
DictEndEntries(SphOffset_t)23438 	virtual void			DictEndEntries ( SphOffset_t ) {}
23439 	virtual bool			DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * pThrottle );
23440 
23441 	virtual SphWordID_t		GetWordID ( BYTE * pWord );
23442 	virtual SphWordID_t		GetWordIDWithMarkers ( BYTE * pWord );
23443 	virtual SphWordID_t		GetWordIDNonStemmed ( BYTE * pWord );
23444 	virtual SphWordID_t		GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops );
Clone() const23445 	virtual CSphDict *		Clone () const { return CloneBase ( new CSphDictKeywords() ); }
23446 
23447 private:
23448 	void					DictFlush ();
23449 };
23450 
23451 //////////////////////////////////////////////////////////////////////////
23452 
CSphDictKeywords()23453 CSphDictKeywords::CSphDictKeywords ()
23454 	: m_bHitblock ( false )
23455 	, m_iMemUse ( 0 )
23456 	, m_iDictLimit ( 0 )
23457 	, m_pEntryChunk ( NULL )
23458 	, m_iEntryChunkFree ( 0 )
23459 	, m_pKeywordChunk ( NULL )
23460 	, m_iKeywordChunkFree ( 0 )
23461 	, m_pDictChunk ( NULL )
23462 	, m_iDictChunkFree ( 0 )
23463 {
23464 	memset ( m_dHash, 0, sizeof(m_dHash) );
23465 }
23466 
~CSphDictKeywords()23467 CSphDictKeywords::~CSphDictKeywords ()
23468 {
23469 	HitblockReset();
23470 }
23471 
HitblockReset()23472 void CSphDictKeywords::HitblockReset()
23473 {
23474 	m_dExceptions.Resize ( 0 );
23475 
23476 	ARRAY_FOREACH ( i, m_dEntryChunks )
23477 		SafeDeleteArray ( m_dEntryChunks[i] );
23478 	m_dEntryChunks.Resize ( 0 );
23479 	m_pEntryChunk = NULL;
23480 	m_iEntryChunkFree = 0;
23481 
23482 	ARRAY_FOREACH ( i, m_dKeywordChunks )
23483 		SafeDeleteArray ( m_dKeywordChunks[i] );
23484 	m_dKeywordChunks.Resize ( 0 );
23485 	m_pKeywordChunk = NULL;
23486 	m_iKeywordChunkFree = 0;
23487 
23488 	m_iMemUse = 0;
23489 
23490 	memset ( m_dHash, 0, sizeof(m_dHash) );
23491 }
23492 
HitblockAddKeyword(DWORD uHash,const char * sWord,int iLen,SphWordID_t uID)23493 CSphDictKeywords::HitblockKeyword_t * CSphDictKeywords::HitblockAddKeyword ( DWORD uHash, const char * sWord, int iLen, SphWordID_t uID )
23494 {
23495 	assert ( iLen<MAX_KEYWORD_BYTES );
23496 
23497 	// alloc entry
23498 	if ( !m_iEntryChunkFree )
23499 	{
23500 		m_pEntryChunk = new HitblockKeyword_t [ ENTRY_CHUNK ];
23501 		m_iEntryChunkFree = ENTRY_CHUNK;
23502 		m_dEntryChunks.Add ( m_pEntryChunk );
23503 		m_iMemUse += sizeof(HitblockKeyword_t)*ENTRY_CHUNK;
23504 	}
23505 	HitblockKeyword_t * pEntry = m_pEntryChunk++;
23506 	m_iEntryChunkFree--;
23507 
23508 	// alloc keyword
23509 	iLen++;
23510 	if ( m_iKeywordChunkFree < iLen )
23511 	{
23512 		m_pKeywordChunk = new BYTE [ KEYWORD_CHUNK ];
23513 		m_iKeywordChunkFree = KEYWORD_CHUNK;
23514 		m_dKeywordChunks.Add ( m_pKeywordChunk );
23515 		m_iMemUse += KEYWORD_CHUNK;
23516 	}
23517 
23518 	// fill it
23519 	memcpy ( m_pKeywordChunk, sWord, iLen );
23520 	m_pKeywordChunk[iLen-1] = '\0';
23521 	pEntry->m_pKeyword = (char*)m_pKeywordChunk;
23522 	pEntry->m_uWordid = uID;
23523 	m_pKeywordChunk += iLen;
23524 	m_iKeywordChunkFree -= iLen;
23525 
23526 	// mtf it
23527 	pEntry->m_pNextHash = m_dHash [ uHash ];
23528 	m_dHash [ uHash ] = pEntry;
23529 
23530 	return pEntry;
23531 }
23532 
HitblockGetID(const char * sWord,int iLen,SphWordID_t uCRC)23533 SphWordID_t CSphDictKeywords::HitblockGetID ( const char * sWord, int iLen, SphWordID_t uCRC )
23534 {
23535 	if ( iLen>=MAX_KEYWORD_BYTES-4 ) // fix of very long word (zones)
23536 	{
23537 		memcpy ( m_sClippedWord, sWord, MAX_KEYWORD_BYTES-4 );
23538 		memset ( m_sClippedWord+MAX_KEYWORD_BYTES-4, 0, 4 );
23539 
23540 		CSphString sOrig;
23541 		sOrig.SetBinary ( sWord, iLen );
23542 		sphWarn ( "word overrun buffer, clipped!!!\n"
23543 			"clipped (len=%d, word='%s')\noriginal (len=%d, word='%s')",
23544 			MAX_KEYWORD_BYTES-4, m_sClippedWord, iLen, sOrig.cstr() );
23545 
23546 		sWord = m_sClippedWord;
23547 		iLen = MAX_KEYWORD_BYTES-4;
23548 		uCRC = sphCRC32 ( m_sClippedWord, MAX_KEYWORD_BYTES-4 );
23549 	}
23550 
23551 	// is this a known one? find it
23552 	// OPTIMIZE? in theory we could use something faster than crc32; but quick lookup3 test did not show any improvements
23553 	const DWORD uHash = (DWORD)( uCRC % SLOTS );
23554 
23555 	HitblockKeyword_t * pEntry = m_dHash [ uHash ];
23556 	HitblockKeyword_t ** ppEntry = &m_dHash [ uHash ];
23557 	while ( pEntry )
23558 	{
23559 		// check crc
23560 		if ( pEntry->m_uWordid!=uCRC )
23561 		{
23562 			// crc mismatch, try next entry
23563 			ppEntry = &pEntry->m_pNextHash;
23564 			pEntry = pEntry->m_pNextHash;
23565 			continue;
23566 		}
23567 
23568 		// crc matches, check keyword
23569 		int iWordLen = iLen;
23570 		const char * a = pEntry->m_pKeyword;
23571 		const char * b = sWord;
23572 		while ( *a==*b && iWordLen-- )
23573 		{
23574 			if ( !*a || !iWordLen )
23575 			{
23576 				// known word, mtf it, and return id
23577 				(*ppEntry) = pEntry->m_pNextHash;
23578 				pEntry->m_pNextHash = m_dHash [ uHash ];
23579 				m_dHash [ uHash ] = pEntry;
23580 				return pEntry->m_uWordid;
23581 			}
23582 			a++;
23583 			b++;
23584 		}
23585 
23586 		// collision detected!
23587 		// our crc is taken as a wordid, but keyword does not match
23588 		// welcome to the land of very tricky magic
23589 		//
23590 		// pEntry might either be a known exception, or a regular keyword
23591 		// sWord might either be a known exception, or a new one
23592 		// if they are not known, they needed to be added as exceptions now
23593 		//
23594 		// in case sWord is new, we need to assign a new unique wordid
23595 		// for that, we keep incrementing the crc until it is unique
23596 		// a starting point for wordid search loop would be handy
23597 		//
23598 		// let's scan the exceptions vector and work on all this
23599 		//
23600 		// NOTE, beware of the order, it is wordid asc, which does NOT guarantee crc asc
23601 		// example, assume crc(w1)==X, crc(w2)==X+1, crc(w3)==X (collides with w1)
23602 		// wordids will be X, X+1, X+2 but crcs will be X, X+1, X
23603 		//
23604 		// OPTIMIZE, might make sense to use binary search
23605 		// OPTIMIZE, add early out somehow
23606 		SphWordID_t uWordid = uCRC + 1;
23607 		const int iExcLen = m_dExceptions.GetLength();
23608 		int iExc = m_dExceptions.GetLength();
23609 		ARRAY_FOREACH ( i, m_dExceptions )
23610 		{
23611 			const HitblockKeyword_t * pExcWord = m_dExceptions[i].m_pEntry;
23612 
23613 			// incoming word is a known exception? just return the pre-assigned wordid
23614 			if ( m_dExceptions[i].m_uCRC==uCRC && strncmp ( pExcWord->m_pKeyword, sWord, iLen )==0 )
23615 				return pExcWord->m_uWordid;
23616 
23617 			// incoming word collided into a known exception? clear the matched entry; no need to re-add it (see below)
23618 			if ( pExcWord==pEntry )
23619 				pEntry = NULL;
23620 
23621 			// find first exception with wordid greater or equal to our candidate
23622 			if ( pExcWord->m_uWordid>=uWordid && iExc==iExcLen )
23623 				iExc = i;
23624 		}
23625 
23626 		// okay, this is a new collision
23627 		// if entry was a regular word, we have to add it
23628 		if ( pEntry )
23629 		{
23630 			m_dExceptions.Add();
23631 			m_dExceptions.Last().m_pEntry = pEntry;
23632 			m_dExceptions.Last().m_uCRC = uCRC;
23633 		}
23634 
23635 		// need to assign a new unique wordid now
23636 		// keep scanning both exceptions and keywords for collisions
23637 		for ( ;; )
23638 		{
23639 			// iExc must be either the first exception greater or equal to current candidate, or out of bounds
23640 			assert ( iExc==iExcLen || m_dExceptions[iExc].m_pEntry->m_uWordid>=uWordid );
23641 			assert ( iExc==0 || m_dExceptions[iExc-1].m_pEntry->m_uWordid<uWordid );
23642 
23643 			// candidate collides with a known exception? increment it, and keep looking
23644 			if ( iExc<iExcLen && m_dExceptions[iExc].m_pEntry->m_uWordid==uWordid )
23645 			{
23646 				uWordid++;
23647 				while ( iExc<iExcLen && m_dExceptions[iExc].m_pEntry->m_uWordid<uWordid )
23648 					iExc++;
23649 				continue;
23650 			}
23651 
23652 			// candidate collides with a keyword? must be a regular one; add it as an exception, and keep looking
23653 			HitblockKeyword_t * pCheck = m_dHash [ (DWORD)( uWordid % SLOTS ) ];
23654 			while ( pCheck )
23655 			{
23656 				if ( pCheck->m_uWordid==uWordid )
23657 					break;
23658 				pCheck = pCheck->m_pNextHash;
23659 			}
23660 
23661 			// no collisions; we've found our unique wordid!
23662 			if ( !pCheck )
23663 				break;
23664 
23665 			// got a collision; add it
23666 			HitblockException_t & tColl = m_dExceptions.Add();
23667 			tColl.m_pEntry = pCheck;
23668 			tColl.m_uCRC = pCheck->m_uWordid; // not a known exception; hence, wordid must equal crc
23669 
23670 			// and keep looking
23671 			uWordid++;
23672 			continue;
23673 		}
23674 
23675 		// and finally, we have that precious new wordid
23676 		// so hash our new unique under its new unique adjusted wordid
23677 		pEntry = HitblockAddKeyword ( (DWORD)( uWordid % SLOTS ), sWord, iLen, uWordid );
23678 
23679 		// add it as a collision too
23680 		m_dExceptions.Add();
23681 		m_dExceptions.Last().m_pEntry = pEntry;
23682 		m_dExceptions.Last().m_uCRC = uCRC;
23683 
23684 		// keep exceptions list sorted by wordid
23685 		m_dExceptions.Sort();
23686 
23687 		return pEntry->m_uWordid;
23688 	}
23689 
23690 	// new keyword with unique crc
23691 	pEntry = HitblockAddKeyword ( uHash, sWord, iLen, uCRC );
23692 	return pEntry->m_uWordid;
23693 }
23694 
23695 struct DictKeywordTagged_t : public CSphDictKeywords::DictKeyword_t
23696 {
23697 	int m_iBlock;
23698 };
23699 
23700 struct DictKeywordTaggedCmp_fn
23701 {
IsLessDictKeywordTaggedCmp_fn23702 	static inline bool IsLess ( const DictKeywordTagged_t & a, const DictKeywordTagged_t & b )
23703 	{
23704 		return strcmp ( a.m_sKeyword, b.m_sKeyword ) < 0;
23705 	}
23706 };
23707 
DictReadEntry(CSphBin * pBin,DictKeywordTagged_t & tEntry,BYTE * pKeyword)23708 static void DictReadEntry ( CSphBin * pBin, DictKeywordTagged_t & tEntry, BYTE * pKeyword )
23709 {
23710 	int iKeywordLen = pBin->ReadByte ();
23711 	if ( iKeywordLen<0 )
23712 	{
23713 		// early eof or read error; flag must be raised
23714 		assert ( pBin->IsError() );
23715 		return;
23716 	}
23717 
23718 	assert ( iKeywordLen>0 && iKeywordLen<MAX_KEYWORD_BYTES-1 );
23719 	if ( pBin->ReadBytes ( pKeyword, iKeywordLen )<0 )
23720 	{
23721 		assert ( pBin->IsError() );
23722 		return;
23723 	}
23724 	pKeyword[iKeywordLen] = '\0';
23725 
23726 	tEntry.m_sKeyword = (char*)pKeyword;
23727 	tEntry.m_uOff = pBin->UnzipOffset();
23728 	tEntry.m_iDocs = pBin->UnzipInt();
23729 	tEntry.m_iHits = pBin->UnzipInt();
23730 	tEntry.m_uHint = (BYTE) pBin->ReadByte();
23731 	if ( tEntry.m_iDocs > SPH_SKIPLIST_BLOCK )
23732 		tEntry.m_iSkiplistPos = pBin->UnzipInt();
23733 	else
23734 		tEntry.m_iSkiplistPos = 0;
23735 }
23736 
DictBegin(CSphAutofile & tTempDict,CSphAutofile & tDict,int iDictLimit,ThrottleState_t * pThrottle)23737 void CSphDictKeywords::DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle )
23738 {
23739 	m_iTmpFD = tTempDict.GetFD();
23740 	m_wrTmpDict.CloseFile ();
23741 	m_wrTmpDict.SetFile ( tTempDict, NULL, m_sWriterError );
23742 	m_wrTmpDict.SetThrottle ( pThrottle );
23743 
23744 	m_wrDict.CloseFile ();
23745 	m_wrDict.SetFile ( tDict, NULL, m_sWriterError );
23746 	m_wrDict.SetThrottle ( pThrottle );
23747 	m_wrDict.PutByte ( 1 );
23748 
23749 	m_iDictLimit = Max ( iDictLimit, KEYWORD_CHUNK + DICT_CHUNK*(int)sizeof(DictKeyword_t) ); // can't use less than 1 chunk
23750 }
23751 
23752 
DictEnd(DictHeader_t * pHeader,int iMemLimit,CSphString & sError,ThrottleState_t * pThrottle)23753 bool CSphDictKeywords::DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * pThrottle )
23754 {
23755 	DictFlush ();
23756 	m_wrTmpDict.CloseFile (); // tricky: file is not owned, so it won't get closed, and iTmpFD won't get invalidated
23757 
23758 	if ( !m_dDictBlocks.GetLength() )
23759 		m_wrDict.CloseFile();
23760 
23761 	if ( m_wrTmpDict.IsError() || m_wrDict.IsError() )
23762 	{
23763 		sError.SetSprintf ( "dictionary write error (out of space?)" );
23764 		return false;
23765 	}
23766 
23767 	if ( !m_dDictBlocks.GetLength() )
23768 	{
23769 		pHeader->m_iDictCheckpointsOffset = m_wrDict.GetPos ();
23770 		pHeader->m_iDictCheckpoints = 0;
23771 		return true;
23772 	}
23773 
23774 	// infix builder, if needed
23775 	ISphInfixBuilder * pInfixer = sphCreateInfixBuilder ( pHeader->m_iInfixCodepointBytes, &sError );
23776 	if ( !sError.IsEmpty() )
23777 	{
23778 		SafeDelete ( pInfixer );
23779 		return false;
23780 	}
23781 
23782 	// initialize readers
23783 	CSphVector<CSphBin*> dBins ( m_dDictBlocks.GetLength() );
23784 
23785 	int iMaxBlock = 0;
23786 	ARRAY_FOREACH ( i, m_dDictBlocks )
23787 		iMaxBlock = Max ( iMaxBlock, m_dDictBlocks[i].m_iLen );
23788 
23789 	iMemLimit = Max ( iMemLimit, iMaxBlock*m_dDictBlocks.GetLength() );
23790 	int iBinSize = CSphBin::CalcBinSize ( iMemLimit, m_dDictBlocks.GetLength(), "sort_dict" );
23791 
23792 	SphOffset_t iSharedOffset = -1;
23793 	ARRAY_FOREACH ( i, m_dDictBlocks )
23794 	{
23795 		dBins[i] = new CSphBin();
23796 		dBins[i]->m_iFileLeft = m_dDictBlocks[i].m_iLen;
23797 		dBins[i]->m_iFilePos = m_dDictBlocks[i].m_iPos;
23798 		dBins[i]->Init ( m_iTmpFD, &iSharedOffset, iBinSize );
23799 		dBins[i]->SetThrottle ( pThrottle );
23800 	}
23801 
23802 	// keywords storage
23803 	BYTE * pKeywords = new BYTE [ MAX_KEYWORD_BYTES*dBins.GetLength() ];
23804 
23805 	#define LOC_CLEANUP() \
23806 		{ \
23807 			ARRAY_FOREACH ( iIdx, dBins ) \
23808 				SafeDelete ( dBins[iIdx] ); \
23809 			SafeDeleteArray ( pKeywords ); \
23810 			SafeDelete ( pInfixer ); \
23811 		}
23812 
23813 	// do the sort
23814 	CSphQueue < DictKeywordTagged_t, DictKeywordTaggedCmp_fn > qWords ( dBins.GetLength() );
23815 	DictKeywordTagged_t tEntry;
23816 
23817 	ARRAY_FOREACH ( i, dBins )
23818 	{
23819 		DictReadEntry ( dBins[i], tEntry, pKeywords + i*MAX_KEYWORD_BYTES );
23820 		if ( dBins[i]->IsError() )
23821 		{
23822 			sError.SetSprintf ( "entry read error in dictionary sort (bin %d of %d)", i, dBins.GetLength() );
23823 			LOC_CLEANUP();
23824 			return false;
23825 		}
23826 
23827 		tEntry.m_iBlock = i;
23828 		qWords.Push ( tEntry );
23829 	}
23830 
23831 	bool bHasMorphology = HasMorphology();
23832 	CSphKeywordDeltaWriter tLastKeyword;
23833 	int iWords = 0;
23834 	while ( qWords.GetLength() )
23835 	{
23836 		const DictKeywordTagged_t & tWord = qWords.Root();
23837 		const int iLen = strlen ( tWord.m_sKeyword ); // OPTIMIZE?
23838 
23839 		// store checkpoints as needed
23840 		if ( ( iWords % SPH_WORDLIST_CHECKPOINT )==0 )
23841 		{
23842 			// emit a checkpoint, unless we're at the very dict beginning
23843 			if ( iWords )
23844 			{
23845 				m_wrDict.ZipInt ( 0 );
23846 				m_wrDict.ZipInt ( 0 );
23847 			}
23848 
23849 			BYTE * sClone = new BYTE [ iLen+1 ]; // OPTIMIZE? pool these?
23850 			memcpy ( sClone, tWord.m_sKeyword, iLen+1 );
23851 			sClone[iLen] = '\0';
23852 
23853 			CSphWordlistCheckpoint & tCheckpoint = m_dCheckpoints.Add ();
23854 			tCheckpoint.m_sWord = (char*) sClone;
23855 			tCheckpoint.m_iWordlistOffset = m_wrDict.GetPos();
23856 
23857 			tLastKeyword.Reset();
23858 		}
23859 		iWords++;
23860 
23861 		// write final dict entry
23862 		assert ( iLen );
23863 		assert ( tWord.m_uOff );
23864 		assert ( tWord.m_iDocs );
23865 		assert ( tWord.m_iHits );
23866 
23867 		tLastKeyword.PutDelta ( m_wrDict, (const BYTE *)tWord.m_sKeyword, iLen );
23868 		m_wrDict.ZipOffset ( tWord.m_uOff );
23869 		m_wrDict.ZipInt ( tWord.m_iDocs );
23870 		m_wrDict.ZipInt ( tWord.m_iHits );
23871 		if ( tWord.m_uHint )
23872 			m_wrDict.PutByte ( tWord.m_uHint );
23873 		if ( tWord.m_iDocs > SPH_SKIPLIST_BLOCK )
23874 			m_wrDict.ZipInt ( tWord.m_iSkiplistPos );
23875 
23876 		// build infixes
23877 		if ( pInfixer )
23878 			pInfixer->AddWord ( (const BYTE*)tWord.m_sKeyword, iLen, m_dCheckpoints.GetLength(), bHasMorphology );
23879 
23880 		// next
23881 		int iBin = tWord.m_iBlock;
23882 		qWords.Pop ();
23883 
23884 		if ( !dBins[iBin]->IsDone() )
23885 		{
23886 			DictReadEntry ( dBins[iBin], tEntry, pKeywords + iBin*MAX_KEYWORD_BYTES );
23887 			if ( dBins[iBin]->IsError() )
23888 			{
23889 				sError.SetSprintf ( "entry read error in dictionary sort (bin %d of %d)", iBin, dBins.GetLength() );
23890 				LOC_CLEANUP();
23891 				return false;
23892 			}
23893 
23894 			tEntry.m_iBlock = iBin;
23895 			qWords.Push ( tEntry );
23896 		}
23897 	}
23898 
23899 	// end of dictionary block
23900 	m_wrDict.ZipInt ( 0 );
23901 	m_wrDict.ZipInt ( 0 );
23902 
23903 	// flush infix hash entries, if any
23904 	if ( pInfixer )
23905 		pInfixer->SaveEntries ( m_wrDict );
23906 
23907 	// flush wordlist checkpoints (blocks)
23908 	pHeader->m_iDictCheckpointsOffset = m_wrDict.GetPos();
23909 	pHeader->m_iDictCheckpoints = m_dCheckpoints.GetLength();
23910 
23911 	ARRAY_FOREACH ( i, m_dCheckpoints )
23912 	{
23913 		const int iLen = strlen ( m_dCheckpoints[i].m_sWord );
23914 
23915 		assert ( m_dCheckpoints[i].m_iWordlistOffset>0 );
23916 		assert ( iLen>0 && iLen<MAX_KEYWORD_BYTES );
23917 
23918 		m_wrDict.PutDword ( iLen );
23919 		m_wrDict.PutBytes ( m_dCheckpoints[i].m_sWord, iLen );
23920 		m_wrDict.PutOffset ( m_dCheckpoints[i].m_iWordlistOffset );
23921 
23922 		SafeDeleteArray ( m_dCheckpoints[i].m_sWord );
23923 	}
23924 
23925 	// flush infix hash blocks
23926 	if ( pInfixer )
23927 	{
23928 		pHeader->m_iInfixBlocksOffset = pInfixer->SaveEntryBlocks ( m_wrDict );
23929 		pHeader->m_iInfixBlocksWordsSize = pInfixer->GetBlocksWordsSize();
23930 		if ( pHeader->m_iInfixBlocksOffset>UINT_MAX ) // FIXME!!! change to int64
23931 			sphDie ( "INTERNAL ERROR: dictionary size " INT64_FMT " overflow at dictend save", pHeader->m_iInfixBlocksOffset );
23932 	}
23933 
23934 	// flush header
23935 	// mostly for debugging convenience
23936 	// primary storage is in the index wide header
23937 	m_wrDict.PutBytes ( "dict-header", 11 );
23938 	m_wrDict.ZipInt ( pHeader->m_iDictCheckpoints );
23939 	m_wrDict.ZipOffset ( pHeader->m_iDictCheckpointsOffset );
23940 	m_wrDict.ZipInt ( pHeader->m_iInfixCodepointBytes );
23941 	m_wrDict.ZipInt ( (DWORD)pHeader->m_iInfixBlocksOffset );
23942 
23943 	// about it
23944 	LOC_CLEANUP();
23945 	#undef LOC_CLEANUP
23946 
23947 	m_wrDict.CloseFile ();
23948 	if ( m_wrDict.IsError() )
23949 		sError.SetSprintf ( "dictionary write error (out of space?)" );
23950 	return !m_wrDict.IsError();
23951 }
23952 
23953 struct DictKeywordCmp_fn
23954 {
IsLessDictKeywordCmp_fn23955 	inline bool IsLess ( CSphDictKeywords::DictKeyword_t * a, CSphDictKeywords::DictKeyword_t * b ) const
23956 	{
23957 		return strcmp ( a->m_sKeyword, b->m_sKeyword ) < 0;
23958 	}
23959 };
23960 
DictFlush()23961 void CSphDictKeywords::DictFlush ()
23962 {
23963 	if ( !m_dDictChunks.GetLength() )
23964 		return;
23965 	assert ( m_dDictChunks.GetLength() && m_dKeywordChunks.GetLength() );
23966 
23967 	// sort em
23968 	int iTotalWords = m_dDictChunks.GetLength()*DICT_CHUNK - m_iDictChunkFree;
23969 	CSphVector<DictKeyword_t*> dWords ( iTotalWords );
23970 
23971 	int iIdx = 0;
23972 	ARRAY_FOREACH ( i, m_dDictChunks )
23973 	{
23974 		int iWords = DICT_CHUNK;
23975 		if ( i==m_dDictChunks.GetLength()-1 )
23976 			iWords -= m_iDictChunkFree;
23977 
23978 		DictKeyword_t * pWord = m_dDictChunks[i];
23979 		for ( int j=0; j<iWords; j++ )
23980 			dWords[iIdx++] = pWord++;
23981 	}
23982 
23983 	dWords.Sort ( DictKeywordCmp_fn() );
23984 
23985 	// write em
23986 	DictBlock_t & tBlock = m_dDictBlocks.Add();
23987 	tBlock.m_iPos = m_wrTmpDict.GetPos ();
23988 
23989 	ARRAY_FOREACH ( i, dWords )
23990 	{
23991 		const DictKeyword_t * pWord = dWords[i];
23992 		int iLen = strlen ( pWord->m_sKeyword );
23993 		m_wrTmpDict.PutByte ( iLen );
23994 		m_wrTmpDict.PutBytes ( pWord->m_sKeyword, iLen );
23995 		m_wrTmpDict.ZipOffset ( pWord->m_uOff );
23996 		m_wrTmpDict.ZipInt ( pWord->m_iDocs );
23997 		m_wrTmpDict.ZipInt ( pWord->m_iHits );
23998 		m_wrTmpDict.PutByte ( pWord->m_uHint );
23999 		assert ( ( pWord->m_iDocs > SPH_SKIPLIST_BLOCK )==( pWord->m_iSkiplistPos!=0 ) );
24000 		if ( pWord->m_iDocs > SPH_SKIPLIST_BLOCK )
24001 			m_wrTmpDict.ZipInt ( pWord->m_iSkiplistPos );
24002 	}
24003 
24004 	tBlock.m_iLen = (int)( m_wrTmpDict.GetPos() - tBlock.m_iPos );
24005 
24006 	// clean up buffers
24007 	ARRAY_FOREACH ( i, m_dDictChunks )
24008 		SafeDeleteArray ( m_dDictChunks[i] );
24009 	m_dDictChunks.Resize ( 0 );
24010 	m_pDictChunk = NULL;
24011 	m_iDictChunkFree = 0;
24012 
24013 	ARRAY_FOREACH ( i, m_dKeywordChunks )
24014 		SafeDeleteArray ( m_dKeywordChunks[i] );
24015 	m_dKeywordChunks.Resize ( 0 );
24016 	m_pKeywordChunk = NULL;
24017 	m_iKeywordChunkFree = 0;
24018 
24019 	m_iMemUse = 0;
24020 }
24021 
DictEntry(const CSphDictEntry & tEntry)24022 void CSphDictKeywords::DictEntry ( const CSphDictEntry & tEntry )
24023 {
24024 	// they say, this might just happen during merge
24025 	// FIXME! can we make merge avoid sending such keywords to dict and assert here?
24026 	if ( !tEntry.m_iDocs )
24027 		return;
24028 
24029 	assert ( tEntry.m_iHits );
24030 	assert ( tEntry.m_iDoclistLength>0 );
24031 
24032 	DictKeyword_t * pWord = NULL;
24033 	int iLen = strlen ( (char*)tEntry.m_sKeyword ) + 1;
24034 
24035 	for ( ;; )
24036 	{
24037 		// alloc dict entry
24038 		if ( !m_iDictChunkFree )
24039 		{
24040 			if ( m_iDictLimit && ( m_iMemUse + (int)sizeof(DictKeyword_t)*DICT_CHUNK )>m_iDictLimit )
24041 				DictFlush ();
24042 
24043 			m_pDictChunk = new DictKeyword_t [ DICT_CHUNK ];
24044 			m_iDictChunkFree = DICT_CHUNK;
24045 			m_dDictChunks.Add ( m_pDictChunk );
24046 			m_iMemUse += sizeof(DictKeyword_t)*DICT_CHUNK;
24047 		}
24048 
24049 		// alloc keyword
24050 		if ( m_iKeywordChunkFree < iLen )
24051 		{
24052 			if ( m_iDictLimit && ( m_iMemUse + KEYWORD_CHUNK )>m_iDictLimit )
24053 			{
24054 				DictFlush ();
24055 				continue; // because we just flushed pWord
24056 			}
24057 
24058 			m_pKeywordChunk = new BYTE [ KEYWORD_CHUNK ];
24059 			m_iKeywordChunkFree = KEYWORD_CHUNK;
24060 			m_dKeywordChunks.Add ( m_pKeywordChunk );
24061 			m_iMemUse += KEYWORD_CHUNK;
24062 		}
24063 		// aw kay
24064 		break;
24065 	}
24066 
24067 	pWord = m_pDictChunk++;
24068 	m_iDictChunkFree--;
24069 	pWord->m_sKeyword = (char*)m_pKeywordChunk;
24070 	memcpy ( m_pKeywordChunk, tEntry.m_sKeyword, iLen );
24071 	m_pKeywordChunk[iLen-1] = '\0';
24072 	m_pKeywordChunk += iLen;
24073 	m_iKeywordChunkFree -= iLen;
24074 
24075 	pWord->m_uOff = tEntry.m_iDoclistOffset;
24076 	pWord->m_iDocs = tEntry.m_iDocs;
24077 	pWord->m_iHits = tEntry.m_iHits;
24078 	pWord->m_uHint = sphDoclistHintPack ( tEntry.m_iDocs, tEntry.m_iDoclistLength );
24079 	pWord->m_iSkiplistPos = 0;
24080 	if ( tEntry.m_iDocs > SPH_SKIPLIST_BLOCK )
24081 		pWord->m_iSkiplistPos = (int)( tEntry.m_iSkiplistOffset );
24082 }
24083 
GetWordID(BYTE * pWord)24084 SphWordID_t CSphDictKeywords::GetWordID ( BYTE * pWord )
24085 {
24086 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordID ( pWord );
24087 	if ( !uCRC || !m_bHitblock )
24088 		return uCRC;
24089 
24090 	int iLen = strlen ( (const char *)pWord );
24091 	return HitblockGetID ( (const char *)pWord, iLen, uCRC );
24092 }
24093 
GetWordIDWithMarkers(BYTE * pWord)24094 SphWordID_t CSphDictKeywords::GetWordIDWithMarkers ( BYTE * pWord )
24095 {
24096 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordIDWithMarkers ( pWord );
24097 	if ( !uCRC || !m_bHitblock )
24098 		return uCRC;
24099 
24100 	int iLen = strlen ( (const char *)pWord );
24101 	return HitblockGetID ( (const char *)pWord, iLen, uCRC );
24102 }
24103 
GetWordIDNonStemmed(BYTE * pWord)24104 SphWordID_t CSphDictKeywords::GetWordIDNonStemmed ( BYTE * pWord )
24105 {
24106 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordIDNonStemmed ( pWord );
24107 	if ( !uCRC || !m_bHitblock )
24108 		return uCRC;
24109 
24110 	int iLen = strlen ( (const char *)pWord );
24111 	return HitblockGetID ( (const char *)pWord, iLen, uCRC );
24112 }
24113 
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)24114 SphWordID_t CSphDictKeywords::GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
24115 {
24116 	SphWordID_t uCRC = CSphDictCRC<true>::GetWordID ( pWord, iLen, bFilterStops );
24117 	if ( !uCRC || !m_bHitblock )
24118 		return uCRC;
24119 
24120 	return HitblockGetID ( (const char *)pWord, iLen, uCRC ); // !COMMIT would break, we kind of strcmp inside; but must never get called?
24121 }
24122 
24123 /// binary search for the first hit with wordid greater than or equal to reference
FindFirstGte(CSphWordHit * pHits,int iHits,SphWordID_t uID)24124 static CSphWordHit * FindFirstGte ( CSphWordHit * pHits, int iHits, SphWordID_t uID )
24125 {
24126 	if ( pHits->m_uWordID==uID )
24127 		return pHits;
24128 
24129 	CSphWordHit * pL = pHits;
24130 	CSphWordHit * pR = pHits + iHits - 1;
24131 	if ( pL->m_uWordID > uID || pR->m_uWordID < uID )
24132 		return NULL;
24133 
24134 	while ( pR-pL!=1 )
24135 	{
24136 		CSphWordHit * pM = pL + ( pR-pL )/2;
24137 		if ( pM->m_uWordID < uID )
24138 			pL = pM;
24139 		else
24140 			pR = pM;
24141 	}
24142 
24143 	assert ( pR-pL==1 );
24144 	assert ( pL->m_uWordID<uID );
24145 	assert ( pR->m_uWordID>=uID );
24146 	return pR;
24147 }
24148 
24149 /// full crc and keyword check
FullIsLess(const CSphDictKeywords::HitblockException_t & a,const CSphDictKeywords::HitblockException_t & b)24150 static inline bool FullIsLess ( const CSphDictKeywords::HitblockException_t & a, const CSphDictKeywords::HitblockException_t & b )
24151 {
24152 	if ( a.m_uCRC!=b.m_uCRC )
24153 		return a.m_uCRC < b.m_uCRC;
24154 	return strcmp ( a.m_pEntry->m_pKeyword, b.m_pEntry->m_pKeyword ) < 0;
24155 }
24156 
24157 /// sort functor to compute collided hits reordering
24158 struct HitblockPatchSort_fn
24159 {
24160 	const CSphDictKeywords::HitblockException_t * m_pExc;
24161 
HitblockPatchSort_fnHitblockPatchSort_fn24162 	explicit HitblockPatchSort_fn ( const CSphDictKeywords::HitblockException_t * pExc )
24163 		: m_pExc ( pExc )
24164 	{}
24165 
IsLessHitblockPatchSort_fn24166 	bool IsLess ( int a, int b ) const
24167 	{
24168 		return FullIsLess ( m_pExc[a], m_pExc[b] );
24169 	}
24170 };
24171 
24172 /// do hit block patching magic
HitblockPatch(CSphWordHit * pHits,int iHits) const24173 void CSphDictKeywords::HitblockPatch ( CSphWordHit * pHits, int iHits ) const
24174 {
24175 	if ( !pHits || iHits<=0 )
24176 		return;
24177 
24178 	const CSphVector<HitblockException_t> & dExc = m_dExceptions; // shortcut
24179 	CSphVector<CSphWordHit*> dChunk;
24180 
24181 	// reorder hit chunks for exceptions (aka crc collisions)
24182 	for ( int iFirst = 0; iFirst < dExc.GetLength()-1; )
24183 	{
24184 		// find next span of collisions, iFirst inclusive, iMax exclusive ie. [iFirst,iMax)
24185 		// (note that exceptions array is always sorted)
24186 		SphWordID_t uFirstWordid = dExc[iFirst].m_pEntry->m_uWordid;
24187 		assert ( dExc[iFirst].m_uCRC==uFirstWordid );
24188 
24189 		int iMax = iFirst+1;
24190 		SphWordID_t uSpan = uFirstWordid+1;
24191 		while ( iMax < dExc.GetLength() && dExc[iMax].m_pEntry->m_uWordid==uSpan )
24192 		{
24193 			iMax++;
24194 			uSpan++;
24195 		}
24196 
24197 		// check whether they are in proper order already
24198 		bool bSorted = true;
24199 		for ( int i=iFirst; i<iMax-1 && bSorted; i++ )
24200 			if ( FullIsLess ( dExc[i+1], dExc[i] ) )
24201 				bSorted = false;
24202 
24203 		// order is ok; skip this span
24204 		if ( bSorted )
24205 		{
24206 			iFirst = iMax;
24207 			continue;
24208 		}
24209 
24210 		// we need to fix up these collision hits
24211 		// convert them from arbitrary "wordid asc" to strict "crc asc, keyword asc" order
24212 		// lets begin with looking up hit chunks for every wordid
24213 		dChunk.Resize ( iMax-iFirst+1 );
24214 
24215 		// find the end
24216 		dChunk.Last() = FindFirstGte ( pHits, iHits, uFirstWordid+iMax-iFirst );
24217 		if ( !dChunk.Last() )
24218 		{
24219 			assert ( iMax==dExc.GetLength() && pHits[iHits-1].m_uWordID==uFirstWordid+iMax-1-iFirst );
24220 			dChunk.Last() = pHits+iHits;
24221 		}
24222 
24223 		// find the start
24224 		dChunk[0] = FindFirstGte ( pHits, dChunk.Last()-pHits, uFirstWordid );
24225 		assert ( dChunk[0] && dChunk[0]->m_uWordID==uFirstWordid );
24226 
24227 		// find the chunk starts
24228 		for ( int i=1; i<dChunk.GetLength()-1; i++ )
24229 		{
24230 			dChunk[i] = FindFirstGte ( dChunk[i-1], dChunk.Last()-dChunk[i-1], uFirstWordid+i );
24231 			assert ( dChunk[i] && dChunk[i]->m_uWordID==uFirstWordid+i );
24232 		}
24233 
24234 		CSphWordHit * pTemp;
24235 		if ( iMax-iFirst==2 )
24236 		{
24237 			// most frequent case, just two collisions
24238 			// OPTIMIZE? allocate buffer for the smaller chunk, not just first chunk
24239 			pTemp = new CSphWordHit [ dChunk[1]-dChunk[0] ];
24240 			memcpy ( pTemp, dChunk[0], ( dChunk[1]-dChunk[0] )*sizeof(CSphWordHit) );
24241 			memmove ( dChunk[0], dChunk[1], ( dChunk[2]-dChunk[1] )*sizeof(CSphWordHit) );
24242 			memcpy ( dChunk[0] + ( dChunk[2]-dChunk[1] ), pTemp, ( dChunk[1]-dChunk[0] )*sizeof(CSphWordHit) );
24243 		} else
24244 		{
24245 			// generic case, more than two
24246 			CSphVector<int> dReorder ( iMax-iFirst );
24247 			ARRAY_FOREACH ( i, dReorder )
24248 				dReorder[i] = i;
24249 
24250 			HitblockPatchSort_fn fnSort ( &dExc[iFirst] );
24251 			dReorder.Sort ( fnSort );
24252 
24253 			// OPTIMIZE? could skip heading and trailing blocks that are already in position
24254 			pTemp = new CSphWordHit [ dChunk.Last()-dChunk[0] ];
24255 			CSphWordHit * pOut = pTemp;
24256 
24257 			ARRAY_FOREACH ( i, dReorder )
24258 			{
24259 				int iChunk = dReorder[i];
24260 				int iChunkHits = dChunk[iChunk+1] - dChunk[iChunk];
24261 				memcpy ( pOut, dChunk[iChunk], iChunkHits*sizeof(CSphWordHit) );
24262 				pOut += iChunkHits;
24263 			}
24264 
24265 			assert ( ( pOut-pTemp )==( dChunk.Last()-dChunk[0] ) );
24266 			memcpy ( dChunk[0], pTemp, ( dChunk.Last()-dChunk[0] )*sizeof(CSphWordHit) );
24267 		}
24268 
24269 		// patching done
24270 		SafeDeleteArray ( pTemp );
24271 		iFirst = iMax;
24272 	}
24273 }
24274 
HitblockGetKeyword(SphWordID_t uWordID)24275 const char * CSphDictKeywords::HitblockGetKeyword ( SphWordID_t uWordID )
24276 {
24277 	const DWORD uHash = (DWORD)( uWordID % SLOTS );
24278 
24279 	HitblockKeyword_t * pEntry = m_dHash [ uHash ];
24280 	while ( pEntry )
24281 	{
24282 		// check crc
24283 		if ( pEntry->m_uWordid!=uWordID )
24284 		{
24285 			// crc mismatch, try next entry
24286 			pEntry = pEntry->m_pNextHash;
24287 			continue;
24288 		}
24289 
24290 		return pEntry->m_pKeyword;
24291 	}
24292 
24293 	assert ( m_dExceptions.GetLength() );
24294 	ARRAY_FOREACH ( i, m_dExceptions )
24295 		if ( m_dExceptions[i].m_pEntry->m_uWordid==uWordID )
24296 			return m_dExceptions[i].m_pEntry->m_pKeyword;
24297 
24298 	sphWarning ( "hash missing value in operator [] (wordid=" INT64_FMT ", hash=%d)", (int64_t)uWordID, uHash );
24299 	assert ( 0 && "hash missing value in operator []" );
24300 	return "\31oops";
24301 }
24302 
24303 //////////////////////////////////////////////////////////////////////////
24304 // KEYWORDS STORING DICTIONARY
24305 //////////////////////////////////////////////////////////////////////////
24306 
24307 class CRtDictKeywords : public ISphRtDictWraper
24308 {
24309 private:
24310 	CSphDict *				m_pBase;
24311 	SmallStringHash_T<int>	m_hKeywords;
24312 	CSphVector<BYTE>		m_dPackedKeywords;
24313 
24314 	CSphString				m_sWarning;
24315 	int						m_iKeywordsOverrun;
24316 
24317 public:
CRtDictKeywords(CSphDict * pBase)24318 	explicit CRtDictKeywords ( CSphDict * pBase )
24319 		: m_pBase ( pBase )
24320 		, m_iKeywordsOverrun ( 0 )
24321 	{
24322 		m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
24323 	}
~CRtDictKeywords()24324 	virtual ~CRtDictKeywords() {}
24325 
GetWordID(BYTE * pWord)24326 	virtual SphWordID_t GetWordID ( BYTE * pWord )
24327 	{
24328 		SphWordID_t uCRC = m_pBase->GetWordID ( pWord );
24329 		if ( uCRC )
24330 			return AddKeyword ( pWord );
24331 		else
24332 			return 0;
24333 	}
24334 
GetWordIDWithMarkers(BYTE * pWord)24335 	virtual SphWordID_t GetWordIDWithMarkers ( BYTE * pWord )
24336 	{
24337 		SphWordID_t uCRC = m_pBase->GetWordIDWithMarkers ( pWord );
24338 		if ( uCRC )
24339 			return AddKeyword ( pWord );
24340 		else
24341 			return 0;
24342 	}
24343 
GetWordIDNonStemmed(BYTE * pWord)24344 	virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord )
24345 	{
24346 		SphWordID_t uCRC = m_pBase->GetWordIDNonStemmed ( pWord );
24347 		if ( uCRC )
24348 			return AddKeyword ( pWord );
24349 		else
24350 			return 0;
24351 	}
24352 
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)24353 	virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops )
24354 	{
24355 		SphWordID_t uCRC = m_pBase->GetWordID ( pWord, iLen, bFilterStops );
24356 		if ( uCRC )
24357 			return AddKeyword ( pWord );
24358 		else
24359 			return 0;
24360 	}
24361 
GetPackedKeywords()24362 	virtual const BYTE * GetPackedKeywords () { return m_dPackedKeywords.Begin(); }
GetPackedLen()24363 	virtual int GetPackedLen () { return m_dPackedKeywords.GetLength(); }
ResetKeywords()24364 	virtual void ResetKeywords()
24365 	{
24366 		m_dPackedKeywords.Resize ( 0 );
24367 		m_dPackedKeywords.Add ( 0 ); // avoid zero offset at all costs
24368 		m_hKeywords.Reset();
24369 	}
24370 
AddKeyword(const BYTE * pWord)24371 	SphWordID_t AddKeyword ( const BYTE * pWord )
24372 	{
24373 		CSphString sWord;
24374 		int iLen = strlen ( (const char *)pWord );
24375 		// stemmer might squeeze out the word
24376 		if ( !iLen )
24377 			return 0;
24378 
24379 		// fix of very long word (zones)
24380 		if ( iLen>=( SPH_MAX_WORD_LEN*3 ) )
24381 		{
24382 			int iClippedLen = SPH_MAX_WORD_LEN*3;
24383 			sWord.SetBinary ( (const char *)pWord, iClippedLen );
24384 			if ( m_iKeywordsOverrun )
24385 			{
24386 				m_sWarning.SetSprintf ( "word overrun buffer, clipped!!! clipped='%s', length=%d(%d)", sWord.cstr(), iClippedLen, iLen );
24387 			} else
24388 			{
24389 				m_sWarning.SetSprintf ( ", clipped='%s', length=%d(%d)", sWord.cstr(), iClippedLen, iLen );
24390 			}
24391 			iLen = iClippedLen;
24392 			m_iKeywordsOverrun++;
24393 		} else
24394 		{
24395 			sWord.SetBinary ( (const char *)pWord, iLen );
24396 		}
24397 
24398 		int * pOff = m_hKeywords ( sWord );
24399 		if ( pOff )
24400 		{
24401 			return *pOff;
24402 		}
24403 
24404 		int iOff = m_dPackedKeywords.GetLength();
24405 		m_dPackedKeywords.Resize ( iOff+iLen+1 );
24406 		m_dPackedKeywords[iOff] = (BYTE)( iLen & 0xFF );
24407 		memcpy ( m_dPackedKeywords.Begin()+iOff+1, pWord, iLen );
24408 
24409 		m_hKeywords.Add ( iOff, sWord );
24410 
24411 		return iOff;
24412 	}
24413 
LoadStopwords(const char * sFiles,const ISphTokenizer * pTokenizer)24414 	virtual void LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer ) { m_pBase->LoadStopwords ( sFiles, pTokenizer ); }
LoadStopwords(const CSphVector<SphWordID_t> & dStopwords)24415 	virtual void LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords ) { m_pBase->LoadStopwords ( dStopwords ); }
WriteStopwords(CSphWriter & tWriter)24416 	virtual void WriteStopwords ( CSphWriter & tWriter ) { m_pBase->WriteStopwords ( tWriter ); }
LoadWordforms(const CSphVector<CSphString> & dFiles,const CSphEmbeddedFiles * pEmbedded,const ISphTokenizer * pTokenizer,const char * sIndex)24417 	virtual bool LoadWordforms ( const CSphVector<CSphString> & dFiles, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex ) { return m_pBase->LoadWordforms ( dFiles, pEmbedded, pTokenizer, sIndex ); }
WriteWordforms(CSphWriter & tWriter)24418 	virtual void WriteWordforms ( CSphWriter & tWriter ) { m_pBase->WriteWordforms ( tWriter ); }
SetMorphology(const char * szMorph,CSphString & sMessage)24419 	virtual int SetMorphology ( const char * szMorph, CSphString & sMessage ) { return m_pBase->SetMorphology ( szMorph, sMessage ); }
Setup(const CSphDictSettings & tSettings)24420 	virtual void Setup ( const CSphDictSettings & tSettings ) { m_pBase->Setup ( tSettings ); }
GetSettings() const24421 	virtual const CSphDictSettings & GetSettings () const { return m_pBase->GetSettings(); }
GetStopwordsFileInfos()24422 	virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_pBase->GetStopwordsFileInfos(); }
GetWordformsFileInfos()24423 	virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_pBase->GetWordformsFileInfos(); }
GetMultiWordforms() const24424 	virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pBase->GetMultiWordforms(); }
IsStopWord(const BYTE * pWord) const24425 	virtual bool IsStopWord ( const BYTE * pWord ) const { return m_pBase->IsStopWord ( pWord ); }
GetLastWarning() const24426 	virtual const char * GetLastWarning() const { return m_iKeywordsOverrun ? m_sWarning.cstr() : NULL; }
ResetWarning()24427 	virtual void ResetWarning () { m_iKeywordsOverrun = 0; }
GetSettingsFNV() const24428 	virtual uint64_t GetSettingsFNV () const { return m_pBase->GetSettingsFNV(); }
SetApplyMorph(bool bApply)24429 	virtual void SetApplyMorph ( bool bApply ) { m_pBase->SetApplyMorph ( bApply ); }
24430 };
24431 
sphCreateRtKeywordsDictionaryWrapper(CSphDict * pBase)24432 ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase )
24433 {
24434 	return new CRtDictKeywords ( pBase );
24435 }
24436 
24437 
24438 //////////////////////////////////////////////////////////////////////////
24439 // DICTIONARY FACTORIES
24440 //////////////////////////////////////////////////////////////////////////
24441 
SetupDictionary(CSphDict * pDict,const CSphDictSettings & tSettings,const CSphEmbeddedFiles * pFiles,const ISphTokenizer * pTokenizer,const char * sIndex,CSphString & sError)24442 static CSphDict * SetupDictionary ( CSphDict * pDict, const CSphDictSettings & tSettings,
24443 	const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex,
24444 	CSphString & sError )
24445 {
24446 	assert ( pTokenizer );
24447 	assert ( pDict );
24448 
24449 	pDict->Setup ( tSettings );
24450 	int iRet = pDict->SetMorphology ( tSettings.m_sMorphology.cstr (), sError );
24451 	if ( iRet==CSphDict::ST_ERROR )
24452 	{
24453 		SafeDelete ( pDict );
24454 		return NULL;
24455 	}
24456 
24457 	if ( pFiles && pFiles->m_bEmbeddedStopwords )
24458 		pDict->LoadStopwords ( pFiles->m_dStopwords );
24459 	else
24460 		pDict->LoadStopwords ( tSettings.m_sStopwords.cstr (), pTokenizer );
24461 
24462 	pDict->LoadWordforms ( tSettings.m_dWordforms, pFiles && pFiles->m_bEmbeddedWordforms ? pFiles : NULL, pTokenizer, sIndex );
24463 
24464 	return pDict;
24465 }
24466 
sphCreateDictionaryTemplate(const CSphDictSettings & tSettings,const CSphEmbeddedFiles * pFiles,const ISphTokenizer * pTokenizer,const char * sIndex,CSphString & sError)24467 CSphDict * sphCreateDictionaryTemplate ( const CSphDictSettings & tSettings,
24468 									const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex,
24469 									CSphString & sError )
24470 {
24471 	CSphDict * pDict = new CSphDictTemplate();
24472 	if ( !pDict )
24473 		return NULL;
24474 	return SetupDictionary ( pDict, tSettings, pFiles, pTokenizer, sIndex, sError );
24475 }
24476 
24477 
sphCreateDictionaryCRC(const CSphDictSettings & tSettings,const CSphEmbeddedFiles * pFiles,const ISphTokenizer * pTokenizer,const char * sIndex,CSphString & sError)24478 CSphDict * sphCreateDictionaryCRC ( const CSphDictSettings & tSettings,
24479 	const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex,
24480 	CSphString & sError )
24481 {
24482 	CSphDict * pDict = NULL;
24483 	if ( tSettings.m_bCrc32 )
24484 		pDict = new CSphDictCRC<true> ();
24485 	else
24486 		pDict = new CSphDictCRC<false> ();
24487 	if ( !pDict )
24488 		return NULL;
24489 	return SetupDictionary ( pDict, tSettings, pFiles, pTokenizer, sIndex, sError );
24490 }
24491 
24492 
sphCreateDictionaryKeywords(const CSphDictSettings & tSettings,const CSphEmbeddedFiles * pFiles,const ISphTokenizer * pTokenizer,const char * sIndex,CSphString & sError)24493 CSphDict * sphCreateDictionaryKeywords ( const CSphDictSettings & tSettings,
24494 	const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex,
24495 	CSphString & sError )
24496 {
24497 	CSphDict * pDict = new CSphDictKeywords();
24498 	return SetupDictionary ( pDict, tSettings, pFiles, pTokenizer, sIndex, sError );
24499 }
24500 
24501 
sphShutdownWordforms()24502 void sphShutdownWordforms ()
24503 {
24504 	CSphVector<CSphSavedFile> dEmptyFiles;
24505 	CSphDiskDictTraits::SweepWordformContainers ( dEmptyFiles );
24506 }
24507 
24508 /////////////////////////////////////////////////////////////////////////////
24509 // HTML STRIPPER
24510 /////////////////////////////////////////////////////////////////////////////
24511 
sphIsTag(int c)24512 static inline int sphIsTag ( int c )
24513 {
24514 	return sphIsAlpha(c) || c=='.' || c==':';
24515 }
24516 
sphIsTagStart(int c)24517 static inline int sphIsTagStart ( int c )
24518 {
24519 	return ( c>='a' && c<='z' ) || ( c>='A' && c<='Z' ) || c=='_' || c=='.' || c==':';
24520 }
24521 
CSphHTMLStripper(bool bDefaultTags)24522 CSphHTMLStripper::CSphHTMLStripper ( bool bDefaultTags )
24523 {
24524 	if ( bDefaultTags )
24525 	{
24526 		// known inline tags
24527 		const char * dKnown[] =
24528 		{
24529 			"a", "b", "i", "s", "u",
24530 			"basefont", "big", "em", "font", "img",
24531 			"label", "small", "span", "strike", "strong",
24532 			"sub\0", "sup\0", // fix gcc 3.4.3 on solaris10 compiler bug
24533 			"tt"
24534 		};
24535 
24536 		m_dTags.Resize ( sizeof(dKnown)/sizeof(dKnown[0]) );
24537 		ARRAY_FOREACH ( i, m_dTags )
24538 		{
24539 			m_dTags[i].m_sTag = dKnown[i];
24540 			m_dTags[i].m_iTagLen = strlen ( dKnown[i] );
24541 			m_dTags[i].m_bInline = true;
24542 		}
24543 	}
24544 
24545 	UpdateTags ();
24546 }
24547 
24548 
GetCharIndex(int iCh) const24549 int CSphHTMLStripper::GetCharIndex ( int iCh ) const
24550 {
24551 	if ( iCh>='a' && iCh<='z' ) return iCh-'a';
24552 	if ( iCh>='A' && iCh<='Z' ) return iCh-'A';
24553 	if ( iCh=='_' ) return 26;
24554 	if ( iCh==':' ) return 27;
24555 	return -1;
24556 }
24557 
24558 
UpdateTags()24559 void CSphHTMLStripper::UpdateTags ()
24560 {
24561 	m_dTags.Sort ();
24562 
24563 	for ( int i=0; i<MAX_CHAR_INDEX; i++ )
24564 	{
24565 		m_dStart[i] = INT_MAX;
24566 		m_dEnd[i] = -1;
24567 	}
24568 
24569 	ARRAY_FOREACH ( i, m_dTags )
24570 	{
24571 		int iIdx = GetCharIndex ( m_dTags[i].m_sTag.cstr()[0] );
24572 		if ( iIdx<0 )
24573 			continue;
24574 
24575 		m_dStart[iIdx] = Min ( m_dStart[iIdx], i );
24576 		m_dEnd[iIdx] = Max ( m_dEnd[iIdx], i );
24577 	}
24578 }
24579 
24580 
SetIndexedAttrs(const char * sConfig,CSphString & sError)24581 bool CSphHTMLStripper::SetIndexedAttrs ( const char * sConfig, CSphString & sError )
24582 {
24583 	if ( !sConfig || !*sConfig )
24584 		return true;
24585 
24586 	char sTag[256], sAttr[256];
24587 
24588 	const char * p = sConfig, * s;
24589 	#define LOC_ERROR(_msg,_pos) { sError.SetSprintf ( "SetIndexedAttrs(): %s near '%s'", _msg, _pos ); return false; }
24590 
24591 	while ( *p )
24592 	{
24593 		// skip spaces
24594 		while ( *p && isspace(*p) ) p++;
24595 		if ( !*p ) break;
24596 
24597 		// check tag name
24598 		s = p; while ( sphIsTag(*p) ) p++;
24599 		if ( s==p ) LOC_ERROR ( "invalid character in tag name", s );
24600 
24601 		// get tag name
24602 		if ( p-s>=(int)sizeof(sTag) ) LOC_ERROR ( "tag name too long", s );
24603 		strncpy ( sTag, s, p-s );
24604 		sTag[p-s] = '\0';
24605 
24606 		// skip spaces
24607 		while ( *p && isspace(*p) ) p++;
24608 		if ( *p++!='=' ) LOC_ERROR ( "'=' expected", p-1 );
24609 
24610 		// add indexed tag entry, if not there yet
24611 		strlwr ( sTag );
24612 
24613 		int iIndexTag = -1;
24614 		ARRAY_FOREACH ( i, m_dTags )
24615 			if ( m_dTags[i].m_sTag==sTag )
24616 		{
24617 			iIndexTag = i;
24618 			break;
24619 		}
24620 		if ( iIndexTag<0 )
24621 		{
24622 			m_dTags.Add();
24623 			m_dTags.Last().m_sTag = sTag;
24624 			m_dTags.Last().m_iTagLen = strlen ( sTag );
24625 			iIndexTag = m_dTags.GetLength()-1;
24626 		}
24627 
24628 		m_dTags[iIndexTag].m_bIndexAttrs = true;
24629 		CSphVector<CSphString> & dAttrs = m_dTags[iIndexTag].m_dAttrs;
24630 
24631 		// scan attributes
24632 		while ( *p )
24633 		{
24634 			// skip spaces
24635 			while ( *p && isspace(*p) ) p++;
24636 			if ( !*p ) break;
24637 
24638 			// check attr name
24639 			s = p; while ( sphIsTag(*p) ) p++;
24640 			if ( s==p ) LOC_ERROR ( "invalid character in attribute name", s );
24641 
24642 			// get attr name
24643 			if ( p-s>=(int)sizeof(sAttr) ) LOC_ERROR ( "attribute name too long", s );
24644 			strncpy ( sAttr, s, p-s );
24645 			sAttr[p-s] = '\0';
24646 
24647 			// add attr, if not there yet
24648 			int iAttr;
24649 			for ( iAttr=0; iAttr<dAttrs.GetLength(); iAttr++ )
24650 				if ( dAttrs[iAttr]==sAttr )
24651 					break;
24652 
24653 			if ( iAttr==dAttrs.GetLength() )
24654 				dAttrs.Add ( sAttr );
24655 
24656 			// skip spaces
24657 			while ( *p && isspace(*p) ) p++;
24658 			if ( !*p ) break;
24659 
24660 			// check if there's next attr or tag
24661 			if ( *p==',' ) { p++; continue; } // next attr
24662 			if ( *p==';' ) { p++; break; } // next tag
24663 			LOC_ERROR ( "',' or ';' or end of line expected", p );
24664 		}
24665 	}
24666 
24667 	#undef LOC_ERROR
24668 
24669 	UpdateTags ();
24670 	return true;
24671 }
24672 
24673 
SetRemovedElements(const char * sConfig,CSphString &)24674 bool CSphHTMLStripper::SetRemovedElements ( const char * sConfig, CSphString & )
24675 {
24676 	if ( !sConfig || !*sConfig )
24677 		return true;
24678 
24679 	const char * p = sConfig;
24680 	while ( *p )
24681 	{
24682 		// skip separators
24683 		while ( *p && !sphIsTag(*p) ) p++;
24684 		if ( !*p ) break;
24685 
24686 		// get tag name
24687 		const char * s = p;
24688 		while ( sphIsTag(*p) ) p++;
24689 
24690 		CSphString sTag;
24691 		sTag.SetBinary ( s, p-s );
24692 		sTag.ToLower ();
24693 
24694 		// mark it
24695 		int iTag;
24696 		for ( iTag=0; iTag<m_dTags.GetLength(); iTag++ )
24697 			if ( m_dTags[iTag].m_sTag==sTag )
24698 		{
24699 			m_dTags[iTag].m_bRemove = true;
24700 			break;
24701 		}
24702 
24703 		if ( iTag==m_dTags.GetLength() )
24704 		{
24705 			m_dTags.Add();
24706 			m_dTags.Last().m_sTag = sTag;
24707 			m_dTags.Last().m_iTagLen = strlen ( sTag.cstr() );
24708 			m_dTags.Last().m_bRemove = true;
24709 		}
24710 	}
24711 
24712 	UpdateTags ();
24713 	return true;
24714 }
24715 
24716 
EnableParagraphs()24717 void CSphHTMLStripper::EnableParagraphs ()
24718 {
24719 	// known block-level elements
24720 	const char * dBlock[] = { "address", "blockquote", "caption", "center",
24721 		"dd", "div", "dl", "dt", "h1", "h2", "h3", "h4", "h5", "li", "menu",
24722 		"ol", "p", "pre", "table", "tbody", "td", "tfoot", "th", "thead",
24723 		"tr", "ul", NULL };
24724 
24725 	for ( int iBlock=0; dBlock[iBlock]; iBlock++ )
24726 	{
24727 		const char * sTag = dBlock[iBlock];
24728 
24729 		// mark if known already
24730 		int iTag;
24731 		for ( iTag=0; iTag<m_dTags.GetLength(); iTag++ )
24732 			if ( m_dTags[iTag].m_sTag==sTag )
24733 		{
24734 			m_dTags[iTag].m_bPara = true;
24735 			break;
24736 		}
24737 
24738 		// add if not known yet
24739 		if ( iTag==m_dTags.GetLength() )
24740 		{
24741 			StripperTag_t& dTag = m_dTags.Add();
24742 			dTag.m_sTag = sTag;
24743 			dTag.m_iTagLen = strlen(sTag);
24744 			dTag.m_bPara = true;
24745 		}
24746 	}
24747 
24748 	UpdateTags ();
24749 }
24750 
24751 
SetZones(const char * sZones,CSphString & sError)24752 bool CSphHTMLStripper::SetZones ( const char * sZones, CSphString & sError )
24753 {
24754 	// yet another mini parser!
24755 	// index_zones = {tagname | prefix*} [, ...]
24756 	if ( !sZones || !*sZones )
24757 		return true;
24758 
24759 	const char * s = sZones;
24760 	while ( *s )
24761 	{
24762 		// skip spaces
24763 		while ( sphIsSpace(*s) )
24764 			s++;
24765 		if ( !*s )
24766 			break;
24767 
24768 		// expect ident
24769 		if ( !sphIsTagStart(*s) )
24770 		{
24771 			sError.SetSprintf ( "unexpected char near '%s' in index_zones", s );
24772 			return false;
24773 		}
24774 
24775 		// get ident (either tagname or prefix*)
24776 		const char * sTag = s;
24777 		while ( sphIsTag(*s) )
24778 			s++;
24779 
24780 		const char * sTagEnd = s;
24781 		bool bPrefix = false;
24782 		if ( *s=='*' )
24783 		{
24784 			s++;
24785 			bPrefix = true;
24786 		}
24787 
24788 		// skip spaces
24789 		while ( sphIsSpace(*s) )
24790 			s++;
24791 
24792 		// expect eof or comma after ident
24793 		if ( *s && *s!=',' )
24794 		{
24795 			sError.SetSprintf ( "unexpected char near '%s' in index_zones", s );
24796 			return false;
24797 		}
24798 		if ( *s==',' )
24799 			s++;
24800 
24801 		// got valid entry, handle it
24802 		CSphHTMLStripper::StripperTag_t & tTag = m_dTags.Add();
24803 		tTag.m_sTag.SetBinary ( sTag, sTagEnd-sTag );
24804 		tTag.m_iTagLen = (int)( sTagEnd-sTag );
24805 		tTag.m_bZone = true;
24806 		tTag.m_bZonePrefix = bPrefix;
24807 	}
24808 
24809 	UpdateTags ();
24810 	return true;
24811 }
24812 
24813 
SkipQuoted(const BYTE * p)24814 const BYTE * SkipQuoted ( const BYTE * p )
24815 {
24816 	const BYTE * pMax = p + 512; // 512 bytes should be enough for a reasonable HTML attribute value, right?!
24817 	const BYTE * pProbEnd = NULL; // (most) probable end location in case we don't find a matching quote
24818 	BYTE cEnd = *p++; // either apostrophe or quote
24819 
24820 	while ( p<pMax && *p && *p!=cEnd )
24821 	{
24822 		if ( !pProbEnd )
24823 			if ( *p=='>' || *p=='\r' )
24824 				pProbEnd = p;
24825 		p++;
24826 	}
24827 
24828 	if ( *p==cEnd )
24829 		return p+1;
24830 
24831 	if ( pProbEnd )
24832 		return pProbEnd;
24833 
24834 	return p;
24835 }
24836 
24837 
24838 struct HtmlEntity_t
24839 {
24840 	const char *	m_sName;
24841 	int				m_iCode;
24842 };
24843 
24844 
HtmlEntityHash(const BYTE * str,int len)24845 static inline DWORD HtmlEntityHash ( const BYTE * str, int len )
24846 {
24847 	static const unsigned short asso_values[] =
24848 	{
24849 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24850 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24851 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24852 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24853 		421, 421, 421, 421, 421, 421, 421, 421, 421, 4,
24854 		6, 22, 1, 421, 421, 421, 421, 421, 421, 421,
24855 		421, 421, 421, 421, 421, 170, 48, 0, 5, 44,
24856 		0, 10, 10, 86, 421, 7, 0, 1, 42, 93,
24857 		41, 421, 0, 5, 8, 14, 421, 421, 5, 11,
24858 		8, 421, 421, 421, 421, 421, 421, 1, 25, 27,
24859 		9, 2, 113, 82, 14, 3, 179, 1, 81, 91,
24860 		12, 0, 1, 180, 56, 17, 5, 31, 60, 7,
24861 		3, 161, 2, 3, 421, 421, 421, 421, 421, 421,
24862 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24863 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24864 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24865 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24866 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24867 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24868 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24869 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24870 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24871 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24872 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24873 		421, 421, 421, 421, 421, 421, 421, 421, 421, 421,
24874 		421, 421, 421, 421, 421, 421, 421
24875 	};
24876 
24877 	int hval = len;
24878 	switch ( hval )
24879 	{
24880 		default:	hval += asso_values [ str[4] ];
24881 		case 4:
24882 		case 3:		hval += asso_values [ str[2] ];
24883 		case 2:		hval += asso_values [ str[1]+1 ];
24884 		case 1:		hval += asso_values [ str[0] ];
24885 					break;
24886 	}
24887 	return hval + asso_values [ str[len-1] ];
24888 }
24889 
24890 
HtmlEntityLookup(const BYTE * str,int len)24891 static inline int HtmlEntityLookup ( const BYTE * str, int len )
24892 {
24893 	static const unsigned char lengthtable[] =
24894 	{
24895 		0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 3,
24896 		4, 3, 3, 5, 3, 6, 5, 5, 3, 4, 4, 5, 3, 4,
24897 		4, 0, 5, 4, 5, 6, 5, 6, 4, 5, 3, 3, 5, 0,
24898 		0, 0, 0, 6, 0, 5, 5, 0, 5, 6, 6, 3, 0, 3,
24899 		5, 3, 0, 6, 0, 4, 3, 6, 3, 6, 6, 6, 6, 5,
24900 		5, 5, 5, 5, 5, 2, 6, 4, 0, 6, 3, 3, 3, 0,
24901 		4, 5, 4, 4, 4, 3, 7, 4, 3, 6, 2, 3, 6, 4,
24902 		3, 6, 5, 6, 5, 5, 4, 2, 0, 0, 4, 6, 8, 0,
24903 		0, 0, 5, 5, 0, 6, 6, 2, 2, 4, 4, 6, 6, 4,
24904 		4, 5, 6, 2, 3, 4, 6, 5, 0, 2, 0, 0, 6, 6,
24905 		6, 6, 6, 4, 6, 5, 0, 6, 4, 5, 4, 6, 6, 0,
24906 		0, 4, 6, 5, 6, 0, 6, 4, 5, 6, 5, 6, 4, 0,
24907 		3, 6, 0, 4, 4, 4, 5, 4, 6, 0, 4, 4, 6, 5,
24908 		6, 7, 2, 2, 6, 2, 5, 2, 5, 0, 0, 0, 4, 4,
24909 		2, 4, 2, 2, 4, 0, 4, 4, 4, 5, 5, 0, 3, 7,
24910 		5, 0, 5, 6, 5, 0, 6, 0, 6, 0, 4, 6, 4, 6,
24911 		6, 2, 6, 0, 5, 5, 4, 6, 6, 0, 5, 6, 4, 4,
24912 		4, 4, 0, 5, 0, 5, 0, 4, 5, 4, 0, 4, 4, 4,
24913 		0, 0, 0, 4, 0, 0, 0, 5, 6, 5, 3, 0, 0, 6,
24914 		5, 4, 5, 5, 5, 5, 0, 5, 5, 0, 5, 0, 0, 0,
24915 		4, 6, 0, 3, 0, 5, 5, 0, 0, 3, 6, 5, 0, 4,
24916 		0, 0, 0, 0, 5, 7, 5, 3, 5, 3, 0, 0, 6, 0,
24917 		6, 0, 0, 7, 0, 0, 5, 0, 5, 0, 0, 0, 0, 5,
24918 		4, 0, 0, 0, 0, 0, 7, 4, 0, 0, 3, 0, 0, 0,
24919 		3, 0, 6, 0, 0, 7, 5, 5, 0, 3, 0, 0, 0, 0,
24920 		0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 5,
24921 		5, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24922 		0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24923 		0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0,
24924 		4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24925 		5
24926 	};
24927 
24928 	static const struct HtmlEntity_t wordlist[] =
24929 	{
24930 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
24931 		{"Rho", 929},
24932 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
24933 		{"Chi", 935},
24934 		{"phi", 966},
24935 		{"iota", 953},
24936 		{"psi", 968},
24937 		{"int", 8747},
24938 		{"theta", 952},
24939 		{"amp", 38},
24940 		{"there4", 8756},
24941 		{"Theta", 920},
24942 		{"omega", 969},
24943 		{"and", 8743},
24944 		{"prop", 8733},
24945 		{"ensp", 8194},
24946 		{"image", 8465},
24947 		{"not", 172},
24948 		{"isin", 8712},
24949 		{"sdot", 8901},
24950 		{"", 0},
24951 		{"prime", 8242},
24952 		{"prod", 8719},
24953 		{"trade", 8482},
24954 		{"Scaron", 352},
24955 		{"kappa", 954},
24956 		{"thinsp", 8201},
24957 		{"emsp", 8195},
24958 		{"thorn", 254},
24959 		{"eta", 951},
24960 		{"chi", 967},
24961 		{"Kappa", 922},
24962 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
24963 		{"scaron", 353},
24964 		{"", 0},
24965 		{"notin", 8713},
24966 		{"ndash", 8211},
24967 		{"", 0},
24968 		{"acute", 180},
24969 		{"otilde", 245},
24970 		{"atilde", 227},
24971 		{"Phi", 934},
24972 		{"", 0},
24973 		{"Psi", 936},
24974 		{"pound", 163},
24975 		{"cap", 8745},
24976 		{"", 0},
24977 		{"otimes", 8855},
24978 		{"", 0},
24979 		{"nbsp", 32},
24980 		{"rho", 961},
24981 		{"ntilde", 241},
24982 		{"eth", 240},
24983 		{"oacute", 243},
24984 		{"aacute", 225},
24985 		{"eacute", 233},
24986 		{"iacute", 237},
24987 		{"nabla", 8711},
24988 		{"Prime", 8243},
24989 		{"ocirc", 244},
24990 		{"acirc", 226},
24991 		{"ecirc", 234},
24992 		{"icirc", 238},
24993 		{"or", 8744},
24994 		{"Yacute", 221},
24995 		{"nsub", 8836},
24996 		{"", 0},
24997 		{"Uacute", 218},
24998 		{"Eta", 919},
24999 		{"ETH", 208},
25000 		{"sup", 8835},
25001 		{"", 0},
25002 		{"supe", 8839},
25003 		{"Ucirc", 219},
25004 		{"sup1", 185},
25005 		{"para", 182},
25006 		{"sup2", 178},
25007 		{"loz", 9674},
25008 		{"omicron", 959},
25009 		{"part", 8706},
25010 		{"cup", 8746},
25011 		{"Ntilde", 209},
25012 		{"Mu", 924},
25013 		{"tau", 964},
25014 		{"uacute", 250},
25015 		{"Iota", 921},
25016 		{"Tau", 932},
25017 		{"rsaquo", 8250},
25018 		{"alpha", 945},
25019 		{"Ccedil", 199},
25020 		{"ucirc", 251},
25021 		{"oline", 8254},
25022 		{"sup3", 179},
25023 		{"nu", 957},
25024 		{"", 0}, {"", 0},
25025 		{"sube", 8838},
25026 		{"Eacute", 201},
25027 		{"thetasym", 977},
25028 		{"", 0}, {"", 0}, {"", 0},
25029 		{"Omega", 937},
25030 		{"Ecirc", 202},
25031 		{"", 0},
25032 		{"lowast", 8727},
25033 		{"iquest", 191},
25034 		{"lt", 60},
25035 		{"gt", 62},
25036 		{"ordm", 186},
25037 		{"euro", 8364},
25038 		{"oslash", 248},
25039 		{"lsaquo", 8249},
25040 		{"zeta", 950},
25041 		{"cong", 8773},
25042 		{"mdash", 8212},
25043 		{"ccedil", 231},
25044 		{"ne", 8800},
25045 		{"sub", 8834},
25046 		{"Zeta", 918},
25047 		{"Lambda", 923},
25048 		{"Gamma", 915},
25049 		{"", 0},
25050 		{"Nu", 925},
25051 		{"", 0}, {"", 0},
25052 		{"ograve", 242},
25053 		{"agrave", 224},
25054 		{"egrave", 232},
25055 		{"igrave", 236},
25056 		{"frac14", 188},
25057 		{"ordf", 170},
25058 		{"Otilde", 213},
25059 		{"infin", 8734},
25060 		{"", 0},
25061 		{"frac12", 189},
25062 		{"beta", 946},
25063 		{"radic", 8730},
25064 		{"darr", 8595},
25065 		{"Iacute", 205},
25066 		{"Ugrave", 217},
25067 		{"", 0}, {"", 0},
25068 		{"harr", 8596},
25069 		{"hearts", 9829},
25070 		{"Icirc", 206},
25071 		{"Oacute", 211},
25072 		{"", 0},
25073 		{"frac34", 190},
25074 		{"cent", 162},
25075 		{"crarr", 8629},
25076 		{"curren", 164},
25077 		{"Ocirc", 212},
25078 		{"brvbar", 166},
25079 		{"sect", 167},
25080 		{"", 0},
25081 		{"ang", 8736},
25082 		{"ugrave", 249},
25083 		{"", 0},
25084 		{"Beta", 914},
25085 		{"uarr", 8593},
25086 		{"dArr", 8659},
25087 		{"asymp", 8776},
25088 		{"perp", 8869},
25089 		{"Dagger", 8225},
25090 		{"", 0},
25091 		{"hArr", 8660},
25092 		{"rang", 9002},
25093 		{"dagger", 8224},
25094 		{"exist", 8707},
25095 		{"Egrave", 200},
25096 		{"Omicron", 927},
25097 		{"mu", 956},
25098 		{"pi", 960},
25099 		{"weierp", 8472},
25100 		{"xi", 958},
25101 		{"clubs", 9827},
25102 		{"Xi", 926},
25103 		{"aring", 229},
25104 		{"", 0}, {"", 0}, {"", 0},
25105 		{"copy", 169},
25106 		{"uArr", 8657},
25107 		{"ni", 8715},
25108 		{"rarr", 8594},
25109 		{"le", 8804},
25110 		{"ge", 8805},
25111 		{"zwnj", 8204},
25112 		{"", 0},
25113 		{"apos", 39},
25114 		{"macr", 175},
25115 		{"lang", 9001},
25116 		{"gamma", 947},
25117 		{"Delta", 916},
25118 		{"", 0},
25119 		{"uml", 168},
25120 		{"alefsym", 8501},
25121 		{"delta", 948},
25122 		{"", 0},
25123 		{"bdquo", 8222},
25124 		{"lambda", 955},
25125 		{"equiv", 8801},
25126 		{"", 0},
25127 		{"Oslash", 216},
25128 		{"", 0},
25129 		{"hellip", 8230},
25130 		{"", 0},
25131 		{"rArr", 8658},
25132 		{"Atilde", 195},
25133 		{"larr", 8592},
25134 		{"spades", 9824},
25135 		{"Igrave", 204},
25136 		{"Pi", 928},
25137 		{"yacute", 253},
25138 		{"", 0},
25139 		{"diams", 9830},
25140 		{"sbquo", 8218},
25141 		{"fnof", 402},
25142 		{"Ograve", 210},
25143 		{"plusmn", 177},
25144 		{"", 0},
25145 		{"rceil", 8969},
25146 		{"Aacute", 193},
25147 		{"ouml", 246},
25148 		{"auml", 228},
25149 		{"euml", 235},
25150 		{"iuml", 239},
25151 		{"", 0},
25152 		{"Acirc", 194},
25153 		{"", 0},
25154 		{"rdquo", 8221},
25155 		{"", 0},
25156 		{"lArr", 8656},
25157 		{"rsquo", 8217},
25158 		{"Yuml", 376},
25159 		{"", 0},
25160 		{"quot", 34},
25161 		{"Uuml", 220},
25162 		{"bull", 8226},
25163 		{"", 0}, {"", 0}, {"", 0},
25164 		{"real", 8476},
25165 		{"", 0}, {"", 0}, {"", 0},
25166 		{"lceil", 8968},
25167 		{"permil", 8240},
25168 		{"upsih", 978},
25169 		{"sum", 8721},
25170 		{"", 0}, {"", 0},
25171 		{"divide", 247},
25172 		{"raquo", 187},
25173 		{"uuml", 252},
25174 		{"ldquo", 8220},
25175 		{"Alpha", 913},
25176 		{"szlig", 223},
25177 		{"lsquo", 8216},
25178 		{"", 0},
25179 		{"Sigma", 931},
25180 		{"tilde", 732},
25181 		{"", 0},
25182 		{"THORN", 222},
25183 		{"", 0}, {"", 0}, {"", 0},
25184 		{"Euml", 203},
25185 		{"rfloor", 8971},
25186 		{"", 0},
25187 		{"lrm", 8206},
25188 		{"", 0},
25189 		{"sigma", 963},
25190 		{"iexcl", 161},
25191 		{"", 0}, {"", 0},
25192 		{"deg", 176},
25193 		{"middot", 183},
25194 		{"laquo", 171},
25195 		{"", 0},
25196 		{"circ", 710},
25197 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
25198 		{"frasl", 8260},
25199 		{"epsilon", 949},
25200 		{"oplus", 8853},
25201 		{"yen", 165},
25202 		{"micro", 181},
25203 		{"piv", 982},
25204 		{"", 0}, {"", 0},
25205 		{"lfloor", 8970},
25206 		{"", 0},
25207 		{"Agrave", 192},
25208 		{"", 0}, {"", 0},
25209 		{"Upsilon", 933},
25210 		{"", 0}, {"", 0},
25211 		{"times", 215},
25212 		{"", 0},
25213 		{"cedil", 184},
25214 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
25215 		{"minus", 8722},
25216 		{"Iuml", 207},
25217 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25218 		{"upsilon", 965},
25219 		{"Ouml", 214},
25220 		{"", 0}, {"", 0},
25221 		{"rlm", 8207},
25222 		{"", 0}, {"", 0}, {"", 0},
25223 		{"reg", 174},
25224 		{"", 0},
25225 		{"forall", 8704},
25226 		{"", 0}, {"", 0},
25227 		{"Epsilon", 917},
25228 		{"empty", 8709},
25229 		{"OElig", 338},
25230 		{"", 0},
25231 		{"shy", 173},
25232 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25233 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
25234 		{"Aring", 197},
25235 		{"", 0}, {"", 0}, {"", 0},
25236 		{"oelig", 339},
25237 		{"aelig", 230},
25238 		{"", 0},
25239 		{"zwj", 8205},
25240 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25241 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25242 		{"sim", 8764},
25243 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25244 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25245 		{"yuml", 255},
25246 		{"sigmaf", 962},
25247 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25248 		{"Auml", 196},
25249 		{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
25250 		{"", 0}, {"", 0}, {"", 0}, {"", 0},
25251 		{"AElig", 198}
25252 	};
25253 
25254 	const int MIN_WORD_LENGTH		= 2;
25255 	const int MAX_WORD_LENGTH		= 8;
25256 	const int MAX_HASH_VALUE		= 420;
25257 
25258 	if ( len<=MAX_WORD_LENGTH && len>=MIN_WORD_LENGTH )
25259 	{
25260 		int key = HtmlEntityHash ( str, len );
25261 		if ( key<=MAX_HASH_VALUE && key>=0 )
25262 			if ( len==lengthtable[key] )
25263 		{
25264 			const char * s = wordlist[key].m_sName;
25265 			if ( *str==*s && !memcmp ( str+1, s+1, len-1 ) )
25266 				return wordlist[key].m_iCode;
25267 		}
25268 	}
25269 	return 0;
25270 }
25271 
25272 
Strip(BYTE * sData) const25273 void CSphHTMLStripper::Strip ( BYTE * sData ) const
25274 {
25275 	const BYTE * s = sData;
25276 	BYTE * d = sData;
25277 	for ( ;; )
25278 	{
25279 		/////////////////////////////////////
25280 		// scan until eof, or tag, or entity
25281 		/////////////////////////////////////
25282 
25283 		while ( *s && *s!='<' && *s!='&' )
25284 		{
25285 			if ( *s>=0x20 )
25286 				*d++ = *s;
25287 			else
25288 				*d++ = ' ';
25289 			s++;
25290 		}
25291 		if ( !*s )
25292 			break;
25293 
25294 		/////////////////
25295 		// handle entity
25296 		/////////////////
25297 
25298 		if ( *s=='&' )
25299 		{
25300 			if ( s[1]=='#' )
25301 			{
25302 				// handle "&#number;" and "&#xnumber;" forms
25303 				DWORD uCode = 0;
25304 				s += 2;
25305 
25306 				bool bHex = ( *s && ( *s=='x' || *s=='X') );
25307 				if ( !bHex )
25308 				{
25309 					while ( isdigit(*s) )
25310 						uCode = uCode*10 + (*s++) - '0';
25311 				} else
25312 				{
25313 					s++;
25314 					while ( *s )
25315 					{
25316 						if ( isdigit(*s) )
25317 							uCode = uCode*16 + (*s++) - '0';
25318 						else if ( *s>=0x41 && *s<=0x46 )
25319 							uCode = uCode*16 + (*s++) - 'A' + 0xA;
25320 						else if ( *s>=0x61 && *s<=0x66 )
25321 							uCode = uCode*16 + (*s++) - 'a' + 0xA;
25322 						else
25323 							break;
25324 					}
25325 				}
25326 
25327 				uCode = uCode % 0x110000; // there is no unicode code-points bigger than this value
25328 
25329 				if ( uCode<=0x1f || *s!=';' ) // 0-31 are reserved codes
25330 					continue;
25331 
25332 				d += sphUTF8Encode ( d, (int)uCode );
25333 				s++;
25334 
25335 			} else
25336 			{
25337 				// skip until ';' or max length
25338 				if ( ( s[1]>='a' && s[1]<='z' ) || ( s[1]>='A' && s[1]<='Z' ) )
25339 				{
25340 					const int MAX_ENTITY_LEN = 8;
25341 					const BYTE * sStart = s+1;
25342 					while ( *s && *s!=';' && s-sStart<=MAX_ENTITY_LEN )
25343 						s++;
25344 
25345 					if ( *s==';' )
25346 					{
25347 						int iCode = HtmlEntityLookup ( sStart, (int)(s-sStart) );
25348 						if ( iCode>0 )
25349 						{
25350 							// this is a known entity; encode it
25351 							d += sphUTF8Encode ( d, iCode );
25352 							s++;
25353 							continue;
25354 						}
25355 					}
25356 
25357 					// rollback
25358 					s = sStart-1;
25359 				}
25360 
25361 				// if we're here, it's not an entity; pass the leading ampersand and rescan
25362 				*d++ = *s++;
25363 			}
25364 			continue;
25365 		}
25366 
25367 		//////////////
25368 		// handle tag
25369 		//////////////
25370 
25371 		assert ( *s=='<' );
25372 		if ( GetCharIndex(s[1])<0 )
25373 		{
25374 			if ( s[1]=='/' )
25375 			{
25376 				// check if it's valid closing tag
25377 				if ( GetCharIndex(s[2])<0 )
25378 				{
25379 					*d++ = *s++;
25380 					continue;
25381 				}
25382 
25383 			} else if ( s[1]=='!' )
25384 			{
25385 				if ( s[2]=='-' && s[3]=='-' )
25386 				{
25387 					// it's valid comment; scan until comment end
25388 					s += 4; // skip opening '<!--'
25389 					while ( *s )
25390 					{
25391 						if ( s[0]=='-' && s[1]=='-' && s[2]=='>' )
25392 							break;
25393 						s++;
25394 					}
25395 					if ( !*s )
25396 						break;
25397 					s += 3; // skip closing '-->'
25398 					continue;
25399 
25400 				} else if ( isalpha(s[2]) )
25401 				{
25402 					// it's <!doctype> style PI; scan until PI end
25403 					s += 2;
25404 					while ( *s && *s!='>' )
25405 					{
25406 						if ( *s=='\'' || *s=='"' )
25407 						{
25408 							s = SkipQuoted ( s );
25409 							while ( isspace(*s) ) s++;
25410 						} else
25411 						{
25412 							s++;
25413 						}
25414 					}
25415 					if ( *s=='>' )
25416 						s++;
25417 					continue;
25418 
25419 				} else
25420 				{
25421 					// it's something malformed; just ignore
25422 					*d++ = *s++;
25423 					continue;
25424 				}
25425 
25426 			} else if ( s[1]=='?' )
25427 			{
25428 				// scan until PI end
25429 				s += 2; // skip opening '<?'
25430 				while ( *s )
25431 				{
25432 					if ( s[0]=='?' && s[1]=='>' )
25433 						break;
25434 					s++;
25435 				}
25436 				if ( !*s )
25437 					break;
25438 				s += 2; // skip closing '?>'
25439 				continue;
25440 
25441 			} else
25442 			{
25443 				// simply malformed
25444 				*d++ = *s++;
25445 				continue;
25446 			}
25447 		}
25448 		s++; // skip '<'
25449 
25450 		//////////////////////////////////////
25451 		// lookup this tag in known tags list
25452 		//////////////////////////////////////
25453 
25454 		const StripperTag_t * pTag = NULL;
25455 		int iZoneNameLen = 0;
25456 		const BYTE * sZoneName = NULL;
25457 		s = FindTag ( s, &pTag, &sZoneName, &iZoneNameLen );
25458 
25459 		/////////////////////////////////////
25460 		// process tag contents
25461 		// index attributes if needed
25462 		// gracefully handle malformed stuff
25463 		/////////////////////////////////////
25464 
25465 #define LOC_SKIP_SPACES() { while ( sphIsSpace(*s) ) s++; if ( !*s || *s=='>' ) break; }
25466 
25467 		bool bIndexAttrs = ( pTag && pTag->m_bIndexAttrs );
25468 		while ( *s && *s!='>' )
25469 		{
25470 			LOC_SKIP_SPACES();
25471 			if ( sphIsTagStart(*s) )
25472 			{
25473 				// skip attribute name while it's valid
25474 				const BYTE * sAttr = s;
25475 				while ( sphIsTag(*s) )
25476 					s++;
25477 
25478 				// blanks or a value after a valid attribute name?
25479 				if ( sphIsSpace(*s) || *s=='=' )
25480 				{
25481 					const int iAttrLen = (int)( s - sAttr );
25482 					LOC_SKIP_SPACES();
25483 
25484 					// a valid name but w/o a value; keep scanning
25485 					if ( *s!='=' )
25486 						continue;
25487 
25488 					// got value!
25489 					s++;
25490 					LOC_SKIP_SPACES();
25491 
25492 					// check attribute name
25493 					// OPTIMIZE! remove linear search
25494 					int iAttr = -1;
25495 					if ( bIndexAttrs )
25496 					{
25497 						for ( iAttr=0; iAttr<pTag->m_dAttrs.GetLength(); iAttr++ )
25498 						{
25499 							int iLen = strlen ( pTag->m_dAttrs[iAttr].cstr() );
25500 							if ( iLen==iAttrLen && !strncasecmp ( pTag->m_dAttrs[iAttr].cstr(), (const char*)sAttr, iLen ) )
25501 								break;
25502 						}
25503 						if ( iAttr==pTag->m_dAttrs.GetLength() )
25504 							iAttr = -1;
25505 					}
25506 
25507 					// process the value
25508 					const BYTE * sVal = s;
25509 					if ( *s=='\'' || *s=='"' )
25510 					{
25511 						// skip quoted value until a matching quote
25512 						s = SkipQuoted ( s );
25513 					} else
25514 					{
25515 						// skip unquoted value until tag end or whitespace
25516 						while ( *s && *s!='>' && !sphIsSpace(*s) )
25517 							s++;
25518 					}
25519 
25520 					// if this one is to be indexed, copy it
25521 					if ( iAttr>=0 )
25522 					{
25523 						const BYTE * sMax = s;
25524 						if ( *sVal=='\'' || *sVal=='"' )
25525 						{
25526 							if ( sMax[-1]==sVal[0] )
25527 								sMax--;
25528 							sVal++;
25529 						}
25530 						while ( sVal<sMax )
25531 							*d++ = *sVal++;
25532 						*d++ = ' ';
25533 					}
25534 
25535 					// handled the value; keep scanning
25536 					continue;
25537 				}
25538 
25539 				// nope, got an invalid character in the sequence (or maybe eof)
25540 				// fall through to an invalid name handler
25541 			}
25542 
25543 			// keep skipping until tag end or whitespace
25544 			while ( *s && *s!='>' && !sphIsSpace(*s) )
25545 				s++;
25546 		}
25547 
25548 #undef LOC_SKIP_SPACES
25549 
25550 		// skip closing angle bracket, if any
25551 		if ( *s )
25552 			s++;
25553 
25554 		// unknown tag is done; others might require a bit more work
25555 		if ( !pTag )
25556 		{
25557 			*d++ = ' '; // unknown tags are *not* inline by default
25558 			continue;
25559 		}
25560 
25561 		// handle zones
25562 		if ( pTag->m_bZone )
25563 		{
25564 			// should be at tag's end
25565 			assert ( s[0]=='\0' || s[-1]=='>' );
25566 
25567 			// emit secret codes
25568 			*d++ = MAGIC_CODE_ZONE;
25569 			for ( int i=0; i<iZoneNameLen; i++ )
25570 				*d++ = (BYTE) tolower ( sZoneName[i] );
25571 			if ( *d )
25572 				*d++ = MAGIC_CODE_ZONE;
25573 
25574 			if ( !*s )
25575 				break;
25576 			continue;
25577 		}
25578 
25579 		// handle paragraph boundaries
25580 		if ( pTag->m_bPara )
25581 		{
25582 			*d++ = MAGIC_CODE_PARAGRAPH;
25583 			continue;
25584 		}
25585 
25586 		// in all cases, the tag must be fully processed at this point
25587 		// not a remove-tag? we're done
25588 		if ( !pTag->m_bRemove )
25589 		{
25590 			if ( !pTag->m_bInline )
25591 				*d++ = ' ';
25592 			continue;
25593 		}
25594 
25595 		// sudden eof? bail out
25596 		if ( !*s )
25597 			break;
25598 
25599 		// must be a proper remove-tag end, then
25600 		assert ( pTag->m_bRemove && s[-1]=='>' );
25601 
25602 		// short-form? we're done
25603 		if ( s[-2]=='/' )
25604 			continue;
25605 
25606 		// skip everything until the closing tag
25607 		// FIXME! should we handle insane cases with quoted closing tag within tag?
25608 		for ( ;; )
25609 		{
25610 			while ( *s && ( s[0]!='<' || s[1]!='/' ) ) s++;
25611 			if ( !*s ) break;
25612 
25613 			s += 2; // skip </
25614 			if ( strncasecmp ( pTag->m_sTag.cstr(), (const char*)s, pTag->m_iTagLen )!=0 ) continue;
25615 			if ( !sphIsTag ( s[pTag->m_iTagLen] ) )
25616 			{
25617 				s += pTag->m_iTagLen; // skip tag
25618 				if ( *s=='>' ) s++;
25619 				break;
25620 			}
25621 		}
25622 
25623 		if ( !pTag->m_bInline ) *d++ = ' ';
25624 	}
25625 	*d++ = '\0';
25626 
25627 	// space, paragraph sequences elimination pass
25628 	s = sData;
25629 	d = sData;
25630 	bool bSpaceOut = false;
25631 	bool bParaOut = false;
25632 	bool bZoneOut = false;
25633 	while ( const char c = *s++ )
25634 	{
25635 		assert ( d<=s-1 );
25636 
25637 		// handle different character classes
25638 		if ( sphIsSpace(c) )
25639 		{
25640 			// handle whitespace, skip dupes
25641 			if ( !bSpaceOut )
25642 				*d++ = ' ';
25643 
25644 			bSpaceOut = true;
25645 			continue;
25646 
25647 		} else if ( c==MAGIC_CODE_PARAGRAPH )
25648 		{
25649 			// handle paragraph marker, skip dupes
25650 			if ( !bParaOut && !bZoneOut )
25651 			{
25652 				*d++ = c;
25653 				bParaOut = true;
25654 			}
25655 
25656 			bSpaceOut = true;
25657 			continue;
25658 
25659 		} else if ( c==MAGIC_CODE_ZONE )
25660 		{
25661 			// zone marker
25662 			// rewind preceding paragraph, if any, it is redundant
25663 			if ( bParaOut )
25664 			{
25665 				assert ( d>sData && d[-1]==MAGIC_CODE_PARAGRAPH );
25666 				d--;
25667 			}
25668 
25669 			// copy \4zoneid\4
25670 			*d++ = c;
25671 			while ( *s && *s!=MAGIC_CODE_ZONE )
25672 				*d++ = *s++;
25673 
25674 			if ( *s )
25675 				*d++ = *s++;
25676 
25677 			// update state
25678 			// no spaces paragraphs allowed
25679 			bSpaceOut = bZoneOut = true;
25680 			bParaOut = false;
25681 			continue;
25682 
25683 		} else
25684 		{
25685 			*d++ = c;
25686 			bSpaceOut = bParaOut = bZoneOut = false;
25687 		}
25688 	}
25689 	*d++ = '\0';
25690 }
25691 
FindTag(const BYTE * sSrc,const StripperTag_t ** ppTag,const BYTE ** ppZoneName,int * pZoneNameLen) const25692 const BYTE * CSphHTMLStripper::FindTag ( const BYTE * sSrc, const StripperTag_t ** ppTag,
25693 	const BYTE ** ppZoneName, int * pZoneNameLen ) const
25694 {
25695 	assert ( sSrc && ppTag && ppZoneName && pZoneNameLen );
25696 	assert ( sSrc[0]!='/' || sSrc[1]!='\0' );
25697 
25698 	const BYTE * sTagName = ( sSrc[0]=='/' ) ? sSrc+1 : sSrc;
25699 
25700 	*ppZoneName = sSrc;
25701 	*pZoneNameLen = 0;
25702 
25703 	int iIdx = GetCharIndex ( sTagName[0] );
25704 	assert ( iIdx>=0 && iIdx<MAX_CHAR_INDEX );
25705 
25706 	if ( m_dEnd[iIdx]>=0 )
25707 	{
25708 		int iStart = m_dStart[iIdx];
25709 		int iEnd = m_dEnd[iIdx];
25710 
25711 		for ( int i=iStart; i<=iEnd; i++ )
25712 		{
25713 			int iLen = m_dTags[i].m_iTagLen;
25714 			int iCmp = strncasecmp ( m_dTags[i].m_sTag.cstr(), (const char*)sTagName, iLen );
25715 
25716 			// the tags are sorted; so if current candidate is already greater, rest can be skipped
25717 			if ( iCmp>0 )
25718 				break;
25719 
25720 			// do we have a match?
25721 			if ( iCmp==0 )
25722 			{
25723 				// got exact match?
25724 				if ( !sphIsTag ( sTagName[iLen] ) )
25725 				{
25726 					*ppTag = m_dTags.Begin() + i;
25727 					sSrc = sTagName + iLen; // skip tag name
25728 					if ( m_dTags[i].m_bZone )
25729 						*pZoneNameLen = sSrc - *ppZoneName;
25730 					break;
25731 				}
25732 
25733 				// got wildcard match?
25734 				if ( m_dTags[i].m_bZonePrefix )
25735 				{
25736 					*ppTag = m_dTags.Begin() + i;
25737 					sSrc = sTagName + iLen;
25738 					while ( sphIsTag(*sSrc) )
25739 						sSrc++;
25740 					*pZoneNameLen = sSrc - *ppZoneName;
25741 					break;
25742 				}
25743 			}
25744 		}
25745 	}
25746 
25747 	return sSrc;
25748 }
25749 
IsValidTagStart(int iCh) const25750 bool CSphHTMLStripper::IsValidTagStart ( int iCh ) const
25751 {
25752 	int i = GetCharIndex ( iCh );
25753 	return ( i>=0 && i<MAX_CHAR_INDEX );
25754 }
25755 
25756 //////////////////////////////////////////////////////////////////////////
25757 #if USE_RE2
25758 class CSphFieldRegExps : public ISphFieldFilter
25759 {
25760 public:
25761 	virtual					~CSphFieldRegExps ();
25762 
25763 	virtual	int				Apply ( const BYTE * sField, int iLength, CSphVector<BYTE> & dStorage );
25764 	virtual	void			GetSettings ( CSphFieldFilterSettings & tSettings ) const;
25765 
25766 	bool					AddRegExp ( const char * sRegExp, CSphString & sError );
25767 
25768 private:
25769 	struct RegExp_t
25770 	{
25771 		CSphString	m_sFrom;
25772 		CSphString	m_sTo;
25773 
25774 		RE2 *		m_pRE2;
25775 	};
25776 
25777 	CSphVector<RegExp_t>	m_dRegexps;
25778 };
25779 
25780 
~CSphFieldRegExps()25781 CSphFieldRegExps::~CSphFieldRegExps ()
25782 {
25783 	ARRAY_FOREACH ( i, m_dRegexps )
25784 		SafeDelete ( m_dRegexps[i].m_pRE2 );
25785 }
25786 
Apply(const BYTE * sField,int iLength,CSphVector<BYTE> & dStorage)25787 int CSphFieldRegExps::Apply ( const BYTE * sField, int iLength, CSphVector<BYTE> & dStorage )
25788 {
25789 	dStorage.Resize ( 0 );
25790 	if ( !sField || !*sField )
25791 		return 0;
25792 
25793 	bool bReplaced = false;
25794 	std::string sRe2 = ( iLength ? std::string ( (char *) sField, iLength ) : (char *) sField );
25795 	ARRAY_FOREACH ( i, m_dRegexps )
25796 	{
25797 		assert ( m_dRegexps[i].m_pRE2 );
25798 		bReplaced |= ( RE2::GlobalReplace ( &sRe2, *m_dRegexps[i].m_pRE2, m_dRegexps[i].m_sTo.cstr() )>0 );
25799 	}
25800 
25801 	if ( !bReplaced )
25802 		return 0;
25803 
25804 	int iDstLen = sRe2.length();
25805 	dStorage.Resize ( iDstLen+4 ); // string SAFETY_GAP
25806 	strncpy ( (char *)dStorage.Begin(), sRe2.c_str(), dStorage.GetLength() );
25807 	return iDstLen;
25808 }
25809 
GetSettings(CSphFieldFilterSettings & tSettings) const25810 void CSphFieldRegExps::GetSettings ( CSphFieldFilterSettings & tSettings ) const
25811 {
25812 	tSettings.m_dRegexps.Resize ( m_dRegexps.GetLength() );
25813 	ARRAY_FOREACH ( i, m_dRegexps )
25814 		tSettings.m_dRegexps[i].SetSprintf ( "%s => %s", m_dRegexps[i].m_sFrom.cstr(), m_dRegexps[i].m_sTo.cstr() );
25815 }
25816 
AddRegExp(const char * sRegExp,CSphString & sError)25817 bool CSphFieldRegExps::AddRegExp ( const char * sRegExp, CSphString & sError )
25818 {
25819 	const char sSplitter [] = "=>";
25820 	const char * sSplit = strstr ( sRegExp, sSplitter );
25821 	if ( !sSplit )
25822 	{
25823 		sError = "mapping token (=>) not found";
25824 		return false;
25825 	} else if ( strstr ( sSplit + strlen ( sSplitter ), sSplitter ) )
25826 	{
25827 		sError = "mapping token (=>) found more than once";
25828 		return false;
25829 	}
25830 
25831 	m_dRegexps.Resize ( m_dRegexps.GetLength () + 1 );
25832 	RegExp_t & tRegExp = m_dRegexps.Last();
25833 	tRegExp.m_sFrom.SetBinary ( sRegExp, sSplit-sRegExp );
25834 	tRegExp.m_sTo = sSplit + strlen ( sSplitter );
25835 	tRegExp.m_sFrom.Trim();
25836 	tRegExp.m_sTo.Trim();
25837 
25838 	RE2::Options tOptions;
25839 	tOptions.set_utf8 ( true );
25840 	tRegExp.m_pRE2 = new RE2 ( tRegExp.m_sFrom.cstr(), tOptions );
25841 
25842 	std::string sRE2Error;
25843 	if ( !tRegExp.m_pRE2->CheckRewriteString ( tRegExp.m_sTo.cstr(), &sRE2Error ) )
25844 	{
25845 		sError.SetSprintf ( "\"%s => %s\" is not a valid mapping: %s", tRegExp.m_sFrom.cstr(), tRegExp.m_sTo.cstr(), sRE2Error.c_str() );
25846 		SafeDelete ( tRegExp.m_pRE2 );
25847 		m_dRegexps.Remove ( m_dRegexps.GetLength() - 1 );
25848 		return false;
25849 	}
25850 
25851 	return true;
25852 }
25853 #endif
25854 
25855 #if USE_RE2
sphCreateFieldFilter(const CSphFieldFilterSettings & tFilterSettings,CSphString & sError)25856 ISphFieldFilter * sphCreateFieldFilter ( const CSphFieldFilterSettings & tFilterSettings, CSphString & sError )
25857 {
25858 	CSphFieldRegExps * pFilter = new CSphFieldRegExps();
25859 	ARRAY_FOREACH ( i, tFilterSettings.m_dRegexps )
25860 		pFilter->AddRegExp ( tFilterSettings.m_dRegexps[i].cstr(), sError );
25861 
25862 	return pFilter;
25863 }
25864 #else
sphCreateFieldFilter(const CSphFieldFilterSettings &,CSphString &)25865 ISphFieldFilter * sphCreateFieldFilter ( const CSphFieldFilterSettings &, CSphString & )
25866 {
25867 	return NULL;
25868 }
25869 #endif
25870 
25871 
25872 /////////////////////////////////////////////////////////////////////////////
25873 // GENERIC SOURCE
25874 /////////////////////////////////////////////////////////////////////////////
25875 
CSphSourceSettings()25876 CSphSourceSettings::CSphSourceSettings ()
25877 	: m_iMinPrefixLen ( 0 )
25878 	, m_iMinInfixLen ( 0 )
25879 	, m_iMaxSubstringLen ( 0 )
25880 	, m_iBoundaryStep ( 0 )
25881 	, m_bIndexExactWords ( false )
25882 	, m_iOvershortStep ( 1 )
25883 	, m_iStopwordStep ( 1 )
25884 	, m_bIndexSP ( false )
25885 	, m_bIndexFieldLens ( false )
25886 {}
25887 
25888 
GetWordpart(const char * sField,bool bWordDict)25889 ESphWordpart CSphSourceSettings::GetWordpart ( const char * sField, bool bWordDict )
25890 {
25891 	if ( bWordDict )
25892 		return SPH_WORDPART_WHOLE;
25893 
25894 	bool bPrefix = ( m_iMinPrefixLen>0 ) && ( m_dPrefixFields.GetLength()==0 || m_dPrefixFields.Contains ( sField ) );
25895 	bool bInfix = ( m_iMinInfixLen>0 ) && ( m_dInfixFields.GetLength()==0 || m_dInfixFields.Contains ( sField ) );
25896 
25897 	assert ( !( bPrefix && bInfix ) ); // no field must be marked both prefix and infix
25898 	if ( bPrefix )
25899 		return SPH_WORDPART_PREFIX;
25900 	if ( bInfix )
25901 		return SPH_WORDPART_INFIX;
25902 	return SPH_WORDPART_WHOLE;
25903 }
25904 
25905 //////////////////////////////////////////////////////////////////////////
25906 
CSphSource(const char * sName)25907 CSphSource::CSphSource ( const char * sName )
25908 	: m_pTokenizer ( NULL )
25909 	, m_pDict ( NULL )
25910 	, m_pFieldFilter ( NULL )
25911 	, m_tSchema ( sName )
25912 	, m_pStripper ( NULL )
25913 	, m_iNullIds ( 0 )
25914 	, m_iMaxIds ( 0 )
25915 {
25916 }
25917 
25918 
~CSphSource()25919 CSphSource::~CSphSource()
25920 {
25921 	SafeDelete ( m_pStripper );
25922 }
25923 
25924 
SetDict(CSphDict * pDict)25925 void CSphSource::SetDict ( CSphDict * pDict )
25926 {
25927 	assert ( pDict );
25928 	m_pDict = pDict;
25929 }
25930 
25931 
GetStats()25932 const CSphSourceStats & CSphSource::GetStats ()
25933 {
25934 	return m_tStats;
25935 }
25936 
25937 
SetStripHTML(const char * sExtractAttrs,const char * sRemoveElements,bool bDetectParagraphs,const char * sZones,CSphString & sError)25938 bool CSphSource::SetStripHTML ( const char * sExtractAttrs, const char * sRemoveElements,
25939 	bool bDetectParagraphs, const char * sZones, CSphString & sError )
25940 {
25941 	if ( !m_pStripper )
25942 		m_pStripper = new CSphHTMLStripper ( true );
25943 
25944 	if ( !m_pStripper->SetIndexedAttrs ( sExtractAttrs, sError ) )
25945 		return false;
25946 
25947 	if ( !m_pStripper->SetRemovedElements ( sRemoveElements, sError ) )
25948 		return false;
25949 
25950 	if ( bDetectParagraphs )
25951 		m_pStripper->EnableParagraphs ();
25952 
25953 	if ( !m_pStripper->SetZones ( sZones, sError ) )
25954 		return false;
25955 
25956 	return true;
25957 }
25958 
25959 
SetFieldFilter(ISphFieldFilter * pFilter)25960 void CSphSource::SetFieldFilter ( ISphFieldFilter * pFilter )
25961 {
25962 	m_pFieldFilter = pFilter;
25963 }
25964 
SetTokenizer(ISphTokenizer * pTokenizer)25965 void CSphSource::SetTokenizer ( ISphTokenizer * pTokenizer )
25966 {
25967 	assert ( pTokenizer );
25968 	m_pTokenizer = pTokenizer;
25969 }
25970 
25971 
UpdateSchema(CSphSchema * pInfo,CSphString & sError)25972 bool CSphSource::UpdateSchema ( CSphSchema * pInfo, CSphString & sError )
25973 {
25974 	assert ( pInfo );
25975 
25976 	// fill it
25977 	if ( pInfo->m_dFields.GetLength()==0 && pInfo->GetAttrsCount()==0 )
25978 	{
25979 		*pInfo = m_tSchema;
25980 		return true;
25981 	}
25982 
25983 	// check it
25984 	return m_tSchema.CompareTo ( *pInfo, sError );
25985 }
25986 
25987 
Setup(const CSphSourceSettings & tSettings)25988 void CSphSource::Setup ( const CSphSourceSettings & tSettings )
25989 {
25990 	m_iMinPrefixLen = Max ( tSettings.m_iMinPrefixLen, 0 );
25991 	m_iMinInfixLen = Max ( tSettings.m_iMinInfixLen, 0 );
25992 	m_iMaxSubstringLen = Max ( tSettings.m_iMaxSubstringLen, 0 );
25993 	m_iBoundaryStep = Max ( tSettings.m_iBoundaryStep, -1 );
25994 	m_bIndexExactWords = tSettings.m_bIndexExactWords;
25995 	m_iOvershortStep = Min ( Max ( tSettings.m_iOvershortStep, 0 ), 1 );
25996 	m_iStopwordStep = Min ( Max ( tSettings.m_iStopwordStep, 0 ), 1 );
25997 	m_bIndexSP = tSettings.m_bIndexSP;
25998 	m_dPrefixFields = tSettings.m_dPrefixFields;
25999 	m_dInfixFields = tSettings.m_dInfixFields;
26000 	m_bIndexFieldLens = tSettings.m_bIndexFieldLens;
26001 }
26002 
26003 
VerifyID(SphDocID_t uID)26004 SphDocID_t CSphSource::VerifyID ( SphDocID_t uID )
26005 {
26006 	if ( uID==0 )
26007 	{
26008 		m_iNullIds++;
26009 		return 0;
26010 	}
26011 
26012 	if ( uID==DOCID_MAX )
26013 	{
26014 		m_iMaxIds++;
26015 		return 0;
26016 	}
26017 
26018 	return uID;
26019 }
26020 
26021 
IterateJoinedHits(CSphString &)26022 ISphHits * CSphSource::IterateJoinedHits ( CSphString & )
26023 {
26024 	static ISphHits dDummy;
26025 	m_tDocInfo.m_uDocID = 0; // pretend that's an eof
26026 	return &dDummy;
26027 }
26028 
26029 /////////////////////////////////////////////////////////////////////////////
26030 // DOCUMENT SOURCE
26031 /////////////////////////////////////////////////////////////////////////////
26032 
FormatEscaped(FILE * fp,const char * sLine)26033 static void FormatEscaped ( FILE * fp, const char * sLine )
26034 {
26035 	// handle empty lines
26036 	if ( !sLine || !*sLine )
26037 	{
26038 		fprintf ( fp, "''" );
26039 		return;
26040 	}
26041 
26042 	// pass one, count the needed buffer size
26043 	int iLen = strlen(sLine);
26044 	int iOut = 0;
26045 	for ( int i=0; i<iLen; i++ )
26046 		switch ( sLine[i] )
26047 	{
26048 		case '\t':
26049 		case '\'':
26050 		case '\\':
26051 			iOut += 2;
26052 			break;
26053 
26054 		default:
26055 			iOut++;
26056 			break;
26057 	}
26058 	iOut += 2; // quotes
26059 
26060 	// allocate the buffer
26061 	char sMinibuffer[8192];
26062 	char * sMaxibuffer = NULL;
26063 	char * sBuffer = sMinibuffer;
26064 
26065 	if ( iOut>(int)sizeof(sMinibuffer) )
26066 	{
26067 		sMaxibuffer = new char [ iOut+4 ]; // 4 is just my safety gap
26068 		sBuffer = sMaxibuffer;
26069 	}
26070 
26071 	// pass two, escape it
26072 	char * sOut = sBuffer;
26073 	*sOut++ = '\'';
26074 
26075 	for ( int i=0; i<iLen; i++ )
26076 		switch ( sLine[i] )
26077 	{
26078 		case '\t':
26079 		case '\'':
26080 		case '\\':	*sOut++ = '\\'; // no break intended
26081 		default:	*sOut++ = sLine[i];
26082 	}
26083 	*sOut++ = '\'';
26084 
26085 	// print!
26086 	assert ( sOut==sBuffer+iOut );
26087 	fwrite ( sBuffer, 1, iOut, fp );
26088 
26089 	// cleanup
26090 	SafeDeleteArray ( sMaxibuffer );
26091 }
26092 
CSphBuildHitsState_t()26093 CSphSource_Document::CSphBuildHitsState_t::CSphBuildHitsState_t ()
26094 {
26095 	Reset();
26096 }
26097 
~CSphBuildHitsState_t()26098 CSphSource_Document::CSphBuildHitsState_t::~CSphBuildHitsState_t ()
26099 {
26100 	Reset();
26101 }
26102 
Reset()26103 void CSphSource_Document::CSphBuildHitsState_t::Reset ()
26104 {
26105 	m_bProcessingHits = false;
26106 	m_bDocumentDone = false;
26107 	m_dFields = NULL;
26108 	m_iStartPos = 0;
26109 	m_iHitPos = 0;
26110 	m_iField = 0;
26111 	m_iStartField = 0;
26112 	m_iEndField = 0;
26113 	m_iBuildLastStep = 1;
26114 
26115 	ARRAY_FOREACH ( i, m_dTmpFieldStorage )
26116 		SafeDeleteArray ( m_dTmpFieldStorage[i] );
26117 
26118 	m_dTmpFieldStorage.Resize ( 0 );
26119 	m_dTmpFieldPtrs.Resize ( 0 );
26120 	m_dFiltered.Resize( 0 );
26121 }
26122 
CSphSource_Document(const char * sName)26123 CSphSource_Document::CSphSource_Document ( const char * sName )
26124 	: CSphSource ( sName )
26125 	, m_pReadFileBuffer ( NULL )
26126 	, m_iReadFileBufferSize ( 256 * 1024 )
26127 	, m_iMaxFileBufferSize ( 2 * 1024 * 1024 )
26128 	, m_eOnFileFieldError ( FFE_IGNORE_FIELD )
26129 	, m_fpDumpRows ( NULL )
26130 	, m_iPlainFieldsLength ( 0 )
26131 	, m_pFieldLengthAttrs ( NULL )
26132 	, m_bIdsSorted ( false )
26133 	, m_iMaxHits ( MAX_SOURCE_HITS )
26134 {
26135 }
26136 
26137 
IterateDocument(CSphString & sError)26138 bool CSphSource_Document::IterateDocument ( CSphString & sError )
26139 {
26140 	assert ( m_pTokenizer );
26141 	assert ( !m_tState.m_bProcessingHits );
26142 
26143 	m_tHits.m_dData.Resize ( 0 );
26144 
26145 	m_tState.Reset();
26146 	m_tState.m_iEndField = m_iPlainFieldsLength;
26147 
26148 	if ( m_pFieldFilter )
26149 	{
26150 		m_tState.m_dTmpFieldPtrs.Resize ( m_tState.m_iEndField );
26151 		m_tState.m_dTmpFieldStorage.Resize ( m_tState.m_iEndField );
26152 
26153 		ARRAY_FOREACH ( i, m_tState.m_dTmpFieldPtrs )
26154 		{
26155 			m_tState.m_dTmpFieldPtrs[i] = NULL;
26156 			m_tState.m_dTmpFieldStorage[i] = NULL;
26157 		}
26158 	}
26159 
26160 	m_dMva.Resize ( 1 ); // must not have zero offset
26161 
26162 	// fetch next document
26163 	for ( ;; )
26164 	{
26165 		m_tState.m_dFields = NextDocument ( sError );
26166 		if ( m_tDocInfo.m_uDocID==0 )
26167 			return true;
26168 		// moved that here as docid==0 means eof for regular query
26169 		// but joined might produce doc with docid==0 and breaks delta packing
26170 		if ( HasJoinedFields() )
26171 			m_dAllIds.Add ( m_tDocInfo.m_uDocID );
26172 
26173 		if ( !m_tState.m_dFields )
26174 			return false;
26175 
26176 		// tricky bit
26177 		// we can only skip document indexing from here, IterateHits() is too late
26178 		// so in case the user chose to skip documents with file field problems
26179 		// we need to check for those here
26180 		if ( m_eOnFileFieldError==FFE_SKIP_DOCUMENT || m_eOnFileFieldError==FFE_FAIL_INDEX )
26181 		{
26182 			bool bOk = true;
26183 			for ( int iField=0; iField<m_tState.m_iEndField && bOk; iField++ )
26184 			{
26185 				const BYTE * sFilename = m_tState.m_dFields[iField];
26186 				if ( m_tSchema.m_dFields[iField].m_bFilename )
26187 					bOk &= CheckFileField ( sFilename );
26188 
26189 				if ( !bOk && m_eOnFileFieldError==FFE_FAIL_INDEX )
26190 				{
26191 					sError.SetSprintf ( "error reading file field data (docid=" DOCID_FMT ", filename=%s)",
26192 						m_tDocInfo.m_uDocID, sFilename );
26193 					return false;
26194 				}
26195 			}
26196 			if ( !bOk && m_eOnFileFieldError==FFE_SKIP_DOCUMENT )
26197 				continue;
26198 		}
26199 
26200 		if ( m_pFieldFilter )
26201 		{
26202 			bool bHaveModifiedFields = false;
26203 			for ( int iField=0; iField<m_tState.m_iEndField; iField++ )
26204 			{
26205 				if ( m_tSchema.m_dFields[iField].m_bFilename )
26206 				{
26207 					m_tState.m_dTmpFieldPtrs[iField] = m_tState.m_dFields[iField];
26208 					continue;
26209 				}
26210 
26211 				CSphVector<BYTE> dFiltered;
26212 				if ( m_pFieldFilter->Apply ( m_tState.m_dFields[iField], 0, dFiltered ) )
26213 				{
26214 					m_tState.m_dTmpFieldStorage[iField] = dFiltered.LeakData();
26215 					m_tState.m_dTmpFieldPtrs[iField] = m_tState.m_dTmpFieldStorage[iField];
26216 					bHaveModifiedFields = true;
26217 				} else
26218 				{
26219 					m_tState.m_dTmpFieldPtrs[iField] = m_tState.m_dFields[iField];
26220 				}
26221 			}
26222 
26223 			if ( bHaveModifiedFields )
26224 				m_tState.m_dFields = (BYTE **)&( m_tState.m_dTmpFieldPtrs[0] );
26225 		}
26226 
26227 		// we're good
26228 		break;
26229 	}
26230 
26231 	m_tStats.m_iTotalDocuments++;
26232 	return true;
26233 }
26234 
26235 
IterateHits(CSphString & sError)26236 ISphHits * CSphSource_Document::IterateHits ( CSphString & sError )
26237 {
26238 	if ( m_tState.m_bDocumentDone )
26239 		return NULL;
26240 
26241 	m_tHits.m_dData.Resize ( 0 );
26242 
26243 	BuildHits ( sError, false );
26244 
26245 	return &m_tHits;
26246 }
26247 
26248 
CheckFileField(const BYTE * sField)26249 bool CSphSource_Document::CheckFileField ( const BYTE * sField )
26250 {
26251 	CSphAutofile tFileSource;
26252 	CSphString sError;
26253 
26254 	if ( tFileSource.Open ( (const char *)sField, SPH_O_READ, sError )==-1 )
26255 	{
26256 		sphWarning ( "docid=" DOCID_FMT ": %s", m_tDocInfo.m_uDocID, sError.cstr() );
26257 		return false;
26258 	}
26259 
26260 	int64_t iFileSize = tFileSource.GetSize();
26261 	if ( iFileSize+16 > m_iMaxFileBufferSize )
26262 	{
26263 		sphWarning ( "docid=" DOCID_FMT ": file '%s' too big for a field (size=" INT64_FMT ", max_file_field_buffer=%d)",
26264 			m_tDocInfo.m_uDocID, (const char *)sField, iFileSize, m_iMaxFileBufferSize );
26265 		return false;
26266 	}
26267 
26268 	return true;
26269 }
26270 
26271 
26272 /// returns file size on success, and replaces *ppField with a pointer to data
26273 /// returns -1 on failure (and emits a warning)
LoadFileField(BYTE ** ppField,CSphString & sError)26274 int CSphSource_Document::LoadFileField ( BYTE ** ppField, CSphString & sError )
26275 {
26276 	CSphAutofile tFileSource;
26277 
26278 	BYTE * sField = *ppField;
26279 	if ( tFileSource.Open ( (const char *)sField, SPH_O_READ, sError )==-1 )
26280 	{
26281 		sphWarning ( "docid=" DOCID_FMT ": %s", m_tDocInfo.m_uDocID, sError.cstr() );
26282 		return -1;
26283 	}
26284 
26285 	int64_t iFileSize = tFileSource.GetSize();
26286 	if ( iFileSize+16 > m_iMaxFileBufferSize )
26287 	{
26288 		sphWarning ( "docid=" DOCID_FMT ": file '%s' too big for a field (size=" INT64_FMT ", max_file_field_buffer=%d)",
26289 			m_tDocInfo.m_uDocID, (const char *)sField, iFileSize, m_iMaxFileBufferSize );
26290 		return -1;
26291 	}
26292 
26293 	int iFieldBytes = (int)iFileSize;
26294 	if ( !iFieldBytes )
26295 		return 0;
26296 
26297 	int iBufSize = Max ( m_iReadFileBufferSize, 1 << sphLog2 ( iFieldBytes+15 ) );
26298 	if ( m_iReadFileBufferSize < iBufSize )
26299 		SafeDeleteArray ( m_pReadFileBuffer );
26300 
26301 	if ( !m_pReadFileBuffer )
26302 	{
26303 		m_pReadFileBuffer = new char [ iBufSize ];
26304 		m_iReadFileBufferSize = iBufSize;
26305 	}
26306 
26307 	if ( !tFileSource.Read ( m_pReadFileBuffer, iFieldBytes, sError ) )
26308 	{
26309 		sphWarning ( "docid=" DOCID_FMT ": read failed: %s", m_tDocInfo.m_uDocID, sError.cstr() );
26310 		return -1;
26311 	}
26312 
26313 	m_pReadFileBuffer[iFieldBytes] = '\0';
26314 
26315 	*ppField = (BYTE*)m_pReadFileBuffer;
26316 	return iFieldBytes;
26317 }
26318 
26319 
AddFieldLens(CSphSchema & tSchema,bool bDynamic,CSphString & sError)26320 bool AddFieldLens ( CSphSchema & tSchema, bool bDynamic, CSphString & sError )
26321 {
26322 	ARRAY_FOREACH ( i, tSchema.m_dFields )
26323 	{
26324 		CSphColumnInfo tCol;
26325 		tCol.m_sName.SetSprintf ( "%s_len", tSchema.m_dFields[i].m_sName.cstr() );
26326 
26327 		int iGot = tSchema.GetAttrIndex ( tCol.m_sName.cstr() );
26328 		if ( iGot>=0 )
26329 		{
26330 			if ( tSchema.GetAttr(iGot).m_eAttrType==SPH_ATTR_TOKENCOUNT )
26331 			{
26332 				// looks like we already added these
26333 				assert ( tSchema.GetAttr(iGot).m_sName==tCol.m_sName );
26334 				return true;
26335 			}
26336 
26337 			sError.SetSprintf ( "attribute %s conflicts with index_field_lengths=1; remove it", tCol.m_sName.cstr() );
26338 			return false;
26339 		}
26340 
26341 		tCol.m_eAttrType = SPH_ATTR_TOKENCOUNT;
26342 		tSchema.AddAttr ( tCol, bDynamic ); // everything's dynamic at indexing time
26343 	}
26344 	return true;
26345 }
26346 
26347 
AddAutoAttrs(CSphString & sError)26348 bool CSphSource_Document::AddAutoAttrs ( CSphString & sError )
26349 {
26350 	// auto-computed length attributes
26351 	if ( m_bIndexFieldLens )
26352 		return AddFieldLens ( m_tSchema, true, sError );
26353 	return true;
26354 }
26355 
26356 
AllocDocinfo()26357 void CSphSource_Document::AllocDocinfo()
26358 {
26359 	// tricky bit
26360 	// with in-config schema, attr storage gets allocated in Setup() when source is initially created
26361 	// so when this AddAutoAttrs() additionally changes the count, we have to change the number of attributes
26362 	// but Reset() prohibits that, because that is usually a programming mistake, hence the Swap() dance
26363 	CSphMatch tNew;
26364 	tNew.Reset ( m_tSchema.GetRowSize() );
26365 	Swap ( m_tDocInfo, tNew );
26366 
26367 	m_dStrAttrs.Resize ( m_tSchema.GetAttrsCount() );
26368 
26369 	if ( m_bIndexFieldLens && m_tSchema.GetAttrsCount() && m_tSchema.m_dFields.GetLength() )
26370 	{
26371 		int iFirst = m_tSchema.GetAttrsCount() - m_tSchema.m_dFields.GetLength();
26372 		assert ( m_tSchema.GetAttr ( iFirst ).m_eAttrType==SPH_ATTR_TOKENCOUNT );
26373 		assert ( m_tSchema.GetAttr ( iFirst+m_tSchema.m_dFields.GetLength()-1 ).m_eAttrType==SPH_ATTR_TOKENCOUNT );
26374 
26375 		m_pFieldLengthAttrs = m_tDocInfo.m_pDynamic + ( m_tSchema.GetAttr ( iFirst ).m_tLocator.m_iBitOffset / 32 );
26376 	}
26377 }
26378 
26379 //////////////////////////////////////////////////////////////////////////
26380 // HIT GENERATORS
26381 //////////////////////////////////////////////////////////////////////////
26382 
BuildZoneHits(SphDocID_t uDocid,BYTE * sWord)26383 bool CSphSource_Document::BuildZoneHits ( SphDocID_t uDocid, BYTE * sWord )
26384 {
26385 	if ( *sWord==MAGIC_CODE_SENTENCE || *sWord==MAGIC_CODE_PARAGRAPH || *sWord==MAGIC_CODE_ZONE )
26386 	{
26387 		m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( (BYTE*)MAGIC_WORD_SENTENCE ), m_tState.m_iHitPos );
26388 
26389 		if ( *sWord==MAGIC_CODE_PARAGRAPH || *sWord==MAGIC_CODE_ZONE )
26390 			m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( (BYTE*)MAGIC_WORD_PARAGRAPH ), m_tState.m_iHitPos );
26391 
26392 		if ( *sWord==MAGIC_CODE_ZONE )
26393 		{
26394 			BYTE * pZone = (BYTE*) m_pTokenizer->GetBufferPtr();
26395 			BYTE * pEnd = pZone;
26396 			while ( *pEnd && *pEnd!=MAGIC_CODE_ZONE )
26397 			{
26398 				pEnd++;
26399 			}
26400 
26401 			if ( *pEnd && *pEnd==MAGIC_CODE_ZONE )
26402 			{
26403 				*pEnd = '\0';
26404 				m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( pZone-1 ), m_tState.m_iHitPos );
26405 				m_pTokenizer->SetBufferPtr ( (const char*) pEnd+1 );
26406 			}
26407 		}
26408 
26409 		m_tState.m_iBuildLastStep = 1;
26410 		return true;
26411 	}
26412 	return false;
26413 }
26414 
26415 
26416 // track blended start and reset on not blended token
TrackBlendedStart(const ISphTokenizer * pTokenizer,int iBlendedHitsStart,int iHitsCount)26417 static int TrackBlendedStart ( const ISphTokenizer * pTokenizer, int iBlendedHitsStart, int iHitsCount )
26418 {
26419 	iBlendedHitsStart = ( ( pTokenizer->TokenIsBlended() || pTokenizer->TokenIsBlendedPart() ) ? iBlendedHitsStart : -1 );
26420 	if ( pTokenizer->TokenIsBlended() )
26421 		iBlendedHitsStart = iHitsCount;
26422 
26423 	return iBlendedHitsStart;
26424 }
26425 
26426 
26427 #define BUILD_SUBSTRING_HITS_COUNT 4
26428 
BuildSubstringHits(SphDocID_t uDocid,bool bPayload,ESphWordpart eWordpart,bool bSkipEndMarker)26429 void CSphSource_Document::BuildSubstringHits ( SphDocID_t uDocid, bool bPayload, ESphWordpart eWordpart, bool bSkipEndMarker )
26430 {
26431 	bool bPrefixField = ( eWordpart==SPH_WORDPART_PREFIX );
26432 	bool bInfixMode = m_iMinInfixLen > 0;
26433 
26434 	int iMinInfixLen = bPrefixField ? m_iMinPrefixLen : m_iMinInfixLen;
26435 	if ( !m_tState.m_bProcessingHits )
26436 		m_tState.m_iBuildLastStep = 1;
26437 
26438 	BYTE * sWord = NULL;
26439 	BYTE sBuf [ 16+3*SPH_MAX_WORD_LEN ];
26440 
26441 	int iIterHitCount = BUILD_SUBSTRING_HITS_COUNT;
26442 	if ( bPrefixField )
26443 		iIterHitCount += SPH_MAX_WORD_LEN - m_iMinPrefixLen;
26444 	else
26445 		iIterHitCount += ( ( m_iMinInfixLen+SPH_MAX_WORD_LEN ) * ( SPH_MAX_WORD_LEN-m_iMinInfixLen ) / 2 );
26446 
26447 	// FIELDEND_MASK at blended token stream should be set for HEAD token too
26448 	int iBlendedHitsStart = -1;
26449 
26450 	// index all infixes
26451 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+iIterHitCount<m_iMaxHits )
26452 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
26453 	{
26454 		int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
26455 
26456 		if ( !bPayload )
26457 		{
26458 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
26459 			if ( m_pTokenizer->GetBoundary() )
26460 				HITMAN::AddPos ( &m_tState.m_iHitPos, m_iBoundaryStep );
26461 			m_tState.m_iBuildLastStep = 1;
26462 		}
26463 
26464 		if ( BuildZoneHits ( uDocid, sWord ) )
26465 			continue;
26466 
26467 		int iLen = m_pTokenizer->GetLastTokenLen ();
26468 
26469 		// always index full word (with magic head/tail marker(s))
26470 		int iBytes = strlen ( (const char*)sWord );
26471 		memcpy ( sBuf + 1, sWord, iBytes );
26472 		sBuf[iBytes+1] = '\0';
26473 
26474 		SphWordID_t uExactWordid = 0;
26475 		if ( m_bIndexExactWords )
26476 		{
26477 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
26478 			uExactWordid = m_pDict->GetWordIDNonStemmed ( sBuf );
26479 		}
26480 
26481 		sBuf[0] = MAGIC_WORD_HEAD;
26482 
26483 		// stemmed word w/markers
26484 		SphWordID_t iWord = m_pDict->GetWordIDWithMarkers ( sBuf );
26485 		if ( !iWord )
26486 		{
26487 			m_tState.m_iBuildLastStep = m_iStopwordStep;
26488 			continue;
26489 		}
26490 
26491 		if ( m_bIndexExactWords )
26492 			m_tHits.AddHit ( uDocid, uExactWordid, m_tState.m_iHitPos );
26493 		iBlendedHitsStart = iLastBlendedStart;
26494 		m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
26495 		m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
26496 
26497 		// restore stemmed word
26498 		int iStemmedLen = strlen ( ( const char *)sBuf );
26499 		sBuf [iStemmedLen - 1] = '\0';
26500 
26501 		// stemmed word w/o markers
26502 		if ( strcmp ( (const char *)sBuf + 1, (const char *)sWord ) )
26503 			m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sBuf + 1, iStemmedLen - 2, true ), m_tState.m_iHitPos );
26504 
26505 		// restore word
26506 		memcpy ( sBuf + 1, sWord, iBytes );
26507 		sBuf[iBytes+1] = MAGIC_WORD_TAIL;
26508 		sBuf[iBytes+2] = '\0';
26509 
26510 		// if there are no infixes, that's it
26511 		if ( iMinInfixLen > iLen )
26512 		{
26513 			// index full word
26514 			m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sWord ), m_tState.m_iHitPos );
26515 			continue;
26516 		}
26517 
26518 		// process all infixes
26519 		int iMaxStart = bPrefixField ? 0 : ( iLen - iMinInfixLen );
26520 
26521 		BYTE * sInfix = sBuf + 1;
26522 
26523 		for ( int iStart=0; iStart<=iMaxStart; iStart++ )
26524 		{
26525 			BYTE * sInfixEnd = sInfix;
26526 			for ( int i = 0; i < iMinInfixLen; i++ )
26527 				sInfixEnd += m_pTokenizer->GetCodepointLength ( *sInfixEnd );
26528 
26529 			int iMaxSubLen = ( iLen-iStart );
26530 			if ( m_iMaxSubstringLen )
26531 				iMaxSubLen = Min ( m_iMaxSubstringLen, iMaxSubLen );
26532 
26533 			for ( int i=iMinInfixLen; i<=iMaxSubLen; i++ )
26534 			{
26535 				m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix, sInfixEnd-sInfix, false ), m_tState.m_iHitPos );
26536 
26537 				// word start: add magic head
26538 				if ( bInfixMode && iStart==0 )
26539 					m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix - 1, sInfixEnd-sInfix + 1, false ), m_tState.m_iHitPos );
26540 
26541 				// word end: add magic tail
26542 				if ( bInfixMode && i==iLen-iStart )
26543 					m_tHits.AddHit ( uDocid, m_pDict->GetWordID ( sInfix, sInfixEnd-sInfix+1, false ), m_tState.m_iHitPos );
26544 
26545 				sInfixEnd += m_pTokenizer->GetCodepointLength ( *sInfixEnd );
26546 			}
26547 
26548 			sInfix += m_pTokenizer->GetCodepointLength ( *sInfix );
26549 		}
26550 	}
26551 
26552 	m_tState.m_bProcessingHits = ( sWord!=NULL );
26553 
26554 	// mark trailing hits
26555 	// and compute fields lengths
26556 	if ( !bSkipEndMarker && !m_tState.m_bProcessingHits && m_tHits.Length() )
26557 	{
26558 		CSphWordHit * pTail = const_cast < CSphWordHit * > ( m_tHits.Last() );
26559 
26560 		if ( m_pFieldLengthAttrs )
26561 			m_pFieldLengthAttrs [ HITMAN::GetField ( pTail->m_uWordPos ) ] = HITMAN::GetPos ( pTail->m_uWordPos );
26562 
26563 		Hitpos_t uEndPos = pTail->m_uWordPos;
26564 		if ( iBlendedHitsStart>=0 )
26565 		{
26566 			assert ( iBlendedHitsStart>=0 && iBlendedHitsStart<m_tHits.Length() );
26567 			Hitpos_t uBlendedPos = ( m_tHits.First() + iBlendedHitsStart )->m_uWordPos;
26568 			uEndPos = Min ( uEndPos, uBlendedPos );
26569 		}
26570 
26571 		// set end marker for all tail hits
26572 		const CSphWordHit * pStart = m_tHits.First();
26573 		while ( pStart<=pTail && uEndPos<=pTail->m_uWordPos )
26574 		{
26575 			HITMAN::SetEndMarker ( &pTail->m_uWordPos );
26576 			pTail--;
26577 		}
26578 	}
26579 }
26580 
26581 
26582 #define BUILD_REGULAR_HITS_COUNT 6
26583 
BuildRegularHits(SphDocID_t uDocid,bool bPayload,bool bSkipEndMarker)26584 void CSphSource_Document::BuildRegularHits ( SphDocID_t uDocid, bool bPayload, bool bSkipEndMarker )
26585 {
26586 	bool bWordDict = m_pDict->GetSettings().m_bWordDict;
26587 	bool bGlobalPartialMatch = !bWordDict && ( m_iMinPrefixLen > 0 || m_iMinInfixLen > 0 );
26588 
26589 	if ( !m_tState.m_bProcessingHits )
26590 		m_tState.m_iBuildLastStep = 1;
26591 
26592 	BYTE * sWord = NULL;
26593 	BYTE sBuf [ 16+3*SPH_MAX_WORD_LEN ];
26594 
26595 	// FIELDEND_MASK at last token stream should be set for HEAD token too
26596 	int iBlendedHitsStart = -1;
26597 
26598 	// index words only
26599 	while ( ( m_iMaxHits==0 || m_tHits.m_dData.GetLength()+BUILD_REGULAR_HITS_COUNT<m_iMaxHits )
26600 		&& ( sWord = m_pTokenizer->GetToken() )!=NULL )
26601 	{
26602 		m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
26603 
26604 		int iLastBlendedStart = TrackBlendedStart ( m_pTokenizer, iBlendedHitsStart, m_tHits.Length() );
26605 
26606 		if ( !bPayload )
26607 		{
26608 			HITMAN::AddPos ( &m_tState.m_iHitPos, m_tState.m_iBuildLastStep + m_pTokenizer->GetOvershortCount()*m_iOvershortStep );
26609 			if ( m_pTokenizer->GetBoundary() )
26610 				HITMAN::AddPos ( &m_tState.m_iHitPos, m_iBoundaryStep );
26611 		}
26612 
26613 		if ( BuildZoneHits ( uDocid, sWord ) )
26614 			continue;
26615 
26616 		if ( bGlobalPartialMatch )
26617 		{
26618 			int iBytes = strlen ( (const char*)sWord );
26619 			memcpy ( sBuf + 1, sWord, iBytes );
26620 			sBuf[0] = MAGIC_WORD_HEAD;
26621 			sBuf[iBytes+1] = '\0';
26622 			m_tHits.AddHit ( uDocid, m_pDict->GetWordIDWithMarkers ( sBuf ), m_tState.m_iHitPos );
26623 		}
26624 
26625 		ESphTokenMorph eMorph = m_pTokenizer->GetTokenMorph();
26626 		if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
26627 		{
26628 			int iBytes = strlen ( (const char*)sWord );
26629 			memcpy ( sBuf + 1, sWord, iBytes );
26630 			sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
26631 			sBuf[iBytes+1] = '\0';
26632 		}
26633 
26634 		if ( m_bIndexExactWords && eMorph==SPH_TOKEN_MORPH_ORIGINAL )
26635 		{
26636 			// can not use GetWordID here due to exception vs missed hit, ie
26637 			// stemmed sWord hasn't got added to hit stream but might be added as exception to dictionary
26638 			// that causes error at hit sorting phase \ dictionary HitblockPatch
26639 			if ( !m_pDict->GetSettings().m_bStopwordsUnstemmed )
26640 				m_pDict->ApplyStemmers ( sWord );
26641 
26642 			if ( !m_pDict->IsStopWord ( sWord ) )
26643 				m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
26644 
26645 			m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
26646 			continue;
26647 		}
26648 
26649 		SphWordID_t iWord = m_pDict->GetWordID ( sWord );
26650 		if ( iWord )
26651 		{
26652 #if 0
26653 			if ( HITMAN::GetPos ( m_tState.m_iHitPos )==1 )
26654 				printf ( "\n" );
26655 			printf ( "doc %d. pos %d. %s\n", uDocid, HITMAN::GetPos ( m_tState.m_iHitPos ), sWord );
26656 #endif
26657 			iBlendedHitsStart = iLastBlendedStart;
26658 			m_tState.m_iBuildLastStep = m_pTokenizer->TokenIsBlended() ? 0 : 1;
26659 			m_tHits.AddHit ( uDocid, iWord, m_tState.m_iHitPos );
26660 			if ( m_bIndexExactWords && eMorph!=SPH_TOKEN_MORPH_GUESS )
26661 				m_tHits.AddHit ( uDocid, m_pDict->GetWordIDNonStemmed ( sBuf ), m_tState.m_iHitPos );
26662 		} else
26663 			m_tState.m_iBuildLastStep = m_iStopwordStep;
26664 	}
26665 
26666 	m_tState.m_bProcessingHits = ( sWord!=NULL );
26667 
26668 	// mark trailing hit
26669 	// and compute field lengths
26670 	if ( !bSkipEndMarker && !m_tState.m_bProcessingHits && m_tHits.Length() )
26671 	{
26672 		CSphWordHit * pTail = const_cast < CSphWordHit * > ( m_tHits.Last() );
26673 
26674 		if ( m_pFieldLengthAttrs )
26675 			m_pFieldLengthAttrs [ HITMAN::GetField ( pTail->m_uWordPos ) ] = HITMAN::GetPos ( pTail->m_uWordPos );
26676 
26677 		Hitpos_t uEndPos = pTail->m_uWordPos;
26678 		if ( iBlendedHitsStart>=0 )
26679 		{
26680 			assert ( iBlendedHitsStart>=0 && iBlendedHitsStart<m_tHits.Length() );
26681 			Hitpos_t uBlendedPos = ( m_tHits.First() + iBlendedHitsStart )->m_uWordPos;
26682 			uEndPos = Min ( uEndPos, uBlendedPos );
26683 		}
26684 
26685 		// set end marker for all tail hits
26686 		const CSphWordHit * pStart = m_tHits.First();
26687 		while ( pStart<=pTail && uEndPos<=pTail->m_uWordPos )
26688 		{
26689 			HITMAN::SetEndMarker ( &pTail->m_uWordPos );
26690 			pTail--;
26691 		}
26692 	}
26693 }
26694 
26695 
BuildHits(CSphString & sError,bool bSkipEndMarker)26696 void CSphSource_Document::BuildHits ( CSphString & sError, bool bSkipEndMarker )
26697 {
26698 	SphDocID_t uDocid = m_tDocInfo.m_uDocID;
26699 
26700 	for ( ; m_tState.m_iField<m_tState.m_iEndField; m_tState.m_iField++ )
26701 	{
26702 		if ( !m_tState.m_bProcessingHits )
26703 		{
26704 			// get that field
26705 			BYTE * sField = m_tState.m_dFields[m_tState.m_iField-m_tState.m_iStartField];
26706 			if ( !sField || !(*sField) )
26707 				continue;
26708 
26709 			// load files
26710 			int iFieldBytes;
26711 			const BYTE * sTextToIndex;
26712 			if ( m_tSchema.m_dFields[m_tState.m_iField].m_bFilename )
26713 			{
26714 				LoadFileField ( &sField, sError );
26715 				sTextToIndex = sField;
26716 				iFieldBytes = (int) strlen ( (char*)sField );
26717 				if ( m_pFieldFilter && iFieldBytes )
26718 				{
26719 					m_tState.m_dFiltered.Resize ( 0 );
26720 					int iFiltered = m_pFieldFilter->Apply ( sTextToIndex, iFieldBytes, m_tState.m_dFiltered );
26721 					if ( iFiltered )
26722 					{
26723 						sTextToIndex = m_tState.m_dFiltered.Begin();
26724 						iFieldBytes = iFiltered;
26725 					}
26726 				}
26727 			} else
26728 			{
26729 				iFieldBytes = (int) strlen ( (char*)sField );
26730 				sTextToIndex = sField;
26731 			}
26732 
26733 			if ( iFieldBytes<=0 )
26734 				continue;
26735 
26736 			// strip html
26737 			if ( m_pStripper )
26738 			{
26739 				m_pStripper->Strip ( (BYTE*)sTextToIndex );
26740 				iFieldBytes = (int) strlen ( (char*)sTextToIndex );
26741 			}
26742 
26743 			// tokenize and build hits
26744 			m_tStats.m_iTotalBytes += iFieldBytes;
26745 
26746 			m_pTokenizer->BeginField ( m_tState.m_iField );
26747 			m_pTokenizer->SetBuffer ( (BYTE*)sTextToIndex, iFieldBytes );
26748 
26749 			m_tState.m_iHitPos = HITMAN::Create ( m_tState.m_iField, m_tState.m_iStartPos );
26750 		}
26751 
26752 		const CSphColumnInfo & tField = m_tSchema.m_dFields[m_tState.m_iField];
26753 
26754 		if ( tField.m_eWordpart!=SPH_WORDPART_WHOLE )
26755 			BuildSubstringHits ( uDocid, tField.m_bPayload, tField.m_eWordpart, bSkipEndMarker );
26756 		else
26757 			BuildRegularHits ( uDocid, tField.m_bPayload, bSkipEndMarker );
26758 
26759 		m_pDict->SetApplyMorph ( true );
26760 
26761 		if ( m_tState.m_bProcessingHits )
26762 			break;
26763 	}
26764 
26765 	m_tState.m_bDocumentDone = !m_tState.m_bProcessingHits;
26766 }
26767 
26768 //////////////////////////////////////////////////////////////////////////
26769 
IterateFieldMVAStart(int iAttr)26770 SphRange_t CSphSource_Document::IterateFieldMVAStart ( int iAttr )
26771 {
26772 	SphRange_t tRange;
26773 	tRange.m_iStart = tRange.m_iLength = 0;
26774 
26775 	if ( iAttr<0 || iAttr>=m_tSchema.GetAttrsCount() )
26776 		return tRange;
26777 
26778 	const CSphColumnInfo & tMva = m_tSchema.GetAttr ( iAttr );
26779 	int uOff = MVA_DOWNSIZE ( m_tDocInfo.GetAttr ( tMva.m_tLocator ) );
26780 	if ( !uOff )
26781 		return tRange;
26782 
26783 	int iCount = m_dMva[uOff];
26784 	assert ( iCount );
26785 
26786 	tRange.m_iStart = uOff+1;
26787 	tRange.m_iLength = iCount;
26788 
26789 	return tRange;
26790 }
26791 
26792 
sphAddMva64(CSphVector<DWORD> & dStorage,int64_t iVal)26793 static int sphAddMva64 ( CSphVector<DWORD> & dStorage, int64_t iVal )
26794 {
26795 	int uOff = dStorage.GetLength();
26796 	dStorage.Resize ( uOff+2 );
26797 	dStorage[uOff] = MVA_DOWNSIZE ( iVal );
26798 	dStorage[uOff+1] = MVA_DOWNSIZE ( ( iVal>>32 ) & 0xffffffff );
26799 	return uOff;
26800 }
26801 
26802 
ParseFieldMVA(CSphVector<DWORD> & dMva,const char * szValue,bool bMva64) const26803 int CSphSource_Document::ParseFieldMVA ( CSphVector < DWORD > & dMva, const char * szValue, bool bMva64 ) const
26804 {
26805 	if ( !szValue )
26806 		return 0;
26807 
26808 	const char * pPtr = szValue;
26809 	const char * pDigit = NULL;
26810 	const int MAX_NUMBER_LEN = 64;
26811 	char szBuf [MAX_NUMBER_LEN];
26812 
26813 	assert ( dMva.GetLength() ); // must not have zero offset
26814 	int uOff = dMva.GetLength();
26815 	dMva.Add ( 0 ); // reserve value for count
26816 
26817 	while ( *pPtr )
26818 	{
26819 		if ( ( *pPtr>='0' && *pPtr<='9' ) || ( bMva64 && *pPtr=='-' ) )
26820 		{
26821 			if ( !pDigit )
26822 				pDigit = pPtr;
26823 		} else
26824 		{
26825 			if ( pDigit )
26826 			{
26827 				if ( pPtr - pDigit < MAX_NUMBER_LEN )
26828 				{
26829 					strncpy ( szBuf, pDigit, pPtr - pDigit );
26830 					szBuf [pPtr - pDigit] = '\0';
26831 					if ( !bMva64 )
26832 						dMva.Add ( sphToDword ( szBuf ) );
26833 					else
26834 						sphAddMva64 ( dMva, sphToInt64 ( szBuf ) );
26835 				}
26836 
26837 				pDigit = NULL;
26838 			}
26839 		}
26840 
26841 		pPtr++;
26842 	}
26843 
26844 	if ( pDigit )
26845 	{
26846 		if ( !bMva64 )
26847 			dMva.Add ( sphToDword ( pDigit ) );
26848 		else
26849 			sphAddMva64 ( dMva, sphToInt64 ( pDigit ) );
26850 	}
26851 
26852 	int iCount = dMva.GetLength()-uOff-1;
26853 	if ( !iCount )
26854 	{
26855 		dMva.Pop(); // remove reserved value for count in case of 0 MVAs
26856 		return 0;
26857 	} else
26858 	{
26859 		dMva[uOff] = iCount;
26860 		return uOff; // return offset to ( count, [value] )
26861 	}
26862 }
26863 
26864 /////////////////////////////////////////////////////////////////////////////
26865 // GENERIC SQL SOURCE
26866 /////////////////////////////////////////////////////////////////////////////
26867 
CSphSourceParams_SQL()26868 CSphSourceParams_SQL::CSphSourceParams_SQL ()
26869 	: m_iRangeStep ( 1024 )
26870 	, m_iRefRangeStep ( 1024 )
26871 	, m_bPrintQueries ( false )
26872 	, m_iRangedThrottle ( 0 )
26873 	, m_iMaxFileBufferSize ( 0 )
26874 	, m_eOnFileFieldError ( FFE_IGNORE_FIELD )
26875 	, m_iPort ( 0 )
26876 {
26877 }
26878 
26879 
26880 const char * const CSphSource_SQL::MACRO_VALUES [ CSphSource_SQL::MACRO_COUNT ] =
26881 {
26882 	"$start",
26883 	"$end"
26884 };
26885 
26886 
CSphSource_SQL(const char * sName)26887 CSphSource_SQL::CSphSource_SQL ( const char * sName )
26888 	: CSphSource_Document	( sName )
26889 	, m_bSqlConnected		( false )
26890 	, m_uMinID				( 0 )
26891 	, m_uMaxID				( 0 )
26892 	, m_uCurrentID			( 0 )
26893 	, m_uMaxFetchedID		( 0 )
26894 	, m_iMultiAttr			( -1 )
26895 	, m_iSqlFields			( 0 )
26896 	, m_bCanUnpack			( false )
26897 	, m_bUnpackFailed		( false )
26898 	, m_bUnpackOverflow		( false )
26899 	, m_iJoinedHitField		( -1 )
26900 	, m_iJoinedHitID		( 0 )
26901 	, m_iJoinedHitPos		( 0 )
26902 {
26903 }
26904 
26905 
Setup(const CSphSourceParams_SQL & tParams)26906 bool CSphSource_SQL::Setup ( const CSphSourceParams_SQL & tParams )
26907 {
26908 	// checks
26909 	assert ( !tParams.m_sQuery.IsEmpty() );
26910 
26911 	m_tParams = tParams;
26912 
26913 	// defaults
26914 	#define LOC_FIX_NULL(_arg) if ( !m_tParams._arg.cstr() ) m_tParams._arg = "";
26915 	LOC_FIX_NULL ( m_sHost );
26916 	LOC_FIX_NULL ( m_sUser );
26917 	LOC_FIX_NULL ( m_sPass );
26918 	LOC_FIX_NULL ( m_sDB );
26919 	#undef LOC_FIX_NULL
26920 
26921 	#define LOC_FIX_QARRAY(_arg) \
26922 		ARRAY_FOREACH ( i, m_tParams._arg ) \
26923 			if ( m_tParams._arg[i].IsEmpty() ) \
26924 				m_tParams._arg.Remove ( i-- );
26925 	LOC_FIX_QARRAY ( m_dQueryPre );
26926 	LOC_FIX_QARRAY ( m_dQueryPost );
26927 	LOC_FIX_QARRAY ( m_dQueryPostIndex );
26928 	#undef LOC_FIX_QARRAY
26929 
26930 	// build and store default DSN for error reporting
26931 	char sBuf [ 1024 ];
26932 	snprintf ( sBuf, sizeof(sBuf), "sql://%s:***@%s:%d/%s",
26933 		m_tParams.m_sUser.cstr(), m_tParams.m_sHost.cstr(),
26934 		m_tParams.m_iPort, m_tParams.m_sDB.cstr() );
26935 	m_sSqlDSN = sBuf;
26936 
26937 	if ( m_tParams.m_iMaxFileBufferSize > 0 )
26938 		m_iMaxFileBufferSize = m_tParams.m_iMaxFileBufferSize;
26939 	m_eOnFileFieldError = m_tParams.m_eOnFileFieldError;
26940 
26941 	return true;
26942 }
26943 
SubstituteParams(const char * sQuery,const char * const * dMacroses,const char ** dValues,int iMcount)26944 const char * SubstituteParams ( const char * sQuery, const char * const * dMacroses, const char ** dValues, int iMcount )
26945 {
26946 	// OPTIMIZE? things can be precalculated
26947 	const char * sCur = sQuery;
26948 	int iLen = 0;
26949 	while ( *sCur )
26950 	{
26951 		if ( *sCur=='$' )
26952 		{
26953 			int i;
26954 			for ( i=0; i<iMcount; i++ )
26955 				if ( strncmp ( dMacroses[i], sCur, strlen ( dMacroses[i] ) )==0 )
26956 				{
26957 					sCur += strlen ( dMacroses[i] );
26958 					iLen += strlen ( dValues[i] );
26959 					break;
26960 				}
26961 				if ( i<iMcount )
26962 					continue;
26963 		}
26964 
26965 		sCur++;
26966 		iLen++;
26967 	}
26968 	iLen++; // trailing zero
26969 
26970 	// do interpolation
26971 	char * sRes = new char [ iLen ];
26972 	sCur = sQuery;
26973 
26974 	char * sDst = sRes;
26975 	while ( *sCur )
26976 	{
26977 		if ( *sCur=='$' )
26978 		{
26979 			int i;
26980 			for ( i=0; i<iMcount; i++ )
26981 				if ( strncmp ( dMacroses[i], sCur, strlen ( dMacroses[i] ) )==0 )
26982 				{
26983 					strcpy ( sDst, dValues[i] ); // NOLINT
26984 					sCur += strlen ( dMacroses[i] );
26985 					sDst += strlen ( dValues[i] );
26986 					break;
26987 				}
26988 				if ( i<iMcount )
26989 					continue;
26990 		}
26991 		*sDst++ = *sCur++;
26992 	}
26993 	*sDst++ = '\0';
26994 	assert ( sDst-sRes==iLen );
26995 	return sRes;
26996 }
26997 
26998 
RunQueryStep(const char * sQuery,CSphString & sError)26999 bool CSphSource_SQL::RunQueryStep ( const char * sQuery, CSphString & sError )
27000 {
27001 	sError = "";
27002 
27003 	if ( m_tParams.m_iRangeStep<=0 )
27004 		return false;
27005 	if ( m_uCurrentID>m_uMaxID )
27006 		return false;
27007 
27008 	static const int iBufSize = 32;
27009 	const char * sRes = NULL;
27010 
27011 	sphSleepMsec ( m_tParams.m_iRangedThrottle );
27012 
27013 	//////////////////////////////////////////////
27014 	// range query with $start/$end interpolation
27015 	//////////////////////////////////////////////
27016 
27017 	assert ( m_uMinID>0 );
27018 	assert ( m_uMaxID>0 );
27019 	assert ( m_uMinID<=m_uMaxID );
27020 	assert ( sQuery );
27021 
27022 	char sValues [ MACRO_COUNT ] [ iBufSize ];
27023 	const char * pValues [ MACRO_COUNT ];
27024 	SphDocID_t uNextID = Min ( m_uCurrentID + (SphDocID_t)m_tParams.m_iRangeStep - 1, m_uMaxID );
27025 	snprintf ( sValues[0], iBufSize, DOCID_FMT, m_uCurrentID );
27026 	snprintf ( sValues[1], iBufSize, DOCID_FMT, uNextID );
27027 	pValues[0] = sValues[0];
27028 	pValues[1] = sValues[1];
27029 	g_iIndexerCurrentRangeMin = m_uCurrentID;
27030 	g_iIndexerCurrentRangeMax = uNextID;
27031 	m_uCurrentID = 1 + uNextID;
27032 
27033 	sRes = SubstituteParams ( sQuery, MACRO_VALUES, pValues, MACRO_COUNT );
27034 
27035 	// run query
27036 	SqlDismissResult ();
27037 	bool bRes = SqlQuery ( sRes );
27038 
27039 	if ( !bRes )
27040 		sError.SetSprintf ( "sql_range_query: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
27041 
27042 	SafeDeleteArray ( sRes );
27043 	return bRes;
27044 }
27045 
HookConnect(const char * szCommand)27046 static bool HookConnect ( const char* szCommand )
27047 {
27048 	FILE * pPipe = popen ( szCommand, "r" );
27049 	if ( !pPipe )
27050 		return false;
27051 	pclose ( pPipe );
27052 	return true;
27053 }
27054 
skipspace(const char * pBuf,const char * pBufEnd)27055 inline static const char* skipspace ( const char* pBuf, const char* pBufEnd )
27056 {
27057 	assert ( pBuf );
27058 	assert ( pBufEnd );
27059 
27060 	while ( (pBuf<pBufEnd) && isspace ( *pBuf ) )
27061 		++pBuf;
27062 	return pBuf;
27063 }
27064 
scannumber(const char * pBuf,const char * pBufEnd,SphDocID_t * pRes)27065 inline static const char* scannumber ( const char* pBuf, const char* pBufEnd, SphDocID_t* pRes )
27066 {
27067 	assert ( pBuf );
27068 	assert ( pBufEnd );
27069 	assert ( pRes );
27070 
27071 	if ( pBuf<pBufEnd )
27072 	{
27073 		*pRes = 0;
27074 		// FIXME! could check for overflow
27075 		while ( isdigit ( *pBuf ) && pBuf<pBufEnd )
27076 			(*pRes) = 10*(*pRes) + (int)( (*pBuf++)-'0' );
27077 	}
27078 	return pBuf;
27079 }
27080 
HookQueryRange(const char * szCommand,SphDocID_t * pMin,SphDocID_t * pMax)27081 static bool HookQueryRange ( const char* szCommand, SphDocID_t* pMin, SphDocID_t* pMax )
27082 {
27083 	FILE * pPipe = popen ( szCommand, "r" );
27084 	if ( !pPipe )
27085 		return false;
27086 
27087 	const int MAX_BUF_SIZE = 1024;
27088 	char dBuf [MAX_BUF_SIZE];
27089 	int iRead = (int)fread ( dBuf, 1, MAX_BUF_SIZE, pPipe );
27090 	pclose ( pPipe );
27091 	const char* pStart = dBuf;
27092 	const char* pEnd = pStart + iRead;
27093 	// leading whitespace and 1-st number
27094 	pStart = skipspace ( pStart, pEnd );
27095 	pStart = scannumber ( pStart, pEnd, pMin );
27096 	// whitespace and 2-nd number
27097 	pStart = skipspace ( pStart, pEnd );
27098 	scannumber ( pStart, pEnd, pMax );
27099 	return true;
27100 }
27101 
HookPostIndex(const char * szCommand,SphDocID_t uLastIndexed)27102 static bool HookPostIndex ( const char* szCommand, SphDocID_t uLastIndexed )
27103 {
27104 	const char * sMacro = "$maxid";
27105 	char sValue[32];
27106 	const char* pValue = sValue;
27107 	snprintf ( sValue, sizeof(sValue), DOCID_FMT, uLastIndexed );
27108 
27109 	const char * pCmd = SubstituteParams ( szCommand, &sMacro, &pValue, 1 );
27110 
27111 	FILE * pPipe = popen ( pCmd, "r" );
27112 	SafeDeleteArray ( pCmd );
27113 	if ( !pPipe )
27114 		return false;
27115 	pclose ( pPipe );
27116 	return true;
27117 }
27118 
27119 /// connect to SQL server
Connect(CSphString & sError)27120 bool CSphSource_SQL::Connect ( CSphString & sError )
27121 {
27122 	// do not connect twice
27123 	if ( m_bSqlConnected )
27124 		return true;
27125 
27126 	// try to connect
27127 	if ( !SqlConnect() )
27128 	{
27129 		sError.SetSprintf ( "sql_connect: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
27130 		return false;
27131 	}
27132 
27133 	m_tHits.m_dData.Reserve ( m_iMaxHits );
27134 
27135 	// all good
27136 	m_bSqlConnected = true;
27137 	if ( !m_tParams.m_sHookConnect.IsEmpty() && !HookConnect ( m_tParams.m_sHookConnect.cstr() ) )
27138 	{
27139 		sError.SetSprintf ( "hook_connect: runtime error %s when running external hook", strerror(errno) );
27140 		return false;
27141 	}
27142 	return true;
27143 }
27144 
27145 
27146 #define LOC_ERROR(_msg,_arg)			{ sError.SetSprintf ( _msg, _arg ); return false; }
27147 #define LOC_ERROR2(_msg,_arg,_arg2)		{ sError.SetSprintf ( _msg, _arg, _arg2 ); return false; }
27148 
27149 /// setup them ranges (called both for document range-queries and MVA range-queries)
SetupRanges(const char * sRangeQuery,const char * sQuery,const char * sPrefix,CSphString & sError,ERangesReason iReason)27150 bool CSphSource_SQL::SetupRanges ( const char * sRangeQuery, const char * sQuery, const char * sPrefix, CSphString & sError, ERangesReason iReason )
27151 {
27152 	// check step
27153 	if ( m_tParams.m_iRangeStep<=0 )
27154 		LOC_ERROR ( "sql_range_step=" INT64_FMT ": must be non-zero positive", m_tParams.m_iRangeStep );
27155 
27156 	if ( m_tParams.m_iRangeStep<128 )
27157 		sphWarn ( "sql_range_step=" INT64_FMT ": too small; might hurt indexing performance!", m_tParams.m_iRangeStep );
27158 
27159 	// check query for macros
27160 	for ( int i=0; i<MACRO_COUNT; i++ )
27161 		if ( !strstr ( sQuery, MACRO_VALUES[i] ) )
27162 			LOC_ERROR2 ( "%s: macro '%s' not found in match fetch query", sPrefix, MACRO_VALUES[i] );
27163 
27164 	// run query
27165 	if ( !SqlQuery ( sRangeQuery ) )
27166 	{
27167 		sError.SetSprintf ( "%s: range-query failed: %s (DSN=%s)", sPrefix, SqlError(), m_sSqlDSN.cstr() );
27168 		return false;
27169 	}
27170 
27171 	// fetch min/max
27172 	int iCols = SqlNumFields ();
27173 	if ( iCols!=2 )
27174 		LOC_ERROR2 ( "%s: expected 2 columns (min_id/max_id), got %d", sPrefix, iCols );
27175 
27176 	if ( !SqlFetchRow() )
27177 	{
27178 		sError.SetSprintf ( "%s: range-query fetch failed: %s (DSN=%s)", sPrefix, SqlError(), m_sSqlDSN.cstr() );
27179 		return false;
27180 	}
27181 
27182 	if ( ( SqlColumn(0)==NULL || !SqlColumn(0)[0] ) && ( SqlColumn(1)==NULL || !SqlColumn(1)[0] ) )
27183 	{
27184 		// the source seems to be empty; workaround
27185 		m_uMinID = 1;
27186 		m_uMaxID = 1;
27187 
27188 	} else
27189 	{
27190 		// get and check min/max id
27191 		const char * sCol0 = SqlColumn(0);
27192 		const char * sCol1 = SqlColumn(1);
27193 		m_uMinID = sphToDocid ( sCol0 );
27194 		m_uMaxID = sphToDocid ( sCol1 );
27195 		if ( !sCol0 ) sCol0 = "(null)";
27196 		if ( !sCol1 ) sCol1 = "(null)";
27197 
27198 		if ( m_uMinID<=0 )
27199 			LOC_ERROR ( "sql_query_range: min_id='%s': must be positive 32/64-bit unsigned integer", sCol0 );
27200 		if ( m_uMaxID<=0 )
27201 			LOC_ERROR ( "sql_query_range: max_id='%s': must be positive 32/64-bit unsigned integer", sCol1 );
27202 		if ( m_uMinID>m_uMaxID )
27203 			LOC_ERROR2 ( "sql_query_range: min_id='%s', max_id='%s': min_id must be less than max_id", sCol0, sCol1 );
27204 	}
27205 
27206 	SqlDismissResult ();
27207 
27208 	if ( iReason==SRE_DOCS && ( !m_tParams.m_sHookQueryRange.IsEmpty() ) )
27209 	{
27210 		if ( !HookQueryRange ( m_tParams.m_sHookQueryRange.cstr(), &m_uMinID, &m_uMaxID ) )
27211 			LOC_ERROR ( "hook_query_range: runtime error %s when running external hook", strerror(errno) );
27212 		if ( m_uMinID<=0 )
27213 			LOC_ERROR ( "hook_query_range: min_id=" DOCID_FMT ": must be positive 32/64-bit unsigned integer", m_uMinID );
27214 		if ( m_uMaxID<=0 )
27215 			LOC_ERROR ( "hook_query_range: max_id=" DOCID_FMT ": must be positive 32/64-bit unsigned integer", m_uMaxID );
27216 		if ( m_uMinID>m_uMaxID )
27217 			LOC_ERROR2 ( "hook_query_range: min_id=" DOCID_FMT ", max_id=" DOCID_FMT ": min_id must be less than max_id", m_uMinID, m_uMaxID );
27218 	}
27219 
27220 	return true;
27221 }
27222 
27223 
27224 /// issue main rows fetch query
IterateStart(CSphString & sError)27225 bool CSphSource_SQL::IterateStart ( CSphString & sError )
27226 {
27227 	assert ( m_bSqlConnected );
27228 
27229 	m_iNullIds = false;
27230 	m_iMaxIds = false;
27231 
27232 	// run pre-queries
27233 	ARRAY_FOREACH ( i, m_tParams.m_dQueryPre )
27234 	{
27235 		if ( !SqlQuery ( m_tParams.m_dQueryPre[i].cstr() ) )
27236 		{
27237 			sError.SetSprintf ( "sql_query_pre[%d]: %s (DSN=%s)", i, SqlError(), m_sSqlDSN.cstr() );
27238 			SqlDisconnect ();
27239 			return false;
27240 		}
27241 		SqlDismissResult ();
27242 	}
27243 
27244 	for ( ;; )
27245 	{
27246 		m_tParams.m_iRangeStep = 0;
27247 
27248 		// issue first fetch query
27249 		if ( !m_tParams.m_sQueryRange.IsEmpty() )
27250 		{
27251 			m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
27252 			// run range-query; setup ranges
27253 			if ( !SetupRanges ( m_tParams.m_sQueryRange.cstr(), m_tParams.m_sQuery.cstr(), "sql_query_range: ", sError, SRE_DOCS ) )
27254 				return false;
27255 
27256 			// issue query
27257 			m_uCurrentID = m_uMinID;
27258 			if ( !RunQueryStep ( m_tParams.m_sQuery.cstr(), sError ) )
27259 				return false;
27260 		} else
27261 		{
27262 			// normal query; just issue
27263 			if ( !SqlQuery ( m_tParams.m_sQuery.cstr() ) )
27264 			{
27265 				sError.SetSprintf ( "sql_query: %s (DSN=%s)", SqlError(), m_sSqlDSN.cstr() );
27266 				return false;
27267 			}
27268 		}
27269 		break;
27270 	}
27271 
27272 	// some post-query setup
27273 	m_tSchema.Reset();
27274 
27275 	for ( int i=0; i<SPH_MAX_FIELDS; i++ )
27276 		m_dUnpack[i] = SPH_UNPACK_NONE;
27277 
27278 	m_iSqlFields = SqlNumFields(); // for rowdump
27279 	int iCols = SqlNumFields() - 1; // skip column 0, which must be the id
27280 
27281 	CSphVector<bool> dFound;
27282 	dFound.Resize ( m_tParams.m_dAttrs.GetLength() );
27283 	ARRAY_FOREACH ( i, dFound )
27284 		dFound[i] = false;
27285 
27286 	const bool bWordDict = m_pDict->GetSettings().m_bWordDict;
27287 
27288 	// map plain attrs from SQL
27289 	for ( int i=0; i<iCols; i++ )
27290 	{
27291 		const char * sName = SqlFieldName ( i+1 );
27292 		if ( !sName )
27293 			LOC_ERROR ( "column number %d has no name", i+1 );
27294 
27295 		CSphColumnInfo tCol ( sName );
27296 		ARRAY_FOREACH ( j, m_tParams.m_dAttrs )
27297 			if ( !strcasecmp ( tCol.m_sName.cstr(), m_tParams.m_dAttrs[j].m_sName.cstr() ) )
27298 		{
27299 			const CSphColumnInfo & tAttr = m_tParams.m_dAttrs[j];
27300 
27301 			tCol.m_eAttrType = tAttr.m_eAttrType;
27302 			assert ( tCol.m_eAttrType!=SPH_ATTR_NONE );
27303 
27304 			if ( ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) && tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
27305 				LOC_ERROR ( "multi-valued attribute '%s' of wrong source-type found in query; must be 'field'", tAttr.m_sName.cstr() );
27306 
27307 			tCol = tAttr;
27308 			dFound[j] = true;
27309 			break;
27310 		}
27311 
27312 		ARRAY_FOREACH ( j, m_tParams.m_dFileFields )
27313 		{
27314 			if ( !strcasecmp ( tCol.m_sName.cstr(), m_tParams.m_dFileFields[j].cstr() ) )
27315 				tCol.m_bFilename = true;
27316 		}
27317 
27318 		tCol.m_iIndex = i+1;
27319 		tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), bWordDict );
27320 
27321 		if ( tCol.m_eAttrType==SPH_ATTR_NONE || tCol.m_bIndexed )
27322 		{
27323 			m_tSchema.m_dFields.Add ( tCol );
27324 			ARRAY_FOREACH ( k, m_tParams.m_dUnpack )
27325 			{
27326 				CSphUnpackInfo & tUnpack = m_tParams.m_dUnpack[k];
27327 				if ( tUnpack.m_sName==tCol.m_sName )
27328 				{
27329 					if ( !m_bCanUnpack )
27330 					{
27331 						sError.SetSprintf ( "this source does not support column unpacking" );
27332 						return false;
27333 					}
27334 					int iIndex = m_tSchema.m_dFields.GetLength() - 1;
27335 					if ( iIndex < SPH_MAX_FIELDS )
27336 					{
27337 						m_dUnpack[iIndex] = tUnpack.m_eFormat;
27338 						m_dUnpackBuffers[iIndex].Resize ( SPH_UNPACK_BUFFER_SIZE );
27339 					}
27340 					break;
27341 				}
27342 			}
27343 		}
27344 
27345 		if ( tCol.m_eAttrType!=SPH_ATTR_NONE )
27346 		{
27347 			if ( CSphSchema::IsReserved ( tCol.m_sName.cstr() ) )
27348 				LOC_ERROR ( "%s is not a valid attribute name", tCol.m_sName.cstr() );
27349 
27350 			m_tSchema.AddAttr ( tCol, true ); // all attributes are dynamic at indexing time
27351 		}
27352 	}
27353 
27354 	// map multi-valued attrs
27355 	ARRAY_FOREACH ( i, m_tParams.m_dAttrs )
27356 	{
27357 		const CSphColumnInfo & tAttr = m_tParams.m_dAttrs[i];
27358 		if ( ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) && tAttr.m_eSrc!=SPH_ATTRSRC_FIELD )
27359 		{
27360 			CSphColumnInfo tMva;
27361 			tMva = tAttr;
27362 			tMva.m_iIndex = m_tSchema.GetAttrsCount();
27363 
27364 			if ( CSphSchema::IsReserved ( tMva.m_sName.cstr() ) )
27365 				LOC_ERROR ( "%s is not a valid attribute name", tMva.m_sName.cstr() );
27366 
27367 			m_tSchema.AddAttr ( tMva, true ); // all attributes are dynamic at indexing time
27368 			dFound[i] = true;
27369 		}
27370 	}
27371 
27372 	// warn if some attrs went unmapped
27373 	ARRAY_FOREACH ( i, dFound )
27374 		if ( !dFound[i] )
27375 			sphWarn ( "attribute '%s' not found - IGNORING", m_tParams.m_dAttrs[i].m_sName.cstr() );
27376 
27377 	// joined fields
27378 	m_iPlainFieldsLength = m_tSchema.m_dFields.GetLength();
27379 
27380 	ARRAY_FOREACH ( i, m_tParams.m_dJoinedFields )
27381 	{
27382 		CSphColumnInfo tCol;
27383 		tCol.m_iIndex = -1;
27384 		tCol.m_sName = m_tParams.m_dJoinedFields[i].m_sName;
27385 		tCol.m_sQuery = m_tParams.m_dJoinedFields[i].m_sQuery;
27386 		tCol.m_bPayload = m_tParams.m_dJoinedFields[i].m_bPayload;
27387 		tCol.m_eSrc = m_tParams.m_dJoinedFields[i].m_sRanged.IsEmpty() ? SPH_ATTRSRC_QUERY : SPH_ATTRSRC_RANGEDQUERY;
27388 		tCol.m_sQueryRange = m_tParams.m_dJoinedFields[i].m_sRanged;
27389 		tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), bWordDict );
27390 		m_tSchema.m_dFields.Add ( tCol );
27391 	}
27392 
27393 	// auto-computed length attributes
27394 	if ( !AddAutoAttrs ( sError ) )
27395 		return false;
27396 
27397 	// alloc storage
27398 	AllocDocinfo();
27399 
27400 	// check it
27401 	if ( m_tSchema.m_dFields.GetLength()>SPH_MAX_FIELDS )
27402 		LOC_ERROR2 ( "too many fields (fields=%d, max=%d)",
27403 			m_tSchema.m_dFields.GetLength(), SPH_MAX_FIELDS );
27404 
27405 	// log it
27406 	if ( m_fpDumpRows )
27407 	{
27408 		const char * sTable = m_tSchema.m_sName.cstr();
27409 
27410 		time_t iNow = time ( NULL );
27411 		fprintf ( m_fpDumpRows, "#\n# === source %s ts %d\n# %s#\n", sTable, (int)iNow, ctime ( &iNow ) );
27412 		ARRAY_FOREACH ( i, m_tSchema.m_dFields )
27413 			fprintf ( m_fpDumpRows, "# field %d: %s\n", i, m_tSchema.m_dFields[i].m_sName.cstr() );
27414 
27415 		for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
27416 		{
27417 			const CSphColumnInfo & tCol = m_tSchema.GetAttr(i);
27418 			fprintf ( m_fpDumpRows, "# %s = %s # attr %d\n", sphTypeDirective ( tCol.m_eAttrType ), tCol.m_sName.cstr(), i );
27419 		}
27420 
27421 		fprintf ( m_fpDumpRows, "#\n\nDROP TABLE IF EXISTS rows_%s;\nCREATE TABLE rows_%s (\n  id VARCHAR(32) NOT NULL,\n",
27422 			sTable, sTable );
27423 		for ( int i=1; i<m_iSqlFields; i++ )
27424 			fprintf ( m_fpDumpRows, "  %s VARCHAR(4096) NOT NULL,\n", SqlFieldName(i) );
27425 		fprintf ( m_fpDumpRows, "  KEY(id) );\n\n" );
27426 	}
27427 
27428 	return true;
27429 }
27430 
27431 #undef LOC_ERROR
27432 #undef LOC_ERROR2
27433 #undef LOC_SQL_ERROR
27434 
27435 
Disconnect()27436 void CSphSource_SQL::Disconnect ()
27437 {
27438 	SafeDeleteArray ( m_pReadFileBuffer );
27439 	m_tHits.m_dData.Reset();
27440 
27441 	if ( m_iNullIds )
27442 		sphWarn ( "source %s: skipped %d document(s) with zero/NULL ids", m_tSchema.m_sName.cstr(), m_iNullIds );
27443 
27444 	if ( m_iMaxIds )
27445 		sphWarn ( "source %s: skipped %d document(s) with DOCID_MAX ids", m_tSchema.m_sName.cstr(), m_iMaxIds );
27446 
27447 	m_iNullIds = 0;
27448 	m_iMaxIds = 0;
27449 
27450 	if ( m_bSqlConnected )
27451 		SqlDisconnect ();
27452 	m_bSqlConnected = false;
27453 }
27454 
27455 
NextDocument(CSphString & sError)27456 BYTE ** CSphSource_SQL::NextDocument ( CSphString & sError )
27457 {
27458 	assert ( m_bSqlConnected );
27459 
27460 	// get next non-zero-id row
27461 	do
27462 	{
27463 		// try to get next row
27464 		bool bGotRow = SqlFetchRow ();
27465 
27466 		// when the party's over...
27467 		while ( !bGotRow )
27468 		{
27469 			// is that an error?
27470 			if ( SqlIsError() )
27471 			{
27472 				sError.SetSprintf ( "sql_fetch_row: %s", SqlError() );
27473 				m_tDocInfo.m_uDocID = 1; // 0 means legal eof
27474 				return NULL;
27475 			}
27476 
27477 			// maybe we can do next step yet?
27478 			if ( !RunQueryStep ( m_tParams.m_sQuery.cstr(), sError ) )
27479 			{
27480 				// if there's a message, there's an error
27481 				// otherwise, we're just over
27482 				if ( !sError.IsEmpty() )
27483 				{
27484 					m_tDocInfo.m_uDocID = 1; // 0 means legal eof
27485 					return NULL;
27486 				}
27487 
27488 			} else
27489 			{
27490 				// step went fine; try to fetch
27491 				bGotRow = SqlFetchRow ();
27492 				continue;
27493 			}
27494 
27495 			SqlDismissResult ();
27496 
27497 			// ok, we're over
27498 			ARRAY_FOREACH ( i, m_tParams.m_dQueryPost )
27499 			{
27500 				if ( !SqlQuery ( m_tParams.m_dQueryPost[i].cstr() ) )
27501 				{
27502 					sphWarn ( "sql_query_post[%d]: error=%s, query=%s",
27503 						i, SqlError(), m_tParams.m_dQueryPost[i].cstr() );
27504 					break;
27505 				}
27506 				SqlDismissResult ();
27507 			}
27508 
27509 			m_tDocInfo.m_uDocID = 0; // 0 means legal eof
27510 			return NULL;
27511 		}
27512 
27513 		// get him!
27514 		m_tDocInfo.m_uDocID = VerifyID ( sphToDocid ( SqlColumn(0) ) );
27515 		m_uMaxFetchedID = Max ( m_uMaxFetchedID, m_tDocInfo.m_uDocID );
27516 	} while ( !m_tDocInfo.m_uDocID );
27517 
27518 	// cleanup attrs
27519 	for ( int i=0; i<m_tSchema.GetRowSize(); i++ )
27520 		m_tDocInfo.m_pDynamic[i] = 0;
27521 
27522 	// split columns into fields and attrs
27523 	for ( int i=0; i<m_iPlainFieldsLength; i++ )
27524 	{
27525 		// get that field
27526 		#if USE_ZLIB
27527 		if ( m_dUnpack[i]!=SPH_UNPACK_NONE )
27528 		{
27529 			m_dFields[i] = (BYTE*) SqlUnpackColumn ( i, m_dUnpack[i] );
27530 			continue;
27531 		}
27532 		#endif
27533 		m_dFields[i] = (BYTE*) SqlColumn ( m_tSchema.m_dFields[i].m_iIndex );
27534 	}
27535 
27536 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
27537 	{
27538 		const CSphColumnInfo & tAttr = m_tSchema.GetAttr(i); // shortcut
27539 
27540 		if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
27541 		{
27542 			int uOff = 0;
27543 			if ( tAttr.m_eSrc==SPH_ATTRSRC_FIELD )
27544 			{
27545 				uOff = ParseFieldMVA ( m_dMva, SqlColumn ( tAttr.m_iIndex ), tAttr.m_eAttrType==SPH_ATTR_INT64SET );
27546 			}
27547 			m_tDocInfo.SetAttr ( tAttr.m_tLocator, uOff );
27548 			continue;
27549 		}
27550 
27551 		switch ( tAttr.m_eAttrType )
27552 		{
27553 			case SPH_ATTR_STRING:
27554 			case SPH_ATTR_JSON:
27555 				// memorize string, fixup NULLs
27556 				m_dStrAttrs[i] = SqlColumn ( tAttr.m_iIndex );
27557 				if ( !m_dStrAttrs[i].cstr() )
27558 					m_dStrAttrs[i] = "";
27559 
27560 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
27561 				break;
27562 
27563 			case SPH_ATTR_FLOAT:
27564 				m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
27565 				break;
27566 
27567 			case SPH_ATTR_BIGINT:
27568 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
27569 				break;
27570 
27571 			case SPH_ATTR_TOKENCOUNT:
27572 				// reset, and the value will be filled by IterateHits()
27573 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
27574 				break;
27575 
27576 			default:
27577 				// just store as uint by default
27578 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( SqlColumn ( tAttr.m_iIndex ) ) ); // FIXME? report conversion errors maybe?
27579 				break;
27580 		}
27581 	}
27582 
27583 	// log it
27584 	if ( m_fpDumpRows )
27585 	{
27586 		fprintf ( m_fpDumpRows, "INSERT INTO rows_%s VALUES (", m_tSchema.m_sName.cstr() );
27587 		for ( int i=0; i<m_iSqlFields; i++ )
27588 		{
27589 			if ( i )
27590 				fprintf ( m_fpDumpRows, ", " );
27591 			FormatEscaped ( m_fpDumpRows, SqlColumn(i) );
27592 		}
27593 		fprintf ( m_fpDumpRows, ");\n" );
27594 	}
27595 
27596 	return m_dFields;
27597 }
27598 
27599 
PostIndex()27600 void CSphSource_SQL::PostIndex ()
27601 {
27602 	if ( ( !m_tParams.m_dQueryPostIndex.GetLength() ) && m_tParams.m_sHookPostIndex.IsEmpty() )
27603 		return;
27604 
27605 	assert ( !m_bSqlConnected );
27606 
27607 	const char * sSqlError = NULL;
27608 	if ( m_tParams.m_dQueryPostIndex.GetLength() )
27609 	{
27610 #define LOC_SQL_ERROR(_msg) { sSqlError = _msg; break; }
27611 
27612 		for ( ;; )
27613 		{
27614 			if ( !SqlConnect () )
27615 				LOC_SQL_ERROR ( "mysql_real_connect" );
27616 
27617 			ARRAY_FOREACH ( i, m_tParams.m_dQueryPostIndex )
27618 			{
27619 				char * sQuery = sphStrMacro ( m_tParams.m_dQueryPostIndex[i].cstr(), "$maxid", m_uMaxFetchedID );
27620 				bool bRes = SqlQuery ( sQuery );
27621 				delete [] sQuery;
27622 
27623 				if ( !bRes )
27624 					LOC_SQL_ERROR ( "sql_query_post_index" );
27625 
27626 				SqlDismissResult ();
27627 			}
27628 
27629 			break;
27630 		}
27631 
27632 		if ( sSqlError )
27633 			sphWarn ( "%s: %s (DSN=%s)", sSqlError, SqlError(), m_sSqlDSN.cstr() );
27634 
27635 #undef LOC_SQL_ERROR
27636 
27637 		SqlDisconnect ();
27638 	}
27639 	if ( !m_tParams.m_sHookPostIndex.IsEmpty() && !HookPostIndex ( m_tParams.m_sHookPostIndex.cstr(), m_uMaxFetchedID ) )
27640 	{
27641 		sphWarn ( "hook_post_index: runtime error %s when running external hook", strerror(errno) );
27642 	}
27643 }
27644 
27645 
IterateMultivaluedStart(int iAttr,CSphString & sError)27646 bool CSphSource_SQL::IterateMultivaluedStart ( int iAttr, CSphString & sError )
27647 {
27648 	if ( iAttr<0 || iAttr>=m_tSchema.GetAttrsCount() )
27649 		return false;
27650 
27651 	m_iMultiAttr = iAttr;
27652 	const CSphColumnInfo & tAttr = m_tSchema.GetAttr(iAttr);
27653 
27654 	if ( !(tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET ) )
27655 		return false;
27656 
27657 	CSphString sPrefix;
27658 	switch ( tAttr.m_eSrc )
27659 	{
27660 	case SPH_ATTRSRC_FIELD:
27661 		return false;
27662 
27663 	case SPH_ATTRSRC_QUERY:
27664 		// run simple query
27665 		if ( !SqlQuery ( tAttr.m_sQuery.cstr() ) )
27666 		{
27667 			sError.SetSprintf ( "multi-valued attr '%s' query failed: %s", tAttr.m_sName.cstr(), SqlError() );
27668 			return false;
27669 		}
27670 		break;
27671 
27672 	case SPH_ATTRSRC_RANGEDQUERY:
27673 			m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
27674 
27675 			// setup ranges
27676 			sPrefix.SetSprintf ( "multi-valued attr '%s' ranged query: ", tAttr.m_sName.cstr() );
27677 			if ( !SetupRanges ( tAttr.m_sQueryRange.cstr(), tAttr.m_sQuery.cstr(), sPrefix.cstr(), sError, SRE_MVA ) )
27678 				return false;
27679 
27680 			// run first step (in order to report errors)
27681 			m_uCurrentID = m_uMinID;
27682 			if ( !RunQueryStep ( tAttr.m_sQuery.cstr(), sError ) )
27683 				return false;
27684 
27685 			break;
27686 
27687 	default:
27688 		sError.SetSprintf ( "INTERNAL ERROR: unknown multi-valued attr source type %d", tAttr.m_eSrc );
27689 		return false;
27690 	}
27691 
27692 	// check fields count
27693 	if ( SqlNumFields()!=2 )
27694 	{
27695 		sError.SetSprintf ( "multi-valued attr '%s' query returned %d fields (expected 2)", tAttr.m_sName.cstr(), SqlNumFields() );
27696 		SqlDismissResult ();
27697 		return false;
27698 	}
27699 	return true;
27700 }
27701 
27702 
IterateMultivaluedNext()27703 bool CSphSource_SQL::IterateMultivaluedNext ()
27704 {
27705 	const CSphColumnInfo & tAttr = m_tSchema.GetAttr ( m_iMultiAttr );
27706 
27707 	assert ( m_bSqlConnected );
27708 	assert ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET );
27709 
27710 	// fetch next row
27711 	bool bGotRow = SqlFetchRow ();
27712 	while ( !bGotRow )
27713 	{
27714 		if ( SqlIsError() )
27715 			sphDie ( "sql_fetch_row: %s", SqlError() ); // FIXME! this should be reported
27716 
27717 		if ( tAttr.m_eSrc!=SPH_ATTRSRC_RANGEDQUERY )
27718 		{
27719 			SqlDismissResult();
27720 			return false;
27721 		}
27722 
27723 		CSphString sTmp;
27724 		if ( !RunQueryStep ( tAttr.m_sQuery.cstr(), sTmp ) ) // FIXME! this should be reported
27725 			return false;
27726 
27727 		bGotRow = SqlFetchRow ();
27728 		continue;
27729 	}
27730 
27731 	// return that tuple or offset to storage for MVA64 value
27732 	m_tDocInfo.m_uDocID = sphToDocid ( SqlColumn(0) );
27733 	m_dMva.Resize ( 0 );
27734 	if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET )
27735 		m_dMva.Add ( sphToDword ( SqlColumn(1) ) );
27736 	else
27737 		sphAddMva64 ( m_dMva, sphToInt64 ( SqlColumn(1) ) );
27738 
27739 	return true;
27740 }
27741 
27742 
IterateKillListStart(CSphString & sError)27743 bool CSphSource_SQL::IterateKillListStart ( CSphString & sError )
27744 {
27745 	if ( m_tParams.m_sQueryKilllist.IsEmpty () )
27746 		return false;
27747 
27748 	if ( !SqlQuery ( m_tParams.m_sQueryKilllist.cstr () ) )
27749 	{
27750 		sError.SetSprintf ( "killlist query failed: %s", SqlError() );
27751 		return false;
27752 	}
27753 
27754 	return true;
27755 }
27756 
27757 
IterateKillListNext(SphDocID_t & uDocId)27758 bool CSphSource_SQL::IterateKillListNext ( SphDocID_t & uDocId )
27759 {
27760 	if ( SqlFetchRow () )
27761 		uDocId = sphToDocid ( SqlColumn(0) );
27762 	else
27763 	{
27764 		if ( SqlIsError() )
27765 			sphDie ( "sql_query_killlist: %s", SqlError() ); // FIXME! this should be reported
27766 		else
27767 		{
27768 			SqlDismissResult ();
27769 			return false;
27770 		}
27771 	}
27772 
27773 	return true;
27774 }
27775 
27776 
ReportUnpackError(int iIndex,int iError)27777 void CSphSource_SQL::ReportUnpackError ( int iIndex, int iError )
27778 {
27779 	if ( !m_bUnpackFailed )
27780 	{
27781 		m_bUnpackFailed = true;
27782 		sphWarn ( "failed to unpack column '%s', error=%d, docid=" DOCID_FMT, SqlFieldName(iIndex), iError, m_tDocInfo.m_uDocID );
27783 	}
27784 }
27785 
27786 
27787 #if !USE_ZLIB
27788 
SqlUnpackColumn(int iFieldIndex,ESphUnpackFormat)27789 const char * CSphSource_SQL::SqlUnpackColumn ( int iFieldIndex, ESphUnpackFormat )
27790 {
27791 	return SqlColumn ( m_tSchema.m_dFields[iFieldIndex].m_iIndex );
27792 }
27793 
27794 #else
27795 
SqlUnpackColumn(int iFieldIndex,ESphUnpackFormat eFormat)27796 const char * CSphSource_SQL::SqlUnpackColumn ( int iFieldIndex, ESphUnpackFormat eFormat )
27797 {
27798 	int iIndex = m_tSchema.m_dFields[iFieldIndex].m_iIndex;
27799 	const char * pData = SqlColumn(iIndex);
27800 
27801 	if ( pData==NULL )
27802 		return NULL;
27803 
27804 	int iPackedLen = SqlColumnLength(iIndex);
27805 	if ( iPackedLen<=0 )
27806 		return NULL;
27807 
27808 
27809 	CSphVector<char> & tBuffer = m_dUnpackBuffers[iFieldIndex];
27810 	switch ( eFormat )
27811 	{
27812 		case SPH_UNPACK_MYSQL_COMPRESS:
27813 		{
27814 			if ( iPackedLen<=4 )
27815 			{
27816 				if ( !m_bUnpackFailed )
27817 				{
27818 					m_bUnpackFailed = true;
27819 					sphWarn ( "failed to unpack '%s', invalid column size (size=%d), "
27820 						"docid=" DOCID_FMT, SqlFieldName(iIndex), iPackedLen, m_tDocInfo.m_uDocID );
27821 				}
27822 				return NULL;
27823 			}
27824 
27825 			unsigned long uSize = 0;
27826 			for ( int i=0; i<4; i++ )
27827 				uSize += ((unsigned long)((BYTE)pData[i])) << ( 8*i );
27828 			uSize &= 0x3FFFFFFF;
27829 
27830 			if ( uSize > m_tParams.m_uUnpackMemoryLimit )
27831 			{
27832 				if ( !m_bUnpackOverflow )
27833 				{
27834 					m_bUnpackOverflow = true;
27835 					sphWarn ( "failed to unpack '%s', column size limit exceeded (size=%d),"
27836 						" docid=" DOCID_FMT, SqlFieldName(iIndex), (int)uSize, m_tDocInfo.m_uDocID );
27837 				}
27838 				return NULL;
27839 			}
27840 
27841 			int iResult;
27842 			tBuffer.Resize ( uSize + 1 );
27843 			unsigned long uLen = iPackedLen-4;
27844 			iResult = uncompress ( (Bytef *)tBuffer.Begin(), &uSize, (Bytef *)pData + 4, uLen );
27845 			if ( iResult==Z_OK )
27846 			{
27847 				tBuffer[uSize] = 0;
27848 				return &tBuffer[0];
27849 			} else
27850 				ReportUnpackError ( iIndex, iResult );
27851 			return NULL;
27852 		}
27853 
27854 		case SPH_UNPACK_ZLIB:
27855 		{
27856 			char * sResult = 0;
27857 			int iBufferOffset = 0;
27858 			int iResult;
27859 
27860 			z_stream tStream;
27861 			tStream.zalloc = Z_NULL;
27862 			tStream.zfree = Z_NULL;
27863 			tStream.opaque = Z_NULL;
27864 			tStream.avail_in = iPackedLen;
27865 			tStream.next_in = (Bytef *)SqlColumn(iIndex);
27866 
27867 			iResult = inflateInit ( &tStream );
27868 			if ( iResult!=Z_OK )
27869 				return NULL;
27870 
27871 			for ( ;; )
27872 			{
27873 				tStream.next_out = (Bytef *)&tBuffer[iBufferOffset];
27874 				tStream.avail_out = tBuffer.GetLength() - iBufferOffset - 1;
27875 
27876 				iResult = inflate ( &tStream, Z_NO_FLUSH );
27877 				if ( iResult==Z_STREAM_END )
27878 				{
27879 					tBuffer [ tStream.total_out ] = 0;
27880 					sResult = &tBuffer[0];
27881 					break;
27882 				} else if ( iResult==Z_OK )
27883 				{
27884 					assert ( tStream.avail_out==0 );
27885 
27886 					tBuffer.Resize ( tBuffer.GetLength()*2 );
27887 					iBufferOffset = tStream.total_out;
27888 				} else
27889 				{
27890 					ReportUnpackError ( iIndex, iResult );
27891 					break;
27892 				}
27893 			}
27894 
27895 			inflateEnd ( &tStream );
27896 			return sResult;
27897 		}
27898 
27899 		case SPH_UNPACK_NONE:
27900 			return pData;
27901 	}
27902 	return NULL;
27903 }
27904 #endif // USE_ZLIB
27905 
27906 
IterateJoinedHits(CSphString & sError)27907 ISphHits * CSphSource_SQL::IterateJoinedHits ( CSphString & sError )
27908 {
27909 	// iterating of joined hits happens after iterating hits from main query
27910 	// so we may be sure at this moment no new IDs will be put in m_dAllIds
27911 	if ( !m_bIdsSorted )
27912 	{
27913 		m_dAllIds.Uniq();
27914 		m_bIdsSorted = true;
27915 	}
27916 	m_tHits.m_dData.Resize ( 0 );
27917 
27918 	// eof check
27919 	if ( m_iJoinedHitField>=m_tSchema.m_dFields.GetLength() )
27920 	{
27921 		m_tDocInfo.m_uDocID = 0;
27922 		return &m_tHits;
27923 	}
27924 
27925 	bool bProcessingRanged = true;
27926 
27927 	// my fetch loop
27928 	while ( m_iJoinedHitField<m_tSchema.m_dFields.GetLength() )
27929 	{
27930 		if ( m_tState.m_bProcessingHits || SqlFetchRow() )
27931 		{
27932 			// next row
27933 			m_tDocInfo.m_uDocID = sphToDocid ( SqlColumn(0) ); // FIXME! handle conversion errors and zero/max values?
27934 
27935 			// lets skip joined document totally if there was no such document ID returned by main query
27936 			if ( !m_dAllIds.BinarySearch ( m_tDocInfo.m_uDocID ) )
27937 				continue;
27938 
27939 			// field start? restart ids
27940 			if ( !m_iJoinedHitID )
27941 				m_iJoinedHitID = m_tDocInfo.m_uDocID;
27942 
27943 			// docid asc requirement violated? report an error
27944 			if ( m_iJoinedHitID>m_tDocInfo.m_uDocID )
27945 			{
27946 				sError.SetSprintf ( "joined field '%s': query MUST return document IDs in ASC order",
27947 					m_tSchema.m_dFields[m_iJoinedHitField].m_sName.cstr() );
27948 				return NULL;
27949 			}
27950 
27951 			// next document? update tracker, reset position
27952 			if ( m_iJoinedHitID<m_tDocInfo.m_uDocID )
27953 			{
27954 				m_iJoinedHitID = m_tDocInfo.m_uDocID;
27955 				m_iJoinedHitPos = 0;
27956 			}
27957 
27958 			if ( !m_tState.m_bProcessingHits )
27959 			{
27960 				m_tState = CSphBuildHitsState_t();
27961 				m_tState.m_iField = m_iJoinedHitField;
27962 				m_tState.m_iStartField = m_iJoinedHitField;
27963 				m_tState.m_iEndField = m_iJoinedHitField+1;
27964 
27965 				if ( m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload )
27966 					m_tState.m_iStartPos = sphToDword ( SqlColumn(2) );
27967 				else
27968 					m_tState.m_iStartPos = m_iJoinedHitPos;
27969 			}
27970 
27971 			// build those hits
27972 			BYTE * dText[] = { (BYTE *)SqlColumn(1) };
27973 			m_tState.m_dFields = dText;
27974 
27975 			BuildHits ( sError, true );
27976 
27977 			// update current position
27978 			if ( !m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload && !m_tState.m_bProcessingHits && m_tHits.Length() )
27979 				m_iJoinedHitPos = HITMAN::GetPos ( m_tHits.Last()->m_uWordPos );
27980 
27981 			if ( m_tState.m_bProcessingHits )
27982 				break;
27983 		} else if ( SqlIsError() )
27984 		{
27985 			// error while fetching row
27986 			sError = SqlError();
27987 			return NULL;
27988 
27989 		} else
27990 		{
27991 			int iLastField = m_iJoinedHitField;
27992 			bool bRanged = ( m_iJoinedHitField>=m_iPlainFieldsLength && m_iJoinedHitField<m_tSchema.m_dFields.GetLength()
27993 				&& m_tSchema.m_dFields[m_iJoinedHitField].m_eSrc==SPH_ATTRSRC_RANGEDQUERY );
27994 
27995 			// current field is over, continue to next field
27996 			if ( m_iJoinedHitField<0 )
27997 				m_iJoinedHitField = m_iPlainFieldsLength;
27998 			else if ( !bRanged || !bProcessingRanged )
27999 				m_iJoinedHitField++;
28000 
28001 			// eof check
28002 			if ( m_iJoinedHitField>=m_tSchema.m_dFields.GetLength() )
28003 			{
28004 				m_tDocInfo.m_uDocID = ( m_tHits.Length() ? 1 : 0 ); // to eof or not to eof
28005 				return &m_tHits;
28006 			}
28007 
28008 			SqlDismissResult ();
28009 
28010 			bProcessingRanged = false;
28011 			bool bCheckNumFields = true;
28012 			CSphColumnInfo & tJoined = m_tSchema.m_dFields[m_iJoinedHitField];
28013 
28014 			// start fetching next field
28015 			if ( tJoined.m_eSrc!=SPH_ATTRSRC_RANGEDQUERY )
28016 			{
28017 				if ( !SqlQuery ( tJoined.m_sQuery.cstr() ) )
28018 				{
28019 					sError = SqlError();
28020 					return NULL;
28021 				}
28022 			} else
28023 			{
28024 				m_tParams.m_iRangeStep = m_tParams.m_iRefRangeStep;
28025 
28026 				// setup ranges for next field
28027 				if ( iLastField!=m_iJoinedHitField )
28028 				{
28029 					CSphString sPrefix;
28030 					sPrefix.SetSprintf ( "joined field '%s' ranged query: ", tJoined.m_sName.cstr() );
28031 					if ( !SetupRanges ( tJoined.m_sQueryRange.cstr(), tJoined.m_sQuery.cstr(), sPrefix.cstr(), sError, SRE_JOINEDHITS ) )
28032 						return NULL;
28033 
28034 					m_uCurrentID = m_uMinID;
28035 				}
28036 
28037 				// run first step (in order to report errors)
28038 				bool bRes = RunQueryStep ( tJoined.m_sQuery.cstr(), sError );
28039 				bProcessingRanged = bRes; // select next documents in range or loop once to process next field
28040 				bCheckNumFields = bRes;
28041 
28042 				if ( !sError.IsEmpty() )
28043 					return NULL;
28044 			}
28045 
28046 			const int iExpected = m_tSchema.m_dFields[m_iJoinedHitField].m_bPayload ? 3 : 2;
28047 			if ( bCheckNumFields && SqlNumFields()!=iExpected )
28048 			{
28049 				const char * sName = m_tSchema.m_dFields[m_iJoinedHitField].m_sName.cstr();
28050 				sError.SetSprintf ( "joined field '%s': query MUST return exactly %d columns, got %d", sName, iExpected, SqlNumFields() );
28051 				return NULL;
28052 			}
28053 
28054 			m_iJoinedHitID = 0;
28055 			m_iJoinedHitPos = 0;
28056 		}
28057 	}
28058 
28059 	return &m_tHits;
28060 }
28061 
28062 /////////////////////////////////////////////////////////////////////////////
28063 // MYSQL SOURCE
28064 /////////////////////////////////////////////////////////////////////////////
28065 #if USE_MYSQL
28066 #if DL_MYSQL
28067 
28068 #ifndef MYSQL_LIB
28069 #define MYSQL_LIB "libmysqlclient.so"
28070 #endif
28071 
28072 #define MYSQL_NUM_FUNCS (15)
28073 
28074 #if defined(__INTEL_COMPILER) || defined(__ICL) || defined(__ICC) || defined(__ECC) || defined(__GNUC__)
28075 
28076 // use non-standard compiler extension __typeof__
28077 // it allow to declare pointer to the function without using it's declaration
28078 typedef __typeof__ ( mysql_free_result ) *xmysql_free_result;
28079 typedef __typeof__ ( mysql_next_result ) *xmysql_next_result;
28080 typedef __typeof__ ( mysql_use_result ) *xmysql_use_result;
28081 typedef __typeof__ ( mysql_num_rows ) *xmysql_num_rows;
28082 typedef __typeof__ ( mysql_query ) *xmysql_query;
28083 typedef __typeof__ ( mysql_errno ) *xmysql_errno;
28084 typedef __typeof__ ( mysql_error ) *xmysql_error;
28085 typedef __typeof__ ( mysql_init ) *xmysql_init;
28086 typedef __typeof__ ( mysql_ssl_set ) *xmysql_ssl_set;
28087 typedef __typeof__ ( mysql_real_connect ) *xmysql_real_connect;
28088 typedef __typeof__ ( mysql_close ) *xmysql_close;
28089 typedef __typeof__ ( mysql_num_fields ) *xmysql_num_fields;
28090 typedef __typeof__ ( mysql_fetch_row ) *xmysql_fetch_row;
28091 typedef __typeof__ ( mysql_fetch_fields ) *xmysql_fetch_fields;
28092 typedef __typeof__ ( mysql_fetch_lengths ) *xmysql_fetch_lengths;
28093 
28094 #else // compilers which are not known about __typeof__ support
28095 
28096 // declarations below are directly copy-pasted from mysql.h,
28097 // and then (*x...) is placed around the function names.
28098 // In mostly cases this code will not be used, and the declarations
28099 // from previous block will be used instead.
28100 #warning Be sure that the mysql function signatures are the same \
28101 as in mysql.h. Correct the code below if this is not so.
28102 
28103 typedef void STDCALL (*xmysql_free_result)(MYSQL_RES *result); //NOLINT
28104 typedef int STDCALL (*xmysql_next_result)(MYSQL *mysql); //NOLINT
28105 typedef MYSQL_RES * STDCALL (*xmysql_use_result)(MYSQL *mysql); //NOLINT
28106 typedef my_ulonglong STDCALL (*xmysql_num_rows)(MYSQL_RES *res); //NOLINT
28107 typedef int     STDCALL (*xmysql_query)(MYSQL *mysql, const char *q); //NOLINT
28108 typedef unsigned int STDCALL (*xmysql_errno)(MYSQL *mysql); //NOLINT
28109 typedef const char * STDCALL (*xmysql_error)(MYSQL *mysql); //NOLINT
28110 typedef MYSQL *		STDCALL (*xmysql_init)(MYSQL *mysql); //NOLINT
28111 typedef my_bool		STDCALL (*xmysql_ssl_set)(MYSQL *mysql, const char *key, //NOLINT
28112 				      const char *cert, const char *ca, //NOLINT
28113 				      const char *capath, const char *cipher); //NOLINT
28114 typedef MYSQL *     STDCALL (*xmysql_real_connect)(MYSQL *mysql, const char *host, //NOLINT
28115                       const char *user, //NOLINT
28116                       const char *passwd, //NOLINT
28117                       const char *db, //NOLINT
28118                       unsigned int port, //NOLINT
28119                       const char *unix_socket, //NOLINT
28120                       unsigned long clientflag); //NOLINT
28121 typedef void STDCALL (*xmysql_close)(MYSQL *sock); //NOLINT
28122 typedef unsigned int STDCALL (*xmysql_num_fields)(MYSQL_RES *res); //NOLINT
28123 typedef MYSQL_ROW   STDCALL (*xmysql_fetch_row)(MYSQL_RES *result); //NOLINT
28124 typedef MYSQL_FIELD * STDCALL (*xmysql_fetch_fields)(MYSQL_RES *res); //NOLINT
28125 typedef unsigned long * STDCALL (*xmysql_fetch_lengths)(MYSQL_RES *result); //NOLINT
28126 #endif
28127 
28128 class CMysql : public CSphDynamicLibrary
28129 {
28130 	static const char* sFuncs[MYSQL_NUM_FUNCS];
28131 	static void** pFuncs[MYSQL_NUM_FUNCS];
28132 
28133 public:
Init()28134 	bool Init()
28135 	{
28136 		if ( !CSphDynamicLibrary::Init ( MYSQL_LIB, true ) )
28137 			return false;
28138 		if ( !LoadSymbols ( sFuncs, pFuncs, MYSQL_NUM_FUNCS ) )
28139 			return false;
28140 		return true;
28141 	}
Stub()28142 	static void 	STDCALL	Stub()
28143 	{
28144 		sphLogDebug ( "Error! Mysql func is null!" );
28145 	}
28146 	static xmysql_free_result m_pmysql_free_result;
28147 	static xmysql_next_result m_pmysql_next_result;
28148 	static xmysql_use_result m_pmysql_use_result;
28149 	static xmysql_num_rows m_pmysql_num_rows;
28150 	static xmysql_query m_pmysql_query;
28151 	static xmysql_errno m_pmysql_errno;
28152 	static xmysql_error m_pmysql_error;
28153 	static xmysql_init m_pmysql_init;
28154 	static xmysql_ssl_set m_pmysql_ssl_set;
28155 	static xmysql_real_connect m_pmysql_real_connect;
28156 	static xmysql_close m_pmysql_close;
28157 	static xmysql_num_fields m_pmysql_num_fields;
28158 	static xmysql_fetch_row m_pmysql_fetch_row;
28159 	static xmysql_fetch_fields m_pmysql_fetch_fields;
28160 	static xmysql_fetch_lengths m_pmysql_fetch_lengths;
28161 };
28162 
28163 #define sph_mysql_free_result (*CMysql::m_pmysql_free_result)
28164 #define sph_mysql_next_result (*CMysql::m_pmysql_next_result)
28165 #define sph_mysql_use_result (*CMysql::m_pmysql_use_result)
28166 #define sph_mysql_num_rows (*CMysql::m_pmysql_num_rows)
28167 #define sph_mysql_query (*CMysql::m_pmysql_query)
28168 #define sph_mysql_errno (*CMysql::m_pmysql_errno)
28169 #define sph_mysql_error (*CMysql::m_pmysql_error)
28170 #define sph_mysql_init (*CMysql::m_pmysql_init)
28171 #define sph_mysql_ssl_set (*CMysql::m_pmysql_ssl_set)
28172 #define sph_mysql_real_connect (*CMysql::m_pmysql_real_connect)
28173 #define sph_mysql_close (*CMysql::m_pmysql_close)
28174 #define sph_mysql_num_fields (*CMysql::m_pmysql_num_fields)
28175 #define sph_mysql_fetch_row (*CMysql::m_pmysql_fetch_row)
28176 #define sph_mysql_fetch_fields (*CMysql::m_pmysql_fetch_fields)
28177 #define sph_mysql_fetch_lengths (*CMysql::m_pmysql_fetch_lengths)
28178 
28179 const char* CMysql::sFuncs[MYSQL_NUM_FUNCS] = {"mysql_free_result", "mysql_next_result",
28180 	"mysql_use_result", "mysql_num_rows", "mysql_query", "mysql_errno",
28181 	"mysql_error", "mysql_init", "mysql_ssl_set", "mysql_real_connect",
28182 	"mysql_close", "mysql_num_fields", "mysql_fetch_row", "mysql_fetch_fields",
28183 	"mysql_fetch_lengths"};
28184 void** CMysql::pFuncs[] = {(void**)&m_pmysql_free_result, (void**)&m_pmysql_next_result,
28185 	(void**)&m_pmysql_use_result, (void**)&m_pmysql_num_rows, (void**)&m_pmysql_query,
28186 	(void**)&m_pmysql_errno, (void**)&m_pmysql_error, (void**)&m_pmysql_init,
28187 	(void**)&m_pmysql_ssl_set, (void**)&m_pmysql_real_connect, (void**)&m_pmysql_close,
28188 	(void**)&m_pmysql_num_fields, (void**)&m_pmysql_fetch_row, (void**)&m_pmysql_fetch_fields,
28189 	(void**)&m_pmysql_fetch_lengths};
28190 
28191 xmysql_free_result CMysql::m_pmysql_free_result = (xmysql_free_result)CMysql::Stub;
28192 xmysql_next_result CMysql::m_pmysql_next_result = (xmysql_next_result)CMysql::Stub;
28193 xmysql_use_result CMysql::m_pmysql_use_result = (xmysql_use_result)CMysql::Stub;
28194 xmysql_num_rows CMysql::m_pmysql_num_rows = (xmysql_num_rows)CMysql::Stub;
28195 xmysql_query CMysql::m_pmysql_query = (xmysql_query)CMysql::Stub;
28196 xmysql_errno CMysql::m_pmysql_errno = (xmysql_errno)CMysql::Stub;
28197 xmysql_error CMysql::m_pmysql_error = (xmysql_error)CMysql::Stub;
28198 xmysql_init CMysql::m_pmysql_init = (xmysql_init)CMysql::Stub;
28199 xmysql_ssl_set CMysql::m_pmysql_ssl_set = (xmysql_ssl_set)CMysql::Stub;
28200 xmysql_real_connect CMysql::m_pmysql_real_connect = (xmysql_real_connect)CMysql::Stub;
28201 xmysql_close CMysql::m_pmysql_close = (xmysql_close)CMysql::Stub;
28202 xmysql_num_fields CMysql::m_pmysql_num_fields = (xmysql_num_fields)CMysql::Stub;
28203 xmysql_fetch_row CMysql::m_pmysql_fetch_row = (xmysql_fetch_row)CMysql::Stub;
28204 xmysql_fetch_fields CMysql::m_pmysql_fetch_fields = (xmysql_fetch_fields)CMysql::Stub;
28205 xmysql_fetch_lengths CMysql::m_pmysql_fetch_lengths = (xmysql_fetch_lengths)CMysql::Stub;
28206 
28207 CMysql MysqlHoder;
28208 
InitDynamicMysql()28209 bool InitDynamicMysql()
28210 {
28211 	return MysqlHoder.Init();
28212 }
28213 
28214 #else // !DL_MYSQL
28215 
28216 #define sph_mysql_free_result mysql_free_result
28217 #define sph_mysql_next_result mysql_next_result
28218 #define sph_mysql_use_result mysql_use_result
28219 #define sph_mysql_num_rows mysql_num_rows
28220 #define sph_mysql_query mysql_query
28221 #define sph_mysql_errno mysql_errno
28222 #define sph_mysql_error mysql_error
28223 #define sph_mysql_init mysql_init
28224 #define sph_mysql_ssl_set mysql_ssl_set
28225 #define sph_mysql_real_connect mysql_real_connect
28226 #define sph_mysql_close mysql_close
28227 #define sph_mysql_num_fields mysql_num_fields
28228 #define sph_mysql_fetch_row mysql_fetch_row
28229 #define sph_mysql_fetch_fields mysql_fetch_fields
28230 #define sph_mysql_fetch_lengths mysql_fetch_lengths
28231 #define InitDynamicMysql() (true)
28232 
28233 #endif // DL_MYSQL
28234 
CSphSourceParams_MySQL()28235 CSphSourceParams_MySQL::CSphSourceParams_MySQL ()
28236 	: m_iFlags ( 0 )
28237 {
28238 	m_iPort = 3306;
28239 }
28240 
28241 
CSphSource_MySQL(const char * sName)28242 CSphSource_MySQL::CSphSource_MySQL ( const char * sName )
28243 	: CSphSource_SQL	( sName )
28244 	, m_pMysqlResult	( NULL )
28245 	, m_pMysqlFields	( NULL )
28246 	, m_tMysqlRow		( NULL )
28247 	, m_pMysqlLengths	( NULL )
28248 {
28249 	m_bCanUnpack = true;
28250 }
28251 
28252 
SqlDismissResult()28253 void CSphSource_MySQL::SqlDismissResult ()
28254 {
28255 	if ( !m_pMysqlResult )
28256 		return;
28257 
28258 	while ( m_pMysqlResult )
28259 	{
28260 		sph_mysql_free_result ( m_pMysqlResult );
28261 		m_pMysqlResult = NULL;
28262 
28263 		// stored procedures might return multiple result sets
28264 		// FIXME? we might want to index all of them
28265 		// but for now, let's simply dismiss additional result sets
28266 		if ( sph_mysql_next_result ( &m_tMysqlDriver )==0 )
28267 		{
28268 			m_pMysqlResult = sph_mysql_use_result ( &m_tMysqlDriver );
28269 
28270 			static bool bOnce = false;
28271 			if ( !bOnce && m_pMysqlResult && sph_mysql_num_rows ( m_pMysqlResult ) )
28272 			{
28273 				sphWarn ( "indexing of multiple result sets is not supported yet; some results sets were dismissed!" );
28274 				bOnce = true;
28275 			}
28276 		}
28277 	}
28278 
28279 	m_pMysqlFields = NULL;
28280 	m_pMysqlLengths = NULL;
28281 }
28282 
28283 
SqlQuery(const char * sQuery)28284 bool CSphSource_MySQL::SqlQuery ( const char * sQuery )
28285 {
28286 	if ( sph_mysql_query ( &m_tMysqlDriver, sQuery ) )
28287 	{
28288 		if ( m_tParams.m_bPrintQueries )
28289 			fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
28290 		return false;
28291 	}
28292 	if ( m_tParams.m_bPrintQueries )
28293 		fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
28294 
28295 	m_pMysqlResult = sph_mysql_use_result ( &m_tMysqlDriver );
28296 	m_pMysqlFields = NULL;
28297 	return true;
28298 }
28299 
28300 
SqlIsError()28301 bool CSphSource_MySQL::SqlIsError ()
28302 {
28303 	return sph_mysql_errno ( &m_tMysqlDriver )!=0;
28304 }
28305 
28306 
SqlError()28307 const char * CSphSource_MySQL::SqlError ()
28308 {
28309 	return sph_mysql_error ( &m_tMysqlDriver );
28310 }
28311 
28312 
SqlConnect()28313 bool CSphSource_MySQL::SqlConnect ()
28314 {
28315 	if_const ( !InitDynamicMysql() )
28316 	{
28317 		if ( m_tParams.m_bPrintQueries )
28318 			fprintf ( stdout, "SQL-CONNECT: FAIL (NO MYSQL CLIENT LIB)\n" );
28319 		return false;
28320 	}
28321 
28322 	sph_mysql_init ( &m_tMysqlDriver );
28323 	if ( !m_sSslKey.IsEmpty() || !m_sSslCert.IsEmpty() || !m_sSslCA.IsEmpty() )
28324 		sph_mysql_ssl_set ( &m_tMysqlDriver, m_sSslKey.cstr(), m_sSslCert.cstr(), m_sSslCA.cstr(), NULL, NULL );
28325 
28326 	m_iMysqlConnectFlags |= CLIENT_MULTI_RESULTS; // we now know how to handle this
28327 	bool bRes = ( NULL!=sph_mysql_real_connect ( &m_tMysqlDriver,
28328 		m_tParams.m_sHost.cstr(), m_tParams.m_sUser.cstr(), m_tParams.m_sPass.cstr(),
28329 		m_tParams.m_sDB.cstr(), m_tParams.m_iPort, m_sMysqlUsock.cstr(), m_iMysqlConnectFlags ) );
28330 	if ( m_tParams.m_bPrintQueries )
28331 		fprintf ( stdout, bRes ? "SQL-CONNECT: ok\n" : "SQL-CONNECT: FAIL\n" );
28332 	return bRes;
28333 }
28334 
28335 
SqlDisconnect()28336 void CSphSource_MySQL::SqlDisconnect ()
28337 {
28338 	if ( m_tParams.m_bPrintQueries )
28339 		fprintf ( stdout, "SQL-DISCONNECT\n" );
28340 
28341 	sph_mysql_close ( &m_tMysqlDriver );
28342 }
28343 
28344 
SqlNumFields()28345 int CSphSource_MySQL::SqlNumFields ()
28346 {
28347 	if ( !m_pMysqlResult )
28348 		return -1;
28349 
28350 	return sph_mysql_num_fields ( m_pMysqlResult );
28351 }
28352 
28353 
SqlFetchRow()28354 bool CSphSource_MySQL::SqlFetchRow ()
28355 {
28356 	if ( !m_pMysqlResult )
28357 		return false;
28358 
28359 	m_tMysqlRow = sph_mysql_fetch_row ( m_pMysqlResult );
28360 	return m_tMysqlRow!=NULL;
28361 }
28362 
28363 
SqlColumn(int iIndex)28364 const char * CSphSource_MySQL::SqlColumn ( int iIndex )
28365 {
28366 	if ( !m_pMysqlResult )
28367 		return NULL;
28368 
28369 	return m_tMysqlRow[iIndex];
28370 }
28371 
28372 
SqlFieldName(int iIndex)28373 const char * CSphSource_MySQL::SqlFieldName ( int iIndex )
28374 {
28375 	if ( !m_pMysqlResult )
28376 		return NULL;
28377 
28378 	if ( !m_pMysqlFields )
28379 		m_pMysqlFields = sph_mysql_fetch_fields ( m_pMysqlResult );
28380 
28381 	return m_pMysqlFields[iIndex].name;
28382 }
28383 
28384 
SqlColumnLength(int iIndex)28385 DWORD CSphSource_MySQL::SqlColumnLength ( int iIndex )
28386 {
28387 	if ( !m_pMysqlResult )
28388 		return 0;
28389 
28390 	if ( !m_pMysqlLengths )
28391 		m_pMysqlLengths = sph_mysql_fetch_lengths ( m_pMysqlResult );
28392 
28393 	return m_pMysqlLengths[iIndex];
28394 }
28395 
28396 
Setup(const CSphSourceParams_MySQL & tParams)28397 bool CSphSource_MySQL::Setup ( const CSphSourceParams_MySQL & tParams )
28398 {
28399 	if ( !CSphSource_SQL::Setup ( tParams ) )
28400 		return false;
28401 
28402 	m_sMysqlUsock = tParams.m_sUsock;
28403 	m_iMysqlConnectFlags = tParams.m_iFlags;
28404 	m_sSslKey = tParams.m_sSslKey;
28405 	m_sSslCert = tParams.m_sSslCert;
28406 	m_sSslCA = tParams.m_sSslCA;
28407 
28408 	// build and store DSN for error reporting
28409 	char sBuf [ 1024 ];
28410 	snprintf ( sBuf, sizeof(sBuf), "mysql%s", m_sSqlDSN.cstr()+3 );
28411 	m_sSqlDSN = sBuf;
28412 
28413 	return true;
28414 }
28415 
28416 #endif // USE_MYSQL
28417 
28418 /////////////////////////////////////////////////////////////////////////////
28419 // PGSQL SOURCE
28420 /////////////////////////////////////////////////////////////////////////////
28421 
28422 #if USE_PGSQL
28423 #if DL_PGSQL
28424 #define POSGRESQL_LIB "libpq.so"
28425 #define POSTRESQL_NUM_FUNCS (12)
28426 
28427 #if defined(__INTEL_COMPILER) || defined(__ICL) || defined(__ICC) || defined(__ECC) || defined(__GNUC__)
28428 
28429 // use non-standard compiler extension __typeof__
28430 // it allow to declare pointer to the function without using it's declaration
28431 typedef __typeof__ ( PQgetvalue ) *xPQgetvalue;
28432 typedef __typeof__ ( PQclear ) *xPQclear;
28433 typedef __typeof__ ( PQsetdbLogin ) *xPQsetdbLogin;
28434 typedef __typeof__ ( PQstatus ) *xPQstatus;
28435 typedef __typeof__ ( PQsetClientEncoding ) *xPQsetClientEncoding;
28436 typedef __typeof__ ( PQexec ) *xPQexec;
28437 typedef __typeof__ ( PQresultStatus ) *xPQresultStatus;
28438 typedef __typeof__ ( PQntuples ) *xPQntuples;
28439 typedef __typeof__ ( PQfname ) *xPQfname;
28440 typedef __typeof__ ( PQnfields ) *xPQnfields;
28441 typedef __typeof__ ( PQfinish ) *xPQfinish;
28442 typedef __typeof__ ( PQerrorMessage ) *xPQerrorMessage;
28443 
28444 #else // compilers which are not known about __typeof__ support
28445 // declarations below are directly copy-pasted from libpq-fe.h,
28446 // and then (*x...) is placed around the function names.
28447 // In mostly cases this code will not be used, and the declarations
28448 // from previous block will be used instead.
28449 #warning Be sure that the posgresql function signatures are the same \
28450 as in libpq-fe.h. Correct the code below if this is not so.
28451 
28452 typedef char* (*xPQgetvalue)(const PGresult *res, int tup_num, int field_num); //NOLINT
28453 typedef void (*xPQclear)(PGresult *res); //NOLINT
28454 typedef PGconn *(*xPQsetdbLogin)(const char *pghost, const char *pgport, //NOLINT
28455 			 const char *pgoptions, const char *pgtty, //NOLINT
28456 			 const char *dbName, //NOLINT
28457 			 const char *login, const char *pwd); //NOLINT
28458 typedef ConnStatusType (*xPQstatus)(const PGconn *conn); //NOLINT
28459 typedef int	(*xPQsetClientEncoding)(PGconn *conn, const char *encoding); //NOLINT
28460 typedef PGresult *(*xPQexec)(PGconn *conn, const char *query); //NOLINT
28461 typedef ExecStatusType (*xPQresultStatus)(const PGresult *res); //NOLINT
28462 typedef int	(*xPQntuples)(const PGresult *res); //NOLINT
28463 typedef char *(*xPQfname)(const PGresult *res, int field_num); //NOLINT
28464 typedef int	(*xPQnfields)(const PGresult *res); //NOLINT
28465 typedef void (*xPQfinish)(PGconn *conn); //NOLINT
28466 typedef char *(*xPQerrorMessage)(const PGconn *conn); //NOLINT
28467 #endif
28468 
28469 class CPosgresql : public CSphDynamicLibrary
28470 {
28471 	static const char* sFuncs[POSTRESQL_NUM_FUNCS];
28472 	static void** pFuncs[POSTRESQL_NUM_FUNCS];
28473 
28474 public:
Init()28475 	bool Init()
28476 	{
28477 		if ( !CSphDynamicLibrary::Init ( POSGRESQL_LIB, true ) )
28478 			return false;
28479 		if ( !LoadSymbols ( sFuncs, pFuncs, POSTRESQL_NUM_FUNCS ) )
28480 			return false;
28481 		return true;
28482 	}
Stub()28483 	static void 	Stub()
28484 	{
28485 		sphLogDebug ( "Error! Posgresql func is null!" );
28486 	}
28487 
28488 	static xPQgetvalue m_pPQgetvalue;
28489 	static xPQclear m_pPQclear;
28490 	static xPQsetdbLogin m_pPQsetdbLogin;
28491 	static xPQstatus m_pPQstatus;
28492 	static xPQsetClientEncoding m_pPQsetClientEncoding;
28493 	static xPQexec m_pPQexec;
28494 	static xPQresultStatus m_pPQresultStatus;
28495 	static xPQntuples m_pPQntuples;
28496 	static xPQfname m_pPQfname;
28497 	static xPQnfields m_pPQnfields;
28498 	static xPQfinish m_pPQfinish;
28499 	static xPQerrorMessage m_pPQerrorMessage;
28500 };
28501 
28502 #define sph_PQgetvalue (*CPosgresql::m_pPQgetvalue)
28503 #define sph_PQclear (*CPosgresql::m_pPQclear)
28504 #define sph_PQsetdbLogin (*CPosgresql::m_pPQsetdbLogin)
28505 #define sph_PQstatus (*CPosgresql::m_pPQstatus)
28506 #define sph_PQsetClientEncoding (*CPosgresql::m_pPQsetClientEncoding)
28507 #define sph_PQexec (*CPosgresql::m_pPQexec)
28508 #define sph_PQresultStatus (*CPosgresql::m_pPQresultStatus)
28509 #define sph_PQntuples (*CPosgresql::m_pPQntuples)
28510 #define sph_PQfname (*CPosgresql::m_pPQfname)
28511 #define sph_PQnfields (*CPosgresql::m_pPQnfields)
28512 #define sph_PQfinish (*CPosgresql::m_pPQfinish)
28513 #define sph_PQerrorMessage (*CPosgresql::m_pPQerrorMessage)
28514 
28515 const char* CPosgresql::sFuncs[POSTRESQL_NUM_FUNCS] = {"PQgetvalue", "PQclear",
28516 	"PQsetdbLogin", "PQstatus", "PQsetClientEncoding", "PQexec",
28517 	"PQresultStatus", "PQntuples", "PQfname", "PQnfields",
28518 	"PQfinish", "PQerrorMessage" };
28519 void** CPosgresql::pFuncs[] = {(void**)&m_pPQgetvalue, (void**)&m_pPQclear,
28520 	(void**)&m_pPQsetdbLogin, (void**)&m_pPQstatus, (void**)&m_pPQsetClientEncoding,
28521 	(void**)&m_pPQexec, (void**)&m_pPQresultStatus, (void**)&m_pPQntuples,
28522 	(void**)&m_pPQfname, (void**)&m_pPQnfields, (void**)&m_pPQfinish,
28523 	(void**)&m_pPQerrorMessage};
28524 
28525 xPQgetvalue CPosgresql::m_pPQgetvalue = (xPQgetvalue)CPosgresql::Stub;
28526 xPQclear CPosgresql::m_pPQclear = (xPQclear)CPosgresql::Stub;
28527 xPQsetdbLogin CPosgresql::m_pPQsetdbLogin = (xPQsetdbLogin)CPosgresql::Stub;
28528 xPQstatus CPosgresql::m_pPQstatus = (xPQstatus)CPosgresql::Stub;
28529 xPQsetClientEncoding CPosgresql::m_pPQsetClientEncoding = (xPQsetClientEncoding)CPosgresql::Stub;
28530 xPQexec CPosgresql::m_pPQexec = (xPQexec)CPosgresql::Stub;
28531 xPQresultStatus CPosgresql::m_pPQresultStatus = (xPQresultStatus)CPosgresql::Stub;
28532 xPQntuples CPosgresql::m_pPQntuples = (xPQntuples)CPosgresql::Stub;
28533 xPQfname CPosgresql::m_pPQfname = (xPQfname)CPosgresql::Stub;
28534 xPQnfields CPosgresql::m_pPQnfields = (xPQnfields)CPosgresql::Stub;
28535 xPQfinish CPosgresql::m_pPQfinish = (xPQfinish)CPosgresql::Stub;
28536 xPQerrorMessage CPosgresql::m_pPQerrorMessage = (xPQerrorMessage)CPosgresql::Stub;
28537 
28538 CPosgresql MyPosgreSqlHolder;
28539 
InitDynamicPosgresql()28540 bool InitDynamicPosgresql()
28541 {
28542 	return MyPosgreSqlHolder.Init();
28543 }
28544 
28545 #else // !DL_PGSQL
28546 
28547 #define sph_PQgetvalue PQgetvalue
28548 #define sph_PQclear PQclear
28549 #define sph_PQsetdbLogin PQsetdbLogin
28550 #define sph_PQstatus PQstatus
28551 #define sph_PQsetClientEncoding PQsetClientEncoding
28552 #define sph_PQexec PQexec
28553 #define sph_PQresultStatus PQresultStatus
28554 #define sph_PQntuples PQntuples
28555 #define sph_PQfname PQfname
28556 #define sph_PQnfields PQnfields
28557 #define sph_PQfinish PQfinish
28558 #define sph_PQerrorMessage PQerrorMessage
28559 #define InitDynamicPosgresql() (true)
28560 
28561 #endif // DL_PGSQL
28562 
CSphSourceParams_PgSQL()28563 CSphSourceParams_PgSQL::CSphSourceParams_PgSQL ()
28564 {
28565 	m_iRangeStep = 1024;
28566 	m_iPort = 5432;
28567 }
28568 
28569 
CSphSource_PgSQL(const char * sName)28570 CSphSource_PgSQL::CSphSource_PgSQL ( const char * sName )
28571 	: CSphSource_SQL	( sName )
28572 	, m_pPgResult		( NULL )
28573 	, m_iPgRows			( 0 )
28574 	, m_iPgRow			( 0 )
28575 {
28576 }
28577 
28578 
SqlIsError()28579 bool CSphSource_PgSQL::SqlIsError ()
28580 {
28581 	return ( m_iPgRow<m_iPgRows ); // if we're over, it's just last row
28582 }
28583 
28584 
SqlError()28585 const char * CSphSource_PgSQL::SqlError ()
28586 {
28587 	return sph_PQerrorMessage ( m_tPgDriver );
28588 }
28589 
28590 
Setup(const CSphSourceParams_PgSQL & tParams)28591 bool CSphSource_PgSQL::Setup ( const CSphSourceParams_PgSQL & tParams )
28592 {
28593 	// checks
28594 	CSphSource_SQL::Setup ( tParams );
28595 
28596 	m_sPgClientEncoding = tParams.m_sClientEncoding;
28597 	if ( !m_sPgClientEncoding.cstr() )
28598 		m_sPgClientEncoding = "";
28599 
28600 	// build and store DSN for error reporting
28601 	char sBuf [ 1024 ];
28602 	snprintf ( sBuf, sizeof(sBuf), "pgsql%s", m_sSqlDSN.cstr()+3 );
28603 	m_sSqlDSN = sBuf;
28604 
28605 	return true;
28606 }
28607 
28608 
IterateStart(CSphString & sError)28609 bool CSphSource_PgSQL::IterateStart ( CSphString & sError )
28610 {
28611 	bool bResult = CSphSource_SQL::IterateStart ( sError );
28612 	if ( !bResult )
28613 		return false;
28614 
28615 	int iMaxIndex = 0;
28616 	for ( int i = 0; i < m_tSchema.GetAttrsCount(); i++ )
28617 		iMaxIndex = Max ( iMaxIndex, m_tSchema.GetAttr(i).m_iIndex );
28618 
28619 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
28620 		iMaxIndex = Max ( iMaxIndex, m_tSchema.m_dFields[i].m_iIndex );
28621 
28622 	m_dIsColumnBool.Resize ( iMaxIndex + 1 );
28623 	ARRAY_FOREACH ( i, m_dIsColumnBool )
28624 		m_dIsColumnBool[i] = false;
28625 
28626 	for ( int i = 0; i < m_tSchema.GetAttrsCount(); i++ )
28627 		m_dIsColumnBool [ m_tSchema.GetAttr(i).m_iIndex ] = ( m_tSchema.GetAttr(i).m_eAttrType==SPH_ATTR_BOOL );
28628 
28629 	return true;
28630 }
28631 
28632 
SqlConnect()28633 bool CSphSource_PgSQL::SqlConnect ()
28634 {
28635 	if ( !InitDynamicPosgresql() )
28636 	{
28637 		if ( m_tParams.m_bPrintQueries )
28638 			fprintf ( stdout, "SQL-CONNECT: FAIL (NO POSGRES CLIENT LIB)\n" );
28639 		return false;
28640 	}
28641 
28642 	char sPort[64];
28643 	snprintf ( sPort, sizeof(sPort), "%d", m_tParams.m_iPort );
28644 	m_tPgDriver = sph_PQsetdbLogin ( m_tParams.m_sHost.cstr(), sPort, NULL, NULL,
28645 		m_tParams.m_sDB.cstr(), m_tParams.m_sUser.cstr(), m_tParams.m_sPass.cstr() );
28646 
28647 	if ( sph_PQstatus ( m_tPgDriver )==CONNECTION_BAD )
28648 	{
28649 		if ( m_tParams.m_bPrintQueries )
28650 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
28651 		return false;
28652 	}
28653 
28654 	// set client encoding
28655 	if ( !m_sPgClientEncoding.IsEmpty() )
28656 		if ( -1==sph_PQsetClientEncoding ( m_tPgDriver, m_sPgClientEncoding.cstr() ) )
28657 	{
28658 		SqlDisconnect ();
28659 		if ( m_tParams.m_bPrintQueries )
28660 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
28661 		return false;
28662 	}
28663 
28664 	if ( m_tParams.m_bPrintQueries )
28665 		fprintf ( stdout, "SQL-CONNECT: ok\n" );
28666 	return true;
28667 }
28668 
28669 
SqlDisconnect()28670 void CSphSource_PgSQL::SqlDisconnect ()
28671 {
28672 	if ( m_tParams.m_bPrintQueries )
28673 		fprintf ( stdout, "SQL-DISCONNECT\n" );
28674 
28675 	sph_PQfinish ( m_tPgDriver );
28676 }
28677 
28678 
SqlQuery(const char * sQuery)28679 bool CSphSource_PgSQL::SqlQuery ( const char * sQuery )
28680 {
28681 	m_iPgRow = -1;
28682 	m_iPgRows = 0;
28683 
28684 	m_pPgResult = sph_PQexec ( m_tPgDriver, sQuery );
28685 
28686 	ExecStatusType eRes = sph_PQresultStatus ( m_pPgResult );
28687 	if ( ( eRes!=PGRES_COMMAND_OK ) && ( eRes!=PGRES_TUPLES_OK ) )
28688 	{
28689 		if ( m_tParams.m_bPrintQueries )
28690 			fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
28691 		return false;
28692 	}
28693 	if ( m_tParams.m_bPrintQueries )
28694 		fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
28695 
28696 	m_iPgRows = sph_PQntuples ( m_pPgResult );
28697 	return true;
28698 }
28699 
28700 
SqlDismissResult()28701 void CSphSource_PgSQL::SqlDismissResult ()
28702 {
28703 	if ( !m_pPgResult )
28704 		return;
28705 
28706 	sph_PQclear ( m_pPgResult );
28707 	m_pPgResult = NULL;
28708 }
28709 
28710 
SqlNumFields()28711 int CSphSource_PgSQL::SqlNumFields ()
28712 {
28713 	if ( !m_pPgResult )
28714 		return -1;
28715 
28716 	return sph_PQnfields ( m_pPgResult );
28717 }
28718 
28719 
SqlColumn(int iIndex)28720 const char * CSphSource_PgSQL::SqlColumn ( int iIndex )
28721 {
28722 	if ( !m_pPgResult )
28723 		return NULL;
28724 
28725 	const char * szValue = sph_PQgetvalue ( m_pPgResult, m_iPgRow, iIndex );
28726 	if ( m_dIsColumnBool.GetLength() && m_dIsColumnBool[iIndex] && szValue[0]=='t' && !szValue[1] )
28727 		return "1";
28728 
28729 	return szValue;
28730 }
28731 
28732 
SqlFieldName(int iIndex)28733 const char * CSphSource_PgSQL::SqlFieldName ( int iIndex )
28734 {
28735 	if ( !m_pPgResult )
28736 		return NULL;
28737 
28738 	return sph_PQfname ( m_pPgResult, iIndex );
28739 }
28740 
28741 
SqlFetchRow()28742 bool CSphSource_PgSQL::SqlFetchRow ()
28743 {
28744 	if ( !m_pPgResult )
28745 		return false;
28746 	return ( ++m_iPgRow<m_iPgRows );
28747 }
28748 
28749 
SqlColumnLength(int)28750 DWORD CSphSource_PgSQL::SqlColumnLength ( int )
28751 {
28752 	return 0;
28753 }
28754 
28755 #endif // USE_PGSQL
28756 
28757 /////////////////////////////////////////////////////////////////////////////
28758 // XMLPIPE (v2)
28759 /////////////////////////////////////////////////////////////////////////////
28760 
28761 template < typename T >
28762 struct CSphSchemaConfigurator
28763 {
ConfigureAttrsCSphSchemaConfigurator28764 	bool ConfigureAttrs ( const CSphVariant * pHead, ESphAttr eAttrType, CSphSchema & tSchema, CSphString & sError ) const
28765 	{
28766 		for ( const CSphVariant * pCur = pHead; pCur; pCur= pCur->m_pNext )
28767 		{
28768 			CSphColumnInfo tCol ( pCur->strval().cstr(), eAttrType );
28769 			char * pColon = strchr ( const_cast<char*> ( tCol.m_sName.cstr() ), ':' );
28770 			if ( pColon )
28771 			{
28772 				*pColon = '\0';
28773 
28774 				if ( eAttrType==SPH_ATTR_INTEGER )
28775 				{
28776 					int iBits = strtol ( pColon+1, NULL, 10 );
28777 					if ( iBits<=0 || iBits>ROWITEM_BITS )
28778 					{
28779 						sphWarn ( "%s", ((T*)this)->DecorateMessage ( "attribute '%s': invalid bitcount=%d (bitcount ignored)", tCol.m_sName.cstr(), iBits ) );
28780 						iBits = -1;
28781 					}
28782 
28783 					tCol.m_tLocator.m_iBitCount = iBits;
28784 				} else
28785 				{
28786 					sphWarn ( "%s", ((T*)this)->DecorateMessage ( "attribute '%s': bitcount is only supported for integer types", tCol.m_sName.cstr() ) );
28787 				}
28788 			}
28789 
28790 			tCol.m_iIndex = tSchema.GetAttrsCount ();
28791 
28792 			if ( eAttrType==SPH_ATTR_UINT32SET || eAttrType==SPH_ATTR_INT64SET )
28793 			{
28794 				tCol.m_eAttrType = eAttrType;
28795 				tCol.m_eSrc = SPH_ATTRSRC_FIELD;
28796 			}
28797 
28798 			if ( CSphSchema::IsReserved ( tCol.m_sName.cstr() ) )
28799 			{
28800 				sError.SetSprintf ( "%s is not a valid attribute name", tCol.m_sName.cstr() );
28801 				return false;
28802 			}
28803 
28804 			tSchema.AddAttr ( tCol, true ); // all attributes are dynamic at indexing time
28805 		}
28806 
28807 		return true;
28808 	}
28809 
ConfigureFieldsCSphSchemaConfigurator28810 	void ConfigureFields ( const CSphVariant * pHead, bool bWordDict, CSphSchema & tSchema ) const
28811 	{
28812 		for ( const CSphVariant * pCur = pHead; pCur; pCur= pCur->m_pNext )
28813 		{
28814 			const char * sFieldName = pCur->strval().cstr();
28815 
28816 			bool bFound = false;
28817 			for ( int i = 0; i < tSchema.m_dFields.GetLength () && !bFound; i++ )
28818 				bFound = ( tSchema.m_dFields[i].m_sName==sFieldName );
28819 
28820 			if ( bFound )
28821 				sphWarn ( "%s", ((T*)this)->DecorateMessage ( "duplicate field '%s'", sFieldName ) );
28822 			else
28823 				AddFieldToSchema ( sFieldName, bWordDict, tSchema );
28824 		}
28825 	}
28826 
AddFieldToSchemaCSphSchemaConfigurator28827 	void AddFieldToSchema ( const char * sFieldName, bool bWordDict, CSphSchema & tSchema ) const
28828 	{
28829 		CSphColumnInfo tCol ( sFieldName );
28830 		tCol.m_eWordpart = ((T*)this)->GetWordpart ( tCol.m_sName.cstr(), bWordDict );
28831 		tSchema.m_dFields.Add ( tCol );
28832 	}
28833 };
28834 
28835 
SourceCheckSchema(const CSphSchema & tSchema,CSphString & sError)28836 static bool SourceCheckSchema ( const CSphSchema & tSchema, CSphString & sError )
28837 {
28838 	SmallStringHash_T<int> hAttrs;
28839 	for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
28840 	{
28841 		const CSphColumnInfo & tAttr = tSchema.GetAttr ( i );
28842 		bool bUniq = hAttrs.Add ( 1, tAttr.m_sName );
28843 
28844 		if ( !bUniq )
28845 		{
28846 			sError.SetSprintf ( "attribute %s declared multiple times", tAttr.m_sName.cstr() );
28847 			return false;
28848 		}
28849 	}
28850 
28851 	return true;
28852 }
28853 
28854 
28855 #if USE_LIBEXPAT
28856 #if DL_EXPAT
28857 #ifndef EXPAT_LIB
28858 #define EXPAT_LIB "libexpat.so"
28859 #endif
28860 #define EXPAT_NUM_FUNCS (11)
28861 
28862 #if defined(__INTEL_COMPILER) || defined(__ICL) || defined(__ICC) || defined(__ECC) || defined(__GNUC__)
28863 
28864 // use non-standard compiler extension __typeof__
28865 // it allow to declare pointer to the function without using it's declaration
28866 typedef __typeof__ ( XML_ParserFree ) *xXML_ParserFree;
28867 typedef __typeof__ ( XML_Parse ) *xXML_Parse;
28868 typedef __typeof__ ( XML_GetCurrentColumnNumber ) *xXML_GetCurrentColumnNumber;
28869 typedef __typeof__ ( XML_GetCurrentLineNumber ) *xXML_GetCurrentLineNumber;
28870 typedef __typeof__ ( XML_GetErrorCode ) *xXML_GetErrorCode;
28871 typedef __typeof__ ( XML_ErrorString ) *xXML_ErrorString;
28872 typedef __typeof__ ( XML_ParserCreate ) *xXML_ParserCreate;
28873 typedef __typeof__ ( XML_SetUserData ) *xXML_SetUserData;
28874 typedef __typeof__ ( XML_SetElementHandler ) *xXML_SetElementHandler;
28875 typedef __typeof__ ( XML_SetCharacterDataHandler ) *xXML_SetCharacterDataHandler;
28876 typedef __typeof__ ( XML_SetUnknownEncodingHandler ) *xXML_SetUnknownEncodingHandler;
28877 
28878 #else // compilers which are not known about __typeof__ support
28879 // declarations below are directly copy-pasted from expat.h,
28880 // and then (*x...) is placed around the function names.
28881 // In mostly cases this code will not be used, and the declarations
28882 // from previous block will be used instead.
28883 #warning Be sure that the expat function signatures are the same \
28884 as in expat.h. Correct the code below if this is not so.
28885 typedef XMLPARSEAPI(void) (*xXML_ParserFree)(XML_Parser); //NOLINT
28886 typedef XMLPARSEAPI(enum XML_Status) (*xXML_Parse)(XML_Parser, const char *, int, int); //NOLINT
28887 typedef XMLPARSEAPI(XML_Size) (*xXML_GetCurrentColumnNumber)(XML_Parser); //NOLINT
28888 typedef XMLPARSEAPI(XML_Size) (*xXML_GetCurrentLineNumber)(XML_Parser); //NOLINT
28889 typedef XMLPARSEAPI(enum XML_Error) (*xXML_GetErrorCode)(XML_Parser); //NOLINT
28890 typedef XMLPARSEAPI(const XML_LChar *) (*xXML_ErrorString)(enum XML_Error code); //NOLINT
28891 typedef XMLPARSEAPI(XML_Parser) (*xXML_ParserCreate)(const XML_Char *encoding); //NOLINT
28892 typedef XMLPARSEAPI(void) (*xXML_SetUserData)(XML_Parser, void *); //NOLINT
28893 typedef XMLPARSEAPI(void) (*xXML_SetElementHandler)(XML_Parser, //NOLINT
28894                       XML_StartElementHandler, //NOLINT
28895                       XML_EndElementHandler); //NOLINT
28896 typedef XMLPARSEAPI(void) (*xXML_SetCharacterDataHandler)(XML_Parser, //NOLINT
28897                             XML_CharacterDataHandler); //NOLINT
28898 typedef XMLPARSEAPI(void) (*xXML_SetUnknownEncodingHandler)(XML_Parser, //NOLINT
28899                               XML_UnknownEncodingHandler, //NOLINT
28900                               void *); //NOLINT
28901 #endif
28902 
28903 class CExpat : public CSphDynamicLibrary
28904 {
28905 	static const char* sFuncs[EXPAT_NUM_FUNCS];
28906 	static void** pFuncs[EXPAT_NUM_FUNCS];
28907 
28908 public:
Init()28909 	bool Init()
28910 	{
28911 		if ( !CSphDynamicLibrary::Init ( EXPAT_LIB, true ) )
28912 			return false;
28913 		if ( !LoadSymbols ( sFuncs, pFuncs, EXPAT_NUM_FUNCS ) )
28914 			return false;
28915 		return true;
28916 	}
Stub()28917 	static void 	Stub()
28918 	{
28919 		sphLogDebug ( "Error! Expat func is null!" );
28920 	}
28921 
28922 	static xXML_ParserFree m_pXML_ParserFree;
28923 	static xXML_Parse m_pXML_Parse;
28924 	static xXML_GetCurrentColumnNumber m_pXML_GetCurrentColumnNumber;
28925 	static xXML_GetCurrentLineNumber m_pXML_GetCurrentLineNumber;
28926 	static xXML_GetErrorCode m_pXML_GetErrorCode;
28927 	static xXML_ErrorString m_pXML_ErrorString;
28928 	static xXML_ParserCreate m_pXML_ParserCreate;
28929 	static xXML_SetUserData m_pXML_SetUserData;
28930 	static xXML_SetElementHandler m_pXML_SetElementHandler;
28931 	static xXML_SetCharacterDataHandler m_pXML_SetCharacterDataHandler;
28932 	static xXML_SetUnknownEncodingHandler m_pXML_SetUnknownEncodingHandler;
28933 };
28934 
28935 #define sph_XML_ParserFree (*CExpat::m_pXML_ParserFree)
28936 #define sph_XML_Parse (*CExpat::m_pXML_Parse)
28937 #define sph_XML_GetCurrentColumnNumber (*CExpat::m_pXML_GetCurrentColumnNumber)
28938 #define sph_XML_GetCurrentLineNumber (*CExpat::m_pXML_GetCurrentLineNumber)
28939 #define sph_XML_GetErrorCode (*CExpat::m_pXML_GetErrorCode)
28940 #define sph_XML_ErrorString (*CExpat::m_pXML_ErrorString)
28941 #define sph_XML_ParserCreate (*CExpat::m_pXML_ParserCreate)
28942 #define sph_XML_SetUserData (*CExpat::m_pXML_SetUserData)
28943 #define sph_XML_SetElementHandler (*CExpat::m_pXML_SetElementHandler)
28944 #define sph_XML_SetCharacterDataHandler (*CExpat::m_pXML_SetCharacterDataHandler)
28945 #define sph_XML_SetUnknownEncodingHandler (*CExpat::m_pXML_SetUnknownEncodingHandler)
28946 
28947 const char* CExpat::sFuncs[] = {"XML_ParserFree", "XML_Parse",
28948 	"XML_GetCurrentColumnNumber", "XML_GetCurrentLineNumber", "XML_GetErrorCode", "XML_ErrorString",
28949 	"XML_ParserCreate", "XML_SetUserData", "XML_SetElementHandler", "XML_SetCharacterDataHandler",
28950 	"XML_SetUnknownEncodingHandler" };
28951 void** CExpat::pFuncs[] = {(void**)&m_pXML_ParserFree, (void**)&m_pXML_Parse,
28952 	(void**)&m_pXML_GetCurrentColumnNumber, (void**)&m_pXML_GetCurrentLineNumber,
28953 	(void**)&m_pXML_GetErrorCode, (void**)&m_pXML_ErrorString,
28954 	(void**)&m_pXML_ParserCreate, (void**)&m_pXML_SetUserData,
28955 	(void**)&m_pXML_SetElementHandler, (void**)&m_pXML_SetCharacterDataHandler,
28956 	(void**)&m_pXML_SetUnknownEncodingHandler};
28957 
28958 
28959 xXML_ParserFree CExpat::m_pXML_ParserFree = (xXML_ParserFree)CExpat::Stub;
28960 xXML_Parse CExpat::m_pXML_Parse = (xXML_Parse)CExpat::Stub;
28961 xXML_GetCurrentColumnNumber CExpat::m_pXML_GetCurrentColumnNumber = (xXML_GetCurrentColumnNumber)CExpat::Stub;
28962 xXML_GetCurrentLineNumber CExpat::m_pXML_GetCurrentLineNumber = (xXML_GetCurrentLineNumber)CExpat::Stub;
28963 xXML_GetErrorCode CExpat::m_pXML_GetErrorCode = (xXML_GetErrorCode)CExpat::Stub;
28964 xXML_ErrorString CExpat::m_pXML_ErrorString = (xXML_ErrorString)CExpat::Stub;
28965 xXML_ParserCreate CExpat::m_pXML_ParserCreate = (xXML_ParserCreate)CExpat::Stub;
28966 xXML_SetUserData CExpat::m_pXML_SetUserData = (xXML_SetUserData)CExpat::Stub;
28967 xXML_SetElementHandler CExpat::m_pXML_SetElementHandler = (xXML_SetElementHandler)CExpat::Stub;
28968 xXML_SetCharacterDataHandler CExpat::m_pXML_SetCharacterDataHandler = (xXML_SetCharacterDataHandler)CExpat::Stub;
28969 xXML_SetUnknownEncodingHandler CExpat::m_pXML_SetUnknownEncodingHandler = (xXML_SetUnknownEncodingHandler)CExpat::Stub;
28970 
28971 CExpat MyExpatHolder;
28972 
InitDynamicExpat()28973 bool InitDynamicExpat()
28974 {
28975 	return MyExpatHolder.Init();
28976 }
28977 
28978 #else // !DL_EXPAT
28979 
28980 #define sph_XML_ParserFree XML_ParserFree
28981 #define sph_XML_Parse XML_Parse
28982 #define sph_XML_GetCurrentColumnNumber XML_GetCurrentColumnNumber
28983 #define sph_XML_GetCurrentLineNumber XML_GetCurrentLineNumber
28984 #define sph_XML_GetErrorCode XML_GetErrorCode
28985 #define sph_XML_ErrorString XML_ErrorString
28986 #define sph_XML_ParserCreate XML_ParserCreate
28987 #define sph_XML_SetUserData XML_SetUserData
28988 #define sph_XML_SetElementHandler XML_SetElementHandler
28989 #define sph_XML_SetCharacterDataHandler XML_SetCharacterDataHandler
28990 #define sph_XML_SetUnknownEncodingHandler XML_SetUnknownEncodingHandler
28991 #define InitDynamicExpat() (true)
28992 
28993 #endif // DL_EXPAT
28994 
28995 /// XML pipe source implementation (v2)
28996 class CSphSource_XMLPipe2 : public CSphSource_Document, public CSphSchemaConfigurator<CSphSource_XMLPipe2>
28997 {
28998 public:
28999 	explicit			CSphSource_XMLPipe2 ( const char * sName );
29000 					~CSphSource_XMLPipe2 ();
29001 
29002 	bool			Setup ( int iFieldBufferMax, bool bFixupUTF8, FILE * pPipe, const CSphConfigSection & hSource, CSphString & sError );			///< memorize the command
29003 	virtual bool	Connect ( CSphString & sError );			///< run the command and open the pipe
29004 	virtual void	Disconnect ();								///< close the pipe
29005 
IterateStart(CSphString &)29006 	virtual bool	IterateStart ( CSphString & ) { m_iPlainFieldsLength = m_tSchema.m_dFields.GetLength(); return true; }	///< Connect() starts getting documents automatically, so this one is empty
29007 	virtual BYTE **	NextDocument ( CSphString & sError );			///< parse incoming chunk and emit some hits
29008 
HasAttrsConfigured()29009 	virtual bool	HasAttrsConfigured ()							{ return true; }	///< xmlpipe always has some attrs for now
IterateMultivaluedStart(int,CSphString &)29010 	virtual bool	IterateMultivaluedStart ( int, CSphString & )	{ return false; }
IterateMultivaluedNext()29011 	virtual bool	IterateMultivaluedNext ()						{ return false; }
29012 	virtual bool	IterateKillListStart ( CSphString & );
29013 	virtual bool	IterateKillListNext ( SphDocID_t & uDocId );
29014 
29015 	void			StartElement ( const char * szName, const char ** pAttrs );
29016 	void			EndElement ( const char * pName );
29017 	void			Characters ( const char * pCharacters, int iLen );
29018 
29019 	void			Error ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
29020 	const char *	DecorateMessage ( const char * sTemplate, ... ) const __attribute__ ( ( format ( printf, 2, 3 ) ) );
29021 	const char *	DecorateMessageVA ( const char * sTemplate, va_list ap ) const;
29022 
29023 private:
29024 	struct Document_t
29025 	{
29026 		SphDocID_t					m_uDocID;
29027 		CSphVector < CSphVector<BYTE> >	m_dFields;
29028 		CSphVector<CSphString>		m_dAttrs;
29029 	};
29030 
29031 	Document_t *				m_pCurDocument;
29032 	CSphVector<Document_t *>	m_dParsedDocuments;
29033 
29034 	FILE *			m_pPipe;			///< incoming stream
29035 	CSphString		m_sError;
29036 	CSphVector<CSphString> m_dDefaultAttrs;
29037 	CSphVector<CSphString> m_dInvalid;
29038 	CSphVector<CSphString> m_dWarned;
29039 	int				m_iElementDepth;
29040 
29041 	BYTE *			m_pBuffer;
29042 	int				m_iBufferSize;
29043 
29044 	CSphVector<BYTE*>m_dFieldPtrs;
29045 	bool			m_bRemoveParsed;
29046 
29047 	bool			m_bInDocset;
29048 	bool			m_bInSchema;
29049 	bool			m_bInDocument;
29050 	bool			m_bInKillList;
29051 	bool			m_bInId;
29052 	bool			m_bInIgnoredTag;
29053 	bool			m_bFirstTagAfterDocset;
29054 
29055 	int				m_iKillListIterator;
29056 	CSphVector < SphDocID_t > m_dKillList;
29057 
29058 	int				m_iMVA;
29059 	int				m_iMVAIterator;
29060 	CSphVector < CSphVector <DWORD> > m_dFieldMVAs;
29061 	CSphVector < int > m_dAttrToMVA;
29062 
29063 	int				m_iCurField;
29064 	int				m_iCurAttr;
29065 
29066 	XML_Parser		m_pParser;
29067 
29068 	int				m_iFieldBufferMax;
29069 	BYTE * 			m_pFieldBuffer;
29070 	int				m_iFieldBufferLen;
29071 
29072 	bool			m_bFixupUTF8;		///< whether to replace invalid utf-8 codepoints with spaces
29073 	int				m_iReparseStart;	///< utf-8 fixerupper might need to postpone a few bytes, starting at this offset
29074 	int				m_iReparseLen;		///< and this much bytes (under 4)
29075 
29076 	void			UnexpectedCharaters ( const char * pCharacters, int iLen, const char * szComment );
29077 
29078 	bool			ParseNextChunk ( int iBufferLen, CSphString & sError );
29079 
DocumentError(const char * sWhere)29080 	void DocumentError ( const char * sWhere )
29081 	{
29082 		Error ( "malformed source, <sphinx:document> found inside %s", sWhere );
29083 
29084 		// Ideally I'd like to display a notice on the next line that
29085 		// would say where exactly it's allowed. E.g.:
29086 		//
29087 		// <sphinx:document> must be contained in <sphinx:docset>
29088 	}
29089 };
29090 
29091 
29092 // callbacks
xmlStartElement(void * user_data,const XML_Char * name,const XML_Char ** attrs)29093 static void XMLCALL xmlStartElement ( void * user_data, const XML_Char * name, const XML_Char ** attrs )
29094 {
29095 	CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
29096 	pSource->StartElement ( name, attrs );
29097 }
29098 
29099 
xmlEndElement(void * user_data,const XML_Char * name)29100 static void XMLCALL xmlEndElement ( void * user_data, const XML_Char * name )
29101 {
29102 	CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
29103 	pSource->EndElement ( name );
29104 }
29105 
29106 
xmlCharacters(void * user_data,const XML_Char * ch,int len)29107 static void XMLCALL xmlCharacters ( void * user_data, const XML_Char * ch, int len )
29108 {
29109 	CSphSource_XMLPipe2 * pSource = (CSphSource_XMLPipe2 *) user_data;
29110 	pSource->Characters ( ch, len );
29111 }
29112 
29113 #if USE_LIBICONV
xmlUnknownEncoding(void *,const XML_Char * name,XML_Encoding * info)29114 static int XMLCALL xmlUnknownEncoding ( void *, const XML_Char * name, XML_Encoding * info )
29115 {
29116 	iconv_t pDesc = iconv_open ( "UTF-16", name );
29117 	if ( !pDesc )
29118 		return XML_STATUS_ERROR;
29119 
29120 	for ( size_t i = 0; i < 256; i++ )
29121 	{
29122 		char cIn = (char) i;
29123 		char dOut[4];
29124 		memset ( dOut, 0, sizeof ( dOut ) );
29125 #if ICONV_INBUF_CONST
29126 		const char * pInbuf = &cIn;
29127 #else
29128 		char * pInbuf = &cIn;
29129 #endif
29130 		char * pOutbuf = dOut;
29131 		size_t iInBytesLeft = 1;
29132 		size_t iOutBytesLeft = 4;
29133 
29134 		if ( iconv ( pDesc, &pInbuf, &iInBytesLeft, &pOutbuf, &iOutBytesLeft )!=size_t(-1) )
29135 			info->map[i] = int ( BYTE ( dOut[0] ) ) << 8 | int ( BYTE ( dOut[1] ) );
29136 		else
29137 			info->map[i] = 0;
29138 	}
29139 
29140 	iconv_close ( pDesc );
29141 
29142 	return XML_STATUS_OK;
29143 }
29144 #endif
29145 
CSphSource_XMLPipe2(const char * sName)29146 CSphSource_XMLPipe2::CSphSource_XMLPipe2 ( const char * sName )
29147 	: CSphSource_Document ( sName )
29148 	, m_pCurDocument	( NULL )
29149 	, m_pPipe			( NULL )
29150 	, m_iElementDepth	( 0 )
29151 	, m_pBuffer			( NULL )
29152 	, m_iBufferSize		( 1048576 )
29153 	, m_bRemoveParsed	( false )
29154 	, m_bInDocset		( false )
29155 	, m_bInSchema		( false )
29156 	, m_bInDocument		( false )
29157 	, m_bInKillList		( false )
29158 	, m_bInId			( false )
29159 	, m_bInIgnoredTag	( false )
29160 	, m_bFirstTagAfterDocset	( false )
29161 	, m_iKillListIterator		( 0 )
29162 	, m_iMVA			( 0 )
29163 	, m_iMVAIterator	( 0 )
29164 	, m_iCurField		( -1 )
29165 	, m_iCurAttr		( -1 )
29166 	, m_pParser			( NULL )
29167 	, m_iFieldBufferMax	( 65536 )
29168 	, m_pFieldBuffer	( NULL )
29169 	, m_iFieldBufferLen	( 0 )
29170 	, m_bFixupUTF8		( false )
29171 	, m_iReparseStart	( 0 )
29172 	, m_iReparseLen		( 0 )
29173 {
29174 }
29175 
29176 
~CSphSource_XMLPipe2()29177 CSphSource_XMLPipe2::~CSphSource_XMLPipe2 ()
29178 {
29179 	Disconnect ();
29180 	SafeDeleteArray ( m_pBuffer );
29181 	SafeDeleteArray ( m_pFieldBuffer );
29182 	ARRAY_FOREACH ( i, m_dParsedDocuments )
29183 		SafeDelete ( m_dParsedDocuments[i] );
29184 }
29185 
29186 
Disconnect()29187 void CSphSource_XMLPipe2::Disconnect ()
29188 {
29189 	if ( m_pPipe )
29190 	{
29191 		pclose ( m_pPipe );
29192 		m_pPipe = NULL;
29193 	}
29194 
29195 	if ( m_pParser )
29196 	{
29197 		sph_XML_ParserFree ( m_pParser );
29198 		m_pParser = NULL;
29199 	}
29200 
29201 	m_tHits.m_dData.Reset();
29202 }
29203 
29204 
Error(const char * sTemplate,...)29205 void CSphSource_XMLPipe2::Error ( const char * sTemplate, ... )
29206 {
29207 	if ( !m_sError.IsEmpty() )
29208 		return;
29209 
29210 	va_list ap;
29211 	va_start ( ap, sTemplate );
29212 	m_sError = DecorateMessageVA ( sTemplate, ap );
29213 	va_end ( ap );
29214 }
29215 
29216 
DecorateMessage(const char * sTemplate,...) const29217 const char * CSphSource_XMLPipe2::DecorateMessage ( const char * sTemplate, ... ) const
29218 {
29219 	va_list ap;
29220 	va_start ( ap, sTemplate );
29221 	const char * sRes = DecorateMessageVA ( sTemplate, ap );
29222 	va_end ( ap );
29223 	return sRes;
29224 }
29225 
29226 
DecorateMessageVA(const char * sTemplate,va_list ap) const29227 const char * CSphSource_XMLPipe2::DecorateMessageVA ( const char * sTemplate, va_list ap ) const
29228 {
29229 	static char sBuf[1024];
29230 
29231 	snprintf ( sBuf, sizeof(sBuf), "source '%s': ", m_tSchema.m_sName.cstr() );
29232 	int iBufLen = strlen ( sBuf );
29233 	int iLeft = sizeof(sBuf) - iBufLen;
29234 	char * szBufStart = sBuf + iBufLen;
29235 
29236 	vsnprintf ( szBufStart, iLeft, sTemplate, ap );
29237 	iBufLen = strlen ( sBuf );
29238 	iLeft = sizeof(sBuf) - iBufLen;
29239 	szBufStart = sBuf + iBufLen;
29240 
29241 	if ( m_pParser )
29242 	{
29243 		SphDocID_t uFailedID = 0;
29244 		if ( m_dParsedDocuments.GetLength() )
29245 			uFailedID = m_dParsedDocuments.Last()->m_uDocID;
29246 
29247 		snprintf ( szBufStart, iLeft, " (line=%d, pos=%d, docid=" DOCID_FMT ")",
29248 			(int)sph_XML_GetCurrentLineNumber ( m_pParser ), (int)sph_XML_GetCurrentColumnNumber ( m_pParser ),
29249 			uFailedID );
29250 	}
29251 
29252 	return sBuf;
29253 }
29254 
29255 
Setup(int iFieldBufferMax,bool bFixupUTF8,FILE * pPipe,const CSphConfigSection & hSource,CSphString & sError)29256 bool CSphSource_XMLPipe2::Setup ( int iFieldBufferMax, bool bFixupUTF8, FILE * pPipe, const CSphConfigSection & hSource, CSphString & sError )
29257 {
29258 	assert ( !m_pBuffer && !m_pFieldBuffer );
29259 
29260 	m_pBuffer = new BYTE [m_iBufferSize];
29261 	m_iFieldBufferMax = Max ( iFieldBufferMax, 65536 );
29262 	m_pFieldBuffer = new BYTE [ m_iFieldBufferMax ];
29263 	m_bFixupUTF8 = bFixupUTF8;
29264 	m_pPipe = pPipe;
29265 	m_tSchema.Reset ();
29266 	bool bWordDict = ( m_pDict && m_pDict->GetSettings().m_bWordDict );
29267 	bool bOk = true;
29268 
29269 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_uint"),		SPH_ATTR_INTEGER,	m_tSchema, sError );
29270 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_timestamp"),	SPH_ATTR_TIMESTAMP,	m_tSchema, sError );
29271 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_bool"),		SPH_ATTR_BOOL,		m_tSchema, sError );
29272 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_float"),		SPH_ATTR_FLOAT,		m_tSchema, sError );
29273 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_bigint"),		SPH_ATTR_BIGINT,	m_tSchema, sError );
29274 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_multi"),		SPH_ATTR_UINT32SET,	m_tSchema, sError );
29275 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_multi_64"),	SPH_ATTR_INT64SET,	m_tSchema, sError );
29276 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_string"),		SPH_ATTR_STRING,	m_tSchema, sError );
29277 	bOk &= ConfigureAttrs ( hSource("xmlpipe_attr_json"),		SPH_ATTR_JSON,		m_tSchema, sError );
29278 
29279 	bOk &= ConfigureAttrs ( hSource("xmlpipe_field_string"),	SPH_ATTR_STRING,	m_tSchema, sError );
29280 
29281 	if ( !bOk )
29282 		return false;
29283 
29284 	if ( !SourceCheckSchema ( m_tSchema, sError ) )
29285 		return false;
29286 
29287 	ConfigureFields ( hSource("xmlpipe_field"), bWordDict, m_tSchema );
29288 	ConfigureFields ( hSource("xmlpipe_field_string"), bWordDict, m_tSchema );
29289 
29290 	AllocDocinfo();
29291 	return true;
29292 }
29293 
29294 
Connect(CSphString & sError)29295 bool CSphSource_XMLPipe2::Connect ( CSphString & sError )
29296 {
29297 	assert ( m_pBuffer && m_pFieldBuffer );
29298 
29299 	if_const ( !InitDynamicExpat() )
29300 	{
29301 		sError.SetSprintf ( "xmlpipe: failed to load libexpat library" );
29302 		return false;
29303 	}
29304 
29305 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
29306 	{
29307 		CSphColumnInfo & tCol = m_tSchema.m_dFields[i];
29308 		tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), m_pDict && m_pDict->GetSettings().m_bWordDict );
29309 	}
29310 
29311 	if ( !AddAutoAttrs ( sError ) )
29312 		return false;
29313 	AllocDocinfo();
29314 
29315 	m_pParser = sph_XML_ParserCreate(NULL);
29316 	if ( !m_pParser )
29317 	{
29318 		sError.SetSprintf ( "xmlpipe: failed to create XML parser" );
29319 		return false;
29320 	}
29321 
29322 	sph_XML_SetUserData ( m_pParser, this );
29323 	sph_XML_SetElementHandler ( m_pParser, xmlStartElement, xmlEndElement );
29324 	sph_XML_SetCharacterDataHandler ( m_pParser, xmlCharacters );
29325 
29326 #if USE_LIBICONV
29327 	sph_XML_SetUnknownEncodingHandler ( m_pParser, xmlUnknownEncoding, NULL );
29328 #endif
29329 
29330 	m_dKillList.Reserve ( 1024 );
29331 	m_dKillList.Resize ( 0 );
29332 
29333 	m_bRemoveParsed = false;
29334 	m_bInDocset = false;
29335 	m_bInSchema = false;
29336 	m_bInDocument = false;
29337 	m_bInKillList = false;
29338 	m_bInId = false;
29339 	m_bFirstTagAfterDocset = false;
29340 	m_iCurField = -1;
29341 	m_iCurAttr = -1;
29342 	m_iElementDepth = 0;
29343 
29344 	m_dParsedDocuments.Reset ();
29345 	m_dDefaultAttrs.Reset ();
29346 	m_dInvalid.Reset ();
29347 	m_dWarned.Reset ();
29348 
29349 	m_dParsedDocuments.Reserve ( 1024 );
29350 	m_dParsedDocuments.Resize ( 0 );
29351 
29352 	m_iKillListIterator = 0;
29353 
29354 	m_iMVA = 0;
29355 	m_iMVAIterator = 0;
29356 
29357 	m_sError = "";
29358 
29359 	int iBytesRead = fread ( m_pBuffer, 1, m_iBufferSize, m_pPipe );
29360 
29361 	if ( !ParseNextChunk ( iBytesRead, sError ) )
29362 		return false;
29363 
29364 	m_dAttrToMVA.Resize ( 0 );
29365 
29366 	int iFieldMVA = 0;
29367 	for ( int i = 0; i < m_tSchema.GetAttrsCount (); i++ )
29368 	{
29369 		const CSphColumnInfo & tCol = m_tSchema.GetAttr ( i );
29370 		if ( ( tCol.m_eAttrType==SPH_ATTR_UINT32SET || tCol.m_eAttrType==SPH_ATTR_INT64SET ) && tCol.m_eSrc==SPH_ATTRSRC_FIELD )
29371 			m_dAttrToMVA.Add ( iFieldMVA++ );
29372 		else
29373 			m_dAttrToMVA.Add ( -1 );
29374 	}
29375 
29376 	m_dFieldMVAs.Resize ( iFieldMVA );
29377 	ARRAY_FOREACH ( i, m_dFieldMVAs )
29378 		m_dFieldMVAs[i].Reserve ( 16 );
29379 
29380 	m_tHits.m_dData.Reserve ( m_iMaxHits );
29381 
29382 	return true;
29383 }
29384 
29385 
ParseNextChunk(int iBufferLen,CSphString & sError)29386 bool CSphSource_XMLPipe2::ParseNextChunk ( int iBufferLen, CSphString & sError )
29387 {
29388 	if ( !iBufferLen )
29389 		return true;
29390 
29391 	bool bLast = ( iBufferLen!=m_iBufferSize );
29392 
29393 	m_iReparseLen = 0;
29394 	if ( m_bFixupUTF8 )
29395 	{
29396 		BYTE * p = m_pBuffer;
29397 		BYTE * pMax = m_pBuffer + iBufferLen;
29398 
29399 		while ( p<pMax )
29400 		{
29401 			BYTE v = *p;
29402 
29403 			// fix control codes
29404 			if ( v<0x20 && v!=0x0D && v!=0x0A )
29405 			{
29406 				*p++ = ' ';
29407 				continue;
29408 			}
29409 
29410 			// accept ascii7 codes
29411 			if ( v<128 )
29412 			{
29413 				p++;
29414 				continue;
29415 			}
29416 
29417 			// remove invalid start bytes
29418 			if ( v<0xC2 )
29419 			{
29420 				*p++ = ' ';
29421 				continue;
29422 			}
29423 
29424 			// get and check byte count
29425 			int iBytes = 0;
29426 			while ( v & 0x80 )
29427 			{
29428 				iBytes++;
29429 				v <<= 1;
29430 			}
29431 			if ( iBytes<2 || iBytes>3 )
29432 			{
29433 				*p++ = ' ';
29434 				continue;
29435 			}
29436 
29437 			// if we're on a boundary, save these few bytes for the future
29438 			if ( p+iBytes>pMax )
29439 			{
29440 				m_iReparseStart = (int)(p-m_pBuffer);
29441 				m_iReparseLen = (int)(pMax-p);
29442 				iBufferLen -= m_iReparseLen;
29443 				break;
29444 			}
29445 
29446 			// otherwise (not a boundary), check them all
29447 			int i = 1;
29448 			int iVal = ( v >> iBytes );
29449 			for ( ; i<iBytes; i++ )
29450 			{
29451 				if ( ( p[i] & 0xC0 )!=0x80 )
29452 					break;
29453 				iVal = ( iVal<<6 ) + ( p[i] & 0x3f );
29454 			}
29455 
29456 			if ( i!=iBytes // remove invalid sequences
29457 				|| ( iVal>=0xd800 && iVal<=0xdfff ) // and utf-16 surrogate pairs
29458 				|| ( iBytes==3 && iVal<0x800 ) // and overlong 3-byte codes
29459 				|| ( iVal>=0xfff0 && iVal<=0xffff ) ) // and kinda-valid specials expat chokes on anyway
29460 			{
29461 				iBytes = i;
29462 				for ( i=0; i<iBytes; i++ )
29463 					p[i] = ' ';
29464 			}
29465 
29466 			// only move forward by the amount of succesfully processed bytes!
29467 			p += i;
29468 		}
29469 	}
29470 
29471 	if ( sph_XML_Parse ( m_pParser, (const char*) m_pBuffer, iBufferLen, bLast )!=XML_STATUS_OK )
29472 	{
29473 		SphDocID_t uFailedID = 0;
29474 		if ( m_dParsedDocuments.GetLength() )
29475 			uFailedID = m_dParsedDocuments.Last()->m_uDocID;
29476 
29477 		sError.SetSprintf ( "source '%s': XML parse error: %s (line=%d, pos=%d, docid=" DOCID_FMT ")",
29478 			m_tSchema.m_sName.cstr(), sph_XML_ErrorString ( sph_XML_GetErrorCode ( m_pParser ) ),
29479 			(int)sph_XML_GetCurrentLineNumber ( m_pParser ), (int)sph_XML_GetCurrentColumnNumber ( m_pParser ),
29480 			uFailedID );
29481 		m_tDocInfo.m_uDocID = 1;
29482 		return false;
29483 	}
29484 
29485 	if ( !m_sError.IsEmpty () )
29486 	{
29487 		sError = m_sError;
29488 		m_tDocInfo.m_uDocID = 1;
29489 		return false;
29490 	}
29491 
29492 	return true;
29493 }
29494 
29495 
NextDocument(CSphString & sError)29496 BYTE **	CSphSource_XMLPipe2::NextDocument ( CSphString & sError )
29497 {
29498 	assert ( m_pBuffer && m_pFieldBuffer );
29499 
29500 	if ( m_bRemoveParsed )
29501 	{
29502 		SafeDelete ( m_dParsedDocuments[0] );
29503 		m_dParsedDocuments.RemoveFast ( 0 );
29504 		m_bRemoveParsed = false;
29505 	}
29506 
29507 	int iReadResult = 0;
29508 
29509 	while ( m_dParsedDocuments.GetLength()==0 )
29510 	{
29511 		// saved bytes to the front!
29512 		if ( m_iReparseLen )
29513 			memmove ( m_pBuffer, m_pBuffer+m_iReparseStart, m_iReparseLen );
29514 
29515 		// read more data
29516 		iReadResult = fread ( m_pBuffer+m_iReparseLen, 1, m_iBufferSize-m_iReparseLen, m_pPipe );
29517 		if ( iReadResult==0 )
29518 			break;
29519 
29520 		// and parse it
29521 		if ( !ParseNextChunk ( iReadResult+m_iReparseLen, sError ) )
29522 			return NULL;
29523 	}
29524 
29525 	while ( m_dParsedDocuments.GetLength()!=0 )
29526 	{
29527 		Document_t * pDocument = m_dParsedDocuments[0];
29528 		int nAttrs = m_tSchema.GetAttrsCount ();
29529 
29530 		// docid
29531 		m_tDocInfo.m_uDocID = VerifyID ( pDocument->m_uDocID );
29532 		if ( m_tDocInfo.m_uDocID==0 )
29533 		{
29534 			SafeDelete ( m_dParsedDocuments[0] );
29535 			m_dParsedDocuments.RemoveFast ( 0 );
29536 			continue;
29537 		}
29538 
29539 		int iFieldLenAttr = nAttrs;
29540 		if ( m_bIndexFieldLens )
29541 			iFieldLenAttr = nAttrs - m_tSchema.m_dFields.GetLength();
29542 
29543 		// attributes
29544 		for ( int i = 0; i < nAttrs; i++ )
29545 		{
29546 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr ( i );
29547 
29548 			// reset, and the value will be filled by IterateHits()
29549 			if ( i>=iFieldLenAttr )
29550 			{
29551 				assert ( tAttr.m_eAttrType==SPH_ATTR_TOKENCOUNT );
29552 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
29553 				continue;
29554 			}
29555 
29556 			const CSphString & sAttrValue = pDocument->m_dAttrs[i].IsEmpty () && m_dDefaultAttrs.GetLength ()
29557 				? m_dDefaultAttrs[i]
29558 				: pDocument->m_dAttrs[i];
29559 
29560 			if ( tAttr.m_eAttrType==SPH_ATTR_UINT32SET || tAttr.m_eAttrType==SPH_ATTR_INT64SET )
29561 			{
29562 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, ParseFieldMVA ( m_dMva, sAttrValue.cstr (), tAttr.m_eAttrType==SPH_ATTR_INT64SET ) );
29563 				continue;
29564 			}
29565 
29566 			switch ( tAttr.m_eAttrType )
29567 			{
29568 				case SPH_ATTR_STRING:
29569 				case SPH_ATTR_JSON:
29570 					m_dStrAttrs[i] = sAttrValue.cstr ();
29571 					if ( !m_dStrAttrs[i].cstr() )
29572 						m_dStrAttrs[i] = "";
29573 
29574 					m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
29575 					break;
29576 
29577 				case SPH_ATTR_FLOAT:
29578 					m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( sAttrValue.cstr () ) );
29579 					break;
29580 
29581 				case SPH_ATTR_BIGINT:
29582 					m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( sAttrValue.cstr () ) );
29583 					break;
29584 
29585 				default:
29586 					m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( sAttrValue.cstr () ) );
29587 					break;
29588 			}
29589 		}
29590 
29591 		m_bRemoveParsed = true;
29592 
29593 		int nFields = m_tSchema.m_dFields.GetLength ();
29594 		if ( !nFields )
29595 		{
29596 			m_tDocInfo.m_uDocID = 0;
29597 			return NULL;
29598 		}
29599 
29600 		m_dFieldPtrs.Resize ( nFields );
29601 		for ( int i = 0; i < nFields; ++i )
29602 			m_dFieldPtrs[i] = pDocument->m_dFields[i].Begin();
29603 
29604 		return (BYTE **)&( m_dFieldPtrs[0] );
29605 	}
29606 
29607 	if ( !iReadResult )
29608 		m_tDocInfo.m_uDocID = 0;
29609 
29610 	return NULL;
29611 }
29612 
29613 
IterateKillListStart(CSphString &)29614 bool CSphSource_XMLPipe2::IterateKillListStart ( CSphString & )
29615 {
29616 	m_iKillListIterator = 0;
29617 	return true;
29618 }
29619 
29620 
IterateKillListNext(SphDocID_t & uDocId)29621 bool CSphSource_XMLPipe2::IterateKillListNext ( SphDocID_t & uDocId )
29622 {
29623 	if ( m_iKillListIterator>=m_dKillList.GetLength () )
29624 		return false;
29625 
29626 	uDocId = m_dKillList [ m_iKillListIterator++ ];
29627 	return true;
29628 }
29629 
29630 enum EXMLElem
29631 {
29632 	ELEM_DOCSET,
29633 	ELEM_SCHEMA,
29634 	ELEM_FIELD,
29635 	ELEM_ATTR,
29636 	ELEM_DOCUMENT,
29637 	ELEM_KLIST,
29638 	ELEM_NONE
29639 };
29640 
LookupElement(const char * szName)29641 static EXMLElem LookupElement ( const char * szName )
29642 {
29643 	if ( szName[0]!='s' )
29644 		return ELEM_NONE;
29645 
29646 	int iLen = strlen(szName);
29647 	if ( iLen>=11 && iLen<=15 )
29648 	{
29649 		char iHash = (char)( ( iLen + szName[7] ) & 15 );
29650 		switch ( iHash )
29651 		{
29652 		case 1:		if ( !strcmp ( szName, "sphinx:docset" ) )		return ELEM_DOCSET;
29653 		case 0:		if ( !strcmp ( szName, "sphinx:schema" ) )		return ELEM_SCHEMA;
29654 		case 2:		if ( !strcmp ( szName, "sphinx:field" ) )		return ELEM_FIELD;
29655 		case 12:	if ( !strcmp ( szName, "sphinx:attr" ) )		return ELEM_ATTR;
29656 		case 3:		if ( !strcmp ( szName, "sphinx:document" ) )	return ELEM_DOCUMENT;
29657 		case 10:	if ( !strcmp ( szName, "sphinx:killlist" ) )	return ELEM_KLIST;
29658 		}
29659 	}
29660 
29661 	return ELEM_NONE;
29662 }
29663 
StartElement(const char * szName,const char ** pAttrs)29664 void CSphSource_XMLPipe2::StartElement ( const char * szName, const char ** pAttrs )
29665 {
29666 	EXMLElem ePos = LookupElement ( szName );
29667 
29668 	switch ( ePos )
29669 	{
29670 	case ELEM_DOCSET:
29671 		m_bInDocset = true;
29672 		m_bFirstTagAfterDocset = true;
29673 		return;
29674 
29675 	case ELEM_SCHEMA:
29676 	{
29677 		if ( !m_bInDocset || !m_bFirstTagAfterDocset )
29678 		{
29679 			Error ( "<sphinx:schema> is allowed immediately after <sphinx:docset> only" );
29680 			return;
29681 		}
29682 
29683 		if ( m_tSchema.m_dFields.GetLength () > 0 || m_tSchema.GetAttrsCount () > 0 )
29684 		{
29685 			sphWarn ( "%s", DecorateMessage ( "both embedded and configured schemas found; using embedded" ) );
29686 			m_tSchema.Reset ();
29687 			CSphMatch tDocInfo;
29688 			Swap ( m_tDocInfo, tDocInfo );
29689 		}
29690 
29691 		m_bFirstTagAfterDocset = false;
29692 		m_bInSchema = true;
29693 	}
29694 	return;
29695 
29696 	case ELEM_FIELD:
29697 	{
29698 		if ( !m_bInDocset || !m_bInSchema )
29699 		{
29700 			Error ( "<sphinx:field> is allowed inside <sphinx:schema> only" );
29701 			return;
29702 		}
29703 
29704 		const char ** dAttrs = pAttrs;
29705 		CSphColumnInfo Info;
29706 		CSphString sDefault;
29707 		bool bIsAttr = false;
29708 		bool bWordDict = ( m_pDict && m_pDict->GetSettings().m_bWordDict );
29709 
29710 		while ( dAttrs[0] && dAttrs[1] && dAttrs[0][0] && dAttrs[1][0] )
29711 		{
29712 			if ( !strcmp ( *dAttrs, "name" ) )
29713 			{
29714 				AddFieldToSchema ( dAttrs[1], bWordDict, m_tSchema );
29715 				Info.m_sName = dAttrs[1];
29716 			} else if ( !strcmp ( *dAttrs, "attr" ) )
29717 			{
29718 				bIsAttr = true;
29719 				if ( !strcmp ( dAttrs[1], "string" ) )
29720 					Info.m_eAttrType = SPH_ATTR_STRING;
29721 				else if ( !strcmp ( dAttrs[1], "json" ) )
29722 					Info.m_eAttrType = SPH_ATTR_JSON;
29723 
29724 			} else if ( !strcmp ( *dAttrs, "default" ) )
29725 				sDefault = dAttrs[1];
29726 
29727 			dAttrs += 2;
29728 		}
29729 
29730 		if ( bIsAttr )
29731 		{
29732 			if ( CSphSchema::IsReserved ( Info.m_sName.cstr() ) )
29733 			{
29734 				Error ( "%s is not a valid attribute name", Info.m_sName.cstr() );
29735 				return;
29736 			}
29737 
29738 			Info.m_iIndex = m_tSchema.GetAttrsCount ();
29739 			m_tSchema.AddAttr ( Info, true ); // all attributes are dynamic at indexing time
29740 			m_dDefaultAttrs.Add ( sDefault );
29741 		}
29742 	}
29743 	return;
29744 
29745 	case ELEM_ATTR:
29746 	{
29747 		if ( !m_bInDocset || !m_bInSchema )
29748 		{
29749 			Error ( "<sphinx:attr> is allowed inside <sphinx:schema> only" );
29750 			return;
29751 		}
29752 
29753 		bool bError = false;
29754 		CSphString sDefault;
29755 
29756 		CSphColumnInfo Info;
29757 		Info.m_eAttrType = SPH_ATTR_INTEGER;
29758 
29759 		const char ** dAttrs = pAttrs;
29760 
29761 		while ( dAttrs[0] && dAttrs[1] && dAttrs[0][0] && dAttrs[1][0] && !bError )
29762 		{
29763 			if ( !strcmp ( *dAttrs, "name" ) )
29764 				Info.m_sName = dAttrs[1];
29765 			else if ( !strcmp ( *dAttrs, "bits" ) )
29766 				Info.m_tLocator.m_iBitCount = strtol ( dAttrs[1], NULL, 10 );
29767 			else if ( !strcmp ( *dAttrs, "default" ) )
29768 				sDefault = dAttrs[1];
29769 			else if ( !strcmp ( *dAttrs, "type" ) )
29770 			{
29771 				const char * szType = dAttrs[1];
29772 				if ( !strcmp ( szType, "int" ) )				Info.m_eAttrType = SPH_ATTR_INTEGER;
29773 				else if ( !strcmp ( szType, "timestamp" ) )		Info.m_eAttrType = SPH_ATTR_TIMESTAMP;
29774 				else if ( !strcmp ( szType, "bool" ) )			Info.m_eAttrType = SPH_ATTR_BOOL;
29775 				else if ( !strcmp ( szType, "float" ) )			Info.m_eAttrType = SPH_ATTR_FLOAT;
29776 				else if ( !strcmp ( szType, "bigint" ) )		Info.m_eAttrType = SPH_ATTR_BIGINT;
29777 				else if ( !strcmp ( szType, "string" ) )		Info.m_eAttrType = SPH_ATTR_STRING;
29778 				else if ( !strcmp ( szType, "json" ) )			Info.m_eAttrType = SPH_ATTR_JSON;
29779 				else if ( !strcmp ( szType, "multi" ) )
29780 				{
29781 					Info.m_eAttrType = SPH_ATTR_UINT32SET;
29782 					Info.m_eSrc = SPH_ATTRSRC_FIELD;
29783 				} else if ( !strcmp ( szType, "multi_64" ) )
29784 				{
29785 					Info.m_eAttrType = SPH_ATTR_INT64SET;
29786 					Info.m_eSrc = SPH_ATTRSRC_FIELD;
29787 				} else
29788 				{
29789 					Error ( "unknown column type '%s'", szType );
29790 					bError = true;
29791 				}
29792 			}
29793 
29794 			dAttrs += 2;
29795 		}
29796 
29797 		if ( !bError )
29798 		{
29799 			if ( CSphSchema::IsReserved ( Info.m_sName.cstr() ) )
29800 			{
29801 				Error ( "%s is not a valid attribute name", Info.m_sName.cstr() );
29802 				return;
29803 			}
29804 
29805 			Info.m_iIndex = m_tSchema.GetAttrsCount ();
29806 			m_tSchema.AddAttr ( Info, true ); // all attributes are dynamic at indexing time
29807 			m_dDefaultAttrs.Add ( sDefault );
29808 		}
29809 	}
29810 	return;
29811 
29812 	case ELEM_DOCUMENT:
29813 	{
29814 		if ( !m_bInDocset || m_bInSchema )
29815 			return DocumentError ( "<sphinx:schema>" );
29816 
29817 		if ( m_bInKillList )
29818 			return DocumentError ( "<sphinx:killlist>" );
29819 
29820 		if ( m_bInDocument )
29821 			return DocumentError ( "<sphinx:document>" );
29822 
29823 		if ( m_tSchema.m_dFields.GetLength()==0 && m_tSchema.GetAttrsCount()==0 )
29824 		{
29825 			Error ( "no schema configured, and no embedded schema found" );
29826 			return;
29827 		}
29828 
29829 		m_bInDocument = true;
29830 
29831 		assert ( !m_pCurDocument );
29832 		m_pCurDocument = new Document_t;
29833 
29834 		m_pCurDocument->m_uDocID = 0;
29835 		m_pCurDocument->m_dFields.Resize ( m_tSchema.m_dFields.GetLength () );
29836 		// for safety
29837 		ARRAY_FOREACH ( i, m_pCurDocument->m_dFields )
29838 			m_pCurDocument->m_dFields[i].Add ( '\0' );
29839 		m_pCurDocument->m_dAttrs.Resize ( m_tSchema.GetAttrsCount () );
29840 
29841 		if ( pAttrs[0] && pAttrs[1] && pAttrs[0][0] && pAttrs[1][0] )
29842 			if ( !strcmp ( pAttrs[0], "id" ) )
29843 				m_pCurDocument->m_uDocID = sphToDocid ( pAttrs[1] );
29844 
29845 		if ( m_pCurDocument->m_uDocID==0 )
29846 			Error ( "attribute 'id' required in <sphinx:document>" );
29847 	}
29848 	return;
29849 
29850 	case ELEM_KLIST:
29851 	{
29852 		if ( !m_bInDocset || m_bInDocument || m_bInSchema )
29853 		{
29854 			Error ( "<sphinx:killlist> is not allowed inside <sphinx:schema> or <sphinx:document>" );
29855 			return;
29856 		}
29857 
29858 		m_bInKillList = true;
29859 	}
29860 	return;
29861 
29862 	case ELEM_NONE: break; // avoid warning
29863 	}
29864 
29865 	if ( m_bInKillList )
29866 	{
29867 		if ( m_bInId )
29868 		{
29869 			m_iElementDepth++;
29870 			return;
29871 		}
29872 
29873 		if ( strcmp ( szName, "id" ) )
29874 		{
29875 			Error ( "only 'id' is allowed inside <sphinx:killlist>" );
29876 			return;
29877 		}
29878 
29879 		m_bInId = true;
29880 
29881 	} else if ( m_bInDocument )
29882 	{
29883 		if ( m_iCurField!=-1 || m_iCurAttr!=-1 )
29884 		{
29885 			m_iElementDepth++;
29886 			return;
29887 		}
29888 
29889 		for ( int i = 0; i < m_tSchema.m_dFields.GetLength () && m_iCurField==-1; i++ )
29890 			if ( m_tSchema.m_dFields[i].m_sName==szName )
29891 				m_iCurField = i;
29892 
29893 		m_iCurAttr = m_tSchema.GetAttrIndex ( szName );
29894 
29895 		if ( m_iCurAttr!=-1 || m_iCurField!=-1 )
29896 			return;
29897 
29898 		m_bInIgnoredTag = true;
29899 
29900 		bool bInvalidFound = false;
29901 		for ( int i = 0; i < m_dInvalid.GetLength () && !bInvalidFound; i++ )
29902 			bInvalidFound = m_dInvalid[i]==szName;
29903 
29904 		if ( !bInvalidFound )
29905 		{
29906 			sphWarn ( "%s", DecorateMessage ( "unknown field/attribute '%s'; ignored", szName ) );
29907 			m_dInvalid.Add ( szName );
29908 		}
29909 	}
29910 }
29911 
29912 
EndElement(const char * szName)29913 void CSphSource_XMLPipe2::EndElement ( const char * szName )
29914 {
29915 	m_bInIgnoredTag = false;
29916 
29917 	EXMLElem ePos = LookupElement ( szName );
29918 
29919 	switch ( ePos )
29920 	{
29921 	case ELEM_DOCSET:
29922 		m_bInDocset = false;
29923 		return;
29924 
29925 	case ELEM_SCHEMA:
29926 		m_bInSchema = false;
29927 		AddAutoAttrs ( m_sError );
29928 		AllocDocinfo();
29929 		return;
29930 
29931 	case ELEM_DOCUMENT:
29932 		m_bInDocument = false;
29933 		if ( m_pCurDocument )
29934 			m_dParsedDocuments.Add ( m_pCurDocument );
29935 		m_pCurDocument = NULL;
29936 		return;
29937 
29938 	case ELEM_KLIST:
29939 		m_bInKillList = false;
29940 		return;
29941 
29942 	case ELEM_FIELD: // avoid warnings
29943 	case ELEM_ATTR:
29944 	case ELEM_NONE: break;
29945 	}
29946 
29947 	if ( m_bInKillList )
29948 	{
29949 		if ( m_iElementDepth!=0 )
29950 		{
29951 			m_iElementDepth--;
29952 			return;
29953 		}
29954 
29955 		if ( m_bInId )
29956 		{
29957 			m_pFieldBuffer [ Min ( m_iFieldBufferLen, m_iFieldBufferMax-1 ) ] = '\0';
29958 			m_dKillList.Add ( sphToDocid ( (const char *)m_pFieldBuffer ) );
29959 			m_iFieldBufferLen = 0;
29960 			m_bInId = false;
29961 		}
29962 
29963 	} else if ( m_bInDocument && ( m_iCurAttr!=-1 || m_iCurField!=-1 ) )
29964 	{
29965 		if ( m_iElementDepth!=0 )
29966 		{
29967 			m_iElementDepth--;
29968 			return;
29969 		}
29970 
29971 		if ( m_iCurField!=-1 )
29972 		{
29973 			assert ( m_pCurDocument );
29974 			CSphVector<BYTE> & dBuf = m_pCurDocument->m_dFields [ m_iCurField ];
29975 
29976 			dBuf.Last() = ' ';
29977 			dBuf.Reserve ( dBuf.GetLength() + m_iFieldBufferLen + 6 ); // 6 is a safety gap
29978 			memcpy ( dBuf.Begin()+dBuf.GetLength(), m_pFieldBuffer, m_iFieldBufferLen );
29979 			dBuf.Resize ( dBuf.GetLength()+m_iFieldBufferLen );
29980 			dBuf.Add ( '\0' );
29981 		}
29982 		if ( m_iCurAttr!=-1 )
29983 		{
29984 			assert ( m_pCurDocument );
29985 			if ( !m_pCurDocument->m_dAttrs [ m_iCurAttr ].IsEmpty () )
29986 				sphWarn ( "duplicate attribute node <%s> - using first value", m_tSchema.GetAttr ( m_iCurAttr ).m_sName.cstr() );
29987 			else
29988 				m_pCurDocument->m_dAttrs [ m_iCurAttr ].SetBinary ( (char*)m_pFieldBuffer, m_iFieldBufferLen );
29989 		}
29990 
29991 		m_iFieldBufferLen = 0;
29992 
29993 		m_iCurAttr = -1;
29994 		m_iCurField = -1;
29995 	}
29996 }
29997 
29998 
UnexpectedCharaters(const char * pCharacters,int iLen,const char * szComment)29999 void CSphSource_XMLPipe2::UnexpectedCharaters ( const char * pCharacters, int iLen, const char * szComment )
30000 {
30001 	const int MAX_WARNING_LENGTH = 64;
30002 
30003 	bool bSpaces = true;
30004 	for ( int i = 0; i < iLen && bSpaces; i++ )
30005 		if ( !sphIsSpace ( pCharacters[i] ) )
30006 			bSpaces = false;
30007 
30008 	if ( !bSpaces )
30009 	{
30010 		CSphString sWarning;
30011 		sWarning.SetBinary ( pCharacters, Min ( iLen, MAX_WARNING_LENGTH ) );
30012 		sphWarn ( "source '%s': unexpected string '%s' (line=%d, pos=%d) %s",
30013 			m_tSchema.m_sName.cstr(), sWarning.cstr (),
30014 			(int)sph_XML_GetCurrentLineNumber ( m_pParser ), (int)sph_XML_GetCurrentColumnNumber ( m_pParser ), szComment );
30015 	}
30016 }
30017 
30018 
Characters(const char * pCharacters,int iLen)30019 void CSphSource_XMLPipe2::Characters ( const char * pCharacters, int iLen )
30020 {
30021 	if ( m_bInIgnoredTag )
30022 		return;
30023 
30024 	if ( !m_bInDocset )
30025 	{
30026 		UnexpectedCharaters ( pCharacters, iLen, "outside of <sphinx:docset>" );
30027 		return;
30028 	}
30029 
30030 	if ( !m_bInSchema && !m_bInDocument && !m_bInKillList )
30031 	{
30032 		UnexpectedCharaters ( pCharacters, iLen, "outside of <sphinx:schema> and <sphinx:document>" );
30033 		return;
30034 	}
30035 
30036 	if ( m_iCurAttr==-1 && m_iCurField==-1 && !m_bInKillList )
30037 	{
30038 		UnexpectedCharaters ( pCharacters, iLen, m_bInDocument ? "inside <sphinx:document>" : ( m_bInSchema ? "inside <sphinx:schema>" : "" ) );
30039 		return;
30040 	}
30041 
30042 	if ( iLen + m_iFieldBufferLen < m_iFieldBufferMax )
30043 	{
30044 		memcpy ( m_pFieldBuffer + m_iFieldBufferLen, pCharacters, iLen );
30045 		m_iFieldBufferLen += iLen;
30046 
30047 	} else
30048 	{
30049 		const CSphString & sName = ( m_iCurField!=-1 ) ? m_tSchema.m_dFields[m_iCurField].m_sName : m_tSchema.GetAttr ( m_iCurAttr ).m_sName;
30050 
30051 		bool bWarned = false;
30052 		for ( int i = 0; i < m_dWarned.GetLength () && !bWarned; i++ )
30053 			bWarned = m_dWarned[i]==sName;
30054 
30055 		if ( !bWarned )
30056 		{
30057 			sphWarn ( "source '%s': field/attribute '%s' length exceeds max length (line=%d, pos=%d, docid=" DOCID_FMT ")",
30058 				m_tSchema.m_sName.cstr(), sName.cstr(),
30059 				(int)sph_XML_GetCurrentLineNumber ( m_pParser ), (int)sph_XML_GetCurrentColumnNumber ( m_pParser ),
30060 				m_pCurDocument->m_uDocID );
30061 
30062 			m_dWarned.Add ( sName );
30063 		}
30064 	}
30065 }
30066 
sphCreateSourceXmlpipe2(const CSphConfigSection * pSource,FILE * pPipe,const char * szSourceName,int iMaxFieldLen,bool RLPARG (bProxy),CSphString & sError)30067 CSphSource * sphCreateSourceXmlpipe2 ( const CSphConfigSection * pSource, FILE * pPipe,
30068 	const char * szSourceName, int iMaxFieldLen, bool RLPARG(bProxy), CSphString & sError )
30069 {
30070 	CSphSource_XMLPipe2 * pXMLPipe;
30071 	bool bUTF8 = pSource->GetInt ( "xmlpipe_fixup_utf8", 0 )!=0;
30072 
30073 #if USE_RLP
30074 	if ( bProxy )
30075 		pXMLPipe = new CSphSource_Proxy<CSphSource_XMLPipe2> ( szSourceName );
30076 	else
30077 #endif
30078 		pXMLPipe = new CSphSource_XMLPipe2 ( szSourceName );
30079 
30080 	if ( !pXMLPipe->Setup ( iMaxFieldLen, bUTF8, pPipe, *pSource, sError ) )
30081 		SafeDelete ( pXMLPipe );
30082 
30083 	return pXMLPipe;
30084 }
30085 
30086 #endif
30087 
30088 
30089 #if USE_ODBC
30090 #if DL_UNIXODBC
30091 // ODBC lib might be libodbc.so or libiodbc.so
30092 #define ODBC_NUM_FUNCS (13)
30093 
30094 #if defined(__INTEL_COMPILER) || defined(__ICL) || defined(__ICC) || defined(__ECC) || defined(__GNUC__)
30095 
30096 // use non-standard compiler extension __typeof__
30097 // it allow to declare pointer to the function without using it's declaration
30098 typedef __typeof__ ( SQLFreeHandle ) *xSQLFreeHandle;
30099 typedef __typeof__ ( SQLDisconnect ) *xSQLDisconnect;
30100 typedef __typeof__ ( SQLCloseCursor ) *xSQLCloseCursor;
30101 typedef __typeof__ ( SQLGetDiagRec ) *xSQLGetDiagRec;
30102 typedef __typeof__ ( SQLSetEnvAttr ) *xSQLSetEnvAttr;
30103 typedef __typeof__ ( SQLAllocHandle ) *xSQLAllocHandle;
30104 typedef __typeof__ ( SQLFetch ) *xSQLFetch;
30105 typedef __typeof__ ( SQLExecDirect ) *xSQLExecDirect;
30106 typedef __typeof__ ( SQLNumResultCols ) *xSQLNumResultCols;
30107 typedef __typeof__ ( SQLDescribeCol ) *xSQLDescribeCol;
30108 typedef __typeof__ ( SQLBindCol ) *xSQLBindCol;
30109 typedef __typeof__ ( SQLDrivers ) *xSQLDrivers;
30110 typedef __typeof__ ( SQLDriverConnect ) *xSQLDriverConnect;
30111 
30112 #else // compilers which are not known about __typeof__ support
30113 // declarations below are directly copy-pasted from expat.h,
30114 // and then (*x...) is placed around the function names.
30115 // In mostly cases this code will not be used, and the declarations
30116 // from previous block will be used instead.
30117 #warning Be sure that the unixodbc function signatures are the same \
30118 as in sql.h and sqlext.h Correct the code below if this is not so.
30119 typedef SQLRETURN  SQL_API (*xSQLFreeHandle)(SQLSMALLINT HandleType, SQLHANDLE Handle) //NOLINT
30120 typedef SQLRETURN  SQL_API (*xSQLDisconnect)(SQLHDBC ConnectionHandle); //NOLINT
30121 typedef SQLRETURN  SQL_API (*xSQLCloseCursor)(SQLHSTMT StatementHandle); //NOLINT
30122 typedef SQLRETURN  SQL_API (*xSQLGetDiagRec)(SQLSMALLINT HandleType, SQLHANDLE Handle, //NOLINT
30123                                      SQLSMALLINT RecNumber, SQLCHAR *Sqlstate, //NOLINT
30124                                      SQLINTEGER *NativeError, SQLCHAR *MessageText, //NOLINT
30125                                      SQLSMALLINT BufferLength, SQLSMALLINT *TextLength); //NOLINT
30126 typedef SQLRETURN  SQL_API (*xSQLSetEnvAttr)(SQLHENV EnvironmentHandle, //NOLINT
30127                                      SQLINTEGER Attribute, SQLPOINTER Value, //NOLINT
30128                                      SQLINTEGER StringLength); //NOLINT
30129 typedef SQLRETURN  SQL_API (*xSQLAllocHandle)(SQLSMALLINT HandleType, //NOLINT
30130                                       SQLHANDLE InputHandle, SQLHANDLE *OutputHandle); //NOLINT
30131 typedef SQLRETURN  SQL_API (*xSQLFetch)(SQLHSTMT StatementHandle); //NOLINT
30132 typedef SQLRETURN  SQL_API (*xSQLExecDirect)(SQLHSTMT StatementHandle, //NOLINT
30133                                      SQLCHAR *StatementText, SQLINTEGER TextLength); //NOLINT
30134 typedef SQLRETURN  SQL_API (*xSQLNumResultCols)(SQLHSTMT StatementHandle, //NOLINT
30135                                         SQLSMALLINT *ColumnCount); //NOLINT
30136 typedef SQLRETURN  SQL_API (*xSQLDescribeCol)(SQLHSTMT StatementHandle, //NOLINT
30137                                       SQLUSMALLINT ColumnNumber, SQLCHAR *ColumnName, //NOLINT
30138                                       SQLSMALLINT BufferLength, SQLSMALLINT *NameLength, //NOLINT
30139                                       SQLSMALLINT *DataType, SQLULEN *ColumnSize, //NOLINT
30140                                       SQLSMALLINT *DecimalDigits, SQLSMALLINT *Nullable); //NOLINT
30141 typedef SQLRETURN  SQL_API (*xSQLBindCol)(SQLHSTMT StatementHandle, //NOLINT
30142                                   SQLUSMALLINT ColumnNumber, SQLSMALLINT TargetType, //NOLINT
30143                                   SQLPOINTER TargetValue, SQLLEN BufferLength, //NOLINT
30144                                   SQLLEN *StrLen_or_Ind); //NOLINT
30145 // these two from sqlext.h
30146 typedef SQLRETURN SQL_API (*xSQLDrivers)( //NOLINT
30147     SQLHENV            henv, //NOLINT
30148     SQLUSMALLINT       fDirection, //NOLINT
30149     SQLCHAR 		  *szDriverDesc, //NOLINT
30150     SQLSMALLINT        cbDriverDescMax, //NOLINT
30151     SQLSMALLINT 	  *pcbDriverDesc, //NOLINT
30152     SQLCHAR 		  *szDriverAttributes, //NOLINT
30153     SQLSMALLINT        cbDrvrAttrMax, //NOLINT
30154     SQLSMALLINT 	  *pcbDrvrAttr); //NOLINT
30155 typedef SQLRETURN SQL_API (*xSQLDriverConnect)( //NOLINT
30156     SQLHDBC            hdbc, //NOLINT
30157     SQLHWND            hwnd, //NOLINT
30158     SQLCHAR 		  *szConnStrIn, //NOLINT
30159     SQLSMALLINT        cbConnStrIn, //NOLINT
30160     SQLCHAR           *szConnStrOut, //NOLINT
30161     SQLSMALLINT        cbConnStrOutMax, //NOLINT
30162     SQLSMALLINT 	  *pcbConnStrOut, //NOLINT
30163     SQLUSMALLINT       fDriverCompletion); //NOLINT
30164 #endif
30165 
30166 class CODBC : public CSphDynamicLibrary
30167 {
30168 	static const char* sFuncs[ODBC_NUM_FUNCS];
30169 	static void** pFuncs[ODBC_NUM_FUNCS];
30170 
30171 public:
Init()30172 	bool Init()
30173 	{
30174 		if ( ( !CSphDynamicLibrary::Init ( "libodbc.so", true ) )
30175 			&& ( !CSphDynamicLibrary::Init ( "libiodbc.so", true ) ) )
30176 				return false;
30177 		if ( !LoadSymbols ( sFuncs, pFuncs, ODBC_NUM_FUNCS ) )
30178 			return false;
30179 		return true;
30180 	}
Stub()30181 	static void 	Stub()
30182 	{
30183 		sphLogDebug ( "Error! Odbc func is null!" );
30184 	}
30185 
30186 static xSQLFreeHandle m_pSQLFreeHandle;
30187 static xSQLDisconnect m_pSQLDisconnect;
30188 static xSQLCloseCursor m_pSQLCloseCursor;
30189 static xSQLGetDiagRec m_pSQLGetDiagRec;
30190 static xSQLSetEnvAttr m_pSQLSetEnvAttr;
30191 static xSQLAllocHandle m_pSQLAllocHandle;
30192 static xSQLFetch m_pSQLFetch;
30193 static xSQLExecDirect m_pSQLExecDirect;
30194 static xSQLNumResultCols m_pSQLNumResultCols;
30195 static xSQLDescribeCol m_pSQLDescribeCol;
30196 static xSQLBindCol m_pSQLBindCol;
30197 static xSQLDrivers m_pSQLDrivers;
30198 static xSQLDriverConnect m_pSQLDriverConnect;
30199 };
30200 
30201 #define sph_SQLFreeHandle (*CODBC::m_pSQLFreeHandle)
30202 #define sph_SQLDisconnect (*CODBC::m_pSQLDisconnect)
30203 #define sph_SQLCloseCursor (*CODBC::m_pSQLCloseCursor)
30204 #define sph_SQLGetDiagRec (*CODBC::m_pSQLGetDiagRec)
30205 #define sph_SQLSetEnvAttr (*CODBC::m_pSQLSetEnvAttr)
30206 #define sph_SQLAllocHandle (*CODBC::m_pSQLAllocHandle)
30207 #define sph_SQLFetch (*CODBC::m_pSQLFetch)
30208 #define sph_SQLExecDirect (*CODBC::m_pSQLExecDirect)
30209 #define sph_SQLNumResultCols (*CODBC::m_pSQLNumResultCols)
30210 #define sph_SQLDescribeCol (*CODBC::m_pSQLDescribeCol)
30211 #define sph_SQLBindCol (*CODBC::m_pSQLBindCol)
30212 #define sph_SQLDrivers (*CODBC::m_pSQLDrivers)
30213 #define sph_SQLDriverConnect (*CODBC::m_pSQLDriverConnect)
30214 
30215 const char* CODBC::sFuncs[] = {"SQLFreeHandle", "SQLDisconnect",
30216 	"SQLCloseCursor", "SQLGetDiagRec", "SQLSetEnvAttr", "SQLAllocHandle",
30217 	"SQLFetch", "SQLExecDirect", "SQLNumResultCols", "SQLDescribeCol",
30218 	"SQLBindCol", "SQLDrivers", "SQLDriverConnect" };
30219 void** CODBC::pFuncs[] = {(void**)&m_pSQLFreeHandle, (void**)&m_pSQLDisconnect,
30220 	(void**)&m_pSQLCloseCursor, (void**)&m_pSQLGetDiagRec, (void**)&m_pSQLSetEnvAttr,
30221 	(void**)&m_pSQLAllocHandle, (void**)&m_pSQLFetch, (void**)&m_pSQLExecDirect,
30222 	(void**)&m_pSQLNumResultCols, (void**)&m_pSQLDescribeCol, (void**)&m_pSQLBindCol,
30223 	(void**)&m_pSQLDrivers, (void**)&m_pSQLDriverConnect };
30224 
30225 xSQLFreeHandle CODBC::m_pSQLFreeHandle = (xSQLFreeHandle)CODBC::Stub;
30226 xSQLDisconnect CODBC::m_pSQLDisconnect = (xSQLDisconnect)CODBC::Stub;
30227 xSQLCloseCursor CODBC::m_pSQLCloseCursor = (xSQLCloseCursor)CODBC::Stub;
30228 xSQLGetDiagRec CODBC::m_pSQLGetDiagRec = (xSQLGetDiagRec)CODBC::Stub;
30229 xSQLSetEnvAttr CODBC::m_pSQLSetEnvAttr = (xSQLSetEnvAttr)CODBC::Stub;
30230 xSQLAllocHandle CODBC::m_pSQLAllocHandle = (xSQLAllocHandle)CODBC::Stub;
30231 xSQLFetch CODBC::m_pSQLFetch = (xSQLFetch)CODBC::Stub;
30232 xSQLExecDirect CODBC::m_pSQLExecDirect = (xSQLExecDirect)CODBC::Stub;
30233 xSQLNumResultCols CODBC::m_pSQLNumResultCols = (xSQLNumResultCols)CODBC::Stub;
30234 xSQLDescribeCol CODBC::m_pSQLDescribeCol = (xSQLDescribeCol)CODBC::Stub;
30235 xSQLBindCol CODBC::m_pSQLBindCol = (xSQLBindCol)CODBC::Stub;
30236 xSQLDrivers CODBC::m_pSQLDrivers = (xSQLDrivers)CODBC::Stub;
30237 xSQLDriverConnect CODBC::m_pSQLDriverConnect = (xSQLDriverConnect)CODBC::Stub;
30238 
30239 CODBC MyOdbcHolder;
30240 
InitDynamicOdbc()30241 bool InitDynamicOdbc()
30242 {
30243 	return MyOdbcHolder.Init();
30244 }
30245 
30246 #else // !DL_UNIXODBC
30247 
30248 #define sph_SQLFreeHandle SQLFreeHandle
30249 #define sph_SQLDisconnect SQLDisconnect
30250 #define sph_SQLCloseCursor SQLCloseCursor
30251 #define sph_SQLGetDiagRec SQLGetDiagRec
30252 #define sph_SQLSetEnvAttr SQLSetEnvAttr
30253 #define sph_SQLAllocHandle SQLAllocHandle
30254 #define sph_SQLFetch SQLFetch
30255 #define sph_SQLExecDirect SQLExecDirect
30256 #define sph_SQLNumResultCols SQLNumResultCols
30257 #define sph_SQLDescribeCol SQLDescribeCol
30258 #define sph_SQLBindCol SQLBindCol
30259 #define sph_SQLDrivers SQLDrivers
30260 #define sph_SQLDriverConnect SQLDriverConnect
30261 #define InitDynamicOdbc() (true)
30262 
30263 #endif // DL_UNIXODBC
30264 
30265 
CSphSourceParams_ODBC()30266 CSphSourceParams_ODBC::CSphSourceParams_ODBC ()
30267 	: m_bWinAuth	( false )
30268 {
30269 }
30270 
30271 
CSphSource_ODBC(const char * sName)30272 CSphSource_ODBC::CSphSource_ODBC ( const char * sName )
30273 	: CSphSource_SQL	( sName )
30274 	, m_bWinAuth		( false )
30275 	, m_bUnicode		( false )
30276 	, m_hEnv			( NULL )
30277 	, m_hDBC			( NULL )
30278 	, m_hStmt			( NULL )
30279 	, m_nResultCols		( 0 )
30280 {
30281 }
30282 
30283 
SqlDismissResult()30284 void CSphSource_ODBC::SqlDismissResult ()
30285 {
30286 	if ( m_hStmt )
30287 	{
30288 		sph_SQLCloseCursor ( m_hStmt );
30289 		sph_SQLFreeHandle ( SQL_HANDLE_STMT, m_hStmt );
30290 		m_hStmt = NULL;
30291 	}
30292 }
30293 
30294 
30295 #define MS_SQL_BUFFER_GAP 16
30296 
30297 
SqlQuery(const char * sQuery)30298 bool CSphSource_ODBC::SqlQuery ( const char * sQuery )
30299 {
30300 	if ( sph_SQLAllocHandle ( SQL_HANDLE_STMT, m_hDBC, &m_hStmt )==SQL_ERROR )
30301 	{
30302 		if ( m_tParams.m_bPrintQueries )
30303 			fprintf ( stdout, "SQL-QUERY: %s: FAIL (SQLAllocHandle failed)\n", sQuery );
30304 		return false;
30305 	}
30306 
30307 	if ( sph_SQLExecDirect ( m_hStmt, (SQLCHAR *)sQuery, SQL_NTS )==SQL_ERROR )
30308 	{
30309 		GetSqlError ( SQL_HANDLE_STMT, m_hStmt );
30310 		if ( m_tParams.m_bPrintQueries )
30311 			fprintf ( stdout, "SQL-QUERY: %s: FAIL\n", sQuery );
30312 		return false;
30313 	}
30314 	if ( m_tParams.m_bPrintQueries )
30315 		fprintf ( stdout, "SQL-QUERY: %s: ok\n", sQuery );
30316 
30317 	SQLSMALLINT nCols = 0;
30318 	m_nResultCols = 0;
30319 	if ( sph_SQLNumResultCols ( m_hStmt, &nCols )==SQL_ERROR )
30320 		return false;
30321 
30322 	m_nResultCols = nCols;
30323 
30324 	const int MAX_NAME_LEN = 512;
30325 	char szColumnName[MAX_NAME_LEN];
30326 
30327 	m_dColumns.Resize ( m_nResultCols );
30328 	int iTotalBuffer = 0;
30329 	ARRAY_FOREACH ( i, m_dColumns )
30330 	{
30331 		QueryColumn_t & tCol = m_dColumns[i];
30332 
30333 		SQLULEN uColSize = 0;
30334 		SQLSMALLINT iNameLen = 0;
30335 		SQLSMALLINT iDataType = 0;
30336 		if ( sph_SQLDescribeCol ( m_hStmt, (SQLUSMALLINT)(i+1), (SQLCHAR*)szColumnName,
30337 			MAX_NAME_LEN, &iNameLen, &iDataType, &uColSize, NULL, NULL )==SQL_ERROR )
30338 				return false;
30339 
30340 		tCol.m_sName = szColumnName;
30341 		tCol.m_sName.ToLower();
30342 
30343 		// deduce buffer size
30344 		// use a small buffer by default, and a bigger one for varchars
30345 		int iBuffLen = DEFAULT_COL_SIZE;
30346 		if ( iDataType==SQL_WCHAR || iDataType==SQL_WVARCHAR || iDataType==SQL_WLONGVARCHAR|| iDataType==SQL_VARCHAR )
30347 			iBuffLen = VARCHAR_COL_SIZE;
30348 
30349 		if ( m_hColBuffers ( tCol.m_sName ) )
30350 			iBuffLen = m_hColBuffers [ tCol.m_sName ]; // got explicit user override
30351 		else if ( uColSize )
30352 			iBuffLen = Min ( uColSize+1, (SQLULEN) MAX_COL_SIZE ); // got data from driver
30353 
30354 		tCol.m_dContents.Resize ( iBuffLen + MS_SQL_BUFFER_GAP );
30355 		tCol.m_dRaw.Resize ( iBuffLen + MS_SQL_BUFFER_GAP );
30356 		tCol.m_iInd = 0;
30357 		tCol.m_iBufferSize = iBuffLen;
30358 		tCol.m_bUCS2 = m_bUnicode && ( iDataType==SQL_WCHAR || iDataType==SQL_WVARCHAR || iDataType==SQL_WLONGVARCHAR );
30359 		tCol.m_bTruncated = false;
30360 		iTotalBuffer += iBuffLen;
30361 
30362 		if ( sph_SQLBindCol ( m_hStmt, (SQLUSMALLINT)(i+1),
30363 			tCol.m_bUCS2 ? SQL_UNICODE : SQL_C_CHAR,
30364 			tCol.m_bUCS2 ? tCol.m_dRaw.Begin() : tCol.m_dContents.Begin(),
30365 			iBuffLen, &(tCol.m_iInd) )==SQL_ERROR )
30366 				return false;
30367 	}
30368 
30369 	if ( iTotalBuffer>WARN_ROW_SIZE )
30370 		sphWarn ( "row buffer is over %d bytes; consider revising sql_column_buffers", iTotalBuffer );
30371 
30372 	return true;
30373 }
30374 
30375 
SqlIsError()30376 bool CSphSource_ODBC::SqlIsError ()
30377 {
30378 	return !m_sError.IsEmpty ();
30379 }
30380 
30381 
SqlError()30382 const char * CSphSource_ODBC::SqlError ()
30383 {
30384 	return m_sError.cstr();
30385 }
30386 
30387 
SqlConnect()30388 bool CSphSource_ODBC::SqlConnect ()
30389 {
30390 	if_const ( !InitDynamicOdbc() )
30391 	{
30392 		if ( m_tParams.m_bPrintQueries )
30393 			fprintf ( stdout, "SQL-CONNECT: FAIL (NO ODBC CLIENT LIB)\n" );
30394 		return false;
30395 	}
30396 
30397 	if ( sph_SQLAllocHandle ( SQL_HANDLE_ENV, NULL, &m_hEnv )==SQL_ERROR )
30398 	{
30399 		if ( m_tParams.m_bPrintQueries )
30400 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
30401 		return false;
30402 	}
30403 
30404 	sph_SQLSetEnvAttr ( m_hEnv, SQL_ATTR_ODBC_VERSION, (void*) SQL_OV_ODBC3, SQL_IS_INTEGER );
30405 
30406 	if ( sph_SQLAllocHandle ( SQL_HANDLE_DBC, m_hEnv, &m_hDBC )==SQL_ERROR )
30407 	{
30408 		if ( m_tParams.m_bPrintQueries )
30409 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
30410 		return false;
30411 	}
30412 
30413 	OdbcPostConnect ();
30414 
30415 	char szOutConn [2048];
30416 	SQLSMALLINT iOutConn = 0;
30417 	if ( sph_SQLDriverConnect ( m_hDBC, NULL, (SQLTCHAR*) m_sOdbcDSN.cstr(), SQL_NTS,
30418 		(SQLCHAR*)szOutConn, sizeof(szOutConn), &iOutConn, SQL_DRIVER_NOPROMPT )==SQL_ERROR )
30419 	{
30420 		GetSqlError ( SQL_HANDLE_DBC, m_hDBC );
30421 		if ( m_tParams.m_bPrintQueries )
30422 			fprintf ( stdout, "SQL-CONNECT: FAIL\n" );
30423 		return false;
30424 	}
30425 
30426 	if ( m_tParams.m_bPrintQueries )
30427 		fprintf ( stdout, "SQL-CONNECT: ok\n" );
30428 	return true;
30429 }
30430 
30431 
SqlDisconnect()30432 void CSphSource_ODBC::SqlDisconnect ()
30433 {
30434 	if ( m_tParams.m_bPrintQueries )
30435 		fprintf ( stdout, "SQL-DISCONNECT\n" );
30436 
30437 	if ( m_hStmt!=NULL )
30438 		sph_SQLFreeHandle ( SQL_HANDLE_STMT, m_hStmt );
30439 
30440 	if ( m_hDBC )
30441 	{
30442 		sph_SQLDisconnect ( m_hDBC );
30443 		sph_SQLFreeHandle ( SQL_HANDLE_DBC, m_hDBC );
30444 	}
30445 
30446 	if ( m_hEnv )
30447 		sph_SQLFreeHandle ( SQL_HANDLE_ENV, m_hEnv );
30448 }
30449 
30450 
SqlNumFields()30451 int CSphSource_ODBC::SqlNumFields ()
30452 {
30453 	if ( !m_hStmt )
30454 		return -1;
30455 
30456 	return m_nResultCols;
30457 }
30458 
30459 
SqlFetchRow()30460 bool CSphSource_ODBC::SqlFetchRow ()
30461 {
30462 	if ( !m_hStmt )
30463 		return false;
30464 
30465 	SQLRETURN iRet = sph_SQLFetch ( m_hStmt );
30466 	if ( iRet==SQL_ERROR || iRet==SQL_INVALID_HANDLE || iRet==SQL_NO_DATA )
30467 	{
30468 		GetSqlError ( SQL_HANDLE_STMT, m_hStmt );
30469 		return false;
30470 	}
30471 
30472 	ARRAY_FOREACH ( i, m_dColumns )
30473 	{
30474 		QueryColumn_t & tCol = m_dColumns[i];
30475 		switch ( tCol.m_iInd )
30476 		{
30477 			case SQL_NULL_DATA:
30478 				tCol.m_dContents[0] = '\0';
30479 				break;
30480 
30481 			default:
30482 #if USE_WINDOWS // FIXME! support UCS-2 columns on Unix too
30483 				if ( tCol.m_bUCS2 )
30484 				{
30485 					// WideCharToMultiByte should get NULL terminated string
30486 					memset ( tCol.m_dRaw.Begin()+tCol.m_iBufferSize, 0, MS_SQL_BUFFER_GAP );
30487 
30488 					int iConv = WideCharToMultiByte ( CP_UTF8, 0, LPCWSTR ( tCol.m_dRaw.Begin() ), tCol.m_iInd/sizeof(WCHAR),
30489 						LPSTR ( tCol.m_dContents.Begin() ), tCol.m_iBufferSize-1, NULL, NULL );
30490 
30491 					if ( iConv==0 )
30492 						if ( GetLastError()==ERROR_INSUFFICIENT_BUFFER )
30493 							iConv = tCol.m_iBufferSize-1;
30494 
30495 					tCol.m_dContents[iConv] = '\0';
30496 
30497 				} else
30498 #endif
30499 				{
30500 					if ( tCol.m_iInd>=0 && tCol.m_iInd<tCol.m_iBufferSize )
30501 					{
30502 						// data fetched ok; add trailing zero
30503 						tCol.m_dContents[tCol.m_iInd] = '\0';
30504 
30505 					} else if ( tCol.m_iInd>=tCol.m_iBufferSize && !tCol.m_bTruncated )
30506 					{
30507 						// out of buffer; warn about that (once)
30508 						tCol.m_bTruncated = true;
30509 						sphWarn ( "'%s' column truncated (buffer=%d, got=%d); consider revising sql_column_buffers",
30510 							tCol.m_sName.cstr(), tCol.m_iBufferSize-1, (int) tCol.m_iInd );
30511 					}
30512 				}
30513 			break;
30514 		}
30515 	}
30516 
30517 	return iRet!=SQL_NO_DATA;
30518 }
30519 
30520 
SqlColumn(int iIndex)30521 const char * CSphSource_ODBC::SqlColumn ( int iIndex )
30522 {
30523 	if ( !m_hStmt )
30524 		return NULL;
30525 
30526 	return &(m_dColumns [iIndex].m_dContents[0]);
30527 }
30528 
30529 
SqlFieldName(int iIndex)30530 const char * CSphSource_ODBC::SqlFieldName ( int iIndex )
30531 {
30532 	return m_dColumns[iIndex].m_sName.cstr();
30533 }
30534 
30535 
SqlColumnLength(int)30536 DWORD CSphSource_ODBC::SqlColumnLength ( int )
30537 {
30538 	return 0;
30539 }
30540 
30541 
Setup(const CSphSourceParams_ODBC & tParams)30542 bool CSphSource_ODBC::Setup ( const CSphSourceParams_ODBC & tParams )
30543 {
30544 	if ( !CSphSource_SQL::Setup ( tParams ) )
30545 		return false;
30546 
30547 	// parse column buffers spec, if any
30548 	if ( !tParams.m_sColBuffers.IsEmpty() )
30549 	{
30550 		const char * p = tParams.m_sColBuffers.cstr();
30551 		while ( *p )
30552 		{
30553 			// skip space
30554 			while ( sphIsSpace(*p) )
30555 				p++;
30556 
30557 			// expect eof or ident
30558 			if ( !*p )
30559 				break;
30560 			if ( !sphIsAlpha(*p) )
30561 			{
30562 				m_sError.SetSprintf ( "identifier expected in sql_column_buffers near '%s'", p );
30563 				return false;
30564 			}
30565 
30566 			// get ident
30567 			CSphString sCol;
30568 			const char * pIdent = p;
30569 			while ( sphIsAlpha(*p) )
30570 				p++;
30571 			sCol.SetBinary ( pIdent, p-pIdent );
30572 
30573 			// skip space
30574 			while ( sphIsSpace(*p) )
30575 				p++;
30576 
30577 			// expect assignment
30578 			if ( *p!='=' )
30579 			{
30580 				m_sError.SetSprintf ( "'=' expected in sql_column_buffers near '%s'", p );
30581 				return false;
30582 			}
30583 			p++;
30584 
30585 			// skip space
30586 			while ( sphIsSpace(*p) )
30587 				p++;
30588 
30589 			// expect number
30590 			if (!( *p>='0' && *p<='9' ))
30591 			{
30592 				m_sError.SetSprintf ( "number expected in sql_column_buffers near '%s'", p );
30593 				return false;
30594 			}
30595 
30596 			// get value
30597 			int iSize = 0;
30598 			while ( *p>='0' && *p<='9' )
30599 			{
30600 				iSize = 10*iSize + ( *p-'0' );
30601 				p++;
30602 			}
30603 			if ( *p=='K' )
30604 			{
30605 				iSize *= 1024;
30606 				p++;
30607 			} else if ( *p=='M' )
30608 			{
30609 				iSize *= 1048576;
30610 				p++;
30611 			}
30612 
30613 			// hash value
30614 			sCol.ToLower();
30615 			m_hColBuffers.Add ( iSize, sCol );
30616 
30617 			// skip space
30618 			while ( sphIsSpace(*p) )
30619 				p++;
30620 
30621 			// expect eof or comma
30622 			if ( !*p )
30623 				break;
30624 			if ( *p!=',' )
30625 			{
30626 				m_sError.SetSprintf ( "comma expected in sql_column_buffers near '%s'", p );
30627 				return false;
30628 			}
30629 			p++;
30630 		}
30631 	}
30632 
30633 	// ODBC specific params
30634 	m_sOdbcDSN = tParams.m_sOdbcDSN;
30635 	m_bWinAuth = tParams.m_bWinAuth;
30636 
30637 	// build and store DSN for error reporting
30638 	char sBuf [ 1024 ];
30639 	snprintf ( sBuf, sizeof(sBuf), "odbc%s", m_sSqlDSN.cstr()+3 );
30640 	m_sSqlDSN = sBuf;
30641 
30642 	return true;
30643 }
30644 
30645 
GetSqlError(SQLSMALLINT iHandleType,SQLHANDLE hHandle)30646 void CSphSource_ODBC::GetSqlError ( SQLSMALLINT iHandleType, SQLHANDLE hHandle )
30647 {
30648 	if ( !hHandle )
30649 	{
30650 		m_sError.SetSprintf ( "invalid handle" );
30651 		return;
30652 	}
30653 
30654 	char szState[16] = "";
30655 	char szMessageText[1024] = "";
30656 	SQLINTEGER iError;
30657 	SQLSMALLINT iLen;
30658 	sph_SQLGetDiagRec ( iHandleType, hHandle, 1, (SQLCHAR*)szState, &iError, (SQLCHAR*)szMessageText, 1024, &iLen );
30659 	m_sError = szMessageText;
30660 }
30661 
30662 //////////////////////////////////////////////////////////////////////////
30663 
OdbcPostConnect()30664 void CSphSource_MSSQL::OdbcPostConnect ()
30665 {
30666 	if ( !m_sOdbcDSN.IsEmpty() )
30667 		return;
30668 
30669 	const int MAX_LEN = 1024;
30670 	char szDriver[MAX_LEN];
30671 	char szDriverAttrs[MAX_LEN];
30672 	SQLSMALLINT iDescLen = 0;
30673 	SQLSMALLINT iAttrLen = 0;
30674 	SQLSMALLINT iDir = SQL_FETCH_FIRST;
30675 
30676 	CSphString sDriver;
30677 	for ( ;; )
30678 	{
30679 		SQLRETURN iRet = sph_SQLDrivers ( m_hEnv, iDir, (SQLCHAR*)szDriver, MAX_LEN, &iDescLen, (SQLCHAR*)szDriverAttrs, MAX_LEN, &iAttrLen );
30680 		if ( iRet==SQL_NO_DATA )
30681 			break;
30682 
30683 		iDir = SQL_FETCH_NEXT;
30684 		if ( !strcmp ( szDriver, "SQL Native Client" )
30685 			|| !strncmp ( szDriver, "SQL Server Native Client", strlen("SQL Server Native Client") ) )
30686 		{
30687 			sDriver = szDriver;
30688 			break;
30689 		}
30690 	}
30691 
30692 	if ( sDriver.IsEmpty() )
30693 		sDriver = "SQL Server";
30694 
30695 	if ( m_bWinAuth && m_tParams.m_sUser.IsEmpty () )
30696 	{
30697 		m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};Database={%s};Trusted_Connection=yes",
30698 			sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sDB.cstr () );
30699 
30700 	} else if ( m_bWinAuth )
30701 	{
30702 		m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};UID={%s};PWD={%s};Database={%s};Trusted_Connection=yes",
30703 			sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sUser.cstr (), m_tParams.m_sPass.cstr (), m_tParams.m_sDB.cstr () );
30704 	} else
30705 	{
30706 		m_sOdbcDSN.SetSprintf ( "DRIVER={%s};SERVER={%s};UID={%s};PWD={%s};Database={%s}",
30707 			sDriver.cstr (), m_tParams.m_sHost.cstr (), m_tParams.m_sUser.cstr (), m_tParams.m_sPass.cstr (), m_tParams.m_sDB.cstr () );
30708 	}
30709 }
30710 
30711 #endif
30712 
30713 
30714 struct RemapXSV_t
30715 {
30716 	int m_iAttr;
30717 	int m_iField;
30718 };
30719 
30720 
30721 class CSphSource_BaseSV : public CSphSource_Document, public CSphSchemaConfigurator<CSphSource_BaseSV>
30722 {
30723 public:
30724 	explicit		CSphSource_BaseSV ( const char * sName );
30725 	virtual			~CSphSource_BaseSV ();
30726 
30727 	virtual bool	Connect ( CSphString & sError );				///< run the command and open the pipe
30728 	virtual void	Disconnect ();									///< close the pipe
30729 	const char *	DecorateMessage ( const char * sTemplate, ... ) const __attribute__ ( ( format ( printf, 2, 3 ) ) );
30730 
30731 	virtual bool	IterateStart ( CSphString & );					///< Connect() starts getting documents automatically, so this one is empty
30732 	virtual BYTE **	NextDocument ( CSphString & );					///< parse incoming chunk and emit some hits
30733 
HasAttrsConfigured()30734 	virtual bool	HasAttrsConfigured ()							{ return ( m_tSchema.GetAttrsCount()>0 ); }
IterateMultivaluedStart(int,CSphString &)30735 	virtual bool	IterateMultivaluedStart ( int, CSphString & )	{ return false; }
IterateMultivaluedNext()30736 	virtual bool	IterateMultivaluedNext ()						{ return false; }
IterateKillListStart(CSphString &)30737 	virtual bool	IterateKillListStart ( CSphString & )			{ return false; }
IterateKillListNext(SphDocID_t &)30738 	virtual bool	IterateKillListNext ( SphDocID_t & )			{ return false; }
30739 
30740 	bool			Setup ( const CSphConfigSection & hSource, FILE * pPipe, CSphString & sError );
30741 
30742 protected:
30743 	enum ESphParseResult
30744 	{
30745 		PARSING_FAILED,
30746 		GOT_DOCUMENT,
30747 		DATA_OVER
30748 	};
30749 
30750 	BYTE **					ReportDocumentError();
30751 	virtual bool			SetupSchema ( const CSphConfigSection & hSource, bool bWordDict, CSphString & sError ) = 0;
30752 	virtual ESphParseResult	SplitColumns ( CSphString & ) = 0;
30753 
30754 	CSphVector<BYTE>			m_dBuf;
30755 	CSphFixedVector<char>		m_dError;
30756 	CSphFixedVector<int>		m_dColumnsLen;
30757 	CSphFixedVector<RemapXSV_t>	m_dRemap;
30758 
30759 	// output
30760 	CSphFixedVector<BYTE *>		m_dFields;
30761 
30762 	FILE *						m_pFP;
30763 	int							m_iDataStart;		///< where the next line to parse starts in m_dBuf
30764 	int							m_iDocStart;		///< where the last parsed document stats in m_dBuf
30765 	int							m_iBufUsed;			///< bytes [0,m_iBufUsed) are actually currently used; the rest of m_dBuf is free
30766 	int							m_iLine;
30767 	int							m_iAutoCount;
30768 };
30769 
30770 
30771 class CSphSource_TSV : public CSphSource_BaseSV
30772 {
30773 public:
CSphSource_TSV(const char * sName)30774 	explicit				CSphSource_TSV ( const char * sName ) : CSphSource_BaseSV ( sName ) {}
30775 	virtual ESphParseResult	SplitColumns ( CSphString & sError );					///< parse incoming chunk and emit some hits
30776 	virtual bool			SetupSchema ( const CSphConfigSection & hSource, bool bWordDict, CSphString & sError );
30777 };
30778 
30779 
30780 class CSphSource_CSV : public CSphSource_BaseSV
30781 {
30782 public:
30783 	explicit				CSphSource_CSV ( const char * sName, const char * sDelimiter = NULL );
30784 	virtual ESphParseResult	SplitColumns ( CSphString & sError );					///< parse incoming chunk and emit some hits
30785 	virtual bool			SetupSchema ( const CSphConfigSection & hSource, bool bWordDict, CSphString & sError );
30786 	void					SetDelimiter ( const char * sDelimiter );
30787 
30788 private:
30789 	BYTE			m_iDelimiter;
30790 };
30791 
30792 
sphCreateSourceTSVpipe(const CSphConfigSection * pSource,FILE * pPipe,const char * sSourceName,bool RLPARG (bProxy))30793 CSphSource * sphCreateSourceTSVpipe ( const CSphConfigSection * pSource, FILE * pPipe, const char * sSourceName, bool RLPARG(bProxy) )
30794 {
30795 	CSphSource_TSV * pTSV = NULL;
30796 #if USE_RLP
30797 	if ( bProxy )
30798 		pTSV = new CSphSource_Proxy<CSphSource_TSV> ( sSourceName );
30799 	else
30800 #endif
30801 		pTSV = new CSphSource_TSV ( sSourceName );
30802 
30803 	CSphString sError;
30804 	if ( !pTSV->Setup ( *pSource, pPipe, sError ) )
30805 	{
30806 		SafeDelete ( pTSV );
30807 		fprintf ( stdout, "ERROR: tsvpipe: %s", sError.cstr() );
30808 	}
30809 
30810 	return pTSV;
30811 }
30812 
30813 
sphCreateSourceCSVpipe(const CSphConfigSection * pSource,FILE * pPipe,const char * sSourceName,bool RLPARG (bProxy))30814 CSphSource * sphCreateSourceCSVpipe ( const CSphConfigSection * pSource, FILE * pPipe, const char * sSourceName, bool RLPARG(bProxy) )
30815 {
30816 	CSphSource_CSV * pCSV = NULL;
30817 	const char * sDelimiter = pSource->GetStr ( "csvpipe_delimiter", "" );
30818 
30819 #if USE_RLP
30820 	if ( bProxy )
30821 	{
30822 		pCSV = new CSphSource_Proxy<CSphSource_CSV> ( sSourceName );
30823 		pCSV->SetDelimiter ( sDelimiter );
30824 	} else
30825 #endif
30826 		pCSV = new CSphSource_CSV ( sSourceName, sDelimiter );
30827 
30828 	CSphString sError;
30829 	if ( !pCSV->Setup ( *pSource, pPipe, sError ) )
30830 	{
30831 		SafeDelete ( pCSV );
30832 		fprintf ( stdout, "ERROR: csvpipe: %s", sError.cstr() );
30833 	}
30834 
30835 	return pCSV;
30836 }
30837 
30838 
CSphSource_BaseSV(const char * sName)30839 CSphSource_BaseSV::CSphSource_BaseSV ( const char * sName )
30840 	: CSphSource_Document ( sName )
30841 	, m_dError ( 1024 )
30842 	, m_dColumnsLen ( 0 )
30843 	, m_dRemap ( 0 )
30844 	, m_dFields ( 0 )
30845 	, m_iAutoCount ( 0 )
30846 {
30847 	m_iDataStart = 0;
30848 	m_iBufUsed = 0;
30849 }
30850 
30851 
~CSphSource_BaseSV()30852 CSphSource_BaseSV::~CSphSource_BaseSV ()
30853 {
30854 	Disconnect();
30855 }
30856 
30857 struct SortedRemapXSV_t : public RemapXSV_t
30858 {
30859 	int m_iTag;
30860 };
30861 
30862 
Setup(const CSphConfigSection & hSource,FILE * pPipe,CSphString & sError)30863 bool CSphSource_BaseSV::Setup ( const CSphConfigSection & hSource, FILE * pPipe, CSphString & sError )
30864 {
30865 	m_pFP = pPipe;
30866 	m_tSchema.Reset ();
30867 	bool bWordDict = ( m_pDict && m_pDict->GetSettings().m_bWordDict );
30868 
30869 	if ( !SetupSchema ( hSource, bWordDict, sError ) )
30870 		return false;
30871 
30872 	if ( !SourceCheckSchema ( m_tSchema, sError ) )
30873 		return false;
30874 
30875 	m_dFields.Reset ( m_tSchema.m_dFields.GetLength() );
30876 
30877 	// build hash from schema names
30878 	SmallStringHash_T<SortedRemapXSV_t> hSchema;
30879 	SortedRemapXSV_t tElem;
30880 	tElem.m_iTag = -1;
30881 	tElem.m_iAttr = -1;
30882 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
30883 	{
30884 		tElem.m_iField = i;
30885 		hSchema.Add ( tElem, m_tSchema.m_dFields[i].m_sName );
30886 	}
30887 	tElem.m_iField = -1;
30888 	for ( int i=0; i<m_tSchema.GetAttrsCount(); i++ )
30889 	{
30890 		RemapXSV_t * pRemap = hSchema ( m_tSchema.GetAttr ( i ).m_sName );
30891 		if ( pRemap )
30892 		{
30893 			pRemap->m_iAttr = i;
30894 		} else
30895 		{
30896 			tElem.m_iAttr = i;
30897 			hSchema.Add ( tElem, m_tSchema.GetAttr ( i ).m_sName );
30898 		}
30899 	}
30900 
30901 	// restore order for declared columns
30902 	CSphString sColumn;
30903 	hSource.IterateStart();
30904 	while ( hSource.IterateNext() )
30905 	{
30906 		const CSphVariant * pVal = &hSource.IterateGet();
30907 		while ( pVal )
30908 		{
30909 			sColumn = pVal->strval();
30910 			// uint attribute might have bit count that should by cut off from name
30911 			const char * pColon = strchr ( sColumn.cstr(), ':' );
30912 			if ( pColon )
30913 			{
30914 				int iColon = pColon-sColumn.cstr();
30915 				CSphString sTmp;
30916 				sTmp.SetBinary ( sColumn.cstr(), iColon );
30917 				sColumn.Swap ( sTmp );
30918 			}
30919 
30920 			// let's handle different char cases
30921 			sColumn.ToLower();
30922 
30923 			SortedRemapXSV_t * pColumn = hSchema ( sColumn );
30924 			assert ( !pColumn || pColumn->m_iAttr>=0 || pColumn->m_iField>=0 );
30925 			assert ( !pColumn || pColumn->m_iTag==-1 );
30926 			if ( pColumn )
30927 				pColumn->m_iTag = pVal->m_iTag;
30928 
30929 			pVal = pVal->m_pNext;
30930 		}
30931 	}
30932 
30933 	// fields + attributes + id - auto-generated
30934 	m_dColumnsLen.Reset ( hSchema.GetLength() + 1 );
30935 	m_dRemap.Reset ( hSchema.GetLength() + 1 );
30936 	CSphFixedVector<SortedRemapXSV_t> dColumnsSorted ( hSchema.GetLength() );
30937 
30938 	hSchema.IterateStart();
30939 	for ( int i=0; hSchema.IterateNext(); i++ )
30940 	{
30941 		assert ( hSchema.IterateGet().m_iTag>=0 );
30942 		dColumnsSorted[i] = hSchema.IterateGet();
30943 	}
30944 
30945 	sphSort ( dColumnsSorted.Begin(), dColumnsSorted.GetLength(), bind ( &SortedRemapXSV_t::m_iTag ) );
30946 
30947 	// set remap incoming columns to fields \ attributes
30948 	// doc_id dummy filler
30949 	m_dRemap[0].m_iAttr = 0;
30950 	m_dRemap[0].m_iField = 0;
30951 
30952 	ARRAY_FOREACH ( i, dColumnsSorted )
30953 	{
30954 		assert ( !i || dColumnsSorted[i-1].m_iTag<dColumnsSorted[i].m_iTag ); // no duplicates allowed
30955 		m_dRemap[i+1] = dColumnsSorted[i];
30956 	}
30957 
30958 	return true;
30959 }
30960 
30961 
Connect(CSphString & sError)30962 bool CSphSource_BaseSV::Connect ( CSphString & sError )
30963 {
30964 	bool bWordDict = ( m_pDict && m_pDict->GetSettings().m_bWordDict );
30965 	ARRAY_FOREACH ( i, m_tSchema.m_dFields )
30966 	{
30967 		CSphColumnInfo & tCol = m_tSchema.m_dFields[i];
30968 		tCol.m_eWordpart = GetWordpart ( tCol.m_sName.cstr(), bWordDict );
30969 	}
30970 
30971 	int iAttrs = m_tSchema.GetAttrsCount();
30972 	if ( !AddAutoAttrs ( sError ) )
30973 		return false;
30974 
30975 	m_iAutoCount = m_tSchema.GetAttrsCount() - iAttrs;
30976 
30977 	AllocDocinfo();
30978 
30979 	m_tHits.m_dData.Reserve ( m_iMaxHits );
30980 	m_dBuf.Resize ( DEFAULT_READ_BUFFER );
30981 	m_dMva.Reserve ( 512 );
30982 
30983 	return true;
30984 }
30985 
30986 
Disconnect()30987 void CSphSource_BaseSV::Disconnect()
30988 {
30989 	if ( m_pFP )
30990 	{
30991 		fclose ( m_pFP );
30992 		m_pFP = NULL;
30993 	}
30994 	m_tHits.m_dData.Reset();
30995 }
30996 
30997 
DecorateMessage(const char * sTemplate,...) const30998 const char * CSphSource_BaseSV::DecorateMessage ( const char * sTemplate, ... ) const
30999 {
31000 	va_list ap;
31001 	va_start ( ap, sTemplate );
31002 	vsnprintf ( m_dError.Begin(), m_dError.GetLength(), sTemplate, ap );
31003 	va_end ( ap );
31004 	return m_dError.Begin();
31005 }
31006 
31007 static const BYTE g_dBOM[] = { 0xEF, 0xBB, 0xBF };
31008 
IterateStart(CSphString & sError)31009 bool CSphSource_BaseSV::IterateStart ( CSphString & sError )
31010 {
31011 	if ( !m_tSchema.m_dFields.GetLength() )
31012 	{
31013 		sError.SetSprintf ( "No fields in schema - will not index" );
31014 		return false;
31015 	}
31016 
31017 	m_iLine = 0;
31018 	m_iDataStart = 0;
31019 
31020 	// initial buffer update
31021 	m_iBufUsed = fread ( m_dBuf.Begin(), 1, m_dBuf.GetLength(), m_pFP );
31022 	if ( !m_iBufUsed )
31023 	{
31024 		sError.SetSprintf ( "source '%s': read error '%s'", m_tSchema.m_sName.cstr(), strerror(errno) );
31025 		return false;
31026 	}
31027 
31028 	m_iPlainFieldsLength = m_tSchema.m_dFields.GetLength();
31029 
31030 	// space out BOM like xml-pipe does
31031 	if ( m_iBufUsed>(int)sizeof(g_dBOM) && memcmp ( m_dBuf.Begin(), g_dBOM, sizeof ( g_dBOM ) )==0 )
31032 		memset ( m_dBuf.Begin(), ' ', sizeof(g_dBOM) );
31033 	return true;
31034 }
31035 
ReportDocumentError()31036 BYTE ** CSphSource_BaseSV::ReportDocumentError ()
31037 {
31038 	m_tDocInfo.m_uDocID = 1; // 0 means legal eof
31039 	m_iDataStart = 0;
31040 	m_iBufUsed = 0;
31041 	return NULL;
31042 }
31043 
31044 
NextDocument(CSphString & sError)31045 BYTE **	CSphSource_BaseSV::NextDocument ( CSphString & sError )
31046 {
31047 	ESphParseResult eRes = SplitColumns ( sError );
31048 	if ( eRes==PARSING_FAILED )
31049 		return ReportDocumentError();
31050 	else if ( eRes==DATA_OVER )
31051 		return NULL;
31052 
31053 	assert ( eRes==GOT_DOCUMENT );
31054 
31055 	// check doc_id
31056 	if ( !m_dColumnsLen[0] )
31057 	{
31058 		sError.SetSprintf ( "source '%s': no doc_id found (line=%d)", m_tSchema.m_sName.cstr(), m_iLine );
31059 		return ReportDocumentError();
31060 	}
31061 
31062 	// parse doc_id
31063 	m_tDocInfo.m_uDocID = sphToDocid ( (const char *)&m_dBuf[m_iDocStart] );
31064 
31065 	// check doc_id
31066 	if ( m_tDocInfo.m_uDocID==0 )
31067 	{
31068 		sError.SetSprintf ( "source '%s': invalid doc_id found (line=%d)", m_tSchema.m_sName.cstr(), m_iLine );
31069 		return ReportDocumentError();
31070 	}
31071 
31072 	// parse column data
31073 	int iOff = m_iDocStart + m_dColumnsLen[0] + 1; // skip docid and its trailing zero
31074 	int iColumns = m_dRemap.GetLength();
31075 	for ( int iCol=1; iCol<iColumns; iCol++ )
31076 	{
31077 		// if+if for field-string attribute case
31078 		const RemapXSV_t & tRemap = m_dRemap[iCol];
31079 
31080 		// field column
31081 		if ( tRemap.m_iField!=-1 )
31082 			m_dFields[tRemap.m_iField] = m_dBuf.Begin() + iOff;
31083 
31084 		// attribute column
31085 		if ( tRemap.m_iAttr!=-1 )
31086 		{
31087 			const CSphColumnInfo & tAttr = m_tSchema.GetAttr ( tRemap.m_iAttr );
31088 			const char * sVal = (const char *)m_dBuf.Begin() + iOff;
31089 
31090 			switch ( tAttr.m_eAttrType )
31091 			{
31092 			case SPH_ATTR_STRING:
31093 			case SPH_ATTR_JSON:
31094 				m_dStrAttrs[tRemap.m_iAttr] = sVal;
31095 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
31096 				break;
31097 
31098 			case SPH_ATTR_FLOAT:
31099 				m_tDocInfo.SetAttrFloat ( tAttr.m_tLocator, sphToFloat ( sVal ) );
31100 				break;
31101 
31102 			case SPH_ATTR_BIGINT:
31103 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToInt64 ( sVal ) );
31104 				break;
31105 
31106 			case SPH_ATTR_UINT32SET:
31107 			case SPH_ATTR_INT64SET:
31108 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, ParseFieldMVA ( m_dMva, sVal, ( tAttr.m_eAttrType==SPH_ATTR_INT64SET ) ) );
31109 				break;
31110 
31111 			case SPH_ATTR_TOKENCOUNT:
31112 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, 0 );
31113 				break;
31114 
31115 			default:
31116 				m_tDocInfo.SetAttr ( tAttr.m_tLocator, sphToDword ( sVal ) );
31117 				break;
31118 			}
31119 		}
31120 
31121 		iOff += m_dColumnsLen[iCol] + 1; // length of value plus null-terminator
31122 	}
31123 
31124 	m_iLine++;
31125 	return m_dFields.Begin();
31126 }
31127 
31128 
SplitColumns(CSphString & sError)31129 CSphSource_BaseSV::ESphParseResult CSphSource_TSV::SplitColumns ( CSphString & sError )
31130 {
31131 	int iColumns = m_dRemap.GetLength();
31132 	int iCol = 0;
31133 	int iColumnStart = m_iDataStart;
31134 	BYTE * pData = m_dBuf.Begin() + m_iDataStart;
31135 	const BYTE * pEnd = m_dBuf.Begin() + m_iBufUsed;
31136 	m_iDocStart = m_iDataStart;
31137 
31138 	for ( ;; )
31139 	{
31140 		if ( iCol>=iColumns )
31141 		{
31142 			sError.SetSprintf ( "source '%s': too many columns found (found=%d, declared=%d, line=%d, docid=" DOCID_FMT ")",
31143 				m_tSchema.m_sName.cstr(), iCol, iColumns+m_iAutoCount, m_iLine, m_tDocInfo.m_uDocID );
31144 			return CSphSource_BaseSV::PARSING_FAILED;
31145 		}
31146 
31147 		// move to next control symbol
31148 		while ( pData<pEnd && *pData && *pData!='\t' && *pData!='\r' && *pData!='\n' )
31149 			pData++;
31150 
31151 		if ( pData<pEnd )
31152 		{
31153 			assert ( *pData=='\t' || !*pData || *pData=='\r' || *pData=='\n' );
31154 			bool bNull = !*pData;
31155 			bool bEOL = ( *pData=='\r' || *pData=='\n' );
31156 
31157 			int iLen = pData - m_dBuf.Begin() - iColumnStart;
31158 			assert ( iLen>=0 );
31159 			m_dColumnsLen[iCol] = iLen;
31160 			*pData++ = '\0';
31161 			iCol++;
31162 
31163 			if ( bNull )
31164 			{
31165 				// null terminated string found
31166 				m_iDataStart = m_iBufUsed = 0;
31167 				break;
31168 			} else if ( bEOL )
31169 			{
31170 				// end of document found
31171 				// skip all EOL characters
31172 				while ( pData<pEnd && *pData && ( *pData=='\r' || *pData=='\n' ) )
31173 					pData++;
31174 				break;
31175 			}
31176 
31177 			// column separator found
31178 			iColumnStart = pData - m_dBuf.Begin();
31179 			continue;
31180 		}
31181 
31182 		int iOff = pData - m_dBuf.Begin();
31183 
31184 		// if there is space at the start, move data around
31185 		// if not, resize the buffer
31186 		if ( m_iDataStart>0 )
31187 		{
31188 			memmove ( m_dBuf.Begin(), m_dBuf.Begin() + m_iDataStart, m_iBufUsed - m_iDataStart );
31189 			m_iBufUsed -= m_iDataStart;
31190 			iOff -= m_iDataStart;
31191 			iColumnStart -= m_iDataStart;
31192 			m_iDataStart = 0;
31193 			m_iDocStart = 0;
31194 		} else if ( m_iBufUsed==m_dBuf.GetLength() )
31195 		{
31196 			m_dBuf.Resize ( m_dBuf.GetLength()*2 );
31197 		}
31198 
31199 		// do read
31200 		int iGot = fread ( m_dBuf.Begin() + m_iBufUsed, 1, m_dBuf.GetLength() - m_iBufUsed, m_pFP );
31201 		if ( !iGot )
31202 		{
31203 			if ( !iCol )
31204 			{
31205 				// normal file termination - no pending columns and documents
31206 				m_iDataStart = m_iBufUsed = 0;
31207 				m_tDocInfo.m_uDocID = 0;
31208 				return CSphSource_BaseSV::DATA_OVER;
31209 			}
31210 
31211 			// error in case no data left in middle of data stream
31212 			sError.SetSprintf ( "source '%s': read error '%s' (line=%d, docid=" DOCID_FMT ")",
31213 				m_tSchema.m_sName.cstr(), strerror(errno), m_iLine, m_tDocInfo.m_uDocID );
31214 			return CSphSource_BaseSV::PARSING_FAILED;
31215 		}
31216 		m_iBufUsed += iGot;
31217 
31218 		// restored pointers after buffer resize
31219 		pData = m_dBuf.Begin() + iOff;
31220 		pEnd = m_dBuf.Begin() + m_iBufUsed;
31221 	}
31222 
31223 	// all columns presence check
31224 	if ( iCol!=iColumns )
31225 	{
31226 		sError.SetSprintf ( "source '%s': not all columns found (found=%d, total=%d, line=%d, docid=" DOCID_FMT ")",
31227 			m_tSchema.m_sName.cstr(), iCol, iColumns, m_iLine, m_tDocInfo.m_uDocID );
31228 		return CSphSource_BaseSV::PARSING_FAILED;
31229 	}
31230 
31231 	// tail data
31232 	assert ( pData<=pEnd );
31233 	m_iDataStart = pData - m_dBuf.Begin();
31234 	return CSphSource_BaseSV::GOT_DOCUMENT;
31235 }
31236 
31237 
SetupSchema(const CSphConfigSection & hSource,bool bWordDict,CSphString & sError)31238 bool CSphSource_TSV::SetupSchema ( const CSphConfigSection & hSource, bool bWordDict, CSphString & sError )
31239 {
31240 	bool bOk = true;
31241 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_uint"),		SPH_ATTR_INTEGER,	m_tSchema, sError );
31242 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_timestamp"),	SPH_ATTR_TIMESTAMP,	m_tSchema, sError );
31243 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_bool"),		SPH_ATTR_BOOL,		m_tSchema, sError );
31244 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_float"),		SPH_ATTR_FLOAT,		m_tSchema, sError );
31245 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_bigint"),		SPH_ATTR_BIGINT,	m_tSchema, sError );
31246 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_multi"),		SPH_ATTR_UINT32SET,	m_tSchema, sError );
31247 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_multi_64"),	SPH_ATTR_INT64SET,	m_tSchema, sError );
31248 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_string"),		SPH_ATTR_STRING,	m_tSchema, sError );
31249 	bOk &= ConfigureAttrs ( hSource("tsvpipe_attr_json"),		SPH_ATTR_JSON,		m_tSchema, sError );
31250 	bOk &= ConfigureAttrs ( hSource("tsvpipe_field_string"),	SPH_ATTR_STRING,	m_tSchema, sError );
31251 
31252 	if ( !bOk )
31253 		return false;
31254 
31255 	ConfigureFields ( hSource("tsvpipe_field"), bWordDict, m_tSchema );
31256 	ConfigureFields ( hSource("tsvpipe_field_string"), bWordDict, m_tSchema );
31257 
31258 	return true;
31259 }
31260 
31261 
CSphSource_CSV(const char * sName,const char * sDelimiter)31262 CSphSource_CSV::CSphSource_CSV ( const char * sName, const char * sDelimiter )
31263 	: CSphSource_BaseSV ( sName )
31264 {
31265 	m_iDelimiter = BYTE ( ',' );
31266 	SetDelimiter ( sDelimiter );
31267 }
31268 
31269 
SplitColumns(CSphString & sError)31270 CSphSource_BaseSV::ESphParseResult CSphSource_CSV::SplitColumns ( CSphString & sError )
31271 {
31272 	int iColumns = m_dRemap.GetLength();
31273 	int iCol = 0;
31274 	int iColumnStart = m_iDataStart;
31275 	int iQuoteCount = 0;
31276 	int iQuoteStart = -1;
31277 	int	iEscapeStart = -1;
31278 	const BYTE * s = m_dBuf.Begin() + m_iDataStart; // parse this line
31279 	BYTE * d = m_dBuf.Begin() + m_iDataStart; // do parsing in place
31280 	const BYTE * pEnd = m_dBuf.Begin() + m_iBufUsed; // until we reach the end of current buffer
31281 	m_iDocStart = m_iDataStart;
31282 
31283 	for ( ;; )
31284 	{
31285 		assert ( d<=s );
31286 
31287 		// move to next control symbol
31288 		while ( s<pEnd && *s && *s!=m_iDelimiter && *s!='"' && *s!='\\' && *s!='\r' && *s!='\n' )
31289 			*d++ = *s++;
31290 
31291 		if ( s<pEnd )
31292 		{
31293 			assert ( !*s || *s==m_iDelimiter || *s=='"' || *s=='\\' || *s=='\r' || *s=='\n' );
31294 			bool bNull = !*s;
31295 			bool bEOL = ( *s=='\r' || *s=='\n' );
31296 #ifndef NDEBUG
31297 			bool bDelimiter = ( *s==m_iDelimiter );
31298 #endif
31299 			bool bQuot = ( *s=='"' );
31300 			bool bEscape = ( *s=='\\' );
31301 			int iOff = s - m_dBuf.Begin();
31302 			bool bEscaped = ( iEscapeStart>=0 && iEscapeStart+1==iOff );
31303 
31304 			if ( bEscape || bEscaped ) // not quoted escape symbol
31305 			{
31306 				if ( bEscaped ) // next to escape symbol proceed as regular
31307 				{
31308 					*d++ = *s++;
31309 				} else // escape just started
31310 				{
31311 					iEscapeStart = iOff;
31312 					s++;
31313 				}
31314 				continue;
31315 			}
31316 
31317 			// [ " ... " ]
31318 			// [ " ... "" ... " ]
31319 			// [ " ... """ ]
31320 			// [ " ... """" ... " ]
31321 			if ( bQuot || ( iQuoteCount%2 )==1 ) // quotation
31322 			{
31323 				// order of operations with quotation counter and offset matter
31324 				if ( bQuot )
31325 					iQuoteCount++;
31326 
31327 				// any symbol inside quotation proceed as regular
31328 				// but not quotation itself
31329 				// but quoted quotation proceed as regular symbol or escaped quotation
31330 				bool bOdd = ( ( iQuoteCount%2 )==1 );
31331 				// regular symbol inside quotation or quoted quotation or escaped quotation
31332 				if ( bOdd && ( !bQuot || ( iQuoteStart!=-1 && iQuoteStart+1==iOff ) ) )
31333 					*d++ = *s;
31334 				s++;
31335 
31336 				if ( bQuot )
31337 					iQuoteStart = iOff;
31338 				continue;
31339 			}
31340 
31341 			int iLen = d - m_dBuf.Begin() - iColumnStart;
31342 			assert ( iLen>=0 );
31343 			m_dColumnsLen[iCol] = iLen;
31344 			*d++ = '\0';
31345 			s++;
31346 			iCol++;
31347 
31348 			if ( bNull ) // null terminated string found
31349 			{
31350 				m_iDataStart = m_iBufUsed = 0;
31351 				break;
31352 			} else if ( bEOL ) // end of document found
31353 			{
31354 				// skip all EOL characters
31355 				while ( s<pEnd && *s && ( *s=='\r' || *s=='\n' ) )
31356 					s++;
31357 				break;
31358 			}
31359 
31360 			assert ( bDelimiter );
31361 			// column separator found
31362 			iColumnStart = d - m_dBuf.Begin();
31363 			continue;
31364 		}
31365 
31366 		/////////////////////
31367 		// read in more data
31368 		/////////////////////
31369 
31370 		int iDstOff = s - m_dBuf.Begin();
31371 		int iSrcOff = d - m_dBuf.Begin();
31372 
31373 		// if there is space at the start, move data around
31374 		// if not, resize the buffer
31375 		if ( m_iDataStart>0 )
31376 		{
31377 			memmove ( m_dBuf.Begin(), m_dBuf.Begin() + m_iDataStart, m_iBufUsed - m_iDataStart );
31378 			m_iBufUsed -= m_iDataStart;
31379 			iDstOff -= m_iDataStart;
31380 			iSrcOff -= m_iDataStart;
31381 			iColumnStart -= m_iDataStart;
31382 			iQuoteStart -= m_iDataStart;
31383 			iEscapeStart -= m_iDataStart;
31384 			m_iDataStart = 0;
31385 			m_iDocStart = 0;
31386 		} else if ( m_iBufUsed==m_dBuf.GetLength() )
31387 		{
31388 			m_dBuf.Resize ( m_dBuf.GetLength()*2 );
31389 		}
31390 
31391 		// do read
31392 		int iGot = fread ( m_dBuf.Begin() + m_iBufUsed, 1, m_dBuf.GetLength() - m_iBufUsed, m_pFP );
31393 		if ( !iGot )
31394 		{
31395 			if ( !iCol )
31396 			{
31397 				// normal file termination - no pending columns and documents
31398 				m_iDataStart = m_iBufUsed = 0;
31399 				m_tDocInfo.m_uDocID = 0;
31400 				return CSphSource_BaseSV::DATA_OVER;
31401 			}
31402 
31403 			// error in case no data left in middle of data stream
31404 			sError.SetSprintf ( "source '%s': read error '%s' (line=%d, docid=" DOCID_FMT ")",
31405 				m_tSchema.m_sName.cstr(), strerror(errno), m_iLine, m_tDocInfo.m_uDocID );
31406 			return CSphSource_BaseSV::PARSING_FAILED;
31407 		}
31408 		m_iBufUsed += iGot;
31409 
31410 		// restore pointers because of the resize
31411 		s = m_dBuf.Begin() + iDstOff;
31412 		d = m_dBuf.Begin() + iSrcOff;
31413 		pEnd = m_dBuf.Begin() + m_iBufUsed;
31414 	}
31415 
31416 	// all columns presence check
31417 	if ( iCol!=iColumns )
31418 	{
31419 		sError.SetSprintf ( "source '%s': not all columns found (found=%d, total=%d, line=%d, docid=" DOCID_FMT ")",
31420 			m_tSchema.m_sName.cstr(), iCol, iColumns, m_iLine, m_tDocInfo.m_uDocID );
31421 		return CSphSource_BaseSV::PARSING_FAILED;
31422 	}
31423 
31424 	// tail data
31425 	assert ( s<=pEnd );
31426 	m_iDataStart = s - m_dBuf.Begin();
31427 	return CSphSource_BaseSV::GOT_DOCUMENT;
31428 }
31429 
31430 
SetupSchema(const CSphConfigSection & hSource,bool bWordDict,CSphString & sError)31431 bool CSphSource_CSV::SetupSchema ( const CSphConfigSection & hSource, bool bWordDict, CSphString & sError )
31432 {
31433 	bool bOk = true;
31434 
31435 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_uint"),		SPH_ATTR_INTEGER,	m_tSchema, sError );
31436 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_timestamp"),	SPH_ATTR_TIMESTAMP,	m_tSchema, sError );
31437 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_bool"),		SPH_ATTR_BOOL,		m_tSchema, sError );
31438 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_float"),		SPH_ATTR_FLOAT,		m_tSchema, sError );
31439 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_bigint"),		SPH_ATTR_BIGINT,	m_tSchema, sError );
31440 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_multi"),		SPH_ATTR_UINT32SET,	m_tSchema, sError );
31441 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_multi_64"),	SPH_ATTR_INT64SET,	m_tSchema, sError );
31442 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_string"),		SPH_ATTR_STRING,	m_tSchema, sError );
31443 	bOk &= ConfigureAttrs ( hSource("csvpipe_attr_json"),		SPH_ATTR_JSON,		m_tSchema, sError );
31444 	bOk &= ConfigureAttrs ( hSource("csvpipe_field_string"),	SPH_ATTR_STRING,	m_tSchema, sError );
31445 
31446 	if ( !bOk )
31447 		return false;
31448 
31449 	ConfigureFields ( hSource("csvpipe_field"), bWordDict, m_tSchema );
31450 	ConfigureFields ( hSource("csvpipe_field_string"), bWordDict, m_tSchema );
31451 
31452 	return true;
31453 }
31454 
31455 
SetDelimiter(const char * sDelimiter)31456 void CSphSource_CSV::SetDelimiter ( const char * sDelimiter )
31457 {
31458 	if ( sDelimiter && *sDelimiter )
31459 		m_iDelimiter = *sDelimiter;
31460 }
31461 
31462 
31463 /////////////////////////////////////////////////////////////////////////////
31464 
31465 
sphSetJsonOptions(bool bStrict,bool bAutoconvNumbers,bool bKeynamesToLowercase)31466 void sphSetJsonOptions ( bool bStrict, bool bAutoconvNumbers, bool bKeynamesToLowercase )
31467 {
31468 	g_bJsonStrict = bStrict;
31469 	g_bJsonAutoconvNumbers = bAutoconvNumbers;
31470 	g_bJsonKeynamesToLowercase = bKeynamesToLowercase;
31471 }
31472 
31473 
GetPercent(int64_t a,int64_t b)31474 static inline float GetPercent ( int64_t a, int64_t b )
31475 {
31476 	if ( b==0 )
31477 		return 100.0f;
31478 
31479 	int64_t r = a*100000/b;
31480 	return float(r)/1000;
31481 }
31482 
31483 
BuildMessage() const31484 const char * CSphIndexProgress::BuildMessage() const
31485 {
31486 	static char sBuf[256];
31487 	switch ( m_ePhase )
31488 	{
31489 		case PHASE_COLLECT:
31490 			snprintf ( sBuf, sizeof(sBuf), "collected " INT64_FMT " docs, %.1f MB", m_iDocuments,
31491 				float(m_iBytes)/1000000.0f );
31492 			break;
31493 
31494 		case PHASE_SORT:
31495 			snprintf ( sBuf, sizeof(sBuf), "sorted %.1f Mhits, %.1f%% done", float(m_iHits)/1000000,
31496 				GetPercent ( m_iHits, m_iHitsTotal ) );
31497 			break;
31498 
31499 		case PHASE_COLLECT_MVA:
31500 			snprintf ( sBuf, sizeof(sBuf), "collected " INT64_FMT " attr values", m_iAttrs );
31501 			break;
31502 
31503 		case PHASE_SORT_MVA:
31504 			snprintf ( sBuf, sizeof(sBuf), "sorted %.1f Mvalues, %.1f%% done", float(m_iAttrs)/1000000,
31505 				GetPercent ( m_iAttrs, m_iAttrsTotal ) );
31506 			break;
31507 
31508 		case PHASE_MERGE:
31509 			snprintf ( sBuf, sizeof(sBuf), "merged %.1f Kwords", float(m_iWords)/1000 );
31510 			break;
31511 
31512 		case PHASE_PREREAD:
31513 			snprintf ( sBuf, sizeof(sBuf), "read %.1f of %.1f MB, %.1f%% done",
31514 				float(m_iBytes)/1000000.0f, float(m_iBytesTotal)/1000000.0f,
31515 				GetPercent ( m_iBytes, m_iBytesTotal ) );
31516 			break;
31517 
31518 		case PHASE_PRECOMPUTE:
31519 			snprintf ( sBuf, sizeof(sBuf), "indexing attributes, %d.%d%% done", m_iDone/10, m_iDone%10 );
31520 			break;
31521 
31522 		default:
31523 			assert ( 0 && "internal error: unhandled progress phase" );
31524 			snprintf ( sBuf, sizeof(sBuf), "(progress-phase-%d)", m_ePhase );
31525 			break;
31526 	}
31527 
31528 	sBuf[sizeof(sBuf)-1] = '\0';
31529 	return sBuf;
31530 }
31531 
31532 
Show(bool bPhaseEnd) const31533 void CSphIndexProgress::Show ( bool bPhaseEnd ) const
31534 {
31535 	if ( m_fnProgress )
31536 		m_fnProgress ( this, bPhaseEnd );
31537 }
31538 
31539 
31540 /////////////////////////////////////////////////////////////////////////////
31541 
sphDictCmp(const char * pStr1,int iLen1,const char * pStr2,int iLen2)31542 int sphDictCmp ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 )
31543 {
31544 	assert ( pStr1 && pStr2 );
31545 	assert ( iLen1 && iLen2 );
31546 	const int iCmpLen = Min ( iLen1, iLen2 );
31547 	return strncmp ( pStr1, pStr2, iCmpLen );
31548 }
31549 
sphDictCmpStrictly(const char * pStr1,int iLen1,const char * pStr2,int iLen2)31550 int sphDictCmpStrictly ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 )
31551 {
31552 	assert ( pStr1 && pStr2 );
31553 	assert ( iLen1 && iLen2 );
31554 	const int iCmpLen = Min ( iLen1, iLen2 );
31555 	const int iCmpRes = strncmp ( pStr1, pStr2, iCmpLen );
31556 	return iCmpRes==0 ? iLen1-iLen2 : iCmpRes;
31557 }
31558 
31559 
CWordlist()31560 CWordlist::CWordlist ()
31561 	: m_dCheckpoints ( 0 )
31562 	, m_dInfixBlocks ( 0 )
31563 	, m_pWords ( 0 )
31564 {
31565 	m_iDictCheckpointsOffset = 0;
31566 	m_iSize = 0;
31567 	m_iMaxChunk = 0;
31568 	m_bWordDict = false;
31569 	m_pInfixBlocksWords = NULL;
31570 }
31571 
~CWordlist()31572 CWordlist::~CWordlist ()
31573 {
31574 	Reset();
31575 }
31576 
Reset()31577 void CWordlist::Reset ()
31578 {
31579 	m_tFile.Close ();
31580 	m_pBuf.Reset ();
31581 
31582 	m_dCheckpoints.Reset ( 0 );
31583 	m_pWords.Reset ( 0 );
31584 	SafeDeleteArray ( m_pInfixBlocksWords );
31585 }
31586 
ReadCP(CSphAutofile & tFile,DWORD uVersion,bool bWordDict,CSphString & sError)31587 bool CWordlist::ReadCP ( CSphAutofile & tFile, DWORD uVersion, bool bWordDict, CSphString & sError )
31588 {
31589 	assert ( ( uVersion>=21 && bWordDict ) || !bWordDict );
31590 	assert ( m_iDictCheckpointsOffset>0 );
31591 	assert ( m_iSize-m_iDictCheckpointsOffset<UINT_MAX );
31592 
31593 	m_bHaveSkips = ( uVersion>=31 );
31594 
31595 	////////////////////////////
31596 	// preload word checkpoints
31597 	////////////////////////////
31598 
31599 	int iCheckpointOnlySize = (int)(m_iSize-m_iDictCheckpointsOffset);
31600 	if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
31601 		iCheckpointOnlySize = (int)(m_iInfixBlocksOffset - strlen ( g_sTagInfixBlocks ) - m_iDictCheckpointsOffset);
31602 
31603 	CSphReader tReader;
31604 	tReader.SetFile ( tFile );
31605 	tReader.SeekTo ( m_iDictCheckpointsOffset, iCheckpointOnlySize );
31606 
31607 	m_bWordDict = bWordDict;
31608 
31609 	if ( m_bWordDict )
31610 	{
31611 		int iArenaSize = iCheckpointOnlySize
31612 			- (sizeof(DWORD)+sizeof(SphOffset_t))*m_dCheckpoints.GetLength()
31613 			+ sizeof(BYTE)*m_dCheckpoints.GetLength();
31614 		assert ( iArenaSize>=0 );
31615 		m_pWords.Reset ( iArenaSize );
31616 
31617 		BYTE * pWord = m_pWords.Begin();
31618 		ARRAY_FOREACH ( i, m_dCheckpoints )
31619 		{
31620 			m_dCheckpoints[i].m_sWord = (char *)pWord;
31621 
31622 			const int iLen = tReader.GetDword();
31623 			assert ( iLen>0 );
31624 			assert ( iLen + 1 + ( pWord - m_pWords.Begin() )<=iArenaSize );
31625 			tReader.GetBytes ( pWord, iLen );
31626 			pWord[iLen] = '\0';
31627 			pWord += iLen+1;
31628 
31629 			m_dCheckpoints[i].m_iWordlistOffset = tReader.GetOffset();
31630 		}
31631 	} else if ( uVersion>=11 )
31632 	{
31633 		// read v.14 checkpoints
31634 		ARRAY_FOREACH ( i, m_dCheckpoints )
31635 		{
31636 			m_dCheckpoints[i].m_uWordID = (SphWordID_t)tReader.GetOffset();
31637 			m_dCheckpoints[i].m_iWordlistOffset = tReader.GetOffset();
31638 		}
31639 	} else
31640 	{
31641 		// convert v.10 checkpoints
31642 		ARRAY_FOREACH ( i, m_dCheckpoints )
31643 		{
31644 #if USE_64BIT
31645 			m_dCheckpoints[i].m_uWordID = tReader.GetOffset();
31646 #else
31647 			m_dCheckpoints[i].m_uWordID = tReader.GetDword();
31648 #endif
31649 			m_dCheckpoints[i].m_iWordlistOffset = tReader.GetDword();
31650 		}
31651 	}
31652 
31653 	////////////////////////
31654 	// preload infix blocks
31655 	////////////////////////
31656 
31657 	if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
31658 	{
31659 		// reading to vector as old version doesn't store total infix words length
31660 		CSphTightVector<BYTE> dInfixWords;
31661 		dInfixWords.Reserve ( (int)m_iInfixBlocksWordsSize );
31662 
31663 		tReader.SeekTo ( m_iInfixBlocksOffset, (int)(m_iSize-m_iInfixBlocksOffset) );
31664 		m_dInfixBlocks.Resize ( tReader.UnzipInt() );
31665 		ARRAY_FOREACH ( i, m_dInfixBlocks )
31666 		{
31667 			int iBytes = tReader.UnzipInt();
31668 
31669 			int iOff = dInfixWords.GetLength();
31670 			m_dInfixBlocks[i].m_iInfixOffset = iOff;
31671 			dInfixWords.Resize ( iOff+iBytes+1 );
31672 
31673 			tReader.GetBytes ( dInfixWords.Begin()+iOff, iBytes );
31674 			dInfixWords[iOff+iBytes] = '\0';
31675 
31676 			m_dInfixBlocks[i].m_iOffset = tReader.UnzipInt();
31677 		}
31678 
31679 		// fix-up offset to pointer
31680 		m_pInfixBlocksWords = dInfixWords.LeakData();
31681 		ARRAY_FOREACH ( i, m_dInfixBlocks )
31682 			m_dInfixBlocks[i].m_sInfix = (const char *)m_pInfixBlocksWords + m_dInfixBlocks[i].m_iInfixOffset;
31683 	}
31684 
31685 	// FIXME!!! store and load that explicitly
31686 	// set wordlist end
31687 	m_iWordsEnd = m_iDictCheckpointsOffset;
31688 	if ( m_iInfixCodepointBytes && m_iInfixBlocksOffset )
31689 	{
31690 		if ( m_dInfixBlocks.GetLength() )
31691 			m_iWordsEnd = m_dInfixBlocks.Begin()->m_iOffset - strlen ( g_sTagInfixEntries );
31692 		else
31693 			m_iWordsEnd -= strlen ( g_sTagInfixEntries );
31694 	}
31695 
31696 	// TODO: count m_dInfixBlocks too while make on_disk_dict work with dict=keywords + infix
31697 	SphOffset_t uMaxChunk = 0;
31698 	if ( m_dCheckpoints.GetLength() )
31699 	{
31700 		uMaxChunk = m_iWordsEnd - m_dCheckpoints.Last().m_iWordlistOffset;
31701 		SphOffset_t uPrev = m_dCheckpoints.Begin()->m_iWordlistOffset;
31702 		for ( int i=1; i<m_dCheckpoints.GetLength(); i++ )
31703 		{
31704 			SphOffset_t uOff = m_dCheckpoints[i].m_iWordlistOffset;
31705 			uMaxChunk = Max ( uMaxChunk, uOff-uPrev );
31706 			uPrev = uOff;
31707 		}
31708 	}
31709 	assert ( uMaxChunk<UINT_MAX );
31710 	m_iMaxChunk = (int)uMaxChunk;
31711 
31712 
31713 	////////
31714 	// done
31715 	////////
31716 
31717 	if ( tReader.GetErrorFlag() )
31718 		sError = tReader.GetErrorMessage();
31719 	return !tReader.GetErrorFlag();
31720 }
31721 
FindCheckpoint(const char * sWord,int iWordLen,SphWordID_t iWordID,bool bStarMode) const31722 const CSphWordlistCheckpoint * CWordlist::FindCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID, bool bStarMode ) const
31723 {
31724 	return sphSearchCheckpoint ( sWord, iWordLen, iWordID, bStarMode, m_bWordDict, m_dCheckpoints.Begin(), &m_dCheckpoints.Last() );
31725 }
31726 
31727 
KeywordsBlockReader_c(const BYTE * pBuf,bool bSkips)31728 KeywordsBlockReader_c::KeywordsBlockReader_c ( const BYTE * pBuf, bool bSkips )
31729 {
31730 	m_pBuf = pBuf;
31731 	m_sWord[0] = '\0';
31732 	m_iLen = 0;
31733 	m_bHaveSkips = bSkips;
31734 	m_sKeyword = m_sWord;
31735 }
31736 
31737 
UnpackWord()31738 bool KeywordsBlockReader_c::UnpackWord()
31739 {
31740 	if ( !m_pBuf )
31741 		return false;
31742 
31743 	// unpack next word
31744 	// must be in sync with DictEnd()!
31745 	BYTE uPack = *m_pBuf++;
31746 	if ( !uPack )
31747 	{
31748 		// ok, this block is over
31749 		m_pBuf = NULL;
31750 		m_iLen = 0;
31751 		return false;
31752 	}
31753 
31754 	int iMatch, iDelta;
31755 	if ( uPack & 0x80 )
31756 	{
31757 		iDelta = ( ( uPack>>4 ) & 7 ) + 1;
31758 		iMatch = uPack & 15;
31759 	} else
31760 	{
31761 		iDelta = uPack & 127;
31762 		iMatch = *m_pBuf++;
31763 	}
31764 
31765 	assert ( iMatch+iDelta<(int)sizeof(m_sWord)-1 );
31766 	assert ( iMatch<=(int)strlen ( (char *)m_sWord ) );
31767 
31768 	memcpy ( m_sWord + iMatch, m_pBuf, iDelta );
31769 	m_pBuf += iDelta;
31770 
31771 	m_iLen = iMatch + iDelta;
31772 	m_sWord[m_iLen] = '\0';
31773 
31774 	m_iDoclistOffset = sphUnzipOffset ( m_pBuf );
31775 	m_iDocs = sphUnzipInt ( m_pBuf );
31776 	m_iHits = sphUnzipInt ( m_pBuf );
31777 	m_uHint = ( m_iDocs>=DOCLIST_HINT_THRESH ) ? *m_pBuf++ : 0;
31778 	m_iDoclistHint = DoclistHintUnpack ( m_iDocs, m_uHint );
31779 	if ( m_bHaveSkips && ( m_iDocs > SPH_SKIPLIST_BLOCK ) )
31780 		m_iSkiplistOffset = sphUnzipInt ( m_pBuf );
31781 	else
31782 		m_iSkiplistOffset = 0;
31783 
31784 	assert ( m_iLen>0 );
31785 	return true;
31786 }
31787 
31788 
GetWord(const BYTE * pBuf,SphWordID_t iWordID,CSphDictEntry & tWord) const31789 bool CWordlist::GetWord ( const BYTE * pBuf, SphWordID_t iWordID, CSphDictEntry & tWord ) const
31790 {
31791 	SphWordID_t iLastID = 0;
31792 	SphOffset_t uLastOff = 0;
31793 
31794 	for ( ;; )
31795 	{
31796 		// unpack next word ID
31797 		const SphWordID_t iDeltaWord = sphUnzipWordid ( pBuf ); // FIXME! slow with 32bit wordids
31798 
31799 		if ( iDeltaWord==0 ) // wordlist chunk is over
31800 			return false;
31801 
31802 		iLastID += iDeltaWord;
31803 
31804 		// list is sorted, so if there was no match, there's no such word
31805 		if ( iLastID>iWordID )
31806 			return false;
31807 
31808 		// unpack next offset
31809 		const SphOffset_t iDeltaOffset = sphUnzipOffset ( pBuf );
31810 		uLastOff += iDeltaOffset;
31811 
31812 		// unpack doc/hit count
31813 		const int iDocs = sphUnzipInt ( pBuf );
31814 		const int iHits = sphUnzipInt ( pBuf );
31815 		SphOffset_t iSkiplistPos = 0;
31816 		if ( m_bHaveSkips && ( iDocs > SPH_SKIPLIST_BLOCK ) )
31817 			iSkiplistPos = sphUnzipOffset ( pBuf );
31818 
31819 		assert ( iDeltaOffset );
31820 		assert ( iDocs );
31821 		assert ( iHits );
31822 
31823 		// it matches?!
31824 		if ( iLastID==iWordID )
31825 		{
31826 			sphUnzipWordid ( pBuf ); // might be 0 at checkpoint
31827 			const SphOffset_t iDoclistLen = sphUnzipOffset ( pBuf );
31828 
31829 			tWord.m_iDoclistOffset = uLastOff;
31830 			tWord.m_iDocs = iDocs;
31831 			tWord.m_iHits = iHits;
31832 			tWord.m_iDoclistHint = (int)iDoclistLen;
31833 			tWord.m_iSkiplistOffset = iSkiplistPos;
31834 			return true;
31835 		}
31836 	}
31837 }
31838 
AcquireDict(const CSphWordlistCheckpoint * pCheckpoint) const31839 const BYTE * CWordlist::AcquireDict ( const CSphWordlistCheckpoint * pCheckpoint ) const
31840 {
31841 	assert ( pCheckpoint );
31842 	assert ( m_dCheckpoints.GetLength() );
31843 	assert ( pCheckpoint>=m_dCheckpoints.Begin() && pCheckpoint<=&m_dCheckpoints.Last() );
31844 	assert ( pCheckpoint->m_iWordlistOffset>0 && pCheckpoint->m_iWordlistOffset<=m_iSize );
31845 	assert ( m_pBuf.IsEmpty() || pCheckpoint->m_iWordlistOffset<(int64_t)m_pBuf.GetLengthBytes() );
31846 	assert ( !m_pBuf.IsEmpty() );
31847 
31848 	return m_pBuf.GetWritePtr()+pCheckpoint->m_iWordlistOffset;
31849 }
31850 
31851 
Args_t(bool bPayload,int iExpansionLimit,bool bHasMorphology,ESphHitless eHitless,const void * pIndexData)31852 ISphWordlist::Args_t::Args_t ( bool bPayload, int iExpansionLimit, bool bHasMorphology, ESphHitless eHitless, const void * pIndexData )
31853 	: m_bPayload ( bPayload )
31854 	, m_iExpansionLimit ( iExpansionLimit )
31855 	, m_bHasMorphology ( bHasMorphology )
31856 	, m_eHitless ( eHitless )
31857 	, m_pIndexData ( pIndexData )
31858 {
31859 	m_sBuf.Reserve ( 2048 * SPH_MAX_WORD_LEN * 3 );
31860 	m_dExpanded.Reserve ( 2048 );
31861 	m_pPayload = NULL;
31862 	m_iTotalDocs = 0;
31863 	m_iTotalHits = 0;
31864 }
31865 
31866 
~Args_t()31867 ISphWordlist::Args_t::~Args_t ()
31868 {
31869 	SafeDelete ( m_pPayload );
31870 }
31871 
31872 
AddExpanded(const BYTE * sName,int iLen,int iDocs,int iHits)31873 void ISphWordlist::Args_t::AddExpanded ( const BYTE * sName, int iLen, int iDocs, int iHits )
31874 {
31875 	SphExpanded_t & tExpanded = m_dExpanded.Add();
31876 	tExpanded.m_iDocs = iDocs;
31877 	tExpanded.m_iHits = iHits;
31878 	int iOff = m_sBuf.GetLength();
31879 	tExpanded.m_iNameOff = iOff;
31880 
31881 	m_sBuf.Resize ( iOff + iLen + 1 );
31882 	memcpy ( m_sBuf.Begin()+iOff, sName, iLen );
31883 	m_sBuf[iOff+iLen] = '\0';
31884 }
31885 
31886 
GetWordExpanded(int iIndex) const31887 const char * ISphWordlist::Args_t::GetWordExpanded ( int iIndex ) const
31888 {
31889 	assert ( m_dExpanded[iIndex].m_iNameOff<m_sBuf.GetLength() );
31890 	return (const char *)m_sBuf.Begin() + m_dExpanded[iIndex].m_iNameOff;
31891 }
31892 
31893 
31894 struct DiskExpandedEntry_t
31895 {
31896 	int		m_iNameOff;
31897 	int		m_iDocs;
31898 	int		m_iHits;
31899 };
31900 
31901 struct DiskExpandedPayload_t
31902 {
31903 	int			m_iDocs;
31904 	int			m_iHits;
31905 	uint64_t	m_uDoclistOff;
31906 	int			m_iDoclistHint;
31907 };
31908 
31909 
31910 struct DictEntryDiskPayload_t
31911 {
DictEntryDiskPayload_tDictEntryDiskPayload_t31912 	explicit DictEntryDiskPayload_t ( bool bPayload, ESphHitless eHitless )
31913 	{
31914 		m_bPayload = bPayload;
31915 		m_eHitless = eHitless;
31916 		if ( bPayload )
31917 			m_dWordPayload.Reserve ( 1000 );
31918 
31919 		m_dWordExpand.Reserve ( 1000 );
31920 		m_dWordBuf.Reserve ( 8096 );
31921 	}
31922 
AddDictEntryDiskPayload_t31923 	void Add ( const CSphDictEntry & tWord, int iWordLen )
31924 	{
31925 		if ( !m_bPayload || !sphIsExpandedPayload ( tWord.m_iDocs, tWord.m_iHits ) ||
31926 			m_eHitless==SPH_HITLESS_ALL || ( m_eHitless==SPH_HITLESS_SOME && ( tWord.m_iDocs & HITLESS_DOC_FLAG )!=0 ) ) // FIXME!!! do we need hitless=some as payloads?
31927 		{
31928 			DiskExpandedEntry_t & tExpand = m_dWordExpand.Add();
31929 
31930 			int iOff = m_dWordBuf.GetLength();
31931 			tExpand.m_iNameOff = iOff;
31932 			tExpand.m_iDocs = tWord.m_iDocs;
31933 			tExpand.m_iHits = tWord.m_iHits;
31934 			m_dWordBuf.Resize ( iOff + iWordLen + 1 );
31935 			memcpy ( m_dWordBuf.Begin() + iOff + 1, tWord.m_sKeyword, iWordLen );
31936 			m_dWordBuf[iOff] = (BYTE)iWordLen;
31937 
31938 		} else
31939 		{
31940 			DiskExpandedPayload_t & tExpand = m_dWordPayload.Add();
31941 			tExpand.m_iDocs = tWord.m_iDocs;
31942 			tExpand.m_iHits = tWord.m_iHits;
31943 			tExpand.m_uDoclistOff = tWord.m_iDoclistOffset;
31944 			tExpand.m_iDoclistHint = tWord.m_iDoclistHint;
31945 		}
31946 	}
31947 
ConvertDictEntryDiskPayload_t31948 	void Convert ( ISphWordlist::Args_t & tArgs )
31949 	{
31950 		if ( !m_dWordExpand.GetLength() && !m_dWordPayload.GetLength() )
31951 			return;
31952 
31953 		int iTotalDocs = 0;
31954 		int iTotalHits = 0;
31955 		if ( m_dWordExpand.GetLength() )
31956 		{
31957 			LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordExpand );
31958 
31959 			const BYTE * sBase = m_dWordBuf.Begin();
31960 			ARRAY_FOREACH ( i, m_dWordExpand )
31961 			{
31962 				const DiskExpandedEntry_t & tCur = m_dWordExpand[i];
31963 				int iDocs = tCur.m_iDocs;
31964 
31965 				if ( m_eHitless==SPH_HITLESS_SOME )
31966 					iDocs = ( tCur.m_iDocs & HITLESS_DOC_MASK );
31967 
31968 				tArgs.AddExpanded ( sBase + tCur.m_iNameOff + 1, sBase[tCur.m_iNameOff], iDocs, tCur.m_iHits );
31969 
31970 				iTotalDocs += iDocs;
31971 				iTotalHits += tCur.m_iHits;
31972 			}
31973 		}
31974 
31975 		if ( m_dWordPayload.GetLength() )
31976 		{
31977 			LimitExpanded ( tArgs.m_iExpansionLimit, m_dWordPayload );
31978 
31979 			DiskSubstringPayload_t * pPayload = new DiskSubstringPayload_t ( m_dWordPayload.GetLength() );
31980 			// sorting by ascending doc-list offset gives some (15%) speed-up too
31981 			sphSort ( m_dWordPayload.Begin(), m_dWordPayload.GetLength(), bind ( &DiskExpandedPayload_t::m_uDoclistOff ) );
31982 
31983 			ARRAY_FOREACH ( i, m_dWordPayload )
31984 			{
31985 				const DiskExpandedPayload_t & tCur = m_dWordPayload[i];
31986 				assert ( m_eHitless==SPH_HITLESS_NONE || ( m_eHitless==SPH_HITLESS_SOME && ( tCur.m_iDocs & HITLESS_DOC_FLAG )==0 ) );
31987 
31988 				iTotalDocs += tCur.m_iDocs;
31989 				iTotalHits += tCur.m_iHits;
31990 				pPayload->m_dDoclist[i].m_uOff = tCur.m_uDoclistOff;
31991 				pPayload->m_dDoclist[i].m_iLen = tCur.m_iDoclistHint;
31992 			}
31993 
31994 			pPayload->m_iTotalDocs = iTotalDocs;
31995 			pPayload->m_iTotalHits = iTotalHits;
31996 			tArgs.m_pPayload = pPayload;
31997 		}
31998 		tArgs.m_iTotalDocs = iTotalDocs;
31999 		tArgs.m_iTotalHits = iTotalHits;
32000 	}
32001 
32002 	// sort expansions by frequency desc
32003 	// clip the less frequent ones if needed, as they are likely misspellings
32004 	template < typename T >
LimitExpandedDictEntryDiskPayload_t32005 	void LimitExpanded ( int iExpansionLimit, CSphVector<T> & dVec ) const
32006 	{
32007 		if ( !iExpansionLimit || dVec.GetLength()<=iExpansionLimit )
32008 			return;
32009 
32010 		sphSort ( dVec.Begin(), dVec.GetLength(), ExpandedOrderDesc_T<T>() );
32011 		dVec.Resize ( iExpansionLimit );
32012 	}
32013 
32014 	bool								m_bPayload;
32015 	ESphHitless							m_eHitless;
32016 	CSphVector<DiskExpandedEntry_t>		m_dWordExpand;
32017 	CSphVector<DiskExpandedPayload_t>	m_dWordPayload;
32018 	CSphVector<BYTE>					m_dWordBuf;
32019 };
32020 
32021 
GetPrefixedWords(const char * sSubstring,int iSubLen,const char * sWildcard,Args_t & tArgs) const32022 void CWordlist::GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
32023 {
32024 	assert ( sSubstring && *sSubstring && iSubLen>0 );
32025 
32026 	// empty index?
32027 	if ( !m_dCheckpoints.GetLength() )
32028 		return;
32029 
32030 	DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
32031 
32032 	int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
32033 	int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
32034 
32035 	const CSphWordlistCheckpoint * pCheckpoint = FindCheckpoint ( sSubstring, iSubLen, 0, true );
32036 	const int iSkipMagic = ( BYTE(*sSubstring)<0x20 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
32037 	while ( pCheckpoint )
32038 	{
32039 		// decode wordlist chunk
32040 		KeywordsBlockReader_c tDictReader ( AcquireDict ( pCheckpoint ), m_bHaveSkips );
32041 		while ( tDictReader.UnpackWord() )
32042 		{
32043 			// block is sorted
32044 			// so once keywords are greater than the prefix, no more matches
32045 			int iCmp = sphDictCmp ( sSubstring, iSubLen, (const char *)tDictReader.m_sKeyword, tDictReader.GetWordLen() );
32046 			if ( iCmp<0 )
32047 				break;
32048 
32049 			// does it match the prefix *and* the entire wildcard?
32050 			if ( iCmp==0 && sphWildcardMatch ( (const char *)tDictReader.m_sKeyword + iSkipMagic, sWildcard, pWildcard ) )
32051 				tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
32052 		}
32053 
32054 		pCheckpoint++;
32055 		if ( pCheckpoint > &m_dCheckpoints.Last() )
32056 			break;
32057 
32058 		if ( sphDictCmp ( sSubstring, iSubLen, pCheckpoint->m_sWord, strlen ( pCheckpoint->m_sWord ) )<0 )
32059 			break;
32060 	}
32061 
32062 	tDict2Payload.Convert ( tArgs );
32063 }
32064 
operator <(const InfixBlock_t & a,const char * b)32065 bool operator < ( const InfixBlock_t & a, const char * b )
32066 {
32067 	return strcmp ( a.m_sInfix, b )<0;
32068 }
32069 
operator ==(const InfixBlock_t & a,const char * b)32070 bool operator == ( const InfixBlock_t & a, const char * b )
32071 {
32072 	return strcmp ( a.m_sInfix, b )==0;
32073 }
32074 
operator <(const char * a,const InfixBlock_t & b)32075 bool operator < ( const char * a, const InfixBlock_t & b )
32076 {
32077 	return strcmp ( a, b.m_sInfix )<0;
32078 }
32079 
32080 
sphLookupInfixCheckpoints(const char * sInfix,int iBytes,const BYTE * pInfixes,const CSphVector<InfixBlock_t> & dInfixBlocks,int iInfixCodepointBytes,CSphVector<int> & dCheckpoints)32081 bool sphLookupInfixCheckpoints ( const char * sInfix, int iBytes, const BYTE * pInfixes, const CSphVector<InfixBlock_t> & dInfixBlocks, int iInfixCodepointBytes, CSphVector<int> & dCheckpoints )
32082 {
32083 	assert ( pInfixes );
32084 	dCheckpoints.Resize ( 0 );
32085 
32086 	char dInfixBuf[3*SPH_MAX_WORD_LEN+4];
32087 	memcpy ( dInfixBuf, sInfix, iBytes );
32088 	dInfixBuf[iBytes] = '\0';
32089 
32090 	// lookup block
32091 	int iBlock = FindSpan ( dInfixBlocks, dInfixBuf );
32092 	if ( iBlock<0 )
32093 		return false;
32094 	const BYTE * pBlock = pInfixes + dInfixBlocks[iBlock].m_iOffset;
32095 
32096 	// decode block and check for exact infix match
32097 	// block entry is { byte edit_code, byte[] key_append, zint data_len, zint data_deltas[] }
32098 	// zero edit_code marks block end
32099 	BYTE sKey[32];
32100 	for ( ;; )
32101 	{
32102 		// unpack next key
32103 		int iCode = *pBlock++;
32104 		if ( !iCode )
32105 			break;
32106 
32107 		BYTE * pOut = sKey;
32108 		if ( iInfixCodepointBytes==1 )
32109 		{
32110 			pOut = sKey + ( iCode>>4 );
32111 			iCode &= 15;
32112 			while ( iCode-- )
32113 				*pOut++ = *pBlock++;
32114 		} else
32115 		{
32116 			int iKeep = ( iCode>>4 );
32117 			while ( iKeep-- )
32118 				pOut += sphUtf8CharBytes ( *pOut ); ///< wtf? *pOut (=sKey) is NOT initialized?
32119 			assert ( pOut-sKey<=(int)sizeof(sKey) );
32120 			iCode &= 15;
32121 			while ( iCode-- )
32122 			{
32123 				int i = sphUtf8CharBytes ( *pBlock );
32124 				while ( i-- )
32125 					*pOut++ = *pBlock++;
32126 			}
32127 			assert ( pOut-sKey<=(int)sizeof(sKey) );
32128 		}
32129 		assert ( pOut-sKey<(int)sizeof(sKey) );
32130 #ifndef NDEBUG
32131 		*pOut = '\0'; // handy for debugging, but not used for real matching
32132 #endif
32133 
32134 		if ( pOut==sKey+iBytes && memcmp ( sKey, dInfixBuf, iBytes )==0 )
32135 		{
32136 			// found you! decompress the data
32137 			int iLast = 0;
32138 			int iPackedLen = sphUnzipInt ( pBlock );
32139 			const BYTE * pMax = pBlock + iPackedLen;
32140 			while ( pBlock<pMax )
32141 			{
32142 				iLast += sphUnzipInt ( pBlock );
32143 				dCheckpoints.Add ( iLast );
32144 			}
32145 			return true;
32146 		}
32147 
32148 		int iSkip = sphUnzipInt ( pBlock );
32149 		pBlock += iSkip;
32150 	}
32151 	return false;
32152 }
32153 
32154 
32155 // calculate length, upto iInfixCodepointBytes chars from infix start
sphGetInfixLength(const char * sInfix,int iBytes,int iInfixCodepointBytes)32156 int sphGetInfixLength ( const char * sInfix, int iBytes, int iInfixCodepointBytes )
32157 {
32158 	int iBytes1 = Min ( 6, iBytes );
32159 	if ( iInfixCodepointBytes!=1 )
32160 	{
32161 		int iCharsLeft = 6;
32162 		const char * s = sInfix;
32163 		const char * sMax = sInfix + iBytes;
32164 		while ( iCharsLeft-- && s<sMax )
32165 			s += sphUtf8CharBytes(*s);
32166 		iBytes1 = (int)( s - sInfix );
32167 	}
32168 
32169 	return iBytes1;
32170 }
32171 
32172 
GetInfixedWords(const char * sSubstring,int iSubLen,const char * sWildcard,Args_t & tArgs) const32173 void CWordlist::GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const
32174 {
32175 	// dict must be of keywords type, and fully cached
32176 	// mmap()ed in the worst case, should we ever banish it to disk again
32177 	if ( m_pBuf.IsEmpty() || !m_dCheckpoints.GetLength() )
32178 		return;
32179 
32180 	// extract key1, upto 6 chars from infix start
32181 	int iBytes1 = sphGetInfixLength ( sSubstring, iSubLen, m_iInfixCodepointBytes );
32182 
32183 	// lookup key1
32184 	// OPTIMIZE? maybe lookup key2 and reduce checkpoint set size, if possible?
32185 	CSphVector<int> dPoints;
32186 	if ( !sphLookupInfixCheckpoints ( sSubstring, iBytes1, m_pBuf.GetWritePtr(), m_dInfixBlocks, m_iInfixCodepointBytes, dPoints ) )
32187 		return;
32188 
32189 	DictEntryDiskPayload_t tDict2Payload ( tArgs.m_bPayload, tArgs.m_eHitless );
32190 	const int iSkipMagic = ( tArgs.m_bHasMorphology ? 1 : 0 ); // whether to skip heading magic chars in the prefix, like NONSTEMMED maker
32191 
32192 	int dWildcard [ SPH_MAX_WORD_LEN + 1 ];
32193 	int * pWildcard = ( sphIsUTF8 ( sWildcard ) && sphUTF8ToWideChar ( sWildcard, dWildcard, SPH_MAX_WORD_LEN ) ) ? dWildcard : NULL;
32194 
32195 	// walk those checkpoints, check all their words
32196 	ARRAY_FOREACH ( i, dPoints )
32197 	{
32198 		// OPTIMIZE? add a quicker path than a generic wildcard for "*infix*" case?
32199 		KeywordsBlockReader_c tDictReader ( m_pBuf.GetWritePtr() + m_dCheckpoints[dPoints[i]-1].m_iWordlistOffset, m_bHaveSkips );
32200 		while ( tDictReader.UnpackWord() )
32201 		{
32202 			// stemmed terms should not match suffixes
32203 			if ( tArgs.m_bHasMorphology && *tDictReader.m_sKeyword!=MAGIC_WORD_HEAD_NONSTEMMED )
32204 				continue;
32205 
32206 			if ( sphWildcardMatch ( (const char *)tDictReader.m_sKeyword+iSkipMagic, sWildcard, pWildcard ) )
32207 				tDict2Payload.Add ( tDictReader, tDictReader.GetWordLen() );
32208 		}
32209 	}
32210 
32211 	tDict2Payload.Convert ( tArgs );
32212 }
32213 
32214 
32215 // all indexes should produce same terms for same query
Set(const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat)32216 void SphWordStatChecker_t::Set ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat )
32217 {
32218 	m_dSrcWords.Reserve ( hStat.GetLength() );
32219 	hStat.IterateStart();
32220 	while ( hStat.IterateNext() )
32221 	{
32222 		m_dSrcWords.Add ( sphFNV64 ( hStat.IterateGetKey().cstr() ) );
32223 	}
32224 	m_dSrcWords.Sort();
32225 }
32226 
32227 
DumpDiffer(const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat,const char * sIndex,CSphString & sWarning) const32228 void SphWordStatChecker_t::DumpDiffer ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat, const char * sIndex, CSphString & sWarning ) const
32229 {
32230 	if ( !m_dSrcWords.GetLength() )
32231 		return;
32232 
32233 	CSphStringBuilder tWarningBuilder;
32234 	hStat.IterateStart();
32235 	while ( hStat.IterateNext() )
32236 	{
32237 		uint64_t uHash = sphFNV64 ( hStat.IterateGetKey().cstr() );
32238 		if ( !m_dSrcWords.BinarySearch ( uHash ) )
32239 		{
32240 			if ( !tWarningBuilder.Length() )
32241 			{
32242 				if ( sIndex )
32243 					tWarningBuilder.Appendf ( "index '%s': ", sIndex );
32244 
32245 				tWarningBuilder.Appendf ( "query word(s) mismatch: %s", hStat.IterateGetKey().cstr() );
32246 			} else
32247 			{
32248 				tWarningBuilder.Appendf ( ", %s", hStat.IterateGetKey().cstr() );
32249 			}
32250 		}
32251 	}
32252 
32253 	if ( tWarningBuilder.Length() )
32254 		sWarning = tWarningBuilder.cstr();
32255 }
32256 
32257 //////////////////////////////////////////////////////////////////////////
32258 // CSphQueryResultMeta
32259 //////////////////////////////////////////////////////////////////////////
32260 
CSphQueryResultMeta()32261 CSphQueryResultMeta::CSphQueryResultMeta ()
32262 	: m_iQueryTime ( 0 )
32263 	, m_iRealQueryTime ( 0 )
32264 	, m_iCpuTime ( 0 )
32265 	, m_iMultiplier ( 1 )
32266 	, m_iMatches ( 0 )
32267 	, m_iTotalMatches ( 0 )
32268 	, m_iAgentCpuTime ( 0 )
32269 	, m_iPredictedTime ( 0 )
32270 	, m_iAgentPredictedTime ( 0 )
32271 	, m_iAgentFetchedDocs ( 0 )
32272 	, m_iAgentFetchedHits ( 0 )
32273 	, m_iAgentFetchedSkips ( 0 )
32274 	, m_bHasPrediction ( false )
32275 	, m_iBadRows ( 0 )
32276 {
32277 }
32278 
32279 
AddStat(const CSphString & sWord,int64_t iDocs,int64_t iHits)32280 void CSphQueryResultMeta::AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits )
32281 {
32282 	CSphString sFixed;
32283 	const CSphString * pFixed = &sWord;
32284 	if ( sWord.cstr()[0]==MAGIC_WORD_HEAD )
32285 	{
32286 		sFixed = sWord;
32287 		*(char *)( sFixed.cstr() ) = '*';
32288 		pFixed = &sFixed;
32289 	} else if ( sWord.cstr()[0]==MAGIC_WORD_HEAD_NONSTEMMED )
32290 	{
32291 		sFixed = sWord;
32292 		*(char *)( sFixed.cstr() ) = '=';
32293 		pFixed = &sFixed;
32294 	} else
32295 	{
32296 		const char * p = strchr ( sWord.cstr(), MAGIC_WORD_BIGRAM );
32297 		if ( p )
32298 		{
32299 			sFixed.SetSprintf ( "\"%s\"", sWord.cstr() );
32300 			*( (char*)sFixed.cstr() + ( p - sWord.cstr() ) + 1 ) = ' ';
32301 			pFixed = &sFixed;
32302 		}
32303 	}
32304 
32305 	WordStat_t & tStats = m_hWordStats.AddUnique ( *pFixed );
32306 	tStats.m_iDocs += iDocs;
32307 	tStats.m_iHits += iHits;
32308 }
32309 
32310 
32311 //////////////////////////////////////////////////////////////////////////
32312 // CONVERSION TOOLS HELPERS
32313 //////////////////////////////////////////////////////////////////////////
32314 
CopyBytes(CSphWriter & wrTo,CSphReader & rdFrom,int iBytes)32315 static void CopyBytes ( CSphWriter & wrTo, CSphReader & rdFrom, int iBytes )
32316 {
32317 	const int BUFSIZE = 65536;
32318 	BYTE * pBuf = new BYTE [ BUFSIZE ];
32319 
32320 	int iCopied = 0;
32321 	while ( iCopied < iBytes )
32322 	{
32323 		int iToCopy = Min ( iBytes - iCopied, BUFSIZE );
32324 		rdFrom.GetBytes ( pBuf, iToCopy );
32325 		wrTo.PutBytes ( pBuf, iToCopy );
32326 		iCopied += iToCopy;
32327 	}
32328 
32329 	SafeDeleteArray ( pBuf );
32330 }
32331 
32332 
32333 /// post-conversion chores
32334 /// rename the files, show elapsed time
FinalizeUpgrade(const char ** sRenames,const char * sBanner,const char * sPath,int64_t tmStart)32335 static void FinalizeUpgrade ( const char ** sRenames, const char * sBanner, const char * sPath, int64_t tmStart )
32336 {
32337 	while ( *sRenames )
32338 	{
32339 		CSphString sFrom, sTo;
32340 		sFrom.SetSprintf ( "%s%s", sPath, sRenames[0] );
32341 		sTo.SetSprintf ( "%s%s", sPath, sRenames[1] );
32342 		sRenames += 2;
32343 
32344 		if ( ::rename ( sFrom.cstr(), sTo.cstr() ) )
32345 			sphDie ( "%s: rename %s to %s failed: %s\n", sBanner,
32346 			sFrom.cstr(), sTo.cstr(), strerror(errno) );
32347 	}
32348 
32349 	// all done! yay
32350 	int64_t tmWall = sphMicroTimer() - tmStart;
32351 	fprintf ( stdout, "%s: elapsed %d.%d sec\n", sBanner,
32352 		(int)(tmWall/1000000), (int)((tmWall/100000)%10) );
32353 	fprintf ( stdout, "%s: done!\n", sBanner );
32354 }
32355 
32356 //////////////////////////////////////////////////////////////////////////
32357 // V.26 TO V.27 CONVERSION TOOL, INFIX BUILDER
32358 //////////////////////////////////////////////////////////////////////////
32359 
sphDictBuildInfixes(const char * sPath)32360 void sphDictBuildInfixes ( const char * sPath )
32361 {
32362 	CSphString sFilename, sError;
32363 	int64_t tmStart = sphMicroTimer();
32364 
32365 	if_const ( INDEX_FORMAT_VERSION!=27 )
32366 		sphDie ( "infix upgrade: only works in v.27 builds for now; get an older indextool or contact support" );
32367 
32368 	//////////////////////////////////////////////////
32369 	// load (interesting parts from) the index header
32370 	//////////////////////////////////////////////////
32371 
32372 	CSphAutoreader rdHeader;
32373 	sFilename.SetSprintf ( "%s.sph", sPath );
32374 	if ( !rdHeader.Open ( sFilename.cstr(), sError ) )
32375 		sphDie ( "infix upgrade: %s", sError.cstr() );
32376 
32377 	// version
32378 	DWORD uHeader = rdHeader.GetDword ();
32379 	DWORD uVersion = rdHeader.GetDword();
32380 	bool bUse64 = ( rdHeader.GetDword()!=0 );
32381 	ESphDocinfo eDocinfo = (ESphDocinfo) rdHeader.GetDword();
32382 
32383 	if ( uHeader!=INDEX_MAGIC_HEADER )
32384 		sphDie ( "infix upgrade: invalid header file" );
32385 	if ( uVersion<21 || uVersion>26 )
32386 		sphDie ( "infix upgrade: got v.%d header, v.21 to v.26 required", uVersion );
32387 	if ( eDocinfo==SPH_DOCINFO_INLINE )
32388 		sphDie ( "infix upgrade: docinfo=inline is not supported" );
32389 
32390 	CSphSchema tSchema;
32391 	DictHeader_t tDictHeader;
32392 	CSphSourceStats tStats;
32393 	CSphIndexSettings tIndexSettings;
32394 	CSphTokenizerSettings tTokenizerSettings;
32395 	CSphDictSettings tDictSettings;
32396 	CSphEmbeddedFiles tEmbeddedFiles;
32397 
32398 	ReadSchema ( rdHeader, tSchema, uVersion, eDocinfo==SPH_DOCINFO_INLINE );
32399 	SphOffset_t iMinDocid = rdHeader.GetOffset();
32400 	tDictHeader.m_iDictCheckpointsOffset = rdHeader.GetOffset ();
32401 	tDictHeader.m_iDictCheckpoints = rdHeader.GetDword ();
32402 	tDictHeader.m_iInfixCodepointBytes = 0;
32403 	tDictHeader.m_iInfixBlocksOffset = 0;
32404 	tDictHeader.m_iInfixBlocksWordsSize = 0;
32405 	tStats.m_iTotalDocuments = rdHeader.GetDword ();
32406 	tStats.m_iTotalBytes = rdHeader.GetOffset ();
32407 	LoadIndexSettings ( tIndexSettings, rdHeader, uVersion );
32408 	if ( !LoadTokenizerSettings ( rdHeader, tTokenizerSettings, tEmbeddedFiles, uVersion, sError ) )
32409 		sphDie ( "infix updrade: failed to load tokenizer settings: '%s'", sError.cstr() );
32410 	LoadDictionarySettings ( rdHeader, tDictSettings, tEmbeddedFiles, uVersion, sError );
32411 	int iKillListSize = rdHeader.GetDword();
32412 	DWORD uMinMaxIndex = rdHeader.GetDword();
32413 
32414 	if ( rdHeader.GetErrorFlag() )
32415 		sphDie ( "infix upgrade: failed to parse header" );
32416 	rdHeader.Close();
32417 
32418 	////////////////////
32419 	// generate infixes
32420 	////////////////////
32421 
32422 	if ( !tDictSettings.m_bWordDict )
32423 		sphDie ( "infix upgrade: dict=keywords required" );
32424 
32425 	tIndexSettings.m_iMinPrefixLen = 0;
32426 	tIndexSettings.m_iMinInfixLen = 2;
32427 
32428 	ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tTokenizerSettings, &tEmbeddedFiles, sError );
32429 	if ( !pTokenizer )
32430 		sphDie ( "infix upgrade: %s", sError.cstr() );
32431 
32432 	tDictHeader.m_iInfixCodepointBytes = pTokenizer->GetMaxCodepointLength();
32433 	ISphInfixBuilder * pInfixer = sphCreateInfixBuilder ( tDictHeader.m_iInfixCodepointBytes, &sError );
32434 	if ( !pInfixer )
32435 		sphDie ( "infix upgrade: %s", sError.cstr() );
32436 
32437 	bool bHasMorphology = !tDictSettings.m_sMorphology.IsEmpty();
32438 	// scan all dict entries, generate infixes
32439 	// (in a separate block, so that tDictReader gets destroyed, and file closed)
32440 	{
32441 		CSphDictReader tDictReader;
32442 		if ( !tDictReader.Setup ( sFilename.SetSprintf ( "%s.spi", sPath ),
32443 			tDictHeader.m_iDictCheckpointsOffset, tIndexSettings.m_eHitless, sError, true, &g_tThrottle, uVersion>=31 ) )
32444 				sphDie ( "infix upgrade: %s", sError.cstr() );
32445 		while ( tDictReader.Read() )
32446 		{
32447 			const BYTE * sWord = tDictReader.GetWord();
32448 			int iLen = strlen ( (const char *)sWord );
32449 			pInfixer->AddWord ( sWord, iLen, tDictReader.GetCheckpoint(), bHasMorphology );
32450 		}
32451 	}
32452 
32453 	/////////////////////////////
32454 	// write new dictionary file
32455 	/////////////////////////////
32456 
32457 	// ready to party
32458 	// open all the cans!
32459 	CSphAutofile tDict;
32460 	tDict.Open ( sFilename, SPH_O_READ, sError );
32461 
32462 	CSphReader rdDict;
32463 	rdDict.SetFile ( tDict );
32464 	rdDict.SeekTo ( 0, READ_NO_SIZE_HINT );
32465 
32466 	CSphWriter wrDict;
32467 	sFilename.SetSprintf ( "%s.spi.upgrade", sPath );
32468 	if ( !wrDict.OpenFile ( sFilename, sError ) )
32469 		sphDie ( "infix upgrade: failed to open %s", sFilename.cstr() );
32470 
32471 	// copy the keyword entries until checkpoints
32472 	CopyBytes ( wrDict, rdDict, (int)tDictHeader.m_iDictCheckpointsOffset );
32473 
32474 	// write newly generated infix hash entries
32475 	pInfixer->SaveEntries ( wrDict );
32476 
32477 	// copy checkpoints
32478 	int iCheckpointsSize = (int)( tDict.GetSize() - tDictHeader.m_iDictCheckpointsOffset );
32479 	tDictHeader.m_iDictCheckpointsOffset = wrDict.GetPos();
32480 	CopyBytes ( wrDict, rdDict, iCheckpointsSize );
32481 
32482 	// write newly generated infix hash blocks
32483 	tDictHeader.m_iInfixBlocksOffset = pInfixer->SaveEntryBlocks ( wrDict );
32484 	tDictHeader.m_iInfixBlocksWordsSize = pInfixer->GetBlocksWordsSize();
32485 	if ( tDictHeader.m_iInfixBlocksOffset>UINT_MAX ) // FIXME!!! change to int64
32486 		sphDie ( "INTERNAL ERROR: dictionary size " INT64_FMT " overflow at build infixes save", tDictHeader.m_iInfixBlocksOffset );
32487 
32488 
32489 	// flush header
32490 	// mostly for debugging convenience
32491 	// primary storage is in the index wide header
32492 	wrDict.PutBytes ( "dict-header", 11 );
32493 	wrDict.ZipInt ( tDictHeader.m_iDictCheckpoints );
32494 	wrDict.ZipOffset ( tDictHeader.m_iDictCheckpointsOffset );
32495 	wrDict.ZipInt ( tDictHeader.m_iInfixCodepointBytes );
32496 	wrDict.ZipInt ( (DWORD)tDictHeader.m_iInfixBlocksOffset );
32497 
32498 	wrDict.CloseFile ();
32499 	if ( wrDict.IsError() )
32500 		sphDie ( "infix upgrade: dictionary write error (out of space?)" );
32501 
32502 	if ( rdDict.GetErrorFlag() )
32503 		sphDie ( "infix upgrade: dictionary read error" );
32504 	tDict.Close();
32505 
32506 	////////////////////
32507 	// write new header
32508 	////////////////////
32509 
32510 	assert ( tDictSettings.m_bWordDict );
32511 	CSphDict * pDict = sphCreateDictionaryKeywords ( tDictSettings, &tEmbeddedFiles, pTokenizer, "$indexname", sError );
32512 	if ( !pDict )
32513 		sphDie ( "infix upgrade: %s", sError.cstr() );
32514 
32515 	CSphWriter wrHeader;
32516 	sFilename.SetSprintf ( "%s.sph.upgrade", sPath );
32517 	if ( !wrHeader.OpenFile ( sFilename, sError ) )
32518 		sphDie ( "infix upgrade: %s", sError.cstr() );
32519 
32520 	wrHeader.PutDword ( INDEX_MAGIC_HEADER );
32521 	wrHeader.PutDword ( INDEX_FORMAT_VERSION );
32522 	wrHeader.PutDword ( bUse64 );
32523 	wrHeader.PutDword ( eDocinfo );
32524 	WriteSchema ( wrHeader, tSchema );
32525 	wrHeader.PutOffset ( iMinDocid );
32526 	wrHeader.PutOffset ( tDictHeader.m_iDictCheckpointsOffset );
32527 	wrHeader.PutDword ( tDictHeader.m_iDictCheckpoints );
32528 	wrHeader.PutByte ( tDictHeader.m_iInfixCodepointBytes );
32529 	wrHeader.PutDword ( (DWORD)tDictHeader.m_iInfixBlocksOffset );
32530 	wrHeader.PutDword ( tDictHeader.m_iInfixBlocksWordsSize );
32531 	wrHeader.PutDword ( (DWORD)tStats.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
32532 	wrHeader.PutOffset ( tStats.m_iTotalBytes );
32533 	SaveIndexSettings ( wrHeader, tIndexSettings );
32534 	SaveTokenizerSettings ( wrHeader, pTokenizer, tIndexSettings.m_iEmbeddedLimit );
32535 	SaveDictionarySettings ( wrHeader, pDict, false, tIndexSettings.m_iEmbeddedLimit );
32536 	wrHeader.PutDword ( iKillListSize );
32537 	wrHeader.PutDword ( uMinMaxIndex );
32538 	wrHeader.PutDword ( 0 ); // no field filter
32539 
32540 	wrHeader.CloseFile ();
32541 	if ( wrHeader.IsError() )
32542 		sphDie ( "infix upgrade: header write error (out of space?)" );
32543 
32544 	// all done!
32545 	const char * sRenames[] = {
32546 		".sph", ".sph.bak",
32547 		".spi", ".spi.bak",
32548 		".sph.upgrade", ".sph",
32549 		".spi.upgrade", ".spi",
32550 		NULL };
32551 	FinalizeUpgrade ( sRenames, "infix upgrade", sPath, tmStart );
32552 }
32553 
32554 //////////////////////////////////////////////////////////////////////////
32555 // V.12 TO V.31 CONVERSION TOOL, SKIPLIST BUILDER
32556 //////////////////////////////////////////////////////////////////////////
32557 
32558 struct EntrySkips_t
32559 {
32560 	DWORD			m_uEntry;		///< sequential index in dict
32561 	SphOffset_t		m_iDoclist;		///< doclist offset from dict
32562 	int				m_iSkiplist;	///< generated skiplist offset
32563 };
32564 
sphDictBuildSkiplists(const char * sPath)32565 void sphDictBuildSkiplists ( const char * sPath )
32566 {
32567 	CSphString sFilename, sError;
32568 	int64_t tmStart = sphMicroTimer();
32569 
32570 	if_const ( INDEX_FORMAT_VERSION<31 || INDEX_FORMAT_VERSION>35 )
32571 		sphDie ( "skiplists upgrade: ony works in v.31 to v.35 builds for now; get an older indextool or contact support" );
32572 
32573 	// load (interesting parts from) the index header
32574 	CSphAutoreader rdHeader;
32575 	sFilename.SetSprintf ( "%s.sph", sPath );
32576 	if ( !rdHeader.Open ( sFilename.cstr(), sError ) )
32577 		sphDie ( "skiplists upgrade: %s", sError.cstr() );
32578 
32579 	// version
32580 	DWORD uHeader = rdHeader.GetDword ();
32581 	DWORD uVersion = rdHeader.GetDword();
32582 	bool bUse64 = ( rdHeader.GetDword()!=0 );
32583 	bool bConvertCheckpoints = ( uVersion<=21 );
32584 	ESphDocinfo eDocinfo = (ESphDocinfo) rdHeader.GetDword();
32585 	const DWORD uLowestVersion = 12;
32586 
32587 	if ( bUse64!=USE_64BIT )
32588 		sphDie ( "skiplists upgrade: USE_64BIT differs, index %s, binary %s",
32589 			bUse64 ? "enabled" : "disabled", USE_64BIT ? "enabled" : "disabled" );
32590 	if ( uHeader!=INDEX_MAGIC_HEADER )
32591 		sphDie ( "skiplists upgrade: invalid header file" );
32592 	if ( uVersion<uLowestVersion )
32593 		sphDie ( "skiplists upgrade: got v.%d header, v.%d to v.30 required", uVersion, uLowestVersion );
32594 	if ( eDocinfo==SPH_DOCINFO_INLINE )
32595 		sphDie ( "skiplists upgrade: docinfo=inline is not supported yet" );
32596 
32597 	CSphSchema tSchema;
32598 	DictHeader_t tDictHeader;
32599 	CSphSourceStats tStats;
32600 	CSphIndexSettings tIndexSettings;
32601 	CSphTokenizerSettings tTokenizerSettings;
32602 	CSphDictSettings tDictSettings;
32603 	CSphEmbeddedFiles tEmbeddedFiles;
32604 
32605 	ReadSchema ( rdHeader, tSchema, uVersion, eDocinfo==SPH_DOCINFO_INLINE );
32606 	SphOffset_t iMinDocid = rdHeader.GetOffset();
32607 	tDictHeader.m_iDictCheckpointsOffset = rdHeader.GetOffset ();
32608 	tDictHeader.m_iDictCheckpoints = rdHeader.GetDword ();
32609 	tDictHeader.m_iInfixCodepointBytes = 0;
32610 	tDictHeader.m_iInfixBlocksOffset = 0;
32611 	if ( uVersion>=27 )
32612 	{
32613 		tDictHeader.m_iInfixCodepointBytes = rdHeader.GetByte();
32614 		tDictHeader.m_iInfixBlocksOffset = rdHeader.GetDword();
32615 	}
32616 	if ( uVersion>=34 )
32617 		tDictHeader.m_iInfixBlocksWordsSize = rdHeader.GetDword();
32618 
32619 	tStats.m_iTotalDocuments = rdHeader.GetDword ();
32620 	tStats.m_iTotalBytes = rdHeader.GetOffset ();
32621 	LoadIndexSettings ( tIndexSettings, rdHeader, uVersion );
32622 	if ( !LoadTokenizerSettings ( rdHeader, tTokenizerSettings, tEmbeddedFiles, uVersion, sError ) )
32623 		sphDie ( "skiplists upgrade: failed to load tokenizer settings: '%s'", sError.cstr() );
32624 	LoadDictionarySettings ( rdHeader, tDictSettings, tEmbeddedFiles, uVersion, sError );
32625 	int iKillListSize = rdHeader.GetDword();
32626 
32627 	SphOffset_t uMinMaxIndex = 0;
32628 	if ( uVersion>=33 )
32629 		uMinMaxIndex = rdHeader.GetOffset ();
32630 	else if ( uVersion>=20 )
32631 		uMinMaxIndex = rdHeader.GetDword ();
32632 
32633 	ISphFieldFilter * pFieldFilter = NULL;
32634 	if ( uVersion>=28 )
32635 	{
32636 		CSphFieldFilterSettings tFieldFilterSettings;
32637 		LoadFieldFilterSettings ( rdHeader, tFieldFilterSettings );
32638 		if ( tFieldFilterSettings.m_dRegexps.GetLength() )
32639 			pFieldFilter = sphCreateFieldFilter ( tFieldFilterSettings, sError );
32640 	}
32641 
32642 	CSphFixedVector<uint64_t> dFieldLens ( tSchema.m_dFields.GetLength() );
32643 	if ( uVersion>=35 && tIndexSettings.m_bIndexFieldLens )
32644 		ARRAY_FOREACH ( i, tSchema.m_dFields )
32645 			dFieldLens[i] = rdHeader.GetOffset(); // FIXME? ideally 64bit even when off is 32bit..
32646 
32647 	if ( rdHeader.GetErrorFlag() )
32648 		sphDie ( "skiplists upgrade: failed to parse header" );
32649 	rdHeader.Close();
32650 
32651 	//////////////////////
32652 	// generate skiplists
32653 	//////////////////////
32654 
32655 	// keywords on disk might be in a different order than dictionary
32656 	// and random accesses on a plain disk would be extremely slow
32657 	// so we load the dictionary, sort by doclist offset
32658 	// then we walk doclists, generate skiplists, sort back by entry number
32659 	// then walk the disk dictionary again, lookup skiplist offset, and patch
32660 
32661 	// load the dictionary
32662 	CSphVector<EntrySkips_t> dSkips;
32663 	const bool bWordDict = tDictSettings.m_bWordDict;
32664 
32665 	CSphAutoreader rdDict;
32666 	if ( !rdDict.Open ( sFilename.SetSprintf ( "%s.spi", sPath ), sError ) )
32667 		sphDie ( "skiplists upgrade: %s", sError.cstr() );
32668 
32669 	// compute actual keyword data length
32670 	SphOffset_t iWordsEnd = tDictHeader.m_iDictCheckpointsOffset;
32671 	if ( bWordDict && tDictHeader.m_iInfixCodepointBytes )
32672 	{
32673 		rdDict.SeekTo ( tDictHeader.m_iInfixBlocksOffset, 32 ); // need just 1 entry, 32 bytes should be ok
32674 		rdDict.UnzipInt(); // skip block count
32675 		int iInfixLen = rdDict.GetByte();
32676 		rdDict.SkipBytes ( iInfixLen );
32677 		iWordsEnd = rdDict.UnzipInt() - strlen ( g_sTagInfixEntries );
32678 		rdDict.SeekTo ( 0, READ_NO_SIZE_HINT );
32679 	}
32680 
32681 	CSphDictReader * pReader = new CSphDictReader();
32682 	pReader->Setup ( &rdDict, iWordsEnd, tIndexSettings.m_eHitless, bWordDict, &g_tThrottle, uVersion>=31 );
32683 
32684 	DWORD uEntry = 0;
32685 	while ( pReader->Read() )
32686 	{
32687 		if ( pReader->m_iDocs > SPH_SKIPLIST_BLOCK )
32688 		{
32689 			EntrySkips_t & t = dSkips.Add();
32690 			t.m_uEntry = uEntry;
32691 			t.m_iDoclist = pReader->m_iDoclistOffset;
32692 			t.m_iSkiplist = -1;
32693 		}
32694 		if ( ++uEntry==0 )
32695 			sphDie ( "skiplists upgrade: dictionaries over 4B entries are not supported yet!" );
32696 	}
32697 
32698 	// sort by doclist offset
32699 	dSkips.Sort ( sphMemberLess ( &EntrySkips_t::m_iDoclist ) );
32700 
32701 	// walk doclists, create skiplists
32702 	CSphAutoreader rdDocs;
32703 	if ( !rdDocs.Open ( sFilename.SetSprintf ( "%s.spd", sPath ), sError ) )
32704 		sphDie ( "skiplists upgrade: %s", sError.cstr() );
32705 
32706 	CSphWriter wrSkips;
32707 	if ( !wrSkips.OpenFile ( sFilename.SetSprintf ( "%s.spe.tmp", sPath ), sError ) )
32708 		sphDie ( "skiplists upgrade: failed to create %s", sFilename.cstr() );
32709 	wrSkips.PutByte ( 1 );
32710 
32711 	int iDone = -1;
32712 	CSphVector<SkiplistEntry_t> dSkiplist;
32713 	ARRAY_FOREACH ( i, dSkips )
32714 	{
32715 		// seek to that keyword
32716 		// OPTIMIZE? use length hint from dict too?
32717 		rdDocs.SeekTo ( dSkips[i].m_iDoclist, READ_NO_SIZE_HINT );
32718 
32719 		// decode interesting bits of doclist
32720 		SphDocID_t uDocid = SphDocID_t ( iMinDocid );
32721 		SphOffset_t uHitPosition = 0;
32722 		DWORD uDocs = 0;
32723 
32724 		for ( ;; )
32725 		{
32726 			// save current entry position
32727 			SphOffset_t uPos = rdDocs.GetPos();
32728 
32729 			// decode next entry
32730 			SphDocID_t uDelta = rdDocs.UnzipDocid();
32731 			if ( !uDelta )
32732 				break;
32733 
32734 			// build skiplist, aka save decoder state as needed
32735 			if ( ( uDocs & ( SPH_SKIPLIST_BLOCK-1 ) )==0 )
32736 			{
32737 				SkiplistEntry_t & t = dSkiplist.Add();
32738 				t.m_iBaseDocid = uDocid;
32739 				t.m_iOffset = uPos;
32740 				t.m_iBaseHitlistPos = uHitPosition;
32741 			}
32742 			uDocs++;
32743 
32744 			// do decode
32745 			uDocid += uDelta; // track delta-encoded docid
32746 			if ( tIndexSettings.m_eHitFormat==SPH_HIT_FORMAT_INLINE )
32747 			{
32748 				DWORD uHits = rdDocs.UnzipInt();
32749 				rdDocs.UnzipInt(); // skip hit field mask/data
32750 				if ( uHits==1 )
32751 				{
32752 					rdDocs.UnzipInt(); // skip inlined field id
32753 				} else
32754 				{
32755 					uHitPosition += rdDocs.UnzipOffset(); // track delta-encoded hitlist offset
32756 				}
32757 			} else
32758 			{
32759 				uHitPosition += rdDocs.UnzipOffset(); // track delta-encoded hitlist offset
32760 				rdDocs.UnzipInt(); // skip hit field mask/data
32761 				rdDocs.UnzipInt(); // skip hit count
32762 			}
32763 		}
32764 
32765 		// alright, we built it, so save it
32766 		assert ( uDocs>SPH_SKIPLIST_BLOCK );
32767 		assert ( dSkiplist.GetLength() );
32768 
32769 		dSkips[i].m_iSkiplist = (int)wrSkips.GetPos();
32770 		SkiplistEntry_t tLast = dSkiplist[0];
32771 		for ( int j=1; j<dSkiplist.GetLength(); j++ )
32772 		{
32773 			const SkiplistEntry_t & t = dSkiplist[j];
32774 			assert ( t.m_iBaseDocid - tLast.m_iBaseDocid>=SPH_SKIPLIST_BLOCK );
32775 			assert ( t.m_iOffset - tLast.m_iOffset>=4*SPH_SKIPLIST_BLOCK );
32776 			wrSkips.ZipOffset ( t.m_iBaseDocid - tLast.m_iBaseDocid - SPH_SKIPLIST_BLOCK );
32777 			wrSkips.ZipOffset ( t.m_iOffset - tLast.m_iOffset - 4*SPH_SKIPLIST_BLOCK );
32778 			wrSkips.ZipOffset ( t.m_iBaseHitlistPos - tLast.m_iBaseHitlistPos );
32779 			tLast = t;
32780 		}
32781 		dSkiplist.Resize ( 0 );
32782 
32783 		// progress bar
32784 		int iDone2 = (1+i)*100 / dSkips.GetLength();
32785 		if ( iDone2!=iDone )
32786 		{
32787 			iDone = iDone2;
32788 			fprintf ( stdout, "skiplists upgrade: building skiplists, %d%% done\r", iDone );
32789 		}
32790 	}
32791 	fprintf ( stdout, "skiplists upgrade: building skiplists, 100%% done\n" );
32792 
32793 	// finalize
32794 	wrSkips.CloseFile ();
32795 	if ( wrSkips.IsError() )
32796 		sphDie ( "skiplists upgrade: write error (out of space?)" );
32797 	if ( rdDocs.GetErrorFlag() )
32798 		sphDie ( "skiplists upgrade: doclist read error: %s", rdDocs.GetErrorMessage().cstr() );
32799 
32800 	// sort by entry id again
32801 	dSkips.Sort ( sphMemberLess ( &EntrySkips_t::m_uEntry ) );
32802 
32803 	/////////////////////////////
32804 	// write new dictionary file
32805 	/////////////////////////////
32806 
32807 	// converted dict writer
32808 	CSphWriter wrDict;
32809 	sFilename.SetSprintf ( "%s.spi.upgrade", sPath );
32810 	if ( !wrDict.OpenFile ( sFilename, sError ) )
32811 		sphDie ( "skiplists upgrade: failed to create %s", sFilename.cstr() );
32812 	wrDict.PutByte ( 1 );
32813 
32814 	// handy entry iterator
32815 	// we will use this one to decode entries, and rdDict for other raw access
32816 	pReader->Setup ( &rdDict, iWordsEnd, tIndexSettings.m_eHitless, bWordDict, &g_tThrottle, uVersion>=31 );
32817 
32818 	// we have to adjust some of the entries
32819 	// thus we also have to recompute the offset in the checkpoints too
32820 	//
32821 	// infix hashes (if any) in dict=keywords refer to checkpoints by numbers
32822 	// so infix data can simply be copied around
32823 
32824 	// new checkpoints
32825 	CSphVector<CSphWordlistCheckpoint> dNewCP;
32826 	int iLastCheckpoint = 0;
32827 
32828 	// skiplist lookup
32829 	EntrySkips_t * pSkips = dSkips.Begin();
32830 
32831 	// dict encoder state
32832 	SphWordID_t uLastWordid = 0; // crc case
32833 	SphOffset_t iLastDoclist = 0; // crc case
32834 	CSphKeywordDeltaWriter tLastKeyword; // keywords case
32835 	DWORD uWordCount = 0;
32836 
32837 	// read old entries, write new entries
32838 	while ( pReader->Read() )
32839 	{
32840 		// update or regenerate checkpoint
32841 		if ( ( !bConvertCheckpoints && iLastCheckpoint!=pReader->GetCheckpoint() )
32842 			|| ( bConvertCheckpoints && ( uWordCount % SPH_WORDLIST_CHECKPOINT )==0 ) )
32843 		{
32844 			// FIXME? GetCheckpoint() is for some reason 1-based
32845 			if ( uWordCount )
32846 			{
32847 				wrDict.ZipInt ( 0 );
32848 				if ( bWordDict )
32849 					wrDict.ZipInt ( 0 );
32850 				else
32851 					wrDict.ZipOffset ( pReader->m_iDoclistOffset - iLastDoclist );
32852 			}
32853 			uLastWordid = 0;
32854 			iLastDoclist = 0;
32855 
32856 			CSphWordlistCheckpoint & tCP = dNewCP.Add();
32857 			if ( bWordDict )
32858 			{
32859 				tCP.m_sWord = strdup ( (const char*)pReader->GetWord() );
32860 				tLastKeyword.Reset();
32861 			} else
32862 			{
32863 				tCP.m_uWordID = pReader->m_uWordID;
32864 			}
32865 			tCP.m_iWordlistOffset = wrDict.GetPos();
32866 			iLastCheckpoint = pReader->GetCheckpoint();
32867 		}
32868 
32869 		// resave entry
32870 		if ( bWordDict )
32871 		{
32872 			// keywords dict path
32873 			const int iLen = strlen ( (const char*)pReader->GetWord() );
32874 			tLastKeyword.PutDelta ( wrDict, pReader->GetWord(), iLen );
32875 			wrDict.ZipOffset ( pReader->m_iDoclistOffset );
32876 			wrDict.ZipInt ( pReader->m_iDocs );
32877 			wrDict.ZipInt ( pReader->m_iHits );
32878 			if ( pReader->m_iDocs>=DOCLIST_HINT_THRESH )
32879 				wrDict.PutByte ( pReader->m_iHint );
32880 		} else
32881 		{
32882 			// crc dict path
32883 			assert ( pReader->m_uWordID > uLastWordid );
32884 			assert ( pReader->m_iDoclistOffset > iLastDoclist );
32885 			wrDict.ZipOffset ( pReader->m_uWordID - uLastWordid );
32886 			wrDict.ZipOffset ( pReader->m_iDoclistOffset - iLastDoclist );
32887 			wrDict.ZipInt ( pReader->m_iDocs );
32888 			wrDict.ZipInt ( pReader->m_iHits );
32889 			uLastWordid = pReader->m_uWordID;
32890 			iLastDoclist = pReader->m_iDoclistOffset;
32891 		}
32892 
32893 		// emit skiplist pointer
32894 		if ( pReader->m_iDocs > SPH_SKIPLIST_BLOCK )
32895 		{
32896 			// lots of checks
32897 			if ( uWordCount!=pSkips->m_uEntry )
32898 				sphDie ( "skiplist upgrade: internal error, entry mismatch (expected %d, got %d)",
32899 					uWordCount, pSkips->m_uEntry );
32900 			if ( pReader->m_iDoclistOffset!=pSkips->m_iDoclist )
32901 				sphDie ( "skiplist upgrade: internal error, offset mismatch (expected %lld, got %lld)",
32902 					INT64 ( pReader->m_iDoclistOffset ), INT64 ( pSkips->m_iDoclist ) );
32903 			if ( pSkips->m_iSkiplist<0 )
32904 				sphDie ( "skiplist upgrade: internal error, bad skiplist offset %d",
32905 					pSkips->m_iSkiplist	);
32906 
32907 			// and a bit of work
32908 			wrDict.ZipInt ( pSkips->m_iSkiplist );
32909 			pSkips++;
32910 		}
32911 
32912 		// next entry
32913 		uWordCount++;
32914 	}
32915 
32916 	// finalize last keywords block
32917 	wrDict.ZipInt ( 0 );
32918 	if ( bWordDict )
32919 		wrDict.ZipInt ( 0 );
32920 	else
32921 		wrDict.ZipOffset ( rdDocs.GetFilesize() - iLastDoclist );
32922 
32923 	rdDocs.Close();
32924 	SafeDelete ( pReader );
32925 
32926 	// copy infix hash entries, if any
32927 	int iDeltaInfix = 0;
32928 	if ( bWordDict && tDictHeader.m_iInfixCodepointBytes )
32929 	{
32930 		if ( iWordsEnd!=rdDict.GetPos() )
32931 			sphDie ( "skiplist upgrade: internal error, infix hash position mismatch (expected=%lld, got=%lld)",
32932 				INT64 ( iWordsEnd ), INT64 ( rdDict.GetPos() ) );
32933 		iDeltaInfix = (int)( wrDict.GetPos() - rdDict.GetPos() );
32934 		CopyBytes ( wrDict, rdDict, (int)( tDictHeader.m_iDictCheckpointsOffset - iWordsEnd ) );
32935 	}
32936 
32937 	// write new checkpoints
32938 	if ( tDictHeader.m_iDictCheckpointsOffset!=rdDict.GetPos() )
32939 		sphDie ( "skiplist upgrade: internal error, checkpoints position mismatch (expected=%lld, got=%lld)",
32940 			INT64 ( tDictHeader.m_iDictCheckpointsOffset ), INT64 ( rdDict.GetPos() ) );
32941 	if ( !bConvertCheckpoints && tDictHeader.m_iDictCheckpoints!=dNewCP.GetLength() )
32942 		sphDie ( "skiplist upgrade: internal error, checkpoint count mismatch (old=%d, new=%d)",
32943 			tDictHeader.m_iDictCheckpoints, dNewCP.GetLength() );
32944 
32945 	tDictHeader.m_iDictCheckpoints = dNewCP.GetLength();
32946 	tDictHeader.m_iDictCheckpointsOffset = wrDict.GetPos();
32947 	ARRAY_FOREACH ( i, dNewCP )
32948 	{
32949 		if ( bWordDict )
32950 		{
32951 			wrDict.PutString ( dNewCP[i].m_sWord );
32952 			SafeDeleteArray ( dNewCP[i].m_sWord );
32953 		} else
32954 		{
32955 			wrDict.PutOffset ( dNewCP[i].m_uWordID );
32956 		}
32957 		wrDict.PutOffset ( dNewCP[i].m_iWordlistOffset );
32958 	}
32959 
32960 	// update infix hash blocks, if any
32961 	// (they store direct offsets to infix hash, which just got moved)
32962 	if ( bWordDict && tDictHeader.m_iInfixCodepointBytes )
32963 	{
32964 		rdDict.SeekTo ( tDictHeader.m_iInfixBlocksOffset, READ_NO_SIZE_HINT );
32965 		int iBlocks = rdDict.UnzipInt();
32966 
32967 		wrDict.PutBytes ( g_sTagInfixBlocks, strlen ( g_sTagInfixBlocks ) );
32968 		tDictHeader.m_iInfixBlocksOffset = wrDict.GetPos();
32969 		if ( tDictHeader.m_iInfixBlocksOffset>UINT_MAX ) // FIXME!!! change to int64
32970 			sphDie ( "INTERNAL ERROR: dictionary size " INT64_FMT " overflow at infix blocks save", wrDict.GetPos() );
32971 
32972 		wrDict.ZipInt ( iBlocks );
32973 		for ( int i=0; i<iBlocks; i++ )
32974 		{
32975 			char sInfix[256];
32976 			int iBytes = rdDict.GetByte();
32977 			rdDict.GetBytes ( sInfix, iBytes );
32978 			wrDict.PutByte ( iBytes );
32979 			wrDict.PutBytes ( sInfix, iBytes );
32980 			wrDict.ZipInt ( rdDict.UnzipInt() + iDeltaInfix );
32981 		}
32982 	}
32983 
32984 	// emit new aux tail header
32985 	if ( bWordDict )
32986 	{
32987 		wrDict.PutBytes ( "dict-header", 11 );
32988 		wrDict.ZipInt ( tDictHeader.m_iDictCheckpoints );
32989 		wrDict.ZipOffset ( tDictHeader.m_iDictCheckpointsOffset );
32990 		wrDict.ZipInt ( tDictHeader.m_iInfixCodepointBytes );
32991 		wrDict.ZipInt ( (DWORD)tDictHeader.m_iInfixBlocksOffset );
32992 	}
32993 
32994 	wrDict.CloseFile();
32995 	if ( wrDict.IsError() )
32996 		sphDie ( "skiplists upgrade: dict write error (out of space?)" );
32997 
32998 	rdDict.Close();
32999 
33000 	////////////////////
33001 	// build min-max attribute index
33002 	////////////////////
33003 
33004 	bool bShuffleAttributes = false;
33005 	if ( uVersion<20 )
33006 	{
33007 		int iStride = DOCINFO_IDSIZE + tSchema.GetRowSize();
33008 		int iEntrySize = sizeof(DWORD)*iStride;
33009 
33010 		sFilename.SetSprintf ( "%s.spa", sPath );
33011 		CSphAutofile rdDocinfo ( sFilename.cstr(), SPH_O_READ, sError );
33012 		if ( rdDocinfo.GetFD()<0 )
33013 			sphDie ( "skiplists upgrade: %s", sError.cstr() );
33014 
33015 		sFilename.SetSprintf ( "%s.spa.upgrade", sPath );
33016 		CSphWriter wrDocinfo;
33017 		if ( !wrDocinfo.OpenFile ( sFilename.cstr(), sError ) )
33018 			sphDie ( "skiplists upgrade: %s", sError.cstr() );
33019 
33020 		CSphFixedVector<DWORD> dMva ( 0 );
33021 		CSphAutofile tMvaFile ( sFilename.cstr(), SPH_O_READ, sError );
33022 		if ( tMvaFile.GetFD()>=0 && tMvaFile.GetSize()>0 )
33023 		{
33024 			uint64_t uMvaSize = tMvaFile.GetSize();
33025 			assert ( uMvaSize/sizeof(DWORD)<=UINT_MAX );
33026 			dMva.Reset ( (int)( uMvaSize/sizeof(DWORD) ) );
33027 			tMvaFile.Read ( dMva.Begin(), uMvaSize, sError );
33028 		}
33029 		tMvaFile.Close();
33030 
33031 		int64_t iDocinfoSize = rdDocinfo.GetSize ( iEntrySize, true, sError ) / sizeof(CSphRowitem);
33032 		assert ( iDocinfoSize / iStride < UINT_MAX );
33033 		int iRows = (int)(iDocinfoSize/iStride);
33034 
33035 		AttrIndexBuilder_c tBuilder ( tSchema );
33036 		int64_t iMinMaxSize = tBuilder.GetExpectedSize ( tStats.m_iTotalDocuments );
33037 		if ( iMinMaxSize>INT_MAX )
33038 			sphDie ( "attribute files (.spa) over 128 GB are not supported" );
33039 		CSphFixedVector<CSphRowitem> dMinMax ( (int)iMinMaxSize );
33040 		tBuilder.Prepare ( dMinMax.Begin(), dMinMax.Begin() + dMinMax.GetLength() ); // FIXME!!! for over INT_MAX blocks
33041 
33042 		CSphFixedVector<CSphRowitem> dRow ( iStride );
33043 
33044 		uMinMaxIndex = 0;
33045 		for ( int i=0; i<iRows; i++ )
33046 		{
33047 			rdDocinfo.Read ( dRow.Begin(), iStride*sizeof(CSphRowitem), sError );
33048 			wrDocinfo.PutBytes ( dRow.Begin(), iStride*sizeof(CSphRowitem) );
33049 
33050 			if ( !tBuilder.Collect ( dRow.Begin(), dMva.Begin(), dMva.GetLength(), sError, true ) )
33051 				sphDie ( "skiplists upgrade: %s", sError.cstr() );
33052 
33053 			uMinMaxIndex += iStride;
33054 
33055 			int iDone1 = ( 1+i ) * 100 / iRows;
33056 			int iDone2 = ( 2+i ) * 100 / iRows;
33057 			if ( iDone1!=iDone2 )
33058 				fprintf ( stdout, "skiplists upgrade: building attribute min-max, %d%% done\r", iDone1 );
33059 		}
33060 		fprintf ( stdout, "skiplists upgrade: building attribute min-max, 100%% done\n" );
33061 
33062 		tBuilder.FinishCollect();
33063 		rdDocinfo.Close();
33064 
33065 		wrDocinfo.PutBytes ( dMinMax.Begin(), dMinMax.GetLength()*sizeof(CSphRowitem) );
33066 		wrDocinfo.CloseFile();
33067 		if ( wrDocinfo.IsError() )
33068 			sphDie ( "skiplists upgrade: attribute write error (out of space?)" );
33069 
33070 		bShuffleAttributes = true;
33071 	}
33072 
33073 
33074 	////////////////////
33075 	// write new header
33076 	////////////////////
33077 
33078 	ISphTokenizer * pTokenizer = ISphTokenizer::Create ( tTokenizerSettings, &tEmbeddedFiles, sError );
33079 	if ( !pTokenizer )
33080 		sphDie ( "skiplists upgrade: %s", sError.cstr() );
33081 
33082 	CSphDict * pDict = bWordDict
33083 		? sphCreateDictionaryKeywords ( tDictSettings, &tEmbeddedFiles, pTokenizer, "$indexname", sError )
33084 		: sphCreateDictionaryCRC ( tDictSettings, &tEmbeddedFiles, pTokenizer, "$indexname", sError );
33085 	if ( !pDict )
33086 		sphDie ( "skiplists upgrade: %s", sError.cstr() );
33087 
33088 	CSphWriter wrHeader;
33089 	sFilename.SetSprintf ( "%s.sph.upgrade", sPath );
33090 	if ( !wrHeader.OpenFile ( sFilename, sError ) )
33091 		sphDie ( "skiplists upgrade: %s", sError.cstr() );
33092 
33093 	wrHeader.PutDword ( INDEX_MAGIC_HEADER );
33094 	wrHeader.PutDword ( INDEX_FORMAT_VERSION );
33095 	wrHeader.PutDword ( bUse64 );
33096 	wrHeader.PutDword ( eDocinfo );
33097 	WriteSchema ( wrHeader, tSchema );
33098 	wrHeader.PutOffset ( iMinDocid );
33099 	wrHeader.PutOffset ( tDictHeader.m_iDictCheckpointsOffset );
33100 	wrHeader.PutDword ( tDictHeader.m_iDictCheckpoints );
33101 	wrHeader.PutByte ( tDictHeader.m_iInfixCodepointBytes );
33102 	wrHeader.PutDword ( (DWORD)tDictHeader.m_iInfixBlocksOffset );
33103 	wrHeader.PutDword ( tDictHeader.m_iInfixBlocksWordsSize );
33104 	wrHeader.PutDword ( (DWORD)tStats.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
33105 	wrHeader.PutOffset ( tStats.m_iTotalBytes );
33106 	SaveIndexSettings ( wrHeader, tIndexSettings );
33107 	SaveTokenizerSettings ( wrHeader, pTokenizer, tIndexSettings.m_iEmbeddedLimit );
33108 	SaveDictionarySettings ( wrHeader, pDict, false, tIndexSettings.m_iEmbeddedLimit );
33109 	wrHeader.PutDword ( iKillListSize );
33110 	wrHeader.PutOffset ( uMinMaxIndex );
33111 	SaveFieldFilterSettings ( wrHeader, pFieldFilter );
33112 
33113 	// average field lengths
33114 	if ( tIndexSettings.m_bIndexFieldLens )
33115 		ARRAY_FOREACH ( i, tSchema.m_dFields )
33116 			wrHeader.PutOffset ( dFieldLens[i] );
33117 
33118 	wrHeader.CloseFile ();
33119 	if ( wrHeader.IsError() )
33120 		sphDie ( "skiplists upgrade: header write error (out of space?)" );
33121 
33122 	sFilename.SetSprintf ( "%s.sps", sPath );
33123 	if ( !sphIsReadable ( sFilename.cstr(), NULL ) )
33124 	{
33125 		CSphWriter wrStrings;
33126 		if ( !wrStrings.OpenFile ( sFilename, sError ) )
33127 			sphDie ( "skiplists upgrade: %s", sError.cstr() );
33128 
33129 		wrStrings.PutByte ( 0 );
33130 		wrStrings.CloseFile();
33131 		if ( wrStrings.IsError() )
33132 			sphDie ( "skiplists upgrade: string write error (out of space?)" );
33133 	}
33134 
33135 	// all done!
33136 	const char * sRenames[] = {
33137 		".spe.tmp", ".spe",
33138 		".sph", ".sph.bak",
33139 		".spi", ".spi.bak",
33140 		".sph.upgrade", ".sph",
33141 		".spi.upgrade", ".spi",
33142 		bShuffleAttributes ? ".spa" : NULL, ".spa.bak",
33143 		".spa.upgrade", ".spa",
33144 		NULL };
33145 	FinalizeUpgrade ( sRenames, "skiplists upgrade", sPath, tmStart );
33146 }
33147 
33148 
Touch(const CSphString & sFilename)33149 bool CSphGlobalIDF::Touch ( const CSphString & sFilename )
33150 {
33151 	// update m_uMTime, return true if modified
33152 	struct_stat tStat;
33153 	memset ( &tStat, 0, sizeof ( tStat ) );
33154 	if ( stat ( sFilename.cstr(), &tStat ) < 0 )
33155 		memset ( &tStat, 0, sizeof ( tStat ) );
33156 	bool bModified = ( m_uMTime!=tStat.st_mtime );
33157 	m_uMTime = tStat.st_mtime;
33158 	return bModified;
33159 }
33160 
33161 
Preread(const CSphString & sFilename,CSphString & sError)33162 bool CSphGlobalIDF::Preread ( const CSphString & sFilename, CSphString & sError )
33163 {
33164 	Touch ( sFilename );
33165 
33166 	CSphAutoreader tReader;
33167 	if ( !tReader.Open ( sFilename, sError ) )
33168 		return false;
33169 
33170 	m_iTotalDocuments = tReader.GetOffset ();
33171 	const SphOffset_t iSize = tReader.GetFilesize () - sizeof(SphOffset_t);
33172 	m_iTotalWords = iSize/sizeof(IDFWord_t);
33173 
33174 	// allocate words cache
33175 	CSphString sWarning;
33176 	if ( !m_pWords.Alloc ( m_iTotalWords, sError, sWarning ) )
33177 		return false;
33178 
33179 	// allocate lookup table if needed
33180 	int iHashSize = (int)( U64C(1) << HASH_BITS );
33181 	if ( m_iTotalWords > iHashSize*8 )
33182 	{
33183 		if ( !m_pHash.Alloc ( iHashSize+2, sError, sWarning ) )
33184 			return false;
33185 	}
33186 
33187 	// read file into memory (may exceed 2GB)
33188 	const int iBlockSize = 10485760; // 10M block
33189 	for ( SphOffset_t iRead=0; iRead<iSize && !sphInterrupted(); iRead+=iBlockSize )
33190 		tReader.GetBytes ( (BYTE*)m_pWords.GetWritePtr()+iRead, iRead+iBlockSize>iSize ? (int)( iSize-iRead ) : iBlockSize );
33191 
33192 	if ( sphInterrupted() )
33193 		return false;
33194 
33195 	// build lookup table
33196 	if ( m_pHash.GetLengthBytes () )
33197 	{
33198 		int64_t * pHash = m_pHash.GetWritePtr();
33199 
33200 		uint64_t uFirst = m_pWords[0].m_uWordID;
33201 		uint64_t uRange = m_pWords[m_iTotalWords-1].m_uWordID - uFirst;
33202 
33203 		DWORD iShift = 0;
33204 		while ( uRange>=( U64C(1) << HASH_BITS ) )
33205 		{
33206 			iShift++;
33207 			uRange >>= 1;
33208 		}
33209 
33210 		pHash[0] = iShift;
33211 		pHash[1] = 0;
33212 		DWORD uLastHash = 0;
33213 
33214 		for ( int64_t i=1; i<m_iTotalWords; i++ )
33215 		{
33216 			// check for interrupt (throttled for speed)
33217 			if ( ( i & 0xffff )==0 && sphInterrupted() )
33218 				return false;
33219 
33220 			DWORD uHash = (DWORD)( ( m_pWords[i].m_uWordID-uFirst ) >> iShift );
33221 
33222 			if ( uHash==uLastHash )
33223 				continue;
33224 
33225 			while ( uLastHash<uHash )
33226 				pHash [ ++uLastHash+1 ] = i;
33227 
33228 			uLastHash = uHash;
33229 		}
33230 		pHash [ ++uLastHash+1 ] = m_iTotalWords;
33231 	}
33232 	return true;
33233 }
33234 
33235 
GetDocs(const CSphString & sWord) const33236 DWORD CSphGlobalIDF::GetDocs ( const CSphString & sWord ) const
33237 {
33238 	const char * s = sWord.cstr();
33239 
33240 	// replace = to MAGIC_WORD_HEAD_NONSTEMMED for exact terms
33241 	char sBuf [ 3*SPH_MAX_WORD_LEN+4 ];
33242 	if ( *s && *s=='=' )
33243 	{
33244 		strncpy ( sBuf, sWord.cstr(), sizeof(sBuf) );
33245 		sBuf[0] = MAGIC_WORD_HEAD_NONSTEMMED;
33246 		s = sBuf;
33247 	}
33248 
33249 	uint64_t uWordID = sphFNV64(s);
33250 
33251 	int64_t iStart = 0;
33252 	int64_t iEnd = m_iTotalWords-1;
33253 
33254 	const IDFWord_t * pWords = (IDFWord_t *)m_pWords.GetWritePtr ();
33255 
33256 	if ( m_pHash.GetLengthBytes () )
33257 	{
33258 		uint64_t uFirst = pWords[0].m_uWordID;
33259 		DWORD uHash = (DWORD)( ( uWordID-uFirst ) >> m_pHash[0] );
33260 		if ( uHash > ( U64C(1) << HASH_BITS ) )
33261 			return 0;
33262 
33263 		iStart = m_pHash [ uHash+1 ];
33264 		iEnd = m_pHash [ uHash+2 ] - 1;
33265 	}
33266 
33267 	const IDFWord_t * pWord = sphBinarySearch ( pWords+iStart, pWords+iEnd, bind ( &IDFWord_t::m_uWordID ), uWordID );
33268 	return pWord ? pWord->m_iDocs : 0;
33269 }
33270 
33271 
GetIDF(const CSphString & sWord,int64_t iDocsLocal,bool bPlainIDF)33272 float CSphGlobalIDF::GetIDF ( const CSphString & sWord, int64_t iDocsLocal, bool bPlainIDF )
33273 {
33274 	const int64_t iDocs = Max ( iDocsLocal, (int64_t)GetDocs ( sWord ) );
33275 	const int64_t iTotalClamped = Max ( m_iTotalDocuments, iDocs );
33276 
33277 	if ( !iDocs )
33278 		return 0.0f;
33279 
33280 	if ( bPlainIDF )
33281 	{
33282 		float fLogTotal = logf ( float ( 1+iTotalClamped ) );
33283 		return logf ( float ( iTotalClamped-iDocs+1 ) / float ( iDocs ) )
33284 			/ ( 2*fLogTotal );
33285 	} else
33286 	{
33287 		float fLogTotal = logf ( float ( 1+iTotalClamped ) );
33288 		return logf ( float ( iTotalClamped ) / float ( iDocs ) )
33289 			/ ( 2*fLogTotal );
33290 	}
33291 }
33292 
33293 
sphPrereadGlobalIDF(const CSphString & sPath,CSphString & sError)33294 bool sphPrereadGlobalIDF ( const CSphString & sPath, CSphString & sError )
33295 {
33296 	g_tGlobalIDFLock.Lock ();
33297 
33298 	CSphGlobalIDF ** ppGlobalIDF = g_hGlobalIDFs ( sPath );
33299 	bool bExpired = ( ppGlobalIDF && *ppGlobalIDF && (*ppGlobalIDF)->Touch ( sPath ) );
33300 
33301 	if ( !ppGlobalIDF || bExpired )
33302 	{
33303 		if ( bExpired )
33304 			sphLogDebug ( "Reloading global IDF (%s)", sPath.cstr() );
33305 		else
33306 			sphLogDebug ( "Loading global IDF (%s)", sPath.cstr() );
33307 
33308 		// unlock while prereading
33309 		g_tGlobalIDFLock.Unlock ();
33310 
33311 		CSphGlobalIDF * pGlobalIDF = new CSphGlobalIDF ();
33312 		if ( !pGlobalIDF->Preread ( sPath, sError ) )
33313 		{
33314 			SafeDelete ( pGlobalIDF );
33315 			return false;
33316 		}
33317 
33318 		// lock while updating
33319 		g_tGlobalIDFLock.Lock ();
33320 
33321 		if ( bExpired )
33322 		{
33323 			ppGlobalIDF = g_hGlobalIDFs ( sPath );
33324 			if ( ppGlobalIDF )
33325 			{
33326 				CSphGlobalIDF * pOld = *ppGlobalIDF;
33327 				*ppGlobalIDF = pGlobalIDF;
33328 				SafeDelete ( pOld );
33329 			}
33330 		} else
33331 		{
33332 			if ( !g_hGlobalIDFs.Add ( pGlobalIDF, sPath ) )
33333 				SafeDelete ( pGlobalIDF );
33334 		}
33335 	}
33336 
33337 	g_tGlobalIDFLock.Unlock ();
33338 
33339 	return true;
33340 }
33341 
33342 
sphUpdateGlobalIDFs(const CSphVector<CSphString> & dFiles)33343 void sphUpdateGlobalIDFs ( const CSphVector<CSphString> & dFiles )
33344 {
33345 	// delete unlisted entries
33346 	g_tGlobalIDFLock.Lock ();
33347 	g_hGlobalIDFs.IterateStart ();
33348 	while ( g_hGlobalIDFs.IterateNext () )
33349 	{
33350 		const CSphString & sKey = g_hGlobalIDFs.IterateGetKey ();
33351 		if ( !dFiles.Contains ( sKey ) )
33352 		{
33353 			sphLogDebug ( "Unloading global IDF (%s)", sKey.cstr() );
33354 			SafeDelete ( g_hGlobalIDFs.IterateGet () );
33355 			g_hGlobalIDFs.Delete ( sKey );
33356 		}
33357 	}
33358 	g_tGlobalIDFLock.Unlock ();
33359 
33360 	// load/rotate remaining entries
33361 	CSphString sError;
33362 	ARRAY_FOREACH ( i, dFiles )
33363 	{
33364 		CSphString sPath = dFiles[i];
33365 		if ( !sphPrereadGlobalIDF ( sPath, sError ) )
33366 			sphLogDebug ( "Could not load global IDF (%s): %s", sPath.cstr(), sError.cstr() );
33367 	}
33368 }
33369 
33370 
sphShutdownGlobalIDFs()33371 void sphShutdownGlobalIDFs ()
33372 {
33373 	CSphVector<CSphString> dEmptyFiles;
33374 	sphUpdateGlobalIDFs ( dEmptyFiles );
33375 }
33376 
33377 //////////////////////////////////////////////////////////////////////////
33378 
33379 //
33380 // $Id$
33381 //
33382