1 //
2 // $Id$
3 //
4 
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 
16 #ifndef _sphinx_
17 #define _sphinx_
18 
19 /////////////////////////////////////////////////////////////////////////////
20 
21 #ifdef _WIN32
22 	#define USE_MYSQL		1	/// whether to compile MySQL support
23 	#define USE_PGSQL		0	/// whether to compile PgSQL support
24 	#define USE_ODBC		1	/// whether to compile ODBC support
25 	#define USE_LIBEXPAT	1	/// whether to compile libexpat support
26 	#define USE_LIBICONV	1	/// whether to compile iconv support
27 	#define	USE_LIBSTEMMER	0	/// whether to compile libstemmber support
28 	#define	USE_RE2			0	/// whether to compile RE2 support
29 	#define USE_RLP			0	/// whether to compile RLP support
30 	#define USE_WINDOWS		1	/// whether to compile for Windows
31 	#define USE_SYSLOG		0	/// whether to use syslog for logging
32 	#define HAVE_STRNLEN	1
33 
34 	#define UNALIGNED_RAM_ACCESS	1
35 	#define USE_LITTLE_ENDIAN		1
36 #else
37 	#define USE_WINDOWS		0	/// whether to compile for Windows
38 #endif
39 
40 /////////////////////////////////////////////////////////////////////////////
41 
42 #include "sphinxstd.h"
43 #include "sphinxexpr.h" // to remove?
44 
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <limits.h>
49 
50 #ifdef HAVE_CONFIG_H
51 #include "config.h"
52 #endif
53 
54 #if USE_PGSQL
55 #include <libpq-fe.h>
56 #endif
57 
58 #if USE_WINDOWS
59 #include <winsock2.h>
60 #else
61 #include <sys/types.h>
62 #include <unistd.h>
63 #endif
64 
65 #if USE_MYSQL
66 #include <mysql.h>
67 #endif
68 
69 #if USE_WINDOWS
70 typedef __int64				SphOffset_t;
71 #define STDOUT_FILENO		fileno(stdout)
72 #define STDERR_FILENO		fileno(stderr)
73 #else
74 typedef off_t				SphOffset_t;
75 #endif
76 
77 #if USE_ODBC
78 #include <sqlext.h>
79 #endif
80 
81 /////////////////////////////////////////////////////////////////////////////
82 
83 #ifndef USE_64BIT
84 #define USE_64BIT 1
85 #endif
86 
87 #if USE_64BIT
88 
89 // use 64-bit unsigned integers to store document and word IDs
90 #define SPHINX_BITS_TAG	"-id64"
91 typedef uint64_t		SphWordID_t;
92 typedef uint64_t		SphDocID_t;
93 
94 #define DOCID_MAX		U64C(0xffffffffffffffff)
95 #define DOCID_FMT		UINT64_FMT
96 #define DOCINFO_IDSIZE	2
97 
98 STATIC_SIZE_ASSERT ( SphWordID_t, 8 );
99 STATIC_SIZE_ASSERT ( SphDocID_t, 8 );
100 
101 #else
102 
103 // use 32-bit unsigned integers to store document and word IDs
104 #define SPHINX_BITS_TAG	""
105 typedef DWORD			SphWordID_t;
106 typedef DWORD			SphDocID_t;
107 
108 #define DOCID_MAX		0xffffffffUL
109 #define DOCID_FMT		"%u"
110 #define DOCINFO_IDSIZE	1
111 
112 STATIC_SIZE_ASSERT ( SphWordID_t, 4 );
113 STATIC_SIZE_ASSERT ( SphDocID_t, 4 );
114 
115 #endif // USE_64BIT
116 
117 #define DWSIZEOF(a) ( sizeof(a) / sizeof(DWORD) )
118 
119 //////////////////////////////////////////////////////////////////////////
120 
121 /// row entry (storage only, does not necessarily map 1:1 to attributes)
122 typedef DWORD			CSphRowitem;
123 typedef const BYTE *	CSphRowitemPtr;
124 
125 /// widest integer type that can be be stored as an attribute (ideally, fully decoupled from rowitem size!)
126 typedef int64_t			SphAttr_t;
127 
128 const CSphRowitem		ROWITEM_MAX		= UINT_MAX;
129 const int				ROWITEM_BITS	= 8*sizeof(CSphRowitem);
130 const int				ROWITEMPTR_BITS	= 8*sizeof(CSphRowitemPtr);
131 const int				ROWITEM_SHIFT	= 5;
132 
133 STATIC_ASSERT ( ( 1 << ROWITEM_SHIFT )==ROWITEM_BITS, INVALID_ROWITEM_SHIFT );
134 
135 #ifndef USE_LITTLE_ENDIAN
136 #error Please define endianness
137 #endif
138 
139 template < typename DOCID >
140 inline DOCID DOCINFO2ID_T ( const DWORD * pDocinfo );
141 
DOCINFO2ID_T(const DWORD * pDocinfo)142 template<> inline DWORD DOCINFO2ID_T ( const DWORD * pDocinfo )
143 {
144 	return pDocinfo[0];
145 }
146 
DOCINFO2ID_T(const DWORD * pDocinfo)147 template<> inline uint64_t DOCINFO2ID_T ( const DWORD * pDocinfo )
148 {
149 #if USE_LITTLE_ENDIAN
150 	return uint64_t(pDocinfo[0]) + (uint64_t(pDocinfo[1])<<32);
151 #else
152 	return uint64_t(pDocinfo[1]) + (uint64_t(pDocinfo[0])<<32);
153 #endif
154 }
155 
DOCINFOSETID(DWORD * pDocinfo,DWORD uValue)156 inline void DOCINFOSETID ( DWORD * pDocinfo, DWORD uValue )
157 {
158 	*pDocinfo = uValue;
159 }
160 
DOCINFOSETID(DWORD * pDocinfo,uint64_t uValue)161 inline void DOCINFOSETID ( DWORD * pDocinfo, uint64_t uValue )
162 {
163 #if USE_LITTLE_ENDIAN
164 	pDocinfo[0] = (DWORD)uValue;
165 	pDocinfo[1] = (DWORD)(uValue>>32);
166 #else
167 	pDocinfo[0] = (DWORD)(uValue>>32);
168 	pDocinfo[1] = (DWORD)uValue;
169 #endif
170 }
171 
DOCINFO2ID(const DWORD * pDocinfo)172 inline SphDocID_t DOCINFO2ID ( const DWORD * pDocinfo )
173 {
174 	return DOCINFO2ID_T<SphDocID_t> ( pDocinfo );
175 }
176 
177 #if PARANOID
DOCINFO2ATTRS_T(DWORD * pDocinfo)178 template < typename DOCID > inline DWORD *			DOCINFO2ATTRS_T ( DWORD * pDocinfo )		{ assert ( pDocinfo ); return pDocinfo+DWSIZEOF(DOCID); }
DOCINFO2ATTRS_T(const DWORD * pDocinfo)179 template < typename DOCID > inline const DWORD *	DOCINFO2ATTRS_T ( const DWORD * pDocinfo )	{ assert ( pDocinfo ); return pDocinfo+DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(DWORD * pAttrs)180 template < typename DOCID > inline DWORD *			STATIC2DOCINFO_T ( DWORD * pAttrs )		{ assert ( pAttrs ); return pAttrs-DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(const DWORD * pAttrs)181 template < typename DOCID > inline const DWORD *	STATIC2DOCINFO_T ( const DWORD * pAttrs )	{ assert ( pAttrs ); return pAttrs-DWSIZEOF(DOCID); }
182 #else
DOCINFO2ATTRS_T(DWORD * pDocinfo)183 template < typename DOCID > inline DWORD *			DOCINFO2ATTRS_T ( DWORD * pDocinfo )		{ return pDocinfo + DWSIZEOF(DOCID); }
DOCINFO2ATTRS_T(const DWORD * pDocinfo)184 template < typename DOCID > inline const DWORD *	DOCINFO2ATTRS_T ( const DWORD * pDocinfo )	{ return pDocinfo + DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(DWORD * pAttrs)185 template < typename DOCID > inline DWORD *			STATIC2DOCINFO_T ( DWORD * pAttrs )		{ return pAttrs - DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(const DWORD * pAttrs)186 template < typename DOCID > inline const DWORD *	STATIC2DOCINFO_T ( const DWORD * pAttrs )	{ return pAttrs - DWSIZEOF(DOCID); }
187 #endif
188 
DOCINFO2ATTRS(DWORD * pDocinfo)189 inline 			DWORD *	DOCINFO2ATTRS ( DWORD * pDocinfo )			{ return DOCINFO2ATTRS_T<SphDocID_t>(pDocinfo); }
DOCINFO2ATTRS(const DWORD * pDocinfo)190 inline const	DWORD *	DOCINFO2ATTRS ( const DWORD * pDocinfo )	{ return DOCINFO2ATTRS_T<SphDocID_t>(pDocinfo); }
STATIC2DOCINFO(DWORD * pAttrs)191 inline 			DWORD *	STATIC2DOCINFO ( DWORD * pAttrs )			{ return STATIC2DOCINFO_T<SphDocID_t>(pAttrs); }
STATIC2DOCINFO(const DWORD * pAttrs)192 inline const	DWORD *	STATIC2DOCINFO ( const DWORD * pAttrs )	{ return STATIC2DOCINFO_T<SphDocID_t>(pAttrs); }
193 
194 
195 /////////////////////////////////////////////////////////////////////////////
196 
197 #ifdef BUILD_WITH_CMAKE
198 	#include "gen_sphinxversion.h"
199 #else
200 	#include "sphinxversion.h"
201 #endif
202 
203 #ifndef SPHINX_TAG
204 #define SPHINX_TAG "-release"
205 #endif
206 
207 // below is for easier extraction of the ver. by any external scripts
208 #define SPHINX_VERSION_NUMBERS    "2.2.11"
209 
210 #define SPHINX_VERSION           SPHINX_VERSION_NUMBERS SPHINX_BITS_TAG SPHINX_TAG " (" SPH_GIT_COMMIT_ID ")"
211 #define SPHINX_BANNER			"Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
212 #define SPHINX_SEARCHD_PROTO	1
213 #define SPHINX_CLIENT_VERSION	1
214 
215 #define SPH_MAX_WORD_LEN		42		// so that any UTF-8 word fits 127 bytes
216 #define SPH_MAX_FILENAME_LEN	512
217 #define SPH_MAX_FIELDS			256
218 
219 /////////////////////////////////////////////////////////////////////////////
220 
221 extern int64_t g_iIndexerCurrentDocID;
222 extern int64_t g_iIndexerCurrentHits;
223 extern int64_t g_iIndexerCurrentRangeMin;
224 extern int64_t g_iIndexerCurrentRangeMax;
225 extern int64_t g_iIndexerPoolStartDocID;
226 extern int64_t g_iIndexerPoolStartHit;
227 
228 /////////////////////////////////////////////////////////////////////////////
229 
230 /// Sphinx CRC32 implementation
231 extern DWORD	g_dSphinxCRC32 [ 256 ];
232 DWORD			sphCRC32 ( const void * pString );
233 DWORD			sphCRC32 ( const void * pString, int iLen );
234 DWORD			sphCRC32 ( const void * pString, int iLen, DWORD uPrevCRC );
235 
236 /// Fast check if our endianess is correct
237 const char*		sphCheckEndian();
238 
239 /// Sphinx FNV64 implementation
240 const uint64_t	SPH_FNV64_SEED = 0xcbf29ce484222325ULL;
241 uint64_t		sphFNV64 ( const void * pString );
242 uint64_t		sphFNV64 ( const void * s, int iLen, uint64_t uPrev = SPH_FNV64_SEED );
243 uint64_t		sphFNV64cont ( const void * pString, uint64_t uPrev );
244 
245 /// calculate file crc32
246 bool			sphCalcFileCRC32 ( const char * szFilename, DWORD & uCRC32 );
247 
248 /// try to obtain an exclusive lock on specified file
249 /// bWait specifies whether to wait
250 bool			sphLockEx ( int iFile, bool bWait );
251 
252 /// remove existing locks
253 void			sphLockUn ( int iFile );
254 
255 /// millisecond-precision sleep
256 void			sphSleepMsec ( int iMsec );
257 
258 /// check if file exists and is a readable file
259 bool			sphIsReadable ( const char * sFilename, CSphString * pError=NULL );
260 
261 /// set throttling options
262 void			sphSetThrottling ( int iMaxIOps, int iMaxIOSize );
263 
264 /// immediately interrupt current query
265 void			sphInterruptNow();
266 
267 /// check if we got interrupted
268 bool			sphInterrupted();
269 
270 #if !USE_WINDOWS
271 /// set process info
272 void			sphSetProcessInfo ( bool bHead );
273 #endif
274 
275 
276 /// initialize IO statistics collecting
277 bool			sphInitIOStats ();
278 
279 /// clean up IO statistics collector
280 void			sphDoneIOStats ();
281 
282 
283 class CSphIOStats
284 {
285 public:
286 	int64_t		m_iReadTime;
287 	DWORD		m_iReadOps;
288 	int64_t		m_iReadBytes;
289 	int64_t		m_iWriteTime;
290 	DWORD		m_iWriteOps;
291 	int64_t		m_iWriteBytes;
292 
293 	CSphIOStats ();
294 	~CSphIOStats ();
295 
296 	void		Start();
297 	void		Stop();
298 
299 	void		Add ( const CSphIOStats & b );
IsEnabled()300 	bool		IsEnabled() { return m_bEnabled; }
301 
302 private:
303 	bool		m_bEnabled;
304 	CSphIOStats * m_pPrev;
305 };
306 
307 
308 //////////////////////////////////////////////////////////////////////////
309 
310 #if UNALIGNED_RAM_ACCESS
311 
312 /// pass-through wrapper
sphUnalignedRead(const T & tRef)313 template < typename T > inline T sphUnalignedRead ( const T & tRef )
314 {
315 	return tRef;
316 }
317 
318 /// pass-through wrapper
sphUnalignedWrite(void * pPtr,const T & tVal)319 template < typename T > void sphUnalignedWrite ( void * pPtr, const T & tVal )
320 {
321 	*(T*)pPtr = tVal;
322 }
323 
324 #else
325 
326 /// unaligned read wrapper for some architectures (eg. SPARC)
327 template < typename T >
sphUnalignedRead(const T & tRef)328 inline T sphUnalignedRead ( const T & tRef )
329 {
330 	T uTmp;
331 	BYTE * pSrc = (BYTE *) &tRef;
332 	BYTE * pDst = (BYTE *) &uTmp;
333 	for ( int i=0; i<(int)sizeof(T); i++ )
334 		*pDst++ = *pSrc++;
335 	return uTmp;
336 }
337 
338 /// unaligned write wrapper for some architectures (eg. SPARC)
339 template < typename T >
sphUnalignedWrite(void * pPtr,const T & tVal)340 void sphUnalignedWrite ( void * pPtr, const T & tVal )
341 {
342 	BYTE * pDst = (BYTE *) pPtr;
343 	BYTE * pSrc = (BYTE *) &tVal;
344 	for ( int i=0; i<(int)sizeof(T); i++ )
345 		*pDst++ = *pSrc++;
346 }
347 
348 #endif // unalgined
349 
350 
351 #if UNALIGNED_RAM_ACCESS && USE_LITTLE_ENDIAN
352 /// get a dword from memory, intel version
sphGetDword(const BYTE * p)353 inline DWORD sphGetDword ( const BYTE * p )
354 {
355 	return *(const DWORD*)p;
356 }
357 #else
358 /// get a dword from memory, non-intel version
sphGetDword(const BYTE * p)359 inline DWORD sphGetDword ( const BYTE * p )
360 {
361 	return p[0] + ( p[1]<<8 ) + ( p[2]<<16 ) + ( p[3]<<24 );
362 }
363 #endif
364 
365 
366 int sphUTF8Len ( const char * pStr );
367 
368 /// check for valid attribute name char
sphIsAttr(int c)369 inline int sphIsAttr ( int c )
370 {
371 	// different from sphIsAlpha() in that we don't allow minus
372 	return ( c>='0' && c<='9' ) || ( c>='a' && c<='z' ) || ( c>='A' && c<='Z' ) || c=='_';
373 }
374 
375 /////////////////////////////////////////////////////////////////////////////
376 // TOKENIZERS
377 /////////////////////////////////////////////////////////////////////////////
378 
379 extern const char *		SPHINX_DEFAULT_UTF8_TABLE;
380 
381 /////////////////////////////////////////////////////////////////////////////
382 
383 /// lowercaser remap range
384 struct CSphRemapRange
385 {
386 	int			m_iStart;
387 	int			m_iEnd;
388 	int			m_iRemapStart;
389 
CSphRemapRangeCSphRemapRange390 	CSphRemapRange ()
391 		: m_iStart		( -1 )
392 		, m_iEnd		( -1 )
393 		, m_iRemapStart	( -1 )
394 	{}
395 
CSphRemapRangeCSphRemapRange396 	CSphRemapRange ( int iStart, int iEnd, int iRemapStart )
397 		: m_iStart		( iStart )
398 		, m_iEnd		( iEnd )
399 		, m_iRemapStart	( iRemapStart )
400 	{}
401 };
402 
403 
404 inline bool operator < ( const CSphRemapRange & a, const CSphRemapRange & b )
405 {
406 	return a.m_iStart < b.m_iStart;
407 }
408 
409 
410 /// lowercaser
411 class CSphLowercaser
412 {
413 	friend class ISphTokenizer;
414 	friend class CSphTokenizerBase;
415 	friend class CSphTokenizer_UTF8_Base;
416 	friend class CSphTokenizerBase2;
417 
418 public:
419 				CSphLowercaser ();
420 				~CSphLowercaser ();
421 
422 	void		Reset ();
423 	void		SetRemap ( const CSphLowercaser * pLC );
424 	void		AddRemaps ( const CSphVector<CSphRemapRange> & dRemaps, DWORD uFlags );
425 	void		AddSpecials ( const char * sSpecials );
426 	uint64_t	GetFNV () const;
427 
428 public:
429 	const CSphLowercaser &		operator = ( const CSphLowercaser & rhs );
430 
431 public:
ToLower(int iCode)432 	inline int	ToLower ( int iCode ) const
433 	{
434 		if ( iCode<0 || iCode>=MAX_CODE )
435 			return iCode;
436 		int * pChunk = m_pChunk [ iCode >> CHUNK_BITS ];
437 		if ( pChunk )
438 			return pChunk [ iCode & CHUNK_MASK ];
439 		return 0;
440 	}
441 
442 	int GetMaxCodepointLength () const;
443 
444 protected:
445 	static const int	CHUNK_COUNT	= 0x300;
446 	static const int	CHUNK_BITS	= 8;
447 
448 	static const int	CHUNK_SIZE	= 1 << CHUNK_BITS;
449 	static const int	CHUNK_MASK	= CHUNK_SIZE - 1;
450 	static const int	MAX_CODE	= CHUNK_COUNT * CHUNK_SIZE;
451 
452 	int					m_iChunks;					///< how much chunks are actually allocated
453 	int *				m_pData;					///< chunks themselves
454 	int *				m_pChunk [ CHUNK_COUNT ];	///< pointers to non-empty chunks
455 };
456 
457 /////////////////////////////////////////////////////////////////////////////
458 
459 struct CSphSavedFile
460 {
461 	CSphString			m_sFilename;
462 	SphOffset_t			m_uSize;
463 	SphOffset_t			m_uCTime;
464 	SphOffset_t			m_uMTime;
465 	DWORD				m_uCRC32;
466 
467 						CSphSavedFile ();
468 };
469 
470 
471 struct CSphEmbeddedFiles
472 {
473 	bool						m_bEmbeddedSynonyms;
474 	bool						m_bEmbeddedStopwords;
475 	bool						m_bEmbeddedWordforms;
476 	CSphSavedFile				m_tSynonymFile;
477 	CSphVector<CSphString>		m_dSynonyms;
478 	CSphVector<CSphSavedFile>	m_dStopwordFiles;
479 	CSphVector<SphWordID_t>		m_dStopwords;
480 	CSphVector<CSphString>		m_dWordforms;
481 	CSphVector<CSphSavedFile>	m_dWordformFiles;
482 
483 								CSphEmbeddedFiles ();
484 
485 	void						Reset();
486 };
487 
488 
489 struct CSphTokenizerSettings
490 {
491 	int					m_iType;
492 	CSphString			m_sCaseFolding;
493 	int					m_iMinWordLen;
494 	CSphString			m_sSynonymsFile;
495 	CSphString			m_sBoundary;
496 	CSphString			m_sIgnoreChars;
497 	int					m_iNgramLen;
498 	CSphString			m_sNgramChars;
499 	CSphString			m_sBlendChars;
500 	CSphString			m_sBlendMode;
501 	CSphString			m_sIndexingPlugin;	///< this tokenizer wants an external plugin to process its raw output
502 
503 						CSphTokenizerSettings ();
504 };
505 
506 
507 enum ESphBigram
508 {
509 	SPH_BIGRAM_NONE			= 0,	///< no bigrams
510 	SPH_BIGRAM_ALL			= 1,	///< index all word pairs
511 	SPH_BIGRAM_FIRSTFREQ	= 2,	///< only index pairs where one of the words is in a frequent words list
512 	SPH_BIGRAM_BOTHFREQ		= 3		///< only index pairs where both words are in a frequent words list
513 };
514 
515 
516 enum ESphTokenizerClone
517 {
518 	SPH_CLONE_INDEX,				///< clone tokenizer and set indexing mode
519 	SPH_CLONE_QUERY,				///< clone tokenizer and set querying mode
520 	SPH_CLONE_QUERY_LIGHTWEIGHT		///< lightweight clone for querying (can parse, can NOT modify settings, shares pointers to the original lowercaser table)
521 };
522 
523 
524 enum ESphTokenMorph
525 {
526 	SPH_TOKEN_MORPH_RAW,			///< no morphology applied, tokenizer does not handle morphology
527 	SPH_TOKEN_MORPH_ORIGINAL,		///< no morphology applied, but tokenizer handles morphology
528 	SPH_TOKEN_MORPH_GUESS			///< morphology applied
529 };
530 
531 
532 struct CSphMultiformContainer;
533 class CSphWriter;
534 
535 /// generic tokenizer
536 class ISphTokenizer
537 {
538 public:
539 	/// trivial ctor
540 									ISphTokenizer();
541 
542 	/// trivial dtor
~ISphTokenizer()543 	virtual							~ISphTokenizer () {}
544 
545 public:
546 	/// set new translation table
547 	/// returns true on success, false on failure
548 	virtual bool					SetCaseFolding ( const char * sConfig, CSphString & sError );
549 
550 	/// add additional character as valid (with folding to itself)
551 	virtual void					AddPlainChar ( char c );
552 
553 	/// add special chars to translation table
554 	/// updates lowercaser so that these remap to -1
555 	virtual void					AddSpecials ( const char * sSpecials );
556 
557 	/// set ignored characters
558 	virtual bool					SetIgnoreChars ( const char * sIgnored, CSphString & sError );
559 
560 	/// set n-gram characters (for CJK n-gram indexing)
SetNgramChars(const char *,CSphString &)561 	virtual bool					SetNgramChars ( const char *, CSphString & ) { return true; }
562 
563 	/// set n-gram length (for CJK n-gram indexing)
SetNgramLen(int)564 	virtual void					SetNgramLen ( int ) {}
565 
566 	/// load synonyms list
567 	virtual bool					LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError ) = 0;
568 
569 	/// write synonyms to file
570 	virtual void					WriteSynonyms ( CSphWriter & tWriter ) = 0;
571 
572 	/// set phrase boundary chars
573 	virtual bool					SetBoundary ( const char * sConfig, CSphString & sError );
574 
575 	/// set blended characters
576 	virtual bool					SetBlendChars ( const char * sConfig, CSphString & sError );
577 
578 	/// set blended tokens processing mode
579 	virtual bool					SetBlendMode ( const char * sMode, CSphString & sError );
580 
581 	/// setup tokenizer using given settings
582 	virtual void					Setup ( const CSphTokenizerSettings & tSettings );
583 
584 	/// create a tokenizer using the given settings
585 	static ISphTokenizer *			Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError );
586 
587 	/// create a token filter
588 	static ISphTokenizer *			CreateMultiformFilter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer );
589 
590 	/// create a token filter
591 	static ISphTokenizer *			CreateBigramFilter ( ISphTokenizer * pTokenizer, ESphBigram eBigramIndex, const CSphString & sBigramWords, CSphString & sError );
592 
593 	/// create a plugin filter
594 	/// sSspec is a library, name, and options specification string, eg "myplugins.dll:myfilter1:arg1=123"
595 	static ISphTokenizer *			CreatePluginFilter ( ISphTokenizer * pTokenizer, const CSphString & sSpec, CSphString & sError );
596 
597 #if USE_RLP
598 	/// create a RLP token filter
599 	static ISphTokenizer *			CreateRLPFilter ( ISphTokenizer * pTokenizer, bool bChineseRLP, const char * szRLPRoot,	const char * szRLPEnv, const char * szRLPCtx, bool bStandalone, CSphString & sError );
600 
601 	/// create a filter to split an RLP-processed token stream into tokens
602 	static ISphTokenizer *			CreateRLPResultSplitter ( ISphTokenizer * pTokenizer, const char * szRLPCtx );
603 
604 	/// split query string with an RLP token filter
605 	static bool						ProcessQueryRLP ( const char * sRLPContext, const char * sQuery, const char ** sProcessed, CSphTightVector<char> & dBuf, CSphString & sError );
606 #endif
607 
608 	/// save tokenizer settings to a stream
GetSettings()609 	virtual const CSphTokenizerSettings &	GetSettings () const { return m_tSettings; }
610 
611 	/// get synonym file info
GetSynFileInfo()612 	virtual const CSphSavedFile &	GetSynFileInfo () const { return m_tSynFileInfo; }
613 
614 public:
615 	/// pass next buffer
616 	virtual void					SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;
617 
618 	/// set current index schema (only intended for the token filter plugins)
SetFilterSchema(const CSphSchema &,CSphString &)619 	virtual bool					SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }
620 
621 	/// set per-document options from INSERT
SetFilterOptions(const char *,CSphString &)622 	virtual bool					SetFilterOptions ( const char *, CSphString & ) { return true; }
623 
624 	/// notify tokenizer that we now begin indexing a field with a given number (only intended for the token filter plugins)
BeginField(int)625 	virtual void					BeginField ( int ) {}
626 
627 	/// get next token
628 	virtual BYTE *					GetToken () = 0;
629 
630 	/// calc codepoint length
631 	virtual int						GetCodepointLength ( int iCode ) const = 0;
632 
633 	/// get max codepoint length
634 	virtual int						GetMaxCodepointLength () const = 0;
635 
636 	/// enable indexing-time sentence boundary detection, and paragraph indexing
637 	virtual bool					EnableSentenceIndexing ( CSphString & sError );
638 
639 	/// enable zone indexing
640 	virtual bool					EnableZoneIndexing ( CSphString & sError );
641 
642 	// shows whether morphology needs to be applied to this token or not
GetMorphFlag()643 	virtual bool					GetMorphFlag () const { return true; }
644 
645 	/// enable tokenized multiform tracking
EnableTokenizedMultiformTracking()646 	virtual void					EnableTokenizedMultiformTracking () {}
647 
648 	/// get last token length, in codepoints
GetLastTokenLen()649 	virtual int						GetLastTokenLen () const { return m_iLastTokenLen; }
650 
651 	/// get last token boundary flag (true if there was a boundary before the token)
GetBoundary()652 	virtual bool					GetBoundary () { return m_bTokenBoundary; }
653 
654 	/// get byte offset of the last boundary character
GetBoundaryOffset()655 	virtual int						GetBoundaryOffset () { return m_iBoundaryOffset; }
656 
657 	/// was last token a special one?
WasTokenSpecial()658 	virtual bool					WasTokenSpecial () { return m_bWasSpecial; }
659 
WasTokenSynonym()660 	virtual bool					WasTokenSynonym () const { return m_bWasSynonym; }
661 
662 	/// get amount of overshort keywords skipped before this token
GetOvershortCount()663 	virtual int						GetOvershortCount () { return ( !m_bBlended && m_bBlendedPart ? 0 : m_iOvershortCount ); }
664 
665 	/// get original tokenized multiform (if any); NULL means there was none
GetTokenizedMultiform()666 	virtual BYTE *					GetTokenizedMultiform () { return NULL; }
667 
668 	/// was last token a part of multi-wordforms destination
669 	/// head parameter might be useful to distinguish between sequence of different multi-wordforms
670 	virtual bool					WasTokenMultiformDestination ( bool & bHead, int & iDestCount ) const = 0;
671 
672 	/// check whether this token is a generated morphological guess
GetTokenMorph()673 	ESphTokenMorph					GetTokenMorph() const { return m_eTokenMorph; }
674 
TokenIsBlended()675 	virtual bool					TokenIsBlended () const { return m_bBlended; }
TokenIsBlendedPart()676 	virtual bool					TokenIsBlendedPart () const { return m_bBlendedPart; }
SkipBlended()677 	virtual int						SkipBlended () { return 0; }
678 
GetEmbeddedTokenizer()679 	virtual ISphTokenizer *			GetEmbeddedTokenizer () const { return NULL; }
680 
681 public:
682 	/// spawn a clone of my own
683 	virtual ISphTokenizer *			Clone ( ESphTokenizerClone eMode ) const = 0;
684 
685 	/// start buffer point of last token
686 	virtual const char *			GetTokenStart () const = 0;
687 
688 	/// end buffer point of last token (exclusive, ie. *GetTokenEnd() is already NOT part of a token!)
689 	virtual const char *			GetTokenEnd () const = 0;
690 
691 	/// current buffer ptr
692 	virtual const char *			GetBufferPtr () const = 0;
693 
694 	/// buffer end
695 	virtual const char *			GetBufferEnd () const = 0;
696 
697 	/// set new buffer ptr (must be within current bounds)
698 	virtual void					SetBufferPtr ( const char * sNewPtr ) = 0;
699 
700 	/// get settings hash
701 	virtual uint64_t				GetSettingsFNV () const;
702 
703 	/// get (readonly) lowercaser
GetLowercaser()704 	const CSphLowercaser &			GetLowercaser() const { return m_tLC; }
705 
706 	/// get an RLP context path (if any)
GetRLPContext()707 	virtual const char * GetRLPContext () const { return NULL; }
708 
709 protected:
710 	virtual bool					RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError );
711 	virtual bool					AddSpecialsSPZ ( const char * sSpecials, const char * sDirective, CSphString & sError );
712 
713 protected:
714 	static const int				MAX_SYNONYM_LEN		= 1024;	///< max synonyms map-from length, bytes
715 
716 	static const BYTE				BLEND_TRIM_NONE		= 1;
717 	static const BYTE				BLEND_TRIM_HEAD		= 2;
718 	static const BYTE				BLEND_TRIM_TAIL		= 4;
719 	static const BYTE				BLEND_TRIM_BOTH		= 8;
720 
721 	CSphLowercaser					m_tLC;						///< my lowercaser
722 	int								m_iLastTokenLen;			///< last token length, in codepoints
723 	bool							m_bTokenBoundary;			///< last token boundary flag (true after boundary codepoint followed by separator)
724 	bool							m_bBoundary;				///< boundary flag (true immediately after boundary codepoint)
725 	int								m_iBoundaryOffset;			///< boundary character offset (in bytes)
726 	bool							m_bWasSpecial;				///< special token flag
727 	bool							m_bWasSynonym;				///< last token is a synonym token
728 	bool							m_bEscaped;					///< backslash handling flag
729 	int								m_iOvershortCount;			///< skipped overshort tokens count
730 	ESphTokenMorph					m_eTokenMorph;				///< whether last token was a generated morphological guess
731 
732 	bool							m_bBlended;					///< whether last token (as in just returned from GetToken()) was blended
733 	bool							m_bNonBlended;				///< internal, whether there were any normal chars in that blended token
734 	bool							m_bBlendedPart;				///< whether last token is a normal subtoken of a blended token
735 	bool							m_bBlendAdd;				///< whether we have more pending blended variants (of current accumulator) to return
736 	BYTE							m_uBlendVariants;			///< mask of blended variants as requested by blend_mode (see BLEND_TRIM_xxx flags)
737 	BYTE							m_uBlendVariantsPending;	///< mask of pending blended variants (we clear bits as we return variants)
738 	bool							m_bBlendSkipPure;			///< skip purely blended tokens
739 
740 	bool							m_bShortTokenFilter;		///< short token filter flag
741 	bool							m_bDetectSentences;			///< should we detect sentence boundaries?
742 
743 	CSphTokenizerSettings			m_tSettings;				///< tokenizer settings
744 	CSphSavedFile					m_tSynFileInfo;				///< synonyms file info
745 
746 public:
747 	bool							m_bPhrase;
748 };
749 
750 /// parse charset table
751 bool					sphParseCharset ( const char * sCharset, CSphVector<CSphRemapRange> & dRemaps );
752 
753 /// create UTF-8 tokenizer
754 ISphTokenizer *			sphCreateUTF8Tokenizer ();
755 
756 /// create UTF-8 tokenizer with n-grams support (for CJK n-gram indexing)
757 ISphTokenizer *			sphCreateUTF8NgramTokenizer ();
758 
759 /////////////////////////////////////////////////////////////////////////////
760 // DICTIONARIES
761 /////////////////////////////////////////////////////////////////////////////
762 
763 struct CSphDictSettings
764 {
765 	CSphString		m_sMorphology;
766 	CSphString		m_sStopwords;
767 	CSphVector<CSphString> m_dWordforms;
768 	int				m_iMinStemmingLen;
769 	bool			m_bWordDict;
770 	bool			m_bCrc32;
771 	bool			m_bStopwordsUnstemmed;
772 	CSphString		m_sMorphFingerprint;		///< not used for creation; only for a check when loading
773 
CSphDictSettingsCSphDictSettings774 	CSphDictSettings ()
775 		: m_iMinStemmingLen ( 1 )
776 		, m_bWordDict ( true )
777 		, m_bCrc32 ( !USE_64BIT )
778 		, m_bStopwordsUnstemmed ( false )
779 	{}
780 };
781 
782 
783 /// dictionary entry
784 /// some of the fields might be unused depending on specific dictionary type
785 struct CSphDictEntry
786 {
787 	SphWordID_t		m_uWordID;			///< keyword id (for dict=crc)
788 	const BYTE *	m_sKeyword;			///< keyword text (for dict=keywords)
789 	int				m_iDocs;			///< number of matching documents
790 	int				m_iHits;			///< number of occurrences
791 	SphOffset_t		m_iDoclistOffset;	///< absolute document list offset (into .spd)
792 	SphOffset_t		m_iDoclistLength;	///< document list length in bytes
793 	SphOffset_t		m_iSkiplistOffset;	///< absolute skiplist offset (into .spe)
794 	int				m_iDoclistHint;		///< raw document list length hint value (0..255 range, 1 byte)
795 };
796 
797 
798 /// stored normal form
799 struct CSphStoredNF
800 {
801 	CSphString					m_sWord;
802 	bool						m_bAfterMorphology;
803 };
804 
805 
806 /// wordforms container
807 struct CSphWordforms
808 {
809 	int							m_iRefCount;
810 	CSphVector<CSphSavedFile>	m_dFiles;
811 	uint64_t					m_uTokenizerFNV;
812 	CSphString					m_sIndexName;
813 	bool						m_bHavePostMorphNF;
814 	CSphVector <CSphStoredNF>	m_dNormalForms;
815 	CSphMultiformContainer *	m_pMultiWordforms;
816 	CSphOrderedHash < int, CSphString, CSphStrHashFunc, 1048576 >	m_dHash;
817 
818 	CSphWordforms ();
819 	~CSphWordforms ();
820 
821 	bool						IsEqual ( const CSphVector<CSphSavedFile> & dFiles );
822 	bool						ToNormalForm ( BYTE * pWord, bool bBefore ) const;
823 };
824 
825 
826 /// abstract word dictionary interface
827 struct CSphWordHit;
828 class CSphAutofile;
829 struct DictHeader_t;
830 struct ThrottleState_t;
831 class CSphDict
832 {
833 public:
834 	static const int	ST_OK = 0;
835 	static const int	ST_ERROR = 1;
836 	static const int	ST_WARNING = 2;
837 
838 public:
839 	/// virtualizing dtor
~CSphDict()840 	virtual				~CSphDict () {}
841 
842 	/// Get word ID by word, "text" version
843 	/// may apply stemming and modify word inplace
844 	/// modified word may become bigger than the original one, so make sure you have enough space in buffer which is pointer by pWord
845 	/// a general practice is to use char[3*SPH_MAX_WORD_LEN+4] as a buffer
846 	/// returns 0 for stopwords
847 	virtual SphWordID_t	GetWordID ( BYTE * pWord ) = 0;
848 
849 	/// get word ID by word, "text" version
850 	/// may apply stemming and modify word inplace
851 	/// accepts words with already prepended MAGIC_WORD_HEAD
852 	/// appends MAGIC_WORD_TAIL
853 	/// returns 0 for stopwords
GetWordIDWithMarkers(BYTE * pWord)854 	virtual SphWordID_t	GetWordIDWithMarkers ( BYTE * pWord ) { return GetWordID ( pWord ); }
855 
856 	/// get word ID by word, "text" version
857 	/// does NOT apply stemming
858 	/// accepts words with already prepended MAGIC_WORD_HEAD_NONSTEMMED
859 	/// returns 0 for stopwords
GetWordIDNonStemmed(BYTE * pWord)860 	virtual SphWordID_t	GetWordIDNonStemmed ( BYTE * pWord ) { return GetWordID ( pWord ); }
861 
862 	/// get word ID by word, "binary" version
863 	/// only used with prefix/infix indexing
864 	/// must not apply stemming and modify anything
865 	/// filters stopwords on request
866 	virtual SphWordID_t	GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) = 0;
867 
868 	/// apply stemmers to the given word
ApplyStemmers(BYTE *)869 	virtual void		ApplyStemmers ( BYTE * ) const {}
870 
871 	/// load stopwords from given files
872 	virtual void		LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer ) = 0;
873 
874 	/// load stopwords from an array
875 	virtual void		LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords ) = 0;
876 
877 	/// write stopwords to a file
878 	virtual void		WriteStopwords ( CSphWriter & tWriter ) = 0;
879 
880 	/// load wordforms from a given list of files
881 	virtual bool		LoadWordforms ( const CSphVector<CSphString> &, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex ) = 0;
882 
883 	/// write wordforms to a file
884 	virtual void		WriteWordforms ( CSphWriter & tWriter ) = 0;
885 
886 	/// get wordforms
GetWordforms()887 	virtual const CSphWordforms *	GetWordforms() { return NULL; }
888 
889 	/// disable wordforms processing
DisableWordforms()890 	virtual void		DisableWordforms() {}
891 
892 	/// set morphology
893 	/// returns 0 on success, 1 on hard error, 2 on a warning (see ST_xxx constants)
894 	virtual int			SetMorphology ( const char * szMorph, CSphString & sMessage ) = 0;
895 
896 	/// are there any morphological processors?
HasMorphology()897 	virtual bool		HasMorphology () const { return false; }
898 
899 	/// morphological data fingerprint (lemmatizer filenames and crc32s)
GetMorphDataFingerprint()900 	virtual const CSphString &	GetMorphDataFingerprint () const { return m_sMorphFingerprint; }
901 
902 	/// setup dictionary using settings
903 	virtual void		Setup ( const CSphDictSettings & tSettings ) = 0;
904 
905 	/// get dictionary settings
906 	virtual const CSphDictSettings & GetSettings () const = 0;
907 
908 	/// stopwords file infos
909 	virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () = 0;
910 
911 	/// wordforms file infos
912 	virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () = 0;
913 
914 	/// get multiwordforms
915 	virtual const CSphMultiformContainer * GetMultiWordforms () const = 0;
916 
917 	/// check what given word is stopword
918 	virtual bool IsStopWord ( const BYTE * pWord ) const = 0;
919 
920 public:
921 	/// enable actually collecting keywords (needed for stopwords/wordforms loading)
HitblockBegin()922 	virtual void			HitblockBegin () {}
923 
924 	/// callback to let dictionary do hit block post-processing
HitblockPatch(CSphWordHit *,int)925 	virtual void			HitblockPatch ( CSphWordHit *, int ) const {}
926 
927 	/// resolve temporary hit block wide wordid (!) back to keyword
HitblockGetKeyword(SphWordID_t)928 	virtual const char *	HitblockGetKeyword ( SphWordID_t ) { return NULL; }
929 
930 	/// check current memory usage
HitblockGetMemUse()931 	virtual int				HitblockGetMemUse () { return 0; }
932 
933 	/// hit block dismissed
HitblockReset()934 	virtual void			HitblockReset () {}
935 
936 public:
937 	/// begin creating dictionary file, setup any needed internal structures
938 	virtual void			DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle );
939 
940 	/// add next keyword entry to final dict
941 	virtual void			DictEntry ( const CSphDictEntry & tEntry );
942 
943 	/// flush last entry
944 	virtual void			DictEndEntries ( SphOffset_t iDoclistOffset );
945 
946 	/// end indexing, store dictionary and checkpoints
947 	virtual bool			DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * pThrottle );
948 
949 	/// check whether there were any errors during indexing
950 	virtual bool			DictIsError () const;
951 
952 public:
953 	/// check whether this dict is stateful (when it comes to lookups)
HasState()954 	virtual bool			HasState () const { return false; }
955 
956 	/// make a clone
Clone()957 	virtual CSphDict *		Clone () const { return NULL; }
958 
959 	/// get settings hash
960 	virtual uint64_t		GetSettingsFNV () const = 0;
961 
962 	/// apply morphology or not
963 	virtual void			SetApplyMorph ( bool bApply ) = 0;
964 
965 protected:
966 	CSphString				m_sMorphFingerprint;
967 };
968 
969 
970 /// traits dictionary factory (no storage, only tokenizing, lemmatizing, etc.)
971 CSphDict * sphCreateDictionaryTemplate ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
972 
973 /// CRC32/FNV64 dictionary factory
974 CSphDict * sphCreateDictionaryCRC ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
975 
976 /// keyword-storing dictionary factory
977 CSphDict * sphCreateDictionaryKeywords ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
978 
979 /// clear wordform cache
980 void sphShutdownWordforms ();
981 
982 /// update/clear global IDF cache
983 bool sphPrereadGlobalIDF ( const CSphString & sPath, CSphString & sError );
984 void sphUpdateGlobalIDFs ( const CSphVector<CSphString> & dFiles );
985 void sphInitGlobalIDFs ();
986 void sphShutdownGlobalIDFs ();
987 
988 /////////////////////////////////////////////////////////////////////////////
989 // DATASOURCES
990 /////////////////////////////////////////////////////////////////////////////
991 
992 /// hit position storage type
993 typedef DWORD Hitpos_t;
994 
995 /// empty hit value
996 #define EMPTY_HIT 0
997 
998 /// hit processing tools
999 /// Hitpos_t consists of three things:
1000 /// 1) high bits store field number
1001 /// 2) middle bit - field end marker
1002 /// 3) lower bits store hit position in field
1003 template < int FIELD_BITS >
1004 class Hitman_c
1005 {
1006 protected:
1007 	enum
1008 	{
1009 		POS_BITS		= 31 - FIELD_BITS,
1010 		FIELD_OFF		= 32 - FIELD_BITS,
1011 		FIELDEND_OFF	= 31 - FIELD_BITS,
1012 		FIELDEND_MASK	= (1UL << POS_BITS),
1013 		POS_MASK		= (1UL << POS_BITS) - 1
1014 	};
1015 
1016 public:
Create(int iField,int iPos)1017 	static Hitpos_t Create ( int iField, int iPos )
1018 	{
1019 		return ( iField << FIELD_OFF ) + ( iPos & POS_MASK );
1020 	}
1021 
Create(int iField,int iPos,bool bEnd)1022 	static Hitpos_t Create ( int iField, int iPos, bool bEnd )
1023 	{
1024 		return ( iField << FIELD_OFF ) + ( ((int)bEnd) << FIELDEND_OFF ) + ( iPos & POS_MASK );
1025 	}
1026 
GetField(Hitpos_t uHitpos)1027 	static inline int GetField ( Hitpos_t uHitpos )
1028 	{
1029 		return uHitpos >> FIELD_OFF;
1030 	}
1031 
GetPos(Hitpos_t uHitpos)1032 	static inline int GetPos ( Hitpos_t uHitpos )
1033 	{
1034 		return uHitpos & POS_MASK;
1035 	}
1036 
IsEnd(Hitpos_t uHitpos)1037 	static inline bool IsEnd ( Hitpos_t uHitpos )
1038 	{
1039 		return ( uHitpos & FIELDEND_MASK )!=0;
1040 	}
1041 
GetPosWithField(Hitpos_t uHitpos)1042 	static inline DWORD GetPosWithField ( Hitpos_t uHitpos )
1043 	{
1044 		return uHitpos & ~FIELDEND_MASK;
1045 	}
1046 
AddPos(Hitpos_t * pHitpos,int iAdd)1047 	static void AddPos ( Hitpos_t * pHitpos, int iAdd )
1048 	{
1049 		// FIXME! add range checks (eg. so that 0:0-1 does not overflow)
1050 		*pHitpos += iAdd;
1051 	}
1052 
CreateSum(Hitpos_t uHitpos,int iAdd)1053 	static Hitpos_t CreateSum ( Hitpos_t uHitpos, int iAdd )
1054 	{
1055 		// FIXME! add range checks (eg. so that 0:0-1 does not overflow)
1056 		return ( uHitpos+iAdd ) & ~FIELDEND_MASK;
1057 	}
1058 
SetEndMarker(Hitpos_t * pHitpos)1059 	static void SetEndMarker ( Hitpos_t * pHitpos )
1060 	{
1061 		*pHitpos |= FIELDEND_MASK;
1062 	}
1063 };
1064 
1065 // this could be just DWORD[] but it's methods are very handy
1066 // used to store field information e.g. which fields do we need to search in
1067 struct FieldMask_t
1068 {
1069 	static const int SIZE = SPH_MAX_FIELDS/32;
1070 	STATIC_ASSERT ( ( SPH_MAX_FIELDS%32 )==0, ASSUME_MAX_FIELDS_ARE_REPRESENTABLE_BY_DWORD );
1071 	DWORD m_dMask [ SIZE ];
1072 
1073 	// no custom cstr and d-tor - to be usable from inside unions
1074 	// deep copy for it is ok - so, no explicit copying constructor and operator=
1075 
1076 	// old-fashion layer to work with DWORD (32-bit) mask.
1077 	// all bits above 32 assumed to be unset.
Assign32FieldMask_t1078 	void Assign32 ( DWORD uMask )
1079 	{
1080 		UnsetAll();
1081 		m_dMask[0] = uMask;
1082 	}
1083 
GetMask32FieldMask_t1084 	DWORD GetMask32 () const
1085 	{
1086 		return m_dMask[0];
1087 	}
1088 
1089 	DWORD operator[] ( int iIdx ) const
1090 	{
1091 		assert ( 0<=iIdx && iIdx<SIZE );
1092 		return m_dMask [ iIdx ];
1093 	}
1094 
1095 	DWORD & operator[] ( int iIdx )
1096 	{
1097 		assert ( 0<=iIdx && iIdx<SIZE );
1098 		return m_dMask [ iIdx ];
1099 	}
1100 
1101 	// set n-th bit
SetFieldMask_t1102 	void Set ( int iIdx )
1103 	{
1104 		assert ( 0<=iIdx && iIdx<(int)sizeof(m_dMask)*8 );
1105 		m_dMask [ iIdx/32 ] |= 1 << ( iIdx%32 );
1106 	}
1107 
1108 	// set all bits
SetAllFieldMask_t1109 	void SetAll()
1110 	{
1111 		memset ( m_dMask, 0xff, sizeof(m_dMask) );
1112 	}
1113 
1114 	// unset n-th bit, or all
UnsetFieldMask_t1115 	void Unset ( int iIdx )
1116 	{
1117 		assert ( 0<=iIdx && iIdx<(int)sizeof(m_dMask)*8 );
1118 		m_dMask [ iIdx/32 ] &= ~(1 << ( iIdx%32 ));
1119 	}
1120 
UnsetAllFieldMask_t1121 	void UnsetAll()
1122 	{
1123 		memset ( m_dMask, 0, sizeof(m_dMask) );
1124 	}
1125 
1126 	// test if n-th bit is set
TestFieldMask_t1127 	bool Test ( int iIdx ) const
1128 	{
1129 		assert ( iIdx>=0 && iIdx<(int)sizeof(m_dMask)*8 );
1130 		return ( m_dMask [ iIdx/32 ] & ( 1 << ( iIdx%32 ) ) )!=0;
1131 	}
1132 
1133 	// test if all bits are set or unset
TestAllFieldMask_t1134 	bool TestAll ( bool bSet ) const
1135 	{
1136 		DWORD uTest = bSet ? 0xffffffff : 0;
1137 		for ( int i=0; i<SIZE; i++ )
1138 			if ( m_dMask[i]!=uTest )
1139 				return false;
1140 		return true;
1141 	}
1142 
NegateFieldMask_t1143 	void Negate()
1144 	{
1145 		for ( int i=0; i<SIZE; i++ )
1146 			m_dMask[i] = ~m_dMask[i];
1147 	}
1148 };
1149 
1150 /// hit info
1151 struct CSphWordHit
1152 {
1153 	SphDocID_t		m_uDocID;		///< document ID
1154 	SphWordID_t		m_uWordID;		///< word ID in current dictionary
1155 	Hitpos_t		m_uWordPos;		///< word position in current document
1156 };
1157 
1158 
1159 /// attribute locator within the row
1160 struct CSphAttrLocator
1161 {
1162 	// OPTIMIZE? try packing these
1163 	int				m_iBitOffset;
1164 	int				m_iBitCount;
1165 	bool			m_bDynamic;
1166 
CSphAttrLocatorCSphAttrLocator1167 	CSphAttrLocator ()
1168 		: m_iBitOffset ( -1 )
1169 		, m_iBitCount ( -1 )
1170 		, m_bDynamic ( false )
1171 	{}
1172 
1173 	explicit CSphAttrLocator ( int iBitOffset, int iBitCount=ROWITEM_BITS )
m_iBitOffsetCSphAttrLocator1174 		: m_iBitOffset ( iBitOffset )
1175 		, m_iBitCount ( iBitCount )
1176 		, m_bDynamic ( true )
1177 	{}
1178 
IsBitfieldCSphAttrLocator1179 	inline bool IsBitfield () const
1180 	{
1181 		return ( m_iBitCount<ROWITEM_BITS || ( m_iBitOffset%ROWITEM_BITS )!=0 );
1182 	}
1183 
CalcRowitemCSphAttrLocator1184 	int CalcRowitem () const
1185 	{
1186 		return IsBitfield() ? -1 : ( m_iBitOffset / ROWITEM_BITS );
1187 	}
1188 
IsIDCSphAttrLocator1189 	bool IsID () const
1190 	{
1191 		return m_iBitOffset==-8*(int)sizeof(SphDocID_t) && m_iBitCount==8*sizeof(SphDocID_t);
1192 	}
1193 
1194 #ifndef NDEBUG
1195 	/// get last item touched by this attr (for debugging checks only)
GetMaxRowitemCSphAttrLocator1196 	int GetMaxRowitem () const
1197 	{
1198 		return ( m_iBitOffset + m_iBitCount - 1 ) / ROWITEM_BITS;
1199 	}
1200 #endif
1201 
1202 	bool operator == ( const CSphAttrLocator & rhs ) const
1203 	{
1204 		return m_iBitOffset==rhs.m_iBitOffset && m_iBitCount==rhs.m_iBitCount && m_bDynamic==rhs.m_bDynamic;
1205 	}
1206 };
1207 
1208 
1209 /// getter
sphGetRowAttr(const CSphRowitem * pRow,const CSphAttrLocator & tLoc)1210 inline SphAttr_t sphGetRowAttr ( const CSphRowitem * pRow, const CSphAttrLocator & tLoc )
1211 {
1212 	assert ( pRow );
1213 	int iItem = tLoc.m_iBitOffset >> ROWITEM_SHIFT;
1214 
1215 	if ( tLoc.m_iBitCount==ROWITEM_BITS )
1216 		return pRow[iItem];
1217 
1218 	if ( tLoc.m_iBitCount==2*ROWITEM_BITS ) // FIXME? write a generalized version, perhaps
1219 		return SphAttr_t ( pRow[iItem] ) + ( SphAttr_t ( pRow[iItem+1] ) << ROWITEM_BITS );
1220 
1221 	int iShift = tLoc.m_iBitOffset & ( ( 1 << ROWITEM_SHIFT )-1 );
1222 	return ( pRow[iItem] >> iShift ) & ( ( 1UL << tLoc.m_iBitCount )-1 );
1223 }
1224 
1225 
1226 /// setter
sphSetRowAttr(CSphRowitem * pRow,const CSphAttrLocator & tLoc,SphAttr_t uValue)1227 inline void sphSetRowAttr ( CSphRowitem * pRow, const CSphAttrLocator & tLoc, SphAttr_t uValue )
1228 {
1229 	assert(pRow);
1230 	int iItem = tLoc.m_iBitOffset >> ROWITEM_SHIFT;
1231 	if ( tLoc.m_iBitCount==2*ROWITEM_BITS )
1232 	{
1233 		// FIXME? write a generalized version, perhaps
1234 		pRow[iItem] = CSphRowitem ( uValue & ( ( SphAttr_t(1) << ROWITEM_BITS )-1 ) );
1235 		pRow[iItem+1] = CSphRowitem ( uValue >> ROWITEM_BITS );
1236 
1237 	} else if ( tLoc.m_iBitCount==ROWITEM_BITS )
1238 	{
1239 		pRow[iItem] = CSphRowitem ( uValue );
1240 
1241 	} else
1242 	{
1243 		int iShift = tLoc.m_iBitOffset & ( ( 1 << ROWITEM_SHIFT )-1);
1244 		CSphRowitem uMask = ( ( 1UL << tLoc.m_iBitCount )-1 ) << iShift;
1245 		pRow[iItem] &= ~uMask;
1246 		pRow[iItem] |= ( uMask & ( uValue << iShift ) );
1247 	}
1248 }
1249 
1250 
1251 /// pack length into row storage (22 bits max)
1252 /// returns number of bytes used
sphPackStrlen(BYTE * pRow,int iLen)1253 inline int sphPackStrlen ( BYTE * pRow, int iLen )
1254 {
1255 	assert ( iLen>=0 && iLen<0x400000 );
1256 	if ( iLen<0x80 )
1257 	{
1258 		pRow[0] = BYTE(iLen);
1259 		return 1;
1260 	} else if ( iLen<0x4000 )
1261 	{
1262 		pRow[0] = BYTE ( ( iLen>>8 ) | 0x80 );
1263 		pRow[1] = BYTE ( iLen );
1264 		return 2;
1265 	} else
1266 	{
1267 		pRow[0] = BYTE ( ( iLen>>16 ) | 0xc0 );
1268 		pRow[1] = BYTE ( iLen>>8 );
1269 		pRow[2] = BYTE ( iLen );
1270 		return 3;
1271 	}
1272 }
1273 
1274 
1275 /// unpack string attr from row storage (22 bits length max)
1276 /// returns unpacked length; stores pointer to string data if required
sphUnpackStr(const BYTE * pRow,const BYTE ** ppStr)1277 inline int sphUnpackStr ( const BYTE * pRow, const BYTE ** ppStr )
1278 {
1279 	int v = *pRow++;
1280 	if ( v & 0x80 )
1281 	{
1282 		if ( v & 0x40 )
1283 		{
1284 			v = ( int ( v & 0x3f )<<16 ) + ( int ( *pRow++ )<<8 );
1285 			v += ( *pRow++ ); // MUST be separate statement; cf. sequence point
1286 		} else
1287 		{
1288 			v = ( int ( v & 0x3f )<<8 ) + ( *pRow++ );
1289 		}
1290 	}
1291 	if ( ppStr )
1292 		*ppStr = pRow;
1293 	return v;
1294 }
1295 
1296 
1297 /// search query match (document info plus weight/tag)
1298 class CSphMatch
1299 {
1300 	friend class ISphSchema;
1301 	friend class CSphRsetSchema;
1302 
1303 public:
1304 	SphDocID_t				m_uDocID;		///< document ID
1305 	const CSphRowitem *		m_pStatic;		///< static part (stored in and owned by the index)
1306 	CSphRowitem *			m_pDynamic;		///< dynamic part (computed per query; owned by the match)
1307 	int						m_iWeight;		///< my computed weight
1308 	int						m_iTag;			///< my index tag
1309 
1310 public:
1311 	/// ctor. clears everything
CSphMatch()1312 	CSphMatch ()
1313 		: m_uDocID ( 0 )
1314 		, m_pStatic ( NULL )
1315 		, m_pDynamic ( NULL )
1316 		, m_iWeight ( 0 )
1317 		, m_iTag ( 0 )
1318 	{
1319 	}
1320 
1321 private:
1322 	/// copy ctor. just in case
CSphMatch(const CSphMatch & rhs)1323 	CSphMatch ( const CSphMatch & rhs )
1324 		: m_pStatic ( 0 )
1325 		, m_pDynamic ( NULL )
1326 	{
1327 		*this = rhs;
1328 	}
1329 
1330 public:
1331 	/// dtor. frees everything
~CSphMatch()1332 	~CSphMatch ()
1333 	{
1334 #ifndef NDEBUG
1335 		if ( m_pDynamic )
1336 			m_pDynamic--;
1337 #endif
1338 		SafeDeleteArray ( m_pDynamic );
1339 	}
1340 
1341 	/// reset
Reset(int iDynamic)1342 	void Reset ( int iDynamic )
1343 	{
1344 		// check that we're either initializing a new one, or NOT changing the current size
1345 		assert ( iDynamic>=0 );
1346 		assert ( !m_pDynamic || iDynamic==(int)m_pDynamic[-1] );
1347 
1348 		m_uDocID = 0;
1349 		if ( !m_pDynamic && iDynamic )
1350 		{
1351 #ifndef NDEBUG
1352 			m_pDynamic = new CSphRowitem [ iDynamic+1 ];
1353 			*m_pDynamic++ = iDynamic;
1354 #else
1355 			m_pDynamic = new CSphRowitem [ iDynamic ];
1356 #endif
1357 			// dynamic stuff might contain pointers now (STRINGPTR type)
1358 			// so we gotta cleanup
1359 			memset ( m_pDynamic, 0, iDynamic*sizeof(CSphRowitem) );
1360 		}
1361 	}
1362 
1363 private:
1364 	/// assignment
Combine(const CSphMatch & rhs,int iDynamic)1365 	void Combine ( const CSphMatch & rhs, int iDynamic )
1366 	{
1367 		// check that we're either initializing a new one, or NOT changing the current size
1368 		assert ( iDynamic>=0 );
1369 		assert ( !m_pDynamic || iDynamic==(int)m_pDynamic[-1] );
1370 
1371 		if ( this!=&rhs )
1372 		{
1373 			m_uDocID = rhs.m_uDocID;
1374 			m_iWeight = rhs.m_iWeight;
1375 			m_pStatic = rhs.m_pStatic;
1376 			m_iTag = rhs.m_iTag;
1377 		}
1378 
1379 		if ( iDynamic )
1380 		{
1381 			if ( !m_pDynamic )
1382 			{
1383 #ifndef NDEBUG
1384 				m_pDynamic = new CSphRowitem [ iDynamic+1 ];
1385 				*m_pDynamic++ = iDynamic;
1386 #else
1387 				m_pDynamic = new CSphRowitem [ iDynamic ];
1388 #endif
1389 			}
1390 
1391 			if ( this!=&rhs )
1392 			{
1393 				assert ( rhs.m_pDynamic );
1394 				assert ( m_pDynamic[-1]==rhs.m_pDynamic[-1] ); // ensure we're not changing X to Y
1395 				memcpy ( m_pDynamic, rhs.m_pDynamic, iDynamic*sizeof(CSphRowitem) );
1396 			}
1397 		}
1398 	}
1399 
1400 public:
1401 	/// integer getter
GetAttr(const CSphAttrLocator & tLoc)1402 	SphAttr_t GetAttr ( const CSphAttrLocator & tLoc ) const
1403 	{
1404 		// m_pRowpart[tLoc.m_bDynamic] is 30% faster on MSVC 2005
1405 		// same time on gcc 4.x though, ~1 msec per 1M calls, so lets avoid the hassle for now
1406 		if ( tLoc.m_iBitOffset>=0 )
1407 			return sphGetRowAttr ( tLoc.m_bDynamic ? m_pDynamic : m_pStatic, tLoc );
1408 		if ( tLoc.IsID() )
1409 			return m_uDocID;
1410 		assert ( false && "Unknown negative-bitoffset locator" );
1411 		return 0;
1412 	}
1413 
1414 	/// float getter
GetAttrFloat(const CSphAttrLocator & tLoc)1415 	float GetAttrFloat ( const CSphAttrLocator & tLoc ) const
1416 	{
1417 		return sphDW2F ( (DWORD)sphGetRowAttr ( tLoc.m_bDynamic ? m_pDynamic : m_pStatic, tLoc ) );
1418 	}
1419 
1420 	/// integer setter
SetAttr(const CSphAttrLocator & tLoc,SphAttr_t uValue)1421 	void SetAttr ( const CSphAttrLocator & tLoc, SphAttr_t uValue )
1422 	{
1423 		if ( tLoc.IsID() )
1424 			return;
1425 		assert ( tLoc.m_bDynamic );
1426 		assert ( tLoc.GetMaxRowitem() < (int)m_pDynamic[-1] );
1427 		sphSetRowAttr ( m_pDynamic, tLoc, uValue );
1428 	}
1429 
1430 	/// float setter
SetAttrFloat(const CSphAttrLocator & tLoc,float fValue)1431 	void SetAttrFloat ( const CSphAttrLocator & tLoc, float fValue )
1432 	{
1433 		assert ( tLoc.m_bDynamic );
1434 		assert ( tLoc.GetMaxRowitem() < (int)m_pDynamic[-1] );
1435 		sphSetRowAttr ( m_pDynamic, tLoc, sphF2DW ( fValue ) );
1436 	}
1437 
1438 	/// MVA getter
1439 	const DWORD * GetAttrMVA ( const CSphAttrLocator & tLoc, const DWORD * pPool, bool bArenaProhibit ) const;
1440 
1441 private:
1442 	/// "manually" prevent copying
1443 	const CSphMatch & operator = ( const CSphMatch & )
1444 	{
1445 		assert ( 0 && "internal error (CSphMatch::operator= called)" );
1446 		return *this;
1447 	}
1448 };
1449 
1450 /// specialized swapper
Swap(CSphMatch & a,CSphMatch & b)1451 inline void Swap ( CSphMatch & a, CSphMatch & b )
1452 {
1453 	Swap ( a.m_uDocID, b.m_uDocID );
1454 	Swap ( a.m_pStatic, b.m_pStatic );
1455 	Swap ( a.m_pDynamic, b.m_pDynamic );
1456 	Swap ( a.m_iWeight, b.m_iWeight );
1457 	Swap ( a.m_iTag, b.m_iTag );
1458 }
1459 
1460 
1461 /// source statistics
1462 struct CSphSourceStats
1463 {
1464 	int64_t			m_iTotalDocuments;	///< how much documents
1465 	int64_t			m_iTotalBytes;		///< how much bytes
1466 
1467 	/// ctor
CSphSourceStatsCSphSourceStats1468 	CSphSourceStats ()
1469 	{
1470 		Reset ();
1471 	}
1472 
1473 	/// reset
ResetCSphSourceStats1474 	void Reset ()
1475 	{
1476 		m_iTotalDocuments = 0;
1477 		m_iTotalBytes = 0;
1478 	}
1479 };
1480 
1481 //////////////////////////////////////////////////////////////////////////
1482 
1483 /// known multi-valued attr sources
1484 enum ESphAttrSrc
1485 {
1486 	SPH_ATTRSRC_NONE		= 0,	///< not multi-valued
1487 	SPH_ATTRSRC_FIELD		= 1,	///< get attr values from text field
1488 	SPH_ATTRSRC_QUERY		= 2,	///< get attr values from SQL query
1489 	SPH_ATTRSRC_RANGEDQUERY	= 3		///< get attr values from ranged SQL query
1490 };
1491 
1492 
1493 /// wordpart processing type
1494 enum ESphWordpart
1495 {
1496 	SPH_WORDPART_WHOLE		= 0,	///< whole-word
1497 	SPH_WORDPART_PREFIX		= 1,	///< prefix
1498 	SPH_WORDPART_INFIX		= 2		///< infix
1499 };
1500 
1501 
1502 /// column unpack format
1503 enum ESphUnpackFormat
1504 {
1505 	SPH_UNPACK_NONE				= 0,
1506 	SPH_UNPACK_ZLIB				= 1,
1507 	SPH_UNPACK_MYSQL_COMPRESS	= 2
1508 };
1509 
1510 
1511 /// aggregate function to apply
1512 enum ESphAggrFunc
1513 {
1514 	SPH_AGGR_NONE,
1515 	SPH_AGGR_AVG,
1516 	SPH_AGGR_MIN,
1517 	SPH_AGGR_MAX,
1518 	SPH_AGGR_SUM,
1519 	SPH_AGGR_CAT
1520 };
1521 
1522 
1523 /// source column info
1524 struct CSphColumnInfo
1525 {
1526 	CSphString		m_sName;		///< column name
1527 	ESphAttr		m_eAttrType;	///< attribute type
1528 	ESphWordpart	m_eWordpart;	///< wordpart processing type
1529 	bool			m_bIndexed;		///< whether to index this column as fulltext field too
1530 
1531 	int				m_iIndex;		///< index into source result set (-1 for joined fields)
1532 	CSphAttrLocator	m_tLocator;		///< attribute locator in the row
1533 
1534 	ESphAttrSrc		m_eSrc;			///< attr source (for multi-valued attrs only)
1535 	CSphString		m_sQuery;		///< query to retrieve values (for multi-valued attrs only)
1536 	CSphString		m_sQueryRange;	///< query to retrieve range (for multi-valued attrs only)
1537 
1538 	CSphRefcountedPtr<ISphExpr>		m_pExpr;		///< evaluator for expression items
1539 	ESphAggrFunc					m_eAggrFunc;	///< aggregate function on top of expression (for GROUP BY)
1540 	ESphEvalStage					m_eStage;		///< column evaluation stage (who and how computes this column)
1541 	bool							m_bPayload;
1542 	bool							m_bFilename;	///< column is a file name
1543 	bool							m_bWeight;		///< is a weight column
1544 
1545 	WORD							m_uNext;		///< next in linked list for hash in CSphSchema
1546 
1547 	/// handy ctor
1548 	CSphColumnInfo ( const char * sName=NULL, ESphAttr eType=SPH_ATTR_NONE );
1549 
1550 	/// equality comparison checks name, type, and locator
1551 	bool operator == ( const CSphColumnInfo & rhs ) const
1552 	{
1553 		return m_sName==rhs.m_sName
1554 			&& m_eAttrType==rhs.m_eAttrType
1555 			&& m_tLocator.m_iBitCount==rhs.m_tLocator.m_iBitCount
1556 			&& m_tLocator.m_iBitOffset==rhs.m_tLocator.m_iBitOffset
1557 			&& m_tLocator.m_bDynamic==rhs.m_tLocator.m_bDynamic;
1558 	}
1559 };
1560 
1561 
1562 /// barebones schema interface
1563 /// everything that is needed from every implementation of a schema
1564 class ISphSchema
1565 {
1566 protected:
1567 	CSphVector<CSphNamedInt>		m_dPtrAttrs;		///< names and rowitems of STRINGPTR and other ptrs to copy and delete
1568 	CSphVector<CSphNamedInt>		m_dFactorAttrs;		///< names and rowitems of SPH_ATTR_FACTORS attributes
1569 
1570 public:
1571 	/// get row size (static+dynamic combined)
1572 	virtual int						GetRowSize() const = 0;
1573 
1574 	/// get static row part size
1575 	virtual int						GetStaticSize() const = 0;
1576 
1577 	/// get dynamic row part size
1578 	virtual int						GetDynamicSize() const = 0;
1579 
1580 	/// get attrs count
1581 	virtual int						GetAttrsCount() const = 0;
1582 
1583 	/// get attribute index by name, returns -1 if not found
1584 	virtual int						GetAttrIndex ( const char * sName ) const = 0;
1585 
1586 	/// get attr by index
1587 	virtual const CSphColumnInfo &	GetAttr ( int iIndex ) const = 0;
1588 
1589 	/// get attr by name
1590 	virtual const CSphColumnInfo *	GetAttr ( const char * sName ) const = 0;
1591 
1592 	/// assign current schema to rset schema (kind of a visitor operator)
1593 	virtual void					AssignTo ( class CSphRsetSchema & lhs ) const = 0;
1594 
1595 public:
1596 	/// full copy, for purely dynamic matches
1597 	void							CloneWholeMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const;
1598 
1599 	/// free the linked strings and/or just initialize the pointers with NULL
1600 	void							FreeStringPtrs ( CSphMatch * pMatch ) const;
1601 
1602 	/// ???
1603 	void							CopyPtrs ( CSphMatch * pDst, const CSphMatch & rhs ) const;
1604 
1605 protected:
1606 	/// generic InsertAttr() implementation that tracks STRINGPTR, FACTORS attributes
1607 	void							InsertAttr ( CSphVector<CSphColumnInfo> & dAttrs, CSphVector<int> & dUsed, int iPos, const CSphColumnInfo & tCol, bool dDynamic );
1608 
1609 	/// reset my trackers
1610 	void							Reset();
1611 
1612 	/// dtor
~ISphSchema()1613 	virtual ~ISphSchema () {}
1614 };
1615 
1616 
1617 /// plain good old schema
1618 /// container that actually holds and owns all the fields, columns, etc
1619 ///
1620 /// NOTE that while this one can be used everywhere where we need a schema
1621 /// it might be huge (say 1000+ attributes) and expensive to copy, modify, etc!
1622 /// so for most of the online query work, consider CSphRsetSchema
1623 class CSphSchema : public ISphSchema
1624 {
1625 	friend class CSphRsetSchema;
1626 
1627 protected:
1628 	static const int			HASH_THRESH		= 32;
1629 	static const int			BUCKET_COUNT	= 256;
1630 
1631 public:
1632 	CSphString					m_sName;		///< my human-readable name
1633 	CSphVector<CSphColumnInfo>	m_dFields;		///< my fulltext-searchable fields
1634 
1635 
1636 	CSphVector<CSphColumnInfo>	m_dAttrs;		///< all my attributes
1637 	CSphVector<int>				m_dStaticUsed;	///< static row part map (amount of used bits in each rowitem)
1638 	CSphVector<int>				m_dDynamicUsed;	///< dynamic row part map
1639 	int							m_iStaticSize;	///< static row size (can be different from m_dStaticUsed.GetLength() because of gaps)
1640 
1641 protected:
1642 	WORD						m_dBuckets [ BUCKET_COUNT ];	///< uses indexes in m_dAttrs as ptrs; 0xffff is like NULL in this hash
1643 
1644 public:
1645 
1646 	/// ctor
1647 	explicit				CSphSchema ( const char * sName="(nameless)" );
1648 
1649 	/// get field index by name
1650 	/// returns -1 if not found
1651 	int						GetFieldIndex ( const char * sName ) const;
1652 
1653 	/// get attribute index by name
1654 	/// returns -1 if not found
1655 	int						GetAttrIndex ( const char * sName ) const;
1656 
1657 	/// checks if two schemas fully match (ie. fields names, attr names, types and locators are the same)
1658 	/// describe mismatch (if any) to sError
1659 	bool					CompareTo ( const CSphSchema & rhs, CSphString & sError, bool bFullComparison = true ) const;
1660 
1661 	/// reset fields and attrs
1662 	void					Reset ();
1663 
1664 	/// get row size (static+dynamic combined)
GetRowSize()1665 	int						GetRowSize () const				{ return m_iStaticSize + m_dDynamicUsed.GetLength(); }
1666 
1667 	/// get static row part size
GetStaticSize()1668 	int						GetStaticSize () const			{ return m_iStaticSize; }
1669 
1670 	/// get dynamic row part size
GetDynamicSize()1671 	int						GetDynamicSize () const			{ return m_dDynamicUsed.GetLength(); }
1672 
1673 	/// get attrs count
GetAttrsCount()1674 	int						GetAttrsCount () const			{ return m_dAttrs.GetLength(); }
1675 
1676 	/// get attr by index
GetAttr(int iIndex)1677 	const CSphColumnInfo &	GetAttr ( int iIndex ) const	{ return m_dAttrs[iIndex]; }
1678 
1679 	/// get attr by name
1680 	const CSphColumnInfo *	GetAttr ( const char * sName ) const;
1681 
1682 	/// insert attr
1683 	void					InsertAttr ( int iPos, const CSphColumnInfo & tAggr, bool bDynamic );
1684 
1685 	/// add attr
1686 	void					AddAttr ( const CSphColumnInfo & tAttr, bool bDynamic );
1687 
1688 	/// remove attr
1689 	void					RemoveAttr ( const char * szAttr, bool bDynamic );
1690 
1691 	static bool				IsReserved ( const char * szToken );
1692 
1693 protected:
1694 	/// returns 0xffff if bucket list is empty and position otherwise
1695 	WORD &					GetBucketPos ( const char * sName );
1696 
1697 	/// reset hash and re-add all attributes
1698 	void					RebuildHash ();
1699 
1700 	/// add iAddVal to all indexes strictly greater than iStartIdx in hash structures
1701 	void					UpdateHash ( int iStartIdx, int iAddVal );
1702 
1703 	/// visitor-style uber-virtual assignment implementation
1704 	void					AssignTo ( CSphRsetSchema & lhs ) const;
1705 };
1706 
1707 
1708 /// lightweight schema to be used in sorters, result sets, etc
1709 /// avoids copying of static attributes part by keeping a pointer
1710 /// manages the additional dynamic attributes on its own
1711 ///
1712 /// NOTE that for that reason CSphRsetSchema needs the originating index to exist
1713 /// (in case it keeps and uses a pointer to original schema in that index)
1714 class CSphRsetSchema : public ISphSchema
1715 {
1716 protected:
1717 	const CSphSchema *			m_pIndexSchema;		///< original index schema, for the static part
1718 	CSphVector<CSphColumnInfo>	m_dExtraAttrs;		///< additional dynamic attributes, for the dynamic one
1719 	CSphVector<int>				m_dDynamicUsed;		///< dynamic row part map
1720 	CSphVector<int>				m_dRemoved;			///< original indexes that are suppressed from the index schema by RemoveStaticAttr()
1721 
1722 public:
1723 	CSphVector<CSphColumnInfo>	m_dFields;			///< standalone case (agent result set), fields container
1724 
1725 public:
1726 								CSphRsetSchema();
1727 	CSphRsetSchema &			operator = ( const ISphSchema & rhs );
1728 	CSphRsetSchema &			operator = ( const CSphSchema & rhs );
AssignTo(CSphRsetSchema & lhs)1729 	virtual void				AssignTo ( CSphRsetSchema & lhs ) const		{ lhs = *this; }
1730 
1731 public:
1732 	int							GetRowSize() const;
1733 	int							GetStaticSize() const;
1734 	int							GetDynamicSize() const;
1735 	int							GetAttrsCount() const;
1736 	int							GetAttrIndex ( const char * sName ) const;
1737 	const CSphColumnInfo &		GetAttr ( int iIndex ) const;
1738 	const CSphColumnInfo *		GetAttr ( const char * sName ) const;
1739 
1740 public:
1741 	void						AddDynamicAttr ( const CSphColumnInfo & tCol );
1742 	void						RemoveStaticAttr ( int iAttr );
1743 	void						Reset();
1744 
1745 public:
1746 	/// simple copy; clones either the entire dynamic part, or a part thereof
1747 	void CloneMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const;
1748 
1749 	/// swap in a subset of current attributes, with not necessarily (!) unique names
1750 	/// used to create a network result set (ie. rset to be sent and then discarded)
1751 	/// WARNING, DO NOT USE THIS UNLESS ABSOLUTELY SURE!
1752 	void SwapAttrs ( CSphVector<CSphColumnInfo> & dAttrs );
1753 };
1754 
1755 //////////////////////////////////////////////////////////////////////////
1756 
1757 /// HTML stripper
1758 class CSphHTMLStripper
1759 {
1760 public:
1761 	explicit					CSphHTMLStripper ( bool bDefaultTags );
1762 	bool						SetIndexedAttrs ( const char * sConfig, CSphString & sError );
1763 	bool						SetRemovedElements ( const char * sConfig, CSphString & sError );
1764 	bool						SetZones ( const char * sZones, CSphString & sError );
1765 	void						EnableParagraphs ();
1766 	void						Strip ( BYTE * sData ) const;
1767 
1768 public:
1769 
1770 	struct StripperTag_t
1771 	{
1772 		CSphString				m_sTag;			///< tag name
1773 		int						m_iTagLen;		///< tag name length
1774 		bool					m_bInline;		///< whether this tag is inline
1775 		bool					m_bIndexAttrs;	///< whether to index attrs
1776 		bool					m_bRemove;		///< whether to remove contents
1777 		bool					m_bPara;		///< whether to mark a paragraph boundary
1778 		bool					m_bZone;		///< whether to mark a zone boundary
1779 		bool					m_bZonePrefix;	///< whether the zone name is a full name or a prefix
1780 		CSphVector<CSphString>	m_dAttrs;		///< attr names to index
1781 
StripperTag_tStripperTag_t1782 		StripperTag_t ()
1783 			: m_iTagLen ( 0 )
1784 			, m_bInline ( false )
1785 			, m_bIndexAttrs ( false )
1786 			, m_bRemove ( false )
1787 			, m_bPara ( false )
1788 			, m_bZone ( false )
1789 			, m_bZonePrefix ( false )
1790 		{}
1791 
1792 		inline bool operator < ( const StripperTag_t & rhs ) const
1793 		{
1794 			return strcmp ( m_sTag.cstr(), rhs.m_sTag.cstr() )<0;
1795 		}
1796 	};
1797 
1798 	/// finds appropriate tag and zone name ( tags zone name could be prefix only )
1799 	/// advances source to the end of the tag
1800 	const BYTE * 				FindTag ( const BYTE * sSrc, const StripperTag_t ** ppTag, const BYTE ** ppZoneName, int * pZoneNameLen ) const;
1801 	bool						IsValidTagStart ( int iCh ) const;
1802 
1803 protected:
1804 	static const int			MAX_CHAR_INDEX = 28;		///< max valid char index (a-z, underscore, colon)
1805 
1806 	CSphVector<StripperTag_t>	m_dTags;					///< known tags to index attrs and/or to remove contents
1807 	int							m_dStart[MAX_CHAR_INDEX];	///< maps index of the first tag name char to start offset in m_dTags
1808 	int							m_dEnd[MAX_CHAR_INDEX];		///< maps index of the first tag name char to end offset in m_dTags
1809 
1810 protected:
1811 	int							GetCharIndex ( int iCh ) const;	///< calcs index by raw char
1812 	void						UpdateTags ();				///< sorts tags, updates internal helpers
1813 };
1814 
1815 
1816 /// indexing-related source settings
1817 /// NOTE, newly added fields should be synced with CSphSource::Setup()
1818 struct CSphSourceSettings
1819 {
1820 	int		m_iMinPrefixLen;	///< min indexable prefix (0 means don't index prefixes)
1821 	int		m_iMinInfixLen;		///< min indexable infix length (0 means don't index infixes)
1822 	int		m_iMaxSubstringLen;	///< max indexable infix and prefix (0 means don't limit infixes and prefixes)
1823 	int		m_iBoundaryStep;	///< additional boundary word position increment
1824 	bool	m_bIndexExactWords;	///< exact (non-stemmed) word indexing flag
1825 	int		m_iOvershortStep;	///< position step on overshort token (default is 1)
1826 	int		m_iStopwordStep;	///< position step on stopword token (default is 1)
1827 	bool	m_bIndexSP;			///< whether to index sentence and paragraph delimiters
1828 	bool	m_bIndexFieldLens;	///< whether to index field lengths
1829 
1830 	CSphVector<CSphString>	m_dPrefixFields;	///< list of prefix fields
1831 	CSphVector<CSphString>	m_dInfixFields;		///< list of infix fields
1832 
1833 	explicit				CSphSourceSettings ();
1834 	ESphWordpart			GetWordpart ( const char * sField, bool bWordDict );
1835 };
1836 
1837 
1838 /// hit vector interface
1839 /// because specific position type might vary (dword, qword, etc)
1840 /// but we don't want to template and instantiate everything because of that
1841 class ISphHits
1842 {
1843 public:
Length()1844 	int Length () const
1845 	{
1846 		return m_dData.GetLength();
1847 	}
1848 
First()1849 	const CSphWordHit * First () const
1850 	{
1851 		return m_dData.Begin();
1852 	}
1853 
Last()1854 	const CSphWordHit * Last () const
1855 	{
1856 		return &m_dData.Last();
1857 	}
1858 
AddHit(SphDocID_t uDocid,SphWordID_t uWordid,Hitpos_t uPos)1859 	void AddHit ( SphDocID_t uDocid, SphWordID_t uWordid, Hitpos_t uPos )
1860 	{
1861 		if ( uWordid )
1862 		{
1863 			CSphWordHit & tHit = m_dData.Add();
1864 			tHit.m_uDocID = uDocid;
1865 			tHit.m_uWordID = uWordid;
1866 			tHit.m_uWordPos = uPos;
1867 		}
1868 	}
1869 
1870 public:
1871 	CSphVector<CSphWordHit> m_dData;
1872 };
1873 
1874 
1875 struct SphRange_t
1876 {
1877 	int m_iStart;
1878 	int m_iLength;
1879 };
1880 
1881 struct CSphFieldFilterSettings
1882 {
1883 	CSphVector<CSphString>	m_dRegexps;
1884 };
1885 
1886 /// field filter
1887 class ISphFieldFilter
1888 {
1889 public:
~ISphFieldFilter()1890 	virtual					~ISphFieldFilter () {}
1891 
1892 	virtual	int				Apply ( const BYTE * sField, int iLength, CSphVector<BYTE> & dStorage ) = 0;
1893 	virtual	void			GetSettings ( CSphFieldFilterSettings & tSettings ) const = 0;
1894 };
1895 
1896 /// create a field filter
1897 ISphFieldFilter * sphCreateFieldFilter ( const CSphFieldFilterSettings & tFilterSettings, CSphString & sError );
1898 
1899 
1900 /// generic data source
1901 class CSphSource : public CSphSourceSettings
1902 {
1903 public:
1904 	CSphMatch							m_tDocInfo;		///< current document info
1905 	CSphVector<CSphString>				m_dStrAttrs;	///< current document string attrs
1906 	CSphVector<DWORD>					m_dMva;			///< MVA storage for mva64
1907 
1908 public:
1909 	/// ctor
1910 	explicit							CSphSource ( const char * sName );
1911 
1912 	/// dtor
1913 	virtual								~CSphSource ();
1914 
1915 	/// set dictionary
1916 	void								SetDict ( CSphDict * dict );
1917 
1918 	/// set HTML stripping mode
1919 	///
1920 	/// sExtractAttrs defines what attributes to store. format is "img=alt; a=alt,title".
1921 	/// empty string means to strip all tags; NULL means to disable stripping.
1922 	///
1923 	/// sRemoveElements defines what elements to cleanup. format is "style, script"
1924 	///
1925 	/// on failure, returns false and fills sError
1926 	bool								SetStripHTML ( const char * sExtractAttrs, const char * sRemoveElements, bool bDetectParagraphs, const char * sZones, CSphString & sError );
1927 
1928 	/// set field filter
1929 	void								SetFieldFilter ( ISphFieldFilter * pFilter );
1930 
1931 	/// set tokenizer
1932 	void								SetTokenizer ( ISphTokenizer * pTokenizer );
1933 
1934 	/// set rows dump file
SetDumpRows(FILE *)1935 	virtual void						SetDumpRows ( FILE * ) {}
1936 
1937 	/// get stats
1938 	virtual const CSphSourceStats &		GetStats ();
1939 
1940 	/// updates schema fields and attributes
1941 	/// updates pInfo if it's empty; checks for match if it's not
1942 	/// must be called after IterateStart(); will always fail otherwise
1943 	virtual bool						UpdateSchema ( CSphSchema * pInfo, CSphString & sError );
1944 
1945 	/// setup misc indexing settings (prefix/infix/exact-word indexing, position steps)
1946 	void								Setup ( const CSphSourceSettings & tSettings );
1947 
1948 public:
1949 	/// connect to the source (eg. to the database)
1950 	/// connection settings are specific for each source type and as such
1951 	/// are implemented in specific descendants
1952 	virtual bool						Connect ( CSphString & sError ) = 0;
1953 
1954 	/// disconnect from the source
1955 	virtual void						Disconnect () = 0;
1956 
1957 	/// check if there are any attributes configured
1958 	/// note that there might be NO actual attributes in the case if configured
1959 	/// ones do not match those actually returned by the source
1960 	virtual bool						HasAttrsConfigured () = 0;
1961 
1962 	/// check if there are any joined fields
HasJoinedFields()1963 	virtual bool						HasJoinedFields () { return false; }
1964 
1965 	/// begin indexing this source
1966 	/// to be implemented by descendants
1967 	virtual bool						IterateStart ( CSphString & sError ) = 0;
1968 
1969 	/// get next document
1970 	/// to be implemented by descendants
1971 	/// returns false on error
1972 	/// returns true and fills m_tDocInfo on success
1973 	/// returns true and sets m_tDocInfo.m_uDocID to 0 on eof
1974 	virtual bool						IterateDocument ( CSphString & sError ) = 0;
1975 
1976 	/// get next hits chunk for current document
1977 	/// to be implemented by descendants
1978 	/// returns NULL when there are no more hits
1979 	/// returns pointer to hit vector (with at most MAX_SOURCE_HITS) on success
1980 	/// fills out-string with error message on failure
1981 	virtual ISphHits *					IterateHits ( CSphString & sError ) = 0;
1982 
1983 	/// get joined hits from joined fields (w/o attached docinfos)
1984 	/// returns false and fills out-string with error message on failure
1985 	/// returns true and sets m_tDocInfo.m_uDocID to 0 on eof
1986 	/// returns true and sets m_tDocInfo.m_uDocID to non-0 on success
1987 	virtual ISphHits *					IterateJoinedHits ( CSphString & sError );
1988 
1989 	/// begin iterating values of out-of-document multi-valued attribute iAttr
1990 	/// will fail if iAttr is out of range, or is not multi-valued
1991 	/// can also fail if configured settings are invalid (eg. SQL query can not be executed)
1992 	virtual bool						IterateMultivaluedStart ( int iAttr, CSphString & sError ) = 0;
1993 
1994 	/// get next multi-valued (id,attr-value) or (id, offset) for mva64 tuple to m_tDocInfo
1995 	virtual bool						IterateMultivaluedNext () = 0;
1996 
1997 	/// begin iterating values of multi-valued attribute iAttr stored in a field
1998 	/// will fail if iAttr is out of range, or is not multi-valued
1999 	virtual SphRange_t					IterateFieldMVAStart ( int iAttr ) = 0;
2000 
2001 	/// begin iterating kill list
2002 	virtual bool						IterateKillListStart ( CSphString & sError ) = 0;
2003 
2004 	/// get next kill list doc id
2005 	virtual bool						IterateKillListNext ( SphDocID_t & uDocId ) = 0;
2006 
2007 	/// post-index callback
2008 	/// gets called when the indexing is succesfully (!) over
PostIndex()2009 	virtual void						PostIndex () {}
2010 
2011 protected:
2012 	ISphTokenizer *						m_pTokenizer;	///< my tokenizer
2013 	CSphDict *							m_pDict;		///< my dict
2014 	ISphFieldFilter	*					m_pFieldFilter;	///< my field filter
2015 
2016 	CSphSourceStats						m_tStats;		///< my stats
2017 	CSphSchema							m_tSchema;		///< my schema
2018 
2019 	CSphHTMLStripper *					m_pStripper;	///< my HTML stripper
2020 
2021 	int			m_iNullIds;
2022 	int			m_iMaxIds;
2023 
2024 	SphDocID_t	VerifyID ( SphDocID_t uID );
2025 };
2026 
2027 
2028 /// how to handle IO errors in file fields
2029 enum ESphOnFileFieldError
2030 {
2031 	FFE_IGNORE_FIELD,
2032 	FFE_SKIP_DOCUMENT,
2033 	FFE_FAIL_INDEX
2034 };
2035 
2036 
2037 /// generic document source
2038 /// provides multi-field support and generic tokenizer
2039 class CSphSource_Document : public CSphSource
2040 {
2041 public:
2042 	/// ctor
2043 	explicit				CSphSource_Document ( const char * sName );
2044 
2045 	/// dtor
~CSphSource_Document()2046 	virtual					~CSphSource_Document () { SafeDeleteArray ( m_pReadFileBuffer ); }
2047 
2048 	/// my generic tokenizer
2049 	virtual bool			IterateDocument ( CSphString & sError );
2050 	virtual ISphHits *		IterateHits ( CSphString & sError );
2051 	void					BuildHits ( CSphString & sError, bool bSkipEndMarker );
2052 
2053 	/// field data getter
2054 	/// to be implemented by descendants
2055 	virtual BYTE **			NextDocument ( CSphString & sError ) = 0;
2056 
SetDumpRows(FILE * fpDumpRows)2057 	virtual void			SetDumpRows ( FILE * fpDumpRows ) { m_fpDumpRows = fpDumpRows; }
2058 
2059 	virtual SphRange_t		IterateFieldMVAStart ( int iAttr );
IterateFieldMVAStart(int,CSphString &)2060 	virtual bool			IterateFieldMVAStart ( int, CSphString & ) { assert ( 0 && "not implemented" ); return false; }
HasJoinedFields()2061 	virtual bool			HasJoinedFields () { return m_iPlainFieldsLength!=m_tSchema.m_dFields.GetLength(); }
2062 
2063 protected:
2064 	int						ParseFieldMVA ( CSphVector < DWORD > & dMva, const char * szValue, bool bMva64 ) const;
2065 	bool					CheckFileField ( const BYTE * sField );
2066 	int						LoadFileField ( BYTE ** ppField, CSphString & sError );
2067 
2068 	bool					BuildZoneHits ( SphDocID_t uDocid, BYTE * sWord );
2069 	void					BuildSubstringHits ( SphDocID_t uDocid, bool bPayload, ESphWordpart eWordpart, bool bSkipEndMarker );
2070 	void					BuildRegularHits ( SphDocID_t uDocid, bool bPayload, bool bSkipEndMarker );
2071 
2072 	/// register autocomputed attributes such as field lengths (see index_field_lengths)
2073 	bool					AddAutoAttrs ( CSphString & sError );
2074 
2075 	/// allocate m_tDocInfo storage, do post-alloc magic (compute pointer to field lengths, etc)
2076 	void					AllocDocinfo ();
2077 
2078 protected:
2079 	ISphHits				m_tHits;				///< my hitvector
2080 
2081 protected:
2082 	char *					m_pReadFileBuffer;
2083 	int						m_iReadFileBufferSize;	///< size of read buffer for the 'sql_file_field' fields
2084 	int						m_iMaxFileBufferSize;	///< max size of read buffer for the 'sql_file_field' fields
2085 	ESphOnFileFieldError	m_eOnFileFieldError;
2086 	FILE *					m_fpDumpRows;
2087 	int						m_iPlainFieldsLength;
2088 	DWORD *					m_pFieldLengthAttrs;	///< pointer into the part of m_tDocInfo where field lengths are stored
2089 
2090 	CSphVector<SphDocID_t>	m_dAllIds;				///< used for joined fields FIXME! unlimited RAM use
2091 	bool					m_bIdsSorted;			///< we sort array to use binary search
2092 
2093 protected:
2094 	struct CSphBuildHitsState_t
2095 	{
2096 		bool m_bProcessingHits;
2097 		bool m_bDocumentDone;
2098 
2099 		BYTE ** m_dFields;
2100 
2101 		CSphVector<BYTE*> m_dTmpFieldStorage;
2102 		CSphVector<BYTE*> m_dTmpFieldPtrs;
2103 		CSphVector<BYTE> m_dFiltered;
2104 
2105 		int m_iStartPos;
2106 		Hitpos_t m_iHitPos;
2107 		int m_iField;
2108 		int m_iStartField;
2109 		int m_iEndField;
2110 
2111 		int m_iBuildLastStep;
2112 
2113 		CSphBuildHitsState_t ();
2114 		~CSphBuildHitsState_t ();
2115 
2116 		void Reset ();
2117 	};
2118 
2119 	CSphBuildHitsState_t	m_tState;
2120 	int						m_iMaxHits;
2121 };
2122 
2123 struct CSphUnpackInfo
2124 {
2125 	ESphUnpackFormat	m_eFormat;
2126 	CSphString			m_sName;
2127 };
2128 
2129 struct CSphJoinedField
2130 {
2131 	CSphString			m_sName;
2132 	CSphString			m_sQuery;
2133 	CSphString			m_sRanged;
2134 	bool				m_bPayload;
2135 };
2136 
2137 
2138 /// generic SQL source params
2139 struct CSphSourceParams_SQL
2140 {
2141 	// query params
2142 	CSphString						m_sQuery;
2143 	CSphString						m_sQueryRange;
2144 	CSphString						m_sQueryKilllist;
2145 	int64_t							m_iRangeStep;
2146 	int64_t							m_iRefRangeStep;
2147 	bool							m_bPrintQueries;
2148 
2149 	CSphVector<CSphString>			m_dQueryPre;
2150 	CSphVector<CSphString>			m_dQueryPost;
2151 	CSphVector<CSphString>			m_dQueryPostIndex;
2152 	CSphVector<CSphColumnInfo>		m_dAttrs;
2153 	CSphVector<CSphString>			m_dFileFields;
2154 
2155 	int								m_iRangedThrottle;
2156 	int								m_iMaxFileBufferSize;
2157 	ESphOnFileFieldError			m_eOnFileFieldError;
2158 
2159 	CSphVector<CSphUnpackInfo>		m_dUnpack;
2160 	DWORD							m_uUnpackMemoryLimit;
2161 
2162 	CSphVector<CSphJoinedField>		m_dJoinedFields;
2163 
2164 	// connection params
2165 	CSphString						m_sHost;
2166 	CSphString						m_sUser;
2167 	CSphString						m_sPass;
2168 	CSphString						m_sDB;
2169 	int								m_iPort;
2170 
2171 	// hooks
2172 	CSphString						m_sHookConnect;
2173 	CSphString						m_sHookQueryRange;
2174 	CSphString						m_sHookPostIndex;
2175 
2176 	CSphSourceParams_SQL ();
2177 };
2178 
2179 
2180 /// generic SQL source
2181 /// multi-field plain-text documents fetched from given query
2182 struct CSphSource_SQL : CSphSource_Document
2183 {
2184 	explicit			CSphSource_SQL ( const char * sName );
~CSphSource_SQLCSphSource_SQL2185 	virtual				~CSphSource_SQL () {}
2186 
2187 	bool				Setup ( const CSphSourceParams_SQL & pParams );
2188 	virtual bool		Connect ( CSphString & sError );
2189 	virtual void		Disconnect ();
2190 
2191 	virtual bool		IterateStart ( CSphString & sError );
2192 	virtual BYTE **		NextDocument ( CSphString & sError );
2193 	virtual void		PostIndex ();
2194 
HasAttrsConfiguredCSphSource_SQL2195 	virtual bool		HasAttrsConfigured () { return m_tParams.m_dAttrs.GetLength()!=0; }
2196 
2197 	virtual ISphHits *	IterateJoinedHits ( CSphString & sError );
2198 
2199 	virtual bool		IterateMultivaluedStart ( int iAttr, CSphString & sError );
2200 	virtual bool		IterateMultivaluedNext ();
2201 
2202 	virtual bool		IterateKillListStart ( CSphString & sError );
2203 	virtual bool		IterateKillListNext ( SphDocID_t & tDocId );
2204 
2205 private:
2206 	bool				m_bSqlConnected;	///< am i connected?
2207 
2208 protected:
2209 	CSphString			m_sSqlDSN;
2210 
2211 	BYTE *				m_dFields [ SPH_MAX_FIELDS ];
2212 	ESphUnpackFormat	m_dUnpack [ SPH_MAX_FIELDS ];
2213 
2214 	SphDocID_t			m_uMinID;			///< grand min ID
2215 	SphDocID_t			m_uMaxID;			///< grand max ID
2216 	SphDocID_t			m_uCurrentID;		///< current min ID
2217 	SphDocID_t			m_uMaxFetchedID;	///< max actually fetched ID
2218 	int					m_iMultiAttr;		///< multi-valued attr being currently fetched
2219 	int					m_iSqlFields;		///< field count (for row dumper)
2220 
2221 	CSphSourceParams_SQL		m_tParams;
2222 
2223 	bool				m_bCanUnpack;
2224 	bool				m_bUnpackFailed;
2225 	bool				m_bUnpackOverflow;
2226 	CSphVector<char>	m_dUnpackBuffers [ SPH_MAX_FIELDS ];
2227 
2228 	int					m_iJoinedHitField;	///< currently pulling joined hits from this field (index into schema; -1 if not pulling)
2229 	SphDocID_t			m_iJoinedHitID;		///< last document id
2230 	int					m_iJoinedHitPos;	///< last hit position
2231 
2232 	static const int			MACRO_COUNT = 2;
2233 	static const char * const	MACRO_VALUES [ MACRO_COUNT ];
2234 
2235 protected:
2236 	/// by what reason the internal SetupRanges called
2237 	enum ERangesReason
2238 	{
2239 		SRE_DOCS,
2240 		SRE_MVA,
2241 		SRE_JOINEDHITS
2242 	};
2243 
2244 protected:
2245 	bool					SetupRanges ( const char * sRangeQuery, const char * sQuery, const char * sPrefix, CSphString & sError, ERangesReason iReason );
2246 	bool					RunQueryStep ( const char * sQuery, CSphString & sError );
2247 
2248 protected:
2249 	virtual void			SqlDismissResult () = 0;
2250 	virtual bool			SqlQuery ( const char * sQuery ) = 0;
2251 	virtual bool			SqlIsError () = 0;
2252 	virtual const char *	SqlError () = 0;
2253 	virtual bool			SqlConnect () = 0;
2254 	virtual void			SqlDisconnect () = 0;
2255 	virtual int				SqlNumFields() = 0;
2256 	virtual bool			SqlFetchRow() = 0;
2257 	virtual DWORD			SqlColumnLength ( int iIndex ) = 0;
2258 	virtual const char *	SqlColumn ( int iIndex ) = 0;
2259 	virtual const char *	SqlFieldName ( int iIndex ) = 0;
2260 
2261 	const char *	SqlUnpackColumn ( int iIndex, ESphUnpackFormat eFormat );
2262 	void			ReportUnpackError ( int iIndex, int iError );
2263 };
2264 
2265 
2266 #if USE_MYSQL
2267 /// MySQL source params
2268 struct CSphSourceParams_MySQL : CSphSourceParams_SQL
2269 {
2270 	CSphString	m_sUsock;					///< UNIX socket
2271 	int			m_iFlags;					///< connection flags
2272 	CSphString	m_sSslKey;
2273 	CSphString	m_sSslCert;
2274 	CSphString	m_sSslCA;
2275 
2276 				CSphSourceParams_MySQL ();	///< ctor. sets defaults
2277 };
2278 
2279 
2280 /// MySQL source implementation
2281 /// multi-field plain-text documents fetched from given query
2282 struct CSphSource_MySQL : CSphSource_SQL
2283 {
2284 	explicit				CSphSource_MySQL ( const char * sName );
2285 	bool					Setup ( const CSphSourceParams_MySQL & tParams );
2286 
2287 protected:
2288 	MYSQL_RES *				m_pMysqlResult;
2289 	MYSQL_FIELD *			m_pMysqlFields;
2290 	MYSQL_ROW				m_tMysqlRow;
2291 	MYSQL					m_tMysqlDriver;
2292 	unsigned long *			m_pMysqlLengths;
2293 
2294 	CSphString				m_sMysqlUsock;
2295 	int						m_iMysqlConnectFlags;
2296 	CSphString				m_sSslKey;
2297 	CSphString				m_sSslCert;
2298 	CSphString				m_sSslCA;
2299 
2300 protected:
2301 	virtual void			SqlDismissResult ();
2302 	virtual bool			SqlQuery ( const char * sQuery );
2303 	virtual bool			SqlIsError ();
2304 	virtual const char *	SqlError ();
2305 	virtual bool			SqlConnect ();
2306 	virtual void			SqlDisconnect ();
2307 	virtual int				SqlNumFields();
2308 	virtual bool			SqlFetchRow();
2309 	virtual DWORD			SqlColumnLength ( int iIndex );
2310 	virtual const char *	SqlColumn ( int iIndex );
2311 	virtual const char *	SqlFieldName ( int iIndex );
2312 };
2313 #endif // USE_MYSQL
2314 
2315 
2316 #if USE_PGSQL
2317 /// PgSQL specific source params
2318 struct CSphSourceParams_PgSQL : CSphSourceParams_SQL
2319 {
2320 	CSphString		m_sClientEncoding;
2321 					CSphSourceParams_PgSQL ();
2322 };
2323 
2324 
2325 /// PgSQL source implementation
2326 /// multi-field plain-text documents fetched from given query
2327 struct CSphSource_PgSQL : CSphSource_SQL
2328 {
2329 	explicit				CSphSource_PgSQL ( const char * sName );
2330 	bool					Setup ( const CSphSourceParams_PgSQL & pParams );
2331 	virtual bool			IterateStart ( CSphString & sError );
2332 
2333 protected:
2334 	PGresult * 				m_pPgResult;	///< postgresql execution restult context
2335 	PGconn *				m_tPgDriver;	///< postgresql connection context
2336 
2337 	int						m_iPgRows;		///< how much rows last step returned
2338 	int						m_iPgRow;		///< current row (0 based, as in PQgetvalue)
2339 
2340 	CSphString				m_sPgClientEncoding;
2341 	CSphVector<bool>		m_dIsColumnBool;
2342 
2343 protected:
2344 	virtual void			SqlDismissResult ();
2345 	virtual bool			SqlQuery ( const char * sQuery );
2346 	virtual bool			SqlIsError ();
2347 	virtual const char *	SqlError ();
2348 	virtual bool			SqlConnect ();
2349 	virtual void			SqlDisconnect ();
2350 	virtual int				SqlNumFields();
2351 	virtual bool			SqlFetchRow();
2352 	virtual DWORD	SqlColumnLength ( int iIndex );
2353 	virtual const char *	SqlColumn ( int iIndex );
2354 	virtual const char *	SqlFieldName ( int iIndex );
2355 };
2356 #endif // USE_PGSQL
2357 
2358 #if USE_ODBC
2359 struct CSphSourceParams_ODBC: CSphSourceParams_SQL
2360 {
2361 	CSphString	m_sOdbcDSN;			///< ODBC DSN
2362 	CSphString	m_sColBuffers;		///< column buffer sizes (eg "col1=2M, col2=4M")
2363 	bool		m_bWinAuth;			///< auth type (MS SQL only)
2364 
2365 				CSphSourceParams_ODBC ();
2366 };
2367 
2368 /// ODBC source implementation
2369 struct CSphSource_ODBC : CSphSource_SQL
2370 {
2371 	explicit				CSphSource_ODBC ( const char * sName );
2372 	bool					Setup ( const CSphSourceParams_ODBC & tParams );
2373 
2374 protected:
2375 	virtual void			SqlDismissResult ();
2376 	virtual bool			SqlQuery ( const char * sQuery );
2377 	virtual bool			SqlIsError ();
2378 	virtual const char *	SqlError ();
2379 	virtual bool			SqlConnect ();
2380 	virtual void			SqlDisconnect ();
2381 	virtual int				SqlNumFields();
2382 	virtual bool			SqlFetchRow();
2383 	virtual const char *	SqlColumn ( int iIndex );
2384 	virtual const char *	SqlFieldName ( int iIndex );
2385 	virtual DWORD			SqlColumnLength ( int iIndex );
2386 
OdbcPostConnectCSphSource_ODBC2387 	virtual void			OdbcPostConnect () {}
2388 
2389 protected:
2390 	CSphString				m_sOdbcDSN;
2391 	bool					m_bWinAuth;
2392 	bool					m_bUnicode;
2393 
2394 	SQLHENV					m_hEnv;
2395 	SQLHDBC					m_hDBC;
2396 	SQLHANDLE				m_hStmt;
2397 	int						m_nResultCols;
2398 	CSphString				m_sError;
2399 
2400 	struct QueryColumn_t
2401 	{
2402 		CSphVector<char>	m_dContents;
2403 		CSphVector<char>	m_dRaw;
2404 		CSphString			m_sName;
2405 		SQLLEN				m_iInd;
2406 		int					m_iBufferSize;	///< size of m_dContents and m_dRaw buffers, in bytes
2407 		bool				m_bUCS2;		///< whether this column needs UCS-2 to UTF-8 translation
2408 		bool				m_bTruncated;	///< whether data was truncated when fetching rows
2409 	};
2410 
2411 	static const int		DEFAULT_COL_SIZE	= 1024;			///< default column buffer size
2412 	static const int		VARCHAR_COL_SIZE	= 1048576;		///< default column buffer size for VARCHAR columns
2413 	static const int		MAX_COL_SIZE		= 8*1048576;	///< hard limit on column buffer size
2414 	static const int		WARN_ROW_SIZE		= 32*1048576;	///< warning thresh (NOT a hard limit) on row buffer size
2415 
2416 	CSphVector<QueryColumn_t>	m_dColumns;
2417 	SmallStringHash_T<int>		m_hColBuffers;
2418 
2419 	void					GetSqlError ( SQLSMALLINT iHandleType, SQLHANDLE hHandle );
2420 };
2421 
2422 
2423 /// MS SQL source implementation
2424 struct CSphSource_MSSQL : public CSphSource_ODBC
2425 {
CSphSource_MSSQLCSphSource_MSSQL2426 	explicit				CSphSource_MSSQL ( const char * sName ) : CSphSource_ODBC ( sName ) { m_bUnicode=true; }
2427 	virtual void			OdbcPostConnect ();
2428 };
2429 #endif // USE_ODBC
2430 
2431 
2432 #if USE_LIBEXPAT
2433 class CSphConfigSection;
2434 CSphSource * sphCreateSourceXmlpipe2 ( const CSphConfigSection * pSource, FILE * pPipe, const char * szSourceName, int iMaxFieldLen, bool bProxy, CSphString & sError );
2435 #endif
2436 
2437 
2438 /////////////////////////////////////////////////////////////////////////////
2439 // SEARCH QUERIES
2440 /////////////////////////////////////////////////////////////////////////////
2441 
2442 /// search query sorting orders
2443 enum ESphSortOrder
2444 {
2445 	SPH_SORT_RELEVANCE		= 0,	///< sort by document relevance desc, then by date
2446 	SPH_SORT_ATTR_DESC		= 1,	///< sort by document date desc, then by relevance desc
2447 	SPH_SORT_ATTR_ASC		= 2,	///< sort by document date asc, then by relevance desc
2448 	SPH_SORT_TIME_SEGMENTS	= 3,	///< sort by time segments (hour/day/week/etc) desc, then by relevance desc
2449 	SPH_SORT_EXTENDED		= 4,	///< sort by SQL-like expression (eg. "@relevance DESC, price ASC, @id DESC")
2450 	SPH_SORT_EXPR			= 5,	///< sort by arithmetic expression in descending order (eg. "@id + max(@weight,1000)*boost + log(price)")
2451 
2452 	SPH_SORT_TOTAL
2453 };
2454 
2455 
2456 /// search query matching mode
2457 enum ESphMatchMode
2458 {
2459 	SPH_MATCH_ALL = 0,			///< match all query words
2460 	SPH_MATCH_ANY,				///< match any query word
2461 	SPH_MATCH_PHRASE,			///< match this exact phrase
2462 	SPH_MATCH_BOOLEAN,			///< match this boolean query
2463 	SPH_MATCH_EXTENDED,			///< match this extended query
2464 	SPH_MATCH_FULLSCAN,			///< match all document IDs w/o fulltext query, apply filters
2465 	SPH_MATCH_EXTENDED2,		///< extended engine V2 (TEMPORARY, WILL BE REMOVED IN 0.9.8-RELEASE)
2466 
2467 	SPH_MATCH_TOTAL
2468 };
2469 
2470 
2471 /// search query relevance ranking mode
2472 enum ESphRankMode
2473 {
2474 	SPH_RANK_PROXIMITY_BM25		= 0,	///< default mode, phrase proximity major factor and BM25 minor one (aka SPH03)
2475 	SPH_RANK_BM25				= 1,	///< statistical mode, BM25 ranking only (faster but worse quality)
2476 	SPH_RANK_NONE				= 2,	///< no ranking, all matches get a weight of 1
2477 	SPH_RANK_WORDCOUNT			= 3,	///< simple word-count weighting, rank is a weighted sum of per-field keyword occurence counts
2478 	SPH_RANK_PROXIMITY			= 4,	///< phrase proximity (aka SPH01)
2479 	SPH_RANK_MATCHANY			= 5,	///< emulate old match-any weighting (aka SPH02)
2480 	SPH_RANK_FIELDMASK			= 6,	///< sets bits where there were matches
2481 	SPH_RANK_SPH04				= 7,	///< codename SPH04, phrase proximity + bm25 + head/exact boost
2482 	SPH_RANK_EXPR				= 8,	///< rank by user expression (eg. "sum(lcs*user_weight)*1000+bm25")
2483 	SPH_RANK_EXPORT				= 9,	///< rank by BM25, but compute and export all user expression factors
2484 	SPH_RANK_PLUGIN				= 10,	///< user-defined ranker
2485 
2486 	SPH_RANK_TOTAL,
2487 	SPH_RANK_DEFAULT			= SPH_RANK_PROXIMITY_BM25
2488 };
2489 
2490 
2491 /// search query grouping mode
2492 enum ESphGroupBy
2493 {
2494 	SPH_GROUPBY_DAY		= 0,	///< group by day
2495 	SPH_GROUPBY_WEEK	= 1,	///< group by week
2496 	SPH_GROUPBY_MONTH	= 2,	///< group by month
2497 	SPH_GROUPBY_YEAR	= 3,	///< group by year
2498 	SPH_GROUPBY_ATTR	= 4,	///< group by attribute value
2499 	SPH_GROUPBY_ATTRPAIR= 5,	///< group by sequential attrs pair (rendered redundant by 64bit attrs support; removed)
2500 	SPH_GROUPBY_MULTIPLE= 6		///< group by on multiple attribute values
2501 };
2502 
2503 
2504 /// search query filter types
2505 enum ESphFilter
2506 {
2507 	SPH_FILTER_VALUES		= 0,	///< filter by integer values set
2508 	SPH_FILTER_RANGE		= 1,	///< filter by integer range
2509 	SPH_FILTER_FLOATRANGE	= 2,	///< filter by float range
2510 	SPH_FILTER_STRING		= 3,	///< filter by string value
2511 	SPH_FILTER_NULL			= 4,	///< filter by NULL
2512 	SPH_FILTER_USERVAR		= 5,	///< filter by @uservar
2513 	SPH_FILTER_STRING_LIST	= 6		///< filter by string list
2514 };
2515 
2516 
2517 /// search query filter
2518 class CSphFilterSettings
2519 {
2520 public:
2521 	CSphString			m_sAttrName;	///< filtered attribute name
2522 	bool				m_bExclude;		///< whether this is "include" or "exclude" filter (default is "include")
2523 	bool				m_bHasEqual;	///< has filter "equal" component (gte\lte) or pure greater\less
2524 
2525 	ESphFilter			m_eType;		///< filter type
2526 	union
2527 	{
2528 		SphAttr_t		m_iMinValue;	///< range min
2529 		float			m_fMinValue;	///< range min
2530 	};
2531 	union
2532 	{
2533 		SphAttr_t		m_iMaxValue;	///< range max
2534 		float			m_fMaxValue;	///< range max
2535 	};
2536 	CSphVector<SphAttr_t>	m_dValues;	///< integer values set
2537 	CSphVector<CSphString>	m_dStrings;	///< string values
2538 
2539 public:
2540 						CSphFilterSettings ();
2541 
2542 	void				SetExternalValues ( const SphAttr_t * pValues, int nValues );
2543 
GetValue(int iIdx)2544 	SphAttr_t			GetValue ( int iIdx ) const	{ assert ( iIdx<GetNumValues() ); return m_pValues ? m_pValues[iIdx] : m_dValues[iIdx]; }
GetValueArray()2545 	const SphAttr_t *	GetValueArray () const		{ return m_pValues ? m_pValues : &(m_dValues[0]); }
GetNumValues()2546 	int					GetNumValues () const		{ return m_pValues ? m_nValues : m_dValues.GetLength (); }
2547 
2548 	bool				operator == ( const CSphFilterSettings & rhs ) const;
2549 	bool				operator != ( const CSphFilterSettings & rhs ) const { return !( (*this)==rhs ); }
2550 
2551 
2552 
2553 protected:
2554 	const SphAttr_t *	m_pValues;		///< external value array
2555 	int					m_nValues;		///< external array size
2556 };
2557 
2558 
2559 // keyword info
2560 struct CSphKeywordInfo
2561 {
2562 	CSphString		m_sTokenized;
2563 	CSphString		m_sNormalized;
2564 	int				m_iDocs;
2565 	int				m_iHits;
2566 	int				m_iQpos;
2567 };
2568 
Swap(CSphKeywordInfo & v1,CSphKeywordInfo & v2)2569 inline void Swap ( CSphKeywordInfo & v1, CSphKeywordInfo & v2 )
2570 {
2571 	v1.m_sTokenized.Swap ( v2.m_sTokenized );
2572 	v1.m_sNormalized.Swap ( v2.m_sNormalized );
2573 	::Swap ( v1.m_iDocs, v2.m_iDocs );
2574 	::Swap ( v1.m_iHits, v2.m_iHits );
2575 	::Swap ( v1.m_iQpos, v2.m_iQpos );
2576 }
2577 
2578 
2579 /// per-attribute value overrides
2580 class CSphAttrOverride
2581 {
2582 public:
2583 	/// docid+attrvalue pair
2584 	struct IdValuePair_t
2585 	{
2586 		SphDocID_t				m_uDocID;		///< document ID
2587 		union
2588 		{
2589 			SphAttr_t			m_uValue;		///< attribute value
2590 			float				m_fValue;		///< attribute value
2591 		};
2592 
2593 		inline bool operator < ( const IdValuePair_t & rhs ) const
2594 		{
2595 			return m_uDocID<rhs.m_uDocID;
2596 		}
2597 	};
2598 
2599 public:
2600 	CSphString					m_sAttr;		///< attribute name
2601 	ESphAttr					m_eAttrType;	///< attribute type
2602 	CSphVector<IdValuePair_t>	m_dValues;		///< id-value overrides
2603 };
2604 
2605 
2606 /// query selection item
2607 struct CSphQueryItem
2608 {
2609 	CSphString		m_sExpr;		///< expression to compute
2610 	CSphString		m_sAlias;		///< alias to return
2611 	ESphAggrFunc	m_eAggrFunc;
2612 
CSphQueryItemCSphQueryItem2613 	CSphQueryItem() : m_eAggrFunc ( SPH_AGGR_NONE ) {}
2614 };
2615 
2616 
2617 /// table function interface
2618 class CSphQuery;
2619 struct AggrResult_t;
2620 class ISphTableFunc
2621 {
2622 public:
~ISphTableFunc()2623 	virtual			~ISphTableFunc() {}
2624 	virtual bool	ValidateArgs ( const CSphVector<CSphString> & dArgs, const CSphQuery & tQuery, CSphString & sError ) = 0;
2625 	virtual bool	Process ( AggrResult_t * pResult, CSphString & sError ) = 0;
LimitPushdown(int,int)2626 	virtual bool	LimitPushdown ( int, int ) { return false; } // FIXME! implement this
2627 };
2628 
2629 
2630 /// search query
2631 class CSphQuery
2632 {
2633 public:
2634 	CSphString		m_sIndexes;		///< indexes to search
2635 	CSphString		m_sQuery;		///< cooked query string for the engine (possibly transformed during legacy matching modes fixup)
2636 	CSphString		m_sRawQuery;	///< raw query string from the client for searchd log, agents, etc
2637 
2638 	int				m_iOffset;		///< offset into result set (as X in MySQL LIMIT X,Y clause)
2639 	int				m_iLimit;		///< limit into result set (as Y in MySQL LIMIT X,Y clause)
2640 	DWORD *			m_pWeights;		///< user-supplied per-field weights. may be NULL. default is NULL. NOT OWNED, WILL NOT BE FREED in dtor.
2641 	int				m_iWeights;		///< number of user-supplied weights. missing fields will be assigned weight 1. default is 0
2642 	ESphMatchMode	m_eMode;		///< match mode. default is "match all"
2643 	ESphRankMode	m_eRanker;		///< ranking mode, default is proximity+BM25
2644 	CSphString		m_sRankerExpr;	///< ranking expression for SPH_RANK_EXPR
2645 	CSphString		m_sUDRanker;	///< user-defined ranker name
2646 	CSphString		m_sUDRankerOpts;	///< user-defined ranker options
2647 	ESphSortOrder	m_eSort;		///< sort mode
2648 	CSphString		m_sSortBy;		///< attribute to sort by
2649 	int64_t			m_iRandSeed;	///< random seed for ORDER BY RAND(), -1 means do not set
2650 	int				m_iMaxMatches;	///< max matches to retrieve, default is 1000. more matches use more memory and CPU time to hold and sort them
2651 
2652 	bool			m_bSortKbuffer;	///< whether to use PQ or K-buffer sorting algorithm
2653 	bool			m_bZSlist;		///< whether the ranker has to fetch the zonespanlist with this query
2654 	bool			m_bSimplify;	///< whether to apply boolean simplification
2655 	bool			m_bPlainIDF;		///< whether to use PlainIDF=log(N/n) or NormalizedIDF=log((N-n+1)/n)
2656 	bool			m_bGlobalIDF;		///< whether to use local indexes or a global idf file
2657 	bool			m_bNormalizedTFIDF;	///< whether to scale IDFs by query word count, so that TF*IDF is normalized
2658 	bool			m_bLocalDF;			///< whether to use calculate DF among local indexes
2659 
2660 	CSphVector<CSphFilterSettings>	m_dFilters;	///< filters
2661 
2662 	CSphString		m_sGroupBy;			///< group-by attribute name(s)
2663 	CSphString		m_sFacetBy;			///< facet-by attribute name(s)
2664 	ESphGroupBy		m_eGroupFunc;		///< function to pre-process group-by attribute value with
2665 	CSphString		m_sGroupSortBy;		///< sorting clause for groups in group-by mode
2666 	CSphString		m_sGroupDistinct;	///< count distinct values for this attribute
2667 
2668 	int				m_iCutoff;			///< matches count threshold to stop searching at (default is 0; means to search until all matches are found)
2669 
2670 	int				m_iRetryCount;		///< retry count, for distributed queries
2671 	int				m_iRetryDelay;		///< retry delay, for distributed queries
2672 	int				m_iAgentQueryTimeout;	///< agent query timeout override, for distributed queries
2673 
2674 	bool			m_bGeoAnchor;		///< do we have an anchor
2675 	CSphString		m_sGeoLatAttr;		///< latitude attr name
2676 	CSphString		m_sGeoLongAttr;		///< longitude attr name
2677 	float			m_fGeoLatitude;		///< anchor latitude
2678 	float			m_fGeoLongitude;	///< anchor longitude
2679 
2680 	CSphVector<CSphNamedInt>	m_dIndexWeights;	///< per-index weights
2681 	CSphVector<CSphNamedInt>	m_dFieldWeights;	///< per-field weights
2682 
2683 	DWORD			m_uMaxQueryMsec;	///< max local index search time, in milliseconds (default is 0; means no limit)
2684 	int				m_iMaxPredictedMsec; ///< max predicted (!) search time limit, in milliseconds (0 means no limit)
2685 	CSphString		m_sComment;			///< comment to pass verbatim in the log file
2686 
2687 	CSphVector<CSphAttrOverride>	m_dOverrides;	///< per-query attribute value overrides
2688 
2689 	CSphString		m_sSelect;			///< select-list (attributes and/or expressions)
2690 	CSphString		m_sOrderBy;			///< order-by clause
2691 
2692 	CSphString		m_sOuterOrderBy;	///< temporary (?) subselect hack
2693 	int				m_iOuterOffset;		///< keep and apply outer offset at master
2694 	int				m_iOuterLimit;
2695 	bool			m_bHasOuter;
2696 
2697 	bool			m_bReverseScan;		///< perform scan in reverse order
2698 	bool			m_bIgnoreNonexistent; ///< whether to warning or not about non-existent columns in select list
2699 	bool			m_bIgnoreNonexistentIndexes; ///< whether to error or not about non-existent indexes in index list
2700 	bool			m_bStrict;			///< whether to warning or not about incompatible types
2701 
2702 	ISphTableFunc *	m_pTableFunc;		///< post-query NOT OWNED, WILL NOT BE FREED in dtor.
2703 	CSphFilterSettings	m_tHaving;		///< post aggregate filtering (got applied only on master)
2704 
2705 public:
2706 	int				m_iSQLSelectStart;	///< SQL parser helper
2707 	int				m_iSQLSelectEnd;	///< SQL parser helper
2708 
2709 	int				m_iGroupbyLimit;	///< number of elems within group
2710 
2711 public:
2712 	CSphVector<CSphQueryItem>	m_dItems;		///< parsed select-list
2713 	ESphCollation				m_eCollation;	///< ORDER BY collation
2714 	bool						m_bAgent;		///< agent mode (may need extra cols on output)
2715 
2716 	CSphString		m_sQueryTokenFilterLib;		///< token filter library name
2717 	CSphString		m_sQueryTokenFilterName;	///< token filter name
2718 	CSphString		m_sQueryTokenFilterOpts;	///< token filter options
2719 
2720 public:
2721 					CSphQuery ();		///< ctor, fills defaults
2722 					~CSphQuery ();		///< dtor, frees owned stuff
2723 
2724 	/// parse select list string into items
2725 	bool			ParseSelectList ( CSphString & sError );
2726 	bool			m_bFacet;			///< whether this a facet query
2727 };
2728 
2729 
2730 /// some low-level query stats
2731 struct CSphQueryStats
2732 {
2733 	int64_t *	m_pNanoBudget;		///< pointer to max_predicted_time budget (counted in nanosec)
2734 	DWORD		m_iFetchedDocs;		///< processed documents
2735 	DWORD		m_iFetchedHits;		///< processed hits (aka positions)
2736 	DWORD		m_iSkips;			///< number of Skip() calls
2737 
2738 				CSphQueryStats();
2739 
2740 	void		Add ( const CSphQueryStats & tStats );
2741 };
2742 
2743 
2744 /// search query meta-info
2745 class CSphQueryResultMeta
2746 {
2747 public:
2748 	int						m_iQueryTime;		///< query time, milliseconds
2749 	int						m_iRealQueryTime;	///< query time, measured just from start to finish of the query. In milliseconds
2750 	int64_t					m_iCpuTime;			///< user time, microseconds
2751 	int						m_iMultiplier;		///< multi-query multiplier, -1 to indicate error
2752 
2753 	struct WordStat_t
2754 	{
2755 		int64_t					m_iDocs;			///< document count for this term
2756 		int64_t					m_iHits;			///< hit count for this term
2757 
WordStat_tWordStat_t2758 		WordStat_t()
2759 			: m_iDocs ( 0 )
2760 			, m_iHits ( 0 )
2761 		{}
2762 	};
2763 	SmallStringHash_T<WordStat_t>	m_hWordStats; ///< hash of i-th search term (normalized word form)
2764 
2765 	int						m_iMatches;			///< total matches returned (upto MAX_MATCHES)
2766 	int64_t					m_iTotalMatches;	///< total matches found (unlimited)
2767 
2768 	CSphIOStats				m_tIOStats;			///< i/o stats for the query
2769 	int64_t					m_iAgentCpuTime;	///< agent cpu time (for distributed searches)
2770 	CSphIOStats				m_tAgentIOStats;	///< agent IO stats (for distributed searches)
2771 
2772 	int64_t					m_iPredictedTime;		///< local predicted time
2773 	int64_t					m_iAgentPredictedTime;	///< distributed predicted time
2774 	DWORD					m_iAgentFetchedDocs;	///< distributed fetched docs
2775 	DWORD					m_iAgentFetchedHits;	///< distributed fetched hits
2776 	DWORD					m_iAgentFetchedSkips;	///< distributed fetched skips
2777 
2778 	CSphQueryStats 			m_tStats;			///< query prediction counters
2779 	bool					m_bHasPrediction;	///< is prediction counters set?
2780 
2781 	CSphString				m_sError;			///< error message
2782 	CSphString				m_sWarning;			///< warning message
2783 	int64_t					m_iBadRows;
2784 
2785 	CSphQueryResultMeta ();													///< ctor
~CSphQueryResultMeta()2786 	virtual					~CSphQueryResultMeta () {}						///< dtor
2787 	void					AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits );
2788 };
2789 
2790 
2791 /// search query result (meta-info plus actual matches)
2792 class CSphQueryProfile;
2793 class CSphQueryResult : public CSphQueryResultMeta
2794 {
2795 public:
2796 	CSphSwapVector<CSphMatch>	m_dMatches;			///< top matching documents, no more than MAX_MATCHES
2797 
2798 	CSphRsetSchema			m_tSchema;			///< result schema
2799 	const DWORD *			m_pMva;				///< pointer to MVA storage
2800 	const BYTE *			m_pStrings;			///< pointer to strings storage
2801 	bool					m_bArenaProhibit;
2802 
2803 	CSphVector<BYTE *>		m_dStorage2Free;	/// < aggregated external storage from rt indexes
2804 
2805 	int						m_iOffset;			///< requested offset into matches array
2806 	int						m_iCount;			///< count which will be actually served (computed from total, offset and limit)
2807 
2808 	int						m_iSuccesses;
2809 
2810 	CSphQueryProfile *		m_pProfile;			///< filled when query profiling is enabled; NULL otherwise
2811 
2812 public:
2813 							CSphQueryResult ();		///< ctor
2814 	virtual					~CSphQueryResult ();	///< dtor, which releases all owned stuff
2815 
2816 	void					LeakStorages ( CSphQueryResult & tDst );
2817 };
2818 
2819 /////////////////////////////////////////////////////////////////////////////
2820 // ATTRIBUTE UPDATE QUERY
2821 /////////////////////////////////////////////////////////////////////////////
2822 
2823 struct CSphAttrUpdate
2824 {
2825 	CSphVector<char*>				m_dAttrs;		///< update schema, attr names to update
2826 	CSphVector<ESphAttr>			m_dTypes;		///< update schema, attr types to update
2827 	CSphVector<DWORD>				m_dPool;		///< update values pool
2828 	CSphVector<SphDocID_t>			m_dDocids;		///< document IDs vector
2829 	CSphVector<const CSphRowitem*>	m_dRows;		///< document attribute's vector, used instead of m_dDocids.
2830 	CSphVector<int>					m_dRowOffset;	///< document row offsets in the pool (1 per doc, i.e. the length is the same as of m_dDocids)
2831 	bool							m_bIgnoreNonexistent;	///< whether to warn about non-existen attrs, or just silently ignore them
2832 	bool							m_bStrict;		///< whether to check for incompatible types first, or just ignore them
2833 
CSphAttrUpdateCSphAttrUpdate2834 	CSphAttrUpdate()
2835 		: m_bIgnoreNonexistent ( false )
2836 		, m_bStrict ( false )
2837 	{}
2838 
~CSphAttrUpdateCSphAttrUpdate2839 	~CSphAttrUpdate()
2840 	{
2841 		ARRAY_FOREACH ( i, m_dAttrs )
2842 			SafeDeleteArray ( m_dAttrs[i] );
2843 	}
2844 };
2845 
2846 /////////////////////////////////////////////////////////////////////////////
2847 // FULLTEXT INDICES
2848 /////////////////////////////////////////////////////////////////////////////
2849 
2850 /// progress info
2851 struct CSphIndexProgress
2852 {
2853 	enum Phase_e
2854 	{
2855 		PHASE_COLLECT,				///< document collection phase
2856 		PHASE_SORT,					///< final sorting phase
2857 		PHASE_COLLECT_MVA,			///< multi-valued attributes collection phase
2858 		PHASE_SORT_MVA,				///< multi-valued attributes collection phase
2859 		PHASE_MERGE,				///< index merging
2860 
2861 		PHASE_PREREAD,				///< searchd startup, prereading data
2862 		PHASE_PRECOMPUTE			///< searchd startup, indexing attributes
2863 	};
2864 
2865 	Phase_e			m_ePhase;		///< current indexing phase
2866 
2867 	int64_t			m_iDocuments;	///< PHASE_COLLECT: documents collected so far
2868 	int64_t			m_iBytes;		///< PHASE_COLLECT: bytes collected so far;
2869 									///< PHASE_PREREAD: bytes read so far;
2870 	int64_t			m_iBytesTotal;	///< PHASE_PREREAD: total bytes to read;
2871 
2872 	int64_t			m_iAttrs;		///< PHASE_COLLECT_MVA, PHASE_SORT_MVA: attrs processed so far
2873 	int64_t			m_iAttrsTotal;	///< PHASE_SORT_MVA: attrs total
2874 
2875 	SphOffset_t		m_iHits;		///< PHASE_SORT: hits sorted so far
2876 	SphOffset_t		m_iHitsTotal;	///< PHASE_SORT: hits total
2877 
2878 	int				m_iWords;		///< PHASE_MERGE: words merged so far
2879 
2880 	int				m_iDone;		///< generic percent, 0..1000 range
2881 
2882 	typedef void ( *IndexingProgress_fn ) ( const CSphIndexProgress * pStat, bool bPhaseEnd );
2883 	IndexingProgress_fn m_fnProgress;
2884 
CSphIndexProgressCSphIndexProgress2885 	CSphIndexProgress ()
2886 		: m_ePhase ( PHASE_COLLECT )
2887 		, m_iDocuments ( 0 )
2888 		, m_iBytes ( 0 )
2889 		, m_iBytesTotal ( 0 )
2890 		, m_iAttrs ( 0 )
2891 		, m_iAttrsTotal ( 0 )
2892 		, m_iHits ( 0 )
2893 		, m_iHitsTotal ( 0 )
2894 		, m_iWords ( 0 )
2895 		, m_fnProgress ( NULL )
2896 	{}
2897 
2898 	/// builds a message to print
2899 	/// WARNING, STATIC BUFFER, NON-REENTRANT
2900 	const char * BuildMessage() const;
2901 
2902 	void Show ( bool bPhaseEnd ) const;
2903 };
2904 
2905 
2906 /// match sorting functions
2907 enum ESphSortFunc
2908 {
2909 	FUNC_REL_DESC,
2910 	FUNC_ATTR_DESC,
2911 	FUNC_ATTR_ASC,
2912 	FUNC_TIMESEGS,
2913 	FUNC_GENERIC2,
2914 	FUNC_GENERIC3,
2915 	FUNC_GENERIC4,
2916 	FUNC_GENERIC5,
2917 	FUNC_EXPR
2918 };
2919 
2920 
2921 /// match sorting clause parsing outcomes
2922 enum ESortClauseParseResult
2923 {
2924 	SORT_CLAUSE_OK,
2925 	SORT_CLAUSE_ERROR,
2926 	SORT_CLAUSE_RANDOM
2927 };
2928 
2929 
2930 /// sorting key part types
2931 enum ESphSortKeyPart
2932 {
2933 	SPH_KEYPART_ID,
2934 	SPH_KEYPART_WEIGHT,
2935 	SPH_KEYPART_INT,
2936 	SPH_KEYPART_FLOAT,
2937 	SPH_KEYPART_STRING,
2938 	SPH_KEYPART_STRINGPTR
2939 };
2940 
2941 
2942 /// JSON key lookup stuff
2943 struct JsonKey_t
2944 {
2945 	CSphString		m_sKey;		///< name string
2946 	DWORD			m_uMask;	///< Bloom mask for this key
2947 	int				m_iLen;		///< name length, in bytes
2948 
2949 	JsonKey_t ();
2950 	explicit JsonKey_t ( const char * sKey, int iLen );
2951 };
2952 
2953 typedef int ( *SphStringCmp_fn )( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
2954 
2955 /// match comparator state
2956 struct CSphMatchComparatorState
2957 {
2958 	static const int	MAX_ATTRS = 5;
2959 
2960 	ESphSortKeyPart		m_eKeypart[MAX_ATTRS];		///< sort-by key part type
2961 	CSphAttrLocator		m_tLocator[MAX_ATTRS];		///< sort-by attr locator
2962 	JsonKey_t			m_tSubKeys[MAX_ATTRS];		///< sort-by attr sub-locator
2963 	ISphExpr *			m_tSubExpr[MAX_ATTRS];		///< sort-by attr expression
2964 	ESphAttr			m_tSubType[MAX_ATTRS];		///< sort-by expression type
2965 	int					m_dAttrs[MAX_ATTRS];		///< sort-by attr index
2966 
2967 	DWORD				m_uAttrDesc;				///< sort order mask (if i-th bit is set, i-th attr order is DESC)
2968 	DWORD				m_iNow;						///< timestamp (for timesegments sorting mode)
2969 	SphStringCmp_fn		m_fnStrCmp;					///< string comparator
2970 
2971 
2972 	/// create default empty state
CSphMatchComparatorStateCSphMatchComparatorState2973 	CSphMatchComparatorState ()
2974 		: m_uAttrDesc ( 0 )
2975 		, m_iNow ( 0 )
2976 		, m_fnStrCmp ( NULL )
2977 	{
2978 		for ( int i=0; i<MAX_ATTRS; i++ )
2979 		{
2980 			m_eKeypart[i] = SPH_KEYPART_ID;
2981 			m_dAttrs[i] = -1;
2982 		}
2983 	}
2984 
2985 	/// check if any of my attrs are bitfields
UsesBitfieldsCSphMatchComparatorState2986 	bool UsesBitfields ()
2987 	{
2988 		for ( int i=0; i<MAX_ATTRS; i++ )
2989 			if ( m_eKeypart[i]==SPH_KEYPART_INT && m_tLocator[i].IsBitfield() )
2990 				return true;
2991 		return false;
2992 	}
2993 
CmpStringsCSphMatchComparatorState2994 	inline int CmpStrings ( const CSphMatch & a, const CSphMatch & b, int iAttr ) const
2995 	{
2996 		assert ( iAttr>=0 && iAttr<MAX_ATTRS );
2997 		assert ( m_eKeypart[iAttr]==SPH_KEYPART_STRING || m_eKeypart[iAttr]==SPH_KEYPART_STRINGPTR );
2998 		assert ( m_fnStrCmp );
2999 
3000 		const BYTE * aa = (const BYTE*) a.GetAttr ( m_tLocator[iAttr] );
3001 		const BYTE * bb = (const BYTE*) b.GetAttr ( m_tLocator[iAttr] );
3002 		if ( aa==NULL || bb==NULL )
3003 		{
3004 			if ( aa==bb )
3005 				return 0;
3006 			if ( aa==NULL )
3007 				return -1;
3008 			return 1;
3009 		}
3010 		return m_fnStrCmp ( aa, bb, ( m_eKeypart[iAttr]==SPH_KEYPART_STRING ) );
3011 	}
3012 };
3013 
3014 
3015 /// match processor interface
3016 struct ISphMatchProcessor
3017 {
~ISphMatchProcessorISphMatchProcessor3018 	virtual ~ISphMatchProcessor () {}
3019 	virtual void Process ( CSphMatch * pMatch ) = 0;
3020 };
3021 
3022 
3023 /// generic match sorter interface
3024 class ISphMatchSorter
3025 {
3026 public:
3027 	bool				m_bRandomize;
3028 	int64_t				m_iTotal;
3029 
3030 	SphDocID_t			m_iJustPushed;
3031 	int					m_iMatchCapacity;
3032 	CSphTightVector<SphDocID_t> m_dJustPopped;
3033 
3034 protected:
3035 	CSphRsetSchema				m_tSchema;		///< sorter schema (adds dynamic attributes on top of index schema)
3036 	CSphMatchComparatorState	m_tState;		///< protected to set m_iNow automatically on SetState() calls
3037 
3038 public:
3039 	/// ctor
ISphMatchSorter()3040 						ISphMatchSorter () : m_bRandomize ( false ), m_iTotal ( 0 ), m_iJustPushed ( 0 ), m_iMatchCapacity ( 0 ) {}
3041 
3042 	/// virtualizing dtor
~ISphMatchSorter()3043 	virtual				~ISphMatchSorter () {}
3044 
3045 	/// check if this sorter needs attr values
3046 	virtual bool		UsesAttrs () const = 0;
3047 
3048 	// check if sorter might be used in multi-queue
3049 	virtual bool		CanMulti () const = 0;
3050 
3051 	/// check if this sorter does groupby
3052 	virtual bool		IsGroupby () const = 0;
3053 
3054 	/// set match comparator state
3055 	virtual void		SetState ( const CSphMatchComparatorState & tState );
3056 
3057 	/// get match comparator stat
GetState()3058 	virtual CSphMatchComparatorState &	GetState() { return m_tState; }
3059 
3060 	/// set group comparator state
SetGroupState(const CSphMatchComparatorState &)3061 	virtual void		SetGroupState ( const CSphMatchComparatorState & ) {}
3062 
3063 	/// set MVA pool pointer (for MVA+groupby sorters)
SetMVAPool(const DWORD *,bool)3064 	virtual void SetMVAPool ( const DWORD *, bool ) {}
3065 
3066 	/// set string pool pointer (for string+groupby sorters)
SetStringPool(const BYTE *)3067 	virtual void		SetStringPool ( const BYTE * ) {}
3068 
3069 	/// set sorter schema by swapping in and (optionally) adjusting the argument
SetSchema(CSphRsetSchema & tSchema)3070 	virtual void		SetSchema ( CSphRsetSchema & tSchema ) { m_tSchema = tSchema; }
3071 
3072 	/// get incoming schema
GetSchema()3073 	virtual const CSphRsetSchema &	GetSchema () const { return m_tSchema; }
3074 
3075 	/// base push
3076 	/// returns false if the entry was rejected as duplicate
3077 	/// returns true otherwise (even if it was not actually inserted)
3078 	virtual bool		Push ( const CSphMatch & tEntry ) = 0;
3079 
3080 	/// submit pre-grouped match. bNewSet indicates that the match begins the bunch of matches got from one source
3081 	virtual bool		PushGrouped ( const CSphMatch & tEntry, bool bNewSet ) = 0;
3082 
3083 	/// get	rough entries count, due of aggregate filtering phase
3084 	virtual int			GetLength () const = 0;
3085 
3086 	/// get internal buffer length
3087 	virtual int			GetDataLength () const = 0;
3088 
3089 	/// get total count of non-duplicates Push()ed through this queue
GetTotalCount()3090 	virtual int64_t		GetTotalCount () const { return m_iTotal; }
3091 
3092 	/// process collected entries up to length count
3093 	virtual void		Finalize ( ISphMatchProcessor & tProcessor, bool bCallProcessInResultSetOrder ) = 0;
3094 
3095 	/// store all entries into specified location and remove them from the queue
3096 	/// entries are stored in properly sorted order,
3097 	/// if iTag is non-negative, entries are also tagged; otherwise, their tag's unchanged
3098 	/// return sored entries count, might be less than length due of aggregate filtering phase
3099 	virtual int			Flatten ( CSphMatch * pTo, int iTag ) = 0;
3100 
3101 	/// get a pointer to the worst element, NULL if there is no fixed location
GetWorst()3102 	virtual const CSphMatch *	GetWorst() const { return NULL; }
3103 };
3104 
3105 
3106 /// available docinfo storage strategies
3107 enum ESphDocinfo
3108 {
3109 	SPH_DOCINFO_NONE		= 0,	///< no docinfo available
3110 	SPH_DOCINFO_INLINE		= 1,	///< inline docinfo into index (specifically, into doclists)
3111 	SPH_DOCINFO_EXTERN		= 2		///< store docinfo separately
3112 };
3113 
3114 
3115 enum ESphHitless
3116 {
3117 	SPH_HITLESS_NONE		= 0,	///< all hits are present
3118 	SPH_HITLESS_SOME		= 1,	///< some of the hits might be omitted (check the flag bit)
3119 	SPH_HITLESS_ALL			= 2	///< no hits in this index
3120 };
3121 
3122 
3123 enum ESphHitFormat
3124 {
3125 	SPH_HIT_FORMAT_PLAIN	= 0,	///< all hits are stored in hitlist
3126 	SPH_HIT_FORMAT_INLINE	= 1	///< hits can be split and inlined into doclist (aka 9-23)
3127 };
3128 
3129 
3130 enum ESphRLPFilter
3131 {
3132 	SPH_RLP_NONE			= 0,	///< rlp not used
3133 	SPH_RLP_PLAIN			= 1,	///< rlp used to tokenize every document
3134 	SPH_RLP_BATCHED			= 2		///< rlp used to batch documents and tokenize several documents at once
3135 };
3136 
3137 
3138 struct CSphIndexSettings : public CSphSourceSettings
3139 {
3140 	ESphDocinfo		m_eDocinfo;
3141 	ESphHitFormat	m_eHitFormat;
3142 	bool			m_bHtmlStrip;
3143 	CSphString		m_sHtmlIndexAttrs;
3144 	CSphString		m_sHtmlRemoveElements;
3145 	CSphString		m_sZones;
3146 	ESphHitless		m_eHitless;
3147 	CSphString		m_sHitlessFiles;
3148 	bool			m_bVerbose;
3149 	int				m_iEmbeddedLimit;
3150 
3151 	ESphBigram				m_eBigramIndex;
3152 	CSphString				m_sBigramWords;
3153 	CSphVector<CSphString>	m_dBigramWords;
3154 
3155 	DWORD			m_uAotFilterMask;		///< lemmatize_XX_all forces us to transform queries on the index level too
3156 	ESphRLPFilter	m_eChineseRLP;			///< chinese RLP filter
3157 	CSphString		m_sRLPContext;			///< path to RLP context file
3158 
3159 	CSphString		m_sIndexTokenFilter;	///< indexing time token filter spec string (pretty useless for disk, vital for RT)
3160 
3161 					CSphIndexSettings ();
3162 };
3163 
3164 
3165 /// forward refs to internal searcher classes
3166 class ISphQword;
3167 class ISphQwordSetup;
3168 class CSphQueryContext;
3169 struct ISphFilter;
3170 
3171 
3172 struct ISphKeywordsStat
3173 {
~ISphKeywordsStatISphKeywordsStat3174 	virtual			~ISphKeywordsStat() {}
3175 	virtual bool	FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const = 0;
3176 };
3177 
3178 
3179 struct CSphIndexStatus
3180 {
3181 	int64_t			m_iRamUse;
3182 	int64_t			m_iDiskUse;
3183 	int64_t			m_iRamChunkSize; // not used for plain
3184 	int				m_iNumChunks; // not used for plain
3185 	int64_t			m_iMemLimit; // not used for plain
3186 
CSphIndexStatusCSphIndexStatus3187 	CSphIndexStatus()
3188 		: m_iRamUse ( 0 )
3189 		, m_iDiskUse ( 0 )
3190 		, m_iRamChunkSize ( 0 )
3191 		, m_iNumChunks ( 0 )
3192 		, m_iMemLimit ( 0 )
3193 	{}
3194 };
3195 
3196 struct KillListTrait_t
3197 {
3198 	const SphDocID_t *	m_pBegin;
3199 	int					m_iLen;
3200 };
3201 
3202 typedef CSphVector<KillListTrait_t> KillListVector;
3203 
3204 struct CSphMultiQueryArgs : public ISphNoncopyable
3205 {
3206 	const KillListVector &					m_dKillList;
3207 	const int								m_iIndexWeight;
3208 	int										m_iTag;
3209 	DWORD									m_uPackedFactorFlags;
3210 	bool									m_bLocalDF;
3211 	const SmallStringHash_T<int64_t> *		m_pLocalDocs;
3212 	int64_t									m_iTotalDocs;
3213 
3214 	CSphMultiQueryArgs ( const KillListVector & dKillList, int iIndexWeight );
3215 };
3216 
3217 
3218 /// generic fulltext index interface
3219 class CSphIndex : public ISphKeywordsStat
3220 {
3221 public:
3222 
3223 	enum
3224 	{
3225 		ATTRS_UPDATED			= ( 1UL<<0 ),
3226 		ATTRS_MVA_UPDATED		= ( 1UL<<1 ),
3227 		ATTRS_STRINGS_UPDATED	= ( 1UL<<2 )
3228 	};
3229 
3230 public:
3231 	explicit					CSphIndex ( const char * sIndexName, const char * sFilename );
3232 	virtual						~CSphIndex ();
3233 
GetLastError()3234 	virtual const CSphString &	GetLastError () const { return m_sLastError; }
GetLastWarning()3235 	virtual const CSphString &	GetLastWarning () const { return m_sLastWarning; }
GetMatchSchema()3236 	virtual const CSphSchema &	GetMatchSchema () const { return m_tSchema; }			///< match schema as returned in result set (possibly different from internal storage schema!)
3237 
3238 	virtual	void				SetProgressCallback ( CSphIndexProgress::IndexingProgress_fn pfnProgress ) = 0;
3239 	virtual void				SetInplaceSettings ( int iHitGap, int iDocinfoGap, float fRelocFactor, float fWriteFactor );
SetPreopen(bool bValue)3240 	virtual void				SetPreopen ( bool bValue ) { m_bKeepFilesOpen = bValue; }
3241 	void						SetFieldFilter ( ISphFieldFilter * pFilter );
3242 	void						SetTokenizer ( ISphTokenizer * pTokenizer );
3243 	void						SetupQueryTokenizer();
GetTokenizer()3244 	const ISphTokenizer *		GetTokenizer () const { return m_pTokenizer; }
GetQueryTokenizer()3245 	const ISphTokenizer *		GetQueryTokenizer () const { return m_pQueryTokenizer; }
3246 	ISphTokenizer *				LeakTokenizer ();
3247 	void						SetDictionary ( CSphDict * pDict );
GetDictionary()3248 	CSphDict *					GetDictionary () const { return m_pDict; }
3249 	CSphDict *					LeakDictionary ();
SetKeepAttrs(const CSphString &)3250 	virtual void				SetKeepAttrs ( const CSphString & ) {}
3251 	void						Setup ( const CSphIndexSettings & tSettings );
GetSettings()3252 	const CSphIndexSettings &	GetSettings () const { return m_tSettings; }
IsStripperInited()3253 	bool						IsStripperInited () const { return m_bStripperInited; }
3254 	virtual SphDocID_t *		GetKillList () const = 0;
3255 	virtual int					GetKillListSize () const = 0;
3256 	virtual bool				HasDocid ( SphDocID_t uDocid ) const = 0;
IsRT()3257 	virtual bool				IsRT() const { return false; }
SetBinlog(bool bBinlog)3258 	void						SetBinlog ( bool bBinlog ) { m_bBinlog = bBinlog; }
GetFieldLens()3259 	virtual int64_t *			GetFieldLens() const { return NULL; }
3260 
IsStarDict()3261 	virtual bool				IsStarDict() const { return true; }
3262 
3263 public:
3264 	/// build index by indexing given sources
3265 	virtual int					Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer ) = 0;
3266 
3267 	/// build index by mering current index with given index
3268 	virtual bool				Merge ( CSphIndex * pSource, const CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists ) = 0;
3269 
3270 public:
3271 	/// check all data files, preload schema, and preallocate enough shared RAM to load memory-cached data
3272 	virtual bool				Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning ) = 0;
3273 
3274 	/// deallocate all previously preallocated shared data
3275 	virtual void				Dealloc () = 0;
3276 
3277 	/// precache everything which needs to be precached
3278 	// WARNING, WILL BE CALLED FROM DIFFERENT PROCESS, MUST ONLY MODIFY SHARED MEMORY
3279 	virtual bool				Preread () = 0;
3280 
3281 	/// set new index base path
3282 	virtual void				SetBase ( const char * sNewBase ) = 0;
3283 
3284 	/// set new index base path, and physically rename index files too
3285 	virtual bool				Rename ( const char * sNewBase ) = 0;
3286 
3287 	/// obtain exclusive lock on this index
3288 	virtual bool				Lock () = 0;
3289 
3290 	/// dismiss exclusive lock and unlink lock file
3291 	virtual void				Unlock () = 0;
3292 
3293 	/// relock shared RAM (only on daemonization)
3294 	virtual bool				Mlock () = 0;
3295 
3296 	/// keep attributes on disk and map them via file memory mapping
SetEnableOndiskAttributes(bool)3297 	virtual void				SetEnableOndiskAttributes ( bool ) {}
3298 
3299 	/// called when index is loaded and prepared to work
3300 	virtual void				PostSetup() = 0;
3301 
3302 public:
3303 	/// return index document, bytes totals (FIXME? remove this in favor of GetStatus() maybe?)
3304 	virtual const CSphSourceStats &		GetStats () const = 0;
3305 
3306 	/// return additional index info
3307 	virtual void				GetStatus ( CSphIndexStatus* ) const = 0;
3308 
3309 public:
3310 	virtual bool				EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const = 0;
3311 	void						SetCacheSize ( int iMaxCachedDocs, int iMaxCachedHits );
3312 	virtual bool				MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const = 0;
3313 	virtual bool				MultiQueryEx ( int iQueries, const CSphQuery * ppQueries, CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const = 0;
3314 	virtual bool				GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString * pError ) const = 0;
3315 	virtual bool				FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const = 0;
3316 
3317 public:
3318 	/// updates memory-cached attributes in real time
3319 	/// returns non-negative amount of actually found and updated records on success
3320 	/// on failure, -1 is returned and GetLastError() contains error message
3321 	virtual int					UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError, CSphString & sWarning ) = 0;
3322 
3323 	/// saves memory-cached attributes, if there were any updates to them
3324 	/// on failure, false is returned and GetLastError() contains error message
3325 	virtual bool				SaveAttributes ( CSphString & sError ) const = 0;
3326 
3327 	virtual DWORD				GetAttributeStatus () const = 0;
3328 
3329 	virtual bool				CreateModifiedFiles ( bool bAddAttr, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError ) = 0;
3330 
3331 	virtual bool				AddRemoveAttribute ( bool bAdd, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError ) = 0;
3332 
3333 public:
3334 	/// internal debugging hook, DO NOT USE
3335 	virtual void				DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig ) = 0;
3336 
3337 	/// internal debugging hook, DO NOT USE
3338 	virtual void				DebugDumpDocids ( FILE * fp ) = 0;
3339 
3340 	/// internal debugging hook, DO NOT USE
3341 	virtual void				DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID ) = 0;
3342 
3343 	/// internal debugging hook, DO NOT USE
3344 	virtual void				DebugDumpDict ( FILE * fp ) = 0;
3345 
3346 	/// internal debugging hook, DO NOT USE
3347 	virtual int					DebugCheck ( FILE * fp ) = 0;
SetDebugCheck()3348 	virtual void				SetDebugCheck () {}
3349 
3350 	/// getter for name
GetName()3351 	const char *				GetName () { return m_sIndexName.cstr(); }
3352 
SetName(const char * sName)3353 	void						SetName ( const char * sName ) { m_sIndexName = sName; }
3354 
3355 	/// get for the base file name
GetFilename()3356 	const char *				GetFilename () const { return m_sFilename.cstr(); }
3357 
3358 	/// internal make document id list from external docinfo, DO NOT USE
3359 	virtual bool BuildDocList ( SphAttr_t ** ppDocList, int64_t * pCount, CSphString * pError ) const;
3360 
3361 	/// internal replace kill-list and rewrite spk file, DO NOT USE
ReplaceKillList(const SphDocID_t *,int)3362 	virtual bool				ReplaceKillList ( const SphDocID_t *, int ) { return true; }
3363 
3364 public:
3365 	int64_t						m_iTID;
3366 
3367 	bool						m_bExpandKeywords;		///< enable automatic query-time keyword expansion (to "( word | =word | *word* )")
3368 	int							m_iExpansionLimit;
3369 
3370 protected:
3371 
3372 	CSphSchema					m_tSchema;
3373 	CSphString					m_sLastError;
3374 	CSphString					m_sLastWarning;
3375 
3376 	bool						m_bInplaceSettings;
3377 	int							m_iHitGap;
3378 	int							m_iDocinfoGap;
3379 	float						m_fRelocFactor;
3380 	float						m_fWriteFactor;
3381 
3382 	bool						m_bKeepFilesOpen;		///< keep files open to avoid race on seamless rotation
3383 	bool						m_bBinlog;
3384 
3385 	bool						m_bStripperInited;		///< was stripper initialized (old index version (<9) handling)
3386 
3387 public:
3388 	bool						m_bId32to64;			///< did we convert id32 to id64 on startup
3389 
3390 protected:
3391 	CSphIndexSettings			m_tSettings;
3392 
3393 	ISphFieldFilter *			m_pFieldFilter;
3394 	ISphTokenizer *				m_pTokenizer;
3395 	ISphTokenizer *				m_pQueryTokenizer;
3396 	CSphDict *					m_pDict;
3397 
3398 	int							m_iMaxCachedDocs;
3399 	int							m_iMaxCachedHits;
3400 	CSphString					m_sIndexName;
3401 	CSphString					m_sFilename;
3402 
3403 public:
SetGlobalIDFPath(const CSphString & sPath)3404 	void						SetGlobalIDFPath ( const CSphString & sPath ) { m_sGlobalIDFPath = sPath; }
3405 	float						GetGlobalIDF ( const CSphString & sWord, int64_t iDocsLocal, bool bPlainIDF ) const;
3406 
3407 protected:
3408 	CSphString					m_sGlobalIDFPath;
3409 };
3410 
3411 // update attributes with index pointer attached
3412 struct CSphAttrUpdateEx
3413 {
3414 	const CSphAttrUpdate *	m_pUpdate;		///< the unchangeable update pool
3415 	CSphIndex *				m_pIndex;		///< the index on which the update should happen
3416 	CSphString *			m_pError;		///< the error, if any
3417 	CSphString *			m_pWarning;		///< the warning, if any
3418 	int						m_iAffected;	///< num of updated rows.
CSphAttrUpdateExCSphAttrUpdateEx3419 	CSphAttrUpdateEx()
3420 		: m_pUpdate ( NULL )
3421 		, m_pIndex ( NULL )
3422 		, m_pError ( NULL )
3423 		, m_pWarning ( NULL )
3424 		, m_iAffected ( 0 )
3425 	{}
3426 };
3427 
3428 struct SphQueueSettings_t : public ISphNoncopyable
3429 {
3430 	const CSphQuery &			m_tQuery;
3431 	const ISphSchema &			m_tSchema;
3432 	CSphString &				m_sError;
3433 	CSphQueryProfile *			m_pProfiler;
3434 	bool						m_bComputeItems;
3435 	CSphSchema *				m_pExtra;
3436 	CSphAttrUpdateEx *			m_pUpdate;
3437 	CSphVector<SphDocID_t> *	m_pDeletes;
3438 	bool						m_bZonespanlist;
3439 	DWORD						m_uPackedFactorFlags;
3440 	ISphExprHook *				m_pHook;
3441 	const CSphFilterSettings *	m_pAggrFilter;
3442 
SphQueueSettings_tSphQueueSettings_t3443 	SphQueueSettings_t ( const CSphQuery & tQuery, const ISphSchema & tSchema, CSphString & sError, CSphQueryProfile * pProfiler )
3444 		: m_tQuery ( tQuery )
3445 		, m_tSchema ( tSchema )
3446 		, m_sError ( sError )
3447 		, m_pProfiler ( pProfiler )
3448 		, m_bComputeItems ( true )
3449 		, m_pExtra ( NULL )
3450 		, m_pUpdate ( NULL )
3451 		, m_pDeletes ( NULL )
3452 		, m_bZonespanlist ( false )
3453 		, m_uPackedFactorFlags ( SPH_FACTOR_DISABLE )
3454 		, m_pHook ( NULL )
3455 		, m_pAggrFilter ( NULL )
3456 	{ }
3457 };
3458 
3459 /////////////////////////////////////////////////////////////////////////////
3460 
3461 /// create phrase fulltext index implementation
3462 CSphIndex *			sphCreateIndexPhrase ( const char* szIndexName, const char * sFilename );
3463 
3464 /// create template (tokenizer) index implementation
3465 CSphIndex *			sphCreateIndexTemplate ( );
3466 
3467 /// set JSON attribute indexing options
3468 /// bStrict is whether to stop indexing on error, or just ignore the attribute value
3469 /// bAutoconvNumbers is whether to auto-convert eligible (!) strings to integers and floats, or keep them as strings
3470 /// bKeynamesToLowercase is whether to convert all key names to lowercase
3471 void				sphSetJsonOptions ( bool bStrict, bool bAutoconvNumbers, bool bKeynamesToLowercase );
3472 
3473 /// parses sort clause, using a given schema
3474 /// fills eFunc and tState and optionally sError, returns result code
3475 ESortClauseParseResult	sphParseSortClause ( const CSphQuery * pQuery, const char * sClause, const ISphSchema & tSchema,
3476 	ESphSortFunc & eFunc, CSphMatchComparatorState & tState, CSphString & sError );
3477 
3478 /// creates proper queue for given query
3479 /// may return NULL on error; in this case, error message is placed in sError
3480 /// if the pUpdate is given, creates the updater's queue and perform the index update
3481 /// instead of searching
3482 ISphMatchSorter *	sphCreateQueue ( SphQueueSettings_t & tQueue );
3483 
3484 /// convert queue to sorted array, and add its entries to result's matches array
3485 int					sphFlattenQueue ( ISphMatchSorter * pQueue, CSphQueryResult * pResult, int iTag );
3486 
3487 /// setup per-keyword read buffer sizes
3488 void				sphSetReadBuffers ( int iReadBuffer, int iReadUnhinted );
3489 
3490 /// check query for expressions
3491 bool				sphHasExpressions ( const CSphQuery & tQuery, const CSphSchema & tSchema );
3492 
3493 /// initialize collation tables
3494 void				sphCollationInit ();
3495 
3496 //////////////////////////////////////////////////////////////////////////
3497 
3498 extern CSphString g_sLemmatizerBase;
3499 
3500 #if USE_RLP
3501 extern CSphString g_sRLPRoot;
3502 extern CSphString g_sRLPEnv;
3503 extern int g_iRLPMaxBatchSize;
3504 extern int g_iRLPMaxBatchDocs;
3505 #endif
3506 
3507 /////////////////////////////////////////////////////////////////////////////
3508 
3509 // workaround to suppress C4511/C4512 warnings (copy ctor and assignment operator) in VS 2003
3510 #if _MSC_VER>=1300 && _MSC_VER<1400
3511 #pragma warning(disable:4511)
3512 #pragma warning(disable:4512)
3513 #endif
3514 
3515 // suppress C4201 (nameless struct/union is a nonstandard extension) because even min-spec gcc 3.4.6 works ok
3516 #if defined(_MSC_VER)
3517 #pragma warning(disable:4201)
3518 #endif
3519 
3520 #endif // _sphinx_
3521 
3522 //
3523 // $Id$
3524 //
3525