1 //
2 // $Id$
3 //
4 
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 
16 #ifndef _sphinxint_
17 #define _sphinxint_
18 
19 #include "sphinx.h"
20 #include "sphinxfilter.h"
21 #include "sphinxrt.h"
22 #include "sphinxquery.h"
23 #include "sphinxexcerpt.h"
24 #include "sphinxudf.h"
25 
26 #include <sys/stat.h>
27 #include <fcntl.h>
28 #include <float.h>
29 
30 //////////////////////////////////////////////////////////////////////////
31 // INTERNAL CONSTANTS
32 //////////////////////////////////////////////////////////////////////////
33 
34 #ifdef O_BINARY
35 #define SPH_O_BINARY O_BINARY
36 #else
37 #define SPH_O_BINARY 0
38 #endif
39 
40 #define SPH_O_READ	( O_RDONLY | SPH_O_BINARY )
41 #define SPH_O_NEW	( O_CREAT | O_RDWR | O_TRUNC | SPH_O_BINARY )
42 
43 #define MVA_DOWNSIZE		DWORD			// MVA32 offset type
44 #define MVA_OFFSET_MASK		0x7fffffffUL	// MVA offset mask
45 #define MVA_ARENA_FLAG		0x80000000UL	// MVA global-arena flag
46 
47 #define DEFAULT_MAX_MATCHES 1000
48 
49 //////////////////////////////////////////////////////////////////////////
50 
51 const DWORD		INDEX_MAGIC_HEADER			= 0x58485053;		///< my magic 'SPHX' header
52 const DWORD		INDEX_FORMAT_VERSION		= 42;				///< my format version
53 
54 const char		MAGIC_SYNONYM_WHITESPACE	= 1;				// used internally in tokenizer only
55 const char		MAGIC_CODE_SENTENCE			= 2;				// emitted from tokenizer on sentence boundary
56 const char		MAGIC_CODE_PARAGRAPH		= 3;				// emitted from stripper (and passed via tokenizer) on paragraph boundary
57 const char		MAGIC_CODE_ZONE				= 4;				// emitted from stripper (and passed via tokenizer) on zone boundary; followed by zero-terminated zone name
58 
59 const char		MAGIC_WORD_HEAD				= 1;				// prepended to keyword by source, stored in (crc) dictionary
60 const char		MAGIC_WORD_TAIL				= 1;				// appended to keyword by source, stored in (crc) dictionary
61 const char		MAGIC_WORD_HEAD_NONSTEMMED	= 2;				// prepended to keyword by source, stored in dictionary
62 const char		MAGIC_WORD_BIGRAM			= 3;				// used as a bigram (keyword pair) separator, stored in dictionary
63 
64 extern const char *		MAGIC_WORD_SENTENCE;	///< value is "\3sentence"
65 extern const char *		MAGIC_WORD_PARAGRAPH;	///< value is "\3paragraph"
66 
67 //////////////////////////////////////////////////////////////////////////
68 // INTERNAL GLOBALS
69 //////////////////////////////////////////////////////////////////////////
70 
71 /// binlog, defined in sphinxrt.cpp
72 extern class ISphBinlog *		g_pBinlog;
73 
74 /// costs for max_predicted_time limits, defined in sphinxsearch.cpp
75 /// measured in nanoseconds (that is, 1e-9)
76 extern int g_iPredictorCostSkip;
77 extern int g_iPredictorCostDoc;
78 extern int g_iPredictorCostHit;
79 extern int g_iPredictorCostMatch;
80 
81 extern bool g_bJsonStrict;
82 extern bool g_bJsonAutoconvNumbers;
83 extern bool g_bJsonKeynamesToLowercase;
84 
85 //////////////////////////////////////////////////////////////////////////
86 // INTERNAL HELPER FUNCTIONS, CLASSES, ETC
87 //////////////////////////////////////////////////////////////////////////
88 
89 #define SPH_QUERY_STATES \
90 	SPH_QUERY_STATE ( UNKNOWN,		"unknown" ) \
91 	SPH_QUERY_STATE ( NET_READ,		"net_read" ) \
92 	SPH_QUERY_STATE ( IO,			"io" ) \
93 	SPH_QUERY_STATE ( DIST_CONNECT,	"dist_connect" ) \
94 	SPH_QUERY_STATE ( LOCAL_DF,		"local_df" ) \
95 	SPH_QUERY_STATE ( LOCAL_SEARCH,	"local_search" ) \
96 	SPH_QUERY_STATE ( SQL_PARSE,	"sql_parse" ) \
97 	SPH_QUERY_STATE ( FULLSCAN,		"fullscan" ) \
98 	SPH_QUERY_STATE ( DICT_SETUP,	"dict_setup" ) \
99 	SPH_QUERY_STATE ( PARSE,		"parse" ) \
100 	SPH_QUERY_STATE ( TRANSFORMS,	"transforms" ) \
101 	SPH_QUERY_STATE ( INIT,			"init" ) \
102 	SPH_QUERY_STATE ( INIT_SEGMENT,	"init_segment" ) \
103 	SPH_QUERY_STATE ( OPEN,			"open" ) \
104 	SPH_QUERY_STATE ( READ_DOCS,	"read_docs" ) \
105 	SPH_QUERY_STATE ( READ_HITS,	"read_hits" ) \
106 	SPH_QUERY_STATE ( GET_DOCS,		"get_docs" ) \
107 	SPH_QUERY_STATE ( GET_HITS,		"get_hits" ) \
108 	SPH_QUERY_STATE ( FILTER,		"filter" ) \
109 	SPH_QUERY_STATE ( RANK,			"rank" ) \
110 	SPH_QUERY_STATE ( SORT,			"sort" ) \
111 	SPH_QUERY_STATE ( FINALIZE,		"finalize" ) \
112 	SPH_QUERY_STATE ( DIST_WAIT,	"dist_wait" ) \
113 	SPH_QUERY_STATE ( AGGREGATE,	"aggregate" ) \
114 	SPH_QUERY_STATE ( NET_WRITE,	"net_write" ) \
115 	SPH_QUERY_STATE ( EVAL_POST,	"eval_post" ) \
116 	SPH_QUERY_STATE ( SNIPPET,		"eval_snippet" ) \
117 	SPH_QUERY_STATE ( EVAL_UDF,		"eval_udf" ) \
118 	SPH_QUERY_STATE ( TABLE_FUNC,	"table_func" )
119 
120 
121 /// possible query states, used for profiling
122 enum ESphQueryState
123 {
124 	SPH_QSTATE_INFINUM = -1,
125 
126 #define SPH_QUERY_STATE(_name,_desc) SPH_QSTATE_##_name,
127 	SPH_QUERY_STATES
128 #undef SPH_QUERY_STATE
129 
130 	SPH_QSTATE_TOTAL
131 };
132 STATIC_ASSERT ( SPH_QSTATE_UNKNOWN==0, BAD_QUERY_STATE_ENUM_BASE );
133 
134 
135 /// search query profile
136 class CSphQueryProfile
137 {
138 public:
139 	ESphQueryState	m_eState;							///< current state
140 	int64_t			m_tmStamp;							///< timestamp when we entered the current state
141 
142 	int				m_dSwitches [ SPH_QSTATE_TOTAL+1 ];	///< number of switches to given state
143 	int64_t			m_tmTotal [ SPH_QSTATE_TOTAL+1 ];	///< total time spent per state
144 
145 	CSphStringBuilder	m_sTransformedTree;					///< transformed query tree
146 
147 public:
148 	/// create empty and stopped profile
CSphQueryProfile()149 	CSphQueryProfile()
150 	{
151 		Start ( SPH_QSTATE_TOTAL );
152 	}
153 
154 	/// switch to a new query state, and record a timestamp
155 	/// returns previous state, to simplify Push/Pop like scenarios
Switch(ESphQueryState eNew)156 	ESphQueryState Switch ( ESphQueryState eNew )
157 	{
158 		int64_t tmNow = sphMicroTimer();
159 		ESphQueryState eOld = m_eState;
160 		m_dSwitches [ eOld ]++;
161 		m_tmTotal [ eOld ] += tmNow - m_tmStamp;
162 		m_eState = eNew;
163 		m_tmStamp = tmNow;
164 		return eOld;
165 	}
166 
167 	/// reset everything and start profiling from a given state
Start(ESphQueryState eNew)168 	void Start ( ESphQueryState eNew )
169 	{
170 		memset ( m_dSwitches, 0, sizeof(m_dSwitches) );
171 		memset ( m_tmTotal, 0, sizeof(m_tmTotal) );
172 		m_eState = eNew;
173 		m_tmStamp = sphMicroTimer();
174 	}
175 
176 	/// stop profiling
Stop()177 	void Stop()
178 	{
179 		Switch ( SPH_QSTATE_TOTAL );
180 	}
181 };
182 
183 
184 /// file writer with write buffering and int encoder
185 class CSphWriter : ISphNoncopyable
186 {
187 public:
188 					CSphWriter ();
189 	virtual			~CSphWriter ();
190 
191 	void			SetBufferSize ( int iBufferSize );	///< tune write cache size; must be called before OpenFile() or SetFile()
192 
193 	bool			OpenFile ( const CSphString & sName, CSphString & sError );
194 	void			SetFile ( CSphAutofile & tAuto, SphOffset_t * pSharedOffset, CSphString & sError );
195 	void			CloseFile ( bool bTruncate = false );	///< note: calls Flush(), ie. IsError() might get true after this call
196 	void			UnlinkFile (); /// some shit happened (outside) and the file is no more actual.
197 
198 	void			PutByte ( int uValue );
199 	void			PutBytes ( const void * pData, int64_t iSize );
PutDword(DWORD uValue)200 	void			PutDword ( DWORD uValue ) { PutBytes ( &uValue, sizeof(DWORD) ); }
PutOffset(SphOffset_t uValue)201 	void			PutOffset ( SphOffset_t uValue ) { PutBytes ( &uValue, sizeof(SphOffset_t) ); }
202 	void			PutString ( const char * szString );
203 	void			PutString ( const CSphString & sString );
204 	void			Tag ( const char * sTag );
205 
206 	void			SeekTo ( SphOffset_t pos ); ///< seeking inside the buffer will truncate it
207 
208 #if USE_64BIT
PutDocid(SphDocID_t uValue)209 	void			PutDocid ( SphDocID_t uValue ) { PutOffset ( uValue ); }
210 #else
PutDocid(SphDocID_t uValue)211 	void			PutDocid ( SphDocID_t uValue ) { PutDword ( uValue ); }
212 #endif
213 
214 	void			ZipInt ( DWORD uValue );
215 	void			ZipOffset ( uint64_t uValue );
216 	void			ZipOffsets ( CSphVector<SphOffset_t> * pData );
217 
IsError()218 	bool			IsError () const	{ return m_bError; }
GetPos()219 	SphOffset_t		GetPos () const		{ return m_iPos; }
SetThrottle(ThrottleState_t * pState)220 	void			SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
221 
222 protected:
223 	CSphString		m_sName;
224 	SphOffset_t		m_iPos;
225 	SphOffset_t		m_iWritten;
226 
227 	int				m_iFD;
228 	int				m_iPoolUsed;
229 	BYTE *			m_pBuffer;
230 	BYTE *			m_pPool;
231 	bool			m_bOwnFile;
232 	SphOffset_t	*	m_pSharedOffset;
233 	int				m_iBufferSize;
234 
235 	bool			m_bError;
236 	CSphString *	m_pError;
237 	ThrottleState_t * m_pThrottle;
238 
239 	virtual void	Flush ();
240 };
241 
242 
243 /// file which closes automatically when going out of scope
244 class CSphAutofile : ISphNoncopyable
245 {
246 protected:
247 	int			m_iFD;			///< my file descriptor
248 	CSphString	m_sFilename;	///< my file name
249 	bool		m_bTemporary;	///< whether to unlink this file on Close()
250 	bool		m_bWouldTemporary; ///< backup of the m_bTemporary
251 
252 	CSphIndexProgress *					m_pStat;
253 
254 public:
255 					CSphAutofile ();
256 					CSphAutofile ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp=false );
257 					~CSphAutofile ();
258 
259 	int				Open ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp=false );
260 	void			Close ();
261 	void			SetTemporary(); ///< would be set if a shit happened and the file is not actual.
262 
263 public:
GetFD()264 	int				GetFD () const { return m_iFD; }
265 	const char *	GetFilename () const;
266 	SphOffset_t		GetSize ( SphOffset_t iMinSize, bool bCheckSizeT, CSphString & sError );
267 	SphOffset_t		GetSize ();
268 
269 	bool			Read ( void * pBuf, int64_t iCount, CSphString & sError );
270 	void			SetProgressCallback ( CSphIndexProgress * pStat );
271 };
272 
273 
274 /// file reader with read buffering and int decoder
275 class CSphReader
276 {
277 public:
278 	CSphQueryProfile *	m_pProfile;
279 	ESphQueryState		m_eProfileState;
280 
281 public:
282 	CSphReader ( BYTE * pBuf=NULL, int iSize=0 );
283 	virtual		~CSphReader ();
284 
285 	void		SetBuffers ( int iReadBuffer, int iReadUnhinted );
286 	void		SetFile ( int iFD, const char * sFilename );
287 	void		SetFile ( const CSphAutofile & tFile );
288 	void		Reset ();
289 	void		SeekTo ( SphOffset_t iPos, int iSizeHint );
290 
291 	void		SkipBytes ( int iCount );
GetPos()292 	SphOffset_t	GetPos () const { return m_iPos+m_iBuffPos; }
293 
294 	void		GetBytes ( void * pData, int iSize );
295 	int			GetBytesZerocopy ( const BYTE ** ppData, int iMax ); ///< zerocopy method; returns actual length present in buffer (upto iMax)
296 
297 	int			GetByte ();
298 	DWORD		GetDword ();
299 	SphOffset_t	GetOffset ();
300 	CSphString	GetString ();
301 	int			GetLine ( char * sBuffer, int iMaxLen );
302 	bool		Tag ( const char * sTag );
303 
304 	DWORD		UnzipInt ();
305 	uint64_t	UnzipOffset ();
306 
GetErrorFlag()307 	bool					GetErrorFlag () const		{ return m_bError; }
GetErrorMessage()308 	const CSphString &		GetErrorMessage () const	{ return m_sError; }
GetFilename()309 	const CSphString &		GetFilename() const			{ return m_sFilename; }
310 	void					ResetError();
311 
312 #if USE_64BIT
GetDocid()313 	SphDocID_t	GetDocid ()		{ return GetOffset(); }
UnzipDocid()314 	SphDocID_t	UnzipDocid ()	{ return UnzipOffset(); }
UnzipWordid()315 	SphWordID_t	UnzipWordid ()	{ return UnzipOffset(); }
316 #else
GetDocid()317 	SphDocID_t	GetDocid ()		{ return GetDword(); }
UnzipDocid()318 	SphDocID_t	UnzipDocid ()	{ return UnzipInt(); }
UnzipWordid()319 	SphWordID_t	UnzipWordid ()	{ return UnzipInt(); }
320 #endif
321 
322 	const CSphReader &	operator = ( const CSphReader & rhs );
SetThrottle(ThrottleState_t * pState)323 	void		SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
324 
325 protected:
326 
327 	int			m_iFD;
328 	SphOffset_t	m_iPos;
329 
330 	int			m_iBuffPos;
331 	int			m_iBuffUsed;
332 	BYTE *		m_pBuff;
333 	int			m_iSizeHint;	///< how much do we expect to read
334 
335 	int			m_iBufSize;
336 	bool		m_bBufOwned;
337 	int			m_iReadUnhinted;
338 
339 	bool		m_bError;
340 	CSphString	m_sError;
341 	CSphString	m_sFilename;
342 	ThrottleState_t * m_pThrottle;
343 
344 protected:
345 	virtual void		UpdateCache ();
346 };
347 
348 
349 /// scoped reader
350 class CSphAutoreader : public CSphReader
351 {
352 public:
CSphReader(pBuf,iSize)353 				CSphAutoreader ( BYTE * pBuf=NULL, int iSize=0 ) : CSphReader ( pBuf, iSize ) {}
354 				~CSphAutoreader ();
355 
356 	bool		Open ( const CSphString & sFilename, CSphString & sError );
357 	void		Close ();
358 	SphOffset_t	GetFilesize ();
359 
360 public:
361 	// added for DebugCheck()
GetFD()362 	int			GetFD () { return m_iFD; }
363 };
364 
365 
366 //////////////////////////////////////////////////////////////////////////
367 
368 /// generic COM-like uids
369 enum ExtraData_e
370 {
371 	EXTRA_GET_DATA_ZONESPANS,
372 	EXTRA_GET_DATA_ZONESPANLIST,
373 	EXTRA_GET_DATA_RANKFACTORS,
374 	EXTRA_GET_DATA_PACKEDFACTORS,
375 	EXTRA_GET_DATA_RANKER_STATE,
376 
377 	EXTRA_GET_QUEUE_WORST,
378 	EXTRA_GET_QUEUE_SORTVAL,
379 
380 	EXTRA_SET_MVAPOOL,
381 	EXTRA_SET_STRINGPOOL,
382 	EXTRA_SET_POOL_CAPACITY,
383 	EXTRA_SET_MATCHPUSHED,
384 	EXTRA_SET_MATCHPOPPED,
385 
386 	EXTRA_SET_RANKER_PLUGIN,
387 	EXTRA_SET_RANKER_PLUGIN_OPTS,
388 
389 	EXTRA_GET_POOL_SIZE
390 };
391 
392 /// generic COM-like interface
393 class ISphExtra
394 {
395 public:
~ISphExtra()396 	virtual						~ISphExtra () {}
ExtraData(ExtraData_e eType,void ** ppData)397 	inline bool					ExtraData	( ExtraData_e eType, void** ppData )
398 	{
399 		return ExtraDataImpl ( eType, ppData );
400 	}
401 private:
ExtraDataImpl(ExtraData_e,void **)402 	virtual bool ExtraDataImpl ( ExtraData_e, void** )
403 	{
404 		return false;
405 	}
406 };
407 
408 
409 class ISphRanker;
410 class ISphMatchSorter;
411 class UservarIntSet_c;
412 
413 
414 /// per-query search context
415 /// everything that index needs to compute/create to process the query
416 class CSphQueryContext
417 {
418 public:
419 	// searching-only, per-query
420 	int							m_iWeights;						///< search query field weights count
421 	int							m_dWeights [ SPH_MAX_FIELDS ];	///< search query field weights
422 
423 	bool						m_bLookupFilter;				///< row data lookup required at filtering stage
424 	bool						m_bLookupSort;					///< row data lookup required at sorting stage
425 
426 	DWORD						m_uPackedFactorFlags;			///< whether we need to calculate packed factors (and some extra options)
427 
428 	ISphFilter *				m_pFilter;
429 	ISphFilter *				m_pWeightFilter;
430 
431 	struct CalcItem_t
432 	{
433 		CSphAttrLocator			m_tLoc;					///< result locator
434 		ESphAttr				m_eType;				///< result type
435 		ISphExpr *				m_pExpr;				///< evaluator (non-owned)
436 	};
437 	CSphVector<CalcItem_t>		m_dCalcFilter;			///< items to compute for filtering
438 	CSphVector<CalcItem_t>		m_dCalcSort;			///< items to compute for sorting/grouping
439 	CSphVector<CalcItem_t>		m_dCalcFinal;			///< items to compute when finalizing result set
440 	CSphVector<CalcItem_t>		m_dCalcPostAggregate;	///< items to compute aggregate depended with finalized result set
441 
442 	const CSphVector<CSphAttrOverride> *	m_pOverrides;		///< overridden attribute values
443 	CSphVector<CSphAttrLocator>				m_dOverrideIn;
444 	CSphVector<CSphAttrLocator>				m_dOverrideOut;
445 
446 	const void *							m_pIndexData;			///< backend specific data
447 	CSphQueryProfile *						m_pProfile;
448 	const SmallStringHash_T<int64_t> *		m_pLocalDocs;
449 	int64_t									m_iTotalDocs;
450 	int64_t									m_iBadRows;
451 
452 public:
453 	CSphQueryContext ();
454 	~CSphQueryContext ();
455 
456 	void						BindWeights ( const CSphQuery * pQuery, const CSphSchema & tSchema, CSphString & sWarning );
457 	bool						SetupCalc ( CSphQueryResult * pResult, const ISphSchema & tInSchema, const CSphSchema & tSchema, const DWORD * pMvaPool, bool bArenaProhibit, bool bExtractPostAggr );
458 	bool						CreateFilters ( bool bFullscan, const CSphVector<CSphFilterSettings> * pdFilters, const ISphSchema & tSchema, const DWORD * pMvaPool, const BYTE * pStrings, CSphString & sError, ESphCollation eCollation, bool bArenaProhibit, const KillListVector & dKillList );
459 	bool						SetupOverrides ( const CSphQuery * pQuery, CSphQueryResult * pResult, const CSphSchema & tIndexSchema, const ISphSchema & tOutgoingSchema );
460 
461 	void						CalcFilter ( CSphMatch & tMatch ) const;
462 	void						CalcSort ( CSphMatch & tMatch ) const;
463 	void						CalcFinal ( CSphMatch & tMatch ) const;
464 	void						CalcPostAggregate ( CSphMatch & tMatch ) const;
465 
466 	void						FreeStrFilter ( CSphMatch & tMatch ) const;
467 	void						FreeStrSort ( CSphMatch & tMatch ) const;
468 	void						FreeStrFinal ( CSphMatch & tMatch ) const;
469 
470 	// note that RT index bind pools at segment searching, not at time it setups context
471 	void						ExprCommand ( ESphExprCommand eCmd, void * pArg );
472 	void						SetStringPool ( const BYTE * pStrings );
473 	void						SetMVAPool ( const DWORD * pMva, bool bArenaProhibit );
474 	void						SetupExtraData ( ISphRanker * pRanker, ISphMatchSorter * pSorter );
475 
476 private:
477 	CSphVector<const UservarIntSet_c*>		m_dUserVals;
478 };
479 
480 //////////////////////////////////////////////////////////////////////////
481 // MEMORY TRACKER
482 //////////////////////////////////////////////////////////////////////////
483 
484 #define MEM_CATEGORIES \
485 	MEM_CATEGORY(MEM_CORE), \
486 	MEM_CATEGORY(MEM_INDEX_DISK), \
487 	MEM_CATEGORY(MEM_INDEX_RT), \
488 	MEM_CATEGORY(MEM_API_HANDLE ), \
489 	MEM_CATEGORY(MEM_API_SEARCH ), \
490 	MEM_CATEGORY(MEM_API_QUERY ), \
491 	MEM_CATEGORY(MEM_RT_ACCUM), \
492 	MEM_CATEGORY(MEM_MMAPED), \
493 	MEM_CATEGORY(MEM_BINLOG), \
494 	MEM_CATEGORY(MEM_SQL_HANDLE), \
495 	MEM_CATEGORY(MEM_SQL_INSERT), \
496 	MEM_CATEGORY(MEM_SQL_SELECT), \
497 	MEM_CATEGORY(MEM_SQL_DELETE), \
498 	MEM_CATEGORY(MEM_SQL_SET), \
499 	MEM_CATEGORY(MEM_SQL_BEGIN), \
500 	MEM_CATEGORY(MEM_SQL_COMMIT), \
501 	MEM_CATEGORY(MEM_SQL_ALTER), \
502 	MEM_CATEGORY(MEM_DISK_QUERY), \
503 	MEM_CATEGORY(MEM_DISK_QUERYEX), \
504 	MEM_CATEGORY(MEM_RT_QUERY), \
505 	MEM_CATEGORY(MEM_RT_RES_MATCHES), \
506 	MEM_CATEGORY(MEM_RT_RES_STRINGS)
507 
508 #define MEM_CATEGORY(_arg) _arg
509 enum MemCategory_e
510 {
511 	MEM_CATEGORIES,
512 	MEM_TOTAL
513 };
514 #undef MEM_CATEGORY
515 
516 #if SPH_ALLOCS_PROFILER
517 
518 void sphMemStatPush ( MemCategory_e eCategory );
519 void sphMemStatPop ( MemCategory_e eCategory );
520 
521 // memory tracker
522 struct MemTracker_c : ISphNoncopyable
523 {
524 	const MemCategory_e m_eCategory; ///< category
525 
526 	/// ctor
MemTracker_cMemTracker_c527 	explicit MemTracker_c ( MemCategory_e eCategory )
528 		: m_eCategory ( eCategory )
529 	{
530 		sphMemStatPush ( m_eCategory );
531 	}
532 
533 	/// dtor
~MemTracker_cMemTracker_c534 	~MemTracker_c ()
535 	{
536 		sphMemStatPop ( m_eCategory );
537 	}
538 };
539 
540 #define MEMORY(name) MemTracker_c tracker_##__LINE__##name(name);
541 
542 #else // SPH_ALLOCS_PROFILER 0
543 
544 #define MEMORY(name)
545 
546 #endif // if SPH_ALLOCS_PROFILER
547 
548 //////////////////////////////////////////////////////////////////////////
549 // BLOCK-LEVEL ATTRIBUTE INDEX BUILDER
550 //////////////////////////////////////////////////////////////////////////
551 
552 #define DOCINFO_INDEX_FREQ 128 // FIXME? make this configurable
553 #define SPH_SKIPLIST_BLOCK 128 ///< must be a power of two
554 
MVA_UPSIZE(const DWORD * pMva)555 inline int64_t MVA_UPSIZE ( const DWORD * pMva )
556 {
557 	int64_t iMva = (int64_t)( (uint64_t)pMva[0] | ( ( (uint64_t)pMva[1] )<<32 ) );
558 	return iMva;
559 }
560 
561 
562 // FIXME!!! for over INT_MAX attributes
563 /// attr min-max builder
564 template < typename DOCID = SphDocID_t >
565 class AttrIndexBuilder_t : ISphNoncopyable
566 {
567 private:
568 	CSphVector<CSphAttrLocator>	m_dIntAttrs;
569 	CSphVector<CSphAttrLocator>	m_dFloatAttrs;
570 	CSphVector<CSphAttrLocator>	m_dMvaAttrs;
571 	CSphVector<SphAttr_t>		m_dIntMin;
572 	CSphVector<SphAttr_t>		m_dIntMax;
573 	CSphVector<SphAttr_t>		m_dIntIndexMin;
574 	CSphVector<SphAttr_t>		m_dIntIndexMax;
575 	CSphVector<float>			m_dFloatMin;
576 	CSphVector<float>			m_dFloatMax;
577 	CSphVector<float>			m_dFloatIndexMin;
578 	CSphVector<float>			m_dFloatIndexMax;
579 	CSphVector<int64_t>			m_dMvaMin;
580 	CSphVector<int64_t>			m_dMvaMax;
581 	CSphVector<int64_t>			m_dMvaIndexMin;
582 	CSphVector<int64_t>			m_dMvaIndexMax;
583 	DWORD						m_uStride;		// size of attribute's chunk (in DWORDs)
584 	DWORD						m_uElements;	// counts total number of collected min/max pairs
585 	int							m_iLoop;		// loop inside one set
586 	DWORD *						m_pOutBuffer;	// storage for collected min/max
587 	DWORD *						m_pOutMax;		// storage max for bound checking
588 	DOCID						m_uStart;		// first and last docids of current chunk
589 	DOCID						m_uLast;
590 	DOCID						m_uIndexStart;	// first and last docids of whole index
591 	DOCID						m_uIndexLast;
592 	int							m_iMva64;
593 
594 private:
595 	void ResetLocal();
596 	void FlushComputed();
597 	void UpdateMinMaxDocids ( DOCID uDocID );
598 	void CollectRowMVA ( int iAttr, DWORD uCount, const DWORD * pMva );
599 	void CollectWithoutMvas ( const DWORD * pCur );
600 
601 public:
602 	explicit AttrIndexBuilder_t ( const CSphSchema & tSchema );
603 
604 	void Prepare ( DWORD * pOutBuffer, DWORD * pOutMax );
605 
606 	bool Collect ( const DWORD * pCur, const DWORD * pMvas, int64_t iMvasCount, CSphString & sError, bool bHasMvaID );
607 
608 	void FinishCollect ();
609 
610 	/// actually used part of output buffer, only used with index merge
611 	/// (we reserve space for rows from both indexes, but might kill some rows)
GetActualSize()612 	inline int64_t GetActualSize() const
613 	{
614 		return int64_t ( m_uElements ) * m_uStride * 2;
615 	}
616 
617 	/// how many DWORDs will we need for block index
GetExpectedSize(int64_t iMaxDocs)618 	inline int64_t GetExpectedSize ( int64_t iMaxDocs ) const
619 	{
620 		assert ( iMaxDocs>=0 );
621 		int64_t iDocinfoIndex = ( iMaxDocs + DOCINFO_INDEX_FREQ - 1 ) / DOCINFO_INDEX_FREQ;
622 		return ( iDocinfoIndex + 1 ) * m_uStride * 2;
623 	}
624 };
625 
626 typedef AttrIndexBuilder_t<> AttrIndexBuilder_c;
627 
628 // dirty hack for some build systems which not has LLONG_MAX
629 #ifndef LLONG_MAX
630 #define LLONG_MAX (((unsigned long long)(-1))>>1)
631 #endif
632 
633 #ifndef LLONG_MIN
634 #define LLONG_MIN (-LLONG_MAX-1)
635 #endif
636 
637 #ifndef ULLONG_MAX
638 #define ULLONG_MAX	(LLONG_MAX * 2ULL + 1)
639 #endif
640 
641 
642 template < typename DOCID >
ResetLocal()643 void AttrIndexBuilder_t<DOCID>::ResetLocal()
644 {
645 	ARRAY_FOREACH ( i, m_dIntMin )
646 	{
647 		m_dIntMin[i] = LLONG_MAX;
648 		m_dIntMax[i] = 0;
649 	}
650 	ARRAY_FOREACH ( i, m_dFloatMin )
651 	{
652 		m_dFloatMin[i] = FLT_MAX;
653 		m_dFloatMax[i] = -FLT_MAX;
654 	}
655 	ARRAY_FOREACH ( i, m_dMvaMin )
656 	{
657 		m_dMvaMin[i] = LLONG_MAX;
658 		m_dMvaMax[i] = ( i>=m_iMva64 ? LLONG_MIN : 0 );
659 	}
660 	m_uStart = m_uLast = 0;
661 	m_iLoop = 0;
662 }
663 
664 template < typename DOCID >
FlushComputed()665 void AttrIndexBuilder_t<DOCID>::FlushComputed ()
666 {
667 	assert ( m_pOutBuffer );
668 	DWORD * pMinEntry = m_pOutBuffer + 2 * m_uElements * m_uStride;
669 	DWORD * pMaxEntry = pMinEntry + m_uStride;
670 	CSphRowitem * pMinAttrs = DOCINFO2ATTRS_T<DOCID> ( pMinEntry );
671 	CSphRowitem * pMaxAttrs = pMinAttrs + m_uStride;
672 
673 	assert ( pMaxEntry+m_uStride<=m_pOutMax );
674 	assert ( pMaxAttrs+m_uStride-DOCINFO_IDSIZE<=m_pOutMax );
675 
676 	m_uIndexLast = m_uLast;
677 
678 	DOCINFOSETID ( pMinEntry, m_uStart );
679 	DOCINFOSETID ( pMaxEntry, m_uLast );
680 
681 	ARRAY_FOREACH ( i, m_dIntAttrs )
682 	{
683 		m_dIntIndexMin[i] = Min ( m_dIntIndexMin[i], m_dIntMin[i] );
684 		m_dIntIndexMax[i] = Max ( m_dIntIndexMax[i], m_dIntMax[i] );
685 		sphSetRowAttr ( pMinAttrs, m_dIntAttrs[i], m_dIntMin[i] );
686 		sphSetRowAttr ( pMaxAttrs, m_dIntAttrs[i], m_dIntMax[i] );
687 	}
688 	ARRAY_FOREACH ( i, m_dFloatAttrs )
689 	{
690 		m_dFloatIndexMin[i] = Min ( m_dFloatIndexMin[i], m_dFloatMin[i] );
691 		m_dFloatIndexMax[i] = Max ( m_dFloatIndexMax[i], m_dFloatMax[i] );
692 		sphSetRowAttr ( pMinAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatMin[i] ) );
693 		sphSetRowAttr ( pMaxAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatMax[i] ) );
694 	}
695 
696 	ARRAY_FOREACH ( i, m_dMvaAttrs )
697 	{
698 		m_dMvaIndexMin[i] = Min ( m_dMvaIndexMin[i], m_dMvaMin[i] );
699 		m_dMvaIndexMax[i] = Max ( m_dMvaIndexMax[i], m_dMvaMax[i] );
700 		sphSetRowAttr ( pMinAttrs, m_dMvaAttrs[i], m_dMvaMin[i] );
701 		sphSetRowAttr ( pMaxAttrs, m_dMvaAttrs[i], m_dMvaMax[i] );
702 	}
703 
704 	m_uElements++;
705 	ResetLocal();
706 }
707 
708 template < typename DOCID >
UpdateMinMaxDocids(DOCID uDocID)709 void AttrIndexBuilder_t<DOCID>::UpdateMinMaxDocids ( DOCID uDocID )
710 {
711 	if ( !m_uStart )
712 		m_uStart = uDocID;
713 	if ( !m_uIndexStart )
714 		m_uIndexStart = uDocID;
715 	m_uLast = uDocID;
716 }
717 
718 template < typename DOCID >
AttrIndexBuilder_t(const CSphSchema & tSchema)719 AttrIndexBuilder_t<DOCID>::AttrIndexBuilder_t ( const CSphSchema & tSchema )
720 	: m_uStride ( DWSIZEOF(DOCID) + tSchema.GetRowSize() )
721 	, m_uElements ( 0 )
722 	, m_iLoop ( 0 )
723 	, m_pOutBuffer ( NULL )
724 	, m_pOutMax ( NULL )
725 	, m_uStart ( 0 )
726 	, m_uLast ( 0 )
727 	, m_uIndexStart ( 0 )
728 	, m_uIndexLast ( 0 )
729 {
730 	for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
731 	{
732 		const CSphColumnInfo & tCol = tSchema.GetAttr(i);
733 		switch ( tCol.m_eAttrType )
734 		{
735 		case SPH_ATTR_INTEGER:
736 		case SPH_ATTR_TIMESTAMP:
737 		case SPH_ATTR_BOOL:
738 		case SPH_ATTR_BIGINT:
739 		case SPH_ATTR_TOKENCOUNT:
740 			m_dIntAttrs.Add ( tCol.m_tLocator );
741 			break;
742 
743 		case SPH_ATTR_FLOAT:
744 			m_dFloatAttrs.Add ( tCol.m_tLocator );
745 			break;
746 
747 		case SPH_ATTR_UINT32SET:
748 			m_dMvaAttrs.Add ( tCol.m_tLocator );
749 			break;
750 
751 		default:
752 			break;
753 		}
754 	}
755 
756 	m_iMva64 = m_dMvaAttrs.GetLength();
757 	for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
758 	{
759 		const CSphColumnInfo & tCol = tSchema.GetAttr(i);
760 		if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
761 			m_dMvaAttrs.Add ( tCol.m_tLocator );
762 	}
763 
764 
765 	m_dIntMin.Resize ( m_dIntAttrs.GetLength() );
766 	m_dIntMax.Resize ( m_dIntAttrs.GetLength() );
767 	m_dIntIndexMin.Resize ( m_dIntAttrs.GetLength() );
768 	m_dIntIndexMax.Resize ( m_dIntAttrs.GetLength() );
769 	m_dFloatMin.Resize ( m_dFloatAttrs.GetLength() );
770 	m_dFloatMax.Resize ( m_dFloatAttrs.GetLength() );
771 	m_dFloatIndexMin.Resize ( m_dFloatAttrs.GetLength() );
772 	m_dFloatIndexMax.Resize ( m_dFloatAttrs.GetLength() );
773 	m_dMvaMin.Resize ( m_dMvaAttrs.GetLength() );
774 	m_dMvaMax.Resize ( m_dMvaAttrs.GetLength() );
775 	m_dMvaIndexMin.Resize ( m_dMvaAttrs.GetLength() );
776 	m_dMvaIndexMax.Resize ( m_dMvaAttrs.GetLength() );
777 }
778 
779 template < typename DOCID >
Prepare(DWORD * pOutBuffer,DWORD * pOutMax)780 void AttrIndexBuilder_t<DOCID>::Prepare ( DWORD * pOutBuffer, DWORD * pOutMax )
781 {
782 	m_pOutBuffer = pOutBuffer;
783 	m_pOutMax = pOutMax;
784 	memset ( pOutBuffer, 0, ( pOutMax-pOutBuffer )*sizeof(DWORD) );
785 
786 	m_uElements = 0;
787 	m_uIndexStart = m_uIndexLast = 0;
788 	ARRAY_FOREACH ( i, m_dIntIndexMin )
789 	{
790 		m_dIntIndexMin[i] = LLONG_MAX;
791 		m_dIntIndexMax[i] = 0;
792 	}
793 	ARRAY_FOREACH ( i, m_dFloatIndexMin )
794 	{
795 		m_dFloatIndexMin[i] = FLT_MAX;
796 		m_dFloatIndexMax[i] = -FLT_MAX;
797 	}
798 	ARRAY_FOREACH ( i, m_dMvaIndexMin )
799 	{
800 		m_dMvaIndexMin[i] = LLONG_MAX;
801 		m_dMvaIndexMax[i] = ( i>=m_iMva64 ? LLONG_MIN : 0 );
802 	}
803 	ResetLocal();
804 }
805 
806 template < typename DOCID >
CollectWithoutMvas(const DWORD * pCur)807 void AttrIndexBuilder_t<DOCID>::CollectWithoutMvas ( const DWORD * pCur )
808 {
809 	// check if it is time to flush already collected values
810 	if ( m_iLoop>=DOCINFO_INDEX_FREQ )
811 		FlushComputed ();
812 
813 	const DWORD * pRow = DOCINFO2ATTRS_T<DOCID>(pCur);
814 	UpdateMinMaxDocids ( DOCINFO2ID_T<DOCID>(pCur) );
815 	m_iLoop++;
816 
817 	// ints
818 	ARRAY_FOREACH ( i, m_dIntAttrs )
819 	{
820 		SphAttr_t uVal = sphGetRowAttr ( pRow, m_dIntAttrs[i] );
821 		m_dIntMin[i] = Min ( m_dIntMin[i], uVal );
822 		m_dIntMax[i] = Max ( m_dIntMax[i], uVal );
823 	}
824 
825 	// floats
826 	ARRAY_FOREACH ( i, m_dFloatAttrs )
827 	{
828 		float fVal = sphDW2F ( (DWORD)sphGetRowAttr ( pRow, m_dFloatAttrs[i] ) );
829 		m_dFloatMin[i] = Min ( m_dFloatMin[i], fVal );
830 		m_dFloatMax[i] = Max ( m_dFloatMax[i], fVal );
831 	}
832 }
833 
834 template < typename DOCID >
CollectRowMVA(int iAttr,DWORD uCount,const DWORD * pMva)835 void AttrIndexBuilder_t<DOCID>::CollectRowMVA ( int iAttr, DWORD uCount, const DWORD * pMva )
836 {
837 	if ( iAttr>=m_iMva64 )
838 	{
839 		assert ( ( uCount%2 )==0 );
840 		for ( ; uCount>0; uCount-=2, pMva+=2 )
841 		{
842 			int64_t iVal = MVA_UPSIZE ( pMva );
843 			m_dMvaMin[iAttr] = Min ( m_dMvaMin[iAttr], iVal );
844 			m_dMvaMax[iAttr] = Max ( m_dMvaMax[iAttr], iVal );
845 		}
846 	} else
847 	{
848 		for ( ; uCount>0; uCount--, pMva++ )
849 		{
850 			DWORD uVal = *pMva;
851 			m_dMvaMin[iAttr] = Min ( m_dMvaMin[iAttr], uVal );
852 			m_dMvaMax[iAttr] = Max ( m_dMvaMax[iAttr], uVal );
853 		}
854 	}
855 }
856 
857 template < typename DOCID >
Collect(const DWORD * pCur,const DWORD * pMvas,int64_t iMvasCount,CSphString & sError,bool bHasMvaID)858 bool AttrIndexBuilder_t<DOCID>::Collect ( const DWORD * pCur, const DWORD * pMvas, int64_t iMvasCount, CSphString & sError, bool bHasMvaID )
859 {
860 	CollectWithoutMvas ( pCur );
861 
862 	const DWORD * pRow = DOCINFO2ATTRS_T<DOCID>(pCur);
863 	SphDocID_t uDocID = DOCINFO2ID_T<DOCID>(pCur);
864 
865 	// MVAs
866 	ARRAY_FOREACH ( i, m_dMvaAttrs )
867 	{
868 		SphAttr_t uOff = sphGetRowAttr ( pRow, m_dMvaAttrs[i] );
869 		if ( !uOff )
870 			continue;
871 
872 		// sanity checks
873 		if ( uOff>=iMvasCount )
874 		{
875 			sError.SetSprintf ( "broken index: mva offset out of bounds, id=" DOCID_FMT, (SphDocID_t)uDocID );
876 			return false;
877 		}
878 
879 		const DWORD * pMva = pMvas + uOff; // don't care about updates at this point
880 
881 		if ( bHasMvaID && i==0 && DOCINFO2ID_T<DOCID> ( pMva-DWSIZEOF(DOCID) )!=uDocID )
882 		{
883 			sError.SetSprintf ( "broken index: mva docid verification failed, id=" DOCID_FMT, (SphDocID_t)uDocID );
884 			return false;
885 		}
886 
887 		DWORD uCount = *pMva++;
888 		if ( ( uOff+uCount>=iMvasCount ) || ( i>=m_iMva64 && ( uCount%2 )!=0 ) )
889 		{
890 			sError.SetSprintf ( "broken index: mva list out of bounds, id=" DOCID_FMT, (SphDocID_t)uDocID );
891 			return false;
892 		}
893 
894 		// walk and calc
895 		CollectRowMVA ( i, uCount, pMva );
896 	}
897 	return true;
898 }
899 
900 template < typename DOCID >
FinishCollect()901 void AttrIndexBuilder_t<DOCID>::FinishCollect ()
902 {
903 	assert ( m_pOutBuffer );
904 	if ( m_iLoop )
905 		FlushComputed ();
906 
907 	DWORD * pMinEntry = m_pOutBuffer + 2 * m_uElements * m_uStride;
908 	DWORD * pMaxEntry = pMinEntry + m_uStride;
909 	CSphRowitem * pMinAttrs = DOCINFO2ATTRS_T<DOCID> ( pMinEntry );
910 	CSphRowitem * pMaxAttrs = pMinAttrs + m_uStride;
911 
912 	assert ( pMaxEntry+m_uStride<=m_pOutMax );
913 	assert ( pMaxAttrs+m_uStride-DWSIZEOF(DOCID)<=m_pOutMax );
914 
915 	DOCINFOSETID ( pMinEntry, m_uIndexStart );
916 	DOCINFOSETID ( pMaxEntry, m_uIndexLast );
917 
918 	ARRAY_FOREACH ( i, m_dMvaAttrs )
919 	{
920 		sphSetRowAttr ( pMinAttrs, m_dMvaAttrs[i], m_dMvaIndexMin[i] );
921 		sphSetRowAttr ( pMaxAttrs, m_dMvaAttrs[i], m_dMvaIndexMax[i] );
922 	}
923 
924 	ARRAY_FOREACH ( i, m_dIntAttrs )
925 	{
926 		sphSetRowAttr ( pMinAttrs, m_dIntAttrs[i], m_dIntIndexMin[i] );
927 		sphSetRowAttr ( pMaxAttrs, m_dIntAttrs[i], m_dIntIndexMax[i] );
928 	}
929 	ARRAY_FOREACH ( i, m_dFloatAttrs )
930 	{
931 		sphSetRowAttr ( pMinAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatIndexMin[i] ) );
932 		sphSetRowAttr ( pMaxAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatIndexMax[i] ) );
933 	}
934 	m_uElements++;
935 }
936 
937 struct PoolPtrs_t
938 {
939 	const DWORD *	m_pMva;
940 	const BYTE *	m_pStrings;
941 	bool			m_bArenaProhibit;
942 
PoolPtrs_tPoolPtrs_t943 	PoolPtrs_t ()
944 		: m_pMva ( NULL )
945 		, m_pStrings ( NULL )
946 		, m_bArenaProhibit ( false )
947 	{}
948 };
949 
950 class CSphFreeList
951 {
952 private:
953 	CSphTightVector<int>	m_dFree;
954 	int						m_iNextFree;
955 #ifndef NDEBUG
956 	int						m_iSize;
957 #endif
958 
959 public:
CSphFreeList()960 	CSphFreeList ()
961 		: m_iNextFree ( 0 )
962 #ifndef NDEBUG
963 		, m_iSize ( 0 )
964 #endif
965 	{}
966 
Reset(int iSize)967 	void Reset ( int iSize )
968 	{
969 #ifndef NDEBUG
970 		m_iSize = iSize;
971 #endif
972 		m_iNextFree = 0;
973 		m_dFree.Reserve ( iSize );
974 	}
975 
Get()976 	int Get ()
977 	{
978 		int iRes = -1;
979 		if ( m_dFree.GetLength () )
980 			iRes = m_dFree.Pop ();
981 		else
982 			iRes = m_iNextFree++;
983 		assert ( iRes>=0 && iRes<m_iSize );
984 		return iRes;
985 	}
986 
Free(int iIndex)987 	void Free ( int iIndex )
988 	{
989 		assert ( iIndex>=0 && iIndex<m_iSize );
990 		m_dFree.Add ( iIndex );
991 	}
992 };
993 
994 //////////////////////////////////////////////////////////////////////////
995 // INLINES, FIND_XXX() GENERIC FUNCTIONS
996 //////////////////////////////////////////////////////////////////////////
997 
998 /// find a value-enclosing span in a sorted vector (aka an index at which vec[i] <= val < vec[i+1])
999 template < typename T, typename U >
1000 static int FindSpan ( const CSphVector<T> & dVec, U tRef, int iSmallTreshold=8 )
1001 {
1002 	// empty vector
1003 	if ( !dVec.GetLength() )
1004 		return -1;
1005 
1006 	// check last semi-span
1007 	if ( dVec.Last()<tRef || dVec.Last()==tRef )
1008 		return dVec.GetLength()-1;
1009 
1010 	// linear search for small vectors
1011 	if ( dVec.GetLength()<=iSmallTreshold )
1012 	{
1013 		for ( int i=0; i<dVec.GetLength()-1; i++ )
1014 			if ( ( dVec[i]<tRef || dVec[i]==tRef ) && tRef<dVec[i+1] )
1015 				return i;
1016 		return -1;
1017 	}
1018 
1019 	// binary search for longer vectors
1020 	const T * pStart = dVec.Begin();
1021 	const T * pEnd = &dVec.Last();
1022 
1023 	if ( ( pStart[0]<tRef || pStart[0]==tRef ) && tRef<pStart[1] )
1024 		return 0;
1025 
1026 	if ( ( pEnd[-1]<tRef || pEnd[-1]==tRef ) && tRef<pEnd[0] )
1027 		return pEnd-dVec.Begin()-1;
1028 
1029 	while ( pEnd-pStart>1 )
1030 	{
1031 		if ( tRef<*pStart || *pEnd<tRef )
1032 			break;
1033 		assert ( *pStart<tRef );
1034 		assert ( tRef<*pEnd );
1035 
1036 		const T * pMid = pStart + (pEnd-pStart)/2;
1037 		assert ( pMid+1 < &dVec.Last() );
1038 
1039 		if ( ( pMid[0]<tRef || pMid[0]==tRef ) && tRef<pMid[1] )
1040 			return pMid - dVec.Begin();
1041 
1042 		if ( tRef<pMid[0] )
1043 			pEnd = pMid;
1044 		else
1045 			pStart = pMid;
1046 	}
1047 
1048 	return -1;
1049 }
1050 
1051 
FindBit(DWORD uValue)1052 inline int FindBit ( DWORD uValue )
1053 {
1054 	DWORD uMask = 0xffff;
1055 	int iIdx = 0;
1056 	int iBits = 16;
1057 
1058 	// we negate bits to compare with 0
1059 	// this makes MSVC emit 'test' instead of 'cmp'
1060 	uValue ^= 0xffffffff;
1061 	for ( int t=0; t<5; t++ )
1062 	{
1063 		if ( ( uValue & uMask )==0 )
1064 		{
1065 			iIdx += iBits;
1066 			uValue >>= iBits;
1067 		}
1068 		iBits >>= 1;
1069 		uMask >>= iBits;
1070 	}
1071 	return iIdx;
1072 }
1073 
1074 
sphEncodeVLB8(BYTE * buf,uint64_t v)1075 inline int sphEncodeVLB8 ( BYTE * buf, uint64_t v )
1076 {
1077 	BYTE b;
1078 	int n = 0;
1079 
1080 	do
1081 	{
1082 		b = (BYTE)(v & 0x7f);
1083 		v >>= 7;
1084 		if ( v )
1085 			b |= 0x80;
1086 		*buf++ = b;
1087 		n++;
1088 	} while ( v );
1089 	return n;
1090 }
1091 
1092 
spnDecodeVLB8(const BYTE * pIn,uint64_t & uValue)1093 inline const BYTE * spnDecodeVLB8 ( const BYTE * pIn, uint64_t & uValue )
1094 {
1095 	BYTE bIn;
1096 	int iOff = 0;
1097 
1098 	do
1099 	{
1100 		bIn = *pIn++;
1101 		uValue += ( uint64_t ( bIn & 0x7f ) ) << iOff;
1102 		iOff += 7;
1103 	} while ( bIn & 0x80 );
1104 
1105 	return pIn;
1106 }
1107 
1108 //////////////////////////////////////////////////////////////////////////
1109 // INLINES, UTF-8 TOOLS
1110 //////////////////////////////////////////////////////////////////////////
1111 
1112 #define SPH_MAX_UTF8_BYTES 4
1113 
1114 /// decode UTF-8 codepoint
1115 /// advances buffer ptr in all cases, including the end of buffer (ie. zero byte)!
1116 /// so eof MUST be handled, otherwise, you get OOB
1117 ///
1118 /// returns -1 on failure
1119 /// returns 0 on end of buffer
1120 /// returns codepoint on success
sphUTF8Decode(const BYTE * & pBuf)1121 inline int sphUTF8Decode ( const BYTE * & pBuf )
1122 {
1123 	BYTE v = *pBuf++;
1124 	if ( !v )
1125 		return 0;
1126 
1127 	// check for 7-bit case
1128 	if ( v<128 )
1129 		return v;
1130 
1131 	// get number of bytes
1132 	int iBytes = 0;
1133 	while ( v & 0x80 )
1134 	{
1135 		iBytes++;
1136 		v <<= 1;
1137 	}
1138 
1139 	// check for valid number of bytes
1140 	if ( iBytes<2 || iBytes>SPH_MAX_UTF8_BYTES )
1141 		return -1;
1142 
1143 	int iCode = ( v >> iBytes );
1144 	iBytes--;
1145 	do
1146 	{
1147 		if ( !(*pBuf) )
1148 			return 0; // unexpected eof
1149 
1150 		if ( ((*pBuf) & 0xC0)!=0x80 )
1151 			return -1; // invalid code
1152 
1153 		iCode = ( iCode<<6 ) + ( (*pBuf) & 0x3F );
1154 		iBytes--;
1155 		pBuf++;
1156 	} while ( iBytes );
1157 
1158 	// all good
1159 	return iCode;
1160 }
1161 
1162 
1163 /// encode UTF-8 codepoint to buffer, macro version for the Really Critical places
1164 #define SPH_UTF8_ENCODE(_ptr,_code) \
1165 	if ( (_code)<0x80 ) \
1166 	{ \
1167 		*_ptr++ = (BYTE)( (_code) & 0x7F ); \
1168 	} else if ( (_code)<0x800 ) \
1169 	{ \
1170 		_ptr[0] = (BYTE)( ( ((_code)>>6) & 0x1F ) | 0xC0 ); \
1171 		_ptr[1] = (BYTE)( ( (_code) & 0x3F ) | 0x80 ); \
1172 		_ptr += 2; \
1173 	} else if ( (_code)<0x10000 )\
1174 	{ \
1175 		_ptr[0] = (BYTE)( ( ((_code)>>12) & 0x0F ) | 0xE0 ); \
1176 		_ptr[1] = (BYTE)( ( ((_code)>>6) & 0x3F ) | 0x80 ); \
1177 		_ptr[2] = (BYTE)( ( (_code) & 0x3F ) | 0x80 ); \
1178 		_ptr += 3; \
1179 	} else \
1180 	{ \
1181 		_ptr[0] = (BYTE)( ( ((_code)>>18) & 0x0F ) | 0xF0 ); \
1182 		_ptr[1] = (BYTE)( ( ((_code)>>12) & 0x3F ) | 0x80 ); \
1183 		_ptr[2] = (BYTE)( ( ((_code)>>6) & 0x3F ) | 0x80 ); \
1184 		_ptr[3] = (BYTE)( ( (_code) & 0x3F ) | 0x80 ); \
1185 		_ptr += 4; \
1186 	}
1187 
1188 
1189 /// encode UTF-8 codepoint to buffer
1190 /// returns number of bytes used
sphUTF8Encode(BYTE * pBuf,int iCode)1191 inline int sphUTF8Encode ( BYTE * pBuf, int iCode )
1192 {
1193 	if ( iCode<0x80 )
1194 	{
1195 		pBuf[0] = (BYTE)( iCode & 0x7F );
1196 		return 1;
1197 	}
1198 
1199 	if ( iCode<0x800 )
1200 	{
1201 		pBuf[0] = (BYTE)( ( (iCode>>6) & 0x1F ) | 0xC0 );
1202 		pBuf[1] = (BYTE)( ( iCode & 0x3F ) | 0x80 );
1203 		return 2;
1204 	}
1205 
1206 	if ( iCode<0x10000 )
1207 	{
1208 		pBuf[0] = (BYTE)( ( (iCode>>12) & 0x0F ) | 0xE0 );
1209 		pBuf[1] = (BYTE)( ( (iCode>>6) & 0x3F ) | 0x80 );
1210 		pBuf[2] = (BYTE)( ( iCode & 0x3F ) | 0x80 );
1211 		return 3;
1212 	}
1213 
1214 	pBuf[0] = (BYTE)( ( (iCode>>18) & 0x0F ) | 0xF0 );
1215 	pBuf[1] = (BYTE)( ( (iCode>>12) & 0x3F ) | 0x80 );
1216 	pBuf[2] = (BYTE)( ( (iCode>>6) & 0x3F ) | 0x80 );
1217 	pBuf[3] = (BYTE)( ( iCode & 0x3F ) | 0x80 );
1218 	return 4;
1219 }
1220 
1221 
1222 /// compute UTF-8 string length in codepoints
sphUTF8Len(const char * pStr)1223 inline int sphUTF8Len ( const char * pStr )
1224 {
1225 	if ( !pStr || *pStr=='\0' )
1226 		return 0;
1227 
1228 	const BYTE * pBuf = (const BYTE*) pStr;
1229 	int iRes = 0, iCode;
1230 
1231 	while ( ( iCode = sphUTF8Decode(pBuf) )!=0 )
1232 		if ( iCode>0 )
1233 			iRes++;
1234 
1235 	return iRes;
1236 }
1237 
1238 
1239 /// compute UTF-8 string length in codepoints
sphUTF8Len(const char * pStr,int iMax)1240 inline int sphUTF8Len ( const char * pStr, int iMax )
1241 {
1242 	if ( !pStr || *pStr=='\0' )
1243 		return 0;
1244 
1245 	const BYTE * pBuf = (const BYTE*) pStr;
1246 	const BYTE * pMax = pBuf + iMax;
1247 	int iRes = 0, iCode;
1248 
1249 	while ( pBuf<pMax && iRes<iMax && ( iCode = sphUTF8Decode ( pBuf ) )!=0 )
1250 		if ( iCode>0 )
1251 			iRes++;
1252 
1253 	return iRes;
1254 }
1255 
1256 /// quick check for UTF-8
sphIsUTF8(const char * pStr)1257 inline bool sphIsUTF8 ( const char * pStr )
1258 {
1259 	while ( *pStr )
1260 	{
1261 		if ( *pStr < 0 )
1262 			return true;
1263 		pStr++;
1264 	}
1265 	return false;
1266 }
1267 
1268 /// convert UTF-8 to codepoints, return string length
sphUTF8ToWideChar(const char * pSrc,int * pDst,int iMaxLen)1269 inline int sphUTF8ToWideChar ( const char * pSrc, int * pDst, int iMaxLen )
1270 {
1271 	const BYTE * p = (const BYTE*) pSrc;
1272 	int iLen = 0, iCode;
1273 	while ( ( iCode = sphUTF8Decode(p) )!=0 && iLen<iMaxLen )
1274 	{
1275 		*pDst++ = iCode;
1276 		iLen++;
1277 	}
1278 	*pDst = 0;
1279 	return iLen;
1280 }
1281 
1282 //////////////////////////////////////////////////////////////////////////
1283 // MATCHING ENGINE INTERNALS
1284 //////////////////////////////////////////////////////////////////////////
1285 
1286 static const int FIELD_BITS = 8;
1287 typedef Hitman_c<FIELD_BITS> HITMAN;
1288 
1289 /// hit in the stream
1290 /// combines posting info (docid and hitpos) with a few more matching/ranking bits
1291 ///
1292 /// note that while in simple cases every hit would just represent a single keyword,
1293 /// this is NOT always the case; phrase, proximity, and NEAR operators (that already
1294 /// analyze keywords positions while matching the document) can emit a single folded
1295 /// hit representing the entire multi-keyword match, so that the ranker could avoid
1296 /// double work processing individual hits again. in such cases, m_uWeight, m_uSpanlen,
1297 /// and m_uMatchlen will differ from the "usual" value of 1.
1298 ///
1299 /// thus, in folded hits:
1300 /// - m_uWeight is the match LCS value in all cases (phrase, proximity, near).
1301 /// - m_uSpanlen is the match span length, ie. a distance from the first to the last
1302 /// matching keyword. for phrase operators it natually equals m_uWeight, for other
1303 /// operators it might be very different.
1304 /// - m_uMatchlen is a piece of voodoo magic that only the near operator seems to use.
1305 struct ExtHit_t
1306 {
1307 	SphDocID_t	m_uDocid;
1308 	Hitpos_t	m_uHitpos;
1309 	WORD		m_uQuerypos;
1310 	WORD		m_uNodepos;
1311 	WORD		m_uSpanlen;
1312 	WORD		m_uMatchlen;
1313 	DWORD		m_uWeight;		///< 1 for individual keywords, LCS value for folded phrase/proximity/near hits
1314 	DWORD		m_uQposMask;
1315 };
1316 
1317 enum SphZoneHit_e
1318 {
1319 	SPH_ZONE_FOUND,
1320 	SPH_ZONE_NO_SPAN,
1321 	SPH_ZONE_NO_DOCUMENT
1322 };
1323 
1324 class ISphZoneCheck
1325 {
1326 public:
~ISphZoneCheck()1327 	virtual ~ISphZoneCheck () {}
1328 	virtual SphZoneHit_e IsInZone ( int iZone, const ExtHit_t * pHit, int * pLastSpan ) = 0;
1329 };
1330 
1331 
1332 struct SphFactorHashEntry_t
1333 {
1334 	SphDocID_t				m_iId;
1335 	int						m_iRefCount;
1336 	BYTE *					m_pData;
1337 	SphFactorHashEntry_t *	m_pPrev;
1338 	SphFactorHashEntry_t *	m_pNext;
1339 };
1340 
1341 typedef CSphFixedVector<SphFactorHashEntry_t *> SphFactorHash_t;
1342 
1343 
1344 struct SphExtraDataRankerState_t
1345 {
1346 	const CSphSchema *	m_pSchema;
1347 	const int64_t *		m_pFieldLens;
1348 	CSphAttrLocator		m_tFieldLensLoc;
1349 	int64_t				m_iTotalDocuments;
1350 	int					m_iFields;
1351 	int					m_iMaxQpos;
SphExtraDataRankerState_tSphExtraDataRankerState_t1352 	SphExtraDataRankerState_t ()
1353 		: m_pSchema ( NULL )
1354 		, m_pFieldLens ( NULL )
1355 		, m_iTotalDocuments ( 0 )
1356 		, m_iFields ( 0 )
1357 		, m_iMaxQpos ( 0 )
1358 	{ }
1359 };
1360 
1361 
1362 struct MatchSortAccessor_t
1363 {
1364 	typedef CSphMatch T;
1365 	typedef CSphMatch * MEDIAN_TYPE;
1366 
1367 	CSphMatch m_tMedian;
1368 
MatchSortAccessor_tMatchSortAccessor_t1369 	MatchSortAccessor_t () {}
MatchSortAccessor_tMatchSortAccessor_t1370 	MatchSortAccessor_t ( const MatchSortAccessor_t & ) {}
1371 
~MatchSortAccessor_tMatchSortAccessor_t1372 	virtual ~MatchSortAccessor_t()
1373 	{
1374 		m_tMedian.m_pDynamic = NULL; // not yours
1375 	}
1376 
KeyMatchSortAccessor_t1377 	MEDIAN_TYPE Key ( CSphMatch * a ) const
1378 	{
1379 		return a;
1380 	}
1381 
CopyKeyMatchSortAccessor_t1382 	void CopyKey ( MEDIAN_TYPE * pMed, CSphMatch * pVal )
1383 	{
1384 		*pMed = &m_tMedian;
1385 		m_tMedian.m_uDocID = pVal->m_uDocID;
1386 		m_tMedian.m_iWeight = pVal->m_iWeight;
1387 		m_tMedian.m_pStatic = pVal->m_pStatic;
1388 		m_tMedian.m_pDynamic = pVal->m_pDynamic;
1389 		m_tMedian.m_iTag = pVal->m_iTag;
1390 	}
1391 
SwapMatchSortAccessor_t1392 	void Swap ( T * a, T * b ) const
1393 	{
1394 		::Swap ( *a, *b );
1395 	}
1396 
AddMatchSortAccessor_t1397 	T * Add ( T * p, int i ) const
1398 	{
1399 		return p+i;
1400 	}
1401 
SubMatchSortAccessor_t1402 	int Sub ( T * b, T * a ) const
1403 	{
1404 		return (int)(b-a);
1405 	}
1406 };
1407 
1408 
1409 //////////////////////////////////////////////////////////////////////////
1410 // INLINES, MISC
1411 //////////////////////////////////////////////////////////////////////////
1412 
sphTypeName(ESphAttr eType)1413 inline const char * sphTypeName ( ESphAttr eType )
1414 {
1415 	switch ( eType )
1416 	{
1417 		case SPH_ATTR_NONE:			return "none";
1418 		case SPH_ATTR_INTEGER:		return "uint";
1419 		case SPH_ATTR_TIMESTAMP:	return "timestamp";
1420 		case SPH_ATTR_BOOL:			return "bool";
1421 		case SPH_ATTR_FLOAT:		return "float";
1422 		case SPH_ATTR_BIGINT:		return "bigint";
1423 		case SPH_ATTR_STRING:		return "string";
1424 		case SPH_ATTR_STRINGPTR:	return "stringptr";
1425 		case SPH_ATTR_TOKENCOUNT:	return "tokencount";
1426 		case SPH_ATTR_JSON:			return "json";
1427 
1428 		case SPH_ATTR_UINT32SET:	return "mva";
1429 		case SPH_ATTR_INT64SET:		return "mva64";
1430 		default:					return "unknown";
1431 	}
1432 }
1433 
sphTypeDirective(ESphAttr eType)1434 inline const char * sphTypeDirective ( ESphAttr eType )
1435 {
1436 	switch ( eType )
1437 	{
1438 		case SPH_ATTR_NONE:			return "???";
1439 		case SPH_ATTR_INTEGER:		return "sql_attr_uint";
1440 		case SPH_ATTR_TIMESTAMP:	return "sql_attr_timestamp";
1441 		case SPH_ATTR_BOOL:			return "sql_attr_bool";
1442 		case SPH_ATTR_FLOAT:		return "sql_attr_float";
1443 		case SPH_ATTR_BIGINT:		return "sql_attr_bigint";
1444 		case SPH_ATTR_STRING:		return "sql_attr_string";
1445 		case SPH_ATTR_STRINGPTR:	return "sql_attr_string";
1446 		case SPH_ATTR_TOKENCOUNT:	return "_autogenerated_tokencount";
1447 		case SPH_ATTR_JSON:			return "sql_attr_json";
1448 
1449 		case SPH_ATTR_UINT32SET:	return "sql_attr_multi";
1450 		case SPH_ATTR_INT64SET:		return "sql_attr_multi bigint";
1451 		default:					return "???";
1452 	}
1453 }
1454 
SqlUnescape(CSphString & sRes,const char * sEscaped,int iLen)1455 inline void SqlUnescape ( CSphString & sRes, const char * sEscaped, int iLen )
1456 {
1457 	assert ( iLen>=2 );
1458 	assert (
1459 		( sEscaped[0]=='\'' && sEscaped[iLen-1]=='\'' ) ||
1460 		( sEscaped[0]=='"' && sEscaped[iLen-1]=='"' ) );
1461 
1462 	// skip heading and trailing quotes
1463 	const char * s = sEscaped+1;
1464 	const char * sMax = s+iLen-2;
1465 
1466 	sRes.Reserve ( iLen );
1467 	char * d = (char*) sRes.cstr();
1468 
1469 	while ( s<sMax )
1470 	{
1471 		if ( s[0]=='\\' )
1472 		{
1473 			switch ( s[1] )
1474 			{
1475 			case 'b': *d++ = '\b'; break;
1476 			case 'n': *d++ = '\n'; break;
1477 			case 'r': *d++ = '\r'; break;
1478 			case 't': *d++ = '\t'; break;
1479 			default:
1480 				*d++ = s[1];
1481 			}
1482 			s += 2;
1483 		} else
1484 			*d++ = *s++;
1485 	}
1486 
1487 	*d++ = '\0';
1488 }
1489 
1490 
StripPath(CSphString & sPath)1491 inline void StripPath ( CSphString & sPath )
1492 {
1493 	if ( sPath.IsEmpty() )
1494 		return;
1495 
1496 	const char * s = sPath.cstr();
1497 	if ( *s!='/' )
1498 		return;
1499 
1500 	const char * sLastSlash = s;
1501 	for ( ; *s; s++ )
1502 		if ( *s=='/' )
1503 			sLastSlash = s;
1504 
1505 	int iPos = (int)( sLastSlash - sPath.cstr() + 1 );
1506 	int iLen = (int)( s - sPath.cstr() );
1507 	sPath = sPath.SubString ( iPos, iLen - iPos );
1508 }
1509 
1510 //////////////////////////////////////////////////////////////////////////
1511 // DISK INDEX INTERNALS
1512 //////////////////////////////////////////////////////////////////////////
1513 
1514 /// locator pair, for RT string dynamization
1515 struct LocatorPair_t
1516 {
1517 	CSphAttrLocator m_tFrom;	///< source (static) locator
1518 	CSphAttrLocator m_tTo;		///< destination (dynamized) locator
1519 };
1520 
1521 //////////////////////////////////////////////////////////////////////////
1522 // DICTIONARY INTERNALS
1523 //////////////////////////////////////////////////////////////////////////
1524 
1525 /// dict traits
1526 class CSphDictTraits : public CSphDict
1527 {
1528 public:
CSphDictTraits(CSphDict * pDict)1529 	explicit			CSphDictTraits ( CSphDict * pDict ) : m_pDict ( pDict ) { assert ( m_pDict ); }
1530 
LoadStopwords(const char * sFiles,const ISphTokenizer * pTokenizer)1531 	virtual void		LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer ) { m_pDict->LoadStopwords ( sFiles, pTokenizer ); }
LoadStopwords(const CSphVector<SphWordID_t> & dStopwords)1532 	virtual void		LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords ) { m_pDict->LoadStopwords ( dStopwords ); }
WriteStopwords(CSphWriter & tWriter)1533 	virtual void		WriteStopwords ( CSphWriter & tWriter ) { m_pDict->WriteStopwords ( tWriter ); }
LoadWordforms(const CSphVector<CSphString> & dFiles,const CSphEmbeddedFiles * pEmbedded,const ISphTokenizer * pTokenizer,const char * sIndex)1534 	virtual bool		LoadWordforms ( const CSphVector<CSphString> & dFiles, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex ) { return m_pDict->LoadWordforms ( dFiles, pEmbedded, pTokenizer, sIndex ); }
WriteWordforms(CSphWriter & tWriter)1535 	virtual void		WriteWordforms ( CSphWriter & tWriter ) { m_pDict->WriteWordforms ( tWriter ); }
SetMorphology(const char * szMorph,CSphString & sMessage)1536 	virtual int			SetMorphology ( const char * szMorph, CSphString & sMessage ) { return m_pDict->SetMorphology ( szMorph, sMessage ); }
1537 
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)1538 	virtual SphWordID_t	GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) { return m_pDict->GetWordID ( pWord, iLen, bFilterStops ); }
1539 	virtual SphWordID_t GetWordID ( BYTE * pWord );
GetWordIDNonStemmed(BYTE * pWord)1540 	virtual SphWordID_t	GetWordIDNonStemmed ( BYTE * pWord ) { return m_pDict->GetWordIDNonStemmed ( pWord ); }
1541 
Setup(const CSphDictSettings &)1542 	virtual void		Setup ( const CSphDictSettings & ) {}
GetSettings()1543 	virtual const CSphDictSettings & GetSettings () const { return m_pDict->GetSettings (); }
GetStopwordsFileInfos()1544 	virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_pDict->GetStopwordsFileInfos (); }
GetWordformsFileInfos()1545 	virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_pDict->GetWordformsFileInfos (); }
GetMultiWordforms()1546 	virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pDict->GetMultiWordforms (); }
GetWordforms()1547 	virtual const CSphWordforms * GetWordforms () { return m_pDict->GetWordforms(); }
1548 
IsStopWord(const BYTE * pWord)1549 	virtual bool		IsStopWord ( const BYTE * pWord ) const { return m_pDict->IsStopWord ( pWord ); }
GetSettingsFNV()1550 	virtual uint64_t	GetSettingsFNV () const { return m_pDict->GetSettingsFNV(); }
SetApplyMorph(bool bApply)1551 	virtual void		SetApplyMorph ( bool bApply ) { m_pDict->SetApplyMorph ( bApply ); }
1552 
1553 protected:
1554 	CSphDict *			m_pDict;
1555 };
1556 
1557 
1558 /// dict wrapper for star-syntax support in prefix-indexes
1559 class CSphDictStar : public CSphDictTraits
1560 {
1561 public:
CSphDictStar(CSphDict * pDict)1562 	explicit			CSphDictStar ( CSphDict * pDict ) : CSphDictTraits ( pDict ) {}
1563 
1564 	virtual SphWordID_t	GetWordID ( BYTE * pWord );
1565 	virtual SphWordID_t	GetWordIDNonStemmed ( BYTE * pWord );
1566 };
1567 
1568 
1569 /// star dict for index v.8+
1570 class CSphDictStarV8 : public CSphDictStar
1571 {
1572 public:
1573 	CSphDictStarV8 ( CSphDict * pDict, bool bPrefixes, bool bInfixes );
1574 
1575 	virtual SphWordID_t	GetWordID ( BYTE * pWord );
1576 
1577 private:
1578 	bool				m_bPrefixes;
1579 	bool				m_bInfixes;
1580 };
1581 
1582 
1583 /// dict wrapper for exact-word syntax
1584 class CSphDictExact : public CSphDictTraits
1585 {
1586 public:
CSphDictExact(CSphDict * pDict)1587 	explicit CSphDictExact ( CSphDict * pDict ) : CSphDictTraits ( pDict ) {}
1588 	virtual SphWordID_t	GetWordID ( BYTE * pWord );
1589 };
1590 
1591 //////////////////////////////////////////////////////////////////////////
1592 // TOKEN FILTER
1593 //////////////////////////////////////////////////////////////////////////
1594 
1595 /// token filter base (boring proxy stuff)
1596 class CSphTokenFilter : public ISphTokenizer
1597 {
1598 protected:
1599 	ISphTokenizer *		m_pTokenizer;
1600 
1601 public:
CSphTokenFilter(ISphTokenizer * pTokenizer)1602 	explicit						CSphTokenFilter ( ISphTokenizer * pTokenizer )					: m_pTokenizer ( pTokenizer ) {}
~CSphTokenFilter()1603 									~CSphTokenFilter()												{ SafeDelete ( m_pTokenizer ); }
1604 
SetCaseFolding(const char * sConfig,CSphString & sError)1605 	virtual bool					SetCaseFolding ( const char * sConfig, CSphString & sError )	{ return m_pTokenizer->SetCaseFolding ( sConfig, sError ); }
AddPlainChar(char c)1606 	virtual void					AddPlainChar ( char c )											{ m_pTokenizer->AddPlainChar ( c ); }
AddSpecials(const char * sSpecials)1607 	virtual void					AddSpecials ( const char * sSpecials )							{ m_pTokenizer->AddSpecials ( sSpecials ); }
SetIgnoreChars(const char * sIgnored,CSphString & sError)1608 	virtual bool					SetIgnoreChars ( const char * sIgnored, CSphString & sError )	{ return m_pTokenizer->SetIgnoreChars ( sIgnored, sError ); }
SetNgramChars(const char * sConfig,CSphString & sError)1609 	virtual bool					SetNgramChars ( const char * sConfig, CSphString & sError )		{ return m_pTokenizer->SetNgramChars ( sConfig, sError ); }
SetNgramLen(int iLen)1610 	virtual void					SetNgramLen ( int iLen )										{ m_pTokenizer->SetNgramLen ( iLen ); }
LoadSynonyms(const char * sFilename,const CSphEmbeddedFiles * pFiles,CSphString & sError)1611 	virtual bool					LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError ) { return m_pTokenizer->LoadSynonyms ( sFilename, pFiles, sError ); }
WriteSynonyms(CSphWriter & tWriter)1612 	virtual void					WriteSynonyms ( CSphWriter & tWriter )							{ return m_pTokenizer->WriteSynonyms ( tWriter ); }
SetBoundary(const char * sConfig,CSphString & sError)1613 	virtual bool					SetBoundary ( const char * sConfig, CSphString & sError )		{ return m_pTokenizer->SetBoundary ( sConfig, sError ); }
Setup(const CSphTokenizerSettings & tSettings)1614 	virtual void					Setup ( const CSphTokenizerSettings & tSettings )				{ m_pTokenizer->Setup ( tSettings ); }
GetSettings()1615 	virtual const CSphTokenizerSettings &	GetSettings () const									{ return m_pTokenizer->GetSettings (); }
GetSynFileInfo()1616 	virtual const CSphSavedFile &	GetSynFileInfo () const											{ return m_pTokenizer->GetSynFileInfo (); }
EnableSentenceIndexing(CSphString & sError)1617 	virtual bool					EnableSentenceIndexing ( CSphString & sError )					{ return m_pTokenizer->EnableSentenceIndexing ( sError ); }
EnableZoneIndexing(CSphString & sError)1618 	virtual bool					EnableZoneIndexing ( CSphString & sError )						{ return m_pTokenizer->EnableZoneIndexing ( sError ); }
SkipBlended()1619 	virtual int						SkipBlended ()													{ return m_pTokenizer->SkipBlended(); }
1620 
GetCodepointLength(int iCode)1621 	virtual int						GetCodepointLength ( int iCode ) const		{ return m_pTokenizer->GetCodepointLength ( iCode ); }
GetMaxCodepointLength()1622 	virtual int						GetMaxCodepointLength () const				{ return m_pTokenizer->GetMaxCodepointLength(); }
1623 
GetTokenStart()1624 	virtual const char *			GetTokenStart () const						{ return m_pTokenizer->GetTokenStart(); }
GetTokenEnd()1625 	virtual const char *			GetTokenEnd () const						{ return m_pTokenizer->GetTokenEnd(); }
GetBufferPtr()1626 	virtual const char *			GetBufferPtr () const						{ return m_pTokenizer->GetBufferPtr(); }
GetBufferEnd()1627 	virtual const char *			GetBufferEnd () const						{ return m_pTokenizer->GetBufferEnd (); }
SetBufferPtr(const char * sNewPtr)1628 	virtual void					SetBufferPtr ( const char * sNewPtr )		{ m_pTokenizer->SetBufferPtr ( sNewPtr ); }
GetSettingsFNV()1629 	virtual uint64_t				GetSettingsFNV () const						{ return m_pTokenizer->GetSettingsFNV(); }
1630 
SetBuffer(const BYTE * sBuffer,int iLength)1631 	virtual void					SetBuffer ( const BYTE * sBuffer, int iLength )	{ m_pTokenizer->SetBuffer ( sBuffer, iLength ); }
GetToken()1632 	virtual BYTE *					GetToken ()										{ return m_pTokenizer->GetToken(); }
1633 
GetEmbeddedTokenizer()1634 	virtual ISphTokenizer *			GetEmbeddedTokenizer () const					{ return m_pTokenizer; }
WasTokenMultiformDestination(bool & bHead,int & iDestCount)1635 	virtual bool					WasTokenMultiformDestination ( bool & bHead, int & iDestCount ) const { return m_pTokenizer->WasTokenMultiformDestination ( bHead, iDestCount ); }
1636 };
1637 
1638 
1639 struct ISphQueryFilter
1640 {
1641 	ISphTokenizer *		m_pTokenizer;
1642 	CSphDict *					m_pDict;
1643 	const CSphIndexSettings *	m_pSettings;
1644 
1645 	ISphQueryFilter ();
1646 	virtual ~ISphQueryFilter ();
1647 
1648 	void GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords );
1649 	virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords ) = 0;
1650 };
1651 
1652 
1653 DWORD sphParseMorphAot ( const char * );
1654 
1655 struct CSphReconfigureSettings
1656 {
1657 	CSphTokenizerSettings	m_tTokenizer;
1658 	CSphDictSettings		m_tDict;
1659 	CSphIndexSettings		m_tIndex;
1660 };
1661 
1662 struct CSphReconfigureSetup
1663 {
1664 	ISphTokenizer *		m_pTokenizer;
1665 	CSphDict *			m_pDict;
1666 	CSphIndexSettings	m_tIndex;
1667 
1668 	CSphReconfigureSetup ();
1669 	~CSphReconfigureSetup ();
1670 };
1671 
1672 uint64_t sphGetSettingsFNV ( const CSphIndexSettings & tSettings );
1673 
1674 //////////////////////////////////////////////////////////////////////////
1675 // USER VARIABLES
1676 //////////////////////////////////////////////////////////////////////////
1677 
1678 /// value container for the intset uservar type
1679 class UservarIntSet_c : public CSphVector<SphAttr_t>, public ISphRefcountedMT
1680 {
1681 };
1682 
1683 extern UservarIntSet_c * ( *g_pUservarsHook )( const CSphString & sUservar );
1684 
1685 //////////////////////////////////////////////////////////////////////////
1686 // BINLOG INTERNALS
1687 //////////////////////////////////////////////////////////////////////////
1688 
1689 /// global binlog interface
1690 class ISphBinlog : ISphNoncopyable
1691 {
1692 public:
~ISphBinlog()1693 	virtual				~ISphBinlog () {}
1694 
1695 	virtual void		BinlogUpdateAttributes ( int64_t * pTID, const char * sIndexName, const CSphAttrUpdate & tUpd ) = 0;
1696 	virtual void		NotifyIndexFlush ( const char * sIndexName, int64_t iTID, bool bShutdown ) = 0;
1697 };
1698 
1699 //////////////////////////////////////////////////////////////////////////
1700 // MISC FUNCTION PROTOTYPES
1701 //////////////////////////////////////////////////////////////////////////
1702 
1703 struct SphStringSorterRemap_t
1704 {
1705 	CSphAttrLocator m_tSrc;
1706 	CSphAttrLocator m_tDst;
1707 };
1708 
1709 struct ThrottleState_t
1710 {
1711 	int64_t	m_tmLastIOTime;
1712 	int		m_iMaxIOps;
1713 	int		m_iMaxIOSize;
1714 
ThrottleState_tThrottleState_t1715 	ThrottleState_t ()
1716 		: m_tmLastIOTime ( 0 )
1717 		, m_iMaxIOps ( 0 )
1718 		, m_iMaxIOSize ( 0 )
1719 	{}
1720 };
1721 
1722 const BYTE *	SkipQuoted ( const BYTE * p );
1723 
1724 bool			sphSortGetStringRemap ( const ISphSchema & tSorterSchema, const ISphSchema & tIndexSchema, CSphVector<SphStringSorterRemap_t> & dAttrs );
1725 bool			sphIsSortStringInternal ( const char * sColumnName );
1726 /// make string lowercase but keep case of JSON.field
1727 void			sphColumnToLowercase ( char * sVal );
1728 
1729 bool			sphCheckQueryHeight ( const struct XQNode_t * pRoot, CSphString & sError );
1730 void			sphTransformExtendedQuery ( XQNode_t ** ppNode, const CSphIndexSettings & tSettings, bool bHasBooleanOptimization, const ISphKeywordsStat * pKeywords );
1731 void			TransformAotFilter ( XQNode_t * pNode, const CSphWordforms * pWordforms, const CSphIndexSettings& tSettings );
1732 bool			sphMerge ( const CSphIndex * pDst, const CSphIndex * pSrc, const CSphVector<SphDocID_t> & dKillList, CSphString & sError, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle, volatile bool * pGlobalStop, volatile bool * pLocalStop );
1733 CSphString		sphReconstructNode ( const XQNode_t * pNode, const CSphSchema * pSchema );
1734 
1735 void			sphSetUnlinkOld ( bool bUnlink );
1736 void			sphUnlinkIndex ( const char * sName, bool bForce );
1737 
1738 void			WriteSchema ( CSphWriter & fdInfo, const CSphSchema & tSchema );
1739 void			ReadSchema ( CSphReader & rdInfo, CSphSchema & m_tSchema, DWORD uVersion, bool bDynamic );
1740 void			SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings );
1741 void			LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion );
1742 void			SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, int iEmbeddedLimit );
1743 bool			LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSettings, CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning );
1744 void			SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict, int iEmbeddedLimit );
1745 void			LoadDictionarySettings ( CSphReader & tReader, CSphDictSettings & tSettings, CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning );
1746 void			SaveFieldFilterSettings ( CSphWriter & tWriter, ISphFieldFilter * pFieldFilter );
1747 
1748 DWORD			ReadVersion ( const char * sPath, CSphString & sError );
1749 bool			AddFieldLens ( CSphSchema & tSchema, bool bDynamic, CSphString & sError );
1750 
1751 void			RebalanceWeights ( const CSphFixedVector<int64_t> & dTimers, WORD * pWeights );
1752 
1753 // all indexes should produce same terms for same query
1754 struct SphWordStatChecker_t
1755 {
SphWordStatChecker_tSphWordStatChecker_t1756 	SphWordStatChecker_t () {}
1757 	void Set ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat );
1758 	void DumpDiffer ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat, const char * sIndex, CSphString & sWarning ) const;
1759 
1760 	CSphVector<uint64_t> m_dSrcWords;
1761 };
1762 
1763 
1764 enum ESphExtType
1765 {
1766 	SPH_EXT_TYPE_CUR = 0,
1767 	SPH_EXT_TYPE_NEW,
1768 	SPH_EXT_TYPE_OLD,
1769 	SPH_EXT_TYPE_LOC
1770 };
1771 
1772 enum ESphExt
1773 {
1774 	SPH_EXT_SPH = 0,
1775 	SPH_EXT_SPA = 1,
1776 	SPH_EXT_MVP = 9
1777 };
1778 
1779 const char ** sphGetExts ( ESphExtType eType, DWORD uVersion=INDEX_FORMAT_VERSION );
1780 int sphGetExtCount ( DWORD uVersion=INDEX_FORMAT_VERSION );
1781 const char * sphGetExt ( ESphExtType eType, ESphExt eExt );
1782 
1783 int sphDictCmp ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 );
1784 int sphDictCmpStrictly ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 );
1785 
1786 template <typename CP>
sphCheckpointCmp(const char * sWord,int iLen,SphWordID_t iWordID,bool bWordDict,const CP & tCP)1787 int sphCheckpointCmp ( const char * sWord, int iLen, SphWordID_t iWordID, bool bWordDict, const CP & tCP )
1788 {
1789 	if ( bWordDict )
1790 		return sphDictCmp ( sWord, iLen, tCP.m_sWord, strlen ( tCP.m_sWord ) );
1791 
1792 	int iRes = 0;
1793 	iRes = iWordID<tCP.m_uWordID ? -1 : iRes;
1794 	iRes = iWordID>tCP.m_uWordID ? 1 : iRes;
1795 	return iRes;
1796 }
1797 
1798 template <typename CP>
sphCheckpointCmpStrictly(const char * sWord,int iLen,SphWordID_t iWordID,bool bWordDict,const CP & tCP)1799 int sphCheckpointCmpStrictly ( const char * sWord, int iLen, SphWordID_t iWordID, bool bWordDict, const CP & tCP )
1800 {
1801 	if ( bWordDict )
1802 		return sphDictCmpStrictly ( sWord, iLen, tCP.m_sWord, strlen ( tCP.m_sWord ) );
1803 
1804 	int iRes = 0;
1805 	iRes = iWordID<tCP.m_uWordID ? -1 : iRes;
1806 	iRes = iWordID>tCP.m_uWordID ? 1 : iRes;
1807 	return iRes;
1808 }
1809 
1810 
1811 template < typename CP >
sphSearchCheckpoint(const char * sWord,int iWordLen,SphWordID_t iWordID,bool bStarMode,bool bWordDict,const CP * pFirstCP,const CP * pLastCP)1812 const CP * sphSearchCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID
1813 							, bool bStarMode, bool bWordDict
1814 							, const CP * pFirstCP, const CP * pLastCP )
1815 {
1816 	assert ( !bWordDict || iWordLen>0 );
1817 
1818 	const CP * pStart = pFirstCP;
1819 	const CP * pEnd = pLastCP;
1820 
1821 	if ( bStarMode && sphCheckpointCmp ( sWord, iWordLen, iWordID, bWordDict, *pStart )<0 )
1822 		return NULL;
1823 	if ( !bStarMode && sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pStart )<0 )
1824 		return NULL;
1825 
1826 	if ( sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pEnd )>=0 )
1827 		pStart = pEnd;
1828 	else
1829 	{
1830 		while ( pEnd-pStart>1 )
1831 		{
1832 			const CP * pMid = pStart + (pEnd-pStart)/2;
1833 			const int iCmpRes = sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pMid );
1834 
1835 			if ( iCmpRes==0 )
1836 			{
1837 				pStart = pMid;
1838 				break;
1839 			} else if ( iCmpRes<0 )
1840 				pEnd = pMid;
1841 			else
1842 				pStart = pMid;
1843 		}
1844 
1845 		assert ( pStart>=pFirstCP );
1846 		assert ( pStart<=pLastCP );
1847 		assert ( sphCheckpointCmp ( sWord, iWordLen, iWordID, bWordDict, *pStart )>=0
1848 			&& sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pEnd )<0 );
1849 	}
1850 
1851 	return pStart;
1852 }
1853 
1854 int sphCollateLibcCI ( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
1855 int sphCollateLibcCS ( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
1856 int sphCollateUtf8GeneralCI ( const BYTE * pArg1, const BYTE * pArg2, bool bPacked );
1857 int sphCollateBinary ( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
1858 
1859 class ISphRtDictWraper : public CSphDict
1860 {
1861 public:
1862 	virtual const BYTE *	GetPackedKeywords () = 0;
1863 	virtual int				GetPackedLen () = 0;
1864 
1865 	virtual void			ResetKeywords() = 0;
1866 
1867 	virtual const char *	GetLastWarning() const = 0;
1868 	virtual void			ResetWarning() = 0;
1869 };
1870 
1871 ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase );
1872 
1873 struct SphExpanded_t
1874 {
1875 	int m_iNameOff;
1876 	int m_iDocs;
1877 	int m_iHits;
1878 };
1879 
1880 struct ISphSubstringPayload
1881 {
ISphSubstringPayloadISphSubstringPayload1882 	ISphSubstringPayload () {}
~ISphSubstringPayloadISphSubstringPayload1883 	virtual ~ISphSubstringPayload() {}
1884 };
1885 
1886 
1887 class ISphWordlist
1888 {
1889 public:
1890 	struct Args_t : public ISphNoncopyable
1891 	{
1892 		CSphVector<SphExpanded_t>	m_dExpanded;
1893 		const bool					m_bPayload;
1894 		int							m_iExpansionLimit;
1895 		const bool					m_bHasMorphology;
1896 		const ESphHitless			m_eHitless;
1897 
1898 		ISphSubstringPayload *		m_pPayload;
1899 		int							m_iTotalDocs;
1900 		int							m_iTotalHits;
1901 		const void *				m_pIndexData;
1902 
1903 		Args_t ( bool bPayload, int iExpansionLimit, bool bHasMorphology, ESphHitless eHitless, const void * pIndexData );
1904 		~Args_t ();
1905 		void AddExpanded ( const BYTE * sWord, int iLen, int iDocs, int iHits );
1906 		const char * GetWordExpanded ( int iIndex ) const;
1907 
1908 	private:
1909 		CSphVector<char> m_sBuf;
1910 	};
1911 
~ISphWordlist()1912 	virtual ~ISphWordlist () {}
1913 	virtual void GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const = 0;
1914 	virtual void GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const = 0;
1915 };
1916 
1917 
1918 class CSphScopedPayload
1919 {
1920 public:
CSphScopedPayload()1921 	CSphScopedPayload () {}
~CSphScopedPayload()1922 	~CSphScopedPayload ()
1923 	{
1924 		ARRAY_FOREACH ( i, m_dPayloads )
1925 			SafeDelete ( m_dPayloads[i] );
1926 	}
Add(ISphSubstringPayload * pPayload)1927 	void Add ( ISphSubstringPayload * pPayload ) { m_dPayloads.Add ( pPayload ); }
1928 
1929 private:
1930 	CSphVector<ISphSubstringPayload *> m_dPayloads;
1931 };
1932 
1933 
1934 struct ExpansionContext_t
1935 {
1936 	const ISphWordlist * m_pWordlist;
1937 	BYTE * m_pBuf;
1938 	CSphQueryResultMeta * m_pResult;
1939 	int m_iMinPrefixLen;
1940 	int m_iMinInfixLen;
1941 	int m_iExpansionLimit;
1942 	bool m_bHasMorphology;
1943 	bool m_bMergeSingles;
1944 	CSphScopedPayload * m_pPayloads;
1945 	ESphHitless m_eHitless;
1946 	const void * m_pIndexData;
1947 
1948 	ExpansionContext_t ();
1949 };
1950 
1951 
1952 XQNode_t * sphExpandXQNode ( XQNode_t * pNode, ExpansionContext_t & tCtx );
1953 XQNode_t * sphQueryExpandKeywords ( XQNode_t * pNode, const CSphIndexSettings & tSettings );
sphGetExpansionMagic(int iDocs,int iHits)1954 inline int sphGetExpansionMagic ( int iDocs, int iHits )
1955 {
1956 	return ( iHits<=256 ? 1 : iDocs + 1 ); // magic threshold; mb make this configurable?
1957 }
sphIsExpandedPayload(int iDocs,int iHits)1958 inline bool sphIsExpandedPayload ( int iDocs, int iHits )
1959 {
1960 	return ( iHits<=256 || iDocs<32 ); // magic threshold; mb make this configurable?
1961 }
1962 
1963 
1964 template<typename T>
1965 struct ExpandedOrderDesc_T
1966 {
IsLessExpandedOrderDesc_T1967 	bool IsLess ( const T & a, const T & b )
1968 	{
1969 		return ( sphGetExpansionMagic ( a.m_iDocs, a.m_iHits )>sphGetExpansionMagic ( b.m_iDocs, b.m_iHits ) );
1970 	}
1971 };
1972 
1973 
1974 class CSphKeywordDeltaWriter
1975 {
1976 private:
1977 	BYTE m_sLastKeyword [SPH_MAX_WORD_LEN*3+4];
1978 	int m_iLastLen;
1979 
1980 public:
CSphKeywordDeltaWriter()1981 	CSphKeywordDeltaWriter ()
1982 	{
1983 		Reset();
1984 	}
1985 
Reset()1986 	void Reset ()
1987 	{
1988 		m_iLastLen = 0;
1989 	}
1990 
1991 	template <typename F>
PutDelta(F & WRITER,const BYTE * pWord,int iLen)1992 	void PutDelta ( F & WRITER, const BYTE * pWord, int iLen )
1993 	{
1994 		assert ( pWord && iLen );
1995 
1996 		// how many bytes of a previous keyword can we reuse?
1997 		BYTE iMatch = 0;
1998 		int iMinLen = Min ( m_iLastLen, iLen );
1999 		assert ( iMinLen<(int)sizeof(m_sLastKeyword) );
2000 		while ( iMatch<iMinLen && m_sLastKeyword[iMatch]==pWord[iMatch] )
2001 		{
2002 			iMatch++;
2003 		}
2004 
2005 		BYTE iDelta = (BYTE)( iLen - iMatch );
2006 		assert ( iDelta>0 );
2007 
2008 		assert ( iLen < (int)sizeof(m_sLastKeyword) );
2009 		memcpy ( m_sLastKeyword, pWord, iLen );
2010 		m_iLastLen = iLen;
2011 
2012 		// match and delta are usually tiny, pack them together in 1 byte
2013 		// tricky bit, this byte leads the entry so it must never be 0 (aka eof mark)!
2014 		if ( iDelta<=8 && iMatch<=15 )
2015 		{
2016 			BYTE uPacked = ( 0x80 + ( (iDelta-1)<<4 ) + iMatch );
2017 			WRITER.PutBytes ( &uPacked, 1 );
2018 		} else
2019 		{
2020 			WRITER.PutBytes ( &iDelta, 1 ); // always greater than 0
2021 			WRITER.PutBytes ( &iMatch, 1 );
2022 		}
2023 
2024 		WRITER.PutBytes ( pWord + iMatch, iDelta );
2025 	}
2026 };
2027 
2028 BYTE sphDoclistHintPack ( SphOffset_t iDocs, SphOffset_t iLen );
2029 
2030 // wordlist checkpoints frequency
2031 #define SPH_WORDLIST_CHECKPOINT 64
2032 
2033 /// startup mva updates arena
2034 const char *		sphArenaInit ( int iMaxBytes );
2035 
2036 #if USE_WINDOWS
2037 void localtime_r ( const time_t * clock, struct tm * res );
2038 void gmtime_r ( const time_t * clock, struct tm * res );
2039 #endif
2040 
2041 struct InfixBlock_t
2042 {
2043 	union
2044 	{
2045 		const char *	m_sInfix;
2046 		DWORD			m_iInfixOffset;
2047 	};
2048 	DWORD				m_iOffset;
2049 };
2050 
2051 
2052 /// infix hash builder
2053 class ISphInfixBuilder
2054 {
2055 public:
ISphInfixBuilder()2056 	explicit		ISphInfixBuilder() {}
~ISphInfixBuilder()2057 	virtual			~ISphInfixBuilder() {}
2058 	virtual void	AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint, bool bHasMorphology ) = 0;
2059 	virtual void	SaveEntries ( CSphWriter & wrDict ) = 0;
2060 	virtual int64_t	SaveEntryBlocks ( CSphWriter & wrDict ) = 0;
2061 	virtual int		GetBlocksWordsSize () const = 0;
2062 };
2063 
2064 
2065 ISphInfixBuilder * sphCreateInfixBuilder ( int iCodepointBytes, CSphString * pError );
2066 bool sphLookupInfixCheckpoints ( const char * sInfix, int iBytes, const BYTE * pInfixes, const CSphVector<InfixBlock_t> & dInfixBlocks, int iInfixCodepointBytes, CSphVector<int> & dCheckpoints );
2067 // calculate length, upto iInfixCodepointBytes chars from infix start
2068 int sphGetInfixLength ( const char * sInfix, int iBytes, int iInfixCodepointBytes );
2069 
2070 
2071 /// compute utf-8 character length in bytes from its first byte
sphUtf8CharBytes(BYTE uFirst)2072 inline int sphUtf8CharBytes ( BYTE uFirst )
2073 {
2074 	switch ( uFirst>>4 )
2075 	{
2076 		case 12: return 2; // 110x xxxx, 2 bytes
2077 		case 13: return 2; // 110x xxxx, 2 bytes
2078 		case 14: return 3; // 1110 xxxx, 3 bytes
2079 		case 15: return 4; // 1111 0xxx, 4 bytes
2080 		default: return 1; // either 1 byte, or invalid/unsupported code
2081 	}
2082 }
2083 
2084 //////////////////////////////////////////////////////////////////////////
2085 
2086 /// snippet setupper
2087 /// used by searchd and SNIPPET() function in exprs
2088 /// should probably be refactored as a single function
2089 /// a precursor to sphBuildExcerpts() call
2090 class SnippetContext_t : ISphNoncopyable
2091 {
2092 private:
2093 	CSphScopedPtr<CSphDict> m_tDictCloned;
2094 	CSphScopedPtr<CSphDict> m_tExactDict;
2095 
2096 public:
2097 	CSphDict * m_pDict;
2098 	CSphScopedPtr<ISphTokenizer> m_tTokenizer;
2099 	CSphScopedPtr<CSphHTMLStripper> m_tStripper;
2100 	ISphTokenizer * m_pQueryTokenizer;
2101 	XQQuery_t m_tExtQuery;
2102 	DWORD m_eExtQuerySPZ;
2103 
SnippetContext_t()2104 	SnippetContext_t()
2105 		: m_tDictCloned ( NULL )
2106 		, m_tExactDict ( NULL )
2107 		, m_pDict ( NULL )
2108 		, m_tTokenizer ( NULL )
2109 		, m_tStripper ( NULL )
2110 		, m_pQueryTokenizer ( NULL )
2111 		, m_eExtQuerySPZ ( SPH_SPZ_NONE )
2112 	{
2113 	}
2114 
~SnippetContext_t()2115 	~SnippetContext_t()
2116 	{
2117 		SafeDelete ( m_pQueryTokenizer );
2118 	}
2119 
SetupExactDict(const CSphIndexSettings & tSettings,CSphScopedPtr<CSphDict> & tExact,CSphDict * pDict)2120 	static CSphDict * SetupExactDict ( const CSphIndexSettings & tSettings, CSphScopedPtr<CSphDict> & tExact, CSphDict * pDict )
2121 	{
2122 		// handle index_exact_words
2123 		if ( !tSettings.m_bIndexExactWords )
2124 			return pDict;
2125 
2126 		tExact = new CSphDictExact ( pDict );
2127 		return tExact.Ptr();
2128 	}
2129 
CollectQuerySPZ(const XQNode_t * pNode)2130 	static DWORD CollectQuerySPZ ( const XQNode_t * pNode )
2131 	{
2132 		if ( !pNode )
2133 			return SPH_SPZ_NONE;
2134 
2135 		DWORD eSPZ = SPH_SPZ_NONE;
2136 		if ( pNode->GetOp()==SPH_QUERY_SENTENCE )
2137 			eSPZ |= SPH_SPZ_SENTENCE;
2138 		else if ( pNode->GetOp()==SPH_QUERY_PARAGRAPH )
2139 			eSPZ |= SPH_SPZ_PARAGRAPH;
2140 
2141 		ARRAY_FOREACH ( i, pNode->m_dChildren )
2142 			eSPZ |= CollectQuerySPZ ( pNode->m_dChildren[i] );
2143 
2144 		return eSPZ;
2145 	}
2146 
SetupStripperSPZ(const CSphIndexSettings & tSettings,const ExcerptQuery_t & q,bool bSetupSPZ,CSphScopedPtr<CSphHTMLStripper> & tStripper,ISphTokenizer * pTokenizer,CSphString & sError)2147 	static bool SetupStripperSPZ ( const CSphIndexSettings & tSettings, const ExcerptQuery_t & q,
2148 		bool bSetupSPZ, CSphScopedPtr<CSphHTMLStripper> & tStripper, ISphTokenizer * pTokenizer,
2149 		CSphString & sError )
2150 	{
2151 		if ( bSetupSPZ &&
2152 			( !pTokenizer->EnableSentenceIndexing ( sError ) || !pTokenizer->EnableZoneIndexing ( sError ) ) )
2153 		{
2154 			return false;
2155 		}
2156 
2157 
2158 		if ( q.m_sStripMode=="strip" || q.m_sStripMode=="retain"
2159 			|| ( q.m_sStripMode=="index" && tSettings.m_bHtmlStrip ) )
2160 		{
2161 			// don't strip HTML markup in 'retain' mode - proceed zones only
2162 			tStripper = new CSphHTMLStripper ( q.m_sStripMode!="retain" );
2163 
2164 			if ( q.m_sStripMode=="index" )
2165 			{
2166 				if (
2167 					!tStripper->SetIndexedAttrs ( tSettings.m_sHtmlIndexAttrs.cstr (), sError ) ||
2168 					!tStripper->SetRemovedElements ( tSettings.m_sHtmlRemoveElements.cstr (), sError ) )
2169 				{
2170 					sError.SetSprintf ( "HTML stripper config error: %s", sError.cstr() );
2171 					return false;
2172 				}
2173 			}
2174 
2175 			if ( bSetupSPZ )
2176 			{
2177 				tStripper->EnableParagraphs();
2178 			}
2179 
2180 			// handle zone(s) in special mode only when passage_boundary enabled
2181 			if ( bSetupSPZ && !tStripper->SetZones ( tSettings.m_sZones.cstr (), sError ) )
2182 			{
2183 				sError.SetSprintf ( "HTML stripper config error: %s", sError.cstr() );
2184 				return false;
2185 			}
2186 		}
2187 
2188 		return true;
2189 	}
2190 
Setup(const CSphIndex * pIndex,const ExcerptQuery_t & tSettings,CSphString & sError)2191 	bool Setup ( const CSphIndex * pIndex, const ExcerptQuery_t & tSettings, CSphString & sError )
2192 	{
2193 		assert ( pIndex );
2194 		CSphScopedPtr<CSphDict> tDictCloned ( NULL );
2195 		m_pDict = pIndex->GetDictionary();
2196 		if ( m_pDict->HasState() )
2197 			m_tDictCloned = m_pDict = m_pDict->Clone();
2198 
2199 		// AOT tokenizer works only with query mode
2200 		if ( pIndex->GetSettings().m_uAotFilterMask &&
2201 			( !tSettings.m_bHighlightQuery || tSettings.m_bExactPhrase ) )
2202 		{
2203 			if ( !tSettings.m_bHighlightQuery )
2204 				sError.SetSprintf ( "failed to setup AOT with query_mode=0, use query_mode=1" );
2205 			else
2206 				sError.SetSprintf ( "failed to setup AOT with exact_phrase, use phrase search operator with query_mode=1" );
2207 			return false;
2208 		}
2209 
2210 		// OPTIMIZE! do a lightweight indexing clone here
2211 		if ( tSettings.m_bHighlightQuery && pIndex->GetSettings().m_uAotFilterMask )
2212 			m_tTokenizer = sphAotCreateFilter ( pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX ), m_pDict, pIndex->GetSettings().m_bIndexExactWords, pIndex->GetSettings().m_uAotFilterMask );
2213 		else
2214 			m_tTokenizer = pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX );
2215 
2216 		m_pQueryTokenizer = NULL;
2217 		if ( tSettings.m_bHighlightQuery || tSettings.m_bExactPhrase )
2218 		{
2219 			m_pQueryTokenizer =	pIndex->GetQueryTokenizer()->Clone ( SPH_CLONE_QUERY_LIGHTWEIGHT );
2220 		} else
2221 		{
2222 			// legacy query mode should handle exact form modifier and star wildcard
2223 			m_pQueryTokenizer = pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX );
2224 			if ( pIndex->IsStarDict() )
2225 			{
2226 				m_pQueryTokenizer->AddPlainChar ( '*' );
2227 				m_pQueryTokenizer->AddPlainChar ( '?' );
2228 				m_pQueryTokenizer->AddPlainChar ( '%' );
2229 			}
2230 			if ( pIndex->GetSettings().m_bIndexExactWords )
2231 				m_pQueryTokenizer->AddPlainChar ( '=' );
2232 		}
2233 
2234 		// setup exact dictionary if needed
2235 		m_pDict = SetupExactDict ( pIndex->GetSettings(), m_tExactDict, m_pDict );
2236 
2237 		if ( tSettings.m_bHighlightQuery )
2238 		{
2239 			// OPTIMIZE? double lightweight clone here? but then again it's lightweight
2240 			if ( !sphParseExtendedQuery ( m_tExtQuery, tSettings.m_sWords.cstr(), NULL, m_pQueryTokenizer,
2241 				&pIndex->GetMatchSchema(), m_pDict, pIndex->GetSettings() ) )
2242 			{
2243 				sError = m_tExtQuery.m_sParseError;
2244 				return false;
2245 			}
2246 			if ( m_tExtQuery.m_pRoot )
2247 				m_tExtQuery.m_pRoot->ClearFieldMask();
2248 
2249 			m_eExtQuerySPZ = SPH_SPZ_NONE;
2250 			m_eExtQuerySPZ |= CollectQuerySPZ ( m_tExtQuery.m_pRoot );
2251 			if ( m_tExtQuery.m_dZones.GetLength() )
2252 				m_eExtQuerySPZ |= SPH_SPZ_ZONE;
2253 
2254 			if ( pIndex->GetSettings().m_uAotFilterMask )
2255 				TransformAotFilter ( m_tExtQuery.m_pRoot, m_pDict->GetWordforms(), pIndex->GetSettings() );
2256 		}
2257 
2258 		bool bSetupSPZ = ( tSettings.m_ePassageSPZ!=SPH_SPZ_NONE || m_eExtQuerySPZ!=SPH_SPZ_NONE ||
2259 			( tSettings.m_sStripMode=="retain" && tSettings.m_bHighlightQuery ) );
2260 
2261 		if ( !SetupStripperSPZ ( pIndex->GetSettings(), tSettings, bSetupSPZ, m_tStripper, m_tTokenizer.Ptr(), sError ) )
2262 			return false;
2263 
2264 		return true;
2265 	}
2266 };
2267 
2268 struct StoredToken_t
2269 {
2270 	BYTE			m_sToken [3*SPH_MAX_WORD_LEN+4];
2271 	// tokenized state
2272 	const char *	m_szTokenStart;
2273 	const char *	m_szTokenEnd;
2274 	const char *	m_pBufferPtr;
2275 	const char *	m_pBufferEnd;
2276 	int				m_iTokenLen;
2277 	int				m_iOvershortCount;
2278 	bool			m_bBoundary;
2279 	bool			m_bSpecial;
2280 	bool			m_bBlended;
2281 	bool			m_bBlendedPart;
2282 };
2283 
2284 void FillStoredTokenInfo ( StoredToken_t & tToken, const BYTE * sToken, ISphTokenizer * pTokenizer );
2285 CSphSource * sphCreateSourceTSVpipe ( const CSphConfigSection * pSource, FILE * pPipe, const char * sSourceName, bool bProxy );
2286 CSphSource * sphCreateSourceCSVpipe ( const CSphConfigSection * pSource, FILE * pPipe, const char * sSourceName, bool bProxy );
2287 
FlipEndianess(DWORD * pData)2288 inline void FlipEndianess ( DWORD* pData )
2289 {
2290 	BYTE* pB = (BYTE*)pData;
2291 	BYTE a = pB[0];
2292 	pB[0] = pB[3];
2293 	pB[3] = a;
2294 	a = pB[1];
2295 	pB[1] = pB[2];
2296 	pB[2] = a;
2297 };
2298 
2299 
2300 #if USE_RLP
2301 #define RLPARG(_arg) _arg
2302 #else
2303 #define RLPARG(_arg)
2304 #endif
2305 
2306 
2307 #if USE_RLP
2308 
2309 struct StoredDoc_t
2310 {
2311 	CSphMatch							m_tDocInfo;
2312 	CSphVector<CSphString>				m_dStrAttrs;
2313 	CSphVector<DWORD>					m_dMva;
2314 	CSphTightVector<BYTE*>				m_dFields;
2315 	CSphTightVector<bool>				m_dChinese;
2316 	CSphTightVector< CSphVector<BYTE> >	m_dFieldStorage;
2317 	CSphTightVector< StoredToken_t >	m_dNonChineseTokens;
2318 };
2319 
2320 // these are used to separate text before passing it to RLP
2321 const int PROXY_DOCUMENT_START = 0xFFFA;
2322 const int PROXY_FIELD_START_CHINESE = 0xFFFB;
2323 const int PROXY_FIELD_START_NONCHINESE = 0xFFFC;
2324 const int PROXY_TOKEN_SEPARATOR = 0xFFFD;
2325 
2326 // these are used on text that is already tokenized
2327 const int PROXY_TOKENIZED = 0xFFFA;
2328 const int PROXY_MORPH = 0xFFFB;
2329 
2330 const int PROXY_MARKER_LEN = 3;
2331 
2332 enum
2333 {
2334 	PROXY_BOUNDARY_FLAG		= 1<<7, //NOLINT
2335 	PROXY_SPECIAL_FLAG		= 1<<8, //NOLINT
2336 	PROXY_BLENDED_FLAG		= 1<<9, //NOLINT
2337 	PROXY_BLENDED_PART_FLAG	= 1<<10, //NOLINT
2338 	PROXY_HAVE_OVERSHORT	= 1<<11 //NOLINT
2339 };
2340 
2341 
2342 #define COPY_MARKER(_ptr,_marker) \
2343 {\
2344 	*_ptr++ = _marker[0]; \
2345 	*_ptr++ = _marker[1]; \
2346 	*_ptr++ = _marker[2]; \
2347 }
2348 
2349 #define CMP_MARKER(_ptr, _marker) \
2350 	( _ptr[0]==_marker[0] && _ptr[1]==_marker[1] && _ptr[2]==_marker[2] )
2351 
2352 // proxy source
2353 template <class T>
2354 class CSphSource_Proxy : public T
2355 {
2356 public:
CSphSource_Proxy(const char * sSourceName)2357 	explicit CSphSource_Proxy ( const char * sSourceName )
2358 		: T ( sSourceName )
2359 		, m_dBatchedDocs ( g_iRLPMaxBatchDocs )
2360 		, m_iDocStart ( 0 )
2361 		, m_iDocCount ( 0 )
2362 		, m_pExtraTokenizer ( NULL )
2363 		, m_pProxyStripper ( NULL )
2364 	{
2365 		assert ( sphUTF8Encode ( m_pMarkerDocStart, PROXY_DOCUMENT_START )==PROXY_MARKER_LEN );
2366 
2367 		sphUTF8Encode ( m_pMarkerDocStart, PROXY_DOCUMENT_START );
2368 		sphUTF8Encode ( m_pMarkerChineseField, PROXY_FIELD_START_CHINESE );
2369 		sphUTF8Encode ( m_pMarkerNonChineseField, PROXY_FIELD_START_NONCHINESE );
2370 		sphUTF8Encode ( m_pMarkerTokenSeparator, PROXY_TOKEN_SEPARATOR );
2371 
2372 		sphUTF8Encode ( m_pMarkerTokenized, PROXY_TOKENIZED );
2373 		sphUTF8Encode ( m_pMarkerMorph, PROXY_MORPH );
2374 
2375 		const int INITIAL_BUFFER_SIZE = 1048576;
2376 		m_dDocBuffer.Reserve ( INITIAL_BUFFER_SIZE );
2377 	}
2378 
~CSphSource_Proxy()2379 	virtual ~CSphSource_Proxy()
2380 	{
2381 		SafeDelete ( m_pExtraTokenizer );
2382 		SafeDelete ( m_pProxyStripper );
2383 	}
2384 
AppendToField(StoredDoc_t * pCurDoc,int iField,BYTE * pToken,int iTokenLen,BYTE * pMarker)2385 	void AppendToField ( StoredDoc_t * pCurDoc, int iField, BYTE * pToken, int iTokenLen, BYTE * pMarker )
2386 	{
2387 		assert ( pCurDoc && iField>=0 );
2388 		CSphVector<BYTE> & tStorage = pCurDoc->m_dFieldStorage[iField];
2389 
2390 		int iNewSize, iOldSize;
2391 		iNewSize = iOldSize = tStorage.GetLength();
2392 		if ( !iNewSize )
2393 			iNewSize += PROXY_MARKER_LEN + 1;	// tokenized field marker + trailing zero
2394 
2395 		iNewSize += iTokenLen+1;				// space before each token + token
2396 
2397 		if ( pMarker )
2398 			iNewSize += PROXY_MARKER_LEN;		// non-chinese token marker
2399 
2400 		tStorage.Resize ( iNewSize );
2401 		BYTE * pPtr = tStorage.Begin() + ( iOldSize ? iOldSize-1 : 0 );
2402 
2403 		if ( !iOldSize )
2404 		{
2405 			memcpy ( pPtr, m_pMarkerTokenized, PROXY_MARKER_LEN );
2406 			pPtr += PROXY_MARKER_LEN;
2407 		}
2408 
2409 		*pPtr++ = ' ';
2410 
2411 		if ( pMarker )
2412 			COPY_MARKER ( pPtr, pMarker );
2413 
2414 		memcpy ( pPtr, pToken, iTokenLen+1 );
2415 		pCurDoc->m_dFields[iField] = tStorage.Begin();
2416 	}
2417 
NextDocument(CSphString & sError)2418 	virtual BYTE ** NextDocument ( CSphString & sError )
2419 	{
2420 		ISphTokenizer * pEmbeddedTokenizer = T::m_pTokenizer->GetEmbeddedTokenizer();
2421 		assert ( pEmbeddedTokenizer );
2422 
2423 		// do not run the stripper twice
2424 		if ( CSphSource_Proxy<T>::m_pStripper )
2425 		{
2426 			m_pProxyStripper = CSphSource_Proxy<T>::m_pStripper;
2427 			CSphSource_Proxy<T>::m_pStripper = NULL;
2428 		}
2429 
2430 		if ( !m_pExtraTokenizer )
2431 		{
2432 			m_pExtraTokenizer = ISphTokenizer::CreateRLPFilter ( pEmbeddedTokenizer->Clone ( SPH_CLONE_INDEX ), true, g_sRLPRoot.cstr(),
2433 				g_sRLPEnv.cstr(), T::m_pTokenizer->GetRLPContext(), false, sError );
2434 			if ( !m_pExtraTokenizer )
2435 				return NULL;
2436 		}
2437 
2438 		if ( !IsDocCacheEmpty() )
2439 			return CopyDoc();
2440 
2441 		if ( m_dFieldLengths.GetLength()!=T::m_tSchema.m_dFields.GetLength() )
2442 			m_dFieldLengths.Resize ( T::m_tSchema.m_dFields.GetLength() );
2443 
2444 		char szTmp [256];
2445 
2446 		m_iDocStart = 0;
2447 		int iCurDoc = 0;
2448 
2449 		m_dDocBuffer.Resize(0);
2450 
2451 		while ( !IsDocCacheFull() && m_dDocBuffer.GetLength() < g_iRLPMaxBatchSize )
2452 		{
2453 			BYTE ** pFields = T::NextDocument ( sError );
2454 			if ( !pFields )
2455 				break;
2456 
2457 			int iTotalFieldLen = 0;
2458 			for ( int i = 0; i < T::m_tSchema.m_dFields.GetLength(); i++ )
2459 			{
2460 				m_dFieldLengths[i] = pFields[i] ? strlen ( (const char*)pFields[i] ) : 0;
2461 				iTotalFieldLen += PROXY_MARKER_LEN+m_dFieldLengths[i]+2;
2462 			}
2463 
2464 			const int MAX_INDEX_LEN = 8;
2465 			int iOldBufferLen = m_dDocBuffer.GetLength();
2466 			m_dDocBuffer.Resize ( iOldBufferLen+PROXY_MARKER_LEN+MAX_INDEX_LEN+2+iTotalFieldLen );
2467 			BYTE * pCurDocPtr = &(m_dDocBuffer[iOldBufferLen]);
2468 
2469 			StoredDoc_t * pDoc = PushDoc();
2470 			int nFields = T::m_tSchema.m_dFields.GetLength();
2471 			CopyDocInfo ( pDoc->m_tDocInfo, T::m_tDocInfo );
2472 			pDoc->m_dMva = T::m_dMva;
2473 			pDoc->m_dStrAttrs = T::m_dStrAttrs;
2474 			pDoc->m_dFields.Resize ( nFields );
2475 			pDoc->m_dFieldStorage.Resize ( nFields );
2476 			pDoc->m_dChinese.Resize ( nFields );
2477 			pDoc->m_dNonChineseTokens.Resize ( 0 );
2478 
2479 			// document start tag
2480 			COPY_MARKER ( pCurDocPtr, m_pMarkerDocStart );
2481 
2482 			// space
2483 			*pCurDocPtr++ = ' ';
2484 
2485 			// index in plain text
2486 			int iLen = snprintf ( szTmp, sizeof(szTmp), "%d", iCurDoc );
2487 			iLen = iLen>=0 ? iLen : sizeof(szTmp);
2488 			memcpy ( pCurDocPtr, szTmp, iLen );
2489 			pCurDocPtr += iLen;
2490 
2491 			// space
2492 			*pCurDocPtr++ = ' ';
2493 
2494 			for ( int i = 0; i < T::m_tSchema.m_dFields.GetLength(); i++ )
2495 			{
2496 				pDoc->m_dChinese[i] = sphDetectChinese ( pFields[i], m_dFieldLengths[i] );
2497 
2498 				if ( m_pProxyStripper )
2499 				{
2500 					m_pProxyStripper->Strip ( pFields[i] );
2501 					m_dFieldLengths[i] = strlen ( (const char *)pFields[i] );
2502 				}
2503 
2504 				int iFieldLen = m_dFieldLengths[i];
2505 
2506 				if ( !pDoc->m_dChinese[i] )
2507 				{
2508 					// no chinese? just save the field storage without tokenizing it
2509 					// it will be tokenized later in the splitter
2510 					pDoc->m_dFieldStorage[i].Resize ( iFieldLen+1 );
2511 					if ( pFields[i] )
2512 						memcpy ( pDoc->m_dFieldStorage[i].Begin(), pFields[i], iFieldLen+1 );
2513 					else
2514 						pDoc->m_dFieldStorage[i][0] = 0;
2515 
2516 					pDoc->m_dFields[i] = pDoc->m_dFieldStorage[i].Begin();
2517 
2518 					COPY_MARKER ( pCurDocPtr, m_pMarkerNonChineseField );
2519 					*pCurDocPtr++ = ' ';
2520 				} else
2521 				{
2522 					COPY_MARKER ( pCurDocPtr, m_pMarkerChineseField );
2523 					*pCurDocPtr++ = ' ';
2524 
2525 					pEmbeddedTokenizer->SetBuffer ( pFields[i], iFieldLen );
2526 					BYTE * pToken;
2527 					while ( ( pToken = pEmbeddedTokenizer->GetToken() )!=NULL )
2528 					{
2529 						int iTokenLen = strlen ( (const char*)pToken );
2530 						if ( sphDetectChinese ( pToken, iTokenLen ) )
2531 						{
2532 							// collect it in one big chinese token buffer that will be processed by RLP
2533 							memcpy ( pCurDocPtr, pToken, iTokenLen );
2534 							pCurDocPtr += iTokenLen;
2535 						} else
2536 						{
2537 							// drop it into "non-chinese" token vector
2538 							StoredToken_t & tStored = pDoc->m_dNonChineseTokens.Add();
2539 							FillStoredTokenInfo ( tStored, pToken, pEmbeddedTokenizer );
2540 
2541 							// add a 'non-chinese token' marker to the chinese token stream
2542 							*pCurDocPtr++ = ' ';
2543 							COPY_MARKER ( pCurDocPtr, m_pMarkerTokenSeparator );
2544 						}
2545 
2546 						*pCurDocPtr++ = ' ';
2547 					}
2548 				}
2549 			}
2550 
2551 			m_dDocBuffer.Resize ( pCurDocPtr-m_dDocBuffer.Begin() );
2552 			iCurDoc++;
2553 		}
2554 
2555 		if ( IsDocCacheEmpty() )
2556 			return NULL;
2557 
2558 		m_pExtraTokenizer->SetBuffer ( m_dDocBuffer.Begin(), m_dDocBuffer.GetLength() );
2559 		BYTE * pToken;
2560 
2561 		StoredDoc_t * pCurDoc = NULL;
2562 		bool bIndexNext = false;
2563 		int iField = -1;
2564 		int iStoredToken = 0;
2565 		while ( ( pToken = m_pExtraTokenizer->GetToken() )!=NULL )
2566 		{
2567 			bool bSpecial = false;
2568 			int iTokenLen = strlen ( (const char *)pToken );
2569 			if ( bIndexNext )
2570 			{
2571 				int iDoc = atoi ( (const char*)pToken );
2572 				pCurDoc = &(m_dBatchedDocs[iDoc]);
2573 				bIndexNext = false;
2574 				iField = -1;
2575 				iStoredToken = 0;
2576 			} else
2577 			{
2578 				if ( iTokenLen==PROXY_MARKER_LEN )
2579 				{
2580 					if ( CMP_MARKER ( pToken, m_pMarkerDocStart ) )
2581 					{
2582 						bIndexNext = true;
2583 						bSpecial = true;
2584 					} else if ( CMP_MARKER ( pToken, m_pMarkerChineseField ) )
2585 					{
2586 						assert ( pCurDoc );
2587 						iField++;
2588 						pCurDoc->m_dFieldStorage[iField].Resize(0);
2589 						pCurDoc->m_dFields[iField] = pCurDoc->m_dFieldStorage[iField].Begin();
2590 						bSpecial = true;
2591 					} else if ( CMP_MARKER ( pToken, m_pMarkerNonChineseField ) )
2592 					{
2593 						iField++;
2594 						bSpecial = true;
2595 					} else if ( CMP_MARKER ( pToken, m_pMarkerTokenSeparator ) )
2596 					{
2597 						StoredToken_t & tStored = pCurDoc->m_dNonChineseTokens[iStoredToken];
2598 
2599 						// copy stored non-chinese token && pack token data
2600 						AppendToField ( pCurDoc, iField, tStored.m_sToken, strlen ( (const char*)tStored.m_sToken ), m_pMarkerMorph );
2601 
2602 						// this depends on SPH_MAX_WORD_LEN being 6 bits max
2603 						DWORD uPacked = iTokenLen;
2604 
2605 						if ( tStored.m_bBoundary )
2606 							uPacked |= PROXY_BOUNDARY_FLAG;
2607 
2608 						if ( tStored.m_bSpecial )
2609 							uPacked |= PROXY_SPECIAL_FLAG;
2610 
2611 						if ( tStored.m_bBlended )
2612 							uPacked |= PROXY_BLENDED_FLAG;
2613 
2614 						if ( tStored.m_bBlendedPart )
2615 							uPacked |= PROXY_BLENDED_PART_FLAG;
2616 
2617 						int iTmpLen;
2618 						if ( tStored.m_iOvershortCount )
2619 						{
2620 							uPacked |= PROXY_HAVE_OVERSHORT;
2621 							iTmpLen = snprintf ( szTmp, sizeof(szTmp), " %x %x", uPacked, tStored.m_iOvershortCount );
2622 						} else
2623 							iTmpLen = snprintf ( szTmp, sizeof(szTmp), " %x", uPacked );
2624 
2625 						if ( iTmpLen < 0 )
2626 							iTmpLen = sizeof(szTmp);
2627 
2628 						int iStoredLen = pCurDoc->m_dFieldStorage[iField].GetLength();
2629 						pCurDoc->m_dFieldStorage[iField].Resize ( iStoredLen + iTmpLen );
2630 						memcpy ( pCurDoc->m_dFieldStorage[iField].Begin()+iStoredLen-1, szTmp, iTmpLen+1 );
2631 
2632 						pCurDoc->m_dFields[iField] = pCurDoc->m_dFieldStorage[iField].Begin();
2633 
2634 						iStoredToken++;
2635 						bSpecial = true;
2636 					}
2637 				}
2638 
2639 				// simple token; append to current field
2640 				if ( !bSpecial )
2641 					AppendToField ( pCurDoc, iField, pToken, iTokenLen, NULL );
2642 			}
2643 		}
2644 
2645 		return CopyDoc ();
2646 	}
2647 
2648 private:
2649 	CSphSource_Document *	m_pSource;
2650 	CSphFixedVector<StoredDoc_t> m_dBatchedDocs;
2651 	CSphVector<BYTE>		m_dDocBuffer;
2652 	CSphVector<int>			m_dFieldLengths;
2653 	int						m_iDocStart;
2654 	int						m_iDocCount;
2655 	ISphTokenizer *			m_pExtraTokenizer;
2656 	CSphHTMLStripper *		m_pProxyStripper;
2657 
2658 	BYTE					m_pMarkerDocStart[PROXY_MARKER_LEN];
2659 	BYTE					m_pMarkerChineseField[PROXY_MARKER_LEN];
2660 	BYTE					m_pMarkerNonChineseField[PROXY_MARKER_LEN];
2661 	BYTE					m_pMarkerTokenSeparator[PROXY_MARKER_LEN];
2662 
2663 	BYTE					m_pMarkerTokenized[PROXY_MARKER_LEN];
2664 	BYTE					m_pMarkerMorph[PROXY_MARKER_LEN];
2665 
IsDocCacheEmpty()2666 	bool					IsDocCacheEmpty() const	{ return !m_iDocCount; }
IsDocCacheFull()2667 	bool					IsDocCacheFull() const { return m_iDocCount==m_dBatchedDocs.GetLength(); }
2668 
PushDoc()2669 	StoredDoc_t * PushDoc()
2670 	{
2671 		assert ( !IsDocCacheFull() );
2672 
2673 		int iEnd = (m_iDocStart+m_iDocCount) % m_dBatchedDocs.GetLength();
2674 		m_iDocCount++;
2675 
2676 		return &(m_dBatchedDocs[iEnd]);
2677 	}
2678 
PopDoc()2679 	StoredDoc_t * PopDoc()
2680 	{
2681 		assert ( !IsDocCacheEmpty() );
2682 
2683 		StoredDoc_t * pDoc = &(m_dBatchedDocs[m_iDocStart]);
2684 		m_iDocStart = (m_iDocStart+1) % m_dBatchedDocs.GetLength();
2685 		m_iDocCount--;
2686 		return pDoc;
2687 	}
2688 
CopyDoc()2689 	BYTE ** CopyDoc ()
2690 	{
2691 		StoredDoc_t * pDoc = PopDoc();
2692 		CopyDocInfo ( T::m_tDocInfo, pDoc->m_tDocInfo );
2693 		T::m_tState.m_dFields = pDoc->m_dFields.Begin();
2694 		T::m_dMva.SwapData ( pDoc->m_dMva );
2695 		T::m_dStrAttrs.SwapData ( pDoc->m_dStrAttrs );
2696 
2697 		return T::m_tState.m_dFields;
2698 	}
2699 
CopyDocInfo(CSphMatch & tTo,const CSphMatch & tFrom)2700 	void CopyDocInfo ( CSphMatch & tTo, const CSphMatch & tFrom )
2701 	{
2702 		if ( tFrom.m_pDynamic )
2703 		{
2704 			int iDynamic = T::m_tSchema.GetRowSize();
2705 
2706 			if ( !tTo.m_pDynamic )
2707 				tTo.Reset ( iDynamic );
2708 
2709 			memcpy ( tTo.m_pDynamic, tFrom.m_pDynamic, iDynamic*sizeof(CSphRowitem) );
2710 		}
2711 
2712 		tTo.m_pStatic = NULL;
2713 		tTo.m_uDocID = tFrom.m_uDocID;
2714 		tTo.m_iWeight = tFrom.m_iWeight;
2715 		tTo.m_iTag = tFrom.m_iTag;
2716 	}
2717 };
2718 
2719 #endif // USE_RLP
2720 
2721 #endif // _sphinxint_
2722 
2723 //
2724 // $Id$
2725 //
2726