1 //
2 // $Id$
3 //
4
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15
16 #ifndef _sphinxint_
17 #define _sphinxint_
18
19 #include "sphinx.h"
20 #include "sphinxfilter.h"
21 #include "sphinxrt.h"
22 #include "sphinxquery.h"
23 #include "sphinxexcerpt.h"
24 #include "sphinxudf.h"
25
26 #include <sys/stat.h>
27 #include <fcntl.h>
28 #include <float.h>
29
30 //////////////////////////////////////////////////////////////////////////
31 // INTERNAL CONSTANTS
32 //////////////////////////////////////////////////////////////////////////
33
34 #ifdef O_BINARY
35 #define SPH_O_BINARY O_BINARY
36 #else
37 #define SPH_O_BINARY 0
38 #endif
39
40 #define SPH_O_READ ( O_RDONLY | SPH_O_BINARY )
41 #define SPH_O_NEW ( O_CREAT | O_RDWR | O_TRUNC | SPH_O_BINARY )
42
43 #define MVA_DOWNSIZE DWORD // MVA32 offset type
44 #define MVA_OFFSET_MASK 0x7fffffffUL // MVA offset mask
45 #define MVA_ARENA_FLAG 0x80000000UL // MVA global-arena flag
46
47 #define DEFAULT_MAX_MATCHES 1000
48
49 //////////////////////////////////////////////////////////////////////////
50
51 const DWORD INDEX_MAGIC_HEADER = 0x58485053; ///< my magic 'SPHX' header
52 const DWORD INDEX_FORMAT_VERSION = 42; ///< my format version
53
54 const char MAGIC_SYNONYM_WHITESPACE = 1; // used internally in tokenizer only
55 const char MAGIC_CODE_SENTENCE = 2; // emitted from tokenizer on sentence boundary
56 const char MAGIC_CODE_PARAGRAPH = 3; // emitted from stripper (and passed via tokenizer) on paragraph boundary
57 const char MAGIC_CODE_ZONE = 4; // emitted from stripper (and passed via tokenizer) on zone boundary; followed by zero-terminated zone name
58
59 const char MAGIC_WORD_HEAD = 1; // prepended to keyword by source, stored in (crc) dictionary
60 const char MAGIC_WORD_TAIL = 1; // appended to keyword by source, stored in (crc) dictionary
61 const char MAGIC_WORD_HEAD_NONSTEMMED = 2; // prepended to keyword by source, stored in dictionary
62 const char MAGIC_WORD_BIGRAM = 3; // used as a bigram (keyword pair) separator, stored in dictionary
63
64 extern const char * MAGIC_WORD_SENTENCE; ///< value is "\3sentence"
65 extern const char * MAGIC_WORD_PARAGRAPH; ///< value is "\3paragraph"
66
67 //////////////////////////////////////////////////////////////////////////
68 // INTERNAL GLOBALS
69 //////////////////////////////////////////////////////////////////////////
70
71 /// binlog, defined in sphinxrt.cpp
72 extern class ISphBinlog * g_pBinlog;
73
74 /// costs for max_predicted_time limits, defined in sphinxsearch.cpp
75 /// measured in nanoseconds (that is, 1e-9)
76 extern int g_iPredictorCostSkip;
77 extern int g_iPredictorCostDoc;
78 extern int g_iPredictorCostHit;
79 extern int g_iPredictorCostMatch;
80
81 extern bool g_bJsonStrict;
82 extern bool g_bJsonAutoconvNumbers;
83 extern bool g_bJsonKeynamesToLowercase;
84
85 //////////////////////////////////////////////////////////////////////////
86 // INTERNAL HELPER FUNCTIONS, CLASSES, ETC
87 //////////////////////////////////////////////////////////////////////////
88
89 #define SPH_QUERY_STATES \
90 SPH_QUERY_STATE ( UNKNOWN, "unknown" ) \
91 SPH_QUERY_STATE ( NET_READ, "net_read" ) \
92 SPH_QUERY_STATE ( IO, "io" ) \
93 SPH_QUERY_STATE ( DIST_CONNECT, "dist_connect" ) \
94 SPH_QUERY_STATE ( LOCAL_DF, "local_df" ) \
95 SPH_QUERY_STATE ( LOCAL_SEARCH, "local_search" ) \
96 SPH_QUERY_STATE ( SQL_PARSE, "sql_parse" ) \
97 SPH_QUERY_STATE ( FULLSCAN, "fullscan" ) \
98 SPH_QUERY_STATE ( DICT_SETUP, "dict_setup" ) \
99 SPH_QUERY_STATE ( PARSE, "parse" ) \
100 SPH_QUERY_STATE ( TRANSFORMS, "transforms" ) \
101 SPH_QUERY_STATE ( INIT, "init" ) \
102 SPH_QUERY_STATE ( INIT_SEGMENT, "init_segment" ) \
103 SPH_QUERY_STATE ( OPEN, "open" ) \
104 SPH_QUERY_STATE ( READ_DOCS, "read_docs" ) \
105 SPH_QUERY_STATE ( READ_HITS, "read_hits" ) \
106 SPH_QUERY_STATE ( GET_DOCS, "get_docs" ) \
107 SPH_QUERY_STATE ( GET_HITS, "get_hits" ) \
108 SPH_QUERY_STATE ( FILTER, "filter" ) \
109 SPH_QUERY_STATE ( RANK, "rank" ) \
110 SPH_QUERY_STATE ( SORT, "sort" ) \
111 SPH_QUERY_STATE ( FINALIZE, "finalize" ) \
112 SPH_QUERY_STATE ( DIST_WAIT, "dist_wait" ) \
113 SPH_QUERY_STATE ( AGGREGATE, "aggregate" ) \
114 SPH_QUERY_STATE ( NET_WRITE, "net_write" ) \
115 SPH_QUERY_STATE ( EVAL_POST, "eval_post" ) \
116 SPH_QUERY_STATE ( SNIPPET, "eval_snippet" ) \
117 SPH_QUERY_STATE ( EVAL_UDF, "eval_udf" ) \
118 SPH_QUERY_STATE ( TABLE_FUNC, "table_func" )
119
120
121 /// possible query states, used for profiling
122 enum ESphQueryState
123 {
124 SPH_QSTATE_INFINUM = -1,
125
126 #define SPH_QUERY_STATE(_name,_desc) SPH_QSTATE_##_name,
127 SPH_QUERY_STATES
128 #undef SPH_QUERY_STATE
129
130 SPH_QSTATE_TOTAL
131 };
132 STATIC_ASSERT ( SPH_QSTATE_UNKNOWN==0, BAD_QUERY_STATE_ENUM_BASE );
133
134
135 /// search query profile
136 class CSphQueryProfile
137 {
138 public:
139 ESphQueryState m_eState; ///< current state
140 int64_t m_tmStamp; ///< timestamp when we entered the current state
141
142 int m_dSwitches [ SPH_QSTATE_TOTAL+1 ]; ///< number of switches to given state
143 int64_t m_tmTotal [ SPH_QSTATE_TOTAL+1 ]; ///< total time spent per state
144
145 CSphStringBuilder m_sTransformedTree; ///< transformed query tree
146
147 public:
148 /// create empty and stopped profile
CSphQueryProfile()149 CSphQueryProfile()
150 {
151 Start ( SPH_QSTATE_TOTAL );
152 }
153
154 /// switch to a new query state, and record a timestamp
155 /// returns previous state, to simplify Push/Pop like scenarios
Switch(ESphQueryState eNew)156 ESphQueryState Switch ( ESphQueryState eNew )
157 {
158 int64_t tmNow = sphMicroTimer();
159 ESphQueryState eOld = m_eState;
160 m_dSwitches [ eOld ]++;
161 m_tmTotal [ eOld ] += tmNow - m_tmStamp;
162 m_eState = eNew;
163 m_tmStamp = tmNow;
164 return eOld;
165 }
166
167 /// reset everything and start profiling from a given state
Start(ESphQueryState eNew)168 void Start ( ESphQueryState eNew )
169 {
170 memset ( m_dSwitches, 0, sizeof(m_dSwitches) );
171 memset ( m_tmTotal, 0, sizeof(m_tmTotal) );
172 m_eState = eNew;
173 m_tmStamp = sphMicroTimer();
174 }
175
176 /// stop profiling
Stop()177 void Stop()
178 {
179 Switch ( SPH_QSTATE_TOTAL );
180 }
181 };
182
183
184 /// file writer with write buffering and int encoder
185 class CSphWriter : ISphNoncopyable
186 {
187 public:
188 CSphWriter ();
189 virtual ~CSphWriter ();
190
191 void SetBufferSize ( int iBufferSize ); ///< tune write cache size; must be called before OpenFile() or SetFile()
192
193 bool OpenFile ( const CSphString & sName, CSphString & sError );
194 void SetFile ( CSphAutofile & tAuto, SphOffset_t * pSharedOffset, CSphString & sError );
195 void CloseFile ( bool bTruncate = false ); ///< note: calls Flush(), ie. IsError() might get true after this call
196 void UnlinkFile (); /// some shit happened (outside) and the file is no more actual.
197
198 void PutByte ( int uValue );
199 void PutBytes ( const void * pData, int64_t iSize );
PutDword(DWORD uValue)200 void PutDword ( DWORD uValue ) { PutBytes ( &uValue, sizeof(DWORD) ); }
PutOffset(SphOffset_t uValue)201 void PutOffset ( SphOffset_t uValue ) { PutBytes ( &uValue, sizeof(SphOffset_t) ); }
202 void PutString ( const char * szString );
203 void PutString ( const CSphString & sString );
204 void Tag ( const char * sTag );
205
206 void SeekTo ( SphOffset_t pos ); ///< seeking inside the buffer will truncate it
207
208 #if USE_64BIT
PutDocid(SphDocID_t uValue)209 void PutDocid ( SphDocID_t uValue ) { PutOffset ( uValue ); }
210 #else
PutDocid(SphDocID_t uValue)211 void PutDocid ( SphDocID_t uValue ) { PutDword ( uValue ); }
212 #endif
213
214 void ZipInt ( DWORD uValue );
215 void ZipOffset ( uint64_t uValue );
216 void ZipOffsets ( CSphVector<SphOffset_t> * pData );
217
IsError()218 bool IsError () const { return m_bError; }
GetPos()219 SphOffset_t GetPos () const { return m_iPos; }
SetThrottle(ThrottleState_t * pState)220 void SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
221
222 protected:
223 CSphString m_sName;
224 SphOffset_t m_iPos;
225 SphOffset_t m_iWritten;
226
227 int m_iFD;
228 int m_iPoolUsed;
229 BYTE * m_pBuffer;
230 BYTE * m_pPool;
231 bool m_bOwnFile;
232 SphOffset_t * m_pSharedOffset;
233 int m_iBufferSize;
234
235 bool m_bError;
236 CSphString * m_pError;
237 ThrottleState_t * m_pThrottle;
238
239 virtual void Flush ();
240 };
241
242
243 /// file which closes automatically when going out of scope
244 class CSphAutofile : ISphNoncopyable
245 {
246 protected:
247 int m_iFD; ///< my file descriptor
248 CSphString m_sFilename; ///< my file name
249 bool m_bTemporary; ///< whether to unlink this file on Close()
250 bool m_bWouldTemporary; ///< backup of the m_bTemporary
251
252 CSphIndexProgress * m_pStat;
253
254 public:
255 CSphAutofile ();
256 CSphAutofile ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp=false );
257 ~CSphAutofile ();
258
259 int Open ( const CSphString & sName, int iMode, CSphString & sError, bool bTemp=false );
260 void Close ();
261 void SetTemporary(); ///< would be set if a shit happened and the file is not actual.
262
263 public:
GetFD()264 int GetFD () const { return m_iFD; }
265 const char * GetFilename () const;
266 SphOffset_t GetSize ( SphOffset_t iMinSize, bool bCheckSizeT, CSphString & sError );
267 SphOffset_t GetSize ();
268
269 bool Read ( void * pBuf, int64_t iCount, CSphString & sError );
270 void SetProgressCallback ( CSphIndexProgress * pStat );
271 };
272
273
274 /// file reader with read buffering and int decoder
275 class CSphReader
276 {
277 public:
278 CSphQueryProfile * m_pProfile;
279 ESphQueryState m_eProfileState;
280
281 public:
282 CSphReader ( BYTE * pBuf=NULL, int iSize=0 );
283 virtual ~CSphReader ();
284
285 void SetBuffers ( int iReadBuffer, int iReadUnhinted );
286 void SetFile ( int iFD, const char * sFilename );
287 void SetFile ( const CSphAutofile & tFile );
288 void Reset ();
289 void SeekTo ( SphOffset_t iPos, int iSizeHint );
290
291 void SkipBytes ( int iCount );
GetPos()292 SphOffset_t GetPos () const { return m_iPos+m_iBuffPos; }
293
294 void GetBytes ( void * pData, int iSize );
295 int GetBytesZerocopy ( const BYTE ** ppData, int iMax ); ///< zerocopy method; returns actual length present in buffer (upto iMax)
296
297 int GetByte ();
298 DWORD GetDword ();
299 SphOffset_t GetOffset ();
300 CSphString GetString ();
301 int GetLine ( char * sBuffer, int iMaxLen );
302 bool Tag ( const char * sTag );
303
304 DWORD UnzipInt ();
305 uint64_t UnzipOffset ();
306
GetErrorFlag()307 bool GetErrorFlag () const { return m_bError; }
GetErrorMessage()308 const CSphString & GetErrorMessage () const { return m_sError; }
GetFilename()309 const CSphString & GetFilename() const { return m_sFilename; }
310 void ResetError();
311
312 #if USE_64BIT
GetDocid()313 SphDocID_t GetDocid () { return GetOffset(); }
UnzipDocid()314 SphDocID_t UnzipDocid () { return UnzipOffset(); }
UnzipWordid()315 SphWordID_t UnzipWordid () { return UnzipOffset(); }
316 #else
GetDocid()317 SphDocID_t GetDocid () { return GetDword(); }
UnzipDocid()318 SphDocID_t UnzipDocid () { return UnzipInt(); }
UnzipWordid()319 SphWordID_t UnzipWordid () { return UnzipInt(); }
320 #endif
321
322 const CSphReader & operator = ( const CSphReader & rhs );
SetThrottle(ThrottleState_t * pState)323 void SetThrottle ( ThrottleState_t * pState ) { m_pThrottle = pState; }
324
325 protected:
326
327 int m_iFD;
328 SphOffset_t m_iPos;
329
330 int m_iBuffPos;
331 int m_iBuffUsed;
332 BYTE * m_pBuff;
333 int m_iSizeHint; ///< how much do we expect to read
334
335 int m_iBufSize;
336 bool m_bBufOwned;
337 int m_iReadUnhinted;
338
339 bool m_bError;
340 CSphString m_sError;
341 CSphString m_sFilename;
342 ThrottleState_t * m_pThrottle;
343
344 protected:
345 virtual void UpdateCache ();
346 };
347
348
349 /// scoped reader
350 class CSphAutoreader : public CSphReader
351 {
352 public:
CSphReader(pBuf,iSize)353 CSphAutoreader ( BYTE * pBuf=NULL, int iSize=0 ) : CSphReader ( pBuf, iSize ) {}
354 ~CSphAutoreader ();
355
356 bool Open ( const CSphString & sFilename, CSphString & sError );
357 void Close ();
358 SphOffset_t GetFilesize ();
359
360 public:
361 // added for DebugCheck()
GetFD()362 int GetFD () { return m_iFD; }
363 };
364
365
366 //////////////////////////////////////////////////////////////////////////
367
368 /// generic COM-like uids
369 enum ExtraData_e
370 {
371 EXTRA_GET_DATA_ZONESPANS,
372 EXTRA_GET_DATA_ZONESPANLIST,
373 EXTRA_GET_DATA_RANKFACTORS,
374 EXTRA_GET_DATA_PACKEDFACTORS,
375 EXTRA_GET_DATA_RANKER_STATE,
376
377 EXTRA_GET_QUEUE_WORST,
378 EXTRA_GET_QUEUE_SORTVAL,
379
380 EXTRA_SET_MVAPOOL,
381 EXTRA_SET_STRINGPOOL,
382 EXTRA_SET_POOL_CAPACITY,
383 EXTRA_SET_MATCHPUSHED,
384 EXTRA_SET_MATCHPOPPED,
385
386 EXTRA_SET_RANKER_PLUGIN,
387 EXTRA_SET_RANKER_PLUGIN_OPTS,
388
389 EXTRA_GET_POOL_SIZE
390 };
391
392 /// generic COM-like interface
393 class ISphExtra
394 {
395 public:
~ISphExtra()396 virtual ~ISphExtra () {}
ExtraData(ExtraData_e eType,void ** ppData)397 inline bool ExtraData ( ExtraData_e eType, void** ppData )
398 {
399 return ExtraDataImpl ( eType, ppData );
400 }
401 private:
ExtraDataImpl(ExtraData_e,void **)402 virtual bool ExtraDataImpl ( ExtraData_e, void** )
403 {
404 return false;
405 }
406 };
407
408
409 class ISphRanker;
410 class ISphMatchSorter;
411 class UservarIntSet_c;
412
413
414 /// per-query search context
415 /// everything that index needs to compute/create to process the query
416 class CSphQueryContext
417 {
418 public:
419 // searching-only, per-query
420 int m_iWeights; ///< search query field weights count
421 int m_dWeights [ SPH_MAX_FIELDS ]; ///< search query field weights
422
423 bool m_bLookupFilter; ///< row data lookup required at filtering stage
424 bool m_bLookupSort; ///< row data lookup required at sorting stage
425
426 DWORD m_uPackedFactorFlags; ///< whether we need to calculate packed factors (and some extra options)
427
428 ISphFilter * m_pFilter;
429 ISphFilter * m_pWeightFilter;
430
431 struct CalcItem_t
432 {
433 CSphAttrLocator m_tLoc; ///< result locator
434 ESphAttr m_eType; ///< result type
435 ISphExpr * m_pExpr; ///< evaluator (non-owned)
436 };
437 CSphVector<CalcItem_t> m_dCalcFilter; ///< items to compute for filtering
438 CSphVector<CalcItem_t> m_dCalcSort; ///< items to compute for sorting/grouping
439 CSphVector<CalcItem_t> m_dCalcFinal; ///< items to compute when finalizing result set
440 CSphVector<CalcItem_t> m_dCalcPostAggregate; ///< items to compute aggregate depended with finalized result set
441
442 const CSphVector<CSphAttrOverride> * m_pOverrides; ///< overridden attribute values
443 CSphVector<CSphAttrLocator> m_dOverrideIn;
444 CSphVector<CSphAttrLocator> m_dOverrideOut;
445
446 const void * m_pIndexData; ///< backend specific data
447 CSphQueryProfile * m_pProfile;
448 const SmallStringHash_T<int64_t> * m_pLocalDocs;
449 int64_t m_iTotalDocs;
450 int64_t m_iBadRows;
451
452 public:
453 CSphQueryContext ();
454 ~CSphQueryContext ();
455
456 void BindWeights ( const CSphQuery * pQuery, const CSphSchema & tSchema, CSphString & sWarning );
457 bool SetupCalc ( CSphQueryResult * pResult, const ISphSchema & tInSchema, const CSphSchema & tSchema, const DWORD * pMvaPool, bool bArenaProhibit, bool bExtractPostAggr );
458 bool CreateFilters ( bool bFullscan, const CSphVector<CSphFilterSettings> * pdFilters, const ISphSchema & tSchema, const DWORD * pMvaPool, const BYTE * pStrings, CSphString & sError, ESphCollation eCollation, bool bArenaProhibit, const KillListVector & dKillList );
459 bool SetupOverrides ( const CSphQuery * pQuery, CSphQueryResult * pResult, const CSphSchema & tIndexSchema, const ISphSchema & tOutgoingSchema );
460
461 void CalcFilter ( CSphMatch & tMatch ) const;
462 void CalcSort ( CSphMatch & tMatch ) const;
463 void CalcFinal ( CSphMatch & tMatch ) const;
464 void CalcPostAggregate ( CSphMatch & tMatch ) const;
465
466 void FreeStrFilter ( CSphMatch & tMatch ) const;
467 void FreeStrSort ( CSphMatch & tMatch ) const;
468 void FreeStrFinal ( CSphMatch & tMatch ) const;
469
470 // note that RT index bind pools at segment searching, not at time it setups context
471 void ExprCommand ( ESphExprCommand eCmd, void * pArg );
472 void SetStringPool ( const BYTE * pStrings );
473 void SetMVAPool ( const DWORD * pMva, bool bArenaProhibit );
474 void SetupExtraData ( ISphRanker * pRanker, ISphMatchSorter * pSorter );
475
476 private:
477 CSphVector<const UservarIntSet_c*> m_dUserVals;
478 };
479
480 //////////////////////////////////////////////////////////////////////////
481 // MEMORY TRACKER
482 //////////////////////////////////////////////////////////////////////////
483
484 #define MEM_CATEGORIES \
485 MEM_CATEGORY(MEM_CORE), \
486 MEM_CATEGORY(MEM_INDEX_DISK), \
487 MEM_CATEGORY(MEM_INDEX_RT), \
488 MEM_CATEGORY(MEM_API_HANDLE ), \
489 MEM_CATEGORY(MEM_API_SEARCH ), \
490 MEM_CATEGORY(MEM_API_QUERY ), \
491 MEM_CATEGORY(MEM_RT_ACCUM), \
492 MEM_CATEGORY(MEM_MMAPED), \
493 MEM_CATEGORY(MEM_BINLOG), \
494 MEM_CATEGORY(MEM_SQL_HANDLE), \
495 MEM_CATEGORY(MEM_SQL_INSERT), \
496 MEM_CATEGORY(MEM_SQL_SELECT), \
497 MEM_CATEGORY(MEM_SQL_DELETE), \
498 MEM_CATEGORY(MEM_SQL_SET), \
499 MEM_CATEGORY(MEM_SQL_BEGIN), \
500 MEM_CATEGORY(MEM_SQL_COMMIT), \
501 MEM_CATEGORY(MEM_SQL_ALTER), \
502 MEM_CATEGORY(MEM_DISK_QUERY), \
503 MEM_CATEGORY(MEM_DISK_QUERYEX), \
504 MEM_CATEGORY(MEM_RT_QUERY), \
505 MEM_CATEGORY(MEM_RT_RES_MATCHES), \
506 MEM_CATEGORY(MEM_RT_RES_STRINGS)
507
508 #define MEM_CATEGORY(_arg) _arg
509 enum MemCategory_e
510 {
511 MEM_CATEGORIES,
512 MEM_TOTAL
513 };
514 #undef MEM_CATEGORY
515
516 #if SPH_ALLOCS_PROFILER
517
518 void sphMemStatPush ( MemCategory_e eCategory );
519 void sphMemStatPop ( MemCategory_e eCategory );
520
521 // memory tracker
522 struct MemTracker_c : ISphNoncopyable
523 {
524 const MemCategory_e m_eCategory; ///< category
525
526 /// ctor
MemTracker_cMemTracker_c527 explicit MemTracker_c ( MemCategory_e eCategory )
528 : m_eCategory ( eCategory )
529 {
530 sphMemStatPush ( m_eCategory );
531 }
532
533 /// dtor
~MemTracker_cMemTracker_c534 ~MemTracker_c ()
535 {
536 sphMemStatPop ( m_eCategory );
537 }
538 };
539
540 #define MEMORY(name) MemTracker_c tracker_##__LINE__##name(name);
541
542 #else // SPH_ALLOCS_PROFILER 0
543
544 #define MEMORY(name)
545
546 #endif // if SPH_ALLOCS_PROFILER
547
548 //////////////////////////////////////////////////////////////////////////
549 // BLOCK-LEVEL ATTRIBUTE INDEX BUILDER
550 //////////////////////////////////////////////////////////////////////////
551
552 #define DOCINFO_INDEX_FREQ 128 // FIXME? make this configurable
553 #define SPH_SKIPLIST_BLOCK 128 ///< must be a power of two
554
MVA_UPSIZE(const DWORD * pMva)555 inline int64_t MVA_UPSIZE ( const DWORD * pMva )
556 {
557 int64_t iMva = (int64_t)( (uint64_t)pMva[0] | ( ( (uint64_t)pMva[1] )<<32 ) );
558 return iMva;
559 }
560
561
562 // FIXME!!! for over INT_MAX attributes
563 /// attr min-max builder
564 template < typename DOCID = SphDocID_t >
565 class AttrIndexBuilder_t : ISphNoncopyable
566 {
567 private:
568 CSphVector<CSphAttrLocator> m_dIntAttrs;
569 CSphVector<CSphAttrLocator> m_dFloatAttrs;
570 CSphVector<CSphAttrLocator> m_dMvaAttrs;
571 CSphVector<SphAttr_t> m_dIntMin;
572 CSphVector<SphAttr_t> m_dIntMax;
573 CSphVector<SphAttr_t> m_dIntIndexMin;
574 CSphVector<SphAttr_t> m_dIntIndexMax;
575 CSphVector<float> m_dFloatMin;
576 CSphVector<float> m_dFloatMax;
577 CSphVector<float> m_dFloatIndexMin;
578 CSphVector<float> m_dFloatIndexMax;
579 CSphVector<int64_t> m_dMvaMin;
580 CSphVector<int64_t> m_dMvaMax;
581 CSphVector<int64_t> m_dMvaIndexMin;
582 CSphVector<int64_t> m_dMvaIndexMax;
583 DWORD m_uStride; // size of attribute's chunk (in DWORDs)
584 DWORD m_uElements; // counts total number of collected min/max pairs
585 int m_iLoop; // loop inside one set
586 DWORD * m_pOutBuffer; // storage for collected min/max
587 DWORD * m_pOutMax; // storage max for bound checking
588 DOCID m_uStart; // first and last docids of current chunk
589 DOCID m_uLast;
590 DOCID m_uIndexStart; // first and last docids of whole index
591 DOCID m_uIndexLast;
592 int m_iMva64;
593
594 private:
595 void ResetLocal();
596 void FlushComputed();
597 void UpdateMinMaxDocids ( DOCID uDocID );
598 void CollectRowMVA ( int iAttr, DWORD uCount, const DWORD * pMva );
599 void CollectWithoutMvas ( const DWORD * pCur );
600
601 public:
602 explicit AttrIndexBuilder_t ( const CSphSchema & tSchema );
603
604 void Prepare ( DWORD * pOutBuffer, DWORD * pOutMax );
605
606 bool Collect ( const DWORD * pCur, const DWORD * pMvas, int64_t iMvasCount, CSphString & sError, bool bHasMvaID );
607
608 void FinishCollect ();
609
610 /// actually used part of output buffer, only used with index merge
611 /// (we reserve space for rows from both indexes, but might kill some rows)
GetActualSize()612 inline int64_t GetActualSize() const
613 {
614 return int64_t ( m_uElements ) * m_uStride * 2;
615 }
616
617 /// how many DWORDs will we need for block index
GetExpectedSize(int64_t iMaxDocs)618 inline int64_t GetExpectedSize ( int64_t iMaxDocs ) const
619 {
620 assert ( iMaxDocs>=0 );
621 int64_t iDocinfoIndex = ( iMaxDocs + DOCINFO_INDEX_FREQ - 1 ) / DOCINFO_INDEX_FREQ;
622 return ( iDocinfoIndex + 1 ) * m_uStride * 2;
623 }
624 };
625
626 typedef AttrIndexBuilder_t<> AttrIndexBuilder_c;
627
628 // dirty hack for some build systems which not has LLONG_MAX
629 #ifndef LLONG_MAX
630 #define LLONG_MAX (((unsigned long long)(-1))>>1)
631 #endif
632
633 #ifndef LLONG_MIN
634 #define LLONG_MIN (-LLONG_MAX-1)
635 #endif
636
637 #ifndef ULLONG_MAX
638 #define ULLONG_MAX (LLONG_MAX * 2ULL + 1)
639 #endif
640
641
642 template < typename DOCID >
ResetLocal()643 void AttrIndexBuilder_t<DOCID>::ResetLocal()
644 {
645 ARRAY_FOREACH ( i, m_dIntMin )
646 {
647 m_dIntMin[i] = LLONG_MAX;
648 m_dIntMax[i] = 0;
649 }
650 ARRAY_FOREACH ( i, m_dFloatMin )
651 {
652 m_dFloatMin[i] = FLT_MAX;
653 m_dFloatMax[i] = -FLT_MAX;
654 }
655 ARRAY_FOREACH ( i, m_dMvaMin )
656 {
657 m_dMvaMin[i] = LLONG_MAX;
658 m_dMvaMax[i] = ( i>=m_iMva64 ? LLONG_MIN : 0 );
659 }
660 m_uStart = m_uLast = 0;
661 m_iLoop = 0;
662 }
663
664 template < typename DOCID >
FlushComputed()665 void AttrIndexBuilder_t<DOCID>::FlushComputed ()
666 {
667 assert ( m_pOutBuffer );
668 DWORD * pMinEntry = m_pOutBuffer + 2 * m_uElements * m_uStride;
669 DWORD * pMaxEntry = pMinEntry + m_uStride;
670 CSphRowitem * pMinAttrs = DOCINFO2ATTRS_T<DOCID> ( pMinEntry );
671 CSphRowitem * pMaxAttrs = pMinAttrs + m_uStride;
672
673 assert ( pMaxEntry+m_uStride<=m_pOutMax );
674 assert ( pMaxAttrs+m_uStride-DOCINFO_IDSIZE<=m_pOutMax );
675
676 m_uIndexLast = m_uLast;
677
678 DOCINFOSETID ( pMinEntry, m_uStart );
679 DOCINFOSETID ( pMaxEntry, m_uLast );
680
681 ARRAY_FOREACH ( i, m_dIntAttrs )
682 {
683 m_dIntIndexMin[i] = Min ( m_dIntIndexMin[i], m_dIntMin[i] );
684 m_dIntIndexMax[i] = Max ( m_dIntIndexMax[i], m_dIntMax[i] );
685 sphSetRowAttr ( pMinAttrs, m_dIntAttrs[i], m_dIntMin[i] );
686 sphSetRowAttr ( pMaxAttrs, m_dIntAttrs[i], m_dIntMax[i] );
687 }
688 ARRAY_FOREACH ( i, m_dFloatAttrs )
689 {
690 m_dFloatIndexMin[i] = Min ( m_dFloatIndexMin[i], m_dFloatMin[i] );
691 m_dFloatIndexMax[i] = Max ( m_dFloatIndexMax[i], m_dFloatMax[i] );
692 sphSetRowAttr ( pMinAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatMin[i] ) );
693 sphSetRowAttr ( pMaxAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatMax[i] ) );
694 }
695
696 ARRAY_FOREACH ( i, m_dMvaAttrs )
697 {
698 m_dMvaIndexMin[i] = Min ( m_dMvaIndexMin[i], m_dMvaMin[i] );
699 m_dMvaIndexMax[i] = Max ( m_dMvaIndexMax[i], m_dMvaMax[i] );
700 sphSetRowAttr ( pMinAttrs, m_dMvaAttrs[i], m_dMvaMin[i] );
701 sphSetRowAttr ( pMaxAttrs, m_dMvaAttrs[i], m_dMvaMax[i] );
702 }
703
704 m_uElements++;
705 ResetLocal();
706 }
707
708 template < typename DOCID >
UpdateMinMaxDocids(DOCID uDocID)709 void AttrIndexBuilder_t<DOCID>::UpdateMinMaxDocids ( DOCID uDocID )
710 {
711 if ( !m_uStart )
712 m_uStart = uDocID;
713 if ( !m_uIndexStart )
714 m_uIndexStart = uDocID;
715 m_uLast = uDocID;
716 }
717
718 template < typename DOCID >
AttrIndexBuilder_t(const CSphSchema & tSchema)719 AttrIndexBuilder_t<DOCID>::AttrIndexBuilder_t ( const CSphSchema & tSchema )
720 : m_uStride ( DWSIZEOF(DOCID) + tSchema.GetRowSize() )
721 , m_uElements ( 0 )
722 , m_iLoop ( 0 )
723 , m_pOutBuffer ( NULL )
724 , m_pOutMax ( NULL )
725 , m_uStart ( 0 )
726 , m_uLast ( 0 )
727 , m_uIndexStart ( 0 )
728 , m_uIndexLast ( 0 )
729 {
730 for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
731 {
732 const CSphColumnInfo & tCol = tSchema.GetAttr(i);
733 switch ( tCol.m_eAttrType )
734 {
735 case SPH_ATTR_INTEGER:
736 case SPH_ATTR_TIMESTAMP:
737 case SPH_ATTR_BOOL:
738 case SPH_ATTR_BIGINT:
739 case SPH_ATTR_TOKENCOUNT:
740 m_dIntAttrs.Add ( tCol.m_tLocator );
741 break;
742
743 case SPH_ATTR_FLOAT:
744 m_dFloatAttrs.Add ( tCol.m_tLocator );
745 break;
746
747 case SPH_ATTR_UINT32SET:
748 m_dMvaAttrs.Add ( tCol.m_tLocator );
749 break;
750
751 default:
752 break;
753 }
754 }
755
756 m_iMva64 = m_dMvaAttrs.GetLength();
757 for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
758 {
759 const CSphColumnInfo & tCol = tSchema.GetAttr(i);
760 if ( tCol.m_eAttrType==SPH_ATTR_INT64SET )
761 m_dMvaAttrs.Add ( tCol.m_tLocator );
762 }
763
764
765 m_dIntMin.Resize ( m_dIntAttrs.GetLength() );
766 m_dIntMax.Resize ( m_dIntAttrs.GetLength() );
767 m_dIntIndexMin.Resize ( m_dIntAttrs.GetLength() );
768 m_dIntIndexMax.Resize ( m_dIntAttrs.GetLength() );
769 m_dFloatMin.Resize ( m_dFloatAttrs.GetLength() );
770 m_dFloatMax.Resize ( m_dFloatAttrs.GetLength() );
771 m_dFloatIndexMin.Resize ( m_dFloatAttrs.GetLength() );
772 m_dFloatIndexMax.Resize ( m_dFloatAttrs.GetLength() );
773 m_dMvaMin.Resize ( m_dMvaAttrs.GetLength() );
774 m_dMvaMax.Resize ( m_dMvaAttrs.GetLength() );
775 m_dMvaIndexMin.Resize ( m_dMvaAttrs.GetLength() );
776 m_dMvaIndexMax.Resize ( m_dMvaAttrs.GetLength() );
777 }
778
779 template < typename DOCID >
Prepare(DWORD * pOutBuffer,DWORD * pOutMax)780 void AttrIndexBuilder_t<DOCID>::Prepare ( DWORD * pOutBuffer, DWORD * pOutMax )
781 {
782 m_pOutBuffer = pOutBuffer;
783 m_pOutMax = pOutMax;
784 memset ( pOutBuffer, 0, ( pOutMax-pOutBuffer )*sizeof(DWORD) );
785
786 m_uElements = 0;
787 m_uIndexStart = m_uIndexLast = 0;
788 ARRAY_FOREACH ( i, m_dIntIndexMin )
789 {
790 m_dIntIndexMin[i] = LLONG_MAX;
791 m_dIntIndexMax[i] = 0;
792 }
793 ARRAY_FOREACH ( i, m_dFloatIndexMin )
794 {
795 m_dFloatIndexMin[i] = FLT_MAX;
796 m_dFloatIndexMax[i] = -FLT_MAX;
797 }
798 ARRAY_FOREACH ( i, m_dMvaIndexMin )
799 {
800 m_dMvaIndexMin[i] = LLONG_MAX;
801 m_dMvaIndexMax[i] = ( i>=m_iMva64 ? LLONG_MIN : 0 );
802 }
803 ResetLocal();
804 }
805
806 template < typename DOCID >
CollectWithoutMvas(const DWORD * pCur)807 void AttrIndexBuilder_t<DOCID>::CollectWithoutMvas ( const DWORD * pCur )
808 {
809 // check if it is time to flush already collected values
810 if ( m_iLoop>=DOCINFO_INDEX_FREQ )
811 FlushComputed ();
812
813 const DWORD * pRow = DOCINFO2ATTRS_T<DOCID>(pCur);
814 UpdateMinMaxDocids ( DOCINFO2ID_T<DOCID>(pCur) );
815 m_iLoop++;
816
817 // ints
818 ARRAY_FOREACH ( i, m_dIntAttrs )
819 {
820 SphAttr_t uVal = sphGetRowAttr ( pRow, m_dIntAttrs[i] );
821 m_dIntMin[i] = Min ( m_dIntMin[i], uVal );
822 m_dIntMax[i] = Max ( m_dIntMax[i], uVal );
823 }
824
825 // floats
826 ARRAY_FOREACH ( i, m_dFloatAttrs )
827 {
828 float fVal = sphDW2F ( (DWORD)sphGetRowAttr ( pRow, m_dFloatAttrs[i] ) );
829 m_dFloatMin[i] = Min ( m_dFloatMin[i], fVal );
830 m_dFloatMax[i] = Max ( m_dFloatMax[i], fVal );
831 }
832 }
833
834 template < typename DOCID >
CollectRowMVA(int iAttr,DWORD uCount,const DWORD * pMva)835 void AttrIndexBuilder_t<DOCID>::CollectRowMVA ( int iAttr, DWORD uCount, const DWORD * pMva )
836 {
837 if ( iAttr>=m_iMva64 )
838 {
839 assert ( ( uCount%2 )==0 );
840 for ( ; uCount>0; uCount-=2, pMva+=2 )
841 {
842 int64_t iVal = MVA_UPSIZE ( pMva );
843 m_dMvaMin[iAttr] = Min ( m_dMvaMin[iAttr], iVal );
844 m_dMvaMax[iAttr] = Max ( m_dMvaMax[iAttr], iVal );
845 }
846 } else
847 {
848 for ( ; uCount>0; uCount--, pMva++ )
849 {
850 DWORD uVal = *pMva;
851 m_dMvaMin[iAttr] = Min ( m_dMvaMin[iAttr], uVal );
852 m_dMvaMax[iAttr] = Max ( m_dMvaMax[iAttr], uVal );
853 }
854 }
855 }
856
857 template < typename DOCID >
Collect(const DWORD * pCur,const DWORD * pMvas,int64_t iMvasCount,CSphString & sError,bool bHasMvaID)858 bool AttrIndexBuilder_t<DOCID>::Collect ( const DWORD * pCur, const DWORD * pMvas, int64_t iMvasCount, CSphString & sError, bool bHasMvaID )
859 {
860 CollectWithoutMvas ( pCur );
861
862 const DWORD * pRow = DOCINFO2ATTRS_T<DOCID>(pCur);
863 SphDocID_t uDocID = DOCINFO2ID_T<DOCID>(pCur);
864
865 // MVAs
866 ARRAY_FOREACH ( i, m_dMvaAttrs )
867 {
868 SphAttr_t uOff = sphGetRowAttr ( pRow, m_dMvaAttrs[i] );
869 if ( !uOff )
870 continue;
871
872 // sanity checks
873 if ( uOff>=iMvasCount )
874 {
875 sError.SetSprintf ( "broken index: mva offset out of bounds, id=" DOCID_FMT, (SphDocID_t)uDocID );
876 return false;
877 }
878
879 const DWORD * pMva = pMvas + uOff; // don't care about updates at this point
880
881 if ( bHasMvaID && i==0 && DOCINFO2ID_T<DOCID> ( pMva-DWSIZEOF(DOCID) )!=uDocID )
882 {
883 sError.SetSprintf ( "broken index: mva docid verification failed, id=" DOCID_FMT, (SphDocID_t)uDocID );
884 return false;
885 }
886
887 DWORD uCount = *pMva++;
888 if ( ( uOff+uCount>=iMvasCount ) || ( i>=m_iMva64 && ( uCount%2 )!=0 ) )
889 {
890 sError.SetSprintf ( "broken index: mva list out of bounds, id=" DOCID_FMT, (SphDocID_t)uDocID );
891 return false;
892 }
893
894 // walk and calc
895 CollectRowMVA ( i, uCount, pMva );
896 }
897 return true;
898 }
899
900 template < typename DOCID >
FinishCollect()901 void AttrIndexBuilder_t<DOCID>::FinishCollect ()
902 {
903 assert ( m_pOutBuffer );
904 if ( m_iLoop )
905 FlushComputed ();
906
907 DWORD * pMinEntry = m_pOutBuffer + 2 * m_uElements * m_uStride;
908 DWORD * pMaxEntry = pMinEntry + m_uStride;
909 CSphRowitem * pMinAttrs = DOCINFO2ATTRS_T<DOCID> ( pMinEntry );
910 CSphRowitem * pMaxAttrs = pMinAttrs + m_uStride;
911
912 assert ( pMaxEntry+m_uStride<=m_pOutMax );
913 assert ( pMaxAttrs+m_uStride-DWSIZEOF(DOCID)<=m_pOutMax );
914
915 DOCINFOSETID ( pMinEntry, m_uIndexStart );
916 DOCINFOSETID ( pMaxEntry, m_uIndexLast );
917
918 ARRAY_FOREACH ( i, m_dMvaAttrs )
919 {
920 sphSetRowAttr ( pMinAttrs, m_dMvaAttrs[i], m_dMvaIndexMin[i] );
921 sphSetRowAttr ( pMaxAttrs, m_dMvaAttrs[i], m_dMvaIndexMax[i] );
922 }
923
924 ARRAY_FOREACH ( i, m_dIntAttrs )
925 {
926 sphSetRowAttr ( pMinAttrs, m_dIntAttrs[i], m_dIntIndexMin[i] );
927 sphSetRowAttr ( pMaxAttrs, m_dIntAttrs[i], m_dIntIndexMax[i] );
928 }
929 ARRAY_FOREACH ( i, m_dFloatAttrs )
930 {
931 sphSetRowAttr ( pMinAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatIndexMin[i] ) );
932 sphSetRowAttr ( pMaxAttrs, m_dFloatAttrs[i], sphF2DW ( m_dFloatIndexMax[i] ) );
933 }
934 m_uElements++;
935 }
936
937 struct PoolPtrs_t
938 {
939 const DWORD * m_pMva;
940 const BYTE * m_pStrings;
941 bool m_bArenaProhibit;
942
PoolPtrs_tPoolPtrs_t943 PoolPtrs_t ()
944 : m_pMva ( NULL )
945 , m_pStrings ( NULL )
946 , m_bArenaProhibit ( false )
947 {}
948 };
949
950 class CSphFreeList
951 {
952 private:
953 CSphTightVector<int> m_dFree;
954 int m_iNextFree;
955 #ifndef NDEBUG
956 int m_iSize;
957 #endif
958
959 public:
CSphFreeList()960 CSphFreeList ()
961 : m_iNextFree ( 0 )
962 #ifndef NDEBUG
963 , m_iSize ( 0 )
964 #endif
965 {}
966
Reset(int iSize)967 void Reset ( int iSize )
968 {
969 #ifndef NDEBUG
970 m_iSize = iSize;
971 #endif
972 m_iNextFree = 0;
973 m_dFree.Reserve ( iSize );
974 }
975
Get()976 int Get ()
977 {
978 int iRes = -1;
979 if ( m_dFree.GetLength () )
980 iRes = m_dFree.Pop ();
981 else
982 iRes = m_iNextFree++;
983 assert ( iRes>=0 && iRes<m_iSize );
984 return iRes;
985 }
986
Free(int iIndex)987 void Free ( int iIndex )
988 {
989 assert ( iIndex>=0 && iIndex<m_iSize );
990 m_dFree.Add ( iIndex );
991 }
992 };
993
994 //////////////////////////////////////////////////////////////////////////
995 // INLINES, FIND_XXX() GENERIC FUNCTIONS
996 //////////////////////////////////////////////////////////////////////////
997
998 /// find a value-enclosing span in a sorted vector (aka an index at which vec[i] <= val < vec[i+1])
999 template < typename T, typename U >
1000 static int FindSpan ( const CSphVector<T> & dVec, U tRef, int iSmallTreshold=8 )
1001 {
1002 // empty vector
1003 if ( !dVec.GetLength() )
1004 return -1;
1005
1006 // check last semi-span
1007 if ( dVec.Last()<tRef || dVec.Last()==tRef )
1008 return dVec.GetLength()-1;
1009
1010 // linear search for small vectors
1011 if ( dVec.GetLength()<=iSmallTreshold )
1012 {
1013 for ( int i=0; i<dVec.GetLength()-1; i++ )
1014 if ( ( dVec[i]<tRef || dVec[i]==tRef ) && tRef<dVec[i+1] )
1015 return i;
1016 return -1;
1017 }
1018
1019 // binary search for longer vectors
1020 const T * pStart = dVec.Begin();
1021 const T * pEnd = &dVec.Last();
1022
1023 if ( ( pStart[0]<tRef || pStart[0]==tRef ) && tRef<pStart[1] )
1024 return 0;
1025
1026 if ( ( pEnd[-1]<tRef || pEnd[-1]==tRef ) && tRef<pEnd[0] )
1027 return pEnd-dVec.Begin()-1;
1028
1029 while ( pEnd-pStart>1 )
1030 {
1031 if ( tRef<*pStart || *pEnd<tRef )
1032 break;
1033 assert ( *pStart<tRef );
1034 assert ( tRef<*pEnd );
1035
1036 const T * pMid = pStart + (pEnd-pStart)/2;
1037 assert ( pMid+1 < &dVec.Last() );
1038
1039 if ( ( pMid[0]<tRef || pMid[0]==tRef ) && tRef<pMid[1] )
1040 return pMid - dVec.Begin();
1041
1042 if ( tRef<pMid[0] )
1043 pEnd = pMid;
1044 else
1045 pStart = pMid;
1046 }
1047
1048 return -1;
1049 }
1050
1051
FindBit(DWORD uValue)1052 inline int FindBit ( DWORD uValue )
1053 {
1054 DWORD uMask = 0xffff;
1055 int iIdx = 0;
1056 int iBits = 16;
1057
1058 // we negate bits to compare with 0
1059 // this makes MSVC emit 'test' instead of 'cmp'
1060 uValue ^= 0xffffffff;
1061 for ( int t=0; t<5; t++ )
1062 {
1063 if ( ( uValue & uMask )==0 )
1064 {
1065 iIdx += iBits;
1066 uValue >>= iBits;
1067 }
1068 iBits >>= 1;
1069 uMask >>= iBits;
1070 }
1071 return iIdx;
1072 }
1073
1074
sphEncodeVLB8(BYTE * buf,uint64_t v)1075 inline int sphEncodeVLB8 ( BYTE * buf, uint64_t v )
1076 {
1077 BYTE b;
1078 int n = 0;
1079
1080 do
1081 {
1082 b = (BYTE)(v & 0x7f);
1083 v >>= 7;
1084 if ( v )
1085 b |= 0x80;
1086 *buf++ = b;
1087 n++;
1088 } while ( v );
1089 return n;
1090 }
1091
1092
spnDecodeVLB8(const BYTE * pIn,uint64_t & uValue)1093 inline const BYTE * spnDecodeVLB8 ( const BYTE * pIn, uint64_t & uValue )
1094 {
1095 BYTE bIn;
1096 int iOff = 0;
1097
1098 do
1099 {
1100 bIn = *pIn++;
1101 uValue += ( uint64_t ( bIn & 0x7f ) ) << iOff;
1102 iOff += 7;
1103 } while ( bIn & 0x80 );
1104
1105 return pIn;
1106 }
1107
1108 //////////////////////////////////////////////////////////////////////////
1109 // INLINES, UTF-8 TOOLS
1110 //////////////////////////////////////////////////////////////////////////
1111
1112 #define SPH_MAX_UTF8_BYTES 4
1113
1114 /// decode UTF-8 codepoint
1115 /// advances buffer ptr in all cases, including the end of buffer (ie. zero byte)!
1116 /// so eof MUST be handled, otherwise, you get OOB
1117 ///
1118 /// returns -1 on failure
1119 /// returns 0 on end of buffer
1120 /// returns codepoint on success
sphUTF8Decode(const BYTE * & pBuf)1121 inline int sphUTF8Decode ( const BYTE * & pBuf )
1122 {
1123 BYTE v = *pBuf++;
1124 if ( !v )
1125 return 0;
1126
1127 // check for 7-bit case
1128 if ( v<128 )
1129 return v;
1130
1131 // get number of bytes
1132 int iBytes = 0;
1133 while ( v & 0x80 )
1134 {
1135 iBytes++;
1136 v <<= 1;
1137 }
1138
1139 // check for valid number of bytes
1140 if ( iBytes<2 || iBytes>SPH_MAX_UTF8_BYTES )
1141 return -1;
1142
1143 int iCode = ( v >> iBytes );
1144 iBytes--;
1145 do
1146 {
1147 if ( !(*pBuf) )
1148 return 0; // unexpected eof
1149
1150 if ( ((*pBuf) & 0xC0)!=0x80 )
1151 return -1; // invalid code
1152
1153 iCode = ( iCode<<6 ) + ( (*pBuf) & 0x3F );
1154 iBytes--;
1155 pBuf++;
1156 } while ( iBytes );
1157
1158 // all good
1159 return iCode;
1160 }
1161
1162
1163 /// encode UTF-8 codepoint to buffer, macro version for the Really Critical places
1164 #define SPH_UTF8_ENCODE(_ptr,_code) \
1165 if ( (_code)<0x80 ) \
1166 { \
1167 *_ptr++ = (BYTE)( (_code) & 0x7F ); \
1168 } else if ( (_code)<0x800 ) \
1169 { \
1170 _ptr[0] = (BYTE)( ( ((_code)>>6) & 0x1F ) | 0xC0 ); \
1171 _ptr[1] = (BYTE)( ( (_code) & 0x3F ) | 0x80 ); \
1172 _ptr += 2; \
1173 } else if ( (_code)<0x10000 )\
1174 { \
1175 _ptr[0] = (BYTE)( ( ((_code)>>12) & 0x0F ) | 0xE0 ); \
1176 _ptr[1] = (BYTE)( ( ((_code)>>6) & 0x3F ) | 0x80 ); \
1177 _ptr[2] = (BYTE)( ( (_code) & 0x3F ) | 0x80 ); \
1178 _ptr += 3; \
1179 } else \
1180 { \
1181 _ptr[0] = (BYTE)( ( ((_code)>>18) & 0x0F ) | 0xF0 ); \
1182 _ptr[1] = (BYTE)( ( ((_code)>>12) & 0x3F ) | 0x80 ); \
1183 _ptr[2] = (BYTE)( ( ((_code)>>6) & 0x3F ) | 0x80 ); \
1184 _ptr[3] = (BYTE)( ( (_code) & 0x3F ) | 0x80 ); \
1185 _ptr += 4; \
1186 }
1187
1188
1189 /// encode UTF-8 codepoint to buffer
1190 /// returns number of bytes used
sphUTF8Encode(BYTE * pBuf,int iCode)1191 inline int sphUTF8Encode ( BYTE * pBuf, int iCode )
1192 {
1193 if ( iCode<0x80 )
1194 {
1195 pBuf[0] = (BYTE)( iCode & 0x7F );
1196 return 1;
1197 }
1198
1199 if ( iCode<0x800 )
1200 {
1201 pBuf[0] = (BYTE)( ( (iCode>>6) & 0x1F ) | 0xC0 );
1202 pBuf[1] = (BYTE)( ( iCode & 0x3F ) | 0x80 );
1203 return 2;
1204 }
1205
1206 if ( iCode<0x10000 )
1207 {
1208 pBuf[0] = (BYTE)( ( (iCode>>12) & 0x0F ) | 0xE0 );
1209 pBuf[1] = (BYTE)( ( (iCode>>6) & 0x3F ) | 0x80 );
1210 pBuf[2] = (BYTE)( ( iCode & 0x3F ) | 0x80 );
1211 return 3;
1212 }
1213
1214 pBuf[0] = (BYTE)( ( (iCode>>18) & 0x0F ) | 0xF0 );
1215 pBuf[1] = (BYTE)( ( (iCode>>12) & 0x3F ) | 0x80 );
1216 pBuf[2] = (BYTE)( ( (iCode>>6) & 0x3F ) | 0x80 );
1217 pBuf[3] = (BYTE)( ( iCode & 0x3F ) | 0x80 );
1218 return 4;
1219 }
1220
1221
1222 /// compute UTF-8 string length in codepoints
sphUTF8Len(const char * pStr)1223 inline int sphUTF8Len ( const char * pStr )
1224 {
1225 if ( !pStr || *pStr=='\0' )
1226 return 0;
1227
1228 const BYTE * pBuf = (const BYTE*) pStr;
1229 int iRes = 0, iCode;
1230
1231 while ( ( iCode = sphUTF8Decode(pBuf) )!=0 )
1232 if ( iCode>0 )
1233 iRes++;
1234
1235 return iRes;
1236 }
1237
1238
1239 /// compute UTF-8 string length in codepoints
sphUTF8Len(const char * pStr,int iMax)1240 inline int sphUTF8Len ( const char * pStr, int iMax )
1241 {
1242 if ( !pStr || *pStr=='\0' )
1243 return 0;
1244
1245 const BYTE * pBuf = (const BYTE*) pStr;
1246 const BYTE * pMax = pBuf + iMax;
1247 int iRes = 0, iCode;
1248
1249 while ( pBuf<pMax && iRes<iMax && ( iCode = sphUTF8Decode ( pBuf ) )!=0 )
1250 if ( iCode>0 )
1251 iRes++;
1252
1253 return iRes;
1254 }
1255
1256 /// quick check for UTF-8
sphIsUTF8(const char * pStr)1257 inline bool sphIsUTF8 ( const char * pStr )
1258 {
1259 while ( *pStr )
1260 {
1261 if ( *pStr < 0 )
1262 return true;
1263 pStr++;
1264 }
1265 return false;
1266 }
1267
1268 /// convert UTF-8 to codepoints, return string length
sphUTF8ToWideChar(const char * pSrc,int * pDst,int iMaxLen)1269 inline int sphUTF8ToWideChar ( const char * pSrc, int * pDst, int iMaxLen )
1270 {
1271 const BYTE * p = (const BYTE*) pSrc;
1272 int iLen = 0, iCode;
1273 while ( ( iCode = sphUTF8Decode(p) )!=0 && iLen<iMaxLen )
1274 {
1275 *pDst++ = iCode;
1276 iLen++;
1277 }
1278 *pDst = 0;
1279 return iLen;
1280 }
1281
1282 //////////////////////////////////////////////////////////////////////////
1283 // MATCHING ENGINE INTERNALS
1284 //////////////////////////////////////////////////////////////////////////
1285
1286 static const int FIELD_BITS = 8;
1287 typedef Hitman_c<FIELD_BITS> HITMAN;
1288
1289 /// hit in the stream
1290 /// combines posting info (docid and hitpos) with a few more matching/ranking bits
1291 ///
1292 /// note that while in simple cases every hit would just represent a single keyword,
1293 /// this is NOT always the case; phrase, proximity, and NEAR operators (that already
1294 /// analyze keywords positions while matching the document) can emit a single folded
1295 /// hit representing the entire multi-keyword match, so that the ranker could avoid
1296 /// double work processing individual hits again. in such cases, m_uWeight, m_uSpanlen,
1297 /// and m_uMatchlen will differ from the "usual" value of 1.
1298 ///
1299 /// thus, in folded hits:
1300 /// - m_uWeight is the match LCS value in all cases (phrase, proximity, near).
1301 /// - m_uSpanlen is the match span length, ie. a distance from the first to the last
1302 /// matching keyword. for phrase operators it natually equals m_uWeight, for other
1303 /// operators it might be very different.
1304 /// - m_uMatchlen is a piece of voodoo magic that only the near operator seems to use.
1305 struct ExtHit_t
1306 {
1307 SphDocID_t m_uDocid;
1308 Hitpos_t m_uHitpos;
1309 WORD m_uQuerypos;
1310 WORD m_uNodepos;
1311 WORD m_uSpanlen;
1312 WORD m_uMatchlen;
1313 DWORD m_uWeight; ///< 1 for individual keywords, LCS value for folded phrase/proximity/near hits
1314 DWORD m_uQposMask;
1315 };
1316
1317 enum SphZoneHit_e
1318 {
1319 SPH_ZONE_FOUND,
1320 SPH_ZONE_NO_SPAN,
1321 SPH_ZONE_NO_DOCUMENT
1322 };
1323
1324 class ISphZoneCheck
1325 {
1326 public:
~ISphZoneCheck()1327 virtual ~ISphZoneCheck () {}
1328 virtual SphZoneHit_e IsInZone ( int iZone, const ExtHit_t * pHit, int * pLastSpan ) = 0;
1329 };
1330
1331
1332 struct SphFactorHashEntry_t
1333 {
1334 SphDocID_t m_iId;
1335 int m_iRefCount;
1336 BYTE * m_pData;
1337 SphFactorHashEntry_t * m_pPrev;
1338 SphFactorHashEntry_t * m_pNext;
1339 };
1340
1341 typedef CSphFixedVector<SphFactorHashEntry_t *> SphFactorHash_t;
1342
1343
1344 struct SphExtraDataRankerState_t
1345 {
1346 const CSphSchema * m_pSchema;
1347 const int64_t * m_pFieldLens;
1348 CSphAttrLocator m_tFieldLensLoc;
1349 int64_t m_iTotalDocuments;
1350 int m_iFields;
1351 int m_iMaxQpos;
SphExtraDataRankerState_tSphExtraDataRankerState_t1352 SphExtraDataRankerState_t ()
1353 : m_pSchema ( NULL )
1354 , m_pFieldLens ( NULL )
1355 , m_iTotalDocuments ( 0 )
1356 , m_iFields ( 0 )
1357 , m_iMaxQpos ( 0 )
1358 { }
1359 };
1360
1361
1362 struct MatchSortAccessor_t
1363 {
1364 typedef CSphMatch T;
1365 typedef CSphMatch * MEDIAN_TYPE;
1366
1367 CSphMatch m_tMedian;
1368
MatchSortAccessor_tMatchSortAccessor_t1369 MatchSortAccessor_t () {}
MatchSortAccessor_tMatchSortAccessor_t1370 MatchSortAccessor_t ( const MatchSortAccessor_t & ) {}
1371
~MatchSortAccessor_tMatchSortAccessor_t1372 virtual ~MatchSortAccessor_t()
1373 {
1374 m_tMedian.m_pDynamic = NULL; // not yours
1375 }
1376
KeyMatchSortAccessor_t1377 MEDIAN_TYPE Key ( CSphMatch * a ) const
1378 {
1379 return a;
1380 }
1381
CopyKeyMatchSortAccessor_t1382 void CopyKey ( MEDIAN_TYPE * pMed, CSphMatch * pVal )
1383 {
1384 *pMed = &m_tMedian;
1385 m_tMedian.m_uDocID = pVal->m_uDocID;
1386 m_tMedian.m_iWeight = pVal->m_iWeight;
1387 m_tMedian.m_pStatic = pVal->m_pStatic;
1388 m_tMedian.m_pDynamic = pVal->m_pDynamic;
1389 m_tMedian.m_iTag = pVal->m_iTag;
1390 }
1391
SwapMatchSortAccessor_t1392 void Swap ( T * a, T * b ) const
1393 {
1394 ::Swap ( *a, *b );
1395 }
1396
AddMatchSortAccessor_t1397 T * Add ( T * p, int i ) const
1398 {
1399 return p+i;
1400 }
1401
SubMatchSortAccessor_t1402 int Sub ( T * b, T * a ) const
1403 {
1404 return (int)(b-a);
1405 }
1406 };
1407
1408
1409 //////////////////////////////////////////////////////////////////////////
1410 // INLINES, MISC
1411 //////////////////////////////////////////////////////////////////////////
1412
sphTypeName(ESphAttr eType)1413 inline const char * sphTypeName ( ESphAttr eType )
1414 {
1415 switch ( eType )
1416 {
1417 case SPH_ATTR_NONE: return "none";
1418 case SPH_ATTR_INTEGER: return "uint";
1419 case SPH_ATTR_TIMESTAMP: return "timestamp";
1420 case SPH_ATTR_BOOL: return "bool";
1421 case SPH_ATTR_FLOAT: return "float";
1422 case SPH_ATTR_BIGINT: return "bigint";
1423 case SPH_ATTR_STRING: return "string";
1424 case SPH_ATTR_STRINGPTR: return "stringptr";
1425 case SPH_ATTR_TOKENCOUNT: return "tokencount";
1426 case SPH_ATTR_JSON: return "json";
1427
1428 case SPH_ATTR_UINT32SET: return "mva";
1429 case SPH_ATTR_INT64SET: return "mva64";
1430 default: return "unknown";
1431 }
1432 }
1433
sphTypeDirective(ESphAttr eType)1434 inline const char * sphTypeDirective ( ESphAttr eType )
1435 {
1436 switch ( eType )
1437 {
1438 case SPH_ATTR_NONE: return "???";
1439 case SPH_ATTR_INTEGER: return "sql_attr_uint";
1440 case SPH_ATTR_TIMESTAMP: return "sql_attr_timestamp";
1441 case SPH_ATTR_BOOL: return "sql_attr_bool";
1442 case SPH_ATTR_FLOAT: return "sql_attr_float";
1443 case SPH_ATTR_BIGINT: return "sql_attr_bigint";
1444 case SPH_ATTR_STRING: return "sql_attr_string";
1445 case SPH_ATTR_STRINGPTR: return "sql_attr_string";
1446 case SPH_ATTR_TOKENCOUNT: return "_autogenerated_tokencount";
1447 case SPH_ATTR_JSON: return "sql_attr_json";
1448
1449 case SPH_ATTR_UINT32SET: return "sql_attr_multi";
1450 case SPH_ATTR_INT64SET: return "sql_attr_multi bigint";
1451 default: return "???";
1452 }
1453 }
1454
SqlUnescape(CSphString & sRes,const char * sEscaped,int iLen)1455 inline void SqlUnescape ( CSphString & sRes, const char * sEscaped, int iLen )
1456 {
1457 assert ( iLen>=2 );
1458 assert (
1459 ( sEscaped[0]=='\'' && sEscaped[iLen-1]=='\'' ) ||
1460 ( sEscaped[0]=='"' && sEscaped[iLen-1]=='"' ) );
1461
1462 // skip heading and trailing quotes
1463 const char * s = sEscaped+1;
1464 const char * sMax = s+iLen-2;
1465
1466 sRes.Reserve ( iLen );
1467 char * d = (char*) sRes.cstr();
1468
1469 while ( s<sMax )
1470 {
1471 if ( s[0]=='\\' )
1472 {
1473 switch ( s[1] )
1474 {
1475 case 'b': *d++ = '\b'; break;
1476 case 'n': *d++ = '\n'; break;
1477 case 'r': *d++ = '\r'; break;
1478 case 't': *d++ = '\t'; break;
1479 default:
1480 *d++ = s[1];
1481 }
1482 s += 2;
1483 } else
1484 *d++ = *s++;
1485 }
1486
1487 *d++ = '\0';
1488 }
1489
1490
StripPath(CSphString & sPath)1491 inline void StripPath ( CSphString & sPath )
1492 {
1493 if ( sPath.IsEmpty() )
1494 return;
1495
1496 const char * s = sPath.cstr();
1497 if ( *s!='/' )
1498 return;
1499
1500 const char * sLastSlash = s;
1501 for ( ; *s; s++ )
1502 if ( *s=='/' )
1503 sLastSlash = s;
1504
1505 int iPos = (int)( sLastSlash - sPath.cstr() + 1 );
1506 int iLen = (int)( s - sPath.cstr() );
1507 sPath = sPath.SubString ( iPos, iLen - iPos );
1508 }
1509
1510 //////////////////////////////////////////////////////////////////////////
1511 // DISK INDEX INTERNALS
1512 //////////////////////////////////////////////////////////////////////////
1513
1514 /// locator pair, for RT string dynamization
1515 struct LocatorPair_t
1516 {
1517 CSphAttrLocator m_tFrom; ///< source (static) locator
1518 CSphAttrLocator m_tTo; ///< destination (dynamized) locator
1519 };
1520
1521 //////////////////////////////////////////////////////////////////////////
1522 // DICTIONARY INTERNALS
1523 //////////////////////////////////////////////////////////////////////////
1524
1525 /// dict traits
1526 class CSphDictTraits : public CSphDict
1527 {
1528 public:
CSphDictTraits(CSphDict * pDict)1529 explicit CSphDictTraits ( CSphDict * pDict ) : m_pDict ( pDict ) { assert ( m_pDict ); }
1530
LoadStopwords(const char * sFiles,const ISphTokenizer * pTokenizer)1531 virtual void LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer ) { m_pDict->LoadStopwords ( sFiles, pTokenizer ); }
LoadStopwords(const CSphVector<SphWordID_t> & dStopwords)1532 virtual void LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords ) { m_pDict->LoadStopwords ( dStopwords ); }
WriteStopwords(CSphWriter & tWriter)1533 virtual void WriteStopwords ( CSphWriter & tWriter ) { m_pDict->WriteStopwords ( tWriter ); }
LoadWordforms(const CSphVector<CSphString> & dFiles,const CSphEmbeddedFiles * pEmbedded,const ISphTokenizer * pTokenizer,const char * sIndex)1534 virtual bool LoadWordforms ( const CSphVector<CSphString> & dFiles, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex ) { return m_pDict->LoadWordforms ( dFiles, pEmbedded, pTokenizer, sIndex ); }
WriteWordforms(CSphWriter & tWriter)1535 virtual void WriteWordforms ( CSphWriter & tWriter ) { m_pDict->WriteWordforms ( tWriter ); }
SetMorphology(const char * szMorph,CSphString & sMessage)1536 virtual int SetMorphology ( const char * szMorph, CSphString & sMessage ) { return m_pDict->SetMorphology ( szMorph, sMessage ); }
1537
GetWordID(const BYTE * pWord,int iLen,bool bFilterStops)1538 virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) { return m_pDict->GetWordID ( pWord, iLen, bFilterStops ); }
1539 virtual SphWordID_t GetWordID ( BYTE * pWord );
GetWordIDNonStemmed(BYTE * pWord)1540 virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord ) { return m_pDict->GetWordIDNonStemmed ( pWord ); }
1541
Setup(const CSphDictSettings &)1542 virtual void Setup ( const CSphDictSettings & ) {}
GetSettings()1543 virtual const CSphDictSettings & GetSettings () const { return m_pDict->GetSettings (); }
GetStopwordsFileInfos()1544 virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () { return m_pDict->GetStopwordsFileInfos (); }
GetWordformsFileInfos()1545 virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () { return m_pDict->GetWordformsFileInfos (); }
GetMultiWordforms()1546 virtual const CSphMultiformContainer * GetMultiWordforms () const { return m_pDict->GetMultiWordforms (); }
GetWordforms()1547 virtual const CSphWordforms * GetWordforms () { return m_pDict->GetWordforms(); }
1548
IsStopWord(const BYTE * pWord)1549 virtual bool IsStopWord ( const BYTE * pWord ) const { return m_pDict->IsStopWord ( pWord ); }
GetSettingsFNV()1550 virtual uint64_t GetSettingsFNV () const { return m_pDict->GetSettingsFNV(); }
SetApplyMorph(bool bApply)1551 virtual void SetApplyMorph ( bool bApply ) { m_pDict->SetApplyMorph ( bApply ); }
1552
1553 protected:
1554 CSphDict * m_pDict;
1555 };
1556
1557
1558 /// dict wrapper for star-syntax support in prefix-indexes
1559 class CSphDictStar : public CSphDictTraits
1560 {
1561 public:
CSphDictStar(CSphDict * pDict)1562 explicit CSphDictStar ( CSphDict * pDict ) : CSphDictTraits ( pDict ) {}
1563
1564 virtual SphWordID_t GetWordID ( BYTE * pWord );
1565 virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord );
1566 };
1567
1568
1569 /// star dict for index v.8+
1570 class CSphDictStarV8 : public CSphDictStar
1571 {
1572 public:
1573 CSphDictStarV8 ( CSphDict * pDict, bool bPrefixes, bool bInfixes );
1574
1575 virtual SphWordID_t GetWordID ( BYTE * pWord );
1576
1577 private:
1578 bool m_bPrefixes;
1579 bool m_bInfixes;
1580 };
1581
1582
1583 /// dict wrapper for exact-word syntax
1584 class CSphDictExact : public CSphDictTraits
1585 {
1586 public:
CSphDictExact(CSphDict * pDict)1587 explicit CSphDictExact ( CSphDict * pDict ) : CSphDictTraits ( pDict ) {}
1588 virtual SphWordID_t GetWordID ( BYTE * pWord );
1589 };
1590
1591 //////////////////////////////////////////////////////////////////////////
1592 // TOKEN FILTER
1593 //////////////////////////////////////////////////////////////////////////
1594
1595 /// token filter base (boring proxy stuff)
1596 class CSphTokenFilter : public ISphTokenizer
1597 {
1598 protected:
1599 ISphTokenizer * m_pTokenizer;
1600
1601 public:
CSphTokenFilter(ISphTokenizer * pTokenizer)1602 explicit CSphTokenFilter ( ISphTokenizer * pTokenizer ) : m_pTokenizer ( pTokenizer ) {}
~CSphTokenFilter()1603 ~CSphTokenFilter() { SafeDelete ( m_pTokenizer ); }
1604
SetCaseFolding(const char * sConfig,CSphString & sError)1605 virtual bool SetCaseFolding ( const char * sConfig, CSphString & sError ) { return m_pTokenizer->SetCaseFolding ( sConfig, sError ); }
AddPlainChar(char c)1606 virtual void AddPlainChar ( char c ) { m_pTokenizer->AddPlainChar ( c ); }
AddSpecials(const char * sSpecials)1607 virtual void AddSpecials ( const char * sSpecials ) { m_pTokenizer->AddSpecials ( sSpecials ); }
SetIgnoreChars(const char * sIgnored,CSphString & sError)1608 virtual bool SetIgnoreChars ( const char * sIgnored, CSphString & sError ) { return m_pTokenizer->SetIgnoreChars ( sIgnored, sError ); }
SetNgramChars(const char * sConfig,CSphString & sError)1609 virtual bool SetNgramChars ( const char * sConfig, CSphString & sError ) { return m_pTokenizer->SetNgramChars ( sConfig, sError ); }
SetNgramLen(int iLen)1610 virtual void SetNgramLen ( int iLen ) { m_pTokenizer->SetNgramLen ( iLen ); }
LoadSynonyms(const char * sFilename,const CSphEmbeddedFiles * pFiles,CSphString & sError)1611 virtual bool LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError ) { return m_pTokenizer->LoadSynonyms ( sFilename, pFiles, sError ); }
WriteSynonyms(CSphWriter & tWriter)1612 virtual void WriteSynonyms ( CSphWriter & tWriter ) { return m_pTokenizer->WriteSynonyms ( tWriter ); }
SetBoundary(const char * sConfig,CSphString & sError)1613 virtual bool SetBoundary ( const char * sConfig, CSphString & sError ) { return m_pTokenizer->SetBoundary ( sConfig, sError ); }
Setup(const CSphTokenizerSettings & tSettings)1614 virtual void Setup ( const CSphTokenizerSettings & tSettings ) { m_pTokenizer->Setup ( tSettings ); }
GetSettings()1615 virtual const CSphTokenizerSettings & GetSettings () const { return m_pTokenizer->GetSettings (); }
GetSynFileInfo()1616 virtual const CSphSavedFile & GetSynFileInfo () const { return m_pTokenizer->GetSynFileInfo (); }
EnableSentenceIndexing(CSphString & sError)1617 virtual bool EnableSentenceIndexing ( CSphString & sError ) { return m_pTokenizer->EnableSentenceIndexing ( sError ); }
EnableZoneIndexing(CSphString & sError)1618 virtual bool EnableZoneIndexing ( CSphString & sError ) { return m_pTokenizer->EnableZoneIndexing ( sError ); }
SkipBlended()1619 virtual int SkipBlended () { return m_pTokenizer->SkipBlended(); }
1620
GetCodepointLength(int iCode)1621 virtual int GetCodepointLength ( int iCode ) const { return m_pTokenizer->GetCodepointLength ( iCode ); }
GetMaxCodepointLength()1622 virtual int GetMaxCodepointLength () const { return m_pTokenizer->GetMaxCodepointLength(); }
1623
GetTokenStart()1624 virtual const char * GetTokenStart () const { return m_pTokenizer->GetTokenStart(); }
GetTokenEnd()1625 virtual const char * GetTokenEnd () const { return m_pTokenizer->GetTokenEnd(); }
GetBufferPtr()1626 virtual const char * GetBufferPtr () const { return m_pTokenizer->GetBufferPtr(); }
GetBufferEnd()1627 virtual const char * GetBufferEnd () const { return m_pTokenizer->GetBufferEnd (); }
SetBufferPtr(const char * sNewPtr)1628 virtual void SetBufferPtr ( const char * sNewPtr ) { m_pTokenizer->SetBufferPtr ( sNewPtr ); }
GetSettingsFNV()1629 virtual uint64_t GetSettingsFNV () const { return m_pTokenizer->GetSettingsFNV(); }
1630
SetBuffer(const BYTE * sBuffer,int iLength)1631 virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) { m_pTokenizer->SetBuffer ( sBuffer, iLength ); }
GetToken()1632 virtual BYTE * GetToken () { return m_pTokenizer->GetToken(); }
1633
GetEmbeddedTokenizer()1634 virtual ISphTokenizer * GetEmbeddedTokenizer () const { return m_pTokenizer; }
WasTokenMultiformDestination(bool & bHead,int & iDestCount)1635 virtual bool WasTokenMultiformDestination ( bool & bHead, int & iDestCount ) const { return m_pTokenizer->WasTokenMultiformDestination ( bHead, iDestCount ); }
1636 };
1637
1638
1639 struct ISphQueryFilter
1640 {
1641 ISphTokenizer * m_pTokenizer;
1642 CSphDict * m_pDict;
1643 const CSphIndexSettings * m_pSettings;
1644
1645 ISphQueryFilter ();
1646 virtual ~ISphQueryFilter ();
1647
1648 void GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords );
1649 virtual void AddKeywordStats ( BYTE * sWord, const BYTE * sTokenized, int iQpos, CSphVector <CSphKeywordInfo> & dKeywords ) = 0;
1650 };
1651
1652
1653 DWORD sphParseMorphAot ( const char * );
1654
1655 struct CSphReconfigureSettings
1656 {
1657 CSphTokenizerSettings m_tTokenizer;
1658 CSphDictSettings m_tDict;
1659 CSphIndexSettings m_tIndex;
1660 };
1661
1662 struct CSphReconfigureSetup
1663 {
1664 ISphTokenizer * m_pTokenizer;
1665 CSphDict * m_pDict;
1666 CSphIndexSettings m_tIndex;
1667
1668 CSphReconfigureSetup ();
1669 ~CSphReconfigureSetup ();
1670 };
1671
1672 uint64_t sphGetSettingsFNV ( const CSphIndexSettings & tSettings );
1673
1674 //////////////////////////////////////////////////////////////////////////
1675 // USER VARIABLES
1676 //////////////////////////////////////////////////////////////////////////
1677
1678 /// value container for the intset uservar type
1679 class UservarIntSet_c : public CSphVector<SphAttr_t>, public ISphRefcountedMT
1680 {
1681 };
1682
1683 extern UservarIntSet_c * ( *g_pUservarsHook )( const CSphString & sUservar );
1684
1685 //////////////////////////////////////////////////////////////////////////
1686 // BINLOG INTERNALS
1687 //////////////////////////////////////////////////////////////////////////
1688
1689 /// global binlog interface
1690 class ISphBinlog : ISphNoncopyable
1691 {
1692 public:
~ISphBinlog()1693 virtual ~ISphBinlog () {}
1694
1695 virtual void BinlogUpdateAttributes ( int64_t * pTID, const char * sIndexName, const CSphAttrUpdate & tUpd ) = 0;
1696 virtual void NotifyIndexFlush ( const char * sIndexName, int64_t iTID, bool bShutdown ) = 0;
1697 };
1698
1699 //////////////////////////////////////////////////////////////////////////
1700 // MISC FUNCTION PROTOTYPES
1701 //////////////////////////////////////////////////////////////////////////
1702
1703 struct SphStringSorterRemap_t
1704 {
1705 CSphAttrLocator m_tSrc;
1706 CSphAttrLocator m_tDst;
1707 };
1708
1709 struct ThrottleState_t
1710 {
1711 int64_t m_tmLastIOTime;
1712 int m_iMaxIOps;
1713 int m_iMaxIOSize;
1714
ThrottleState_tThrottleState_t1715 ThrottleState_t ()
1716 : m_tmLastIOTime ( 0 )
1717 , m_iMaxIOps ( 0 )
1718 , m_iMaxIOSize ( 0 )
1719 {}
1720 };
1721
1722 const BYTE * SkipQuoted ( const BYTE * p );
1723
1724 bool sphSortGetStringRemap ( const ISphSchema & tSorterSchema, const ISphSchema & tIndexSchema, CSphVector<SphStringSorterRemap_t> & dAttrs );
1725 bool sphIsSortStringInternal ( const char * sColumnName );
1726 /// make string lowercase but keep case of JSON.field
1727 void sphColumnToLowercase ( char * sVal );
1728
1729 bool sphCheckQueryHeight ( const struct XQNode_t * pRoot, CSphString & sError );
1730 void sphTransformExtendedQuery ( XQNode_t ** ppNode, const CSphIndexSettings & tSettings, bool bHasBooleanOptimization, const ISphKeywordsStat * pKeywords );
1731 void TransformAotFilter ( XQNode_t * pNode, const CSphWordforms * pWordforms, const CSphIndexSettings& tSettings );
1732 bool sphMerge ( const CSphIndex * pDst, const CSphIndex * pSrc, const CSphVector<SphDocID_t> & dKillList, CSphString & sError, CSphIndexProgress & tProgress, ThrottleState_t * pThrottle, volatile bool * pGlobalStop, volatile bool * pLocalStop );
1733 CSphString sphReconstructNode ( const XQNode_t * pNode, const CSphSchema * pSchema );
1734
1735 void sphSetUnlinkOld ( bool bUnlink );
1736 void sphUnlinkIndex ( const char * sName, bool bForce );
1737
1738 void WriteSchema ( CSphWriter & fdInfo, const CSphSchema & tSchema );
1739 void ReadSchema ( CSphReader & rdInfo, CSphSchema & m_tSchema, DWORD uVersion, bool bDynamic );
1740 void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings );
1741 void LoadIndexSettings ( CSphIndexSettings & tSettings, CSphReader & tReader, DWORD uVersion );
1742 void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, int iEmbeddedLimit );
1743 bool LoadTokenizerSettings ( CSphReader & tReader, CSphTokenizerSettings & tSettings, CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning );
1744 void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict, int iEmbeddedLimit );
1745 void LoadDictionarySettings ( CSphReader & tReader, CSphDictSettings & tSettings, CSphEmbeddedFiles & tEmbeddedFiles, DWORD uVersion, CSphString & sWarning );
1746 void SaveFieldFilterSettings ( CSphWriter & tWriter, ISphFieldFilter * pFieldFilter );
1747
1748 DWORD ReadVersion ( const char * sPath, CSphString & sError );
1749 bool AddFieldLens ( CSphSchema & tSchema, bool bDynamic, CSphString & sError );
1750
1751 void RebalanceWeights ( const CSphFixedVector<int64_t> & dTimers, WORD * pWeights );
1752
1753 // all indexes should produce same terms for same query
1754 struct SphWordStatChecker_t
1755 {
SphWordStatChecker_tSphWordStatChecker_t1756 SphWordStatChecker_t () {}
1757 void Set ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat );
1758 void DumpDiffer ( const SmallStringHash_T<CSphQueryResultMeta::WordStat_t> & hStat, const char * sIndex, CSphString & sWarning ) const;
1759
1760 CSphVector<uint64_t> m_dSrcWords;
1761 };
1762
1763
1764 enum ESphExtType
1765 {
1766 SPH_EXT_TYPE_CUR = 0,
1767 SPH_EXT_TYPE_NEW,
1768 SPH_EXT_TYPE_OLD,
1769 SPH_EXT_TYPE_LOC
1770 };
1771
1772 enum ESphExt
1773 {
1774 SPH_EXT_SPH = 0,
1775 SPH_EXT_SPA = 1,
1776 SPH_EXT_MVP = 9
1777 };
1778
1779 const char ** sphGetExts ( ESphExtType eType, DWORD uVersion=INDEX_FORMAT_VERSION );
1780 int sphGetExtCount ( DWORD uVersion=INDEX_FORMAT_VERSION );
1781 const char * sphGetExt ( ESphExtType eType, ESphExt eExt );
1782
1783 int sphDictCmp ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 );
1784 int sphDictCmpStrictly ( const char * pStr1, int iLen1, const char * pStr2, int iLen2 );
1785
1786 template <typename CP>
sphCheckpointCmp(const char * sWord,int iLen,SphWordID_t iWordID,bool bWordDict,const CP & tCP)1787 int sphCheckpointCmp ( const char * sWord, int iLen, SphWordID_t iWordID, bool bWordDict, const CP & tCP )
1788 {
1789 if ( bWordDict )
1790 return sphDictCmp ( sWord, iLen, tCP.m_sWord, strlen ( tCP.m_sWord ) );
1791
1792 int iRes = 0;
1793 iRes = iWordID<tCP.m_uWordID ? -1 : iRes;
1794 iRes = iWordID>tCP.m_uWordID ? 1 : iRes;
1795 return iRes;
1796 }
1797
1798 template <typename CP>
sphCheckpointCmpStrictly(const char * sWord,int iLen,SphWordID_t iWordID,bool bWordDict,const CP & tCP)1799 int sphCheckpointCmpStrictly ( const char * sWord, int iLen, SphWordID_t iWordID, bool bWordDict, const CP & tCP )
1800 {
1801 if ( bWordDict )
1802 return sphDictCmpStrictly ( sWord, iLen, tCP.m_sWord, strlen ( tCP.m_sWord ) );
1803
1804 int iRes = 0;
1805 iRes = iWordID<tCP.m_uWordID ? -1 : iRes;
1806 iRes = iWordID>tCP.m_uWordID ? 1 : iRes;
1807 return iRes;
1808 }
1809
1810
1811 template < typename CP >
sphSearchCheckpoint(const char * sWord,int iWordLen,SphWordID_t iWordID,bool bStarMode,bool bWordDict,const CP * pFirstCP,const CP * pLastCP)1812 const CP * sphSearchCheckpoint ( const char * sWord, int iWordLen, SphWordID_t iWordID
1813 , bool bStarMode, bool bWordDict
1814 , const CP * pFirstCP, const CP * pLastCP )
1815 {
1816 assert ( !bWordDict || iWordLen>0 );
1817
1818 const CP * pStart = pFirstCP;
1819 const CP * pEnd = pLastCP;
1820
1821 if ( bStarMode && sphCheckpointCmp ( sWord, iWordLen, iWordID, bWordDict, *pStart )<0 )
1822 return NULL;
1823 if ( !bStarMode && sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pStart )<0 )
1824 return NULL;
1825
1826 if ( sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pEnd )>=0 )
1827 pStart = pEnd;
1828 else
1829 {
1830 while ( pEnd-pStart>1 )
1831 {
1832 const CP * pMid = pStart + (pEnd-pStart)/2;
1833 const int iCmpRes = sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pMid );
1834
1835 if ( iCmpRes==0 )
1836 {
1837 pStart = pMid;
1838 break;
1839 } else if ( iCmpRes<0 )
1840 pEnd = pMid;
1841 else
1842 pStart = pMid;
1843 }
1844
1845 assert ( pStart>=pFirstCP );
1846 assert ( pStart<=pLastCP );
1847 assert ( sphCheckpointCmp ( sWord, iWordLen, iWordID, bWordDict, *pStart )>=0
1848 && sphCheckpointCmpStrictly ( sWord, iWordLen, iWordID, bWordDict, *pEnd )<0 );
1849 }
1850
1851 return pStart;
1852 }
1853
1854 int sphCollateLibcCI ( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
1855 int sphCollateLibcCS ( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
1856 int sphCollateUtf8GeneralCI ( const BYTE * pArg1, const BYTE * pArg2, bool bPacked );
1857 int sphCollateBinary ( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
1858
1859 class ISphRtDictWraper : public CSphDict
1860 {
1861 public:
1862 virtual const BYTE * GetPackedKeywords () = 0;
1863 virtual int GetPackedLen () = 0;
1864
1865 virtual void ResetKeywords() = 0;
1866
1867 virtual const char * GetLastWarning() const = 0;
1868 virtual void ResetWarning() = 0;
1869 };
1870
1871 ISphRtDictWraper * sphCreateRtKeywordsDictionaryWrapper ( CSphDict * pBase );
1872
1873 struct SphExpanded_t
1874 {
1875 int m_iNameOff;
1876 int m_iDocs;
1877 int m_iHits;
1878 };
1879
1880 struct ISphSubstringPayload
1881 {
ISphSubstringPayloadISphSubstringPayload1882 ISphSubstringPayload () {}
~ISphSubstringPayloadISphSubstringPayload1883 virtual ~ISphSubstringPayload() {}
1884 };
1885
1886
1887 class ISphWordlist
1888 {
1889 public:
1890 struct Args_t : public ISphNoncopyable
1891 {
1892 CSphVector<SphExpanded_t> m_dExpanded;
1893 const bool m_bPayload;
1894 int m_iExpansionLimit;
1895 const bool m_bHasMorphology;
1896 const ESphHitless m_eHitless;
1897
1898 ISphSubstringPayload * m_pPayload;
1899 int m_iTotalDocs;
1900 int m_iTotalHits;
1901 const void * m_pIndexData;
1902
1903 Args_t ( bool bPayload, int iExpansionLimit, bool bHasMorphology, ESphHitless eHitless, const void * pIndexData );
1904 ~Args_t ();
1905 void AddExpanded ( const BYTE * sWord, int iLen, int iDocs, int iHits );
1906 const char * GetWordExpanded ( int iIndex ) const;
1907
1908 private:
1909 CSphVector<char> m_sBuf;
1910 };
1911
~ISphWordlist()1912 virtual ~ISphWordlist () {}
1913 virtual void GetPrefixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const = 0;
1914 virtual void GetInfixedWords ( const char * sSubstring, int iSubLen, const char * sWildcard, Args_t & tArgs ) const = 0;
1915 };
1916
1917
1918 class CSphScopedPayload
1919 {
1920 public:
CSphScopedPayload()1921 CSphScopedPayload () {}
~CSphScopedPayload()1922 ~CSphScopedPayload ()
1923 {
1924 ARRAY_FOREACH ( i, m_dPayloads )
1925 SafeDelete ( m_dPayloads[i] );
1926 }
Add(ISphSubstringPayload * pPayload)1927 void Add ( ISphSubstringPayload * pPayload ) { m_dPayloads.Add ( pPayload ); }
1928
1929 private:
1930 CSphVector<ISphSubstringPayload *> m_dPayloads;
1931 };
1932
1933
1934 struct ExpansionContext_t
1935 {
1936 const ISphWordlist * m_pWordlist;
1937 BYTE * m_pBuf;
1938 CSphQueryResultMeta * m_pResult;
1939 int m_iMinPrefixLen;
1940 int m_iMinInfixLen;
1941 int m_iExpansionLimit;
1942 bool m_bHasMorphology;
1943 bool m_bMergeSingles;
1944 CSphScopedPayload * m_pPayloads;
1945 ESphHitless m_eHitless;
1946 const void * m_pIndexData;
1947
1948 ExpansionContext_t ();
1949 };
1950
1951
1952 XQNode_t * sphExpandXQNode ( XQNode_t * pNode, ExpansionContext_t & tCtx );
1953 XQNode_t * sphQueryExpandKeywords ( XQNode_t * pNode, const CSphIndexSettings & tSettings );
sphGetExpansionMagic(int iDocs,int iHits)1954 inline int sphGetExpansionMagic ( int iDocs, int iHits )
1955 {
1956 return ( iHits<=256 ? 1 : iDocs + 1 ); // magic threshold; mb make this configurable?
1957 }
sphIsExpandedPayload(int iDocs,int iHits)1958 inline bool sphIsExpandedPayload ( int iDocs, int iHits )
1959 {
1960 return ( iHits<=256 || iDocs<32 ); // magic threshold; mb make this configurable?
1961 }
1962
1963
1964 template<typename T>
1965 struct ExpandedOrderDesc_T
1966 {
IsLessExpandedOrderDesc_T1967 bool IsLess ( const T & a, const T & b )
1968 {
1969 return ( sphGetExpansionMagic ( a.m_iDocs, a.m_iHits )>sphGetExpansionMagic ( b.m_iDocs, b.m_iHits ) );
1970 }
1971 };
1972
1973
1974 class CSphKeywordDeltaWriter
1975 {
1976 private:
1977 BYTE m_sLastKeyword [SPH_MAX_WORD_LEN*3+4];
1978 int m_iLastLen;
1979
1980 public:
CSphKeywordDeltaWriter()1981 CSphKeywordDeltaWriter ()
1982 {
1983 Reset();
1984 }
1985
Reset()1986 void Reset ()
1987 {
1988 m_iLastLen = 0;
1989 }
1990
1991 template <typename F>
PutDelta(F & WRITER,const BYTE * pWord,int iLen)1992 void PutDelta ( F & WRITER, const BYTE * pWord, int iLen )
1993 {
1994 assert ( pWord && iLen );
1995
1996 // how many bytes of a previous keyword can we reuse?
1997 BYTE iMatch = 0;
1998 int iMinLen = Min ( m_iLastLen, iLen );
1999 assert ( iMinLen<(int)sizeof(m_sLastKeyword) );
2000 while ( iMatch<iMinLen && m_sLastKeyword[iMatch]==pWord[iMatch] )
2001 {
2002 iMatch++;
2003 }
2004
2005 BYTE iDelta = (BYTE)( iLen - iMatch );
2006 assert ( iDelta>0 );
2007
2008 assert ( iLen < (int)sizeof(m_sLastKeyword) );
2009 memcpy ( m_sLastKeyword, pWord, iLen );
2010 m_iLastLen = iLen;
2011
2012 // match and delta are usually tiny, pack them together in 1 byte
2013 // tricky bit, this byte leads the entry so it must never be 0 (aka eof mark)!
2014 if ( iDelta<=8 && iMatch<=15 )
2015 {
2016 BYTE uPacked = ( 0x80 + ( (iDelta-1)<<4 ) + iMatch );
2017 WRITER.PutBytes ( &uPacked, 1 );
2018 } else
2019 {
2020 WRITER.PutBytes ( &iDelta, 1 ); // always greater than 0
2021 WRITER.PutBytes ( &iMatch, 1 );
2022 }
2023
2024 WRITER.PutBytes ( pWord + iMatch, iDelta );
2025 }
2026 };
2027
2028 BYTE sphDoclistHintPack ( SphOffset_t iDocs, SphOffset_t iLen );
2029
2030 // wordlist checkpoints frequency
2031 #define SPH_WORDLIST_CHECKPOINT 64
2032
2033 /// startup mva updates arena
2034 const char * sphArenaInit ( int iMaxBytes );
2035
2036 #if USE_WINDOWS
2037 void localtime_r ( const time_t * clock, struct tm * res );
2038 void gmtime_r ( const time_t * clock, struct tm * res );
2039 #endif
2040
2041 struct InfixBlock_t
2042 {
2043 union
2044 {
2045 const char * m_sInfix;
2046 DWORD m_iInfixOffset;
2047 };
2048 DWORD m_iOffset;
2049 };
2050
2051
2052 /// infix hash builder
2053 class ISphInfixBuilder
2054 {
2055 public:
ISphInfixBuilder()2056 explicit ISphInfixBuilder() {}
~ISphInfixBuilder()2057 virtual ~ISphInfixBuilder() {}
2058 virtual void AddWord ( const BYTE * pWord, int iWordLength, int iCheckpoint, bool bHasMorphology ) = 0;
2059 virtual void SaveEntries ( CSphWriter & wrDict ) = 0;
2060 virtual int64_t SaveEntryBlocks ( CSphWriter & wrDict ) = 0;
2061 virtual int GetBlocksWordsSize () const = 0;
2062 };
2063
2064
2065 ISphInfixBuilder * sphCreateInfixBuilder ( int iCodepointBytes, CSphString * pError );
2066 bool sphLookupInfixCheckpoints ( const char * sInfix, int iBytes, const BYTE * pInfixes, const CSphVector<InfixBlock_t> & dInfixBlocks, int iInfixCodepointBytes, CSphVector<int> & dCheckpoints );
2067 // calculate length, upto iInfixCodepointBytes chars from infix start
2068 int sphGetInfixLength ( const char * sInfix, int iBytes, int iInfixCodepointBytes );
2069
2070
2071 /// compute utf-8 character length in bytes from its first byte
sphUtf8CharBytes(BYTE uFirst)2072 inline int sphUtf8CharBytes ( BYTE uFirst )
2073 {
2074 switch ( uFirst>>4 )
2075 {
2076 case 12: return 2; // 110x xxxx, 2 bytes
2077 case 13: return 2; // 110x xxxx, 2 bytes
2078 case 14: return 3; // 1110 xxxx, 3 bytes
2079 case 15: return 4; // 1111 0xxx, 4 bytes
2080 default: return 1; // either 1 byte, or invalid/unsupported code
2081 }
2082 }
2083
2084 //////////////////////////////////////////////////////////////////////////
2085
2086 /// snippet setupper
2087 /// used by searchd and SNIPPET() function in exprs
2088 /// should probably be refactored as a single function
2089 /// a precursor to sphBuildExcerpts() call
2090 class SnippetContext_t : ISphNoncopyable
2091 {
2092 private:
2093 CSphScopedPtr<CSphDict> m_tDictCloned;
2094 CSphScopedPtr<CSphDict> m_tExactDict;
2095
2096 public:
2097 CSphDict * m_pDict;
2098 CSphScopedPtr<ISphTokenizer> m_tTokenizer;
2099 CSphScopedPtr<CSphHTMLStripper> m_tStripper;
2100 ISphTokenizer * m_pQueryTokenizer;
2101 XQQuery_t m_tExtQuery;
2102 DWORD m_eExtQuerySPZ;
2103
SnippetContext_t()2104 SnippetContext_t()
2105 : m_tDictCloned ( NULL )
2106 , m_tExactDict ( NULL )
2107 , m_pDict ( NULL )
2108 , m_tTokenizer ( NULL )
2109 , m_tStripper ( NULL )
2110 , m_pQueryTokenizer ( NULL )
2111 , m_eExtQuerySPZ ( SPH_SPZ_NONE )
2112 {
2113 }
2114
~SnippetContext_t()2115 ~SnippetContext_t()
2116 {
2117 SafeDelete ( m_pQueryTokenizer );
2118 }
2119
SetupExactDict(const CSphIndexSettings & tSettings,CSphScopedPtr<CSphDict> & tExact,CSphDict * pDict)2120 static CSphDict * SetupExactDict ( const CSphIndexSettings & tSettings, CSphScopedPtr<CSphDict> & tExact, CSphDict * pDict )
2121 {
2122 // handle index_exact_words
2123 if ( !tSettings.m_bIndexExactWords )
2124 return pDict;
2125
2126 tExact = new CSphDictExact ( pDict );
2127 return tExact.Ptr();
2128 }
2129
CollectQuerySPZ(const XQNode_t * pNode)2130 static DWORD CollectQuerySPZ ( const XQNode_t * pNode )
2131 {
2132 if ( !pNode )
2133 return SPH_SPZ_NONE;
2134
2135 DWORD eSPZ = SPH_SPZ_NONE;
2136 if ( pNode->GetOp()==SPH_QUERY_SENTENCE )
2137 eSPZ |= SPH_SPZ_SENTENCE;
2138 else if ( pNode->GetOp()==SPH_QUERY_PARAGRAPH )
2139 eSPZ |= SPH_SPZ_PARAGRAPH;
2140
2141 ARRAY_FOREACH ( i, pNode->m_dChildren )
2142 eSPZ |= CollectQuerySPZ ( pNode->m_dChildren[i] );
2143
2144 return eSPZ;
2145 }
2146
SetupStripperSPZ(const CSphIndexSettings & tSettings,const ExcerptQuery_t & q,bool bSetupSPZ,CSphScopedPtr<CSphHTMLStripper> & tStripper,ISphTokenizer * pTokenizer,CSphString & sError)2147 static bool SetupStripperSPZ ( const CSphIndexSettings & tSettings, const ExcerptQuery_t & q,
2148 bool bSetupSPZ, CSphScopedPtr<CSphHTMLStripper> & tStripper, ISphTokenizer * pTokenizer,
2149 CSphString & sError )
2150 {
2151 if ( bSetupSPZ &&
2152 ( !pTokenizer->EnableSentenceIndexing ( sError ) || !pTokenizer->EnableZoneIndexing ( sError ) ) )
2153 {
2154 return false;
2155 }
2156
2157
2158 if ( q.m_sStripMode=="strip" || q.m_sStripMode=="retain"
2159 || ( q.m_sStripMode=="index" && tSettings.m_bHtmlStrip ) )
2160 {
2161 // don't strip HTML markup in 'retain' mode - proceed zones only
2162 tStripper = new CSphHTMLStripper ( q.m_sStripMode!="retain" );
2163
2164 if ( q.m_sStripMode=="index" )
2165 {
2166 if (
2167 !tStripper->SetIndexedAttrs ( tSettings.m_sHtmlIndexAttrs.cstr (), sError ) ||
2168 !tStripper->SetRemovedElements ( tSettings.m_sHtmlRemoveElements.cstr (), sError ) )
2169 {
2170 sError.SetSprintf ( "HTML stripper config error: %s", sError.cstr() );
2171 return false;
2172 }
2173 }
2174
2175 if ( bSetupSPZ )
2176 {
2177 tStripper->EnableParagraphs();
2178 }
2179
2180 // handle zone(s) in special mode only when passage_boundary enabled
2181 if ( bSetupSPZ && !tStripper->SetZones ( tSettings.m_sZones.cstr (), sError ) )
2182 {
2183 sError.SetSprintf ( "HTML stripper config error: %s", sError.cstr() );
2184 return false;
2185 }
2186 }
2187
2188 return true;
2189 }
2190
Setup(const CSphIndex * pIndex,const ExcerptQuery_t & tSettings,CSphString & sError)2191 bool Setup ( const CSphIndex * pIndex, const ExcerptQuery_t & tSettings, CSphString & sError )
2192 {
2193 assert ( pIndex );
2194 CSphScopedPtr<CSphDict> tDictCloned ( NULL );
2195 m_pDict = pIndex->GetDictionary();
2196 if ( m_pDict->HasState() )
2197 m_tDictCloned = m_pDict = m_pDict->Clone();
2198
2199 // AOT tokenizer works only with query mode
2200 if ( pIndex->GetSettings().m_uAotFilterMask &&
2201 ( !tSettings.m_bHighlightQuery || tSettings.m_bExactPhrase ) )
2202 {
2203 if ( !tSettings.m_bHighlightQuery )
2204 sError.SetSprintf ( "failed to setup AOT with query_mode=0, use query_mode=1" );
2205 else
2206 sError.SetSprintf ( "failed to setup AOT with exact_phrase, use phrase search operator with query_mode=1" );
2207 return false;
2208 }
2209
2210 // OPTIMIZE! do a lightweight indexing clone here
2211 if ( tSettings.m_bHighlightQuery && pIndex->GetSettings().m_uAotFilterMask )
2212 m_tTokenizer = sphAotCreateFilter ( pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX ), m_pDict, pIndex->GetSettings().m_bIndexExactWords, pIndex->GetSettings().m_uAotFilterMask );
2213 else
2214 m_tTokenizer = pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX );
2215
2216 m_pQueryTokenizer = NULL;
2217 if ( tSettings.m_bHighlightQuery || tSettings.m_bExactPhrase )
2218 {
2219 m_pQueryTokenizer = pIndex->GetQueryTokenizer()->Clone ( SPH_CLONE_QUERY_LIGHTWEIGHT );
2220 } else
2221 {
2222 // legacy query mode should handle exact form modifier and star wildcard
2223 m_pQueryTokenizer = pIndex->GetTokenizer()->Clone ( SPH_CLONE_INDEX );
2224 if ( pIndex->IsStarDict() )
2225 {
2226 m_pQueryTokenizer->AddPlainChar ( '*' );
2227 m_pQueryTokenizer->AddPlainChar ( '?' );
2228 m_pQueryTokenizer->AddPlainChar ( '%' );
2229 }
2230 if ( pIndex->GetSettings().m_bIndexExactWords )
2231 m_pQueryTokenizer->AddPlainChar ( '=' );
2232 }
2233
2234 // setup exact dictionary if needed
2235 m_pDict = SetupExactDict ( pIndex->GetSettings(), m_tExactDict, m_pDict );
2236
2237 if ( tSettings.m_bHighlightQuery )
2238 {
2239 // OPTIMIZE? double lightweight clone here? but then again it's lightweight
2240 if ( !sphParseExtendedQuery ( m_tExtQuery, tSettings.m_sWords.cstr(), NULL, m_pQueryTokenizer,
2241 &pIndex->GetMatchSchema(), m_pDict, pIndex->GetSettings() ) )
2242 {
2243 sError = m_tExtQuery.m_sParseError;
2244 return false;
2245 }
2246 if ( m_tExtQuery.m_pRoot )
2247 m_tExtQuery.m_pRoot->ClearFieldMask();
2248
2249 m_eExtQuerySPZ = SPH_SPZ_NONE;
2250 m_eExtQuerySPZ |= CollectQuerySPZ ( m_tExtQuery.m_pRoot );
2251 if ( m_tExtQuery.m_dZones.GetLength() )
2252 m_eExtQuerySPZ |= SPH_SPZ_ZONE;
2253
2254 if ( pIndex->GetSettings().m_uAotFilterMask )
2255 TransformAotFilter ( m_tExtQuery.m_pRoot, m_pDict->GetWordforms(), pIndex->GetSettings() );
2256 }
2257
2258 bool bSetupSPZ = ( tSettings.m_ePassageSPZ!=SPH_SPZ_NONE || m_eExtQuerySPZ!=SPH_SPZ_NONE ||
2259 ( tSettings.m_sStripMode=="retain" && tSettings.m_bHighlightQuery ) );
2260
2261 if ( !SetupStripperSPZ ( pIndex->GetSettings(), tSettings, bSetupSPZ, m_tStripper, m_tTokenizer.Ptr(), sError ) )
2262 return false;
2263
2264 return true;
2265 }
2266 };
2267
2268 struct StoredToken_t
2269 {
2270 BYTE m_sToken [3*SPH_MAX_WORD_LEN+4];
2271 // tokenized state
2272 const char * m_szTokenStart;
2273 const char * m_szTokenEnd;
2274 const char * m_pBufferPtr;
2275 const char * m_pBufferEnd;
2276 int m_iTokenLen;
2277 int m_iOvershortCount;
2278 bool m_bBoundary;
2279 bool m_bSpecial;
2280 bool m_bBlended;
2281 bool m_bBlendedPart;
2282 };
2283
2284 void FillStoredTokenInfo ( StoredToken_t & tToken, const BYTE * sToken, ISphTokenizer * pTokenizer );
2285 CSphSource * sphCreateSourceTSVpipe ( const CSphConfigSection * pSource, FILE * pPipe, const char * sSourceName, bool bProxy );
2286 CSphSource * sphCreateSourceCSVpipe ( const CSphConfigSection * pSource, FILE * pPipe, const char * sSourceName, bool bProxy );
2287
FlipEndianess(DWORD * pData)2288 inline void FlipEndianess ( DWORD* pData )
2289 {
2290 BYTE* pB = (BYTE*)pData;
2291 BYTE a = pB[0];
2292 pB[0] = pB[3];
2293 pB[3] = a;
2294 a = pB[1];
2295 pB[1] = pB[2];
2296 pB[2] = a;
2297 };
2298
2299
2300 #if USE_RLP
2301 #define RLPARG(_arg) _arg
2302 #else
2303 #define RLPARG(_arg)
2304 #endif
2305
2306
2307 #if USE_RLP
2308
2309 struct StoredDoc_t
2310 {
2311 CSphMatch m_tDocInfo;
2312 CSphVector<CSphString> m_dStrAttrs;
2313 CSphVector<DWORD> m_dMva;
2314 CSphTightVector<BYTE*> m_dFields;
2315 CSphTightVector<bool> m_dChinese;
2316 CSphTightVector< CSphVector<BYTE> > m_dFieldStorage;
2317 CSphTightVector< StoredToken_t > m_dNonChineseTokens;
2318 };
2319
2320 // these are used to separate text before passing it to RLP
2321 const int PROXY_DOCUMENT_START = 0xFFFA;
2322 const int PROXY_FIELD_START_CHINESE = 0xFFFB;
2323 const int PROXY_FIELD_START_NONCHINESE = 0xFFFC;
2324 const int PROXY_TOKEN_SEPARATOR = 0xFFFD;
2325
2326 // these are used on text that is already tokenized
2327 const int PROXY_TOKENIZED = 0xFFFA;
2328 const int PROXY_MORPH = 0xFFFB;
2329
2330 const int PROXY_MARKER_LEN = 3;
2331
2332 enum
2333 {
2334 PROXY_BOUNDARY_FLAG = 1<<7, //NOLINT
2335 PROXY_SPECIAL_FLAG = 1<<8, //NOLINT
2336 PROXY_BLENDED_FLAG = 1<<9, //NOLINT
2337 PROXY_BLENDED_PART_FLAG = 1<<10, //NOLINT
2338 PROXY_HAVE_OVERSHORT = 1<<11 //NOLINT
2339 };
2340
2341
2342 #define COPY_MARKER(_ptr,_marker) \
2343 {\
2344 *_ptr++ = _marker[0]; \
2345 *_ptr++ = _marker[1]; \
2346 *_ptr++ = _marker[2]; \
2347 }
2348
2349 #define CMP_MARKER(_ptr, _marker) \
2350 ( _ptr[0]==_marker[0] && _ptr[1]==_marker[1] && _ptr[2]==_marker[2] )
2351
2352 // proxy source
2353 template <class T>
2354 class CSphSource_Proxy : public T
2355 {
2356 public:
CSphSource_Proxy(const char * sSourceName)2357 explicit CSphSource_Proxy ( const char * sSourceName )
2358 : T ( sSourceName )
2359 , m_dBatchedDocs ( g_iRLPMaxBatchDocs )
2360 , m_iDocStart ( 0 )
2361 , m_iDocCount ( 0 )
2362 , m_pExtraTokenizer ( NULL )
2363 , m_pProxyStripper ( NULL )
2364 {
2365 assert ( sphUTF8Encode ( m_pMarkerDocStart, PROXY_DOCUMENT_START )==PROXY_MARKER_LEN );
2366
2367 sphUTF8Encode ( m_pMarkerDocStart, PROXY_DOCUMENT_START );
2368 sphUTF8Encode ( m_pMarkerChineseField, PROXY_FIELD_START_CHINESE );
2369 sphUTF8Encode ( m_pMarkerNonChineseField, PROXY_FIELD_START_NONCHINESE );
2370 sphUTF8Encode ( m_pMarkerTokenSeparator, PROXY_TOKEN_SEPARATOR );
2371
2372 sphUTF8Encode ( m_pMarkerTokenized, PROXY_TOKENIZED );
2373 sphUTF8Encode ( m_pMarkerMorph, PROXY_MORPH );
2374
2375 const int INITIAL_BUFFER_SIZE = 1048576;
2376 m_dDocBuffer.Reserve ( INITIAL_BUFFER_SIZE );
2377 }
2378
~CSphSource_Proxy()2379 virtual ~CSphSource_Proxy()
2380 {
2381 SafeDelete ( m_pExtraTokenizer );
2382 SafeDelete ( m_pProxyStripper );
2383 }
2384
AppendToField(StoredDoc_t * pCurDoc,int iField,BYTE * pToken,int iTokenLen,BYTE * pMarker)2385 void AppendToField ( StoredDoc_t * pCurDoc, int iField, BYTE * pToken, int iTokenLen, BYTE * pMarker )
2386 {
2387 assert ( pCurDoc && iField>=0 );
2388 CSphVector<BYTE> & tStorage = pCurDoc->m_dFieldStorage[iField];
2389
2390 int iNewSize, iOldSize;
2391 iNewSize = iOldSize = tStorage.GetLength();
2392 if ( !iNewSize )
2393 iNewSize += PROXY_MARKER_LEN + 1; // tokenized field marker + trailing zero
2394
2395 iNewSize += iTokenLen+1; // space before each token + token
2396
2397 if ( pMarker )
2398 iNewSize += PROXY_MARKER_LEN; // non-chinese token marker
2399
2400 tStorage.Resize ( iNewSize );
2401 BYTE * pPtr = tStorage.Begin() + ( iOldSize ? iOldSize-1 : 0 );
2402
2403 if ( !iOldSize )
2404 {
2405 memcpy ( pPtr, m_pMarkerTokenized, PROXY_MARKER_LEN );
2406 pPtr += PROXY_MARKER_LEN;
2407 }
2408
2409 *pPtr++ = ' ';
2410
2411 if ( pMarker )
2412 COPY_MARKER ( pPtr, pMarker );
2413
2414 memcpy ( pPtr, pToken, iTokenLen+1 );
2415 pCurDoc->m_dFields[iField] = tStorage.Begin();
2416 }
2417
NextDocument(CSphString & sError)2418 virtual BYTE ** NextDocument ( CSphString & sError )
2419 {
2420 ISphTokenizer * pEmbeddedTokenizer = T::m_pTokenizer->GetEmbeddedTokenizer();
2421 assert ( pEmbeddedTokenizer );
2422
2423 // do not run the stripper twice
2424 if ( CSphSource_Proxy<T>::m_pStripper )
2425 {
2426 m_pProxyStripper = CSphSource_Proxy<T>::m_pStripper;
2427 CSphSource_Proxy<T>::m_pStripper = NULL;
2428 }
2429
2430 if ( !m_pExtraTokenizer )
2431 {
2432 m_pExtraTokenizer = ISphTokenizer::CreateRLPFilter ( pEmbeddedTokenizer->Clone ( SPH_CLONE_INDEX ), true, g_sRLPRoot.cstr(),
2433 g_sRLPEnv.cstr(), T::m_pTokenizer->GetRLPContext(), false, sError );
2434 if ( !m_pExtraTokenizer )
2435 return NULL;
2436 }
2437
2438 if ( !IsDocCacheEmpty() )
2439 return CopyDoc();
2440
2441 if ( m_dFieldLengths.GetLength()!=T::m_tSchema.m_dFields.GetLength() )
2442 m_dFieldLengths.Resize ( T::m_tSchema.m_dFields.GetLength() );
2443
2444 char szTmp [256];
2445
2446 m_iDocStart = 0;
2447 int iCurDoc = 0;
2448
2449 m_dDocBuffer.Resize(0);
2450
2451 while ( !IsDocCacheFull() && m_dDocBuffer.GetLength() < g_iRLPMaxBatchSize )
2452 {
2453 BYTE ** pFields = T::NextDocument ( sError );
2454 if ( !pFields )
2455 break;
2456
2457 int iTotalFieldLen = 0;
2458 for ( int i = 0; i < T::m_tSchema.m_dFields.GetLength(); i++ )
2459 {
2460 m_dFieldLengths[i] = pFields[i] ? strlen ( (const char*)pFields[i] ) : 0;
2461 iTotalFieldLen += PROXY_MARKER_LEN+m_dFieldLengths[i]+2;
2462 }
2463
2464 const int MAX_INDEX_LEN = 8;
2465 int iOldBufferLen = m_dDocBuffer.GetLength();
2466 m_dDocBuffer.Resize ( iOldBufferLen+PROXY_MARKER_LEN+MAX_INDEX_LEN+2+iTotalFieldLen );
2467 BYTE * pCurDocPtr = &(m_dDocBuffer[iOldBufferLen]);
2468
2469 StoredDoc_t * pDoc = PushDoc();
2470 int nFields = T::m_tSchema.m_dFields.GetLength();
2471 CopyDocInfo ( pDoc->m_tDocInfo, T::m_tDocInfo );
2472 pDoc->m_dMva = T::m_dMva;
2473 pDoc->m_dStrAttrs = T::m_dStrAttrs;
2474 pDoc->m_dFields.Resize ( nFields );
2475 pDoc->m_dFieldStorage.Resize ( nFields );
2476 pDoc->m_dChinese.Resize ( nFields );
2477 pDoc->m_dNonChineseTokens.Resize ( 0 );
2478
2479 // document start tag
2480 COPY_MARKER ( pCurDocPtr, m_pMarkerDocStart );
2481
2482 // space
2483 *pCurDocPtr++ = ' ';
2484
2485 // index in plain text
2486 int iLen = snprintf ( szTmp, sizeof(szTmp), "%d", iCurDoc );
2487 iLen = iLen>=0 ? iLen : sizeof(szTmp);
2488 memcpy ( pCurDocPtr, szTmp, iLen );
2489 pCurDocPtr += iLen;
2490
2491 // space
2492 *pCurDocPtr++ = ' ';
2493
2494 for ( int i = 0; i < T::m_tSchema.m_dFields.GetLength(); i++ )
2495 {
2496 pDoc->m_dChinese[i] = sphDetectChinese ( pFields[i], m_dFieldLengths[i] );
2497
2498 if ( m_pProxyStripper )
2499 {
2500 m_pProxyStripper->Strip ( pFields[i] );
2501 m_dFieldLengths[i] = strlen ( (const char *)pFields[i] );
2502 }
2503
2504 int iFieldLen = m_dFieldLengths[i];
2505
2506 if ( !pDoc->m_dChinese[i] )
2507 {
2508 // no chinese? just save the field storage without tokenizing it
2509 // it will be tokenized later in the splitter
2510 pDoc->m_dFieldStorage[i].Resize ( iFieldLen+1 );
2511 if ( pFields[i] )
2512 memcpy ( pDoc->m_dFieldStorage[i].Begin(), pFields[i], iFieldLen+1 );
2513 else
2514 pDoc->m_dFieldStorage[i][0] = 0;
2515
2516 pDoc->m_dFields[i] = pDoc->m_dFieldStorage[i].Begin();
2517
2518 COPY_MARKER ( pCurDocPtr, m_pMarkerNonChineseField );
2519 *pCurDocPtr++ = ' ';
2520 } else
2521 {
2522 COPY_MARKER ( pCurDocPtr, m_pMarkerChineseField );
2523 *pCurDocPtr++ = ' ';
2524
2525 pEmbeddedTokenizer->SetBuffer ( pFields[i], iFieldLen );
2526 BYTE * pToken;
2527 while ( ( pToken = pEmbeddedTokenizer->GetToken() )!=NULL )
2528 {
2529 int iTokenLen = strlen ( (const char*)pToken );
2530 if ( sphDetectChinese ( pToken, iTokenLen ) )
2531 {
2532 // collect it in one big chinese token buffer that will be processed by RLP
2533 memcpy ( pCurDocPtr, pToken, iTokenLen );
2534 pCurDocPtr += iTokenLen;
2535 } else
2536 {
2537 // drop it into "non-chinese" token vector
2538 StoredToken_t & tStored = pDoc->m_dNonChineseTokens.Add();
2539 FillStoredTokenInfo ( tStored, pToken, pEmbeddedTokenizer );
2540
2541 // add a 'non-chinese token' marker to the chinese token stream
2542 *pCurDocPtr++ = ' ';
2543 COPY_MARKER ( pCurDocPtr, m_pMarkerTokenSeparator );
2544 }
2545
2546 *pCurDocPtr++ = ' ';
2547 }
2548 }
2549 }
2550
2551 m_dDocBuffer.Resize ( pCurDocPtr-m_dDocBuffer.Begin() );
2552 iCurDoc++;
2553 }
2554
2555 if ( IsDocCacheEmpty() )
2556 return NULL;
2557
2558 m_pExtraTokenizer->SetBuffer ( m_dDocBuffer.Begin(), m_dDocBuffer.GetLength() );
2559 BYTE * pToken;
2560
2561 StoredDoc_t * pCurDoc = NULL;
2562 bool bIndexNext = false;
2563 int iField = -1;
2564 int iStoredToken = 0;
2565 while ( ( pToken = m_pExtraTokenizer->GetToken() )!=NULL )
2566 {
2567 bool bSpecial = false;
2568 int iTokenLen = strlen ( (const char *)pToken );
2569 if ( bIndexNext )
2570 {
2571 int iDoc = atoi ( (const char*)pToken );
2572 pCurDoc = &(m_dBatchedDocs[iDoc]);
2573 bIndexNext = false;
2574 iField = -1;
2575 iStoredToken = 0;
2576 } else
2577 {
2578 if ( iTokenLen==PROXY_MARKER_LEN )
2579 {
2580 if ( CMP_MARKER ( pToken, m_pMarkerDocStart ) )
2581 {
2582 bIndexNext = true;
2583 bSpecial = true;
2584 } else if ( CMP_MARKER ( pToken, m_pMarkerChineseField ) )
2585 {
2586 assert ( pCurDoc );
2587 iField++;
2588 pCurDoc->m_dFieldStorage[iField].Resize(0);
2589 pCurDoc->m_dFields[iField] = pCurDoc->m_dFieldStorage[iField].Begin();
2590 bSpecial = true;
2591 } else if ( CMP_MARKER ( pToken, m_pMarkerNonChineseField ) )
2592 {
2593 iField++;
2594 bSpecial = true;
2595 } else if ( CMP_MARKER ( pToken, m_pMarkerTokenSeparator ) )
2596 {
2597 StoredToken_t & tStored = pCurDoc->m_dNonChineseTokens[iStoredToken];
2598
2599 // copy stored non-chinese token && pack token data
2600 AppendToField ( pCurDoc, iField, tStored.m_sToken, strlen ( (const char*)tStored.m_sToken ), m_pMarkerMorph );
2601
2602 // this depends on SPH_MAX_WORD_LEN being 6 bits max
2603 DWORD uPacked = iTokenLen;
2604
2605 if ( tStored.m_bBoundary )
2606 uPacked |= PROXY_BOUNDARY_FLAG;
2607
2608 if ( tStored.m_bSpecial )
2609 uPacked |= PROXY_SPECIAL_FLAG;
2610
2611 if ( tStored.m_bBlended )
2612 uPacked |= PROXY_BLENDED_FLAG;
2613
2614 if ( tStored.m_bBlendedPart )
2615 uPacked |= PROXY_BLENDED_PART_FLAG;
2616
2617 int iTmpLen;
2618 if ( tStored.m_iOvershortCount )
2619 {
2620 uPacked |= PROXY_HAVE_OVERSHORT;
2621 iTmpLen = snprintf ( szTmp, sizeof(szTmp), " %x %x", uPacked, tStored.m_iOvershortCount );
2622 } else
2623 iTmpLen = snprintf ( szTmp, sizeof(szTmp), " %x", uPacked );
2624
2625 if ( iTmpLen < 0 )
2626 iTmpLen = sizeof(szTmp);
2627
2628 int iStoredLen = pCurDoc->m_dFieldStorage[iField].GetLength();
2629 pCurDoc->m_dFieldStorage[iField].Resize ( iStoredLen + iTmpLen );
2630 memcpy ( pCurDoc->m_dFieldStorage[iField].Begin()+iStoredLen-1, szTmp, iTmpLen+1 );
2631
2632 pCurDoc->m_dFields[iField] = pCurDoc->m_dFieldStorage[iField].Begin();
2633
2634 iStoredToken++;
2635 bSpecial = true;
2636 }
2637 }
2638
2639 // simple token; append to current field
2640 if ( !bSpecial )
2641 AppendToField ( pCurDoc, iField, pToken, iTokenLen, NULL );
2642 }
2643 }
2644
2645 return CopyDoc ();
2646 }
2647
2648 private:
2649 CSphSource_Document * m_pSource;
2650 CSphFixedVector<StoredDoc_t> m_dBatchedDocs;
2651 CSphVector<BYTE> m_dDocBuffer;
2652 CSphVector<int> m_dFieldLengths;
2653 int m_iDocStart;
2654 int m_iDocCount;
2655 ISphTokenizer * m_pExtraTokenizer;
2656 CSphHTMLStripper * m_pProxyStripper;
2657
2658 BYTE m_pMarkerDocStart[PROXY_MARKER_LEN];
2659 BYTE m_pMarkerChineseField[PROXY_MARKER_LEN];
2660 BYTE m_pMarkerNonChineseField[PROXY_MARKER_LEN];
2661 BYTE m_pMarkerTokenSeparator[PROXY_MARKER_LEN];
2662
2663 BYTE m_pMarkerTokenized[PROXY_MARKER_LEN];
2664 BYTE m_pMarkerMorph[PROXY_MARKER_LEN];
2665
IsDocCacheEmpty()2666 bool IsDocCacheEmpty() const { return !m_iDocCount; }
IsDocCacheFull()2667 bool IsDocCacheFull() const { return m_iDocCount==m_dBatchedDocs.GetLength(); }
2668
PushDoc()2669 StoredDoc_t * PushDoc()
2670 {
2671 assert ( !IsDocCacheFull() );
2672
2673 int iEnd = (m_iDocStart+m_iDocCount) % m_dBatchedDocs.GetLength();
2674 m_iDocCount++;
2675
2676 return &(m_dBatchedDocs[iEnd]);
2677 }
2678
PopDoc()2679 StoredDoc_t * PopDoc()
2680 {
2681 assert ( !IsDocCacheEmpty() );
2682
2683 StoredDoc_t * pDoc = &(m_dBatchedDocs[m_iDocStart]);
2684 m_iDocStart = (m_iDocStart+1) % m_dBatchedDocs.GetLength();
2685 m_iDocCount--;
2686 return pDoc;
2687 }
2688
CopyDoc()2689 BYTE ** CopyDoc ()
2690 {
2691 StoredDoc_t * pDoc = PopDoc();
2692 CopyDocInfo ( T::m_tDocInfo, pDoc->m_tDocInfo );
2693 T::m_tState.m_dFields = pDoc->m_dFields.Begin();
2694 T::m_dMva.SwapData ( pDoc->m_dMva );
2695 T::m_dStrAttrs.SwapData ( pDoc->m_dStrAttrs );
2696
2697 return T::m_tState.m_dFields;
2698 }
2699
CopyDocInfo(CSphMatch & tTo,const CSphMatch & tFrom)2700 void CopyDocInfo ( CSphMatch & tTo, const CSphMatch & tFrom )
2701 {
2702 if ( tFrom.m_pDynamic )
2703 {
2704 int iDynamic = T::m_tSchema.GetRowSize();
2705
2706 if ( !tTo.m_pDynamic )
2707 tTo.Reset ( iDynamic );
2708
2709 memcpy ( tTo.m_pDynamic, tFrom.m_pDynamic, iDynamic*sizeof(CSphRowitem) );
2710 }
2711
2712 tTo.m_pStatic = NULL;
2713 tTo.m_uDocID = tFrom.m_uDocID;
2714 tTo.m_iWeight = tFrom.m_iWeight;
2715 tTo.m_iTag = tFrom.m_iTag;
2716 }
2717 };
2718
2719 #endif // USE_RLP
2720
2721 #endif // _sphinxint_
2722
2723 //
2724 // $Id$
2725 //
2726