1 //
2 // $Id$
3 //
4
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15
16 #ifndef _sphinx_
17 #define _sphinx_
18
19 /////////////////////////////////////////////////////////////////////////////
20
21 #ifdef _WIN32
22 #define USE_MYSQL 1 /// whether to compile MySQL support
23 #define USE_PGSQL 0 /// whether to compile PgSQL support
24 #define USE_ODBC 1 /// whether to compile ODBC support
25 #define USE_LIBEXPAT 1 /// whether to compile libexpat support
26 #define USE_LIBICONV 1 /// whether to compile iconv support
27 #define USE_LIBSTEMMER 0 /// whether to compile libstemmber support
28 #define USE_RE2 0 /// whether to compile RE2 support
29 #define USE_RLP 0 /// whether to compile RLP support
30 #define USE_WINDOWS 1 /// whether to compile for Windows
31 #define USE_SYSLOG 0 /// whether to use syslog for logging
32 #define HAVE_STRNLEN 1
33
34 #define UNALIGNED_RAM_ACCESS 1
35 #define USE_LITTLE_ENDIAN 1
36 #else
37 #define USE_WINDOWS 0 /// whether to compile for Windows
38 #endif
39
40 /////////////////////////////////////////////////////////////////////////////
41
42 #include "sphinxstd.h"
43 #include "sphinxexpr.h" // to remove?
44
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <limits.h>
49
50 #ifdef HAVE_CONFIG_H
51 #include "config.h"
52 #endif
53
54 #if USE_PGSQL
55 #include <libpq-fe.h>
56 #endif
57
58 #if USE_WINDOWS
59 #include <winsock2.h>
60 #else
61 #include <sys/types.h>
62 #include <unistd.h>
63 #endif
64
65 #if USE_MYSQL
66 #include <mysql.h>
67 #endif
68
69 #if USE_WINDOWS
70 typedef __int64 SphOffset_t;
71 #define STDOUT_FILENO fileno(stdout)
72 #define STDERR_FILENO fileno(stderr)
73 #else
74 typedef off_t SphOffset_t;
75 #endif
76
77 #if USE_ODBC
78 #include <sqlext.h>
79 #endif
80
81 /////////////////////////////////////////////////////////////////////////////
82
83 #ifndef USE_64BIT
84 #define USE_64BIT 1
85 #endif
86
87 #if USE_64BIT
88
89 // use 64-bit unsigned integers to store document and word IDs
90 #define SPHINX_BITS_TAG "-id64"
91 typedef uint64_t SphWordID_t;
92 typedef uint64_t SphDocID_t;
93
94 #define DOCID_MAX U64C(0xffffffffffffffff)
95 #define DOCID_FMT UINT64_FMT
96 #define DOCINFO_IDSIZE 2
97
98 STATIC_SIZE_ASSERT ( SphWordID_t, 8 );
99 STATIC_SIZE_ASSERT ( SphDocID_t, 8 );
100
101 #else
102
103 // use 32-bit unsigned integers to store document and word IDs
104 #define SPHINX_BITS_TAG ""
105 typedef DWORD SphWordID_t;
106 typedef DWORD SphDocID_t;
107
108 #define DOCID_MAX 0xffffffffUL
109 #define DOCID_FMT "%u"
110 #define DOCINFO_IDSIZE 1
111
112 STATIC_SIZE_ASSERT ( SphWordID_t, 4 );
113 STATIC_SIZE_ASSERT ( SphDocID_t, 4 );
114
115 #endif // USE_64BIT
116
117 #define DWSIZEOF(a) ( sizeof(a) / sizeof(DWORD) )
118
119 //////////////////////////////////////////////////////////////////////////
120
121 /// row entry (storage only, does not necessarily map 1:1 to attributes)
122 typedef DWORD CSphRowitem;
123 typedef const BYTE * CSphRowitemPtr;
124
125 /// widest integer type that can be be stored as an attribute (ideally, fully decoupled from rowitem size!)
126 typedef int64_t SphAttr_t;
127
128 const CSphRowitem ROWITEM_MAX = UINT_MAX;
129 const int ROWITEM_BITS = 8*sizeof(CSphRowitem);
130 const int ROWITEMPTR_BITS = 8*sizeof(CSphRowitemPtr);
131 const int ROWITEM_SHIFT = 5;
132
133 STATIC_ASSERT ( ( 1 << ROWITEM_SHIFT )==ROWITEM_BITS, INVALID_ROWITEM_SHIFT );
134
135 #ifndef USE_LITTLE_ENDIAN
136 #error Please define endianness
137 #endif
138
139 template < typename DOCID >
140 inline DOCID DOCINFO2ID_T ( const DWORD * pDocinfo );
141
DOCINFO2ID_T(const DWORD * pDocinfo)142 template<> inline DWORD DOCINFO2ID_T ( const DWORD * pDocinfo )
143 {
144 return pDocinfo[0];
145 }
146
DOCINFO2ID_T(const DWORD * pDocinfo)147 template<> inline uint64_t DOCINFO2ID_T ( const DWORD * pDocinfo )
148 {
149 #if USE_LITTLE_ENDIAN
150 return uint64_t(pDocinfo[0]) + (uint64_t(pDocinfo[1])<<32);
151 #else
152 return uint64_t(pDocinfo[1]) + (uint64_t(pDocinfo[0])<<32);
153 #endif
154 }
155
DOCINFOSETID(DWORD * pDocinfo,DWORD uValue)156 inline void DOCINFOSETID ( DWORD * pDocinfo, DWORD uValue )
157 {
158 *pDocinfo = uValue;
159 }
160
DOCINFOSETID(DWORD * pDocinfo,uint64_t uValue)161 inline void DOCINFOSETID ( DWORD * pDocinfo, uint64_t uValue )
162 {
163 #if USE_LITTLE_ENDIAN
164 pDocinfo[0] = (DWORD)uValue;
165 pDocinfo[1] = (DWORD)(uValue>>32);
166 #else
167 pDocinfo[0] = (DWORD)(uValue>>32);
168 pDocinfo[1] = (DWORD)uValue;
169 #endif
170 }
171
DOCINFO2ID(const DWORD * pDocinfo)172 inline SphDocID_t DOCINFO2ID ( const DWORD * pDocinfo )
173 {
174 return DOCINFO2ID_T<SphDocID_t> ( pDocinfo );
175 }
176
177 #if PARANOID
DOCINFO2ATTRS_T(DWORD * pDocinfo)178 template < typename DOCID > inline DWORD * DOCINFO2ATTRS_T ( DWORD * pDocinfo ) { assert ( pDocinfo ); return pDocinfo+DWSIZEOF(DOCID); }
DOCINFO2ATTRS_T(const DWORD * pDocinfo)179 template < typename DOCID > inline const DWORD * DOCINFO2ATTRS_T ( const DWORD * pDocinfo ) { assert ( pDocinfo ); return pDocinfo+DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(DWORD * pAttrs)180 template < typename DOCID > inline DWORD * STATIC2DOCINFO_T ( DWORD * pAttrs ) { assert ( pAttrs ); return pAttrs-DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(const DWORD * pAttrs)181 template < typename DOCID > inline const DWORD * STATIC2DOCINFO_T ( const DWORD * pAttrs ) { assert ( pAttrs ); return pAttrs-DWSIZEOF(DOCID); }
182 #else
DOCINFO2ATTRS_T(DWORD * pDocinfo)183 template < typename DOCID > inline DWORD * DOCINFO2ATTRS_T ( DWORD * pDocinfo ) { return pDocinfo + DWSIZEOF(DOCID); }
DOCINFO2ATTRS_T(const DWORD * pDocinfo)184 template < typename DOCID > inline const DWORD * DOCINFO2ATTRS_T ( const DWORD * pDocinfo ) { return pDocinfo + DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(DWORD * pAttrs)185 template < typename DOCID > inline DWORD * STATIC2DOCINFO_T ( DWORD * pAttrs ) { return pAttrs - DWSIZEOF(DOCID); }
STATIC2DOCINFO_T(const DWORD * pAttrs)186 template < typename DOCID > inline const DWORD * STATIC2DOCINFO_T ( const DWORD * pAttrs ) { return pAttrs - DWSIZEOF(DOCID); }
187 #endif
188
DOCINFO2ATTRS(DWORD * pDocinfo)189 inline DWORD * DOCINFO2ATTRS ( DWORD * pDocinfo ) { return DOCINFO2ATTRS_T<SphDocID_t>(pDocinfo); }
DOCINFO2ATTRS(const DWORD * pDocinfo)190 inline const DWORD * DOCINFO2ATTRS ( const DWORD * pDocinfo ) { return DOCINFO2ATTRS_T<SphDocID_t>(pDocinfo); }
STATIC2DOCINFO(DWORD * pAttrs)191 inline DWORD * STATIC2DOCINFO ( DWORD * pAttrs ) { return STATIC2DOCINFO_T<SphDocID_t>(pAttrs); }
STATIC2DOCINFO(const DWORD * pAttrs)192 inline const DWORD * STATIC2DOCINFO ( const DWORD * pAttrs ) { return STATIC2DOCINFO_T<SphDocID_t>(pAttrs); }
193
194
195 /////////////////////////////////////////////////////////////////////////////
196
197 #ifdef BUILD_WITH_CMAKE
198 #include "gen_sphinxversion.h"
199 #else
200 #include "sphinxversion.h"
201 #endif
202
203 #ifndef SPHINX_TAG
204 #define SPHINX_TAG "-release"
205 #endif
206
207 // below is for easier extraction of the ver. by any external scripts
208 #define SPHINX_VERSION_NUMBERS "2.2.11"
209
210 #define SPHINX_VERSION SPHINX_VERSION_NUMBERS SPHINX_BITS_TAG SPHINX_TAG " (" SPH_GIT_COMMIT_ID ")"
211 #define SPHINX_BANNER "Sphinx " SPHINX_VERSION "\nCopyright (c) 2001-2016, Andrew Aksyonoff\nCopyright (c) 2008-2016, Sphinx Technologies Inc (http://sphinxsearch.com)\n\n"
212 #define SPHINX_SEARCHD_PROTO 1
213 #define SPHINX_CLIENT_VERSION 1
214
215 #define SPH_MAX_WORD_LEN 42 // so that any UTF-8 word fits 127 bytes
216 #define SPH_MAX_FILENAME_LEN 512
217 #define SPH_MAX_FIELDS 256
218
219 /////////////////////////////////////////////////////////////////////////////
220
221 extern int64_t g_iIndexerCurrentDocID;
222 extern int64_t g_iIndexerCurrentHits;
223 extern int64_t g_iIndexerCurrentRangeMin;
224 extern int64_t g_iIndexerCurrentRangeMax;
225 extern int64_t g_iIndexerPoolStartDocID;
226 extern int64_t g_iIndexerPoolStartHit;
227
228 /////////////////////////////////////////////////////////////////////////////
229
230 /// Sphinx CRC32 implementation
231 extern DWORD g_dSphinxCRC32 [ 256 ];
232 DWORD sphCRC32 ( const void * pString );
233 DWORD sphCRC32 ( const void * pString, int iLen );
234 DWORD sphCRC32 ( const void * pString, int iLen, DWORD uPrevCRC );
235
236 /// Fast check if our endianess is correct
237 const char* sphCheckEndian();
238
239 /// Sphinx FNV64 implementation
240 const uint64_t SPH_FNV64_SEED = 0xcbf29ce484222325ULL;
241 uint64_t sphFNV64 ( const void * pString );
242 uint64_t sphFNV64 ( const void * s, int iLen, uint64_t uPrev = SPH_FNV64_SEED );
243 uint64_t sphFNV64cont ( const void * pString, uint64_t uPrev );
244
245 /// calculate file crc32
246 bool sphCalcFileCRC32 ( const char * szFilename, DWORD & uCRC32 );
247
248 /// try to obtain an exclusive lock on specified file
249 /// bWait specifies whether to wait
250 bool sphLockEx ( int iFile, bool bWait );
251
252 /// remove existing locks
253 void sphLockUn ( int iFile );
254
255 /// millisecond-precision sleep
256 void sphSleepMsec ( int iMsec );
257
258 /// check if file exists and is a readable file
259 bool sphIsReadable ( const char * sFilename, CSphString * pError=NULL );
260
261 /// set throttling options
262 void sphSetThrottling ( int iMaxIOps, int iMaxIOSize );
263
264 /// immediately interrupt current query
265 void sphInterruptNow();
266
267 /// check if we got interrupted
268 bool sphInterrupted();
269
270 #if !USE_WINDOWS
271 /// set process info
272 void sphSetProcessInfo ( bool bHead );
273 #endif
274
275
276 /// initialize IO statistics collecting
277 bool sphInitIOStats ();
278
279 /// clean up IO statistics collector
280 void sphDoneIOStats ();
281
282
283 class CSphIOStats
284 {
285 public:
286 int64_t m_iReadTime;
287 DWORD m_iReadOps;
288 int64_t m_iReadBytes;
289 int64_t m_iWriteTime;
290 DWORD m_iWriteOps;
291 int64_t m_iWriteBytes;
292
293 CSphIOStats ();
294 ~CSphIOStats ();
295
296 void Start();
297 void Stop();
298
299 void Add ( const CSphIOStats & b );
IsEnabled()300 bool IsEnabled() { return m_bEnabled; }
301
302 private:
303 bool m_bEnabled;
304 CSphIOStats * m_pPrev;
305 };
306
307
308 //////////////////////////////////////////////////////////////////////////
309
310 #if UNALIGNED_RAM_ACCESS
311
312 /// pass-through wrapper
sphUnalignedRead(const T & tRef)313 template < typename T > inline T sphUnalignedRead ( const T & tRef )
314 {
315 return tRef;
316 }
317
318 /// pass-through wrapper
sphUnalignedWrite(void * pPtr,const T & tVal)319 template < typename T > void sphUnalignedWrite ( void * pPtr, const T & tVal )
320 {
321 *(T*)pPtr = tVal;
322 }
323
324 #else
325
326 /// unaligned read wrapper for some architectures (eg. SPARC)
327 template < typename T >
sphUnalignedRead(const T & tRef)328 inline T sphUnalignedRead ( const T & tRef )
329 {
330 T uTmp;
331 BYTE * pSrc = (BYTE *) &tRef;
332 BYTE * pDst = (BYTE *) &uTmp;
333 for ( int i=0; i<(int)sizeof(T); i++ )
334 *pDst++ = *pSrc++;
335 return uTmp;
336 }
337
338 /// unaligned write wrapper for some architectures (eg. SPARC)
339 template < typename T >
sphUnalignedWrite(void * pPtr,const T & tVal)340 void sphUnalignedWrite ( void * pPtr, const T & tVal )
341 {
342 BYTE * pDst = (BYTE *) pPtr;
343 BYTE * pSrc = (BYTE *) &tVal;
344 for ( int i=0; i<(int)sizeof(T); i++ )
345 *pDst++ = *pSrc++;
346 }
347
348 #endif // unalgined
349
350
351 #if UNALIGNED_RAM_ACCESS && USE_LITTLE_ENDIAN
352 /// get a dword from memory, intel version
sphGetDword(const BYTE * p)353 inline DWORD sphGetDword ( const BYTE * p )
354 {
355 return *(const DWORD*)p;
356 }
357 #else
358 /// get a dword from memory, non-intel version
sphGetDword(const BYTE * p)359 inline DWORD sphGetDword ( const BYTE * p )
360 {
361 return p[0] + ( p[1]<<8 ) + ( p[2]<<16 ) + ( p[3]<<24 );
362 }
363 #endif
364
365
366 int sphUTF8Len ( const char * pStr );
367
368 /// check for valid attribute name char
sphIsAttr(int c)369 inline int sphIsAttr ( int c )
370 {
371 // different from sphIsAlpha() in that we don't allow minus
372 return ( c>='0' && c<='9' ) || ( c>='a' && c<='z' ) || ( c>='A' && c<='Z' ) || c=='_';
373 }
374
375 /////////////////////////////////////////////////////////////////////////////
376 // TOKENIZERS
377 /////////////////////////////////////////////////////////////////////////////
378
379 extern const char * SPHINX_DEFAULT_UTF8_TABLE;
380
381 /////////////////////////////////////////////////////////////////////////////
382
383 /// lowercaser remap range
384 struct CSphRemapRange
385 {
386 int m_iStart;
387 int m_iEnd;
388 int m_iRemapStart;
389
CSphRemapRangeCSphRemapRange390 CSphRemapRange ()
391 : m_iStart ( -1 )
392 , m_iEnd ( -1 )
393 , m_iRemapStart ( -1 )
394 {}
395
CSphRemapRangeCSphRemapRange396 CSphRemapRange ( int iStart, int iEnd, int iRemapStart )
397 : m_iStart ( iStart )
398 , m_iEnd ( iEnd )
399 , m_iRemapStart ( iRemapStart )
400 {}
401 };
402
403
404 inline bool operator < ( const CSphRemapRange & a, const CSphRemapRange & b )
405 {
406 return a.m_iStart < b.m_iStart;
407 }
408
409
410 /// lowercaser
411 class CSphLowercaser
412 {
413 friend class ISphTokenizer;
414 friend class CSphTokenizerBase;
415 friend class CSphTokenizer_UTF8_Base;
416 friend class CSphTokenizerBase2;
417
418 public:
419 CSphLowercaser ();
420 ~CSphLowercaser ();
421
422 void Reset ();
423 void SetRemap ( const CSphLowercaser * pLC );
424 void AddRemaps ( const CSphVector<CSphRemapRange> & dRemaps, DWORD uFlags );
425 void AddSpecials ( const char * sSpecials );
426 uint64_t GetFNV () const;
427
428 public:
429 const CSphLowercaser & operator = ( const CSphLowercaser & rhs );
430
431 public:
ToLower(int iCode)432 inline int ToLower ( int iCode ) const
433 {
434 if ( iCode<0 || iCode>=MAX_CODE )
435 return iCode;
436 int * pChunk = m_pChunk [ iCode >> CHUNK_BITS ];
437 if ( pChunk )
438 return pChunk [ iCode & CHUNK_MASK ];
439 return 0;
440 }
441
442 int GetMaxCodepointLength () const;
443
444 protected:
445 static const int CHUNK_COUNT = 0x300;
446 static const int CHUNK_BITS = 8;
447
448 static const int CHUNK_SIZE = 1 << CHUNK_BITS;
449 static const int CHUNK_MASK = CHUNK_SIZE - 1;
450 static const int MAX_CODE = CHUNK_COUNT * CHUNK_SIZE;
451
452 int m_iChunks; ///< how much chunks are actually allocated
453 int * m_pData; ///< chunks themselves
454 int * m_pChunk [ CHUNK_COUNT ]; ///< pointers to non-empty chunks
455 };
456
457 /////////////////////////////////////////////////////////////////////////////
458
459 struct CSphSavedFile
460 {
461 CSphString m_sFilename;
462 SphOffset_t m_uSize;
463 SphOffset_t m_uCTime;
464 SphOffset_t m_uMTime;
465 DWORD m_uCRC32;
466
467 CSphSavedFile ();
468 };
469
470
471 struct CSphEmbeddedFiles
472 {
473 bool m_bEmbeddedSynonyms;
474 bool m_bEmbeddedStopwords;
475 bool m_bEmbeddedWordforms;
476 CSphSavedFile m_tSynonymFile;
477 CSphVector<CSphString> m_dSynonyms;
478 CSphVector<CSphSavedFile> m_dStopwordFiles;
479 CSphVector<SphWordID_t> m_dStopwords;
480 CSphVector<CSphString> m_dWordforms;
481 CSphVector<CSphSavedFile> m_dWordformFiles;
482
483 CSphEmbeddedFiles ();
484
485 void Reset();
486 };
487
488
489 struct CSphTokenizerSettings
490 {
491 int m_iType;
492 CSphString m_sCaseFolding;
493 int m_iMinWordLen;
494 CSphString m_sSynonymsFile;
495 CSphString m_sBoundary;
496 CSphString m_sIgnoreChars;
497 int m_iNgramLen;
498 CSphString m_sNgramChars;
499 CSphString m_sBlendChars;
500 CSphString m_sBlendMode;
501 CSphString m_sIndexingPlugin; ///< this tokenizer wants an external plugin to process its raw output
502
503 CSphTokenizerSettings ();
504 };
505
506
507 enum ESphBigram
508 {
509 SPH_BIGRAM_NONE = 0, ///< no bigrams
510 SPH_BIGRAM_ALL = 1, ///< index all word pairs
511 SPH_BIGRAM_FIRSTFREQ = 2, ///< only index pairs where one of the words is in a frequent words list
512 SPH_BIGRAM_BOTHFREQ = 3 ///< only index pairs where both words are in a frequent words list
513 };
514
515
516 enum ESphTokenizerClone
517 {
518 SPH_CLONE_INDEX, ///< clone tokenizer and set indexing mode
519 SPH_CLONE_QUERY, ///< clone tokenizer and set querying mode
520 SPH_CLONE_QUERY_LIGHTWEIGHT ///< lightweight clone for querying (can parse, can NOT modify settings, shares pointers to the original lowercaser table)
521 };
522
523
524 enum ESphTokenMorph
525 {
526 SPH_TOKEN_MORPH_RAW, ///< no morphology applied, tokenizer does not handle morphology
527 SPH_TOKEN_MORPH_ORIGINAL, ///< no morphology applied, but tokenizer handles morphology
528 SPH_TOKEN_MORPH_GUESS ///< morphology applied
529 };
530
531
532 struct CSphMultiformContainer;
533 class CSphWriter;
534
535 /// generic tokenizer
536 class ISphTokenizer
537 {
538 public:
539 /// trivial ctor
540 ISphTokenizer();
541
542 /// trivial dtor
~ISphTokenizer()543 virtual ~ISphTokenizer () {}
544
545 public:
546 /// set new translation table
547 /// returns true on success, false on failure
548 virtual bool SetCaseFolding ( const char * sConfig, CSphString & sError );
549
550 /// add additional character as valid (with folding to itself)
551 virtual void AddPlainChar ( char c );
552
553 /// add special chars to translation table
554 /// updates lowercaser so that these remap to -1
555 virtual void AddSpecials ( const char * sSpecials );
556
557 /// set ignored characters
558 virtual bool SetIgnoreChars ( const char * sIgnored, CSphString & sError );
559
560 /// set n-gram characters (for CJK n-gram indexing)
SetNgramChars(const char *,CSphString &)561 virtual bool SetNgramChars ( const char *, CSphString & ) { return true; }
562
563 /// set n-gram length (for CJK n-gram indexing)
SetNgramLen(int)564 virtual void SetNgramLen ( int ) {}
565
566 /// load synonyms list
567 virtual bool LoadSynonyms ( const char * sFilename, const CSphEmbeddedFiles * pFiles, CSphString & sError ) = 0;
568
569 /// write synonyms to file
570 virtual void WriteSynonyms ( CSphWriter & tWriter ) = 0;
571
572 /// set phrase boundary chars
573 virtual bool SetBoundary ( const char * sConfig, CSphString & sError );
574
575 /// set blended characters
576 virtual bool SetBlendChars ( const char * sConfig, CSphString & sError );
577
578 /// set blended tokens processing mode
579 virtual bool SetBlendMode ( const char * sMode, CSphString & sError );
580
581 /// setup tokenizer using given settings
582 virtual void Setup ( const CSphTokenizerSettings & tSettings );
583
584 /// create a tokenizer using the given settings
585 static ISphTokenizer * Create ( const CSphTokenizerSettings & tSettings, const CSphEmbeddedFiles * pFiles, CSphString & sError );
586
587 /// create a token filter
588 static ISphTokenizer * CreateMultiformFilter ( ISphTokenizer * pTokenizer, const CSphMultiformContainer * pContainer );
589
590 /// create a token filter
591 static ISphTokenizer * CreateBigramFilter ( ISphTokenizer * pTokenizer, ESphBigram eBigramIndex, const CSphString & sBigramWords, CSphString & sError );
592
593 /// create a plugin filter
594 /// sSspec is a library, name, and options specification string, eg "myplugins.dll:myfilter1:arg1=123"
595 static ISphTokenizer * CreatePluginFilter ( ISphTokenizer * pTokenizer, const CSphString & sSpec, CSphString & sError );
596
597 #if USE_RLP
598 /// create a RLP token filter
599 static ISphTokenizer * CreateRLPFilter ( ISphTokenizer * pTokenizer, bool bChineseRLP, const char * szRLPRoot, const char * szRLPEnv, const char * szRLPCtx, bool bStandalone, CSphString & sError );
600
601 /// create a filter to split an RLP-processed token stream into tokens
602 static ISphTokenizer * CreateRLPResultSplitter ( ISphTokenizer * pTokenizer, const char * szRLPCtx );
603
604 /// split query string with an RLP token filter
605 static bool ProcessQueryRLP ( const char * sRLPContext, const char * sQuery, const char ** sProcessed, CSphTightVector<char> & dBuf, CSphString & sError );
606 #endif
607
608 /// save tokenizer settings to a stream
GetSettings()609 virtual const CSphTokenizerSettings & GetSettings () const { return m_tSettings; }
610
611 /// get synonym file info
GetSynFileInfo()612 virtual const CSphSavedFile & GetSynFileInfo () const { return m_tSynFileInfo; }
613
614 public:
615 /// pass next buffer
616 virtual void SetBuffer ( const BYTE * sBuffer, int iLength ) = 0;
617
618 /// set current index schema (only intended for the token filter plugins)
SetFilterSchema(const CSphSchema &,CSphString &)619 virtual bool SetFilterSchema ( const CSphSchema &, CSphString & ) { return true; }
620
621 /// set per-document options from INSERT
SetFilterOptions(const char *,CSphString &)622 virtual bool SetFilterOptions ( const char *, CSphString & ) { return true; }
623
624 /// notify tokenizer that we now begin indexing a field with a given number (only intended for the token filter plugins)
BeginField(int)625 virtual void BeginField ( int ) {}
626
627 /// get next token
628 virtual BYTE * GetToken () = 0;
629
630 /// calc codepoint length
631 virtual int GetCodepointLength ( int iCode ) const = 0;
632
633 /// get max codepoint length
634 virtual int GetMaxCodepointLength () const = 0;
635
636 /// enable indexing-time sentence boundary detection, and paragraph indexing
637 virtual bool EnableSentenceIndexing ( CSphString & sError );
638
639 /// enable zone indexing
640 virtual bool EnableZoneIndexing ( CSphString & sError );
641
642 // shows whether morphology needs to be applied to this token or not
GetMorphFlag()643 virtual bool GetMorphFlag () const { return true; }
644
645 /// enable tokenized multiform tracking
EnableTokenizedMultiformTracking()646 virtual void EnableTokenizedMultiformTracking () {}
647
648 /// get last token length, in codepoints
GetLastTokenLen()649 virtual int GetLastTokenLen () const { return m_iLastTokenLen; }
650
651 /// get last token boundary flag (true if there was a boundary before the token)
GetBoundary()652 virtual bool GetBoundary () { return m_bTokenBoundary; }
653
654 /// get byte offset of the last boundary character
GetBoundaryOffset()655 virtual int GetBoundaryOffset () { return m_iBoundaryOffset; }
656
657 /// was last token a special one?
WasTokenSpecial()658 virtual bool WasTokenSpecial () { return m_bWasSpecial; }
659
WasTokenSynonym()660 virtual bool WasTokenSynonym () const { return m_bWasSynonym; }
661
662 /// get amount of overshort keywords skipped before this token
GetOvershortCount()663 virtual int GetOvershortCount () { return ( !m_bBlended && m_bBlendedPart ? 0 : m_iOvershortCount ); }
664
665 /// get original tokenized multiform (if any); NULL means there was none
GetTokenizedMultiform()666 virtual BYTE * GetTokenizedMultiform () { return NULL; }
667
668 /// was last token a part of multi-wordforms destination
669 /// head parameter might be useful to distinguish between sequence of different multi-wordforms
670 virtual bool WasTokenMultiformDestination ( bool & bHead, int & iDestCount ) const = 0;
671
672 /// check whether this token is a generated morphological guess
GetTokenMorph()673 ESphTokenMorph GetTokenMorph() const { return m_eTokenMorph; }
674
TokenIsBlended()675 virtual bool TokenIsBlended () const { return m_bBlended; }
TokenIsBlendedPart()676 virtual bool TokenIsBlendedPart () const { return m_bBlendedPart; }
SkipBlended()677 virtual int SkipBlended () { return 0; }
678
GetEmbeddedTokenizer()679 virtual ISphTokenizer * GetEmbeddedTokenizer () const { return NULL; }
680
681 public:
682 /// spawn a clone of my own
683 virtual ISphTokenizer * Clone ( ESphTokenizerClone eMode ) const = 0;
684
685 /// start buffer point of last token
686 virtual const char * GetTokenStart () const = 0;
687
688 /// end buffer point of last token (exclusive, ie. *GetTokenEnd() is already NOT part of a token!)
689 virtual const char * GetTokenEnd () const = 0;
690
691 /// current buffer ptr
692 virtual const char * GetBufferPtr () const = 0;
693
694 /// buffer end
695 virtual const char * GetBufferEnd () const = 0;
696
697 /// set new buffer ptr (must be within current bounds)
698 virtual void SetBufferPtr ( const char * sNewPtr ) = 0;
699
700 /// get settings hash
701 virtual uint64_t GetSettingsFNV () const;
702
703 /// get (readonly) lowercaser
GetLowercaser()704 const CSphLowercaser & GetLowercaser() const { return m_tLC; }
705
706 /// get an RLP context path (if any)
GetRLPContext()707 virtual const char * GetRLPContext () const { return NULL; }
708
709 protected:
710 virtual bool RemapCharacters ( const char * sConfig, DWORD uFlags, const char * sSource, bool bCanRemap, CSphString & sError );
711 virtual bool AddSpecialsSPZ ( const char * sSpecials, const char * sDirective, CSphString & sError );
712
713 protected:
714 static const int MAX_SYNONYM_LEN = 1024; ///< max synonyms map-from length, bytes
715
716 static const BYTE BLEND_TRIM_NONE = 1;
717 static const BYTE BLEND_TRIM_HEAD = 2;
718 static const BYTE BLEND_TRIM_TAIL = 4;
719 static const BYTE BLEND_TRIM_BOTH = 8;
720
721 CSphLowercaser m_tLC; ///< my lowercaser
722 int m_iLastTokenLen; ///< last token length, in codepoints
723 bool m_bTokenBoundary; ///< last token boundary flag (true after boundary codepoint followed by separator)
724 bool m_bBoundary; ///< boundary flag (true immediately after boundary codepoint)
725 int m_iBoundaryOffset; ///< boundary character offset (in bytes)
726 bool m_bWasSpecial; ///< special token flag
727 bool m_bWasSynonym; ///< last token is a synonym token
728 bool m_bEscaped; ///< backslash handling flag
729 int m_iOvershortCount; ///< skipped overshort tokens count
730 ESphTokenMorph m_eTokenMorph; ///< whether last token was a generated morphological guess
731
732 bool m_bBlended; ///< whether last token (as in just returned from GetToken()) was blended
733 bool m_bNonBlended; ///< internal, whether there were any normal chars in that blended token
734 bool m_bBlendedPart; ///< whether last token is a normal subtoken of a blended token
735 bool m_bBlendAdd; ///< whether we have more pending blended variants (of current accumulator) to return
736 BYTE m_uBlendVariants; ///< mask of blended variants as requested by blend_mode (see BLEND_TRIM_xxx flags)
737 BYTE m_uBlendVariantsPending; ///< mask of pending blended variants (we clear bits as we return variants)
738 bool m_bBlendSkipPure; ///< skip purely blended tokens
739
740 bool m_bShortTokenFilter; ///< short token filter flag
741 bool m_bDetectSentences; ///< should we detect sentence boundaries?
742
743 CSphTokenizerSettings m_tSettings; ///< tokenizer settings
744 CSphSavedFile m_tSynFileInfo; ///< synonyms file info
745
746 public:
747 bool m_bPhrase;
748 };
749
750 /// parse charset table
751 bool sphParseCharset ( const char * sCharset, CSphVector<CSphRemapRange> & dRemaps );
752
753 /// create UTF-8 tokenizer
754 ISphTokenizer * sphCreateUTF8Tokenizer ();
755
756 /// create UTF-8 tokenizer with n-grams support (for CJK n-gram indexing)
757 ISphTokenizer * sphCreateUTF8NgramTokenizer ();
758
759 /////////////////////////////////////////////////////////////////////////////
760 // DICTIONARIES
761 /////////////////////////////////////////////////////////////////////////////
762
763 struct CSphDictSettings
764 {
765 CSphString m_sMorphology;
766 CSphString m_sStopwords;
767 CSphVector<CSphString> m_dWordforms;
768 int m_iMinStemmingLen;
769 bool m_bWordDict;
770 bool m_bCrc32;
771 bool m_bStopwordsUnstemmed;
772 CSphString m_sMorphFingerprint; ///< not used for creation; only for a check when loading
773
CSphDictSettingsCSphDictSettings774 CSphDictSettings ()
775 : m_iMinStemmingLen ( 1 )
776 , m_bWordDict ( true )
777 , m_bCrc32 ( !USE_64BIT )
778 , m_bStopwordsUnstemmed ( false )
779 {}
780 };
781
782
783 /// dictionary entry
784 /// some of the fields might be unused depending on specific dictionary type
785 struct CSphDictEntry
786 {
787 SphWordID_t m_uWordID; ///< keyword id (for dict=crc)
788 const BYTE * m_sKeyword; ///< keyword text (for dict=keywords)
789 int m_iDocs; ///< number of matching documents
790 int m_iHits; ///< number of occurrences
791 SphOffset_t m_iDoclistOffset; ///< absolute document list offset (into .spd)
792 SphOffset_t m_iDoclistLength; ///< document list length in bytes
793 SphOffset_t m_iSkiplistOffset; ///< absolute skiplist offset (into .spe)
794 int m_iDoclistHint; ///< raw document list length hint value (0..255 range, 1 byte)
795 };
796
797
798 /// stored normal form
799 struct CSphStoredNF
800 {
801 CSphString m_sWord;
802 bool m_bAfterMorphology;
803 };
804
805
806 /// wordforms container
807 struct CSphWordforms
808 {
809 int m_iRefCount;
810 CSphVector<CSphSavedFile> m_dFiles;
811 uint64_t m_uTokenizerFNV;
812 CSphString m_sIndexName;
813 bool m_bHavePostMorphNF;
814 CSphVector <CSphStoredNF> m_dNormalForms;
815 CSphMultiformContainer * m_pMultiWordforms;
816 CSphOrderedHash < int, CSphString, CSphStrHashFunc, 1048576 > m_dHash;
817
818 CSphWordforms ();
819 ~CSphWordforms ();
820
821 bool IsEqual ( const CSphVector<CSphSavedFile> & dFiles );
822 bool ToNormalForm ( BYTE * pWord, bool bBefore ) const;
823 };
824
825
826 /// abstract word dictionary interface
827 struct CSphWordHit;
828 class CSphAutofile;
829 struct DictHeader_t;
830 struct ThrottleState_t;
831 class CSphDict
832 {
833 public:
834 static const int ST_OK = 0;
835 static const int ST_ERROR = 1;
836 static const int ST_WARNING = 2;
837
838 public:
839 /// virtualizing dtor
~CSphDict()840 virtual ~CSphDict () {}
841
842 /// Get word ID by word, "text" version
843 /// may apply stemming and modify word inplace
844 /// modified word may become bigger than the original one, so make sure you have enough space in buffer which is pointer by pWord
845 /// a general practice is to use char[3*SPH_MAX_WORD_LEN+4] as a buffer
846 /// returns 0 for stopwords
847 virtual SphWordID_t GetWordID ( BYTE * pWord ) = 0;
848
849 /// get word ID by word, "text" version
850 /// may apply stemming and modify word inplace
851 /// accepts words with already prepended MAGIC_WORD_HEAD
852 /// appends MAGIC_WORD_TAIL
853 /// returns 0 for stopwords
GetWordIDWithMarkers(BYTE * pWord)854 virtual SphWordID_t GetWordIDWithMarkers ( BYTE * pWord ) { return GetWordID ( pWord ); }
855
856 /// get word ID by word, "text" version
857 /// does NOT apply stemming
858 /// accepts words with already prepended MAGIC_WORD_HEAD_NONSTEMMED
859 /// returns 0 for stopwords
GetWordIDNonStemmed(BYTE * pWord)860 virtual SphWordID_t GetWordIDNonStemmed ( BYTE * pWord ) { return GetWordID ( pWord ); }
861
862 /// get word ID by word, "binary" version
863 /// only used with prefix/infix indexing
864 /// must not apply stemming and modify anything
865 /// filters stopwords on request
866 virtual SphWordID_t GetWordID ( const BYTE * pWord, int iLen, bool bFilterStops ) = 0;
867
868 /// apply stemmers to the given word
ApplyStemmers(BYTE *)869 virtual void ApplyStemmers ( BYTE * ) const {}
870
871 /// load stopwords from given files
872 virtual void LoadStopwords ( const char * sFiles, const ISphTokenizer * pTokenizer ) = 0;
873
874 /// load stopwords from an array
875 virtual void LoadStopwords ( const CSphVector<SphWordID_t> & dStopwords ) = 0;
876
877 /// write stopwords to a file
878 virtual void WriteStopwords ( CSphWriter & tWriter ) = 0;
879
880 /// load wordforms from a given list of files
881 virtual bool LoadWordforms ( const CSphVector<CSphString> &, const CSphEmbeddedFiles * pEmbedded, const ISphTokenizer * pTokenizer, const char * sIndex ) = 0;
882
883 /// write wordforms to a file
884 virtual void WriteWordforms ( CSphWriter & tWriter ) = 0;
885
886 /// get wordforms
GetWordforms()887 virtual const CSphWordforms * GetWordforms() { return NULL; }
888
889 /// disable wordforms processing
DisableWordforms()890 virtual void DisableWordforms() {}
891
892 /// set morphology
893 /// returns 0 on success, 1 on hard error, 2 on a warning (see ST_xxx constants)
894 virtual int SetMorphology ( const char * szMorph, CSphString & sMessage ) = 0;
895
896 /// are there any morphological processors?
HasMorphology()897 virtual bool HasMorphology () const { return false; }
898
899 /// morphological data fingerprint (lemmatizer filenames and crc32s)
GetMorphDataFingerprint()900 virtual const CSphString & GetMorphDataFingerprint () const { return m_sMorphFingerprint; }
901
902 /// setup dictionary using settings
903 virtual void Setup ( const CSphDictSettings & tSettings ) = 0;
904
905 /// get dictionary settings
906 virtual const CSphDictSettings & GetSettings () const = 0;
907
908 /// stopwords file infos
909 virtual const CSphVector <CSphSavedFile> & GetStopwordsFileInfos () = 0;
910
911 /// wordforms file infos
912 virtual const CSphVector <CSphSavedFile> & GetWordformsFileInfos () = 0;
913
914 /// get multiwordforms
915 virtual const CSphMultiformContainer * GetMultiWordforms () const = 0;
916
917 /// check what given word is stopword
918 virtual bool IsStopWord ( const BYTE * pWord ) const = 0;
919
920 public:
921 /// enable actually collecting keywords (needed for stopwords/wordforms loading)
HitblockBegin()922 virtual void HitblockBegin () {}
923
924 /// callback to let dictionary do hit block post-processing
HitblockPatch(CSphWordHit *,int)925 virtual void HitblockPatch ( CSphWordHit *, int ) const {}
926
927 /// resolve temporary hit block wide wordid (!) back to keyword
HitblockGetKeyword(SphWordID_t)928 virtual const char * HitblockGetKeyword ( SphWordID_t ) { return NULL; }
929
930 /// check current memory usage
HitblockGetMemUse()931 virtual int HitblockGetMemUse () { return 0; }
932
933 /// hit block dismissed
HitblockReset()934 virtual void HitblockReset () {}
935
936 public:
937 /// begin creating dictionary file, setup any needed internal structures
938 virtual void DictBegin ( CSphAutofile & tTempDict, CSphAutofile & tDict, int iDictLimit, ThrottleState_t * pThrottle );
939
940 /// add next keyword entry to final dict
941 virtual void DictEntry ( const CSphDictEntry & tEntry );
942
943 /// flush last entry
944 virtual void DictEndEntries ( SphOffset_t iDoclistOffset );
945
946 /// end indexing, store dictionary and checkpoints
947 virtual bool DictEnd ( DictHeader_t * pHeader, int iMemLimit, CSphString & sError, ThrottleState_t * pThrottle );
948
949 /// check whether there were any errors during indexing
950 virtual bool DictIsError () const;
951
952 public:
953 /// check whether this dict is stateful (when it comes to lookups)
HasState()954 virtual bool HasState () const { return false; }
955
956 /// make a clone
Clone()957 virtual CSphDict * Clone () const { return NULL; }
958
959 /// get settings hash
960 virtual uint64_t GetSettingsFNV () const = 0;
961
962 /// apply morphology or not
963 virtual void SetApplyMorph ( bool bApply ) = 0;
964
965 protected:
966 CSphString m_sMorphFingerprint;
967 };
968
969
970 /// traits dictionary factory (no storage, only tokenizing, lemmatizing, etc.)
971 CSphDict * sphCreateDictionaryTemplate ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
972
973 /// CRC32/FNV64 dictionary factory
974 CSphDict * sphCreateDictionaryCRC ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
975
976 /// keyword-storing dictionary factory
977 CSphDict * sphCreateDictionaryKeywords ( const CSphDictSettings & tSettings, const CSphEmbeddedFiles * pFiles, const ISphTokenizer * pTokenizer, const char * sIndex, CSphString & sError );
978
979 /// clear wordform cache
980 void sphShutdownWordforms ();
981
982 /// update/clear global IDF cache
983 bool sphPrereadGlobalIDF ( const CSphString & sPath, CSphString & sError );
984 void sphUpdateGlobalIDFs ( const CSphVector<CSphString> & dFiles );
985 void sphInitGlobalIDFs ();
986 void sphShutdownGlobalIDFs ();
987
988 /////////////////////////////////////////////////////////////////////////////
989 // DATASOURCES
990 /////////////////////////////////////////////////////////////////////////////
991
992 /// hit position storage type
993 typedef DWORD Hitpos_t;
994
995 /// empty hit value
996 #define EMPTY_HIT 0
997
998 /// hit processing tools
999 /// Hitpos_t consists of three things:
1000 /// 1) high bits store field number
1001 /// 2) middle bit - field end marker
1002 /// 3) lower bits store hit position in field
1003 template < int FIELD_BITS >
1004 class Hitman_c
1005 {
1006 protected:
1007 enum
1008 {
1009 POS_BITS = 31 - FIELD_BITS,
1010 FIELD_OFF = 32 - FIELD_BITS,
1011 FIELDEND_OFF = 31 - FIELD_BITS,
1012 FIELDEND_MASK = (1UL << POS_BITS),
1013 POS_MASK = (1UL << POS_BITS) - 1
1014 };
1015
1016 public:
Create(int iField,int iPos)1017 static Hitpos_t Create ( int iField, int iPos )
1018 {
1019 return ( iField << FIELD_OFF ) + ( iPos & POS_MASK );
1020 }
1021
Create(int iField,int iPos,bool bEnd)1022 static Hitpos_t Create ( int iField, int iPos, bool bEnd )
1023 {
1024 return ( iField << FIELD_OFF ) + ( ((int)bEnd) << FIELDEND_OFF ) + ( iPos & POS_MASK );
1025 }
1026
GetField(Hitpos_t uHitpos)1027 static inline int GetField ( Hitpos_t uHitpos )
1028 {
1029 return uHitpos >> FIELD_OFF;
1030 }
1031
GetPos(Hitpos_t uHitpos)1032 static inline int GetPos ( Hitpos_t uHitpos )
1033 {
1034 return uHitpos & POS_MASK;
1035 }
1036
IsEnd(Hitpos_t uHitpos)1037 static inline bool IsEnd ( Hitpos_t uHitpos )
1038 {
1039 return ( uHitpos & FIELDEND_MASK )!=0;
1040 }
1041
GetPosWithField(Hitpos_t uHitpos)1042 static inline DWORD GetPosWithField ( Hitpos_t uHitpos )
1043 {
1044 return uHitpos & ~FIELDEND_MASK;
1045 }
1046
AddPos(Hitpos_t * pHitpos,int iAdd)1047 static void AddPos ( Hitpos_t * pHitpos, int iAdd )
1048 {
1049 // FIXME! add range checks (eg. so that 0:0-1 does not overflow)
1050 *pHitpos += iAdd;
1051 }
1052
CreateSum(Hitpos_t uHitpos,int iAdd)1053 static Hitpos_t CreateSum ( Hitpos_t uHitpos, int iAdd )
1054 {
1055 // FIXME! add range checks (eg. so that 0:0-1 does not overflow)
1056 return ( uHitpos+iAdd ) & ~FIELDEND_MASK;
1057 }
1058
SetEndMarker(Hitpos_t * pHitpos)1059 static void SetEndMarker ( Hitpos_t * pHitpos )
1060 {
1061 *pHitpos |= FIELDEND_MASK;
1062 }
1063 };
1064
1065 // this could be just DWORD[] but it's methods are very handy
1066 // used to store field information e.g. which fields do we need to search in
1067 struct FieldMask_t
1068 {
1069 static const int SIZE = SPH_MAX_FIELDS/32;
1070 STATIC_ASSERT ( ( SPH_MAX_FIELDS%32 )==0, ASSUME_MAX_FIELDS_ARE_REPRESENTABLE_BY_DWORD );
1071 DWORD m_dMask [ SIZE ];
1072
1073 // no custom cstr and d-tor - to be usable from inside unions
1074 // deep copy for it is ok - so, no explicit copying constructor and operator=
1075
1076 // old-fashion layer to work with DWORD (32-bit) mask.
1077 // all bits above 32 assumed to be unset.
Assign32FieldMask_t1078 void Assign32 ( DWORD uMask )
1079 {
1080 UnsetAll();
1081 m_dMask[0] = uMask;
1082 }
1083
GetMask32FieldMask_t1084 DWORD GetMask32 () const
1085 {
1086 return m_dMask[0];
1087 }
1088
1089 DWORD operator[] ( int iIdx ) const
1090 {
1091 assert ( 0<=iIdx && iIdx<SIZE );
1092 return m_dMask [ iIdx ];
1093 }
1094
1095 DWORD & operator[] ( int iIdx )
1096 {
1097 assert ( 0<=iIdx && iIdx<SIZE );
1098 return m_dMask [ iIdx ];
1099 }
1100
1101 // set n-th bit
SetFieldMask_t1102 void Set ( int iIdx )
1103 {
1104 assert ( 0<=iIdx && iIdx<(int)sizeof(m_dMask)*8 );
1105 m_dMask [ iIdx/32 ] |= 1 << ( iIdx%32 );
1106 }
1107
1108 // set all bits
SetAllFieldMask_t1109 void SetAll()
1110 {
1111 memset ( m_dMask, 0xff, sizeof(m_dMask) );
1112 }
1113
1114 // unset n-th bit, or all
UnsetFieldMask_t1115 void Unset ( int iIdx )
1116 {
1117 assert ( 0<=iIdx && iIdx<(int)sizeof(m_dMask)*8 );
1118 m_dMask [ iIdx/32 ] &= ~(1 << ( iIdx%32 ));
1119 }
1120
UnsetAllFieldMask_t1121 void UnsetAll()
1122 {
1123 memset ( m_dMask, 0, sizeof(m_dMask) );
1124 }
1125
1126 // test if n-th bit is set
TestFieldMask_t1127 bool Test ( int iIdx ) const
1128 {
1129 assert ( iIdx>=0 && iIdx<(int)sizeof(m_dMask)*8 );
1130 return ( m_dMask [ iIdx/32 ] & ( 1 << ( iIdx%32 ) ) )!=0;
1131 }
1132
1133 // test if all bits are set or unset
TestAllFieldMask_t1134 bool TestAll ( bool bSet ) const
1135 {
1136 DWORD uTest = bSet ? 0xffffffff : 0;
1137 for ( int i=0; i<SIZE; i++ )
1138 if ( m_dMask[i]!=uTest )
1139 return false;
1140 return true;
1141 }
1142
NegateFieldMask_t1143 void Negate()
1144 {
1145 for ( int i=0; i<SIZE; i++ )
1146 m_dMask[i] = ~m_dMask[i];
1147 }
1148 };
1149
1150 /// hit info
1151 struct CSphWordHit
1152 {
1153 SphDocID_t m_uDocID; ///< document ID
1154 SphWordID_t m_uWordID; ///< word ID in current dictionary
1155 Hitpos_t m_uWordPos; ///< word position in current document
1156 };
1157
1158
1159 /// attribute locator within the row
1160 struct CSphAttrLocator
1161 {
1162 // OPTIMIZE? try packing these
1163 int m_iBitOffset;
1164 int m_iBitCount;
1165 bool m_bDynamic;
1166
CSphAttrLocatorCSphAttrLocator1167 CSphAttrLocator ()
1168 : m_iBitOffset ( -1 )
1169 , m_iBitCount ( -1 )
1170 , m_bDynamic ( false )
1171 {}
1172
1173 explicit CSphAttrLocator ( int iBitOffset, int iBitCount=ROWITEM_BITS )
m_iBitOffsetCSphAttrLocator1174 : m_iBitOffset ( iBitOffset )
1175 , m_iBitCount ( iBitCount )
1176 , m_bDynamic ( true )
1177 {}
1178
IsBitfieldCSphAttrLocator1179 inline bool IsBitfield () const
1180 {
1181 return ( m_iBitCount<ROWITEM_BITS || ( m_iBitOffset%ROWITEM_BITS )!=0 );
1182 }
1183
CalcRowitemCSphAttrLocator1184 int CalcRowitem () const
1185 {
1186 return IsBitfield() ? -1 : ( m_iBitOffset / ROWITEM_BITS );
1187 }
1188
IsIDCSphAttrLocator1189 bool IsID () const
1190 {
1191 return m_iBitOffset==-8*(int)sizeof(SphDocID_t) && m_iBitCount==8*sizeof(SphDocID_t);
1192 }
1193
1194 #ifndef NDEBUG
1195 /// get last item touched by this attr (for debugging checks only)
GetMaxRowitemCSphAttrLocator1196 int GetMaxRowitem () const
1197 {
1198 return ( m_iBitOffset + m_iBitCount - 1 ) / ROWITEM_BITS;
1199 }
1200 #endif
1201
1202 bool operator == ( const CSphAttrLocator & rhs ) const
1203 {
1204 return m_iBitOffset==rhs.m_iBitOffset && m_iBitCount==rhs.m_iBitCount && m_bDynamic==rhs.m_bDynamic;
1205 }
1206 };
1207
1208
1209 /// getter
sphGetRowAttr(const CSphRowitem * pRow,const CSphAttrLocator & tLoc)1210 inline SphAttr_t sphGetRowAttr ( const CSphRowitem * pRow, const CSphAttrLocator & tLoc )
1211 {
1212 assert ( pRow );
1213 int iItem = tLoc.m_iBitOffset >> ROWITEM_SHIFT;
1214
1215 if ( tLoc.m_iBitCount==ROWITEM_BITS )
1216 return pRow[iItem];
1217
1218 if ( tLoc.m_iBitCount==2*ROWITEM_BITS ) // FIXME? write a generalized version, perhaps
1219 return SphAttr_t ( pRow[iItem] ) + ( SphAttr_t ( pRow[iItem+1] ) << ROWITEM_BITS );
1220
1221 int iShift = tLoc.m_iBitOffset & ( ( 1 << ROWITEM_SHIFT )-1 );
1222 return ( pRow[iItem] >> iShift ) & ( ( 1UL << tLoc.m_iBitCount )-1 );
1223 }
1224
1225
1226 /// setter
sphSetRowAttr(CSphRowitem * pRow,const CSphAttrLocator & tLoc,SphAttr_t uValue)1227 inline void sphSetRowAttr ( CSphRowitem * pRow, const CSphAttrLocator & tLoc, SphAttr_t uValue )
1228 {
1229 assert(pRow);
1230 int iItem = tLoc.m_iBitOffset >> ROWITEM_SHIFT;
1231 if ( tLoc.m_iBitCount==2*ROWITEM_BITS )
1232 {
1233 // FIXME? write a generalized version, perhaps
1234 pRow[iItem] = CSphRowitem ( uValue & ( ( SphAttr_t(1) << ROWITEM_BITS )-1 ) );
1235 pRow[iItem+1] = CSphRowitem ( uValue >> ROWITEM_BITS );
1236
1237 } else if ( tLoc.m_iBitCount==ROWITEM_BITS )
1238 {
1239 pRow[iItem] = CSphRowitem ( uValue );
1240
1241 } else
1242 {
1243 int iShift = tLoc.m_iBitOffset & ( ( 1 << ROWITEM_SHIFT )-1);
1244 CSphRowitem uMask = ( ( 1UL << tLoc.m_iBitCount )-1 ) << iShift;
1245 pRow[iItem] &= ~uMask;
1246 pRow[iItem] |= ( uMask & ( uValue << iShift ) );
1247 }
1248 }
1249
1250
1251 /// pack length into row storage (22 bits max)
1252 /// returns number of bytes used
sphPackStrlen(BYTE * pRow,int iLen)1253 inline int sphPackStrlen ( BYTE * pRow, int iLen )
1254 {
1255 assert ( iLen>=0 && iLen<0x400000 );
1256 if ( iLen<0x80 )
1257 {
1258 pRow[0] = BYTE(iLen);
1259 return 1;
1260 } else if ( iLen<0x4000 )
1261 {
1262 pRow[0] = BYTE ( ( iLen>>8 ) | 0x80 );
1263 pRow[1] = BYTE ( iLen );
1264 return 2;
1265 } else
1266 {
1267 pRow[0] = BYTE ( ( iLen>>16 ) | 0xc0 );
1268 pRow[1] = BYTE ( iLen>>8 );
1269 pRow[2] = BYTE ( iLen );
1270 return 3;
1271 }
1272 }
1273
1274
1275 /// unpack string attr from row storage (22 bits length max)
1276 /// returns unpacked length; stores pointer to string data if required
sphUnpackStr(const BYTE * pRow,const BYTE ** ppStr)1277 inline int sphUnpackStr ( const BYTE * pRow, const BYTE ** ppStr )
1278 {
1279 int v = *pRow++;
1280 if ( v & 0x80 )
1281 {
1282 if ( v & 0x40 )
1283 {
1284 v = ( int ( v & 0x3f )<<16 ) + ( int ( *pRow++ )<<8 );
1285 v += ( *pRow++ ); // MUST be separate statement; cf. sequence point
1286 } else
1287 {
1288 v = ( int ( v & 0x3f )<<8 ) + ( *pRow++ );
1289 }
1290 }
1291 if ( ppStr )
1292 *ppStr = pRow;
1293 return v;
1294 }
1295
1296
1297 /// search query match (document info plus weight/tag)
1298 class CSphMatch
1299 {
1300 friend class ISphSchema;
1301 friend class CSphRsetSchema;
1302
1303 public:
1304 SphDocID_t m_uDocID; ///< document ID
1305 const CSphRowitem * m_pStatic; ///< static part (stored in and owned by the index)
1306 CSphRowitem * m_pDynamic; ///< dynamic part (computed per query; owned by the match)
1307 int m_iWeight; ///< my computed weight
1308 int m_iTag; ///< my index tag
1309
1310 public:
1311 /// ctor. clears everything
CSphMatch()1312 CSphMatch ()
1313 : m_uDocID ( 0 )
1314 , m_pStatic ( NULL )
1315 , m_pDynamic ( NULL )
1316 , m_iWeight ( 0 )
1317 , m_iTag ( 0 )
1318 {
1319 }
1320
1321 private:
1322 /// copy ctor. just in case
CSphMatch(const CSphMatch & rhs)1323 CSphMatch ( const CSphMatch & rhs )
1324 : m_pStatic ( 0 )
1325 , m_pDynamic ( NULL )
1326 {
1327 *this = rhs;
1328 }
1329
1330 public:
1331 /// dtor. frees everything
~CSphMatch()1332 ~CSphMatch ()
1333 {
1334 #ifndef NDEBUG
1335 if ( m_pDynamic )
1336 m_pDynamic--;
1337 #endif
1338 SafeDeleteArray ( m_pDynamic );
1339 }
1340
1341 /// reset
Reset(int iDynamic)1342 void Reset ( int iDynamic )
1343 {
1344 // check that we're either initializing a new one, or NOT changing the current size
1345 assert ( iDynamic>=0 );
1346 assert ( !m_pDynamic || iDynamic==(int)m_pDynamic[-1] );
1347
1348 m_uDocID = 0;
1349 if ( !m_pDynamic && iDynamic )
1350 {
1351 #ifndef NDEBUG
1352 m_pDynamic = new CSphRowitem [ iDynamic+1 ];
1353 *m_pDynamic++ = iDynamic;
1354 #else
1355 m_pDynamic = new CSphRowitem [ iDynamic ];
1356 #endif
1357 // dynamic stuff might contain pointers now (STRINGPTR type)
1358 // so we gotta cleanup
1359 memset ( m_pDynamic, 0, iDynamic*sizeof(CSphRowitem) );
1360 }
1361 }
1362
1363 private:
1364 /// assignment
Combine(const CSphMatch & rhs,int iDynamic)1365 void Combine ( const CSphMatch & rhs, int iDynamic )
1366 {
1367 // check that we're either initializing a new one, or NOT changing the current size
1368 assert ( iDynamic>=0 );
1369 assert ( !m_pDynamic || iDynamic==(int)m_pDynamic[-1] );
1370
1371 if ( this!=&rhs )
1372 {
1373 m_uDocID = rhs.m_uDocID;
1374 m_iWeight = rhs.m_iWeight;
1375 m_pStatic = rhs.m_pStatic;
1376 m_iTag = rhs.m_iTag;
1377 }
1378
1379 if ( iDynamic )
1380 {
1381 if ( !m_pDynamic )
1382 {
1383 #ifndef NDEBUG
1384 m_pDynamic = new CSphRowitem [ iDynamic+1 ];
1385 *m_pDynamic++ = iDynamic;
1386 #else
1387 m_pDynamic = new CSphRowitem [ iDynamic ];
1388 #endif
1389 }
1390
1391 if ( this!=&rhs )
1392 {
1393 assert ( rhs.m_pDynamic );
1394 assert ( m_pDynamic[-1]==rhs.m_pDynamic[-1] ); // ensure we're not changing X to Y
1395 memcpy ( m_pDynamic, rhs.m_pDynamic, iDynamic*sizeof(CSphRowitem) );
1396 }
1397 }
1398 }
1399
1400 public:
1401 /// integer getter
GetAttr(const CSphAttrLocator & tLoc)1402 SphAttr_t GetAttr ( const CSphAttrLocator & tLoc ) const
1403 {
1404 // m_pRowpart[tLoc.m_bDynamic] is 30% faster on MSVC 2005
1405 // same time on gcc 4.x though, ~1 msec per 1M calls, so lets avoid the hassle for now
1406 if ( tLoc.m_iBitOffset>=0 )
1407 return sphGetRowAttr ( tLoc.m_bDynamic ? m_pDynamic : m_pStatic, tLoc );
1408 if ( tLoc.IsID() )
1409 return m_uDocID;
1410 assert ( false && "Unknown negative-bitoffset locator" );
1411 return 0;
1412 }
1413
1414 /// float getter
GetAttrFloat(const CSphAttrLocator & tLoc)1415 float GetAttrFloat ( const CSphAttrLocator & tLoc ) const
1416 {
1417 return sphDW2F ( (DWORD)sphGetRowAttr ( tLoc.m_bDynamic ? m_pDynamic : m_pStatic, tLoc ) );
1418 }
1419
1420 /// integer setter
SetAttr(const CSphAttrLocator & tLoc,SphAttr_t uValue)1421 void SetAttr ( const CSphAttrLocator & tLoc, SphAttr_t uValue )
1422 {
1423 if ( tLoc.IsID() )
1424 return;
1425 assert ( tLoc.m_bDynamic );
1426 assert ( tLoc.GetMaxRowitem() < (int)m_pDynamic[-1] );
1427 sphSetRowAttr ( m_pDynamic, tLoc, uValue );
1428 }
1429
1430 /// float setter
SetAttrFloat(const CSphAttrLocator & tLoc,float fValue)1431 void SetAttrFloat ( const CSphAttrLocator & tLoc, float fValue )
1432 {
1433 assert ( tLoc.m_bDynamic );
1434 assert ( tLoc.GetMaxRowitem() < (int)m_pDynamic[-1] );
1435 sphSetRowAttr ( m_pDynamic, tLoc, sphF2DW ( fValue ) );
1436 }
1437
1438 /// MVA getter
1439 const DWORD * GetAttrMVA ( const CSphAttrLocator & tLoc, const DWORD * pPool, bool bArenaProhibit ) const;
1440
1441 private:
1442 /// "manually" prevent copying
1443 const CSphMatch & operator = ( const CSphMatch & )
1444 {
1445 assert ( 0 && "internal error (CSphMatch::operator= called)" );
1446 return *this;
1447 }
1448 };
1449
1450 /// specialized swapper
Swap(CSphMatch & a,CSphMatch & b)1451 inline void Swap ( CSphMatch & a, CSphMatch & b )
1452 {
1453 Swap ( a.m_uDocID, b.m_uDocID );
1454 Swap ( a.m_pStatic, b.m_pStatic );
1455 Swap ( a.m_pDynamic, b.m_pDynamic );
1456 Swap ( a.m_iWeight, b.m_iWeight );
1457 Swap ( a.m_iTag, b.m_iTag );
1458 }
1459
1460
1461 /// source statistics
1462 struct CSphSourceStats
1463 {
1464 int64_t m_iTotalDocuments; ///< how much documents
1465 int64_t m_iTotalBytes; ///< how much bytes
1466
1467 /// ctor
CSphSourceStatsCSphSourceStats1468 CSphSourceStats ()
1469 {
1470 Reset ();
1471 }
1472
1473 /// reset
ResetCSphSourceStats1474 void Reset ()
1475 {
1476 m_iTotalDocuments = 0;
1477 m_iTotalBytes = 0;
1478 }
1479 };
1480
1481 //////////////////////////////////////////////////////////////////////////
1482
1483 /// known multi-valued attr sources
1484 enum ESphAttrSrc
1485 {
1486 SPH_ATTRSRC_NONE = 0, ///< not multi-valued
1487 SPH_ATTRSRC_FIELD = 1, ///< get attr values from text field
1488 SPH_ATTRSRC_QUERY = 2, ///< get attr values from SQL query
1489 SPH_ATTRSRC_RANGEDQUERY = 3 ///< get attr values from ranged SQL query
1490 };
1491
1492
1493 /// wordpart processing type
1494 enum ESphWordpart
1495 {
1496 SPH_WORDPART_WHOLE = 0, ///< whole-word
1497 SPH_WORDPART_PREFIX = 1, ///< prefix
1498 SPH_WORDPART_INFIX = 2 ///< infix
1499 };
1500
1501
1502 /// column unpack format
1503 enum ESphUnpackFormat
1504 {
1505 SPH_UNPACK_NONE = 0,
1506 SPH_UNPACK_ZLIB = 1,
1507 SPH_UNPACK_MYSQL_COMPRESS = 2
1508 };
1509
1510
1511 /// aggregate function to apply
1512 enum ESphAggrFunc
1513 {
1514 SPH_AGGR_NONE,
1515 SPH_AGGR_AVG,
1516 SPH_AGGR_MIN,
1517 SPH_AGGR_MAX,
1518 SPH_AGGR_SUM,
1519 SPH_AGGR_CAT
1520 };
1521
1522
1523 /// source column info
1524 struct CSphColumnInfo
1525 {
1526 CSphString m_sName; ///< column name
1527 ESphAttr m_eAttrType; ///< attribute type
1528 ESphWordpart m_eWordpart; ///< wordpart processing type
1529 bool m_bIndexed; ///< whether to index this column as fulltext field too
1530
1531 int m_iIndex; ///< index into source result set (-1 for joined fields)
1532 CSphAttrLocator m_tLocator; ///< attribute locator in the row
1533
1534 ESphAttrSrc m_eSrc; ///< attr source (for multi-valued attrs only)
1535 CSphString m_sQuery; ///< query to retrieve values (for multi-valued attrs only)
1536 CSphString m_sQueryRange; ///< query to retrieve range (for multi-valued attrs only)
1537
1538 CSphRefcountedPtr<ISphExpr> m_pExpr; ///< evaluator for expression items
1539 ESphAggrFunc m_eAggrFunc; ///< aggregate function on top of expression (for GROUP BY)
1540 ESphEvalStage m_eStage; ///< column evaluation stage (who and how computes this column)
1541 bool m_bPayload;
1542 bool m_bFilename; ///< column is a file name
1543 bool m_bWeight; ///< is a weight column
1544
1545 WORD m_uNext; ///< next in linked list for hash in CSphSchema
1546
1547 /// handy ctor
1548 CSphColumnInfo ( const char * sName=NULL, ESphAttr eType=SPH_ATTR_NONE );
1549
1550 /// equality comparison checks name, type, and locator
1551 bool operator == ( const CSphColumnInfo & rhs ) const
1552 {
1553 return m_sName==rhs.m_sName
1554 && m_eAttrType==rhs.m_eAttrType
1555 && m_tLocator.m_iBitCount==rhs.m_tLocator.m_iBitCount
1556 && m_tLocator.m_iBitOffset==rhs.m_tLocator.m_iBitOffset
1557 && m_tLocator.m_bDynamic==rhs.m_tLocator.m_bDynamic;
1558 }
1559 };
1560
1561
1562 /// barebones schema interface
1563 /// everything that is needed from every implementation of a schema
1564 class ISphSchema
1565 {
1566 protected:
1567 CSphVector<CSphNamedInt> m_dPtrAttrs; ///< names and rowitems of STRINGPTR and other ptrs to copy and delete
1568 CSphVector<CSphNamedInt> m_dFactorAttrs; ///< names and rowitems of SPH_ATTR_FACTORS attributes
1569
1570 public:
1571 /// get row size (static+dynamic combined)
1572 virtual int GetRowSize() const = 0;
1573
1574 /// get static row part size
1575 virtual int GetStaticSize() const = 0;
1576
1577 /// get dynamic row part size
1578 virtual int GetDynamicSize() const = 0;
1579
1580 /// get attrs count
1581 virtual int GetAttrsCount() const = 0;
1582
1583 /// get attribute index by name, returns -1 if not found
1584 virtual int GetAttrIndex ( const char * sName ) const = 0;
1585
1586 /// get attr by index
1587 virtual const CSphColumnInfo & GetAttr ( int iIndex ) const = 0;
1588
1589 /// get attr by name
1590 virtual const CSphColumnInfo * GetAttr ( const char * sName ) const = 0;
1591
1592 /// assign current schema to rset schema (kind of a visitor operator)
1593 virtual void AssignTo ( class CSphRsetSchema & lhs ) const = 0;
1594
1595 public:
1596 /// full copy, for purely dynamic matches
1597 void CloneWholeMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const;
1598
1599 /// free the linked strings and/or just initialize the pointers with NULL
1600 void FreeStringPtrs ( CSphMatch * pMatch ) const;
1601
1602 /// ???
1603 void CopyPtrs ( CSphMatch * pDst, const CSphMatch & rhs ) const;
1604
1605 protected:
1606 /// generic InsertAttr() implementation that tracks STRINGPTR, FACTORS attributes
1607 void InsertAttr ( CSphVector<CSphColumnInfo> & dAttrs, CSphVector<int> & dUsed, int iPos, const CSphColumnInfo & tCol, bool dDynamic );
1608
1609 /// reset my trackers
1610 void Reset();
1611
1612 /// dtor
~ISphSchema()1613 virtual ~ISphSchema () {}
1614 };
1615
1616
1617 /// plain good old schema
1618 /// container that actually holds and owns all the fields, columns, etc
1619 ///
1620 /// NOTE that while this one can be used everywhere where we need a schema
1621 /// it might be huge (say 1000+ attributes) and expensive to copy, modify, etc!
1622 /// so for most of the online query work, consider CSphRsetSchema
1623 class CSphSchema : public ISphSchema
1624 {
1625 friend class CSphRsetSchema;
1626
1627 protected:
1628 static const int HASH_THRESH = 32;
1629 static const int BUCKET_COUNT = 256;
1630
1631 public:
1632 CSphString m_sName; ///< my human-readable name
1633 CSphVector<CSphColumnInfo> m_dFields; ///< my fulltext-searchable fields
1634
1635
1636 CSphVector<CSphColumnInfo> m_dAttrs; ///< all my attributes
1637 CSphVector<int> m_dStaticUsed; ///< static row part map (amount of used bits in each rowitem)
1638 CSphVector<int> m_dDynamicUsed; ///< dynamic row part map
1639 int m_iStaticSize; ///< static row size (can be different from m_dStaticUsed.GetLength() because of gaps)
1640
1641 protected:
1642 WORD m_dBuckets [ BUCKET_COUNT ]; ///< uses indexes in m_dAttrs as ptrs; 0xffff is like NULL in this hash
1643
1644 public:
1645
1646 /// ctor
1647 explicit CSphSchema ( const char * sName="(nameless)" );
1648
1649 /// get field index by name
1650 /// returns -1 if not found
1651 int GetFieldIndex ( const char * sName ) const;
1652
1653 /// get attribute index by name
1654 /// returns -1 if not found
1655 int GetAttrIndex ( const char * sName ) const;
1656
1657 /// checks if two schemas fully match (ie. fields names, attr names, types and locators are the same)
1658 /// describe mismatch (if any) to sError
1659 bool CompareTo ( const CSphSchema & rhs, CSphString & sError, bool bFullComparison = true ) const;
1660
1661 /// reset fields and attrs
1662 void Reset ();
1663
1664 /// get row size (static+dynamic combined)
GetRowSize()1665 int GetRowSize () const { return m_iStaticSize + m_dDynamicUsed.GetLength(); }
1666
1667 /// get static row part size
GetStaticSize()1668 int GetStaticSize () const { return m_iStaticSize; }
1669
1670 /// get dynamic row part size
GetDynamicSize()1671 int GetDynamicSize () const { return m_dDynamicUsed.GetLength(); }
1672
1673 /// get attrs count
GetAttrsCount()1674 int GetAttrsCount () const { return m_dAttrs.GetLength(); }
1675
1676 /// get attr by index
GetAttr(int iIndex)1677 const CSphColumnInfo & GetAttr ( int iIndex ) const { return m_dAttrs[iIndex]; }
1678
1679 /// get attr by name
1680 const CSphColumnInfo * GetAttr ( const char * sName ) const;
1681
1682 /// insert attr
1683 void InsertAttr ( int iPos, const CSphColumnInfo & tAggr, bool bDynamic );
1684
1685 /// add attr
1686 void AddAttr ( const CSphColumnInfo & tAttr, bool bDynamic );
1687
1688 /// remove attr
1689 void RemoveAttr ( const char * szAttr, bool bDynamic );
1690
1691 static bool IsReserved ( const char * szToken );
1692
1693 protected:
1694 /// returns 0xffff if bucket list is empty and position otherwise
1695 WORD & GetBucketPos ( const char * sName );
1696
1697 /// reset hash and re-add all attributes
1698 void RebuildHash ();
1699
1700 /// add iAddVal to all indexes strictly greater than iStartIdx in hash structures
1701 void UpdateHash ( int iStartIdx, int iAddVal );
1702
1703 /// visitor-style uber-virtual assignment implementation
1704 void AssignTo ( CSphRsetSchema & lhs ) const;
1705 };
1706
1707
1708 /// lightweight schema to be used in sorters, result sets, etc
1709 /// avoids copying of static attributes part by keeping a pointer
1710 /// manages the additional dynamic attributes on its own
1711 ///
1712 /// NOTE that for that reason CSphRsetSchema needs the originating index to exist
1713 /// (in case it keeps and uses a pointer to original schema in that index)
1714 class CSphRsetSchema : public ISphSchema
1715 {
1716 protected:
1717 const CSphSchema * m_pIndexSchema; ///< original index schema, for the static part
1718 CSphVector<CSphColumnInfo> m_dExtraAttrs; ///< additional dynamic attributes, for the dynamic one
1719 CSphVector<int> m_dDynamicUsed; ///< dynamic row part map
1720 CSphVector<int> m_dRemoved; ///< original indexes that are suppressed from the index schema by RemoveStaticAttr()
1721
1722 public:
1723 CSphVector<CSphColumnInfo> m_dFields; ///< standalone case (agent result set), fields container
1724
1725 public:
1726 CSphRsetSchema();
1727 CSphRsetSchema & operator = ( const ISphSchema & rhs );
1728 CSphRsetSchema & operator = ( const CSphSchema & rhs );
AssignTo(CSphRsetSchema & lhs)1729 virtual void AssignTo ( CSphRsetSchema & lhs ) const { lhs = *this; }
1730
1731 public:
1732 int GetRowSize() const;
1733 int GetStaticSize() const;
1734 int GetDynamicSize() const;
1735 int GetAttrsCount() const;
1736 int GetAttrIndex ( const char * sName ) const;
1737 const CSphColumnInfo & GetAttr ( int iIndex ) const;
1738 const CSphColumnInfo * GetAttr ( const char * sName ) const;
1739
1740 public:
1741 void AddDynamicAttr ( const CSphColumnInfo & tCol );
1742 void RemoveStaticAttr ( int iAttr );
1743 void Reset();
1744
1745 public:
1746 /// simple copy; clones either the entire dynamic part, or a part thereof
1747 void CloneMatch ( CSphMatch * pDst, const CSphMatch & rhs ) const;
1748
1749 /// swap in a subset of current attributes, with not necessarily (!) unique names
1750 /// used to create a network result set (ie. rset to be sent and then discarded)
1751 /// WARNING, DO NOT USE THIS UNLESS ABSOLUTELY SURE!
1752 void SwapAttrs ( CSphVector<CSphColumnInfo> & dAttrs );
1753 };
1754
1755 //////////////////////////////////////////////////////////////////////////
1756
1757 /// HTML stripper
1758 class CSphHTMLStripper
1759 {
1760 public:
1761 explicit CSphHTMLStripper ( bool bDefaultTags );
1762 bool SetIndexedAttrs ( const char * sConfig, CSphString & sError );
1763 bool SetRemovedElements ( const char * sConfig, CSphString & sError );
1764 bool SetZones ( const char * sZones, CSphString & sError );
1765 void EnableParagraphs ();
1766 void Strip ( BYTE * sData ) const;
1767
1768 public:
1769
1770 struct StripperTag_t
1771 {
1772 CSphString m_sTag; ///< tag name
1773 int m_iTagLen; ///< tag name length
1774 bool m_bInline; ///< whether this tag is inline
1775 bool m_bIndexAttrs; ///< whether to index attrs
1776 bool m_bRemove; ///< whether to remove contents
1777 bool m_bPara; ///< whether to mark a paragraph boundary
1778 bool m_bZone; ///< whether to mark a zone boundary
1779 bool m_bZonePrefix; ///< whether the zone name is a full name or a prefix
1780 CSphVector<CSphString> m_dAttrs; ///< attr names to index
1781
StripperTag_tStripperTag_t1782 StripperTag_t ()
1783 : m_iTagLen ( 0 )
1784 , m_bInline ( false )
1785 , m_bIndexAttrs ( false )
1786 , m_bRemove ( false )
1787 , m_bPara ( false )
1788 , m_bZone ( false )
1789 , m_bZonePrefix ( false )
1790 {}
1791
1792 inline bool operator < ( const StripperTag_t & rhs ) const
1793 {
1794 return strcmp ( m_sTag.cstr(), rhs.m_sTag.cstr() )<0;
1795 }
1796 };
1797
1798 /// finds appropriate tag and zone name ( tags zone name could be prefix only )
1799 /// advances source to the end of the tag
1800 const BYTE * FindTag ( const BYTE * sSrc, const StripperTag_t ** ppTag, const BYTE ** ppZoneName, int * pZoneNameLen ) const;
1801 bool IsValidTagStart ( int iCh ) const;
1802
1803 protected:
1804 static const int MAX_CHAR_INDEX = 28; ///< max valid char index (a-z, underscore, colon)
1805
1806 CSphVector<StripperTag_t> m_dTags; ///< known tags to index attrs and/or to remove contents
1807 int m_dStart[MAX_CHAR_INDEX]; ///< maps index of the first tag name char to start offset in m_dTags
1808 int m_dEnd[MAX_CHAR_INDEX]; ///< maps index of the first tag name char to end offset in m_dTags
1809
1810 protected:
1811 int GetCharIndex ( int iCh ) const; ///< calcs index by raw char
1812 void UpdateTags (); ///< sorts tags, updates internal helpers
1813 };
1814
1815
1816 /// indexing-related source settings
1817 /// NOTE, newly added fields should be synced with CSphSource::Setup()
1818 struct CSphSourceSettings
1819 {
1820 int m_iMinPrefixLen; ///< min indexable prefix (0 means don't index prefixes)
1821 int m_iMinInfixLen; ///< min indexable infix length (0 means don't index infixes)
1822 int m_iMaxSubstringLen; ///< max indexable infix and prefix (0 means don't limit infixes and prefixes)
1823 int m_iBoundaryStep; ///< additional boundary word position increment
1824 bool m_bIndexExactWords; ///< exact (non-stemmed) word indexing flag
1825 int m_iOvershortStep; ///< position step on overshort token (default is 1)
1826 int m_iStopwordStep; ///< position step on stopword token (default is 1)
1827 bool m_bIndexSP; ///< whether to index sentence and paragraph delimiters
1828 bool m_bIndexFieldLens; ///< whether to index field lengths
1829
1830 CSphVector<CSphString> m_dPrefixFields; ///< list of prefix fields
1831 CSphVector<CSphString> m_dInfixFields; ///< list of infix fields
1832
1833 explicit CSphSourceSettings ();
1834 ESphWordpart GetWordpart ( const char * sField, bool bWordDict );
1835 };
1836
1837
1838 /// hit vector interface
1839 /// because specific position type might vary (dword, qword, etc)
1840 /// but we don't want to template and instantiate everything because of that
1841 class ISphHits
1842 {
1843 public:
Length()1844 int Length () const
1845 {
1846 return m_dData.GetLength();
1847 }
1848
First()1849 const CSphWordHit * First () const
1850 {
1851 return m_dData.Begin();
1852 }
1853
Last()1854 const CSphWordHit * Last () const
1855 {
1856 return &m_dData.Last();
1857 }
1858
AddHit(SphDocID_t uDocid,SphWordID_t uWordid,Hitpos_t uPos)1859 void AddHit ( SphDocID_t uDocid, SphWordID_t uWordid, Hitpos_t uPos )
1860 {
1861 if ( uWordid )
1862 {
1863 CSphWordHit & tHit = m_dData.Add();
1864 tHit.m_uDocID = uDocid;
1865 tHit.m_uWordID = uWordid;
1866 tHit.m_uWordPos = uPos;
1867 }
1868 }
1869
1870 public:
1871 CSphVector<CSphWordHit> m_dData;
1872 };
1873
1874
1875 struct SphRange_t
1876 {
1877 int m_iStart;
1878 int m_iLength;
1879 };
1880
1881 struct CSphFieldFilterSettings
1882 {
1883 CSphVector<CSphString> m_dRegexps;
1884 };
1885
1886 /// field filter
1887 class ISphFieldFilter
1888 {
1889 public:
~ISphFieldFilter()1890 virtual ~ISphFieldFilter () {}
1891
1892 virtual int Apply ( const BYTE * sField, int iLength, CSphVector<BYTE> & dStorage ) = 0;
1893 virtual void GetSettings ( CSphFieldFilterSettings & tSettings ) const = 0;
1894 };
1895
1896 /// create a field filter
1897 ISphFieldFilter * sphCreateFieldFilter ( const CSphFieldFilterSettings & tFilterSettings, CSphString & sError );
1898
1899
1900 /// generic data source
1901 class CSphSource : public CSphSourceSettings
1902 {
1903 public:
1904 CSphMatch m_tDocInfo; ///< current document info
1905 CSphVector<CSphString> m_dStrAttrs; ///< current document string attrs
1906 CSphVector<DWORD> m_dMva; ///< MVA storage for mva64
1907
1908 public:
1909 /// ctor
1910 explicit CSphSource ( const char * sName );
1911
1912 /// dtor
1913 virtual ~CSphSource ();
1914
1915 /// set dictionary
1916 void SetDict ( CSphDict * dict );
1917
1918 /// set HTML stripping mode
1919 ///
1920 /// sExtractAttrs defines what attributes to store. format is "img=alt; a=alt,title".
1921 /// empty string means to strip all tags; NULL means to disable stripping.
1922 ///
1923 /// sRemoveElements defines what elements to cleanup. format is "style, script"
1924 ///
1925 /// on failure, returns false and fills sError
1926 bool SetStripHTML ( const char * sExtractAttrs, const char * sRemoveElements, bool bDetectParagraphs, const char * sZones, CSphString & sError );
1927
1928 /// set field filter
1929 void SetFieldFilter ( ISphFieldFilter * pFilter );
1930
1931 /// set tokenizer
1932 void SetTokenizer ( ISphTokenizer * pTokenizer );
1933
1934 /// set rows dump file
SetDumpRows(FILE *)1935 virtual void SetDumpRows ( FILE * ) {}
1936
1937 /// get stats
1938 virtual const CSphSourceStats & GetStats ();
1939
1940 /// updates schema fields and attributes
1941 /// updates pInfo if it's empty; checks for match if it's not
1942 /// must be called after IterateStart(); will always fail otherwise
1943 virtual bool UpdateSchema ( CSphSchema * pInfo, CSphString & sError );
1944
1945 /// setup misc indexing settings (prefix/infix/exact-word indexing, position steps)
1946 void Setup ( const CSphSourceSettings & tSettings );
1947
1948 public:
1949 /// connect to the source (eg. to the database)
1950 /// connection settings are specific for each source type and as such
1951 /// are implemented in specific descendants
1952 virtual bool Connect ( CSphString & sError ) = 0;
1953
1954 /// disconnect from the source
1955 virtual void Disconnect () = 0;
1956
1957 /// check if there are any attributes configured
1958 /// note that there might be NO actual attributes in the case if configured
1959 /// ones do not match those actually returned by the source
1960 virtual bool HasAttrsConfigured () = 0;
1961
1962 /// check if there are any joined fields
HasJoinedFields()1963 virtual bool HasJoinedFields () { return false; }
1964
1965 /// begin indexing this source
1966 /// to be implemented by descendants
1967 virtual bool IterateStart ( CSphString & sError ) = 0;
1968
1969 /// get next document
1970 /// to be implemented by descendants
1971 /// returns false on error
1972 /// returns true and fills m_tDocInfo on success
1973 /// returns true and sets m_tDocInfo.m_uDocID to 0 on eof
1974 virtual bool IterateDocument ( CSphString & sError ) = 0;
1975
1976 /// get next hits chunk for current document
1977 /// to be implemented by descendants
1978 /// returns NULL when there are no more hits
1979 /// returns pointer to hit vector (with at most MAX_SOURCE_HITS) on success
1980 /// fills out-string with error message on failure
1981 virtual ISphHits * IterateHits ( CSphString & sError ) = 0;
1982
1983 /// get joined hits from joined fields (w/o attached docinfos)
1984 /// returns false and fills out-string with error message on failure
1985 /// returns true and sets m_tDocInfo.m_uDocID to 0 on eof
1986 /// returns true and sets m_tDocInfo.m_uDocID to non-0 on success
1987 virtual ISphHits * IterateJoinedHits ( CSphString & sError );
1988
1989 /// begin iterating values of out-of-document multi-valued attribute iAttr
1990 /// will fail if iAttr is out of range, or is not multi-valued
1991 /// can also fail if configured settings are invalid (eg. SQL query can not be executed)
1992 virtual bool IterateMultivaluedStart ( int iAttr, CSphString & sError ) = 0;
1993
1994 /// get next multi-valued (id,attr-value) or (id, offset) for mva64 tuple to m_tDocInfo
1995 virtual bool IterateMultivaluedNext () = 0;
1996
1997 /// begin iterating values of multi-valued attribute iAttr stored in a field
1998 /// will fail if iAttr is out of range, or is not multi-valued
1999 virtual SphRange_t IterateFieldMVAStart ( int iAttr ) = 0;
2000
2001 /// begin iterating kill list
2002 virtual bool IterateKillListStart ( CSphString & sError ) = 0;
2003
2004 /// get next kill list doc id
2005 virtual bool IterateKillListNext ( SphDocID_t & uDocId ) = 0;
2006
2007 /// post-index callback
2008 /// gets called when the indexing is succesfully (!) over
PostIndex()2009 virtual void PostIndex () {}
2010
2011 protected:
2012 ISphTokenizer * m_pTokenizer; ///< my tokenizer
2013 CSphDict * m_pDict; ///< my dict
2014 ISphFieldFilter * m_pFieldFilter; ///< my field filter
2015
2016 CSphSourceStats m_tStats; ///< my stats
2017 CSphSchema m_tSchema; ///< my schema
2018
2019 CSphHTMLStripper * m_pStripper; ///< my HTML stripper
2020
2021 int m_iNullIds;
2022 int m_iMaxIds;
2023
2024 SphDocID_t VerifyID ( SphDocID_t uID );
2025 };
2026
2027
2028 /// how to handle IO errors in file fields
2029 enum ESphOnFileFieldError
2030 {
2031 FFE_IGNORE_FIELD,
2032 FFE_SKIP_DOCUMENT,
2033 FFE_FAIL_INDEX
2034 };
2035
2036
2037 /// generic document source
2038 /// provides multi-field support and generic tokenizer
2039 class CSphSource_Document : public CSphSource
2040 {
2041 public:
2042 /// ctor
2043 explicit CSphSource_Document ( const char * sName );
2044
2045 /// dtor
~CSphSource_Document()2046 virtual ~CSphSource_Document () { SafeDeleteArray ( m_pReadFileBuffer ); }
2047
2048 /// my generic tokenizer
2049 virtual bool IterateDocument ( CSphString & sError );
2050 virtual ISphHits * IterateHits ( CSphString & sError );
2051 void BuildHits ( CSphString & sError, bool bSkipEndMarker );
2052
2053 /// field data getter
2054 /// to be implemented by descendants
2055 virtual BYTE ** NextDocument ( CSphString & sError ) = 0;
2056
SetDumpRows(FILE * fpDumpRows)2057 virtual void SetDumpRows ( FILE * fpDumpRows ) { m_fpDumpRows = fpDumpRows; }
2058
2059 virtual SphRange_t IterateFieldMVAStart ( int iAttr );
IterateFieldMVAStart(int,CSphString &)2060 virtual bool IterateFieldMVAStart ( int, CSphString & ) { assert ( 0 && "not implemented" ); return false; }
HasJoinedFields()2061 virtual bool HasJoinedFields () { return m_iPlainFieldsLength!=m_tSchema.m_dFields.GetLength(); }
2062
2063 protected:
2064 int ParseFieldMVA ( CSphVector < DWORD > & dMva, const char * szValue, bool bMva64 ) const;
2065 bool CheckFileField ( const BYTE * sField );
2066 int LoadFileField ( BYTE ** ppField, CSphString & sError );
2067
2068 bool BuildZoneHits ( SphDocID_t uDocid, BYTE * sWord );
2069 void BuildSubstringHits ( SphDocID_t uDocid, bool bPayload, ESphWordpart eWordpart, bool bSkipEndMarker );
2070 void BuildRegularHits ( SphDocID_t uDocid, bool bPayload, bool bSkipEndMarker );
2071
2072 /// register autocomputed attributes such as field lengths (see index_field_lengths)
2073 bool AddAutoAttrs ( CSphString & sError );
2074
2075 /// allocate m_tDocInfo storage, do post-alloc magic (compute pointer to field lengths, etc)
2076 void AllocDocinfo ();
2077
2078 protected:
2079 ISphHits m_tHits; ///< my hitvector
2080
2081 protected:
2082 char * m_pReadFileBuffer;
2083 int m_iReadFileBufferSize; ///< size of read buffer for the 'sql_file_field' fields
2084 int m_iMaxFileBufferSize; ///< max size of read buffer for the 'sql_file_field' fields
2085 ESphOnFileFieldError m_eOnFileFieldError;
2086 FILE * m_fpDumpRows;
2087 int m_iPlainFieldsLength;
2088 DWORD * m_pFieldLengthAttrs; ///< pointer into the part of m_tDocInfo where field lengths are stored
2089
2090 CSphVector<SphDocID_t> m_dAllIds; ///< used for joined fields FIXME! unlimited RAM use
2091 bool m_bIdsSorted; ///< we sort array to use binary search
2092
2093 protected:
2094 struct CSphBuildHitsState_t
2095 {
2096 bool m_bProcessingHits;
2097 bool m_bDocumentDone;
2098
2099 BYTE ** m_dFields;
2100
2101 CSphVector<BYTE*> m_dTmpFieldStorage;
2102 CSphVector<BYTE*> m_dTmpFieldPtrs;
2103 CSphVector<BYTE> m_dFiltered;
2104
2105 int m_iStartPos;
2106 Hitpos_t m_iHitPos;
2107 int m_iField;
2108 int m_iStartField;
2109 int m_iEndField;
2110
2111 int m_iBuildLastStep;
2112
2113 CSphBuildHitsState_t ();
2114 ~CSphBuildHitsState_t ();
2115
2116 void Reset ();
2117 };
2118
2119 CSphBuildHitsState_t m_tState;
2120 int m_iMaxHits;
2121 };
2122
2123 struct CSphUnpackInfo
2124 {
2125 ESphUnpackFormat m_eFormat;
2126 CSphString m_sName;
2127 };
2128
2129 struct CSphJoinedField
2130 {
2131 CSphString m_sName;
2132 CSphString m_sQuery;
2133 CSphString m_sRanged;
2134 bool m_bPayload;
2135 };
2136
2137
2138 /// generic SQL source params
2139 struct CSphSourceParams_SQL
2140 {
2141 // query params
2142 CSphString m_sQuery;
2143 CSphString m_sQueryRange;
2144 CSphString m_sQueryKilllist;
2145 int64_t m_iRangeStep;
2146 int64_t m_iRefRangeStep;
2147 bool m_bPrintQueries;
2148
2149 CSphVector<CSphString> m_dQueryPre;
2150 CSphVector<CSphString> m_dQueryPost;
2151 CSphVector<CSphString> m_dQueryPostIndex;
2152 CSphVector<CSphColumnInfo> m_dAttrs;
2153 CSphVector<CSphString> m_dFileFields;
2154
2155 int m_iRangedThrottle;
2156 int m_iMaxFileBufferSize;
2157 ESphOnFileFieldError m_eOnFileFieldError;
2158
2159 CSphVector<CSphUnpackInfo> m_dUnpack;
2160 DWORD m_uUnpackMemoryLimit;
2161
2162 CSphVector<CSphJoinedField> m_dJoinedFields;
2163
2164 // connection params
2165 CSphString m_sHost;
2166 CSphString m_sUser;
2167 CSphString m_sPass;
2168 CSphString m_sDB;
2169 int m_iPort;
2170
2171 // hooks
2172 CSphString m_sHookConnect;
2173 CSphString m_sHookQueryRange;
2174 CSphString m_sHookPostIndex;
2175
2176 CSphSourceParams_SQL ();
2177 };
2178
2179
2180 /// generic SQL source
2181 /// multi-field plain-text documents fetched from given query
2182 struct CSphSource_SQL : CSphSource_Document
2183 {
2184 explicit CSphSource_SQL ( const char * sName );
~CSphSource_SQLCSphSource_SQL2185 virtual ~CSphSource_SQL () {}
2186
2187 bool Setup ( const CSphSourceParams_SQL & pParams );
2188 virtual bool Connect ( CSphString & sError );
2189 virtual void Disconnect ();
2190
2191 virtual bool IterateStart ( CSphString & sError );
2192 virtual BYTE ** NextDocument ( CSphString & sError );
2193 virtual void PostIndex ();
2194
HasAttrsConfiguredCSphSource_SQL2195 virtual bool HasAttrsConfigured () { return m_tParams.m_dAttrs.GetLength()!=0; }
2196
2197 virtual ISphHits * IterateJoinedHits ( CSphString & sError );
2198
2199 virtual bool IterateMultivaluedStart ( int iAttr, CSphString & sError );
2200 virtual bool IterateMultivaluedNext ();
2201
2202 virtual bool IterateKillListStart ( CSphString & sError );
2203 virtual bool IterateKillListNext ( SphDocID_t & tDocId );
2204
2205 private:
2206 bool m_bSqlConnected; ///< am i connected?
2207
2208 protected:
2209 CSphString m_sSqlDSN;
2210
2211 BYTE * m_dFields [ SPH_MAX_FIELDS ];
2212 ESphUnpackFormat m_dUnpack [ SPH_MAX_FIELDS ];
2213
2214 SphDocID_t m_uMinID; ///< grand min ID
2215 SphDocID_t m_uMaxID; ///< grand max ID
2216 SphDocID_t m_uCurrentID; ///< current min ID
2217 SphDocID_t m_uMaxFetchedID; ///< max actually fetched ID
2218 int m_iMultiAttr; ///< multi-valued attr being currently fetched
2219 int m_iSqlFields; ///< field count (for row dumper)
2220
2221 CSphSourceParams_SQL m_tParams;
2222
2223 bool m_bCanUnpack;
2224 bool m_bUnpackFailed;
2225 bool m_bUnpackOverflow;
2226 CSphVector<char> m_dUnpackBuffers [ SPH_MAX_FIELDS ];
2227
2228 int m_iJoinedHitField; ///< currently pulling joined hits from this field (index into schema; -1 if not pulling)
2229 SphDocID_t m_iJoinedHitID; ///< last document id
2230 int m_iJoinedHitPos; ///< last hit position
2231
2232 static const int MACRO_COUNT = 2;
2233 static const char * const MACRO_VALUES [ MACRO_COUNT ];
2234
2235 protected:
2236 /// by what reason the internal SetupRanges called
2237 enum ERangesReason
2238 {
2239 SRE_DOCS,
2240 SRE_MVA,
2241 SRE_JOINEDHITS
2242 };
2243
2244 protected:
2245 bool SetupRanges ( const char * sRangeQuery, const char * sQuery, const char * sPrefix, CSphString & sError, ERangesReason iReason );
2246 bool RunQueryStep ( const char * sQuery, CSphString & sError );
2247
2248 protected:
2249 virtual void SqlDismissResult () = 0;
2250 virtual bool SqlQuery ( const char * sQuery ) = 0;
2251 virtual bool SqlIsError () = 0;
2252 virtual const char * SqlError () = 0;
2253 virtual bool SqlConnect () = 0;
2254 virtual void SqlDisconnect () = 0;
2255 virtual int SqlNumFields() = 0;
2256 virtual bool SqlFetchRow() = 0;
2257 virtual DWORD SqlColumnLength ( int iIndex ) = 0;
2258 virtual const char * SqlColumn ( int iIndex ) = 0;
2259 virtual const char * SqlFieldName ( int iIndex ) = 0;
2260
2261 const char * SqlUnpackColumn ( int iIndex, ESphUnpackFormat eFormat );
2262 void ReportUnpackError ( int iIndex, int iError );
2263 };
2264
2265
2266 #if USE_MYSQL
2267 /// MySQL source params
2268 struct CSphSourceParams_MySQL : CSphSourceParams_SQL
2269 {
2270 CSphString m_sUsock; ///< UNIX socket
2271 int m_iFlags; ///< connection flags
2272 CSphString m_sSslKey;
2273 CSphString m_sSslCert;
2274 CSphString m_sSslCA;
2275
2276 CSphSourceParams_MySQL (); ///< ctor. sets defaults
2277 };
2278
2279
2280 /// MySQL source implementation
2281 /// multi-field plain-text documents fetched from given query
2282 struct CSphSource_MySQL : CSphSource_SQL
2283 {
2284 explicit CSphSource_MySQL ( const char * sName );
2285 bool Setup ( const CSphSourceParams_MySQL & tParams );
2286
2287 protected:
2288 MYSQL_RES * m_pMysqlResult;
2289 MYSQL_FIELD * m_pMysqlFields;
2290 MYSQL_ROW m_tMysqlRow;
2291 MYSQL m_tMysqlDriver;
2292 unsigned long * m_pMysqlLengths;
2293
2294 CSphString m_sMysqlUsock;
2295 int m_iMysqlConnectFlags;
2296 CSphString m_sSslKey;
2297 CSphString m_sSslCert;
2298 CSphString m_sSslCA;
2299
2300 protected:
2301 virtual void SqlDismissResult ();
2302 virtual bool SqlQuery ( const char * sQuery );
2303 virtual bool SqlIsError ();
2304 virtual const char * SqlError ();
2305 virtual bool SqlConnect ();
2306 virtual void SqlDisconnect ();
2307 virtual int SqlNumFields();
2308 virtual bool SqlFetchRow();
2309 virtual DWORD SqlColumnLength ( int iIndex );
2310 virtual const char * SqlColumn ( int iIndex );
2311 virtual const char * SqlFieldName ( int iIndex );
2312 };
2313 #endif // USE_MYSQL
2314
2315
2316 #if USE_PGSQL
2317 /// PgSQL specific source params
2318 struct CSphSourceParams_PgSQL : CSphSourceParams_SQL
2319 {
2320 CSphString m_sClientEncoding;
2321 CSphSourceParams_PgSQL ();
2322 };
2323
2324
2325 /// PgSQL source implementation
2326 /// multi-field plain-text documents fetched from given query
2327 struct CSphSource_PgSQL : CSphSource_SQL
2328 {
2329 explicit CSphSource_PgSQL ( const char * sName );
2330 bool Setup ( const CSphSourceParams_PgSQL & pParams );
2331 virtual bool IterateStart ( CSphString & sError );
2332
2333 protected:
2334 PGresult * m_pPgResult; ///< postgresql execution restult context
2335 PGconn * m_tPgDriver; ///< postgresql connection context
2336
2337 int m_iPgRows; ///< how much rows last step returned
2338 int m_iPgRow; ///< current row (0 based, as in PQgetvalue)
2339
2340 CSphString m_sPgClientEncoding;
2341 CSphVector<bool> m_dIsColumnBool;
2342
2343 protected:
2344 virtual void SqlDismissResult ();
2345 virtual bool SqlQuery ( const char * sQuery );
2346 virtual bool SqlIsError ();
2347 virtual const char * SqlError ();
2348 virtual bool SqlConnect ();
2349 virtual void SqlDisconnect ();
2350 virtual int SqlNumFields();
2351 virtual bool SqlFetchRow();
2352 virtual DWORD SqlColumnLength ( int iIndex );
2353 virtual const char * SqlColumn ( int iIndex );
2354 virtual const char * SqlFieldName ( int iIndex );
2355 };
2356 #endif // USE_PGSQL
2357
2358 #if USE_ODBC
2359 struct CSphSourceParams_ODBC: CSphSourceParams_SQL
2360 {
2361 CSphString m_sOdbcDSN; ///< ODBC DSN
2362 CSphString m_sColBuffers; ///< column buffer sizes (eg "col1=2M, col2=4M")
2363 bool m_bWinAuth; ///< auth type (MS SQL only)
2364
2365 CSphSourceParams_ODBC ();
2366 };
2367
2368 /// ODBC source implementation
2369 struct CSphSource_ODBC : CSphSource_SQL
2370 {
2371 explicit CSphSource_ODBC ( const char * sName );
2372 bool Setup ( const CSphSourceParams_ODBC & tParams );
2373
2374 protected:
2375 virtual void SqlDismissResult ();
2376 virtual bool SqlQuery ( const char * sQuery );
2377 virtual bool SqlIsError ();
2378 virtual const char * SqlError ();
2379 virtual bool SqlConnect ();
2380 virtual void SqlDisconnect ();
2381 virtual int SqlNumFields();
2382 virtual bool SqlFetchRow();
2383 virtual const char * SqlColumn ( int iIndex );
2384 virtual const char * SqlFieldName ( int iIndex );
2385 virtual DWORD SqlColumnLength ( int iIndex );
2386
OdbcPostConnectCSphSource_ODBC2387 virtual void OdbcPostConnect () {}
2388
2389 protected:
2390 CSphString m_sOdbcDSN;
2391 bool m_bWinAuth;
2392 bool m_bUnicode;
2393
2394 SQLHENV m_hEnv;
2395 SQLHDBC m_hDBC;
2396 SQLHANDLE m_hStmt;
2397 int m_nResultCols;
2398 CSphString m_sError;
2399
2400 struct QueryColumn_t
2401 {
2402 CSphVector<char> m_dContents;
2403 CSphVector<char> m_dRaw;
2404 CSphString m_sName;
2405 SQLLEN m_iInd;
2406 int m_iBufferSize; ///< size of m_dContents and m_dRaw buffers, in bytes
2407 bool m_bUCS2; ///< whether this column needs UCS-2 to UTF-8 translation
2408 bool m_bTruncated; ///< whether data was truncated when fetching rows
2409 };
2410
2411 static const int DEFAULT_COL_SIZE = 1024; ///< default column buffer size
2412 static const int VARCHAR_COL_SIZE = 1048576; ///< default column buffer size for VARCHAR columns
2413 static const int MAX_COL_SIZE = 8*1048576; ///< hard limit on column buffer size
2414 static const int WARN_ROW_SIZE = 32*1048576; ///< warning thresh (NOT a hard limit) on row buffer size
2415
2416 CSphVector<QueryColumn_t> m_dColumns;
2417 SmallStringHash_T<int> m_hColBuffers;
2418
2419 void GetSqlError ( SQLSMALLINT iHandleType, SQLHANDLE hHandle );
2420 };
2421
2422
2423 /// MS SQL source implementation
2424 struct CSphSource_MSSQL : public CSphSource_ODBC
2425 {
CSphSource_MSSQLCSphSource_MSSQL2426 explicit CSphSource_MSSQL ( const char * sName ) : CSphSource_ODBC ( sName ) { m_bUnicode=true; }
2427 virtual void OdbcPostConnect ();
2428 };
2429 #endif // USE_ODBC
2430
2431
2432 #if USE_LIBEXPAT
2433 class CSphConfigSection;
2434 CSphSource * sphCreateSourceXmlpipe2 ( const CSphConfigSection * pSource, FILE * pPipe, const char * szSourceName, int iMaxFieldLen, bool bProxy, CSphString & sError );
2435 #endif
2436
2437
2438 /////////////////////////////////////////////////////////////////////////////
2439 // SEARCH QUERIES
2440 /////////////////////////////////////////////////////////////////////////////
2441
2442 /// search query sorting orders
2443 enum ESphSortOrder
2444 {
2445 SPH_SORT_RELEVANCE = 0, ///< sort by document relevance desc, then by date
2446 SPH_SORT_ATTR_DESC = 1, ///< sort by document date desc, then by relevance desc
2447 SPH_SORT_ATTR_ASC = 2, ///< sort by document date asc, then by relevance desc
2448 SPH_SORT_TIME_SEGMENTS = 3, ///< sort by time segments (hour/day/week/etc) desc, then by relevance desc
2449 SPH_SORT_EXTENDED = 4, ///< sort by SQL-like expression (eg. "@relevance DESC, price ASC, @id DESC")
2450 SPH_SORT_EXPR = 5, ///< sort by arithmetic expression in descending order (eg. "@id + max(@weight,1000)*boost + log(price)")
2451
2452 SPH_SORT_TOTAL
2453 };
2454
2455
2456 /// search query matching mode
2457 enum ESphMatchMode
2458 {
2459 SPH_MATCH_ALL = 0, ///< match all query words
2460 SPH_MATCH_ANY, ///< match any query word
2461 SPH_MATCH_PHRASE, ///< match this exact phrase
2462 SPH_MATCH_BOOLEAN, ///< match this boolean query
2463 SPH_MATCH_EXTENDED, ///< match this extended query
2464 SPH_MATCH_FULLSCAN, ///< match all document IDs w/o fulltext query, apply filters
2465 SPH_MATCH_EXTENDED2, ///< extended engine V2 (TEMPORARY, WILL BE REMOVED IN 0.9.8-RELEASE)
2466
2467 SPH_MATCH_TOTAL
2468 };
2469
2470
2471 /// search query relevance ranking mode
2472 enum ESphRankMode
2473 {
2474 SPH_RANK_PROXIMITY_BM25 = 0, ///< default mode, phrase proximity major factor and BM25 minor one (aka SPH03)
2475 SPH_RANK_BM25 = 1, ///< statistical mode, BM25 ranking only (faster but worse quality)
2476 SPH_RANK_NONE = 2, ///< no ranking, all matches get a weight of 1
2477 SPH_RANK_WORDCOUNT = 3, ///< simple word-count weighting, rank is a weighted sum of per-field keyword occurence counts
2478 SPH_RANK_PROXIMITY = 4, ///< phrase proximity (aka SPH01)
2479 SPH_RANK_MATCHANY = 5, ///< emulate old match-any weighting (aka SPH02)
2480 SPH_RANK_FIELDMASK = 6, ///< sets bits where there were matches
2481 SPH_RANK_SPH04 = 7, ///< codename SPH04, phrase proximity + bm25 + head/exact boost
2482 SPH_RANK_EXPR = 8, ///< rank by user expression (eg. "sum(lcs*user_weight)*1000+bm25")
2483 SPH_RANK_EXPORT = 9, ///< rank by BM25, but compute and export all user expression factors
2484 SPH_RANK_PLUGIN = 10, ///< user-defined ranker
2485
2486 SPH_RANK_TOTAL,
2487 SPH_RANK_DEFAULT = SPH_RANK_PROXIMITY_BM25
2488 };
2489
2490
2491 /// search query grouping mode
2492 enum ESphGroupBy
2493 {
2494 SPH_GROUPBY_DAY = 0, ///< group by day
2495 SPH_GROUPBY_WEEK = 1, ///< group by week
2496 SPH_GROUPBY_MONTH = 2, ///< group by month
2497 SPH_GROUPBY_YEAR = 3, ///< group by year
2498 SPH_GROUPBY_ATTR = 4, ///< group by attribute value
2499 SPH_GROUPBY_ATTRPAIR= 5, ///< group by sequential attrs pair (rendered redundant by 64bit attrs support; removed)
2500 SPH_GROUPBY_MULTIPLE= 6 ///< group by on multiple attribute values
2501 };
2502
2503
2504 /// search query filter types
2505 enum ESphFilter
2506 {
2507 SPH_FILTER_VALUES = 0, ///< filter by integer values set
2508 SPH_FILTER_RANGE = 1, ///< filter by integer range
2509 SPH_FILTER_FLOATRANGE = 2, ///< filter by float range
2510 SPH_FILTER_STRING = 3, ///< filter by string value
2511 SPH_FILTER_NULL = 4, ///< filter by NULL
2512 SPH_FILTER_USERVAR = 5, ///< filter by @uservar
2513 SPH_FILTER_STRING_LIST = 6 ///< filter by string list
2514 };
2515
2516
2517 /// search query filter
2518 class CSphFilterSettings
2519 {
2520 public:
2521 CSphString m_sAttrName; ///< filtered attribute name
2522 bool m_bExclude; ///< whether this is "include" or "exclude" filter (default is "include")
2523 bool m_bHasEqual; ///< has filter "equal" component (gte\lte) or pure greater\less
2524
2525 ESphFilter m_eType; ///< filter type
2526 union
2527 {
2528 SphAttr_t m_iMinValue; ///< range min
2529 float m_fMinValue; ///< range min
2530 };
2531 union
2532 {
2533 SphAttr_t m_iMaxValue; ///< range max
2534 float m_fMaxValue; ///< range max
2535 };
2536 CSphVector<SphAttr_t> m_dValues; ///< integer values set
2537 CSphVector<CSphString> m_dStrings; ///< string values
2538
2539 public:
2540 CSphFilterSettings ();
2541
2542 void SetExternalValues ( const SphAttr_t * pValues, int nValues );
2543
GetValue(int iIdx)2544 SphAttr_t GetValue ( int iIdx ) const { assert ( iIdx<GetNumValues() ); return m_pValues ? m_pValues[iIdx] : m_dValues[iIdx]; }
GetValueArray()2545 const SphAttr_t * GetValueArray () const { return m_pValues ? m_pValues : &(m_dValues[0]); }
GetNumValues()2546 int GetNumValues () const { return m_pValues ? m_nValues : m_dValues.GetLength (); }
2547
2548 bool operator == ( const CSphFilterSettings & rhs ) const;
2549 bool operator != ( const CSphFilterSettings & rhs ) const { return !( (*this)==rhs ); }
2550
2551
2552
2553 protected:
2554 const SphAttr_t * m_pValues; ///< external value array
2555 int m_nValues; ///< external array size
2556 };
2557
2558
2559 // keyword info
2560 struct CSphKeywordInfo
2561 {
2562 CSphString m_sTokenized;
2563 CSphString m_sNormalized;
2564 int m_iDocs;
2565 int m_iHits;
2566 int m_iQpos;
2567 };
2568
Swap(CSphKeywordInfo & v1,CSphKeywordInfo & v2)2569 inline void Swap ( CSphKeywordInfo & v1, CSphKeywordInfo & v2 )
2570 {
2571 v1.m_sTokenized.Swap ( v2.m_sTokenized );
2572 v1.m_sNormalized.Swap ( v2.m_sNormalized );
2573 ::Swap ( v1.m_iDocs, v2.m_iDocs );
2574 ::Swap ( v1.m_iHits, v2.m_iHits );
2575 ::Swap ( v1.m_iQpos, v2.m_iQpos );
2576 }
2577
2578
2579 /// per-attribute value overrides
2580 class CSphAttrOverride
2581 {
2582 public:
2583 /// docid+attrvalue pair
2584 struct IdValuePair_t
2585 {
2586 SphDocID_t m_uDocID; ///< document ID
2587 union
2588 {
2589 SphAttr_t m_uValue; ///< attribute value
2590 float m_fValue; ///< attribute value
2591 };
2592
2593 inline bool operator < ( const IdValuePair_t & rhs ) const
2594 {
2595 return m_uDocID<rhs.m_uDocID;
2596 }
2597 };
2598
2599 public:
2600 CSphString m_sAttr; ///< attribute name
2601 ESphAttr m_eAttrType; ///< attribute type
2602 CSphVector<IdValuePair_t> m_dValues; ///< id-value overrides
2603 };
2604
2605
2606 /// query selection item
2607 struct CSphQueryItem
2608 {
2609 CSphString m_sExpr; ///< expression to compute
2610 CSphString m_sAlias; ///< alias to return
2611 ESphAggrFunc m_eAggrFunc;
2612
CSphQueryItemCSphQueryItem2613 CSphQueryItem() : m_eAggrFunc ( SPH_AGGR_NONE ) {}
2614 };
2615
2616
2617 /// table function interface
2618 class CSphQuery;
2619 struct AggrResult_t;
2620 class ISphTableFunc
2621 {
2622 public:
~ISphTableFunc()2623 virtual ~ISphTableFunc() {}
2624 virtual bool ValidateArgs ( const CSphVector<CSphString> & dArgs, const CSphQuery & tQuery, CSphString & sError ) = 0;
2625 virtual bool Process ( AggrResult_t * pResult, CSphString & sError ) = 0;
LimitPushdown(int,int)2626 virtual bool LimitPushdown ( int, int ) { return false; } // FIXME! implement this
2627 };
2628
2629
2630 /// search query
2631 class CSphQuery
2632 {
2633 public:
2634 CSphString m_sIndexes; ///< indexes to search
2635 CSphString m_sQuery; ///< cooked query string for the engine (possibly transformed during legacy matching modes fixup)
2636 CSphString m_sRawQuery; ///< raw query string from the client for searchd log, agents, etc
2637
2638 int m_iOffset; ///< offset into result set (as X in MySQL LIMIT X,Y clause)
2639 int m_iLimit; ///< limit into result set (as Y in MySQL LIMIT X,Y clause)
2640 DWORD * m_pWeights; ///< user-supplied per-field weights. may be NULL. default is NULL. NOT OWNED, WILL NOT BE FREED in dtor.
2641 int m_iWeights; ///< number of user-supplied weights. missing fields will be assigned weight 1. default is 0
2642 ESphMatchMode m_eMode; ///< match mode. default is "match all"
2643 ESphRankMode m_eRanker; ///< ranking mode, default is proximity+BM25
2644 CSphString m_sRankerExpr; ///< ranking expression for SPH_RANK_EXPR
2645 CSphString m_sUDRanker; ///< user-defined ranker name
2646 CSphString m_sUDRankerOpts; ///< user-defined ranker options
2647 ESphSortOrder m_eSort; ///< sort mode
2648 CSphString m_sSortBy; ///< attribute to sort by
2649 int64_t m_iRandSeed; ///< random seed for ORDER BY RAND(), -1 means do not set
2650 int m_iMaxMatches; ///< max matches to retrieve, default is 1000. more matches use more memory and CPU time to hold and sort them
2651
2652 bool m_bSortKbuffer; ///< whether to use PQ or K-buffer sorting algorithm
2653 bool m_bZSlist; ///< whether the ranker has to fetch the zonespanlist with this query
2654 bool m_bSimplify; ///< whether to apply boolean simplification
2655 bool m_bPlainIDF; ///< whether to use PlainIDF=log(N/n) or NormalizedIDF=log((N-n+1)/n)
2656 bool m_bGlobalIDF; ///< whether to use local indexes or a global idf file
2657 bool m_bNormalizedTFIDF; ///< whether to scale IDFs by query word count, so that TF*IDF is normalized
2658 bool m_bLocalDF; ///< whether to use calculate DF among local indexes
2659
2660 CSphVector<CSphFilterSettings> m_dFilters; ///< filters
2661
2662 CSphString m_sGroupBy; ///< group-by attribute name(s)
2663 CSphString m_sFacetBy; ///< facet-by attribute name(s)
2664 ESphGroupBy m_eGroupFunc; ///< function to pre-process group-by attribute value with
2665 CSphString m_sGroupSortBy; ///< sorting clause for groups in group-by mode
2666 CSphString m_sGroupDistinct; ///< count distinct values for this attribute
2667
2668 int m_iCutoff; ///< matches count threshold to stop searching at (default is 0; means to search until all matches are found)
2669
2670 int m_iRetryCount; ///< retry count, for distributed queries
2671 int m_iRetryDelay; ///< retry delay, for distributed queries
2672 int m_iAgentQueryTimeout; ///< agent query timeout override, for distributed queries
2673
2674 bool m_bGeoAnchor; ///< do we have an anchor
2675 CSphString m_sGeoLatAttr; ///< latitude attr name
2676 CSphString m_sGeoLongAttr; ///< longitude attr name
2677 float m_fGeoLatitude; ///< anchor latitude
2678 float m_fGeoLongitude; ///< anchor longitude
2679
2680 CSphVector<CSphNamedInt> m_dIndexWeights; ///< per-index weights
2681 CSphVector<CSphNamedInt> m_dFieldWeights; ///< per-field weights
2682
2683 DWORD m_uMaxQueryMsec; ///< max local index search time, in milliseconds (default is 0; means no limit)
2684 int m_iMaxPredictedMsec; ///< max predicted (!) search time limit, in milliseconds (0 means no limit)
2685 CSphString m_sComment; ///< comment to pass verbatim in the log file
2686
2687 CSphVector<CSphAttrOverride> m_dOverrides; ///< per-query attribute value overrides
2688
2689 CSphString m_sSelect; ///< select-list (attributes and/or expressions)
2690 CSphString m_sOrderBy; ///< order-by clause
2691
2692 CSphString m_sOuterOrderBy; ///< temporary (?) subselect hack
2693 int m_iOuterOffset; ///< keep and apply outer offset at master
2694 int m_iOuterLimit;
2695 bool m_bHasOuter;
2696
2697 bool m_bReverseScan; ///< perform scan in reverse order
2698 bool m_bIgnoreNonexistent; ///< whether to warning or not about non-existent columns in select list
2699 bool m_bIgnoreNonexistentIndexes; ///< whether to error or not about non-existent indexes in index list
2700 bool m_bStrict; ///< whether to warning or not about incompatible types
2701
2702 ISphTableFunc * m_pTableFunc; ///< post-query NOT OWNED, WILL NOT BE FREED in dtor.
2703 CSphFilterSettings m_tHaving; ///< post aggregate filtering (got applied only on master)
2704
2705 public:
2706 int m_iSQLSelectStart; ///< SQL parser helper
2707 int m_iSQLSelectEnd; ///< SQL parser helper
2708
2709 int m_iGroupbyLimit; ///< number of elems within group
2710
2711 public:
2712 CSphVector<CSphQueryItem> m_dItems; ///< parsed select-list
2713 ESphCollation m_eCollation; ///< ORDER BY collation
2714 bool m_bAgent; ///< agent mode (may need extra cols on output)
2715
2716 CSphString m_sQueryTokenFilterLib; ///< token filter library name
2717 CSphString m_sQueryTokenFilterName; ///< token filter name
2718 CSphString m_sQueryTokenFilterOpts; ///< token filter options
2719
2720 public:
2721 CSphQuery (); ///< ctor, fills defaults
2722 ~CSphQuery (); ///< dtor, frees owned stuff
2723
2724 /// parse select list string into items
2725 bool ParseSelectList ( CSphString & sError );
2726 bool m_bFacet; ///< whether this a facet query
2727 };
2728
2729
2730 /// some low-level query stats
2731 struct CSphQueryStats
2732 {
2733 int64_t * m_pNanoBudget; ///< pointer to max_predicted_time budget (counted in nanosec)
2734 DWORD m_iFetchedDocs; ///< processed documents
2735 DWORD m_iFetchedHits; ///< processed hits (aka positions)
2736 DWORD m_iSkips; ///< number of Skip() calls
2737
2738 CSphQueryStats();
2739
2740 void Add ( const CSphQueryStats & tStats );
2741 };
2742
2743
2744 /// search query meta-info
2745 class CSphQueryResultMeta
2746 {
2747 public:
2748 int m_iQueryTime; ///< query time, milliseconds
2749 int m_iRealQueryTime; ///< query time, measured just from start to finish of the query. In milliseconds
2750 int64_t m_iCpuTime; ///< user time, microseconds
2751 int m_iMultiplier; ///< multi-query multiplier, -1 to indicate error
2752
2753 struct WordStat_t
2754 {
2755 int64_t m_iDocs; ///< document count for this term
2756 int64_t m_iHits; ///< hit count for this term
2757
WordStat_tWordStat_t2758 WordStat_t()
2759 : m_iDocs ( 0 )
2760 , m_iHits ( 0 )
2761 {}
2762 };
2763 SmallStringHash_T<WordStat_t> m_hWordStats; ///< hash of i-th search term (normalized word form)
2764
2765 int m_iMatches; ///< total matches returned (upto MAX_MATCHES)
2766 int64_t m_iTotalMatches; ///< total matches found (unlimited)
2767
2768 CSphIOStats m_tIOStats; ///< i/o stats for the query
2769 int64_t m_iAgentCpuTime; ///< agent cpu time (for distributed searches)
2770 CSphIOStats m_tAgentIOStats; ///< agent IO stats (for distributed searches)
2771
2772 int64_t m_iPredictedTime; ///< local predicted time
2773 int64_t m_iAgentPredictedTime; ///< distributed predicted time
2774 DWORD m_iAgentFetchedDocs; ///< distributed fetched docs
2775 DWORD m_iAgentFetchedHits; ///< distributed fetched hits
2776 DWORD m_iAgentFetchedSkips; ///< distributed fetched skips
2777
2778 CSphQueryStats m_tStats; ///< query prediction counters
2779 bool m_bHasPrediction; ///< is prediction counters set?
2780
2781 CSphString m_sError; ///< error message
2782 CSphString m_sWarning; ///< warning message
2783 int64_t m_iBadRows;
2784
2785 CSphQueryResultMeta (); ///< ctor
~CSphQueryResultMeta()2786 virtual ~CSphQueryResultMeta () {} ///< dtor
2787 void AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits );
2788 };
2789
2790
2791 /// search query result (meta-info plus actual matches)
2792 class CSphQueryProfile;
2793 class CSphQueryResult : public CSphQueryResultMeta
2794 {
2795 public:
2796 CSphSwapVector<CSphMatch> m_dMatches; ///< top matching documents, no more than MAX_MATCHES
2797
2798 CSphRsetSchema m_tSchema; ///< result schema
2799 const DWORD * m_pMva; ///< pointer to MVA storage
2800 const BYTE * m_pStrings; ///< pointer to strings storage
2801 bool m_bArenaProhibit;
2802
2803 CSphVector<BYTE *> m_dStorage2Free; /// < aggregated external storage from rt indexes
2804
2805 int m_iOffset; ///< requested offset into matches array
2806 int m_iCount; ///< count which will be actually served (computed from total, offset and limit)
2807
2808 int m_iSuccesses;
2809
2810 CSphQueryProfile * m_pProfile; ///< filled when query profiling is enabled; NULL otherwise
2811
2812 public:
2813 CSphQueryResult (); ///< ctor
2814 virtual ~CSphQueryResult (); ///< dtor, which releases all owned stuff
2815
2816 void LeakStorages ( CSphQueryResult & tDst );
2817 };
2818
2819 /////////////////////////////////////////////////////////////////////////////
2820 // ATTRIBUTE UPDATE QUERY
2821 /////////////////////////////////////////////////////////////////////////////
2822
2823 struct CSphAttrUpdate
2824 {
2825 CSphVector<char*> m_dAttrs; ///< update schema, attr names to update
2826 CSphVector<ESphAttr> m_dTypes; ///< update schema, attr types to update
2827 CSphVector<DWORD> m_dPool; ///< update values pool
2828 CSphVector<SphDocID_t> m_dDocids; ///< document IDs vector
2829 CSphVector<const CSphRowitem*> m_dRows; ///< document attribute's vector, used instead of m_dDocids.
2830 CSphVector<int> m_dRowOffset; ///< document row offsets in the pool (1 per doc, i.e. the length is the same as of m_dDocids)
2831 bool m_bIgnoreNonexistent; ///< whether to warn about non-existen attrs, or just silently ignore them
2832 bool m_bStrict; ///< whether to check for incompatible types first, or just ignore them
2833
CSphAttrUpdateCSphAttrUpdate2834 CSphAttrUpdate()
2835 : m_bIgnoreNonexistent ( false )
2836 , m_bStrict ( false )
2837 {}
2838
~CSphAttrUpdateCSphAttrUpdate2839 ~CSphAttrUpdate()
2840 {
2841 ARRAY_FOREACH ( i, m_dAttrs )
2842 SafeDeleteArray ( m_dAttrs[i] );
2843 }
2844 };
2845
2846 /////////////////////////////////////////////////////////////////////////////
2847 // FULLTEXT INDICES
2848 /////////////////////////////////////////////////////////////////////////////
2849
2850 /// progress info
2851 struct CSphIndexProgress
2852 {
2853 enum Phase_e
2854 {
2855 PHASE_COLLECT, ///< document collection phase
2856 PHASE_SORT, ///< final sorting phase
2857 PHASE_COLLECT_MVA, ///< multi-valued attributes collection phase
2858 PHASE_SORT_MVA, ///< multi-valued attributes collection phase
2859 PHASE_MERGE, ///< index merging
2860
2861 PHASE_PREREAD, ///< searchd startup, prereading data
2862 PHASE_PRECOMPUTE ///< searchd startup, indexing attributes
2863 };
2864
2865 Phase_e m_ePhase; ///< current indexing phase
2866
2867 int64_t m_iDocuments; ///< PHASE_COLLECT: documents collected so far
2868 int64_t m_iBytes; ///< PHASE_COLLECT: bytes collected so far;
2869 ///< PHASE_PREREAD: bytes read so far;
2870 int64_t m_iBytesTotal; ///< PHASE_PREREAD: total bytes to read;
2871
2872 int64_t m_iAttrs; ///< PHASE_COLLECT_MVA, PHASE_SORT_MVA: attrs processed so far
2873 int64_t m_iAttrsTotal; ///< PHASE_SORT_MVA: attrs total
2874
2875 SphOffset_t m_iHits; ///< PHASE_SORT: hits sorted so far
2876 SphOffset_t m_iHitsTotal; ///< PHASE_SORT: hits total
2877
2878 int m_iWords; ///< PHASE_MERGE: words merged so far
2879
2880 int m_iDone; ///< generic percent, 0..1000 range
2881
2882 typedef void ( *IndexingProgress_fn ) ( const CSphIndexProgress * pStat, bool bPhaseEnd );
2883 IndexingProgress_fn m_fnProgress;
2884
CSphIndexProgressCSphIndexProgress2885 CSphIndexProgress ()
2886 : m_ePhase ( PHASE_COLLECT )
2887 , m_iDocuments ( 0 )
2888 , m_iBytes ( 0 )
2889 , m_iBytesTotal ( 0 )
2890 , m_iAttrs ( 0 )
2891 , m_iAttrsTotal ( 0 )
2892 , m_iHits ( 0 )
2893 , m_iHitsTotal ( 0 )
2894 , m_iWords ( 0 )
2895 , m_fnProgress ( NULL )
2896 {}
2897
2898 /// builds a message to print
2899 /// WARNING, STATIC BUFFER, NON-REENTRANT
2900 const char * BuildMessage() const;
2901
2902 void Show ( bool bPhaseEnd ) const;
2903 };
2904
2905
2906 /// match sorting functions
2907 enum ESphSortFunc
2908 {
2909 FUNC_REL_DESC,
2910 FUNC_ATTR_DESC,
2911 FUNC_ATTR_ASC,
2912 FUNC_TIMESEGS,
2913 FUNC_GENERIC2,
2914 FUNC_GENERIC3,
2915 FUNC_GENERIC4,
2916 FUNC_GENERIC5,
2917 FUNC_EXPR
2918 };
2919
2920
2921 /// match sorting clause parsing outcomes
2922 enum ESortClauseParseResult
2923 {
2924 SORT_CLAUSE_OK,
2925 SORT_CLAUSE_ERROR,
2926 SORT_CLAUSE_RANDOM
2927 };
2928
2929
2930 /// sorting key part types
2931 enum ESphSortKeyPart
2932 {
2933 SPH_KEYPART_ID,
2934 SPH_KEYPART_WEIGHT,
2935 SPH_KEYPART_INT,
2936 SPH_KEYPART_FLOAT,
2937 SPH_KEYPART_STRING,
2938 SPH_KEYPART_STRINGPTR
2939 };
2940
2941
2942 /// JSON key lookup stuff
2943 struct JsonKey_t
2944 {
2945 CSphString m_sKey; ///< name string
2946 DWORD m_uMask; ///< Bloom mask for this key
2947 int m_iLen; ///< name length, in bytes
2948
2949 JsonKey_t ();
2950 explicit JsonKey_t ( const char * sKey, int iLen );
2951 };
2952
2953 typedef int ( *SphStringCmp_fn )( const BYTE * pStr1, const BYTE * pStr2, bool bPacked );
2954
2955 /// match comparator state
2956 struct CSphMatchComparatorState
2957 {
2958 static const int MAX_ATTRS = 5;
2959
2960 ESphSortKeyPart m_eKeypart[MAX_ATTRS]; ///< sort-by key part type
2961 CSphAttrLocator m_tLocator[MAX_ATTRS]; ///< sort-by attr locator
2962 JsonKey_t m_tSubKeys[MAX_ATTRS]; ///< sort-by attr sub-locator
2963 ISphExpr * m_tSubExpr[MAX_ATTRS]; ///< sort-by attr expression
2964 ESphAttr m_tSubType[MAX_ATTRS]; ///< sort-by expression type
2965 int m_dAttrs[MAX_ATTRS]; ///< sort-by attr index
2966
2967 DWORD m_uAttrDesc; ///< sort order mask (if i-th bit is set, i-th attr order is DESC)
2968 DWORD m_iNow; ///< timestamp (for timesegments sorting mode)
2969 SphStringCmp_fn m_fnStrCmp; ///< string comparator
2970
2971
2972 /// create default empty state
CSphMatchComparatorStateCSphMatchComparatorState2973 CSphMatchComparatorState ()
2974 : m_uAttrDesc ( 0 )
2975 , m_iNow ( 0 )
2976 , m_fnStrCmp ( NULL )
2977 {
2978 for ( int i=0; i<MAX_ATTRS; i++ )
2979 {
2980 m_eKeypart[i] = SPH_KEYPART_ID;
2981 m_dAttrs[i] = -1;
2982 }
2983 }
2984
2985 /// check if any of my attrs are bitfields
UsesBitfieldsCSphMatchComparatorState2986 bool UsesBitfields ()
2987 {
2988 for ( int i=0; i<MAX_ATTRS; i++ )
2989 if ( m_eKeypart[i]==SPH_KEYPART_INT && m_tLocator[i].IsBitfield() )
2990 return true;
2991 return false;
2992 }
2993
CmpStringsCSphMatchComparatorState2994 inline int CmpStrings ( const CSphMatch & a, const CSphMatch & b, int iAttr ) const
2995 {
2996 assert ( iAttr>=0 && iAttr<MAX_ATTRS );
2997 assert ( m_eKeypart[iAttr]==SPH_KEYPART_STRING || m_eKeypart[iAttr]==SPH_KEYPART_STRINGPTR );
2998 assert ( m_fnStrCmp );
2999
3000 const BYTE * aa = (const BYTE*) a.GetAttr ( m_tLocator[iAttr] );
3001 const BYTE * bb = (const BYTE*) b.GetAttr ( m_tLocator[iAttr] );
3002 if ( aa==NULL || bb==NULL )
3003 {
3004 if ( aa==bb )
3005 return 0;
3006 if ( aa==NULL )
3007 return -1;
3008 return 1;
3009 }
3010 return m_fnStrCmp ( aa, bb, ( m_eKeypart[iAttr]==SPH_KEYPART_STRING ) );
3011 }
3012 };
3013
3014
3015 /// match processor interface
3016 struct ISphMatchProcessor
3017 {
~ISphMatchProcessorISphMatchProcessor3018 virtual ~ISphMatchProcessor () {}
3019 virtual void Process ( CSphMatch * pMatch ) = 0;
3020 };
3021
3022
3023 /// generic match sorter interface
3024 class ISphMatchSorter
3025 {
3026 public:
3027 bool m_bRandomize;
3028 int64_t m_iTotal;
3029
3030 SphDocID_t m_iJustPushed;
3031 int m_iMatchCapacity;
3032 CSphTightVector<SphDocID_t> m_dJustPopped;
3033
3034 protected:
3035 CSphRsetSchema m_tSchema; ///< sorter schema (adds dynamic attributes on top of index schema)
3036 CSphMatchComparatorState m_tState; ///< protected to set m_iNow automatically on SetState() calls
3037
3038 public:
3039 /// ctor
ISphMatchSorter()3040 ISphMatchSorter () : m_bRandomize ( false ), m_iTotal ( 0 ), m_iJustPushed ( 0 ), m_iMatchCapacity ( 0 ) {}
3041
3042 /// virtualizing dtor
~ISphMatchSorter()3043 virtual ~ISphMatchSorter () {}
3044
3045 /// check if this sorter needs attr values
3046 virtual bool UsesAttrs () const = 0;
3047
3048 // check if sorter might be used in multi-queue
3049 virtual bool CanMulti () const = 0;
3050
3051 /// check if this sorter does groupby
3052 virtual bool IsGroupby () const = 0;
3053
3054 /// set match comparator state
3055 virtual void SetState ( const CSphMatchComparatorState & tState );
3056
3057 /// get match comparator stat
GetState()3058 virtual CSphMatchComparatorState & GetState() { return m_tState; }
3059
3060 /// set group comparator state
SetGroupState(const CSphMatchComparatorState &)3061 virtual void SetGroupState ( const CSphMatchComparatorState & ) {}
3062
3063 /// set MVA pool pointer (for MVA+groupby sorters)
SetMVAPool(const DWORD *,bool)3064 virtual void SetMVAPool ( const DWORD *, bool ) {}
3065
3066 /// set string pool pointer (for string+groupby sorters)
SetStringPool(const BYTE *)3067 virtual void SetStringPool ( const BYTE * ) {}
3068
3069 /// set sorter schema by swapping in and (optionally) adjusting the argument
SetSchema(CSphRsetSchema & tSchema)3070 virtual void SetSchema ( CSphRsetSchema & tSchema ) { m_tSchema = tSchema; }
3071
3072 /// get incoming schema
GetSchema()3073 virtual const CSphRsetSchema & GetSchema () const { return m_tSchema; }
3074
3075 /// base push
3076 /// returns false if the entry was rejected as duplicate
3077 /// returns true otherwise (even if it was not actually inserted)
3078 virtual bool Push ( const CSphMatch & tEntry ) = 0;
3079
3080 /// submit pre-grouped match. bNewSet indicates that the match begins the bunch of matches got from one source
3081 virtual bool PushGrouped ( const CSphMatch & tEntry, bool bNewSet ) = 0;
3082
3083 /// get rough entries count, due of aggregate filtering phase
3084 virtual int GetLength () const = 0;
3085
3086 /// get internal buffer length
3087 virtual int GetDataLength () const = 0;
3088
3089 /// get total count of non-duplicates Push()ed through this queue
GetTotalCount()3090 virtual int64_t GetTotalCount () const { return m_iTotal; }
3091
3092 /// process collected entries up to length count
3093 virtual void Finalize ( ISphMatchProcessor & tProcessor, bool bCallProcessInResultSetOrder ) = 0;
3094
3095 /// store all entries into specified location and remove them from the queue
3096 /// entries are stored in properly sorted order,
3097 /// if iTag is non-negative, entries are also tagged; otherwise, their tag's unchanged
3098 /// return sored entries count, might be less than length due of aggregate filtering phase
3099 virtual int Flatten ( CSphMatch * pTo, int iTag ) = 0;
3100
3101 /// get a pointer to the worst element, NULL if there is no fixed location
GetWorst()3102 virtual const CSphMatch * GetWorst() const { return NULL; }
3103 };
3104
3105
3106 /// available docinfo storage strategies
3107 enum ESphDocinfo
3108 {
3109 SPH_DOCINFO_NONE = 0, ///< no docinfo available
3110 SPH_DOCINFO_INLINE = 1, ///< inline docinfo into index (specifically, into doclists)
3111 SPH_DOCINFO_EXTERN = 2 ///< store docinfo separately
3112 };
3113
3114
3115 enum ESphHitless
3116 {
3117 SPH_HITLESS_NONE = 0, ///< all hits are present
3118 SPH_HITLESS_SOME = 1, ///< some of the hits might be omitted (check the flag bit)
3119 SPH_HITLESS_ALL = 2 ///< no hits in this index
3120 };
3121
3122
3123 enum ESphHitFormat
3124 {
3125 SPH_HIT_FORMAT_PLAIN = 0, ///< all hits are stored in hitlist
3126 SPH_HIT_FORMAT_INLINE = 1 ///< hits can be split and inlined into doclist (aka 9-23)
3127 };
3128
3129
3130 enum ESphRLPFilter
3131 {
3132 SPH_RLP_NONE = 0, ///< rlp not used
3133 SPH_RLP_PLAIN = 1, ///< rlp used to tokenize every document
3134 SPH_RLP_BATCHED = 2 ///< rlp used to batch documents and tokenize several documents at once
3135 };
3136
3137
3138 struct CSphIndexSettings : public CSphSourceSettings
3139 {
3140 ESphDocinfo m_eDocinfo;
3141 ESphHitFormat m_eHitFormat;
3142 bool m_bHtmlStrip;
3143 CSphString m_sHtmlIndexAttrs;
3144 CSphString m_sHtmlRemoveElements;
3145 CSphString m_sZones;
3146 ESphHitless m_eHitless;
3147 CSphString m_sHitlessFiles;
3148 bool m_bVerbose;
3149 int m_iEmbeddedLimit;
3150
3151 ESphBigram m_eBigramIndex;
3152 CSphString m_sBigramWords;
3153 CSphVector<CSphString> m_dBigramWords;
3154
3155 DWORD m_uAotFilterMask; ///< lemmatize_XX_all forces us to transform queries on the index level too
3156 ESphRLPFilter m_eChineseRLP; ///< chinese RLP filter
3157 CSphString m_sRLPContext; ///< path to RLP context file
3158
3159 CSphString m_sIndexTokenFilter; ///< indexing time token filter spec string (pretty useless for disk, vital for RT)
3160
3161 CSphIndexSettings ();
3162 };
3163
3164
3165 /// forward refs to internal searcher classes
3166 class ISphQword;
3167 class ISphQwordSetup;
3168 class CSphQueryContext;
3169 struct ISphFilter;
3170
3171
3172 struct ISphKeywordsStat
3173 {
~ISphKeywordsStatISphKeywordsStat3174 virtual ~ISphKeywordsStat() {}
3175 virtual bool FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const = 0;
3176 };
3177
3178
3179 struct CSphIndexStatus
3180 {
3181 int64_t m_iRamUse;
3182 int64_t m_iDiskUse;
3183 int64_t m_iRamChunkSize; // not used for plain
3184 int m_iNumChunks; // not used for plain
3185 int64_t m_iMemLimit; // not used for plain
3186
CSphIndexStatusCSphIndexStatus3187 CSphIndexStatus()
3188 : m_iRamUse ( 0 )
3189 , m_iDiskUse ( 0 )
3190 , m_iRamChunkSize ( 0 )
3191 , m_iNumChunks ( 0 )
3192 , m_iMemLimit ( 0 )
3193 {}
3194 };
3195
3196 struct KillListTrait_t
3197 {
3198 const SphDocID_t * m_pBegin;
3199 int m_iLen;
3200 };
3201
3202 typedef CSphVector<KillListTrait_t> KillListVector;
3203
3204 struct CSphMultiQueryArgs : public ISphNoncopyable
3205 {
3206 const KillListVector & m_dKillList;
3207 const int m_iIndexWeight;
3208 int m_iTag;
3209 DWORD m_uPackedFactorFlags;
3210 bool m_bLocalDF;
3211 const SmallStringHash_T<int64_t> * m_pLocalDocs;
3212 int64_t m_iTotalDocs;
3213
3214 CSphMultiQueryArgs ( const KillListVector & dKillList, int iIndexWeight );
3215 };
3216
3217
3218 /// generic fulltext index interface
3219 class CSphIndex : public ISphKeywordsStat
3220 {
3221 public:
3222
3223 enum
3224 {
3225 ATTRS_UPDATED = ( 1UL<<0 ),
3226 ATTRS_MVA_UPDATED = ( 1UL<<1 ),
3227 ATTRS_STRINGS_UPDATED = ( 1UL<<2 )
3228 };
3229
3230 public:
3231 explicit CSphIndex ( const char * sIndexName, const char * sFilename );
3232 virtual ~CSphIndex ();
3233
GetLastError()3234 virtual const CSphString & GetLastError () const { return m_sLastError; }
GetLastWarning()3235 virtual const CSphString & GetLastWarning () const { return m_sLastWarning; }
GetMatchSchema()3236 virtual const CSphSchema & GetMatchSchema () const { return m_tSchema; } ///< match schema as returned in result set (possibly different from internal storage schema!)
3237
3238 virtual void SetProgressCallback ( CSphIndexProgress::IndexingProgress_fn pfnProgress ) = 0;
3239 virtual void SetInplaceSettings ( int iHitGap, int iDocinfoGap, float fRelocFactor, float fWriteFactor );
SetPreopen(bool bValue)3240 virtual void SetPreopen ( bool bValue ) { m_bKeepFilesOpen = bValue; }
3241 void SetFieldFilter ( ISphFieldFilter * pFilter );
3242 void SetTokenizer ( ISphTokenizer * pTokenizer );
3243 void SetupQueryTokenizer();
GetTokenizer()3244 const ISphTokenizer * GetTokenizer () const { return m_pTokenizer; }
GetQueryTokenizer()3245 const ISphTokenizer * GetQueryTokenizer () const { return m_pQueryTokenizer; }
3246 ISphTokenizer * LeakTokenizer ();
3247 void SetDictionary ( CSphDict * pDict );
GetDictionary()3248 CSphDict * GetDictionary () const { return m_pDict; }
3249 CSphDict * LeakDictionary ();
SetKeepAttrs(const CSphString &)3250 virtual void SetKeepAttrs ( const CSphString & ) {}
3251 void Setup ( const CSphIndexSettings & tSettings );
GetSettings()3252 const CSphIndexSettings & GetSettings () const { return m_tSettings; }
IsStripperInited()3253 bool IsStripperInited () const { return m_bStripperInited; }
3254 virtual SphDocID_t * GetKillList () const = 0;
3255 virtual int GetKillListSize () const = 0;
3256 virtual bool HasDocid ( SphDocID_t uDocid ) const = 0;
IsRT()3257 virtual bool IsRT() const { return false; }
SetBinlog(bool bBinlog)3258 void SetBinlog ( bool bBinlog ) { m_bBinlog = bBinlog; }
GetFieldLens()3259 virtual int64_t * GetFieldLens() const { return NULL; }
3260
IsStarDict()3261 virtual bool IsStarDict() const { return true; }
3262
3263 public:
3264 /// build index by indexing given sources
3265 virtual int Build ( const CSphVector<CSphSource*> & dSources, int iMemoryLimit, int iWriteBuffer ) = 0;
3266
3267 /// build index by mering current index with given index
3268 virtual bool Merge ( CSphIndex * pSource, const CSphVector<CSphFilterSettings> & dFilters, bool bMergeKillLists ) = 0;
3269
3270 public:
3271 /// check all data files, preload schema, and preallocate enough shared RAM to load memory-cached data
3272 virtual bool Prealloc ( bool bMlock, bool bStripPath, CSphString & sWarning ) = 0;
3273
3274 /// deallocate all previously preallocated shared data
3275 virtual void Dealloc () = 0;
3276
3277 /// precache everything which needs to be precached
3278 // WARNING, WILL BE CALLED FROM DIFFERENT PROCESS, MUST ONLY MODIFY SHARED MEMORY
3279 virtual bool Preread () = 0;
3280
3281 /// set new index base path
3282 virtual void SetBase ( const char * sNewBase ) = 0;
3283
3284 /// set new index base path, and physically rename index files too
3285 virtual bool Rename ( const char * sNewBase ) = 0;
3286
3287 /// obtain exclusive lock on this index
3288 virtual bool Lock () = 0;
3289
3290 /// dismiss exclusive lock and unlink lock file
3291 virtual void Unlock () = 0;
3292
3293 /// relock shared RAM (only on daemonization)
3294 virtual bool Mlock () = 0;
3295
3296 /// keep attributes on disk and map them via file memory mapping
SetEnableOndiskAttributes(bool)3297 virtual void SetEnableOndiskAttributes ( bool ) {}
3298
3299 /// called when index is loaded and prepared to work
3300 virtual void PostSetup() = 0;
3301
3302 public:
3303 /// return index document, bytes totals (FIXME? remove this in favor of GetStatus() maybe?)
3304 virtual const CSphSourceStats & GetStats () const = 0;
3305
3306 /// return additional index info
3307 virtual void GetStatus ( CSphIndexStatus* ) const = 0;
3308
3309 public:
3310 virtual bool EarlyReject ( CSphQueryContext * pCtx, CSphMatch & tMatch ) const = 0;
3311 void SetCacheSize ( int iMaxCachedDocs, int iMaxCachedHits );
3312 virtual bool MultiQuery ( const CSphQuery * pQuery, CSphQueryResult * pResult, int iSorters, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const = 0;
3313 virtual bool MultiQueryEx ( int iQueries, const CSphQuery * ppQueries, CSphQueryResult ** ppResults, ISphMatchSorter ** ppSorters, const CSphMultiQueryArgs & tArgs ) const = 0;
3314 virtual bool GetKeywords ( CSphVector <CSphKeywordInfo> & dKeywords, const char * szQuery, bool bGetStats, CSphString * pError ) const = 0;
3315 virtual bool FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const = 0;
3316
3317 public:
3318 /// updates memory-cached attributes in real time
3319 /// returns non-negative amount of actually found and updated records on success
3320 /// on failure, -1 is returned and GetLastError() contains error message
3321 virtual int UpdateAttributes ( const CSphAttrUpdate & tUpd, int iIndex, CSphString & sError, CSphString & sWarning ) = 0;
3322
3323 /// saves memory-cached attributes, if there were any updates to them
3324 /// on failure, false is returned and GetLastError() contains error message
3325 virtual bool SaveAttributes ( CSphString & sError ) const = 0;
3326
3327 virtual DWORD GetAttributeStatus () const = 0;
3328
3329 virtual bool CreateModifiedFiles ( bool bAddAttr, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError ) = 0;
3330
3331 virtual bool AddRemoveAttribute ( bool bAdd, const CSphString & sAttrName, ESphAttr eAttrType, int iPos, CSphString & sError ) = 0;
3332
3333 public:
3334 /// internal debugging hook, DO NOT USE
3335 virtual void DebugDumpHeader ( FILE * fp, const char * sHeaderName, bool bConfig ) = 0;
3336
3337 /// internal debugging hook, DO NOT USE
3338 virtual void DebugDumpDocids ( FILE * fp ) = 0;
3339
3340 /// internal debugging hook, DO NOT USE
3341 virtual void DebugDumpHitlist ( FILE * fp, const char * sKeyword, bool bID ) = 0;
3342
3343 /// internal debugging hook, DO NOT USE
3344 virtual void DebugDumpDict ( FILE * fp ) = 0;
3345
3346 /// internal debugging hook, DO NOT USE
3347 virtual int DebugCheck ( FILE * fp ) = 0;
SetDebugCheck()3348 virtual void SetDebugCheck () {}
3349
3350 /// getter for name
GetName()3351 const char * GetName () { return m_sIndexName.cstr(); }
3352
SetName(const char * sName)3353 void SetName ( const char * sName ) { m_sIndexName = sName; }
3354
3355 /// get for the base file name
GetFilename()3356 const char * GetFilename () const { return m_sFilename.cstr(); }
3357
3358 /// internal make document id list from external docinfo, DO NOT USE
3359 virtual bool BuildDocList ( SphAttr_t ** ppDocList, int64_t * pCount, CSphString * pError ) const;
3360
3361 /// internal replace kill-list and rewrite spk file, DO NOT USE
ReplaceKillList(const SphDocID_t *,int)3362 virtual bool ReplaceKillList ( const SphDocID_t *, int ) { return true; }
3363
3364 public:
3365 int64_t m_iTID;
3366
3367 bool m_bExpandKeywords; ///< enable automatic query-time keyword expansion (to "( word | =word | *word* )")
3368 int m_iExpansionLimit;
3369
3370 protected:
3371
3372 CSphSchema m_tSchema;
3373 CSphString m_sLastError;
3374 CSphString m_sLastWarning;
3375
3376 bool m_bInplaceSettings;
3377 int m_iHitGap;
3378 int m_iDocinfoGap;
3379 float m_fRelocFactor;
3380 float m_fWriteFactor;
3381
3382 bool m_bKeepFilesOpen; ///< keep files open to avoid race on seamless rotation
3383 bool m_bBinlog;
3384
3385 bool m_bStripperInited; ///< was stripper initialized (old index version (<9) handling)
3386
3387 public:
3388 bool m_bId32to64; ///< did we convert id32 to id64 on startup
3389
3390 protected:
3391 CSphIndexSettings m_tSettings;
3392
3393 ISphFieldFilter * m_pFieldFilter;
3394 ISphTokenizer * m_pTokenizer;
3395 ISphTokenizer * m_pQueryTokenizer;
3396 CSphDict * m_pDict;
3397
3398 int m_iMaxCachedDocs;
3399 int m_iMaxCachedHits;
3400 CSphString m_sIndexName;
3401 CSphString m_sFilename;
3402
3403 public:
SetGlobalIDFPath(const CSphString & sPath)3404 void SetGlobalIDFPath ( const CSphString & sPath ) { m_sGlobalIDFPath = sPath; }
3405 float GetGlobalIDF ( const CSphString & sWord, int64_t iDocsLocal, bool bPlainIDF ) const;
3406
3407 protected:
3408 CSphString m_sGlobalIDFPath;
3409 };
3410
3411 // update attributes with index pointer attached
3412 struct CSphAttrUpdateEx
3413 {
3414 const CSphAttrUpdate * m_pUpdate; ///< the unchangeable update pool
3415 CSphIndex * m_pIndex; ///< the index on which the update should happen
3416 CSphString * m_pError; ///< the error, if any
3417 CSphString * m_pWarning; ///< the warning, if any
3418 int m_iAffected; ///< num of updated rows.
CSphAttrUpdateExCSphAttrUpdateEx3419 CSphAttrUpdateEx()
3420 : m_pUpdate ( NULL )
3421 , m_pIndex ( NULL )
3422 , m_pError ( NULL )
3423 , m_pWarning ( NULL )
3424 , m_iAffected ( 0 )
3425 {}
3426 };
3427
3428 struct SphQueueSettings_t : public ISphNoncopyable
3429 {
3430 const CSphQuery & m_tQuery;
3431 const ISphSchema & m_tSchema;
3432 CSphString & m_sError;
3433 CSphQueryProfile * m_pProfiler;
3434 bool m_bComputeItems;
3435 CSphSchema * m_pExtra;
3436 CSphAttrUpdateEx * m_pUpdate;
3437 CSphVector<SphDocID_t> * m_pDeletes;
3438 bool m_bZonespanlist;
3439 DWORD m_uPackedFactorFlags;
3440 ISphExprHook * m_pHook;
3441 const CSphFilterSettings * m_pAggrFilter;
3442
SphQueueSettings_tSphQueueSettings_t3443 SphQueueSettings_t ( const CSphQuery & tQuery, const ISphSchema & tSchema, CSphString & sError, CSphQueryProfile * pProfiler )
3444 : m_tQuery ( tQuery )
3445 , m_tSchema ( tSchema )
3446 , m_sError ( sError )
3447 , m_pProfiler ( pProfiler )
3448 , m_bComputeItems ( true )
3449 , m_pExtra ( NULL )
3450 , m_pUpdate ( NULL )
3451 , m_pDeletes ( NULL )
3452 , m_bZonespanlist ( false )
3453 , m_uPackedFactorFlags ( SPH_FACTOR_DISABLE )
3454 , m_pHook ( NULL )
3455 , m_pAggrFilter ( NULL )
3456 { }
3457 };
3458
3459 /////////////////////////////////////////////////////////////////////////////
3460
3461 /// create phrase fulltext index implementation
3462 CSphIndex * sphCreateIndexPhrase ( const char* szIndexName, const char * sFilename );
3463
3464 /// create template (tokenizer) index implementation
3465 CSphIndex * sphCreateIndexTemplate ( );
3466
3467 /// set JSON attribute indexing options
3468 /// bStrict is whether to stop indexing on error, or just ignore the attribute value
3469 /// bAutoconvNumbers is whether to auto-convert eligible (!) strings to integers and floats, or keep them as strings
3470 /// bKeynamesToLowercase is whether to convert all key names to lowercase
3471 void sphSetJsonOptions ( bool bStrict, bool bAutoconvNumbers, bool bKeynamesToLowercase );
3472
3473 /// parses sort clause, using a given schema
3474 /// fills eFunc and tState and optionally sError, returns result code
3475 ESortClauseParseResult sphParseSortClause ( const CSphQuery * pQuery, const char * sClause, const ISphSchema & tSchema,
3476 ESphSortFunc & eFunc, CSphMatchComparatorState & tState, CSphString & sError );
3477
3478 /// creates proper queue for given query
3479 /// may return NULL on error; in this case, error message is placed in sError
3480 /// if the pUpdate is given, creates the updater's queue and perform the index update
3481 /// instead of searching
3482 ISphMatchSorter * sphCreateQueue ( SphQueueSettings_t & tQueue );
3483
3484 /// convert queue to sorted array, and add its entries to result's matches array
3485 int sphFlattenQueue ( ISphMatchSorter * pQueue, CSphQueryResult * pResult, int iTag );
3486
3487 /// setup per-keyword read buffer sizes
3488 void sphSetReadBuffers ( int iReadBuffer, int iReadUnhinted );
3489
3490 /// check query for expressions
3491 bool sphHasExpressions ( const CSphQuery & tQuery, const CSphSchema & tSchema );
3492
3493 /// initialize collation tables
3494 void sphCollationInit ();
3495
3496 //////////////////////////////////////////////////////////////////////////
3497
3498 extern CSphString g_sLemmatizerBase;
3499
3500 #if USE_RLP
3501 extern CSphString g_sRLPRoot;
3502 extern CSphString g_sRLPEnv;
3503 extern int g_iRLPMaxBatchSize;
3504 extern int g_iRLPMaxBatchDocs;
3505 #endif
3506
3507 /////////////////////////////////////////////////////////////////////////////
3508
3509 // workaround to suppress C4511/C4512 warnings (copy ctor and assignment operator) in VS 2003
3510 #if _MSC_VER>=1300 && _MSC_VER<1400
3511 #pragma warning(disable:4511)
3512 #pragma warning(disable:4512)
3513 #endif
3514
3515 // suppress C4201 (nameless struct/union is a nonstandard extension) because even min-spec gcc 3.4.6 works ok
3516 #if defined(_MSC_VER)
3517 #pragma warning(disable:4201)
3518 #endif
3519
3520 #endif // _sphinx_
3521
3522 //
3523 // $Id$
3524 //
3525