1 //
2 // $Id$
3 //
4
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15
16 #include "sphinx.h"
17 #include "sphinxquery.h"
18 #include "sphinxutils.h"
19 #include "sphinxplugin.h"
20 #include <stdarg.h>
21
22 //////////////////////////////////////////////////////////////////////////
23 // EXTENDED PARSER RELOADED
24 //////////////////////////////////////////////////////////////////////////
25 class XQParser_t;
26 #ifdef CMAKE_GENERATED_GRAMMAR
27 #include "bissphinxquery.h"
28 #else
29 #include "yysphinxquery.h"
30 #endif
31
32 // #define XQDEBUG 1
33 // #define XQ_DUMP_TRANSFORMED_TREE 1
34 // #define XQ_DUMP_NODE_ADDR 1
35
36 //////////////////////////////////////////////////////////////////////////
37
38 struct MultiformNode_t
39 {
40 XQNode_t * m_pNode;
41 int m_iDestStart;
42 int m_iDestCount;
43 };
44
45 class XQParser_t
46 {
47 public:
48 XQParser_t ();
49 ~XQParser_t ();
50
51 public:
52 bool Parse ( XQQuery_t & tQuery, const char * sQuery, const CSphQuery * pQuery, const ISphTokenizer * pTokenizer, const CSphSchema * pSchema, CSphDict * pDict, const CSphIndexSettings & tSettings );
53
54 bool Error ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
55 void Warning ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
56
57 bool AddField ( FieldMask_t & dFields, const char * szField, int iLen );
58 bool ParseFields ( FieldMask_t & uFields, int & iMaxFieldPos, bool & bIgnore );
59 int ParseZone ( const char * pZone );
60
61 bool IsSpecial ( char c );
62 bool GetNumber ( const char * p );
63 int GetToken ( YYSTYPE * lvalp );
64
65 void HandleModifiers ( XQKeyword_t & tKeyword );
66
67 void AddQuery ( XQNode_t * pNode );
68 XQNode_t * AddKeyword ( const char * sKeyword, int iSkippedPosBeforeToken=0 );
69 XQNode_t * AddKeyword ( XQNode_t * pLeft, XQNode_t * pRight );
70 XQNode_t * AddOp ( XQOperator_e eOp, XQNode_t * pLeft, XQNode_t * pRight, int iOpArg=0 );
71 void SetPhrase ( XQNode_t * pNode, bool bSetExact );
72
73 void Cleanup ();
74 XQNode_t * SweepNulls ( XQNode_t * pNode );
75 bool FixupNots ( XQNode_t * pNode );
76 void FixupNulls ( XQNode_t * pNode );
77 void DeleteNodesWOFields ( XQNode_t * pNode );
78 void PhraseShiftQpos ( XQNode_t * pNode );
79 void FixupDestForms ();
80
SetFieldSpec(const FieldMask_t & uMask,int iMaxPos)81 inline void SetFieldSpec ( const FieldMask_t& uMask, int iMaxPos )
82 {
83 FixRefSpec();
84 m_dStateSpec.Last()->SetFieldSpec ( uMask, iMaxPos );
85 }
SetZoneVec(int iZoneVec,bool bZoneSpan=false)86 inline void SetZoneVec ( int iZoneVec, bool bZoneSpan = false )
87 {
88 FixRefSpec();
89 m_dStateSpec.Last()->SetZoneSpec ( m_dZoneVecs[iZoneVec], bZoneSpan );
90 }
91
FixRefSpec()92 inline void FixRefSpec ()
93 {
94 bool bRef = ( m_dStateSpec.GetLength()>1 && ( m_dStateSpec[m_dStateSpec.GetLength()-1]==m_dStateSpec[m_dStateSpec.GetLength()-2] ) );
95 if ( !bRef )
96 return;
97
98 XQLimitSpec_t * pSpec = m_dStateSpec.Pop();
99 m_dSpecPool.Add ( new XQLimitSpec_t ( *pSpec ) );
100 m_dStateSpec.Add ( m_dSpecPool.Last() );
101 }
102
103 public:
GetZoneVec(int iZoneVec) const104 const CSphVector<int> & GetZoneVec ( int iZoneVec ) const
105 {
106 return m_dZoneVecs[iZoneVec];
107 }
108
109 public:
110 static const int MAX_TOKEN_BYTES = 3*SPH_MAX_WORD_LEN + 16;
111
112 XQQuery_t * m_pParsed;
113
114 BYTE * m_sQuery;
115 int m_iQueryLen;
116 const char * m_pLastTokenStart;
117 const char * m_pLastTokenEnd;
118
119 const CSphSchema * m_pSchema;
120 ISphTokenizer * m_pTokenizer;
121 CSphDict * m_pDict;
122
123 CSphVector<XQNode_t*> m_dSpawned;
124 XQNode_t * m_pRoot;
125
126 bool m_bStopOnInvalid;
127 int m_iAtomPos;
128
129 int m_iPendingNulls;
130 int m_iPendingType;
131 YYSTYPE m_tPendingToken;
132 bool m_bWasBlended;
133 bool m_bWasKeyword;
134
135 bool m_bEmpty;
136 bool m_bQuoted;
137 bool m_bEmptyStopword;
138 int m_iOvershortStep;
139
140 int m_iQuorumQuote;
141 int m_iQuorumFSlash;
142 bool m_bCheckNumber;
143
144 const PluginQueryTokenFilter_c * m_pPlugin;
145 void * m_pPluginData;
146
147 CSphVector<CSphString> m_dIntTokens;
148
149 CSphVector < CSphVector<int> > m_dZoneVecs;
150 CSphVector<XQLimitSpec_t *> m_dStateSpec;
151 CSphVector<XQLimitSpec_t *> m_dSpecPool;
152 CSphVector<int> m_dPhraseStar;
153
154 CSphVector<CSphString> m_dDestForms;
155 CSphVector<MultiformNode_t> m_dMultiforms;
156 };
157
158 //////////////////////////////////////////////////////////////////////////
159
yylex(YYSTYPE * lvalp,XQParser_t * pParser)160 int yylex ( YYSTYPE * lvalp, XQParser_t * pParser )
161 {
162 return pParser->GetToken ( lvalp );
163 }
164
yyerror(XQParser_t * pParser,const char * sMessage)165 void yyerror ( XQParser_t * pParser, const char * sMessage )
166 {
167 if ( pParser->m_pParsed->m_sParseError.IsEmpty() )
168 pParser->m_pParsed->m_sParseError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastTokenStart );
169 }
170
171 #ifdef CMAKE_GENERATED_GRAMMAR
172 #include "bissphinxquery.c"
173 #else
174 #include "yysphinxquery.c"
175 #endif
176
177 //////////////////////////////////////////////////////////////////////////
178
SetFieldSpec(const FieldMask_t & uMask,int iMaxPos)179 void XQLimitSpec_t::SetFieldSpec ( const FieldMask_t& uMask, int iMaxPos )
180 {
181 m_bFieldSpec = true;
182 m_dFieldMask = uMask;
183 m_iFieldMaxPos = iMaxPos;
184 }
185
186 /// ctor
XQNode_t(const XQLimitSpec_t & dSpec)187 XQNode_t::XQNode_t ( const XQLimitSpec_t & dSpec )
188 : m_pParent ( NULL )
189 , m_eOp ( SPH_QUERY_AND )
190 , m_iOrder ( 0 )
191 , m_iCounter ( 0 )
192 , m_iMagicHash ( 0 )
193 , m_iFuzzyHash ( 0 )
194 , m_dSpec ( dSpec )
195 , m_iOpArg ( 0 )
196 , m_iAtomPos ( -1 )
197 , m_iUser ( 0 )
198 , m_bVirtuallyPlain ( false )
199 , m_bNotWeighted ( false )
200 , m_bPercentOp ( false )
201 {
202 #ifdef XQ_DUMP_NODE_ADDR
203 printf ( "node new 0x%08x\n", this );
204 #endif
205 }
206
207 /// dtor
~XQNode_t()208 XQNode_t::~XQNode_t ()
209 {
210 #ifdef XQ_DUMP_NODE_ADDR
211 printf ( "node deleted %d 0x%08x\n", m_eOp, this );
212 #endif
213 ARRAY_FOREACH ( i, m_dChildren )
214 SafeDelete ( m_dChildren[i] );
215 }
216
217
SetFieldSpec(const FieldMask_t & uMask,int iMaxPos)218 void XQNode_t::SetFieldSpec ( const FieldMask_t& uMask, int iMaxPos )
219 {
220 // set it, if we do not yet have one
221 if ( !m_dSpec.m_bFieldSpec )
222 m_dSpec.SetFieldSpec ( uMask, iMaxPos );
223
224 // some of the children might not yet have a spec, even if the node itself has
225 // eg. 'hello @title world' (whole node has '@title' spec but 'hello' node does not have any!)
226 ARRAY_FOREACH ( i, m_dChildren )
227 m_dChildren[i]->SetFieldSpec ( uMask, iMaxPos );
228 }
229
SetZoneSpec(const CSphVector<int> & dZones,bool bZoneSpan)230 void XQLimitSpec_t::SetZoneSpec ( const CSphVector<int> & dZones, bool bZoneSpan )
231 {
232 m_dZones = dZones;
233 m_bZoneSpan = bZoneSpan;
234 }
235
236
SetZoneSpec(const CSphVector<int> & dZones,bool bZoneSpan)237 void XQNode_t::SetZoneSpec ( const CSphVector<int> & dZones, bool bZoneSpan )
238 {
239 // set it, if we do not yet have one
240 if ( !m_dSpec.m_dZones.GetLength() )
241 m_dSpec.SetZoneSpec ( dZones, bZoneSpan );
242
243 // some of the children might not yet have a spec, even if the node itself has
244 ARRAY_FOREACH ( i, m_dChildren )
245 m_dChildren[i]->SetZoneSpec ( dZones, bZoneSpan );
246 }
247
CopySpecs(const XQNode_t * pSpecs)248 void XQNode_t::CopySpecs ( const XQNode_t * pSpecs )
249 {
250 if ( !pSpecs )
251 return;
252
253 if ( !m_dSpec.m_bFieldSpec )
254 m_dSpec.SetFieldSpec ( pSpecs->m_dSpec.m_dFieldMask, pSpecs->m_dSpec.m_iFieldMaxPos );
255
256 if ( !m_dSpec.m_dZones.GetLength() )
257 m_dSpec.SetZoneSpec ( pSpecs->m_dSpec.m_dZones, pSpecs->m_dSpec.m_bZoneSpan );
258 }
259
260
ClearFieldMask()261 void XQNode_t::ClearFieldMask ()
262 {
263 m_dSpec.m_dFieldMask.SetAll();
264
265 ARRAY_FOREACH ( i, m_dChildren )
266 m_dChildren[i]->ClearFieldMask();
267 }
268
269
GetHash() const270 uint64_t XQNode_t::GetHash () const
271 {
272 if ( m_iMagicHash )
273 return m_iMagicHash;
274
275 XQOperator_e dZeroOp[2];
276 dZeroOp[0] = m_eOp;
277 dZeroOp[1] = (XQOperator_e) 0;
278
279 ARRAY_FOREACH ( i, m_dWords )
280 m_iMagicHash = 100 + ( m_iMagicHash ^ sphFNV64 ( m_dWords[i].m_sWord.cstr() ) ); // +100 to make it non-transitive
281 ARRAY_FOREACH ( j, m_dChildren )
282 m_iMagicHash = 100 + ( m_iMagicHash ^ m_dChildren[j]->GetHash() ); // +100 to make it non-transitive
283 m_iMagicHash += 1000000; // to immerse difference between parents and children
284 m_iMagicHash ^= sphFNV64 ( dZeroOp );
285
286 return m_iMagicHash;
287 }
288
289
GetFuzzyHash() const290 uint64_t XQNode_t::GetFuzzyHash () const
291 {
292 if ( m_iFuzzyHash )
293 return m_iFuzzyHash;
294
295 XQOperator_e dZeroOp[2];
296 dZeroOp[0] = ( m_eOp==SPH_QUERY_PHRASE ? SPH_QUERY_PROXIMITY : m_eOp );
297 dZeroOp[1] = (XQOperator_e) 0;
298
299 ARRAY_FOREACH ( i, m_dWords )
300 m_iFuzzyHash = 100 + ( m_iFuzzyHash ^ sphFNV64 ( m_dWords[i].m_sWord.cstr() ) ); // +100 to make it non-transitive
301 ARRAY_FOREACH ( j, m_dChildren )
302 m_iFuzzyHash = 100 + ( m_iFuzzyHash ^ m_dChildren[j]->GetFuzzyHash () ); // +100 to make it non-transitive
303 m_iFuzzyHash += 1000000; // to immerse difference between parents and children
304 m_iFuzzyHash ^= sphFNV64 ( dZeroOp );
305
306 return m_iFuzzyHash;
307 }
308
309
SetOp(XQOperator_e eOp,XQNode_t * pArg1,XQNode_t * pArg2)310 void XQNode_t::SetOp ( XQOperator_e eOp, XQNode_t * pArg1, XQNode_t * pArg2 )
311 {
312 m_eOp = eOp;
313 m_dChildren.Reset();
314 if ( pArg1 )
315 {
316 m_dChildren.Add ( pArg1 );
317 pArg1->m_pParent = this;
318 }
319 if ( pArg2 )
320 {
321 m_dChildren.Add ( pArg2 );
322 pArg2->m_pParent = this;
323 }
324 }
325
326
FixupAtomPos()327 int XQNode_t::FixupAtomPos()
328 {
329 assert ( m_eOp==SPH_QUERY_PROXIMITY && m_dWords.GetLength()>0 );
330
331 ARRAY_FOREACH ( i, m_dWords )
332 {
333 int iSub = m_dWords[i].m_iSkippedBefore-1;
334 if ( iSub>0 )
335 {
336 for ( int j = i; j < m_dWords.GetLength(); j++ )
337 m_dWords[j].m_iAtomPos -= iSub;
338 }
339 }
340
341 return m_dWords.Last().m_iAtomPos+1;
342 }
343
344
Clone()345 XQNode_t * XQNode_t::Clone ()
346 {
347 XQNode_t * pRet = new XQNode_t ( m_dSpec );
348 pRet->SetOp ( m_eOp );
349 pRet->m_dWords = m_dWords;
350 pRet->m_iOpArg = m_iOpArg;
351 pRet->m_iAtomPos = m_iAtomPos;
352 pRet->m_bVirtuallyPlain = m_bVirtuallyPlain;
353 pRet->m_bNotWeighted = m_bNotWeighted;
354 pRet->m_bPercentOp = m_bPercentOp;
355
356 if ( m_dChildren.GetLength()==0 )
357 return pRet;
358
359 pRet->m_dChildren.Reserve ( m_dChildren.GetLength() );
360 for ( int i = 0; i < m_dChildren.GetLength(); ++i )
361 {
362 pRet->m_dChildren.Add ( m_dChildren[i]->Clone() );
363 pRet->m_dChildren.Last()->m_pParent = pRet;
364 }
365
366 return pRet;
367 }
368
369
ResetHash()370 bool XQNode_t::ResetHash ()
371 {
372 bool bAlreadyReset = ( m_iMagicHash==0 && m_iFuzzyHash==0 );
373 m_iMagicHash = 0;
374 m_iFuzzyHash = 0;
375 return bAlreadyReset;
376 }
377
378
379 /// return either index of pNode among Parent.m_dChildren, or -1
GetNodeChildIndex(const XQNode_t * pParent,const XQNode_t * pNode)380 static int GetNodeChildIndex ( const XQNode_t * pParent, const XQNode_t * pNode )
381 {
382 assert ( pParent && pNode );
383 ARRAY_FOREACH ( i, pParent->m_dChildren )
384 if ( pParent->m_dChildren[i]==pNode )
385 return i;
386
387 return -1;
388 }
389
390 //////////////////////////////////////////////////////////////////////////
391
XQParser_t()392 XQParser_t::XQParser_t ()
393 : m_pParsed ( NULL )
394 , m_pLastTokenStart ( NULL )
395 , m_pLastTokenEnd ( NULL )
396 , m_pRoot ( NULL )
397 , m_bStopOnInvalid ( true )
398 , m_bWasBlended ( false )
399 , m_bWasKeyword ( false )
400 , m_bQuoted ( false )
401 , m_bEmptyStopword ( false )
402 , m_iQuorumQuote ( -1 )
403 , m_iQuorumFSlash ( -1 )
404 , m_bCheckNumber ( false )
405 , m_pPlugin ( NULL )
406 , m_pPluginData ( NULL )
407 {
408 m_dSpecPool.Add ( new XQLimitSpec_t() );
409 m_dStateSpec.Add ( m_dSpecPool.Last() );
410 }
411
~XQParser_t()412 XQParser_t::~XQParser_t ()
413 {
414 ARRAY_FOREACH ( i, m_dSpecPool )
415 SafeDelete ( m_dSpecPool[i] );
416 }
417
418
419 /// cleanup spawned nodes (for bailing out on errors)
Cleanup()420 void XQParser_t::Cleanup ()
421 {
422 m_dSpawned.Uniq(); // FIXME! should eliminate this by testing
423
424 ARRAY_FOREACH ( i, m_dSpawned )
425 {
426 m_dSpawned[i]->m_dChildren.Reset ();
427 SafeDelete ( m_dSpawned[i] );
428 }
429 m_dSpawned.Reset ();
430 m_dStateSpec.Reset();
431 }
432
433
434
Error(const char * sTemplate,...)435 bool XQParser_t::Error ( const char * sTemplate, ... )
436 {
437 assert ( m_pParsed );
438 char sBuf[256];
439
440 const char * sPrefix = "query error: ";
441 int iPrefix = strlen(sPrefix);
442 memcpy ( sBuf, sPrefix, iPrefix );
443
444 va_list ap;
445 va_start ( ap, sTemplate );
446 vsnprintf ( sBuf+iPrefix, sizeof(sBuf)-iPrefix, sTemplate, ap );
447 va_end ( ap );
448
449 m_pParsed->m_sParseError = sBuf;
450 return false;
451 }
452
453
Warning(const char * sTemplate,...)454 void XQParser_t::Warning ( const char * sTemplate, ... )
455 {
456 assert ( m_pParsed );
457 char sBuf[256];
458
459 const char * sPrefix = "query warning: ";
460 int iPrefix = strlen(sPrefix);
461 memcpy ( sBuf, sPrefix, iPrefix );
462
463 va_list ap;
464 va_start ( ap, sTemplate );
465 vsnprintf ( sBuf+iPrefix, sizeof(sBuf)-iPrefix, sTemplate, ap );
466 va_end ( ap );
467
468 m_pParsed->m_sParseWarning = sBuf;
469 }
470
471
472 /// my special chars
IsSpecial(char c)473 bool XQParser_t::IsSpecial ( char c )
474 {
475 return c=='(' || c==')' || c=='|' || c=='-' || c=='!' || c=='@' || c=='~' || c=='"' || c=='/';
476 }
477
478
479 /// lookup field and add it into mask
AddField(FieldMask_t & dFields,const char * szField,int iLen)480 bool XQParser_t::AddField ( FieldMask_t & dFields, const char * szField, int iLen )
481 {
482 CSphString sField;
483 sField.SetBinary ( szField, iLen );
484
485 int iField = m_pSchema->GetFieldIndex ( sField.cstr () );
486 if ( iField<0 )
487 {
488 if ( m_bStopOnInvalid )
489 return Error ( "no field '%s' found in schema", sField.cstr () );
490 else
491 Warning ( "no field '%s' found in schema", sField.cstr () );
492 } else
493 {
494 if ( iField>=SPH_MAX_FIELDS )
495 return Error ( " max %d fields allowed", SPH_MAX_FIELDS );
496
497 dFields.Set ( iField );
498 }
499
500 return true;
501 }
502
503
504 /// parse fields block
ParseFields(FieldMask_t & dFields,int & iMaxFieldPos,bool & bIgnore)505 bool XQParser_t::ParseFields ( FieldMask_t & dFields, int & iMaxFieldPos, bool & bIgnore )
506 {
507 dFields.UnsetAll();
508 iMaxFieldPos = 0;
509 bIgnore = false;
510
511 const char * pPtr = m_pTokenizer->GetBufferPtr ();
512 const char * pLastPtr = m_pTokenizer->GetBufferEnd ();
513
514 if ( pPtr==pLastPtr )
515 return true; // silently ignore trailing field operator
516
517 bool bNegate = false;
518 bool bBlock = false;
519
520 // handle special modifiers
521 if ( *pPtr=='!' )
522 {
523 // handle @! and @!(
524 bNegate = true; pPtr++;
525 if ( *pPtr=='(' ) { bBlock = true; pPtr++; }
526
527 } else if ( *pPtr=='*' )
528 {
529 // handle @*
530 dFields.SetAll();
531 m_pTokenizer->SetBufferPtr ( pPtr+1 );
532 return true;
533
534 } else if ( *pPtr=='(' )
535 {
536 // handle @(
537 bBlock = true; pPtr++;
538 }
539
540 // handle invalid chars
541 if ( !sphIsAlpha(*pPtr) )
542 {
543 bIgnore = true;
544 m_pTokenizer->SetBufferPtr ( pPtr ); // ignore and re-parse (FIXME! maybe warn?)
545 return true;
546 }
547 assert ( sphIsAlpha(*pPtr) ); // i think i'm paranoid
548
549 // handle field specification
550 if ( !bBlock )
551 {
552 // handle standalone field specification
553 const char * pFieldStart = pPtr;
554 while ( sphIsAlpha(*pPtr) && pPtr<pLastPtr )
555 pPtr++;
556
557 assert ( pPtr-pFieldStart>0 );
558 if ( !AddField ( dFields, pFieldStart, pPtr-pFieldStart ) )
559 return false;
560
561 m_pTokenizer->SetBufferPtr ( pPtr );
562 if ( bNegate )
563 dFields.Negate();
564
565 } else
566 {
567 // handle fields block specification
568 assert ( sphIsAlpha(*pPtr) && bBlock ); // and complicated
569
570 bool bOK = false;
571 const char * pFieldStart = NULL;
572 while ( pPtr<pLastPtr )
573 {
574 // accumulate field name, while we can
575 if ( sphIsAlpha(*pPtr) )
576 {
577 if ( !pFieldStart )
578 pFieldStart = pPtr;
579 pPtr++;
580 continue;
581 }
582
583 // separator found
584 if ( pFieldStart==NULL )
585 {
586 CSphString sContext;
587 sContext.SetBinary ( pPtr, (int)( pLastPtr-pPtr ) );
588 return Error ( "invalid field block operator syntax near '%s'", sContext.cstr() ? sContext.cstr() : "" );
589
590 } else if ( *pPtr==',' )
591 {
592 if ( !AddField ( dFields, pFieldStart, pPtr-pFieldStart ) )
593 return false;
594
595 pFieldStart = NULL;
596 pPtr++;
597
598 } else if ( *pPtr==')' )
599 {
600 if ( !AddField ( dFields, pFieldStart, pPtr-pFieldStart ) )
601 return false;
602
603 m_pTokenizer->SetBufferPtr ( ++pPtr );
604 if ( bNegate )
605 dFields.Negate();
606
607 bOK = true;
608 break;
609
610 } else
611 {
612 return Error ( "invalid character '%c' in field block operator", *pPtr );
613 }
614 }
615 if ( !bOK )
616 return Error ( "missing closing ')' in field block operator" );
617 }
618
619 // handle optional position range modifier
620 if ( pPtr[0]=='[' && isdigit ( pPtr[1] ) )
621 {
622 // skip '[' and digits
623 const char * p = pPtr+1;
624 while ( *p && isdigit(*p) ) p++;
625
626 // check that the range ends with ']' (FIXME! maybe report an error if it does not?)
627 if ( *p!=']' )
628 return true;
629
630 // fetch my value
631 iMaxFieldPos = strtoul ( pPtr+1, NULL, 10 );
632 m_pTokenizer->SetBufferPtr ( p+1 );
633 }
634
635 // well done
636 return true;
637 }
638
639
640 /// helper find-or-add (make it generic and move to sphinxstd?)
GetZoneIndex(XQQuery_t * pQuery,const CSphString & sZone)641 static int GetZoneIndex ( XQQuery_t * pQuery, const CSphString & sZone )
642 {
643 ARRAY_FOREACH ( i, pQuery->m_dZones )
644 if ( pQuery->m_dZones[i]==sZone )
645 return i;
646
647 pQuery->m_dZones.Add ( sZone );
648 return pQuery->m_dZones.GetLength()-1;
649 }
650
651
652 /// parse zone
ParseZone(const char * pZone)653 int XQParser_t::ParseZone ( const char * pZone )
654 {
655 const char * p = pZone;
656
657 // case one, just a single zone name
658 if ( sphIsAlpha ( *pZone ) )
659 {
660 // find zone name
661 while ( sphIsAlpha(*p) )
662 p++;
663 m_pTokenizer->SetBufferPtr ( p );
664
665 // extract and lowercase it
666 CSphString sZone;
667 sZone.SetBinary ( pZone, p-pZone );
668 sZone.ToLower();
669
670 // register it in zones list
671 int iZone = GetZoneIndex ( m_pParsed, sZone );
672
673 // create new 1-zone vector
674 m_dZoneVecs.Add().Add ( iZone );
675 return m_dZoneVecs.GetLength()-1;
676 }
677
678 // case two, zone block
679 // it must follow strict (name1,name2,...) syntax
680 if ( *pZone=='(' )
681 {
682 // create new zone vector
683 CSphVector<int> & dZones = m_dZoneVecs.Add();
684 p = ++pZone;
685
686 // scan names
687 for ( ;; )
688 {
689 // syntax error, name expected!
690 if ( !sphIsAlpha(*p) )
691 {
692 Error ( "unexpected character '%c' in zone block operator", *p );
693 return -1;
694 }
695
696 // scan next name
697 while ( sphIsAlpha(*p) )
698 p++;
699
700 // extract and lowercase it
701 CSphString sZone;
702 sZone.SetBinary ( pZone, p-pZone );
703 sZone.ToLower();
704
705 // register it in zones list
706 dZones.Add ( GetZoneIndex ( m_pParsed, sZone ) );
707
708 // must be either followed by comma, or closing paren
709 // everything else will cause syntax error
710 if ( *p==')' )
711 {
712 m_pTokenizer->SetBufferPtr ( p+1 );
713 break;
714 }
715
716 if ( *p==',' )
717 pZone = ++p;
718 }
719
720 return m_dZoneVecs.GetLength()-1;
721 }
722
723 // unhandled case
724 Error ( "internal error, unhandled case in ParseZone()" );
725 return -1;
726 }
727
728
GetNumber(const char * p)729 bool XQParser_t::GetNumber ( const char * p )
730 {
731 int iDots = 0;
732 const char * sToken = p;
733 const char * sEnd = m_pTokenizer->GetBufferEnd ();
734 while ( p<sEnd && ( isdigit ( *(BYTE*)p ) || *p=='.' ) )
735 {
736 iDots += ( *p=='.' );
737 p++;
738 }
739
740 // must be float number but got many dots or only dot
741 if ( iDots && ( iDots>1 || p-sToken==iDots ) )
742 p = sToken;
743
744 // float as number allowed only as quorum argument and regular keywords stream otherwise
745 if ( iDots==1 && ( m_iQuorumQuote!=m_iQuorumFSlash || m_iQuorumQuote!=m_iAtomPos ) )
746 p = sToken;
747
748 static const int NUMBER_BUF_LEN = 10; // max strlen of int32
749 if ( p>sToken && p-sToken<NUMBER_BUF_LEN
750 && !( *p=='-' && !( p-sToken==1 && sphIsModifier ( p[-1] ) ) ) // !bDashInside copied over from arbitration
751 && ( *p=='\0' || sphIsSpace(*p) || IsSpecial(*p) ) )
752 {
753 // float as quorum argument has higher precedence than blended
754 bool bQuorum = ( m_iQuorumQuote==m_iQuorumFSlash && m_iQuorumFSlash==m_iAtomPos );
755 bool bQuorumPercent = ( bQuorum && iDots==1 );
756
757 bool bTok = ( m_pTokenizer->GetToken()!=NULL );
758 if ( bTok && m_pTokenizer->TokenIsBlended() && !( bQuorum || bQuorumPercent ) ) // number with blended should be tokenized as usual
759 {
760 m_pTokenizer->SkipBlended();
761 m_pTokenizer->SetBufferPtr ( m_pLastTokenStart );
762 } else if ( bTok && m_pTokenizer->WasTokenSynonym() && !( bQuorum || bQuorumPercent ) )
763 {
764 m_pTokenizer->SetBufferPtr ( m_pLastTokenStart );
765 } else
766 {
767 // got not a very long number followed by a whitespace or special, handle it
768 char sNumberBuf[NUMBER_BUF_LEN];
769
770 int iNumberLen = Min ( (int)sizeof(sNumberBuf)-1, int(p-sToken) );
771 memcpy ( sNumberBuf, sToken, iNumberLen );
772 sNumberBuf[iNumberLen] = '\0';
773 if ( iDots )
774 m_tPendingToken.tInt.fValue = (float)strtod ( sNumberBuf, NULL );
775 else
776 m_tPendingToken.tInt.iValue = atoi ( sNumberBuf );
777
778 // check if it can be used as a keyword too
779 m_pTokenizer->SetBuffer ( (BYTE*)sNumberBuf, iNumberLen );
780 sToken = (const char*) m_pTokenizer->GetToken();
781 m_pTokenizer->SetBuffer ( m_sQuery, m_iQueryLen );
782 m_pTokenizer->SetBufferPtr ( p );
783
784 m_tPendingToken.tInt.iStrIndex = -1;
785 if ( sToken )
786 {
787 m_dIntTokens.Add ( sToken );
788 m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
789 if ( m_pDict->GetWordID ( (BYTE*)sToken ) )
790 m_tPendingToken.tInt.iStrIndex = m_dIntTokens.GetLength()-1;
791 else
792 m_dIntTokens.Pop();
793 m_iAtomPos++;
794 }
795
796 m_iPendingNulls = 0;
797 m_iPendingType = iDots ? TOK_FLOAT : TOK_INT;
798 return true;
799 }
800 }
801 return false;
802 }
803
804
805 /// a lexer of my own
GetToken(YYSTYPE * lvalp)806 int XQParser_t::GetToken ( YYSTYPE * lvalp )
807 {
808 bool bWasFrontModifier = false; // used to print warning
809
810 // what, noone's pending for a bending?!
811 if ( !m_iPendingType )
812 for ( ;; )
813 {
814 assert ( m_iPendingNulls==0 );
815
816 bool bWasKeyword = m_bWasKeyword;
817 m_bWasKeyword = false;
818
819 int iSkippedPosBeforeToken = 0;
820 if ( m_bWasBlended )
821 {
822 iSkippedPosBeforeToken = m_pTokenizer->SkipBlended();
823 m_iAtomPos += iSkippedPosBeforeToken;
824 }
825
826 // tricky stuff
827 // we need to manually check for numbers in certain states (currently, just after proximity or quorum operator)
828 // required because if 0-9 are not in charset_table, or min_word_len is too high,
829 // the tokenizer will *not* return the number as a token!
830 m_pLastTokenStart = m_pTokenizer->GetBufferPtr ();
831 m_pLastTokenEnd = m_pTokenizer->GetTokenEnd();
832 const char * sEnd = m_pTokenizer->GetBufferEnd ();
833
834 const char * p = m_pLastTokenStart;
835 while ( p<sEnd && isspace ( *(BYTE*)p ) ) p++; // to avoid CRT assertions on Windows
836
837 if ( m_bCheckNumber )
838 {
839 m_bCheckNumber = false;
840 if ( GetNumber(p) )
841 break;
842 }
843
844 // not a number, long number, or number not followed by a whitespace, so fallback to regular tokenizing
845 const char * sToken = (const char *) m_pTokenizer->GetToken ();
846 if ( !sToken )
847 {
848 m_iPendingNulls = m_pTokenizer->GetOvershortCount() * m_iOvershortStep;
849 if ( !( m_iPendingNulls || m_pTokenizer->GetBufferPtr()-p>0 ) )
850 return 0;
851 m_iPendingNulls = 0;
852 lvalp->pNode = AddKeyword ( NULL, iSkippedPosBeforeToken );
853 m_bWasKeyword = true;
854 return TOK_KEYWORD;
855 }
856
857 // now let's do some token post-processing
858 m_bWasBlended = m_pTokenizer->TokenIsBlended();
859 m_bEmpty = false;
860
861 m_iPendingNulls = m_pTokenizer->GetOvershortCount() * m_iOvershortStep;
862 m_iAtomPos += 1+m_iPendingNulls;
863
864 // handle NEAR (must be case-sensitive, and immediately followed by slash and int)
865 if ( sToken && p && !m_pTokenizer->m_bPhrase && strncmp ( p, "NEAR/", 5 )==0 && isdigit(p[5]) )
866 {
867 // extract that int
868 int iVal = 0;
869 for ( p=p+5; isdigit(*p); p++ )
870 iVal = iVal*10 + (*p) - '0'; // FIXME! check for overflow?
871 m_pTokenizer->SetBufferPtr ( p );
872
873 // we just lexed our next token
874 m_iPendingType = TOK_NEAR;
875 m_tPendingToken.tInt.iValue = iVal;
876 m_tPendingToken.tInt.iStrIndex = -1;
877 m_iAtomPos -= 1; // skip NEAR
878 break;
879 }
880
881 // handle SENTENCE
882 if ( sToken && p && !m_pTokenizer->m_bPhrase && !strcasecmp ( sToken, "sentence" ) && !strncmp ( p, "SENTENCE", 8 ) )
883 {
884 // we just lexed our next token
885 m_iPendingType = TOK_SENTENCE;
886 m_iAtomPos -= 1;
887 break;
888 }
889
890 // handle PARAGRAPH
891 if ( sToken && p && !m_pTokenizer->m_bPhrase && !strcasecmp ( sToken, "paragraph" ) && !strncmp ( p, "PARAGRAPH", 9 ) )
892 {
893 // we just lexed our next token
894 m_iPendingType = TOK_PARAGRAPH;
895 m_iAtomPos -= 1;
896 break;
897 }
898
899 // handle MAYBE
900 if ( sToken && p && !m_pTokenizer->m_bPhrase && !strcasecmp ( sToken, "maybe" ) && !strncmp ( p, "MAYBE", 5 ) )
901 {
902 // we just lexed our next token
903 m_iPendingType = TOK_MAYBE;
904 m_iAtomPos -= 1;
905 break;
906 }
907
908 // handle ZONE
909 if ( sToken && p && !m_pTokenizer->m_bPhrase && !strncmp ( p, "ZONE:", 5 )
910 && ( sphIsAlpha(p[5]) || p[5]=='(' ) )
911 {
912 // ParseZone() will update tokenizer buffer ptr as needed
913 int iVec = ParseZone ( p+5 );
914 if ( iVec<0 )
915 return -1;
916
917 // we just lexed our next token
918 m_iPendingType = TOK_ZONE;
919 m_tPendingToken.iZoneVec = iVec;
920 m_iAtomPos -= 1;
921 break;
922 }
923
924 // handle ZONESPAN
925 if ( sToken && p && !m_pTokenizer->m_bPhrase && !strncmp ( p, "ZONESPAN:", 9 )
926 && ( sphIsAlpha(p[9]) || p[9]=='(' ) )
927 {
928 // ParseZone() will update tokenizer buffer ptr as needed
929 int iVec = ParseZone ( p+9 );
930 if ( iVec<0 )
931 return -1;
932
933 // we just lexed our next token
934 m_iPendingType = TOK_ZONESPAN;
935 m_tPendingToken.iZoneVec = iVec;
936 m_iAtomPos -= 1;
937 break;
938 }
939
940 // count [ * ] at phrase node for qpos shift
941 // FIXME! RLP can return tokens from several buffers, all this pointer arithmetic will lead to crashes
942 if ( m_pTokenizer->m_bPhrase && m_pLastTokenEnd )
943 {
944 if ( strncmp ( sToken, "*", 1 )==0 )
945 {
946 m_dPhraseStar.Add ( m_iAtomPos );
947 } else
948 {
949 int iSpace = 0;
950 int iStar = 0;
951 const char * sCur = m_pLastTokenEnd;
952 const char * sEnd = m_pTokenizer->GetTokenStart();
953 for ( ; sCur<sEnd; sCur++ )
954 {
955 int iCur = sCur - m_pLastTokenEnd;
956 switch ( *sCur )
957 {
958 case '*':
959 iStar = sCur - m_pLastTokenEnd;
960 break;
961 case ' ':
962 if ( iSpace+2==iCur && iStar+1==iCur ) // match only [ * ] (separate single star) as valid shift operator
963 m_dPhraseStar.Add ( m_iAtomPos );
964 iSpace = iCur;
965 break;
966 }
967 }
968 }
969 }
970
971 // handle specials
972 if ( m_pTokenizer->WasTokenSpecial() )
973 {
974 // specials must not affect pos
975 m_iAtomPos--;
976
977 // some specials are especially special
978 if ( sToken[0]=='@' )
979 {
980 bool bIgnore;
981
982 // parse fields operator
983 if ( !ParseFields ( m_tPendingToken.tFieldLimit.dMask, m_tPendingToken.tFieldLimit.iMaxPos, bIgnore ) )
984 return -1;
985
986 if ( bIgnore )
987 continue;
988
989 m_iPendingType = TOK_FIELDLIMIT;
990 break;
991
992 } else if ( sToken[0]=='<' )
993 {
994 if ( *m_pTokenizer->GetBufferPtr()=='<' )
995 {
996 // got "<<", aka operator BEFORE
997 m_iPendingType = TOK_BEFORE;
998 break;
999 } else
1000 {
1001 // got stray '<', ignore
1002 if ( m_iPendingNulls>0 )
1003 {
1004 m_iPendingNulls = 0;
1005 lvalp->pNode = AddKeyword ( NULL, iSkippedPosBeforeToken );
1006 m_bWasKeyword = true;
1007 return TOK_KEYWORD;
1008 }
1009 continue;
1010 }
1011 } else if ( sToken[0]=='^' )
1012 {
1013 const char * pTokEnd = m_pTokenizer->GetTokenEnd();
1014 if ( pTokEnd<m_pTokenizer->GetBufferEnd() && !sphIsSpace ( pTokEnd[0] ) )
1015 bWasFrontModifier = true;
1016
1017 // this special is handled in HandleModifiers()
1018 continue;
1019 } else if ( sToken[0]=='$' )
1020 {
1021 if ( bWasKeyword )
1022 continue;
1023 if ( sphIsSpace ( m_pTokenizer->GetTokenStart() [ -1 ] ) )
1024 continue;
1025
1026 // right after overshort
1027 if ( m_pTokenizer->GetOvershortCount()==1 )
1028 {
1029 m_iPendingNulls = 0;
1030 lvalp->pNode = AddKeyword ( NULL, iSkippedPosBeforeToken );
1031 return TOK_KEYWORD;
1032 }
1033
1034 Warning ( "modifiers must be applied to keywords, not operators" );
1035
1036 // this special is handled in HandleModifiers()
1037 continue;
1038 } else
1039 {
1040 bool bWasQuoted = m_bQuoted;
1041 // all the other specials are passed to parser verbatim
1042 if ( sToken[0]=='"' )
1043 {
1044 m_bQuoted = !m_bQuoted;
1045 if ( m_bQuoted )
1046 m_dPhraseStar.Resize ( 0 );
1047 }
1048 m_iPendingType = sToken[0];
1049 m_pTokenizer->m_bPhrase = m_bQuoted;
1050
1051 if ( sToken[0]=='(' )
1052 {
1053 XQLimitSpec_t * pLastField = m_dStateSpec.Last();
1054 m_dStateSpec.Add ( pLastField );
1055 } else if ( sToken[0]==')' && m_dStateSpec.GetLength()>1 )
1056 {
1057 m_dStateSpec.Pop();
1058 }
1059
1060 if ( bWasQuoted && !m_bQuoted )
1061 m_iQuorumQuote = m_iAtomPos;
1062 else if ( sToken[0]=='/' )
1063 m_iQuorumFSlash = m_iAtomPos;
1064
1065 if ( sToken[0]=='~' ||sToken[0]=='/' )
1066 m_bCheckNumber = true;
1067 break;
1068 }
1069 }
1070
1071 // check for stopword, and create that node
1072 // temp buffer is required, because GetWordID() might expand (!) the keyword in-place
1073 BYTE sTmp [ MAX_TOKEN_BYTES ];
1074
1075 strncpy ( (char*)sTmp, sToken, MAX_TOKEN_BYTES );
1076 sTmp[MAX_TOKEN_BYTES-1] = '\0';
1077
1078 int iStopWord = 0;
1079 if ( m_pPlugin && m_pPlugin->m_fnPreMorph )
1080 m_pPlugin->m_fnPreMorph ( m_pPluginData, (char*)sTmp, &iStopWord );
1081
1082 m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
1083 SphWordID_t uWordId = iStopWord ? 0 : m_pDict->GetWordID ( sTmp );
1084
1085 if ( uWordId && m_pPlugin && m_pPlugin->m_fnPostMorph )
1086 {
1087 int iRes = m_pPlugin->m_fnPostMorph ( m_pPluginData, (char*)sTmp, &iStopWord );
1088 if ( iStopWord )
1089 uWordId = 0;
1090 else if ( iRes )
1091 uWordId = m_pDict->GetWordIDNonStemmed ( sTmp );
1092 }
1093
1094 if ( !uWordId )
1095 {
1096 sToken = NULL;
1097 // stopwords with step=0 must not affect pos
1098 if ( m_bEmptyStopword )
1099 m_iAtomPos--;
1100 }
1101
1102 bool bMultiDestHead = false;
1103 bool bMultiDest = false;
1104 int iDestCount = 0;
1105 // do nothing inside phrase
1106 if ( !m_pTokenizer->m_bPhrase )
1107 bMultiDest = m_pTokenizer->WasTokenMultiformDestination ( bMultiDestHead, iDestCount );
1108
1109 if ( bMultiDest && !bMultiDestHead )
1110 {
1111 assert ( m_dMultiforms.GetLength() );
1112 m_dMultiforms.Last().m_iDestCount++;
1113 m_dDestForms.Add ( sToken );
1114 m_bWasKeyword = true;
1115 } else
1116 {
1117 m_tPendingToken.pNode = AddKeyword ( sToken, iSkippedPosBeforeToken );
1118 m_iPendingType = TOK_KEYWORD;
1119 }
1120
1121 if ( bMultiDestHead )
1122 {
1123 MultiformNode_t & tMulti = m_dMultiforms.Add();
1124 tMulti.m_pNode = m_tPendingToken.pNode;
1125 tMulti.m_iDestStart = m_dDestForms.GetLength();
1126 tMulti.m_iDestCount = 0;
1127 }
1128
1129 if ( m_pTokenizer->TokenIsBlended() )
1130 m_iAtomPos--;
1131
1132 if ( !bMultiDest || bMultiDestHead )
1133 break;
1134 }
1135
1136 if ( bWasFrontModifier && m_iPendingType!=TOK_KEYWORD )
1137 Warning ( "modifiers must be applied to keywords, not operators" );
1138
1139 // someone must be pending now!
1140 assert ( m_iPendingType );
1141 m_bEmpty = false;
1142
1143 // ladies first, though
1144 if ( m_iPendingNulls>0 )
1145 {
1146 m_iPendingNulls--;
1147 lvalp->pNode = AddKeyword ( NULL );
1148 m_bWasKeyword = true;
1149 return TOK_KEYWORD;
1150 }
1151
1152 // pending the offending
1153 int iRes = m_iPendingType;
1154 m_iPendingType = 0;
1155
1156 if ( iRes==TOK_KEYWORD )
1157 m_bWasKeyword = true;
1158 *lvalp = m_tPendingToken;
1159 return iRes;
1160 }
1161
1162
AddQuery(XQNode_t * pNode)1163 void XQParser_t::AddQuery ( XQNode_t * pNode )
1164 {
1165 m_pRoot = pNode;
1166 }
1167
1168 // Handle modifiers:
1169 // 1) ^ - field start
1170 // 2) $ - field end
1171 // 3) ^1.234 - keyword boost
1172 // keyword$^1.234 - field end with boost are on
1173 // keywords^1.234$ - only boost here, '$' it NOT modifier
HandleModifiers(XQKeyword_t & tKeyword)1174 void XQParser_t::HandleModifiers ( XQKeyword_t & tKeyword )
1175 {
1176 const char * sTokStart = m_pTokenizer->GetTokenStart();
1177 const char * sTokEnd = m_pTokenizer->GetTokenEnd();
1178 if ( !sTokStart || !sTokEnd )
1179 return;
1180
1181 const char * sQuery = reinterpret_cast<char *> ( m_sQuery );
1182 tKeyword.m_bFieldStart = ( sTokStart-sQuery )>0 && sTokStart [ -1 ]=='^' &&
1183 !( ( sTokStart-sQuery )>1 && sTokStart [ -2 ]=='\\' );
1184
1185 if ( sTokEnd[0]=='$' )
1186 {
1187 tKeyword.m_bFieldEnd = true;
1188 ++sTokEnd; // Skipping.
1189 }
1190
1191 if ( sTokEnd[0]=='^' && ( sTokEnd[1]=='.' || isdigit ( sTokEnd[1] ) ) )
1192 {
1193 // Probably we have a boost, lets check.
1194 char * pEnd;
1195 float fBoost = (float)strtod ( sTokEnd+1, &pEnd );
1196 if ( ( sTokEnd+1 )!=pEnd )
1197 {
1198 // We do have a boost.
1199 // FIXME Handle ERANGE errno here.
1200 tKeyword.m_fBoost = fBoost;
1201 m_pTokenizer->SetBufferPtr ( pEnd );
1202 }
1203 }
1204 }
1205
1206
AddKeyword(const char * sKeyword,int iSkippedPosBeforeToken)1207 XQNode_t * XQParser_t::AddKeyword ( const char * sKeyword, int iSkippedPosBeforeToken )
1208 {
1209 XQKeyword_t tAW ( sKeyword, m_iAtomPos );
1210 tAW.m_iSkippedBefore = iSkippedPosBeforeToken;
1211 HandleModifiers ( tAW );
1212 XQNode_t * pNode = new XQNode_t ( *m_dStateSpec.Last() );
1213 pNode->m_dWords.Add ( tAW );
1214 m_dSpawned.Add ( pNode );
1215 return pNode;
1216 }
1217
1218
AddKeyword(XQNode_t * pLeft,XQNode_t * pRight)1219 XQNode_t * XQParser_t::AddKeyword ( XQNode_t * pLeft, XQNode_t * pRight )
1220 {
1221 if ( !pLeft || !pRight )
1222 return pLeft ? pLeft : pRight;
1223
1224 assert ( pLeft->m_dWords.GetLength()>0 );
1225 assert ( pRight->m_dWords.GetLength()==1 );
1226
1227 pLeft->m_dWords.Add ( pRight->m_dWords[0] );
1228 m_dSpawned.RemoveValue ( pRight );
1229 SafeDelete ( pRight );
1230 return pLeft;
1231 }
1232
1233
HasMissedField(const XQLimitSpec_t & tSpec)1234 static bool HasMissedField ( const XQLimitSpec_t & tSpec )
1235 {
1236 return ( tSpec.m_dFieldMask.TestAll ( false ) && tSpec.m_iFieldMaxPos==0 && !tSpec.m_bZoneSpan && tSpec.m_dZones.GetLength()==0 );
1237 }
1238
1239
AddOp(XQOperator_e eOp,XQNode_t * pLeft,XQNode_t * pRight,int iOpArg)1240 XQNode_t * XQParser_t::AddOp ( XQOperator_e eOp, XQNode_t * pLeft, XQNode_t * pRight, int iOpArg )
1241 {
1242 /////////
1243 // unary
1244 /////////
1245
1246 if ( eOp==SPH_QUERY_NOT )
1247 {
1248 XQNode_t * pNode = new XQNode_t ( *m_dStateSpec.Last() );
1249 pNode->SetOp ( SPH_QUERY_NOT, pLeft );
1250 m_dSpawned.Add ( pNode );
1251 return pNode;
1252 }
1253
1254 //////////
1255 // binary
1256 //////////
1257
1258 if ( !pLeft || !pRight )
1259 return pLeft ? pLeft : pRight;
1260
1261 // build a new node
1262 XQNode_t * pResult = NULL;
1263 if ( pLeft->m_dChildren.GetLength() && pLeft->GetOp()==eOp && pLeft->m_iOpArg==iOpArg )
1264 {
1265 pLeft->m_dChildren.Add ( pRight );
1266 pRight->m_pParent = pLeft;
1267 pResult = pLeft;
1268 } else
1269 {
1270 // however-2, beside all however below, [@@relaxed ((@title hello) | (@missed world)) @body other terms]
1271 // we should use valid (left) field mask for complex (OR) node
1272 // as right node in this case has m_bFieldSpec==true but m_dFieldMask==0
1273 const XQLimitSpec_t & tSpec = HasMissedField ( pRight->m_dSpec ) ? pLeft->m_dSpec : pRight->m_dSpec;
1274
1275 // however, it's right (!) spec which is chosen for the resulting node,
1276 // eg. '@title hello' + 'world @body program'
1277 XQNode_t * pNode = new XQNode_t ( tSpec );
1278 pNode->SetOp ( eOp, pLeft, pRight );
1279 pNode->m_iOpArg = iOpArg;
1280 m_dSpawned.Add ( pNode );
1281 pResult = pNode;
1282 }
1283 return pResult;
1284 }
1285
1286
SetPhrase(XQNode_t * pNode,bool bSetExact)1287 void XQParser_t::SetPhrase ( XQNode_t * pNode, bool bSetExact )
1288 {
1289 if ( !pNode )
1290 return;
1291
1292 assert ( pNode->m_dWords.GetLength() );
1293 if ( bSetExact )
1294 {
1295 ARRAY_FOREACH ( iWord, pNode->m_dWords )
1296 {
1297 if ( !pNode->m_dWords[iWord].m_sWord.IsEmpty() )
1298 pNode->m_dWords[iWord].m_sWord.SetSprintf ( "=%s", pNode->m_dWords[iWord].m_sWord.cstr() );
1299 }
1300 }
1301 pNode->SetOp ( SPH_QUERY_PHRASE );
1302
1303 PhraseShiftQpos ( pNode );
1304 }
1305
1306
SweepNulls(XQNode_t * pNode)1307 XQNode_t * XQParser_t::SweepNulls ( XQNode_t * pNode )
1308 {
1309 if ( !pNode )
1310 return NULL;
1311
1312 // sweep plain node
1313 if ( pNode->m_dWords.GetLength() )
1314 {
1315 ARRAY_FOREACH ( i, pNode->m_dWords )
1316 if ( pNode->m_dWords[i].m_sWord.cstr()==NULL )
1317 pNode->m_dWords.Remove ( i-- );
1318
1319 if ( pNode->m_dWords.GetLength()==0 )
1320 {
1321 m_dSpawned.RemoveValue ( pNode ); // OPTIMIZE!
1322 SafeDelete ( pNode );
1323 return NULL;
1324 }
1325
1326 return pNode;
1327 }
1328
1329 // sweep op node
1330 ARRAY_FOREACH ( i, pNode->m_dChildren )
1331 {
1332 pNode->m_dChildren[i] = SweepNulls ( pNode->m_dChildren[i] );
1333 if ( pNode->m_dChildren[i]==NULL )
1334 {
1335 pNode->m_dChildren.Remove ( i-- );
1336 // use non-null iOpArg as a flag indicating that the sweeping happened.
1337 ++pNode->m_iOpArg;
1338 }
1339 }
1340
1341 if ( pNode->m_dChildren.GetLength()==0 )
1342 {
1343 m_dSpawned.RemoveValue ( pNode ); // OPTIMIZE!
1344 SafeDelete ( pNode );
1345 return NULL;
1346 }
1347
1348 // remove redundancies if needed
1349 if ( pNode->GetOp()!=SPH_QUERY_NOT && pNode->m_dChildren.GetLength()==1 )
1350 {
1351 XQNode_t * pRet = pNode->m_dChildren[0];
1352 pNode->m_dChildren.Reset ();
1353 pRet->m_pParent = pNode->m_pParent;
1354 // expressions like 'la !word' (having min_word_len>len(la)) became a 'null' node.
1355 if ( pNode->m_iOpArg && pRet->GetOp()==SPH_QUERY_NOT )
1356 {
1357 pRet->SetOp ( SPH_QUERY_NULL );
1358 ARRAY_FOREACH ( i, pRet->m_dChildren )
1359 {
1360 m_dSpawned.RemoveValue ( pRet->m_dChildren[i] );
1361 SafeDelete ( pRet->m_dChildren[i] )
1362 }
1363 pRet->m_dChildren.Reset();
1364 }
1365 pRet->m_iOpArg = pNode->m_iOpArg;
1366
1367 m_dSpawned.RemoveValue ( pNode ); // OPTIMIZE!
1368 SafeDelete ( pNode );
1369 return SweepNulls ( pRet );
1370 }
1371
1372 // done
1373 return pNode;
1374 }
1375
FixupNulls(XQNode_t * pNode)1376 void XQParser_t::FixupNulls ( XQNode_t * pNode )
1377 {
1378 if ( !pNode )
1379 return;
1380
1381 ARRAY_FOREACH ( i, pNode->m_dChildren )
1382 FixupNulls ( pNode->m_dChildren[i] );
1383
1384 // smth OR null == smth.
1385 if ( pNode->GetOp()==SPH_QUERY_OR )
1386 {
1387 CSphVector<XQNode_t*> dNotNulls;
1388 ARRAY_FOREACH ( i, pNode->m_dChildren )
1389 {
1390 XQNode_t* pChild = pNode->m_dChildren[i];
1391 if ( pChild->GetOp()!=SPH_QUERY_NULL )
1392 dNotNulls.Add ( pChild );
1393 else
1394 {
1395 m_dSpawned.RemoveValue ( pChild );
1396 SafeDelete ( pChild );
1397 }
1398 }
1399 pNode->m_dChildren.SwapData ( dNotNulls );
1400 dNotNulls.Reset();
1401 // smth AND null = null.
1402 } else if ( pNode->GetOp()==SPH_QUERY_AND )
1403 {
1404 bool bHasNull = ARRAY_ANY ( bHasNull, pNode->m_dChildren, pNode->m_dChildren[_any]->GetOp()==SPH_QUERY_NULL );
1405 if ( bHasNull )
1406 {
1407 pNode->SetOp ( SPH_QUERY_NULL );
1408 ARRAY_FOREACH ( i, pNode->m_dChildren )
1409 {
1410 m_dSpawned.RemoveValue ( pNode->m_dChildren[i] );
1411 SafeDelete ( pNode->m_dChildren[i] )
1412 }
1413 pNode->m_dChildren.Reset();
1414 }
1415 }
1416 }
1417
FixupNots(XQNode_t * pNode)1418 bool XQParser_t::FixupNots ( XQNode_t * pNode )
1419 {
1420 // no processing for plain nodes
1421 if ( !pNode || pNode->m_dWords.GetLength() )
1422 return true;
1423
1424 // process 'em children
1425 ARRAY_FOREACH ( i, pNode->m_dChildren )
1426 if ( !FixupNots ( pNode->m_dChildren[i] ) )
1427 return false;
1428
1429 // extract NOT subnodes
1430 CSphVector<XQNode_t*> dNots;
1431 ARRAY_FOREACH ( i, pNode->m_dChildren )
1432 if ( pNode->m_dChildren[i]->GetOp()==SPH_QUERY_NOT )
1433 {
1434 dNots.Add ( pNode->m_dChildren[i] );
1435 pNode->m_dChildren.RemoveFast ( i-- );
1436 }
1437
1438 // no NOTs? we're square
1439 if ( !dNots.GetLength() )
1440 return true;
1441
1442 // nothing but NOTs? we can't compute that
1443 if ( !pNode->m_dChildren.GetLength() )
1444 {
1445 m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (node consists of NOT operators only)" );
1446 return false;
1447 }
1448
1449 // NOT within OR or MAYBE? we can't compute that
1450 if ( pNode->GetOp()==SPH_QUERY_OR || pNode->GetOp()==SPH_QUERY_MAYBE || pNode->GetOp()==SPH_QUERY_NEAR )
1451 {
1452 XQOperator_e eOp = pNode->GetOp();
1453 const char * sOp = ( eOp==SPH_QUERY_OR ? "OR" : ( eOp==SPH_QUERY_MAYBE ? "MAYBE" : "NEAR" ) );
1454 m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (NOT is not allowed within %s)", sOp );
1455 return false;
1456 }
1457
1458 // NOT used in before operator
1459 if ( pNode->GetOp()==SPH_QUERY_BEFORE )
1460 {
1461 m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (NOT cannot be used as before operand)" );
1462 return false;
1463 }
1464
1465 // must be some NOTs within AND at this point, convert this node to ANDNOT
1466 assert ( pNode->GetOp()==SPH_QUERY_AND && pNode->m_dChildren.GetLength() && dNots.GetLength() );
1467
1468 XQNode_t * pAnd = new XQNode_t ( pNode->m_dSpec );
1469 pAnd->SetOp ( SPH_QUERY_AND, pNode->m_dChildren );
1470 m_dSpawned.Add ( pAnd );
1471
1472 XQNode_t * pNot = NULL;
1473 if ( dNots.GetLength()==1 )
1474 {
1475 pNot = dNots[0];
1476 } else
1477 {
1478 pNot = new XQNode_t ( pNode->m_dSpec );
1479 pNot->SetOp ( SPH_QUERY_OR, dNots );
1480 m_dSpawned.Add ( pNot );
1481 }
1482
1483 pNode->SetOp ( SPH_QUERY_ANDNOT, pAnd, pNot );
1484 return true;
1485 }
1486
1487
CollectChildren(XQNode_t * pNode,CSphVector<XQNode_t * > & dChildren)1488 static void CollectChildren ( XQNode_t * pNode, CSphVector<XQNode_t *> & dChildren )
1489 {
1490 if ( !pNode->m_dChildren.GetLength() )
1491 return;
1492
1493 dChildren.Add ( pNode );
1494 ARRAY_FOREACH ( i, dChildren )
1495 {
1496 const XQNode_t * pNode = dChildren[i];
1497 ARRAY_FOREACH ( j, pNode->m_dChildren )
1498 dChildren.Add ( pNode->m_dChildren[j] );
1499 }
1500 }
1501
1502
DeleteNodesWOFields(XQNode_t * pNode)1503 void XQParser_t::DeleteNodesWOFields ( XQNode_t * pNode )
1504 {
1505 if ( !pNode )
1506 return;
1507
1508 for ( int i = 0; i < pNode->m_dChildren.GetLength (); )
1509 {
1510 if ( pNode->m_dChildren[i]->m_dSpec.m_dFieldMask.TestAll ( false ) )
1511 {
1512 XQNode_t * pChild = pNode->m_dChildren[i];
1513 CSphVector<XQNode_t *> dChildren;
1514 CollectChildren ( pChild, dChildren );
1515 #ifndef NDEBUG
1516 bool bAllEmpty = ARRAY_ALL ( bAllEmpty, dChildren, dChildren[_all]->m_dSpec.m_dFieldMask.TestAll ( false ) );
1517 assert ( pChild->m_dChildren.GetLength()==0 || ( dChildren.GetLength() && bAllEmpty ) );
1518 #endif
1519 if ( dChildren.GetLength() )
1520 {
1521 ARRAY_FOREACH ( i, dChildren )
1522 m_dSpawned.RemoveValue ( dChildren[i] );
1523 } else
1524 {
1525 m_dSpawned.RemoveValue ( pChild );
1526 }
1527
1528 // this should be a leaf node
1529 SafeDelete ( pNode->m_dChildren[i] );
1530 pNode->m_dChildren.RemoveFast ( i );
1531
1532 } else
1533 {
1534 DeleteNodesWOFields ( pNode->m_dChildren[i] );
1535 i++;
1536 }
1537 }
1538 }
1539
1540
PhraseShiftQpos(XQNode_t * pNode)1541 void XQParser_t::PhraseShiftQpos ( XQNode_t * pNode )
1542 {
1543 if ( !m_dPhraseStar.GetLength() )
1544 return;
1545
1546 const int * pLast = m_dPhraseStar.Begin();
1547 const int * pEnd = m_dPhraseStar.Begin() + m_dPhraseStar.GetLength();
1548 int iQposShiftStart = *pLast;
1549 int iQposShift = 0;
1550 int iLastStarPos = *pLast;
1551
1552 ARRAY_FOREACH ( iWord, pNode->m_dWords )
1553 {
1554 XQKeyword_t & tWord = pNode->m_dWords[iWord];
1555
1556 // fold stars in phrase till current term position
1557 while ( pLast<pEnd && *(pLast)<=tWord.m_iAtomPos )
1558 {
1559 iLastStarPos = *pLast;
1560 pLast++;
1561 iQposShift++;
1562 }
1563
1564 // star dictionary passes raw star however regular dictionary suppress it
1565 // raw star also might be suppressed by min_word_len option
1566 // so remove qpos shift from duplicated raw star term
1567 // however not stopwords that is also term with empty word
1568 if ( tWord.m_sWord=="*" || ( tWord.m_sWord.IsEmpty() && tWord.m_iAtomPos==iLastStarPos ) )
1569 {
1570 pNode->m_dWords.Remove ( iWord-- );
1571 iQposShift--;
1572 continue;
1573 }
1574
1575 if ( iQposShiftStart<=tWord.m_iAtomPos )
1576 tWord.m_iAtomPos += iQposShift;
1577 }
1578 }
1579
1580
CheckQuorumProximity(XQNode_t * pNode,CSphString * pError)1581 static bool CheckQuorumProximity ( XQNode_t * pNode, CSphString * pError )
1582 {
1583 assert ( pError );
1584 if ( !pNode )
1585 return true;
1586
1587 bool bQuorumPassed = ( pNode->GetOp()!=SPH_QUERY_QUORUM ||
1588 ( pNode->m_iOpArg>0 && ( !pNode->m_bPercentOp || pNode->m_iOpArg<=100 ) ) );
1589 if ( !bQuorumPassed )
1590 {
1591 if ( pNode->m_bPercentOp )
1592 pError->SetSprintf ( "quorum threshold out of bounds 0.0 and 1.0f (%f)", 1.0f / 100.0f * pNode->m_iOpArg );
1593 else
1594 pError->SetSprintf ( "quorum threshold too low (%d)", pNode->m_iOpArg );
1595 return false;
1596 }
1597
1598 if ( pNode->GetOp()==SPH_QUERY_PROXIMITY && pNode->m_iOpArg<1 )
1599 {
1600 pError->SetSprintf ( "proximity threshold too low (%d)", pNode->m_iOpArg );
1601 return false;
1602 }
1603
1604 bool bValid = ARRAY_ALL ( bValid, pNode->m_dChildren, CheckQuorumProximity ( pNode->m_dChildren[_all], pError ) );
1605 return bValid;
1606 }
1607
1608
FixupDegenerates(XQNode_t * pNode,CSphString & sWarning)1609 static void FixupDegenerates ( XQNode_t * pNode, CSphString & sWarning )
1610 {
1611 if ( !pNode )
1612 return;
1613
1614 if ( pNode->m_dWords.GetLength()==1 &&
1615 ( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM ) )
1616 {
1617 if ( pNode->GetOp()==SPH_QUERY_QUORUM && !pNode->m_bPercentOp && pNode->m_iOpArg>1 )
1618 sWarning.SetSprintf ( "quorum threshold too high (words=%d, thresh=%d); replacing quorum operator with AND operator", pNode->m_dWords.GetLength(), pNode->m_iOpArg );
1619
1620 pNode->SetOp ( SPH_QUERY_AND );
1621 return;
1622 }
1623
1624 ARRAY_FOREACH ( i, pNode->m_dChildren )
1625 FixupDegenerates ( pNode->m_dChildren[i], sWarning );
1626 }
1627
FixupDestForms()1628 void XQParser_t::FixupDestForms ()
1629 {
1630 if ( !m_dMultiforms.GetLength() )
1631 return;
1632
1633 CSphVector<XQNode_t *> dForms;
1634
1635 ARRAY_FOREACH ( iNode, m_dMultiforms )
1636 {
1637 const MultiformNode_t & tDesc = m_dMultiforms[iNode];
1638 XQNode_t * pMultiParent = tDesc.m_pNode;
1639 assert ( pMultiParent->m_dWords.GetLength()==1 && pMultiParent->m_dChildren.GetLength()==0 );
1640
1641 XQKeyword_t tKeyword;
1642 Swap ( pMultiParent->m_dWords[0], tKeyword );
1643 pMultiParent->m_dWords.Reset();
1644
1645 // FIXME: what about whildcard?
1646 bool bExact = ( tKeyword.m_sWord.Length()>1 && tKeyword.m_sWord.cstr()[0]=='=' );
1647 bool bFieldStart = tKeyword.m_bFieldStart;
1648 bool bFieldEnd = tKeyword.m_bFieldEnd;
1649 tKeyword.m_bFieldStart = false;
1650 tKeyword.m_bFieldEnd = false;
1651
1652 XQNode_t * pMultiHead = new XQNode_t ( pMultiParent->m_dSpec );
1653 pMultiHead->m_dWords.Add ( tKeyword );
1654 m_dSpawned.Add ( pMultiHead );
1655 dForms.Add ( pMultiHead );
1656
1657 for ( int iForm=0; iForm<tDesc.m_iDestCount; iForm++ )
1658 {
1659 tKeyword.m_iAtomPos++;
1660 tKeyword.m_sWord = m_dDestForms [ tDesc.m_iDestStart + iForm ];
1661
1662 // propagate exact word flag to all destination forms
1663 if ( bExact )
1664 tKeyword.m_sWord.SetSprintf ( "=%s", tKeyword.m_sWord.cstr() );
1665
1666 XQNode_t * pMulti = new XQNode_t ( pMultiParent->m_dSpec );
1667 pMulti->m_dWords.Add ( tKeyword );
1668 m_dSpawned.Add ( pMulti );
1669 dForms.Add ( pMulti );
1670 }
1671
1672 // fix-up field start\end modifier
1673 dForms[0]->m_dWords[0].m_bFieldStart = bFieldStart;
1674 dForms.Last()->m_dWords[0].m_bFieldEnd = bFieldEnd;
1675
1676 pMultiParent->SetOp ( SPH_QUERY_AND, dForms );
1677 dForms.Resize ( 0 );
1678 }
1679 }
1680
1681
Parse(XQQuery_t & tParsed,const char * sQuery,const CSphQuery * pQuery,const ISphTokenizer * pTokenizer,const CSphSchema * pSchema,CSphDict * pDict,const CSphIndexSettings & tSettings)1682 bool XQParser_t::Parse ( XQQuery_t & tParsed, const char * sQuery, const CSphQuery * pQuery, const ISphTokenizer * pTokenizer,
1683 const CSphSchema * pSchema, CSphDict * pDict, const CSphIndexSettings & tSettings )
1684 {
1685 // FIXME? might wanna verify somehow that pTokenizer has all the specials etc from sphSetupQueryTokenizer
1686 CSphScopedPtr<ISphTokenizer> pMyTokenizer ( NULL );
1687 CSphTightVector<char> dRlpBuf;
1688
1689 #if USE_RLP
1690 if ( !pTokenizer->GetRLPContext() )
1691 {
1692 #endif
1693 pMyTokenizer = pTokenizer->Clone ( SPH_CLONE_QUERY_LIGHTWEIGHT );
1694 #if USE_RLP
1695 } else
1696 {
1697 const char * sProcessed = NULL;
1698 if ( !ISphTokenizer::ProcessQueryRLP ( tSettings.m_sRLPContext.cstr(), sQuery, &sProcessed, dRlpBuf, tParsed.m_sParseError ) )
1699 return false;
1700
1701 pMyTokenizer = pTokenizer->GetEmbeddedTokenizer()->Clone ( SPH_CLONE_QUERY_LIGHTWEIGHT );
1702 sQuery = sProcessed;
1703 }
1704 #endif
1705
1706 // most outcomes are errors
1707 SafeDelete ( tParsed.m_pRoot );
1708
1709 // check for relaxed syntax
1710 const char * OPTION_RELAXED = "@@relaxed";
1711 const int OPTION_RELAXED_LEN = strlen ( OPTION_RELAXED );
1712
1713 m_bStopOnInvalid = true;
1714 if ( sQuery && strncmp ( sQuery, OPTION_RELAXED, OPTION_RELAXED_LEN )==0 && !sphIsAlpha ( sQuery[OPTION_RELAXED_LEN] ) )
1715 {
1716 sQuery += OPTION_RELAXED_LEN;
1717 m_bStopOnInvalid = false;
1718 }
1719
1720 m_pPlugin = NULL;
1721 m_pPluginData = NULL;
1722
1723 if ( pQuery && !pQuery->m_sQueryTokenFilterName.IsEmpty() )
1724 {
1725 CSphString sError;
1726 m_pPlugin = static_cast < PluginQueryTokenFilter_c * > ( sphPluginAcquire ( pQuery->m_sQueryTokenFilterLib.cstr(),
1727 PLUGIN_QUERY_TOKEN_FILTER, pQuery->m_sQueryTokenFilterName.cstr(), tParsed.m_sParseError ) );
1728 if ( !m_pPlugin )
1729 return false;
1730
1731 char szError [ SPH_UDF_ERROR_LEN ];
1732 if ( m_pPlugin->m_fnInit && m_pPlugin->m_fnInit ( &m_pPluginData, MAX_TOKEN_BYTES, pQuery->m_sQueryTokenFilterOpts.cstr(), szError )!=0 )
1733 {
1734 tParsed.m_sParseError = sError;
1735 m_pPlugin->Release();
1736 m_pPlugin = NULL;
1737 m_pPluginData = NULL;
1738 return false;
1739 }
1740 }
1741
1742 // setup parser
1743 m_pParsed = &tParsed;
1744 m_sQuery = (BYTE*) sQuery;
1745 m_iQueryLen = sQuery ? strlen(sQuery) : 0;
1746 m_pTokenizer = pMyTokenizer.Ptr();
1747 m_pSchema = pSchema;
1748 m_iAtomPos = 0;
1749 m_iPendingNulls = 0;
1750 m_iPendingType = 0;
1751 m_pRoot = NULL;
1752 m_bEmpty = true;
1753 m_bEmptyStopword = ( tSettings.m_iStopwordStep==0 );
1754 m_iOvershortStep = tSettings.m_iOvershortStep;
1755
1756 CSphScopedPtr<CSphDict> tDictCloned ( NULL );
1757 m_pDict = pDict;
1758 if ( pDict->HasState() )
1759 tDictCloned = m_pDict = pDict->Clone();
1760
1761 m_pTokenizer->SetBuffer ( m_sQuery, m_iQueryLen );
1762 int iRes = yyparse ( this );
1763
1764 if ( m_pPlugin )
1765 {
1766 if ( m_pPlugin->m_fnDeinit )
1767 m_pPlugin->m_fnDeinit ( m_pPluginData );
1768
1769 m_pPlugin->Release();
1770 m_pPlugin = NULL;
1771 m_pPluginData = NULL;
1772 }
1773
1774 if ( ( iRes || !m_pParsed->m_sParseError.IsEmpty() ) && !m_bEmpty )
1775 {
1776 Cleanup ();
1777 return false;
1778 }
1779
1780 FixupDestForms ();
1781 DeleteNodesWOFields ( m_pRoot );
1782 m_pRoot = SweepNulls ( m_pRoot );
1783 FixupDegenerates ( m_pRoot, m_pParsed->m_sParseWarning );
1784 FixupNulls ( m_pRoot );
1785
1786 if ( !FixupNots ( m_pRoot ) )
1787 {
1788 Cleanup ();
1789 return false;
1790 }
1791
1792 if ( !CheckQuorumProximity ( m_pRoot, &m_pParsed->m_sParseError ) )
1793 {
1794 Cleanup();
1795 return false;
1796 }
1797
1798 if ( m_pRoot && m_pRoot->GetOp()==SPH_QUERY_NOT && !m_pRoot->m_iOpArg )
1799 {
1800 Cleanup ();
1801 m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (single NOT operator)" );
1802 return false;
1803 }
1804
1805 // all ok; might want to create a dummy node to indicate that
1806 m_dSpawned.Reset();
1807
1808 if ( m_pRoot && m_pRoot->GetOp()==SPH_QUERY_NULL )
1809 {
1810 delete m_pRoot;
1811 m_pRoot = NULL;
1812 }
1813
1814 tParsed.m_pRoot = m_pRoot ? m_pRoot : new XQNode_t ( *m_dStateSpec.Last() );
1815 return true;
1816 }
1817
1818 //////////////////////////////////////////////////////////////////////////
1819
1820
1821 #ifdef XQDEBUG
xqIndent(int iIndent)1822 static void xqIndent ( int iIndent )
1823 {
1824 iIndent *= 2;
1825 while ( iIndent-- )
1826 printf ( "|-" );
1827 }
1828
xqDump(const XQNode_t * pNode,int iIndent)1829 static void xqDump ( const XQNode_t * pNode, int iIndent )
1830 {
1831 #ifdef XQ_DUMP_NODE_ADDR
1832 printf ( "0x%08x ", pNode );
1833 #endif
1834 if ( pNode->m_dChildren.GetLength() )
1835 {
1836 xqIndent ( iIndent );
1837 switch ( pNode->GetOp() )
1838 {
1839 case SPH_QUERY_AND: printf ( "AND:" ); break;
1840 case SPH_QUERY_OR: printf ( "OR:" ); break;
1841 case SPH_QUERY_MAYBE: printf ( "MAYBE:" ); break;
1842 case SPH_QUERY_NOT: printf ( "NOT:" ); break;
1843 case SPH_QUERY_ANDNOT: printf ( "ANDNOT:" ); break;
1844 case SPH_QUERY_BEFORE: printf ( "BEFORE:" ); break;
1845 case SPH_QUERY_PHRASE: printf ( "PHRASE:" ); break;
1846 case SPH_QUERY_PROXIMITY: printf ( "PROXIMITY:" ); break;
1847 case SPH_QUERY_QUORUM: printf ( "QUORUM:" ); break;
1848 case SPH_QUERY_NEAR: printf ( "NEAR:" ); break;
1849 case SPH_QUERY_SENTENCE: printf ( "SENTENCE:" ); break;
1850 case SPH_QUERY_PARAGRAPH: printf ( "PARAGRAPH:" ); break;
1851 default: printf ( "unknown-op-%d:", pNode->GetOp() ); break;
1852 }
1853 printf ( " (%d)\n", pNode->m_dChildren.GetLength() );
1854 ARRAY_FOREACH ( i, pNode->m_dChildren )
1855 {
1856 assert ( pNode->m_dChildren[i]->m_pParent==pNode );
1857 xqDump ( pNode->m_dChildren[i], iIndent+1 );
1858 }
1859 } else
1860 {
1861 xqIndent ( iIndent );
1862 printf ( "MATCH(%d,%d):", pNode->m_dSpec.m_dFieldMask.GetMask32(), pNode->m_iOpArg );
1863
1864 ARRAY_FOREACH ( i, pNode->m_dWords )
1865 {
1866 const XQKeyword_t & tWord = pNode->m_dWords[i];
1867
1868 const char * sLocTag = "";
1869 if ( tWord.m_bFieldStart ) sLocTag = ", start";
1870 if ( tWord.m_bFieldEnd ) sLocTag = ", end";
1871
1872 printf ( " %s (qpos %d%s)", tWord.m_sWord.cstr(), tWord.m_iAtomPos, sLocTag );
1873 }
1874 printf ( "\n" );
1875 }
1876 }
1877 #endif
1878
1879
sphReconstructNode(const XQNode_t * pNode,const CSphSchema * pSchema)1880 CSphString sphReconstructNode ( const XQNode_t * pNode, const CSphSchema * pSchema )
1881 {
1882 CSphString sRes ( "" );
1883
1884 if ( !pNode )
1885 return sRes;
1886
1887 if ( pNode->m_dWords.GetLength() )
1888 {
1889 // say just words to me
1890 const CSphVector<XQKeyword_t> & dWords = pNode->m_dWords;
1891 ARRAY_FOREACH ( i, dWords )
1892 sRes.SetSprintf ( "%s %s", sRes.cstr(), dWords[i].m_sWord.cstr() );
1893 sRes.Trim ();
1894
1895 switch ( pNode->GetOp() )
1896 {
1897 case SPH_QUERY_AND: break;
1898 case SPH_QUERY_PHRASE: sRes.SetSprintf ( "\"%s\"", sRes.cstr() ); break;
1899 case SPH_QUERY_PROXIMITY: sRes.SetSprintf ( "\"%s\"~%d", sRes.cstr(), pNode->m_iOpArg ); break;
1900 case SPH_QUERY_QUORUM: sRes.SetSprintf ( "\"%s\"/%d", sRes.cstr(), pNode->m_iOpArg ); break;
1901 case SPH_QUERY_NEAR: sRes.SetSprintf ( "\"%s\"NEAR/%d", sRes.cstr(), pNode->m_iOpArg ); break;
1902 default: assert ( 0 && "unexpected op in ReconstructNode()" ); break;
1903 }
1904
1905 if ( !pNode->m_dSpec.m_dFieldMask.TestAll(true) )
1906 {
1907 CSphString sFields ( "" );
1908 for ( int i=0; i<SPH_MAX_FIELDS; i++ )
1909 {
1910 if ( !pNode->m_dSpec.m_dFieldMask.Test(i) )
1911 continue;
1912
1913 if ( pSchema )
1914 sFields.SetSprintf ( "%s,%s", sFields.cstr(), pSchema->m_dFields[i].m_sName.cstr() );
1915 else
1916 sFields.SetSprintf ( "%s,%d", sFields.cstr(), pNode->m_dSpec.m_dFieldMask.GetMask32() );
1917 }
1918
1919 sRes.SetSprintf ( "( @%s: %s )", sFields.cstr()+1, sRes.cstr() );
1920 } else
1921 {
1922 if ( pNode->GetOp()==SPH_QUERY_AND && dWords.GetLength()>1 )
1923 sRes.SetSprintf ( "( %s )", sRes.cstr() ); // wrap bag of words
1924 }
1925
1926 } else
1927 {
1928 ARRAY_FOREACH ( i, pNode->m_dChildren )
1929 {
1930 if ( !i )
1931 {
1932 sRes = sphReconstructNode ( pNode->m_dChildren[i], pSchema );
1933 } else
1934 {
1935 const char * sOp = "(unknown-op)";
1936 switch ( pNode->GetOp() )
1937 {
1938 case SPH_QUERY_AND: sOp = " "; break;
1939 case SPH_QUERY_OR: sOp = "|"; break;
1940 case SPH_QUERY_MAYBE: sOp = "MAYBE"; break;
1941 case SPH_QUERY_NOT: sOp = "NOT"; break;
1942 case SPH_QUERY_ANDNOT: sOp = "AND NOT"; break;
1943 case SPH_QUERY_BEFORE: sOp = "BEFORE"; break;
1944 case SPH_QUERY_NEAR: sOp = "NEAR"; break;
1945 case SPH_QUERY_PHRASE: sOp = ""; break;
1946 default: assert ( 0 && "unexpected op in ReconstructNode()" ); break;
1947 }
1948 if ( pNode->GetOp()==SPH_QUERY_PHRASE )
1949 sRes.SetSprintf ( "\"%s %s\"", sRes.cstr(), sphReconstructNode ( pNode->m_dChildren[i], pSchema ).cstr() );
1950 else
1951 sRes.SetSprintf ( "%s %s %s", sRes.cstr(), sOp, sphReconstructNode ( pNode->m_dChildren[i], pSchema ).cstr() );
1952 }
1953 }
1954
1955 if ( pNode->m_dChildren.GetLength()>1 )
1956 sRes.SetSprintf ( "( %s )", sRes.cstr() );
1957 }
1958
1959 return sRes;
1960 }
1961
1962
sphParseExtendedQuery(XQQuery_t & tParsed,const char * sQuery,const CSphQuery * pQuery,const ISphTokenizer * pTokenizer,const CSphSchema * pSchema,CSphDict * pDict,const CSphIndexSettings & tSettings)1963 bool sphParseExtendedQuery ( XQQuery_t & tParsed, const char * sQuery, const CSphQuery * pQuery, const ISphTokenizer * pTokenizer,
1964 const CSphSchema * pSchema, CSphDict * pDict, const CSphIndexSettings & tSettings )
1965 {
1966 XQParser_t qp;
1967 bool bRes = qp.Parse ( tParsed, sQuery, pQuery, pTokenizer, pSchema, pDict, tSettings );
1968
1969 #ifndef NDEBUG
1970 if ( bRes && tParsed.m_pRoot )
1971 tParsed.m_pRoot->Check ( true );
1972 #endif
1973
1974 #ifdef XQDEBUG
1975 if ( bRes )
1976 {
1977 printf ( "\n--- query ---\n" );
1978 printf ( "%s\n", sQuery );
1979 xqDump ( tParsed.m_pRoot, 0 );
1980 printf ( "---\n" );
1981 }
1982 #endif
1983
1984 // moved here from ranker creation
1985 // as at that point term expansion could produce many terms from expanded term and this condition got failed
1986 tParsed.m_bSingleWord = ( tParsed.m_pRoot && tParsed.m_pRoot->m_dChildren.GetLength()==0 && tParsed.m_pRoot->m_dWords.GetLength()==1 );
1987
1988 return bRes;
1989 }
1990
1991 //////////////////////////////////////////////////////////////////////////
1992 // COMMON SUBTREES DETECTION
1993 //////////////////////////////////////////////////////////////////////////
1994
1995 /// Decides if given pTree is appropriate for caching or not. Currently we don't cache
1996 /// the end values (leafs).
IsAppropriate(XQNode_t * pTree)1997 static bool IsAppropriate ( XQNode_t * pTree )
1998 {
1999 if ( !pTree ) return false;
2000
2001 // skip nodes that actually are leaves (eg. "AND smth" node instead of merely "smth")
2002 return !( pTree->m_dWords.GetLength()==1 && pTree->GetOp()!=SPH_QUERY_NOT );
2003 }
2004
2005 typedef CSphOrderedHash < DWORD, uint64_t, IdentityHash_fn, 128 > CDwordHash;
2006
2007 // stores the pair of a tree, and the bitmask of common nodes
2008 // which contains the tree.
2009 class BitMask_t
2010 {
2011 XQNode_t * m_pTree;
2012 uint64_t m_uMask;
2013
2014 public:
BitMask_t()2015 BitMask_t ()
2016 : m_pTree ( NULL )
2017 , m_uMask ( 0ull )
2018 {}
2019
Init(XQNode_t * pTree,uint64_t uMask)2020 void Init ( XQNode_t * pTree, uint64_t uMask )
2021 {
2022 m_pTree = pTree;
2023 m_uMask = uMask;
2024 }
2025
GetMask() const2026 inline uint64_t GetMask() const { return m_uMask; }
GetTree() const2027 inline XQNode_t * GetTree() const { return m_pTree; }
2028 };
2029
2030 // a list of unique values.
2031 class Associations_t : public CDwordHash
2032 {
2033 public:
2034
2035 // returns true when add the second member.
2036 // The reason is that only one is not interesting for us,
2037 // but more than two will flood the caller.
Associate2nd(uint64_t uTree)2038 bool Associate2nd ( uint64_t uTree )
2039 {
2040 if ( Exists ( uTree ) )
2041 return false;
2042 Add ( 0, uTree );
2043 return GetLength()==2;
2044 }
2045
2046 // merge with another similar
Merge(const Associations_t & parents)2047 void Merge ( const Associations_t& parents )
2048 {
2049 parents.IterateStart();
2050 while ( parents.IterateNext() )
2051 Associate2nd ( parents.IterateGetKey() );
2052 }
2053 };
2054
2055 // associate set of nodes, common bitmask for these nodes,
2056 // and gives the < to compare different pairs
2057 class BitAssociation_t
2058 {
2059 private:
2060 const Associations_t * m_pAssociations;
2061 mutable int m_iBits;
2062
2063 // The key method of subtree selection.
2064 // Most 'heavy' subtrees will be extracted first.
GetWeight() const2065 inline int GetWeight() const
2066 {
2067 assert ( m_pAssociations );
2068 int iNodes = m_pAssociations->GetLength();
2069 if ( m_iBits==0 && m_uMask!=0 )
2070 {
2071 for ( uint64_t dMask = m_uMask; dMask; dMask >>=1 )
2072 m_iBits += (int)( dMask & 1 );
2073 }
2074
2075 // current working formula is num_nodes^2 * num_hits
2076 return iNodes * iNodes * m_iBits;
2077 }
2078
2079 public:
2080 uint64_t m_uMask;
2081
BitAssociation_t()2082 BitAssociation_t()
2083 : m_pAssociations ( NULL )
2084 , m_iBits ( 0 )
2085 , m_uMask ( 0 )
2086 {}
2087
Init(uint64_t uMask,const Associations_t * dNodes)2088 void Init ( uint64_t uMask, const Associations_t* dNodes )
2089 {
2090 m_uMask = uMask;
2091 m_pAssociations = dNodes;
2092 m_iBits = 0;
2093 }
2094
operator <(const BitAssociation_t & second) const2095 bool operator< (const BitAssociation_t& second) const
2096 {
2097 return GetWeight() < second.GetWeight();
2098 }
2099 };
2100
2101 // for pairs of values builds and stores the association "key -> list of values"
2102 class CAssociations_t
2103 : public CSphOrderedHash < Associations_t, uint64_t, IdentityHash_fn, 128 >
2104 {
2105 int m_iBits; // number of non-unique associations
2106 public:
2107
CAssociations_t()2108 CAssociations_t() : m_iBits ( 0 ) {}
2109
2110 // Add the given pTree into the list of pTrees, associated with given uHash
Associate(XQNode_t * pTree,uint64_t uHash)2111 int Associate ( XQNode_t * pTree, uint64_t uHash )
2112 {
2113 if ( !Exists ( uHash ) )
2114 Add ( Associations_t(), uHash );
2115 if ( operator[]( uHash ).Associate2nd ( pTree->GetHash() ) )
2116 m_iBits++;
2117 return m_iBits;
2118 }
2119
2120 // merge the existing association of uHash with given chain
MergeAssociations(const Associations_t & chain,uint64_t uHash)2121 void MergeAssociations ( const Associations_t & chain, uint64_t uHash )
2122 {
2123 if ( !Exists ( uHash ) )
2124 Add ( chain, uHash );
2125 else
2126 operator[]( uHash ).Merge ( chain );
2127 }
2128
GetBits() const2129 inline int GetBits() const { return m_iBits; }
2130 };
2131
2132 // The main class for working with common subtrees
2133 class RevealCommon_t : ISphNoncopyable
2134 {
2135 private:
2136 static const int MAX_MULTINODES = 64;
2137 CSphVector<BitMask_t> m_dBitmasks; // all bitmasks for all the nodes
2138 CSphVector<uint64_t> m_dSubQueries; // final vector with roadmap for tree division.
2139 CAssociations_t m_hNodes; // initial accumulator for nodes
2140 CAssociations_t m_hInterSections; // initial accumulator for nodes
2141 CDwordHash m_hBitOrders; // order numbers for found common subnodes
2142 XQOperator_e m_eOp; // my operator which I process
2143
2144 private:
2145
2146 // returns the order for given uHash (if any).
GetBitOrder(uint64_t uHash) const2147 inline int GetBitOrder ( uint64_t uHash ) const
2148 {
2149 if ( !m_hBitOrders.Exists ( uHash ) )
2150 return -1;
2151 return m_hBitOrders[uHash];
2152 }
2153
2154 // recursively scans the whole tree and builds the maps
2155 // where a list of parents associated with every "leaf" nodes (i.e. with children)
BuildAssociations(XQNode_t * pTree)2156 bool BuildAssociations ( XQNode_t * pTree )
2157 {
2158 if ( IsAppropriate ( pTree ) )
2159 {
2160 ARRAY_FOREACH ( i, pTree->m_dChildren )
2161 if ( ( !BuildAssociations ( pTree->m_dChildren[i] ) )
2162 || ( ( m_eOp==pTree->GetOp() )
2163 && ( m_hNodes.Associate ( pTree, pTree->m_dChildren[i]->GetHash() )>=MAX_MULTINODES ) ) )
2164 {
2165 return false;
2166 }
2167 }
2168 return true;
2169 }
2170
2171 // Find all leafs, non-unique across the tree,
2172 // and associate the order number with every of them
CalcCommonNodes()2173 bool CalcCommonNodes ()
2174 {
2175 if ( !m_hNodes.GetBits() )
2176 return false; // there is totally no non-unique leaves
2177 int iBit = 0;
2178 m_hNodes.IterateStart();
2179 while ( m_hNodes.IterateNext() )
2180 if ( m_hNodes.IterateGet().GetLength() > 1 )
2181 m_hBitOrders.Add ( iBit++, m_hNodes.IterateGetKey() );
2182 assert ( m_hNodes.GetBits()==m_hBitOrders.GetLength() );
2183 m_hNodes.Reset(); ///< since from now we don't need this data anymore
2184 return true;
2185 }
2186
2187 // recursively builds for every node the bitmaks
2188 // of common nodes it has as children
BuildBitmasks(XQNode_t * pTree)2189 void BuildBitmasks ( XQNode_t * pTree )
2190 {
2191 if ( !IsAppropriate ( pTree ) )
2192 return;
2193
2194 if ( m_eOp==pTree->GetOp() )
2195 {
2196 // calculate the bitmask
2197 int iOrder;
2198 uint64_t dMask = 0;
2199 ARRAY_FOREACH ( i, pTree->m_dChildren )
2200 {
2201 iOrder = GetBitOrder ( pTree->m_dChildren[i]->GetHash() );
2202 if ( iOrder>=0 )
2203 dMask |= 1ull << iOrder;
2204 }
2205
2206 // add the bitmask into the array
2207 if ( dMask )
2208 m_dBitmasks.Add().Init( pTree, dMask );
2209 }
2210
2211 // recursively process all the children
2212 ARRAY_FOREACH ( i, pTree->m_dChildren )
2213 BuildBitmasks ( pTree->m_dChildren[i] );
2214 }
2215
2216 // Collect all possible intersections of Bitmasks.
2217 // For every non-zero intersection we collect the list of trees which contain it.
CalcIntersections()2218 void CalcIntersections ()
2219 {
2220 // Round 1. Intersect all content of bitmasks one-by-one.
2221 ARRAY_FOREACH ( i, m_dBitmasks )
2222 for ( int j = i+1; j<m_dBitmasks.GetLength(); j++ )
2223 {
2224 // intersect one-by-one and group (grouping is done by nature of a hash)
2225 uint64_t uMask = m_dBitmasks[i].GetMask() & m_dBitmasks[j].GetMask();
2226 if ( uMask )
2227 {
2228 m_hInterSections.Associate ( m_dBitmasks[i].GetTree(), uMask );
2229 m_hInterSections.Associate ( m_dBitmasks[j].GetTree(), uMask );
2230 }
2231 }
2232
2233 // Round 2. Intersect again all collected intersection one-by-one - until zero.
2234 void *p1=NULL, *p2;
2235 uint64_t uMask1, uMask2;
2236 while ( m_hInterSections.IterateNext ( &p1 ) )
2237 {
2238 p2 = p1;
2239 while ( m_hInterSections.IterateNext ( &p2 ) )
2240 {
2241 uMask1 = CAssociations_t::IterateGetKey ( &p1 );
2242 uMask2 = CAssociations_t::IterateGetKey ( &p2 );
2243 assert ( uMask1!=uMask2 );
2244 uMask1 &= uMask2;
2245 if ( uMask1 )
2246 {
2247 m_hInterSections.MergeAssociations ( CAssociations_t::IterateGet ( &p1 ), uMask1 );
2248 m_hInterSections.MergeAssociations ( CAssociations_t::IterateGet ( &p2 ), uMask1 );
2249 }
2250 }
2251 }
2252 }
2253
2254 // create the final kit of common-subsets
2255 // which we will actually reveal (extract) from original trees
MakeQueries()2256 void MakeQueries()
2257 {
2258 CSphVector<BitAssociation_t> dSubnodes; // masks for our selected subnodes
2259 dSubnodes.Reserve ( m_hInterSections.GetLength() );
2260 m_hInterSections.IterateStart();
2261 while ( m_hInterSections.IterateNext() )
2262 dSubnodes.Add().Init( m_hInterSections.IterateGetKey(), &m_hInterSections.IterateGet() );
2263
2264 // sort by weight descending (weight sorting is hold by operator <)
2265 dSubnodes.RSort();
2266 m_dSubQueries.Reset();
2267
2268 // make the final subtrees vector: get one-by-one from the beginning,
2269 // intresect with all the next and throw out zeros.
2270 // The final subqueries will not be intersected between each other.
2271 int j;
2272 uint64_t uMask;
2273 ARRAY_FOREACH ( i, dSubnodes )
2274 {
2275 uMask = dSubnodes[i].m_uMask;
2276 m_dSubQueries.Add ( uMask );
2277 j = i+1;
2278 while ( j < dSubnodes.GetLength() )
2279 {
2280 if ( !( dSubnodes[j].m_uMask &= ~uMask ) )
2281 dSubnodes.Remove(j);
2282 else
2283 j++;
2284 }
2285 }
2286 }
2287
2288 // Now we finally extract the common subtrees from original tree
2289 // and (recursively) from it's children
Reorganize(XQNode_t * pTree)2290 void Reorganize ( XQNode_t * pTree )
2291 {
2292 if ( !IsAppropriate ( pTree ) )
2293 return;
2294
2295 if ( m_eOp==pTree->GetOp() )
2296 {
2297 // pBranch is for common subset of children, pOtherChildren is for the rest.
2298 CSphOrderedHash < XQNode_t*, int, IdentityHash_fn, 64 > hBranches;
2299 XQNode_t * pOtherChildren = NULL;
2300 int iBit;
2301 int iOptimizations = 0;
2302 ARRAY_FOREACH ( i, pTree->m_dChildren )
2303 {
2304 iBit = GetBitOrder ( pTree->m_dChildren[i]->GetHash() );
2305
2306 // works only with children which are actually common with somebody else
2307 if ( iBit>=0 )
2308 {
2309 // since subqueries doesn't intersected between each other,
2310 // the first hit we found in this loop is exactly what we searched.
2311 ARRAY_FOREACH ( j, m_dSubQueries )
2312 if ( ( 1ull << iBit ) & m_dSubQueries[j] )
2313 {
2314 XQNode_t * pNode;
2315 if ( !hBranches.Exists(j) )
2316 {
2317 pNode = new XQNode_t ( pTree->m_dSpec );
2318 pNode->SetOp ( m_eOp, pTree->m_dChildren[i] );
2319 hBranches.Add ( pNode, j );
2320 } else
2321 {
2322 pNode = hBranches[j];
2323 pNode->m_dChildren.Add ( pTree->m_dChildren[i] );
2324
2325 // Count essential subtrees (with at least 2 children)
2326 if ( pNode->m_dChildren.GetLength()==2 )
2327 iOptimizations++;
2328 }
2329 break;
2330 }
2331 // another nodes add to the set of "other" children
2332 } else
2333 {
2334 if ( !pOtherChildren )
2335 {
2336 pOtherChildren = new XQNode_t ( pTree->m_dSpec );
2337 pOtherChildren->SetOp ( m_eOp, pTree->m_dChildren[i] );
2338 } else
2339 pOtherChildren->m_dChildren.Add ( pTree->m_dChildren[i] );
2340 }
2341 }
2342
2343 // we don't reorganize explicit simple case - as no "others" and only one common.
2344 // Also reject optimization if there is nothing to optimize.
2345 if ( ( iOptimizations==0 )
2346 | ( !pOtherChildren && ( hBranches.GetLength()==1 ) ) )
2347 {
2348 if ( pOtherChildren )
2349 pOtherChildren->m_dChildren.Reset();
2350 hBranches.IterateStart();
2351 while ( hBranches.IterateNext() )
2352 {
2353 assert ( hBranches.IterateGet() );
2354 hBranches.IterateGet()->m_dChildren.Reset();
2355 SafeDelete ( hBranches.IterateGet() );
2356 }
2357 } else
2358 {
2359 // reorganize the tree: replace the common subset to explicit node with
2360 // only common members inside. This will give the the possibility
2361 // to cache the node.
2362 pTree->m_dChildren.Reset();
2363 if ( pOtherChildren )
2364 pTree->m_dChildren.SwapData ( pOtherChildren->m_dChildren );
2365
2366 hBranches.IterateStart();
2367 while ( hBranches.IterateNext() )
2368 {
2369 if ( hBranches.IterateGet()->m_dChildren.GetLength()==1 )
2370 {
2371 pTree->m_dChildren.Add ( hBranches.IterateGet()->m_dChildren[0] );
2372 hBranches.IterateGet()->m_dChildren.Reset();
2373 SafeDelete ( hBranches.IterateGet() );
2374 } else
2375 pTree->m_dChildren.Add ( hBranches.IterateGet() );
2376 }
2377 }
2378 SafeDelete ( pOtherChildren );
2379 }
2380
2381 // recursively process all the children
2382 ARRAY_FOREACH ( i, pTree->m_dChildren )
2383 Reorganize ( pTree->m_dChildren[i] );
2384 }
2385
2386 public:
RevealCommon_t(XQOperator_e eOp)2387 explicit RevealCommon_t ( XQOperator_e eOp )
2388 : m_eOp ( eOp )
2389 {}
2390
2391 // actual method for processing tree and reveal (extract) common subtrees
Transform(int iXQ,const XQQuery_t * pXQ)2392 void Transform ( int iXQ, const XQQuery_t * pXQ )
2393 {
2394 // collect all non-unique nodes
2395 for ( int i=0; i<iXQ; i++ )
2396 if ( !BuildAssociations ( pXQ[i].m_pRoot ) )
2397 return;
2398
2399 // count and order all non-unique nodes
2400 if ( !CalcCommonNodes() )
2401 return;
2402
2403 // create and collect bitmask for every node
2404 for ( int i=0; i<iXQ; i++ )
2405 BuildBitmasks ( pXQ[i].m_pRoot );
2406
2407 // intersect all bitmasks one-by-one, and also intersect all intersections
2408 CalcIntersections();
2409
2410 // the die-hard: actually select the set of subtrees which we'll process
2411 MakeQueries();
2412
2413 // ... and finally - process all our trees.
2414 for ( int i=0; i<iXQ; i++ )
2415 Reorganize ( pXQ[i].m_pRoot );
2416 }
2417 };
2418
2419
2420 struct MarkedNode_t
2421 {
2422 int m_iCounter;
2423 XQNode_t * m_pTree;
2424 bool m_bMarked;
2425 int m_iOrder;
2426
MarkedNode_tMarkedNode_t2427 explicit MarkedNode_t ( XQNode_t * pTree=NULL )
2428 : m_iCounter ( 1 )
2429 , m_pTree ( pTree )
2430 , m_bMarked ( false )
2431 , m_iOrder ( 0 )
2432 {}
2433
MarkItMarkedNode_t2434 void MarkIt ( bool bMark )
2435 {
2436 // mark
2437 if ( bMark )
2438 {
2439 m_iCounter++;
2440 m_bMarked = true;
2441 return;
2442 }
2443
2444 // unmark
2445 if ( m_bMarked && m_iCounter>1 )
2446 m_iCounter--;
2447 if ( m_iCounter<2 )
2448 m_bMarked = false;
2449 }
2450 };
2451
2452 typedef CSphOrderedHash < MarkedNode_t, uint64_t, IdentityHash_fn, 128 > CSubtreeHash;
2453
2454 struct XqTreeComparator_t
2455 {
2456 CSphVector<const XQKeyword_t *> m_dTerms1;
2457 CSphVector<const XQKeyword_t *> m_dTerms2;
2458
2459 bool IsEqual ( const XQNode_t * pNode1, const XQNode_t * pNode2 );
2460 bool CheckCollectTerms ( const XQNode_t * pNode1, const XQNode_t * pNode2 );
2461 };
2462
2463 /// check hashes, then check subtrees, then flag
FlagCommonSubtrees(XQNode_t * pTree,CSubtreeHash & hSubTrees,XqTreeComparator_t & tCmp,bool bFlag,bool bMarkIt)2464 static void FlagCommonSubtrees ( XQNode_t * pTree, CSubtreeHash & hSubTrees, XqTreeComparator_t & tCmp, bool bFlag, bool bMarkIt )
2465 {
2466 if ( !IsAppropriate ( pTree ) )
2467 return;
2468
2469 // we do not yet have any collisions stats,
2470 // but chances are we don't actually need IsEqualTo() at all
2471 uint64_t iHash = pTree->GetHash();
2472 if ( bFlag && hSubTrees.Exists ( iHash ) && tCmp.IsEqual ( hSubTrees [ iHash ].m_pTree, pTree ) )
2473 {
2474 hSubTrees[iHash].MarkIt ( true );
2475
2476 // we just add all the children but do NOT mark them as common
2477 // so that only the subtree root is marked.
2478 // also we unmark all the cases which were eaten by bigger trees
2479 ARRAY_FOREACH ( i, pTree->m_dChildren )
2480 if ( !hSubTrees.Exists ( pTree->m_dChildren[i]->GetHash() ) )
2481 FlagCommonSubtrees ( pTree->m_dChildren[i], hSubTrees, tCmp, false, bMarkIt );
2482 else
2483 FlagCommonSubtrees ( pTree->m_dChildren[i], hSubTrees, tCmp, false, false );
2484 } else
2485 {
2486 if ( !bMarkIt )
2487 hSubTrees[iHash].MarkIt ( false );
2488 else
2489 hSubTrees.Add ( MarkedNode_t ( pTree ), iHash );
2490
2491 ARRAY_FOREACH ( i, pTree->m_dChildren )
2492 FlagCommonSubtrees ( pTree->m_dChildren[i], hSubTrees, tCmp, bFlag, bMarkIt );
2493 }
2494 }
2495
2496
SignCommonSubtrees(XQNode_t * pTree,const CSubtreeHash & hSubTrees)2497 static void SignCommonSubtrees ( XQNode_t * pTree, const CSubtreeHash & hSubTrees )
2498 {
2499 if ( !pTree )
2500 return;
2501
2502 uint64_t iHash = pTree->GetHash();
2503 const MarkedNode_t * pCommon = hSubTrees ( iHash );
2504 if ( pCommon && pCommon->m_bMarked )
2505 pTree->TagAsCommon ( pCommon->m_iOrder, pCommon->m_iCounter );
2506
2507 ARRAY_FOREACH ( i, pTree->m_dChildren )
2508 SignCommonSubtrees ( pTree->m_dChildren[i], hSubTrees );
2509 }
2510
2511
sphMarkCommonSubtrees(int iXQ,const XQQuery_t * pXQ)2512 int sphMarkCommonSubtrees ( int iXQ, const XQQuery_t * pXQ )
2513 {
2514 if ( iXQ<=0 || !pXQ )
2515 return 0;
2516
2517 { // Optional reorganize tree to extract common parts
2518 RevealCommon_t ( SPH_QUERY_AND ).Transform ( iXQ, pXQ );
2519 RevealCommon_t ( SPH_QUERY_OR ).Transform ( iXQ, pXQ );
2520 }
2521
2522 // flag common subtrees and refcount them
2523 XqTreeComparator_t tCmp;
2524 CSubtreeHash hSubtrees;
2525 for ( int i=0; i<iXQ; i++ )
2526 FlagCommonSubtrees ( pXQ[i].m_pRoot, hSubtrees, tCmp, true, true );
2527
2528 // number marked subtrees and assign them order numbers.
2529 int iOrder = 0;
2530 hSubtrees.IterateStart();
2531 while ( hSubtrees.IterateNext() )
2532 if ( hSubtrees.IterateGet().m_bMarked )
2533 hSubtrees.IterateGet().m_iOrder = iOrder++;
2534
2535 // copy the flags and orders to original trees
2536 for ( int i=0; i<iXQ; i++ )
2537 SignCommonSubtrees ( pXQ[i].m_pRoot, hSubtrees );
2538
2539 return iOrder;
2540 }
2541
2542 struct CmpTermQPos_fn
2543 {
IsLessCmpTermQPos_fn2544 inline bool IsLess ( const XQKeyword_t * pA, const XQKeyword_t * pB ) const
2545 {
2546 assert ( pA && pB );
2547 return ( pA->m_iAtomPos<pB->m_iAtomPos );
2548 }
2549 };
2550
IsEqual(const XQNode_t * pNode1,const XQNode_t * pNode2)2551 bool XqTreeComparator_t::IsEqual ( const XQNode_t * pNode1, const XQNode_t * pNode2 )
2552 {
2553 // early out check to skip allocations
2554 if ( !pNode1 || !pNode2 || pNode1->GetHash()!=pNode2->GetHash() || pNode1->GetOp()!=pNode2->GetOp () )
2555 return false;
2556
2557 // need reset data from previous compare
2558 m_dTerms1.Resize ( 0 );
2559 m_dTerms2.Resize ( 0 );
2560 // need reserve some space for first compare
2561 m_dTerms1.Reserve ( 64 );
2562 m_dTerms2.Reserve ( 64 );
2563
2564 if ( !CheckCollectTerms ( pNode1, pNode2 ) )
2565 return false;
2566
2567 assert ( m_dTerms1.GetLength ()==m_dTerms2.GetLength () );
2568
2569 if ( !m_dTerms1.GetLength() )
2570 return true;
2571
2572 m_dTerms1.Sort ( CmpTermQPos_fn() );
2573 m_dTerms2.Sort ( CmpTermQPos_fn() );
2574
2575 if ( m_dTerms1[0]->m_sWord!=m_dTerms2[0]->m_sWord )
2576 return false;
2577
2578 for ( int i=1; i<m_dTerms1.GetLength(); i++ )
2579 {
2580 int iDelta1 = m_dTerms1[i]->m_iAtomPos - m_dTerms1[i-1]->m_iAtomPos;
2581 int iDelta2 = m_dTerms2[i]->m_iAtomPos - m_dTerms2[i-1]->m_iAtomPos;
2582 if ( iDelta1!=iDelta2 || m_dTerms1[i]->m_sWord!=m_dTerms2[i]->m_sWord )
2583 return false;
2584 }
2585
2586 return true;
2587 }
2588
CheckCollectTerms(const XQNode_t * pNode1,const XQNode_t * pNode2)2589 bool XqTreeComparator_t::CheckCollectTerms ( const XQNode_t * pNode1, const XQNode_t * pNode2 )
2590 {
2591 // early out
2592 if ( !pNode1 || !pNode2
2593 || pNode1->GetHash ()!=pNode2->GetHash () || pNode1->GetOp ()!=pNode2->GetOp ()
2594 || pNode1->m_dWords.GetLength ()!=pNode2->m_dWords.GetLength ()
2595 || pNode1->m_dChildren.GetLength ()!=pNode2->m_dChildren.GetLength () )
2596 return false;
2597
2598 // for plain nodes compare keywords
2599 ARRAY_FOREACH ( i, pNode1->m_dWords )
2600 m_dTerms1.Add ( pNode1->m_dWords.Begin() + i );
2601
2602 ARRAY_FOREACH ( i, pNode2->m_dWords )
2603 m_dTerms2.Add ( pNode2->m_dWords.Begin () + i );
2604
2605 // for non-plain nodes compare children
2606 ARRAY_FOREACH ( i, pNode1->m_dChildren )
2607 {
2608 if ( !CheckCollectTerms ( pNode1->m_dChildren[i], pNode2->m_dChildren[i] ) )
2609 return false;
2610 }
2611
2612 return true;
2613 }
2614
2615
2616 class CSphTransformation : public ISphNoncopyable
2617 {
2618 public:
2619 CSphTransformation ( XQNode_t ** ppRoot, const ISphKeywordsStat * pKeywords );
2620 void Transform ();
2621 inline void Dump ( const XQNode_t * pNode, const char * sHeader = "" );
2622
2623 private:
2624
2625 typedef CSphOrderedHash < CSphVector<XQNode_t*>, uint64_t, IdentityHash_fn, 32> HashSimilar_t;
2626 CSphOrderedHash < HashSimilar_t, uint64_t, IdentityHash_fn, 256 > m_hSimilar;
2627 CSphVector<XQNode_t *> m_dRelatedNodes;
2628 const ISphKeywordsStat * m_pKeywords;
2629 XQNode_t ** m_ppRoot;
2630 typedef bool ( *Checker_fn ) ( const XQNode_t * );
2631
2632 private:
2633
2634 void Dump ();
2635 void SetCosts ( XQNode_t * pNode, const CSphVector<XQNode_t *> & dNodes );
2636 int GetWeakestIndex ( const CSphVector<XQNode_t *> & dNodes );
2637
2638 template < typename Group, typename SubGroup >
2639 inline void TreeCollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker );
2640
2641 template < typename Group, typename SubGroup >
2642 inline bool CollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker );
2643
2644 template < typename Excluder, typename Parenter >
2645 inline bool CollectRelatedNodes ( const CSphVector<XQNode_t *> & dSimilarNodes );
2646
2647 // ((A !N) | (B !N)) -> ((A|B) !N)
2648 static bool CheckCommonNot ( const XQNode_t * pNode );
2649 bool TransformCommonNot ();
2650 bool MakeTransformCommonNot ( CSphVector<XQNode_t *> & dSimilarNodes );
2651
2652 // ((A !(N AA)) | (B !(N BB))) -> (((A|B) !N) | (A !AA) | (B !BB)) [ if cost(N) > cost(A) + cost(B) ]
2653 static bool CheckCommonCompoundNot ( const XQNode_t * pNode );
2654 bool TransformCommonCompoundNot ();
2655 bool MakeTransformCommonCompoundNot ( CSphVector<XQNode_t *> & dSimilarNodes );
2656
2657 // ((A (X | AA)) | (B (X | BB))) -> (((A|B) X) | (A AA) | (B BB)) [ if cost(X) > cost(A) + cost(B) ]
2658 static bool CheckCommonSubTerm ( const XQNode_t * pNode );
2659 bool TransformCommonSubTerm ();
2660 void MakeTransformCommonSubTerm ( CSphVector<XQNode_t *> & dX );
2661
2662 // (A | "A B"~N) -> A ; ("A B" | "A B C") -> "A B" ; ("A B"~N | "A B C"~N) -> ("A B"~N)
2663 static bool CheckCommonKeywords ( const XQNode_t * pNode );
2664 bool TransformCommonKeywords ();
2665
2666 // ("X A B" | "Y A B") -> (("X|Y") "A B")
2667 // ("A B X" | "A B Y") -> (("X|Y") "A B")
2668 static bool CheckCommonPhrase ( const XQNode_t * pNode );
2669 bool TransformCommonPhrase ();
2670 void MakeTransformCommonPhrase ( CSphVector<XQNode_t *> & dCommonNodes, int iCommonLen, bool bHeadIsCommon );
2671
2672 // ((A !X) | (A !Y) | (A !Z)) -> (A !(X Y Z))
2673 static bool CheckCommonAndNotFactor ( const XQNode_t * pNode );
2674 bool TransformCommonAndNotFactor ();
2675 bool MakeTransformCommonAndNotFactor ( CSphVector<XQNode_t *> & dSimilarNodes );
2676
2677 // ((A !(N | N1)) | (B !(N | N2))) -> (( (A !N1) | (B !N2) ) !N)
2678 static bool CheckCommonOrNot ( const XQNode_t * pNode );
2679 bool TransformCommonOrNot ();
2680 bool MakeTransformCommonOrNot ( CSphVector<XQNode_t *> & dSimilarNodes );
2681
2682 // The main goal of transformations below is tree clarification and
2683 // further applying of standard transformations above.
2684
2685 // "hung" operand ( AND(OR) node with only 1 child ) appears after an internal transformation
2686 static bool CheckHungOperand ( const XQNode_t * pNode );
2687 bool TransformHungOperand ();
2688
2689 // ((A | B) | C) -> ( A | B | C )
2690 // ((A B) C) -> ( A B C )
2691 static bool CheckExcessBrackets ( const XQNode_t * pNode );
2692 bool TransformExcessBrackets ();
2693
2694 // ((A !N1) !N2) -> (A !(N1 | N2))
2695 static bool CheckExcessAndNot ( const XQNode_t * pNode );
2696 bool TransformExcessAndNot ();
2697
2698 private:
2699 static const uint64_t CONST_GROUP_FACTOR;
2700
2701 struct NullNode
2702 {
ByCSphTransformation::NullNode2703 static inline uint64_t By ( XQNode_t * ) { return CONST_GROUP_FACTOR; } // NOLINT
FromCSphTransformation::NullNode2704 static inline const XQNode_t * From ( const XQNode_t * ) { return NULL; } // NOLINT
2705 };
2706
2707 struct CurrentNode
2708 {
ByCSphTransformation::CurrentNode2709 static inline uint64_t By ( XQNode_t * p ) { return p->GetFuzzyHash(); }
FromCSphTransformation::CurrentNode2710 static inline const XQNode_t * From ( const XQNode_t * p ) { return p; }
2711 };
2712
2713 struct ParentNode
2714 {
ByCSphTransformation::ParentNode2715 static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::ParentNode2716 static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent; }
2717 };
2718
2719 struct GrandNode
2720 {
ByCSphTransformation::GrandNode2721 static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::GrandNode2722 static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent->m_pParent; }
2723 };
2724
2725 struct Grand2Node {
ByCSphTransformation::Grand2Node2726 static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::Grand2Node2727 static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent; }
2728 };
2729
2730 struct Grand3Node
2731 {
ByCSphTransformation::Grand3Node2732 static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::Grand3Node2733 static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent->m_pParent; }
2734 };
2735 };
2736
2737
CSphTransformation(XQNode_t ** ppRoot,const ISphKeywordsStat * pKeywords)2738 CSphTransformation::CSphTransformation ( XQNode_t ** ppRoot, const ISphKeywordsStat * pKeywords )
2739 : m_pKeywords ( pKeywords )
2740 , m_ppRoot ( ppRoot )
2741 {
2742 assert ( m_pKeywords!=NULL );
2743 }
2744
2745
2746
2747 const uint64_t CSphTransformation::CONST_GROUP_FACTOR = 0;
2748
2749 template < typename Group, typename SubGroup >
TreeCollectInfo(XQNode_t * pParent,Checker_fn pfnChecker)2750 void CSphTransformation::TreeCollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker )
2751 {
2752 if ( pParent )
2753 {
2754 if ( pfnChecker ( pParent ) )
2755 {
2756 // "Similar nodes" are nodes which are suited to a template (like 'COMMON NOT', 'COMMON COMPOND NOT', ...)
2757 uint64_t uGroup = (uint64_t)Group::From ( pParent );
2758 uint64_t uSubGroup = SubGroup::By ( pParent );
2759
2760 HashSimilar_t & hGroup = m_hSimilar.AddUnique ( uGroup );
2761 hGroup.AddUnique ( uSubGroup ).Add ( pParent );
2762 }
2763
2764 ARRAY_FOREACH ( iChild, pParent->m_dChildren )
2765 TreeCollectInfo<Group, SubGroup> ( pParent->m_dChildren[iChild], pfnChecker );
2766 }
2767 }
2768
2769
2770 template < typename Group, typename SubGroup >
CollectInfo(XQNode_t * pParent,Checker_fn pfnChecker)2771 bool CSphTransformation::CollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker )
2772 {
2773 ( *m_ppRoot )->Check ( true );
2774 m_hSimilar.Reset();
2775
2776 TreeCollectInfo<Group, SubGroup> ( pParent, pfnChecker );
2777 return ( m_hSimilar.GetLength()>0 );
2778 }
2779
2780
SetCosts(XQNode_t * pNode,const CSphVector<XQNode_t * > & dNodes)2781 void CSphTransformation::SetCosts ( XQNode_t * pNode, const CSphVector<XQNode_t *> & dNodes )
2782 {
2783 assert ( pNode || dNodes.GetLength() );
2784
2785 CSphVector<XQNode_t*> dChildren ( dNodes.GetLength() + 1 );
2786 dChildren[dNodes.GetLength()] = pNode;
2787 ARRAY_FOREACH ( i, dNodes )
2788 {
2789 dChildren[i] = dNodes[i];
2790 dChildren[i]->m_iUser = 0;
2791 }
2792
2793 // collect unknown keywords from all children
2794 CSphVector<CSphKeywordInfo> dKeywords;
2795 SmallStringHash_T<int> hCosts;
2796 ARRAY_FOREACH ( i, dChildren )
2797 {
2798 XQNode_t * pChild = dChildren[i];
2799 ARRAY_FOREACH ( j, pChild->m_dChildren )
2800 {
2801 dChildren.Add ( pChild->m_dChildren[j] );
2802 dChildren.Last()->m_iUser = 0;
2803 assert ( dChildren.Last()->m_pParent==pChild );
2804 }
2805 ARRAY_FOREACH ( j, pChild->m_dWords )
2806 {
2807 const CSphString & sWord = pChild->m_dWords[j].m_sWord;
2808 int * pCost = hCosts ( sWord );
2809 if ( !pCost )
2810 {
2811 Verify ( hCosts.Add ( 0, sWord ) );
2812 dKeywords.Add();
2813 dKeywords.Last().m_sTokenized = sWord;
2814 dKeywords.Last().m_iDocs = 0;
2815 }
2816 }
2817 }
2818
2819 // get keywords info from index dictionary
2820 if ( dKeywords.GetLength() )
2821 {
2822 m_pKeywords->FillKeywords ( dKeywords );
2823 ARRAY_FOREACH ( i, dKeywords )
2824 {
2825 const CSphKeywordInfo & tKeyword = dKeywords[i];
2826 hCosts[tKeyword.m_sTokenized] = tKeyword.m_iDocs;
2827 }
2828 }
2829
2830 // propagate cost bottom-up (from children to parents)
2831 for ( int i=dChildren.GetLength()-1; i>=0; i-- )
2832 {
2833 XQNode_t * pChild = dChildren[i];
2834 int iCost = 0;
2835 ARRAY_FOREACH ( j, pChild->m_dWords )
2836 iCost += hCosts [ pChild->m_dWords[j].m_sWord ];
2837
2838 pChild->m_iUser += iCost;
2839 if ( pChild->m_pParent )
2840 pChild->m_pParent->m_iUser += pChild->m_iUser;
2841 }
2842 }
2843
2844
2845 template < typename Excluder, typename Parenter >
CollectRelatedNodes(const CSphVector<XQNode_t * > & dSimilarNodes)2846 bool CSphTransformation::CollectRelatedNodes ( const CSphVector<XQNode_t *> & dSimilarNodes )
2847 {
2848 m_dRelatedNodes.Resize ( 0 );
2849 ARRAY_FOREACH ( i, dSimilarNodes )
2850 {
2851 // Eval node that should be excluded
2852 const XQNode_t * pExclude = Excluder::From ( dSimilarNodes[i] );
2853
2854 // Eval node that points to related nodes
2855 const XQNode_t * pParent = Parenter::From ( dSimilarNodes[i] );
2856
2857 assert ( &pParent->m_dChildren!=&m_dRelatedNodes );
2858 ARRAY_FOREACH ( j, pParent->m_dChildren )
2859 {
2860 if ( pParent->m_dChildren[j]!=pExclude )
2861 m_dRelatedNodes.Add ( pParent->m_dChildren[j] );
2862 }
2863 }
2864 return ( m_dRelatedNodes.GetLength()>1 );
2865 }
2866
2867
CheckCommonNot(const XQNode_t * pNode)2868 bool CSphTransformation::CheckCommonNot ( const XQNode_t * pNode )
2869 {
2870 if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
2871 pNode->m_pParent->GetOp()!=SPH_QUERY_NOT || pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
2872 pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
2873 {
2874 //
2875 // NOLINT // NOT:
2876 // NOLINT // _______ OR (gGOr) ___________
2877 // NOLINT // / | |
2878 // NOLINT // ... AND NOT (grandAndNot) ...
2879 // NOLINT // / |
2880 // NOLINT // relatedNode NOT (parentNot)
2881 // NOLINT // |
2882 // NOLINT // pNode
2883 //
2884 return false;
2885 }
2886 return true;
2887 }
2888
2889
TransformCommonNot()2890 bool CSphTransformation::TransformCommonNot ()
2891 {
2892 bool bRecollect = false;
2893 m_hSimilar.IterateStart();
2894 while ( m_hSimilar.IterateNext () )
2895 {
2896 m_hSimilar.IterateGet().IterateStart();
2897 while ( m_hSimilar.IterateGet().IterateNext() )
2898 {
2899 // Nodes with the same iFuzzyHash
2900 CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
2901 if ( dSimilarNodes.GetLength()<2 )
2902 continue;
2903
2904 if ( CollectRelatedNodes < ParentNode, GrandNode > ( dSimilarNodes ) && MakeTransformCommonNot ( dSimilarNodes ) )
2905 {
2906 bRecollect = true;
2907 // Don't make transformation for other nodes from the same OR-node,
2908 // because query tree was changed and further transformations
2909 // might be invalid.
2910 break;
2911 }
2912 }
2913 }
2914
2915 return bRecollect;
2916 }
2917
2918
GetWeakestIndex(const CSphVector<XQNode_t * > & dNodes)2919 int CSphTransformation::GetWeakestIndex ( const CSphVector<XQNode_t *> & dNodes )
2920 {
2921 // Returns index of weakest node from the equal.
2922 // The example of equal nodes:
2923 // "aaa bbb" (PHRASE), "aaa bbb"~10 (PROXIMITY), "aaa bbb"~20 (PROXIMITY)
2924 // Such nodes have the same magic hash value.
2925 // The weakest is "aaa bbb"~20
2926
2927 int iWeakestIndex = 0;
2928 int iProximity = -1;
2929
2930 ARRAY_FOREACH ( i, dNodes )
2931 {
2932 XQNode_t * pNode = dNodes[i];
2933 if ( pNode->GetOp()==SPH_QUERY_PROXIMITY && pNode->m_iOpArg>iProximity )
2934 {
2935 iProximity = pNode->m_iOpArg;
2936 iWeakestIndex = i;
2937 }
2938 }
2939 return iWeakestIndex;
2940 }
2941
2942
MakeTransformCommonNot(CSphVector<XQNode_t * > & dSimilarNodes)2943 bool CSphTransformation::MakeTransformCommonNot ( CSphVector<XQNode_t *> & dSimilarNodes )
2944 {
2945 // Pick weakest node from the equal
2946 // PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
2947 // so they are considered as equal nodes.
2948 int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
2949
2950 // the weakest node is new parent of transformed expression
2951 XQNode_t * pWeakestAndNot = m_dRelatedNodes[iWeakestIndex]->m_pParent;
2952 assert ( pWeakestAndNot->m_dChildren[0]==m_dRelatedNodes[iWeakestIndex] );
2953 XQNode_t * pCommonOr = pWeakestAndNot->m_pParent;
2954 assert ( pCommonOr->GetOp()==SPH_QUERY_OR && pCommonOr->m_dChildren.Contains ( pWeakestAndNot ) );
2955 XQNode_t * pGrandCommonOr = pCommonOr->m_pParent;
2956
2957 bool bKeepOr = ( pCommonOr->m_dChildren.GetLength()>2 );
2958
2959 // reset ownership of related nodes
2960 ARRAY_FOREACH ( i, m_dRelatedNodes )
2961 {
2962 XQNode_t * pAnd = m_dRelatedNodes[i];
2963 XQNode_t * pAndNot = pAnd->m_pParent;
2964 assert ( pAndNot->m_pParent==pCommonOr );
2965
2966 if ( i!=iWeakestIndex )
2967 {
2968 Verify ( pAndNot->m_dChildren.RemoveValue ( pAnd ) );
2969
2970 if ( bKeepOr )
2971 {
2972 pCommonOr->m_dChildren.RemoveValue ( pAndNot );
2973 SafeDelete ( pAndNot );
2974 }
2975 }
2976 }
2977
2978 // move all related to new OR
2979 XQNode_t * pHubOr = new XQNode_t ( XQLimitSpec_t() );
2980 pHubOr->SetOp ( SPH_QUERY_OR, m_dRelatedNodes );
2981
2982 // insert hub OR via hub AND to new parent ( AND NOT )
2983 XQNode_t * pHubAnd = new XQNode_t ( XQLimitSpec_t() );
2984 pHubAnd->SetOp ( SPH_QUERY_AND, pHubOr );
2985 // replace old AND at new parent ( AND NOT ) 0 already at OR children
2986 pHubAnd->m_pParent = pWeakestAndNot;
2987 pWeakestAndNot->m_dChildren[0] = pHubAnd;
2988
2989 // in case common OR had only 2 children
2990 if ( !bKeepOr )
2991 {
2992 // replace old OR with AND_NOT at parent
2993 if ( !pGrandCommonOr )
2994 {
2995 pWeakestAndNot->m_pParent = NULL;
2996 *m_ppRoot = pWeakestAndNot;
2997 } else
2998 {
2999 pWeakestAndNot->m_pParent = pGrandCommonOr;
3000 CSphVector<XQNode_t *> & dChildren = pGrandCommonOr->m_dChildren;
3001 ARRAY_FOREACH ( i, dChildren )
3002 {
3003 if ( dChildren[i]==pCommonOr )
3004 {
3005 dChildren[i] = pWeakestAndNot;
3006 break;
3007 }
3008 }
3009 }
3010 // remove new parent ( AND OR ) from OR children
3011 Verify ( pCommonOr->m_dChildren.RemoveValue ( pWeakestAndNot ) );
3012 // free OR and all children
3013 SafeDelete ( pCommonOr );
3014 }
3015
3016 return true;
3017 }
3018
3019
CheckCommonCompoundNot(const XQNode_t * pNode)3020 bool CSphTransformation::CheckCommonCompoundNot ( const XQNode_t * pNode )
3021 {
3022 if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3023 !pNode->m_pParent->m_pParent->m_pParent->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_AND ||
3024 pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_NOT || pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
3025 pNode->m_pParent->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
3026 {
3027 //
3028 // NOLINT // NOT:
3029 // NOLINT // __ OR (Grand3 = CommonOr) __
3030 // NOLINT // / | |
3031 // NOLINT // ... AND NOT (Grand2) ...
3032 // NOLINT // / |
3033 // NOLINT // relatedNode NOT (grandNot)
3034 // NOLINT // |
3035 // NOLINT // AND (parentAnd)
3036 // NOLINT // / |
3037 // NOLINT // pNode ...
3038 //
3039 return false;
3040 }
3041 return true;
3042 }
3043
3044
TransformCommonCompoundNot()3045 bool CSphTransformation::TransformCommonCompoundNot ()
3046 {
3047 bool bRecollect = false;
3048 m_hSimilar.IterateStart();
3049 while ( m_hSimilar.IterateNext () )
3050 {
3051 m_hSimilar.IterateGet().IterateStart();
3052 while ( m_hSimilar.IterateGet().IterateNext() )
3053 {
3054 // Nodes with the same iFuzzyHash
3055 CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
3056 if ( dSimilarNodes.GetLength()<2 )
3057 continue;
3058
3059 if ( CollectRelatedNodes < GrandNode, Grand2Node > ( dSimilarNodes ) )
3060 {
3061 // Load cost of the first node from the group
3062 // of the common nodes. The cost of nodes from
3063 // TransformableNodes are the same.
3064 SetCosts ( dSimilarNodes[0], m_dRelatedNodes );
3065 int iCommon = dSimilarNodes[0]->m_iUser;
3066 int iRelated = 0;
3067 ARRAY_FOREACH ( i, m_dRelatedNodes )
3068 iRelated += m_dRelatedNodes[i]->m_iUser;
3069
3070 // Check that optimization willl be useful.
3071 if ( iCommon>iRelated && MakeTransformCommonCompoundNot ( dSimilarNodes ) )
3072 {
3073 bRecollect = true;
3074 // Don't make transformation for other nodes from the same OR-node,
3075 // because qtree was changed and further transformations
3076 // might be invalid.
3077 break;
3078 }
3079 }
3080 }
3081 }
3082
3083 return bRecollect;
3084 }
3085
3086
MakeTransformCommonCompoundNot(CSphVector<XQNode_t * > & dSimilarNodes)3087 bool CSphTransformation::MakeTransformCommonCompoundNot ( CSphVector<XQNode_t *> & dSimilarNodes )
3088 {
3089 // Pick weakest node from the equal
3090 // PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
3091 // so they are considered as equal nodes.
3092 int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
3093 assert ( iWeakestIndex!=-1 );
3094 XQNode_t * pWeakestSimilar = dSimilarNodes [ iWeakestIndex ];
3095
3096 // Common OR node (that is Grand3Node::From)
3097 XQNode_t * pCommonOr = pWeakestSimilar->m_pParent->m_pParent->m_pParent->m_pParent;
3098
3099 // Factor out and delete/unlink similar nodes ( except weakest )
3100 ARRAY_FOREACH ( i, dSimilarNodes )
3101 {
3102 XQNode_t * pParent = dSimilarNodes[i]->m_pParent;
3103 Verify ( pParent->m_dChildren.RemoveValue ( dSimilarNodes[i] ) );
3104
3105 if ( i!=iWeakestIndex )
3106 SafeDelete ( dSimilarNodes[i] );
3107 }
3108
3109 // Create yet another ANDNOT node
3110 // with related nodes and one common node
3111 XQNode_t * pNewNot = new XQNode_t ( XQLimitSpec_t() );
3112 pNewNot->SetOp ( SPH_QUERY_NOT, pWeakestSimilar );
3113
3114 XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
3115 pNewOr->SetOp ( SPH_QUERY_OR );
3116 pNewOr->m_dChildren.Resize ( m_dRelatedNodes.GetLength() );
3117 ARRAY_FOREACH ( i, m_dRelatedNodes )
3118 {
3119 // ANDNOT operation implies AND and NOT nodes.
3120 // The related nodes point to AND node that has one child node.
3121 assert ( m_dRelatedNodes[i]->m_dChildren.GetLength()==1 );
3122 pNewOr->m_dChildren[i] = m_dRelatedNodes[i]->m_dChildren[0]->Clone();
3123 pNewOr->m_dChildren[i]->m_pParent = pNewOr;
3124 }
3125
3126 XQNode_t * pNewAnd = new XQNode_t ( XQLimitSpec_t() );
3127 pNewAnd->SetOp ( SPH_QUERY_AND, pNewOr );
3128 XQNode_t * pNewAndNot = new XQNode_t ( XQLimitSpec_t() );
3129 pNewAndNot->SetOp ( SPH_QUERY_ANDNOT, pNewAnd, pNewNot );
3130 pCommonOr->m_dChildren.Add ( pNewAndNot );
3131 pNewAndNot->m_pParent = pCommonOr;
3132 return true;
3133 }
3134
3135
CheckCommonSubTerm(const XQNode_t * pNode)3136 bool CSphTransformation::CheckCommonSubTerm ( const XQNode_t * pNode )
3137 {
3138 if ( !pNode || ( pNode->GetOp()==SPH_QUERY_PHRASE && pNode->m_dChildren.GetLength() )
3139 || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3140 pNode->m_pParent->GetOp()!=SPH_QUERY_OR || pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_AND ||
3141 pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
3142 {
3143 //
3144 // NOLINT // NOT:
3145 // NOLINT // ________OR (gGOr)
3146 // NOLINT // / |
3147 // NOLINT // ...... AND (grandAnd)
3148 // NOLINT // / |
3149 // NOLINT // relatedNode OR (parentOr)
3150 // NOLINT // / |
3151 // NOLINT // pNode ...
3152 //
3153 return false;
3154 }
3155 return true;
3156 }
3157
3158
TransformCommonSubTerm()3159 bool CSphTransformation::TransformCommonSubTerm ()
3160 {
3161 bool bRecollect = false;
3162 m_hSimilar.IterateStart();
3163 while ( m_hSimilar.IterateNext () )
3164 {
3165 m_hSimilar.IterateGet().IterateStart();
3166 while ( m_hSimilar.IterateGet().IterateNext() )
3167 {
3168 // Nodes with the same iFuzzyHash
3169 CSphVector<XQNode_t *> & dX = m_hSimilar.IterateGet().IterateGet();
3170 if ( dX.GetLength()<2 )
3171 continue;
3172
3173 // skip common sub-terms from same tree
3174 bool bSame = false;
3175 for ( int i=0; i<dX.GetLength()-1 && !bSame; i++ )
3176 {
3177 for ( int j=i+1; j<dX.GetLength() && !bSame; j++ )
3178 bSame = ( dX[i]->m_pParent==dX[j]->m_pParent );
3179 }
3180 if ( bSame )
3181 continue;
3182
3183 if ( CollectRelatedNodes < ParentNode, GrandNode > ( dX ) )
3184 {
3185 // Load cost of the first node from the group
3186 // of the common nodes. The cost of nodes from
3187 // TransformableNodes are the same.
3188 SetCosts ( dX[0], m_dRelatedNodes );
3189 int iCostCommonSubTermNode = dX[0]->m_iUser;
3190 int iCostRelatedNodes = 0;
3191 ARRAY_FOREACH ( i, m_dRelatedNodes )
3192 iCostRelatedNodes += m_dRelatedNodes[i]->m_iUser;
3193
3194 // Check that optimization will be useful.
3195 if ( iCostCommonSubTermNode > iCostRelatedNodes )
3196 {
3197 MakeTransformCommonSubTerm ( dX );
3198 bRecollect = true;
3199 // Don't make transformation for other nodes from the same OR-node,
3200 // because query tree was changed and further transformations
3201 // might be invalid.
3202 break;
3203 }
3204 }
3205 }
3206 }
3207
3208 return bRecollect;
3209 }
3210
3211
3212 // remove nodes without children up the tree
SubtreeRemoveEmpty(XQNode_t * pNode)3213 static bool SubtreeRemoveEmpty ( XQNode_t * pNode )
3214 {
3215 if ( !pNode->IsEmpty() )
3216 return false;
3217
3218 // climb up
3219 XQNode_t * pParent = pNode->m_pParent;
3220 while ( pParent && pParent->m_dChildren.GetLength()<=1 && !pParent->m_dWords.GetLength() )
3221 {
3222 pNode = pParent;
3223 pParent = pParent->m_pParent;
3224 }
3225
3226 if ( pParent )
3227 pParent->m_dChildren.RemoveValue ( pNode );
3228
3229 // free subtree
3230 SafeDelete ( pNode );
3231 return true;
3232 }
3233
3234
3235 // eliminate composite ( AND \ OR ) nodes with only one children
CompositeFixup(XQNode_t * pNode,XQNode_t ** ppRoot)3236 static void CompositeFixup ( XQNode_t * pNode, XQNode_t ** ppRoot )
3237 {
3238 assert ( pNode && !pNode->m_dWords.GetLength() );
3239 if ( pNode->m_dChildren.GetLength()!=1 || !( pNode->GetOp()==SPH_QUERY_OR || pNode->GetOp()==SPH_QUERY_AND ) )
3240 return;
3241
3242 XQNode_t * pChild = pNode->m_dChildren[0];
3243 pChild->m_pParent = NULL;
3244 pNode->m_dChildren.Resize ( 0 );
3245
3246 // climb up
3247 XQNode_t * pParent = pNode->m_pParent;
3248 while ( pParent && pParent->m_dChildren.GetLength()==1 && !pParent->m_dWords.GetLength() &&
3249 ( pParent->GetOp()==SPH_QUERY_OR || pParent->GetOp()==SPH_QUERY_AND ) )
3250 {
3251 pNode = pParent;
3252 pParent = pParent->m_pParent;
3253 }
3254
3255 if ( pParent )
3256 {
3257 ARRAY_FOREACH ( i, pParent->m_dChildren )
3258 {
3259 if ( pParent->m_dChildren[i]!=pNode )
3260 continue;
3261
3262 pParent->m_dChildren[i] = pChild;
3263 pChild->m_pParent = pParent;
3264 break;
3265 }
3266 } else
3267 {
3268 *ppRoot = pChild;
3269 }
3270
3271 // free subtree
3272 SafeDelete ( pNode );
3273 }
3274
3275
CleanupSubtree(XQNode_t * pNode,XQNode_t ** ppRoot)3276 static void CleanupSubtree ( XQNode_t * pNode, XQNode_t ** ppRoot )
3277 {
3278 if ( SubtreeRemoveEmpty ( pNode ) )
3279 return;
3280
3281 CompositeFixup ( pNode, ppRoot );
3282 }
3283
3284
MakeTransformCommonSubTerm(CSphVector<XQNode_t * > & dX)3285 void CSphTransformation::MakeTransformCommonSubTerm ( CSphVector<XQNode_t *> & dX )
3286 {
3287 // Pick weakest node from the equal
3288 // PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
3289 // so they are considered as equal nodes.
3290 int iWeakestIndex = GetWeakestIndex ( dX );
3291 XQNode_t * pX = dX[iWeakestIndex];
3292
3293 // common parents of X and AA \ BB need to be excluded
3294 CSphVector<XQNode_t *> dExcluded ( dX.GetLength() );
3295
3296 // Factor out and delete/unlink similar nodes ( except weakest )
3297 ARRAY_FOREACH ( i, dX )
3298 {
3299 XQNode_t * pParent = dX[i]->m_pParent;
3300 Verify ( pParent->m_dChildren.RemoveValue ( dX[i] ) );
3301 if ( i!=iWeakestIndex )
3302 SafeDelete ( dX[i] );
3303
3304 dExcluded[i] = pParent;
3305 pParent->m_pParent->m_dChildren.RemoveValue ( pParent );
3306 }
3307
3308 CSphVector<XQNode_t *> dRelatedParents;
3309 for ( int i=0; i<m_dRelatedNodes.GetLength(); i++ )
3310 {
3311 XQNode_t * pParent = m_dRelatedNodes[i]->m_pParent;
3312 if ( !dRelatedParents.Contains ( pParent ) )
3313 dRelatedParents.Add ( pParent );
3314 }
3315
3316 ARRAY_FOREACH ( i, dRelatedParents )
3317 dRelatedParents[i] = dRelatedParents[i]->Clone();
3318
3319 // push excluded children back
3320 ARRAY_FOREACH ( i, dExcluded )
3321 {
3322 XQNode_t * pChild = dExcluded[i];
3323 pChild->m_pParent->m_dChildren.Add ( pChild );
3324 }
3325
3326 XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
3327 pNewOr->SetOp ( SPH_QUERY_OR, dRelatedParents );
3328
3329 // Create yet another AND node
3330 // with related nodes and one common dSimilar node
3331 XQNode_t * pCommonOr = pX->m_pParent->m_pParent->m_pParent;
3332 XQNode_t * pNewAnd = new XQNode_t ( XQLimitSpec_t() );
3333 pNewAnd->SetOp ( SPH_QUERY_AND, pNewOr, pX );
3334 pCommonOr->m_dChildren.Add ( pNewAnd );
3335 pNewAnd->m_pParent = pCommonOr;
3336
3337 ARRAY_FOREACH ( i, dExcluded )
3338 {
3339 CleanupSubtree ( dExcluded[i], m_ppRoot );
3340 }
3341 }
3342
3343
CheckCommonKeywords(const XQNode_t * pNode)3344 bool CSphTransformation::CheckCommonKeywords ( const XQNode_t * pNode )
3345 {
3346 if ( !pNode || !pNode->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_OR || !pNode->m_dWords.GetLength() )
3347 {
3348 //
3349 // NOLINT // NOT:
3350 // NOLINT // ______________________ OR (parentOr) _______
3351 // NOLINT // / | |
3352 // NOLINT // pNode (PHRASE|AND|PROXIMITY) ... ...
3353 //
3354 return false;
3355 }
3356 return true;
3357 }
3358
3359
3360 typedef CSphOrderedHash<CSphVector<XQNode_t *>, uint64_t, IdentityHash_fn, 128> BigramHash_t;
3361
3362
sphBigramAddNode(XQNode_t * pNode,int64_t uHash,BigramHash_t & hBirgam)3363 static int sphBigramAddNode ( XQNode_t * pNode, int64_t uHash, BigramHash_t & hBirgam )
3364 {
3365 CSphVector<XQNode_t *> * ppNodes = hBirgam ( uHash );
3366 if ( !ppNodes )
3367 {
3368 CSphVector<XQNode_t *> dNode ( 1 );
3369 dNode[0] = pNode;
3370 hBirgam.Add ( dNode, uHash );
3371 return 1;
3372 } else
3373 {
3374 (*ppNodes).Add ( pNode );
3375 return (*ppNodes).GetLength();
3376 }
3377 }
3378
3379 static const BYTE g_sPhraseDelimiter[] = { 1 };
3380
sphHashPhrase(const XQNode_t * pNode)3381 static uint64_t sphHashPhrase ( const XQNode_t * pNode )
3382 {
3383 assert ( pNode );
3384 uint64_t uHash = SPH_FNV64_SEED;
3385 ARRAY_FOREACH ( i, pNode->m_dWords )
3386 {
3387 if ( i )
3388 uHash = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uHash );
3389 uHash = sphFNV64cont ( pNode->m_dWords[i].m_sWord.cstr(), uHash );
3390 }
3391
3392 return uHash;
3393 }
3394
3395
sphHashSubphrases(XQNode_t * pNode,BigramHash_t & hBirgam)3396 static void sphHashSubphrases ( XQNode_t * pNode, BigramHash_t & hBirgam )
3397 {
3398 assert ( pNode );
3399 // skip whole phrase
3400 if ( pNode->m_dWords.GetLength()<=1 )
3401 return;
3402
3403 const CSphVector<XQKeyword_t> & dWords = pNode->m_dWords;
3404 int iLen = dWords.GetLength();
3405 for ( int i=0; i<iLen; i++ )
3406 {
3407 uint64_t uSubPhrase = sphFNV64cont ( dWords[i].m_sWord.cstr(), SPH_FNV64_SEED );
3408 sphBigramAddNode ( pNode, uSubPhrase, hBirgam );
3409
3410 // skip whole phrase
3411 int iSubLen = ( i==0 ? iLen-1 : iLen );
3412 for ( int j=i+1; j<iSubLen; j++ )
3413 {
3414 uSubPhrase = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uSubPhrase );
3415 uSubPhrase = sphFNV64cont ( dWords[j].m_sWord.cstr(), uSubPhrase );
3416 sphBigramAddNode ( pNode, uSubPhrase, hBirgam );
3417 }
3418 }
3419
3420 // loop all children
3421 ARRAY_FOREACH ( i, pNode->m_dChildren )
3422 sphHashSubphrases ( pNode->m_dChildren[i], hBirgam );
3423 }
3424
3425
sphIsNodeStrongest(const XQNode_t * pNode,const CSphVector<XQNode_t * > & dSimilar)3426 bool sphIsNodeStrongest ( const XQNode_t * pNode, const CSphVector<XQNode_t *> & dSimilar )
3427 {
3428 //
3429 // The cases when query won't be optimized:
3430 // 1. Proximities with different distance: "A B C"~N | "A B C D"~M (N != M)
3431 // 2. Partial intersection in the middle: "A B C D" | "D B C E" (really they won't be found)
3432 // 3. Weaker phrase for proximity. Example: "A B C D"~N | "B C"
3433 //
3434 // The cases when query will be optimized:
3435 // 1. Found weaker term (phrase or proximity type) for sub-query with phrase type.
3436 // Examples:
3437 // "D A B C E" | "A B C" (weaker phrase) => "A B C"
3438 // "A B C D E" | "B C D"~N (weaker proximity) => "B C D"~N
3439 //
3440 // 2. Equal proximities with the different distance.
3441 // Example: "A B C"~N | "A B C"~M => "A B C"~min(M,N)
3442 //
3443 // 3. Found weaker term with proximity type with the same distance.
3444 // Example: "D A B C E"~N | "A B"~N => "A B"~N
3445 //
3446
3447 assert ( pNode );
3448 XQOperator_e eNode = pNode->GetOp();
3449 int iWords = pNode->m_dWords.GetLength();
3450
3451 ARRAY_FOREACH ( i, dSimilar )
3452 {
3453 XQOperator_e eSimilar = dSimilar[i]->GetOp();
3454 int iSimilarWords = dSimilar[i]->m_dWords.GetLength();
3455
3456 if ( eNode==SPH_QUERY_PROXIMITY && eSimilar==SPH_QUERY_PROXIMITY && iWords>iSimilarWords )
3457 return false;
3458 if ( ( eNode==SPH_QUERY_PHRASE || eNode==SPH_QUERY_AND ) && ( eSimilar==SPH_QUERY_PROXIMITY && ( iWords>1 || pNode->m_dChildren.GetLength() ) ) )
3459 return false;
3460
3461 bool bSimilar = ( ( eNode==SPH_QUERY_PHRASE && eSimilar==SPH_QUERY_PHRASE ) ||
3462 ( ( eNode==SPH_QUERY_PHRASE || eNode==SPH_QUERY_AND ) && ( eSimilar==SPH_QUERY_PHRASE || eSimilar==SPH_QUERY_PROXIMITY ) ) ||
3463 ( eNode==SPH_QUERY_PROXIMITY && ( eSimilar==SPH_QUERY_AND || eSimilar==SPH_QUERY_PHRASE ) ) ||
3464 ( eNode==SPH_QUERY_PROXIMITY && eSimilar==SPH_QUERY_PROXIMITY && pNode->m_iOpArg>=dSimilar[i]->m_iOpArg ) );
3465
3466 if ( !bSimilar )
3467 return false;
3468 }
3469
3470 return true;
3471 }
3472
3473
TransformCommonKeywords()3474 bool CSphTransformation::TransformCommonKeywords ()
3475 {
3476 CSphVector <XQNode_t *> dPendingDel;
3477 m_hSimilar.IterateStart();
3478 while ( m_hSimilar.IterateNext () )
3479 {
3480 BigramHash_t hBigrams;
3481 m_hSimilar.IterateGet().IterateStart();
3482 while ( m_hSimilar.IterateGet().IterateNext() )
3483 {
3484 // Nodes with the same iFuzzyHash
3485 CSphVector<XQNode_t *> & dPhrases = m_hSimilar.IterateGet().IterateGet();
3486 if ( dPhrases.GetLength()<2 )
3487 continue;
3488
3489 ARRAY_FOREACH ( i, dPhrases )
3490 sphHashSubphrases ( dPhrases[i], hBigrams );
3491
3492 ARRAY_FOREACH ( i, dPhrases )
3493 {
3494 XQNode_t * pNode = dPhrases[i];
3495 uint64_t uPhraseHash = sphHashPhrase ( pNode );
3496 CSphVector<XQNode_t *> * ppCommon = hBigrams ( uPhraseHash );
3497 if ( ppCommon && sphIsNodeStrongest ( pNode, *ppCommon ) )
3498 {
3499 ARRAY_FOREACH ( j, (*ppCommon) )
3500 dPendingDel.Add ( (*ppCommon)[j] );
3501 }
3502 }
3503 }
3504 }
3505
3506 bool bTransformed = ( dPendingDel.GetLength()>0 );
3507 dPendingDel.Sort();
3508 // Delete stronger terms
3509 XQNode_t * pLast = NULL;
3510 ARRAY_FOREACH ( i, dPendingDel )
3511 {
3512 // skip dupes
3513 if ( pLast==dPendingDel[i] )
3514 continue;
3515
3516 pLast = dPendingDel[i];
3517 Verify ( pLast->m_pParent->m_dChildren.RemoveValue ( pLast ) );
3518 // delete here (not SafeDelete) as later that pointer will be compared
3519 delete ( dPendingDel[i] );
3520 }
3521
3522 return bTransformed;
3523 }
3524
3525 // minimum words per phrase that might be optimized by CommonSuffix optimization
3526
CheckCommonPhrase(const XQNode_t * pNode)3527 bool CSphTransformation::CheckCommonPhrase ( const XQNode_t * pNode )
3528 {
3529 if ( !pNode || !pNode->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_OR || pNode->GetOp()!=SPH_QUERY_PHRASE || pNode->m_dWords.GetLength()<2 )
3530 {
3531 //
3532 // NOLINT // NOT:
3533 // NOLINT // ______________________ OR (parentOr) ___
3534 // NOLINT // / | |
3535 // NOLINT // pNode (PHRASE) ... ...
3536 //
3537 return false;
3538 }
3539
3540 // single word phrase not allowed
3541 assert ( pNode->m_dWords.GetLength()>=2 );
3542
3543 // phrase there words not one after another not allowed
3544 for ( int i=1; i<pNode->m_dWords.GetLength(); i++ )
3545 {
3546 if ( pNode->m_dWords[i].m_iAtomPos-pNode->m_dWords[i-1].m_iAtomPos!=1 )
3547 return false;
3548 }
3549
3550 return true;
3551 }
3552
3553
3554 struct CommonInfo_t
3555 {
3556 CSphVector<XQNode_t *> * m_pPhrases;
3557 int m_iCommonLen;
3558 bool m_bHead;
3559 bool m_bHasBetter;
3560 };
3561
3562
3563 struct Node2Common_t
3564 {
3565 XQNode_t * m_pNode;
3566 CommonInfo_t * m_pCommon;
3567 };
3568
3569
3570 struct CommonDupElemination_fn
3571 {
IsLessCommonDupElemination_fn3572 bool IsLess ( const Node2Common_t & a, const Node2Common_t & b ) const
3573 {
3574 if ( a.m_pNode!=b.m_pNode )
3575 return a.m_pNode<b.m_pNode;
3576
3577 if ( a.m_pCommon->m_iCommonLen!=b.m_pCommon->m_iCommonLen )
3578 return a.m_pCommon->m_iCommonLen>b.m_pCommon->m_iCommonLen;
3579
3580 return a.m_pCommon->m_bHead;
3581 }
3582 };
3583
3584
3585 struct XQNodeAtomPos_fn
3586 {
IsLessXQNodeAtomPos_fn3587 bool IsLess ( const XQNode_t * a, const XQNode_t * b ) const
3588 {
3589 return a->m_iAtomPos<b->m_iAtomPos;
3590 }
3591 };
3592
3593
TransformCommonPhrase()3594 bool CSphTransformation::TransformCommonPhrase ()
3595 {
3596 bool bRecollect = false;
3597 m_hSimilar.IterateStart();
3598 while ( m_hSimilar.IterateNext () )
3599 {
3600 m_hSimilar.IterateGet().IterateStart();
3601 while ( m_hSimilar.IterateGet().IterateNext() )
3602 {
3603 // Nodes with the same iFuzzyHash
3604 CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
3605 if ( dNodes.GetLength()<2 )
3606 continue;
3607
3608 bool bHasCommonPhrases = false;
3609 BigramHash_t tBigramHead;
3610 BigramHash_t tBigramTail;
3611 // 1st check only 2 words at head tail at phrases
3612 ARRAY_FOREACH ( iPhrase, dNodes )
3613 {
3614 const CSphVector<XQKeyword_t> & dWords = dNodes[iPhrase]->m_dWords;
3615 assert ( dWords.GetLength()>=2 );
3616 dNodes[iPhrase]->m_iAtomPos = dWords.Begin()->m_iAtomPos;
3617
3618 uint64_t uHead = sphFNV64cont ( dWords[0].m_sWord.cstr(), SPH_FNV64_SEED );
3619 uint64_t uTail = sphFNV64cont ( dWords [ dWords.GetLength() - 1 ].m_sWord.cstr(), SPH_FNV64_SEED );
3620 uHead = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uHead );
3621 uHead = sphFNV64cont ( dWords[1].m_sWord.cstr(), uHead );
3622
3623 uTail = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uTail );
3624 uTail = sphFNV64cont ( dWords[dWords.GetLength()-2].m_sWord.cstr(), uTail );
3625
3626 int iHeadLen = sphBigramAddNode ( dNodes[iPhrase], uHead, tBigramHead );
3627 int iTailLen = sphBigramAddNode ( dNodes[iPhrase], uTail, tBigramTail );
3628 bHasCommonPhrases |= ( iHeadLen>1 || iTailLen>1 );
3629 }
3630
3631 if ( !bHasCommonPhrases )
3632 continue;
3633
3634 // 2nd step find minimum for each phrases group
3635 CSphVector<CommonInfo_t> dCommon;
3636 tBigramHead.IterateStart();
3637 while ( tBigramHead.IterateNext() )
3638 {
3639 // only phrases that share same words at head
3640 if ( tBigramHead.IterateGet().GetLength()<2 )
3641 continue;
3642
3643 CommonInfo_t & tElem = dCommon.Add();
3644 tElem.m_pPhrases = &tBigramHead.IterateGet();
3645 tElem.m_iCommonLen = 2;
3646 tElem.m_bHead = true;
3647 tElem.m_bHasBetter = false;
3648 }
3649 tBigramTail.IterateStart();
3650 while ( tBigramTail.IterateNext() )
3651 {
3652 // only phrases that share same words at tail
3653 if ( tBigramTail.IterateGet().GetLength()<2 )
3654 continue;
3655
3656 CommonInfo_t & tElem = dCommon.Add();
3657 tElem.m_pPhrases = &tBigramTail.IterateGet();
3658 tElem.m_iCommonLen = 2;
3659 tElem.m_bHead = false;
3660 tElem.m_bHasBetter = false;
3661 }
3662
3663 // for each set of phrases with common words at the head or tail
3664 // each word that is same at all phrases makes common length longer
3665 ARRAY_FOREACH ( i, dCommon )
3666 {
3667 CommonInfo_t & tCommon = dCommon[i];
3668 bool bHead = tCommon.m_bHead;
3669 const CSphVector<XQNode_t *> & dPhrases = *tCommon.m_pPhrases;
3670 // start from third word ( two words at each phrase already matched at bigram hashing )
3671 for ( int iCount=3; ; iCount++ )
3672 {
3673 // is shortest phrase words over
3674 if ( iCount>=dPhrases[0]->m_dWords.GetLength() )
3675 break;
3676
3677 int iWordRef = ( bHead ? iCount-1 : dPhrases[0]->m_dWords.GetLength() - iCount );
3678 uint64_t uHash = sphFNV64 ( dPhrases[0]->m_dWords[iWordRef].m_sWord.cstr() );
3679
3680 bool bPhrasesMatch = false;
3681 bool bSomePhraseOver = false;
3682 for ( int iPhrase=1; iPhrase<dPhrases.GetLength(); iPhrase++ )
3683 {
3684 bSomePhraseOver = ( iCount>=dPhrases[iPhrase]->m_dWords.GetLength() );
3685 if ( bSomePhraseOver )
3686 break;
3687
3688 int iWord = ( bHead ? iCount-1 : dPhrases[iPhrase]->m_dWords.GetLength() - iCount );
3689 bPhrasesMatch = ( uHash==sphFNV64 ( dPhrases[iPhrase]->m_dWords[iWord].m_sWord.cstr() ) );
3690 if ( !bPhrasesMatch )
3691 break;
3692 }
3693
3694 // no need to check further in case shortest phrase has no more words or sequence over
3695 if ( bSomePhraseOver || !bPhrasesMatch )
3696 break;
3697
3698 tCommon.m_iCommonLen = iCount;
3699 }
3700 }
3701
3702 // mark all dupes (that has smaller common length) as deleted
3703 if ( dCommon.GetLength()>=2 )
3704 {
3705 CSphVector<Node2Common_t> dDups ( dCommon.GetLength()*2 );
3706 dDups.Resize ( 0 );
3707 ARRAY_FOREACH ( i, dCommon )
3708 {
3709 CommonInfo_t & tCommon = dCommon[i];
3710 CSphVector<XQNode_t *> & dPhrases = *tCommon.m_pPhrases;
3711 ARRAY_FOREACH ( j, dPhrases )
3712 {
3713 Node2Common_t & tDup = dDups.Add();
3714 tDup.m_pNode = dPhrases[j];
3715 tDup.m_pCommon = &tCommon;
3716 }
3717 }
3718 dDups.Sort ( CommonDupElemination_fn() );
3719 for ( int i=0; i<dDups.GetLength()-1; i++ )
3720 {
3721 Node2Common_t & tCurr = dDups[i];
3722 Node2Common_t & tNext = dDups[i+1];
3723 if ( tCurr.m_pNode==tNext.m_pNode )
3724 {
3725 if ( tCurr.m_pCommon->m_iCommonLen<=tNext.m_pCommon->m_iCommonLen )
3726 tCurr.m_pCommon->m_bHasBetter = true;
3727 else
3728 tNext.m_pCommon->m_bHasBetter = true;
3729 }
3730 }
3731 }
3732
3733 ARRAY_FOREACH ( i, dCommon )
3734 {
3735 const CommonInfo_t & tElem = dCommon[i];
3736 if ( !tElem.m_bHasBetter )
3737 {
3738 tElem.m_pPhrases->Sort ( XQNodeAtomPos_fn() );
3739 MakeTransformCommonPhrase ( *tElem.m_pPhrases, tElem.m_iCommonLen, tElem.m_bHead );
3740 bRecollect = true;
3741 }
3742 }
3743 }
3744 }
3745
3746 return bRecollect;
3747 }
3748
3749
MakeTransformCommonPhrase(CSphVector<XQNode_t * > & dCommonNodes,int iCommonLen,bool bHeadIsCommon)3750 void CSphTransformation::MakeTransformCommonPhrase ( CSphVector<XQNode_t *> & dCommonNodes, int iCommonLen, bool bHeadIsCommon )
3751 {
3752 XQNode_t * pCommonPhrase = new XQNode_t ( XQLimitSpec_t() );
3753 pCommonPhrase->SetOp ( SPH_QUERY_PHRASE );
3754
3755 XQNode_t * pGrandOr = dCommonNodes[0]->m_pParent;
3756 if ( bHeadIsCommon )
3757 {
3758 // fill up common suffix
3759 XQNode_t * pPhrase = dCommonNodes[0];
3760 pCommonPhrase->m_iAtomPos = pPhrase->m_dWords[0].m_iAtomPos;
3761 for ( int i=0; i<iCommonLen; i++ )
3762 pCommonPhrase->m_dWords.Add ( pPhrase->m_dWords[i] );
3763 } else
3764 {
3765 XQNode_t * pPhrase = dCommonNodes[0];
3766 // set the farthest atom position
3767 int iAtomPos = pPhrase->m_dWords [ pPhrase->m_dWords.GetLength() - iCommonLen ].m_iAtomPos;
3768 for ( int i=1; i<dCommonNodes.GetLength(); i++ )
3769 {
3770 XQNode_t * pCur = dCommonNodes[i];
3771 int iCurAtomPos = pCur->m_dWords[pCur->m_dWords.GetLength() - iCommonLen].m_iAtomPos;
3772 if ( iAtomPos < iCurAtomPos )
3773 {
3774 pPhrase = pCur;
3775 iAtomPos = iCurAtomPos;
3776 }
3777 }
3778 pCommonPhrase->m_iAtomPos = iAtomPos;
3779
3780 for ( int i=pPhrase->m_dWords.GetLength() - iCommonLen; i<pPhrase->m_dWords.GetLength(); i++ )
3781 pCommonPhrase->m_dWords.Add ( pPhrase->m_dWords[i] );
3782 }
3783
3784 XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
3785 pNewOr->SetOp ( SPH_QUERY_OR );
3786
3787 ARRAY_FOREACH ( i, dCommonNodes )
3788 {
3789 XQNode_t * pPhrase = dCommonNodes[i];
3790
3791 // remove phrase from parent and eliminate in case of common phrase duplication
3792 Verify ( pGrandOr->m_dChildren.RemoveValue ( pPhrase ) );
3793 if ( pPhrase->m_dWords.GetLength()==iCommonLen )
3794 {
3795 SafeDelete ( pPhrase );
3796 continue;
3797 }
3798
3799 // move phrase to new OR
3800 pNewOr->m_dChildren.Add ( pPhrase );
3801 pPhrase->m_pParent = pNewOr;
3802
3803 // shift down words and enumerate words atom positions
3804 if ( bHeadIsCommon )
3805 {
3806 int iEndCommonAtom = pCommonPhrase->m_dWords.Last().m_iAtomPos+1;
3807 for ( int j=iCommonLen; j<pPhrase->m_dWords.GetLength(); j++ )
3808 {
3809 int iTo = j-iCommonLen;
3810 pPhrase->m_dWords[iTo] = pPhrase->m_dWords[j];
3811 pPhrase->m_dWords[iTo].m_iAtomPos = iEndCommonAtom + iTo;
3812 }
3813 }
3814 pPhrase->m_dWords.Resize ( pPhrase->m_dWords.GetLength() - iCommonLen );
3815 if ( !bHeadIsCommon )
3816 {
3817 int iStartAtom = pCommonPhrase->m_dWords[0].m_iAtomPos - pPhrase->m_dWords.GetLength();
3818 ARRAY_FOREACH ( j, pPhrase->m_dWords )
3819 pPhrase->m_dWords[j].m_iAtomPos = iStartAtom + j;
3820 }
3821
3822 if ( pPhrase->m_dWords.GetLength()==1 )
3823 pPhrase->SetOp ( SPH_QUERY_AND );
3824 }
3825
3826 if ( pNewOr->m_dChildren.GetLength() )
3827 {
3828 // parent phrase need valid atom position of children
3829 pNewOr->m_iAtomPos = pNewOr->m_dChildren[0]->m_dWords[0].m_iAtomPos;
3830
3831 XQNode_t * pNewPhrase = new XQNode_t ( XQLimitSpec_t() );
3832 if ( bHeadIsCommon )
3833 pNewPhrase->SetOp ( SPH_QUERY_PHRASE, pCommonPhrase, pNewOr );
3834 else
3835 pNewPhrase->SetOp ( SPH_QUERY_PHRASE, pNewOr, pCommonPhrase );
3836
3837 pGrandOr->m_dChildren.Add ( pNewPhrase );
3838 pNewPhrase->m_pParent = pGrandOr;
3839 } else
3840 {
3841 // common phrases with same words elimination
3842 pGrandOr->m_dChildren.Add ( pCommonPhrase );
3843 pCommonPhrase->m_pParent = pGrandOr;
3844 SafeDelete ( pNewOr );
3845 }
3846 }
3847
3848
CheckCommonAndNotFactor(const XQNode_t * pNode)3849 bool CSphTransformation::CheckCommonAndNotFactor ( const XQNode_t * pNode )
3850 {
3851 if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3852 pNode->m_pParent->GetOp()!=SPH_QUERY_AND || pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
3853 pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR ||
3854 // FIXME!!! check performance with OR node at 2nd grand instead of regular not NOT
3855 pNode->m_pParent->m_pParent->m_dChildren.GetLength()<2 || pNode->m_pParent->m_pParent->m_dChildren[1]->GetOp()!=SPH_QUERY_NOT )
3856 {
3857 //
3858 // NOLINT // NOT:
3859 // NOLINT // _______ OR (gGOr) ________________
3860 // NOLINT // / | |
3861 // NOLINT // ... AND NOT (grandAndNot) ...
3862 // NOLINT // / |
3863 // NOLINT // AND NOT
3864 // NOLINT // | |
3865 // NOLINT // pNode relatedNode
3866 //
3867 return false;
3868 }
3869 return true;
3870 }
3871
3872
TransformCommonAndNotFactor()3873 bool CSphTransformation::TransformCommonAndNotFactor ()
3874 {
3875 bool bRecollect = false;
3876 m_hSimilar.IterateStart();
3877 while ( m_hSimilar.IterateNext () )
3878 {
3879 m_hSimilar.IterateGet().IterateStart();
3880 while ( m_hSimilar.IterateGet().IterateNext() )
3881 {
3882 // Nodes with the same iFuzzyHash
3883 CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
3884 if ( dSimilarNodes.GetLength()<2 )
3885 continue;
3886
3887 if ( MakeTransformCommonAndNotFactor ( dSimilarNodes ) )
3888 bRecollect = true;
3889 }
3890 }
3891 return bRecollect;
3892 }
3893
3894
MakeTransformCommonAndNotFactor(CSphVector<XQNode_t * > & dSimilarNodes)3895 bool CSphTransformation::MakeTransformCommonAndNotFactor ( CSphVector<XQNode_t *> & dSimilarNodes )
3896 {
3897 // Pick weakest node from the equal
3898 // PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
3899 // so they are considered as equal nodes.
3900 int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
3901
3902 XQNode_t * pFirstAndNot = dSimilarNodes [iWeakestIndex]->m_pParent->m_pParent;
3903 XQNode_t * pCommonOr = pFirstAndNot->m_pParent;
3904
3905 assert ( pFirstAndNot->m_dChildren.GetLength()==2 );
3906 XQNode_t * pFirstNot = pFirstAndNot->m_dChildren[1];
3907 assert ( pFirstNot->m_dChildren.GetLength()==1 );
3908
3909 XQNode_t * pAndNew = new XQNode_t ( XQLimitSpec_t() );
3910 pAndNew->SetOp ( SPH_QUERY_AND );
3911 pAndNew->m_dChildren.Reserve ( dSimilarNodes.GetLength() );
3912 pAndNew->m_dChildren.Add ( pFirstNot->m_dChildren[0] );
3913 pAndNew->m_dChildren.Last()->m_pParent = pAndNew;
3914 pFirstNot->m_dChildren[0] = pAndNew;
3915 pAndNew->m_pParent = pFirstNot;
3916
3917 for ( int i=0; i<dSimilarNodes.GetLength(); ++i )
3918 {
3919 assert ( CheckCommonAndNotFactor ( dSimilarNodes[i] ) );
3920 if ( i==iWeakestIndex )
3921 continue;
3922
3923 XQNode_t * pAndNot = dSimilarNodes[i]->m_pParent->m_pParent;
3924 assert ( pAndNot->m_dChildren.GetLength()==2 );
3925 XQNode_t * pNot = pAndNot->m_dChildren[1];
3926 assert ( pNot->m_dChildren.GetLength()==1 );
3927 assert ( &pAndNew->m_dChildren!=&pNot->m_dChildren );
3928 pAndNew->m_dChildren.Add ( pNot->m_dChildren[0] );
3929 pAndNew->m_dChildren.Last()->m_pParent = pAndNew;
3930 pNot->m_dChildren[0] = NULL;
3931
3932 Verify ( pCommonOr->m_dChildren.RemoveValue ( pAndNot ) );
3933 dSimilarNodes[i] = NULL;
3934 SafeDelete ( pAndNot );
3935 }
3936
3937 return true;
3938 }
3939
3940
CheckCommonOrNot(const XQNode_t * pNode)3941 bool CSphTransformation::CheckCommonOrNot ( const XQNode_t * pNode )
3942 {
3943 if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3944 !pNode->m_pParent->m_pParent->m_pParent->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_OR ||
3945 pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_NOT ||
3946 pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
3947 pNode->m_pParent->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
3948 {
3949 //
3950 // NOLINT // NOT:
3951 // NOLINT // __ OR (Grand3 = CommonOr) __
3952 // NOLINT // / | |
3953 // NOLINT // ... AND NOT (Grand2) ...
3954 // NOLINT // / |
3955 // NOLINT // relatedNode NOT (grandNot)
3956 // NOLINT // |
3957 // NOLINT // OR (parentOr)
3958 // NOLINT // / |
3959 // NOLINT // pNode ...
3960 //
3961 return false;
3962 }
3963 return true;
3964 }
3965
3966
TransformCommonOrNot()3967 bool CSphTransformation::TransformCommonOrNot ()
3968 {
3969 bool bRecollect = false;
3970 m_hSimilar.IterateStart();
3971 while ( m_hSimilar.IterateNext () )
3972 {
3973 m_hSimilar.IterateGet().IterateStart();
3974 while ( m_hSimilar.IterateGet().IterateNext() )
3975 {
3976 // Nodes with the same iFuzzyHash
3977 CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
3978 if ( dSimilarNodes.GetLength()<2 )
3979 continue;
3980
3981 if ( CollectRelatedNodes < GrandNode, Grand2Node > ( dSimilarNodes ) && MakeTransformCommonOrNot ( dSimilarNodes ) )
3982 {
3983 bRecollect = true;
3984 // Don't make transformation for other nodes from the same OR-node,
3985 // because query tree was changed and further transformations
3986 // might be invalid.
3987 break;
3988 }
3989 }
3990 }
3991
3992 return bRecollect;
3993 }
3994
3995
MakeTransformCommonOrNot(CSphVector<XQNode_t * > & dSimilarNodes)3996 bool CSphTransformation::MakeTransformCommonOrNot ( CSphVector<XQNode_t *> & dSimilarNodes )
3997 {
3998 // Pick weakest node from the equal
3999 // PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
4000 // so they are considered as equal nodes.
4001 int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
4002 assert ( iWeakestIndex!=-1 );
4003 XQNode_t * pWeakestSimilar = dSimilarNodes [ iWeakestIndex ];
4004
4005 // Common OR node (that is Grand3Node::From)
4006 XQNode_t * pCommonOr = pWeakestSimilar->m_pParent->m_pParent->m_pParent->m_pParent;
4007
4008 // Delete/unlink similar nodes ( except weakest )
4009 ARRAY_FOREACH ( i, dSimilarNodes )
4010 {
4011 XQNode_t * pParent = dSimilarNodes[i]->m_pParent;
4012 Verify ( pParent->m_dChildren.RemoveValue ( dSimilarNodes[i] ) );
4013
4014 if ( i!=iWeakestIndex )
4015 SafeDelete ( dSimilarNodes[i] );
4016 }
4017
4018 XQNode_t * pNewAndNot = new XQNode_t ( XQLimitSpec_t() );
4019 XQNode_t * pNewAnd = new XQNode_t ( XQLimitSpec_t() );
4020 XQNode_t * pNewNot = new XQNode_t ( XQLimitSpec_t() );
4021 if ( !pCommonOr->m_pParent )
4022 {
4023 *m_ppRoot = pNewAndNot;
4024 } else
4025 {
4026 pNewAndNot->m_pParent = pCommonOr->m_pParent;
4027 assert ( pCommonOr->m_pParent->m_dChildren.Contains ( pCommonOr ) );
4028 ARRAY_FOREACH ( i, pCommonOr->m_pParent->m_dChildren )
4029 {
4030 if ( pCommonOr->m_pParent->m_dChildren[i]==pCommonOr )
4031 pCommonOr->m_pParent->m_pParent->m_dChildren[i] = pNewAndNot;
4032 }
4033 }
4034 pNewAnd->SetOp ( SPH_QUERY_AND, pCommonOr );
4035 pNewNot->SetOp ( SPH_QUERY_NOT, pWeakestSimilar );
4036 pNewAndNot->SetOp ( SPH_QUERY_ANDNOT, pNewAnd, pNewNot );
4037
4038 return true;
4039 }
4040
4041
CheckHungOperand(const XQNode_t * pNode)4042 bool CSphTransformation::CheckHungOperand ( const XQNode_t * pNode )
4043 {
4044 if ( !pNode || !pNode->m_pParent ||
4045 ( pNode->m_pParent->GetOp()!=SPH_QUERY_OR && pNode->m_pParent->GetOp()!=SPH_QUERY_AND ) ||
4046 ( pNode->m_pParent->m_pParent && pNode->m_pParent->GetOp()==SPH_QUERY_AND &&
4047 pNode->m_pParent->m_pParent->GetOp()==SPH_QUERY_ANDNOT ) ||
4048 pNode->m_pParent->m_dChildren.GetLength()>1 || pNode->m_dWords.GetLength() )
4049 {
4050 //
4051 // NOLINT // NOT:
4052 // NOLINT // OR|AND (parent)
4053 // NOLINT // |
4054 // NOLINT // pNode\?
4055 //
4056 return false;
4057 }
4058 return true;
4059 }
4060
4061
TransformHungOperand()4062 bool CSphTransformation::TransformHungOperand ()
4063 {
4064 if ( !m_hSimilar.GetLength() || !m_hSimilar.Exists ( CONST_GROUP_FACTOR ) || !m_hSimilar[CONST_GROUP_FACTOR].Exists ( CONST_GROUP_FACTOR ) )
4065 return false;
4066
4067 CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar[CONST_GROUP_FACTOR][CONST_GROUP_FACTOR];
4068 ARRAY_FOREACH ( i, dSimilarNodes )
4069 {
4070 XQNode_t * pHungNode = dSimilarNodes[i];
4071 XQNode_t * pParent = pHungNode->m_pParent;
4072 XQNode_t * pGrand = pParent->m_pParent;
4073
4074 if ( !pGrand )
4075 {
4076 *m_ppRoot = pHungNode;
4077 pHungNode->m_pParent = NULL;
4078 } else
4079 {
4080 assert ( pGrand->m_dChildren.Contains ( pParent ) );
4081 ARRAY_FOREACH ( j, pGrand->m_dChildren )
4082 {
4083 if ( pGrand->m_dChildren[j]!=pParent )
4084 continue;
4085
4086 pGrand->m_dChildren[j] = pHungNode;
4087 pHungNode->m_pParent = pGrand;
4088 break;
4089 }
4090 }
4091
4092 pParent->m_dChildren[0] = NULL;
4093 SafeDelete ( pParent );
4094 }
4095
4096 return true;
4097 }
4098
4099
CheckExcessBrackets(const XQNode_t * pNode)4100 bool CSphTransformation::CheckExcessBrackets ( const XQNode_t * pNode )
4101 {
4102 if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent ||
4103 !( ( pNode->m_pParent->GetOp()==SPH_QUERY_AND && !pNode->m_pParent->m_dWords.GetLength() &&
4104 pNode->m_pParent->m_pParent->GetOp()==SPH_QUERY_AND ) ||
4105 ( pNode->m_pParent->GetOp()==SPH_QUERY_OR && pNode->m_pParent->m_pParent->GetOp()==SPH_QUERY_OR ) ) )
4106 {
4107 //
4108 // NOLINT // NOT:
4109 // NOLINT // OR|AND (grand)
4110 // NOLINT // / |
4111 // NOLINT // OR|AND (parent) ...
4112 // NOLINT // |
4113 // NOLINT // pNode
4114 //
4115 return false;
4116 }
4117 return true;
4118 }
4119
sphMoveSiblingsUp(XQNode_t * pNode)4120 static XQNode_t * sphMoveSiblingsUp ( XQNode_t * pNode )
4121 {
4122 XQNode_t * pParent = pNode->m_pParent;
4123 assert ( pParent );
4124 XQNode_t * pGrand = pParent->m_pParent;
4125 assert ( pGrand );
4126
4127 assert ( pGrand->m_dChildren.Contains ( pParent ) );
4128 int iParent = GetNodeChildIndex ( pGrand, pParent );
4129
4130 int iParentChildren = pParent->m_dChildren.GetLength();
4131 int iGrandChildren = pGrand->m_dChildren.GetLength();
4132 int iTotalChildren = iParentChildren+iGrandChildren-1;
4133
4134 // parent.children + grand.parent.children - parent itself
4135 CSphVector<XQNode_t *> dChildren ( iTotalChildren );
4136
4137 // grand head prior parent
4138 for ( int i=0; i<iParent; i++ )
4139 dChildren[i] = pGrand->m_dChildren[i];
4140
4141 // grand tail after parent
4142 for ( int i=0; i<iGrandChildren-iParent-1; i++ )
4143 dChildren[i+iParent+iParentChildren] = pGrand->m_dChildren[i+iParent+1];
4144
4145 // all parent children
4146 for ( int i=0; i<iParentChildren; i++ )
4147 {
4148 XQNode_t * pChild = pParent->m_dChildren[i];
4149 pChild->m_pParent = pGrand;
4150 dChildren[i+iParent] = pChild;
4151 }
4152
4153 pGrand->m_dChildren.SwapData ( dChildren );
4154 // all children at grand now
4155 pParent->m_dChildren.Resize(0);
4156 delete ( pParent );
4157 return pParent;
4158 }
4159
4160
4161 struct XQNodeHash_fn
4162 {
HashXQNodeHash_fn4163 static inline uint64_t Hash ( XQNode_t * pNode ) { return (uint64_t)pNode; }
4164 };
4165
4166
TransformExcessBrackets()4167 bool CSphTransformation::TransformExcessBrackets ()
4168 {
4169 bool bRecollect = false;
4170 CSphOrderedHash<int, XQNode_t *, XQNodeHash_fn, 64> hDeleted;
4171 m_hSimilar.IterateStart();
4172 while ( m_hSimilar.IterateNext () )
4173 {
4174 m_hSimilar.IterateGet().IterateStart();
4175 while ( m_hSimilar.IterateGet().IterateNext() )
4176 {
4177 // Nodes with the same iFuzzyHash
4178 CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
4179 ARRAY_FOREACH ( i, dNodes )
4180 {
4181 XQNode_t * pNode = dNodes[i];
4182 // node environment might be changed due prior nodes transformations
4183 if ( !hDeleted.Exists ( pNode ) && CheckExcessBrackets ( pNode ) )
4184 {
4185 XQNode_t * pDel = sphMoveSiblingsUp ( pNode );
4186 hDeleted.Add ( 1, pDel );
4187 bRecollect = true;
4188 }
4189 }
4190 }
4191 }
4192
4193 return bRecollect;
4194 }
4195
4196
CheckExcessAndNot(const XQNode_t * pNode)4197 bool CSphTransformation::CheckExcessAndNot ( const XQNode_t * pNode )
4198 {
4199 if ( !pNode || !ParentNode::From ( pNode ) || !GrandNode::From ( pNode ) || !Grand2Node::From ( pNode ) || pNode->GetOp()!=SPH_QUERY_AND ||
4200 ( pNode->m_dChildren.GetLength()==1 && pNode->m_dChildren[0]->GetOp()==SPH_QUERY_ANDNOT ) ||
4201 ParentNode::From ( pNode )->GetOp()!=SPH_QUERY_ANDNOT || GrandNode::From ( pNode )->GetOp()!=SPH_QUERY_AND ||
4202 Grand2Node::From ( pNode )->GetOp()!=SPH_QUERY_ANDNOT ||
4203 // FIXME!!! check performance with OR node at 2nd grand instead of regular not NOT
4204 Grand2Node::From ( pNode )->m_dChildren.GetLength()<2 || Grand2Node::From ( pNode )->m_dChildren[1]->GetOp()!=SPH_QUERY_NOT )
4205 {
4206 //
4207 // NOLINT // NOT:
4208 // NOLINT // AND NOT
4209 // NOLINT // / |
4210 // NOLINT // AND NOT
4211 // NOLINT // |
4212 // NOLINT // AND NOT
4213 // NOLINT // / |
4214 // NOLINT // AND(pNode) NOT
4215 // NOLINT // | |
4216 // NOLINT // .. ...
4217 //
4218 return false;
4219 }
4220 return true;
4221 }
4222
4223
TransformExcessAndNot()4224 bool CSphTransformation::TransformExcessAndNot ()
4225 {
4226 bool bRecollect = false;
4227 CSphOrderedHash<int, XQNode_t *, XQNodeHash_fn, 64> hDeleted;
4228 m_hSimilar.IterateStart();
4229 while ( m_hSimilar.IterateNext () )
4230 {
4231 m_hSimilar.IterateGet().IterateStart();
4232 while ( m_hSimilar.IterateGet().IterateNext() )
4233 {
4234 // Nodes with the same iFuzzyHash
4235 CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
4236 ARRAY_FOREACH ( i, dNodes )
4237 {
4238 XQNode_t * pAnd = dNodes[i];
4239 XQNode_t * pParentAndNot = pAnd->m_pParent;
4240
4241 // node environment might be changed due prior nodes transformations
4242 if ( hDeleted.Exists ( pParentAndNot ) || !CheckExcessAndNot ( pAnd ) )
4243 continue;
4244
4245 assert ( pParentAndNot->m_dChildren.GetLength()==2 );
4246 XQNode_t * pNot = pParentAndNot->m_dChildren[1];
4247 XQNode_t * pGrandAnd = pParentAndNot->m_pParent;
4248 XQNode_t * pGrand2AndNot = pGrandAnd->m_pParent;
4249 assert ( pGrand2AndNot->m_dChildren.GetLength()==2 );
4250 XQNode_t * pGrand2Not = pGrand2AndNot->m_dChildren[1];
4251
4252 assert ( pGrand2Not->m_dChildren.GetLength()==1 );
4253 XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
4254
4255 pNewOr->SetOp ( SPH_QUERY_OR, pNot->m_dChildren );
4256 pNewOr->m_dChildren.Add ( pGrand2Not->m_dChildren[0] );
4257 pNewOr->m_dChildren.Last()->m_pParent = pNewOr;
4258 pGrand2Not->m_dChildren[0] = pNewOr;
4259 pNewOr->m_pParent = pGrand2Not;
4260
4261 assert ( pGrandAnd->m_dChildren.Contains ( pParentAndNot ) );
4262 int iChild = GetNodeChildIndex ( pGrandAnd, pParentAndNot );
4263 pGrandAnd->m_dChildren[iChild] = pAnd;
4264 pAnd->m_pParent = pGrandAnd;
4265
4266 // Delete excess nodes
4267 hDeleted.Add ( 1, pParentAndNot );
4268 pNot->m_dChildren.Resize ( 0 );
4269 pParentAndNot->m_dChildren[0] = NULL;
4270 SafeDelete ( pParentAndNot );
4271 bRecollect = true;
4272 }
4273 }
4274 }
4275
4276 return bRecollect;
4277 }
4278
4279
Dump()4280 void CSphTransformation::Dump ()
4281 {
4282 #ifdef XQDEBUG
4283 m_hSimilar.IterateStart();
4284 while ( m_hSimilar.IterateNext() )
4285 {
4286 printf ( "\nnode: hash 0x" UINT64_FMT "\n", m_hSimilar.IterateGetKey() );
4287 m_hSimilar.IterateGet().IterateStart();
4288 while ( m_hSimilar.IterateGet().IterateNext() )
4289 {
4290 CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
4291 printf ( "\tgrand: hash 0x" UINT64_FMT ", children %d\n", m_hSimilar.IterateGet().IterateGetKey(), dNodes.GetLength() );
4292
4293 printf ( "\tparents:\n" );
4294 ARRAY_FOREACH ( i, dNodes )
4295 {
4296 uint64_t uParentHash = dNodes[i]->GetHash();
4297 printf ( "\t\thash 0x" UINT64_FMT "\n", uParentHash );
4298 }
4299 }
4300 }
4301 #endif
4302 }
4303
4304
4305 #ifdef XQDEBUG
Dump(const XQNode_t * pNode,const char * sHeader)4306 void CSphTransformation::Dump ( const XQNode_t * pNode, const char * sHeader )
4307 {
4308 printf ( sHeader );
4309 if ( pNode )
4310 {
4311 printf ( "%s\n", sphReconstructNode ( pNode, NULL ).cstr(), NULL );
4312 #ifdef XQ_DUMP_TRANSFORMED_TREE
4313 xqDump ( pNode, 0 );
4314 #endif
4315 }
4316 }
4317 #else
Dump(const XQNode_t *,const char *)4318 void CSphTransformation::Dump ( const XQNode_t * , const char * )
4319 {}
4320 #endif
4321
4322
Transform()4323 void CSphTransformation::Transform ()
4324 {
4325 if ( CollectInfo <ParentNode, NullNode> ( *m_ppRoot, &CheckCommonKeywords ) )
4326 {
4327 bool bDump = TransformCommonKeywords ();
4328 if ( bDump )
4329 Dump ( *m_ppRoot, "\nAfter transformation of 'COMMON KEYWORDS'\n" );
4330 }
4331
4332 if ( CollectInfo <ParentNode, NullNode> ( *m_ppRoot, &CheckCommonPhrase ) )
4333 {
4334 bool bDump = TransformCommonPhrase ();
4335 if ( bDump )
4336 Dump ( *m_ppRoot, "\nAfter transformation of 'COMMON PHRASES'\n" );
4337 }
4338
4339 bool bRecollect = false;
4340 do
4341 {
4342 bRecollect = false;
4343
4344 if ( CollectInfo <Grand2Node, CurrentNode> ( *m_ppRoot, &CheckCommonNot ) )
4345 {
4346 bool bDump = TransformCommonNot ();
4347 bRecollect |= bDump;
4348 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'COMMON NOT'\n" );
4349 }
4350
4351 if ( CollectInfo <Grand3Node, CurrentNode> ( *m_ppRoot, &CheckCommonCompoundNot ) )
4352 {
4353 bool bDump = TransformCommonCompoundNot ();
4354 bRecollect |= bDump;
4355 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'COMMON COMPOUND NOT'\n" );
4356 }
4357
4358 if ( CollectInfo <Grand2Node, CurrentNode> ( *m_ppRoot, &CheckCommonSubTerm ) )
4359 {
4360 bool bDump = TransformCommonSubTerm ();
4361 bRecollect |= bDump;
4362 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'COMMON SUBTERM'\n" );
4363 }
4364
4365 if ( CollectInfo <Grand2Node, CurrentNode> ( *m_ppRoot, &CheckCommonAndNotFactor ) )
4366 {
4367 bool bDump = TransformCommonAndNotFactor ();
4368 bRecollect |= bDump;
4369 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'COMMON ANDNOT FACTOR'\n" );
4370 }
4371
4372 if ( CollectInfo <Grand3Node, CurrentNode> ( *m_ppRoot, &CheckCommonOrNot ) )
4373 {
4374 bool bDump = TransformCommonOrNot ();
4375 bRecollect |= bDump;
4376 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'COMMON OR NOT'\n" );
4377 }
4378
4379 if ( CollectInfo <NullNode, NullNode> ( *m_ppRoot, &CheckHungOperand ) )
4380 {
4381 bool bDump = TransformHungOperand ();
4382 bRecollect |= bDump;
4383 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'HUNG OPERAND'\n" );
4384 }
4385
4386 if ( CollectInfo <NullNode, NullNode> ( *m_ppRoot, &CheckExcessBrackets ) )
4387 {
4388 bool bDump = TransformExcessBrackets ();
4389 bRecollect |= bDump;
4390 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'EXCESS BRACKETS'\n" );
4391 }
4392
4393 if ( CollectInfo <ParentNode, CurrentNode> ( *m_ppRoot, &CheckExcessAndNot ) )
4394 {
4395 bool bDump = TransformExcessAndNot ();
4396 bRecollect |= bDump;
4397 Dump ( bDump ? *m_ppRoot : NULL, "\nAfter transformation of 'EXCESS AND NOT'\n" );
4398 }
4399 } while ( bRecollect );
4400
4401 ( *m_ppRoot )->Check ( true );
4402 }
4403
4404
sphOptimizeBoolean(XQNode_t ** ppRoot,const ISphKeywordsStat * pKeywords)4405 void sphOptimizeBoolean ( XQNode_t ** ppRoot, const ISphKeywordsStat * pKeywords )
4406 {
4407 #ifdef XQDEBUG
4408 int64_t tmDelta = sphMicroTimer();
4409 #endif
4410
4411 CSphTransformation qInfo ( ppRoot, pKeywords );
4412 qInfo.Transform ();
4413
4414 #ifdef XQDEBUG
4415 tmDelta = sphMicroTimer() - tmDelta;
4416 if ( tmDelta>10 )
4417 printf ( "optimized boolean in %d.%03d msec", (int)(tmDelta/1000), (int)(tmDelta%1000) );
4418 #endif
4419 }
4420
4421
sphSetupQueryTokenizer(ISphTokenizer * pTokenizer)4422 void sphSetupQueryTokenizer ( ISphTokenizer * pTokenizer )
4423 {
4424 pTokenizer->AddSpecials ( "()|-!@~\"/^$<" );
4425 pTokenizer->AddPlainChar ( '?' );
4426 pTokenizer->AddPlainChar ( '%' );
4427 }
4428
4429
4430 //
4431 // $Id$
4432 //
4433