1 //
2 // $Id$
3 //
4 
5 //
6 // Copyright (c) 2001-2016, Andrew Aksyonoff
7 // Copyright (c) 2008-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 
16 #include "sphinx.h"
17 #include "sphinxquery.h"
18 #include "sphinxutils.h"
19 #include "sphinxplugin.h"
20 #include <stdarg.h>
21 
22 //////////////////////////////////////////////////////////////////////////
23 // EXTENDED PARSER RELOADED
24 //////////////////////////////////////////////////////////////////////////
25 class XQParser_t;
26 #ifdef CMAKE_GENERATED_GRAMMAR
27 	#include "bissphinxquery.h"
28 #else
29 	#include "yysphinxquery.h"
30 #endif
31 
32 // #define XQDEBUG 1
33 // #define XQ_DUMP_TRANSFORMED_TREE 1
34 // #define XQ_DUMP_NODE_ADDR 1
35 
36 //////////////////////////////////////////////////////////////////////////
37 
38 struct MultiformNode_t
39 {
40 	XQNode_t *	m_pNode;
41 	int			m_iDestStart;
42 	int			m_iDestCount;
43 };
44 
45 class XQParser_t
46 {
47 public:
48 					XQParser_t ();
49 					~XQParser_t ();
50 
51 public:
52 	bool			Parse ( XQQuery_t & tQuery, const char * sQuery, const CSphQuery * pQuery, const ISphTokenizer * pTokenizer, const CSphSchema * pSchema, CSphDict * pDict, const CSphIndexSettings & tSettings );
53 
54 	bool			Error ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
55 	void			Warning ( const char * sTemplate, ... ) __attribute__ ( ( format ( printf, 2, 3 ) ) );
56 
57 	bool			AddField ( FieldMask_t & dFields, const char * szField, int iLen );
58 	bool			ParseFields ( FieldMask_t & uFields, int & iMaxFieldPos, bool & bIgnore );
59 	int				ParseZone ( const char * pZone );
60 
61 	bool			IsSpecial ( char c );
62 	bool			GetNumber ( const char * p );
63 	int				GetToken ( YYSTYPE * lvalp );
64 
65 	void			HandleModifiers ( XQKeyword_t & tKeyword );
66 
67 	void			AddQuery ( XQNode_t * pNode );
68 	XQNode_t *		AddKeyword ( const char * sKeyword, int iSkippedPosBeforeToken=0 );
69 	XQNode_t *		AddKeyword ( XQNode_t * pLeft, XQNode_t * pRight );
70 	XQNode_t *		AddOp ( XQOperator_e eOp, XQNode_t * pLeft, XQNode_t * pRight, int iOpArg=0 );
71 	void			SetPhrase ( XQNode_t * pNode, bool bSetExact );
72 
73 	void			Cleanup ();
74 	XQNode_t *		SweepNulls ( XQNode_t * pNode );
75 	bool			FixupNots ( XQNode_t * pNode );
76 	void			FixupNulls ( XQNode_t * pNode );
77 	void			DeleteNodesWOFields ( XQNode_t * pNode );
78 	void			PhraseShiftQpos ( XQNode_t * pNode );
79 	void			FixupDestForms ();
80 
SetFieldSpec(const FieldMask_t & uMask,int iMaxPos)81 	inline void SetFieldSpec ( const FieldMask_t& uMask, int iMaxPos )
82 	{
83 		FixRefSpec();
84 		m_dStateSpec.Last()->SetFieldSpec ( uMask, iMaxPos );
85 	}
SetZoneVec(int iZoneVec,bool bZoneSpan=false)86 	inline void SetZoneVec ( int iZoneVec, bool bZoneSpan = false )
87 	{
88 		FixRefSpec();
89 		m_dStateSpec.Last()->SetZoneSpec ( m_dZoneVecs[iZoneVec], bZoneSpan );
90 	}
91 
FixRefSpec()92 	inline void FixRefSpec ()
93 	{
94 		bool bRef = ( m_dStateSpec.GetLength()>1 && ( m_dStateSpec[m_dStateSpec.GetLength()-1]==m_dStateSpec[m_dStateSpec.GetLength()-2] ) );
95 		if ( !bRef )
96 			return;
97 
98 		XQLimitSpec_t * pSpec = m_dStateSpec.Pop();
99 		m_dSpecPool.Add ( new XQLimitSpec_t ( *pSpec ) );
100 		m_dStateSpec.Add ( m_dSpecPool.Last() );
101 	}
102 
103 public:
GetZoneVec(int iZoneVec) const104 	const CSphVector<int> & GetZoneVec ( int iZoneVec ) const
105 	{
106 		return m_dZoneVecs[iZoneVec];
107 	}
108 
109 public:
110 	static const int MAX_TOKEN_BYTES = 3*SPH_MAX_WORD_LEN + 16;
111 
112 	XQQuery_t *				m_pParsed;
113 
114 	BYTE *					m_sQuery;
115 	int						m_iQueryLen;
116 	const char *			m_pLastTokenStart;
117 	const char *			m_pLastTokenEnd;
118 
119 	const CSphSchema *		m_pSchema;
120 	ISphTokenizer *			m_pTokenizer;
121 	CSphDict *				m_pDict;
122 
123 	CSphVector<XQNode_t*>	m_dSpawned;
124 	XQNode_t *				m_pRoot;
125 
126 	bool					m_bStopOnInvalid;
127 	int						m_iAtomPos;
128 
129 	int						m_iPendingNulls;
130 	int						m_iPendingType;
131 	YYSTYPE					m_tPendingToken;
132 	bool					m_bWasBlended;
133 	bool					m_bWasKeyword;
134 
135 	bool					m_bEmpty;
136 	bool					m_bQuoted;
137 	bool					m_bEmptyStopword;
138 	int						m_iOvershortStep;
139 
140 	int						m_iQuorumQuote;
141 	int						m_iQuorumFSlash;
142 	bool					m_bCheckNumber;
143 
144 	const PluginQueryTokenFilter_c * m_pPlugin;
145 	void *					m_pPluginData;
146 
147 	CSphVector<CSphString>	m_dIntTokens;
148 
149 	CSphVector < CSphVector<int> >	m_dZoneVecs;
150 	CSphVector<XQLimitSpec_t *>		m_dStateSpec;
151 	CSphVector<XQLimitSpec_t *>		m_dSpecPool;
152 	CSphVector<int>					m_dPhraseStar;
153 
154 	CSphVector<CSphString>			m_dDestForms;
155 	CSphVector<MultiformNode_t>		m_dMultiforms;
156 };
157 
158 //////////////////////////////////////////////////////////////////////////
159 
yylex(YYSTYPE * lvalp,XQParser_t * pParser)160 int yylex ( YYSTYPE * lvalp, XQParser_t * pParser )
161 {
162 	return pParser->GetToken ( lvalp );
163 }
164 
yyerror(XQParser_t * pParser,const char * sMessage)165 void yyerror ( XQParser_t * pParser, const char * sMessage )
166 {
167 	if ( pParser->m_pParsed->m_sParseError.IsEmpty() )
168 		pParser->m_pParsed->m_sParseError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastTokenStart );
169 }
170 
171 #ifdef CMAKE_GENERATED_GRAMMAR
172 	#include "bissphinxquery.c"
173 #else
174 	#include "yysphinxquery.c"
175 #endif
176 
177 //////////////////////////////////////////////////////////////////////////
178 
SetFieldSpec(const FieldMask_t & uMask,int iMaxPos)179 void XQLimitSpec_t::SetFieldSpec ( const FieldMask_t& uMask, int iMaxPos )
180 {
181 	m_bFieldSpec = true;
182 	m_dFieldMask = uMask;
183 	m_iFieldMaxPos = iMaxPos;
184 }
185 
186 /// ctor
XQNode_t(const XQLimitSpec_t & dSpec)187 XQNode_t::XQNode_t ( const XQLimitSpec_t & dSpec )
188 : m_pParent ( NULL )
189 , m_eOp ( SPH_QUERY_AND )
190 , m_iOrder ( 0 )
191 , m_iCounter ( 0 )
192 , m_iMagicHash ( 0 )
193 , m_iFuzzyHash ( 0 )
194 , m_dSpec ( dSpec )
195 , m_iOpArg ( 0 )
196 , m_iAtomPos ( -1 )
197 , m_iUser ( 0 )
198 , m_bVirtuallyPlain ( false )
199 , m_bNotWeighted ( false )
200 , m_bPercentOp ( false )
201 {
202 #ifdef XQ_DUMP_NODE_ADDR
203 	printf ( "node new 0x%08x\n", this );
204 #endif
205 }
206 
207 /// dtor
~XQNode_t()208 XQNode_t::~XQNode_t ()
209 {
210 #ifdef XQ_DUMP_NODE_ADDR
211 	printf ( "node deleted %d 0x%08x\n", m_eOp, this );
212 #endif
213 	ARRAY_FOREACH ( i, m_dChildren )
214 		SafeDelete ( m_dChildren[i] );
215 }
216 
217 
SetFieldSpec(const FieldMask_t & uMask,int iMaxPos)218 void XQNode_t::SetFieldSpec ( const FieldMask_t& uMask, int iMaxPos )
219 {
220 	// set it, if we do not yet have one
221 	if ( !m_dSpec.m_bFieldSpec )
222 		m_dSpec.SetFieldSpec ( uMask, iMaxPos );
223 
224 	// some of the children might not yet have a spec, even if the node itself has
225 	// eg. 'hello @title world' (whole node has '@title' spec but 'hello' node does not have any!)
226 	ARRAY_FOREACH ( i, m_dChildren )
227 		m_dChildren[i]->SetFieldSpec ( uMask, iMaxPos );
228 }
229 
SetZoneSpec(const CSphVector<int> & dZones,bool bZoneSpan)230 void XQLimitSpec_t::SetZoneSpec ( const CSphVector<int> & dZones, bool bZoneSpan )
231 {
232 	m_dZones = dZones;
233 	m_bZoneSpan = bZoneSpan;
234 }
235 
236 
SetZoneSpec(const CSphVector<int> & dZones,bool bZoneSpan)237 void XQNode_t::SetZoneSpec ( const CSphVector<int> & dZones, bool bZoneSpan )
238 {
239 	// set it, if we do not yet have one
240 	if ( !m_dSpec.m_dZones.GetLength() )
241 		m_dSpec.SetZoneSpec ( dZones, bZoneSpan );
242 
243 	// some of the children might not yet have a spec, even if the node itself has
244 	ARRAY_FOREACH ( i, m_dChildren )
245 		m_dChildren[i]->SetZoneSpec ( dZones, bZoneSpan );
246 }
247 
CopySpecs(const XQNode_t * pSpecs)248 void XQNode_t::CopySpecs ( const XQNode_t * pSpecs )
249 {
250 	if ( !pSpecs )
251 		return;
252 
253 	if ( !m_dSpec.m_bFieldSpec )
254 		m_dSpec.SetFieldSpec ( pSpecs->m_dSpec.m_dFieldMask, pSpecs->m_dSpec.m_iFieldMaxPos );
255 
256 	if ( !m_dSpec.m_dZones.GetLength() )
257 		m_dSpec.SetZoneSpec ( pSpecs->m_dSpec.m_dZones, pSpecs->m_dSpec.m_bZoneSpan );
258 }
259 
260 
ClearFieldMask()261 void XQNode_t::ClearFieldMask ()
262 {
263 	m_dSpec.m_dFieldMask.SetAll();
264 
265 	ARRAY_FOREACH ( i, m_dChildren )
266 		m_dChildren[i]->ClearFieldMask();
267 }
268 
269 
GetHash() const270 uint64_t XQNode_t::GetHash () const
271 {
272 	if ( m_iMagicHash )
273 		return m_iMagicHash;
274 
275 	XQOperator_e dZeroOp[2];
276 	dZeroOp[0] = m_eOp;
277 	dZeroOp[1] = (XQOperator_e) 0;
278 
279 	ARRAY_FOREACH ( i, m_dWords )
280 		m_iMagicHash = 100 + ( m_iMagicHash ^ sphFNV64 ( m_dWords[i].m_sWord.cstr() ) ); // +100 to make it non-transitive
281 	ARRAY_FOREACH ( j, m_dChildren )
282 		m_iMagicHash = 100 + ( m_iMagicHash ^ m_dChildren[j]->GetHash() ); // +100 to make it non-transitive
283 	m_iMagicHash += 1000000; // to immerse difference between parents and children
284 	m_iMagicHash ^= sphFNV64 ( dZeroOp );
285 
286 	return m_iMagicHash;
287 }
288 
289 
GetFuzzyHash() const290 uint64_t XQNode_t::GetFuzzyHash () const
291 {
292 	if ( m_iFuzzyHash )
293 		return m_iFuzzyHash;
294 
295 	XQOperator_e dZeroOp[2];
296 	dZeroOp[0] = ( m_eOp==SPH_QUERY_PHRASE ? SPH_QUERY_PROXIMITY : m_eOp );
297 	dZeroOp[1] = (XQOperator_e) 0;
298 
299 	ARRAY_FOREACH ( i, m_dWords )
300 		m_iFuzzyHash = 100 + ( m_iFuzzyHash ^ sphFNV64 ( m_dWords[i].m_sWord.cstr() ) ); // +100 to make it non-transitive
301 	ARRAY_FOREACH ( j, m_dChildren )
302 		m_iFuzzyHash = 100 + ( m_iFuzzyHash ^ m_dChildren[j]->GetFuzzyHash () ); // +100 to make it non-transitive
303 	m_iFuzzyHash += 1000000; // to immerse difference between parents and children
304 	m_iFuzzyHash ^= sphFNV64 ( dZeroOp );
305 
306 	return m_iFuzzyHash;
307 }
308 
309 
SetOp(XQOperator_e eOp,XQNode_t * pArg1,XQNode_t * pArg2)310 void XQNode_t::SetOp ( XQOperator_e eOp, XQNode_t * pArg1, XQNode_t * pArg2 )
311 {
312 	m_eOp = eOp;
313 	m_dChildren.Reset();
314 	if ( pArg1 )
315 	{
316 		m_dChildren.Add ( pArg1 );
317 		pArg1->m_pParent = this;
318 	}
319 	if ( pArg2 )
320 	{
321 		m_dChildren.Add ( pArg2 );
322 		pArg2->m_pParent = this;
323 	}
324 }
325 
326 
FixupAtomPos()327 int XQNode_t::FixupAtomPos()
328 {
329 	assert ( m_eOp==SPH_QUERY_PROXIMITY && m_dWords.GetLength()>0 );
330 
331 	ARRAY_FOREACH ( i, m_dWords )
332 	{
333 		int iSub = m_dWords[i].m_iSkippedBefore-1;
334 		if ( iSub>0 )
335 		{
336 			for ( int j = i; j < m_dWords.GetLength(); j++ )
337 				m_dWords[j].m_iAtomPos -= iSub;
338 		}
339 	}
340 
341 	return m_dWords.Last().m_iAtomPos+1;
342 }
343 
344 
Clone()345 XQNode_t * XQNode_t::Clone ()
346 {
347 	XQNode_t * pRet = new XQNode_t ( m_dSpec );
348 	pRet->SetOp ( m_eOp );
349 	pRet->m_dWords = m_dWords;
350 	pRet->m_iOpArg = m_iOpArg;
351 	pRet->m_iAtomPos = m_iAtomPos;
352 	pRet->m_bVirtuallyPlain = m_bVirtuallyPlain;
353 	pRet->m_bNotWeighted = m_bNotWeighted;
354 	pRet->m_bPercentOp = m_bPercentOp;
355 
356 	if ( m_dChildren.GetLength()==0 )
357 		return pRet;
358 
359 	pRet->m_dChildren.Reserve ( m_dChildren.GetLength() );
360 	for ( int i = 0; i < m_dChildren.GetLength(); ++i )
361 	{
362 		pRet->m_dChildren.Add ( m_dChildren[i]->Clone() );
363 		pRet->m_dChildren.Last()->m_pParent = pRet;
364 	}
365 
366 	return pRet;
367 }
368 
369 
ResetHash()370 bool XQNode_t::ResetHash ()
371 {
372 	bool bAlreadyReset = ( m_iMagicHash==0 && m_iFuzzyHash==0 );
373 	m_iMagicHash = 0;
374 	m_iFuzzyHash = 0;
375 	return bAlreadyReset;
376 }
377 
378 
379 /// return either index of pNode among Parent.m_dChildren, or -1
GetNodeChildIndex(const XQNode_t * pParent,const XQNode_t * pNode)380 static int GetNodeChildIndex ( const XQNode_t * pParent, const XQNode_t * pNode )
381 {
382 	assert ( pParent && pNode );
383 	ARRAY_FOREACH ( i, pParent->m_dChildren )
384 		if ( pParent->m_dChildren[i]==pNode )
385 			return i;
386 
387 	return -1;
388 }
389 
390 //////////////////////////////////////////////////////////////////////////
391 
XQParser_t()392 XQParser_t::XQParser_t ()
393 	: m_pParsed ( NULL )
394 	, m_pLastTokenStart ( NULL )
395 	, m_pLastTokenEnd ( NULL )
396 	, m_pRoot ( NULL )
397 	, m_bStopOnInvalid ( true )
398 	, m_bWasBlended ( false )
399 	, m_bWasKeyword ( false )
400 	, m_bQuoted ( false )
401 	, m_bEmptyStopword ( false )
402 	, m_iQuorumQuote ( -1 )
403 	, m_iQuorumFSlash ( -1 )
404 	, m_bCheckNumber ( false )
405 	, m_pPlugin ( NULL )
406 	, m_pPluginData ( NULL )
407 {
408 	m_dSpecPool.Add ( new XQLimitSpec_t() );
409 	m_dStateSpec.Add ( m_dSpecPool.Last() );
410 }
411 
~XQParser_t()412 XQParser_t::~XQParser_t ()
413 {
414 	ARRAY_FOREACH ( i, m_dSpecPool )
415 		SafeDelete ( m_dSpecPool[i] );
416 }
417 
418 
419 /// cleanup spawned nodes (for bailing out on errors)
Cleanup()420 void XQParser_t::Cleanup ()
421 {
422 	m_dSpawned.Uniq(); // FIXME! should eliminate this by testing
423 
424 	ARRAY_FOREACH ( i, m_dSpawned )
425 	{
426 		m_dSpawned[i]->m_dChildren.Reset ();
427 		SafeDelete ( m_dSpawned[i] );
428 	}
429 	m_dSpawned.Reset ();
430 	m_dStateSpec.Reset();
431 }
432 
433 
434 
Error(const char * sTemplate,...)435 bool XQParser_t::Error ( const char * sTemplate, ... )
436 {
437 	assert ( m_pParsed );
438 	char sBuf[256];
439 
440 	const char * sPrefix = "query error: ";
441 	int iPrefix = strlen(sPrefix);
442 	memcpy ( sBuf, sPrefix, iPrefix );
443 
444 	va_list ap;
445 	va_start ( ap, sTemplate );
446 	vsnprintf ( sBuf+iPrefix, sizeof(sBuf)-iPrefix, sTemplate, ap );
447 	va_end ( ap );
448 
449 	m_pParsed->m_sParseError = sBuf;
450 	return false;
451 }
452 
453 
Warning(const char * sTemplate,...)454 void XQParser_t::Warning ( const char * sTemplate, ... )
455 {
456 	assert ( m_pParsed );
457 	char sBuf[256];
458 
459 	const char * sPrefix = "query warning: ";
460 	int iPrefix = strlen(sPrefix);
461 	memcpy ( sBuf, sPrefix, iPrefix );
462 
463 	va_list ap;
464 	va_start ( ap, sTemplate );
465 	vsnprintf ( sBuf+iPrefix, sizeof(sBuf)-iPrefix, sTemplate, ap );
466 	va_end ( ap );
467 
468 	m_pParsed->m_sParseWarning = sBuf;
469 }
470 
471 
472 /// my special chars
IsSpecial(char c)473 bool XQParser_t::IsSpecial ( char c )
474 {
475 	return c=='(' || c==')' || c=='|' || c=='-' || c=='!' || c=='@' || c=='~' || c=='"' || c=='/';
476 }
477 
478 
479 /// lookup field and add it into mask
AddField(FieldMask_t & dFields,const char * szField,int iLen)480 bool XQParser_t::AddField ( FieldMask_t & dFields, const char * szField, int iLen )
481 {
482 	CSphString sField;
483 	sField.SetBinary ( szField, iLen );
484 
485 	int iField = m_pSchema->GetFieldIndex ( sField.cstr () );
486 	if ( iField<0 )
487 	{
488 		if ( m_bStopOnInvalid )
489 			return Error ( "no field '%s' found in schema", sField.cstr () );
490 		else
491 			Warning ( "no field '%s' found in schema", sField.cstr () );
492 	} else
493 	{
494 		if ( iField>=SPH_MAX_FIELDS )
495 			return Error ( " max %d fields allowed", SPH_MAX_FIELDS );
496 
497 		dFields.Set ( iField );
498 	}
499 
500 	return true;
501 }
502 
503 
504 /// parse fields block
ParseFields(FieldMask_t & dFields,int & iMaxFieldPos,bool & bIgnore)505 bool XQParser_t::ParseFields ( FieldMask_t & dFields, int & iMaxFieldPos, bool & bIgnore )
506 {
507 	dFields.UnsetAll();
508 	iMaxFieldPos = 0;
509 	bIgnore = false;
510 
511 	const char * pPtr = m_pTokenizer->GetBufferPtr ();
512 	const char * pLastPtr = m_pTokenizer->GetBufferEnd ();
513 
514 	if ( pPtr==pLastPtr )
515 		return true; // silently ignore trailing field operator
516 
517 	bool bNegate = false;
518 	bool bBlock = false;
519 
520 	// handle special modifiers
521 	if ( *pPtr=='!' )
522 	{
523 		// handle @! and @!(
524 		bNegate = true; pPtr++;
525 		if ( *pPtr=='(' ) { bBlock = true; pPtr++; }
526 
527 	} else if ( *pPtr=='*' )
528 	{
529 		// handle @*
530 		dFields.SetAll();
531 		m_pTokenizer->SetBufferPtr ( pPtr+1 );
532 		return true;
533 
534 	} else if ( *pPtr=='(' )
535 	{
536 		// handle @(
537 		bBlock = true; pPtr++;
538 	}
539 
540 	// handle invalid chars
541 	if ( !sphIsAlpha(*pPtr) )
542 	{
543 		bIgnore = true;
544 		m_pTokenizer->SetBufferPtr ( pPtr ); // ignore and re-parse (FIXME! maybe warn?)
545 		return true;
546 	}
547 	assert ( sphIsAlpha(*pPtr) ); // i think i'm paranoid
548 
549 	// handle field specification
550 	if ( !bBlock )
551 	{
552 		// handle standalone field specification
553 		const char * pFieldStart = pPtr;
554 		while ( sphIsAlpha(*pPtr) && pPtr<pLastPtr )
555 			pPtr++;
556 
557 		assert ( pPtr-pFieldStart>0 );
558 		if ( !AddField ( dFields, pFieldStart, pPtr-pFieldStart ) )
559 			return false;
560 
561 		m_pTokenizer->SetBufferPtr ( pPtr );
562 		if ( bNegate )
563 			dFields.Negate();
564 
565 	} else
566 	{
567 		// handle fields block specification
568 		assert ( sphIsAlpha(*pPtr) && bBlock ); // and complicated
569 
570 		bool bOK = false;
571 		const char * pFieldStart = NULL;
572 		while ( pPtr<pLastPtr )
573 		{
574 			// accumulate field name, while we can
575 			if ( sphIsAlpha(*pPtr) )
576 			{
577 				if ( !pFieldStart )
578 					pFieldStart = pPtr;
579 				pPtr++;
580 				continue;
581 			}
582 
583 			// separator found
584 			if ( pFieldStart==NULL )
585 			{
586 				CSphString sContext;
587 				sContext.SetBinary ( pPtr, (int)( pLastPtr-pPtr ) );
588 				return Error ( "invalid field block operator syntax near '%s'", sContext.cstr() ? sContext.cstr() : "" );
589 
590 			} else if ( *pPtr==',' )
591 			{
592 				if ( !AddField ( dFields, pFieldStart, pPtr-pFieldStart ) )
593 					return false;
594 
595 				pFieldStart = NULL;
596 				pPtr++;
597 
598 			} else if ( *pPtr==')' )
599 			{
600 				if ( !AddField ( dFields, pFieldStart, pPtr-pFieldStart ) )
601 					return false;
602 
603 				m_pTokenizer->SetBufferPtr ( ++pPtr );
604 				if ( bNegate )
605 					dFields.Negate();
606 
607 				bOK = true;
608 				break;
609 
610 			} else
611 			{
612 				return Error ( "invalid character '%c' in field block operator", *pPtr );
613 			}
614 		}
615 		if ( !bOK )
616 			return Error ( "missing closing ')' in field block operator" );
617 	}
618 
619 	// handle optional position range modifier
620 	if ( pPtr[0]=='[' && isdigit ( pPtr[1] ) )
621 	{
622 		// skip '[' and digits
623 		const char * p = pPtr+1;
624 		while ( *p && isdigit(*p) ) p++;
625 
626 		// check that the range ends with ']' (FIXME! maybe report an error if it does not?)
627 		if ( *p!=']' )
628 			return true;
629 
630 		// fetch my value
631 		iMaxFieldPos = strtoul ( pPtr+1, NULL, 10 );
632 		m_pTokenizer->SetBufferPtr ( p+1 );
633 	}
634 
635 	// well done
636 	return true;
637 }
638 
639 
640 /// helper find-or-add (make it generic and move to sphinxstd?)
GetZoneIndex(XQQuery_t * pQuery,const CSphString & sZone)641 static int GetZoneIndex ( XQQuery_t * pQuery, const CSphString & sZone )
642 {
643 	ARRAY_FOREACH ( i, pQuery->m_dZones )
644 		if ( pQuery->m_dZones[i]==sZone )
645 			return i;
646 
647 	pQuery->m_dZones.Add ( sZone );
648 	return pQuery->m_dZones.GetLength()-1;
649 }
650 
651 
652 /// parse zone
ParseZone(const char * pZone)653 int XQParser_t::ParseZone ( const char * pZone )
654 {
655 	const char * p = pZone;
656 
657 	// case one, just a single zone name
658 	if ( sphIsAlpha ( *pZone ) )
659 	{
660 		// find zone name
661 		while ( sphIsAlpha(*p) )
662 			p++;
663 		m_pTokenizer->SetBufferPtr ( p );
664 
665 		// extract and lowercase it
666 		CSphString sZone;
667 		sZone.SetBinary ( pZone, p-pZone );
668 		sZone.ToLower();
669 
670 		// register it in zones list
671 		int iZone = GetZoneIndex ( m_pParsed, sZone );
672 
673 		// create new 1-zone vector
674 		m_dZoneVecs.Add().Add ( iZone );
675 		return m_dZoneVecs.GetLength()-1;
676 	}
677 
678 	// case two, zone block
679 	// it must follow strict (name1,name2,...) syntax
680 	if ( *pZone=='(' )
681 	{
682 		// create new zone vector
683 		CSphVector<int> & dZones = m_dZoneVecs.Add();
684 		p = ++pZone;
685 
686 		// scan names
687 		for ( ;; )
688 		{
689 			// syntax error, name expected!
690 			if ( !sphIsAlpha(*p) )
691 			{
692 				Error ( "unexpected character '%c' in zone block operator", *p );
693 				return -1;
694 			}
695 
696 			// scan next name
697 			while ( sphIsAlpha(*p) )
698 				p++;
699 
700 			// extract and lowercase it
701 			CSphString sZone;
702 			sZone.SetBinary ( pZone, p-pZone );
703 			sZone.ToLower();
704 
705 			// register it in zones list
706 			dZones.Add ( GetZoneIndex ( m_pParsed, sZone ) );
707 
708 			// must be either followed by comma, or closing paren
709 			// everything else will cause syntax error
710 			if ( *p==')' )
711 			{
712 				m_pTokenizer->SetBufferPtr ( p+1 );
713 				break;
714 			}
715 
716 			if ( *p==',' )
717 				pZone = ++p;
718 		}
719 
720 		return m_dZoneVecs.GetLength()-1;
721 	}
722 
723 	// unhandled case
724 	Error ( "internal error, unhandled case in ParseZone()" );
725 	return -1;
726 }
727 
728 
GetNumber(const char * p)729 bool XQParser_t::GetNumber ( const char * p )
730 {
731 	int iDots = 0;
732 	const char * sToken = p;
733 	const char * sEnd = m_pTokenizer->GetBufferEnd ();
734 	while ( p<sEnd && ( isdigit ( *(BYTE*)p ) || *p=='.' ) )
735 	{
736 		iDots += ( *p=='.' );
737 		p++;
738 	}
739 
740 	// must be float number but got many dots or only dot
741 	if ( iDots && ( iDots>1 || p-sToken==iDots ) )
742 		p = sToken;
743 
744 	// float as number allowed only as quorum argument and regular keywords stream otherwise
745 	if ( iDots==1 && ( m_iQuorumQuote!=m_iQuorumFSlash || m_iQuorumQuote!=m_iAtomPos ) )
746 		p = sToken;
747 
748 	static const int NUMBER_BUF_LEN = 10; // max strlen of int32
749 	if ( p>sToken && p-sToken<NUMBER_BUF_LEN
750 		&& !( *p=='-' && !( p-sToken==1 && sphIsModifier ( p[-1] ) ) ) // !bDashInside copied over from arbitration
751 		&& ( *p=='\0' || sphIsSpace(*p) || IsSpecial(*p) ) )
752 	{
753 		// float as quorum argument has higher precedence than blended
754 		bool bQuorum = ( m_iQuorumQuote==m_iQuorumFSlash && m_iQuorumFSlash==m_iAtomPos );
755 		bool bQuorumPercent = ( bQuorum && iDots==1 );
756 
757 		bool bTok = ( m_pTokenizer->GetToken()!=NULL );
758 		if ( bTok && m_pTokenizer->TokenIsBlended() && !( bQuorum || bQuorumPercent ) ) // number with blended should be tokenized as usual
759 		{
760 			m_pTokenizer->SkipBlended();
761 			m_pTokenizer->SetBufferPtr ( m_pLastTokenStart );
762 		} else if ( bTok && m_pTokenizer->WasTokenSynonym() && !( bQuorum || bQuorumPercent ) )
763 		{
764 			m_pTokenizer->SetBufferPtr ( m_pLastTokenStart );
765 		} else
766 		{
767 			// got not a very long number followed by a whitespace or special, handle it
768 			char sNumberBuf[NUMBER_BUF_LEN];
769 
770 			int iNumberLen = Min ( (int)sizeof(sNumberBuf)-1, int(p-sToken) );
771 			memcpy ( sNumberBuf, sToken, iNumberLen );
772 			sNumberBuf[iNumberLen] = '\0';
773 			if ( iDots )
774 				m_tPendingToken.tInt.fValue = (float)strtod ( sNumberBuf, NULL );
775 			else
776 				m_tPendingToken.tInt.iValue = atoi ( sNumberBuf );
777 
778 			// check if it can be used as a keyword too
779 			m_pTokenizer->SetBuffer ( (BYTE*)sNumberBuf, iNumberLen );
780 			sToken = (const char*) m_pTokenizer->GetToken();
781 			m_pTokenizer->SetBuffer ( m_sQuery, m_iQueryLen );
782 			m_pTokenizer->SetBufferPtr ( p );
783 
784 			m_tPendingToken.tInt.iStrIndex = -1;
785 			if ( sToken )
786 			{
787 				m_dIntTokens.Add ( sToken );
788 				m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
789 				if ( m_pDict->GetWordID ( (BYTE*)sToken ) )
790 					m_tPendingToken.tInt.iStrIndex = m_dIntTokens.GetLength()-1;
791 				else
792 					m_dIntTokens.Pop();
793 				m_iAtomPos++;
794 			}
795 
796 			m_iPendingNulls = 0;
797 			m_iPendingType = iDots ? TOK_FLOAT : TOK_INT;
798 			return true;
799 		}
800 	}
801 	return false;
802 }
803 
804 
805 /// a lexer of my own
GetToken(YYSTYPE * lvalp)806 int XQParser_t::GetToken ( YYSTYPE * lvalp )
807 {
808 	bool bWasFrontModifier = false; // used to print warning
809 
810 	// what, noone's pending for a bending?!
811 	if ( !m_iPendingType )
812 		for ( ;; )
813 	{
814 		assert ( m_iPendingNulls==0 );
815 
816 		bool bWasKeyword = m_bWasKeyword;
817 		m_bWasKeyword = false;
818 
819 		int iSkippedPosBeforeToken = 0;
820 		if ( m_bWasBlended )
821 		{
822 			iSkippedPosBeforeToken = m_pTokenizer->SkipBlended();
823 			m_iAtomPos += iSkippedPosBeforeToken;
824 		}
825 
826 		// tricky stuff
827 		// we need to manually check for numbers in certain states (currently, just after proximity or quorum operator)
828 		// required because if 0-9 are not in charset_table, or min_word_len is too high,
829 		// the tokenizer will *not* return the number as a token!
830 		m_pLastTokenStart = m_pTokenizer->GetBufferPtr ();
831 		m_pLastTokenEnd = m_pTokenizer->GetTokenEnd();
832 		const char * sEnd = m_pTokenizer->GetBufferEnd ();
833 
834 		const char * p = m_pLastTokenStart;
835 		while ( p<sEnd && isspace ( *(BYTE*)p ) ) p++; // to avoid CRT assertions on Windows
836 
837 		if ( m_bCheckNumber )
838 		{
839 			m_bCheckNumber = false;
840 			if ( GetNumber(p) )
841 				break;
842 		}
843 
844 		// not a number, long number, or number not followed by a whitespace, so fallback to regular tokenizing
845 		const char * sToken = (const char *) m_pTokenizer->GetToken ();
846 		if ( !sToken )
847 		{
848 			m_iPendingNulls = m_pTokenizer->GetOvershortCount() * m_iOvershortStep;
849 			if ( !( m_iPendingNulls || m_pTokenizer->GetBufferPtr()-p>0 ) )
850 				return 0;
851 			m_iPendingNulls = 0;
852 			lvalp->pNode = AddKeyword ( NULL, iSkippedPosBeforeToken );
853 			m_bWasKeyword = true;
854 			return TOK_KEYWORD;
855 		}
856 
857 		// now let's do some token post-processing
858 		m_bWasBlended = m_pTokenizer->TokenIsBlended();
859 		m_bEmpty = false;
860 
861 		m_iPendingNulls = m_pTokenizer->GetOvershortCount() * m_iOvershortStep;
862 		m_iAtomPos += 1+m_iPendingNulls;
863 
864 		// handle NEAR (must be case-sensitive, and immediately followed by slash and int)
865 		if ( sToken && p && !m_pTokenizer->m_bPhrase && strncmp ( p, "NEAR/", 5 )==0 && isdigit(p[5]) )
866 		{
867 			// extract that int
868 			int iVal = 0;
869 			for ( p=p+5; isdigit(*p); p++ )
870 				iVal = iVal*10 + (*p) - '0'; // FIXME! check for overflow?
871 			m_pTokenizer->SetBufferPtr ( p );
872 
873 			// we just lexed our next token
874 			m_iPendingType = TOK_NEAR;
875 			m_tPendingToken.tInt.iValue = iVal;
876 			m_tPendingToken.tInt.iStrIndex = -1;
877 			m_iAtomPos -= 1; // skip NEAR
878 			break;
879 		}
880 
881 		// handle SENTENCE
882 		if ( sToken && p && !m_pTokenizer->m_bPhrase && !strcasecmp ( sToken, "sentence" ) && !strncmp ( p, "SENTENCE", 8 ) )
883 		{
884 			// we just lexed our next token
885 			m_iPendingType = TOK_SENTENCE;
886 			m_iAtomPos -= 1;
887 			break;
888 		}
889 
890 		// handle PARAGRAPH
891 		if ( sToken && p && !m_pTokenizer->m_bPhrase && !strcasecmp ( sToken, "paragraph" ) && !strncmp ( p, "PARAGRAPH", 9 ) )
892 		{
893 			// we just lexed our next token
894 			m_iPendingType = TOK_PARAGRAPH;
895 			m_iAtomPos -= 1;
896 			break;
897 		}
898 
899 		// handle MAYBE
900 		if ( sToken && p && !m_pTokenizer->m_bPhrase && !strcasecmp ( sToken, "maybe" ) && !strncmp ( p, "MAYBE", 5 ) )
901 		{
902 			// we just lexed our next token
903 			m_iPendingType = TOK_MAYBE;
904 			m_iAtomPos -= 1;
905 			break;
906 		}
907 
908 		// handle ZONE
909 		if ( sToken && p && !m_pTokenizer->m_bPhrase && !strncmp ( p, "ZONE:", 5 )
910 			&& ( sphIsAlpha(p[5]) || p[5]=='(' ) )
911 		{
912 			// ParseZone() will update tokenizer buffer ptr as needed
913 			int iVec = ParseZone ( p+5 );
914 			if ( iVec<0 )
915 				return -1;
916 
917 			// we just lexed our next token
918 			m_iPendingType = TOK_ZONE;
919 			m_tPendingToken.iZoneVec = iVec;
920 			m_iAtomPos -= 1;
921 			break;
922 		}
923 
924 		// handle ZONESPAN
925 		if ( sToken && p && !m_pTokenizer->m_bPhrase && !strncmp ( p, "ZONESPAN:", 9 )
926 			&& ( sphIsAlpha(p[9]) || p[9]=='(' ) )
927 		{
928 			// ParseZone() will update tokenizer buffer ptr as needed
929 			int iVec = ParseZone ( p+9 );
930 			if ( iVec<0 )
931 				return -1;
932 
933 			// we just lexed our next token
934 			m_iPendingType = TOK_ZONESPAN;
935 			m_tPendingToken.iZoneVec = iVec;
936 			m_iAtomPos -= 1;
937 			break;
938 		}
939 
940 		// count [ * ] at phrase node for qpos shift
941 		// FIXME! RLP can return tokens from several buffers, all this pointer arithmetic will lead to crashes
942 		if ( m_pTokenizer->m_bPhrase && m_pLastTokenEnd )
943 		{
944 			if ( strncmp ( sToken, "*", 1 )==0 )
945 			{
946 				m_dPhraseStar.Add ( m_iAtomPos );
947 			} else
948 			{
949 				int iSpace = 0;
950 				int iStar = 0;
951 				const char * sCur = m_pLastTokenEnd;
952 				const char * sEnd = m_pTokenizer->GetTokenStart();
953 				for ( ; sCur<sEnd; sCur++ )
954 				{
955 					int iCur = sCur - m_pLastTokenEnd;
956 					switch ( *sCur )
957 					{
958 					case '*':
959 						iStar = sCur - m_pLastTokenEnd;
960 						break;
961 					case ' ':
962 						if ( iSpace+2==iCur && iStar+1==iCur ) // match only [ * ] (separate single star) as valid shift operator
963 							m_dPhraseStar.Add ( m_iAtomPos );
964 						iSpace = iCur;
965 						break;
966 					}
967 				}
968 			}
969 		}
970 
971 		// handle specials
972 		if ( m_pTokenizer->WasTokenSpecial() )
973 		{
974 			// specials must not affect pos
975 			m_iAtomPos--;
976 
977 			// some specials are especially special
978 			if ( sToken[0]=='@' )
979 			{
980 				bool bIgnore;
981 
982 				// parse fields operator
983 				if ( !ParseFields ( m_tPendingToken.tFieldLimit.dMask, m_tPendingToken.tFieldLimit.iMaxPos, bIgnore ) )
984 					return -1;
985 
986 				if ( bIgnore )
987 					continue;
988 
989 				m_iPendingType = TOK_FIELDLIMIT;
990 				break;
991 
992 			} else if ( sToken[0]=='<' )
993 			{
994 				if ( *m_pTokenizer->GetBufferPtr()=='<' )
995 				{
996 					// got "<<", aka operator BEFORE
997 					m_iPendingType = TOK_BEFORE;
998 					break;
999 				} else
1000 				{
1001 					// got stray '<', ignore
1002 					if ( m_iPendingNulls>0 )
1003 					{
1004 						m_iPendingNulls = 0;
1005 						lvalp->pNode = AddKeyword ( NULL, iSkippedPosBeforeToken );
1006 						m_bWasKeyword = true;
1007 						return TOK_KEYWORD;
1008 					}
1009 					continue;
1010 				}
1011 			} else if ( sToken[0]=='^' )
1012 			{
1013 				const char * pTokEnd = m_pTokenizer->GetTokenEnd();
1014 				if ( pTokEnd<m_pTokenizer->GetBufferEnd() && !sphIsSpace ( pTokEnd[0] ) )
1015 					bWasFrontModifier = true;
1016 
1017 				// this special is handled in HandleModifiers()
1018 				continue;
1019 			} else if ( sToken[0]=='$' )
1020 			{
1021 				if ( bWasKeyword )
1022 					continue;
1023 				if ( sphIsSpace ( m_pTokenizer->GetTokenStart() [ -1 ] ) )
1024 					continue;
1025 
1026 				// right after overshort
1027 				if ( m_pTokenizer->GetOvershortCount()==1 )
1028 				{
1029 					m_iPendingNulls = 0;
1030 					lvalp->pNode = AddKeyword ( NULL, iSkippedPosBeforeToken );
1031 					return TOK_KEYWORD;
1032 				}
1033 
1034 				Warning ( "modifiers must be applied to keywords, not operators" );
1035 
1036 				// this special is handled in HandleModifiers()
1037 				continue;
1038 			} else
1039 			{
1040 				bool bWasQuoted = m_bQuoted;
1041 				// all the other specials are passed to parser verbatim
1042 				if ( sToken[0]=='"' )
1043 				{
1044 					m_bQuoted = !m_bQuoted;
1045 					if ( m_bQuoted )
1046 						m_dPhraseStar.Resize ( 0 );
1047 				}
1048 				m_iPendingType = sToken[0];
1049 				m_pTokenizer->m_bPhrase = m_bQuoted;
1050 
1051 				if ( sToken[0]=='(' )
1052 				{
1053 					XQLimitSpec_t * pLastField = m_dStateSpec.Last();
1054 					m_dStateSpec.Add ( pLastField );
1055 				} else if ( sToken[0]==')' && m_dStateSpec.GetLength()>1 )
1056 				{
1057 					m_dStateSpec.Pop();
1058 				}
1059 
1060 				if ( bWasQuoted && !m_bQuoted )
1061 					m_iQuorumQuote = m_iAtomPos;
1062 				else if ( sToken[0]=='/' )
1063 					m_iQuorumFSlash = m_iAtomPos;
1064 
1065 				if ( sToken[0]=='~' ||sToken[0]=='/' )
1066 					m_bCheckNumber = true;
1067 				break;
1068 			}
1069 		}
1070 
1071 		// check for stopword, and create that node
1072 		// temp buffer is required, because GetWordID() might expand (!) the keyword in-place
1073 		BYTE sTmp [ MAX_TOKEN_BYTES ];
1074 
1075 		strncpy ( (char*)sTmp, sToken, MAX_TOKEN_BYTES );
1076 		sTmp[MAX_TOKEN_BYTES-1] = '\0';
1077 
1078 		int iStopWord = 0;
1079 		if ( m_pPlugin && m_pPlugin->m_fnPreMorph )
1080 			m_pPlugin->m_fnPreMorph ( m_pPluginData, (char*)sTmp, &iStopWord );
1081 
1082 		m_pDict->SetApplyMorph ( m_pTokenizer->GetMorphFlag() );
1083 		SphWordID_t uWordId = iStopWord ? 0 : m_pDict->GetWordID ( sTmp );
1084 
1085 		if ( uWordId && m_pPlugin && m_pPlugin->m_fnPostMorph )
1086 		{
1087 			int iRes = m_pPlugin->m_fnPostMorph ( m_pPluginData, (char*)sTmp, &iStopWord );
1088 			if ( iStopWord )
1089 				uWordId = 0;
1090 			else if ( iRes )
1091 				uWordId = m_pDict->GetWordIDNonStemmed ( sTmp );
1092 		}
1093 
1094 		if ( !uWordId )
1095 		{
1096 			sToken = NULL;
1097 			// stopwords with step=0 must not affect pos
1098 			if ( m_bEmptyStopword )
1099 				m_iAtomPos--;
1100 		}
1101 
1102 		bool bMultiDestHead = false;
1103 		bool bMultiDest = false;
1104 		int iDestCount = 0;
1105 		// do nothing inside phrase
1106 		if ( !m_pTokenizer->m_bPhrase )
1107 			bMultiDest = m_pTokenizer->WasTokenMultiformDestination ( bMultiDestHead, iDestCount );
1108 
1109 		if ( bMultiDest && !bMultiDestHead )
1110 		{
1111 			assert ( m_dMultiforms.GetLength() );
1112 			m_dMultiforms.Last().m_iDestCount++;
1113 			m_dDestForms.Add ( sToken );
1114 			m_bWasKeyword = true;
1115 		} else
1116 		{
1117 			m_tPendingToken.pNode = AddKeyword ( sToken, iSkippedPosBeforeToken );
1118 			m_iPendingType = TOK_KEYWORD;
1119 		}
1120 
1121 		if ( bMultiDestHead )
1122 		{
1123 			MultiformNode_t & tMulti = m_dMultiforms.Add();
1124 			tMulti.m_pNode = m_tPendingToken.pNode;
1125 			tMulti.m_iDestStart = m_dDestForms.GetLength();
1126 			tMulti.m_iDestCount = 0;
1127 		}
1128 
1129 		if ( m_pTokenizer->TokenIsBlended() )
1130 			m_iAtomPos--;
1131 
1132 		if ( !bMultiDest || bMultiDestHead )
1133 			break;
1134 	}
1135 
1136 	if ( bWasFrontModifier && m_iPendingType!=TOK_KEYWORD )
1137 		Warning ( "modifiers must be applied to keywords, not operators" );
1138 
1139 	// someone must be pending now!
1140 	assert ( m_iPendingType );
1141 	m_bEmpty = false;
1142 
1143 	// ladies first, though
1144 	if ( m_iPendingNulls>0 )
1145 	{
1146 		m_iPendingNulls--;
1147 		lvalp->pNode = AddKeyword ( NULL );
1148 		m_bWasKeyword = true;
1149 		return TOK_KEYWORD;
1150 	}
1151 
1152 	// pending the offending
1153 	int iRes = m_iPendingType;
1154 	m_iPendingType = 0;
1155 
1156 	if ( iRes==TOK_KEYWORD )
1157 		m_bWasKeyword = true;
1158 	*lvalp = m_tPendingToken;
1159 	return iRes;
1160 }
1161 
1162 
AddQuery(XQNode_t * pNode)1163 void XQParser_t::AddQuery ( XQNode_t * pNode )
1164 {
1165 	m_pRoot = pNode;
1166 }
1167 
1168 // Handle modifiers:
1169 // 1) ^ - field start
1170 // 2) $ - field end
1171 // 3) ^1.234 - keyword boost
1172 // keyword$^1.234 - field end with boost are on
1173 // keywords^1.234$ - only boost here, '$' it NOT modifier
HandleModifiers(XQKeyword_t & tKeyword)1174 void XQParser_t::HandleModifiers ( XQKeyword_t & tKeyword )
1175 {
1176 	const char * sTokStart = m_pTokenizer->GetTokenStart();
1177 	const char * sTokEnd = m_pTokenizer->GetTokenEnd();
1178 	if ( !sTokStart || !sTokEnd )
1179 		return;
1180 
1181 	const char * sQuery = reinterpret_cast<char *> ( m_sQuery );
1182 	tKeyword.m_bFieldStart = ( sTokStart-sQuery )>0 && sTokStart [ -1 ]=='^' &&
1183 		!( ( sTokStart-sQuery )>1 && sTokStart [ -2 ]=='\\' );
1184 
1185 	if ( sTokEnd[0]=='$' )
1186 	{
1187 		tKeyword.m_bFieldEnd = true;
1188 		++sTokEnd; // Skipping.
1189 	}
1190 
1191 	if ( sTokEnd[0]=='^' && ( sTokEnd[1]=='.' || isdigit ( sTokEnd[1] ) ) )
1192 	{
1193 		// Probably we have a boost, lets check.
1194 		char * pEnd;
1195 		float fBoost = (float)strtod ( sTokEnd+1, &pEnd );
1196 		if ( ( sTokEnd+1 )!=pEnd )
1197 		{
1198 			// We do have a boost.
1199 			// FIXME Handle ERANGE errno here.
1200 			tKeyword.m_fBoost = fBoost;
1201 			m_pTokenizer->SetBufferPtr ( pEnd );
1202 		}
1203 	}
1204 }
1205 
1206 
AddKeyword(const char * sKeyword,int iSkippedPosBeforeToken)1207 XQNode_t * XQParser_t::AddKeyword ( const char * sKeyword, int iSkippedPosBeforeToken )
1208 {
1209 	XQKeyword_t tAW ( sKeyword, m_iAtomPos );
1210 	tAW.m_iSkippedBefore = iSkippedPosBeforeToken;
1211 	HandleModifiers ( tAW );
1212 	XQNode_t * pNode = new XQNode_t ( *m_dStateSpec.Last() );
1213 	pNode->m_dWords.Add ( tAW );
1214 	m_dSpawned.Add ( pNode );
1215 	return pNode;
1216 }
1217 
1218 
AddKeyword(XQNode_t * pLeft,XQNode_t * pRight)1219 XQNode_t * XQParser_t::AddKeyword ( XQNode_t * pLeft, XQNode_t * pRight )
1220 {
1221 	if ( !pLeft || !pRight )
1222 		return pLeft ? pLeft : pRight;
1223 
1224 	assert ( pLeft->m_dWords.GetLength()>0 );
1225 	assert ( pRight->m_dWords.GetLength()==1 );
1226 
1227 	pLeft->m_dWords.Add ( pRight->m_dWords[0] );
1228 	m_dSpawned.RemoveValue ( pRight );
1229 	SafeDelete ( pRight );
1230 	return pLeft;
1231 }
1232 
1233 
HasMissedField(const XQLimitSpec_t & tSpec)1234 static bool HasMissedField ( const XQLimitSpec_t & tSpec )
1235 {
1236 	return ( tSpec.m_dFieldMask.TestAll ( false ) && tSpec.m_iFieldMaxPos==0 && !tSpec.m_bZoneSpan && tSpec.m_dZones.GetLength()==0 );
1237 }
1238 
1239 
AddOp(XQOperator_e eOp,XQNode_t * pLeft,XQNode_t * pRight,int iOpArg)1240 XQNode_t * XQParser_t::AddOp ( XQOperator_e eOp, XQNode_t * pLeft, XQNode_t * pRight, int iOpArg )
1241 {
1242 	/////////
1243 	// unary
1244 	/////////
1245 
1246 	if ( eOp==SPH_QUERY_NOT )
1247 	{
1248 		XQNode_t * pNode = new XQNode_t ( *m_dStateSpec.Last() );
1249 		pNode->SetOp ( SPH_QUERY_NOT, pLeft );
1250 		m_dSpawned.Add ( pNode );
1251 		return pNode;
1252 	}
1253 
1254 	//////////
1255 	// binary
1256 	//////////
1257 
1258 	if ( !pLeft || !pRight )
1259 		return pLeft ? pLeft : pRight;
1260 
1261 	// build a new node
1262 	XQNode_t * pResult = NULL;
1263 	if ( pLeft->m_dChildren.GetLength() && pLeft->GetOp()==eOp && pLeft->m_iOpArg==iOpArg )
1264 	{
1265 		pLeft->m_dChildren.Add ( pRight );
1266 		pRight->m_pParent = pLeft;
1267 		pResult = pLeft;
1268 	} else
1269 	{
1270 		// however-2, beside all however below, [@@relaxed ((@title hello) | (@missed world)) @body other terms]
1271 		// we should use valid (left) field mask for complex (OR) node
1272 		// as right node in this case has m_bFieldSpec==true but m_dFieldMask==0
1273 		const XQLimitSpec_t & tSpec = HasMissedField ( pRight->m_dSpec ) ? pLeft->m_dSpec : pRight->m_dSpec;
1274 
1275 		// however, it's right (!) spec which is chosen for the resulting node,
1276 		// eg. '@title hello' + 'world @body program'
1277 		XQNode_t * pNode = new XQNode_t ( tSpec );
1278 		pNode->SetOp ( eOp, pLeft, pRight );
1279 		pNode->m_iOpArg = iOpArg;
1280 		m_dSpawned.Add ( pNode );
1281 		pResult = pNode;
1282 	}
1283 	return pResult;
1284 }
1285 
1286 
SetPhrase(XQNode_t * pNode,bool bSetExact)1287 void XQParser_t::SetPhrase ( XQNode_t * pNode, bool bSetExact )
1288 {
1289 	if ( !pNode )
1290 		return;
1291 
1292 	assert ( pNode->m_dWords.GetLength() );
1293 	if ( bSetExact )
1294 	{
1295 		ARRAY_FOREACH ( iWord, pNode->m_dWords )
1296 		{
1297 			if ( !pNode->m_dWords[iWord].m_sWord.IsEmpty() )
1298 				pNode->m_dWords[iWord].m_sWord.SetSprintf ( "=%s", pNode->m_dWords[iWord].m_sWord.cstr() );
1299 		}
1300 	}
1301 	pNode->SetOp ( SPH_QUERY_PHRASE );
1302 
1303 	PhraseShiftQpos ( pNode );
1304 }
1305 
1306 
SweepNulls(XQNode_t * pNode)1307 XQNode_t * XQParser_t::SweepNulls ( XQNode_t * pNode )
1308 {
1309 	if ( !pNode )
1310 		return NULL;
1311 
1312 	// sweep plain node
1313 	if ( pNode->m_dWords.GetLength() )
1314 	{
1315 		ARRAY_FOREACH ( i, pNode->m_dWords )
1316 			if ( pNode->m_dWords[i].m_sWord.cstr()==NULL )
1317 				pNode->m_dWords.Remove ( i-- );
1318 
1319 		if ( pNode->m_dWords.GetLength()==0 )
1320 		{
1321 			m_dSpawned.RemoveValue ( pNode ); // OPTIMIZE!
1322 			SafeDelete ( pNode );
1323 			return NULL;
1324 		}
1325 
1326 		return pNode;
1327 	}
1328 
1329 	// sweep op node
1330 	ARRAY_FOREACH ( i, pNode->m_dChildren )
1331 	{
1332 		pNode->m_dChildren[i] = SweepNulls ( pNode->m_dChildren[i] );
1333 		if ( pNode->m_dChildren[i]==NULL )
1334 		{
1335 			pNode->m_dChildren.Remove ( i-- );
1336 			// use non-null iOpArg as a flag indicating that the sweeping happened.
1337 			++pNode->m_iOpArg;
1338 		}
1339 	}
1340 
1341 	if ( pNode->m_dChildren.GetLength()==0 )
1342 	{
1343 		m_dSpawned.RemoveValue ( pNode ); // OPTIMIZE!
1344 		SafeDelete ( pNode );
1345 		return NULL;
1346 	}
1347 
1348 	// remove redundancies if needed
1349 	if ( pNode->GetOp()!=SPH_QUERY_NOT && pNode->m_dChildren.GetLength()==1 )
1350 	{
1351 		XQNode_t * pRet = pNode->m_dChildren[0];
1352 		pNode->m_dChildren.Reset ();
1353 		pRet->m_pParent = pNode->m_pParent;
1354 		// expressions like 'la !word' (having min_word_len>len(la)) became a 'null' node.
1355 		if ( pNode->m_iOpArg && pRet->GetOp()==SPH_QUERY_NOT )
1356 		{
1357 			pRet->SetOp ( SPH_QUERY_NULL );
1358 			ARRAY_FOREACH ( i, pRet->m_dChildren )
1359 			{
1360 				m_dSpawned.RemoveValue ( pRet->m_dChildren[i] );
1361 				SafeDelete ( pRet->m_dChildren[i] )
1362 			}
1363 			pRet->m_dChildren.Reset();
1364 		}
1365 		pRet->m_iOpArg = pNode->m_iOpArg;
1366 
1367 		m_dSpawned.RemoveValue ( pNode ); // OPTIMIZE!
1368 		SafeDelete ( pNode );
1369 		return SweepNulls ( pRet );
1370 	}
1371 
1372 	// done
1373 	return pNode;
1374 }
1375 
FixupNulls(XQNode_t * pNode)1376 void XQParser_t::FixupNulls ( XQNode_t * pNode )
1377 {
1378 	if ( !pNode )
1379 		return;
1380 
1381 	ARRAY_FOREACH ( i, pNode->m_dChildren )
1382 		FixupNulls ( pNode->m_dChildren[i] );
1383 
1384 	// smth OR null == smth.
1385 	if ( pNode->GetOp()==SPH_QUERY_OR )
1386 	{
1387 		CSphVector<XQNode_t*> dNotNulls;
1388 		ARRAY_FOREACH ( i, pNode->m_dChildren )
1389 		{
1390 			XQNode_t* pChild = pNode->m_dChildren[i];
1391 			if ( pChild->GetOp()!=SPH_QUERY_NULL )
1392 				dNotNulls.Add ( pChild );
1393 			else
1394 			{
1395 				m_dSpawned.RemoveValue ( pChild );
1396 				SafeDelete ( pChild );
1397 			}
1398 		}
1399 		pNode->m_dChildren.SwapData ( dNotNulls );
1400 		dNotNulls.Reset();
1401 	// smth AND null = null.
1402 	} else if ( pNode->GetOp()==SPH_QUERY_AND )
1403 	{
1404 		bool bHasNull = ARRAY_ANY ( bHasNull, pNode->m_dChildren, pNode->m_dChildren[_any]->GetOp()==SPH_QUERY_NULL );
1405 		if ( bHasNull )
1406 		{
1407 			pNode->SetOp ( SPH_QUERY_NULL );
1408 			ARRAY_FOREACH ( i, pNode->m_dChildren )
1409 			{
1410 				m_dSpawned.RemoveValue ( pNode->m_dChildren[i] );
1411 				SafeDelete ( pNode->m_dChildren[i] )
1412 			}
1413 			pNode->m_dChildren.Reset();
1414 		}
1415 	}
1416 }
1417 
FixupNots(XQNode_t * pNode)1418 bool XQParser_t::FixupNots ( XQNode_t * pNode )
1419 {
1420 	// no processing for plain nodes
1421 	if ( !pNode || pNode->m_dWords.GetLength() )
1422 		return true;
1423 
1424 	// process 'em children
1425 	ARRAY_FOREACH ( i, pNode->m_dChildren )
1426 		if ( !FixupNots ( pNode->m_dChildren[i] ) )
1427 			return false;
1428 
1429 	// extract NOT subnodes
1430 	CSphVector<XQNode_t*> dNots;
1431 	ARRAY_FOREACH ( i, pNode->m_dChildren )
1432 		if ( pNode->m_dChildren[i]->GetOp()==SPH_QUERY_NOT )
1433 	{
1434 		dNots.Add ( pNode->m_dChildren[i] );
1435 		pNode->m_dChildren.RemoveFast ( i-- );
1436 	}
1437 
1438 	// no NOTs? we're square
1439 	if ( !dNots.GetLength() )
1440 		return true;
1441 
1442 	// nothing but NOTs? we can't compute that
1443 	if ( !pNode->m_dChildren.GetLength() )
1444 	{
1445 		m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (node consists of NOT operators only)" );
1446 		return false;
1447 	}
1448 
1449 	// NOT within OR or MAYBE? we can't compute that
1450 	if ( pNode->GetOp()==SPH_QUERY_OR || pNode->GetOp()==SPH_QUERY_MAYBE || pNode->GetOp()==SPH_QUERY_NEAR )
1451 	{
1452 		XQOperator_e eOp = pNode->GetOp();
1453 		const char * sOp = ( eOp==SPH_QUERY_OR ? "OR" : ( eOp==SPH_QUERY_MAYBE ? "MAYBE" : "NEAR" ) );
1454 		m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (NOT is not allowed within %s)", sOp );
1455 		return false;
1456 	}
1457 
1458 	// NOT used in before operator
1459 	if ( pNode->GetOp()==SPH_QUERY_BEFORE )
1460 	{
1461 		m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (NOT cannot be used as before operand)" );
1462 		return false;
1463 	}
1464 
1465 	// must be some NOTs within AND at this point, convert this node to ANDNOT
1466 	assert ( pNode->GetOp()==SPH_QUERY_AND && pNode->m_dChildren.GetLength() && dNots.GetLength() );
1467 
1468 	XQNode_t * pAnd = new XQNode_t ( pNode->m_dSpec );
1469 	pAnd->SetOp ( SPH_QUERY_AND, pNode->m_dChildren );
1470 	m_dSpawned.Add ( pAnd );
1471 
1472 	XQNode_t * pNot = NULL;
1473 	if ( dNots.GetLength()==1 )
1474 	{
1475 		pNot = dNots[0];
1476 	} else
1477 	{
1478 		pNot = new XQNode_t ( pNode->m_dSpec );
1479 		pNot->SetOp ( SPH_QUERY_OR, dNots );
1480 		m_dSpawned.Add ( pNot );
1481 	}
1482 
1483 	pNode->SetOp ( SPH_QUERY_ANDNOT, pAnd, pNot );
1484 	return true;
1485 }
1486 
1487 
CollectChildren(XQNode_t * pNode,CSphVector<XQNode_t * > & dChildren)1488 static void CollectChildren ( XQNode_t * pNode, CSphVector<XQNode_t *> & dChildren )
1489 {
1490 	if ( !pNode->m_dChildren.GetLength() )
1491 		return;
1492 
1493 	dChildren.Add ( pNode );
1494 	ARRAY_FOREACH ( i, dChildren )
1495 	{
1496 		const XQNode_t * pNode = dChildren[i];
1497 		ARRAY_FOREACH ( j, pNode->m_dChildren )
1498 			dChildren.Add ( pNode->m_dChildren[j] );
1499 	}
1500 }
1501 
1502 
DeleteNodesWOFields(XQNode_t * pNode)1503 void XQParser_t::DeleteNodesWOFields ( XQNode_t * pNode )
1504 {
1505 	if ( !pNode )
1506 		return;
1507 
1508 	for ( int i = 0; i < pNode->m_dChildren.GetLength (); )
1509 	{
1510 		if ( pNode->m_dChildren[i]->m_dSpec.m_dFieldMask.TestAll ( false ) )
1511 		{
1512 			XQNode_t * pChild = pNode->m_dChildren[i];
1513 			CSphVector<XQNode_t *> dChildren;
1514 			CollectChildren ( pChild, dChildren );
1515 #ifndef NDEBUG
1516 			bool bAllEmpty = ARRAY_ALL ( bAllEmpty, dChildren, dChildren[_all]->m_dSpec.m_dFieldMask.TestAll ( false ) );
1517 			assert ( pChild->m_dChildren.GetLength()==0 || ( dChildren.GetLength() && bAllEmpty ) );
1518 #endif
1519 			if ( dChildren.GetLength() )
1520 			{
1521 				ARRAY_FOREACH ( i, dChildren )
1522 					m_dSpawned.RemoveValue ( dChildren[i] );
1523 			} else
1524 			{
1525 				m_dSpawned.RemoveValue ( pChild );
1526 			}
1527 
1528 			// this should be a leaf node
1529 			SafeDelete ( pNode->m_dChildren[i] );
1530 			pNode->m_dChildren.RemoveFast ( i );
1531 
1532 		} else
1533 		{
1534 			DeleteNodesWOFields ( pNode->m_dChildren[i] );
1535 			i++;
1536 		}
1537 	}
1538 }
1539 
1540 
PhraseShiftQpos(XQNode_t * pNode)1541 void XQParser_t::PhraseShiftQpos ( XQNode_t * pNode )
1542 {
1543 	if ( !m_dPhraseStar.GetLength() )
1544 		return;
1545 
1546 	const int * pLast = m_dPhraseStar.Begin();
1547 	const int * pEnd = m_dPhraseStar.Begin() + m_dPhraseStar.GetLength();
1548 	int iQposShiftStart = *pLast;
1549 	int iQposShift = 0;
1550 	int iLastStarPos = *pLast;
1551 
1552 	ARRAY_FOREACH ( iWord, pNode->m_dWords )
1553 	{
1554 		XQKeyword_t & tWord = pNode->m_dWords[iWord];
1555 
1556 		// fold stars in phrase till current term position
1557 		while ( pLast<pEnd && *(pLast)<=tWord.m_iAtomPos )
1558 		{
1559 			iLastStarPos = *pLast;
1560 			pLast++;
1561 			iQposShift++;
1562 		}
1563 
1564 		// star dictionary passes raw star however regular dictionary suppress it
1565 		// raw star also might be suppressed by min_word_len option
1566 		// so remove qpos shift from duplicated raw star term
1567 		// however not stopwords that is also term with empty word
1568 		if ( tWord.m_sWord=="*" || ( tWord.m_sWord.IsEmpty() && tWord.m_iAtomPos==iLastStarPos ) )
1569 		{
1570 			pNode->m_dWords.Remove ( iWord-- );
1571 			iQposShift--;
1572 			continue;
1573 		}
1574 
1575 		if ( iQposShiftStart<=tWord.m_iAtomPos )
1576 			tWord.m_iAtomPos += iQposShift;
1577 	}
1578 }
1579 
1580 
CheckQuorumProximity(XQNode_t * pNode,CSphString * pError)1581 static bool CheckQuorumProximity ( XQNode_t * pNode, CSphString * pError )
1582 {
1583 	assert ( pError );
1584 	if ( !pNode )
1585 		return true;
1586 
1587 	bool bQuorumPassed = ( pNode->GetOp()!=SPH_QUERY_QUORUM ||
1588 		( pNode->m_iOpArg>0 && ( !pNode->m_bPercentOp || pNode->m_iOpArg<=100 ) ) );
1589 	if ( !bQuorumPassed )
1590 	{
1591 		if ( pNode->m_bPercentOp )
1592 			pError->SetSprintf ( "quorum threshold out of bounds 0.0 and 1.0f (%f)", 1.0f / 100.0f * pNode->m_iOpArg );
1593 		else
1594 			pError->SetSprintf ( "quorum threshold too low (%d)", pNode->m_iOpArg );
1595 		return false;
1596 	}
1597 
1598 	if ( pNode->GetOp()==SPH_QUERY_PROXIMITY && pNode->m_iOpArg<1 )
1599 	{
1600 		pError->SetSprintf ( "proximity threshold too low (%d)", pNode->m_iOpArg );
1601 		return false;
1602 	}
1603 
1604 	bool bValid = ARRAY_ALL ( bValid, pNode->m_dChildren, CheckQuorumProximity ( pNode->m_dChildren[_all], pError ) );
1605 	return bValid;
1606 }
1607 
1608 
FixupDegenerates(XQNode_t * pNode,CSphString & sWarning)1609 static void FixupDegenerates ( XQNode_t * pNode, CSphString & sWarning )
1610 {
1611 	if ( !pNode )
1612 		return;
1613 
1614 	if ( pNode->m_dWords.GetLength()==1 &&
1615 		( pNode->GetOp()==SPH_QUERY_PHRASE || pNode->GetOp()==SPH_QUERY_PROXIMITY || pNode->GetOp()==SPH_QUERY_QUORUM ) )
1616 	{
1617 		if ( pNode->GetOp()==SPH_QUERY_QUORUM && !pNode->m_bPercentOp && pNode->m_iOpArg>1 )
1618 			sWarning.SetSprintf ( "quorum threshold too high (words=%d, thresh=%d); replacing quorum operator with AND operator", pNode->m_dWords.GetLength(), pNode->m_iOpArg );
1619 
1620 		pNode->SetOp ( SPH_QUERY_AND );
1621 		return;
1622 	}
1623 
1624 	ARRAY_FOREACH ( i, pNode->m_dChildren )
1625 		FixupDegenerates ( pNode->m_dChildren[i], sWarning );
1626 }
1627 
FixupDestForms()1628 void XQParser_t::FixupDestForms ()
1629 {
1630 	if ( !m_dMultiforms.GetLength() )
1631 		return;
1632 
1633 	CSphVector<XQNode_t *> dForms;
1634 
1635 	ARRAY_FOREACH ( iNode, m_dMultiforms )
1636 	{
1637 		const MultiformNode_t & tDesc = m_dMultiforms[iNode];
1638 		XQNode_t * pMultiParent = tDesc.m_pNode;
1639 		assert ( pMultiParent->m_dWords.GetLength()==1 && pMultiParent->m_dChildren.GetLength()==0 );
1640 
1641 		XQKeyword_t tKeyword;
1642 		Swap ( pMultiParent->m_dWords[0], tKeyword );
1643 		pMultiParent->m_dWords.Reset();
1644 
1645 		// FIXME: what about whildcard?
1646 		bool bExact = ( tKeyword.m_sWord.Length()>1 && tKeyword.m_sWord.cstr()[0]=='=' );
1647 		bool bFieldStart = tKeyword.m_bFieldStart;
1648 		bool bFieldEnd = tKeyword.m_bFieldEnd;
1649 		tKeyword.m_bFieldStart = false;
1650 		tKeyword.m_bFieldEnd = false;
1651 
1652 		XQNode_t * pMultiHead = new XQNode_t ( pMultiParent->m_dSpec );
1653 		pMultiHead->m_dWords.Add ( tKeyword );
1654 		m_dSpawned.Add ( pMultiHead );
1655 		dForms.Add ( pMultiHead );
1656 
1657 		for ( int iForm=0; iForm<tDesc.m_iDestCount; iForm++ )
1658 		{
1659 			tKeyword.m_iAtomPos++;
1660 			tKeyword.m_sWord = m_dDestForms [ tDesc.m_iDestStart + iForm ];
1661 
1662 			// propagate exact word flag to all destination forms
1663 			if ( bExact )
1664 				tKeyword.m_sWord.SetSprintf ( "=%s", tKeyword.m_sWord.cstr() );
1665 
1666 			XQNode_t * pMulti = new XQNode_t ( pMultiParent->m_dSpec );
1667 			pMulti->m_dWords.Add ( tKeyword );
1668 			m_dSpawned.Add ( pMulti );
1669 			dForms.Add ( pMulti );
1670 		}
1671 
1672 		// fix-up field start\end modifier
1673 		dForms[0]->m_dWords[0].m_bFieldStart = bFieldStart;
1674 		dForms.Last()->m_dWords[0].m_bFieldEnd = bFieldEnd;
1675 
1676 		pMultiParent->SetOp ( SPH_QUERY_AND, dForms );
1677 		dForms.Resize ( 0 );
1678 	}
1679 }
1680 
1681 
Parse(XQQuery_t & tParsed,const char * sQuery,const CSphQuery * pQuery,const ISphTokenizer * pTokenizer,const CSphSchema * pSchema,CSphDict * pDict,const CSphIndexSettings & tSettings)1682 bool XQParser_t::Parse ( XQQuery_t & tParsed, const char * sQuery, const CSphQuery * pQuery, const ISphTokenizer * pTokenizer,
1683 	const CSphSchema * pSchema, CSphDict * pDict, const CSphIndexSettings & tSettings )
1684 {
1685 	// FIXME? might wanna verify somehow that pTokenizer has all the specials etc from sphSetupQueryTokenizer
1686 	CSphScopedPtr<ISphTokenizer> pMyTokenizer ( NULL );
1687 	CSphTightVector<char> dRlpBuf;
1688 
1689 #if USE_RLP
1690 	if ( !pTokenizer->GetRLPContext() )
1691 	{
1692 #endif
1693 		pMyTokenizer = pTokenizer->Clone ( SPH_CLONE_QUERY_LIGHTWEIGHT );
1694 #if USE_RLP
1695 	} else
1696 	{
1697 		const char * sProcessed = NULL;
1698 		if ( !ISphTokenizer::ProcessQueryRLP ( tSettings.m_sRLPContext.cstr(), sQuery, &sProcessed, dRlpBuf, tParsed.m_sParseError ) )
1699 			return false;
1700 
1701 		pMyTokenizer = pTokenizer->GetEmbeddedTokenizer()->Clone ( SPH_CLONE_QUERY_LIGHTWEIGHT );
1702 		sQuery = sProcessed;
1703 	}
1704 #endif
1705 
1706 	// most outcomes are errors
1707 	SafeDelete ( tParsed.m_pRoot );
1708 
1709 	// check for relaxed syntax
1710 	const char * OPTION_RELAXED = "@@relaxed";
1711 	const int OPTION_RELAXED_LEN = strlen ( OPTION_RELAXED );
1712 
1713 	m_bStopOnInvalid = true;
1714 	if ( sQuery && strncmp ( sQuery, OPTION_RELAXED, OPTION_RELAXED_LEN )==0 && !sphIsAlpha ( sQuery[OPTION_RELAXED_LEN] ) )
1715 	{
1716 		sQuery += OPTION_RELAXED_LEN;
1717 		m_bStopOnInvalid = false;
1718 	}
1719 
1720 	m_pPlugin = NULL;
1721 	m_pPluginData = NULL;
1722 
1723 	if ( pQuery && !pQuery->m_sQueryTokenFilterName.IsEmpty() )
1724 	{
1725 		CSphString sError;
1726 		m_pPlugin = static_cast < PluginQueryTokenFilter_c * > ( sphPluginAcquire ( pQuery->m_sQueryTokenFilterLib.cstr(),
1727 			PLUGIN_QUERY_TOKEN_FILTER, pQuery->m_sQueryTokenFilterName.cstr(), tParsed.m_sParseError ) );
1728 		if ( !m_pPlugin )
1729 			return false;
1730 
1731 		char szError [ SPH_UDF_ERROR_LEN ];
1732 		if ( m_pPlugin->m_fnInit && m_pPlugin->m_fnInit ( &m_pPluginData, MAX_TOKEN_BYTES, pQuery->m_sQueryTokenFilterOpts.cstr(), szError )!=0 )
1733 		{
1734 			tParsed.m_sParseError = sError;
1735 			m_pPlugin->Release();
1736 			m_pPlugin = NULL;
1737 			m_pPluginData = NULL;
1738 			return false;
1739 		}
1740 	}
1741 
1742 	// setup parser
1743 	m_pParsed = &tParsed;
1744 	m_sQuery = (BYTE*) sQuery;
1745 	m_iQueryLen = sQuery ? strlen(sQuery) : 0;
1746 	m_pTokenizer = pMyTokenizer.Ptr();
1747 	m_pSchema = pSchema;
1748 	m_iAtomPos = 0;
1749 	m_iPendingNulls = 0;
1750 	m_iPendingType = 0;
1751 	m_pRoot = NULL;
1752 	m_bEmpty = true;
1753 	m_bEmptyStopword = ( tSettings.m_iStopwordStep==0 );
1754 	m_iOvershortStep = tSettings.m_iOvershortStep;
1755 
1756 	CSphScopedPtr<CSphDict> tDictCloned ( NULL );
1757 	m_pDict = pDict;
1758 	if ( pDict->HasState() )
1759 		tDictCloned = m_pDict = pDict->Clone();
1760 
1761 	m_pTokenizer->SetBuffer ( m_sQuery, m_iQueryLen );
1762 	int iRes = yyparse ( this );
1763 
1764 	if ( m_pPlugin )
1765 	{
1766 		if ( m_pPlugin->m_fnDeinit )
1767 			m_pPlugin->m_fnDeinit ( m_pPluginData );
1768 
1769 		m_pPlugin->Release();
1770 		m_pPlugin = NULL;
1771 		m_pPluginData = NULL;
1772 	}
1773 
1774 	if ( ( iRes || !m_pParsed->m_sParseError.IsEmpty() ) && !m_bEmpty )
1775 	{
1776 		Cleanup ();
1777 		return false;
1778 	}
1779 
1780 	FixupDestForms ();
1781 	DeleteNodesWOFields ( m_pRoot );
1782 	m_pRoot = SweepNulls ( m_pRoot );
1783 	FixupDegenerates ( m_pRoot, m_pParsed->m_sParseWarning );
1784 	FixupNulls ( m_pRoot );
1785 
1786 	if ( !FixupNots ( m_pRoot ) )
1787 	{
1788 		Cleanup ();
1789 		return false;
1790 	}
1791 
1792 	if ( !CheckQuorumProximity ( m_pRoot, &m_pParsed->m_sParseError ) )
1793 	{
1794 		Cleanup();
1795 		return false;
1796 	}
1797 
1798 	if ( m_pRoot && m_pRoot->GetOp()==SPH_QUERY_NOT && !m_pRoot->m_iOpArg )
1799 	{
1800 		Cleanup ();
1801 		m_pParsed->m_sParseError.SetSprintf ( "query is non-computable (single NOT operator)" );
1802 		return false;
1803 	}
1804 
1805 	// all ok; might want to create a dummy node to indicate that
1806 	m_dSpawned.Reset();
1807 
1808 	if ( m_pRoot && m_pRoot->GetOp()==SPH_QUERY_NULL )
1809 	{
1810 		delete m_pRoot;
1811 		m_pRoot = NULL;
1812 	}
1813 
1814 	tParsed.m_pRoot = m_pRoot ? m_pRoot : new XQNode_t ( *m_dStateSpec.Last() );
1815 	return true;
1816 }
1817 
1818 //////////////////////////////////////////////////////////////////////////
1819 
1820 
1821 #ifdef XQDEBUG
xqIndent(int iIndent)1822 static void xqIndent ( int iIndent )
1823 {
1824 	iIndent *= 2;
1825 	while ( iIndent-- )
1826 		printf ( "|-" );
1827 }
1828 
xqDump(const XQNode_t * pNode,int iIndent)1829 static void xqDump ( const XQNode_t * pNode, int iIndent )
1830 {
1831 #ifdef XQ_DUMP_NODE_ADDR
1832 	printf ( "0x%08x ", pNode );
1833 #endif
1834 	if ( pNode->m_dChildren.GetLength() )
1835 	{
1836 		xqIndent ( iIndent );
1837 		switch ( pNode->GetOp() )
1838 		{
1839 			case SPH_QUERY_AND: printf ( "AND:" ); break;
1840 			case SPH_QUERY_OR: printf ( "OR:" ); break;
1841 			case SPH_QUERY_MAYBE: printf ( "MAYBE:" ); break;
1842 			case SPH_QUERY_NOT: printf ( "NOT:" ); break;
1843 			case SPH_QUERY_ANDNOT: printf ( "ANDNOT:" ); break;
1844 			case SPH_QUERY_BEFORE: printf ( "BEFORE:" ); break;
1845 			case SPH_QUERY_PHRASE: printf ( "PHRASE:" ); break;
1846 			case SPH_QUERY_PROXIMITY: printf ( "PROXIMITY:" ); break;
1847 			case SPH_QUERY_QUORUM: printf ( "QUORUM:" ); break;
1848 			case SPH_QUERY_NEAR: printf ( "NEAR:" ); break;
1849 			case SPH_QUERY_SENTENCE: printf ( "SENTENCE:" ); break;
1850 			case SPH_QUERY_PARAGRAPH: printf ( "PARAGRAPH:" ); break;
1851 			default: printf ( "unknown-op-%d:", pNode->GetOp() ); break;
1852 		}
1853 		printf ( " (%d)\n", pNode->m_dChildren.GetLength() );
1854 		ARRAY_FOREACH ( i, pNode->m_dChildren )
1855 		{
1856 			assert ( pNode->m_dChildren[i]->m_pParent==pNode );
1857 			xqDump ( pNode->m_dChildren[i], iIndent+1 );
1858 		}
1859 	} else
1860 	{
1861 		xqIndent ( iIndent );
1862 		printf ( "MATCH(%d,%d):", pNode->m_dSpec.m_dFieldMask.GetMask32(), pNode->m_iOpArg );
1863 
1864 		ARRAY_FOREACH ( i, pNode->m_dWords )
1865 		{
1866 			const XQKeyword_t & tWord = pNode->m_dWords[i];
1867 
1868 			const char * sLocTag = "";
1869 			if ( tWord.m_bFieldStart ) sLocTag = ", start";
1870 			if ( tWord.m_bFieldEnd ) sLocTag = ", end";
1871 
1872 			printf ( " %s (qpos %d%s)", tWord.m_sWord.cstr(), tWord.m_iAtomPos, sLocTag );
1873 		}
1874 		printf ( "\n" );
1875 	}
1876 }
1877 #endif
1878 
1879 
sphReconstructNode(const XQNode_t * pNode,const CSphSchema * pSchema)1880 CSphString sphReconstructNode ( const XQNode_t * pNode, const CSphSchema * pSchema )
1881 {
1882 	CSphString sRes ( "" );
1883 
1884 	if ( !pNode )
1885 		return sRes;
1886 
1887 	if ( pNode->m_dWords.GetLength() )
1888 	{
1889 		// say just words to me
1890 		const CSphVector<XQKeyword_t> & dWords = pNode->m_dWords;
1891 		ARRAY_FOREACH ( i, dWords )
1892 			sRes.SetSprintf ( "%s %s", sRes.cstr(), dWords[i].m_sWord.cstr() );
1893 		sRes.Trim ();
1894 
1895 		switch ( pNode->GetOp() )
1896 		{
1897 		case SPH_QUERY_AND:			break;
1898 		case SPH_QUERY_PHRASE:		sRes.SetSprintf ( "\"%s\"", sRes.cstr() ); break;
1899 		case SPH_QUERY_PROXIMITY:	sRes.SetSprintf ( "\"%s\"~%d", sRes.cstr(), pNode->m_iOpArg ); break;
1900 		case SPH_QUERY_QUORUM:		sRes.SetSprintf ( "\"%s\"/%d", sRes.cstr(), pNode->m_iOpArg ); break;
1901 		case SPH_QUERY_NEAR:		sRes.SetSprintf ( "\"%s\"NEAR/%d", sRes.cstr(), pNode->m_iOpArg ); break;
1902 		default:					assert ( 0 && "unexpected op in ReconstructNode()" ); break;
1903 		}
1904 
1905 		if ( !pNode->m_dSpec.m_dFieldMask.TestAll(true) )
1906 		{
1907 			CSphString sFields ( "" );
1908 			for ( int i=0; i<SPH_MAX_FIELDS; i++ )
1909 			{
1910 				if ( !pNode->m_dSpec.m_dFieldMask.Test(i) )
1911 					continue;
1912 
1913 				if ( pSchema )
1914 					sFields.SetSprintf ( "%s,%s", sFields.cstr(), pSchema->m_dFields[i].m_sName.cstr() );
1915 				else
1916 					sFields.SetSprintf ( "%s,%d", sFields.cstr(), pNode->m_dSpec.m_dFieldMask.GetMask32() );
1917 			}
1918 
1919 			sRes.SetSprintf ( "( @%s: %s )", sFields.cstr()+1, sRes.cstr() );
1920 		} else
1921 		{
1922 			if ( pNode->GetOp()==SPH_QUERY_AND && dWords.GetLength()>1 )
1923 				sRes.SetSprintf ( "( %s )", sRes.cstr() ); // wrap bag of words
1924 		}
1925 
1926 	} else
1927 	{
1928 		ARRAY_FOREACH ( i, pNode->m_dChildren )
1929 		{
1930 			if ( !i )
1931 			{
1932 				sRes = sphReconstructNode ( pNode->m_dChildren[i], pSchema );
1933 			} else
1934 			{
1935 				const char * sOp = "(unknown-op)";
1936 				switch ( pNode->GetOp() )
1937 				{
1938 				case SPH_QUERY_AND:		sOp = " "; break;
1939 				case SPH_QUERY_OR:		sOp = "|"; break;
1940 				case SPH_QUERY_MAYBE:	sOp = "MAYBE"; break;
1941 				case SPH_QUERY_NOT:		sOp = "NOT"; break;
1942 				case SPH_QUERY_ANDNOT:	sOp = "AND NOT"; break;
1943 				case SPH_QUERY_BEFORE:	sOp = "BEFORE"; break;
1944 				case SPH_QUERY_NEAR:	sOp = "NEAR"; break;
1945 				case SPH_QUERY_PHRASE:	sOp = ""; break;
1946 				default:				assert ( 0 && "unexpected op in ReconstructNode()" ); break;
1947 				}
1948 				if ( pNode->GetOp()==SPH_QUERY_PHRASE )
1949 					sRes.SetSprintf ( "\"%s %s\"", sRes.cstr(), sphReconstructNode ( pNode->m_dChildren[i], pSchema ).cstr() );
1950 				else
1951 					sRes.SetSprintf ( "%s %s %s", sRes.cstr(), sOp, sphReconstructNode ( pNode->m_dChildren[i], pSchema ).cstr() );
1952 			}
1953 		}
1954 
1955 		if ( pNode->m_dChildren.GetLength()>1 )
1956 			sRes.SetSprintf ( "( %s )", sRes.cstr() );
1957 	}
1958 
1959 	return sRes;
1960 }
1961 
1962 
sphParseExtendedQuery(XQQuery_t & tParsed,const char * sQuery,const CSphQuery * pQuery,const ISphTokenizer * pTokenizer,const CSphSchema * pSchema,CSphDict * pDict,const CSphIndexSettings & tSettings)1963 bool sphParseExtendedQuery ( XQQuery_t & tParsed, const char * sQuery, const CSphQuery * pQuery, const ISphTokenizer * pTokenizer,
1964 	const CSphSchema * pSchema, CSphDict * pDict, const CSphIndexSettings & tSettings )
1965 {
1966 	XQParser_t qp;
1967 	bool bRes = qp.Parse ( tParsed, sQuery, pQuery, pTokenizer, pSchema, pDict, tSettings );
1968 
1969 #ifndef NDEBUG
1970 	if ( bRes && tParsed.m_pRoot )
1971 		tParsed.m_pRoot->Check ( true );
1972 #endif
1973 
1974 #ifdef XQDEBUG
1975 	if ( bRes )
1976 	{
1977 		printf ( "\n--- query ---\n" );
1978 		printf ( "%s\n", sQuery );
1979 		xqDump ( tParsed.m_pRoot, 0 );
1980 		printf ( "---\n" );
1981 	}
1982 #endif
1983 
1984 	// moved here from ranker creation
1985 	// as at that point term expansion could produce many terms from expanded term and this condition got failed
1986 	tParsed.m_bSingleWord = ( tParsed.m_pRoot && tParsed.m_pRoot->m_dChildren.GetLength()==0 && tParsed.m_pRoot->m_dWords.GetLength()==1 );
1987 
1988 	return bRes;
1989 }
1990 
1991 //////////////////////////////////////////////////////////////////////////
1992 // COMMON SUBTREES DETECTION
1993 //////////////////////////////////////////////////////////////////////////
1994 
1995 /// Decides if given pTree is appropriate for caching or not. Currently we don't cache
1996 /// the end values (leafs).
IsAppropriate(XQNode_t * pTree)1997 static bool IsAppropriate ( XQNode_t * pTree )
1998 {
1999 	if ( !pTree ) return false;
2000 
2001 	// skip nodes that actually are leaves (eg. "AND smth" node instead of merely "smth")
2002 	return !( pTree->m_dWords.GetLength()==1 && pTree->GetOp()!=SPH_QUERY_NOT );
2003 }
2004 
2005 typedef CSphOrderedHash < DWORD, uint64_t, IdentityHash_fn, 128 > CDwordHash;
2006 
2007 // stores the pair of a tree, and the bitmask of common nodes
2008 // which contains the tree.
2009 class BitMask_t
2010 {
2011 	XQNode_t *		m_pTree;
2012 	uint64_t		m_uMask;
2013 
2014 public:
BitMask_t()2015 	BitMask_t ()
2016 		: m_pTree ( NULL )
2017 		, m_uMask ( 0ull )
2018 	{}
2019 
Init(XQNode_t * pTree,uint64_t uMask)2020 	void Init ( XQNode_t * pTree, uint64_t uMask )
2021 	{
2022 		m_pTree = pTree;
2023 		m_uMask = uMask;
2024 	}
2025 
GetMask() const2026 	inline uint64_t GetMask() const { return m_uMask; }
GetTree() const2027 	inline XQNode_t * GetTree() const { return m_pTree; }
2028 };
2029 
2030 // a list of unique values.
2031 class Associations_t : public CDwordHash
2032 {
2033 public:
2034 
2035 	// returns true when add the second member.
2036 	// The reason is that only one is not interesting for us,
2037 	// but more than two will flood the caller.
Associate2nd(uint64_t uTree)2038 	bool Associate2nd ( uint64_t uTree )
2039 	{
2040 		if ( Exists ( uTree ) )
2041 			return false;
2042 		Add ( 0, uTree );
2043 		return GetLength()==2;
2044 	}
2045 
2046 	// merge with another similar
Merge(const Associations_t & parents)2047 	void Merge ( const Associations_t& parents )
2048 	{
2049 		parents.IterateStart();
2050 		while ( parents.IterateNext() )
2051 			Associate2nd ( parents.IterateGetKey() );
2052 	}
2053 };
2054 
2055 // associate set of nodes, common bitmask for these nodes,
2056 // and gives the < to compare different pairs
2057 class BitAssociation_t
2058 {
2059 private:
2060 	const Associations_t *	m_pAssociations;
2061 	mutable int				m_iBits;
2062 
2063 	// The key method of subtree selection.
2064 	// Most 'heavy' subtrees will be extracted first.
GetWeight() const2065 	inline int GetWeight() const
2066 	{
2067 		assert ( m_pAssociations );
2068 		int iNodes = m_pAssociations->GetLength();
2069 		if ( m_iBits==0 && m_uMask!=0 )
2070 		{
2071 			for ( uint64_t dMask = m_uMask; dMask; dMask >>=1 )
2072 				m_iBits += (int)( dMask & 1 );
2073 		}
2074 
2075 		// current working formula is num_nodes^2 * num_hits
2076 		return iNodes * iNodes * m_iBits;
2077 	}
2078 
2079 public:
2080 	uint64_t			m_uMask;
2081 
BitAssociation_t()2082 	BitAssociation_t()
2083 		: m_pAssociations ( NULL )
2084 		, m_iBits ( 0 )
2085 		, m_uMask ( 0 )
2086 	{}
2087 
Init(uint64_t uMask,const Associations_t * dNodes)2088 	void Init ( uint64_t uMask, const Associations_t* dNodes )
2089 	{
2090 		m_uMask = uMask;
2091 		m_pAssociations = dNodes;
2092 		m_iBits = 0;
2093 	}
2094 
operator <(const BitAssociation_t & second) const2095 	bool operator< (const BitAssociation_t& second) const
2096 	{
2097 		return GetWeight() < second.GetWeight();
2098 	}
2099 };
2100 
2101 // for pairs of values builds and stores the association "key -> list of values"
2102 class CAssociations_t
2103 	: public CSphOrderedHash < Associations_t, uint64_t, IdentityHash_fn, 128 >
2104 {
2105 	int		m_iBits;			// number of non-unique associations
2106 public:
2107 
CAssociations_t()2108 	CAssociations_t() : m_iBits ( 0 ) {}
2109 
2110 	// Add the given pTree into the list of pTrees, associated with given uHash
Associate(XQNode_t * pTree,uint64_t uHash)2111 	int Associate ( XQNode_t * pTree, uint64_t uHash )
2112 	{
2113 		if ( !Exists ( uHash ) )
2114 			Add ( Associations_t(), uHash );
2115 		if ( operator[]( uHash ).Associate2nd ( pTree->GetHash() ) )
2116 			m_iBits++;
2117 		return m_iBits;
2118 	}
2119 
2120 	// merge the existing association of uHash with given chain
MergeAssociations(const Associations_t & chain,uint64_t uHash)2121 	void MergeAssociations ( const Associations_t & chain, uint64_t uHash )
2122 	{
2123 		if ( !Exists ( uHash ) )
2124 			Add ( chain, uHash );
2125 		else
2126 			operator[]( uHash ).Merge ( chain );
2127 	}
2128 
GetBits() const2129 	inline int GetBits() const { return m_iBits; }
2130 };
2131 
2132 // The main class for working with common subtrees
2133 class RevealCommon_t : ISphNoncopyable
2134 {
2135 private:
2136 	static const int			MAX_MULTINODES = 64;
2137 	CSphVector<BitMask_t>		m_dBitmasks;		// all bitmasks for all the nodes
2138 	CSphVector<uint64_t>		m_dSubQueries;		// final vector with roadmap for tree division.
2139 	CAssociations_t				m_hNodes;			// initial accumulator for nodes
2140 	CAssociations_t				m_hInterSections;	// initial accumulator for nodes
2141 	CDwordHash					m_hBitOrders;		// order numbers for found common subnodes
2142 	XQOperator_e				m_eOp;				// my operator which I process
2143 
2144 private:
2145 
2146 	// returns the order for given uHash (if any).
GetBitOrder(uint64_t uHash) const2147 	inline int GetBitOrder ( uint64_t uHash ) const
2148 	{
2149 		if ( !m_hBitOrders.Exists ( uHash ) )
2150 			return -1;
2151 		return m_hBitOrders[uHash];
2152 	}
2153 
2154 	// recursively scans the whole tree and builds the maps
2155 	// where a list of parents associated with every "leaf" nodes (i.e. with children)
BuildAssociations(XQNode_t * pTree)2156 	bool BuildAssociations ( XQNode_t * pTree )
2157 	{
2158 		if ( IsAppropriate ( pTree ) )
2159 		{
2160 			ARRAY_FOREACH ( i, pTree->m_dChildren )
2161 			if ( ( !BuildAssociations ( pTree->m_dChildren[i] ) )
2162 				|| ( ( m_eOp==pTree->GetOp() )
2163 				&& ( m_hNodes.Associate ( pTree, pTree->m_dChildren[i]->GetHash() )>=MAX_MULTINODES ) ) )
2164 			{
2165 				return false;
2166 			}
2167 		}
2168 		return true;
2169 	}
2170 
2171 	// Find all leafs, non-unique across the tree,
2172 	// and associate the order number with every of them
CalcCommonNodes()2173 	bool CalcCommonNodes ()
2174 	{
2175 		if ( !m_hNodes.GetBits() )
2176 			return false; // there is totally no non-unique leaves
2177 		int iBit = 0;
2178 		m_hNodes.IterateStart();
2179 		while ( m_hNodes.IterateNext() )
2180 			if ( m_hNodes.IterateGet().GetLength() > 1 )
2181 				m_hBitOrders.Add ( iBit++, m_hNodes.IterateGetKey() );
2182 		assert ( m_hNodes.GetBits()==m_hBitOrders.GetLength() );
2183 		m_hNodes.Reset(); ///< since from now we don't need this data anymore
2184 		return true;
2185 	}
2186 
2187 	// recursively builds for every node the bitmaks
2188 	// of common nodes it has as children
BuildBitmasks(XQNode_t * pTree)2189 	void BuildBitmasks ( XQNode_t * pTree )
2190 	{
2191 		if ( !IsAppropriate ( pTree ) )
2192 			return;
2193 
2194 		if ( m_eOp==pTree->GetOp() )
2195 		{
2196 			// calculate the bitmask
2197 			int iOrder;
2198 			uint64_t dMask = 0;
2199 			ARRAY_FOREACH ( i, pTree->m_dChildren )
2200 			{
2201 				iOrder = GetBitOrder ( pTree->m_dChildren[i]->GetHash() );
2202 				if ( iOrder>=0 )
2203 					dMask |= 1ull << iOrder;
2204 			}
2205 
2206 			// add the bitmask into the array
2207 			if ( dMask )
2208 				m_dBitmasks.Add().Init( pTree, dMask );
2209 		}
2210 
2211 		// recursively process all the children
2212 		ARRAY_FOREACH ( i, pTree->m_dChildren )
2213 			BuildBitmasks ( pTree->m_dChildren[i] );
2214 	}
2215 
2216 	// Collect all possible intersections of Bitmasks.
2217 	// For every non-zero intersection we collect the list of trees which contain it.
CalcIntersections()2218 	void CalcIntersections ()
2219 	{
2220 		// Round 1. Intersect all content of bitmasks one-by-one.
2221 		ARRAY_FOREACH ( i, m_dBitmasks )
2222 			for ( int j = i+1; j<m_dBitmasks.GetLength(); j++ )
2223 			{
2224 				// intersect one-by-one and group (grouping is done by nature of a hash)
2225 				uint64_t uMask = m_dBitmasks[i].GetMask() & m_dBitmasks[j].GetMask();
2226 				if ( uMask )
2227 				{
2228 					m_hInterSections.Associate ( m_dBitmasks[i].GetTree(), uMask );
2229 					m_hInterSections.Associate ( m_dBitmasks[j].GetTree(), uMask );
2230 				}
2231 			}
2232 
2233 		// Round 2. Intersect again all collected intersection one-by-one - until zero.
2234 		void *p1=NULL, *p2;
2235 		uint64_t uMask1, uMask2;
2236 		while ( m_hInterSections.IterateNext ( &p1 ) )
2237 		{
2238 			p2 = p1;
2239 			while ( m_hInterSections.IterateNext ( &p2 ) )
2240 			{
2241 				uMask1 = CAssociations_t::IterateGetKey ( &p1 );
2242 				uMask2 = CAssociations_t::IterateGetKey ( &p2 );
2243 				assert ( uMask1!=uMask2 );
2244 				uMask1 &= uMask2;
2245 				if ( uMask1 )
2246 				{
2247 					m_hInterSections.MergeAssociations ( CAssociations_t::IterateGet ( &p1 ), uMask1 );
2248 					m_hInterSections.MergeAssociations ( CAssociations_t::IterateGet ( &p2 ), uMask1 );
2249 				}
2250 			}
2251 		}
2252 	}
2253 
2254 	// create the final kit of common-subsets
2255 	// which we will actually reveal (extract) from original trees
MakeQueries()2256 	void MakeQueries()
2257 	{
2258 		CSphVector<BitAssociation_t> dSubnodes; // masks for our selected subnodes
2259 		dSubnodes.Reserve ( m_hInterSections.GetLength() );
2260 		m_hInterSections.IterateStart();
2261 		while ( m_hInterSections.IterateNext() )
2262 			dSubnodes.Add().Init( m_hInterSections.IterateGetKey(), &m_hInterSections.IterateGet() );
2263 
2264 		// sort by weight descending (weight sorting is hold by operator <)
2265 		dSubnodes.RSort();
2266 		m_dSubQueries.Reset();
2267 
2268 		// make the final subtrees vector: get one-by-one from the beginning,
2269 		// intresect with all the next and throw out zeros.
2270 		// The final subqueries will not be intersected between each other.
2271 		int j;
2272 		uint64_t uMask;
2273 		ARRAY_FOREACH ( i, dSubnodes )
2274 		{
2275 			uMask = dSubnodes[i].m_uMask;
2276 			m_dSubQueries.Add ( uMask );
2277 			j = i+1;
2278 			while ( j < dSubnodes.GetLength() )
2279 			{
2280 				if ( !( dSubnodes[j].m_uMask &= ~uMask ) )
2281 					dSubnodes.Remove(j);
2282 				else
2283 					j++;
2284 			}
2285 		}
2286 	}
2287 
2288 	// Now we finally extract the common subtrees from original tree
2289 	// and (recursively) from it's children
Reorganize(XQNode_t * pTree)2290 	void Reorganize ( XQNode_t * pTree )
2291 	{
2292 		if ( !IsAppropriate ( pTree ) )
2293 			return;
2294 
2295 		if ( m_eOp==pTree->GetOp() )
2296 		{
2297 			// pBranch is for common subset of children, pOtherChildren is for the rest.
2298 			CSphOrderedHash < XQNode_t*, int, IdentityHash_fn, 64 > hBranches;
2299 			XQNode_t * pOtherChildren = NULL;
2300 			int iBit;
2301 			int iOptimizations = 0;
2302 			ARRAY_FOREACH ( i, pTree->m_dChildren )
2303 			{
2304 				iBit = GetBitOrder ( pTree->m_dChildren[i]->GetHash() );
2305 
2306 				// works only with children which are actually common with somebody else
2307 				if ( iBit>=0 )
2308 				{
2309 					// since subqueries doesn't intersected between each other,
2310 					// the first hit we found in this loop is exactly what we searched.
2311 					ARRAY_FOREACH ( j, m_dSubQueries )
2312 						if ( ( 1ull << iBit ) & m_dSubQueries[j] )
2313 						{
2314 							XQNode_t * pNode;
2315 							if ( !hBranches.Exists(j) )
2316 							{
2317 								pNode = new XQNode_t ( pTree->m_dSpec );
2318 								pNode->SetOp ( m_eOp, pTree->m_dChildren[i] );
2319 								hBranches.Add ( pNode, j );
2320 							} else
2321 							{
2322 								pNode = hBranches[j];
2323 								pNode->m_dChildren.Add ( pTree->m_dChildren[i] );
2324 
2325 								// Count essential subtrees (with at least 2 children)
2326 								if ( pNode->m_dChildren.GetLength()==2 )
2327 									iOptimizations++;
2328 							}
2329 							break;
2330 						}
2331 					// another nodes add to the set of "other" children
2332 				} else
2333 				{
2334 					if ( !pOtherChildren )
2335 					{
2336 						pOtherChildren = new XQNode_t ( pTree->m_dSpec );
2337 						pOtherChildren->SetOp ( m_eOp, pTree->m_dChildren[i] );
2338 					} else
2339 						pOtherChildren->m_dChildren.Add ( pTree->m_dChildren[i] );
2340 				}
2341 			}
2342 
2343 			// we don't reorganize explicit simple case - as no "others" and only one common.
2344 			// Also reject optimization if there is nothing to optimize.
2345 			if ( ( iOptimizations==0 )
2346 				| ( !pOtherChildren && ( hBranches.GetLength()==1 ) ) )
2347 			{
2348 				if ( pOtherChildren )
2349 					pOtherChildren->m_dChildren.Reset();
2350 				hBranches.IterateStart();
2351 				while ( hBranches.IterateNext() )
2352 				{
2353 					assert ( hBranches.IterateGet() );
2354 					hBranches.IterateGet()->m_dChildren.Reset();
2355 					SafeDelete ( hBranches.IterateGet() );
2356 				}
2357 			} else
2358 			{
2359 				// reorganize the tree: replace the common subset to explicit node with
2360 				// only common members inside. This will give the the possibility
2361 				// to cache the node.
2362 				pTree->m_dChildren.Reset();
2363 				if ( pOtherChildren )
2364 					pTree->m_dChildren.SwapData ( pOtherChildren->m_dChildren );
2365 
2366 				hBranches.IterateStart();
2367 				while ( hBranches.IterateNext() )
2368 				{
2369 					if ( hBranches.IterateGet()->m_dChildren.GetLength()==1 )
2370 					{
2371 						pTree->m_dChildren.Add ( hBranches.IterateGet()->m_dChildren[0] );
2372 						hBranches.IterateGet()->m_dChildren.Reset();
2373 						SafeDelete ( hBranches.IterateGet() );
2374 					} else
2375 						pTree->m_dChildren.Add ( hBranches.IterateGet() );
2376 				}
2377 			}
2378 			SafeDelete ( pOtherChildren );
2379 		}
2380 
2381 		// recursively process all the children
2382 		ARRAY_FOREACH ( i, pTree->m_dChildren )
2383 			Reorganize ( pTree->m_dChildren[i] );
2384 	}
2385 
2386 public:
RevealCommon_t(XQOperator_e eOp)2387 	explicit RevealCommon_t ( XQOperator_e eOp )
2388 		: m_eOp ( eOp )
2389 	{}
2390 
2391 	// actual method for processing tree and reveal (extract) common subtrees
Transform(int iXQ,const XQQuery_t * pXQ)2392 	void Transform ( int iXQ, const XQQuery_t * pXQ )
2393 	{
2394 		// collect all non-unique nodes
2395 		for ( int i=0; i<iXQ; i++ )
2396 			if ( !BuildAssociations ( pXQ[i].m_pRoot ) )
2397 				return;
2398 
2399 		// count and order all non-unique nodes
2400 		if ( !CalcCommonNodes() )
2401 			return;
2402 
2403 		// create and collect bitmask for every node
2404 		for ( int i=0; i<iXQ; i++ )
2405 			BuildBitmasks ( pXQ[i].m_pRoot );
2406 
2407 		// intersect all bitmasks one-by-one, and also intersect all intersections
2408 		CalcIntersections();
2409 
2410 		// the die-hard: actually select the set of subtrees which we'll process
2411 		MakeQueries();
2412 
2413 		// ... and finally - process all our trees.
2414 		for ( int i=0; i<iXQ; i++ )
2415 			Reorganize ( pXQ[i].m_pRoot );
2416 	}
2417 };
2418 
2419 
2420 struct MarkedNode_t
2421 {
2422 	int			m_iCounter;
2423 	XQNode_t *	m_pTree;
2424 	bool		m_bMarked;
2425 	int			m_iOrder;
2426 
MarkedNode_tMarkedNode_t2427 	explicit MarkedNode_t ( XQNode_t * pTree=NULL )
2428 		: m_iCounter ( 1 )
2429 		, m_pTree ( pTree )
2430 		, m_bMarked ( false )
2431 		, m_iOrder ( 0 )
2432 	{}
2433 
MarkItMarkedNode_t2434 	void MarkIt ( bool bMark )
2435 	{
2436 		// mark
2437 		if ( bMark )
2438 		{
2439 			m_iCounter++;
2440 			m_bMarked = true;
2441 			return;
2442 		}
2443 
2444 		// unmark
2445 		if ( m_bMarked && m_iCounter>1 )
2446 			m_iCounter--;
2447 		if ( m_iCounter<2 )
2448 			m_bMarked = false;
2449 	}
2450 };
2451 
2452 typedef CSphOrderedHash < MarkedNode_t, uint64_t, IdentityHash_fn, 128 > CSubtreeHash;
2453 
2454 struct XqTreeComparator_t
2455 {
2456 	CSphVector<const XQKeyword_t *>		m_dTerms1;
2457 	CSphVector<const XQKeyword_t *>		m_dTerms2;
2458 
2459 	bool IsEqual ( const XQNode_t * pNode1, const XQNode_t * pNode2 );
2460 	bool CheckCollectTerms ( const XQNode_t * pNode1, const XQNode_t * pNode2 );
2461 };
2462 
2463 /// check hashes, then check subtrees, then flag
FlagCommonSubtrees(XQNode_t * pTree,CSubtreeHash & hSubTrees,XqTreeComparator_t & tCmp,bool bFlag,bool bMarkIt)2464 static void FlagCommonSubtrees ( XQNode_t * pTree, CSubtreeHash & hSubTrees, XqTreeComparator_t & tCmp, bool bFlag, bool bMarkIt )
2465 {
2466 	if ( !IsAppropriate ( pTree ) )
2467 		return;
2468 
2469 	// we do not yet have any collisions stats,
2470 	// but chances are we don't actually need IsEqualTo() at all
2471 	uint64_t iHash = pTree->GetHash();
2472 	if ( bFlag && hSubTrees.Exists ( iHash ) && tCmp.IsEqual ( hSubTrees [ iHash ].m_pTree, pTree ) )
2473 	{
2474 		hSubTrees[iHash].MarkIt ( true );
2475 
2476 		// we just add all the children but do NOT mark them as common
2477 		// so that only the subtree root is marked.
2478 		// also we unmark all the cases which were eaten by bigger trees
2479 		ARRAY_FOREACH ( i, pTree->m_dChildren )
2480 			if ( !hSubTrees.Exists ( pTree->m_dChildren[i]->GetHash() ) )
2481 				FlagCommonSubtrees ( pTree->m_dChildren[i], hSubTrees, tCmp, false, bMarkIt );
2482 			else
2483 				FlagCommonSubtrees ( pTree->m_dChildren[i], hSubTrees, tCmp, false, false );
2484 	} else
2485 	{
2486 		if ( !bMarkIt )
2487 			hSubTrees[iHash].MarkIt ( false );
2488 		else
2489 			hSubTrees.Add ( MarkedNode_t ( pTree ), iHash );
2490 
2491 		ARRAY_FOREACH ( i, pTree->m_dChildren )
2492 			FlagCommonSubtrees ( pTree->m_dChildren[i], hSubTrees, tCmp, bFlag, bMarkIt );
2493 	}
2494 }
2495 
2496 
SignCommonSubtrees(XQNode_t * pTree,const CSubtreeHash & hSubTrees)2497 static void SignCommonSubtrees ( XQNode_t * pTree, const CSubtreeHash & hSubTrees )
2498 {
2499 	if ( !pTree )
2500 		return;
2501 
2502 	uint64_t iHash = pTree->GetHash();
2503 	const MarkedNode_t * pCommon = hSubTrees ( iHash );
2504 	if ( pCommon && pCommon->m_bMarked )
2505 		pTree->TagAsCommon ( pCommon->m_iOrder, pCommon->m_iCounter );
2506 
2507 	ARRAY_FOREACH ( i, pTree->m_dChildren )
2508 		SignCommonSubtrees ( pTree->m_dChildren[i], hSubTrees );
2509 }
2510 
2511 
sphMarkCommonSubtrees(int iXQ,const XQQuery_t * pXQ)2512 int sphMarkCommonSubtrees ( int iXQ, const XQQuery_t * pXQ )
2513 {
2514 	if ( iXQ<=0 || !pXQ )
2515 		return 0;
2516 
2517 	{ // Optional reorganize tree to extract common parts
2518 		RevealCommon_t ( SPH_QUERY_AND ).Transform ( iXQ, pXQ );
2519 		RevealCommon_t ( SPH_QUERY_OR ).Transform ( iXQ, pXQ );
2520 	}
2521 
2522 	// flag common subtrees and refcount them
2523 	XqTreeComparator_t tCmp;
2524 	CSubtreeHash hSubtrees;
2525 	for ( int i=0; i<iXQ; i++ )
2526 		FlagCommonSubtrees ( pXQ[i].m_pRoot, hSubtrees, tCmp, true, true );
2527 
2528 	// number marked subtrees and assign them order numbers.
2529 	int iOrder = 0;
2530 	hSubtrees.IterateStart();
2531 	while ( hSubtrees.IterateNext() )
2532 		if ( hSubtrees.IterateGet().m_bMarked )
2533 			hSubtrees.IterateGet().m_iOrder = iOrder++;
2534 
2535 	// copy the flags and orders to original trees
2536 	for ( int i=0; i<iXQ; i++ )
2537 		SignCommonSubtrees ( pXQ[i].m_pRoot, hSubtrees );
2538 
2539 	return iOrder;
2540 }
2541 
2542 struct CmpTermQPos_fn
2543 {
IsLessCmpTermQPos_fn2544 	inline bool IsLess ( const XQKeyword_t * pA, const XQKeyword_t * pB ) const
2545 	{
2546 		assert ( pA && pB );
2547 		return ( pA->m_iAtomPos<pB->m_iAtomPos );
2548 	}
2549 };
2550 
IsEqual(const XQNode_t * pNode1,const XQNode_t * pNode2)2551 bool XqTreeComparator_t::IsEqual ( const XQNode_t * pNode1, const XQNode_t * pNode2 )
2552 {
2553 	// early out check to skip allocations
2554 	if ( !pNode1 || !pNode2 || pNode1->GetHash()!=pNode2->GetHash() || pNode1->GetOp()!=pNode2->GetOp () )
2555 		return false;
2556 
2557 	// need reset data from previous compare
2558 	m_dTerms1.Resize ( 0 );
2559 	m_dTerms2.Resize ( 0 );
2560 	// need reserve some space for first compare
2561 	m_dTerms1.Reserve ( 64 );
2562 	m_dTerms2.Reserve ( 64 );
2563 
2564 	if ( !CheckCollectTerms ( pNode1, pNode2 ) )
2565 		return false;
2566 
2567 	assert ( m_dTerms1.GetLength ()==m_dTerms2.GetLength () );
2568 
2569 	if ( !m_dTerms1.GetLength() )
2570 		return true;
2571 
2572 	m_dTerms1.Sort ( CmpTermQPos_fn() );
2573 	m_dTerms2.Sort ( CmpTermQPos_fn() );
2574 
2575 	if ( m_dTerms1[0]->m_sWord!=m_dTerms2[0]->m_sWord )
2576 		return false;
2577 
2578 	for ( int i=1; i<m_dTerms1.GetLength(); i++ )
2579 	{
2580 		int iDelta1 = m_dTerms1[i]->m_iAtomPos - m_dTerms1[i-1]->m_iAtomPos;
2581 		int iDelta2 = m_dTerms2[i]->m_iAtomPos - m_dTerms2[i-1]->m_iAtomPos;
2582 		if ( iDelta1!=iDelta2 || m_dTerms1[i]->m_sWord!=m_dTerms2[i]->m_sWord )
2583 			return false;
2584 	}
2585 
2586 	return true;
2587 }
2588 
CheckCollectTerms(const XQNode_t * pNode1,const XQNode_t * pNode2)2589 bool XqTreeComparator_t::CheckCollectTerms ( const XQNode_t * pNode1, const XQNode_t * pNode2 )
2590 {
2591 	// early out
2592 	if ( !pNode1 || !pNode2
2593 			|| pNode1->GetHash ()!=pNode2->GetHash () || pNode1->GetOp ()!=pNode2->GetOp ()
2594 			|| pNode1->m_dWords.GetLength ()!=pNode2->m_dWords.GetLength ()
2595 			|| pNode1->m_dChildren.GetLength ()!=pNode2->m_dChildren.GetLength () )
2596 		return false;
2597 
2598 	// for plain nodes compare keywords
2599 	ARRAY_FOREACH ( i, pNode1->m_dWords )
2600 		m_dTerms1.Add ( pNode1->m_dWords.Begin() + i );
2601 
2602 	ARRAY_FOREACH ( i, pNode2->m_dWords )
2603 		m_dTerms2.Add ( pNode2->m_dWords.Begin () + i );
2604 
2605 	// for non-plain nodes compare children
2606 	ARRAY_FOREACH ( i, pNode1->m_dChildren )
2607 	{
2608 		if ( !CheckCollectTerms ( pNode1->m_dChildren[i], pNode2->m_dChildren[i] ) )
2609 			return false;
2610 	}
2611 
2612 	return true;
2613 }
2614 
2615 
2616 class CSphTransformation : public ISphNoncopyable
2617 {
2618 public:
2619 	CSphTransformation ( XQNode_t ** ppRoot, const ISphKeywordsStat * pKeywords );
2620 	void Transform ();
2621 	inline void Dump ( const XQNode_t * pNode, const char * sHeader = "" );
2622 
2623 private:
2624 
2625 	typedef CSphOrderedHash < CSphVector<XQNode_t*>, uint64_t, IdentityHash_fn, 32> HashSimilar_t;
2626 	CSphOrderedHash < HashSimilar_t, uint64_t, IdentityHash_fn, 256 >	m_hSimilar;
2627 	CSphVector<XQNode_t *>		m_dRelatedNodes;
2628 	const ISphKeywordsStat *	m_pKeywords;
2629 	XQNode_t **					m_ppRoot;
2630 	typedef bool ( *Checker_fn ) ( const XQNode_t * );
2631 
2632 private:
2633 
2634 	void		Dump ();
2635 	void		SetCosts ( XQNode_t * pNode, const CSphVector<XQNode_t *> & dNodes );
2636 	int			GetWeakestIndex ( const CSphVector<XQNode_t *> & dNodes );
2637 
2638 	template < typename Group, typename SubGroup >
2639 	inline void TreeCollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker );
2640 
2641 	template < typename Group, typename SubGroup >
2642 	inline bool CollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker );
2643 
2644 	template < typename Excluder, typename Parenter >
2645 	inline bool	CollectRelatedNodes ( const CSphVector<XQNode_t *> & dSimilarNodes );
2646 
2647 	// ((A !N) | (B !N)) -> ((A|B) !N)
2648 	static bool CheckCommonNot ( const XQNode_t * pNode );
2649 	bool		TransformCommonNot ();
2650 	bool		MakeTransformCommonNot ( CSphVector<XQNode_t *> & dSimilarNodes );
2651 
2652 	// ((A !(N AA)) | (B !(N BB))) -> (((A|B) !N) | (A !AA) | (B !BB)) [ if cost(N) > cost(A) + cost(B) ]
2653 	static bool	CheckCommonCompoundNot ( const XQNode_t * pNode );
2654 	bool		TransformCommonCompoundNot ();
2655 	bool		MakeTransformCommonCompoundNot ( CSphVector<XQNode_t *> & dSimilarNodes );
2656 
2657 	// ((A (X | AA)) | (B (X | BB))) -> (((A|B) X) | (A AA) | (B BB)) [ if cost(X) > cost(A) + cost(B) ]
2658 	static bool	CheckCommonSubTerm ( const XQNode_t * pNode );
2659 	bool		TransformCommonSubTerm ();
2660 	void		MakeTransformCommonSubTerm ( CSphVector<XQNode_t *> & dX );
2661 
2662 	// (A | "A B"~N) -> A ; ("A B" | "A B C") -> "A B" ; ("A B"~N | "A B C"~N) -> ("A B"~N)
2663 	static bool CheckCommonKeywords ( const XQNode_t * pNode );
2664 	bool		TransformCommonKeywords ();
2665 
2666 	// ("X A B" | "Y A B") -> (("X|Y") "A B")
2667 	// ("A B X" | "A B Y") -> (("X|Y") "A B")
2668 	static bool CheckCommonPhrase ( const XQNode_t * pNode );
2669 	bool 		TransformCommonPhrase ();
2670 	void		MakeTransformCommonPhrase ( CSphVector<XQNode_t *> & dCommonNodes, int iCommonLen, bool bHeadIsCommon );
2671 
2672 	// ((A !X) | (A !Y) | (A !Z)) -> (A !(X Y Z))
2673 	static bool CheckCommonAndNotFactor ( const XQNode_t * pNode );
2674 	bool		TransformCommonAndNotFactor ();
2675 	bool		MakeTransformCommonAndNotFactor ( CSphVector<XQNode_t *> & dSimilarNodes );
2676 
2677 	// ((A !(N | N1)) | (B !(N | N2))) -> (( (A !N1) | (B !N2) ) !N)
2678 	static bool CheckCommonOrNot ( const XQNode_t * pNode );
2679 	bool 		TransformCommonOrNot ();
2680 	bool		MakeTransformCommonOrNot ( CSphVector<XQNode_t *> & dSimilarNodes );
2681 
2682 	// The main goal of transformations below is tree clarification and
2683 	// further applying of standard transformations above.
2684 
2685 	// "hung" operand ( AND(OR) node with only 1 child ) appears after an internal transformation
2686 	static bool CheckHungOperand ( const XQNode_t * pNode );
2687 	bool		TransformHungOperand ();
2688 
2689 	// ((A | B) | C) -> ( A | B | C )
2690 	// ((A B) C) -> ( A B C )
2691 	static bool CheckExcessBrackets ( const XQNode_t * pNode );
2692 	bool 		TransformExcessBrackets ();
2693 
2694 	// ((A !N1) !N2) -> (A !(N1 | N2))
2695 	static bool CheckExcessAndNot ( const XQNode_t * pNode );
2696 	bool		TransformExcessAndNot ();
2697 
2698 private:
2699 	static const uint64_t CONST_GROUP_FACTOR;
2700 
2701 	struct NullNode
2702 	{
ByCSphTransformation::NullNode2703 		static inline uint64_t By ( XQNode_t * ) { return CONST_GROUP_FACTOR; } // NOLINT
FromCSphTransformation::NullNode2704 		static inline const XQNode_t * From ( const XQNode_t * ) { return NULL; } // NOLINT
2705 	};
2706 
2707 	struct CurrentNode
2708 	{
ByCSphTransformation::CurrentNode2709 		static inline uint64_t By ( XQNode_t * p ) { return p->GetFuzzyHash(); }
FromCSphTransformation::CurrentNode2710 		static inline const XQNode_t * From ( const XQNode_t * p ) { return p; }
2711 	};
2712 
2713 	struct ParentNode
2714 	{
ByCSphTransformation::ParentNode2715 		static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::ParentNode2716 		static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent; }
2717 	};
2718 
2719 	struct GrandNode
2720 	{
ByCSphTransformation::GrandNode2721 		static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::GrandNode2722 		static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent->m_pParent; }
2723 	};
2724 
2725 	struct Grand2Node {
ByCSphTransformation::Grand2Node2726 		static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::Grand2Node2727 		static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent; }
2728 	};
2729 
2730 	struct Grand3Node
2731 	{
ByCSphTransformation::Grand3Node2732 		static inline uint64_t By ( XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent->m_pParent->GetFuzzyHash(); }
FromCSphTransformation::Grand3Node2733 		static inline const XQNode_t * From ( const XQNode_t * p ) { return p->m_pParent->m_pParent->m_pParent->m_pParent; }
2734 	};
2735 };
2736 
2737 
CSphTransformation(XQNode_t ** ppRoot,const ISphKeywordsStat * pKeywords)2738 CSphTransformation::CSphTransformation ( XQNode_t ** ppRoot, const ISphKeywordsStat * pKeywords )
2739 	: m_pKeywords ( pKeywords )
2740 	, m_ppRoot ( ppRoot )
2741 {
2742 	assert ( m_pKeywords!=NULL );
2743 }
2744 
2745 
2746 
2747 const uint64_t CSphTransformation::CONST_GROUP_FACTOR = 0;
2748 
2749 template < typename Group, typename SubGroup >
TreeCollectInfo(XQNode_t * pParent,Checker_fn pfnChecker)2750 void CSphTransformation::TreeCollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker )
2751 {
2752 	if ( pParent )
2753 	{
2754 		if ( pfnChecker ( pParent ) )
2755 		{
2756 			// "Similar nodes" are nodes which are suited to a template (like 'COMMON NOT', 'COMMON COMPOND NOT', ...)
2757 			uint64_t uGroup = (uint64_t)Group::From ( pParent );
2758 			uint64_t uSubGroup = SubGroup::By ( pParent );
2759 
2760 			HashSimilar_t & hGroup = m_hSimilar.AddUnique ( uGroup );
2761 			hGroup.AddUnique ( uSubGroup ).Add ( pParent );
2762 		}
2763 
2764 		ARRAY_FOREACH ( iChild, pParent->m_dChildren )
2765 			TreeCollectInfo<Group, SubGroup> ( pParent->m_dChildren[iChild], pfnChecker );
2766 	}
2767 }
2768 
2769 
2770 template < typename Group, typename SubGroup >
CollectInfo(XQNode_t * pParent,Checker_fn pfnChecker)2771 bool CSphTransformation::CollectInfo ( XQNode_t * pParent, Checker_fn pfnChecker )
2772 {
2773 	( *m_ppRoot )->Check ( true );
2774 	m_hSimilar.Reset();
2775 
2776 	TreeCollectInfo<Group, SubGroup> ( pParent, pfnChecker );
2777 	return ( m_hSimilar.GetLength()>0 );
2778 }
2779 
2780 
SetCosts(XQNode_t * pNode,const CSphVector<XQNode_t * > & dNodes)2781 void CSphTransformation::SetCosts ( XQNode_t * pNode, const CSphVector<XQNode_t *> & dNodes )
2782 {
2783 	assert ( pNode || dNodes.GetLength() );
2784 
2785 	CSphVector<XQNode_t*> dChildren ( dNodes.GetLength() + 1 );
2786 	dChildren[dNodes.GetLength()] = pNode;
2787 	ARRAY_FOREACH ( i, dNodes )
2788 	{
2789 		dChildren[i] = dNodes[i];
2790 		dChildren[i]->m_iUser = 0;
2791 	}
2792 
2793 	// collect unknown keywords from all children
2794 	CSphVector<CSphKeywordInfo> dKeywords;
2795 	SmallStringHash_T<int>	hCosts;
2796 	ARRAY_FOREACH ( i, dChildren )
2797 	{
2798 		XQNode_t * pChild = dChildren[i];
2799 		ARRAY_FOREACH ( j, pChild->m_dChildren )
2800 		{
2801 			dChildren.Add ( pChild->m_dChildren[j] );
2802 			dChildren.Last()->m_iUser = 0;
2803 			assert ( dChildren.Last()->m_pParent==pChild );
2804 		}
2805 		ARRAY_FOREACH ( j, pChild->m_dWords )
2806 		{
2807 			const CSphString & sWord = pChild->m_dWords[j].m_sWord;
2808 			int * pCost = hCosts ( sWord );
2809 			if ( !pCost )
2810 			{
2811 				Verify ( hCosts.Add ( 0, sWord ) );
2812 				dKeywords.Add();
2813 				dKeywords.Last().m_sTokenized = sWord;
2814 				dKeywords.Last().m_iDocs = 0;
2815 			}
2816 		}
2817 	}
2818 
2819 	// get keywords info from index dictionary
2820 	if ( dKeywords.GetLength() )
2821 	{
2822 		m_pKeywords->FillKeywords ( dKeywords );
2823 		ARRAY_FOREACH ( i, dKeywords )
2824 		{
2825 			const CSphKeywordInfo & tKeyword = dKeywords[i];
2826 			hCosts[tKeyword.m_sTokenized] = tKeyword.m_iDocs;
2827 		}
2828 	}
2829 
2830 	// propagate cost bottom-up (from children to parents)
2831 	for ( int i=dChildren.GetLength()-1; i>=0; i-- )
2832 	{
2833 		XQNode_t * pChild = dChildren[i];
2834 		int iCost = 0;
2835 		ARRAY_FOREACH ( j, pChild->m_dWords )
2836 			iCost += hCosts [ pChild->m_dWords[j].m_sWord ];
2837 
2838 		pChild->m_iUser += iCost;
2839 		if ( pChild->m_pParent )
2840 			pChild->m_pParent->m_iUser += pChild->m_iUser;
2841 	}
2842 }
2843 
2844 
2845 template < typename Excluder, typename Parenter >
CollectRelatedNodes(const CSphVector<XQNode_t * > & dSimilarNodes)2846 bool CSphTransformation::CollectRelatedNodes ( const CSphVector<XQNode_t *> & dSimilarNodes )
2847 {
2848 	m_dRelatedNodes.Resize ( 0 );
2849 	ARRAY_FOREACH ( i, dSimilarNodes )
2850 	{
2851 		// Eval node that should be excluded
2852 		const XQNode_t * pExclude = Excluder::From ( dSimilarNodes[i] );
2853 
2854 		// Eval node that points to related nodes
2855 		const XQNode_t * pParent = Parenter::From ( dSimilarNodes[i] );
2856 
2857 		assert ( &pParent->m_dChildren!=&m_dRelatedNodes );
2858 		ARRAY_FOREACH ( j, pParent->m_dChildren )
2859 		{
2860 			if ( pParent->m_dChildren[j]!=pExclude )
2861 				m_dRelatedNodes.Add ( pParent->m_dChildren[j] );
2862 		}
2863 	}
2864 	return ( m_dRelatedNodes.GetLength()>1 );
2865 }
2866 
2867 
CheckCommonNot(const XQNode_t * pNode)2868 bool CSphTransformation::CheckCommonNot ( const XQNode_t * pNode )
2869 {
2870 	if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
2871 			pNode->m_pParent->GetOp()!=SPH_QUERY_NOT || pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
2872 			pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
2873 	{
2874 //
2875 // NOLINT		//  NOT:
2876 // NOLINT		//		 _______ OR (gGOr) ___________
2877 // NOLINT		// 		/          |                   |
2878 // NOLINT		// 	 ...        AND NOT (grandAndNot)  ...
2879 // NOLINT		//                 /       |
2880 // NOLINT		//         relatedNode    NOT (parentNot)
2881 // NOLINT		//         	               |
2882 // NOLINT		//         	             pNode
2883 //
2884 		return false;
2885 	}
2886 	return true;
2887 }
2888 
2889 
TransformCommonNot()2890 bool CSphTransformation::TransformCommonNot ()
2891 {
2892 	bool bRecollect = false;
2893 	m_hSimilar.IterateStart();
2894 	while ( m_hSimilar.IterateNext () )
2895 	{
2896 		m_hSimilar.IterateGet().IterateStart();
2897 		while ( m_hSimilar.IterateGet().IterateNext() )
2898 		{
2899 			// Nodes with the same iFuzzyHash
2900 			CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
2901 			if ( dSimilarNodes.GetLength()<2 )
2902 				continue;
2903 
2904 			if ( CollectRelatedNodes < ParentNode, GrandNode > ( dSimilarNodes ) && MakeTransformCommonNot ( dSimilarNodes ) )
2905 			{
2906 				bRecollect = true;
2907 				// Don't make transformation for other nodes from the same OR-node,
2908 				// because query tree was changed and further transformations
2909 				// might be invalid.
2910 				break;
2911 			}
2912 		}
2913 	}
2914 
2915 	return bRecollect;
2916 }
2917 
2918 
GetWeakestIndex(const CSphVector<XQNode_t * > & dNodes)2919 int CSphTransformation::GetWeakestIndex ( const CSphVector<XQNode_t *> & dNodes )
2920 {
2921 	// Returns index of weakest node from the equal.
2922 	// The example of equal nodes:
2923 	// "aaa bbb" (PHRASE), "aaa bbb"~10 (PROXIMITY), "aaa bbb"~20 (PROXIMITY)
2924 	// Such nodes have the same magic hash value.
2925 	// The weakest is "aaa bbb"~20
2926 
2927 	int iWeakestIndex = 0;
2928 	int iProximity = -1;
2929 
2930 	ARRAY_FOREACH ( i, dNodes )
2931 	{
2932 		XQNode_t * pNode = dNodes[i];
2933 		if ( pNode->GetOp()==SPH_QUERY_PROXIMITY && pNode->m_iOpArg>iProximity )
2934 		{
2935 			iProximity = pNode->m_iOpArg;
2936 			iWeakestIndex = i;
2937 		}
2938 	}
2939 	return iWeakestIndex;
2940 }
2941 
2942 
MakeTransformCommonNot(CSphVector<XQNode_t * > & dSimilarNodes)2943 bool CSphTransformation::MakeTransformCommonNot ( CSphVector<XQNode_t *> & dSimilarNodes )
2944 {
2945 	// Pick weakest node from the equal
2946 	// PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
2947 	// so they are considered as equal nodes.
2948 	int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
2949 
2950 	// the weakest node is new parent of transformed expression
2951 	XQNode_t * pWeakestAndNot = m_dRelatedNodes[iWeakestIndex]->m_pParent;
2952 	assert ( pWeakestAndNot->m_dChildren[0]==m_dRelatedNodes[iWeakestIndex] );
2953 	XQNode_t * pCommonOr = pWeakestAndNot->m_pParent;
2954 	assert ( pCommonOr->GetOp()==SPH_QUERY_OR && pCommonOr->m_dChildren.Contains ( pWeakestAndNot ) );
2955 	XQNode_t * pGrandCommonOr = pCommonOr->m_pParent;
2956 
2957 	bool bKeepOr = ( pCommonOr->m_dChildren.GetLength()>2 );
2958 
2959 	// reset ownership of related nodes
2960 	ARRAY_FOREACH ( i, m_dRelatedNodes )
2961 	{
2962 		XQNode_t * pAnd = m_dRelatedNodes[i];
2963 		XQNode_t * pAndNot = pAnd->m_pParent;
2964 		assert ( pAndNot->m_pParent==pCommonOr );
2965 
2966 		if ( i!=iWeakestIndex )
2967 		{
2968 			Verify ( pAndNot->m_dChildren.RemoveValue ( pAnd ) );
2969 
2970 			if ( bKeepOr )
2971 			{
2972 				pCommonOr->m_dChildren.RemoveValue ( pAndNot );
2973 				SafeDelete ( pAndNot );
2974 			}
2975 		}
2976 	}
2977 
2978 	// move all related to new OR
2979 	XQNode_t * pHubOr = new XQNode_t ( XQLimitSpec_t() );
2980 	pHubOr->SetOp ( SPH_QUERY_OR, m_dRelatedNodes );
2981 
2982 	// insert hub OR via hub AND to new parent ( AND NOT )
2983 	XQNode_t * pHubAnd = new XQNode_t ( XQLimitSpec_t() );
2984 	pHubAnd->SetOp ( SPH_QUERY_AND, pHubOr );
2985 	// replace old AND at new parent ( AND NOT ) 0 already at OR children
2986 	pHubAnd->m_pParent = pWeakestAndNot;
2987 	pWeakestAndNot->m_dChildren[0] = pHubAnd;
2988 
2989 	// in case common OR had only 2 children
2990 	if ( !bKeepOr )
2991 	{
2992 		// replace old OR with AND_NOT at parent
2993 		if ( !pGrandCommonOr )
2994 		{
2995 			pWeakestAndNot->m_pParent = NULL;
2996 			*m_ppRoot = pWeakestAndNot;
2997 		} else
2998 		{
2999 			pWeakestAndNot->m_pParent = pGrandCommonOr;
3000 			CSphVector<XQNode_t *> & dChildren = pGrandCommonOr->m_dChildren;
3001 			ARRAY_FOREACH ( i, dChildren )
3002 			{
3003 				if ( dChildren[i]==pCommonOr )
3004 				{
3005 					dChildren[i] = pWeakestAndNot;
3006 					break;
3007 				}
3008 			}
3009 		}
3010 		// remove new parent ( AND OR ) from OR children
3011 		Verify ( pCommonOr->m_dChildren.RemoveValue ( pWeakestAndNot ) );
3012 		// free OR and all children
3013 		SafeDelete ( pCommonOr );
3014 	}
3015 
3016 	return true;
3017 }
3018 
3019 
CheckCommonCompoundNot(const XQNode_t * pNode)3020 bool CSphTransformation::CheckCommonCompoundNot ( const XQNode_t * pNode )
3021 {
3022 	if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3023 			!pNode->m_pParent->m_pParent->m_pParent->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_AND ||
3024 			pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_NOT || pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
3025 			pNode->m_pParent->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
3026 	{
3027 //
3028 // NOLINT		//  NOT:
3029 // NOLINT		//		 __ OR (Grand3 = CommonOr) __
3030 // NOLINT		//		/    |                       |
3031 // NOLINT		//	 ...  AND NOT (Grand2)          ...
3032 // NOLINT		//          /        |
3033 // NOLINT		//     relatedNode  NOT (grandNot)
3034 // NOLINT		//        	         |
3035 // NOLINT		//        	      AND (parentAnd)
3036 // NOLINT		//                 /    |
3037 // NOLINT		//               pNode  ...
3038 //
3039 		return false;
3040 	}
3041 	return true;
3042 }
3043 
3044 
TransformCommonCompoundNot()3045 bool CSphTransformation::TransformCommonCompoundNot ()
3046 {
3047 	bool bRecollect = false;
3048 	m_hSimilar.IterateStart();
3049 	while ( m_hSimilar.IterateNext () )
3050 	{
3051 		m_hSimilar.IterateGet().IterateStart();
3052 		while ( m_hSimilar.IterateGet().IterateNext() )
3053 		{
3054 			// Nodes with the same iFuzzyHash
3055 			CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
3056 			if ( dSimilarNodes.GetLength()<2 )
3057 				continue;
3058 
3059 			if ( CollectRelatedNodes < GrandNode, Grand2Node > ( dSimilarNodes ) )
3060 			{
3061 				// Load cost of the first node from the group
3062 				// of the common nodes. The cost of nodes from
3063 				// TransformableNodes are the same.
3064 				SetCosts ( dSimilarNodes[0], m_dRelatedNodes );
3065 				int iCommon = dSimilarNodes[0]->m_iUser;
3066 				int iRelated = 0;
3067 				ARRAY_FOREACH ( i, m_dRelatedNodes )
3068 					iRelated += m_dRelatedNodes[i]->m_iUser;
3069 
3070 				// Check that optimization willl be useful.
3071 				if ( iCommon>iRelated && MakeTransformCommonCompoundNot ( dSimilarNodes ) )
3072 				{
3073 					bRecollect = true;
3074 					// Don't make transformation for other nodes from the same OR-node,
3075 					// because qtree was changed and further transformations
3076 					// might be invalid.
3077 					break;
3078 				}
3079 			}
3080 		}
3081 	}
3082 
3083 	return bRecollect;
3084 }
3085 
3086 
MakeTransformCommonCompoundNot(CSphVector<XQNode_t * > & dSimilarNodes)3087 bool CSphTransformation::MakeTransformCommonCompoundNot ( CSphVector<XQNode_t *> & dSimilarNodes )
3088 {
3089 	// Pick weakest node from the equal
3090 	// PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
3091 	// so they are considered as equal nodes.
3092 	int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
3093 	assert ( iWeakestIndex!=-1 );
3094 	XQNode_t * pWeakestSimilar = dSimilarNodes [ iWeakestIndex ];
3095 
3096 	// Common OR node (that is Grand3Node::From)
3097 	XQNode_t * pCommonOr = pWeakestSimilar->m_pParent->m_pParent->m_pParent->m_pParent;
3098 
3099 	// Factor out and delete/unlink similar nodes ( except weakest )
3100 	ARRAY_FOREACH ( i, dSimilarNodes )
3101 	{
3102 		XQNode_t * pParent = dSimilarNodes[i]->m_pParent;
3103 		Verify ( pParent->m_dChildren.RemoveValue ( dSimilarNodes[i] ) );
3104 
3105 		if ( i!=iWeakestIndex )
3106 			SafeDelete ( dSimilarNodes[i] );
3107 	}
3108 
3109 	// Create yet another ANDNOT node
3110 	// with related nodes and one common node
3111 	XQNode_t * pNewNot = new XQNode_t ( XQLimitSpec_t() );
3112 	pNewNot->SetOp ( SPH_QUERY_NOT, pWeakestSimilar );
3113 
3114 	XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
3115 	pNewOr->SetOp ( SPH_QUERY_OR );
3116 	pNewOr->m_dChildren.Resize ( m_dRelatedNodes.GetLength() );
3117 	ARRAY_FOREACH ( i, m_dRelatedNodes )
3118 	{
3119 		// ANDNOT operation implies AND and NOT nodes.
3120 		// The related nodes point to AND node that has one child node.
3121 		assert ( m_dRelatedNodes[i]->m_dChildren.GetLength()==1 );
3122 		pNewOr->m_dChildren[i] = m_dRelatedNodes[i]->m_dChildren[0]->Clone();
3123 		pNewOr->m_dChildren[i]->m_pParent = pNewOr;
3124 	}
3125 
3126 	XQNode_t * pNewAnd = new XQNode_t ( XQLimitSpec_t() );
3127 	pNewAnd->SetOp ( SPH_QUERY_AND, pNewOr );
3128 	XQNode_t * pNewAndNot = new XQNode_t ( XQLimitSpec_t() );
3129 	pNewAndNot->SetOp ( SPH_QUERY_ANDNOT, pNewAnd, pNewNot );
3130 	pCommonOr->m_dChildren.Add ( pNewAndNot );
3131 	pNewAndNot->m_pParent = pCommonOr;
3132 	return true;
3133 }
3134 
3135 
CheckCommonSubTerm(const XQNode_t * pNode)3136 bool CSphTransformation::CheckCommonSubTerm ( const XQNode_t * pNode )
3137 {
3138 	if ( !pNode || ( pNode->GetOp()==SPH_QUERY_PHRASE && pNode->m_dChildren.GetLength() )
3139 		|| !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3140 			pNode->m_pParent->GetOp()!=SPH_QUERY_OR || pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_AND ||
3141 			pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
3142 	{
3143 //
3144 // NOLINT		//  NOT:
3145 // NOLINT		//        ________OR (gGOr)
3146 // NOLINT		//		/           |
3147 // NOLINT		//	......	  AND (grandAnd)
3148 // NOLINT		//                /     |
3149 // NOLINT		//      relatedNode    OR (parentOr)
3150 // NOLINT		//        	            /   |
3151 // NOLINT		//                   pNode  ...
3152 //
3153 		return false;
3154 	}
3155 	return true;
3156 }
3157 
3158 
TransformCommonSubTerm()3159 bool CSphTransformation::TransformCommonSubTerm ()
3160 {
3161 	bool bRecollect = false;
3162 	m_hSimilar.IterateStart();
3163 	while ( m_hSimilar.IterateNext () )
3164 	{
3165 		m_hSimilar.IterateGet().IterateStart();
3166 		while ( m_hSimilar.IterateGet().IterateNext() )
3167 		{
3168 			// Nodes with the same iFuzzyHash
3169 			CSphVector<XQNode_t *> & dX = m_hSimilar.IterateGet().IterateGet();
3170 			if ( dX.GetLength()<2 )
3171 				continue;
3172 
3173 			// skip common sub-terms from same tree
3174 			bool bSame = false;
3175 			for ( int i=0; i<dX.GetLength()-1 && !bSame; i++ )
3176 			{
3177 				for ( int j=i+1; j<dX.GetLength() && !bSame; j++ )
3178 					bSame = ( dX[i]->m_pParent==dX[j]->m_pParent );
3179 			}
3180 			if ( bSame )
3181 				continue;
3182 
3183 			if ( CollectRelatedNodes < ParentNode, GrandNode > ( dX ) )
3184 			{
3185 				// Load cost of the first node from the group
3186 				// of the common nodes. The cost of nodes from
3187 				// TransformableNodes are the same.
3188 				SetCosts ( dX[0], m_dRelatedNodes );
3189 				int iCostCommonSubTermNode = dX[0]->m_iUser;
3190 				int iCostRelatedNodes = 0;
3191 				ARRAY_FOREACH ( i, m_dRelatedNodes )
3192 					iCostRelatedNodes += m_dRelatedNodes[i]->m_iUser;
3193 
3194 				// Check that optimization will be useful.
3195 				if ( iCostCommonSubTermNode > iCostRelatedNodes )
3196 				{
3197 					MakeTransformCommonSubTerm ( dX );
3198 					bRecollect = true;
3199 					// Don't make transformation for other nodes from the same OR-node,
3200 					// because query tree was changed and further transformations
3201 					// might be invalid.
3202 					break;
3203 				}
3204 			}
3205 		}
3206 	}
3207 
3208 	return bRecollect;
3209 }
3210 
3211 
3212 // remove nodes without children up the tree
SubtreeRemoveEmpty(XQNode_t * pNode)3213 static bool SubtreeRemoveEmpty ( XQNode_t * pNode )
3214 {
3215 	if ( !pNode->IsEmpty() )
3216 		return false;
3217 
3218 	// climb up
3219 	XQNode_t * pParent = pNode->m_pParent;
3220 	while ( pParent && pParent->m_dChildren.GetLength()<=1 && !pParent->m_dWords.GetLength() )
3221 	{
3222 		pNode = pParent;
3223 		pParent = pParent->m_pParent;
3224 	}
3225 
3226 	if ( pParent )
3227 		pParent->m_dChildren.RemoveValue ( pNode );
3228 
3229 	// free subtree
3230 	SafeDelete ( pNode );
3231 	return true;
3232 }
3233 
3234 
3235 // eliminate composite ( AND \ OR ) nodes with only one children
CompositeFixup(XQNode_t * pNode,XQNode_t ** ppRoot)3236 static void CompositeFixup ( XQNode_t * pNode, XQNode_t ** ppRoot )
3237 {
3238 	assert ( pNode && !pNode->m_dWords.GetLength() );
3239 	if ( pNode->m_dChildren.GetLength()!=1 || !( pNode->GetOp()==SPH_QUERY_OR || pNode->GetOp()==SPH_QUERY_AND ) )
3240 		return;
3241 
3242 	XQNode_t * pChild = pNode->m_dChildren[0];
3243 	pChild->m_pParent = NULL;
3244 	pNode->m_dChildren.Resize ( 0 );
3245 
3246 	// climb up
3247 	XQNode_t * pParent = pNode->m_pParent;
3248 	while ( pParent && pParent->m_dChildren.GetLength()==1 && !pParent->m_dWords.GetLength() &&
3249 		( pParent->GetOp()==SPH_QUERY_OR || pParent->GetOp()==SPH_QUERY_AND ) )
3250 	{
3251 		pNode = pParent;
3252 		pParent = pParent->m_pParent;
3253 	}
3254 
3255 	if ( pParent )
3256 	{
3257 		ARRAY_FOREACH ( i, pParent->m_dChildren )
3258 		{
3259 			if ( pParent->m_dChildren[i]!=pNode )
3260 				continue;
3261 
3262 			pParent->m_dChildren[i] = pChild;
3263 			pChild->m_pParent = pParent;
3264 			break;
3265 		}
3266 	} else
3267 	{
3268 		*ppRoot = pChild;
3269 	}
3270 
3271 	// free subtree
3272 	SafeDelete ( pNode );
3273 }
3274 
3275 
CleanupSubtree(XQNode_t * pNode,XQNode_t ** ppRoot)3276 static void CleanupSubtree ( XQNode_t * pNode, XQNode_t ** ppRoot )
3277 {
3278 	if ( SubtreeRemoveEmpty ( pNode ) )
3279 		return;
3280 
3281 	CompositeFixup ( pNode, ppRoot );
3282 }
3283 
3284 
MakeTransformCommonSubTerm(CSphVector<XQNode_t * > & dX)3285 void CSphTransformation::MakeTransformCommonSubTerm ( CSphVector<XQNode_t *> & dX )
3286 {
3287 	// Pick weakest node from the equal
3288 	// PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
3289 	// so they are considered as equal nodes.
3290 	int iWeakestIndex = GetWeakestIndex ( dX );
3291 	XQNode_t * pX = dX[iWeakestIndex];
3292 
3293 	// common parents of X and AA \ BB need to be excluded
3294 	CSphVector<XQNode_t *> dExcluded ( dX.GetLength() );
3295 
3296 	// Factor out and delete/unlink similar nodes ( except weakest )
3297 	ARRAY_FOREACH ( i, dX )
3298 	{
3299 		XQNode_t * pParent = dX[i]->m_pParent;
3300 		Verify ( pParent->m_dChildren.RemoveValue ( dX[i] ) );
3301 		if ( i!=iWeakestIndex )
3302 			SafeDelete ( dX[i] );
3303 
3304 		dExcluded[i] = pParent;
3305 		pParent->m_pParent->m_dChildren.RemoveValue ( pParent );
3306 	}
3307 
3308 	CSphVector<XQNode_t *> dRelatedParents;
3309 	for ( int i=0; i<m_dRelatedNodes.GetLength(); i++ )
3310 	{
3311 		XQNode_t * pParent = m_dRelatedNodes[i]->m_pParent;
3312 		if ( !dRelatedParents.Contains ( pParent ) )
3313 			dRelatedParents.Add ( pParent );
3314 	}
3315 
3316 	ARRAY_FOREACH ( i, dRelatedParents )
3317 		dRelatedParents[i] = dRelatedParents[i]->Clone();
3318 
3319 	// push excluded children back
3320 	ARRAY_FOREACH ( i, dExcluded )
3321 	{
3322 		XQNode_t * pChild = dExcluded[i];
3323 		pChild->m_pParent->m_dChildren.Add ( pChild );
3324 	}
3325 
3326 	XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
3327 	pNewOr->SetOp ( SPH_QUERY_OR, dRelatedParents );
3328 
3329 	// Create yet another AND node
3330 	// with related nodes and one common dSimilar node
3331 	XQNode_t * pCommonOr = pX->m_pParent->m_pParent->m_pParent;
3332 	XQNode_t * pNewAnd = new XQNode_t ( XQLimitSpec_t() );
3333 	pNewAnd->SetOp ( SPH_QUERY_AND, pNewOr, pX );
3334 	pCommonOr->m_dChildren.Add ( pNewAnd );
3335 	pNewAnd->m_pParent = pCommonOr;
3336 
3337 	ARRAY_FOREACH ( i, dExcluded )
3338 	{
3339 		CleanupSubtree ( dExcluded[i], m_ppRoot );
3340 	}
3341 }
3342 
3343 
CheckCommonKeywords(const XQNode_t * pNode)3344 bool CSphTransformation::CheckCommonKeywords ( const XQNode_t * pNode )
3345 {
3346 	if ( !pNode || !pNode->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_OR || !pNode->m_dWords.GetLength() )
3347 	{
3348 //
3349 // NOLINT		//	NOT:
3350 // NOLINT		// 		 ______________________ OR (parentOr) _______
3351 // NOLINT		//		/                            |               |
3352 // NOLINT		//	  pNode (PHRASE|AND|PROXIMITY)  ...	            ...
3353 //
3354 		return false;
3355 	}
3356 	return true;
3357 }
3358 
3359 
3360 typedef CSphOrderedHash<CSphVector<XQNode_t *>, uint64_t, IdentityHash_fn, 128> BigramHash_t;
3361 
3362 
sphBigramAddNode(XQNode_t * pNode,int64_t uHash,BigramHash_t & hBirgam)3363 static int sphBigramAddNode ( XQNode_t * pNode, int64_t uHash, BigramHash_t & hBirgam )
3364 {
3365 	CSphVector<XQNode_t *> * ppNodes = hBirgam ( uHash );
3366 	if ( !ppNodes )
3367 	{
3368 		CSphVector<XQNode_t *> dNode ( 1 );
3369 		dNode[0] = pNode;
3370 		hBirgam.Add ( dNode, uHash );
3371 		return 1;
3372 	} else
3373 	{
3374 		(*ppNodes).Add ( pNode );
3375 		return (*ppNodes).GetLength();
3376 	}
3377 }
3378 
3379 static const BYTE g_sPhraseDelimiter[] = { 1 };
3380 
sphHashPhrase(const XQNode_t * pNode)3381 static uint64_t sphHashPhrase ( const XQNode_t * pNode )
3382 {
3383 	assert ( pNode );
3384 	uint64_t uHash = SPH_FNV64_SEED;
3385 	ARRAY_FOREACH ( i, pNode->m_dWords )
3386 	{
3387 		if ( i )
3388 			uHash = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uHash );
3389 		uHash = sphFNV64cont ( pNode->m_dWords[i].m_sWord.cstr(), uHash );
3390 	}
3391 
3392 	return uHash;
3393 }
3394 
3395 
sphHashSubphrases(XQNode_t * pNode,BigramHash_t & hBirgam)3396 static void sphHashSubphrases ( XQNode_t * pNode, BigramHash_t & hBirgam )
3397 {
3398 	assert ( pNode );
3399 	// skip whole phrase
3400 	if ( pNode->m_dWords.GetLength()<=1 )
3401 		return;
3402 
3403 	const CSphVector<XQKeyword_t> & dWords = pNode->m_dWords;
3404 	int iLen = dWords.GetLength();
3405 	for ( int i=0; i<iLen; i++ )
3406 	{
3407 		uint64_t uSubPhrase = sphFNV64cont ( dWords[i].m_sWord.cstr(), SPH_FNV64_SEED );
3408 		sphBigramAddNode ( pNode, uSubPhrase, hBirgam );
3409 
3410 		// skip whole phrase
3411 		int iSubLen = ( i==0 ? iLen-1 : iLen );
3412 		for ( int j=i+1; j<iSubLen; j++ )
3413 		{
3414 			uSubPhrase = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uSubPhrase );
3415 			uSubPhrase = sphFNV64cont ( dWords[j].m_sWord.cstr(), uSubPhrase );
3416 			sphBigramAddNode ( pNode, uSubPhrase, hBirgam );
3417 		}
3418 	}
3419 
3420 	// loop all children
3421 	ARRAY_FOREACH ( i, pNode->m_dChildren )
3422 		sphHashSubphrases ( pNode->m_dChildren[i], hBirgam );
3423 }
3424 
3425 
sphIsNodeStrongest(const XQNode_t * pNode,const CSphVector<XQNode_t * > & dSimilar)3426 bool sphIsNodeStrongest ( const XQNode_t * pNode, const CSphVector<XQNode_t *> & dSimilar )
3427 {
3428 	//
3429 	// The cases when query won't be optimized:
3430 	// 1. Proximities with different distance: "A B C"~N | "A B C D"~M (N != M)
3431 	// 2. Partial intersection in the middle: "A B C D" | "D B C E" (really they won't be found)
3432 	// 3. Weaker phrase for proximity. Example: "A B C D"~N | "B C"
3433 	//
3434 	// The cases when query will be optimized:
3435 	// 1. Found weaker term (phrase or proximity type) for sub-query with phrase type.
3436 	// Examples:
3437 	// "D A B C E" | "A B C" (weaker phrase) => "A B C"
3438 	// "A B C D E" | "B C D"~N (weaker proximity) => "B C D"~N
3439 	//
3440 	// 2. Equal proximities with the different distance.
3441 	// Example: "A B C"~N | "A B C"~M => "A B C"~min(M,N)
3442 	//
3443 	// 3. Found weaker term with proximity type with the same distance.
3444 	// Example: "D A B C E"~N | "A B"~N => "A B"~N
3445 	//
3446 
3447 	assert ( pNode );
3448 	XQOperator_e eNode = pNode->GetOp();
3449 	int iWords = pNode->m_dWords.GetLength();
3450 
3451 	ARRAY_FOREACH ( i, dSimilar )
3452 	{
3453 		XQOperator_e eSimilar = dSimilar[i]->GetOp();
3454 		int iSimilarWords = dSimilar[i]->m_dWords.GetLength();
3455 
3456 		if ( eNode==SPH_QUERY_PROXIMITY && eSimilar==SPH_QUERY_PROXIMITY && iWords>iSimilarWords )
3457 			return false;
3458 		if ( ( eNode==SPH_QUERY_PHRASE || eNode==SPH_QUERY_AND ) && ( eSimilar==SPH_QUERY_PROXIMITY && ( iWords>1 || pNode->m_dChildren.GetLength() ) ) )
3459 			return false;
3460 
3461 		bool bSimilar = ( ( eNode==SPH_QUERY_PHRASE && eSimilar==SPH_QUERY_PHRASE ) ||
3462 			( ( eNode==SPH_QUERY_PHRASE || eNode==SPH_QUERY_AND ) && ( eSimilar==SPH_QUERY_PHRASE || eSimilar==SPH_QUERY_PROXIMITY ) ) ||
3463 			( eNode==SPH_QUERY_PROXIMITY && ( eSimilar==SPH_QUERY_AND || eSimilar==SPH_QUERY_PHRASE ) ) ||
3464 			( eNode==SPH_QUERY_PROXIMITY && eSimilar==SPH_QUERY_PROXIMITY && pNode->m_iOpArg>=dSimilar[i]->m_iOpArg ) );
3465 
3466 		if ( !bSimilar )
3467 			return false;
3468 	}
3469 
3470 	return true;
3471 }
3472 
3473 
TransformCommonKeywords()3474 bool CSphTransformation::TransformCommonKeywords ()
3475 {
3476 	CSphVector <XQNode_t *> dPendingDel;
3477 	m_hSimilar.IterateStart();
3478 	while ( m_hSimilar.IterateNext () )
3479 	{
3480 		BigramHash_t hBigrams;
3481 		m_hSimilar.IterateGet().IterateStart();
3482 		while ( m_hSimilar.IterateGet().IterateNext() )
3483 		{
3484 			// Nodes with the same iFuzzyHash
3485 			CSphVector<XQNode_t *> & dPhrases = m_hSimilar.IterateGet().IterateGet();
3486 			if ( dPhrases.GetLength()<2 )
3487 				continue;
3488 
3489 			ARRAY_FOREACH ( i, dPhrases )
3490 				sphHashSubphrases ( dPhrases[i], hBigrams );
3491 
3492 			ARRAY_FOREACH ( i, dPhrases )
3493 			{
3494 				XQNode_t * pNode = dPhrases[i];
3495 				uint64_t uPhraseHash = sphHashPhrase ( pNode );
3496 				CSphVector<XQNode_t *> * ppCommon = hBigrams ( uPhraseHash );
3497 				if ( ppCommon && sphIsNodeStrongest ( pNode, *ppCommon ) )
3498 				{
3499 					ARRAY_FOREACH ( j, (*ppCommon) )
3500 						dPendingDel.Add ( (*ppCommon)[j] );
3501 				}
3502 			}
3503 		}
3504 	}
3505 
3506 	bool bTransformed = ( dPendingDel.GetLength()>0 );
3507 	dPendingDel.Sort();
3508 	// Delete stronger terms
3509 	XQNode_t * pLast = NULL;
3510 	ARRAY_FOREACH ( i, dPendingDel )
3511 	{
3512 		// skip dupes
3513 		if ( pLast==dPendingDel[i] )
3514 			continue;
3515 
3516 		pLast = dPendingDel[i];
3517 		Verify ( pLast->m_pParent->m_dChildren.RemoveValue ( pLast ) );
3518 		// delete here (not SafeDelete) as later that pointer will be compared
3519 		delete ( dPendingDel[i] );
3520 	}
3521 
3522 	return bTransformed;
3523 }
3524 
3525 // minimum words per phrase that might be optimized by CommonSuffix optimization
3526 
CheckCommonPhrase(const XQNode_t * pNode)3527 bool CSphTransformation::CheckCommonPhrase ( const XQNode_t * pNode )
3528 {
3529 	if ( !pNode || !pNode->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_OR || pNode->GetOp()!=SPH_QUERY_PHRASE || pNode->m_dWords.GetLength()<2 )
3530 	{
3531 		//
3532 		// NOLINT		//  NOT:
3533 		// NOLINT		//		 ______________________ OR (parentOr) ___
3534 		// NOLINT		// 		/                        |               |
3535 		// NOLINT		// 	  pNode (PHRASE)            ...	             ...
3536 		//
3537 		return false;
3538 	}
3539 
3540 	// single word phrase not allowed
3541 	assert ( pNode->m_dWords.GetLength()>=2 );
3542 
3543 	// phrase there words not one after another not allowed
3544 	for ( int i=1; i<pNode->m_dWords.GetLength(); i++ )
3545 	{
3546 		if ( pNode->m_dWords[i].m_iAtomPos-pNode->m_dWords[i-1].m_iAtomPos!=1 )
3547 			return false;
3548 	}
3549 
3550 	return true;
3551 }
3552 
3553 
3554 struct CommonInfo_t
3555 {
3556 	CSphVector<XQNode_t *> *	m_pPhrases;
3557 	int							m_iCommonLen;
3558 	bool						m_bHead;
3559 	bool						m_bHasBetter;
3560 };
3561 
3562 
3563 struct Node2Common_t
3564 {
3565 	XQNode_t * m_pNode;
3566 	CommonInfo_t * m_pCommon;
3567 };
3568 
3569 
3570 struct CommonDupElemination_fn
3571 {
IsLessCommonDupElemination_fn3572 	bool IsLess ( const Node2Common_t & a, const Node2Common_t & b ) const
3573 	{
3574 		if ( a.m_pNode!=b.m_pNode )
3575 			return a.m_pNode<b.m_pNode;
3576 
3577 		if ( a.m_pCommon->m_iCommonLen!=b.m_pCommon->m_iCommonLen )
3578 			return a.m_pCommon->m_iCommonLen>b.m_pCommon->m_iCommonLen;
3579 
3580 		return a.m_pCommon->m_bHead;
3581 	}
3582 };
3583 
3584 
3585 struct XQNodeAtomPos_fn
3586 {
IsLessXQNodeAtomPos_fn3587 	bool IsLess ( const XQNode_t * a, const XQNode_t * b ) const
3588 	{
3589 		return a->m_iAtomPos<b->m_iAtomPos;
3590 	}
3591 };
3592 
3593 
TransformCommonPhrase()3594 bool CSphTransformation::TransformCommonPhrase ()
3595 {
3596 	bool bRecollect = false;
3597 	m_hSimilar.IterateStart();
3598 	while ( m_hSimilar.IterateNext () )
3599 	{
3600 		m_hSimilar.IterateGet().IterateStart();
3601 		while ( m_hSimilar.IterateGet().IterateNext() )
3602 		{
3603 			// Nodes with the same iFuzzyHash
3604 			CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
3605 			if ( dNodes.GetLength()<2 )
3606 				continue;
3607 
3608 			bool bHasCommonPhrases = false;
3609 			BigramHash_t tBigramHead;
3610 			BigramHash_t tBigramTail;
3611 			// 1st check only 2 words at head tail at phrases
3612 			ARRAY_FOREACH ( iPhrase, dNodes )
3613 			{
3614 				const CSphVector<XQKeyword_t> & dWords = dNodes[iPhrase]->m_dWords;
3615 				assert ( dWords.GetLength()>=2 );
3616 				dNodes[iPhrase]->m_iAtomPos = dWords.Begin()->m_iAtomPos;
3617 
3618 				uint64_t uHead = sphFNV64cont ( dWords[0].m_sWord.cstr(), SPH_FNV64_SEED );
3619 				uint64_t uTail = sphFNV64cont ( dWords [ dWords.GetLength() - 1 ].m_sWord.cstr(), SPH_FNV64_SEED );
3620 				uHead = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uHead );
3621 				uHead = sphFNV64cont ( dWords[1].m_sWord.cstr(), uHead );
3622 
3623 				uTail = sphFNV64 ( g_sPhraseDelimiter, sizeof(g_sPhraseDelimiter), uTail );
3624 				uTail = sphFNV64cont ( dWords[dWords.GetLength()-2].m_sWord.cstr(), uTail );
3625 
3626 				int iHeadLen = sphBigramAddNode ( dNodes[iPhrase], uHead, tBigramHead );
3627 				int iTailLen = sphBigramAddNode ( dNodes[iPhrase], uTail, tBigramTail );
3628 				bHasCommonPhrases |= ( iHeadLen>1 || iTailLen>1 );
3629 			}
3630 
3631 			if ( !bHasCommonPhrases )
3632 				continue;
3633 
3634 			// 2nd step find minimum for each phrases group
3635 			CSphVector<CommonInfo_t> dCommon;
3636 			tBigramHead.IterateStart();
3637 			while ( tBigramHead.IterateNext() )
3638 			{
3639 				// only phrases that share same words at head
3640 				if ( tBigramHead.IterateGet().GetLength()<2 )
3641 					continue;
3642 
3643 				CommonInfo_t & tElem = dCommon.Add();
3644 				tElem.m_pPhrases = &tBigramHead.IterateGet();
3645 				tElem.m_iCommonLen = 2;
3646 				tElem.m_bHead = true;
3647 				tElem.m_bHasBetter = false;
3648 			}
3649 			tBigramTail.IterateStart();
3650 			while ( tBigramTail.IterateNext() )
3651 			{
3652 				// only phrases that share same words at tail
3653 				if ( tBigramTail.IterateGet().GetLength()<2 )
3654 					continue;
3655 
3656 				CommonInfo_t & tElem = dCommon.Add();
3657 				tElem.m_pPhrases = &tBigramTail.IterateGet();
3658 				tElem.m_iCommonLen = 2;
3659 				tElem.m_bHead = false;
3660 				tElem.m_bHasBetter = false;
3661 			}
3662 
3663 			// for each set of phrases with common words at the head or tail
3664 			// each word that is same at all phrases makes common length longer
3665 			ARRAY_FOREACH ( i, dCommon )
3666 			{
3667 				CommonInfo_t & tCommon = dCommon[i];
3668 				bool bHead = tCommon.m_bHead;
3669 				const CSphVector<XQNode_t *> & dPhrases = *tCommon.m_pPhrases;
3670 				// start from third word ( two words at each phrase already matched at bigram hashing )
3671 				for ( int iCount=3; ; iCount++ )
3672 				{
3673 					// is shortest phrase words over
3674 					if ( iCount>=dPhrases[0]->m_dWords.GetLength() )
3675 						break;
3676 
3677 					int iWordRef = ( bHead ? iCount-1 : dPhrases[0]->m_dWords.GetLength() - iCount );
3678 					uint64_t uHash = sphFNV64 ( dPhrases[0]->m_dWords[iWordRef].m_sWord.cstr() );
3679 
3680 					bool bPhrasesMatch = false;
3681 					bool bSomePhraseOver = false;
3682 					for ( int iPhrase=1; iPhrase<dPhrases.GetLength(); iPhrase++ )
3683 					{
3684 						bSomePhraseOver = ( iCount>=dPhrases[iPhrase]->m_dWords.GetLength() );
3685 						if ( bSomePhraseOver )
3686 							break;
3687 
3688 						int iWord = ( bHead ? iCount-1 : dPhrases[iPhrase]->m_dWords.GetLength() - iCount );
3689 						bPhrasesMatch = ( uHash==sphFNV64 ( dPhrases[iPhrase]->m_dWords[iWord].m_sWord.cstr() ) );
3690 						if ( !bPhrasesMatch )
3691 							break;
3692 					}
3693 
3694 					// no need to check further in case shortest phrase has no more words or sequence over
3695 					if ( bSomePhraseOver || !bPhrasesMatch )
3696 						break;
3697 
3698 					tCommon.m_iCommonLen = iCount;
3699 				}
3700 			}
3701 
3702 			// mark all dupes (that has smaller common length) as deleted
3703 			if ( dCommon.GetLength()>=2 )
3704 			{
3705 				CSphVector<Node2Common_t> dDups ( dCommon.GetLength()*2 );
3706 				dDups.Resize ( 0 );
3707 				ARRAY_FOREACH ( i, dCommon )
3708 				{
3709 					CommonInfo_t & tCommon = dCommon[i];
3710 					CSphVector<XQNode_t *> & dPhrases = *tCommon.m_pPhrases;
3711 					ARRAY_FOREACH ( j, dPhrases )
3712 					{
3713 						Node2Common_t & tDup = dDups.Add();
3714 						tDup.m_pNode = dPhrases[j];
3715 						tDup.m_pCommon = &tCommon;
3716 					}
3717 				}
3718 				dDups.Sort ( CommonDupElemination_fn() );
3719 				for ( int i=0; i<dDups.GetLength()-1; i++ )
3720 				{
3721 					Node2Common_t & tCurr = dDups[i];
3722 					Node2Common_t & tNext = dDups[i+1];
3723 					if ( tCurr.m_pNode==tNext.m_pNode )
3724 					{
3725 						if ( tCurr.m_pCommon->m_iCommonLen<=tNext.m_pCommon->m_iCommonLen )
3726 							tCurr.m_pCommon->m_bHasBetter = true;
3727 						else
3728 							tNext.m_pCommon->m_bHasBetter = true;
3729 					}
3730 				}
3731 			}
3732 
3733 			ARRAY_FOREACH ( i, dCommon )
3734 			{
3735 				const CommonInfo_t & tElem = dCommon[i];
3736 				if ( !tElem.m_bHasBetter )
3737 				{
3738 					tElem.m_pPhrases->Sort ( XQNodeAtomPos_fn() );
3739 					MakeTransformCommonPhrase ( *tElem.m_pPhrases, tElem.m_iCommonLen, tElem.m_bHead );
3740 					bRecollect = true;
3741 				}
3742 			}
3743 		}
3744 	}
3745 
3746 	return bRecollect;
3747 }
3748 
3749 
MakeTransformCommonPhrase(CSphVector<XQNode_t * > & dCommonNodes,int iCommonLen,bool bHeadIsCommon)3750 void CSphTransformation::MakeTransformCommonPhrase ( CSphVector<XQNode_t *> & dCommonNodes, int iCommonLen, bool bHeadIsCommon )
3751 {
3752 	XQNode_t * pCommonPhrase = new XQNode_t ( XQLimitSpec_t() );
3753 	pCommonPhrase->SetOp ( SPH_QUERY_PHRASE );
3754 
3755 	XQNode_t * pGrandOr = dCommonNodes[0]->m_pParent;
3756 	if ( bHeadIsCommon )
3757 	{
3758 		// fill up common suffix
3759 		XQNode_t * pPhrase = dCommonNodes[0];
3760 		pCommonPhrase->m_iAtomPos = pPhrase->m_dWords[0].m_iAtomPos;
3761 		for ( int i=0; i<iCommonLen; i++ )
3762 			pCommonPhrase->m_dWords.Add ( pPhrase->m_dWords[i] );
3763 	} else
3764 	{
3765 		XQNode_t * pPhrase = dCommonNodes[0];
3766 		// set the farthest atom position
3767 		int iAtomPos = pPhrase->m_dWords [ pPhrase->m_dWords.GetLength() - iCommonLen ].m_iAtomPos;
3768 		for ( int i=1; i<dCommonNodes.GetLength(); i++ )
3769 		{
3770 			XQNode_t * pCur = dCommonNodes[i];
3771 			int iCurAtomPos = pCur->m_dWords[pCur->m_dWords.GetLength() - iCommonLen].m_iAtomPos;
3772 			if ( iAtomPos < iCurAtomPos )
3773 			{
3774 				pPhrase = pCur;
3775 				iAtomPos = iCurAtomPos;
3776 			}
3777 		}
3778 		pCommonPhrase->m_iAtomPos = iAtomPos;
3779 
3780 		for ( int i=pPhrase->m_dWords.GetLength() - iCommonLen; i<pPhrase->m_dWords.GetLength(); i++ )
3781 			pCommonPhrase->m_dWords.Add ( pPhrase->m_dWords[i] );
3782 	}
3783 
3784 	XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
3785 	pNewOr->SetOp ( SPH_QUERY_OR );
3786 
3787 	ARRAY_FOREACH ( i, dCommonNodes )
3788 	{
3789 		XQNode_t * pPhrase = dCommonNodes[i];
3790 
3791 		// remove phrase from parent and eliminate in case of common phrase duplication
3792 		Verify ( pGrandOr->m_dChildren.RemoveValue ( pPhrase ) );
3793 		if ( pPhrase->m_dWords.GetLength()==iCommonLen )
3794 		{
3795 			SafeDelete ( pPhrase );
3796 			continue;
3797 		}
3798 
3799 		// move phrase to new OR
3800 		pNewOr->m_dChildren.Add ( pPhrase );
3801 		pPhrase->m_pParent = pNewOr;
3802 
3803 		// shift down words and enumerate words atom positions
3804 		if ( bHeadIsCommon )
3805 		{
3806 			int iEndCommonAtom = pCommonPhrase->m_dWords.Last().m_iAtomPos+1;
3807 			for ( int j=iCommonLen; j<pPhrase->m_dWords.GetLength(); j++ )
3808 			{
3809 				int iTo = j-iCommonLen;
3810 				pPhrase->m_dWords[iTo] = pPhrase->m_dWords[j];
3811 				pPhrase->m_dWords[iTo].m_iAtomPos = iEndCommonAtom + iTo;
3812 			}
3813 		}
3814 		pPhrase->m_dWords.Resize ( pPhrase->m_dWords.GetLength() - iCommonLen );
3815 		if ( !bHeadIsCommon )
3816 		{
3817 			int iStartAtom = pCommonPhrase->m_dWords[0].m_iAtomPos - pPhrase->m_dWords.GetLength();
3818 			ARRAY_FOREACH ( j, pPhrase->m_dWords )
3819 				pPhrase->m_dWords[j].m_iAtomPos = iStartAtom + j;
3820 		}
3821 
3822 		if ( pPhrase->m_dWords.GetLength()==1 )
3823 			pPhrase->SetOp ( SPH_QUERY_AND );
3824 	}
3825 
3826 	if ( pNewOr->m_dChildren.GetLength() )
3827 	{
3828 		// parent phrase need valid atom position of children
3829 		pNewOr->m_iAtomPos = pNewOr->m_dChildren[0]->m_dWords[0].m_iAtomPos;
3830 
3831 		XQNode_t * pNewPhrase = new XQNode_t ( XQLimitSpec_t() );
3832 		if ( bHeadIsCommon )
3833 			pNewPhrase->SetOp ( SPH_QUERY_PHRASE, pCommonPhrase, pNewOr );
3834 		else
3835 			pNewPhrase->SetOp ( SPH_QUERY_PHRASE, pNewOr, pCommonPhrase );
3836 
3837 		pGrandOr->m_dChildren.Add ( pNewPhrase );
3838 		pNewPhrase->m_pParent = pGrandOr;
3839 	} else
3840 	{
3841 		// common phrases with same words elimination
3842 		pGrandOr->m_dChildren.Add ( pCommonPhrase );
3843 		pCommonPhrase->m_pParent = pGrandOr;
3844 		SafeDelete ( pNewOr );
3845 	}
3846 }
3847 
3848 
CheckCommonAndNotFactor(const XQNode_t * pNode)3849 bool CSphTransformation::CheckCommonAndNotFactor ( const XQNode_t * pNode )
3850 {
3851 	if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3852 			pNode->m_pParent->GetOp()!=SPH_QUERY_AND || pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
3853 			pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR ||
3854 			// FIXME!!! check performance with OR node at 2nd grand instead of regular not NOT
3855 			pNode->m_pParent->m_pParent->m_dChildren.GetLength()<2 || pNode->m_pParent->m_pParent->m_dChildren[1]->GetOp()!=SPH_QUERY_NOT )
3856 	{
3857 //
3858 // NOLINT		//  NOT:
3859 // NOLINT		//		 _______ OR (gGOr) ________________
3860 // NOLINT		// 		/          |                       |
3861 // NOLINT		// 	 ...        AND NOT (grandAndNot)     ...
3862 // NOLINT		//                /       |
3863 // NOLINT		//               AND     NOT
3864 // NOLINT		//                |       |
3865 // NOLINT		//              pNode  relatedNode
3866 //
3867 		return false;
3868 	}
3869 	return true;
3870 }
3871 
3872 
TransformCommonAndNotFactor()3873 bool CSphTransformation::TransformCommonAndNotFactor ()
3874 {
3875 	bool bRecollect = false;
3876 	m_hSimilar.IterateStart();
3877 	while ( m_hSimilar.IterateNext () )
3878 	{
3879 		m_hSimilar.IterateGet().IterateStart();
3880 		while ( m_hSimilar.IterateGet().IterateNext() )
3881 		{
3882 			// Nodes with the same iFuzzyHash
3883 			CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
3884 			if ( dSimilarNodes.GetLength()<2 )
3885 				continue;
3886 
3887 			if ( MakeTransformCommonAndNotFactor ( dSimilarNodes ) )
3888 				bRecollect = true;
3889 		}
3890 	}
3891 	return bRecollect;
3892 }
3893 
3894 
MakeTransformCommonAndNotFactor(CSphVector<XQNode_t * > & dSimilarNodes)3895 bool CSphTransformation::MakeTransformCommonAndNotFactor ( CSphVector<XQNode_t *> & dSimilarNodes )
3896 {
3897 	// Pick weakest node from the equal
3898 	// PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
3899 	// so they are considered as equal nodes.
3900 	int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
3901 
3902 	XQNode_t * pFirstAndNot = dSimilarNodes [iWeakestIndex]->m_pParent->m_pParent;
3903 	XQNode_t * pCommonOr = pFirstAndNot->m_pParent;
3904 
3905 	assert ( pFirstAndNot->m_dChildren.GetLength()==2 );
3906 	XQNode_t * pFirstNot = pFirstAndNot->m_dChildren[1];
3907 	assert ( pFirstNot->m_dChildren.GetLength()==1 );
3908 
3909 	XQNode_t * pAndNew = new XQNode_t ( XQLimitSpec_t() );
3910 	pAndNew->SetOp ( SPH_QUERY_AND );
3911 	pAndNew->m_dChildren.Reserve ( dSimilarNodes.GetLength() );
3912 	pAndNew->m_dChildren.Add ( pFirstNot->m_dChildren[0] );
3913 	pAndNew->m_dChildren.Last()->m_pParent = pAndNew;
3914 	pFirstNot->m_dChildren[0] = pAndNew;
3915 	pAndNew->m_pParent = pFirstNot;
3916 
3917 	for ( int i=0; i<dSimilarNodes.GetLength(); ++i )
3918 	{
3919 		assert ( CheckCommonAndNotFactor ( dSimilarNodes[i] ) );
3920 		if ( i==iWeakestIndex )
3921 			continue;
3922 
3923 		XQNode_t * pAndNot = dSimilarNodes[i]->m_pParent->m_pParent;
3924 		assert ( pAndNot->m_dChildren.GetLength()==2 );
3925 		XQNode_t * pNot = pAndNot->m_dChildren[1];
3926 		assert ( pNot->m_dChildren.GetLength()==1 );
3927 		assert ( &pAndNew->m_dChildren!=&pNot->m_dChildren );
3928 		pAndNew->m_dChildren.Add ( pNot->m_dChildren[0] );
3929 		pAndNew->m_dChildren.Last()->m_pParent = pAndNew;
3930 		pNot->m_dChildren[0] = NULL;
3931 
3932 		Verify ( pCommonOr->m_dChildren.RemoveValue ( pAndNot ) );
3933 		dSimilarNodes[i] = NULL;
3934 		SafeDelete ( pAndNot );
3935 	}
3936 
3937 	return true;
3938 }
3939 
3940 
CheckCommonOrNot(const XQNode_t * pNode)3941 bool CSphTransformation::CheckCommonOrNot ( const XQNode_t * pNode )
3942 {
3943 	if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent || !pNode->m_pParent->m_pParent->m_pParent ||
3944 			!pNode->m_pParent->m_pParent->m_pParent->m_pParent || pNode->m_pParent->GetOp()!=SPH_QUERY_OR ||
3945 			pNode->m_pParent->m_pParent->GetOp()!=SPH_QUERY_NOT ||
3946 			pNode->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_ANDNOT ||
3947 			pNode->m_pParent->m_pParent->m_pParent->m_pParent->GetOp()!=SPH_QUERY_OR )
3948 	{
3949 //
3950 // NOLINT		//  NOT:
3951 // NOLINT		//		 __ OR (Grand3 = CommonOr) __
3952 // NOLINT		//		/    |                       |
3953 // NOLINT		//	 ...  AND NOT (Grand2)          ...
3954 // NOLINT		//          /        |
3955 // NOLINT		//     relatedNode  NOT (grandNot)
3956 // NOLINT		//        	         |
3957 // NOLINT		//        	      OR (parentOr)
3958 // NOLINT		//                 /    |
3959 // NOLINT		//               pNode  ...
3960 //
3961 		return false;
3962 	}
3963 	return true;
3964 }
3965 
3966 
TransformCommonOrNot()3967 bool CSphTransformation::TransformCommonOrNot ()
3968 {
3969 	bool bRecollect = false;
3970 	m_hSimilar.IterateStart();
3971 	while ( m_hSimilar.IterateNext () )
3972 	{
3973 		m_hSimilar.IterateGet().IterateStart();
3974 		while ( m_hSimilar.IterateGet().IterateNext() )
3975 		{
3976 			// Nodes with the same iFuzzyHash
3977 			CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar.IterateGet().IterateGet();
3978 			if ( dSimilarNodes.GetLength()<2 )
3979 				continue;
3980 
3981 			if ( CollectRelatedNodes < GrandNode, Grand2Node > ( dSimilarNodes ) && MakeTransformCommonOrNot ( dSimilarNodes ) )
3982 			{
3983 				bRecollect = true;
3984 				// Don't make transformation for other nodes from the same OR-node,
3985 				// because query tree was changed and further transformations
3986 				// might be invalid.
3987 				break;
3988 			}
3989 		}
3990 	}
3991 
3992 	return bRecollect;
3993 }
3994 
3995 
MakeTransformCommonOrNot(CSphVector<XQNode_t * > & dSimilarNodes)3996 bool CSphTransformation::MakeTransformCommonOrNot ( CSphVector<XQNode_t *> & dSimilarNodes )
3997 {
3998 	// Pick weakest node from the equal
3999 	// PROXIMITY and PHRASE nodes with same keywords have an equal magic hash
4000 	// so they are considered as equal nodes.
4001 	int iWeakestIndex = GetWeakestIndex ( dSimilarNodes );
4002 	assert ( iWeakestIndex!=-1 );
4003 	XQNode_t * pWeakestSimilar = dSimilarNodes [ iWeakestIndex ];
4004 
4005 	// Common OR node (that is Grand3Node::From)
4006 	XQNode_t * pCommonOr = pWeakestSimilar->m_pParent->m_pParent->m_pParent->m_pParent;
4007 
4008 	// Delete/unlink similar nodes ( except weakest )
4009 	ARRAY_FOREACH ( i, dSimilarNodes )
4010 	{
4011 		XQNode_t * pParent = dSimilarNodes[i]->m_pParent;
4012 		Verify ( pParent->m_dChildren.RemoveValue ( dSimilarNodes[i] ) );
4013 
4014 		if ( i!=iWeakestIndex )
4015 			SafeDelete ( dSimilarNodes[i] );
4016 	}
4017 
4018 	XQNode_t * pNewAndNot = new XQNode_t ( XQLimitSpec_t() );
4019 	XQNode_t * pNewAnd = new XQNode_t ( XQLimitSpec_t() );
4020 	XQNode_t * pNewNot = new XQNode_t ( XQLimitSpec_t() );
4021 	if ( !pCommonOr->m_pParent )
4022 	{
4023 		*m_ppRoot = pNewAndNot;
4024 	} else
4025 	{
4026 		pNewAndNot->m_pParent = pCommonOr->m_pParent;
4027 		assert ( pCommonOr->m_pParent->m_dChildren.Contains ( pCommonOr ) );
4028 		ARRAY_FOREACH ( i, pCommonOr->m_pParent->m_dChildren )
4029 		{
4030 			if ( pCommonOr->m_pParent->m_dChildren[i]==pCommonOr )
4031 				pCommonOr->m_pParent->m_pParent->m_dChildren[i] = pNewAndNot;
4032 		}
4033 	}
4034 	pNewAnd->SetOp ( SPH_QUERY_AND, pCommonOr );
4035 	pNewNot->SetOp ( SPH_QUERY_NOT, pWeakestSimilar );
4036 	pNewAndNot->SetOp ( SPH_QUERY_ANDNOT, pNewAnd, pNewNot );
4037 
4038 	return true;
4039 }
4040 
4041 
CheckHungOperand(const XQNode_t * pNode)4042 bool CSphTransformation::CheckHungOperand ( const XQNode_t * pNode )
4043 {
4044 	if ( !pNode || !pNode->m_pParent ||
4045 			( pNode->m_pParent->GetOp()!=SPH_QUERY_OR && pNode->m_pParent->GetOp()!=SPH_QUERY_AND ) ||
4046 			( pNode->m_pParent->m_pParent && pNode->m_pParent->GetOp()==SPH_QUERY_AND &&
4047 				pNode->m_pParent->m_pParent->GetOp()==SPH_QUERY_ANDNOT ) ||
4048 				pNode->m_pParent->m_dChildren.GetLength()>1 || pNode->m_dWords.GetLength() )
4049 	{
4050 //
4051 // NOLINT		//  NOT:
4052 // NOLINT		//	OR|AND (parent)
4053 // NOLINT		//		  |
4054 // NOLINT		//	    pNode\?
4055 //
4056 		return false;
4057 	}
4058 	return true;
4059 }
4060 
4061 
TransformHungOperand()4062 bool CSphTransformation::TransformHungOperand ()
4063 {
4064 	if ( !m_hSimilar.GetLength() || !m_hSimilar.Exists ( CONST_GROUP_FACTOR ) || !m_hSimilar[CONST_GROUP_FACTOR].Exists ( CONST_GROUP_FACTOR ) )
4065 		return false;
4066 
4067 	CSphVector<XQNode_t *> & dSimilarNodes = m_hSimilar[CONST_GROUP_FACTOR][CONST_GROUP_FACTOR];
4068 	ARRAY_FOREACH ( i, dSimilarNodes )
4069 	{
4070 		XQNode_t * pHungNode = dSimilarNodes[i];
4071 		XQNode_t * pParent = pHungNode->m_pParent;
4072 		XQNode_t * pGrand = pParent->m_pParent;
4073 
4074 		if ( !pGrand )
4075 		{
4076 			*m_ppRoot = pHungNode;
4077 			pHungNode->m_pParent = NULL;
4078 		} else
4079 		{
4080 			assert ( pGrand->m_dChildren.Contains ( pParent ) );
4081 			ARRAY_FOREACH ( j, pGrand->m_dChildren )
4082 			{
4083 				if ( pGrand->m_dChildren[j]!=pParent )
4084 					continue;
4085 
4086 				pGrand->m_dChildren[j] = pHungNode;
4087 				pHungNode->m_pParent = pGrand;
4088 				break;
4089 			}
4090 		}
4091 
4092 		pParent->m_dChildren[0] = NULL;
4093 		SafeDelete ( pParent );
4094 	}
4095 
4096 	return true;
4097 }
4098 
4099 
CheckExcessBrackets(const XQNode_t * pNode)4100 bool CSphTransformation::CheckExcessBrackets ( const XQNode_t * pNode )
4101 {
4102 	if ( !pNode || !pNode->m_pParent || !pNode->m_pParent->m_pParent ||
4103 			!( ( pNode->m_pParent->GetOp()==SPH_QUERY_AND && !pNode->m_pParent->m_dWords.GetLength() &&
4104 					pNode->m_pParent->m_pParent->GetOp()==SPH_QUERY_AND ) ||
4105 					( pNode->m_pParent->GetOp()==SPH_QUERY_OR && pNode->m_pParent->m_pParent->GetOp()==SPH_QUERY_OR ) ) )
4106 	{
4107 //
4108 // NOLINT		//  NOT:
4109 // NOLINT		//	         OR|AND (grand)
4110 // NOLINT		//            /     |
4111 // NOLINT		//	   OR|AND (parent) ...
4112 // NOLINT		//		  |
4113 // NOLINT		//	    pNode
4114 //
4115 		return false;
4116 	}
4117 	return true;
4118 }
4119 
sphMoveSiblingsUp(XQNode_t * pNode)4120 static XQNode_t * sphMoveSiblingsUp ( XQNode_t * pNode )
4121 {
4122 	XQNode_t * pParent = pNode->m_pParent;
4123 	assert ( pParent );
4124 	XQNode_t * pGrand = pParent->m_pParent;
4125 	assert ( pGrand );
4126 
4127 	assert ( pGrand->m_dChildren.Contains ( pParent ) );
4128 	int iParent = GetNodeChildIndex ( pGrand, pParent );
4129 
4130 	int iParentChildren = pParent->m_dChildren.GetLength();
4131 	int iGrandChildren = pGrand->m_dChildren.GetLength();
4132 	int iTotalChildren = iParentChildren+iGrandChildren-1;
4133 
4134 	// parent.children + grand.parent.children - parent itself
4135 	CSphVector<XQNode_t *> dChildren ( iTotalChildren );
4136 
4137 	// grand head prior parent
4138 	for ( int i=0; i<iParent; i++ )
4139 		dChildren[i] = pGrand->m_dChildren[i];
4140 
4141 	// grand tail after parent
4142 	for ( int i=0; i<iGrandChildren-iParent-1; i++ )
4143 		dChildren[i+iParent+iParentChildren] = pGrand->m_dChildren[i+iParent+1];
4144 
4145 	// all parent children
4146 	for ( int i=0; i<iParentChildren; i++ )
4147 	{
4148 		XQNode_t * pChild = pParent->m_dChildren[i];
4149 		pChild->m_pParent = pGrand;
4150 		dChildren[i+iParent] = pChild;
4151 	}
4152 
4153 	pGrand->m_dChildren.SwapData ( dChildren );
4154 	// all children at grand now
4155 	pParent->m_dChildren.Resize(0);
4156 	delete ( pParent );
4157 	return pParent;
4158 }
4159 
4160 
4161 struct XQNodeHash_fn
4162 {
HashXQNodeHash_fn4163 	static inline uint64_t Hash ( XQNode_t * pNode )	{ return (uint64_t)pNode; }
4164 };
4165 
4166 
TransformExcessBrackets()4167 bool CSphTransformation::TransformExcessBrackets ()
4168 {
4169 	bool bRecollect = false;
4170 	CSphOrderedHash<int, XQNode_t *, XQNodeHash_fn, 64> hDeleted;
4171 	m_hSimilar.IterateStart();
4172 	while ( m_hSimilar.IterateNext () )
4173 	{
4174 		m_hSimilar.IterateGet().IterateStart();
4175 		while ( m_hSimilar.IterateGet().IterateNext() )
4176 		{
4177 			// Nodes with the same iFuzzyHash
4178 			CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
4179 			ARRAY_FOREACH ( i, dNodes )
4180 			{
4181 				XQNode_t * pNode = dNodes[i];
4182 				// node environment might be changed due prior nodes transformations
4183 				if ( !hDeleted.Exists ( pNode ) && CheckExcessBrackets ( pNode ) )
4184 				{
4185 					XQNode_t * pDel = sphMoveSiblingsUp ( pNode );
4186 					hDeleted.Add ( 1, pDel );
4187 					bRecollect = true;
4188 				}
4189 			}
4190 		}
4191 	}
4192 
4193 	return bRecollect;
4194 }
4195 
4196 
CheckExcessAndNot(const XQNode_t * pNode)4197 bool CSphTransformation::CheckExcessAndNot ( const XQNode_t * pNode )
4198 {
4199 	if ( !pNode || !ParentNode::From ( pNode ) || !GrandNode::From ( pNode ) || !Grand2Node::From ( pNode ) || pNode->GetOp()!=SPH_QUERY_AND ||
4200 			( pNode->m_dChildren.GetLength()==1 && pNode->m_dChildren[0]->GetOp()==SPH_QUERY_ANDNOT ) ||
4201 			ParentNode::From ( pNode )->GetOp()!=SPH_QUERY_ANDNOT || GrandNode::From ( pNode )->GetOp()!=SPH_QUERY_AND ||
4202 			Grand2Node::From ( pNode )->GetOp()!=SPH_QUERY_ANDNOT ||
4203 			// FIXME!!! check performance with OR node at 2nd grand instead of regular not NOT
4204 			Grand2Node::From ( pNode )->m_dChildren.GetLength()<2 || Grand2Node::From ( pNode )->m_dChildren[1]->GetOp()!=SPH_QUERY_NOT )
4205 	{
4206 //
4207 // NOLINT		//  NOT:
4208 // NOLINT		//	                      AND NOT
4209 // NOLINT		//	                       /   |
4210 // NOLINT		//	                     AND  NOT
4211 // NOLINT		//	                     |
4212 // NOLINT		//	                AND NOT
4213 // NOLINT		//	              /      |
4214 // NOLINT		//	         AND(pNode) NOT
4215 // NOLINT		//            |          |
4216 // NOLINT		//           ..         ...
4217 //
4218 		return false;
4219 	}
4220 	return true;
4221 }
4222 
4223 
TransformExcessAndNot()4224 bool CSphTransformation::TransformExcessAndNot ()
4225 {
4226 	bool bRecollect = false;
4227 	CSphOrderedHash<int, XQNode_t *, XQNodeHash_fn, 64> hDeleted;
4228 	m_hSimilar.IterateStart();
4229 	while ( m_hSimilar.IterateNext () )
4230 	{
4231 		m_hSimilar.IterateGet().IterateStart();
4232 		while ( m_hSimilar.IterateGet().IterateNext() )
4233 		{
4234 			// Nodes with the same iFuzzyHash
4235 			CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
4236 			ARRAY_FOREACH ( i, dNodes )
4237 			{
4238 				XQNode_t * pAnd = dNodes[i];
4239 				XQNode_t * pParentAndNot = pAnd->m_pParent;
4240 
4241 				// node environment might be changed due prior nodes transformations
4242 				if ( hDeleted.Exists ( pParentAndNot ) || !CheckExcessAndNot ( pAnd ) )
4243 					continue;
4244 
4245 				assert ( pParentAndNot->m_dChildren.GetLength()==2 );
4246 				XQNode_t * pNot = pParentAndNot->m_dChildren[1];
4247 				XQNode_t * pGrandAnd = pParentAndNot->m_pParent;
4248 				XQNode_t * pGrand2AndNot = pGrandAnd->m_pParent;
4249 				assert ( pGrand2AndNot->m_dChildren.GetLength()==2 );
4250 				XQNode_t * pGrand2Not = pGrand2AndNot->m_dChildren[1];
4251 
4252 				assert ( pGrand2Not->m_dChildren.GetLength()==1 );
4253 				XQNode_t * pNewOr = new XQNode_t ( XQLimitSpec_t() );
4254 
4255 				pNewOr->SetOp ( SPH_QUERY_OR, pNot->m_dChildren );
4256 				pNewOr->m_dChildren.Add ( pGrand2Not->m_dChildren[0] );
4257 				pNewOr->m_dChildren.Last()->m_pParent = pNewOr;
4258 				pGrand2Not->m_dChildren[0] = pNewOr;
4259 				pNewOr->m_pParent = pGrand2Not;
4260 
4261 				assert ( pGrandAnd->m_dChildren.Contains ( pParentAndNot ) );
4262 				int iChild = GetNodeChildIndex ( pGrandAnd, pParentAndNot );
4263 				pGrandAnd->m_dChildren[iChild] = pAnd;
4264 				pAnd->m_pParent = pGrandAnd;
4265 
4266 				// Delete excess nodes
4267 				hDeleted.Add ( 1, pParentAndNot );
4268 				pNot->m_dChildren.Resize ( 0 );
4269 				pParentAndNot->m_dChildren[0] = NULL;
4270 				SafeDelete ( pParentAndNot );
4271 				bRecollect = true;
4272 			}
4273 		}
4274 	}
4275 
4276 	return bRecollect;
4277 }
4278 
4279 
Dump()4280 void CSphTransformation::Dump ()
4281 {
4282 #ifdef XQDEBUG
4283 	m_hSimilar.IterateStart();
4284 	while ( m_hSimilar.IterateNext() )
4285 	{
4286 		printf ( "\nnode: hash 0x" UINT64_FMT "\n", m_hSimilar.IterateGetKey() );
4287 		m_hSimilar.IterateGet().IterateStart();
4288 		while ( m_hSimilar.IterateGet().IterateNext() )
4289 		{
4290 			CSphVector<XQNode_t *> & dNodes = m_hSimilar.IterateGet().IterateGet();
4291 			printf ( "\tgrand: hash 0x" UINT64_FMT ", children %d\n", m_hSimilar.IterateGet().IterateGetKey(), dNodes.GetLength() );
4292 
4293 			printf ( "\tparents:\n" );
4294 			ARRAY_FOREACH ( i, dNodes )
4295 			{
4296 				uint64_t uParentHash = dNodes[i]->GetHash();
4297 				printf ( "\t\thash 0x" UINT64_FMT "\n", uParentHash );
4298 			}
4299 		}
4300 	}
4301 #endif
4302 }
4303 
4304 
4305 #ifdef XQDEBUG
Dump(const XQNode_t * pNode,const char * sHeader)4306 void CSphTransformation::Dump ( const XQNode_t * pNode, const char * sHeader )
4307 {
4308 	printf ( sHeader );
4309 	if ( pNode )
4310 	{
4311 		printf ( "%s\n", sphReconstructNode ( pNode, NULL ).cstr(), NULL );
4312 #ifdef XQ_DUMP_TRANSFORMED_TREE
4313 		xqDump ( pNode, 0 );
4314 #endif
4315 	}
4316 }
4317 #else
Dump(const XQNode_t *,const char *)4318 void CSphTransformation::Dump ( const XQNode_t * , const char * )
4319 {}
4320 #endif
4321 
4322 
Transform()4323 void CSphTransformation::Transform ()
4324 {
4325 	if ( CollectInfo <ParentNode, NullNode> ( *m_ppRoot, &CheckCommonKeywords ) )
4326 	{
4327 		bool bDump = TransformCommonKeywords ();
4328 		if ( bDump )
4329 			Dump ( *m_ppRoot, "\nAfter  transformation of 'COMMON KEYWORDS'\n" );
4330 	}
4331 
4332 	if ( CollectInfo <ParentNode, NullNode> ( *m_ppRoot, &CheckCommonPhrase ) )
4333 	{
4334 		bool bDump = TransformCommonPhrase ();
4335 		if ( bDump )
4336 			Dump ( *m_ppRoot, "\nAfter  transformation of 'COMMON PHRASES'\n" );
4337 	}
4338 
4339 	bool bRecollect = false;
4340 	do
4341 	{
4342 		bRecollect = false;
4343 
4344 		if ( CollectInfo <Grand2Node, CurrentNode> ( *m_ppRoot, &CheckCommonNot ) )
4345 		{
4346 			bool bDump = TransformCommonNot ();
4347 			bRecollect |= bDump;
4348 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'COMMON NOT'\n" );
4349 		}
4350 
4351 		if ( CollectInfo <Grand3Node, CurrentNode> ( *m_ppRoot, &CheckCommonCompoundNot ) )
4352 		{
4353 			bool bDump = TransformCommonCompoundNot ();
4354 			bRecollect |= bDump;
4355 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'COMMON COMPOUND NOT'\n" );
4356 		}
4357 
4358 		if ( CollectInfo <Grand2Node, CurrentNode> ( *m_ppRoot, &CheckCommonSubTerm ) )
4359 		{
4360 			bool bDump = TransformCommonSubTerm ();
4361 			bRecollect |= bDump;
4362 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'COMMON SUBTERM'\n" );
4363 		}
4364 
4365 		if ( CollectInfo <Grand2Node, CurrentNode> ( *m_ppRoot, &CheckCommonAndNotFactor ) )
4366 		{
4367 			bool bDump = TransformCommonAndNotFactor ();
4368 			bRecollect |= bDump;
4369 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'COMMON ANDNOT FACTOR'\n" );
4370 		}
4371 
4372 		if ( CollectInfo <Grand3Node, CurrentNode> ( *m_ppRoot, &CheckCommonOrNot ) )
4373 		{
4374 			bool bDump = TransformCommonOrNot ();
4375 			bRecollect |= bDump;
4376 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'COMMON OR NOT'\n" );
4377 		}
4378 
4379 		if ( CollectInfo <NullNode, NullNode> ( *m_ppRoot, &CheckHungOperand ) )
4380 		{
4381 			bool bDump = TransformHungOperand ();
4382 			bRecollect |= bDump;
4383 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'HUNG OPERAND'\n" );
4384 		}
4385 
4386 		if ( CollectInfo <NullNode, NullNode> ( *m_ppRoot, &CheckExcessBrackets ) )
4387 		{
4388 			bool bDump = TransformExcessBrackets ();
4389 			bRecollect |= bDump;
4390 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'EXCESS BRACKETS'\n" );
4391 		}
4392 
4393 		if ( CollectInfo <ParentNode, CurrentNode> ( *m_ppRoot, &CheckExcessAndNot ) )
4394 		{
4395 			bool bDump = TransformExcessAndNot ();
4396 			bRecollect |= bDump;
4397 			Dump ( bDump ? *m_ppRoot : NULL, "\nAfter  transformation of 'EXCESS AND NOT'\n" );
4398 		}
4399 	} while ( bRecollect );
4400 
4401 	( *m_ppRoot )->Check ( true );
4402 }
4403 
4404 
sphOptimizeBoolean(XQNode_t ** ppRoot,const ISphKeywordsStat * pKeywords)4405 void sphOptimizeBoolean ( XQNode_t ** ppRoot, const ISphKeywordsStat * pKeywords )
4406 {
4407 #ifdef XQDEBUG
4408 	int64_t tmDelta = sphMicroTimer();
4409 #endif
4410 
4411 	CSphTransformation qInfo ( ppRoot, pKeywords );
4412 	qInfo.Transform ();
4413 
4414 #ifdef XQDEBUG
4415 	tmDelta = sphMicroTimer() - tmDelta;
4416 	if ( tmDelta>10 )
4417 		printf ( "optimized boolean in %d.%03d msec", (int)(tmDelta/1000), (int)(tmDelta%1000) );
4418 #endif
4419 }
4420 
4421 
sphSetupQueryTokenizer(ISphTokenizer * pTokenizer)4422 void sphSetupQueryTokenizer ( ISphTokenizer * pTokenizer )
4423 {
4424 	pTokenizer->AddSpecials ( "()|-!@~\"/^$<" );
4425 	pTokenizer->AddPlainChar ( '?' );
4426 	pTokenizer->AddPlainChar ( '%' );
4427 }
4428 
4429 
4430 //
4431 // $Id$
4432 //
4433