1 #ifndef OLIGOFAR_CQUERYHASH__HPP
2 #define OLIGOFAR_CQUERYHASH__HPP
3 
4 #include "uinth.hpp"
5 #include "cquery.hpp"
6 #include "fourplanes.hpp"
7 #include "cpermutator8b.hpp"
8 #include "cbitmaskaccess.hpp"
9 #include "array_set.hpp"
10 #include "cwordhash.hpp"
11 #include "chashparam.hpp"
12 #include "cintron.hpp"
13 
14 #include <deque>
15 
16 BEGIN_OLIGOFAR_SCOPES
17 
18 ////////////////////////////////////////////////////////////////////////
19 // 0....|....1....|....2
20 // 111010111111110111111 18 bits
21 // aaaaaaaaaaa            9 bits
22 //           bbbbbbbbbbb 10 bits
23 class CHashPopulator;
24 class CQueryHash
25 {
26 public:
27     typedef array_set<Uint1> TSkipPositions;
28     typedef vector<CHashAtom> TMatchSet;
29 
30     enum EStatus {
31         eStatus_empty,
32         eStatus_dirty,
33         eStatus_ready
34     };
35 
36     enum EIndelLocation { eIndel_anywhere = 0 };
37 
38     ~CQueryHash();
39     CQueryHash( int maxm = 2, int maxa = 4 );
40 
SetHashWordBitmask(CBitmaskAccess * wbm)41     void SetHashWordBitmask( CBitmaskAccess * wbm ) { m_wbmAccess = wbm; }
42     void SetWindowSize( int winSize );
43     void SetWindowStep( int winStep );
44     void SetWindowStart( int winStart );
45     void SetWordSize( int wordSz );
46     void SetStrideSize( int stride );
47     void SetMaxMismatches( int mismatches );
48     void SetMaxAmbiguities( int ambiguities );
49     void SetMaxInsertions( int ins );
50     void SetMaxDeletions( int del );
51     void SetMaxDistance( int dist );
52     void SetIndelPosition( int pos );
53     void SetIndexBits( int bits );
54     void SetStrands( int strands );
55     void SetMaxSimplicity( double simpl );
56     void SetMaxWindowCount( int windows );
57     void SetNaHSO3mode( bool on = true );
SetHaveSplices(bool on=true)58     void SetHaveSplices( bool on = true ) { m_haveSplices = on; }
59 
60     void SetParam( const CHashParam& param );
61     void GetParam( CHashParam& dest ) const;
62 
SetExpectedReadCount(int count)63     void SetExpectedReadCount( int count ) { m_expectedCount = count; }
64 
GetHashWordBitmask() const65     CBitmaskAccess * GetHashWordBitmask() const { return m_wbmAccess; }
GetScannerWindowLength() const66     int  GetScannerWindowLength() const { return m_scannerWindowLength; }
GetMaxWindowCount() const67     int  GetMaxWindowCount() const { return m_maxWindowCount; }
GetWindowSize() const68     int  GetWindowSize() const { return m_windowSize; }
GetWindowStep() const69     int  GetWindowStep() const { return m_windowStep; }
GetWindowStart() const70     int  GetWindowStart() const { return m_windowStart; }
GetWordSize() const71     int  GetWordSize() const { return m_wordSize; }
GetWordOffset() const72     int  GetWordOffset() const { return m_wordOffset; }
GetStrideSize() const73     int  GetStrideSize() const { return m_strideSize; }
GetMaxMismatches() const74     int  GetMaxMismatches() const { return m_maxMism; }
GetMaxAmbiguities() const75     int  GetMaxAmbiguities() const { return m_maxAmb; }
GetMaxInsertions() const76     int  GetMaxInsertions() const { return m_maxIns; }
GetMaxDeletions() const77     int  GetMaxDeletions() const { return m_maxDel; }
GetMaxDistance() const78     int  GetMaxDistance() const { return m_maxDist; }
GetIndelPosition() const79     int  GetIndelPosition() const { return m_indelPos; }
GetNaHSO3mode() const80     bool GetNaHSO3mode() const { return m_nahso3mode; }
GetIndexBits() const81     int  GetIndexBits() const { return m_hashTable.GetIndexBits(); }
GetStrands() const82     int  GetStrands() const { return m_strands; }
GetExpectedReadCount() const83     int  GetExpectedReadCount() const { return m_expectedCount; }
GetHashedQueriesCount() const84     int  GetHashedQueriesCount() const { return m_hashedCount; }
GetMaxSimplicity() const85     double GetMaxSimplicity() const { return m_maxSimplicity; }
HaveSplices() const86     bool HaveSplices() const { return m_haveSplices; }
87 
GetAbsoluteMaxMism() const88     int GetAbsoluteMaxMism() const { return m_permutators.size() - 1; }
CanOptimizeOffset() const89     bool CanOptimizeOffset() const { return m_skipPositions.size() == 0 && m_haveSplices == false; }
90 
91     template<class iterator>  void SetSkipPositions( iterator begin, iterator end );
92     template<class container> void SetSkipPositions( const container& c );
93 
HasSkipPositions() const94     bool HasSkipPositions() const { return !m_skipPositions.empty(); }
GetSkipPositions() const95     const TSkipPositions& GetSkipPositions() const { return m_skipPositions; }
96 
Clear()97     void Clear() { m_status = eStatus_empty; m_hashedCount = 0; m_hashTable.Clear(); }
98     void Freeze();
99     void CheckConstraints();
100 
101     int AddQuery( CQuery * query );
102     int AddQuery( CQuery * query, int component, int position, int wordcnt = 1, int wstep = 1 );
103 
104     /* ========================================================================
105      * This code is not used and may be outdated and even have errors
106      * ========================================================================
107     template<class Callback> void ForEach4gnl( const UintH& ncbi4na, Callback& cbk ) const;
108     template<class Callback> void ForEach4( const UintH& ncbi4na, Callback& cbk ) const;
109     template<class Callback> void ForEach4( UintH hash, Callback& callback, Uint1 mask, Uint1 flags ) const;
110     template<class Callback> void ForEach4( const UintH& ncbi4na, Callback& cbk ) {
111         if( m_status == eStatus_dirty ) Freeze();
112         ForEach4gnl( ncbi4na, cbk );
113     }
114     * ======================================================================== */
115     template<class Callback> void ForEach2gnl( const Uint8& ncbi2na, Callback& cbk ) const;
116     template<class Callback> void ForEach2str( const Uint8& ncbi2na, Callback& cbk ) const;
117     template<class Callback> void ForEach2int( Uint8 hash, Callback& callback, Uint1 mask, Uint1 flags ) const;
ForEach2(const Uint8 & ncbi2na,Callback & cbk) const118     template<class Callback> void ForEach2( const Uint8& ncbi2na, Callback& cbk ) const {
119         ForEach2gnl( ncbi2na, cbk );
120     }
ForEach2(const Uint8 & ncbi2na,Callback & cbk)121     template<class Callback> void ForEach2( const Uint8& ncbi2na, Callback& cbk ) {
122         if( m_status == eStatus_dirty ) Freeze();
123         ForEach2gnl( ncbi2na, cbk );
124     }
125 
126     Uint2  ComputeHashPreallocSz() const;
127     Uint8  ComputeEntryCountPerReadWord( Uint2 w, bool removeDupes = false ) const; // no ambiguities are allowed
128     Uint8  ComputeEntryCountPerReadMmOnly( Uint2 w, int maxMm ) const;
129     Uint8  ComputeEntryCountPerRead( bool removeDupes = false ) const;
130     int    ComputeHasherWindowLength();
131     int    ComputeHasherWindowStep();
132     int    ComputeScannerWindowLength();
GetHasherWindowLength() const133     int    GetHasherWindowLength() const { return m_windowLength; }
GetHasherWindowStep() const134     int    GetHasherWindowStep() const { return m_windowStep > 0 ? m_windowStep : GetHasherWindowLength(); }
135     UintH  ComputeAlternatives( UintH w, int l ) const; // used to optimize window
136     int    ComputeAmbiguitiesNcbi8na( UintH w, int len ) const;
137     double ComputeComplexityNcbi8na( UintH w, int len, int& amb ) const;
138     double ComputeComplexityNcbi2na( Uint8 w, int len ) const;
139 
140     template<class Uint, int bpb> void CompressFwd( Uint& window ) const;
141     template<class Uint, int bpb> void CompressRev( Uint& window ) const;
142 
SetNcbipnaToNcbi4naScore(Uint2 score)143     void SetNcbipnaToNcbi4naScore( Uint2 score ) { m_ncbipnaToNcbi4naScore = score; }
SetNcbiqnaToNcbi4naScore(Uint2 score)144     void SetNcbiqnaToNcbi4naScore( Uint2 score ) { m_ncbiqnaToNcbi4naScore = score; }
145 
146 protected:
147 
148     template<class Callback>
149     class C_ScanCallback
150     {
151     public:
C_ScanCallback(const CQueryHash & caller,Callback & cbk,Uint1 mask,Uint1 flags)152         C_ScanCallback( const CQueryHash& caller, Callback& cbk, Uint1 mask, Uint1 flags ) :
153             m_caller( caller ), m_callback( cbk ), m_mask( mask ), m_flags( flags ) {}
154         void operator () ( Uint8 hash, int mism, unsigned amb ) const;
155     protected:
156         const CQueryHash& m_caller;
157         Callback& m_callback;
158         Uint1 m_mask;
159         Uint1 m_flags;
160     };
161 
162     class C_ListInserter
163     {
164     public:
C_ListInserter(TMatchSet & ms)165         C_ListInserter( TMatchSet& ms ) : m_matchSet( ms ) {}
operator ()(const CHashAtom & atom)166         void operator () ( const CHashAtom& atom ) { m_matchSet.push_back( atom ); }
167     protected:
168         TMatchSet& m_matchSet;
169     };
170 
171     typedef UintH TCvt( const unsigned char *, int, unsigned short );
172     typedef Uint2 TDecr( Uint2 );
173 
174     bool CheckWordConstraints();
175     bool CheckWordConstraints() const;
176 
177     int  x_AddQuery( CQuery * query, int component );
178     int  x_AddQuery( CQuery * query, int component, int off, int maxWinCnt, int wstep );
179     int  PopulateWindow( CHashPopulator& dest, UintH fwindow, int offset, int ambiguities );
180     void PopulateInsertion2( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
181     void PopulateInsertion1( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
182     void PopulateInsertions( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow );
183     void PopulateDeletions ( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow );
184     void PopulateDeletion2 ( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
185     void PopulateDeletion1 ( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
186     int  GetNcbi4na( UintH& window, CSeqCoding::ECoding, const unsigned char * data, unsigned length );
187 
188     int x_GetNcbi4na_ncbi8na( UintH& window, const unsigned char * data, unsigned length );
189     int x_GetNcbi4na_colorsp( UintH& window, const unsigned char * data, unsigned length );
190     int x_GetNcbi4na_ncbipna( UintH& window, const unsigned char * data, unsigned length );
191     int x_GetNcbi4na_ncbiqna( UintH& window, const unsigned char * data, unsigned length );
192     int x_GetNcbi4na_quality( UintH& window, const unsigned char * data, unsigned length, TCvt * fun, int incr, unsigned short score, TDecr * decr );
193 
x_UpdateNcbipnaScore(Uint2 score)194     static Uint2 x_UpdateNcbipnaScore( Uint2 score ) { return score /= 2; }
x_UpdateNcbiqnaScore(Uint2 score)195     static Uint2 x_UpdateNcbiqnaScore( Uint2 score ) { return score - 1; }
196     static TCvt  x_Ncbipna2Ncbi4na;
197     static TCvt  x_Ncbiqna2Ncbi4na;
198 
199     //static Uint8 x_Convert2( Uint8 word, unsigned len, Uint8 from, Uint8 to );
200     static Uint8 x_Convert2( Uint8 word, unsigned len, const unsigned char * tbl );
201     static UintH x_Convert4( UintH word, unsigned len, UintH mask, int shr );
202     static UintH x_Convert4tc( UintH word, unsigned len );
203     static UintH x_Convert4ag( UintH word, unsigned len );
204 
205 //    template<class Callback> void ForEach( Uint8 ncbi2na, Callback& cbk, Uint1 mask, Uint1 flags );
206 
207 protected:
208 
209     EStatus m_status;
210     int m_windowLength;
211     int m_windowStep;
212     int m_windowStart;
213     int m_scannerWindowLength;
214     int m_windowSize;
215     int m_strideSize;
216     int m_wordSize;
217     int m_maxWindowCount;
218     int m_strands;
219     int m_expectedCount;
220     int m_hashedCount;
221     int m_maxAmb;
222     int m_maxMism;
223     int m_maxIns;
224     int m_maxDel;
225     int m_maxDist;
226     int m_indelPos;
227     bool m_nahso3mode;
228     bool m_haveSplices;
229     double m_maxSimplicity;
230     Uint2 m_ncbipnaToNcbi4naScore;
231     Uint2 m_ncbiqnaToNcbi4naScore;
232     TSkipPositions m_skipPositions;
233     CWordHash m_hashTable;
234     vector<CPermutator8b*> m_permutators;
235     // all masks are ncbi2na, i.e. 2 bits per base
236     Uint8 m_wordMask;
237     int m_wordOffset;
238     CBitmaskAccess * m_wbmAccess;
239 
240     mutable Uint8 m_hashPopulatorReverve;
241     mutable TMatchSet m_listA, m_listB; // to keep memory space reserved between ForEach calls
242     static const unsigned char * s_convert2tc;
243     static const unsigned char * s_convert2ag;
244     static const unsigned char * s_convert2eq;
245 };
246 
247 ////////////////////////////////////////////////////////////////////////
248 // inline implementation follows
249 
~CQueryHash()250 inline CQueryHash::~CQueryHash()
251 {
252     for( int i = 0; i < int( m_permutators.size() ); ++i )
253         delete m_permutators[i];
254 }
255 
CQueryHash(int maxm,int maxa)256 inline CQueryHash::CQueryHash( int maxm, int maxa ) :
257     m_status( eStatus_empty ),
258     m_windowLength( 22 ),
259     m_windowStep( 1 ),
260     m_windowStart( 0 ),
261     m_scannerWindowLength( 22 ),
262     m_windowSize( 22 ),
263     m_strideSize( 1 ),
264     m_wordSize( 11 ),
265     m_maxWindowCount( 1 ),
266     m_strands( 3 ),
267     m_expectedCount( 1000000 ),
268     m_hashedCount( 0 ),
269     m_maxAmb( maxa ),
270     m_maxMism( min(1, maxm) ),
271     m_maxIns( 0 ),
272     m_maxDel( 0 ),
273     m_maxDist( 0 ),
274     m_indelPos( eIndel_anywhere ),
275     m_nahso3mode( false ),
276     m_haveSplices( false ),
277     m_maxSimplicity( 2.0 ),
278     m_ncbipnaToNcbi4naScore( 0x7f ),
279     m_ncbiqnaToNcbi4naScore( 3 ),
280     m_hashTable( m_wordSize*2 ),
281     m_wordMask( CBitHacks::WordFootprint<Uint8>( m_wordSize*2 ) ),
282     m_wordOffset( m_windowSize - m_wordSize ),
283     m_wbmAccess( 0 )
284 {
285     SetMaxMismatches( maxm );
286 }
287 
288 template<class iterator>
SetSkipPositions(iterator begin,iterator end)289 inline void CQueryHash::SetSkipPositions( iterator begin, iterator end )
290 {
291     m_skipPositions.clear();
292     for( ; begin != end ; ++begin ) {
293         if( *begin <= 0 ) continue;
294         m_skipPositions.insert( *begin );
295     }
296     if( m_maxWindowCount > 1 && m_skipPositions.size() )
297         THROW( logic_error, "Skip positions are not supported with max word count > 1" );
298     ComputeScannerWindowLength();
299 }
300 
301 template<class container>
SetSkipPositions(const container & c)302 inline void CQueryHash::SetSkipPositions( const container& c )
303 {
304     SetSkipPositions( c.begin(), c.end() );
305 }
306 
SetWindowSize(int winSize)307 inline void CQueryHash::SetWindowSize( int winSize )
308 {
309     ASSERT( m_status == eStatus_empty );
310     m_windowSize = winSize;
311     if( m_wordSize > m_windowSize ) {
312         m_wordSize = m_windowSize;
313         m_wordMask = CBitHacks::WordFootprint<Uint8>( 2*m_wordSize );
314         m_wordOffset = 0;
315     } else m_wordOffset = m_windowSize - m_wordSize;
316 //    ComputeScannerWindowLength();
317 }
318 
SetWindowStep(int step)319 inline void CQueryHash::SetWindowStep( int step )
320 {
321     ASSERT( m_status == eStatus_empty );
322     ASSERT( step >= 0 );
323     m_windowStep = step;
324 }
325 
SetWindowStart(int start)326 inline void CQueryHash::SetWindowStart( int start )
327 {
328     ASSERT( m_status == eStatus_empty );
329     ASSERT( start >= 0 );
330     m_windowStart = start;
331 }
332 
SetMaxWindowCount(int winCnt)333 inline void CQueryHash::SetMaxWindowCount( int winCnt )
334 {
335     ASSERT( m_status == eStatus_empty );
336     m_maxWindowCount = winCnt;
337     if( m_maxWindowCount > 1 && m_skipPositions.size() )
338         THROW( logic_error, "Skip positions are not supported with max word count > 1" );
339     m_wordOffset = m_windowSize - m_wordSize;
340 }
341 
SetWordSize(int wordSize)342 inline void CQueryHash::SetWordSize( int wordSize )
343 {
344     ASSERT( m_status == eStatus_empty );
345     m_wordSize = wordSize;
346     m_wordMask = CBitHacks::WordFootprint<Uint8>( 2*m_wordSize );
347     if( m_wordSize > m_windowSize ) m_windowSize = m_wordSize;
348     m_wordOffset = m_windowSize - m_wordSize;
349     ComputeScannerWindowLength();
350 }
351 
AddQuery(CQuery * query)352 inline int CQueryHash::AddQuery( CQuery * query )
353 {
354     if( m_status == eStatus_empty ) CheckWordConstraints();
355 
356     m_status = eStatus_dirty;
357     int ret = x_AddQuery( query, 0 );
358     if( query->HasComponent( 1 ) ) ret += x_AddQuery( query, 1 );
359     if( ret ) ++m_hashedCount;
360     return ret;
361 }
362 
AddQuery(CQuery * query,int component,int woffset,int wcount,int wstep)363 inline int CQueryHash::AddQuery( CQuery * query, int component, int woffset, int wcount, int wstep )
364 {
365     if( m_status == eStatus_empty ) CheckWordConstraints();
366     m_status = eStatus_dirty;
367 
368     return x_AddQuery( query, component, woffset, wcount, wstep );
369 }
370 
371 
SetStrideSize(int stride)372 inline void CQueryHash::SetStrideSize( int stride )
373 {
374     ASSERT( m_status == eStatus_empty );
375     m_strideSize = stride;
376 }
377 
SetMaxMismatches(int mismatches)378 inline void CQueryHash::SetMaxMismatches( int mismatches )
379 {
380     ASSERT( m_status == eStatus_empty );
381     if( int(m_permutators.size()) <= mismatches ) {
382         int oldsz = m_permutators.size();
383         m_permutators.resize( mismatches + 1 );
384         for( int i = oldsz; i <= mismatches; ++i )
385             m_permutators[i] = new CPermutator8b( i, m_nahso3mode ? 32 : m_maxAmb );
386     }
387     m_maxMism = mismatches;
388     if( m_maxDist < mismatches ) m_maxDist = mismatches;
389 }
390 
SetMaxAmbiguities(int ambiguities)391 inline void CQueryHash::SetMaxAmbiguities( int ambiguities )
392 {
393     ASSERT( m_status == eStatus_empty );
394     unsigned amb = m_nahso3mode ? 32 : ambiguities;
395     m_maxAmb = ambiguities;
396     for( unsigned i = 0; i < m_permutators.size(); ++i ) {
397         if( m_permutators[i]->GetMaxAmbiguities() != amb ) {
398             delete m_permutators[i];
399             m_permutators[i] = new CPermutator8b( i, m_maxAmb );
400         }
401     }
402 }
403 
SetMaxInsertions(int ins)404 inline void CQueryHash::SetMaxInsertions( int ins )
405 {
406     ASSERT( m_status == eStatus_empty );
407     m_maxIns = ins;
408     if( m_maxDist < ins ) m_maxDist = ins;
409 }
410 
SetMaxDeletions(int del)411 inline void CQueryHash::SetMaxDeletions( int del )
412 {
413     ASSERT( m_status == eStatus_empty );
414     m_maxDel = del;
415     if( m_maxDist < del ) m_maxDist = del;
416 }
417 
SetMaxDistance(int dist)418 inline void CQueryHash::SetMaxDistance( int dist )
419 {
420     ASSERT( m_status == eStatus_empty );
421     m_maxDist = max( max( m_maxIns, m_maxDel ), max( m_maxMism, dist ) );
422 }
423 
SetIndelPosition(int pos)424 inline void CQueryHash::SetIndelPosition( int pos )
425 {
426     ASSERT( m_status == eStatus_empty );
427     m_indelPos = pos;
428 }
429 
SetNaHSO3mode(bool on)430 inline void CQueryHash::SetNaHSO3mode( bool on )
431 {
432     ASSERT( m_status == eStatus_empty );
433     m_nahso3mode = on;
434     SetMaxAmbiguities( GetMaxAmbiguities() );
435 }
436 
SetIndexBits(int bits)437 inline void CQueryHash::SetIndexBits( int bits )
438 {
439     ASSERT( m_status == eStatus_empty );
440     m_hashTable.SetIndexBits( bits );
441 }
442 
SetStrands(int strands)443 inline void CQueryHash::SetStrands( int strands )
444 {
445     ASSERT( m_status == eStatus_empty );
446     m_strands = strands;
447 }
448 
SetMaxSimplicity(double simpl)449 inline void CQueryHash::SetMaxSimplicity( double simpl )
450 {
451     ASSERT( m_status == eStatus_empty );
452     m_maxSimplicity = simpl;
453 }
454 
ComputeEntryCountPerReadMmOnly(Uint2 w,int maxMm) const455 inline Uint8 CQueryHash::ComputeEntryCountPerReadMmOnly( Uint2 w, int maxMm ) const
456 {
457     Uint8 ret = 1;
458     for( int i = 0; i < maxMm; ++i ) ret *= (w - i)*3;
459     return ret;
460 }
461 
ComputeEntryCountPerReadWord(Uint2 w,bool removeDupes) const462 inline Uint8 CQueryHash::ComputeEntryCountPerReadWord( Uint2 w, bool removeDupes ) const
463 {
464     Uint8 ret = ComputeEntryCountPerReadMmOnly( w, GetMaxMismatches() );
465     if( GetMaxInsertions() == 2 ) {
466         Uint8 i2 = 16;
467         if( m_indelPos == eIndel_anywhere ) {
468             i2 *= (w - 2);
469             if( removeDupes ) i2 -= (w - 3)*4 + 1;
470         }
471         i2 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 2 ) );
472         ret += i2;
473     }
474     if( GetMaxInsertions() ) {
475         Uint8 i1 = 4;
476         if( m_indelPos == eIndel_anywhere ) {
477             i1 *= (w - 1);
478             if( removeDupes ) i1 -= (w - 2) + 1;
479         }
480         i1 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 1 ) );
481         ret += i1;
482     }
483     if( GetMaxDeletions() == 2 ) {
484         Uint8 d2 = 1;
485         if( m_indelPos == eIndel_anywhere ) d2 *= (w - 1);
486         d2 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 2 ) );
487         ret += d2;
488     }
489     if( GetMaxDeletions() ) {
490         Uint8 d1 = 1;
491         if( m_indelPos == eIndel_anywhere ) d1 *= w;
492         d1 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 1 ) );
493         ret += d1;
494     }
495     return ret;
496 }
497 
ComputeEntryCountPerRead(bool removeDupes) const498 inline Uint8 CQueryHash::ComputeEntryCountPerRead( bool removeDupes ) const
499 {
500     Uint8 ret = ComputeEntryCountPerReadWord( m_wordSize/*m_windowSize + m_strideSize - 1*/, removeDupes ); // TODO: check what is used an wsize
501     if( m_wordOffset ) ret *= 2; // for each word
502     if( (m_strands&3) == 3 ) ret *= 2; // for each strand
503     ret *= m_maxWindowCount;
504     return ret;
505 }
506 
ComputeHasherWindowLength()507 inline int CQueryHash::ComputeHasherWindowLength()
508 {
509     ASSERT( m_strideSize == 1 || m_skipPositions.size() == 0 );
510     m_windowLength = m_windowSize + m_strideSize - 1 + GetMaxDeletions();
511     ITERATE( TSkipPositions, p, m_skipPositions ) {
512         if( *p <= m_windowLength ) ++m_windowLength; // m_skipPositions are sorted, unique and > 0 (1-based)
513         else break;
514     }
515     return m_windowLength;
516 }
517 
ComputeHasherWindowStep()518 inline int CQueryHash::ComputeHasherWindowStep()
519 {
520     return m_windowStep > 0 ? m_windowStep : ComputeHasherWindowLength();
521 }
522 
ComputeScannerWindowLength()523 inline int CQueryHash::ComputeScannerWindowLength()
524 {
525     ASSERT( m_strideSize == 1 || m_skipPositions.size() == 0 );
526     int winlen = m_windowSize; // indel and strides are taken into account in hash
527     ITERATE( TSkipPositions, p, m_skipPositions ) { // just extend window to allow skipping
528         if( *p <= winlen ) ++winlen; // m_skipPositions are sorted, unique and > 0 (1-based)
529         else break;
530     }
531     return m_scannerWindowLength = winlen;
532 }
533 
ComputeHashPreallocSz() const534 inline Uint2 CQueryHash::ComputeHashPreallocSz() const{ return 1; }
535 
CheckWordConstraints()536 inline bool CQueryHash::CheckWordConstraints()
537 {
538     ComputeHasherWindowLength();
539     ComputeScannerWindowLength();
540     m_hashTable.SetPreallocSz( ComputeHashPreallocSz() );
541     m_hashPopulatorReverve = ComputeEntryCountPerRead();
542     return ((const CQueryHash*)this)->CQueryHash::CheckWordConstraints();
543 }
544 
CheckWordConstraints() const545 inline bool CQueryHash::CheckWordConstraints() const
546 {
547     ASSERT( m_skipPositions.size() == 0 || m_strideSize == 1 );
548 //    ASSERT( m_strideSize < m_wordSize );
549 //    ASSERT( m_strideSize < m_wordOffset + 1|| m_wordOffset == 0 );
550     ASSERT( m_wordSize <= m_windowSize );
551     ASSERT( m_wordSize * 2 >= m_windowSize );
552     ASSERT( m_wordSize * 2 - m_hashTable.GetIndexBits() <= 16 ); // requirement is based on that CHashAtom may store only 16 bits
553     ASSERT( m_windowLength <= 32 );
554     return true;
555 }
556 
557 template <typename Uint, int bpb>
CompressFwd(Uint & w) const558 inline void CQueryHash::CompressFwd( Uint& w ) const
559 {
560     int x = 0;
561     ITERATE( TSkipPositions, p, m_skipPositions ) {
562         int pp = *p - m_windowStart;
563         if( pp < 0 ) continue;
564         if( pp > x ) w = CBitHacks::DeleteBits<Uint,bpb>( w, pp - ++x );
565         if( pp > m_windowLength ) break;
566     }
567 }
568 
569 template <typename Uint, int bpb>
CompressRev(Uint & w) const570 void CQueryHash::CompressRev( Uint& w ) const
571 {
572     int x = m_scannerWindowLength;
573     ITERATE( TSkipPositions, p, m_skipPositions ) {
574         int pp = *p - m_windowStart;
575         if( pp < 0 ) continue;
576         if( pp < x ) w = CBitHacks::DeleteBits<Uint,bpb>( w, x - pp );
577         if( pp > m_windowLength ) break;
578     }
579 }
580 
581 #if 0
582 template<class Callback>
583 void CQueryHash::ForEach4gnl( const UintH& hash, Callback& callback ) const
584 {
585     if( m_nahso3mode ) {
586         ForEach4<Callback>( this->x_Convert4tc( hash, m_windowSize ), callback );
587         ForEach4<Callback>( this->x_Convert4ag( hash, m_windowSize ), callback );
588     } else {
589         ForEach4<Callback>( hash, callback );
590     }
591 }
592 
593 template<class Callback>
594 void CQueryHash::ForEach4( const UintH& hash, Callback& callback ) const
595 {
596     /* ........................................................................
597        We change strategy here: instead of putting logic on varying hash window
598        by seqscanner, we to it in hash - so that hash is a black box
599 
600        Remember, that scanner WindowLength = either
601         (a) WindowSize + 0 * (StrideSize + IndelAllowed)
602        or
603         (b) WindowSize + (number of skip positions - self conjugated)
604        because it is not allowed to have strides and skip positions
605 
606        That's the window scanner should use.
607        ........................................................................ */
608     ASSERT( m_status != eStatus_dirty );
609 
610     if( m_strands == 3 & !HasSkipPositions() )
611         ForEach4( hash, callback, 0, 0 );
612     else {
613         if( m_strands & 1 ) {
614             UintH _hash = hash;
615             if( HasSkipPositions() ) CompressFwd<UintH,4>( _hash );
616             ForEach4( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandFwd );
617         }
618         if( m_strands & 2 ) {
619             UintH _hash = hash;
620             if( HasSkipPositions() ) CompressRev<UintH,4>( _hash );
621             ForEach4( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandRev );
622         }
623     }
624 }
625 template<class Callback>
626 void CQueryHash::ForEach4( UintH hash, Callback& callback, Uint1 mask, Uint1 flags ) const
627 {
628     THROW( logic_error, "ForEach4 should be modified in the way ForEach2 works in " << __FILE__ << ":" << __LINE__ );
629     //Compress4fwd( hash );
630     C_ScanCallback<Callback> cbk( *this, callback, mask, flags );
631     m_permutators[0]->ForEach( m_windowSize, hash, cbk );
632 }
633 
634 #endif
635 template<class Callback>
ForEach2gnl(const Uint8 & hash,Callback & callback) const636 void CQueryHash::ForEach2gnl( const Uint8& hash, Callback& callback ) const
637 {
638     if( m_nahso3mode ) {
639         ForEach2str<Callback>( x_Convert2( hash, m_windowSize, s_convert2tc ), callback );
640         ForEach2str<Callback>( x_Convert2( hash, m_windowSize, s_convert2ag ), callback );
641     } else {
642         ForEach2str<Callback>( hash, callback );
643     }
644 }
645 
646 template<class Callback>
ForEach2str(const Uint8 & hash,Callback & callback) const647 void CQueryHash::ForEach2str( const Uint8& hash, Callback& callback ) const
648 {
649     /* ........................................................................
650        We change strategy here: instead of putting logic on varying hash window
651        by seqscanner, we to it in hash - so that hash is a black box
652 
653        Remember, that scanner WindowLength = either
654         (a) WindowSize + 0 * (StrideSize + IndelAllowed)
655        or
656         (b) WindowSize + (number of skip positions - self conjugated)
657        because it is not allowed to have strides and skip positions
658 
659        That's the window scanner should use.
660        ........................................................................ */
661     ASSERT( m_status != eStatus_dirty );
662 
663     if( m_strands == 3 && !HasSkipPositions() ) {
664         ForEach2int( hash, callback, 0, 0 );
665     } else {
666         if( m_strands & 1 ) {
667             Uint8 _hash = hash;
668             if( HasSkipPositions() ) CompressFwd<Uint8,2>( _hash );
669             ForEach2int( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandFwd );
670         }
671         if( m_strands & 2 ) {
672             Uint8 _hash = hash;
673             if( HasSkipPositions() ) CompressRev<Uint8,2>( _hash );
674             ForEach2int( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandRev );
675         }
676     }
677 }
678 
679 template<class Callback>
operator ()(Uint8 hash,int,unsigned) const680 void CQueryHash::C_ScanCallback<Callback>::operator () ( Uint8 hash, int, unsigned ) const
681 {
682     m_caller.ForEach2( hash, m_callback, m_mask, m_flags );
683 }
684 
685 template<class Callback>
ForEach2int(Uint8 hash,Callback & callback,Uint1 mask,Uint1 flags) const686 void CQueryHash::ForEach2int( Uint8 hash, Callback& callback, Uint1 mask, Uint1 flags ) const
687 {
688     if( GetWordOffset() == 0 ) {
689         m_hashTable.ForEach( hash, callback, mask, flags );
690     } else {
691         mask |= CHashAtom::fMask_wordId;
692 
693         Uint1 flags0 = flags | CHashAtom::fFlag_wordId0;
694         Uint1 flags1 = flags | CHashAtom::fFlag_wordId1;
695 
696         TMatchSet& listA( m_listA );
697         TMatchSet& listB( m_listB );
698 
699         listA.resize(0);
700         listB.resize(0);
701 
702         Uint8 hashA = Uint8( hash >> ( GetWordOffset() * 2 ) );
703         Uint8 hashB = Uint8( hash & CBitHacks::WordFootprint<Uint8>( 2*GetWordSize() ) ); //( ( Uint8(1) << ( 2 * GetWordSize() )) - 1 ) );
704 
705         C_ListInserter iA( listA );
706         C_ListInserter iB( listB );
707 
708         m_hashTable.ForEach( hashA, iA, mask, flags0);
709         m_hashTable.ForEach( hashB, iB, mask, flags1);
710 
711         if( listA.size() == 0 || listB.size() == 0 ) return;
712 
713         sort( listA.begin(), listA.end(), CHashAtom::LessQueryCoiterate );
714         sort( listB.begin(), listB.end(), CHashAtom::LessQueryCoiterate );
715 
716         /*
717         cerr << __FUNCTION__ << DISPLAY( listA.size() ) << DISPLAY( listB.size() ) << "\n";
718         ITERATE( TMatchSet, a, listA ) cerr << "A: " << *a << "\n";
719         ITERATE( TMatchSet, b, listB ) cerr << "B: " << *b << "\n";
720         */
721 
722         TMatchSet::const_iterator a = listA.begin(), A = listA.end();
723         TMatchSet::const_iterator b = listB.begin(), B = listB.end();
724 
725         while( a != A && b != B ) {
726 
727             if( CHashAtom::LessQueryCoiterate( *a, *b ) ) { ++a; continue; }
728             if( CHashAtom::LessQueryCoiterate( *b, *a ) ) { ++b; continue; }
729 
730             //callback( a->GetStrandId() == CHashAtom::fFlag_strandFwd ? *b : *a ); // one time is enough
731             CHashAtom w( *a, *b );
732             callback( w );
733 
734             TMatchSet::const_iterator x = a;
735 
736             do if( ! ( ++a < A ) ) return; while( *a == *x );
737             do if( ! ( ++b < B ) ) return; while( *b == *x );
738         }
739     }
740 }
741 
742 /*
743 inline Uint8 CQueryHash::x_Convert2( Uint8 word, unsigned len, Uint8 from, Uint8 to )
744 {
745     Uint8 mask = 3;
746     for( unsigned i = 0; i < len; ++i, (mask <<= 2) ) {
747         if( ( word & mask ) == ( from & mask ) ) {
748             word = (word & ~mask) | ( to & mask );
749         }
750     }
751     return word;
752 }
753 */
754 
x_Convert2(Uint8 word,unsigned len,const unsigned char * tbl)755 inline Uint8 CQueryHash::x_Convert2( Uint8 word, unsigned len, const unsigned char * tbl )
756 {
757     //THROW( logic_error, "Please check usage of this function!!! it is not supposed to be used at all (2009.05.04)" );
758     Uint8 ret = 0;
759     for( int i = 0, I = len*2; i < I; i += 8 )
760         ret |= tbl[(unsigned char)(word >> i)] << i;
761     return ret;
762 }
763 
x_Convert4(UintH word,unsigned len,UintH mask,int shr)764 inline UintH CQueryHash::x_Convert4( UintH word, unsigned len, UintH mask, int shr )
765 {
766     //THROW( logic_error, "Please check usage of this function!!! it is not supposed to be used at all (2009.05.04)" );
767     if( shr > 0 )
768         word = (( word & mask ) >> (+shr) ) | ( word & ~mask );
769     else
770         word = (( word & mask ) << (-shr) ) | ( word & ~mask );
771     return word;
772 }
773 
x_Convert4tc(UintH word,unsigned len)774 inline UintH CQueryHash::x_Convert4tc( UintH word, unsigned len )
775 {
776     static UintH allT( 0x8888888888888888ULL, 0x8888888888888888ULL );
777     return (( word & allT ) >> 2 ) | ( word & ~allT );
778 }
779 
x_Convert4ag(UintH word,unsigned len)780 inline UintH CQueryHash::x_Convert4ag( UintH word, unsigned len )
781 {
782     static UintH allA( 0x1111111111111111ULL, 0x1111111111111111ULL );
783     return (( word & allA ) << 2 ) | ( word & ~allA );
784 }
785 
SetParam(const CHashParam & param)786 inline void CQueryHash::SetParam( const CHashParam& param )
787 {
788     SetWordSize( param.GetWordSize() );
789     SetWindowSize( param.GetWindowSize() );
790     SetWindowStep( param.GetWindowStep() );
791     SetWindowStart( param.GetWindowStart() );
792     SetMaxWindowCount( param.GetWindowCount() );
793     SetStrideSize( param.GetStrideSize() );
794     SetIndexBits( param.GetHashBits() );
795     SetMaxMismatches( param.GetHashMismatches() );
796     SetMaxInsertions( param.GetHashInsertions() );
797     SetMaxDeletions( param.GetHashDeletions() );
798     SetMaxDistance( param.GetHashMaxDistance() );
799     SetIndelPosition( param.GetHashIndelPosition() );
800     SetSkipPositions( param.GetSkipPositions() );
801     SetNcbipnaToNcbi4naScore( Uint2( 255 * pow( 10.0, double( param.GetPhrapCutoff())/10) ) );
802     SetNcbiqnaToNcbi4naScore( param.GetPhrapCutoff() );
803     SetMaxSimplicity( param.GetMaxSimplicity() );
804     if( m_wbmAccess ) m_wbmAccess->SetRqWordSize( param.GetWindowSize() );
805     ComputeHasherWindowLength();
806     CheckConstraints();
807 }
808 
GetParam(CHashParam & param) const809 inline void CQueryHash::GetParam( CHashParam& param ) const
810 {
811     param.SetWindowSize( GetWindowSize() );
812     param.SetWindowStep( GetWindowStep() );
813     param.SetWindowStart( GetWindowStart() );
814     param.SetWindowCount( GetMaxWindowCount() );
815     param.SetStrideSize( GetStrideSize() );
816     param.SetWordSize( GetWordSize() );
817     param.SetHashBits( GetIndexBits() );
818     param.SetHashMismatches( GetMaxMismatches() );
819     param.SetHashInsertions( GetMaxInsertions() );
820     param.SetHashDeletions( GetMaxDeletions() );
821     param.SetMaxHashDistance( GetMaxDistance() );
822     param.SetHashIndelPosition( GetIndelPosition() );
823     param.SetSkipPositions( GetSkipPositions() );
824     param.SetPhrapCutoff( m_ncbiqnaToNcbi4naScore );
825     param.SetMaxSimplicity( m_maxSimplicity );
826 }
827 
828 END_OLIGOFAR_SCOPES
829 
830 #endif
831