1 #ifndef OLIGOFAR_CQUERYHASH__HPP
2 #define OLIGOFAR_CQUERYHASH__HPP
3
4 #include "uinth.hpp"
5 #include "cquery.hpp"
6 #include "fourplanes.hpp"
7 #include "cpermutator8b.hpp"
8 #include "cbitmaskaccess.hpp"
9 #include "array_set.hpp"
10 #include "cwordhash.hpp"
11 #include "chashparam.hpp"
12 #include "cintron.hpp"
13
14 #include <deque>
15
16 BEGIN_OLIGOFAR_SCOPES
17
18 ////////////////////////////////////////////////////////////////////////
19 // 0....|....1....|....2
20 // 111010111111110111111 18 bits
21 // aaaaaaaaaaa 9 bits
22 // bbbbbbbbbbb 10 bits
23 class CHashPopulator;
24 class CQueryHash
25 {
26 public:
27 typedef array_set<Uint1> TSkipPositions;
28 typedef vector<CHashAtom> TMatchSet;
29
30 enum EStatus {
31 eStatus_empty,
32 eStatus_dirty,
33 eStatus_ready
34 };
35
36 enum EIndelLocation { eIndel_anywhere = 0 };
37
38 ~CQueryHash();
39 CQueryHash( int maxm = 2, int maxa = 4 );
40
SetHashWordBitmask(CBitmaskAccess * wbm)41 void SetHashWordBitmask( CBitmaskAccess * wbm ) { m_wbmAccess = wbm; }
42 void SetWindowSize( int winSize );
43 void SetWindowStep( int winStep );
44 void SetWindowStart( int winStart );
45 void SetWordSize( int wordSz );
46 void SetStrideSize( int stride );
47 void SetMaxMismatches( int mismatches );
48 void SetMaxAmbiguities( int ambiguities );
49 void SetMaxInsertions( int ins );
50 void SetMaxDeletions( int del );
51 void SetMaxDistance( int dist );
52 void SetIndelPosition( int pos );
53 void SetIndexBits( int bits );
54 void SetStrands( int strands );
55 void SetMaxSimplicity( double simpl );
56 void SetMaxWindowCount( int windows );
57 void SetNaHSO3mode( bool on = true );
SetHaveSplices(bool on=true)58 void SetHaveSplices( bool on = true ) { m_haveSplices = on; }
59
60 void SetParam( const CHashParam& param );
61 void GetParam( CHashParam& dest ) const;
62
SetExpectedReadCount(int count)63 void SetExpectedReadCount( int count ) { m_expectedCount = count; }
64
GetHashWordBitmask() const65 CBitmaskAccess * GetHashWordBitmask() const { return m_wbmAccess; }
GetScannerWindowLength() const66 int GetScannerWindowLength() const { return m_scannerWindowLength; }
GetMaxWindowCount() const67 int GetMaxWindowCount() const { return m_maxWindowCount; }
GetWindowSize() const68 int GetWindowSize() const { return m_windowSize; }
GetWindowStep() const69 int GetWindowStep() const { return m_windowStep; }
GetWindowStart() const70 int GetWindowStart() const { return m_windowStart; }
GetWordSize() const71 int GetWordSize() const { return m_wordSize; }
GetWordOffset() const72 int GetWordOffset() const { return m_wordOffset; }
GetStrideSize() const73 int GetStrideSize() const { return m_strideSize; }
GetMaxMismatches() const74 int GetMaxMismatches() const { return m_maxMism; }
GetMaxAmbiguities() const75 int GetMaxAmbiguities() const { return m_maxAmb; }
GetMaxInsertions() const76 int GetMaxInsertions() const { return m_maxIns; }
GetMaxDeletions() const77 int GetMaxDeletions() const { return m_maxDel; }
GetMaxDistance() const78 int GetMaxDistance() const { return m_maxDist; }
GetIndelPosition() const79 int GetIndelPosition() const { return m_indelPos; }
GetNaHSO3mode() const80 bool GetNaHSO3mode() const { return m_nahso3mode; }
GetIndexBits() const81 int GetIndexBits() const { return m_hashTable.GetIndexBits(); }
GetStrands() const82 int GetStrands() const { return m_strands; }
GetExpectedReadCount() const83 int GetExpectedReadCount() const { return m_expectedCount; }
GetHashedQueriesCount() const84 int GetHashedQueriesCount() const { return m_hashedCount; }
GetMaxSimplicity() const85 double GetMaxSimplicity() const { return m_maxSimplicity; }
HaveSplices() const86 bool HaveSplices() const { return m_haveSplices; }
87
GetAbsoluteMaxMism() const88 int GetAbsoluteMaxMism() const { return m_permutators.size() - 1; }
CanOptimizeOffset() const89 bool CanOptimizeOffset() const { return m_skipPositions.size() == 0 && m_haveSplices == false; }
90
91 template<class iterator> void SetSkipPositions( iterator begin, iterator end );
92 template<class container> void SetSkipPositions( const container& c );
93
HasSkipPositions() const94 bool HasSkipPositions() const { return !m_skipPositions.empty(); }
GetSkipPositions() const95 const TSkipPositions& GetSkipPositions() const { return m_skipPositions; }
96
Clear()97 void Clear() { m_status = eStatus_empty; m_hashedCount = 0; m_hashTable.Clear(); }
98 void Freeze();
99 void CheckConstraints();
100
101 int AddQuery( CQuery * query );
102 int AddQuery( CQuery * query, int component, int position, int wordcnt = 1, int wstep = 1 );
103
104 /* ========================================================================
105 * This code is not used and may be outdated and even have errors
106 * ========================================================================
107 template<class Callback> void ForEach4gnl( const UintH& ncbi4na, Callback& cbk ) const;
108 template<class Callback> void ForEach4( const UintH& ncbi4na, Callback& cbk ) const;
109 template<class Callback> void ForEach4( UintH hash, Callback& callback, Uint1 mask, Uint1 flags ) const;
110 template<class Callback> void ForEach4( const UintH& ncbi4na, Callback& cbk ) {
111 if( m_status == eStatus_dirty ) Freeze();
112 ForEach4gnl( ncbi4na, cbk );
113 }
114 * ======================================================================== */
115 template<class Callback> void ForEach2gnl( const Uint8& ncbi2na, Callback& cbk ) const;
116 template<class Callback> void ForEach2str( const Uint8& ncbi2na, Callback& cbk ) const;
117 template<class Callback> void ForEach2int( Uint8 hash, Callback& callback, Uint1 mask, Uint1 flags ) const;
ForEach2(const Uint8 & ncbi2na,Callback & cbk) const118 template<class Callback> void ForEach2( const Uint8& ncbi2na, Callback& cbk ) const {
119 ForEach2gnl( ncbi2na, cbk );
120 }
ForEach2(const Uint8 & ncbi2na,Callback & cbk)121 template<class Callback> void ForEach2( const Uint8& ncbi2na, Callback& cbk ) {
122 if( m_status == eStatus_dirty ) Freeze();
123 ForEach2gnl( ncbi2na, cbk );
124 }
125
126 Uint2 ComputeHashPreallocSz() const;
127 Uint8 ComputeEntryCountPerReadWord( Uint2 w, bool removeDupes = false ) const; // no ambiguities are allowed
128 Uint8 ComputeEntryCountPerReadMmOnly( Uint2 w, int maxMm ) const;
129 Uint8 ComputeEntryCountPerRead( bool removeDupes = false ) const;
130 int ComputeHasherWindowLength();
131 int ComputeHasherWindowStep();
132 int ComputeScannerWindowLength();
GetHasherWindowLength() const133 int GetHasherWindowLength() const { return m_windowLength; }
GetHasherWindowStep() const134 int GetHasherWindowStep() const { return m_windowStep > 0 ? m_windowStep : GetHasherWindowLength(); }
135 UintH ComputeAlternatives( UintH w, int l ) const; // used to optimize window
136 int ComputeAmbiguitiesNcbi8na( UintH w, int len ) const;
137 double ComputeComplexityNcbi8na( UintH w, int len, int& amb ) const;
138 double ComputeComplexityNcbi2na( Uint8 w, int len ) const;
139
140 template<class Uint, int bpb> void CompressFwd( Uint& window ) const;
141 template<class Uint, int bpb> void CompressRev( Uint& window ) const;
142
SetNcbipnaToNcbi4naScore(Uint2 score)143 void SetNcbipnaToNcbi4naScore( Uint2 score ) { m_ncbipnaToNcbi4naScore = score; }
SetNcbiqnaToNcbi4naScore(Uint2 score)144 void SetNcbiqnaToNcbi4naScore( Uint2 score ) { m_ncbiqnaToNcbi4naScore = score; }
145
146 protected:
147
148 template<class Callback>
149 class C_ScanCallback
150 {
151 public:
C_ScanCallback(const CQueryHash & caller,Callback & cbk,Uint1 mask,Uint1 flags)152 C_ScanCallback( const CQueryHash& caller, Callback& cbk, Uint1 mask, Uint1 flags ) :
153 m_caller( caller ), m_callback( cbk ), m_mask( mask ), m_flags( flags ) {}
154 void operator () ( Uint8 hash, int mism, unsigned amb ) const;
155 protected:
156 const CQueryHash& m_caller;
157 Callback& m_callback;
158 Uint1 m_mask;
159 Uint1 m_flags;
160 };
161
162 class C_ListInserter
163 {
164 public:
C_ListInserter(TMatchSet & ms)165 C_ListInserter( TMatchSet& ms ) : m_matchSet( ms ) {}
operator ()(const CHashAtom & atom)166 void operator () ( const CHashAtom& atom ) { m_matchSet.push_back( atom ); }
167 protected:
168 TMatchSet& m_matchSet;
169 };
170
171 typedef UintH TCvt( const unsigned char *, int, unsigned short );
172 typedef Uint2 TDecr( Uint2 );
173
174 bool CheckWordConstraints();
175 bool CheckWordConstraints() const;
176
177 int x_AddQuery( CQuery * query, int component );
178 int x_AddQuery( CQuery * query, int component, int off, int maxWinCnt, int wstep );
179 int PopulateWindow( CHashPopulator& dest, UintH fwindow, int offset, int ambiguities );
180 void PopulateInsertion2( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
181 void PopulateInsertion1( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
182 void PopulateInsertions( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow );
183 void PopulateDeletions ( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow );
184 void PopulateDeletion2 ( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
185 void PopulateDeletion1 ( CHashPopulator& dest, const UintH& maskH, const UintH& fwindow, int pos );
186 int GetNcbi4na( UintH& window, CSeqCoding::ECoding, const unsigned char * data, unsigned length );
187
188 int x_GetNcbi4na_ncbi8na( UintH& window, const unsigned char * data, unsigned length );
189 int x_GetNcbi4na_colorsp( UintH& window, const unsigned char * data, unsigned length );
190 int x_GetNcbi4na_ncbipna( UintH& window, const unsigned char * data, unsigned length );
191 int x_GetNcbi4na_ncbiqna( UintH& window, const unsigned char * data, unsigned length );
192 int x_GetNcbi4na_quality( UintH& window, const unsigned char * data, unsigned length, TCvt * fun, int incr, unsigned short score, TDecr * decr );
193
x_UpdateNcbipnaScore(Uint2 score)194 static Uint2 x_UpdateNcbipnaScore( Uint2 score ) { return score /= 2; }
x_UpdateNcbiqnaScore(Uint2 score)195 static Uint2 x_UpdateNcbiqnaScore( Uint2 score ) { return score - 1; }
196 static TCvt x_Ncbipna2Ncbi4na;
197 static TCvt x_Ncbiqna2Ncbi4na;
198
199 //static Uint8 x_Convert2( Uint8 word, unsigned len, Uint8 from, Uint8 to );
200 static Uint8 x_Convert2( Uint8 word, unsigned len, const unsigned char * tbl );
201 static UintH x_Convert4( UintH word, unsigned len, UintH mask, int shr );
202 static UintH x_Convert4tc( UintH word, unsigned len );
203 static UintH x_Convert4ag( UintH word, unsigned len );
204
205 // template<class Callback> void ForEach( Uint8 ncbi2na, Callback& cbk, Uint1 mask, Uint1 flags );
206
207 protected:
208
209 EStatus m_status;
210 int m_windowLength;
211 int m_windowStep;
212 int m_windowStart;
213 int m_scannerWindowLength;
214 int m_windowSize;
215 int m_strideSize;
216 int m_wordSize;
217 int m_maxWindowCount;
218 int m_strands;
219 int m_expectedCount;
220 int m_hashedCount;
221 int m_maxAmb;
222 int m_maxMism;
223 int m_maxIns;
224 int m_maxDel;
225 int m_maxDist;
226 int m_indelPos;
227 bool m_nahso3mode;
228 bool m_haveSplices;
229 double m_maxSimplicity;
230 Uint2 m_ncbipnaToNcbi4naScore;
231 Uint2 m_ncbiqnaToNcbi4naScore;
232 TSkipPositions m_skipPositions;
233 CWordHash m_hashTable;
234 vector<CPermutator8b*> m_permutators;
235 // all masks are ncbi2na, i.e. 2 bits per base
236 Uint8 m_wordMask;
237 int m_wordOffset;
238 CBitmaskAccess * m_wbmAccess;
239
240 mutable Uint8 m_hashPopulatorReverve;
241 mutable TMatchSet m_listA, m_listB; // to keep memory space reserved between ForEach calls
242 static const unsigned char * s_convert2tc;
243 static const unsigned char * s_convert2ag;
244 static const unsigned char * s_convert2eq;
245 };
246
247 ////////////////////////////////////////////////////////////////////////
248 // inline implementation follows
249
~CQueryHash()250 inline CQueryHash::~CQueryHash()
251 {
252 for( int i = 0; i < int( m_permutators.size() ); ++i )
253 delete m_permutators[i];
254 }
255
CQueryHash(int maxm,int maxa)256 inline CQueryHash::CQueryHash( int maxm, int maxa ) :
257 m_status( eStatus_empty ),
258 m_windowLength( 22 ),
259 m_windowStep( 1 ),
260 m_windowStart( 0 ),
261 m_scannerWindowLength( 22 ),
262 m_windowSize( 22 ),
263 m_strideSize( 1 ),
264 m_wordSize( 11 ),
265 m_maxWindowCount( 1 ),
266 m_strands( 3 ),
267 m_expectedCount( 1000000 ),
268 m_hashedCount( 0 ),
269 m_maxAmb( maxa ),
270 m_maxMism( min(1, maxm) ),
271 m_maxIns( 0 ),
272 m_maxDel( 0 ),
273 m_maxDist( 0 ),
274 m_indelPos( eIndel_anywhere ),
275 m_nahso3mode( false ),
276 m_haveSplices( false ),
277 m_maxSimplicity( 2.0 ),
278 m_ncbipnaToNcbi4naScore( 0x7f ),
279 m_ncbiqnaToNcbi4naScore( 3 ),
280 m_hashTable( m_wordSize*2 ),
281 m_wordMask( CBitHacks::WordFootprint<Uint8>( m_wordSize*2 ) ),
282 m_wordOffset( m_windowSize - m_wordSize ),
283 m_wbmAccess( 0 )
284 {
285 SetMaxMismatches( maxm );
286 }
287
288 template<class iterator>
SetSkipPositions(iterator begin,iterator end)289 inline void CQueryHash::SetSkipPositions( iterator begin, iterator end )
290 {
291 m_skipPositions.clear();
292 for( ; begin != end ; ++begin ) {
293 if( *begin <= 0 ) continue;
294 m_skipPositions.insert( *begin );
295 }
296 if( m_maxWindowCount > 1 && m_skipPositions.size() )
297 THROW( logic_error, "Skip positions are not supported with max word count > 1" );
298 ComputeScannerWindowLength();
299 }
300
301 template<class container>
SetSkipPositions(const container & c)302 inline void CQueryHash::SetSkipPositions( const container& c )
303 {
304 SetSkipPositions( c.begin(), c.end() );
305 }
306
SetWindowSize(int winSize)307 inline void CQueryHash::SetWindowSize( int winSize )
308 {
309 ASSERT( m_status == eStatus_empty );
310 m_windowSize = winSize;
311 if( m_wordSize > m_windowSize ) {
312 m_wordSize = m_windowSize;
313 m_wordMask = CBitHacks::WordFootprint<Uint8>( 2*m_wordSize );
314 m_wordOffset = 0;
315 } else m_wordOffset = m_windowSize - m_wordSize;
316 // ComputeScannerWindowLength();
317 }
318
SetWindowStep(int step)319 inline void CQueryHash::SetWindowStep( int step )
320 {
321 ASSERT( m_status == eStatus_empty );
322 ASSERT( step >= 0 );
323 m_windowStep = step;
324 }
325
SetWindowStart(int start)326 inline void CQueryHash::SetWindowStart( int start )
327 {
328 ASSERT( m_status == eStatus_empty );
329 ASSERT( start >= 0 );
330 m_windowStart = start;
331 }
332
SetMaxWindowCount(int winCnt)333 inline void CQueryHash::SetMaxWindowCount( int winCnt )
334 {
335 ASSERT( m_status == eStatus_empty );
336 m_maxWindowCount = winCnt;
337 if( m_maxWindowCount > 1 && m_skipPositions.size() )
338 THROW( logic_error, "Skip positions are not supported with max word count > 1" );
339 m_wordOffset = m_windowSize - m_wordSize;
340 }
341
SetWordSize(int wordSize)342 inline void CQueryHash::SetWordSize( int wordSize )
343 {
344 ASSERT( m_status == eStatus_empty );
345 m_wordSize = wordSize;
346 m_wordMask = CBitHacks::WordFootprint<Uint8>( 2*m_wordSize );
347 if( m_wordSize > m_windowSize ) m_windowSize = m_wordSize;
348 m_wordOffset = m_windowSize - m_wordSize;
349 ComputeScannerWindowLength();
350 }
351
AddQuery(CQuery * query)352 inline int CQueryHash::AddQuery( CQuery * query )
353 {
354 if( m_status == eStatus_empty ) CheckWordConstraints();
355
356 m_status = eStatus_dirty;
357 int ret = x_AddQuery( query, 0 );
358 if( query->HasComponent( 1 ) ) ret += x_AddQuery( query, 1 );
359 if( ret ) ++m_hashedCount;
360 return ret;
361 }
362
AddQuery(CQuery * query,int component,int woffset,int wcount,int wstep)363 inline int CQueryHash::AddQuery( CQuery * query, int component, int woffset, int wcount, int wstep )
364 {
365 if( m_status == eStatus_empty ) CheckWordConstraints();
366 m_status = eStatus_dirty;
367
368 return x_AddQuery( query, component, woffset, wcount, wstep );
369 }
370
371
SetStrideSize(int stride)372 inline void CQueryHash::SetStrideSize( int stride )
373 {
374 ASSERT( m_status == eStatus_empty );
375 m_strideSize = stride;
376 }
377
SetMaxMismatches(int mismatches)378 inline void CQueryHash::SetMaxMismatches( int mismatches )
379 {
380 ASSERT( m_status == eStatus_empty );
381 if( int(m_permutators.size()) <= mismatches ) {
382 int oldsz = m_permutators.size();
383 m_permutators.resize( mismatches + 1 );
384 for( int i = oldsz; i <= mismatches; ++i )
385 m_permutators[i] = new CPermutator8b( i, m_nahso3mode ? 32 : m_maxAmb );
386 }
387 m_maxMism = mismatches;
388 if( m_maxDist < mismatches ) m_maxDist = mismatches;
389 }
390
SetMaxAmbiguities(int ambiguities)391 inline void CQueryHash::SetMaxAmbiguities( int ambiguities )
392 {
393 ASSERT( m_status == eStatus_empty );
394 unsigned amb = m_nahso3mode ? 32 : ambiguities;
395 m_maxAmb = ambiguities;
396 for( unsigned i = 0; i < m_permutators.size(); ++i ) {
397 if( m_permutators[i]->GetMaxAmbiguities() != amb ) {
398 delete m_permutators[i];
399 m_permutators[i] = new CPermutator8b( i, m_maxAmb );
400 }
401 }
402 }
403
SetMaxInsertions(int ins)404 inline void CQueryHash::SetMaxInsertions( int ins )
405 {
406 ASSERT( m_status == eStatus_empty );
407 m_maxIns = ins;
408 if( m_maxDist < ins ) m_maxDist = ins;
409 }
410
SetMaxDeletions(int del)411 inline void CQueryHash::SetMaxDeletions( int del )
412 {
413 ASSERT( m_status == eStatus_empty );
414 m_maxDel = del;
415 if( m_maxDist < del ) m_maxDist = del;
416 }
417
SetMaxDistance(int dist)418 inline void CQueryHash::SetMaxDistance( int dist )
419 {
420 ASSERT( m_status == eStatus_empty );
421 m_maxDist = max( max( m_maxIns, m_maxDel ), max( m_maxMism, dist ) );
422 }
423
SetIndelPosition(int pos)424 inline void CQueryHash::SetIndelPosition( int pos )
425 {
426 ASSERT( m_status == eStatus_empty );
427 m_indelPos = pos;
428 }
429
SetNaHSO3mode(bool on)430 inline void CQueryHash::SetNaHSO3mode( bool on )
431 {
432 ASSERT( m_status == eStatus_empty );
433 m_nahso3mode = on;
434 SetMaxAmbiguities( GetMaxAmbiguities() );
435 }
436
SetIndexBits(int bits)437 inline void CQueryHash::SetIndexBits( int bits )
438 {
439 ASSERT( m_status == eStatus_empty );
440 m_hashTable.SetIndexBits( bits );
441 }
442
SetStrands(int strands)443 inline void CQueryHash::SetStrands( int strands )
444 {
445 ASSERT( m_status == eStatus_empty );
446 m_strands = strands;
447 }
448
SetMaxSimplicity(double simpl)449 inline void CQueryHash::SetMaxSimplicity( double simpl )
450 {
451 ASSERT( m_status == eStatus_empty );
452 m_maxSimplicity = simpl;
453 }
454
ComputeEntryCountPerReadMmOnly(Uint2 w,int maxMm) const455 inline Uint8 CQueryHash::ComputeEntryCountPerReadMmOnly( Uint2 w, int maxMm ) const
456 {
457 Uint8 ret = 1;
458 for( int i = 0; i < maxMm; ++i ) ret *= (w - i)*3;
459 return ret;
460 }
461
ComputeEntryCountPerReadWord(Uint2 w,bool removeDupes) const462 inline Uint8 CQueryHash::ComputeEntryCountPerReadWord( Uint2 w, bool removeDupes ) const
463 {
464 Uint8 ret = ComputeEntryCountPerReadMmOnly( w, GetMaxMismatches() );
465 if( GetMaxInsertions() == 2 ) {
466 Uint8 i2 = 16;
467 if( m_indelPos == eIndel_anywhere ) {
468 i2 *= (w - 2);
469 if( removeDupes ) i2 -= (w - 3)*4 + 1;
470 }
471 i2 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 2 ) );
472 ret += i2;
473 }
474 if( GetMaxInsertions() ) {
475 Uint8 i1 = 4;
476 if( m_indelPos == eIndel_anywhere ) {
477 i1 *= (w - 1);
478 if( removeDupes ) i1 -= (w - 2) + 1;
479 }
480 i1 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 1 ) );
481 ret += i1;
482 }
483 if( GetMaxDeletions() == 2 ) {
484 Uint8 d2 = 1;
485 if( m_indelPos == eIndel_anywhere ) d2 *= (w - 1);
486 d2 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 2 ) );
487 ret += d2;
488 }
489 if( GetMaxDeletions() ) {
490 Uint8 d1 = 1;
491 if( m_indelPos == eIndel_anywhere ) d1 *= w;
492 d1 *= ComputeEntryCountPerReadMmOnly( w, min( GetMaxMismatches(), GetMaxDistance() - 1 ) );
493 ret += d1;
494 }
495 return ret;
496 }
497
ComputeEntryCountPerRead(bool removeDupes) const498 inline Uint8 CQueryHash::ComputeEntryCountPerRead( bool removeDupes ) const
499 {
500 Uint8 ret = ComputeEntryCountPerReadWord( m_wordSize/*m_windowSize + m_strideSize - 1*/, removeDupes ); // TODO: check what is used an wsize
501 if( m_wordOffset ) ret *= 2; // for each word
502 if( (m_strands&3) == 3 ) ret *= 2; // for each strand
503 ret *= m_maxWindowCount;
504 return ret;
505 }
506
ComputeHasherWindowLength()507 inline int CQueryHash::ComputeHasherWindowLength()
508 {
509 ASSERT( m_strideSize == 1 || m_skipPositions.size() == 0 );
510 m_windowLength = m_windowSize + m_strideSize - 1 + GetMaxDeletions();
511 ITERATE( TSkipPositions, p, m_skipPositions ) {
512 if( *p <= m_windowLength ) ++m_windowLength; // m_skipPositions are sorted, unique and > 0 (1-based)
513 else break;
514 }
515 return m_windowLength;
516 }
517
ComputeHasherWindowStep()518 inline int CQueryHash::ComputeHasherWindowStep()
519 {
520 return m_windowStep > 0 ? m_windowStep : ComputeHasherWindowLength();
521 }
522
ComputeScannerWindowLength()523 inline int CQueryHash::ComputeScannerWindowLength()
524 {
525 ASSERT( m_strideSize == 1 || m_skipPositions.size() == 0 );
526 int winlen = m_windowSize; // indel and strides are taken into account in hash
527 ITERATE( TSkipPositions, p, m_skipPositions ) { // just extend window to allow skipping
528 if( *p <= winlen ) ++winlen; // m_skipPositions are sorted, unique and > 0 (1-based)
529 else break;
530 }
531 return m_scannerWindowLength = winlen;
532 }
533
ComputeHashPreallocSz() const534 inline Uint2 CQueryHash::ComputeHashPreallocSz() const{ return 1; }
535
CheckWordConstraints()536 inline bool CQueryHash::CheckWordConstraints()
537 {
538 ComputeHasherWindowLength();
539 ComputeScannerWindowLength();
540 m_hashTable.SetPreallocSz( ComputeHashPreallocSz() );
541 m_hashPopulatorReverve = ComputeEntryCountPerRead();
542 return ((const CQueryHash*)this)->CQueryHash::CheckWordConstraints();
543 }
544
CheckWordConstraints() const545 inline bool CQueryHash::CheckWordConstraints() const
546 {
547 ASSERT( m_skipPositions.size() == 0 || m_strideSize == 1 );
548 // ASSERT( m_strideSize < m_wordSize );
549 // ASSERT( m_strideSize < m_wordOffset + 1|| m_wordOffset == 0 );
550 ASSERT( m_wordSize <= m_windowSize );
551 ASSERT( m_wordSize * 2 >= m_windowSize );
552 ASSERT( m_wordSize * 2 - m_hashTable.GetIndexBits() <= 16 ); // requirement is based on that CHashAtom may store only 16 bits
553 ASSERT( m_windowLength <= 32 );
554 return true;
555 }
556
557 template <typename Uint, int bpb>
CompressFwd(Uint & w) const558 inline void CQueryHash::CompressFwd( Uint& w ) const
559 {
560 int x = 0;
561 ITERATE( TSkipPositions, p, m_skipPositions ) {
562 int pp = *p - m_windowStart;
563 if( pp < 0 ) continue;
564 if( pp > x ) w = CBitHacks::DeleteBits<Uint,bpb>( w, pp - ++x );
565 if( pp > m_windowLength ) break;
566 }
567 }
568
569 template <typename Uint, int bpb>
CompressRev(Uint & w) const570 void CQueryHash::CompressRev( Uint& w ) const
571 {
572 int x = m_scannerWindowLength;
573 ITERATE( TSkipPositions, p, m_skipPositions ) {
574 int pp = *p - m_windowStart;
575 if( pp < 0 ) continue;
576 if( pp < x ) w = CBitHacks::DeleteBits<Uint,bpb>( w, x - pp );
577 if( pp > m_windowLength ) break;
578 }
579 }
580
581 #if 0
582 template<class Callback>
583 void CQueryHash::ForEach4gnl( const UintH& hash, Callback& callback ) const
584 {
585 if( m_nahso3mode ) {
586 ForEach4<Callback>( this->x_Convert4tc( hash, m_windowSize ), callback );
587 ForEach4<Callback>( this->x_Convert4ag( hash, m_windowSize ), callback );
588 } else {
589 ForEach4<Callback>( hash, callback );
590 }
591 }
592
593 template<class Callback>
594 void CQueryHash::ForEach4( const UintH& hash, Callback& callback ) const
595 {
596 /* ........................................................................
597 We change strategy here: instead of putting logic on varying hash window
598 by seqscanner, we to it in hash - so that hash is a black box
599
600 Remember, that scanner WindowLength = either
601 (a) WindowSize + 0 * (StrideSize + IndelAllowed)
602 or
603 (b) WindowSize + (number of skip positions - self conjugated)
604 because it is not allowed to have strides and skip positions
605
606 That's the window scanner should use.
607 ........................................................................ */
608 ASSERT( m_status != eStatus_dirty );
609
610 if( m_strands == 3 & !HasSkipPositions() )
611 ForEach4( hash, callback, 0, 0 );
612 else {
613 if( m_strands & 1 ) {
614 UintH _hash = hash;
615 if( HasSkipPositions() ) CompressFwd<UintH,4>( _hash );
616 ForEach4( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandFwd );
617 }
618 if( m_strands & 2 ) {
619 UintH _hash = hash;
620 if( HasSkipPositions() ) CompressRev<UintH,4>( _hash );
621 ForEach4( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandRev );
622 }
623 }
624 }
625 template<class Callback>
626 void CQueryHash::ForEach4( UintH hash, Callback& callback, Uint1 mask, Uint1 flags ) const
627 {
628 THROW( logic_error, "ForEach4 should be modified in the way ForEach2 works in " << __FILE__ << ":" << __LINE__ );
629 //Compress4fwd( hash );
630 C_ScanCallback<Callback> cbk( *this, callback, mask, flags );
631 m_permutators[0]->ForEach( m_windowSize, hash, cbk );
632 }
633
634 #endif
635 template<class Callback>
ForEach2gnl(const Uint8 & hash,Callback & callback) const636 void CQueryHash::ForEach2gnl( const Uint8& hash, Callback& callback ) const
637 {
638 if( m_nahso3mode ) {
639 ForEach2str<Callback>( x_Convert2( hash, m_windowSize, s_convert2tc ), callback );
640 ForEach2str<Callback>( x_Convert2( hash, m_windowSize, s_convert2ag ), callback );
641 } else {
642 ForEach2str<Callback>( hash, callback );
643 }
644 }
645
646 template<class Callback>
ForEach2str(const Uint8 & hash,Callback & callback) const647 void CQueryHash::ForEach2str( const Uint8& hash, Callback& callback ) const
648 {
649 /* ........................................................................
650 We change strategy here: instead of putting logic on varying hash window
651 by seqscanner, we to it in hash - so that hash is a black box
652
653 Remember, that scanner WindowLength = either
654 (a) WindowSize + 0 * (StrideSize + IndelAllowed)
655 or
656 (b) WindowSize + (number of skip positions - self conjugated)
657 because it is not allowed to have strides and skip positions
658
659 That's the window scanner should use.
660 ........................................................................ */
661 ASSERT( m_status != eStatus_dirty );
662
663 if( m_strands == 3 && !HasSkipPositions() ) {
664 ForEach2int( hash, callback, 0, 0 );
665 } else {
666 if( m_strands & 1 ) {
667 Uint8 _hash = hash;
668 if( HasSkipPositions() ) CompressFwd<Uint8,2>( _hash );
669 ForEach2int( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandFwd );
670 }
671 if( m_strands & 2 ) {
672 Uint8 _hash = hash;
673 if( HasSkipPositions() ) CompressRev<Uint8,2>( _hash );
674 ForEach2int( _hash, callback, CHashAtom::fMask_strand, CHashAtom::fFlag_strandRev );
675 }
676 }
677 }
678
679 template<class Callback>
operator ()(Uint8 hash,int,unsigned) const680 void CQueryHash::C_ScanCallback<Callback>::operator () ( Uint8 hash, int, unsigned ) const
681 {
682 m_caller.ForEach2( hash, m_callback, m_mask, m_flags );
683 }
684
685 template<class Callback>
ForEach2int(Uint8 hash,Callback & callback,Uint1 mask,Uint1 flags) const686 void CQueryHash::ForEach2int( Uint8 hash, Callback& callback, Uint1 mask, Uint1 flags ) const
687 {
688 if( GetWordOffset() == 0 ) {
689 m_hashTable.ForEach( hash, callback, mask, flags );
690 } else {
691 mask |= CHashAtom::fMask_wordId;
692
693 Uint1 flags0 = flags | CHashAtom::fFlag_wordId0;
694 Uint1 flags1 = flags | CHashAtom::fFlag_wordId1;
695
696 TMatchSet& listA( m_listA );
697 TMatchSet& listB( m_listB );
698
699 listA.resize(0);
700 listB.resize(0);
701
702 Uint8 hashA = Uint8( hash >> ( GetWordOffset() * 2 ) );
703 Uint8 hashB = Uint8( hash & CBitHacks::WordFootprint<Uint8>( 2*GetWordSize() ) ); //( ( Uint8(1) << ( 2 * GetWordSize() )) - 1 ) );
704
705 C_ListInserter iA( listA );
706 C_ListInserter iB( listB );
707
708 m_hashTable.ForEach( hashA, iA, mask, flags0);
709 m_hashTable.ForEach( hashB, iB, mask, flags1);
710
711 if( listA.size() == 0 || listB.size() == 0 ) return;
712
713 sort( listA.begin(), listA.end(), CHashAtom::LessQueryCoiterate );
714 sort( listB.begin(), listB.end(), CHashAtom::LessQueryCoiterate );
715
716 /*
717 cerr << __FUNCTION__ << DISPLAY( listA.size() ) << DISPLAY( listB.size() ) << "\n";
718 ITERATE( TMatchSet, a, listA ) cerr << "A: " << *a << "\n";
719 ITERATE( TMatchSet, b, listB ) cerr << "B: " << *b << "\n";
720 */
721
722 TMatchSet::const_iterator a = listA.begin(), A = listA.end();
723 TMatchSet::const_iterator b = listB.begin(), B = listB.end();
724
725 while( a != A && b != B ) {
726
727 if( CHashAtom::LessQueryCoiterate( *a, *b ) ) { ++a; continue; }
728 if( CHashAtom::LessQueryCoiterate( *b, *a ) ) { ++b; continue; }
729
730 //callback( a->GetStrandId() == CHashAtom::fFlag_strandFwd ? *b : *a ); // one time is enough
731 CHashAtom w( *a, *b );
732 callback( w );
733
734 TMatchSet::const_iterator x = a;
735
736 do if( ! ( ++a < A ) ) return; while( *a == *x );
737 do if( ! ( ++b < B ) ) return; while( *b == *x );
738 }
739 }
740 }
741
742 /*
743 inline Uint8 CQueryHash::x_Convert2( Uint8 word, unsigned len, Uint8 from, Uint8 to )
744 {
745 Uint8 mask = 3;
746 for( unsigned i = 0; i < len; ++i, (mask <<= 2) ) {
747 if( ( word & mask ) == ( from & mask ) ) {
748 word = (word & ~mask) | ( to & mask );
749 }
750 }
751 return word;
752 }
753 */
754
x_Convert2(Uint8 word,unsigned len,const unsigned char * tbl)755 inline Uint8 CQueryHash::x_Convert2( Uint8 word, unsigned len, const unsigned char * tbl )
756 {
757 //THROW( logic_error, "Please check usage of this function!!! it is not supposed to be used at all (2009.05.04)" );
758 Uint8 ret = 0;
759 for( int i = 0, I = len*2; i < I; i += 8 )
760 ret |= tbl[(unsigned char)(word >> i)] << i;
761 return ret;
762 }
763
x_Convert4(UintH word,unsigned len,UintH mask,int shr)764 inline UintH CQueryHash::x_Convert4( UintH word, unsigned len, UintH mask, int shr )
765 {
766 //THROW( logic_error, "Please check usage of this function!!! it is not supposed to be used at all (2009.05.04)" );
767 if( shr > 0 )
768 word = (( word & mask ) >> (+shr) ) | ( word & ~mask );
769 else
770 word = (( word & mask ) << (-shr) ) | ( word & ~mask );
771 return word;
772 }
773
x_Convert4tc(UintH word,unsigned len)774 inline UintH CQueryHash::x_Convert4tc( UintH word, unsigned len )
775 {
776 static UintH allT( 0x8888888888888888ULL, 0x8888888888888888ULL );
777 return (( word & allT ) >> 2 ) | ( word & ~allT );
778 }
779
x_Convert4ag(UintH word,unsigned len)780 inline UintH CQueryHash::x_Convert4ag( UintH word, unsigned len )
781 {
782 static UintH allA( 0x1111111111111111ULL, 0x1111111111111111ULL );
783 return (( word & allA ) << 2 ) | ( word & ~allA );
784 }
785
SetParam(const CHashParam & param)786 inline void CQueryHash::SetParam( const CHashParam& param )
787 {
788 SetWordSize( param.GetWordSize() );
789 SetWindowSize( param.GetWindowSize() );
790 SetWindowStep( param.GetWindowStep() );
791 SetWindowStart( param.GetWindowStart() );
792 SetMaxWindowCount( param.GetWindowCount() );
793 SetStrideSize( param.GetStrideSize() );
794 SetIndexBits( param.GetHashBits() );
795 SetMaxMismatches( param.GetHashMismatches() );
796 SetMaxInsertions( param.GetHashInsertions() );
797 SetMaxDeletions( param.GetHashDeletions() );
798 SetMaxDistance( param.GetHashMaxDistance() );
799 SetIndelPosition( param.GetHashIndelPosition() );
800 SetSkipPositions( param.GetSkipPositions() );
801 SetNcbipnaToNcbi4naScore( Uint2( 255 * pow( 10.0, double( param.GetPhrapCutoff())/10) ) );
802 SetNcbiqnaToNcbi4naScore( param.GetPhrapCutoff() );
803 SetMaxSimplicity( param.GetMaxSimplicity() );
804 if( m_wbmAccess ) m_wbmAccess->SetRqWordSize( param.GetWindowSize() );
805 ComputeHasherWindowLength();
806 CheckConstraints();
807 }
808
GetParam(CHashParam & param) const809 inline void CQueryHash::GetParam( CHashParam& param ) const
810 {
811 param.SetWindowSize( GetWindowSize() );
812 param.SetWindowStep( GetWindowStep() );
813 param.SetWindowStart( GetWindowStart() );
814 param.SetWindowCount( GetMaxWindowCount() );
815 param.SetStrideSize( GetStrideSize() );
816 param.SetWordSize( GetWordSize() );
817 param.SetHashBits( GetIndexBits() );
818 param.SetHashMismatches( GetMaxMismatches() );
819 param.SetHashInsertions( GetMaxInsertions() );
820 param.SetHashDeletions( GetMaxDeletions() );
821 param.SetMaxHashDistance( GetMaxDistance() );
822 param.SetHashIndelPosition( GetIndelPosition() );
823 param.SetSkipPositions( GetSkipPositions() );
824 param.SetPhrapCutoff( m_ncbiqnaToNcbi4naScore );
825 param.SetMaxSimplicity( m_maxSimplicity );
826 }
827
828 END_OLIGOFAR_SCOPES
829
830 #endif
831