1 #include <ncbi_pch.hpp>
2 #include "csamalignments.hpp"
3 #include "csammdformatter.hpp"
4 #include "cprogressindicator.hpp"
5 #include "string-util.hpp"
6 #include "ialigner.hpp"
7 
8 USING_OLIGOFAR_SCOPES;
9 
CSRead(const string & id,CSeqCoding::ECoding coding,const char * sequence,int length,EPairType intended)10 CSRead::CSRead( const string& id, CSeqCoding::ECoding coding, const char * sequence, int length, EPairType intended )
11 {
12     ++s_count;
13     ASSERT( coding == CSeqCoding::eCoding_ncbi8na || coding == CSeqCoding::eCoding_ncbiqna );
14     m_data = new char[x_GetDataSize( id.length(), length, eRead_single )];
15     m_data[0] = id.length();
16     m_data[1] = length;
17     m_data[2] = eRead_single | ( intended << 2 );
18     m_data[3] = coding; // quality.length() ? CSeqCoding::eCoding_ncbiqna : CSeqCoding::eCoding_ncbi8na ;
19     strcpy( m_data + x_GetIdOffset(), id.c_str() );
20     char * d = m_data + x_GetSequenceOffset();
21     copy( sequence, sequence + length, d );
22 }
23 
CSRead(const string & id,const string & sequence,const string & quality,EPairType intended)24 CSRead::CSRead( const string& id, const string& sequence, const string& quality, EPairType intended )
25 {
26     ++s_count;
27     if( quality.length() && quality.length() != sequence.length() )
28         THROW( runtime_error, "Sequence lendgth and quality length differ (" << sequence << ", " << quality << ") !" );
29     m_data = new char[x_GetDataSize( id.length(), sequence.length(), eRead_single )];
30     m_data[0] = id.length();
31     m_data[1] = sequence.length();
32     m_data[2] = eRead_single | ( intended << 2 );
33     m_data[3] = quality.length() ? CSeqCoding::eCoding_ncbiqna : CSeqCoding::eCoding_ncbi8na ;
34     strcpy( m_data + x_GetIdOffset(), id.c_str() );
35     char * d = m_data + x_GetSequenceOffset();
36     if( quality.length() ) {
37         for( const char * s = sequence.c_str(), * q = quality.c_str(); *s; ++s, ++q ) {
38             if( *s != '-' ) *d++ = CNcbiqnaBase( CIupacnaBase( *s ), *q - 33 );
39         }
40     } else {
41         for( const char * s = sequence.c_str(); *s; ++s ) {
42             if( *s != '-' ) *d++ = CNcbi8naBase( CIupacnaBase( *s ) );
43         }
44     }
45 }
46 
CSRead(const CSRead & r)47 CSRead::CSRead( const CSRead& r )
48 {
49     ++s_count;
50     int ds = x_GetDataSize( strlen(r.GetId()), r.x_GetSequenceLength(), r.x_GetPairType() );
51     m_data = new char[ds];
52     memcpy( m_data, r.m_data, ds );
53 }
54 
operator =(const CSRead & r)55 CSRead& CSRead::operator = ( const CSRead& r )
56 {
57     delete[] m_data;
58     int ds = x_GetDataSize( strlen(r.GetId()), r.x_GetSequenceLength(), r.x_GetPairType() );
59     m_data = new char[ds];
60     memcpy( m_data, r.m_data, ds );
61     return *this;
62 }
63 
SetMate(CSRead * other)64 void CSRead::SetMate( CSRead * other )
65 {
66     if( !other ) return;
67     if( x_GetPairType() != eRead_single || other->x_GetPairType() != eRead_single ) {
68         if( other->x_GetMate() == this ) return;
69         THROW( logic_error, "Can't make read paired: it is already paired" );
70     }
71     if( GetIntendedPairType() != eRead_first || other->GetIntendedPairType() != eRead_second ) {
72         THROW( logic_error, "Intender pair type of the read differs from what is getting to be actual" );
73     }
74     char * newdata = new char[x_GetDataSize( x_GetIdLength(), x_GetSequenceLength(), eRead_first )];
75     const char * id = m_data + x_GetIdOffset();
76     const char * seq = m_data + x_GetSequenceOffset();
77     newdata[0] = x_GetIdLength();
78     newdata[1] = x_GetSequenceLength();
79     newdata[2] = eRead_first | (GetIntendedPairType() << 2);
80     newdata[3] = x_GetSequenceCoding();
81     swap( newdata, m_data );
82     copy( id, id + x_GetIdLength() + 1, m_data + x_GetIdOffset() );
83     copy( seq, seq + x_GetSequenceLength() + 1, m_data + x_GetSequenceOffset() );
84     x_SetMate() = other;
85     delete[] newdata;
86     newdata = new char[other->x_GetDataSize( other->x_GetIdLength(), other->x_GetSequenceLength(), eRead_second )];
87     id = other->m_data + other->x_GetIdOffset();
88     seq = other->m_data + other->x_GetSequenceOffset();
89     newdata[0] = 0; // other->x_GetIdLength();
90     newdata[1] = other->x_GetSequenceLength();
91     newdata[2] = eRead_second | (other->GetIntendedPairType() << 2);
92     newdata[3] = other->x_GetSequenceCoding();
93     swap( newdata, other->m_data );
94     copy( seq, seq + other->x_GetSequenceLength() + 1, other->m_data + other->x_GetSequenceOffset() );
95     other->x_SetMate() = this;
96     delete[] newdata;
97 }
98 
CContig(const string & name)99 CContig::CContig( const string& name ) :
100     m_id( name ), m_data( 0 ), m_size( 0 ), m_offset( 0 ), m_coding( CSeqCoding::eCoding_ncbi8na ), m_owns( false )
101 {
102     ++s_count;
103 }
104 
CContig(const CContig & other)105 CContig::CContig( const CContig& other ) :
106     m_id( other.m_id ), m_data( 0 ), m_size( other.m_size ), m_offset( other.m_offset ), m_coding( other.m_coding ), m_owns( other.m_owns )
107 {
108     ++s_count;
109     if( other.m_owns ) {
110         m_data = new char[ other.m_size - other.m_offset ];
111         memcpy( m_data, other.m_data, other.m_size - other.m_offset );
112     } else {
113         m_data = other.m_data;
114     }
115 }
116 
117 int CSRead::s_count = 0;
118 int CContig::s_count = 0;
119 int CMappedAlignment::s_count = 0;
120 bool CMappedAlignment::s_validate_consistency = true;
121 
CMappedAlignment(CSRead * query,CContig * subject,int from,const TTrVector & cigar,CSeqCoding::EStrand strand,int flags,TTags * tags)122 CMappedAlignment::CMappedAlignment( CSRead * query, CContig * subject, int from, const TTrVector& cigar, CSeqCoding::EStrand strand, int flags, TTags * tags ) :
123     m_mate(0), m_flags( flags ), m_tags( tags )
124 {
125     ++s_count;
126     m_query = query;
127     m_subject = subject;
128     m_sstart = from;
129     m_slength = cigar.ComputeSubjectLength();
130     m_qstart = 0;
131     m_qlength = query->GetSequenceLength();
132     m_cigar.Assign( cigar );
133     switch( strand ) {
134     case CSeqCoding::eStrand_pos:
135         if( cigar.front().GetEvent() == CTrBase::eEvent_SoftMask ) {
136             m_qstart += cigar.front().GetCount();
137             m_qlength -= cigar.front().GetCount();
138         }
139         if( cigar.back().GetEvent() == CTrBase::eEvent_SoftMask ) {
140             m_qlength -= cigar.back().GetCount();
141         }
142         break;
143     case CSeqCoding::eStrand_neg:
144         if( cigar.back().GetEvent() == CTrBase::eEvent_SoftMask ) {
145             m_qstart += cigar.back().GetCount();
146             m_qlength -= cigar.back().GetCount();
147         }
148         if( cigar.front().GetEvent() == CTrBase::eEvent_SoftMask ) {
149             m_qlength -= cigar.front().GetCount();
150         }
151         m_qstart += m_qlength - 1;
152         m_qlength *= -1;
153         break;
154         //m_qstart = query->GetSequenceLength() - m_qstart - m_qlength;
155         //m_qstart += m_qlength - 1;
156         //m_qlength *= -1;
157     }
158     if( s_validate_consistency ) ValidateConsistency( "CMappedAlignment constructor" );
159 //    cerr << __PRETTY_FUNCTION__ << DISPLAY( m_qstart ) << DISPLAY( m_qlength ) << "\n";
160 }
161 
ScoreAlignment(double id,double mm,double go,double ge) const162 double CMappedAlignment::ScoreAlignment( double id, double mm, double go, double ge ) const
163 {
164     return ScoreAlignment( m_cigar, id, mm, go, ge );
165 }
166 
ScoreAlignment(const TTrSequence & tr,double id,double mm,double go,double ge)167 double CMappedAlignment::ScoreAlignment( const TTrSequence& tr, double id, double mm, double go, double ge )
168 {
169     double score = 0;
170     for( TTrSequence::const_iterator x = tr.begin(); x != tr.end(); ++x ) {
171         switch( x->GetEvent() ) {
172             case CTrBase::eEvent_Changed:
173             case CTrBase::eEvent_Match: score += id * x->GetCount(); break;
174             case CTrBase::eEvent_Replaced: score += mm * x->GetCount(); break;
175             case CTrBase::eEvent_Deletion:
176             case CTrBase::eEvent_Insertion: score += go + (x->GetCount() - 1) * ge; break;
177             default:;
178         }
179     }
180     return score;
181 }
182 
SetMate(CMappedAlignment * other)183 void CMappedAlignment::SetMate( CMappedAlignment * other )
184 {
185     if( other == 0 ) return;
186     ASSERT( m_mate == 0 || m_mate == other );
187     ASSERT( other->GetMate() == 0 || other->GetMate() == this );
188     m_mate = other;
189     other->m_mate = this;
190 }
191 
~CSamSource()192 CSamSource::~CSamSource()
193 {
194     ITERATE( TAlignmentMap, a, m_alignmentsByContig )
195         ITERATE( TAlignmentList, l, a->second )
196             delete *l;
197     ITERATE( TReads, r , m_reads ) {
198         if( CSRead * mate = dynamic_cast<CSRead*>( const_cast<IShortRead*>( r->second->GetMate() ) ) )
199             delete mate;
200         delete r->second;
201     }
202     ITERATE( TContigs, c, m_contigs ) delete c->second;
203 }
204 
ParseSamLine(const string & line,int reg)205 CMappedAlignment * CSamSource::ParseSamLine( const string& line, int reg )
206 {
207     // This function should parse the line, create CShortRead and CContig if necessary,
208     // making sure that these objects are placed in maps, and create and store reference to
209     // new CMappedAlignment object
210 
211     vector<string> fields;
212     Split( line, "\t", back_inserter( fields ) );
213     if( fields.size() < 6 )
214         THROW( runtime_error, "SAM is supposed to have at least 6 columns, got " <<
215                 fields.size() << " in [" << line << "] in line" );
216 
217     return ParseSamLine( fields, reg );
218 }
219 
ParseSamLine(const vector<string> & fields,int reg)220 CMappedAlignment * CSamSource::ParseSamLine( const vector<string>& fields, int reg )
221 {
222     if( reg & fRegisterAlignmentByQuery ) reg |= fRegisterQuery;
223     if( reg & fRegisterAlignmentBySubject ) reg |= fRegisterSubject;
224     unsigned flags = NStr::StringToInt( fields[eCol_flags] );
225     int pos1 = NStr::StringToInt( fields[eCol_pos] ) - 1;
226     const string& id = fields[eCol_qname];
227     const string& ctg = fields[eCol_rname];
228     const string& cigar = fields[eCol_cigar];
229     string  seq = fields[eCol_seq];
230     string qual = fields[eCol_qual];
231     if( qual == "*" ) qual = "";
232     CSRead::EPairType ipt = flags & fPairedQuery ? flags & fSeqIsFirst ? CSRead::eRead_first : CSRead::eRead_second : CSRead::eRead_single;
233     CSeqCoding::EStrand strand = flags & fSeqReverse ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos;
234     if( flags & fSeqReverse ) { // since mapped reads are represented on genomic strand
235         for( int i = 0, j = seq.length() - 1; i < j; ++i, --j ) {
236             char x = CIupacnaBase( seq[i] ).Complement();
237             seq[i] = CIupacnaBase( seq[j] ).Complement();
238             seq[j] = x;
239             if( qual.length() ) swap( qual[i], qual[j] );
240         }
241         if( seq.length() & 1 ) {
242             int i = seq.length()/2;
243             seq[i] = CIupacnaBase( seq[i] ).Complement();
244         }
245     }
246 
247     CContig * contig = 0;
248     if( reg & fRegisterSubject ) {
249         TContigs::iterator ictg = m_contigs.find( ctg );
250         if( ictg == m_contigs.end() ) {
251             ictg = m_contigs.insert( ictg, make_pair( ctg, contig = new CContig( ctg ) ) );
252         }
253     } else contig = new CContig( ctg );
254     CSRead * read = 0;
255     if( reg & fRegisterQuery ) {
256         TReads::iterator iread = m_reads.find( id );
257         if( iread == m_reads.end() ) {
258             iread = m_reads.insert( iread, make_pair( id, read = new CSRead( id, seq, qual, ipt ) ) );
259         } else  {
260             read = iread->second;
261             if( read->GetPairType() != ipt ) {
262                 if( const IShortRead * rm = read->GetMate() ) {
263                     if( rm->GetPairType() != ipt )
264                         THROW( runtime_error, "Some strange problem with SAM flags: " << DISPLAY( read->GetPairType() ) << DISPLAY( rm->GetPairType() ) << DISPLAY( ipt ) );
265                     else read = dynamic_cast<CSRead*>( const_cast<IShortRead*>( rm ) );
266                 } else
267                     THROW( runtime_error, "Some problem with hit SAM flags: sequence is marked both as paired and not paired" );
268             } else if( read->GetIntendedPairType() != ipt ) {
269                 CSRead * mate = new CSRead( id, seq, qual, ipt );
270                 switch( ipt ) {
271                     case CSRead::eRead_single: THROW( logic_error, "Really weird!" );
272                     case CSRead::eRead_first: mate->SetMate( read ); iread->second = mate; read = mate; break;
273                     case CSRead::eRead_second: read->SetMate( mate ); read = mate; break;
274                 }
275             } else { /* nothing to do - we've found it */ }
276         }
277     } else read = new CSRead( id, seq, qual, ipt );
278 
279     string mism = "";
280 
281     map<string,string> * tags = 0;
282     for( unsigned i = eCol_tags; i < fields.size(); ++i ) {
283         // spaces are allowed in tags (see SAM 1.2)
284         if( fields[i].substr( 0, 5 ) == "MD:Z:" ) mism = fields[i].substr( 5 );
285         else {
286             if( tags == 0 ) tags = new CMappedAlignment::TTags();
287             const string& tag = fields[i];
288             size_t col = tag.find( ':' );
289             if( col == string::npos ) THROW( runtime_error, "Tag does not have `:' in " << Join( "\t", fields ) );
290             tags->insert( make_pair( tag.substr( 0, col ), tag.substr( col + 1 ) ) );
291         }
292     }
293     TTrVector fullcigar = GetFullCigar( cigar, mism );
294 #ifndef CORRECT_BAD_INPUT_CIGAR
295 #define CORRECT_BAD_INPUT_CIGAR 1
296 #endif
297 #if CORRECT_BAD_INPUT_CIGAR
298     int al = fullcigar.ComputeLengths().first;
299     int pl = fullcigar.front().GetEvent() == CTrBase::eEvent_SoftMask ? fullcigar.front().GetCount() : 0;
300     int sl = fullcigar.back().GetEvent() == CTrBase::eEvent_SoftMask ? fullcigar.back().GetCount() : 0;
301     int ql = seq.length();
302     if( mism == "" && ((al + pl + sl - 1) == ql)) {
303         TTrVector other;
304         TTrVector::iterator x = fullcigar.begin();
305         if( pl == 1 || sl == 1 ) {
306             other.AppendItem( *x++ );
307             if( pl == 1 ) {
308                 other.AppendItem( x->GetEvent(), x->GetCount() - 1 ); ++x;
309                 for( ; x != fullcigar.end(); ++x ) other.AppendItem( *x );
310             }
311             else {
312                 TTrVector::iterator y = x; ++y; ++y;
313                 for( ; y != fullcigar.end(); ++y, ++x ) other.AppendItem( *x );
314                 other.AppendItem( x->GetEvent(), x->GetCount() - 1 );
315                 other.AppendItem( *++x );
316             }
317         } else if( pl == 0 || sl == 0 ) {
318             if( pl ) {
319                 other.AppendItem( *x++ );
320                 other.AppendItem( x->GetEvent(), x->GetCount() - 1 );
321                 for( ; x != fullcigar.end(); ++x ) other.AppendItem( *x );
322             } else {
323                 TTrVector::iterator y = x; ++y; ++y;
324                 for( ; y != fullcigar.end(); ++y, ++x ) other.AppendItem( *x );
325                 other.AppendItem( x->GetEvent(), x->GetCount() - 1 );
326                 other.AppendItem( *++x );
327             }
328         }
329         cerr << "WARNING: Correcting " << fullcigar.ToString() << " to " << other.ToString() << " for " << id
330             << " " << seq << " on " << ctg << " at " << pos1 << " " << (strand == CSeqCoding::eStrand_pos ? "fwd" : "rev" ) << "\n";
331         fullcigar.swap( other );
332     }
333 #endif
334     CMappedAlignment * align = 0;
335     try {
336         align = new CMappedAlignment( read, contig, pos1, fullcigar, strand, 0, tags );
337     } catch( exception& e ) {
338         cerr << "ERROR: " << e.what() << " while parsing " << Join( "\t", fields ) << ": ignoring\n";
339         delete align;
340         delete read;
341         delete contig;
342         return 0;
343     }
344     if( reg & fRegisterQuery )   ; else align->SetFlags( CMappedAlignment::fOwnQuery );
345     if( reg & fRegisterSubject ) ; else align->SetFlags( CMappedAlignment::fOwnSubject );
346     // TODO: pairing of alignments is not ready yet
347 
348     const string& ctg2 = fields[eCol_mrnm];
349     int pos2 = NStr::StringToInt( fields[eCol_mpos] );
350 
351     if( flags & fPairedHit ) {
352         if( fields.size() < 7 )
353             THROW( runtime_error, "OligoFAR needs mate position in SAM file for paired reads" );
354         TPendingHits::iterator i = m_pendingHits.find( make_pair( id, make_pair( ctg2, pos2 ) ) );
355         if( i == m_pendingHits.end() ) m_pendingHits.insert( make_pair( make_pair( id, make_pair( ctg, pos1 ) ), align ) );
356         else {
357             if( ipt == CSRead::eRead_second ) i->second->SetMate( align );
358             else align->SetMate( i->second );
359             m_pendingHits.erase( i );
360         }
361     }
362     if( reg & fRegisterAlignmentBySubject ) m_alignmentsByContig[ctg].push_back( align );
363     if( reg & fRegisterAlignmentByQuery )   m_alignmentsByRead[id].push_back( align );
364     return align;
365 }
366 
ValidateConsistency(const string & context) const367 bool CMappedAlignment::ValidateConsistency( const string& context ) const
368 {
369     pair<int,int> lengths = m_cigar.ComputeLengths();
370     int fullLength = GetQuery()->GetSequenceLength();
371     int prefix = m_cigar.front().GetEvent() == CTrBase::eEvent_SoftMask ? m_cigar.front().GetCount() : 0;
372     int suffix = m_cigar.back().GetEvent() == CTrBase::eEvent_SoftMask ? m_cigar.back().GetCount() : 0;
373     if( IsReverseComplement() ) swap( prefix, suffix );
374     try {
375         ASSERT( lengths.first == GetQueryBounding().GetLength() );
376         ASSERT( lengths.second == GetSubjectBounding().GetLength() );
377         ASSERT( prefix == GetQueryBounding().GetFrom() );
378         ASSERT( fullLength - suffix - 1 == GetQueryBounding().GetTo() );
379     } catch( exception& e ) {
380         cerr
381             << "\x1b[34;42m" << context << "\x1b[0m\n"
382             << "\x1b[33;41m"
383             << GetQuery()->GetId() << "\t"
384             << GetSubject()->GetId() << "\t"
385             << m_query->GetIupacna() << "\t"
386             << m_qstart << (m_qlength > 0 ? "+" : "") << m_qlength << "\t"
387             << m_sstart << "+" << m_slength << "\t"
388             << m_cigar.ToString() << "\t"
389             << GetQueryBounding() << "\t"
390             << GetSubjectBounding() << "\t"
391             << (IsReverseComplement() ? "-" : "+")
392             << "\x1b[0m"
393             << "\n";
394         cerr << "\x1b[32m"
395             << DISPLAY( lengths.first )
396             << DISPLAY( lengths.second )
397             << DISPLAY( IsReverseComplement() )
398             << DISPLAY( fullLength )
399             << DISPLAY( prefix )
400             << DISPLAY( suffix )
401             << "\x1b[0m"
402             << "\n";
403         WriteAsSam( cerr );
404         throw;
405     }
406     return true;
407 }
408 
409 
GetFullCigar(const TTrVector & cigar,const string & mismatches)410 TTrVector CSamSource::GetFullCigar( const TTrVector& cigar, const string& mismatches )
411 {
412     // cigar  = 10A5^C10
413     // misms  = 16M1D4M1I6M
414     // result = 10M1R5M1D4M1I6M
415 
416     TTrVector fullcigar;
417     const char * misms = mismatches.c_str();
418     if( misms[0] ) {
419         const char * m = misms;
420         TTrVector::const_iterator c = cigar.begin();
421 
422         while( c != cigar.end() && c->GetEvent() == CTrBase::eEvent_SoftMask || c->GetEvent() == CTrBase::eEvent_HardMask )
423             fullcigar.AppendItem( *c++ );
424 
425         int acc = 0;
426 
427         string errmsgbase;
428         do {
429             ostringstream s;
430             s << "Inconsistent cigar " << cigar.ToString() << " and mismatch string " << misms;
431             errmsgbase = s.str();
432         } while( 0 );
433 
434         while( *m ) {
435             if( c == cigar.end() )
436                 THROW( runtime_error, errmsgbase << ": cigar is too short" );
437             if( isdigit( *m ) ) {
438                 int count = strtol( m, (char**)&m, 10 );
439                 if( c->GetEvent() != CTrBase::eEvent_Match )
440                     THROW( runtime_error, errmsgbase << ": cigar M is expected" );
441                 acc -= c->GetCount() - count;
442                 fullcigar.AppendItem( CTrBase::eEvent_Match, min( (int) c->GetCount(), count ) );
443                 if( acc == 0 ) { ++c; continue; }
444                 else if( acc < 0 ) {
445                     ++c;
446                     // NB: overlap is not supported here... it is not clear what to do according to SAM specs
447                     if( c == cigar.end() || c->GetEvent() != CTrBase::eEvent_Insertion )
448                         THROW( runtime_error, errmsgbase << ": cigar I is expected" );
449                     fullcigar.AppendItem( CTrBase::eEvent_Insertion, c->GetCount() );
450                     ++c;
451                 } else { // acc > 0
452                     // do nothing
453                 }
454             } else if( strchr( "ACGTN", *m ) ) {
455                 fullcigar.AppendItem( CTrBase::eEvent_Replaced, 1 );
456                 ++c;
457             } else if( *m == '^' ) {
458                 if( acc != 0 )
459                     THROW( runtime_error, errmsgbase << ": misplaced deletion" );
460                 int k = c->GetCount();
461                 if( c->GetEvent() != CTrBase::eEvent_Deletion && c->GetEvent() != CTrBase::eEvent_Splice )
462                     THROW( runtime_error, errmsgbase << ": N or D is expected" );
463                 while( strchr( "ACGTN", *++m ) ) {
464                     fullcigar.AppendItem( c->GetEvent(), 1 );
465                     --k;
466                 }
467                 if( k != 0 )
468                     THROW( runtime_error, errmsgbase << ": different sizes of deletion or splice" );
469                 ++c;
470             }
471         }
472         while( c != cigar.end() ) {
473             if( c->GetEvent() != CTrBase::eEvent_SoftMask && c->GetEvent() != CTrBase::eEvent_HardMask )
474                 THROW( runtime_error, errmsgbase << ": cigar is too long" );
475             fullcigar.AppendItem( *c++ );
476         }
477     } else {
478         fullcigar = cigar;
479     }
480     return fullcigar;
481 }
482 
IndexFile(const string & samFile,unsigned start,unsigned limit)483 void CSamSource::IndexFile( const string& samFile, unsigned start, unsigned limit )
484 {
485     ifstream in( samFile.c_str() );
486     string buff;
487     string lastId;
488     CProgressIndicator p( "Indexing " + samFile + " by contig" );
489     unsigned cnt = 0;
490     limit += start;
491     for( Uint8 offset = 0; getline( in, buff ); offset = in.tellg() ) {
492         if( buff[0] == '@' ) continue;
493         if( cnt++ < start ) continue;
494         if( cnt > limit ) break;
495         vector<string> fields;
496         Split( buff, "\t", back_inserter( fields ) );
497         if( fields.size() < 6 )
498             THROW( runtime_error, "SAM is supposed to have at least 6 columns, got " <<
499                     fields.size() << " in [" << buff << "] in line" );
500         if( fields[eCol_rname] != lastId ) {
501             m_alignmentOffsets[lastId = fields[eCol_rname]].push_back( offset );
502         }
503         p.Increment();
504     }
505     p.Summary();
506 }
507 
GetIupacna(CSeqCoding::EStrand strand) const508 string CSRead::GetIupacna( CSeqCoding::EStrand strand ) const
509 {
510     string iupac( x_GetSequenceLength(), '.' );
511     const char * seqdata = m_data + x_GetSequenceOffset();
512     int i = 0;
513     int I = iupac.length();
514     int j = strand == CSeqCoding::eStrand_pos ? 0 : iupac.size() - 1;
515     int d = strand == CSeqCoding::eStrand_pos ? 1 : -1;
516     switch( x_GetSequenceCoding() ) {
517         case CSeqCoding::eCoding_ncbi8na:
518             for( ; i < I; ++i, (j += d) ) { iupac[i] = CIupacnaBase( CNcbi8naBase( seqdata[j], strand ) ); }
519             break;
520         default:
521             THROW( logic_error, __PRETTY_FUNCTION__ << " does not support sequence coding other then NCBI-8na" );
522             break;
523     }
524     return iupac;
525 }
526 
WriteAsSam(ostream & out) const527 void CMappedAlignment::WriteAsSam( ostream& out ) const
528 {
529     CSeqCoding::EStrand strand = IsReverseComplement() ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos;
530     int flags = IsReverseComplement() ? CSamBase::fSeqReverse : 0;
531     string iupac = m_query->GetIupacna( strand );
532     TTags * tt = m_tags;
533     if( GetSubject()->GetSequenceData() != 0 && GetSubject()->GetSequenceCoding() == CSeqCoding::eCoding_ncbi8na ) {
534         CSamMdFormatter md(  GetSubject()->GetSequenceData() + GetSubjectBounding().GetFrom(), iupac.c_str(), GetCigar() );
535         if( md.Format() ) {
536             tt = new TTags();
537             if( m_tags ) copy( m_tags->begin(), m_tags->end(), inserter( *tt, tt->end() ) );
538             tt->insert( make_pair( "MD:Z", md.GetString() ) );
539         }
540     }
541     out
542         << GetQuery()->GetId() << "\t"
543         << flags << "\t"
544         << GetSubject()->GetId() << "\t"
545         << (GetSubjectBounding().GetFrom() + 1) << "\t"
546         << 255 << "\t"
547         << GetCigar().ToString() << "\t"
548         << "*\t0\t0\t" << iupac << "\t*";
549     if( tt ) {
550         ITERATE( TTags, tag, (*tt) ) {
551             out << "\t" << tag->first << ":" << tag->second;
552         }
553         if( tt != m_tags ) delete tt;
554     }
555     out << "\n";
556 }
557 
CMappedAlignment(const CMappedAlignment & other)558 CMappedAlignment::CMappedAlignment( const CMappedAlignment& other ) :
559     m_query( other.m_flags & fOwnQuery ? new CSRead( *other.m_query ) : m_query ),
560     m_subject( other.m_flags & fOwnSubject ? new CContig( *other.m_subject ) : m_subject ),
561     m_mate( 0 ),
562     m_sstart( other.m_sstart ),
563     m_slength( other.m_slength ),
564     m_qstart( other.m_qstart ),
565     m_qlength( other.m_qlength ),
566     m_flags( other.m_flags ),
567     m_cigar( other.m_cigar ),
568     m_tags( other.m_tags ? new TTags( *other.m_tags ) : 0 )
569 {
570     ++s_count;
571 }
572 
Assign(const CMappedAlignment * other)573 void CMappedAlignment::Assign( const CMappedAlignment * other )
574 {
575     if( this == other ) { return; }
576     ASSERT( other );
577     if( m_query != other->m_query ) {
578         if( m_flags & fOwnQuery ) delete m_query;
579         if( other->m_flags & fOwnQuery ) {
580             m_query = new CSRead( *other->m_query );
581             m_flags |= fOwnQuery;
582         } else {
583             m_query = other->m_query;
584             m_flags &= ~fOwnQuery;
585         }
586     }
587     if( m_subject != other->m_subject ) {
588         if( m_flags & fOwnSubject ) delete m_subject;
589         if( other->m_flags & fOwnSubject ) {
590             m_subject = new CContig( *other->m_subject );
591             m_flags |= fOwnSubject;
592         } else {
593             m_subject = other->m_subject;
594             m_flags &= ~fOwnSubject;
595         }
596     }
597     if( m_tags ) delete m_tags;
598     m_mate = 0;
599     m_sstart = other->m_sstart;
600     m_slength = other->m_slength;
601     m_qstart = other->m_qstart;
602     m_qlength = other->m_qlength;
603     m_flags = other->m_flags; //(other->m_flags&~fOwnSeqs) | (m_flags&fOwnSeqs);
604     m_cigar = other->m_cigar;
605     m_tags = other->m_tags ? new TTags( *other->m_tags ) : 0;
606 }
607 
MakeExtended(IAligner * aligner) const608 CMappedAlignment * CMappedAlignment::MakeExtended( IAligner * aligner ) const
609 {
610     if( GetCigar().front().GetEvent() != CTrBase::eEvent_SoftMask &&
611             GetCigar().back().GetEvent() != CTrBase::eEvent_SoftMask ) return 0;
612     CSeqCoding::EStrand strand = IsReverseComplement() ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos;
613     TRange qbb( GetQueryBounding() );
614     TRange sbb( GetSubjectBounding() );
615 
616     aligner->SetSubjectStrand( strand );
617     aligner->SetWordDistance( ComputeDistance( GetCigar() ) );
618     aligner->SetQuery( m_query->GetSequenceData(), m_query->GetSequenceLength() );
619     aligner->SetQueryCoding( m_query->GetSequenceCoding() );
620     aligner->SetQueryAnchor( qbb.GetFrom(), qbb.GetTo() );
621     aligner->SetSubjectAnchor( sbb.GetFrom(), sbb.GetTo() );
622     aligner->SetSubjectGuideTranscript( GetCigar() );
623     if( aligner->Align() )
624         if( ScoreAlignment() <= ScoreAlignment( aligner->GetSubjectTranscript() ) ) // ( aligner->GetSubjectTo() - aligner->GetSubjectFrom() + 1 ) >= sbb.GetLength() )
625             return new CMappedAlignment(
626                 m_flags & fOwnQuery ? new CSRead( *m_query ) : m_query,
627                 m_flags & fOwnSubject ? new CContig( *m_subject ) : m_subject,
628                 aligner->GetSubjectFrom(), aligner->GetSubjectTranscript(), strand,
629                 m_flags, m_tags ? new TTags( *m_tags ) : 0 );
630         else
631             cerr << "WARNING: " << m_cigar.ToString()
632                 << " scores " << ScoreAlignment() << ", " << aligner->GetSubjectTranscript().ToString()
633                 << " scores " << ScoreAlignment( aligner->GetSubjectTranscript() ) << "\n";
634     else cerr << "WARNING: alignment failed (distance = " << ComputeDistance( GetCigar() ) << ")\n";
635 
636     return 0;
637 }
638 
AdjustAlignment(EAdjustMode amode,int fromBy,int toBy)639 void CMappedAlignment::AdjustAlignment( EAdjustMode amode, int fromBy, int toBy )
640     // NB: this procedure is well debugged (for case when there are no indels) on Oct 15 2009 for all combinations of cases where fromBy * toBy = 0
641     // Also versions which increase alignment length should work fine with indels (since indels are not touched)
642     // Tested all versions, should not throw an exception, but will warn to stderr and overadjust in subj coords alignments like 1M2D10M3D2M
643     // TODO: a version of this function may use subject sequence to mark some 'replacement' positions as 'match' where there is actual match
644 {
645     if( s_validate_consistency ) ValidateConsistency( "AdjustAlignment start" );
646     enum EFlags {
647         fFromGt0 = 0x01,
648         fFromLt0 = 0x02,
649         fToLess0 = 0x04,
650         fToMore0 = 0x08,
651         fQryMode = 0x10,
652         fSbjMode = 0x00,
653         fReverse = 0x20,
654         fForward = 0x00,
655         fNULL = 0
656     };
657     //ASSERT(fromBy == 0 || toBy == 0);
658     unsigned flags = 0;
659     if( fromBy > 0 ) flags |= fFromGt0; else if( fromBy < 0 ) flags |= fFromLt0;
660     if( toBy > 0 ) flags |= fToMore0; else if( toBy < 0 ) flags |= fToLess0;
661     if( amode == eAdjust_query ) flags |= fQryMode;
662     if( IsReverseComplement() ) flags |= fReverse;
663     int qlength = GetQuery()->GetSequenceLength();
664     switch( flags ) {
665         case fSbjMode|fForward:
666         case fSbjMode|fReverse:
667         case fQryMode|fForward:
668         case fQryMode|fReverse: return; // do nothing - no offsets
669         case fSbjMode|fForward|fFromLt0:
670         case fSbjMode|fReverse|fFromLt0:
671         case fQryMode|fForward|fFromLt0:
672         case fQryMode|fReverse|fToMore0:
673             do {
674                 int by = toBy - fromBy; // >0
675                 switch( flags ) {
676                 case fSbjMode|fForward|fFromLt0:
677                 case fQryMode|fForward|fFromLt0: if( by > GetQueryBounding().GetFrom() ) by = GetQueryBounding().GetFrom(); break;
678                 case fSbjMode|fReverse|fFromLt0:
679                 case fQryMode|fReverse|fToMore0: if( by > qlength - 1 - GetQueryBounding().GetTo() ) by = qlength - 1 - GetQueryBounding().GetTo(); break;
680                 }
681                 if( by == 0 ) break;
682                 ASSERT( by > 0 );
683                 TTrSequence result;
684                 TTrSequence::const_iterator x = m_cigar.begin();
685                 ASSERT( x != m_cigar.end() );
686                 if( x->GetEvent() != CTrBase::eEvent_SoftMask ) {
687                     cerr << "Warning: attempt to extend alignment with no soft mask, ignoring\n";
688                     by = 0;
689                 } else if( (int)x->GetCount() < by ) {
690                     cerr << "Warning: by of " << by << " is too large for CIGAR " << m_cigar.ToString() << ", reducing\n";
691                     by = x->GetCount();
692                 }
693                 result.AppendItem( CTrBase::eEvent_SoftMask, x->GetCount() - by );
694                 result.AppendItem( CTrBase::eEvent_Replaced, by );
695                 for( ++x; x != m_cigar.end(); ++x ) result.AppendItem( *x );
696                 m_cigar = result;
697                 m_sstart -= by;
698                 m_slength += by;
699                 if( IsReverseComplement() ) by = -by;
700                 m_qstart -= by;
701                 m_qlength += by;
702             } while(0);
703             break;
704         case fSbjMode|fForward|fToMore0:
705         case fSbjMode|fReverse|fToMore0:
706         case fQryMode|fForward|fToMore0:
707         case fQryMode|fReverse|fFromLt0:
708             do {
709                 int by = toBy - fromBy; // >0
710                 switch( flags ) {
711                 case fSbjMode|fForward|fToMore0:
712                 case fQryMode|fForward|fToMore0: if( by > qlength - 1 - GetQueryBounding().GetTo() ) by = qlength - 1 - GetQueryBounding().GetTo(); break;
713                 case fSbjMode|fReverse|fToMore0:
714                 case fQryMode|fReverse|fFromLt0: if( by > GetQueryBounding().GetFrom() ) by = GetQueryBounding().GetFrom(); break;
715                 }
716                 if( by == 0 ) break;
717                 ASSERT( by > 0 );
718                 TTrSequence result;
719                 TTrSequence::const_iterator x = m_cigar.begin();
720                 ASSERT( x != m_cigar.end() );
721                 result.AppendItem( *x );
722                 for( ++x ; x != m_cigar.end(); ++x ) {
723                     if( x->GetEvent() == CTrBase::eEvent_SoftMask ) {
724                         if( (int)x->GetCount() < by ) {
725                             cerr << "Warning: by of " << by << " is too large for CIGAR " << m_cigar.ToString() << ", reducing\n";
726                             by = x->GetCount();
727                         }
728                         result.AppendItem( CTrBase::eEvent_Replaced, by );
729                         result.AppendItem( CTrBase::eEvent_SoftMask, x->GetCount() - by );
730                     } else result.AppendItem( *x );
731                 }
732                 m_cigar = result;
733                 m_slength += by;
734                 if( IsReverseComplement() ) by = -by;
735                 m_qlength += by;
736             } while(0);
737             break;
738         case fSbjMode|fForward|fFromGt0:
739         case fSbjMode|fReverse|fFromGt0:
740         case fQryMode|fForward|fFromGt0:
741         case fQryMode|fReverse|fToLess0:
742             do {
743                 int by = fromBy - toBy; // >0
744                 ASSERT( by > 0 );
745                 pair<int,int> ll = m_cigar.ComputeLengths();
746                 if( ll.first <= by || ll.second <= by ) {
747                     cerr << "Warning: too short alignment to clip it by " << by << ": " << m_cigar.ToString() << "\n";
748                     return;
749                 }
750                 vector<CTrBase::EEvent> result;
751                 result.reserve( by );
752                 TTrSequence::const_iterator x = m_cigar.begin();
753                 ASSERT( x != m_cigar.end() );
754                 if( x->GetEvent() == CTrBase::eEvent_SoftMask ) {
755                     for( int i = 0; i < x->GetCount(); ++i )
756                         result.push_back( x->GetEvent() );
757                     ++x;
758                 }
759                 int count = x->GetCount();
760                 ASSERT( count > 0 );
761                 int d = IsReverseComplement() ? -1 : 1;
762                 for( ; by > 0 ; ) {
763                     switch( x->GetEvent() ) {
764                         case CTrBase::eEvent_Match:
765                         case CTrBase::eEvent_Replaced:
766                         case CTrBase::eEvent_Changed:
767                             --by;
768                             result.push_back( CTrBase::eEvent_SoftMask );
769                             m_sstart ++;
770                             m_slength --;
771                             m_qstart += d;
772                             m_qlength -= d;
773                             break;
774                         case CTrBase::eEvent_Insertion:
775                             if( amode == eAdjust_query ) --by;
776                             result.push_back( CTrBase::eEvent_SoftMask );
777                             m_qstart += d;
778                             m_qlength -= d;
779                             break;
780                         case CTrBase::eEvent_Deletion:
781                             if( amode == eAdjust_subj ) --by;
782                             m_sstart ++;
783                             m_slength --;
784                             break;
785                         case CTrBase::eEvent_Padding:
786                             break;
787                         default: THROW( logic_error, "Procedure does not support CIGAR like " << m_cigar.ToString() << " in " __FILE__ ":" << __LINE__ );
788                     }
789                     if( --count == 0 ) count = (++x)->GetCount();
790                     if( x == m_cigar.end() ) THROW( logic_error, "Oops... unexpected CIGAR end (forward) " << m_cigar.ToString() );
791                 }
792                 switch( x->GetEvent() ) {
793                     case CTrBase::eEvent_Insertion:
794                         if( amode == eAdjust_subj ) {
795                             for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_SoftMask );
796                             m_qstart += d * count;
797                             m_qlength -= d * count;
798                         } else {
799                             for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_Replaced );
800                             m_slength += count;
801                             m_sstart -= count;
802                         }
803                         count = 0;
804                         break;
805                     case CTrBase::eEvent_Deletion:
806                         if( amode == eAdjust_subj ) {
807                             if( (int)result.size() < count ) {
808                                 m_sstart += count - result.size();
809                                 m_slength -= count - result.size();
810                                 count = result.size();
811                                 cerr << "WARNING: "__FILE__":" << __LINE__ << " can't adjust precisely strange alignment of " << m_cigar.ToString() << "\n";
812                             }
813                             //ASSERT( (int)result.size() >= count );
814                             for( int i = (int)result.size() - count; i < (int)result.size(); ++i )
815                                 result[i] = CTrBase::eEvent_Replaced;
816                             m_qstart -= d * count;
817                             m_qlength += d * count;
818                         } else {
819                             m_slength -= count;
820                             m_sstart += count;
821                         }
822                         count = 0;
823                         break;
824                     default:;
825                 }
826                 TTrSequence tmp;
827                 ITERATE( vector<CTrBase::EEvent>, r, result ) tmp.AppendItem( *r );
828                 while( count-- > 0 ) tmp.AppendItem( x->GetEvent() );
829                 for( ++x ; x != m_cigar.end(); ++x ) tmp.AppendItem( *x );
830                 m_cigar = tmp;
831             } while(0);
832             break;
833         case fSbjMode|fForward|fToLess0:
834         case fSbjMode|fReverse|fToLess0:
835         case fQryMode|fForward|fToLess0:
836         case fQryMode|fReverse|fFromGt0:
837             do {
838                 int by = fromBy - toBy; // >0
839                 ASSERT( by > 0 );
840                 pair<int,int> ll = m_cigar.ComputeLengths();
841                 if( ll.first <= by || ll.second <= by ) {
842                     cerr << "Warning: too short alignment to clip it by " << by << ": " << m_cigar.ToString() << "\n";
843                     return;
844                 }
845                 vector<CTrBase::EEvent> result;
846                 result.reserve( by );
847                 TTrSequence::reverse_iterator x = m_cigar.rbegin();
848                 ASSERT( x != m_cigar.rend() );
849                 if( x->GetEvent() == CTrBase::eEvent_SoftMask ) {
850                     for( int i = 0; i < x->GetCount(); ++i )
851                         result.push_back( x->GetEvent() );
852                     ++x;
853                 }
854                 int count = x->GetCount();
855                 ASSERT( count > 0 );
856                 int d = IsReverseComplement() ? -1 : 1;
857                 for( ; by > 0 ; ) {
858                     switch( x->GetEvent() ) {
859                         case CTrBase::eEvent_Match:
860                         case CTrBase::eEvent_Replaced:
861                         case CTrBase::eEvent_Changed:
862                             --by;
863                             result.push_back( CTrBase::eEvent_SoftMask );
864                             m_slength --;
865                             m_qlength -= d;
866                             break;
867                         case CTrBase::eEvent_Insertion:
868                             if( amode == eAdjust_query ) --by;
869                             result.push_back( CTrBase::eEvent_SoftMask );
870                             m_qlength -= d;
871                             break;
872                         case CTrBase::eEvent_Deletion:
873                             if( amode == eAdjust_subj ) --by;
874                             m_slength --;
875                             break;
876                         case CTrBase::eEvent_Padding:
877                             break;
878                         default: THROW( logic_error, "Procedure does not support CIGAR like " << m_cigar.ToString() << " in " __FILE__ ":" << __LINE__ );
879                     }
880                     if( --count == 0 ) count = (++x)->GetCount();
881                     ASSERT( x != m_cigar.rend() );
882                 }
883                 switch( x->GetEvent() ) {
884                     case CTrBase::eEvent_Insertion:
885                         if( amode == eAdjust_subj ) {
886                             for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_SoftMask );
887                             m_qlength -= d * count;
888                         } else {
889                             for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_Replaced );
890                             m_slength += count;
891                         }
892                         count = 0;
893                         break;
894                     case CTrBase::eEvent_Deletion:
895                         if( amode == eAdjust_subj ) {
896                             if( (int)result.size() < count ) {
897                                 m_slength -= count - result.size();
898                                 count = result.size();
899                                 cerr << "WARNING: "__FILE__":" << __LINE__ << " can't adjust precisely strange alignment of " << m_cigar.ToString() << "\n";
900                             }
901                             // ASSERT( (int)result.size() >= count );
902                             for( int i = (int)result.size() - count; i < (int)result.size(); ++i )
903                                 result[i] = CTrBase::eEvent_Replaced;
904                             m_qlength += d * count;
905                         } else {
906                             m_slength -= count;
907                         }
908                         count = 0;
909                         break;
910                     default:;
911                 }
912                 ASSERT( x != m_cigar.rend() );
913                 TTrSequence tmp;
914                 ITERATE( vector<CTrBase::EEvent>, r, result ) tmp.AppendItem( *r );
915                 while( count-- > 0 ) tmp.AppendItem( x->GetEvent() );
916                 for( ++x; x != m_cigar.rend(); ++x ) tmp.AppendItem( *x );
917                 m_cigar.clear();
918                 for( x = tmp.rbegin(); x != tmp.rend(); ++x ) m_cigar.AppendItem( *x );
919             } while(0);
920             break;
921         default:
922             THROW( logic_error, "flags " << hex << flags << dec << "h can not be processed in " __FILE__ ":" << __LINE__ );
923     }
924     if( s_validate_consistency ) {
925         ostringstream o;
926         o << hex << flags;
927         ValidateConsistency( "AdjustAlignment end " + o.str() );
928     }
929 }
930 
GetTagType(const string & name) const931 char CMappedAlignment::GetTagType( const string& name ) const
932 {
933     if( m_tags == 0 ) return 0;
934     TTags::const_iterator i = m_tags->find( name );
935     if( i == m_tags->end() ) return 0;
936     return i->second[0];
937 }
938 
SetTagS(const string & name,const string & value)939 void CMappedAlignment::SetTagS( const string& name, const string& value )
940 {
941     if( value.length() ) {
942         if( m_tags == 0 ) m_tags = new TTags();
943         (*m_tags)[name] = "Z:" + value;
944     } else RemoveTag( name );
945 }
946 
RemoveTag(const string & name)947 void CMappedAlignment::RemoveTag( const string& name )
948 {
949     if( m_tags ) {
950         m_tags->erase( name );
951         if( m_tags->size() == 0 ) SetTags( 0 );
952     }
953 }
954 
SetTagI(const string & name,int value)955 void CMappedAlignment::SetTagI( const string& name, int value )
956 {
957     if( m_tags == 0 ) m_tags = new TTags();
958     (*m_tags)[name] = "i:" + NStr::IntToString( value );
959 }
960 
SetTagF(const string & name,float value)961 void CMappedAlignment::SetTagF( const string& name, float value )
962 {
963     if( m_tags == 0 ) m_tags = new TTags();
964     (*m_tags)[name] = "f:" + NStr::DoubleToString( value );
965 }
966 
GetTagS(const string & name,char type) const967 string CMappedAlignment::GetTagS( const string& name, char type ) const
968 {
969     if( m_tags == 0 ) THROW( runtime_error, "No tag " << name << " is found" );
970     TTags::const_iterator i = m_tags->find( name );
971     if( i == m_tags->end() ) THROW( runtime_error, "No tag " << name << " is found" );
972     if( i->second[1] != ':' ) THROW( runtime_error, "Bad tag format for " << name << ":" << i->second );
973     if( i->second[0] != type ) THROW( runtime_error, "Bad tag type: " << i->second[0] << " while " << type << " is expected" );
974     return i->second.substr(2);
975 }
976 
GetTagI(const string & name) const977 int CMappedAlignment::GetTagI( const string& name ) const
978 {
979     return NStr::StringToInt( GetTagS( name, 'i' ) );
980 }
981 
GetTagF(const string & name) const982 float CMappedAlignment::GetTagF( const string& name ) const
983 {
984     return NStr::StringToDouble( GetTagS( name, 'f' ) );
985 }
986 
987