1 #include <ncbi_pch.hpp>
2 #include "csamalignments.hpp"
3 #include "csammdformatter.hpp"
4 #include "cprogressindicator.hpp"
5 #include "string-util.hpp"
6 #include "ialigner.hpp"
7
8 USING_OLIGOFAR_SCOPES;
9
CSRead(const string & id,CSeqCoding::ECoding coding,const char * sequence,int length,EPairType intended)10 CSRead::CSRead( const string& id, CSeqCoding::ECoding coding, const char * sequence, int length, EPairType intended )
11 {
12 ++s_count;
13 ASSERT( coding == CSeqCoding::eCoding_ncbi8na || coding == CSeqCoding::eCoding_ncbiqna );
14 m_data = new char[x_GetDataSize( id.length(), length, eRead_single )];
15 m_data[0] = id.length();
16 m_data[1] = length;
17 m_data[2] = eRead_single | ( intended << 2 );
18 m_data[3] = coding; // quality.length() ? CSeqCoding::eCoding_ncbiqna : CSeqCoding::eCoding_ncbi8na ;
19 strcpy( m_data + x_GetIdOffset(), id.c_str() );
20 char * d = m_data + x_GetSequenceOffset();
21 copy( sequence, sequence + length, d );
22 }
23
CSRead(const string & id,const string & sequence,const string & quality,EPairType intended)24 CSRead::CSRead( const string& id, const string& sequence, const string& quality, EPairType intended )
25 {
26 ++s_count;
27 if( quality.length() && quality.length() != sequence.length() )
28 THROW( runtime_error, "Sequence lendgth and quality length differ (" << sequence << ", " << quality << ") !" );
29 m_data = new char[x_GetDataSize( id.length(), sequence.length(), eRead_single )];
30 m_data[0] = id.length();
31 m_data[1] = sequence.length();
32 m_data[2] = eRead_single | ( intended << 2 );
33 m_data[3] = quality.length() ? CSeqCoding::eCoding_ncbiqna : CSeqCoding::eCoding_ncbi8na ;
34 strcpy( m_data + x_GetIdOffset(), id.c_str() );
35 char * d = m_data + x_GetSequenceOffset();
36 if( quality.length() ) {
37 for( const char * s = sequence.c_str(), * q = quality.c_str(); *s; ++s, ++q ) {
38 if( *s != '-' ) *d++ = CNcbiqnaBase( CIupacnaBase( *s ), *q - 33 );
39 }
40 } else {
41 for( const char * s = sequence.c_str(); *s; ++s ) {
42 if( *s != '-' ) *d++ = CNcbi8naBase( CIupacnaBase( *s ) );
43 }
44 }
45 }
46
CSRead(const CSRead & r)47 CSRead::CSRead( const CSRead& r )
48 {
49 ++s_count;
50 int ds = x_GetDataSize( strlen(r.GetId()), r.x_GetSequenceLength(), r.x_GetPairType() );
51 m_data = new char[ds];
52 memcpy( m_data, r.m_data, ds );
53 }
54
operator =(const CSRead & r)55 CSRead& CSRead::operator = ( const CSRead& r )
56 {
57 delete[] m_data;
58 int ds = x_GetDataSize( strlen(r.GetId()), r.x_GetSequenceLength(), r.x_GetPairType() );
59 m_data = new char[ds];
60 memcpy( m_data, r.m_data, ds );
61 return *this;
62 }
63
SetMate(CSRead * other)64 void CSRead::SetMate( CSRead * other )
65 {
66 if( !other ) return;
67 if( x_GetPairType() != eRead_single || other->x_GetPairType() != eRead_single ) {
68 if( other->x_GetMate() == this ) return;
69 THROW( logic_error, "Can't make read paired: it is already paired" );
70 }
71 if( GetIntendedPairType() != eRead_first || other->GetIntendedPairType() != eRead_second ) {
72 THROW( logic_error, "Intender pair type of the read differs from what is getting to be actual" );
73 }
74 char * newdata = new char[x_GetDataSize( x_GetIdLength(), x_GetSequenceLength(), eRead_first )];
75 const char * id = m_data + x_GetIdOffset();
76 const char * seq = m_data + x_GetSequenceOffset();
77 newdata[0] = x_GetIdLength();
78 newdata[1] = x_GetSequenceLength();
79 newdata[2] = eRead_first | (GetIntendedPairType() << 2);
80 newdata[3] = x_GetSequenceCoding();
81 swap( newdata, m_data );
82 copy( id, id + x_GetIdLength() + 1, m_data + x_GetIdOffset() );
83 copy( seq, seq + x_GetSequenceLength() + 1, m_data + x_GetSequenceOffset() );
84 x_SetMate() = other;
85 delete[] newdata;
86 newdata = new char[other->x_GetDataSize( other->x_GetIdLength(), other->x_GetSequenceLength(), eRead_second )];
87 id = other->m_data + other->x_GetIdOffset();
88 seq = other->m_data + other->x_GetSequenceOffset();
89 newdata[0] = 0; // other->x_GetIdLength();
90 newdata[1] = other->x_GetSequenceLength();
91 newdata[2] = eRead_second | (other->GetIntendedPairType() << 2);
92 newdata[3] = other->x_GetSequenceCoding();
93 swap( newdata, other->m_data );
94 copy( seq, seq + other->x_GetSequenceLength() + 1, other->m_data + other->x_GetSequenceOffset() );
95 other->x_SetMate() = this;
96 delete[] newdata;
97 }
98
CContig(const string & name)99 CContig::CContig( const string& name ) :
100 m_id( name ), m_data( 0 ), m_size( 0 ), m_offset( 0 ), m_coding( CSeqCoding::eCoding_ncbi8na ), m_owns( false )
101 {
102 ++s_count;
103 }
104
CContig(const CContig & other)105 CContig::CContig( const CContig& other ) :
106 m_id( other.m_id ), m_data( 0 ), m_size( other.m_size ), m_offset( other.m_offset ), m_coding( other.m_coding ), m_owns( other.m_owns )
107 {
108 ++s_count;
109 if( other.m_owns ) {
110 m_data = new char[ other.m_size - other.m_offset ];
111 memcpy( m_data, other.m_data, other.m_size - other.m_offset );
112 } else {
113 m_data = other.m_data;
114 }
115 }
116
117 int CSRead::s_count = 0;
118 int CContig::s_count = 0;
119 int CMappedAlignment::s_count = 0;
120 bool CMappedAlignment::s_validate_consistency = true;
121
CMappedAlignment(CSRead * query,CContig * subject,int from,const TTrVector & cigar,CSeqCoding::EStrand strand,int flags,TTags * tags)122 CMappedAlignment::CMappedAlignment( CSRead * query, CContig * subject, int from, const TTrVector& cigar, CSeqCoding::EStrand strand, int flags, TTags * tags ) :
123 m_mate(0), m_flags( flags ), m_tags( tags )
124 {
125 ++s_count;
126 m_query = query;
127 m_subject = subject;
128 m_sstart = from;
129 m_slength = cigar.ComputeSubjectLength();
130 m_qstart = 0;
131 m_qlength = query->GetSequenceLength();
132 m_cigar.Assign( cigar );
133 switch( strand ) {
134 case CSeqCoding::eStrand_pos:
135 if( cigar.front().GetEvent() == CTrBase::eEvent_SoftMask ) {
136 m_qstart += cigar.front().GetCount();
137 m_qlength -= cigar.front().GetCount();
138 }
139 if( cigar.back().GetEvent() == CTrBase::eEvent_SoftMask ) {
140 m_qlength -= cigar.back().GetCount();
141 }
142 break;
143 case CSeqCoding::eStrand_neg:
144 if( cigar.back().GetEvent() == CTrBase::eEvent_SoftMask ) {
145 m_qstart += cigar.back().GetCount();
146 m_qlength -= cigar.back().GetCount();
147 }
148 if( cigar.front().GetEvent() == CTrBase::eEvent_SoftMask ) {
149 m_qlength -= cigar.front().GetCount();
150 }
151 m_qstart += m_qlength - 1;
152 m_qlength *= -1;
153 break;
154 //m_qstart = query->GetSequenceLength() - m_qstart - m_qlength;
155 //m_qstart += m_qlength - 1;
156 //m_qlength *= -1;
157 }
158 if( s_validate_consistency ) ValidateConsistency( "CMappedAlignment constructor" );
159 // cerr << __PRETTY_FUNCTION__ << DISPLAY( m_qstart ) << DISPLAY( m_qlength ) << "\n";
160 }
161
ScoreAlignment(double id,double mm,double go,double ge) const162 double CMappedAlignment::ScoreAlignment( double id, double mm, double go, double ge ) const
163 {
164 return ScoreAlignment( m_cigar, id, mm, go, ge );
165 }
166
ScoreAlignment(const TTrSequence & tr,double id,double mm,double go,double ge)167 double CMappedAlignment::ScoreAlignment( const TTrSequence& tr, double id, double mm, double go, double ge )
168 {
169 double score = 0;
170 for( TTrSequence::const_iterator x = tr.begin(); x != tr.end(); ++x ) {
171 switch( x->GetEvent() ) {
172 case CTrBase::eEvent_Changed:
173 case CTrBase::eEvent_Match: score += id * x->GetCount(); break;
174 case CTrBase::eEvent_Replaced: score += mm * x->GetCount(); break;
175 case CTrBase::eEvent_Deletion:
176 case CTrBase::eEvent_Insertion: score += go + (x->GetCount() - 1) * ge; break;
177 default:;
178 }
179 }
180 return score;
181 }
182
SetMate(CMappedAlignment * other)183 void CMappedAlignment::SetMate( CMappedAlignment * other )
184 {
185 if( other == 0 ) return;
186 ASSERT( m_mate == 0 || m_mate == other );
187 ASSERT( other->GetMate() == 0 || other->GetMate() == this );
188 m_mate = other;
189 other->m_mate = this;
190 }
191
~CSamSource()192 CSamSource::~CSamSource()
193 {
194 ITERATE( TAlignmentMap, a, m_alignmentsByContig )
195 ITERATE( TAlignmentList, l, a->second )
196 delete *l;
197 ITERATE( TReads, r , m_reads ) {
198 if( CSRead * mate = dynamic_cast<CSRead*>( const_cast<IShortRead*>( r->second->GetMate() ) ) )
199 delete mate;
200 delete r->second;
201 }
202 ITERATE( TContigs, c, m_contigs ) delete c->second;
203 }
204
ParseSamLine(const string & line,int reg)205 CMappedAlignment * CSamSource::ParseSamLine( const string& line, int reg )
206 {
207 // This function should parse the line, create CShortRead and CContig if necessary,
208 // making sure that these objects are placed in maps, and create and store reference to
209 // new CMappedAlignment object
210
211 vector<string> fields;
212 Split( line, "\t", back_inserter( fields ) );
213 if( fields.size() < 6 )
214 THROW( runtime_error, "SAM is supposed to have at least 6 columns, got " <<
215 fields.size() << " in [" << line << "] in line" );
216
217 return ParseSamLine( fields, reg );
218 }
219
ParseSamLine(const vector<string> & fields,int reg)220 CMappedAlignment * CSamSource::ParseSamLine( const vector<string>& fields, int reg )
221 {
222 if( reg & fRegisterAlignmentByQuery ) reg |= fRegisterQuery;
223 if( reg & fRegisterAlignmentBySubject ) reg |= fRegisterSubject;
224 unsigned flags = NStr::StringToInt( fields[eCol_flags] );
225 int pos1 = NStr::StringToInt( fields[eCol_pos] ) - 1;
226 const string& id = fields[eCol_qname];
227 const string& ctg = fields[eCol_rname];
228 const string& cigar = fields[eCol_cigar];
229 string seq = fields[eCol_seq];
230 string qual = fields[eCol_qual];
231 if( qual == "*" ) qual = "";
232 CSRead::EPairType ipt = flags & fPairedQuery ? flags & fSeqIsFirst ? CSRead::eRead_first : CSRead::eRead_second : CSRead::eRead_single;
233 CSeqCoding::EStrand strand = flags & fSeqReverse ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos;
234 if( flags & fSeqReverse ) { // since mapped reads are represented on genomic strand
235 for( int i = 0, j = seq.length() - 1; i < j; ++i, --j ) {
236 char x = CIupacnaBase( seq[i] ).Complement();
237 seq[i] = CIupacnaBase( seq[j] ).Complement();
238 seq[j] = x;
239 if( qual.length() ) swap( qual[i], qual[j] );
240 }
241 if( seq.length() & 1 ) {
242 int i = seq.length()/2;
243 seq[i] = CIupacnaBase( seq[i] ).Complement();
244 }
245 }
246
247 CContig * contig = 0;
248 if( reg & fRegisterSubject ) {
249 TContigs::iterator ictg = m_contigs.find( ctg );
250 if( ictg == m_contigs.end() ) {
251 ictg = m_contigs.insert( ictg, make_pair( ctg, contig = new CContig( ctg ) ) );
252 }
253 } else contig = new CContig( ctg );
254 CSRead * read = 0;
255 if( reg & fRegisterQuery ) {
256 TReads::iterator iread = m_reads.find( id );
257 if( iread == m_reads.end() ) {
258 iread = m_reads.insert( iread, make_pair( id, read = new CSRead( id, seq, qual, ipt ) ) );
259 } else {
260 read = iread->second;
261 if( read->GetPairType() != ipt ) {
262 if( const IShortRead * rm = read->GetMate() ) {
263 if( rm->GetPairType() != ipt )
264 THROW( runtime_error, "Some strange problem with SAM flags: " << DISPLAY( read->GetPairType() ) << DISPLAY( rm->GetPairType() ) << DISPLAY( ipt ) );
265 else read = dynamic_cast<CSRead*>( const_cast<IShortRead*>( rm ) );
266 } else
267 THROW( runtime_error, "Some problem with hit SAM flags: sequence is marked both as paired and not paired" );
268 } else if( read->GetIntendedPairType() != ipt ) {
269 CSRead * mate = new CSRead( id, seq, qual, ipt );
270 switch( ipt ) {
271 case CSRead::eRead_single: THROW( logic_error, "Really weird!" );
272 case CSRead::eRead_first: mate->SetMate( read ); iread->second = mate; read = mate; break;
273 case CSRead::eRead_second: read->SetMate( mate ); read = mate; break;
274 }
275 } else { /* nothing to do - we've found it */ }
276 }
277 } else read = new CSRead( id, seq, qual, ipt );
278
279 string mism = "";
280
281 map<string,string> * tags = 0;
282 for( unsigned i = eCol_tags; i < fields.size(); ++i ) {
283 // spaces are allowed in tags (see SAM 1.2)
284 if( fields[i].substr( 0, 5 ) == "MD:Z:" ) mism = fields[i].substr( 5 );
285 else {
286 if( tags == 0 ) tags = new CMappedAlignment::TTags();
287 const string& tag = fields[i];
288 size_t col = tag.find( ':' );
289 if( col == string::npos ) THROW( runtime_error, "Tag does not have `:' in " << Join( "\t", fields ) );
290 tags->insert( make_pair( tag.substr( 0, col ), tag.substr( col + 1 ) ) );
291 }
292 }
293 TTrVector fullcigar = GetFullCigar( cigar, mism );
294 #ifndef CORRECT_BAD_INPUT_CIGAR
295 #define CORRECT_BAD_INPUT_CIGAR 1
296 #endif
297 #if CORRECT_BAD_INPUT_CIGAR
298 int al = fullcigar.ComputeLengths().first;
299 int pl = fullcigar.front().GetEvent() == CTrBase::eEvent_SoftMask ? fullcigar.front().GetCount() : 0;
300 int sl = fullcigar.back().GetEvent() == CTrBase::eEvent_SoftMask ? fullcigar.back().GetCount() : 0;
301 int ql = seq.length();
302 if( mism == "" && ((al + pl + sl - 1) == ql)) {
303 TTrVector other;
304 TTrVector::iterator x = fullcigar.begin();
305 if( pl == 1 || sl == 1 ) {
306 other.AppendItem( *x++ );
307 if( pl == 1 ) {
308 other.AppendItem( x->GetEvent(), x->GetCount() - 1 ); ++x;
309 for( ; x != fullcigar.end(); ++x ) other.AppendItem( *x );
310 }
311 else {
312 TTrVector::iterator y = x; ++y; ++y;
313 for( ; y != fullcigar.end(); ++y, ++x ) other.AppendItem( *x );
314 other.AppendItem( x->GetEvent(), x->GetCount() - 1 );
315 other.AppendItem( *++x );
316 }
317 } else if( pl == 0 || sl == 0 ) {
318 if( pl ) {
319 other.AppendItem( *x++ );
320 other.AppendItem( x->GetEvent(), x->GetCount() - 1 );
321 for( ; x != fullcigar.end(); ++x ) other.AppendItem( *x );
322 } else {
323 TTrVector::iterator y = x; ++y; ++y;
324 for( ; y != fullcigar.end(); ++y, ++x ) other.AppendItem( *x );
325 other.AppendItem( x->GetEvent(), x->GetCount() - 1 );
326 other.AppendItem( *++x );
327 }
328 }
329 cerr << "WARNING: Correcting " << fullcigar.ToString() << " to " << other.ToString() << " for " << id
330 << " " << seq << " on " << ctg << " at " << pos1 << " " << (strand == CSeqCoding::eStrand_pos ? "fwd" : "rev" ) << "\n";
331 fullcigar.swap( other );
332 }
333 #endif
334 CMappedAlignment * align = 0;
335 try {
336 align = new CMappedAlignment( read, contig, pos1, fullcigar, strand, 0, tags );
337 } catch( exception& e ) {
338 cerr << "ERROR: " << e.what() << " while parsing " << Join( "\t", fields ) << ": ignoring\n";
339 delete align;
340 delete read;
341 delete contig;
342 return 0;
343 }
344 if( reg & fRegisterQuery ) ; else align->SetFlags( CMappedAlignment::fOwnQuery );
345 if( reg & fRegisterSubject ) ; else align->SetFlags( CMappedAlignment::fOwnSubject );
346 // TODO: pairing of alignments is not ready yet
347
348 const string& ctg2 = fields[eCol_mrnm];
349 int pos2 = NStr::StringToInt( fields[eCol_mpos] );
350
351 if( flags & fPairedHit ) {
352 if( fields.size() < 7 )
353 THROW( runtime_error, "OligoFAR needs mate position in SAM file for paired reads" );
354 TPendingHits::iterator i = m_pendingHits.find( make_pair( id, make_pair( ctg2, pos2 ) ) );
355 if( i == m_pendingHits.end() ) m_pendingHits.insert( make_pair( make_pair( id, make_pair( ctg, pos1 ) ), align ) );
356 else {
357 if( ipt == CSRead::eRead_second ) i->second->SetMate( align );
358 else align->SetMate( i->second );
359 m_pendingHits.erase( i );
360 }
361 }
362 if( reg & fRegisterAlignmentBySubject ) m_alignmentsByContig[ctg].push_back( align );
363 if( reg & fRegisterAlignmentByQuery ) m_alignmentsByRead[id].push_back( align );
364 return align;
365 }
366
ValidateConsistency(const string & context) const367 bool CMappedAlignment::ValidateConsistency( const string& context ) const
368 {
369 pair<int,int> lengths = m_cigar.ComputeLengths();
370 int fullLength = GetQuery()->GetSequenceLength();
371 int prefix = m_cigar.front().GetEvent() == CTrBase::eEvent_SoftMask ? m_cigar.front().GetCount() : 0;
372 int suffix = m_cigar.back().GetEvent() == CTrBase::eEvent_SoftMask ? m_cigar.back().GetCount() : 0;
373 if( IsReverseComplement() ) swap( prefix, suffix );
374 try {
375 ASSERT( lengths.first == GetQueryBounding().GetLength() );
376 ASSERT( lengths.second == GetSubjectBounding().GetLength() );
377 ASSERT( prefix == GetQueryBounding().GetFrom() );
378 ASSERT( fullLength - suffix - 1 == GetQueryBounding().GetTo() );
379 } catch( exception& e ) {
380 cerr
381 << "\x1b[34;42m" << context << "\x1b[0m\n"
382 << "\x1b[33;41m"
383 << GetQuery()->GetId() << "\t"
384 << GetSubject()->GetId() << "\t"
385 << m_query->GetIupacna() << "\t"
386 << m_qstart << (m_qlength > 0 ? "+" : "") << m_qlength << "\t"
387 << m_sstart << "+" << m_slength << "\t"
388 << m_cigar.ToString() << "\t"
389 << GetQueryBounding() << "\t"
390 << GetSubjectBounding() << "\t"
391 << (IsReverseComplement() ? "-" : "+")
392 << "\x1b[0m"
393 << "\n";
394 cerr << "\x1b[32m"
395 << DISPLAY( lengths.first )
396 << DISPLAY( lengths.second )
397 << DISPLAY( IsReverseComplement() )
398 << DISPLAY( fullLength )
399 << DISPLAY( prefix )
400 << DISPLAY( suffix )
401 << "\x1b[0m"
402 << "\n";
403 WriteAsSam( cerr );
404 throw;
405 }
406 return true;
407 }
408
409
GetFullCigar(const TTrVector & cigar,const string & mismatches)410 TTrVector CSamSource::GetFullCigar( const TTrVector& cigar, const string& mismatches )
411 {
412 // cigar = 10A5^C10
413 // misms = 16M1D4M1I6M
414 // result = 10M1R5M1D4M1I6M
415
416 TTrVector fullcigar;
417 const char * misms = mismatches.c_str();
418 if( misms[0] ) {
419 const char * m = misms;
420 TTrVector::const_iterator c = cigar.begin();
421
422 while( c != cigar.end() && c->GetEvent() == CTrBase::eEvent_SoftMask || c->GetEvent() == CTrBase::eEvent_HardMask )
423 fullcigar.AppendItem( *c++ );
424
425 int acc = 0;
426
427 string errmsgbase;
428 do {
429 ostringstream s;
430 s << "Inconsistent cigar " << cigar.ToString() << " and mismatch string " << misms;
431 errmsgbase = s.str();
432 } while( 0 );
433
434 while( *m ) {
435 if( c == cigar.end() )
436 THROW( runtime_error, errmsgbase << ": cigar is too short" );
437 if( isdigit( *m ) ) {
438 int count = strtol( m, (char**)&m, 10 );
439 if( c->GetEvent() != CTrBase::eEvent_Match )
440 THROW( runtime_error, errmsgbase << ": cigar M is expected" );
441 acc -= c->GetCount() - count;
442 fullcigar.AppendItem( CTrBase::eEvent_Match, min( (int) c->GetCount(), count ) );
443 if( acc == 0 ) { ++c; continue; }
444 else if( acc < 0 ) {
445 ++c;
446 // NB: overlap is not supported here... it is not clear what to do according to SAM specs
447 if( c == cigar.end() || c->GetEvent() != CTrBase::eEvent_Insertion )
448 THROW( runtime_error, errmsgbase << ": cigar I is expected" );
449 fullcigar.AppendItem( CTrBase::eEvent_Insertion, c->GetCount() );
450 ++c;
451 } else { // acc > 0
452 // do nothing
453 }
454 } else if( strchr( "ACGTN", *m ) ) {
455 fullcigar.AppendItem( CTrBase::eEvent_Replaced, 1 );
456 ++c;
457 } else if( *m == '^' ) {
458 if( acc != 0 )
459 THROW( runtime_error, errmsgbase << ": misplaced deletion" );
460 int k = c->GetCount();
461 if( c->GetEvent() != CTrBase::eEvent_Deletion && c->GetEvent() != CTrBase::eEvent_Splice )
462 THROW( runtime_error, errmsgbase << ": N or D is expected" );
463 while( strchr( "ACGTN", *++m ) ) {
464 fullcigar.AppendItem( c->GetEvent(), 1 );
465 --k;
466 }
467 if( k != 0 )
468 THROW( runtime_error, errmsgbase << ": different sizes of deletion or splice" );
469 ++c;
470 }
471 }
472 while( c != cigar.end() ) {
473 if( c->GetEvent() != CTrBase::eEvent_SoftMask && c->GetEvent() != CTrBase::eEvent_HardMask )
474 THROW( runtime_error, errmsgbase << ": cigar is too long" );
475 fullcigar.AppendItem( *c++ );
476 }
477 } else {
478 fullcigar = cigar;
479 }
480 return fullcigar;
481 }
482
IndexFile(const string & samFile,unsigned start,unsigned limit)483 void CSamSource::IndexFile( const string& samFile, unsigned start, unsigned limit )
484 {
485 ifstream in( samFile.c_str() );
486 string buff;
487 string lastId;
488 CProgressIndicator p( "Indexing " + samFile + " by contig" );
489 unsigned cnt = 0;
490 limit += start;
491 for( Uint8 offset = 0; getline( in, buff ); offset = in.tellg() ) {
492 if( buff[0] == '@' ) continue;
493 if( cnt++ < start ) continue;
494 if( cnt > limit ) break;
495 vector<string> fields;
496 Split( buff, "\t", back_inserter( fields ) );
497 if( fields.size() < 6 )
498 THROW( runtime_error, "SAM is supposed to have at least 6 columns, got " <<
499 fields.size() << " in [" << buff << "] in line" );
500 if( fields[eCol_rname] != lastId ) {
501 m_alignmentOffsets[lastId = fields[eCol_rname]].push_back( offset );
502 }
503 p.Increment();
504 }
505 p.Summary();
506 }
507
GetIupacna(CSeqCoding::EStrand strand) const508 string CSRead::GetIupacna( CSeqCoding::EStrand strand ) const
509 {
510 string iupac( x_GetSequenceLength(), '.' );
511 const char * seqdata = m_data + x_GetSequenceOffset();
512 int i = 0;
513 int I = iupac.length();
514 int j = strand == CSeqCoding::eStrand_pos ? 0 : iupac.size() - 1;
515 int d = strand == CSeqCoding::eStrand_pos ? 1 : -1;
516 switch( x_GetSequenceCoding() ) {
517 case CSeqCoding::eCoding_ncbi8na:
518 for( ; i < I; ++i, (j += d) ) { iupac[i] = CIupacnaBase( CNcbi8naBase( seqdata[j], strand ) ); }
519 break;
520 default:
521 THROW( logic_error, __PRETTY_FUNCTION__ << " does not support sequence coding other then NCBI-8na" );
522 break;
523 }
524 return iupac;
525 }
526
WriteAsSam(ostream & out) const527 void CMappedAlignment::WriteAsSam( ostream& out ) const
528 {
529 CSeqCoding::EStrand strand = IsReverseComplement() ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos;
530 int flags = IsReverseComplement() ? CSamBase::fSeqReverse : 0;
531 string iupac = m_query->GetIupacna( strand );
532 TTags * tt = m_tags;
533 if( GetSubject()->GetSequenceData() != 0 && GetSubject()->GetSequenceCoding() == CSeqCoding::eCoding_ncbi8na ) {
534 CSamMdFormatter md( GetSubject()->GetSequenceData() + GetSubjectBounding().GetFrom(), iupac.c_str(), GetCigar() );
535 if( md.Format() ) {
536 tt = new TTags();
537 if( m_tags ) copy( m_tags->begin(), m_tags->end(), inserter( *tt, tt->end() ) );
538 tt->insert( make_pair( "MD:Z", md.GetString() ) );
539 }
540 }
541 out
542 << GetQuery()->GetId() << "\t"
543 << flags << "\t"
544 << GetSubject()->GetId() << "\t"
545 << (GetSubjectBounding().GetFrom() + 1) << "\t"
546 << 255 << "\t"
547 << GetCigar().ToString() << "\t"
548 << "*\t0\t0\t" << iupac << "\t*";
549 if( tt ) {
550 ITERATE( TTags, tag, (*tt) ) {
551 out << "\t" << tag->first << ":" << tag->second;
552 }
553 if( tt != m_tags ) delete tt;
554 }
555 out << "\n";
556 }
557
CMappedAlignment(const CMappedAlignment & other)558 CMappedAlignment::CMappedAlignment( const CMappedAlignment& other ) :
559 m_query( other.m_flags & fOwnQuery ? new CSRead( *other.m_query ) : m_query ),
560 m_subject( other.m_flags & fOwnSubject ? new CContig( *other.m_subject ) : m_subject ),
561 m_mate( 0 ),
562 m_sstart( other.m_sstart ),
563 m_slength( other.m_slength ),
564 m_qstart( other.m_qstart ),
565 m_qlength( other.m_qlength ),
566 m_flags( other.m_flags ),
567 m_cigar( other.m_cigar ),
568 m_tags( other.m_tags ? new TTags( *other.m_tags ) : 0 )
569 {
570 ++s_count;
571 }
572
Assign(const CMappedAlignment * other)573 void CMappedAlignment::Assign( const CMappedAlignment * other )
574 {
575 if( this == other ) { return; }
576 ASSERT( other );
577 if( m_query != other->m_query ) {
578 if( m_flags & fOwnQuery ) delete m_query;
579 if( other->m_flags & fOwnQuery ) {
580 m_query = new CSRead( *other->m_query );
581 m_flags |= fOwnQuery;
582 } else {
583 m_query = other->m_query;
584 m_flags &= ~fOwnQuery;
585 }
586 }
587 if( m_subject != other->m_subject ) {
588 if( m_flags & fOwnSubject ) delete m_subject;
589 if( other->m_flags & fOwnSubject ) {
590 m_subject = new CContig( *other->m_subject );
591 m_flags |= fOwnSubject;
592 } else {
593 m_subject = other->m_subject;
594 m_flags &= ~fOwnSubject;
595 }
596 }
597 if( m_tags ) delete m_tags;
598 m_mate = 0;
599 m_sstart = other->m_sstart;
600 m_slength = other->m_slength;
601 m_qstart = other->m_qstart;
602 m_qlength = other->m_qlength;
603 m_flags = other->m_flags; //(other->m_flags&~fOwnSeqs) | (m_flags&fOwnSeqs);
604 m_cigar = other->m_cigar;
605 m_tags = other->m_tags ? new TTags( *other->m_tags ) : 0;
606 }
607
MakeExtended(IAligner * aligner) const608 CMappedAlignment * CMappedAlignment::MakeExtended( IAligner * aligner ) const
609 {
610 if( GetCigar().front().GetEvent() != CTrBase::eEvent_SoftMask &&
611 GetCigar().back().GetEvent() != CTrBase::eEvent_SoftMask ) return 0;
612 CSeqCoding::EStrand strand = IsReverseComplement() ? CSeqCoding::eStrand_neg : CSeqCoding::eStrand_pos;
613 TRange qbb( GetQueryBounding() );
614 TRange sbb( GetSubjectBounding() );
615
616 aligner->SetSubjectStrand( strand );
617 aligner->SetWordDistance( ComputeDistance( GetCigar() ) );
618 aligner->SetQuery( m_query->GetSequenceData(), m_query->GetSequenceLength() );
619 aligner->SetQueryCoding( m_query->GetSequenceCoding() );
620 aligner->SetQueryAnchor( qbb.GetFrom(), qbb.GetTo() );
621 aligner->SetSubjectAnchor( sbb.GetFrom(), sbb.GetTo() );
622 aligner->SetSubjectGuideTranscript( GetCigar() );
623 if( aligner->Align() )
624 if( ScoreAlignment() <= ScoreAlignment( aligner->GetSubjectTranscript() ) ) // ( aligner->GetSubjectTo() - aligner->GetSubjectFrom() + 1 ) >= sbb.GetLength() )
625 return new CMappedAlignment(
626 m_flags & fOwnQuery ? new CSRead( *m_query ) : m_query,
627 m_flags & fOwnSubject ? new CContig( *m_subject ) : m_subject,
628 aligner->GetSubjectFrom(), aligner->GetSubjectTranscript(), strand,
629 m_flags, m_tags ? new TTags( *m_tags ) : 0 );
630 else
631 cerr << "WARNING: " << m_cigar.ToString()
632 << " scores " << ScoreAlignment() << ", " << aligner->GetSubjectTranscript().ToString()
633 << " scores " << ScoreAlignment( aligner->GetSubjectTranscript() ) << "\n";
634 else cerr << "WARNING: alignment failed (distance = " << ComputeDistance( GetCigar() ) << ")\n";
635
636 return 0;
637 }
638
AdjustAlignment(EAdjustMode amode,int fromBy,int toBy)639 void CMappedAlignment::AdjustAlignment( EAdjustMode amode, int fromBy, int toBy )
640 // NB: this procedure is well debugged (for case when there are no indels) on Oct 15 2009 for all combinations of cases where fromBy * toBy = 0
641 // Also versions which increase alignment length should work fine with indels (since indels are not touched)
642 // Tested all versions, should not throw an exception, but will warn to stderr and overadjust in subj coords alignments like 1M2D10M3D2M
643 // TODO: a version of this function may use subject sequence to mark some 'replacement' positions as 'match' where there is actual match
644 {
645 if( s_validate_consistency ) ValidateConsistency( "AdjustAlignment start" );
646 enum EFlags {
647 fFromGt0 = 0x01,
648 fFromLt0 = 0x02,
649 fToLess0 = 0x04,
650 fToMore0 = 0x08,
651 fQryMode = 0x10,
652 fSbjMode = 0x00,
653 fReverse = 0x20,
654 fForward = 0x00,
655 fNULL = 0
656 };
657 //ASSERT(fromBy == 0 || toBy == 0);
658 unsigned flags = 0;
659 if( fromBy > 0 ) flags |= fFromGt0; else if( fromBy < 0 ) flags |= fFromLt0;
660 if( toBy > 0 ) flags |= fToMore0; else if( toBy < 0 ) flags |= fToLess0;
661 if( amode == eAdjust_query ) flags |= fQryMode;
662 if( IsReverseComplement() ) flags |= fReverse;
663 int qlength = GetQuery()->GetSequenceLength();
664 switch( flags ) {
665 case fSbjMode|fForward:
666 case fSbjMode|fReverse:
667 case fQryMode|fForward:
668 case fQryMode|fReverse: return; // do nothing - no offsets
669 case fSbjMode|fForward|fFromLt0:
670 case fSbjMode|fReverse|fFromLt0:
671 case fQryMode|fForward|fFromLt0:
672 case fQryMode|fReverse|fToMore0:
673 do {
674 int by = toBy - fromBy; // >0
675 switch( flags ) {
676 case fSbjMode|fForward|fFromLt0:
677 case fQryMode|fForward|fFromLt0: if( by > GetQueryBounding().GetFrom() ) by = GetQueryBounding().GetFrom(); break;
678 case fSbjMode|fReverse|fFromLt0:
679 case fQryMode|fReverse|fToMore0: if( by > qlength - 1 - GetQueryBounding().GetTo() ) by = qlength - 1 - GetQueryBounding().GetTo(); break;
680 }
681 if( by == 0 ) break;
682 ASSERT( by > 0 );
683 TTrSequence result;
684 TTrSequence::const_iterator x = m_cigar.begin();
685 ASSERT( x != m_cigar.end() );
686 if( x->GetEvent() != CTrBase::eEvent_SoftMask ) {
687 cerr << "Warning: attempt to extend alignment with no soft mask, ignoring\n";
688 by = 0;
689 } else if( (int)x->GetCount() < by ) {
690 cerr << "Warning: by of " << by << " is too large for CIGAR " << m_cigar.ToString() << ", reducing\n";
691 by = x->GetCount();
692 }
693 result.AppendItem( CTrBase::eEvent_SoftMask, x->GetCount() - by );
694 result.AppendItem( CTrBase::eEvent_Replaced, by );
695 for( ++x; x != m_cigar.end(); ++x ) result.AppendItem( *x );
696 m_cigar = result;
697 m_sstart -= by;
698 m_slength += by;
699 if( IsReverseComplement() ) by = -by;
700 m_qstart -= by;
701 m_qlength += by;
702 } while(0);
703 break;
704 case fSbjMode|fForward|fToMore0:
705 case fSbjMode|fReverse|fToMore0:
706 case fQryMode|fForward|fToMore0:
707 case fQryMode|fReverse|fFromLt0:
708 do {
709 int by = toBy - fromBy; // >0
710 switch( flags ) {
711 case fSbjMode|fForward|fToMore0:
712 case fQryMode|fForward|fToMore0: if( by > qlength - 1 - GetQueryBounding().GetTo() ) by = qlength - 1 - GetQueryBounding().GetTo(); break;
713 case fSbjMode|fReverse|fToMore0:
714 case fQryMode|fReverse|fFromLt0: if( by > GetQueryBounding().GetFrom() ) by = GetQueryBounding().GetFrom(); break;
715 }
716 if( by == 0 ) break;
717 ASSERT( by > 0 );
718 TTrSequence result;
719 TTrSequence::const_iterator x = m_cigar.begin();
720 ASSERT( x != m_cigar.end() );
721 result.AppendItem( *x );
722 for( ++x ; x != m_cigar.end(); ++x ) {
723 if( x->GetEvent() == CTrBase::eEvent_SoftMask ) {
724 if( (int)x->GetCount() < by ) {
725 cerr << "Warning: by of " << by << " is too large for CIGAR " << m_cigar.ToString() << ", reducing\n";
726 by = x->GetCount();
727 }
728 result.AppendItem( CTrBase::eEvent_Replaced, by );
729 result.AppendItem( CTrBase::eEvent_SoftMask, x->GetCount() - by );
730 } else result.AppendItem( *x );
731 }
732 m_cigar = result;
733 m_slength += by;
734 if( IsReverseComplement() ) by = -by;
735 m_qlength += by;
736 } while(0);
737 break;
738 case fSbjMode|fForward|fFromGt0:
739 case fSbjMode|fReverse|fFromGt0:
740 case fQryMode|fForward|fFromGt0:
741 case fQryMode|fReverse|fToLess0:
742 do {
743 int by = fromBy - toBy; // >0
744 ASSERT( by > 0 );
745 pair<int,int> ll = m_cigar.ComputeLengths();
746 if( ll.first <= by || ll.second <= by ) {
747 cerr << "Warning: too short alignment to clip it by " << by << ": " << m_cigar.ToString() << "\n";
748 return;
749 }
750 vector<CTrBase::EEvent> result;
751 result.reserve( by );
752 TTrSequence::const_iterator x = m_cigar.begin();
753 ASSERT( x != m_cigar.end() );
754 if( x->GetEvent() == CTrBase::eEvent_SoftMask ) {
755 for( int i = 0; i < x->GetCount(); ++i )
756 result.push_back( x->GetEvent() );
757 ++x;
758 }
759 int count = x->GetCount();
760 ASSERT( count > 0 );
761 int d = IsReverseComplement() ? -1 : 1;
762 for( ; by > 0 ; ) {
763 switch( x->GetEvent() ) {
764 case CTrBase::eEvent_Match:
765 case CTrBase::eEvent_Replaced:
766 case CTrBase::eEvent_Changed:
767 --by;
768 result.push_back( CTrBase::eEvent_SoftMask );
769 m_sstart ++;
770 m_slength --;
771 m_qstart += d;
772 m_qlength -= d;
773 break;
774 case CTrBase::eEvent_Insertion:
775 if( amode == eAdjust_query ) --by;
776 result.push_back( CTrBase::eEvent_SoftMask );
777 m_qstart += d;
778 m_qlength -= d;
779 break;
780 case CTrBase::eEvent_Deletion:
781 if( amode == eAdjust_subj ) --by;
782 m_sstart ++;
783 m_slength --;
784 break;
785 case CTrBase::eEvent_Padding:
786 break;
787 default: THROW( logic_error, "Procedure does not support CIGAR like " << m_cigar.ToString() << " in " __FILE__ ":" << __LINE__ );
788 }
789 if( --count == 0 ) count = (++x)->GetCount();
790 if( x == m_cigar.end() ) THROW( logic_error, "Oops... unexpected CIGAR end (forward) " << m_cigar.ToString() );
791 }
792 switch( x->GetEvent() ) {
793 case CTrBase::eEvent_Insertion:
794 if( amode == eAdjust_subj ) {
795 for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_SoftMask );
796 m_qstart += d * count;
797 m_qlength -= d * count;
798 } else {
799 for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_Replaced );
800 m_slength += count;
801 m_sstart -= count;
802 }
803 count = 0;
804 break;
805 case CTrBase::eEvent_Deletion:
806 if( amode == eAdjust_subj ) {
807 if( (int)result.size() < count ) {
808 m_sstart += count - result.size();
809 m_slength -= count - result.size();
810 count = result.size();
811 cerr << "WARNING: "__FILE__":" << __LINE__ << " can't adjust precisely strange alignment of " << m_cigar.ToString() << "\n";
812 }
813 //ASSERT( (int)result.size() >= count );
814 for( int i = (int)result.size() - count; i < (int)result.size(); ++i )
815 result[i] = CTrBase::eEvent_Replaced;
816 m_qstart -= d * count;
817 m_qlength += d * count;
818 } else {
819 m_slength -= count;
820 m_sstart += count;
821 }
822 count = 0;
823 break;
824 default:;
825 }
826 TTrSequence tmp;
827 ITERATE( vector<CTrBase::EEvent>, r, result ) tmp.AppendItem( *r );
828 while( count-- > 0 ) tmp.AppendItem( x->GetEvent() );
829 for( ++x ; x != m_cigar.end(); ++x ) tmp.AppendItem( *x );
830 m_cigar = tmp;
831 } while(0);
832 break;
833 case fSbjMode|fForward|fToLess0:
834 case fSbjMode|fReverse|fToLess0:
835 case fQryMode|fForward|fToLess0:
836 case fQryMode|fReverse|fFromGt0:
837 do {
838 int by = fromBy - toBy; // >0
839 ASSERT( by > 0 );
840 pair<int,int> ll = m_cigar.ComputeLengths();
841 if( ll.first <= by || ll.second <= by ) {
842 cerr << "Warning: too short alignment to clip it by " << by << ": " << m_cigar.ToString() << "\n";
843 return;
844 }
845 vector<CTrBase::EEvent> result;
846 result.reserve( by );
847 TTrSequence::reverse_iterator x = m_cigar.rbegin();
848 ASSERT( x != m_cigar.rend() );
849 if( x->GetEvent() == CTrBase::eEvent_SoftMask ) {
850 for( int i = 0; i < x->GetCount(); ++i )
851 result.push_back( x->GetEvent() );
852 ++x;
853 }
854 int count = x->GetCount();
855 ASSERT( count > 0 );
856 int d = IsReverseComplement() ? -1 : 1;
857 for( ; by > 0 ; ) {
858 switch( x->GetEvent() ) {
859 case CTrBase::eEvent_Match:
860 case CTrBase::eEvent_Replaced:
861 case CTrBase::eEvent_Changed:
862 --by;
863 result.push_back( CTrBase::eEvent_SoftMask );
864 m_slength --;
865 m_qlength -= d;
866 break;
867 case CTrBase::eEvent_Insertion:
868 if( amode == eAdjust_query ) --by;
869 result.push_back( CTrBase::eEvent_SoftMask );
870 m_qlength -= d;
871 break;
872 case CTrBase::eEvent_Deletion:
873 if( amode == eAdjust_subj ) --by;
874 m_slength --;
875 break;
876 case CTrBase::eEvent_Padding:
877 break;
878 default: THROW( logic_error, "Procedure does not support CIGAR like " << m_cigar.ToString() << " in " __FILE__ ":" << __LINE__ );
879 }
880 if( --count == 0 ) count = (++x)->GetCount();
881 ASSERT( x != m_cigar.rend() );
882 }
883 switch( x->GetEvent() ) {
884 case CTrBase::eEvent_Insertion:
885 if( amode == eAdjust_subj ) {
886 for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_SoftMask );
887 m_qlength -= d * count;
888 } else {
889 for( int i = 0; i < count; ++i ) result.push_back( CTrBase::eEvent_Replaced );
890 m_slength += count;
891 }
892 count = 0;
893 break;
894 case CTrBase::eEvent_Deletion:
895 if( amode == eAdjust_subj ) {
896 if( (int)result.size() < count ) {
897 m_slength -= count - result.size();
898 count = result.size();
899 cerr << "WARNING: "__FILE__":" << __LINE__ << " can't adjust precisely strange alignment of " << m_cigar.ToString() << "\n";
900 }
901 // ASSERT( (int)result.size() >= count );
902 for( int i = (int)result.size() - count; i < (int)result.size(); ++i )
903 result[i] = CTrBase::eEvent_Replaced;
904 m_qlength += d * count;
905 } else {
906 m_slength -= count;
907 }
908 count = 0;
909 break;
910 default:;
911 }
912 ASSERT( x != m_cigar.rend() );
913 TTrSequence tmp;
914 ITERATE( vector<CTrBase::EEvent>, r, result ) tmp.AppendItem( *r );
915 while( count-- > 0 ) tmp.AppendItem( x->GetEvent() );
916 for( ++x; x != m_cigar.rend(); ++x ) tmp.AppendItem( *x );
917 m_cigar.clear();
918 for( x = tmp.rbegin(); x != tmp.rend(); ++x ) m_cigar.AppendItem( *x );
919 } while(0);
920 break;
921 default:
922 THROW( logic_error, "flags " << hex << flags << dec << "h can not be processed in " __FILE__ ":" << __LINE__ );
923 }
924 if( s_validate_consistency ) {
925 ostringstream o;
926 o << hex << flags;
927 ValidateConsistency( "AdjustAlignment end " + o.str() );
928 }
929 }
930
GetTagType(const string & name) const931 char CMappedAlignment::GetTagType( const string& name ) const
932 {
933 if( m_tags == 0 ) return 0;
934 TTags::const_iterator i = m_tags->find( name );
935 if( i == m_tags->end() ) return 0;
936 return i->second[0];
937 }
938
SetTagS(const string & name,const string & value)939 void CMappedAlignment::SetTagS( const string& name, const string& value )
940 {
941 if( value.length() ) {
942 if( m_tags == 0 ) m_tags = new TTags();
943 (*m_tags)[name] = "Z:" + value;
944 } else RemoveTag( name );
945 }
946
RemoveTag(const string & name)947 void CMappedAlignment::RemoveTag( const string& name )
948 {
949 if( m_tags ) {
950 m_tags->erase( name );
951 if( m_tags->size() == 0 ) SetTags( 0 );
952 }
953 }
954
SetTagI(const string & name,int value)955 void CMappedAlignment::SetTagI( const string& name, int value )
956 {
957 if( m_tags == 0 ) m_tags = new TTags();
958 (*m_tags)[name] = "i:" + NStr::IntToString( value );
959 }
960
SetTagF(const string & name,float value)961 void CMappedAlignment::SetTagF( const string& name, float value )
962 {
963 if( m_tags == 0 ) m_tags = new TTags();
964 (*m_tags)[name] = "f:" + NStr::DoubleToString( value );
965 }
966
GetTagS(const string & name,char type) const967 string CMappedAlignment::GetTagS( const string& name, char type ) const
968 {
969 if( m_tags == 0 ) THROW( runtime_error, "No tag " << name << " is found" );
970 TTags::const_iterator i = m_tags->find( name );
971 if( i == m_tags->end() ) THROW( runtime_error, "No tag " << name << " is found" );
972 if( i->second[1] != ':' ) THROW( runtime_error, "Bad tag format for " << name << ":" << i->second );
973 if( i->second[0] != type ) THROW( runtime_error, "Bad tag type: " << i->second[0] << " while " << type << " is expected" );
974 return i->second.substr(2);
975 }
976
GetTagI(const string & name) const977 int CMappedAlignment::GetTagI( const string& name ) const
978 {
979 return NStr::StringToInt( GetTagS( name, 'i' ) );
980 }
981
GetTagF(const string & name) const982 float CMappedAlignment::GetTagF( const string& name ) const
983 {
984 return NStr::StringToDouble( GetTagS( name, 'f' ) );
985 }
986
987