1 /* $Id: splign.cpp 592158 2019-08-27 16:34:30Z vakatov $
2 * ===========================================================================
3 *
4 * public DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Yuri Kapustin, Boris Kiryutin
27 *
28 * File Description:
29 * CSplign class implementation
30 *
31 */
32
33
34 #include <ncbi_pch.hpp>
35
36 #include "splign_util.hpp"
37 #include "splign_exon_trim.hpp"
38 #include "messages.hpp"
39
40 #include <algo/align/util/hit_comparator.hpp>
41 #include <algo/align/util/compartment_finder.hpp>
42 #include <algo/align/nw/nw_band_aligner.hpp>
43 #include <algo/align/nw/nw_spliced_aligner16.hpp>
44 #include <algo/align/nw/nw_formatter.hpp>
45 #include <algo/align/nw/align_exception.hpp>
46 #include <algo/align/splign/splign.hpp>
47
48 #include <algo/sequence/orf.hpp>
49
50 #include <objmgr/scope.hpp>
51 #include <objmgr/bioseq_handle.hpp>
52 #include <objmgr/seq_vector.hpp>
53 #include <objmgr/util/seq_loc_util.hpp>
54
55 #include <objtools/alnmgr/score_builder_base.hpp>
56
57 #include <objects/seqloc/Seq_interval.hpp>
58 #include <objects/seqalign/Seq_align.hpp>
59 #include <objects/seqalign/Seq_align_set.hpp>
60 #include <objects/seqalign/Spliced_seg.hpp>
61 #include <objects/seqalign/Spliced_exon.hpp>
62 #include <objects/seqalign/Spliced_exon_chunk.hpp>
63 #include <objects/seqalign/Product_pos.hpp>
64 #include <objects/seqalign/Splice_site.hpp>
65 #include <objects/seqalign/Score.hpp>
66 #include <objects/seqalign/Score_set.hpp>
67 #include <objects/general/Object_id.hpp>
68
69 #include <math.h>
70 #include <algorithm>
71 #include <iostream>
72
73 BEGIN_NCBI_SCOPE
74
75 USING_SCOPE(objects);
76
77 namespace {
78
79 // define cut-off strategy at the terminii:
80
81 // (1) - pre-processing
82 // For non-covered ends longer than kNonCoveredEndThreshold use
83 // m_max_genomic_ext. For shorter ends use k * query_len^(1/kPower)
84
85 const Uint4 kNonCoveredEndThreshold (55);
86 const double kPower (2.5);
87
88 // (2) - post-processing
89 // term exons shorter than kMinTermExonSize with identity lower than
90 // kMinTermExonIdty will be converted to gaps
91 const size_t kMinTermExonSize (28);
92 const double kMinTermExonIdty (0.9);
93
94 //if flanking exon is closer than kFlankExonProx to the beggining/(end or polyA) of mRNA, it is NOT treated as adjecent to a gap
95 const int kFlankExonProx (20);
96
97 //maximim length to cut to splice (exons near gaps)
98 const int kMaxCutToSplice (6);
99
100 const CSplign::THit::TCoord
101 kMaxCoord (numeric_limits<CSplign::THit::TCoord>::max());
102
103
104 //original Yuri's EST scores
105
106 const int kEstMatchScore (1000);
107 const int kEstMismatchScore (-1011);
108 const int kEstGapOpeningScore(-1460);
109 const int kEstGapExtensionScore(-464);
110
111 const int kEstGtAgSpliceScore(-4988);
112 const int kEstGcAgSpliceScore(-5999);
113 const int kEstAtAcSpliceScore(-7010);
114 const int kEstNonConsensusSpliceScore(-13060);
115 }
116
117
118
119 //original Yuri's mRNA scores
120
s_GetDefaultScoringType(void)121 CSplign::EScoringType CSplign::s_GetDefaultScoringType(void) {
122 return eMrnaScoring;
123 }
124
s_GetDefaultMatchScore(void)125 int CSplign::s_GetDefaultMatchScore(void) {
126 return 1000;
127 }
128
s_GetDefaultMismatchScore(void)129 int CSplign::s_GetDefaultMismatchScore(void) {
130 return -1044;
131 }
132
s_GetDefaultGapOpeningScore(void)133 int CSplign::s_GetDefaultGapOpeningScore(void) {
134 return -3070;
135 }
136
s_GetDefaultGapExtensionScore(void)137 int CSplign::s_GetDefaultGapExtensionScore(void) {
138 return -173;
139 }
140
s_GetDefaultGtAgSpliceScore(void)141 int CSplign::s_GetDefaultGtAgSpliceScore(void) {
142 return -4270;
143 }
144
s_GetDefaultGcAgSpliceScore(void)145 int CSplign::s_GetDefaultGcAgSpliceScore(void) {
146 return -5314;
147 }
148
s_GetDefaultAtAcSpliceScore(void)149 int CSplign::s_GetDefaultAtAcSpliceScore(void) {
150 return -6358;
151 }
152
s_GetDefaultNonConsensusSpliceScore(void)153 int CSplign::s_GetDefaultNonConsensusSpliceScore(void) {
154 return -7395;
155 }
156
CSplign(void)157 CSplign::CSplign(void):
158 m_CanResetHistory (false),
159 //basic scores
160 m_ScoringType(s_GetDefaultScoringType()),
161 m_MatchScore(s_GetDefaultMatchScore()),
162 m_MismatchScore(s_GetDefaultMismatchScore()),
163 m_GapOpeningScore(s_GetDefaultGapOpeningScore()),
164 m_GapExtensionScore(s_GetDefaultGapExtensionScore()),
165 m_GtAgSpliceScore(s_GetDefaultGtAgSpliceScore()),
166 m_GcAgSpliceScore(s_GetDefaultGcAgSpliceScore()),
167 m_AtAcSpliceScore(s_GetDefaultAtAcSpliceScore()),
168 m_NonConsensusSpliceScore(s_GetDefaultNonConsensusSpliceScore()),
169 //end of basic scores
170 m_MinExonIdty(s_GetDefaultMinExonIdty()),
171 m_MinPolyaExtIdty(s_GetDefaultPolyaExtIdty()),
172 m_MinPolyaLen(s_GetDefaultMinPolyaLen()),
173 m_MinHoleLen(s_GetDefaultMinHoleLen()),
174 m_TrimToCodons(s_GetDefaultTrimToCodons()),
175 m_CompartmentPenalty(s_GetDefaultCompartmentPenalty()),
176 m_MinCompartmentIdty(s_GetDefaultMinCompartmentIdty()),
177 m_MinSingletonIdty(s_GetDefaultMinCompartmentIdty()),
178 m_MinSingletonIdtyBps(numeric_limits<size_t>::max()),
179 m_TestType(kTestType_production_default),
180 m_endgaps (true),
181 m_strand (true),
182 m_nopolya (false),
183 m_cds_start (0),
184 m_cds_stop (0),
185 m_max_genomic_ext (s_GetDefaultMaxGenomicExtent()),
186 m_MaxIntron (CCompartmentFinder<THit>::s_GetDefaultMaxIntron()),
187 m_MaxPartExonIdentDrop (s_GetDefaultMaxPartExonIdentDrop()),
188 m_model_id (0),
189 m_MaxCompsPerQuery (0),
190 m_MinPatternHitLength (13)
191 {
192 }
193
~CSplign()194 CSplign::~CSplign()
195 {
196 }
197
s_CreateVersion(void)198 static CVersionAPI* s_CreateVersion(void)
199 {
200 return new CVersion(CVersionInfo(2, 1, 0));
201 };
202
203
s_GetVersion(void)204 CVersionAPI& CSplign::s_GetVersion(void)
205 {
206 static CSafeStatic<CVersionAPI> s_Version(s_CreateVersion, 0);
207 return s_Version.Get();
208 }
209
210
SetAligner(void)211 CRef<CSplign::TAligner>& CSplign::SetAligner( void ) {
212 return m_aligner;
213 }
214
215
GetAligner(void) const216 CConstRef<CSplign::TAligner> CSplign::GetAligner( void ) const {
217 return m_aligner;
218 }
219
SetAlignerScores(void)220 void CSplign::SetAlignerScores(void) {
221 CRef<TAligner>& aligner = SetAligner();
222 aligner->SetWm (GetMatchScore());
223 aligner->SetWms (GetMismatchScore());
224 aligner->SetWg (GetGapOpeningScore());
225 aligner->SetWs (GetGapExtensionScore());
226 aligner->SetScoreMatrix(NULL);
227 aligner->SetWi(0, GetGtAgSpliceScore());
228 aligner->SetWi(1, GetGcAgSpliceScore());
229 aligner->SetWi(2, GetAtAcSpliceScore());
230 aligner->SetWi(3, GetNonConsensusSpliceScore());
231 }
232
s_CreateDefaultAligner(void)233 CRef<CSplicedAligner> CSplign::s_CreateDefaultAligner(void) {
234 CRef<CSplicedAligner> aligner(
235 static_cast<CSplicedAligner*>(new CSplicedAligner16));
236 return aligner;
237 }
238
239
s_CreateDefaultAligner(bool low_query_quality)240 CRef<CSplicedAligner> CSplign::s_CreateDefaultAligner(bool low_query_quality)
241 {
242 CRef<CSplicedAligner> aligner(s_CreateDefaultAligner());
243
244 if(low_query_quality) {
245 aligner->SetWm (kEstMatchScore);
246 aligner->SetWms (kEstMismatchScore);
247 aligner->SetWg (kEstGapOpeningScore);
248 aligner->SetWs (kEstGapExtensionScore);
249 aligner->SetScoreMatrix(NULL);
250 aligner->SetWi(0, kEstGtAgSpliceScore);
251 aligner->SetWi(1, kEstGcAgSpliceScore);
252 aligner->SetWi(2, kEstAtAcSpliceScore);
253 aligner->SetWi(3, kEstNonConsensusSpliceScore);
254 }
255 else {
256 aligner->SetWm (s_GetDefaultMatchScore());
257 aligner->SetWms (s_GetDefaultMismatchScore());
258 aligner->SetWg (s_GetDefaultGapOpeningScore());
259 aligner->SetWs (s_GetDefaultGapExtensionScore());
260 aligner->SetScoreMatrix(NULL);
261 aligner->SetWi(0, s_GetDefaultGtAgSpliceScore());
262 aligner->SetWi(1, s_GetDefaultGcAgSpliceScore());
263 aligner->SetWi(2, s_GetDefaultAtAcSpliceScore());
264 aligner->SetWi(3, s_GetDefaultNonConsensusSpliceScore());
265 }
266
267 return aligner;
268 }
269
270 //note: SetScoringType call with mRNA or EST type is going to switch basic scores to preset values
271
SetScoringType(EScoringType type)272 void CSplign::SetScoringType(EScoringType type)
273 {
274 m_ScoringType = type;
275 if(type == eMrnaScoring) {
276 SetMatchScore(s_GetDefaultMatchScore());
277 SetMismatchScore(s_GetDefaultMismatchScore());
278 SetGapOpeningScore(s_GetDefaultGapOpeningScore());
279 SetGapExtensionScore(s_GetDefaultGapExtensionScore());
280 SetGtAgSpliceScore(s_GetDefaultGtAgSpliceScore());
281 SetGcAgSpliceScore(s_GetDefaultGcAgSpliceScore());
282 SetAtAcSpliceScore(s_GetDefaultAtAcSpliceScore());
283 SetNonConsensusSpliceScore(s_GetDefaultNonConsensusSpliceScore());
284 } else if(type == eEstScoring) {
285 SetMatchScore(kEstMatchScore);
286 SetMismatchScore(kEstMismatchScore);
287 SetGapOpeningScore(kEstGapOpeningScore);
288 SetGapExtensionScore(kEstGapExtensionScore);
289 SetGtAgSpliceScore(kEstGtAgSpliceScore);
290 SetGcAgSpliceScore(kEstGcAgSpliceScore);
291 SetAtAcSpliceScore(kEstAtAcSpliceScore);
292 SetNonConsensusSpliceScore(kEstNonConsensusSpliceScore);
293 }
294 }
295
GetScoringType(void) const296 CSplign::EScoringType CSplign::GetScoringType(void) const {
297 return m_ScoringType;
298 }
299
SetMatchScore(int score)300 void CSplign::SetMatchScore(int score) {
301 m_MatchScore = score;
302 }
303
GetMatchScore(void) const304 int CSplign::GetMatchScore(void) const {
305 return m_MatchScore;
306 }
307
SetMismatchScore(int score)308 void CSplign::SetMismatchScore(int score) {
309 m_MismatchScore = score;
310 }
311
GetMismatchScore(void) const312 int CSplign::GetMismatchScore(void) const {
313 return m_MismatchScore;
314 }
315
SetGapOpeningScore(int score)316 void CSplign::SetGapOpeningScore(int score) {
317 m_GapOpeningScore = score;
318 }
319
GetGapOpeningScore(void) const320 int CSplign::GetGapOpeningScore(void) const {
321 return m_GapOpeningScore;
322 }
323
SetGapExtensionScore(int score)324 void CSplign::SetGapExtensionScore(int score) {
325 m_GapExtensionScore = score;
326 }
327
GetGapExtensionScore(void) const328 int CSplign::GetGapExtensionScore(void) const {
329 return m_GapExtensionScore;
330 }
331 //
SetGtAgSpliceScore(int score)332 void CSplign::SetGtAgSpliceScore(int score) {
333 m_GtAgSpliceScore = score;
334 }
335
GetGtAgSpliceScore(void) const336 int CSplign::GetGtAgSpliceScore(void) const {
337 return m_GtAgSpliceScore;
338 }
339
SetGcAgSpliceScore(int score)340 void CSplign::SetGcAgSpliceScore(int score) {
341 m_GcAgSpliceScore = score;
342 }
343
GetGcAgSpliceScore(void) const344 int CSplign::GetGcAgSpliceScore(void) const {
345 return m_GcAgSpliceScore;
346 }
347
SetAtAcSpliceScore(int score)348 void CSplign::SetAtAcSpliceScore(int score) {
349 m_AtAcSpliceScore = score;
350 }
351
GetAtAcSpliceScore(void) const352 int CSplign::GetAtAcSpliceScore(void) const {
353 return m_AtAcSpliceScore;
354 }
355
SetNonConsensusSpliceScore(int score)356 void CSplign::SetNonConsensusSpliceScore(int score) {
357 m_NonConsensusSpliceScore = score;
358 }
359
GetNonConsensusSpliceScore(void) const360 int CSplign::GetNonConsensusSpliceScore(void) const {
361 return m_NonConsensusSpliceScore;
362 }
363
SetEndGapDetection(bool on)364 void CSplign::SetEndGapDetection( bool on ) {
365 m_endgaps = on;
366 }
367
GetEndGapDetection(void) const368 bool CSplign::GetEndGapDetection( void ) const {
369 return m_endgaps;
370 }
371
SetPolyaDetection(bool on)372 void CSplign::SetPolyaDetection( bool on ) {
373 m_nopolya = !on;
374 }
375
GetPolyaDetection(void) const376 bool CSplign::GetPolyaDetection( void ) const {
377 return !m_nopolya;
378 }
379
SetStrand(bool strand)380 void CSplign::SetStrand( bool strand ) {
381 m_strand = strand;
382 }
383
GetStrand(void) const384 bool CSplign::GetStrand( void ) const {
385 return m_strand;
386 }
387
SetMinExonIdentity(double idty)388 void CSplign::SetMinExonIdentity( double idty )
389 {
390 if(!(0 <= idty && idty <= 1)) {
391 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
392 }
393 else {
394 m_MinExonIdty = idty;
395 }
396 }
397
SetPolyaExtIdentity(double idty)398 void CSplign::SetPolyaExtIdentity( double idty )
399 {
400 if(!(0 <= idty && idty <= 1)) {
401 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
402 }
403 else {
404 m_MinPolyaExtIdty = idty;
405 }
406 }
407
SetMinPolyaLen(size_t len)408 void CSplign::SetMinPolyaLen(size_t len) {
409 m_MinPolyaLen = len;
410 }
411
SetMinHoleLen(size_t len)412 void CSplign::SetMinHoleLen(size_t len) {
413 m_MinHoleLen = len;
414 }
415
SetTrimToCodons(bool val)416 void CSplign::SetTrimToCodons(bool val) {
417 m_TrimToCodons = val;
418 }
419
SetMinCompartmentIdentity(double idty)420 void CSplign::SetMinCompartmentIdentity( double idty )
421 {
422 if(!(0 <= idty && idty <= 1)) {
423 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
424 }
425 else {
426 m_MinCompartmentIdty = idty;
427 }
428 }
429
SetMinSingletonIdentity(double idty)430 void CSplign::SetMinSingletonIdentity(double idty)
431 {
432 if(!(0 <= idty && idty <= 1)) {
433 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
434 }
435 else {
436 m_MinSingletonIdty = idty;
437 }
438 }
439
SetMinSingletonIdentityBps(size_t idty_bps)440 void CSplign::SetMinSingletonIdentityBps(size_t idty_bps)
441 {
442 m_MinSingletonIdtyBps = idty_bps;
443 }
444
s_GetDefaultMaxGenomicExtent(void)445 size_t CSplign::s_GetDefaultMaxGenomicExtent(void)
446 {
447 return 35000;
448 }
449
450
SetMaxGenomicExtent(size_t mge)451 void CSplign::SetMaxGenomicExtent(size_t mge)
452 {
453 m_max_genomic_ext = mge;
454 }
455
456
GetMaxGenomicExtent(void) const457 size_t CSplign::GetMaxGenomicExtent(void) const
458 {
459 return m_max_genomic_ext;
460 }
461
462
SetMaxIntron(size_t max_intron)463 void CSplign::SetMaxIntron(size_t max_intron)
464 {
465 m_MaxIntron = max_intron;
466 }
467
468
GetMaxIntron(void) const469 size_t CSplign::GetMaxIntron(void) const
470 {
471 return m_MaxIntron;
472 }
473
474
GetMinExonIdentity(void) const475 double CSplign::GetMinExonIdentity( void ) const {
476 return m_MinExonIdty;
477 }
478
s_GetDefaultMinExonIdty(void)479 double CSplign::s_GetDefaultMinExonIdty(void)
480 {
481 return 0.75;
482 }
483
GetPolyaExtIdentity(void) const484 double CSplign::GetPolyaExtIdentity( void ) const {
485 return m_MinPolyaExtIdty;
486 }
487
s_GetDefaultPolyaExtIdty(void)488 double CSplign::s_GetDefaultPolyaExtIdty(void)
489 {
490 return 1.;
491 }
492
GetMinPolyaLen(void) const493 size_t CSplign::GetMinPolyaLen( void ) const {
494 return m_MinPolyaLen;
495 }
496
s_GetDefaultMinPolyaLen(void)497 size_t CSplign::s_GetDefaultMinPolyaLen(void)
498 {
499 return 1;
500 }
501
GetMinHoleLen(void) const502 size_t CSplign::GetMinHoleLen( void ) const {
503 return m_MinHoleLen;
504 }
505
s_GetDefaultMinHoleLen(void)506 size_t CSplign::s_GetDefaultMinHoleLen(void)
507 {
508 return 0;
509 }
510
GetTrimToCodons(void) const511 bool CSplign::GetTrimToCodons( void ) const {
512 return m_TrimToCodons;
513 }
514
s_GetDefaultTrimToCodons(void)515 bool CSplign::s_GetDefaultTrimToCodons(void)
516 {
517 return false;
518 }
519
GetMinCompartmentIdentity(void) const520 double CSplign::GetMinCompartmentIdentity(void) const {
521 return m_MinCompartmentIdty;
522 }
523
GetMinSingletonIdentity(void) const524 double CSplign::GetMinSingletonIdentity(void) const {
525 return m_MinSingletonIdty;
526 }
527
GetMinSingletonIdentityBps(void) const528 size_t CSplign::GetMinSingletonIdentityBps(void) const {
529 return m_MinSingletonIdtyBps;
530 }
531
s_GetDefaultMinCompartmentIdty(void)532 double CSplign::s_GetDefaultMinCompartmentIdty(void)
533 {
534 return 0.70;
535 }
536
SetMaxPartExonIdentDrop(double ident)537 void CSplign::SetMaxPartExonIdentDrop(double ident) {
538 m_MaxPartExonIdentDrop = ident;
539 }
540
GetMaxPartExonIdentDrop(void) const541 double CSplign::GetMaxPartExonIdentDrop(void) const {
542 return m_MaxPartExonIdentDrop;
543 }
544
s_GetDefaultMaxPartExonIdentDrop(void)545 double CSplign::s_GetDefaultMaxPartExonIdentDrop(void)
546 {
547 return 0.25;
548 }
549
SetTestType(const string & test_type)550 void CSplign::SetTestType(const string& test_type) {
551 m_TestType = test_type;
552 }
553
GetTestType(void) const554 string CSplign::GetTestType(void) const {
555 return m_TestType;
556 }
557
SetMaxCompsPerQuery(size_t m)558 void CSplign::SetMaxCompsPerQuery(size_t m) {
559 m_MaxCompsPerQuery = m;
560 }
561
GetMaxCompsPerQuery(void) const562 size_t CSplign::GetMaxCompsPerQuery(void) const {
563 return m_MaxCompsPerQuery;
564 }
565
566
567
GetScope(void) const568 CRef<objects::CScope> CSplign::GetScope(void) const
569 {
570 return m_Scope;
571 }
572
573
SetScope(void)574 CRef<objects::CScope>& CSplign::SetScope(void)
575 {
576 return m_Scope;
577 }
578
579
PreserveScope(bool preserve_scope)580 void CSplign::PreserveScope(bool preserve_scope)
581 {
582 m_CanResetHistory = ! preserve_scope;
583 }
584
585
SetCompartmentPenalty(double penalty)586 void CSplign::SetCompartmentPenalty(double penalty)
587 {
588 if(penalty < 0 || penalty > 1) {
589 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_QueryCoverageOutOfRange);
590 }
591 m_CompartmentPenalty = penalty;
592 }
593
s_GetDefaultCompartmentPenalty(void)594 double CSplign::s_GetDefaultCompartmentPenalty(void)
595 {
596 return 0.55;
597 }
598
GetCompartmentPenalty(void) const599 double CSplign::GetCompartmentPenalty( void ) const
600 {
601 return m_CompartmentPenalty;
602 }
603
x_IsInGap(THit::TCoord pos)604 bool CSplign::x_IsInGap(THit::TCoord pos)
605 {
606 if( pos+1 == 0 || pos >= m_genomic.size() ) return true;//outside genome boundaries
607 if(m_GenomicSeqMap && m_GenomicSeqMap->ResolvedRangeIterator(GetScope(), pos, 1, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap)) {//gap
608 return true;
609 }
610 return false;
611 }
612
613
x_LoadSequence(vector<char> * seq,const CSeq_id & seqid,THit::TCoord start,THit::TCoord finish,bool retain,bool is_genomic,bool genomic_strand)614 void CSplign::x_LoadSequence(vector<char>* seq,
615 const CSeq_id& seqid,
616 THit::TCoord start,
617 THit::TCoord finish,
618 bool retain, bool is_genomic, bool genomic_strand)
619 {
620
621 try {
622
623 if(m_Scope.IsNull()) {
624 NCBI_THROW(CAlgoAlignException, eInternal, "Splign scope not set");
625 }
626
627 CBioseq_Handle bh (m_Scope->GetBioseqHandle(seqid));
628
629 if( !is_genomic ) m_mrna_bio_handle = bh;
630
631 if(retain && m_CanResetHistory) {
632 m_Scope->ResetHistory(); // this does not remove the sequence
633 // referenced to by 'bh'
634 }
635
636 if(bh) {
637
638 CSeqVector sv (bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
639 const TSeqPos dim (sv.size());
640 if(dim == 0) {
641 NCBI_THROW(CAlgoAlignException,
642 eNoSeqData,
643 string("Sequence is empty: ")
644 + seqid.AsFastaString());
645 }
646 if(finish >= dim) {
647 finish = dim - 1;
648 }
649
650 if(start > finish) {
651 CNcbiOstrstream ostr;
652 ostr << "Invalid sequence interval requested for "
653 << seqid.GetSeqIdString(true) << ":\t"
654 << start << '\t' << finish;
655 const string err = CNcbiOstrstreamToString(ostr);
656 NCBI_THROW(CAlgoAlignException, eNoSeqData, err);
657 }
658
659 string s;
660 sv.GetSeqData(start, finish + 1, s);
661 if(is_genomic) {//get SeqMap data
662 ENa_strand strand = eNa_strand_minus;
663 if(genomic_strand) strand = eNa_strand_plus;
664 CRef<CSeq_id> tmp_id(new CSeq_id());
665 tmp_id->Assign(seqid);
666 CSeq_loc tmp_loc(*tmp_id, start, finish, strand);
667 m_GenomicSeqMap = CSeqMap::GetSeqMapForSeq_loc(tmp_loc, GetScope());
668 }
669 seq->resize(1 + finish - start);
670 copy(s.begin(), s.end(), seq->begin());
671 }
672 else {
673 NCBI_THROW(CAlgoAlignException, eNoSeqData,
674 string("ID not found: ") + seqid.AsFastaString());
675 }
676
677 if(!retain && m_CanResetHistory) {
678 m_Scope->RemoveFromHistory(bh);
679 }
680 }
681
682 catch(CAlgoAlignException& e) {
683 NCBI_RETHROW_SAME(e, "CSplign::x_LoadSequence(): Sequence data problem");
684 }
685
686 if(!is_genomic) {
687 if(seq == &m_mrna) {
688 m_mrna_polya.clear();
689 m_mrna_polya.resize(seq->size());
690 copy(seq->begin(), seq->end(), m_mrna_polya.begin());
691 }
692 CSeq_id_Handle idh_key = CSeq_id_Handle::GetHandle(seqid);
693 if(m_MaskMap.find(idh_key) != m_MaskMap.end()) {
694 const TSeqRangeColl& mask_ranges = m_MaskMap.find(idh_key)->second;
695 x_MaskSequence(seq, mask_ranges, start, finish);
696 }
697 }
698 }
699
700
x_MaskSequence(vector<char> * seq,const TSeqRangeColl & mask_ranges,THit::TCoord start,THit::TCoord finish)701 void CSplign::x_MaskSequence(vector<char>* seq,
702 const TSeqRangeColl& mask_ranges,
703 THit::TCoord start,
704 THit::TCoord finish)
705 {
706 //cerr<<start<<"\t"<<finish<<endl;
707 TSeqRange loop_range;
708 for(TSeqPos loop = start; loop <= finish; loop++) {
709 loop_range.SetFrom(loop);
710 loop_range.SetLength(1);
711 if(mask_ranges.IntersectingWith(loop_range)) {
712 //cerr<<loop<<"\t"<<loop_range<<endl;
713 (*seq)[loop] = 'N';
714 }
715 }
716 //ITERATE(vector<char>, ci, (*seq)) {
717 // cerr << *ci;
718 //}
719 //cerr<<endl;
720 }
721
722
723
ClearMem(void)724 void CSplign::ClearMem(void)
725 {
726 m_Scope.Reset(NULL);
727 m_pattern.clear();
728 m_alnmap.clear();
729 m_genomic.clear();
730 m_mrna.clear();
731 m_mrna_polya.clear();
732 }
733
734
sx_NewHit(THit::TCoord q0,THit::TCoord q,THit::TCoord s0,THit::TCoord s)735 CSplign::THitRef CSplign::sx_NewHit(THit::TCoord q0, THit::TCoord q,
736 THit::TCoord s0, THit::TCoord s)
737 {
738 THitRef hr (new THit);
739 hr->SetQueryStart(q0);
740 hr->SetSubjStart(s0);
741 hr->SetQueryStop(q - 1);
742 hr->SetSubjStop(s - 1);
743 hr->SetLength(q - q0);
744 hr->SetMismatches(0);
745 hr->SetGaps(0);
746 hr->SetEValue(0);
747 hr->SetScore(2*(q - q0));
748 hr->SetIdentity(1);
749 return hr;
750 }
751
752
x_SplitQualifyingHits(THitRefs * phitrefs)753 void CSplign::x_SplitQualifyingHits(THitRefs* phitrefs)
754 {
755 const char * Seq1 (&m_mrna.front());
756 const char * Seq2 (&m_genomic.front());
757 THitRefs rv;
758 ITERATE(THitRefs, ii, (*phitrefs)) {
759
760 const THitRef & h (*ii);
761 const double idty (h->GetIdentity());
762 const bool diag (h->GetGaps() == 0 && h->GetQuerySpan() == h->GetSubjSpan());
763 if(idty == 1 || idty < .95 || h->GetLength() < 100 || !diag) {
764 rv.push_back(h);
765 }
766 else {
767
768 int q0 (-1), s0 (-1), q1 (h->GetQueryMax());
769 int q (h->GetQueryMin()), s (h->GetSubjMin());
770 size_t new_hits (0);
771 while(q <= q1) {
772 if(Seq1[q++] != Seq2[s++]) {
773 if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
774 THitRef hr (sx_NewHit(q0, q, s0, s));
775 hr->SetQueryId(h->GetQueryId());
776 hr->SetSubjId(h->GetSubjId());
777 rv.push_back(hr);
778 ++new_hits;
779 }
780 q0 = s0 = -1;
781 }
782 else {
783 if(q0 == -1) {
784 q0 = q;
785 s0 = s;
786 }
787 }
788 }
789
790 if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
791 THitRef hr (sx_NewHit(q0, q, s0, s));
792 hr->SetQueryId(h->GetQueryId());
793 hr->SetSubjId(h->GetSubjId());
794 rv.push_back(hr);
795 ++new_hits;
796 }
797
798 if(new_hits == 0) {
799 rv.push_back(h);
800 }
801 }
802 }
803
804 *phitrefs = rv;
805 }
806
807
x_SetPattern(THitRefs * phitrefs)808 void CSplign::x_SetPattern(THitRefs* phitrefs)
809 {
810 m_alnmap.clear();
811 m_pattern.clear();
812
813 // sort the input by min query coordinate
814 typedef CHitComparator<THit> THitComparator;
815 THitComparator sorter (THitComparator::eQueryMin);
816 stable_sort(phitrefs->begin(), phitrefs->end(), sorter);
817
818 // check that no two consecutive hits are farther than the max intron
819 // (extra short hits skipped)
820 // (throw out a hit if it intersects with previous on subject (genome), )
821 size_t prev (0);
822 TSeqPos prevSmin, prevSmax;
823 NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
824
825 THitRef& h (*ii);
826
827 if(h->GetQuerySpan() < m_MinPatternHitLength) {
828 h.Reset(0);
829 continue;
830 }
831
832 if(prev > 0) {
833
834 const bool non_intersect = ( prevSmax < h->GetSubjMin() ) || ( prevSmin > h->GetSubjMax() );
835 if(!non_intersect) {//throw out intersecting hit
836 h.Reset(0);
837 continue;
838 }
839
840 const bool consistent (h->GetSubjStrand()?
841 (h->GetSubjStart() < prev + m_MaxIntron):
842 (h->GetSubjStart() + m_MaxIntron > prev));
843
844 if(!consistent) {
845 const string errmsg (g_msg_CompartmentInconsistent
846 + string(" (extra long introns)"));
847 NCBI_THROW(CAlgoAlignException, eIntronTooLong, errmsg);
848 }
849 }
850
851 prev = h->GetSubjStop();
852 prevSmin = h->GetSubjMin();
853 prevSmax = h->GetSubjMax();
854 }
855
856 phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
857 CHitFilter<THit>::s_PNullRef),
858 phitrefs->end());
859
860 // save each hit longer than the minimum and test whether the hit is perfect
861 vector<size_t> pattern0;
862 vector<pair<bool,double> > imperfect;
863 double max_idty (0.0);
864 for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
865
866 const THitRef & h ((*phitrefs)[i]);
867 const bool valid (true);
868 if(valid) {
869
870 pattern0.push_back(h->GetQueryMin());
871 pattern0.push_back(h->GetQueryMax());
872 pattern0.push_back(h->GetSubjMin());
873 pattern0.push_back(h->GetSubjMax());
874 const double idty (h->GetIdentity());
875 const bool imprf (idty < 1.00
876 || h->GetQuerySpan() != h->GetSubjSpan()
877 || h->GetMismatches() > 0
878 || h->GetGaps() > 0);
879 imperfect.push_back(pair<bool,double>(imprf, idty));
880 if(idty > max_idty) {
881 max_idty = idty;
882 }
883 }
884 }
885
886 if(max_idty < .85 && pattern0.size() >= 4) {
887 m_BoundingRange = pair<size_t, size_t>(pattern0[2], pattern0.back());
888 }
889 else {
890 m_BoundingRange = pair<size_t, size_t>(0, 0);
891 }
892
893 const size_t dim (pattern0.size());
894
895 const char* Seq1 (&m_mrna.front());
896 const size_t SeqLen1 (m_polya_start < kMax_UInt? m_polya_start: m_mrna.size());
897 const char* Seq2 (&m_genomic.front());
898 const size_t SeqLen2 (m_genomic.size());
899
900 // verify conditions on the input hit pattern
901 CNcbiOstrstream ostr_err;
902 bool some_error (false), bad_input (false);
903 if(dim % 4 == 0) {
904
905 for(size_t i (0); i < dim; i += 4) {
906
907 if(pattern0[i] > pattern0[i+1] || pattern0[i+2] > pattern0[i+3]) {
908 ostr_err << "Pattern hits must be specified in plus strand";
909 some_error = bad_input = true;
910 break;
911 }
912
913 if(i > 4) {
914 if(pattern0[i] <= pattern0[i-3] || pattern0[i+2] <= pattern0[i-1]) {
915 ostr_err << g_msg_CompartmentInconsistent
916 << string(" (hits not sorted)");
917 some_error = true;
918 break;
919 }
920 }
921
922 const bool br1 (pattern0[i+1] >= SeqLen1);
923 const bool br2 (pattern0[i+3] >= SeqLen2);
924 if(br1 || br2) {
925
926 ostr_err << "Pattern hits out of range ("
927 << "query = "
928 << phitrefs->front()->GetQueryId()->GetSeqIdString(true)
929 << "subj = "
930 << phitrefs->front()->GetSubjId()->GetSeqIdString(true)
931 << "):" << endl;
932
933 if(br1) {
934 ostr_err << "\tquery_pattern_max = " << pattern0[i+1]
935 << "; query_len = " << SeqLen1 << endl;
936 }
937
938 if(br2) {
939 ostr_err << "\tsubj_pattern_max = " << pattern0[i+3]
940 << "; subj_len = " << SeqLen2 << endl;
941 }
942
943 some_error= true;
944 break;
945 }
946 }
947
948 }
949 else {
950 ostr_err << "Pattern dimension must be a multiple of four";
951 some_error = bad_input = true;
952 }
953
954 if(some_error) {
955 ostr_err << " (query = "
956 << phitrefs->front()->GetQueryId()->AsFastaString()
957 << " , subj = "
958 << phitrefs->front()->GetSubjId()->AsFastaString() << ')'
959 << endl;
960 }
961
962 const string err = CNcbiOstrstreamToString(ostr_err);
963 if(err.size() > 0) {
964 if(bad_input) {
965 NCBI_THROW(CAlgoAlignException, eBadParameter, err);
966 }
967 else {
968 NCBI_THROW(CAlgoAlignException, ePattern, err);
969 }
970 }
971
972 SAlnMapElem map_elem;
973 map_elem.m_box[0] = map_elem.m_box[2] = 0;
974 map_elem.m_pattern_start = map_elem.m_pattern_end = -1;
975
976 // build the alignment map
977 CBandAligner nwa;
978 for(size_t i = 0; i < dim; i += 4) {
979
980 size_t L1, R1, L2, R2;
981 size_t max_seg_size (0);
982
983 const bool imprf (imperfect[i/4].first);
984 if(imprf) {
985
986 // TODO:
987 // a better approach is to find indels and mismatches
988 // and split at the SplitQualifyingHits() stage
989 // to pass here only perfect diags
990 const size_t len1 (pattern0[i+1] - pattern0[i] + 1);
991 const size_t len2 (pattern0[i+3] - pattern0[i+2] + 1);
992 const size_t maxlen (max(len1, len2));
993 const size_t lendif (len1 < len2? len2 - len1: len1 - len2);
994 size_t band (size_t((1 - imperfect[i/4].second) * maxlen) + 2);
995 if(band < lendif) band += lendif;
996 nwa.SetBand(band);
997 nwa.SetSequences(Seq1 + pattern0[i], len1,
998 Seq2 + pattern0[i+2], len2,
999 false);
1000 nwa.Run();
1001 max_seg_size = nwa.GetLongestSeg(&L1, &R1, &L2, &R2);
1002 }
1003 else {
1004
1005 L1 = 1;
1006 R1 = pattern0[i+1] - pattern0[i] - 1;
1007 L2 = 1;
1008 R2 = pattern0[i+3] - pattern0[i+2] - 1;
1009 max_seg_size = R1 - L1 + 1;
1010 }
1011
1012 if(max_seg_size) {
1013
1014 // make the core
1015 {{
1016 size_t cut ((1 + R1 - L1) / 5);
1017 if(cut > 20) cut = 20;
1018
1019 const size_t l1 (L1 + cut), l2 (L2 + cut);
1020 const size_t r1 (R1 - cut), r2 (R2 - cut);
1021 if(l1 < r1 && l2 < r2) {
1022 L1 = l1; L2 = l2;
1023 R1 = r1; R2 = r2;
1024 }
1025 }}
1026
1027 size_t q0 (pattern0[i] + L1);
1028 size_t s0 (pattern0[i+2] + L2);
1029 size_t q1 (pattern0[i] + R1);
1030 size_t s1 (pattern0[i+2] + R2);
1031
1032 if(imprf) {
1033
1034 const size_t hitlen_q (pattern0[i + 1] - pattern0[i] + 1);
1035 const size_t sh (size_t(hitlen_q / 4));
1036
1037 size_t delta (sh > L1? sh - L1: 0);
1038 q0 += delta;
1039 s0 += delta;
1040
1041 const size_t h2s_right (hitlen_q - R1 - 1);
1042 delta = sh > h2s_right? sh - h2s_right: 0;
1043 q1 -= delta;
1044 s1 -= delta;
1045
1046 if(q0 > q1 || s0 > s1) {
1047
1048 // the longest segment too short
1049 q0 = pattern0[i] + L1;
1050 s0 = pattern0[i+2] + L2;
1051 q1 = pattern0[i] + R1;
1052 s1 = pattern0[i+2] + R2;
1053 }
1054 }
1055
1056 m_pattern.push_back(q0); m_pattern.push_back(q1);
1057 m_pattern.push_back(s0); m_pattern.push_back(s1);
1058
1059 const size_t pattern_dim = m_pattern.size();
1060 if(map_elem.m_pattern_start == -1) {
1061 map_elem.m_pattern_start = pattern_dim - 4;
1062 }
1063 map_elem.m_pattern_end = pattern_dim - 1;
1064 }
1065
1066 map_elem.m_box[1] = pattern0[i+1];
1067 map_elem.m_box[3] = pattern0[i+3];
1068 }
1069
1070 map_elem.m_box[1] = SeqLen1 - 1;
1071 map_elem.m_box[3] = SeqLen2 - 1;
1072 m_alnmap.push_back(map_elem);
1073 }
1074
1075
GetCds(const THit::TId & id,const vector<char> * seq_data)1076 CSplign::TOrfPair CSplign::GetCds(const THit::TId& id, const vector<char> * seq_data)
1077 {
1078 TOrfPair rv (TOrf(0, 0), TOrf(0, 0));
1079
1080 TStrIdToOrfs::const_iterator ie (m_OrfMap.end());
1081 const string strid (id->AsFastaString());
1082 TStrIdToOrfs::const_iterator ii (m_OrfMap.find(strid));
1083
1084 if(ii != ie) {
1085 rv = ii->second;
1086 }
1087 else {
1088
1089
1090 vector<char> seq;
1091 if(seq_data == 0) {
1092 x_LoadSequence(&seq, *id, 0, kMaxCoord, false);
1093 seq_data = & seq;
1094 }
1095
1096 // Assign CDS to the max ORF longer than 90 bps and starting from ATG
1097 //
1098 vector<CRef<CSeq_loc> > orfs;
1099 vector<string> start_codon;
1100 start_codon.push_back("ATG");
1101
1102 COrf::FindOrfs(*seq_data, orfs, 90, 1, start_codon);
1103 TSeqPos max_len_plus (0), max_len_minus (0);
1104 TSeqPos max_from_plus (0), max_from_minus (0);
1105 TSeqPos max_to_plus (0), max_to_minus (0);
1106 ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
1107
1108 const TSeqPos len (sequence::GetLength(**orf, NULL));
1109 const ENa_strand orf_strand ((*orf)->GetInt().GetStrand());
1110 const bool antisense (orf_strand == eNa_strand_minus);
1111
1112 if(antisense) {
1113 if(len > max_len_minus) {
1114 max_len_minus = len;
1115 max_from_minus = (*orf)->GetInt().GetTo();
1116 max_to_minus = (*orf)->GetInt().GetFrom();
1117 }
1118 }
1119 else {
1120 if(len > max_len_plus) {
1121 max_len_plus = len;
1122 max_from_plus = (*orf)->GetInt().GetFrom();
1123 max_to_plus = (*orf)->GetInt().GetTo();
1124 }
1125 }
1126 }
1127
1128 if(max_len_plus > 0) {
1129 rv.first = TOrf(max_from_plus, max_to_plus);
1130 }
1131
1132 if(max_len_minus > 0) {
1133 rv.second = TOrf(max_from_minus, max_to_minus);
1134 }
1135
1136 m_OrfMap[strid] = rv;
1137 }
1138
1139 return rv;
1140 }
1141
1142
x_FinalizeAlignedCompartment(SAlignedCompartment & ac)1143 void CSplign::x_FinalizeAlignedCompartment(SAlignedCompartment & ac)
1144 {
1145 ac.m_Id = ++m_model_id;
1146 ac.m_Segments = m_segments;
1147 ac.m_Status = SAlignedCompartment::eStatus_Ok;
1148 ac.m_Msg = "Ok";
1149 ac.m_Cds_start = m_cds_start;
1150 ac.m_Cds_stop = m_cds_stop;
1151 ac.m_QueryLen = m_mrna.size();
1152 ac.m_PolyA = (m_polya_start < kMax_UInt? m_polya_start : 0);
1153 }
1154
1155
1156 // PRE: Input Blast hits.
1157 // POST: TResults - a vector of aligned compartments.
Run(THitRefs * phitrefs)1158 void CSplign::Run(THitRefs* phitrefs)
1159 {
1160 if(!phitrefs) {
1161 NCBI_THROW(CAlgoAlignException, eInternal, "Unexpected NULL pointers");
1162 }
1163
1164 THitRefs& hitrefs = *phitrefs;
1165
1166 // make sure query hit is in plus strand
1167 NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1168
1169 THitRef& h = *ii;
1170 if(h.NotNull() && h->GetQueryStrand() == false) {
1171 h->FlipStrands();
1172 }
1173 }
1174
1175 if(m_aligner.IsNull()) {
1176 NCBI_THROW(CAlgoAlignException, eNotInitialized, g_msg_AlignedNotSpecified);
1177 }
1178
1179 if(hitrefs.size() == 0) {
1180 NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_EmptyHitVectorPassed);
1181 }
1182
1183 m_result.clear();
1184
1185 THit::TId id_query (hitrefs.front()->GetQueryId());
1186
1187 const THit::TCoord mrna_size (objects::sequence::GetLength(*id_query, m_Scope));
1188 if(mrna_size == kMaxCoord) {
1189 NCBI_THROW(CAlgoAlignException, eNoSeqData,
1190 string("Sequence not found: ") + id_query->AsFastaString());
1191 }
1192
1193 // iterate through compartments
1194 const THit::TCoord min_singleton_idty_final (
1195 min(size_t(m_MinSingletonIdty * mrna_size),
1196 m_MinSingletonIdtyBps));
1197
1198 CCompartmentAccessor<THit> comps (THit::TCoord(m_CompartmentPenalty * mrna_size),
1199 THit::TCoord(m_MinCompartmentIdty * mrna_size),
1200 min_singleton_idty_final,
1201 true);
1202 comps.SetMaxIntron(m_MaxIntron);
1203 if( GetTestType() == kTestType_20_28 || GetTestType() == kTestType_20_28_plus ) {
1204 comps.Run(hitrefs.begin(), hitrefs.end(), GetScope());
1205 } else {
1206 comps.Run(hitrefs.begin(), hitrefs.end());
1207 }
1208
1209 pair<size_t,size_t> dim (comps.GetCounts()); // (count_total, count_unmasked)
1210 if(dim.second > 0) {
1211
1212 // pre-load cDNA
1213 m_mrna.clear();
1214
1215 x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
1216
1217 const TOrfPair orfs (GetCds(id_query, & m_mrna));
1218 if(m_strand) {
1219 m_cds_start = orfs.first.first;
1220 m_cds_stop = orfs.first.second;
1221 }
1222 else {
1223 m_cds_start = orfs.second.first;
1224 m_cds_stop = orfs.second.second;
1225 }
1226
1227 if(!m_strand) {
1228 // make a reverse complimentary
1229 reverse (m_mrna.begin(), m_mrna.end());
1230 transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
1231
1232 reverse (m_mrna_polya.begin(), m_mrna_polya.end());
1233 transform(m_mrna_polya.begin(), m_mrna_polya.end(), m_mrna_polya.begin(), SCompliment());
1234 }
1235
1236 // compartments share the space between them
1237 size_t smin (0), smax (kMax_UInt);
1238 bool same_strand (false);
1239
1240 const THit::TCoord* box (comps.GetBox(0));
1241 if(m_MaxCompsPerQuery > 0 && dim.second > m_MaxCompsPerQuery) {
1242 dim.second = m_MaxCompsPerQuery;
1243 }
1244
1245 for(size_t i (0); i < dim.first; ++i, box += 4) {
1246
1247 if(i + 1 == dim.first) {
1248 smax = kMax_UInt;
1249 same_strand = false;
1250 }
1251 else {
1252 bool strand_this (comps.GetStrand(i));
1253 bool strand_next (comps.GetStrand(i+1));
1254 same_strand = strand_this == strand_next;
1255 smax = same_strand? (box + 4)[2]: kMax_UInt;
1256 }
1257
1258 try {
1259
1260 if(smax < box[3]) {
1261 // alert if not ordered by lower subject coordinate
1262 NCBI_THROW(CAlgoAlignException, eInternal,
1263 "Unexpected order of compartments");
1264 }
1265
1266 if(comps.GetStatus(i)) {
1267 THitRefs comp_hits;
1268 comps.Get(i, comp_hits);
1269
1270 if(smax < box[3]) smax = box[3];
1271 if(smin > box[2]) smin = box[2];
1272
1273 SAlignedCompartment ac (x_RunOnCompartment(&comp_hits, smin,smax));
1274 x_FinalizeAlignedCompartment(ac);
1275 m_result.push_back(ac);
1276 }
1277 }
1278
1279 catch(CAlgoAlignException& e) {
1280
1281 if(e.GetSeverity() == eDiag_Fatal) {
1282 throw;
1283 }
1284
1285 m_result.push_back(SAlignedCompartment(0, e.GetMsg().c_str()));
1286
1287 const CException::TErrCode errcode (e.GetErrCode());
1288 if(errcode != CAlgoAlignException::eNoAlignment) {
1289 m_result.back().m_Status = SAlignedCompartment::eStatus_Error;
1290 }
1291
1292 ++m_model_id;
1293 }
1294
1295 smin = same_strand? box[3]: 0;
1296 }
1297 }
1298 }
1299
1300
AlignSingleCompartment(CRef<objects::CSeq_align> compartment,SAlignedCompartment * result)1301 bool CSplign::AlignSingleCompartment(CRef<objects::CSeq_align> compartment,
1302 SAlignedCompartment* result)
1303 {
1304 const CRef<CSeq_loc> seqloc (compartment->GetBounds().front());
1305 const size_t subj_min (seqloc->GetStart(eExtreme_Positional));
1306 const size_t subj_max (seqloc->GetStop(eExtreme_Positional));
1307
1308 THitRefs hitrefs;
1309 ITERATE(CSeq_align_set::Tdata, ii, compartment->GetSegs().GetDisc().Get()) {
1310 }
1311
1312 return AlignSingleCompartment(&hitrefs, subj_min, subj_max, result);
1313 }
1314
1315
AlignSingleCompartment(THitRefs * phitrefs,size_t subj_min,size_t subj_max,SAlignedCompartment * result)1316 bool CSplign::AlignSingleCompartment(THitRefs* phitrefs,
1317 size_t subj_min, size_t subj_max,
1318 SAlignedCompartment* result)
1319 {
1320 m_mrna.resize(0);
1321
1322 THit::TId id_query (phitrefs->front()->GetQueryId());
1323
1324 x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
1325
1326 const TOrfPair orfs (GetCds(id_query, & m_mrna));
1327 if(m_strand) {
1328 m_cds_start = orfs.first.first;
1329 m_cds_stop = orfs.first.second;
1330 }
1331 else {
1332 m_cds_start = orfs.second.first;
1333 m_cds_stop = orfs.second.second;
1334 }
1335
1336 if(!m_strand) {
1337 reverse (m_mrna.begin(), m_mrna.end());
1338 transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
1339
1340 reverse (m_mrna_polya.begin(), m_mrna_polya.end());
1341 transform(m_mrna_polya.begin(), m_mrna_polya.end(), m_mrna_polya.begin(), SCompliment());
1342 }
1343
1344 bool rv (true);
1345 try {
1346
1347 SAlignedCompartment ac (x_RunOnCompartment(phitrefs, subj_min, subj_max));
1348 x_FinalizeAlignedCompartment(ac);
1349 *result = ac;
1350 m_mrna.resize(0);
1351 }
1352
1353 catch(CAlgoAlignException& e) {
1354
1355 m_mrna.resize(0);
1356
1357 if(e.GetSeverity() == eDiag_Fatal) {
1358 throw;
1359 }
1360
1361 *result = SAlignedCompartment(0, e.GetMsg().c_str());
1362
1363 const CException::TErrCode errcode (e.GetErrCode());
1364 if(errcode != CAlgoAlignException::eNoAlignment) {
1365 result->m_Status = SAlignedCompartment::eStatus_Error;
1366 }
1367
1368 ++m_model_id;
1369 rv = false;
1370 }
1371
1372 return rv;
1373 }
1374
IsPolyA(const char * seq,size_t polya_start,size_t dim)1375 bool CSplign::IsPolyA(const char * seq, size_t polya_start, size_t dim) {
1376 const double kMinPercAInPolya (0.80);
1377 if( polya_start + GetMinPolyaLen() > dim ) return false;
1378 size_t cnt = 0;
1379 for(size_t i = polya_start; i<dim; ++i) {
1380 if(seq[i] == 'A') ++cnt;
1381 }
1382 if(cnt >= (dim - polya_start)*kMinPercAInPolya) return true;
1383 return false;
1384 }
1385
1386 // naive polya detection; sense direction assumed
s_TestPolyA(const char * seq,size_t dim,size_t cds_stop)1387 size_t CSplign::s_TestPolyA(const char * seq, size_t dim, size_t cds_stop)
1388 {
1389 const size_t kMaxNonA (3), kMinAstreak (5);
1390 int i (dim - 1), i0 (dim);
1391 for(size_t count_non_a (0), astreak (0); i >= 0 && count_non_a < kMaxNonA; --i) {
1392
1393 if(seq[i] != 'A') {
1394 ++count_non_a;
1395 astreak = 0;
1396 }
1397 else {
1398 if(++astreak >= kMinAstreak) {
1399 i0 = i;
1400 }
1401 }
1402 }
1403
1404 const size_t len (dim - i0);
1405 size_t rv;
1406 if(len >= kMinAstreak) {
1407 rv = i0;
1408 if(0 < cds_stop && cds_stop < dim && rv <= cds_stop) {
1409 rv = cds_stop + 1;
1410 }
1411 }
1412 else {
1413 rv = kMax_UInt;
1414 }
1415
1416 return rv;
1417 }
1418
1419
1420 // PRE: Hits (initial, not transformed) representing the compartment;
1421 // maximum genomic sequence span;
1422 // pre-loaded and appropriately transformed query sequence.
1423 // POST: A set of segments packed into the aligned compartment.
1424
x_RunOnCompartment(THitRefs * phitrefs,size_t range_left,size_t range_right)1425 CSplign::SAlignedCompartment CSplign::x_RunOnCompartment(THitRefs* phitrefs,
1426 size_t range_left,
1427 size_t range_right)
1428 {
1429 SAlignedCompartment rv;
1430
1431 try {
1432 m_segments.clear();
1433
1434 if(range_left > range_right) {
1435 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange);
1436 }
1437
1438 if(phitrefs->size() == 0) {
1439 NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_NoHitsAfterFiltering);
1440 }
1441
1442 const size_t mrna_size (m_mrna.size());
1443
1444 if(m_strand == false) {
1445
1446 // adjust the hits
1447 for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
1448
1449 THitRef& h ((*phitrefs)[i]);
1450 THit::TCoord a0 (mrna_size - h->GetQueryMin() - 1);
1451 THit::TCoord a1 (mrna_size - h->GetQueryMax() - 1);
1452 const bool new_strand (!(h->GetSubjStrand()));
1453 h->SetQueryStart(a1);
1454 h->SetQueryStop(a0);
1455 h->SetSubjStrand(new_strand);
1456 }
1457 }
1458
1459 m_polya_start = m_nopolya?
1460 kMax_UInt:
1461 s_TestPolyA(&m_mrna_polya.front(), m_mrna_polya.size(), m_cds_stop);
1462
1463 // cleave off hits beyond polya
1464 if(m_polya_start < kMax_UInt) {
1465 CleaveOffByTail(phitrefs, m_polya_start);
1466 }
1467
1468 // keep short terminal hits out of the pattern
1469 THitRefs::iterator ii (phitrefs->begin()), jj (phitrefs->end() - 1);
1470 const size_t min_termhitlen1 (m_MinPatternHitLength);
1471 const size_t min_termhitlen2 (2*m_MinPatternHitLength);
1472 bool b0 (true), b1 (true);
1473 while(b0 && b1 && ii < jj) {
1474
1475 while(ii->IsNull() && ii < jj) ++ii;
1476 while(jj->IsNull() && ii < jj) --jj;
1477
1478 if(ii < jj) {
1479
1480 const double hit_idty ((*ii)->GetIdentity());
1481 const size_t min_termhitlen (
1482 hit_idty < .9999? min_termhitlen2: min_termhitlen1);
1483
1484 if((*ii)->GetQuerySpan() < min_termhitlen) {
1485 ii++ -> Reset(0);
1486 }
1487 else {
1488 b0 = false;
1489 }
1490 }
1491
1492 if(ii < jj) {
1493
1494 const double hit_idty ((*jj)->GetIdentity());
1495 const size_t min_termhitlen (
1496 hit_idty < .9999? min_termhitlen2: min_termhitlen1);
1497
1498 if((*jj)->GetQuerySpan() < min_termhitlen) {
1499 jj-- -> Reset(0);
1500 }
1501 else {
1502 b1 = false;
1503 }
1504 }
1505 }
1506
1507 phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
1508 CHitFilter<THit>::s_PNullRef),
1509 phitrefs->end());
1510
1511 if(phitrefs->size() == 0) {
1512 NCBI_THROW(CAlgoAlignException, eNoHits,
1513 g_msg_NoHitsAfterFiltering);
1514 }
1515
1516
1517 // find regions of interest on mRna (query) and contig (subj)
1518 THit::TCoord span [4];
1519 CHitFilter<THit>::s_GetSpan(*phitrefs, span);
1520 THit::TCoord qmin (span[0]), qmax (span[1]), smin (span[2]), smax (span[3]);
1521
1522 const bool ctg_strand (phitrefs->front()->GetSubjStrand());
1523
1524 // m1: estimate terminal genomic extents based on uncovered end sizes
1525 const THit::TCoord extent_left_m1 (x_GetGenomicExtent(qmin));
1526 const THit::TCoord rspace ((m_polya_start < kMax_UInt?
1527 m_polya_start: mrna_size) - qmax - 1 );
1528 const THit::TCoord extent_right_m1 (x_GetGenomicExtent(rspace));
1529
1530 // m2: estimate genomic extents using compartment hits
1531 THit::TCoord fixed_left (kMaxCoord / 2), fixed_right(fixed_left);
1532
1533 const size_t kTermLenCutOff_m2 (10);
1534 const bool fix_left (qmin <= kTermLenCutOff_m2);
1535 const bool fix_right (rspace <= kTermLenCutOff_m2);
1536 if(fix_left || fix_right) {
1537
1538 if(phitrefs->size() > 1) {
1539
1540 // select based on the max intron length
1541 THit::TCoord max_intron (0);
1542 THit::TCoord prev_start (phitrefs->front()->GetSubjStart());
1543
1544 ITERATE(THitRefs, ii, (*phitrefs)) {
1545
1546 const THit::TCoord cur_start ((*ii)->GetSubjStart());
1547 const THit::TCoord intron (cur_start >= prev_start?
1548 cur_start - prev_start:
1549 prev_start - cur_start);
1550 if(intron > max_intron) {
1551 max_intron = intron;
1552 }
1553 prev_start = cur_start;
1554 }
1555
1556 const double factor (2.5);
1557 if(fix_left) { fixed_left = THit::TCoord(max_intron * factor); }
1558 if(fix_right) { fixed_right = THit::TCoord(max_intron * factor); }
1559 }
1560 else {
1561 // stay conservative for single-hit compartments
1562 const THit::TCoord single_hit_extent (300);
1563 if(fix_left) { fixed_left = single_hit_extent; }
1564 if(fix_right) { fixed_right = single_hit_extent; }
1565 }
1566 }
1567
1568 const THit::TCoord extent_left_m2 (100 + max(fixed_left, qmin));
1569 const THit::TCoord extent_right_m2 (100 + max(fixed_right, rspace));
1570
1571 const THit::TCoord extent_left (min(extent_left_m1, extent_left_m2));
1572 THit::TCoord extent_right (min(extent_right_m1, extent_right_m2));
1573
1574 //add polya length to extent
1575 THit::TCoord poly_length = m_polya_start < kMax_UInt ? mrna_size - m_polya_start : 0;
1576 if(extent_right < poly_length) extent_right = poly_length;
1577
1578 if(ctg_strand) {
1579 smin = max(0, int(smin - extent_left));
1580 smax += extent_right;
1581 }
1582 else {
1583 smin = max(0, int(smin - extent_right));
1584 smax += extent_left;
1585 }
1586
1587 // regardless of hits, entire cDNA is aligned (without the tail, if any)
1588 qmin = 0;
1589 qmax = m_polya_start < kMax_UInt? m_polya_start - 1: mrna_size - 1;
1590
1591 // make sure to obey the genomic range specified
1592 if(smin < range_left) {
1593 smin = range_left;
1594 }
1595 if(smax > range_right) {
1596 smax = range_right;
1597 }
1598
1599 //prohibit extension to go over over non-bridgeable gaps
1600 if(phitrefs->size() > 1) {
1601 THit::TId id_query (phitrefs->front()->GetSubjId());
1602 CRef<CSeq_id> tmp_id(new CSeq_id());
1603 tmp_id->Assign(*id_query);
1604 TSeqPos hitmin(span[2]);
1605 TSeqPos hitmax(span[3]);
1606
1607 //left
1608 if(hitmin > smin) {
1609 CSeq_loc tmp_loc(*tmp_id, smin, hitmin - 1, eNa_strand_plus);
1610 CConstRef<CSeqMap> smap = CSeqMap::GetSeqMapForSeq_loc(tmp_loc, GetScope());
1611 if(smap) {
1612 TSeqPos tmplen = hitmin - smin;
1613 CSeqMap_CI smit = smap->ResolvedRangeIterator(GetScope(), 0, tmplen, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
1614 for(;smit; ++smit) {
1615 if(smit.GetType() == CSeqMap::eSeqGap) {
1616 CConstRef<CSeq_literal> slit = smit.GetRefGapLiteral();
1617 if(slit && slit->IsBridgeable() == CSeq_literal::e_NotBridgeable) {
1618 TSeqPos pos = smit.GetEndPosition();//exclusive
1619 _ASSERT( smin + pos <= hitmin );
1620 smin += pos;
1621 }
1622 }
1623 }
1624 }
1625 }
1626 //right
1627 if(smax > hitmax) {
1628 CSeq_loc tmp_loc(*tmp_id, hitmax + 1, smax, eNa_strand_plus);
1629 CConstRef<CSeqMap> smap = CSeqMap::GetSeqMapForSeq_loc(tmp_loc, GetScope());
1630 if(smap) {
1631 TSeqPos tmplen = smax - hitmax;
1632 CSeqMap_CI smit = smap->ResolvedRangeIterator(GetScope(), 0, tmplen, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
1633 for(;smit; ++smit) {
1634 if(smit.GetType() == CSeqMap::eSeqGap) {
1635 CConstRef<CSeq_literal> slit = smit.GetRefGapLiteral();
1636 if(slit && slit->IsBridgeable() == CSeq_literal::e_NotBridgeable) {
1637 TSeqPos pos = smit.GetPosition();
1638 _ASSERT( hitmax + pos < smax );
1639 smax = hitmax + pos;
1640 }
1641 }
1642 }
1643 }
1644 }
1645
1646 }
1647
1648
1649 m_genomic.clear();
1650 x_LoadSequence(&m_genomic, *(phitrefs->front()->GetSubjId()),
1651 smin, smax, true, true, ctg_strand);
1652
1653 // adjust smax if beyond the end
1654 const THit::TCoord ctg_end (smin + m_genomic.size());
1655 if(smax >= ctg_end) {
1656 smax = ctg_end > 0? ctg_end - 1: 0;
1657 }
1658
1659 if(ctg_strand == false) {
1660
1661 // make reverse complementary
1662 // for the contig's area of interest
1663 reverse (m_genomic.begin(), m_genomic.end());
1664 transform(m_genomic.begin(), m_genomic.end(), m_genomic.begin(),
1665 SCompliment());
1666 }
1667
1668 NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
1669
1670 THitRef& h (*ii);
1671
1672 const THit::TCoord hsmin (h->GetSubjMin());
1673 const THit::TCoord hsmax (h->GetSubjMax());
1674 if(!(smin <= hsmin && hsmax <= smax)) {
1675 CNcbiOstrstream ostr;
1676 ostr << "\nOne of compartment hits:\n" << *h
1677 << "\n goes outside the genome range = (" << smin+1 << ", " << smax+1 << ')'
1678 <<"\n allowed for the compartment";
1679 const string errmsg = CNcbiOstrstreamToString(ostr);
1680 NCBI_THROW(CAlgoAlignException, ePattern, errmsg);
1681 }
1682
1683 if(ctg_strand == false) {
1684
1685 THit::TCoord a2 (smax - (hsmax - smin));
1686 THit::TCoord a3 (smax - (hsmin - smin));
1687 h->SetSubjStart(a2);
1688 h->SetSubjStop(a3);
1689 }
1690 }
1691
1692 rv.m_QueryStrand = m_strand;
1693 rv.m_SubjStrand = ctg_strand;
1694
1695 // shift hits so that they originate from zero
1696 NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
1697 (*ii)->Shift(-(Int4)qmin, -(Int4)smin);
1698 }
1699
1700 x_SplitQualifyingHits(phitrefs);
1701 x_SetPattern(phitrefs);
1702 rv.m_Score = x_Run(&m_mrna.front(), &m_genomic.front());
1703
1704 const size_t seg_dim (m_segments.size());
1705 if(seg_dim == 0) {
1706 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
1707 }
1708
1709 //trim holes to codons
1710 if( GetTrimToCodons() ) {
1711 CSplignTrim trim(&m_genomic.front(), (int)m_genomic.size(), m_aligner, m_MaxPartExonIdentDrop);
1712 trim.TrimHolesToCodons(m_segments, m_mrna_bio_handle, m_strand, m_mrna.size());
1713 }
1714
1715 //look for the last exon
1716 int last_exon = -1;
1717 for(int i = m_segments.size(); i > 0; ) {
1718 --i;
1719 if(m_segments[i].m_exon) {
1720 last_exon = i;
1721 break;
1722 }
1723 }
1724
1725 if(last_exon == -1) {//no exons found
1726 NCBI_THROW(CAlgoAlignException, eNoAlignment,g_msg_NoExonsAboveIdtyLimit);
1727 }
1728
1729
1730 // try to extend the last exon as long as it's a good match (min exon identity at the end required)
1731 TSegment& s (const_cast<TSegment&>(m_segments[last_exon]));
1732
1733 const char* p0 = &m_mrna.front() + s.m_box[1] + 1;
1734 const char* q0 = &m_genomic.front() + s.m_box[3] + 1;
1735 const char* p = p0;
1736 const char* q = q0;
1737 const char* pe = &m_mrna.front() + mrna_size;
1738 const char* qe = &m_genomic.front() + m_genomic.size();
1739
1740 int match_num = 0;
1741 size_t sh = 0, ct =0;
1742 for(; p < pe && q < qe; ++p, ++q, ++ct) {
1743 if(toupper(*p) != 'N' && *p == *q) {
1744 ++match_num;
1745 if( match_num >= (ct+1)*GetMinExonIdentity() ) { // % ident
1746 sh = ct+1;
1747 }
1748 }
1749 }
1750
1751 // cut low identity flank region in extention
1752 const double kMinExonFlankIdty (GetPolyaExtIdentity());
1753 if(sh) {
1754 p = p0+(sh-1);
1755 q = q0+(sh-1);
1756 ct = 1;
1757 match_num = 0;
1758 for(;p>=p0;--p,--q,++ct) {
1759 if(toupper(*p) != 'N' && *p == *q) {
1760 ++match_num;
1761 } else {
1762 if( match_num < ct*kMinExonFlankIdty) {//cut flank
1763 sh = p - p0;
1764 ct = 1;
1765 match_num = 0;
1766 }
1767 }
1768 }
1769 }
1770
1771 if(sh) {
1772 // resize
1773 s.m_box[1] += sh;
1774 s.m_box[3] += sh;
1775 for(ct = 0,p = p0, q = q0; ct < sh; ++p, ++q, ++ct) {
1776 if(toupper(*p) != 'N' && *p == *q) {
1777 s.m_details.append(1, 'M');
1778 } else {
1779 s.m_details.append(1, 'R');
1780 }
1781 }
1782 s.Update(m_aligner);
1783
1784 // fix annotation
1785 const size_t ann_dim = s.m_annot.size();
1786 if(ann_dim > 2 && s.m_annot[ann_dim - 3] == '>') {
1787 s.m_annot[ann_dim - 2] = q < qe? *q: ' ';
1788 s.m_annot[ann_dim - 1] = q < (qe-1)? *(q+1): ' ';
1789 }
1790 }
1791
1792 m_segments.resize(last_exon + 1);
1793
1794 //check if the rest is polya or a gap
1795 THit::TCoord coord = s.m_box[1] + 1;
1796 m_polya_start = kMax_UInt;
1797 if(coord < mrna_size ) {//there is unaligned flanking part of mRNA
1798 if(!m_nopolya && IsPolyA(&m_mrna_polya.front(), coord, m_mrna_polya.size())) {//polya
1799 m_polya_start = coord;
1800 } else {//gap
1801 if( GetTestType() == kTestType_20_28 || GetTestType() == kTestType_20_28_plus ) {
1802 if( ( (int)mrna_size - (int)s.m_box[1] - 1 ) >= kFlankExonProx &&
1803 ! x_IsInGap( s.m_box[3] + 1) ) {//a sequence gap, but not a genomic gap, cut to splice
1804 int seq1_pos = (int)s.m_box[1];
1805 int seq2_pos = (int)s.m_box[3];
1806 int det_pos = s.m_details.size() - 1;
1807 int min_det_pos = det_pos - kMaxCutToSplice;
1808 int min_pos = (int)s.m_box[0] + 8;//exon should not be too short
1809 while(seq1_pos >= min_pos && det_pos >= min_det_pos) {
1810 if( (size_t)(seq2_pos + 2) < m_genomic.size() && s.m_details[det_pos] == 'M' &&
1811 toupper(m_genomic[seq2_pos+1]) == 'G' && toupper(m_genomic[seq2_pos+2]) == 'T' ) {//GT point
1812 if( (size_t)det_pos + 1 < s.m_details.size() ) {//resize
1813 s.m_box[1] = seq1_pos;
1814 s.m_box[3] = seq2_pos;
1815 s.m_details.resize(det_pos + 1);
1816 s.Update(m_aligner);
1817 // update the last two annotation symbols
1818 size_t adim = s.m_annot.size();
1819 if(adim > 0 && s.m_annot[adim-1] == '>') {
1820 s.m_annot += "GT";
1821 } else if(adim > 2 && s.m_annot[adim-3] == '>') {
1822 s.m_annot[adim-2] = 'G';
1823 s.m_annot[adim-1] = 'T';
1824 }
1825 coord = seq1_pos+1;
1826 }
1827 break;
1828 }
1829 switch(s.m_details[det_pos]) {
1830 case 'M' :
1831 --seq1_pos;
1832 --seq2_pos;
1833 break;
1834 case 'R' :
1835 --seq1_pos;
1836 --seq2_pos;
1837 break;
1838 case 'I' :
1839 --seq2_pos;
1840 break;
1841 case 'D' :
1842 --seq1_pos;
1843 break;
1844 }
1845 --det_pos;
1846 }
1847 }
1848 }
1849
1850 TSegment ss;
1851 ss.m_box[0] = coord;
1852 ss.m_box[1] = mrna_size - 1;
1853 ss.SetToGap();
1854 m_segments.push_back(ss);
1855 }
1856 }
1857
1858
1859
1860 // scratch it if the total coverage is too low
1861 double mcount (0);
1862 ITERATE(TSegments, jj, m_segments) {
1863 if(jj->m_exon) {
1864 mcount += jj->m_idty * jj->m_len;
1865 }
1866 }
1867
1868 const THit::TCoord min_singleton_idty_final (
1869 min(size_t(m_MinSingletonIdty * qmax), m_MinSingletonIdtyBps));
1870
1871 if(mcount < min_singleton_idty_final) {
1872 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
1873 }
1874
1875 // convert coordinates back to original
1876 NON_CONST_ITERATE(TSegments, jj, m_segments) {
1877
1878 if(rv.m_QueryStrand) {
1879 jj->m_box[0] += qmin;
1880 jj->m_box[1] += qmin;
1881 }
1882 else {
1883 jj->m_box[0] = mrna_size - jj->m_box[0] - 1;
1884 jj->m_box[1] = mrna_size - jj->m_box[1] - 1;
1885 }
1886
1887 if(jj->m_exon) {
1888 if(rv.m_SubjStrand) {
1889 jj->m_box[2] += smin;
1890 jj->m_box[3] += smin;
1891 }
1892 else {
1893 jj->m_box[2] = smax - jj->m_box[2];
1894 jj->m_box[3] = smax - jj->m_box[3];
1895 }
1896 }
1897 }
1898
1899 if(!rv.m_QueryStrand && m_polya_start > 0 && m_polya_start < mrna_size) {
1900 m_polya_start = mrna_size - m_polya_start - 1;
1901 }
1902 } // try
1903
1904 catch(CAlgoAlignException& e) {
1905
1906 const CException::TErrCode errcode (e.GetErrCode());
1907 bool severe (true);
1908 switch(errcode) {
1909 case CAlgoAlignException::eNoAlignment:
1910 case CAlgoAlignException::eMemoryLimit:
1911 case CAlgoAlignException::eNoHits:
1912 case CAlgoAlignException::eIntronTooLong:
1913 // case CAlgoAlignException::ePattern:
1914 severe = false;
1915 break;
1916 }
1917
1918 if(severe) {
1919 e.SetSeverity(eDiag_Fatal);
1920 }
1921 throw;
1922 }
1923
1924 return rv;
1925 }
1926
1927 // at this level and below, plus strand is assumed for both sequences
x_Run(const char * Seq1,const char * Seq2)1928 float CSplign::x_Run(const char* Seq1, const char* Seq2)
1929 {
1930 typedef TSegments TSegmentVector;
1931 TSegmentVector segments;
1932
1933 //#define DBG_DUMP_PATTERN
1934 #ifdef DBG_DUMP_PATTERN
1935 cerr << "Pattern:" << endl;
1936 #endif
1937
1938 const size_t map_dim (m_alnmap.size());
1939 if(map_dim != 1) {
1940 NCBI_THROW(CAlgoAlignException, eInternal, "Multiple maps not supported");
1941 }
1942
1943 float rv (0);
1944 size_t cds_start (0), cds_stop (0);
1945 for(size_t i (0); i < map_dim; ++i) {
1946
1947 const SAlnMapElem& zone (m_alnmap[i]);
1948
1949 // setup sequences
1950 const size_t len1 (zone.m_box[1] - zone.m_box[0] + 1);
1951 const size_t len2 (zone.m_box[3] - zone.m_box[2] + 1);
1952
1953 // remap cds if antisense
1954 if(m_strand) {
1955 cds_start = m_cds_start;
1956 cds_stop = m_cds_stop;
1957 }
1958 else {
1959 cds_start = len1 - m_cds_start - 1;
1960 cds_stop = len1 - m_cds_stop - 1;
1961 }
1962
1963 m_aligner->SetSequences(Seq1 + zone.m_box[0], len1,
1964 Seq2 + zone.m_box[2], len2,
1965 false);
1966
1967 // prepare the pattern
1968 vector<size_t> pattern;
1969 if(m_pattern.size() > 0) {
1970
1971 if(zone.m_pattern_start < 0) {
1972 NCBI_THROW(CAlgoAlignException, eInternal,
1973 "CSplign::x_Run(): Invalid alignment pattern");
1974 }
1975
1976 copy(m_pattern.begin() + zone.m_pattern_start,
1977 m_pattern.begin() + zone.m_pattern_end + 1,
1978 back_inserter(pattern));
1979 }
1980
1981 for(size_t j (0), pt_dim (pattern.size()); j < pt_dim; j += 4) {
1982
1983 #ifdef DBG_DUMP_PATTERN
1984 cerr << (1 + pattern[j]) << '\t' << (1 + pattern[j+1]) << '\t'
1985 << "(len = " << (pattern[j+1] - pattern[j] + 1) << ")\t"
1986 << (1 + pattern[j+2]) << '\t' << (1 + pattern[j+3])
1987 << "(len = " << (pattern[j+3] - pattern[j+2] + 1) << ")\t"
1988 << endl;
1989 #undef DBG_DUMP_PATTERN
1990 #endif
1991
1992 pattern[j] -= zone.m_box[0];
1993 pattern[j+1] -= zone.m_box[0];
1994 pattern[j+2] -= zone.m_box[2];
1995 pattern[j+3] -= zone.m_box[2];
1996 }
1997
1998 // setup the aligner
1999 m_aligner->SetPattern(pattern);
2000 m_aligner->SetEndSpaceFree(true, true, true, true);
2001 m_aligner->SetCDS(cds_start, cds_stop);
2002
2003 rv += m_aligner->Run();
2004 m_aligner->CheckPreferences();
2005
2006 // #define DBG_DUMP_TYPE2
2007 #ifdef DBG_DUMP_TYPE2
2008 {{
2009 CNWFormatter fmt (*m_aligner);
2010 string txt;
2011 fmt.AsText(&txt, CNWFormatter::eFormatType2);
2012 cerr << txt;
2013 }}
2014 #undef DBG_DUMP_TYPE2
2015 #endif
2016
2017 CNWFormatter formatter (*m_aligner);
2018 formatter.MakeSegments(&segments);
2019
2020 // append a gap
2021 if(i + 1 < map_dim) {
2022 segments.push_back(TSegment());
2023 TSegment& g (segments.back());
2024 g.m_box[0] = zone.m_box[1] + 1;
2025 g.m_box[1] = m_alnmap[i+1].m_box[0] - 1;
2026 g.m_box[2] = zone.m_box[3] + 1;
2027 g.m_box[3] = m_alnmap[i+1].m_box[2] - 1;
2028 g.SetToGap();
2029 }
2030 } // zone iterations end
2031
2032
2033 //#define DUMP_ORIG_SEGS
2034 #ifdef DUMP_ORIG_SEGS
2035 cerr << "Orig segments:" << endl;
2036 ITERATE(TSegmentVector, ii, segments) {
2037 cerr << ii->m_exon << '\t' << ii->m_idty << '\t' << ii->m_len << '\t'
2038 << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
2039 << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
2040 << ii->m_annot << '\t' << ii->m_score << endl;
2041 }
2042 #endif
2043
2044 if(segments.size() == 0) {
2045 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
2046 }
2047
2048 // segment-level postprocessing
2049
2050 const size_t SeqLen2 (m_genomic.size());
2051 const size_t SeqLen1 (m_polya_start == kMax_UInt?
2052 m_mrna.size():
2053 m_polya_start);
2054
2055 // if the limiting range is set, clear all segments beyond the range
2056 if(m_BoundingRange.second > 0) {
2057
2058 NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2059 if(ii->m_exon &&
2060 (ii->m_box[3] < m_BoundingRange.first
2061 || ii->m_box[2] > m_BoundingRange.second))
2062 {
2063 ii->SetToGap();
2064 }
2065 }
2066 }
2067
2068 m_segments.resize(0);
2069
2070 /// postprocessing starts here
2071
2072 bool is_test = false;
2073 bool is_test_plus = false;
2074 if( GetTestType() == kTestType_20_28 ) {
2075 is_test = true;
2076 } else if( GetTestType() == kTestType_20_28_plus ) {
2077 is_test = true;
2078 is_test_plus = true;
2079 }
2080
2081 CSplignTrim trim(&m_genomic.front(), (int)m_genomic.size(), m_aligner, m_MaxPartExonIdentDrop);
2082
2083 //partial trimming near sequence gaps
2084 {{
2085 if (is_test) { // test mode
2086 bool first = true;
2087 TSegmentVector::iterator prev;
2088 NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2089 if(ii->m_exon == false) continue;
2090 if(first) {
2091 first = false;
2092 } else {
2093 if(prev->m_exon) {//if not exon, it will be trimmed later anyway
2094 /* old logic
2095 TSeqPos from = prev->m_box[3];
2096 TSeqPos length = ii->m_box[2] - prev->m_box[3] + 1;
2097 if(m_GenomicSeqMap && m_GenomicSeqMap->ResolvedRangeIterator(GetScope(), from, length, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap)) { //gap, trim.
2098
2099 // TEST OUTPUT
2100 CSeqMap_CI smit = m_GenomicSeqMap->ResolvedRangeIterator(GetScope(), from, length, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
2101 CConstRef<CSeq_literal> slit = smit.GetRefGapLiteral();
2102 string type = "not_gap";
2103 if(smit.GetType() == CSeqMap::eSeqGap) type = "gap";
2104 cout<<"Type: "<<type;
2105 if(slit) {
2106 cout<<" Position: "<<smit.GetPosition()+1;
2107 cout<<" Length: "<<smit.GetLength();
2108 cout<<" End Position: "<<smit.GetEndPosition()+1;
2109 }
2110 cout<<endl;
2111
2112 if(is_test_plus) {
2113 trim.ImproveFromRight(*prev);
2114 trim.ImproveFromLeft(*ii);
2115 } else {
2116 prev->ImproveFromRight1(Seq1, Seq2, m_aligner);
2117 ii->ImproveFromLeft1(Seq1, Seq2, m_aligner);
2118 }
2119 */
2120
2121 //trim only if one exon abuts a gap, and the other one not
2122 bool prev_abuts_gap = x_IsInGap(prev->m_box[3] + 1);
2123 bool abuts_gap = x_IsInGap( ii->m_box[2] - 1 );
2124 if( abuts_gap && !prev_abuts_gap ) trim.ImproveFromRight(*prev);
2125 if( !abuts_gap && prev_abuts_gap ) trim.ImproveFromLeft(*ii);
2126
2127 //add an alignment gap if needed
2128 if( ii->m_box[0] > prev->m_box[1] + 1) {
2129 TSegment sgap;
2130 sgap.m_box[0] = prev->m_box[1] + 1;
2131 sgap.m_box[2] = prev->m_box[3] + 1;
2132 sgap.m_box[1] = ii->m_box[0] - 1;
2133 sgap.m_box[3] = ii->m_box[2] - 1;
2134 sgap.SetToGap();
2135 ii = segments.insert(ii, sgap);
2136 ++ii;
2137 }
2138 //} end of old logic
2139 }
2140 }
2141 prev = ii;
2142 }
2143 if(first) NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons found
2144 }
2145 }}
2146
2147 /// trimming iterations
2148 while(true) {
2149
2150 bool continue_iterations = false;
2151
2152 if(m_segments.size() > 0) {
2153 segments.resize(m_segments.size());
2154 copy(m_segments.begin(), m_segments.end(), segments.begin());
2155 m_segments.resize(0);
2156 }
2157
2158 if(segments.size() == 0) {
2159 return 0;
2160 }
2161
2162 size_t exon_count0 (0);
2163 ITERATE(TSegmentVector, ii, segments) {
2164 if(ii->m_exon) ++exon_count0;
2165 }
2166
2167 //extend 100% near gaps and flanks
2168 if ( is_test ) { // test mode
2169 bool first_exon = true;
2170 TSegment *last_exon = NULL;
2171 for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2172 TSegment& s = segments[k0];
2173 int ext_len = 0;
2174 int ext_max = 0;
2175 if(s.m_exon) {
2176 if(first_exon) {
2177 first_exon = false;
2178 ext_len = s.CanExtendLeft(m_mrna, m_genomic);
2179 s.ExtendLeft(m_mrna, m_genomic, ext_len, m_aligner);
2180 } else if( !segments[k0-1].m_exon ) {//extend near gap
2181 //extend previous exon to right
2182 ext_len = last_exon->CanExtendRight(m_mrna, m_genomic);
2183 //exons should not intersect
2184 ext_max = min(s.m_box[0] - last_exon->m_box[1], s.m_box[2] - last_exon->m_box[3]) - 1;
2185 last_exon->ExtendRight(m_mrna, m_genomic, min(ext_len, ext_max), m_aligner);
2186 //extend current exon to left
2187 ext_len = s.CanExtendLeft(m_mrna, m_genomic);
2188 ext_max = min(s.m_box[0] - last_exon->m_box[1], s.m_box[2] - last_exon->m_box[3]) - 1;
2189 s.ExtendLeft(m_mrna, m_genomic, min(ext_len, ext_max), m_aligner);
2190 }
2191 last_exon = &s;
2192 }
2193 }
2194 if(last_exon == NULL) {
2195 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2196 } else {
2197 int ext_len = last_exon->CanExtendRight(m_mrna, m_genomic);
2198 last_exon->ExtendRight(m_mrna, m_genomic, ext_len, m_aligner);
2199 }
2200
2201 //extension sometimes leads to gap dissapearance
2202 //remove/correct gaps after extension of exons
2203 {{
2204 TSegmentVector tmp_segments;
2205 TSegment g;
2206 g.SetToGap();
2207 int prev_exon_index = -1;
2208 for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2209 if(segments[k0].m_exon) {
2210 if(prev_exon_index == -1) {//first exon
2211 if(segments[k0].m_box[0] > 0) {//gap at the beginning
2212 g.m_box[0] = 0;
2213 g.m_box[1] = segments[k0].m_box[0] - 1;
2214 g.m_box[2] = 0;
2215 g.m_box[3] = segments[k0].m_box[2] - 1;
2216 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2217 tmp_segments.push_back(g);
2218 }
2219
2220 } else {
2221 if( segments[prev_exon_index].m_box[1] + 1 < segments[k0].m_box[0] ) {// is there a gap?
2222 g.m_box[0] = segments[prev_exon_index].m_box[1] + 1;
2223 g.m_box[1] = segments[k0].m_box[0] - 1;
2224 g.m_box[2] = segments[prev_exon_index].m_box[3] + 1;
2225 g.m_box[3] = segments[k0].m_box[2] - 1;
2226 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2227 tmp_segments.push_back(g);
2228 }
2229 }
2230 prev_exon_index = (int)k0;
2231 tmp_segments.push_back(segments[k0]);
2232 }
2233 }
2234
2235 //check right end
2236 if(prev_exon_index >= 0) {
2237 if(segments[prev_exon_index].m_box[1] + 1 < SeqLen1) {
2238 g.m_box[0] = segments[prev_exon_index].m_box[1] + 1;
2239 g.m_box[1] = SeqLen1 - 1;
2240 g.m_box[2] = segments[prev_exon_index].m_box[3] + 1;
2241 g.m_box[3] = SeqLen2 - 1;
2242 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2243 tmp_segments.push_back(g);
2244 }
2245 } else {
2246 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2247 }
2248 segments.swap(tmp_segments);
2249 }}
2250 }
2251
2252
2253
2254 //PARTIAL TRIMMING OF TERMINAL EXONS
2255 if( !is_test ) {//default production
2256 // Go from the ends and see if we can improve term exons
2257 //note that it continue trimming of exons until s.m_idty is high
2258 size_t k0 (0);
2259 while(k0 < segments.size()) {
2260
2261 TSegment& s = segments[k0];
2262 if(s.m_exon) {
2263
2264 const size_t len (1 + s.m_box[1] - s.m_box[0]);
2265 const double min_idty (len >= kMinTermExonSize?
2266 m_MinExonIdty:
2267 max(m_MinExonIdty, kMinTermExonIdty));
2268 s.ImproveFromLeft(Seq1, Seq2, m_aligner);
2269 if(s.m_idty >= min_idty) {
2270 break;
2271 }
2272 }
2273 ++k0;
2274 }
2275
2276 int k1 (segments.size() - 1);
2277 while(k1 >= int(k0)) {
2278
2279 TSegment& s (segments[k1]);
2280 if(s.m_exon) {
2281
2282 const size_t len (1 + s.m_box[1] - s.m_box[0]);
2283 const double min_idty (len >= kMinTermExonSize?
2284 m_MinExonIdty:
2285 max(m_MinExonIdty, kMinTermExonIdty));
2286 s.ImproveFromRight(Seq1, Seq2, m_aligner);
2287 if(s.m_idty >= min_idty) {
2288 break;
2289 }
2290 }
2291 --k1;
2292 }
2293 } else { // test mode
2294 //trim terminal exons only
2295 //first exon
2296 NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2297 if(ii->m_exon) {
2298 if(is_test_plus) {
2299 trim.Cut50FromLeft(*ii);
2300 } else {
2301 ii->ImproveFromLeft1(Seq1, Seq2, m_aligner);
2302 }
2303 break;
2304 }
2305 }
2306 //last exon
2307 NON_CONST_REVERSE_ITERATE(TSegmentVector, ii, segments) {
2308 if(ii->m_exon) {
2309 if(is_test_plus) {
2310 trim.Cut50FromRight(*ii);
2311 } else {
2312 ii->ImproveFromRight1(Seq1, Seq2, m_aligner);
2313 }
2314 break;
2315 }
2316 }
2317 }
2318
2319 //partial trimming, exons near <GAP>s, terminal exons are trimmed already
2320 for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2321 if(!segments[k0].m_exon) {
2322 if( k0 > 0 && segments[k0-1].m_exon) {
2323 if(is_test) {
2324 if(is_test_plus) {
2325 if( x_IsInGap(segments[k0-1].m_box[3] + 1) ||
2326 CSplignTrim::HasAbuttingExonOnRight(segments, k0-1) ) {
2327 //abuting an exon or a sequence gap on the genome, do not trim
2328 } else {
2329 if( ( (int)SeqLen1 - (int)segments[k0-1].m_box[1] - 1 ) >= kFlankExonProx ) {
2330 trim.ImproveFromRight(segments[k0-1]);
2331 } else {
2332 trim.Cut50FromRight(segments[k0-1]);
2333 }
2334 }
2335 } else {
2336 segments[k0-1].ImproveFromRight1(Seq1, Seq2, m_aligner);
2337 }
2338 } else {
2339 segments[k0-1].ImproveFromRight(Seq1, Seq2, m_aligner);
2340 }
2341 }
2342 if( k0 + 1 < segments.size() && segments[k0+1].m_exon) {
2343 if(is_test) {
2344 if(is_test_plus) {
2345 if( x_IsInGap(segments[k0+1].m_box[2] - 1) ||
2346 CSplignTrim::HasAbuttingExonOnLeft(segments, k0+1) ) {
2347 //abuting an exon or a sequence gap on the genome, do not trim
2348 } else {
2349 if( (int)segments[k0+1].m_box[0] >= kFlankExonProx ) {
2350 trim.ImproveFromLeft(segments[k0+1]);
2351 } else {
2352 trim.Cut50FromLeft(segments[k0+1]);
2353 }
2354 }
2355 } else {
2356 segments[k0+1].ImproveFromLeft1(Seq1, Seq2, m_aligner);
2357 }
2358 } else {
2359 segments[k0+1].ImproveFromLeft(Seq1, Seq2, m_aligner);
2360 }
2361 }
2362 }
2363 }
2364
2365
2366 //CORRECTIONS AFTER PARTIAL TRIMMING
2367
2368 if( segments.size() == 0 ) {
2369 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2370 }
2371
2372 // indicate any slack space on the left
2373 if(segments[0].m_box[0] > 0) {
2374
2375 TSegment g;
2376 g.m_box[0] = 0;
2377 g.m_box[1] = segments[0].m_box[0] - 1;
2378 g.m_box[2] = 0;
2379 g.m_box[3] = segments[0].m_box[2] - 1;
2380 g.SetToGap();
2381 segments.insert(segments.begin(), g);
2382 }
2383
2384 // same on the right
2385 TSegment& seg_last (segments.back());
2386 if(seg_last.m_box[1] + 1 < SeqLen1) {
2387
2388 TSegment g;
2389 g.m_box[0] = seg_last.m_box[1] + 1;
2390 g.m_box[1] = SeqLen1 - 1;
2391 g.m_box[2] = seg_last.m_box[3] + 1;
2392 g.m_box[3] = SeqLen2 - 1;
2393 g.SetToGap();
2394 segments.push_back(g);
2395 }
2396
2397 //WHOLE EXON TRIMMING
2398
2399 //drop low complexity terminal exons
2400 {{
2401 bool first_exon = true;
2402 TSegment *last_exon = NULL;
2403 NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2404 if(ii->m_exon) {
2405 //first exon
2406 if(first_exon) {
2407 if(ii->IsLowComplexityExon(Seq1) ) {
2408 ii->SetToGap();
2409 }
2410 first_exon = false;
2411 } else {//make sure that if exon is single, it is checked once
2412 last_exon = &*ii;
2413 }
2414 }
2415 }
2416 //last exon
2417 if( last_exon != 0 ) {
2418 if(last_exon->IsLowComplexityExon(Seq1) ) {
2419 last_exon->SetToGap();
2420 }
2421 }
2422 }}
2423
2424
2425 //throw away low identity exons
2426 NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2427 if(ii->m_exon == false) continue;
2428 if(ii->m_idty < m_MinExonIdty) {
2429 if( is_test ) {//try to trim it first
2430 TSegment sl(*ii), sr(*ii);
2431 if(is_test_plus) {
2432 trim.ImproveFromLeft(sl);
2433 trim.ImproveFromRight(sr);
2434 } else {
2435 sl.ImproveFromLeft1(Seq1, Seq2, m_aligner);
2436 sr.ImproveFromRight1(Seq1, Seq2, m_aligner);
2437 }
2438 if( sl.m_details == ii->m_details && sr.m_details == ii->m_details ) {//did not help
2439 ii->SetToGap();
2440 } else {
2441 //pick better one
2442 if(is_test_plus) {
2443 //there is a splice on left, but no splice on right
2444 if( sr.m_details != ii->m_details && ii != segments.begin() && (ii-1)->m_exon && ( (ii+1) == segments.end() || !(ii+1)->m_exon ) ) {
2445 *ii = sr;
2446 //splice on right, no splice on left
2447 } else if( sl.m_details != ii->m_details && (ii+1) != segments.end() && (ii+1)->m_exon && ( ii == segments.begin() || !(ii-1)->m_exon) ) {
2448 *ii = sl;
2449 } else if(sl.m_details == ii->m_details ||
2450 (sr.m_details != ii->m_details && sr.m_score > sl.m_score ) ) {
2451 *ii = sr;
2452 } else {
2453 *ii = sl;
2454 }
2455 } else {
2456 if(sl.m_details != ii->m_details || sr.m_details != ii->m_details){
2457 if(sl.m_details == ii->m_details ||
2458 (sr.m_details != ii->m_details && sr.m_score > sl.m_score ) ) {
2459 *ii = sr;
2460 } else {
2461 *ii = sl;
2462 }
2463 }
2464 }
2465 //add gaps if needed
2466 if(ii != segments.begin() && (ii)->m_box[0] > (ii - 1)->m_box[1] + 1) {
2467 TSegment sgap;
2468 sgap.m_box[0] = (ii - 1)->m_box[1] + 1;
2469 sgap.m_box[2] = (ii - 1)->m_box[3] + 1;
2470 sgap.m_box[1] = ii->m_box[0] - 1;
2471 sgap.m_box[3] = ii->m_box[2] - 1;
2472 sgap.SetToGap();
2473 ii = segments.insert(ii, sgap);
2474 continue_iterations = true;
2475 ++ii;
2476 }
2477 if( (ii+1) != segments.end() && (ii+1)->m_box[0] > ii->m_box[1] + 1 ) {
2478 TSegment sgap;
2479 ++ii;
2480 sgap.m_box[0] = (ii - 1)->m_box[1] + 1;
2481 sgap.m_box[2] = (ii - 1)->m_box[3] + 1;
2482 sgap.m_box[1] = ii->m_box[0] - 1;
2483 sgap.m_box[3] = ii->m_box[2] - 1;
2484 sgap.SetToGap();
2485 ii = segments.insert(ii, sgap);
2486 continue_iterations = true;
2487 }
2488 }
2489 } else {// end of test mode
2490 //old style, just throw away
2491 ii->SetToGap();
2492 }
2493 } else if(ii->m_idty < .9 && ii->m_len < 20) {
2494 // 20_90 rule for short exons preceded/followed by non-consensus splices
2495 bool nc_prev (false), nc_next (false);
2496 if(ii != segments.begin() && (ii - 1)->m_exon) {
2497 nc_prev = ! TSegment::s_IsConsensusSplice(
2498 (ii - 1)->GetDonor(),
2499 ii->GetAcceptor());
2500 }
2501 if( (ii+1) != segments.end() && (ii + 1)->m_exon) {
2502 nc_next = ! TSegment::s_IsConsensusSplice(
2503 ii->GetDonor(),
2504 (ii + 1)->GetAcceptor());
2505 }
2506 if( nc_prev || nc_next ) {
2507 ii->SetToGap();
2508 }
2509 }
2510
2511 }
2512
2513
2514 //GP-12069
2515 //apply 40/85 rule for all 'stand alone' exons
2516 if( is_test ) {//test mode
2517 for(size_t k (0); k < segments.size(); ++k) {
2518 TSegment& s (segments[k]);
2519 if(s.m_exon == false) continue;
2520 //the first exon or gap on left
2521 if( ( k == 0 ) || ( ! segments[k-1].m_exon ) ) {
2522 //the last exon or gap on right
2523 if( ( k + 1 == segments.size() ) || ( ! segments[k+1].m_exon ) ) {
2524 //stand alone
2525 if (s.m_idty < .85 && s.m_len < 40) {
2526 s.SetToGap();
2527 }
2528 }
2529 }
2530 }
2531 }
2532
2533
2534 // turn to gaps exons with low identity
2535 //20_28_90 TEST MODE
2536 // turn to gaps exons with combination of shortness and low identity
2537 // deal with exons adjacent to gaps
2538 if ( is_test ) { // test mode
2539 for(size_t k (0); k < segments.size(); ++k) {
2540 TSegment& s (segments[k]);
2541 if(s.m_exon == false) continue;
2542
2543 bool drop (false);
2544
2545 enum EAdjustExon {
2546 eNo,
2547 eSoft, //for exons close to mRNA edge
2548 eHard
2549 } adj;
2550 adj = eNo;
2551
2552 if(k == 0) {//the first segment is an exon
2553 if( (int)s.m_box[0] >= kFlankExonProx ) {
2554 adj = eHard;
2555 } else {
2556 if(adj == eNo) adj = eSoft;
2557 }
2558 }
2559
2560 if( k + 1 == segments.size() ) {//the last segment is an exon
2561 if( ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {
2562 adj = eHard;
2563 } else {
2564 if(adj == eNo) adj = eSoft;//prevent switch from Hard to Soft
2565 }
2566 }
2567
2568 if(k > 0 && ( ! segments[k-1].m_exon ) ) {//gap on left
2569 if( (int)s.m_box[0] >= kFlankExonProx ) {
2570 adj = eHard;
2571 } else {
2572 if(adj == eNo) adj = eSoft;
2573 }
2574 }
2575
2576 if(k + 1 < segments.size() && (! segments[k+1].m_exon ) ) {//gap on right
2577 if( ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {
2578 adj = eHard;
2579 } else {
2580 if(adj == eNo) adj = eSoft;
2581 }
2582 }
2583
2584 if(adj == eSoft) {//20_90 rule
2585 if (s.m_idty < .9 && s.m_len < 20) {
2586 drop = true;
2587 }
2588 } else if(adj == eHard) {
2589 if( s.m_len < 20 ) {// 20 rule
2590 drop = true;
2591 }
2592 if ( s.m_idty < kMinTermExonIdty && s.m_len < kMinTermExonSize ) {// 28_90 rule
2593 drop = true;
2594 }
2595 }
2596 if(drop) {
2597 s.SetToGap();
2598 }
2599 }
2600 }
2601
2602
2603 // turn to gaps short weak terminal exons
2604 {{
2605 // find the two leftmost exons
2606 size_t exon_count (0);
2607 TSegment* term_segs[] = {0, 0};
2608 for(size_t i = 0; i < segments.size(); ++i) {
2609 TSegment& s = segments[i];
2610 if(s.m_exon) {
2611 term_segs[exon_count] = &s;
2612 if(++exon_count == 2) {
2613 break;
2614 }
2615 }
2616 }
2617
2618 if(exon_count == 2) {
2619 x_ProcessTermSegm(term_segs, 0);
2620 }
2621 }}
2622
2623 {{
2624 // find the two rightmost exons
2625 size_t exon_count (0);
2626 TSegment* term_segs[] = {0, 0};
2627 for(int i = segments.size() - 1; i >= 0; --i) {
2628 TSegment& s = segments[i];
2629 if(s.m_exon) {
2630 term_segs[exon_count] = &s;
2631 if(++exon_count == 2) {
2632 break;
2633 }
2634 }
2635 }
2636
2637 if(exon_count == 2) {
2638 x_ProcessTermSegm(term_segs, 1);
2639 }
2640 }}
2641
2642 // turn to gaps extra-short exons preceded/followed by gaps
2643 bool gap_prev (false);
2644 for(size_t k (0); k < segments.size(); ++k) {
2645
2646 TSegment& s (segments[k]);
2647 if(s.m_exon == false) {
2648 gap_prev = true;
2649 }
2650 else {
2651 size_t length (s.m_box[1] - s.m_box[0] + 1);
2652 bool gap_next (false);
2653 if(k + 1 < segments.size()) {
2654 gap_next = !segments[k+1].m_exon;
2655 }
2656 if(length <= 10 && (gap_prev || gap_next)) {
2657 s.SetToGap();
2658 }
2659 gap_prev = false;
2660 }
2661 }
2662
2663
2664 // merge all adjacent gaps
2665 int gap_start_idx (-1);
2666 if(segments.size() && segments[0].m_exon == false) {
2667 gap_start_idx = 0;
2668 }
2669
2670 for(size_t k (0); k < segments.size(); ++k) {
2671 TSegment& s (segments[k]);
2672 if(!s.m_exon) {
2673 if(gap_start_idx == -1) {
2674 gap_start_idx = int(k);
2675 if(k > 0) {
2676 s.m_box[0] = segments[k-1].m_box[1] + 1;
2677 s.m_box[2] = segments[k-1].m_box[3] + 1;
2678 }
2679 }
2680 }
2681 else {
2682 if(gap_start_idx >= 0) {
2683 TSegment& g = segments[gap_start_idx];
2684 g.m_box[1] = s.m_box[0] - 1;
2685 g.m_box[3] = s.m_box[2] - 1;
2686 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2687 g.m_details.resize(0);
2688 m_segments.push_back(g);
2689 gap_start_idx = -1;
2690 }
2691 m_segments.push_back(s);
2692 }
2693 }
2694
2695 if(gap_start_idx >= 0) {
2696 TSegment& g (segments[gap_start_idx]);
2697 g.m_box[1] = segments[segments.size()-1].m_box[1];
2698 g.m_box[3] = segments[segments.size()-1].m_box[3];
2699 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2700 g.m_details.resize(0);
2701 m_segments.push_back(g);
2702 }
2703
2704 size_t exon_count1 (0);
2705 ITERATE(TSegments, ii, m_segments) {
2706 if(ii->m_exon) ++exon_count1;
2707 }
2708
2709 if(exon_count1 == 0 ) {
2710 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2711 }
2712
2713 if(exon_count0 == exon_count1 && continue_iterations == false) break;
2714
2715 } // end of trimming iterations
2716
2717 //cut to AG/GT
2718 {{
2719
2720
2721 if ( is_test ) { // test mode
2722 bool first_exon = true;
2723 size_t sdim = m_segments.size();
2724 int last_exon_index = -1;
2725 for(size_t k0 = 0; k0 < sdim; ++k0) {
2726 if(m_segments[k0].m_exon) {
2727 last_exon_index = (int)k0;
2728 }
2729 }
2730 for(size_t k0 = 0; k0 < sdim; ++k0) {
2731 TSegment& s = m_segments[k0];
2732 bool cut_from_left = false;
2733 bool cut_from_right = false;
2734 if(s.m_exon) {
2735 //check left
2736 if( s.m_box[2] == 0 || x_IsInGap(s.m_box[2] - 1) ||
2737 CSplignTrim::HasAbuttingExonOnLeft(m_segments, k0) ) {
2738 //abuting an exon or a contig border or a sequence gap on the genome, do not cut
2739 first_exon = false;
2740 } else {
2741 if(first_exon) {
2742 if( (int)s.m_box[0] >= kFlankExonProx ) {//gap on left
2743 cut_from_left = true;
2744 }
2745 first_exon = false;
2746 } else if( ! m_segments[k0-1].m_exon ) {//gap on left
2747 cut_from_left = true;
2748 }
2749 }
2750 //check right
2751 if( x_IsInGap(s.m_box[3] + 1) ||
2752 CSplignTrim::HasAbuttingExonOnRight(m_segments, k0) ) {
2753 //abuting an exon or a sequence gap on the genome, do not cut
2754 } else {
2755 if( last_exon_index == (int)k0 ) {
2756 if( ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {//gap on right
2757 cut_from_right = true;
2758 }
2759 } else if(k0 + 1 < sdim && (! m_segments[k0+1].m_exon ) ) {//gap on right
2760 cut_from_right = true;
2761 }
2762 }
2763 //try to cut from left
2764 if(cut_from_left) {
2765 int seq1_pos = (int)s.m_box[0];
2766 int seq2_pos = (int)s.m_box[2];
2767 int det_pos = 0;
2768 int max_pos = (int)s.m_box[1] - 8;//exon should not be too short
2769 while(seq1_pos <= max_pos && det_pos <= kMaxCutToSplice) {
2770 if( seq2_pos > 1 && s.m_details[det_pos] == 'M' &&
2771 toupper(Seq2[seq2_pos-2]) == 'A' && toupper(Seq2[seq2_pos-1]) == 'G' ) {//AG point
2772 if(det_pos > 0) {//resize
2773 s.m_box[0] = seq1_pos;
2774 s.m_box[2] = seq2_pos;
2775 s.m_details.erase(0, det_pos);
2776 s.Update(m_aligner);
2777 // update the first two annotation symbols
2778 if(s.m_annot.size() > 0 && s.m_annot[0] == '<') {
2779 s.m_annot = "AG" + s.m_annot;
2780 } else if(s.m_annot.size() > 2 && s.m_annot[2] == '<') {
2781 s.m_annot[0] = 'A';
2782 s.m_annot[1] = 'G';
2783 }
2784 if( k0>0 && ( !m_segments[k0-1].m_exon ) ) {//adjust previos gap
2785 TSegment& g = m_segments[k0-1];
2786 g.m_box[1] = s.m_box[0] - 1;
2787 g.m_box[3] = s.m_box[2] - 1;
2788 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2789 }
2790 }
2791 break;
2792 }
2793 switch(s.m_details[det_pos]) {
2794 case 'M' :
2795 ++seq1_pos;
2796 ++seq2_pos;
2797 break;
2798 case 'R' :
2799 ++seq1_pos;
2800 ++seq2_pos;
2801 break;
2802 case 'I' :
2803 ++seq2_pos;
2804 break;
2805 case 'D' :
2806 ++seq1_pos;
2807 break;
2808 }
2809 ++det_pos;
2810 }
2811 }
2812
2813 //try to cut from right
2814 if(cut_from_right) {
2815 int seq1_pos = (int)s.m_box[1];
2816 int seq2_pos = (int)s.m_box[3];
2817 int det_pos = s.m_details.size() - 1;
2818 int min_det_pos = det_pos - kMaxCutToSplice;
2819 int min_pos = (int)s.m_box[0] + 8;//exon should not be too short
2820 while(seq1_pos >= min_pos && det_pos >= min_det_pos) {
2821 if( (size_t)(seq2_pos + 2) < m_genomic.size() && s.m_details[det_pos] == 'M' &&
2822 toupper(Seq2[seq2_pos+1]) == 'G' && toupper(Seq2[seq2_pos+2]) == 'T' ) {//GT point
2823 if( (size_t)det_pos + 1 < s.m_details.size() ) {//resize
2824 s.m_box[1] = seq1_pos;
2825 s.m_box[3] = seq2_pos;
2826 s.m_details.resize(det_pos + 1);
2827 s.Update(m_aligner);
2828 // update the last two annotation symbols
2829 size_t adim = s.m_annot.size();
2830 if(adim > 0 && s.m_annot[adim-1] == '>') {
2831 s.m_annot += "GT";
2832 } else if(adim > 2 && s.m_annot[adim-3] == '>') {
2833 s.m_annot[adim-2] = 'G';
2834 s.m_annot[adim-1] = 'T';
2835 }
2836 if( k0 + 1 < sdim && ( !m_segments[k0+1].m_exon ) ) {//adjust next gap
2837 TSegment& g = m_segments[k0+1];
2838 g.m_box[0] = s.m_box[1] + 1;
2839 g.m_box[2] = s.m_box[3] + 1;
2840 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2841 }
2842 }
2843 break;
2844 }
2845 switch(s.m_details[det_pos]) {
2846 case 'M' :
2847 --seq1_pos;
2848 --seq2_pos;
2849 break;
2850 case 'R' :
2851 --seq1_pos;
2852 --seq2_pos;
2853 break;
2854 case 'I' :
2855 --seq2_pos;
2856 break;
2857 case 'D' :
2858 --seq1_pos;
2859 break;
2860 }
2861 --det_pos;
2862 }
2863 }
2864 }
2865 }
2866 }
2867 }}
2868
2869 //throw away bad stand alone exons
2870 {{
2871 if(is_test_plus) {
2872 bool adjust = false;
2873 bool prev_exon = false;
2874 size_t ssize = m_segments.size();
2875 for(size_t pp = 0; pp <ssize ; ++pp) {
2876 if(m_segments[pp].m_exon) {
2877 if( !prev_exon && ( pp == ssize - 1 || !m_segments[pp+1].m_exon ) ) { //stand alone exon
2878 if(trim.ThrowAway20_28_90(m_segments[pp])) {
2879 adjust = true;
2880 }
2881 }
2882 prev_exon = true;
2883 } else {
2884 prev_exon = false;
2885 }
2886 }
2887 if(adjust) {
2888 trim.AdjustGaps(m_segments);
2889 }
2890 }
2891 }}
2892
2893 // stich small holes
2894 {{
2895 size_t min_hole_len = GetMinHoleLen();
2896 if( min_hole_len > 0) { //find small holes and stich
2897 trim.AdjustGaps(m_segments);//make sure there is no adjacent gaps
2898 size_t pos1 = 0, pos2 = 2;
2899 for(; pos2 < m_segments.size(); ++pos1, ++pos2) {
2900 if( m_segments[pos1].m_exon && !m_segments[pos1+1].m_exon && m_segments[pos2].m_exon &&
2901 m_segments[pos1].m_box[1] + min_hole_len >= m_segments[pos2].m_box[0] &&
2902 m_segments[pos1].m_box[3] + min_hole_len >= m_segments[pos2].m_box[2] ) {
2903
2904 trim.JoinExons(m_segments, pos1, pos2);//note: m_segments changes here!
2905 }
2906 }
2907 }
2908 }}
2909
2910 // cut gaps to codons
2911 {{
2912 bool cut_to_codons = true;
2913 if( cut_to_codons ) {
2914 }
2915 }}
2916
2917 /// there is more postprocessing to follow
2918
2919 if( m_segments.size() == 0 ) {
2920 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2921 }
2922
2923
2924 //#define DUMP_PROCESSED_SEGS
2925 #ifdef DUMP_PROCESSED_SEGS
2926 cerr << "Processed segments:" << endl;
2927 ITERATE(TSegments, ii, m_segments) {
2928 cerr << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
2929 << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
2930 << ii->m_annot << '\t' << ii->m_score << endl;
2931 }
2932 #endif
2933
2934 rv /= m_aligner->GetWm();
2935 return rv;
2936 }
2937
2938
GetIdentity() const2939 double CSplign::SAlignedCompartment::GetIdentity() const
2940 {
2941 string trans;
2942 for(size_t i (0), dim (m_Segments.size()); i < dim; ++i) {
2943 const TSegment & s (m_Segments[i]);
2944 if(s.m_exon) {
2945 trans.append(s.m_details);
2946 }
2947 else {
2948 trans.append(s.m_len, 'D');
2949 }
2950 }
2951 size_t matches = 0;
2952 ITERATE(string, ii, trans) {
2953 if(*ii == 'M') {
2954 ++matches;
2955 }
2956 }
2957 return double(matches) / trans.size();
2958 }
2959
2960
2961
GetBox(Uint4 * box) const2962 void CSplign::SAlignedCompartment::GetBox(Uint4* box) const
2963 {
2964 box[0] = box[2] = kMax_UInt;
2965 box[1] = box[3] = 0;
2966 ITERATE(TSegments, ii, m_Segments) {
2967 const TSegment& s (*ii);
2968 if(s.m_exon) {
2969
2970 Uint4 a, b;
2971 if(s.m_box[0] <= s.m_box[1]) {
2972 a = s.m_box[0];
2973 b = s.m_box[1];
2974 }
2975 else {
2976 b = s.m_box[0];
2977 a = s.m_box[1];
2978 }
2979 if(a < box[0]) {
2980 box[0] = a;
2981 }
2982 if(b > box[1]) {
2983 box[1] = b;
2984 }
2985
2986 if(s.m_box[2] <= s.m_box[3]) {
2987 a = s.m_box[2];
2988 b = s.m_box[3];
2989 }
2990 else {
2991 b = s.m_box[2];
2992 a = s.m_box[3];
2993 }
2994 if(a < box[2]) {
2995 box[2] = a;
2996 }
2997 if(b > box[3]) {
2998 box[3] = b;
2999 }
3000 }
3001 }
3002 }
3003
3004
x_ProcessTermSegm(TSegment ** term_segs,Uint1 side) const3005 bool CSplign::x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const
3006 {
3007 bool turn2gap (false);
3008
3009 const size_t exon_size (1 + term_segs[0]->m_box[1] -
3010 term_segs[0]->m_box[0]);
3011
3012 const double idty (term_segs[0]->m_idty);
3013
3014
3015 if( GetTestType() == kTestType_production_default ) {//default production
3016 if(exon_size < kMinTermExonSize && idty < kMinTermExonIdty ) {
3017 turn2gap = true;
3018 }
3019 }
3020
3021 if(exon_size < kMinTermExonSize) {
3022
3023 // verify that the intron is not too long
3024
3025 size_t a, b;
3026 const char *dnr, *acc;
3027 if(side == 0) {
3028 a = term_segs[0]->m_box[3];
3029 b = term_segs[1]->m_box[2];
3030 dnr = term_segs[0]->GetDonor();
3031 acc = term_segs[1]->GetAcceptor();
3032 }
3033 else {
3034 a = term_segs[1]->m_box[3];
3035 b = term_segs[0]->m_box[2];
3036 dnr = term_segs[1]->GetDonor();
3037 acc = term_segs[0]->GetAcceptor();
3038 }
3039
3040 const size_t intron_len (b - a);
3041
3042 const bool consensus (TSegment::s_IsConsensusSplice(dnr, acc));
3043
3044 size_t max_ext ((idty < .96 || !consensus || exon_size < 16)?
3045 m_max_genomic_ext: (5000 * kMinTermExonSize));
3046
3047 if(consensus) {
3048 if(exon_size < 8) {
3049 max_ext = 10 * exon_size;
3050 }
3051 }
3052 else if(exon_size < 16) {
3053 max_ext = 1;
3054 }
3055
3056 const size_t max_intron_len (x_GetGenomicExtent(exon_size, max_ext));
3057 if(intron_len > max_intron_len) {
3058 turn2gap = true;
3059 }
3060 }
3061
3062 if(turn2gap) {
3063
3064 // turn the segment into a gap
3065 TSegment& s = *(term_segs[0]);
3066 s.SetToGap();
3067 s.m_len = exon_size;
3068 }
3069
3070 return turn2gap;
3071 }
3072
3073
x_GetGenomicExtent(const Uint4 query_len,Uint4 max_ext) const3074 Uint4 CSplign::x_GetGenomicExtent(const Uint4 query_len, Uint4 max_ext) const
3075 {
3076 if(max_ext == 0) {
3077 max_ext = m_max_genomic_ext;
3078 }
3079
3080 Uint4 rv (0);
3081 if(query_len >= kNonCoveredEndThreshold) {
3082 rv = m_max_genomic_ext;
3083 }
3084 else {
3085 const double k (pow(kNonCoveredEndThreshold, - 1. / kPower) * max_ext);
3086 const double drv (k * pow(query_len, 1. / kPower));
3087 rv = Uint4(drv);
3088 }
3089
3090 return rv;
3091 }
3092
3093
3094 ////////////////////////////////////
3095
3096 namespace splign_local {
3097
3098 template<typename T>
ElemToBuffer(const T & n,char * & p)3099 void ElemToBuffer(const T& n, char*& p)
3100 {
3101 *(reinterpret_cast<T*>(p)) = n;
3102 p += sizeof(n);
3103 }
3104
3105 template<>
ElemToBuffer(const string & s,char * & p)3106 void ElemToBuffer(const string& s, char*& p)
3107 {
3108 copy(s.begin(), s.end(), p);
3109 p += s.size();
3110 *p++ = 0;
3111 }
3112
3113 template<typename T>
ElemFromBuffer(T & n,const char * & p)3114 void ElemFromBuffer(T& n, const char*& p)
3115 {
3116 n = *(reinterpret_cast<const T*>(p));
3117 p += sizeof(n);
3118 }
3119
3120 template<>
ElemFromBuffer(string & s,const char * & p)3121 void ElemFromBuffer(string& s, const char*& p)
3122 {
3123 s = p;
3124 p += s.size() + 1;
3125 }
3126 }
3127
3128
ToBuffer(TNetCacheBuffer * target) const3129 void CNWFormatter::SSegment::ToBuffer(TNetCacheBuffer* target) const
3130 {
3131 using namespace splign_local;
3132
3133 if(target == 0) {
3134 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed );
3135 }
3136
3137 const size_t total_size = sizeof m_exon + sizeof m_idty +
3138 sizeof m_len + sizeof m_box + m_annot.size() + 1 +
3139 m_details.size() + 1 + sizeof m_score;
3140
3141 target->resize(total_size);
3142
3143 char* p = &target->front();
3144 ElemToBuffer(m_exon, p);
3145 ElemToBuffer(m_idty, p);
3146 ElemToBuffer(m_len, p);
3147 for(size_t i = 0; i < 4; ++i) {
3148 ElemToBuffer(m_box[i], p);
3149 }
3150 ElemToBuffer(m_annot, p);
3151 ElemToBuffer(m_details, p);
3152 ElemToBuffer(m_score, p);
3153 }
3154
3155
FromBuffer(const TNetCacheBuffer & source)3156 void CNWFormatter::SSegment::FromBuffer(const TNetCacheBuffer& source)
3157 {
3158 using namespace splign_local;
3159
3160 const size_t min_size = sizeof m_exon + sizeof m_idty + sizeof m_len +
3161 + sizeof m_box + 1 + 1 + sizeof m_score;
3162
3163 if(source.size() < min_size) {
3164 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
3165 }
3166
3167 const char* p = &source.front();
3168 ElemFromBuffer(m_exon, p);
3169 ElemFromBuffer(m_idty, p);
3170 ElemFromBuffer(m_len, p);
3171
3172 for(size_t i = 0; i < 4; ++i) {
3173 ElemFromBuffer(m_box[i], p);
3174 }
3175
3176 ElemFromBuffer(m_annot, p);
3177 ElemFromBuffer(m_details, p);
3178 ElemFromBuffer(m_score, p);
3179 }
3180
3181
ToBuffer(TNetCacheBuffer * target) const3182 void CSplign::SAlignedCompartment::ToBuffer(TNetCacheBuffer* target) const
3183 {
3184 using namespace splign_local;
3185
3186 if(target == 0) {
3187 NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed);
3188 }
3189
3190 const size_t core_size (
3191 sizeof m_Id + sizeof m_Status + m_Msg.size() + 1
3192 + sizeof m_QueryStrand + sizeof m_SubjStrand
3193 + sizeof m_Cds_start + sizeof m_Cds_stop
3194 + sizeof m_QueryLen
3195 + sizeof m_PolyA
3196 + sizeof m_Score);
3197
3198 vector<char> core (core_size);
3199
3200 char* p = &core.front();
3201 ElemToBuffer(m_Id, p);
3202 ElemToBuffer(m_Status, p);
3203 ElemToBuffer(m_Msg, p);
3204 ElemToBuffer(m_QueryStrand, p);
3205 ElemToBuffer(m_SubjStrand, p);
3206 ElemToBuffer(m_Cds_start, p);
3207 ElemToBuffer(m_Cds_stop, p);
3208 ElemToBuffer(m_QueryLen, p);
3209 ElemToBuffer(m_PolyA, p);
3210 ElemToBuffer(m_Score, p);
3211
3212 typedef vector<TNetCacheBuffer> TBuffers;
3213 TBuffers vb (m_Segments.size());
3214 size_t ibuf (0);
3215 ITERATE(TSegments, ii, m_Segments) {
3216 ii->ToBuffer(&vb[ibuf++]);
3217 }
3218
3219 size_t total_size (core_size + sizeof(size_t) * m_Segments.size());
3220 ITERATE(TBuffers, ii, vb) {
3221 total_size += ii->size();
3222 }
3223
3224 target->resize(total_size);
3225 TNetCacheBuffer::iterator it = target->begin();
3226 copy(core.begin(), core.end(), it);
3227 it += core_size;
3228
3229 ITERATE(TBuffers, ii, vb) {
3230 char* p = &(*it);
3231 const size_t seg_buf_size = ii->size();
3232 *((size_t*)p) = seg_buf_size;
3233 it += sizeof (size_t);
3234 copy(ii->begin(), ii->end(), it);
3235 it += seg_buf_size;
3236 }
3237 }
3238
3239
FromBuffer(const TNetCacheBuffer & source)3240 void CSplign::SAlignedCompartment::FromBuffer(const TNetCacheBuffer& source)
3241 {
3242 using namespace splign_local;
3243
3244 const size_t min_size (
3245 sizeof m_Id
3246 + sizeof m_Status
3247 + 1
3248 + sizeof m_QueryStrand + sizeof m_SubjStrand
3249 + sizeof m_Cds_start + sizeof m_Cds_stop
3250 + sizeof m_QueryLen
3251 + sizeof m_PolyA
3252 + sizeof m_Score );
3253
3254 if(source.size() < min_size) {
3255 NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
3256 }
3257
3258 const char* p (&source.front());
3259 ElemFromBuffer(m_Id, p);
3260 ElemFromBuffer(m_Status, p);
3261 ElemFromBuffer(m_Msg, p);
3262 ElemFromBuffer(m_QueryStrand, p);
3263 ElemFromBuffer(m_SubjStrand, p);
3264 ElemFromBuffer(m_Cds_start, p);
3265 ElemFromBuffer(m_Cds_stop, p);
3266 ElemFromBuffer(m_QueryLen, p);
3267 ElemFromBuffer(m_PolyA, p);
3268 ElemFromBuffer(m_Score, p);
3269
3270 const char* pe (&source.back());
3271 while(p <= pe) {
3272 size_t seg_buf_size (0);
3273 ElemFromBuffer(seg_buf_size, p);
3274 m_Segments.push_back(TSegment());
3275 TSegment& seg (m_Segments.back());
3276 seg.FromBuffer(TNetCacheBuffer(p, p + seg_buf_size));
3277 p += seg_buf_size;
3278 }
3279 }
3280
s_ComputeStats(CRef<objects::CSeq_align_set> sas,TScoreSets * output_stats,TOrf cds,EStatFlags flags)3281 size_t CSplign::s_ComputeStats(CRef<objects::CSeq_align_set> sas,
3282 TScoreSets * output_stats,
3283 TOrf cds,
3284 EStatFlags flags)
3285 {
3286
3287 const
3288 bool valid_input (sas.GetPointer() && sas->CanGet() && sas->Get().size()
3289 && sas->Get().front()->CanGetSegs()
3290 && sas->Get().front()->GetSegs().IsSpliced()
3291 && sas->Get().front()->GetSegs().GetSpliced().GetProduct_type()
3292 == CSpliced_seg::eProduct_type_transcript
3293 && output_stats);
3294
3295 if(!valid_input) {
3296 NCBI_THROW(CAlgoAlignException, eBadParameter,
3297 "CSplign::s_ComputeStats(): Invalid input");
3298 }
3299
3300 output_stats->resize(0);
3301
3302 ITERATE(CSeq_align_set::Tdata, ii1, sas->Get()) {
3303 CRef<CScore_set> ss (s_ComputeStats(*ii1, false, cds, flags));
3304 output_stats->push_back(ss);
3305 }
3306
3307 return output_stats->size();
3308 }
3309
3310
3311 namespace {
3312 const int kFrame_not_set (-10);
3313 const int kFrame_end (-5);
3314 const int kFrame_lost (-20);
3315 }
3316
3317
s_ComputeStats(CRef<objects::CSeq_align> sa,bool embed_scoreset,TOrf cds,EStatFlags flags)3318 CRef<objects::CScore_set> CSplign::s_ComputeStats(CRef<objects::CSeq_align> sa,
3319 bool embed_scoreset,
3320 TOrf cds,
3321 EStatFlags flags)
3322 {
3323 if(!(flags & (eSF_BasicNonCds | eSF_BasicCds))) {
3324 NCBI_THROW(CException, eUnknown,
3325 "CSplign::s_ComputeStats(): mode not yet supported.");
3326 }
3327
3328 const bool cds_stats ((flags & eSF_BasicCds) && (cds.first + cds.second > 0));
3329
3330 // set individual scores
3331 CRef<CScore_set> ss (new CScore_set);
3332 CScore_set::Tdata & scores (ss->Set());
3333
3334 if(flags & eSF_BasicNonCds) {
3335 CSeq_align::TScore score_vec;
3336 CScoreBuilderBase().AddSplignScores(*sa, score_vec);
3337 scores.assign(score_vec.begin(), score_vec.end());
3338 }
3339
3340 if(cds_stats) {
3341
3342 typedef CSeq_align::TSegs::TSpliced TSpliced;
3343 const TSpliced & spliced (sa->GetSegs().GetSpliced());
3344 if(spliced.GetProduct_type() != CSpliced_seg::eProduct_type_transcript) {
3345 NCBI_THROW(CAlgoAlignException, eBadParameter,
3346 "CSplign::s_ComputeStats(): Unsupported product type");
3347 }
3348
3349 const bool qstrand (spliced.GetProduct_strand() != eNa_strand_minus);
3350 if(cds_stats) {
3351 const bool cds_strand (cds.first < cds.second);
3352 if(qstrand ^ cds_strand) {
3353 NCBI_THROW(CAlgoAlignException, eBadParameter,
3354 "CSplign::s_ComputeStats(): Transcript orientation not "
3355 "matching specified CDS orientation.");
3356 }
3357 }
3358
3359 typedef TSpliced::TExons TExons;
3360 const TExons & exons (spliced.GetExons());
3361
3362 size_t matches (0),
3363 aligned_query_bases (0), // matches, mismatches and indels
3364 aln_length_exons (0),
3365 aln_length_gaps (0);
3366
3367 const TSeqPos qlen (spliced.GetProduct_length());
3368 const TSeqPos polya (spliced.CanGetPoly_a()?
3369 spliced.GetPoly_a(): (qstrand? qlen: TSeqPos(-1)));
3370
3371 typedef CSpliced_exon TExon;
3372 TSeqPos qprev (qstrand? TSeqPos(-1): qlen);
3373 string xcript;
3374 ITERATE(TExons, ii2, exons) {
3375
3376 const TExon & exon (**ii2);
3377 const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
3378 qmax (exon.GetProduct_end().GetNucpos());
3379
3380 const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
3381
3382 if(qgap > 0) {
3383 aln_length_gaps += qgap;
3384 if(cds_stats) xcript.append(qgap, 'X');
3385 }
3386
3387 typedef TExon::TParts TParts;
3388 const TParts & parts (exon.GetParts());
3389 string errmsg;
3390 ITERATE(TParts, ii3, parts) {
3391 const CSpliced_exon_chunk & part (**ii3);
3392 const CSpliced_exon_chunk::E_Choice choice (part.Which());
3393 TSeqPos len (0);
3394 switch(choice) {
3395 case CSpliced_exon_chunk::e_Match:
3396 len = part.GetMatch();
3397 matches += len;
3398 aligned_query_bases += len;
3399 if(cds_stats) xcript.append(len, 'M');
3400 break;
3401 case CSpliced_exon_chunk::e_Mismatch:
3402 len = part.GetMismatch();
3403 aligned_query_bases += len;
3404 if(cds_stats) xcript.append(len, 'R');
3405 break;
3406 case CSpliced_exon_chunk::e_Product_ins:
3407 len = part.GetProduct_ins();
3408 aligned_query_bases += len;
3409 if(cds_stats) xcript.append(len, 'D');
3410 break;
3411 case CSpliced_exon_chunk::e_Genomic_ins:
3412 len = part.GetGenomic_ins();
3413 if(cds_stats) xcript.append(len, 'I');
3414 break;
3415 default:
3416 errmsg = "Unexpected spliced exon chunk part: "
3417 + part.SelectionName(choice);
3418 NCBI_THROW(CAlgoAlignException, eBadParameter, errmsg);
3419 }
3420 aln_length_exons += len;
3421 }
3422
3423 qprev = qstrand? qmax: qmin;
3424 } // TExons
3425
3426 const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
3427 aln_length_gaps += qgap;
3428 if(cds_stats) xcript.append(qgap, 'X');
3429
3430 if(!qstrand && qlen <= 0) {
3431 NCBI_THROW(CAlgoAlignException, eBadParameter,
3432 "CSplign::s_ComputeStats(): Cannot compute "
3433 "inframe stats - transcript length not set.");
3434 }
3435
3436 int qpos (qstrand? -1: int(qlen));
3437 int qinc (qstrand? +1: -1);
3438 int frame (kFrame_not_set);
3439 size_t aln_length_cds (0);
3440 size_t matches_frame[] = {0, 0, 0, 0, 0};
3441 const int cds_start (cds.first), cds_stop (cds.second);
3442 for(string::const_iterator ie (xcript.end()), ii(xcript.begin());
3443 ii != ie && frame != kFrame_end; ++ii)
3444 {
3445
3446 switch(*ii) {
3447
3448 case 'M':
3449 qpos += qinc;
3450 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3451 if(qpos == cds_stop) frame = kFrame_end;
3452 if(frame >= -2) {
3453 ++aln_length_cds;
3454 ++matches_frame[frame + 2];
3455 }
3456 break;
3457
3458 case 'R':
3459 qpos += qinc;
3460 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3461 if(qpos == cds_stop) frame = kFrame_end;
3462 if(frame >= -2) ++aln_length_cds;
3463 break;
3464
3465 case 'D':
3466 qpos += qinc;
3467 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3468 if(qpos == cds_stop) frame = kFrame_end;
3469 if(frame >= -2) {
3470 ++aln_length_cds;
3471 frame = (frame + 1) % 3;
3472 }
3473 break;
3474
3475 case 'I':
3476 if(frame >= -2) {
3477 ++aln_length_cds;
3478 frame = (frame - 1) % 3;
3479 }
3480 break;
3481
3482 case 'X':
3483 qpos += qinc;
3484 if( (qstrand && cds_start <= qpos && qpos < cds_stop) ||
3485 (!qstrand && cds_start >= qpos && qpos > cds_stop) )
3486 {
3487 frame = kFrame_lost;
3488 ++aln_length_cds;
3489 }
3490 break;
3491 }
3492 }
3493
3494 {
3495 CRef<CScore> score_matches_inframe (new CScore());
3496 score_matches_inframe->SetId().SetId(eCS_InframeMatches);
3497 score_matches_inframe->SetValue().SetInt(matches_frame[2]);
3498 scores.push_back(score_matches_inframe);
3499 }
3500
3501 {
3502 CRef<CScore> score_inframe_identity (new CScore());
3503 score_inframe_identity->SetId().SetId(eCS_InframeIdentity);
3504 score_inframe_identity->SetValue().
3505 SetReal(double(matches_frame[2]) / aln_length_cds);
3506 scores.push_back(score_inframe_identity);
3507 }
3508 }
3509
3510
3511 if(embed_scoreset) {
3512 CSeq_align::TScore & sa_score (sa->SetScore());
3513 sa_score.resize(scores.size());
3514 copy(scores.begin(), scores.end(), sa_score.begin());
3515 }
3516
3517 return ss;
3518 }
3519
3520
3521 END_NCBI_SCOPE
3522
3523
3524