1 /* $Id: splign.cpp 592158 2019-08-27 16:34:30Z vakatov $
2 * ===========================================================================
3 *
4 *                            public DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Yuri Kapustin, Boris Kiryutin
27 *
28 * File Description:
29 *   CSplign class implementation
30 *
31 */
32 
33 
34 #include <ncbi_pch.hpp>
35 
36 #include "splign_util.hpp"
37 #include "splign_exon_trim.hpp"
38 #include "messages.hpp"
39 
40 #include <algo/align/util/hit_comparator.hpp>
41 #include <algo/align/util/compartment_finder.hpp>
42 #include <algo/align/nw/nw_band_aligner.hpp>
43 #include <algo/align/nw/nw_spliced_aligner16.hpp>
44 #include <algo/align/nw/nw_formatter.hpp>
45 #include <algo/align/nw/align_exception.hpp>
46 #include <algo/align/splign/splign.hpp>
47 
48 #include <algo/sequence/orf.hpp>
49 
50 #include <objmgr/scope.hpp>
51 #include <objmgr/bioseq_handle.hpp>
52 #include <objmgr/seq_vector.hpp>
53 #include <objmgr/util/seq_loc_util.hpp>
54 
55 #include <objtools/alnmgr/score_builder_base.hpp>
56 
57 #include <objects/seqloc/Seq_interval.hpp>
58 #include <objects/seqalign/Seq_align.hpp>
59 #include <objects/seqalign/Seq_align_set.hpp>
60 #include <objects/seqalign/Spliced_seg.hpp>
61 #include <objects/seqalign/Spliced_exon.hpp>
62 #include <objects/seqalign/Spliced_exon_chunk.hpp>
63 #include <objects/seqalign/Product_pos.hpp>
64 #include <objects/seqalign/Splice_site.hpp>
65 #include <objects/seqalign/Score.hpp>
66 #include <objects/seqalign/Score_set.hpp>
67 #include <objects/general/Object_id.hpp>
68 
69 #include <math.h>
70 #include <algorithm>
71 #include <iostream>
72 
73 BEGIN_NCBI_SCOPE
74 
75 USING_SCOPE(objects);
76 
77 namespace {
78 
79     // define cut-off strategy at the terminii:
80 
81     // (1) - pre-processing
82     // For non-covered ends longer than kNonCoveredEndThreshold use
83     // m_max_genomic_ext. For shorter ends use k * query_len^(1/kPower)
84 
85     const Uint4  kNonCoveredEndThreshold (55);
86     const double kPower (2.5);
87 
88     // (2) - post-processing
89     // term exons shorter than kMinTermExonSize with identity lower than
90     // kMinTermExonIdty will be converted to gaps
91     const size_t kMinTermExonSize (28);
92     const double kMinTermExonIdty (0.9);
93 
94     //if flanking exon is closer than kFlankExonProx to the beggining/(end or polyA) of mRNA, it is NOT treated as adjecent to a gap
95     const int kFlankExonProx (20);
96 
97     //maximim length to cut to splice (exons near gaps)
98     const int kMaxCutToSplice (6);
99 
100     const CSplign::THit::TCoord
101                  kMaxCoord (numeric_limits<CSplign::THit::TCoord>::max());
102 
103 
104     //original Yuri's EST scores
105 
106     const int kEstMatchScore (1000);
107     const int kEstMismatchScore (-1011);
108     const int kEstGapOpeningScore(-1460);
109     const int kEstGapExtensionScore(-464);
110 
111     const int kEstGtAgSpliceScore(-4988);
112     const int kEstGcAgSpliceScore(-5999);
113     const int kEstAtAcSpliceScore(-7010);
114     const int kEstNonConsensusSpliceScore(-13060);
115 }
116 
117 
118 
119 //original Yuri's mRNA scores
120 
s_GetDefaultScoringType(void)121 CSplign::EScoringType CSplign::s_GetDefaultScoringType(void) {
122     return eMrnaScoring;
123 }
124 
s_GetDefaultMatchScore(void)125 int CSplign::s_GetDefaultMatchScore(void) {
126     return 1000;
127 }
128 
s_GetDefaultMismatchScore(void)129 int CSplign::s_GetDefaultMismatchScore(void) {
130     return -1044;
131 }
132 
s_GetDefaultGapOpeningScore(void)133 int CSplign::s_GetDefaultGapOpeningScore(void) {
134     return -3070;
135 }
136 
s_GetDefaultGapExtensionScore(void)137 int CSplign::s_GetDefaultGapExtensionScore(void) {
138     return -173;
139 }
140 
s_GetDefaultGtAgSpliceScore(void)141 int CSplign::s_GetDefaultGtAgSpliceScore(void) {
142     return -4270;
143 }
144 
s_GetDefaultGcAgSpliceScore(void)145 int CSplign::s_GetDefaultGcAgSpliceScore(void) {
146     return -5314;
147 }
148 
s_GetDefaultAtAcSpliceScore(void)149 int CSplign::s_GetDefaultAtAcSpliceScore(void) {
150     return -6358;
151 }
152 
s_GetDefaultNonConsensusSpliceScore(void)153 int CSplign::s_GetDefaultNonConsensusSpliceScore(void) {
154     return -7395;
155 }
156 
CSplign(void)157 CSplign::CSplign(void):
158     m_CanResetHistory (false),
159     //basic scores
160     m_ScoringType(s_GetDefaultScoringType()),
161     m_MatchScore(s_GetDefaultMatchScore()),
162     m_MismatchScore(s_GetDefaultMismatchScore()),
163     m_GapOpeningScore(s_GetDefaultGapOpeningScore()),
164     m_GapExtensionScore(s_GetDefaultGapExtensionScore()),
165     m_GtAgSpliceScore(s_GetDefaultGtAgSpliceScore()),
166     m_GcAgSpliceScore(s_GetDefaultGcAgSpliceScore()),
167     m_AtAcSpliceScore(s_GetDefaultAtAcSpliceScore()),
168     m_NonConsensusSpliceScore(s_GetDefaultNonConsensusSpliceScore()),
169     //end of basic scores
170     m_MinExonIdty(s_GetDefaultMinExonIdty()),
171     m_MinPolyaExtIdty(s_GetDefaultPolyaExtIdty()),
172     m_MinPolyaLen(s_GetDefaultMinPolyaLen()),
173     m_MinHoleLen(s_GetDefaultMinHoleLen()),
174     m_TrimToCodons(s_GetDefaultTrimToCodons()),
175     m_CompartmentPenalty(s_GetDefaultCompartmentPenalty()),
176     m_MinCompartmentIdty(s_GetDefaultMinCompartmentIdty()),
177     m_MinSingletonIdty(s_GetDefaultMinCompartmentIdty()),
178     m_MinSingletonIdtyBps(numeric_limits<size_t>::max()),
179     m_TestType(kTestType_production_default),
180     m_endgaps (true),
181     m_strand (true),
182     m_nopolya (false),
183     m_cds_start (0),
184     m_cds_stop (0),
185     m_max_genomic_ext (s_GetDefaultMaxGenomicExtent()),
186     m_MaxIntron (CCompartmentFinder<THit>::s_GetDefaultMaxIntron()),
187     m_MaxPartExonIdentDrop (s_GetDefaultMaxPartExonIdentDrop()),
188     m_model_id (0),
189     m_MaxCompsPerQuery (0),
190     m_MinPatternHitLength (13)
191 {
192 }
193 
~CSplign()194 CSplign::~CSplign()
195 {
196 }
197 
s_CreateVersion(void)198 static CVersionAPI* s_CreateVersion(void)
199 {
200     return new CVersion(CVersionInfo(2, 1, 0));
201 };
202 
203 
s_GetVersion(void)204 CVersionAPI& CSplign::s_GetVersion(void)
205 {
206     static CSafeStatic<CVersionAPI> s_Version(s_CreateVersion, 0);
207     return s_Version.Get();
208 }
209 
210 
SetAligner(void)211 CRef<CSplign::TAligner>& CSplign::SetAligner( void ) {
212     return m_aligner;
213 }
214 
215 
GetAligner(void) const216 CConstRef<CSplign::TAligner> CSplign::GetAligner( void ) const {
217     return m_aligner;
218 }
219 
SetAlignerScores(void)220 void CSplign::SetAlignerScores(void) {
221     CRef<TAligner>& aligner = SetAligner();
222     aligner->SetWm  (GetMatchScore());
223     aligner->SetWms (GetMismatchScore());
224     aligner->SetWg  (GetGapOpeningScore());
225     aligner->SetWs  (GetGapExtensionScore());
226     aligner->SetScoreMatrix(NULL);
227     aligner->SetWi(0, GetGtAgSpliceScore());
228     aligner->SetWi(1, GetGcAgSpliceScore());
229     aligner->SetWi(2, GetAtAcSpliceScore());
230     aligner->SetWi(3, GetNonConsensusSpliceScore());
231 }
232 
s_CreateDefaultAligner(void)233 CRef<CSplicedAligner> CSplign::s_CreateDefaultAligner(void) {
234     CRef<CSplicedAligner> aligner(
235         static_cast<CSplicedAligner*>(new CSplicedAligner16));
236     return aligner;
237 }
238 
239 
s_CreateDefaultAligner(bool low_query_quality)240 CRef<CSplicedAligner> CSplign::s_CreateDefaultAligner(bool low_query_quality)
241 {
242     CRef<CSplicedAligner> aligner(s_CreateDefaultAligner());
243 
244     if(low_query_quality) {
245         aligner->SetWm  (kEstMatchScore);
246         aligner->SetWms (kEstMismatchScore);
247         aligner->SetWg  (kEstGapOpeningScore);
248         aligner->SetWs  (kEstGapExtensionScore);
249         aligner->SetScoreMatrix(NULL);
250         aligner->SetWi(0, kEstGtAgSpliceScore);
251         aligner->SetWi(1, kEstGcAgSpliceScore);
252         aligner->SetWi(2, kEstAtAcSpliceScore);
253         aligner->SetWi(3, kEstNonConsensusSpliceScore);
254     }
255     else {
256         aligner->SetWm  (s_GetDefaultMatchScore());
257         aligner->SetWms (s_GetDefaultMismatchScore());
258         aligner->SetWg  (s_GetDefaultGapOpeningScore());
259         aligner->SetWs  (s_GetDefaultGapExtensionScore());
260         aligner->SetScoreMatrix(NULL);
261         aligner->SetWi(0, s_GetDefaultGtAgSpliceScore());
262         aligner->SetWi(1, s_GetDefaultGcAgSpliceScore());
263         aligner->SetWi(2, s_GetDefaultAtAcSpliceScore());
264         aligner->SetWi(3, s_GetDefaultNonConsensusSpliceScore());
265     }
266 
267     return aligner;
268 }
269 
270 //note: SetScoringType call with mRNA or EST type is going to switch basic scores to preset values
271 
SetScoringType(EScoringType type)272 void CSplign::SetScoringType(EScoringType type)
273 {
274     m_ScoringType = type;
275     if(type == eMrnaScoring) {
276         SetMatchScore(s_GetDefaultMatchScore());
277         SetMismatchScore(s_GetDefaultMismatchScore());
278         SetGapOpeningScore(s_GetDefaultGapOpeningScore());
279         SetGapExtensionScore(s_GetDefaultGapExtensionScore());
280         SetGtAgSpliceScore(s_GetDefaultGtAgSpliceScore());
281         SetGcAgSpliceScore(s_GetDefaultGcAgSpliceScore());
282         SetAtAcSpliceScore(s_GetDefaultAtAcSpliceScore());
283         SetNonConsensusSpliceScore(s_GetDefaultNonConsensusSpliceScore());
284     } else if(type == eEstScoring) {
285         SetMatchScore(kEstMatchScore);
286         SetMismatchScore(kEstMismatchScore);
287         SetGapOpeningScore(kEstGapOpeningScore);
288         SetGapExtensionScore(kEstGapExtensionScore);
289         SetGtAgSpliceScore(kEstGtAgSpliceScore);
290         SetGcAgSpliceScore(kEstGcAgSpliceScore);
291         SetAtAcSpliceScore(kEstAtAcSpliceScore);
292         SetNonConsensusSpliceScore(kEstNonConsensusSpliceScore);
293     }
294 }
295 
GetScoringType(void) const296 CSplign::EScoringType CSplign::GetScoringType(void) const {
297     return m_ScoringType;
298 }
299 
SetMatchScore(int score)300 void   CSplign::SetMatchScore(int score) {
301     m_MatchScore = score;
302 }
303 
GetMatchScore(void) const304 int CSplign::GetMatchScore(void) const {
305     return m_MatchScore;
306 }
307 
SetMismatchScore(int score)308 void   CSplign::SetMismatchScore(int score) {
309     m_MismatchScore = score;
310 }
311 
GetMismatchScore(void) const312 int CSplign::GetMismatchScore(void) const {
313     return m_MismatchScore;
314 }
315 
SetGapOpeningScore(int score)316 void   CSplign::SetGapOpeningScore(int score) {
317     m_GapOpeningScore = score;
318 }
319 
GetGapOpeningScore(void) const320 int CSplign::GetGapOpeningScore(void) const {
321     return m_GapOpeningScore;
322 }
323 
SetGapExtensionScore(int score)324 void   CSplign::SetGapExtensionScore(int score) {
325     m_GapExtensionScore = score;
326 }
327 
GetGapExtensionScore(void) const328 int CSplign::GetGapExtensionScore(void) const {
329     return m_GapExtensionScore;
330 }
331 //
SetGtAgSpliceScore(int score)332 void   CSplign::SetGtAgSpliceScore(int score) {
333     m_GtAgSpliceScore = score;
334 }
335 
GetGtAgSpliceScore(void) const336 int CSplign::GetGtAgSpliceScore(void) const {
337     return m_GtAgSpliceScore;
338 }
339 
SetGcAgSpliceScore(int score)340 void   CSplign::SetGcAgSpliceScore(int score) {
341     m_GcAgSpliceScore = score;
342 }
343 
GetGcAgSpliceScore(void) const344 int CSplign::GetGcAgSpliceScore(void) const {
345     return m_GcAgSpliceScore;
346 }
347 
SetAtAcSpliceScore(int score)348 void   CSplign::SetAtAcSpliceScore(int score) {
349     m_AtAcSpliceScore = score;
350 }
351 
GetAtAcSpliceScore(void) const352 int CSplign::GetAtAcSpliceScore(void) const {
353     return m_AtAcSpliceScore;
354 }
355 
SetNonConsensusSpliceScore(int score)356 void   CSplign::SetNonConsensusSpliceScore(int score) {
357     m_NonConsensusSpliceScore = score;
358 }
359 
GetNonConsensusSpliceScore(void) const360 int CSplign::GetNonConsensusSpliceScore(void) const {
361     return m_NonConsensusSpliceScore;
362 }
363 
SetEndGapDetection(bool on)364 void CSplign::SetEndGapDetection( bool on ) {
365     m_endgaps = on;
366 }
367 
GetEndGapDetection(void) const368 bool CSplign::GetEndGapDetection( void ) const {
369     return m_endgaps;
370 }
371 
SetPolyaDetection(bool on)372 void CSplign::SetPolyaDetection( bool on ) {
373     m_nopolya = !on;
374 }
375 
GetPolyaDetection(void) const376 bool CSplign::GetPolyaDetection( void ) const {
377     return !m_nopolya;
378 }
379 
SetStrand(bool strand)380 void CSplign::SetStrand( bool strand ) {
381     m_strand = strand;
382 }
383 
GetStrand(void) const384 bool CSplign::GetStrand( void ) const {
385     return m_strand;
386 }
387 
SetMinExonIdentity(double idty)388 void CSplign::SetMinExonIdentity( double idty )
389 {
390     if(!(0 <= idty && idty <= 1)) {
391         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
392     }
393     else {
394         m_MinExonIdty = idty;
395     }
396 }
397 
SetPolyaExtIdentity(double idty)398 void CSplign::SetPolyaExtIdentity( double idty )
399 {
400     if(!(0 <= idty && idty <= 1)) {
401         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
402     }
403     else {
404         m_MinPolyaExtIdty = idty;
405     }
406 }
407 
SetMinPolyaLen(size_t len)408 void CSplign::SetMinPolyaLen(size_t len) {
409     m_MinPolyaLen = len;
410 }
411 
SetMinHoleLen(size_t len)412 void CSplign::SetMinHoleLen(size_t len) {
413     m_MinHoleLen = len;
414 }
415 
SetTrimToCodons(bool val)416 void CSplign::SetTrimToCodons(bool val) {
417     m_TrimToCodons = val;
418 }
419 
SetMinCompartmentIdentity(double idty)420 void CSplign::SetMinCompartmentIdentity( double idty )
421 {
422     if(!(0 <= idty && idty <= 1)) {
423         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
424     }
425     else {
426         m_MinCompartmentIdty = idty;
427     }
428 }
429 
SetMinSingletonIdentity(double idty)430 void CSplign::SetMinSingletonIdentity(double idty)
431 {
432     if(!(0 <= idty && idty <= 1)) {
433         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_BadIdentityThreshold);
434     }
435     else {
436         m_MinSingletonIdty = idty;
437     }
438 }
439 
SetMinSingletonIdentityBps(size_t idty_bps)440 void CSplign::SetMinSingletonIdentityBps(size_t idty_bps)
441 {
442     m_MinSingletonIdtyBps = idty_bps;
443 }
444 
s_GetDefaultMaxGenomicExtent(void)445 size_t CSplign::s_GetDefaultMaxGenomicExtent(void)
446 {
447     return 35000;
448 }
449 
450 
SetMaxGenomicExtent(size_t mge)451 void CSplign::SetMaxGenomicExtent(size_t mge)
452 {
453     m_max_genomic_ext = mge;
454 }
455 
456 
GetMaxGenomicExtent(void) const457 size_t CSplign::GetMaxGenomicExtent(void) const
458 {
459     return m_max_genomic_ext;
460 }
461 
462 
SetMaxIntron(size_t max_intron)463 void CSplign::SetMaxIntron(size_t max_intron)
464 {
465     m_MaxIntron = max_intron;
466 }
467 
468 
GetMaxIntron(void) const469 size_t CSplign::GetMaxIntron(void) const
470 {
471     return m_MaxIntron;
472 }
473 
474 
GetMinExonIdentity(void) const475 double CSplign::GetMinExonIdentity( void ) const {
476     return m_MinExonIdty;
477 }
478 
s_GetDefaultMinExonIdty(void)479 double CSplign::s_GetDefaultMinExonIdty(void)
480 {
481     return 0.75;
482 }
483 
GetPolyaExtIdentity(void) const484 double CSplign::GetPolyaExtIdentity( void ) const {
485     return m_MinPolyaExtIdty;
486 }
487 
s_GetDefaultPolyaExtIdty(void)488 double CSplign::s_GetDefaultPolyaExtIdty(void)
489 {
490     return 1.;
491 }
492 
GetMinPolyaLen(void) const493 size_t CSplign::GetMinPolyaLen( void ) const {
494     return m_MinPolyaLen;
495 }
496 
s_GetDefaultMinPolyaLen(void)497 size_t CSplign::s_GetDefaultMinPolyaLen(void)
498 {
499     return 1;
500 }
501 
GetMinHoleLen(void) const502 size_t CSplign::GetMinHoleLen( void ) const {
503     return m_MinHoleLen;
504 }
505 
s_GetDefaultMinHoleLen(void)506 size_t CSplign::s_GetDefaultMinHoleLen(void)
507 {
508     return 0;
509 }
510 
GetTrimToCodons(void) const511 bool CSplign::GetTrimToCodons( void ) const {
512     return m_TrimToCodons;
513 }
514 
s_GetDefaultTrimToCodons(void)515 bool CSplign::s_GetDefaultTrimToCodons(void)
516 {
517     return false;
518 }
519 
GetMinCompartmentIdentity(void) const520 double CSplign::GetMinCompartmentIdentity(void) const {
521     return m_MinCompartmentIdty;
522 }
523 
GetMinSingletonIdentity(void) const524 double CSplign::GetMinSingletonIdentity(void) const {
525     return m_MinSingletonIdty;
526 }
527 
GetMinSingletonIdentityBps(void) const528 size_t CSplign::GetMinSingletonIdentityBps(void) const {
529     return m_MinSingletonIdtyBps;
530 }
531 
s_GetDefaultMinCompartmentIdty(void)532 double CSplign::s_GetDefaultMinCompartmentIdty(void)
533 {
534     return 0.70;
535 }
536 
SetMaxPartExonIdentDrop(double ident)537 void CSplign::SetMaxPartExonIdentDrop(double ident) {
538     m_MaxPartExonIdentDrop = ident;
539 }
540 
GetMaxPartExonIdentDrop(void) const541 double CSplign::GetMaxPartExonIdentDrop(void) const {
542     return m_MaxPartExonIdentDrop;
543 }
544 
s_GetDefaultMaxPartExonIdentDrop(void)545 double CSplign::s_GetDefaultMaxPartExonIdentDrop(void)
546 {
547     return 0.25;
548 }
549 
SetTestType(const string & test_type)550 void CSplign::SetTestType(const string& test_type) {
551     m_TestType = test_type;
552 }
553 
GetTestType(void) const554 string CSplign::GetTestType(void) const {
555     return m_TestType;
556 }
557 
SetMaxCompsPerQuery(size_t m)558 void CSplign::SetMaxCompsPerQuery(size_t m) {
559     m_MaxCompsPerQuery = m;
560 }
561 
GetMaxCompsPerQuery(void) const562 size_t CSplign::GetMaxCompsPerQuery(void) const {
563     return m_MaxCompsPerQuery;
564 }
565 
566 
567 
GetScope(void) const568 CRef<objects::CScope> CSplign::GetScope(void) const
569 {
570     return m_Scope;
571 }
572 
573 
SetScope(void)574 CRef<objects::CScope>& CSplign::SetScope(void)
575 {
576     return m_Scope;
577 }
578 
579 
PreserveScope(bool preserve_scope)580 void CSplign::PreserveScope(bool preserve_scope)
581 {
582     m_CanResetHistory = ! preserve_scope;
583 }
584 
585 
SetCompartmentPenalty(double penalty)586 void CSplign::SetCompartmentPenalty(double penalty)
587 {
588     if(penalty < 0 || penalty > 1) {
589         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_QueryCoverageOutOfRange);
590     }
591     m_CompartmentPenalty = penalty;
592 }
593 
s_GetDefaultCompartmentPenalty(void)594 double CSplign::s_GetDefaultCompartmentPenalty(void)
595 {
596     return 0.55;
597 }
598 
GetCompartmentPenalty(void) const599 double CSplign::GetCompartmentPenalty( void ) const
600 {
601     return m_CompartmentPenalty;
602 }
603 
x_IsInGap(THit::TCoord pos)604 bool CSplign::x_IsInGap(THit::TCoord pos)
605 {
606     if( pos+1 == 0 || pos >= m_genomic.size()  ) return true;//outside genome boundaries
607     if(m_GenomicSeqMap && m_GenomicSeqMap->ResolvedRangeIterator(GetScope(),  pos, 1, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap)) {//gap
608         return true;
609     }
610     return false;
611 }
612 
613 
x_LoadSequence(vector<char> * seq,const CSeq_id & seqid,THit::TCoord start,THit::TCoord finish,bool retain,bool is_genomic,bool genomic_strand)614 void CSplign::x_LoadSequence(vector<char>* seq,
615                              const CSeq_id& seqid,
616                              THit::TCoord start,
617                              THit::TCoord finish,
618                              bool retain, bool is_genomic, bool genomic_strand)
619 {
620 
621     try {
622 
623         if(m_Scope.IsNull()) {
624             NCBI_THROW(CAlgoAlignException, eInternal, "Splign scope not set");
625         }
626 
627         CBioseq_Handle bh (m_Scope->GetBioseqHandle(seqid));
628 
629         if( !is_genomic ) m_mrna_bio_handle = bh;
630 
631         if(retain && m_CanResetHistory) {
632             m_Scope->ResetHistory(); // this does not remove the sequence
633                                      // referenced to by 'bh'
634         }
635 
636         if(bh) {
637 
638             CSeqVector sv (bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
639             const TSeqPos dim (sv.size());
640             if(dim == 0) {
641                 NCBI_THROW(CAlgoAlignException,
642                            eNoSeqData,
643                            string("Sequence is empty: ")
644                            + seqid.AsFastaString());
645             }
646             if(finish >= dim) {
647                 finish = dim - 1;
648             }
649 
650             if(start > finish) {
651                 CNcbiOstrstream ostr;
652                 ostr << "Invalid sequence interval requested for "
653                      << seqid.GetSeqIdString(true) << ":\t"
654                      << start << '\t' << finish;
655                 const string err = CNcbiOstrstreamToString(ostr);
656                 NCBI_THROW(CAlgoAlignException, eNoSeqData, err);
657             }
658 
659             string s;
660             sv.GetSeqData(start, finish + 1, s);
661             if(is_genomic) {//get SeqMap data
662                 ENa_strand strand = eNa_strand_minus;
663                 if(genomic_strand) strand = eNa_strand_plus;
664                 CRef<CSeq_id> tmp_id(new CSeq_id());
665                 tmp_id->Assign(seqid);
666                 CSeq_loc tmp_loc(*tmp_id, start, finish, strand);
667                 m_GenomicSeqMap = CSeqMap::GetSeqMapForSeq_loc(tmp_loc, GetScope());
668             }
669             seq->resize(1 + finish - start);
670             copy(s.begin(), s.end(), seq->begin());
671         }
672         else {
673             NCBI_THROW(CAlgoAlignException, eNoSeqData,
674                        string("ID not found: ") + seqid.AsFastaString());
675         }
676 
677         if(!retain && m_CanResetHistory) {
678             m_Scope->RemoveFromHistory(bh);
679         }
680     }
681 
682     catch(CAlgoAlignException& e) {
683         NCBI_RETHROW_SAME(e, "CSplign::x_LoadSequence(): Sequence data problem");
684     }
685 
686     if(!is_genomic) {
687         if(seq == &m_mrna) {
688             m_mrna_polya.clear();
689             m_mrna_polya.resize(seq->size());
690             copy(seq->begin(), seq->end(), m_mrna_polya.begin());
691         }
692         CSeq_id_Handle idh_key = CSeq_id_Handle::GetHandle(seqid);
693         if(m_MaskMap.find(idh_key) != m_MaskMap.end()) {
694             const TSeqRangeColl& mask_ranges = m_MaskMap.find(idh_key)->second;
695             x_MaskSequence(seq, mask_ranges, start, finish);
696         }
697     }
698 }
699 
700 
x_MaskSequence(vector<char> * seq,const TSeqRangeColl & mask_ranges,THit::TCoord start,THit::TCoord finish)701 void  CSplign::x_MaskSequence(vector<char>* seq,
702                           const TSeqRangeColl& mask_ranges,
703                           THit::TCoord start,
704                           THit::TCoord finish)
705 {
706     //cerr<<start<<"\t"<<finish<<endl;
707     TSeqRange loop_range;
708     for(TSeqPos loop = start; loop <= finish; loop++) {
709         loop_range.SetFrom(loop);
710         loop_range.SetLength(1);
711         if(mask_ranges.IntersectingWith(loop_range)) {
712             //cerr<<loop<<"\t"<<loop_range<<endl;
713             (*seq)[loop] = 'N';
714         }
715     }
716     //ITERATE(vector<char>, ci, (*seq)) {
717     //    cerr << *ci;
718     //}
719     //cerr<<endl;
720 }
721 
722 
723 
ClearMem(void)724 void CSplign::ClearMem(void)
725 {
726     m_Scope.Reset(NULL);
727     m_pattern.clear();
728     m_alnmap.clear();
729     m_genomic.clear();
730     m_mrna.clear();
731     m_mrna_polya.clear();
732 }
733 
734 
sx_NewHit(THit::TCoord q0,THit::TCoord q,THit::TCoord s0,THit::TCoord s)735 CSplign::THitRef CSplign::sx_NewHit(THit::TCoord q0, THit::TCoord q,
736                                     THit::TCoord s0, THit::TCoord s)
737 {
738     THitRef hr (new THit);
739     hr->SetQueryStart(q0);
740     hr->SetSubjStart(s0);
741     hr->SetQueryStop(q - 1);
742     hr->SetSubjStop(s - 1);
743     hr->SetLength(q - q0);
744     hr->SetMismatches(0);
745     hr->SetGaps(0);
746     hr->SetEValue(0);
747     hr->SetScore(2*(q - q0));
748     hr->SetIdentity(1);
749     return hr;
750 }
751 
752 
x_SplitQualifyingHits(THitRefs * phitrefs)753 void CSplign::x_SplitQualifyingHits(THitRefs* phitrefs)
754 {
755     const char * Seq1 (&m_mrna.front());
756     const char * Seq2 (&m_genomic.front());
757     THitRefs rv;
758     ITERATE(THitRefs, ii, (*phitrefs)) {
759 
760         const THitRef & h (*ii);
761         const double idty (h->GetIdentity());
762         const bool diag (h->GetGaps() == 0 && h->GetQuerySpan() == h->GetSubjSpan());
763         if(idty == 1 || idty < .95 || h->GetLength() < 100 || !diag) {
764             rv.push_back(h);
765         }
766         else {
767 
768             int q0 (-1), s0 (-1), q1 (h->GetQueryMax());
769             int q (h->GetQueryMin()), s (h->GetSubjMin());
770             size_t new_hits (0);
771             while(q <= q1) {
772                 if(Seq1[q++] != Seq2[s++]) {
773                     if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
774                         THitRef hr (sx_NewHit(q0, q, s0, s));
775                         hr->SetQueryId(h->GetQueryId());
776                         hr->SetSubjId(h->GetSubjId());
777                         rv.push_back(hr);
778                         ++new_hits;
779                     }
780                     q0 = s0 = -1;
781                 }
782                 else {
783                     if(q0 == -1) {
784                         q0 = q;
785                         s0 = s;
786                     }
787                 }
788             }
789 
790             if(q0 != -1 && q >= q0 + int(m_MinPatternHitLength)) {
791                 THitRef hr (sx_NewHit(q0, q, s0, s));
792                 hr->SetQueryId(h->GetQueryId());
793                 hr->SetSubjId(h->GetSubjId());
794                 rv.push_back(hr);
795                 ++new_hits;
796             }
797 
798             if(new_hits == 0) {
799                 rv.push_back(h);
800             }
801         }
802     }
803 
804     *phitrefs = rv;
805 }
806 
807 
x_SetPattern(THitRefs * phitrefs)808 void CSplign::x_SetPattern(THitRefs* phitrefs)
809 {
810     m_alnmap.clear();
811     m_pattern.clear();
812 
813     // sort the input by min query coordinate
814     typedef CHitComparator<THit> THitComparator;
815     THitComparator sorter (THitComparator::eQueryMin);
816     stable_sort(phitrefs->begin(), phitrefs->end(), sorter);
817 
818     // check that no two consecutive hits are farther than the max intron
819     // (extra short hits skipped)
820     // (throw out a hit if it intersects with previous on subject (genome), )
821     size_t prev (0);
822     TSeqPos prevSmin, prevSmax;
823     NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
824 
825         THitRef& h (*ii);
826 
827         if(h->GetQuerySpan() < m_MinPatternHitLength) {
828             h.Reset(0);
829             continue;
830         }
831 
832         if(prev > 0) {
833 
834         const bool non_intersect = ( prevSmax < h->GetSubjMin() ) || ( prevSmin > h->GetSubjMax() );
835         if(!non_intersect) {//throw out intersecting hit
836             h.Reset(0);
837             continue;
838         }
839 
840             const bool consistent (h->GetSubjStrand()?
841                                   (h->GetSubjStart() < prev + m_MaxIntron):
842                                   (h->GetSubjStart() + m_MaxIntron > prev));
843 
844             if(!consistent) {
845                 const string errmsg (g_msg_CompartmentInconsistent
846                                      + string(" (extra long introns)"));
847                 NCBI_THROW(CAlgoAlignException, eIntronTooLong, errmsg);
848             }
849         }
850 
851         prev = h->GetSubjStop();
852         prevSmin = h->GetSubjMin();
853         prevSmax = h->GetSubjMax();
854     }
855 
856     phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
857                               CHitFilter<THit>::s_PNullRef),
858                     phitrefs->end());
859 
860     // save each hit longer than the minimum and test whether the hit is perfect
861     vector<size_t> pattern0;
862     vector<pair<bool,double> > imperfect;
863     double max_idty (0.0);
864     for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
865 
866         const THitRef & h ((*phitrefs)[i]);
867         const bool valid (true);
868         if(valid) {
869 
870             pattern0.push_back(h->GetQueryMin());
871             pattern0.push_back(h->GetQueryMax());
872             pattern0.push_back(h->GetSubjMin());
873             pattern0.push_back(h->GetSubjMax());
874             const double idty (h->GetIdentity());
875             const bool imprf  (idty < 1.00
876                                || h->GetQuerySpan() != h->GetSubjSpan()
877                                || h->GetMismatches() > 0
878                                || h->GetGaps() > 0);
879             imperfect.push_back(pair<bool,double>(imprf, idty));
880             if(idty > max_idty) {
881                 max_idty = idty;
882             }
883         }
884     }
885 
886     if(max_idty < .85 && pattern0.size() >= 4) {
887         m_BoundingRange = pair<size_t, size_t>(pattern0[2], pattern0.back());
888     }
889     else {
890         m_BoundingRange = pair<size_t, size_t>(0, 0);
891     }
892 
893     const size_t dim (pattern0.size());
894 
895     const char* Seq1 (&m_mrna.front());
896     const size_t SeqLen1 (m_polya_start < kMax_UInt? m_polya_start: m_mrna.size());
897     const char* Seq2 (&m_genomic.front());
898     const size_t SeqLen2 (m_genomic.size());
899 
900     // verify conditions on the input hit pattern
901     CNcbiOstrstream ostr_err;
902     bool some_error (false), bad_input (false);
903     if(dim % 4 == 0) {
904 
905         for(size_t i (0); i < dim; i += 4) {
906 
907             if(pattern0[i] > pattern0[i+1] || pattern0[i+2] > pattern0[i+3]) {
908                 ostr_err << "Pattern hits must be specified in plus strand";
909                 some_error = bad_input = true;
910                 break;
911             }
912 
913             if(i > 4) {
914                 if(pattern0[i] <= pattern0[i-3] || pattern0[i+2] <= pattern0[i-1]) {
915                     ostr_err << g_msg_CompartmentInconsistent
916                              << string(" (hits not sorted)");
917                     some_error = true;
918                     break;
919                 }
920             }
921 
922             const bool br1 (pattern0[i+1] >= SeqLen1);
923             const bool br2 (pattern0[i+3] >= SeqLen2);
924             if(br1 || br2) {
925 
926                 ostr_err << "Pattern hits out of range ("
927                          << "query = "
928                          << phitrefs->front()->GetQueryId()->GetSeqIdString(true)
929                          << "subj = "
930                          << phitrefs->front()->GetSubjId()->GetSeqIdString(true)
931                          << "):" << endl;
932 
933                 if(br1) {
934                     ostr_err << "\tquery_pattern_max = " << pattern0[i+1]
935                              << "; query_len = " << SeqLen1 << endl;
936                 }
937 
938                 if(br2) {
939                     ostr_err << "\tsubj_pattern_max = " << pattern0[i+3]
940                              << "; subj_len = " << SeqLen2 << endl;
941                 }
942 
943                 some_error= true;
944                 break;
945             }
946         }
947 
948     }
949     else {
950         ostr_err << "Pattern dimension must be a multiple of four";
951         some_error = bad_input = true;
952     }
953 
954     if(some_error) {
955         ostr_err << " (query = "
956                  << phitrefs->front()->GetQueryId()->AsFastaString()
957                  << " , subj = "
958                  << phitrefs->front()->GetSubjId()->AsFastaString() << ')'
959                  << endl;
960     }
961 
962     const string err = CNcbiOstrstreamToString(ostr_err);
963     if(err.size() > 0) {
964         if(bad_input) {
965             NCBI_THROW(CAlgoAlignException, eBadParameter, err);
966         }
967         else {
968             NCBI_THROW(CAlgoAlignException, ePattern, err);
969         }
970     }
971 
972     SAlnMapElem map_elem;
973     map_elem.m_box[0] = map_elem.m_box[2] = 0;
974     map_elem.m_pattern_start = map_elem.m_pattern_end = -1;
975 
976     // build the alignment map
977     CBandAligner nwa;
978     for(size_t i = 0; i < dim; i += 4) {
979 
980         size_t L1, R1, L2, R2;
981         size_t max_seg_size (0);
982 
983         const bool imprf (imperfect[i/4].first);
984         if(imprf) {
985 
986             // TODO:
987             // a better approach is to find indels and mismatches
988             // and split at the SplitQualifyingHits() stage
989             // to pass here only perfect diags
990             const size_t len1 (pattern0[i+1] - pattern0[i] + 1);
991             const size_t len2 (pattern0[i+3] - pattern0[i+2] + 1);
992             const size_t maxlen (max(len1, len2));
993             const size_t lendif (len1 < len2? len2 - len1: len1 - len2);
994             size_t band (size_t((1 - imperfect[i/4].second) * maxlen) + 2);
995             if(band < lendif) band += lendif;
996             nwa.SetBand(band);
997             nwa.SetSequences(Seq1 + pattern0[i],   len1,
998                              Seq2 + pattern0[i+2], len2,
999                              false);
1000             nwa.Run();
1001             max_seg_size = nwa.GetLongestSeg(&L1, &R1, &L2, &R2);
1002         }
1003         else {
1004 
1005             L1 = 1;
1006             R1 = pattern0[i+1] - pattern0[i] - 1;
1007             L2 = 1;
1008             R2 = pattern0[i+3] - pattern0[i+2] - 1;
1009             max_seg_size = R1 - L1 + 1;
1010         }
1011 
1012         if(max_seg_size) {
1013 
1014             // make the core
1015             {{
1016                 size_t cut ((1 + R1 - L1) / 5);
1017                 if(cut > 20) cut = 20;
1018 
1019                 const size_t l1 (L1 + cut), l2 (L2 + cut);
1020                 const size_t r1 (R1 - cut), r2 (R2 - cut);
1021                 if(l1 < r1 && l2 < r2) {
1022                     L1 = l1; L2 = l2;
1023                     R1 = r1; R2 = r2;
1024                 }
1025             }}
1026 
1027             size_t q0 (pattern0[i] + L1);
1028             size_t s0 (pattern0[i+2] + L2);
1029             size_t q1 (pattern0[i] + R1);
1030             size_t s1 (pattern0[i+2] + R2);
1031 
1032             if(imprf) {
1033 
1034                 const size_t hitlen_q (pattern0[i + 1] - pattern0[i] + 1);
1035                 const size_t sh (size_t(hitlen_q / 4));
1036 
1037                 size_t delta (sh > L1? sh - L1: 0);
1038                 q0 += delta;
1039                 s0 += delta;
1040 
1041                 const size_t h2s_right (hitlen_q - R1 - 1);
1042                 delta = sh > h2s_right? sh - h2s_right: 0;
1043                 q1 -= delta;
1044                 s1 -= delta;
1045 
1046                 if(q0 > q1 || s0 > s1) {
1047 
1048                     // the longest segment too short
1049                     q0 = pattern0[i] + L1;
1050                     s0 = pattern0[i+2] + L2;
1051                     q1 = pattern0[i] + R1;
1052                     s1 = pattern0[i+2] + R2;
1053                 }
1054             }
1055 
1056             m_pattern.push_back(q0); m_pattern.push_back(q1);
1057             m_pattern.push_back(s0); m_pattern.push_back(s1);
1058 
1059             const size_t pattern_dim = m_pattern.size();
1060             if(map_elem.m_pattern_start == -1) {
1061                 map_elem.m_pattern_start = pattern_dim - 4;
1062             }
1063             map_elem.m_pattern_end = pattern_dim - 1;
1064         }
1065 
1066         map_elem.m_box[1] = pattern0[i+1];
1067         map_elem.m_box[3] = pattern0[i+3];
1068     }
1069 
1070     map_elem.m_box[1] = SeqLen1 - 1;
1071     map_elem.m_box[3] = SeqLen2 - 1;
1072     m_alnmap.push_back(map_elem);
1073 }
1074 
1075 
GetCds(const THit::TId & id,const vector<char> * seq_data)1076 CSplign::TOrfPair CSplign::GetCds(const THit::TId& id, const vector<char> * seq_data)
1077 {
1078     TOrfPair rv (TOrf(0, 0), TOrf(0, 0));
1079 
1080     TStrIdToOrfs::const_iterator ie (m_OrfMap.end());
1081     const string strid (id->AsFastaString());
1082     TStrIdToOrfs::const_iterator ii (m_OrfMap.find(strid));
1083 
1084     if(ii != ie) {
1085         rv = ii->second;
1086     }
1087     else {
1088 
1089 
1090         vector<char> seq;
1091         if(seq_data == 0) {
1092             x_LoadSequence(&seq, *id, 0, kMaxCoord, false);
1093             seq_data = & seq;
1094         }
1095 
1096         // Assign CDS to the max ORF longer than 90 bps and starting from ATG
1097         //
1098         vector<CRef<CSeq_loc> > orfs;
1099         vector<string> start_codon;
1100         start_codon.push_back("ATG");
1101 
1102         COrf::FindOrfs(*seq_data, orfs, 90, 1, start_codon);
1103         TSeqPos max_len_plus  (0), max_len_minus  (0);
1104         TSeqPos max_from_plus (0), max_from_minus (0);
1105         TSeqPos max_to_plus   (0), max_to_minus   (0);
1106         ITERATE (vector<CRef<CSeq_loc> >, orf, orfs) {
1107 
1108             const TSeqPos len           (sequence::GetLength(**orf, NULL));
1109             const ENa_strand orf_strand ((*orf)->GetInt().GetStrand());
1110             const bool antisense        (orf_strand == eNa_strand_minus);
1111 
1112             if(antisense) {
1113                 if(len > max_len_minus) {
1114                     max_len_minus    = len;
1115                     max_from_minus   = (*orf)->GetInt().GetTo();
1116                     max_to_minus     = (*orf)->GetInt().GetFrom();
1117                 }
1118             }
1119             else {
1120                 if(len > max_len_plus) {
1121                     max_len_plus    = len;
1122                     max_from_plus   = (*orf)->GetInt().GetFrom();
1123                     max_to_plus     = (*orf)->GetInt().GetTo();
1124                 }
1125             }
1126         }
1127 
1128         if(max_len_plus > 0) {
1129             rv.first = TOrf(max_from_plus, max_to_plus);
1130         }
1131 
1132         if(max_len_minus > 0) {
1133             rv.second = TOrf(max_from_minus, max_to_minus);
1134         }
1135 
1136         m_OrfMap[strid] = rv;
1137     }
1138 
1139     return rv;
1140 }
1141 
1142 
x_FinalizeAlignedCompartment(SAlignedCompartment & ac)1143 void CSplign::x_FinalizeAlignedCompartment(SAlignedCompartment & ac)
1144 {
1145     ac.m_Id         = ++m_model_id;
1146     ac.m_Segments   = m_segments;
1147     ac.m_Status     = SAlignedCompartment::eStatus_Ok;
1148     ac.m_Msg        = "Ok";
1149     ac.m_Cds_start  = m_cds_start;
1150     ac.m_Cds_stop   = m_cds_stop;
1151     ac.m_QueryLen   = m_mrna.size();
1152     ac.m_PolyA      = (m_polya_start < kMax_UInt? m_polya_start : 0);
1153 }
1154 
1155 
1156 // PRE:  Input Blast hits.
1157 // POST: TResults - a vector of aligned compartments.
Run(THitRefs * phitrefs)1158 void CSplign::Run(THitRefs* phitrefs)
1159 {
1160     if(!phitrefs) {
1161         NCBI_THROW(CAlgoAlignException, eInternal, "Unexpected NULL pointers");
1162     }
1163 
1164     THitRefs& hitrefs = *phitrefs;
1165 
1166     // make sure query hit is in plus strand
1167     NON_CONST_ITERATE(THitRefs, ii, hitrefs) {
1168 
1169         THitRef& h = *ii;
1170         if(h.NotNull() && h->GetQueryStrand() == false) {
1171             h->FlipStrands();
1172         }
1173     }
1174 
1175     if(m_aligner.IsNull()) {
1176         NCBI_THROW(CAlgoAlignException, eNotInitialized, g_msg_AlignedNotSpecified);
1177     }
1178 
1179     if(hitrefs.size() == 0) {
1180         NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_EmptyHitVectorPassed);
1181     }
1182 
1183     m_result.clear();
1184 
1185     THit::TId id_query (hitrefs.front()->GetQueryId());
1186 
1187     const THit::TCoord mrna_size (objects::sequence::GetLength(*id_query, m_Scope));
1188     if(mrna_size == kMaxCoord) {
1189         NCBI_THROW(CAlgoAlignException, eNoSeqData,
1190                    string("Sequence not found: ") + id_query->AsFastaString());
1191     }
1192 
1193     // iterate through compartments
1194     const THit::TCoord min_singleton_idty_final (
1195                        min(size_t(m_MinSingletonIdty * mrna_size),
1196                            m_MinSingletonIdtyBps));
1197 
1198     CCompartmentAccessor<THit> comps (THit::TCoord(m_CompartmentPenalty * mrna_size),
1199                                       THit::TCoord(m_MinCompartmentIdty * mrna_size),
1200                                       min_singleton_idty_final,
1201                                       true);
1202     comps.SetMaxIntron(m_MaxIntron);
1203      if( GetTestType() == kTestType_20_28 ||  GetTestType() == kTestType_20_28_plus ) {
1204          comps.Run(hitrefs.begin(), hitrefs.end(), GetScope());
1205      } else {
1206          comps.Run(hitrefs.begin(), hitrefs.end());
1207      }
1208 
1209     pair<size_t,size_t> dim (comps.GetCounts()); // (count_total, count_unmasked)
1210     if(dim.second > 0) {
1211 
1212         // pre-load cDNA
1213         m_mrna.clear();
1214 
1215         x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
1216 
1217         const TOrfPair orfs (GetCds(id_query, & m_mrna));
1218         if(m_strand) {
1219             m_cds_start  = orfs.first.first;
1220             m_cds_stop = orfs.first.second;
1221         }
1222         else {
1223             m_cds_start  = orfs.second.first;
1224             m_cds_stop = orfs.second.second;
1225         }
1226 
1227         if(!m_strand) {
1228             // make a reverse complimentary
1229             reverse (m_mrna.begin(), m_mrna.end());
1230             transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
1231 
1232             reverse (m_mrna_polya.begin(), m_mrna_polya.end());
1233             transform(m_mrna_polya.begin(), m_mrna_polya.end(), m_mrna_polya.begin(), SCompliment());
1234         }
1235 
1236         // compartments share the space between them
1237         size_t smin (0), smax (kMax_UInt);
1238         bool same_strand (false);
1239 
1240         const THit::TCoord* box (comps.GetBox(0));
1241         if(m_MaxCompsPerQuery > 0 && dim.second > m_MaxCompsPerQuery) {
1242             dim.second = m_MaxCompsPerQuery;
1243         }
1244 
1245         for(size_t i (0); i < dim.first; ++i, box += 4) {
1246 
1247             if(i + 1 == dim.first) {
1248                 smax = kMax_UInt;
1249                 same_strand = false;
1250             }
1251             else {
1252                 bool strand_this (comps.GetStrand(i));
1253                 bool strand_next (comps.GetStrand(i+1));
1254                 same_strand = strand_this == strand_next;
1255                 smax = same_strand? (box + 4)[2]: kMax_UInt;
1256             }
1257 
1258             try {
1259 
1260                 if(smax < box[3]) {
1261                     // alert if not ordered by lower subject coordinate
1262                     NCBI_THROW(CAlgoAlignException, eInternal,
1263                                "Unexpected order of compartments");
1264                 }
1265 
1266                 if(comps.GetStatus(i)) {
1267                     THitRefs comp_hits;
1268                     comps.Get(i, comp_hits);
1269 
1270                     if(smax < box[3]) smax = box[3];
1271                     if(smin > box[2]) smin = box[2];
1272 
1273                     SAlignedCompartment ac (x_RunOnCompartment(&comp_hits, smin,smax));
1274                     x_FinalizeAlignedCompartment(ac);
1275                     m_result.push_back(ac);
1276                 }
1277             }
1278 
1279             catch(CAlgoAlignException& e) {
1280 
1281                 if(e.GetSeverity() == eDiag_Fatal) {
1282                     throw;
1283                 }
1284 
1285                 m_result.push_back(SAlignedCompartment(0, e.GetMsg().c_str()));
1286 
1287                 const CException::TErrCode errcode (e.GetErrCode());
1288                 if(errcode != CAlgoAlignException::eNoAlignment) {
1289                     m_result.back().m_Status = SAlignedCompartment::eStatus_Error;
1290                 }
1291 
1292                 ++m_model_id;
1293             }
1294 
1295             smin = same_strand? box[3]: 0;
1296         }
1297     }
1298 }
1299 
1300 
AlignSingleCompartment(CRef<objects::CSeq_align> compartment,SAlignedCompartment * result)1301 bool CSplign::AlignSingleCompartment(CRef<objects::CSeq_align> compartment,
1302                                      SAlignedCompartment* result)
1303 {
1304     const CRef<CSeq_loc> seqloc (compartment->GetBounds().front());
1305     const size_t subj_min (seqloc->GetStart(eExtreme_Positional));
1306     const size_t subj_max (seqloc->GetStop(eExtreme_Positional));
1307 
1308     THitRefs hitrefs;
1309     ITERATE(CSeq_align_set::Tdata, ii, compartment->GetSegs().GetDisc().Get()) {
1310     }
1311 
1312     return AlignSingleCompartment(&hitrefs, subj_min, subj_max, result);
1313 }
1314 
1315 
AlignSingleCompartment(THitRefs * phitrefs,size_t subj_min,size_t subj_max,SAlignedCompartment * result)1316 bool CSplign::AlignSingleCompartment(THitRefs* phitrefs,
1317                                      size_t subj_min, size_t subj_max,
1318                                      SAlignedCompartment* result)
1319 {
1320     m_mrna.resize(0);
1321 
1322     THit::TId id_query (phitrefs->front()->GetQueryId());
1323 
1324     x_LoadSequence(&m_mrna, *id_query, 0, kMaxCoord, false);
1325 
1326     const TOrfPair orfs (GetCds(id_query, & m_mrna));
1327     if(m_strand) {
1328         m_cds_start  = orfs.first.first;
1329         m_cds_stop   = orfs.first.second;
1330     }
1331     else {
1332         m_cds_start  = orfs.second.first;
1333         m_cds_stop   = orfs.second.second;
1334     }
1335 
1336     if(!m_strand) {
1337         reverse (m_mrna.begin(), m_mrna.end());
1338         transform(m_mrna.begin(), m_mrna.end(), m_mrna.begin(), SCompliment());
1339 
1340         reverse (m_mrna_polya.begin(), m_mrna_polya.end());
1341         transform(m_mrna_polya.begin(), m_mrna_polya.end(), m_mrna_polya.begin(), SCompliment());
1342     }
1343 
1344     bool rv (true);
1345     try {
1346 
1347         SAlignedCompartment ac (x_RunOnCompartment(phitrefs, subj_min, subj_max));
1348         x_FinalizeAlignedCompartment(ac);
1349         *result = ac;
1350         m_mrna.resize(0);
1351     }
1352 
1353     catch(CAlgoAlignException& e) {
1354 
1355         m_mrna.resize(0);
1356 
1357         if(e.GetSeverity() == eDiag_Fatal) {
1358             throw;
1359         }
1360 
1361         *result = SAlignedCompartment(0, e.GetMsg().c_str());
1362 
1363         const CException::TErrCode errcode (e.GetErrCode());
1364         if(errcode != CAlgoAlignException::eNoAlignment) {
1365             result->m_Status = SAlignedCompartment::eStatus_Error;
1366         }
1367 
1368         ++m_model_id;
1369         rv = false;
1370     }
1371 
1372     return rv;
1373 }
1374 
IsPolyA(const char * seq,size_t polya_start,size_t dim)1375 bool CSplign::IsPolyA(const char * seq, size_t polya_start, size_t dim) {
1376     const double kMinPercAInPolya (0.80);
1377     if( polya_start + GetMinPolyaLen() > dim ) return false;
1378     size_t cnt = 0;
1379     for(size_t i = polya_start; i<dim; ++i) {
1380         if(seq[i] == 'A') ++cnt;
1381     }
1382     if(cnt >= (dim - polya_start)*kMinPercAInPolya) return true;
1383     return false;
1384 }
1385 
1386 // naive polya detection; sense direction assumed
s_TestPolyA(const char * seq,size_t dim,size_t cds_stop)1387 size_t CSplign::s_TestPolyA(const char * seq, size_t dim, size_t cds_stop)
1388 {
1389     const size_t kMaxNonA (3), kMinAstreak (5);
1390     int i (dim - 1), i0 (dim);
1391     for(size_t count_non_a (0), astreak (0); i >= 0 && count_non_a < kMaxNonA; --i) {
1392 
1393         if(seq[i] != 'A') {
1394             ++count_non_a;
1395             astreak = 0;
1396         }
1397         else {
1398             if(++astreak >= kMinAstreak) {
1399                 i0 = i;
1400             }
1401         }
1402     }
1403 
1404     const size_t len (dim - i0);
1405     size_t rv;
1406     if(len >= kMinAstreak) {
1407         rv = i0;
1408         if(0 < cds_stop && cds_stop < dim && rv <= cds_stop) {
1409             rv = cds_stop + 1;
1410         }
1411     }
1412     else {
1413         rv = kMax_UInt;
1414     }
1415 
1416     return rv;
1417 }
1418 
1419 
1420 // PRE:  Hits (initial, not transformed) representing the compartment;
1421 //       maximum genomic sequence span;
1422 //       pre-loaded and appropriately transformed query sequence.
1423 // POST: A set of segments packed into the aligned compartment.
1424 
x_RunOnCompartment(THitRefs * phitrefs,size_t range_left,size_t range_right)1425 CSplign::SAlignedCompartment CSplign::x_RunOnCompartment(THitRefs* phitrefs,
1426                                                          size_t range_left,
1427                                                          size_t range_right)
1428 {
1429     SAlignedCompartment rv;
1430 
1431     try {
1432         m_segments.clear();
1433 
1434         if(range_left > range_right) {
1435             NCBI_THROW(CAlgoAlignException, eInternal, g_msg_InvalidRange);
1436         }
1437 
1438         if(phitrefs->size() == 0) {
1439             NCBI_THROW(CAlgoAlignException, eNoHits, g_msg_NoHitsAfterFiltering);
1440         }
1441 
1442         const size_t mrna_size (m_mrna.size());
1443 
1444         if(m_strand == false) {
1445 
1446             // adjust the hits
1447             for(size_t i (0), n (phitrefs->size()); i < n; ++i) {
1448 
1449                 THitRef& h ((*phitrefs)[i]);
1450                 THit::TCoord a0 (mrna_size - h->GetQueryMin() - 1);
1451                 THit::TCoord a1 (mrna_size - h->GetQueryMax() - 1);
1452                 const bool new_strand (!(h->GetSubjStrand()));
1453                 h->SetQueryStart(a1);
1454                 h->SetQueryStop(a0);
1455                 h->SetSubjStrand(new_strand);
1456             }
1457         }
1458 
1459         m_polya_start = m_nopolya?
1460             kMax_UInt:
1461             s_TestPolyA(&m_mrna_polya.front(), m_mrna_polya.size(), m_cds_stop);
1462 
1463         // cleave off hits beyond polya
1464         if(m_polya_start < kMax_UInt) {
1465             CleaveOffByTail(phitrefs, m_polya_start);
1466         }
1467 
1468         // keep short terminal hits out of the pattern
1469         THitRefs::iterator ii (phitrefs->begin()), jj (phitrefs->end() - 1);
1470         const size_t min_termhitlen1 (m_MinPatternHitLength);
1471         const size_t min_termhitlen2 (2*m_MinPatternHitLength);
1472         bool b0 (true), b1 (true);
1473         while(b0 && b1 && ii < jj) {
1474 
1475             while(ii->IsNull() && ii < jj) ++ii;
1476             while(jj->IsNull() && ii < jj) --jj;
1477 
1478             if(ii < jj) {
1479 
1480                 const double hit_idty ((*ii)->GetIdentity());
1481                 const size_t min_termhitlen (
1482                    hit_idty < .9999? min_termhitlen2: min_termhitlen1);
1483 
1484                 if((*ii)->GetQuerySpan() < min_termhitlen) {
1485                     ii++ -> Reset(0);
1486                 }
1487                 else {
1488                     b0 = false;
1489                 }
1490             }
1491 
1492             if(ii < jj) {
1493 
1494                 const double hit_idty ((*jj)->GetIdentity());
1495                 const size_t min_termhitlen (
1496                    hit_idty < .9999? min_termhitlen2: min_termhitlen1);
1497 
1498                 if((*jj)->GetQuerySpan() < min_termhitlen) {
1499                     jj-- -> Reset(0);
1500                 }
1501                 else {
1502                     b1 = false;
1503                 }
1504             }
1505         }
1506 
1507         phitrefs->erase(remove_if(phitrefs->begin(), phitrefs->end(),
1508                                   CHitFilter<THit>::s_PNullRef),
1509                         phitrefs->end());
1510 
1511         if(phitrefs->size() == 0) {
1512             NCBI_THROW(CAlgoAlignException, eNoHits,
1513                        g_msg_NoHitsAfterFiltering);
1514         }
1515 
1516 
1517         // find regions of interest on mRna (query) and contig (subj)
1518         THit::TCoord span [4];
1519         CHitFilter<THit>::s_GetSpan(*phitrefs, span);
1520         THit::TCoord qmin (span[0]), qmax (span[1]), smin (span[2]), smax (span[3]);
1521 
1522         const bool ctg_strand (phitrefs->front()->GetSubjStrand());
1523 
1524         // m1: estimate terminal genomic extents based on uncovered end sizes
1525         const THit::TCoord extent_left_m1 (x_GetGenomicExtent(qmin));
1526         const THit::TCoord rspace   ((m_polya_start < kMax_UInt?
1527                                       m_polya_start: mrna_size) - qmax - 1 );
1528         const THit::TCoord extent_right_m1 (x_GetGenomicExtent(rspace));
1529 
1530         // m2: estimate genomic extents using compartment hits
1531         THit::TCoord fixed_left (kMaxCoord / 2), fixed_right(fixed_left);
1532 
1533         const size_t kTermLenCutOff_m2 (10);
1534         const bool fix_left  (qmin   <= kTermLenCutOff_m2);
1535         const bool fix_right (rspace <= kTermLenCutOff_m2);
1536         if(fix_left || fix_right) {
1537 
1538             if(phitrefs->size() > 1) {
1539 
1540                 // select based on the max intron length
1541                 THit::TCoord max_intron (0);
1542                 THit::TCoord prev_start (phitrefs->front()->GetSubjStart());
1543 
1544                 ITERATE(THitRefs, ii, (*phitrefs)) {
1545 
1546                     const THit::TCoord cur_start ((*ii)->GetSubjStart());
1547                     const THit::TCoord intron    (cur_start >= prev_start?
1548                                                   cur_start - prev_start:
1549                                                   prev_start - cur_start);
1550                     if(intron > max_intron) {
1551                         max_intron = intron;
1552                     }
1553                     prev_start = cur_start;
1554                 }
1555 
1556                 const double factor (2.5);
1557                 if(fix_left)  { fixed_left  = THit::TCoord(max_intron * factor); }
1558                 if(fix_right) { fixed_right = THit::TCoord(max_intron * factor); }
1559             }
1560             else {
1561                 // stay conservative for single-hit compartments
1562                 const THit::TCoord single_hit_extent (300);
1563                 if(fix_left)  { fixed_left  = single_hit_extent; }
1564                 if(fix_right) { fixed_right = single_hit_extent; }
1565             }
1566         }
1567 
1568         const THit::TCoord extent_left_m2  (100 + max(fixed_left,  qmin));
1569         const THit::TCoord extent_right_m2 (100 + max(fixed_right, rspace));
1570 
1571         const THit::TCoord extent_left (min(extent_left_m1, extent_left_m2));
1572         THit::TCoord extent_right (min(extent_right_m1, extent_right_m2));
1573 
1574 		//add polya length to extent
1575 		THit::TCoord poly_length = m_polya_start < kMax_UInt ? mrna_size - m_polya_start : 0;
1576 		if(extent_right < poly_length) extent_right = poly_length;
1577 
1578         if(ctg_strand) {
1579             smin = max(0, int(smin - extent_left));
1580             smax += extent_right;
1581         }
1582         else {
1583             smin = max(0, int(smin - extent_right));
1584             smax += extent_left;
1585         }
1586 
1587         // regardless of hits, entire cDNA is aligned (without the tail, if any)
1588         qmin = 0;
1589         qmax = m_polya_start < kMax_UInt? m_polya_start - 1: mrna_size - 1;
1590 
1591         // make sure to obey the genomic range specified
1592         if(smin < range_left) {
1593             smin = range_left;
1594         }
1595         if(smax > range_right) {
1596             smax = range_right;
1597         }
1598 
1599         //prohibit extension to go over over non-bridgeable gaps
1600         if(phitrefs->size() > 1) {
1601             THit::TId id_query (phitrefs->front()->GetSubjId());
1602             CRef<CSeq_id> tmp_id(new CSeq_id());
1603             tmp_id->Assign(*id_query);
1604             TSeqPos hitmin(span[2]);
1605             TSeqPos hitmax(span[3]);
1606 
1607             //left
1608             if(hitmin > smin) {
1609                 CSeq_loc tmp_loc(*tmp_id, smin, hitmin - 1, eNa_strand_plus);
1610                 CConstRef<CSeqMap> smap = CSeqMap::GetSeqMapForSeq_loc(tmp_loc, GetScope());
1611                 if(smap) {
1612                     TSeqPos tmplen = hitmin - smin;
1613                     CSeqMap_CI smit = smap->ResolvedRangeIterator(GetScope(),   0, tmplen, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
1614                     for(;smit; ++smit) {
1615                         if(smit.GetType() == CSeqMap::eSeqGap) {
1616                             CConstRef<CSeq_literal> slit = smit.GetRefGapLiteral();
1617                             if(slit && slit->IsBridgeable() == CSeq_literal::e_NotBridgeable) {
1618                                 TSeqPos pos = smit.GetEndPosition();//exclusive
1619                                 _ASSERT( smin + pos <= hitmin );
1620                                 smin += pos;
1621                             }
1622                         }
1623                     }
1624                 }
1625             }
1626             //right
1627             if(smax > hitmax) {
1628                 CSeq_loc tmp_loc(*tmp_id, hitmax + 1, smax, eNa_strand_plus);
1629                 CConstRef<CSeqMap> smap = CSeqMap::GetSeqMapForSeq_loc(tmp_loc, GetScope());
1630                 if(smap) {
1631                     TSeqPos tmplen = smax - hitmax;
1632                     CSeqMap_CI smit = smap->ResolvedRangeIterator(GetScope(),   0, tmplen, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
1633                     for(;smit; ++smit) {
1634                         if(smit.GetType() == CSeqMap::eSeqGap) {
1635                             CConstRef<CSeq_literal> slit = smit.GetRefGapLiteral();
1636                             if(slit && slit->IsBridgeable() == CSeq_literal::e_NotBridgeable) {
1637                                 TSeqPos pos = smit.GetPosition();
1638                                 _ASSERT( hitmax + pos < smax );
1639                                 smax = hitmax + pos;
1640                             }
1641                         }
1642                     }
1643                 }
1644             }
1645 
1646         }
1647 
1648 
1649         m_genomic.clear();
1650         x_LoadSequence(&m_genomic, *(phitrefs->front()->GetSubjId()),
1651                        smin, smax, true, true, ctg_strand);
1652 
1653         // adjust smax if beyond the end
1654         const THit::TCoord ctg_end (smin + m_genomic.size());
1655         if(smax >= ctg_end) {
1656             smax = ctg_end > 0? ctg_end - 1: 0;
1657         }
1658 
1659         if(ctg_strand == false) {
1660 
1661             // make reverse complementary
1662             // for the contig's area of interest
1663             reverse (m_genomic.begin(), m_genomic.end());
1664             transform(m_genomic.begin(), m_genomic.end(), m_genomic.begin(),
1665                       SCompliment());
1666         }
1667 
1668         NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
1669 
1670             THitRef& h (*ii);
1671 
1672             const THit::TCoord hsmin (h->GetSubjMin());
1673             const THit::TCoord hsmax (h->GetSubjMax());
1674             if(!(smin <= hsmin && hsmax <= smax)) {
1675                 CNcbiOstrstream ostr;
1676                 ostr << "\nOne of compartment hits:\n" << *h
1677                      << "\n goes outside the genome range = (" << smin+1 << ", " << smax+1 << ')'
1678                      <<"\n allowed for the compartment";
1679                 const string errmsg = CNcbiOstrstreamToString(ostr);
1680                 NCBI_THROW(CAlgoAlignException, ePattern, errmsg);
1681             }
1682 
1683             if(ctg_strand == false) {
1684 
1685                 THit::TCoord a2 (smax - (hsmax - smin));
1686                 THit::TCoord a3 (smax - (hsmin - smin));
1687                 h->SetSubjStart(a2);
1688                 h->SetSubjStop(a3);
1689             }
1690         }
1691 
1692         rv.m_QueryStrand = m_strand;
1693         rv.m_SubjStrand  = ctg_strand;
1694 
1695         // shift hits so that they originate from zero
1696         NON_CONST_ITERATE(THitRefs, ii, *phitrefs) {
1697             (*ii)->Shift(-(Int4)qmin, -(Int4)smin);
1698         }
1699 
1700         x_SplitQualifyingHits(phitrefs);
1701         x_SetPattern(phitrefs);
1702         rv.m_Score = x_Run(&m_mrna.front(), &m_genomic.front());
1703 
1704         const size_t seg_dim (m_segments.size());
1705         if(seg_dim == 0) {
1706             NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
1707         }
1708 
1709         //trim holes to codons
1710         if( GetTrimToCodons() ) {
1711             CSplignTrim trim(&m_genomic.front(), (int)m_genomic.size(), m_aligner, m_MaxPartExonIdentDrop);
1712             trim.TrimHolesToCodons(m_segments, m_mrna_bio_handle, m_strand, m_mrna.size());
1713         }
1714 
1715 		//look for the last exon
1716         int last_exon = -1;
1717         for(int i = m_segments.size(); i > 0;  ) {
1718 			--i;
1719             if(m_segments[i].m_exon) {
1720                 last_exon = i;
1721 				break;
1722             }
1723         }
1724 
1725         if(last_exon == -1) {//no exons found
1726             NCBI_THROW(CAlgoAlignException, eNoAlignment,g_msg_NoExonsAboveIdtyLimit);
1727         }
1728 
1729 
1730 		// try to extend the last exon as long as it's a good match (min exon identity at the end required)
1731 		TSegment& s (const_cast<TSegment&>(m_segments[last_exon]));
1732 
1733         const char* p0 = &m_mrna.front() + s.m_box[1] + 1;
1734         const char* q0 = &m_genomic.front() + s.m_box[3] + 1;
1735         const char* p = p0;
1736 		const char* q = q0;
1737         const char* pe = &m_mrna.front() + mrna_size;
1738         const char* qe = &m_genomic.front() + m_genomic.size();
1739 
1740         int match_num = 0;
1741 		size_t sh = 0, ct =0;
1742 		for(; p < pe && q < qe; ++p, ++q, ++ct) {
1743             if(toupper(*p) != 'N' && *p == *q) {
1744         		++match_num;
1745 		    	if( match_num >= (ct+1)*GetMinExonIdentity() ) { // % ident
1746 	    	    	sh = ct+1;
1747     			}
1748             }
1749         }
1750 
1751         // cut low identity flank region in extention
1752         const double kMinExonFlankIdty (GetPolyaExtIdentity());
1753         if(sh) {
1754             p = p0+(sh-1);
1755             q = q0+(sh-1);
1756             ct = 1;
1757             match_num = 0;
1758             for(;p>=p0;--p,--q,++ct) {
1759                 if(toupper(*p) != 'N' && *p == *q) {
1760             		++match_num;
1761                 } else {
1762                     if( match_num < ct*kMinExonFlankIdty) {//cut flank
1763                         sh = p - p0;
1764                         ct = 1;
1765                         match_num = 0;
1766                     }
1767                 }
1768             }
1769         }
1770 
1771         if(sh) {
1772             // resize
1773             s.m_box[1] += sh;
1774             s.m_box[3] += sh;
1775     		for(ct = 0,p = p0, q = q0; ct < sh; ++p, ++q, ++ct) {
1776     		if(toupper(*p) != 'N' && *p == *q) {
1777     			s.m_details.append(1, 'M');
1778 				} else {
1779 					s.m_details.append(1, 'R');
1780 				}
1781 			}
1782             s.Update(m_aligner);
1783 
1784             // fix annotation
1785             const size_t ann_dim = s.m_annot.size();
1786             if(ann_dim > 2 && s.m_annot[ann_dim - 3] == '>') {
1787                 s.m_annot[ann_dim - 2] = q < qe? *q: ' ';
1788                 s.m_annot[ann_dim - 1] = q < (qe-1)? *(q+1): ' ';
1789             }
1790         }
1791 
1792         m_segments.resize(last_exon + 1);
1793 
1794         //check if the rest is polya or a gap
1795         THit::TCoord coord = s.m_box[1] + 1;
1796         m_polya_start = kMax_UInt;
1797         if(coord < mrna_size ) {//there is unaligned flanking part of mRNA
1798             if(!m_nopolya && IsPolyA(&m_mrna_polya.front(), coord, m_mrna_polya.size())) {//polya
1799                 m_polya_start = coord;
1800             } else {//gap
1801                 if( GetTestType() == kTestType_20_28 || GetTestType() == kTestType_20_28_plus ) {
1802                     if(  ( (int)mrna_size - (int)s.m_box[1] - 1 ) >= kFlankExonProx &&
1803                          ! x_IsInGap( s.m_box[3] + 1) ) {//a sequence gap, but not a genomic gap, cut to splice
1804                         int seq1_pos = (int)s.m_box[1];
1805                         int seq2_pos = (int)s.m_box[3];
1806                         int det_pos = s.m_details.size() - 1;
1807                         int min_det_pos = det_pos - kMaxCutToSplice;
1808                         int min_pos = (int)s.m_box[0] + 8;//exon should not be too short
1809                         while(seq1_pos >= min_pos && det_pos >= min_det_pos) {
1810                             if( (size_t)(seq2_pos + 2) < m_genomic.size() && s.m_details[det_pos] == 'M' &&
1811                                    toupper(m_genomic[seq2_pos+1]) == 'G' &&  toupper(m_genomic[seq2_pos+2]) == 'T'  ) {//GT point
1812                                 if( (size_t)det_pos + 1 < s.m_details.size() ) {//resize
1813                                     s.m_box[1] = seq1_pos;
1814                                     s.m_box[3] = seq2_pos;
1815                                     s.m_details.resize(det_pos + 1);
1816                                     s.Update(m_aligner);
1817                                     // update the last two annotation symbols
1818                                     size_t adim = s.m_annot.size();
1819                                     if(adim > 0 && s.m_annot[adim-1] == '>') {
1820                                         s.m_annot += "GT";
1821                                     } else if(adim > 2 && s.m_annot[adim-3] == '>') {
1822                                         s.m_annot[adim-2] = 'G';
1823                                         s.m_annot[adim-1] = 'T';
1824                                     }
1825                                     coord = seq1_pos+1;
1826                                 }
1827                                 break;
1828                             }
1829                             switch(s.m_details[det_pos]) {
1830                             case 'M' :
1831                                 --seq1_pos;
1832                                 --seq2_pos;
1833                                 break;
1834                             case 'R' :
1835                                 --seq1_pos;
1836                                 --seq2_pos;
1837                                 break;
1838                             case 'I' :
1839                                 --seq2_pos;
1840                                 break;
1841                             case 'D' :
1842                                 --seq1_pos;
1843                                 break;
1844                             }
1845                             --det_pos;
1846                         }
1847                     }
1848                 }
1849 
1850                 TSegment ss;
1851                 ss.m_box[0] = coord;
1852                 ss.m_box[1] = mrna_size - 1;
1853                 ss.SetToGap();
1854                 m_segments.push_back(ss);
1855             }
1856         }
1857 
1858 
1859 
1860         // scratch it if the total coverage is too low
1861         double mcount (0);
1862         ITERATE(TSegments, jj, m_segments) {
1863             if(jj->m_exon) {
1864                 mcount += jj->m_idty * jj->m_len;
1865             }
1866         }
1867 
1868         const THit::TCoord min_singleton_idty_final (
1869                  min(size_t(m_MinSingletonIdty * qmax), m_MinSingletonIdtyBps));
1870 
1871         if(mcount < min_singleton_idty_final) {
1872             NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
1873         }
1874 
1875         // convert coordinates back to original
1876         NON_CONST_ITERATE(TSegments, jj, m_segments) {
1877 
1878             if(rv.m_QueryStrand) {
1879                 jj->m_box[0] += qmin;
1880                 jj->m_box[1] += qmin;
1881             }
1882             else {
1883                 jj->m_box[0] = mrna_size - jj->m_box[0] - 1;
1884                 jj->m_box[1] = mrna_size - jj->m_box[1] - 1;
1885             }
1886 
1887             if(jj->m_exon) {
1888                 if(rv.m_SubjStrand) {
1889                     jj->m_box[2] += smin;
1890                     jj->m_box[3] += smin;
1891                 }
1892                 else {
1893                     jj->m_box[2] = smax - jj->m_box[2];
1894                     jj->m_box[3] = smax - jj->m_box[3];
1895                 }
1896             }
1897         }
1898 
1899         if(!rv.m_QueryStrand && m_polya_start > 0 && m_polya_start < mrna_size) {
1900             m_polya_start = mrna_size - m_polya_start - 1;
1901         }
1902     } // try
1903 
1904     catch(CAlgoAlignException& e) {
1905 
1906         const CException::TErrCode errcode (e.GetErrCode());
1907         bool severe (true);
1908         switch(errcode) {
1909         case CAlgoAlignException::eNoAlignment:
1910         case CAlgoAlignException::eMemoryLimit:
1911         case CAlgoAlignException::eNoHits:
1912         case CAlgoAlignException::eIntronTooLong:
1913         // case CAlgoAlignException::ePattern:
1914             severe = false;
1915             break;
1916         }
1917 
1918         if(severe) {
1919             e.SetSeverity(eDiag_Fatal);
1920         }
1921         throw;
1922     }
1923 
1924     return rv;
1925 }
1926 
1927 // at this level and below, plus strand is assumed for both sequences
x_Run(const char * Seq1,const char * Seq2)1928 float CSplign::x_Run(const char* Seq1, const char* Seq2)
1929 {
1930     typedef TSegments TSegmentVector;
1931     TSegmentVector segments;
1932 
1933 //#define DBG_DUMP_PATTERN
1934 #ifdef  DBG_DUMP_PATTERN
1935     cerr << "Pattern:" << endl;
1936 #endif
1937 
1938     const size_t map_dim (m_alnmap.size());
1939     if(map_dim != 1) {
1940         NCBI_THROW(CAlgoAlignException, eInternal, "Multiple maps not supported");
1941     }
1942 
1943     float rv (0);
1944     size_t cds_start (0), cds_stop (0);
1945     for(size_t i (0); i < map_dim; ++i) {
1946 
1947         const SAlnMapElem& zone (m_alnmap[i]);
1948 
1949         // setup sequences
1950         const size_t len1 (zone.m_box[1] - zone.m_box[0] + 1);
1951         const size_t len2 (zone.m_box[3] - zone.m_box[2] + 1);
1952 
1953         // remap cds if antisense
1954         if(m_strand) {
1955             cds_start = m_cds_start;
1956             cds_stop  = m_cds_stop;
1957         }
1958         else {
1959             cds_start = len1 - m_cds_start - 1;
1960             cds_stop  = len1 - m_cds_stop - 1;
1961         }
1962 
1963         m_aligner->SetSequences(Seq1 + zone.m_box[0], len1,
1964                                 Seq2 + zone.m_box[2], len2,
1965                                 false);
1966 
1967         // prepare the pattern
1968         vector<size_t> pattern;
1969         if(m_pattern.size() > 0) {
1970 
1971             if(zone.m_pattern_start < 0) {
1972                 NCBI_THROW(CAlgoAlignException, eInternal,
1973                            "CSplign::x_Run(): Invalid alignment pattern");
1974             }
1975 
1976             copy(m_pattern.begin() + zone.m_pattern_start,
1977                  m_pattern.begin() + zone.m_pattern_end + 1,
1978                  back_inserter(pattern));
1979         }
1980 
1981         for(size_t j (0), pt_dim (pattern.size()); j < pt_dim; j += 4) {
1982 
1983 #ifdef  DBG_DUMP_PATTERN
1984             cerr << (1 + pattern[j]) << '\t' << (1 + pattern[j+1]) << '\t'
1985                  << "(len = " << (pattern[j+1] - pattern[j] + 1) << ")\t"
1986                  << (1 + pattern[j+2]) << '\t' << (1 + pattern[j+3])
1987                  << "(len = " << (pattern[j+3] - pattern[j+2] + 1) << ")\t"
1988                  << endl;
1989 #undef DBG_DUMP_PATTERN
1990 #endif
1991 
1992             pattern[j]   -= zone.m_box[0];
1993             pattern[j+1] -= zone.m_box[0];
1994             pattern[j+2] -= zone.m_box[2];
1995             pattern[j+3] -= zone.m_box[2];
1996         }
1997 
1998         // setup the aligner
1999         m_aligner->SetPattern(pattern);
2000         m_aligner->SetEndSpaceFree(true, true, true, true);
2001         m_aligner->SetCDS(cds_start, cds_stop);
2002 
2003         rv += m_aligner->Run();
2004         m_aligner->CheckPreferences();
2005 
2006 // #define DBG_DUMP_TYPE2
2007 #ifdef  DBG_DUMP_TYPE2
2008         {{
2009             CNWFormatter fmt (*m_aligner);
2010             string txt;
2011             fmt.AsText(&txt, CNWFormatter::eFormatType2);
2012             cerr << txt;
2013         }}
2014 #undef DBG_DUMP_TYPE2
2015 #endif
2016 
2017         CNWFormatter formatter (*m_aligner);
2018         formatter.MakeSegments(&segments);
2019 
2020         // append a gap
2021         if(i + 1 < map_dim) {
2022             segments.push_back(TSegment());
2023             TSegment& g (segments.back());
2024             g.m_box[0] = zone.m_box[1] + 1;
2025             g.m_box[1] = m_alnmap[i+1].m_box[0] - 1;
2026             g.m_box[2] = zone.m_box[3] + 1;
2027             g.m_box[3] = m_alnmap[i+1].m_box[2] - 1;
2028             g.SetToGap();
2029         }
2030     } // zone iterations end
2031 
2032 
2033 //#define DUMP_ORIG_SEGS
2034 #ifdef DUMP_ORIG_SEGS
2035     cerr << "Orig segments:" << endl;
2036     ITERATE(TSegmentVector, ii, segments) {
2037         cerr << ii->m_exon << '\t' << ii->m_idty << '\t' << ii->m_len << '\t'
2038              << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
2039              << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
2040              << ii->m_annot  << '\t' << ii->m_score  << endl;
2041     }
2042 #endif
2043 
2044     if(segments.size() == 0) {
2045         NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);
2046     }
2047 
2048     // segment-level postprocessing
2049 
2050     const size_t SeqLen2 (m_genomic.size());
2051     const size_t SeqLen1 (m_polya_start == kMax_UInt?
2052                           m_mrna.size():
2053                           m_polya_start);
2054 
2055     // if the limiting range is set, clear all segments beyond the range
2056     if(m_BoundingRange.second > 0) {
2057 
2058         NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2059             if(ii->m_exon  &&
2060                (ii->m_box[3] < m_BoundingRange.first
2061                 || ii->m_box[2] > m_BoundingRange.second))
2062             {
2063                 ii->SetToGap();
2064             }
2065         }
2066     }
2067 
2068     m_segments.resize(0);
2069 
2070     /// postprocessing starts here
2071 
2072     bool is_test = false;
2073     bool is_test_plus = false;
2074     if( GetTestType() == kTestType_20_28 ) {
2075         is_test = true;
2076     } else if( GetTestType() == kTestType_20_28_plus ) {
2077         is_test = true;
2078         is_test_plus = true;
2079     }
2080 
2081     CSplignTrim trim(&m_genomic.front(), (int)m_genomic.size(), m_aligner, m_MaxPartExonIdentDrop);
2082 
2083     //partial trimming near sequence gaps
2084     {{
2085         if (is_test)  { // test mode
2086             bool first = true;
2087             TSegmentVector::iterator prev;
2088             NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2089                 if(ii->m_exon == false) continue;
2090                 if(first) {
2091                     first = false;
2092                 } else {
2093                     if(prev->m_exon) {//if not exon, it will be trimmed later anyway
2094                         /*  old logic
2095                         TSeqPos from =  prev->m_box[3];
2096                         TSeqPos length = ii->m_box[2] - prev->m_box[3] + 1;
2097                         if(m_GenomicSeqMap && m_GenomicSeqMap->ResolvedRangeIterator(GetScope(),  from, length, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap)) { //gap, trim.
2098 
2099                             // TEST OUTPUT
2100                                CSeqMap_CI smit = m_GenomicSeqMap->ResolvedRangeIterator(GetScope(),   from, length, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
2101                                CConstRef<CSeq_literal> slit = smit.GetRefGapLiteral();
2102                                string type = "not_gap";
2103                                if(smit.GetType() == CSeqMap::eSeqGap) type = "gap";
2104                                cout<<"Type: "<<type;
2105                                if(slit) {
2106                                cout<<" Position: "<<smit.GetPosition()+1;
2107                                cout<<" Length: "<<smit.GetLength();
2108                                cout<<" End Position: "<<smit.GetEndPosition()+1;
2109                                }
2110                                cout<<endl;
2111 
2112                             if(is_test_plus) {
2113                                 trim.ImproveFromRight(*prev);
2114                                 trim.ImproveFromLeft(*ii);
2115                             } else {
2116                                 prev->ImproveFromRight1(Seq1, Seq2, m_aligner);
2117                                 ii->ImproveFromLeft1(Seq1, Seq2, m_aligner);
2118                             }
2119                             */
2120 
2121                        //trim only if one exon abuts a gap, and the other one not
2122                        bool prev_abuts_gap = x_IsInGap(prev->m_box[3] + 1);
2123                        bool abuts_gap = x_IsInGap( ii->m_box[2] - 1 );
2124                        if( abuts_gap && !prev_abuts_gap ) trim.ImproveFromRight(*prev);
2125                        if( !abuts_gap && prev_abuts_gap ) trim.ImproveFromLeft(*ii);
2126 
2127                             //add an alignment gap if needed
2128                             if( ii->m_box[0] > prev->m_box[1] + 1) {
2129                                 TSegment sgap;
2130                                 sgap.m_box[0] = prev->m_box[1] + 1;
2131                                 sgap.m_box[2] = prev->m_box[3] + 1;
2132                                 sgap.m_box[1] = ii->m_box[0] - 1;
2133                                 sgap.m_box[3] = ii->m_box[2] - 1;
2134                                 sgap.SetToGap();
2135                                 ii = segments.insert(ii, sgap);
2136                                 ++ii;
2137                             }
2138                        //} end of old logic
2139                     }
2140                 }
2141                 prev = ii;
2142             }
2143             if(first) NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons found
2144         }
2145     }}
2146 
2147     /// trimming iterations
2148     while(true) {
2149 
2150         bool continue_iterations = false;
2151 
2152         if(m_segments.size() > 0) {
2153             segments.resize(m_segments.size());
2154             copy(m_segments.begin(), m_segments.end(), segments.begin());
2155             m_segments.resize(0);
2156         }
2157 
2158         if(segments.size() == 0) {
2159             return 0;
2160         }
2161 
2162         size_t exon_count0 (0);
2163         ITERATE(TSegmentVector, ii, segments) {
2164             if(ii->m_exon) ++exon_count0;
2165         }
2166 
2167         //extend 100% near gaps and flanks
2168         if ( is_test )  { // test mode
2169             bool first_exon = true;
2170             TSegment *last_exon = NULL;
2171             for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2172                 TSegment& s = segments[k0];
2173                 int ext_len = 0;
2174                 int ext_max = 0;
2175                 if(s.m_exon) {
2176                     if(first_exon) {
2177                         first_exon = false;
2178                         ext_len = s.CanExtendLeft(m_mrna, m_genomic);
2179                         s.ExtendLeft(m_mrna, m_genomic, ext_len, m_aligner);
2180                     } else if( !segments[k0-1].m_exon ) {//extend near gap
2181                       //extend previous exon to right
2182                         ext_len = last_exon->CanExtendRight(m_mrna, m_genomic);
2183                         //exons should not intersect
2184                         ext_max = min(s.m_box[0] - last_exon->m_box[1], s.m_box[2] - last_exon->m_box[3]) - 1;
2185                         last_exon->ExtendRight(m_mrna, m_genomic, min(ext_len, ext_max), m_aligner);
2186                       //extend current exon to left
2187                         ext_len = s.CanExtendLeft(m_mrna, m_genomic);
2188                         ext_max = min(s.m_box[0] - last_exon->m_box[1], s.m_box[2] - last_exon->m_box[3]) - 1;
2189                         s.ExtendLeft(m_mrna, m_genomic, min(ext_len, ext_max), m_aligner);
2190                     }
2191                     last_exon = &s;
2192                 }
2193             }
2194             if(last_exon == NULL) {
2195                 NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2196             } else {
2197                 int ext_len = last_exon->CanExtendRight(m_mrna, m_genomic);
2198                 last_exon->ExtendRight(m_mrna, m_genomic, ext_len, m_aligner);
2199             }
2200 
2201             //extension sometimes leads to gap dissapearance
2202             //remove/correct gaps after extension of exons
2203             {{
2204                 TSegmentVector tmp_segments;
2205                 TSegment g;
2206                 g.SetToGap();
2207                 int prev_exon_index = -1;
2208                 for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2209                     if(segments[k0].m_exon) {
2210                         if(prev_exon_index == -1) {//first exon
2211                             if(segments[k0].m_box[0] > 0) {//gap at the beginning
2212                                 g.m_box[0] = 0;
2213                                 g.m_box[1] = segments[k0].m_box[0] - 1;
2214                                 g.m_box[2] = 0;
2215                                 g.m_box[3] = segments[k0].m_box[2] - 1;
2216                                 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2217                                 tmp_segments.push_back(g);
2218                             }
2219 
2220                         } else {
2221                             if( segments[prev_exon_index].m_box[1] + 1 < segments[k0].m_box[0] ) {// is there a gap?
2222                                 g.m_box[0] = segments[prev_exon_index].m_box[1] + 1;
2223                                 g.m_box[1] = segments[k0].m_box[0] - 1;
2224                                 g.m_box[2] = segments[prev_exon_index].m_box[3] + 1;
2225                                 g.m_box[3] = segments[k0].m_box[2] - 1;
2226                                 g.m_len = g.m_box[1] - g.m_box[0] + 1;
2227                                 tmp_segments.push_back(g);
2228                             }
2229                         }
2230                         prev_exon_index = (int)k0;
2231                         tmp_segments.push_back(segments[k0]);
2232                     }
2233                 }
2234 
2235                 //check right end
2236                 if(prev_exon_index >= 0) {
2237                     if(segments[prev_exon_index].m_box[1] + 1 < SeqLen1) {
2238                         g.m_box[0] = segments[prev_exon_index].m_box[1] + 1;
2239                         g.m_box[1] = SeqLen1 - 1;
2240                         g.m_box[2] = segments[prev_exon_index].m_box[3] + 1;
2241                         g.m_box[3] = SeqLen2 - 1;
2242                         g.m_len = g.m_box[1] - g.m_box[0] + 1;
2243                         tmp_segments.push_back(g);
2244                     }
2245                 } else {
2246                     NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2247                 }
2248                 segments.swap(tmp_segments);
2249             }}
2250         }
2251 
2252 
2253 
2254         //PARTIAL TRIMMING OF TERMINAL EXONS
2255         if( !is_test ) {//default production
2256             // Go from the ends and see if we can improve term exons
2257             //note that it continue trimming of exons until s.m_idty is high
2258             size_t k0 (0);
2259             while(k0 < segments.size()) {
2260 
2261                 TSegment& s = segments[k0];
2262                 if(s.m_exon) {
2263 
2264                     const size_t len (1 + s.m_box[1] - s.m_box[0]);
2265                     const double min_idty (len >= kMinTermExonSize?
2266                                            m_MinExonIdty:
2267                                            max(m_MinExonIdty, kMinTermExonIdty));
2268                     s.ImproveFromLeft(Seq1, Seq2, m_aligner);
2269                     if(s.m_idty >= min_idty) {
2270                         break;
2271                     }
2272                 }
2273                 ++k0;
2274             }
2275 
2276             int k1 (segments.size() - 1);
2277             while(k1 >= int(k0)) {
2278 
2279                 TSegment& s (segments[k1]);
2280                 if(s.m_exon) {
2281 
2282                     const size_t len (1 + s.m_box[1] - s.m_box[0]);
2283                     const double min_idty (len >= kMinTermExonSize?
2284                                            m_MinExonIdty:
2285                                            max(m_MinExonIdty, kMinTermExonIdty));
2286                     s.ImproveFromRight(Seq1, Seq2, m_aligner);
2287                      if(s.m_idty >= min_idty) {
2288                         break;
2289                     }
2290                 }
2291                 --k1;
2292             }
2293         } else { // test mode
2294             //trim terminal exons only
2295             //first exon
2296             NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2297                 if(ii->m_exon) {
2298                     if(is_test_plus) {
2299                         trim.Cut50FromLeft(*ii);
2300                     } else {
2301                         ii->ImproveFromLeft1(Seq1, Seq2, m_aligner);
2302                     }
2303                     break;
2304                 }
2305             }
2306             //last exon
2307             NON_CONST_REVERSE_ITERATE(TSegmentVector, ii, segments) {
2308                 if(ii->m_exon) {
2309                     if(is_test_plus) {
2310                         trim.Cut50FromRight(*ii);
2311                     } else {
2312                         ii->ImproveFromRight1(Seq1, Seq2, m_aligner);
2313                     }
2314                     break;
2315                 }
2316             }
2317         }
2318 
2319         //partial trimming, exons near <GAP>s, terminal exons are trimmed already
2320         for(size_t k0 = 0; k0 < segments.size(); ++k0) {
2321             if(!segments[k0].m_exon) {
2322                 if( k0 > 0 && segments[k0-1].m_exon) {
2323                     if(is_test) {
2324                         if(is_test_plus) {
2325                             if( x_IsInGap(segments[k0-1].m_box[3] + 1) ||
2326                                 CSplignTrim::HasAbuttingExonOnRight(segments, k0-1) ) {
2327                                 //abuting an exon or a sequence gap on the genome, do not trim
2328                             } else {
2329                                 if( ( (int)SeqLen1 - (int)segments[k0-1].m_box[1] - 1 ) >= kFlankExonProx ) {
2330                                     trim.ImproveFromRight(segments[k0-1]);
2331                                 } else {
2332                                     trim.Cut50FromRight(segments[k0-1]);
2333                                 }
2334                             }
2335                         } else {
2336                             segments[k0-1].ImproveFromRight1(Seq1, Seq2, m_aligner);
2337                         }
2338                     } else {
2339                             segments[k0-1].ImproveFromRight(Seq1, Seq2, m_aligner);
2340                     }
2341                 }
2342                 if( k0 + 1 < segments.size() && segments[k0+1].m_exon) {
2343                     if(is_test) {
2344                             if(is_test_plus) {
2345                                 if( x_IsInGap(segments[k0+1].m_box[2] - 1) ||
2346                                     CSplignTrim::HasAbuttingExonOnLeft(segments, k0+1) ) {
2347                                     //abuting an exon or a sequence gap on the genome, do not trim
2348                                 } else {
2349                                     if( (int)segments[k0+1].m_box[0] >= kFlankExonProx ) {
2350                                         trim.ImproveFromLeft(segments[k0+1]);
2351                                     } else {
2352                                         trim.Cut50FromLeft(segments[k0+1]);
2353                                     }
2354                                 }
2355                             } else {
2356                                 segments[k0+1].ImproveFromLeft1(Seq1, Seq2, m_aligner);
2357                             }
2358                     } else {
2359                             segments[k0+1].ImproveFromLeft(Seq1, Seq2, m_aligner);
2360                     }
2361                 }
2362             }
2363         }
2364 
2365 
2366         //CORRECTIONS AFTER PARTIAL TRIMMING
2367 
2368         if( segments.size() == 0 ) {
2369             NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2370         }
2371 
2372         // indicate any slack space on the left
2373         if(segments[0].m_box[0] > 0) {
2374 
2375             TSegment g;
2376             g.m_box[0] = 0;
2377             g.m_box[1] = segments[0].m_box[0] - 1;
2378             g.m_box[2] = 0;
2379             g.m_box[3] = segments[0].m_box[2] - 1;
2380             g.SetToGap();
2381             segments.insert(segments.begin(), g);
2382         }
2383 
2384         // same on the right
2385         TSegment& seg_last (segments.back());
2386         if(seg_last.m_box[1] + 1 < SeqLen1) {
2387 
2388             TSegment g;
2389             g.m_box[0] = seg_last.m_box[1] + 1;
2390             g.m_box[1] = SeqLen1 - 1;
2391             g.m_box[2] = seg_last.m_box[3] + 1;
2392             g.m_box[3] = SeqLen2 - 1;
2393             g.SetToGap();
2394             segments.push_back(g);
2395         }
2396 
2397         //WHOLE EXON TRIMMING
2398 
2399         //drop low complexity terminal exons
2400         {{
2401             bool first_exon = true;
2402             TSegment *last_exon = NULL;
2403             NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2404                 if(ii->m_exon) {
2405                     //first exon
2406                     if(first_exon) {
2407                         if(ii->IsLowComplexityExon(Seq1) ) {
2408                             ii->SetToGap();
2409                         }
2410                         first_exon = false;
2411                     } else {//make sure that if exon is single, it is checked once
2412                         last_exon = &*ii;
2413                     }
2414                 }
2415             }
2416             //last exon
2417             if( last_exon != 0 ) {
2418                 if(last_exon->IsLowComplexityExon(Seq1) ) {
2419                     last_exon->SetToGap();
2420                 }
2421             }
2422         }}
2423 
2424 
2425         //throw away low identity exons
2426         NON_CONST_ITERATE(TSegmentVector, ii, segments) {
2427             if(ii->m_exon == false) continue;
2428             if(ii->m_idty < m_MinExonIdty) {
2429                 if( is_test ) {//try to trim it first
2430                     TSegment sl(*ii), sr(*ii);
2431                     if(is_test_plus) {
2432                         trim.ImproveFromLeft(sl);
2433                         trim.ImproveFromRight(sr);
2434                     } else {
2435                         sl.ImproveFromLeft1(Seq1, Seq2, m_aligner);
2436                         sr.ImproveFromRight1(Seq1, Seq2, m_aligner);
2437                     }
2438                     if( sl.m_details == ii->m_details && sr.m_details == ii->m_details ) {//did not help
2439                         ii->SetToGap();
2440                     } else {
2441                         //pick better one
2442                         if(is_test_plus) {
2443                             //there is a splice on left, but no splice on right
2444                             if( sr.m_details != ii->m_details && ii != segments.begin() && (ii-1)->m_exon && ( (ii+1) == segments.end() || !(ii+1)->m_exon ) ) {
2445                                 *ii = sr;
2446                                 //splice on right, no splice on left
2447                             } else if( sl.m_details != ii->m_details && (ii+1) != segments.end() && (ii+1)->m_exon && ( ii == segments.begin() || !(ii-1)->m_exon) ) {
2448                                 *ii = sl;
2449                             } else if(sl.m_details == ii->m_details ||
2450                                       (sr.m_details != ii->m_details && sr.m_score > sl.m_score ) ) {
2451                                 *ii = sr;
2452                             } else {
2453                                 *ii = sl;
2454                             }
2455                         } else {
2456                             if(sl.m_details != ii->m_details || sr.m_details != ii->m_details){
2457                                 if(sl.m_details == ii->m_details ||
2458                                    (sr.m_details != ii->m_details && sr.m_score > sl.m_score ) ) {
2459                                     *ii = sr;
2460                                 } else {
2461                                     *ii = sl;
2462                                 }
2463                             }
2464                         }
2465                         //add gaps if needed
2466                         if(ii != segments.begin() && (ii)->m_box[0] > (ii - 1)->m_box[1] + 1) {
2467                             TSegment sgap;
2468                             sgap.m_box[0] = (ii - 1)->m_box[1] + 1;
2469                             sgap.m_box[2] = (ii - 1)->m_box[3] + 1;
2470                             sgap.m_box[1] = ii->m_box[0] - 1;
2471                             sgap.m_box[3] = ii->m_box[2] - 1;
2472                             sgap.SetToGap();
2473                             ii = segments.insert(ii, sgap);
2474                             continue_iterations = true;
2475                             ++ii;
2476                         }
2477                         if( (ii+1) != segments.end() && (ii+1)->m_box[0] > ii->m_box[1] + 1 ) {
2478                             TSegment sgap;
2479                             ++ii;
2480                             sgap.m_box[0] = (ii - 1)->m_box[1] + 1;
2481                             sgap.m_box[2] = (ii - 1)->m_box[3] + 1;
2482                             sgap.m_box[1] = ii->m_box[0] - 1;
2483                             sgap.m_box[3] = ii->m_box[2] - 1;
2484                             sgap.SetToGap();
2485                             ii = segments.insert(ii, sgap);
2486                             continue_iterations = true;
2487                         }
2488                     }
2489                 } else {// end of test mode
2490                     //old style, just throw away
2491                     ii->SetToGap();
2492                 }
2493             } else if(ii->m_idty < .9 && ii->m_len < 20) {
2494                 // 20_90 rule for short exons preceded/followed by non-consensus splices
2495                 bool nc_prev (false), nc_next (false);
2496                 if(ii != segments.begin() && (ii - 1)->m_exon) {
2497                     nc_prev = ! TSegment::s_IsConsensusSplice(
2498                                     (ii - 1)->GetDonor(),
2499                                      ii->GetAcceptor());
2500                 }
2501                 if( (ii+1) != segments.end() && (ii + 1)->m_exon) {
2502                     nc_next = ! TSegment::s_IsConsensusSplice(
2503                                      ii->GetDonor(),
2504                                     (ii + 1)->GetAcceptor());
2505                 }
2506                 if( nc_prev || nc_next ) {
2507                      ii->SetToGap();
2508                 }
2509             }
2510 
2511         }
2512 
2513 
2514         //GP-12069
2515         //apply 40/85 rule for all 'stand alone' exons
2516         if( is_test ) {//test mode
2517             for(size_t k (0); k < segments.size(); ++k) {
2518                 TSegment& s (segments[k]);
2519                 if(s.m_exon == false) continue;
2520                 //the first exon or gap on left
2521                 if( ( k == 0 ) || ( ! segments[k-1].m_exon ) ) {
2522                     //the last exon or gap on right
2523                     if( ( k + 1 == segments.size() ) ||  ( ! segments[k+1].m_exon ) ) {
2524                         //stand alone
2525                         if (s.m_idty < .85 && s.m_len < 40) {
2526                             s.SetToGap();
2527                         }
2528                     }
2529                 }
2530             }
2531         }
2532 
2533 
2534         // turn to gaps exons with low identity
2535         //20_28_90 TEST MODE
2536         // turn to gaps exons with combination of shortness and low identity
2537                 // deal with exons adjacent to gaps
2538         if ( is_test )  { // test mode
2539             for(size_t k (0); k < segments.size(); ++k) {
2540                 TSegment& s (segments[k]);
2541                 if(s.m_exon == false) continue;
2542 
2543                 bool drop (false);
2544 
2545                 enum EAdjustExon {
2546                     eNo,
2547                     eSoft, //for exons close to mRNA edge
2548                     eHard
2549                 } adj;
2550                 adj = eNo;
2551 
2552                 if(k == 0) {//the first segment is an exon
2553                     if( (int)s.m_box[0] >= kFlankExonProx ) {
2554                         adj = eHard;
2555                     } else {
2556                         if(adj == eNo) adj = eSoft;
2557                     }
2558                 }
2559 
2560                 if( k + 1 == segments.size() ) {//the last segment is an exon
2561                     if( ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {
2562                         adj = eHard;
2563                     } else {
2564                         if(adj == eNo) adj = eSoft;//prevent switch from Hard to Soft
2565                     }
2566                 }
2567 
2568                 if(k > 0 && ( ! segments[k-1].m_exon ) ) {//gap on left
2569                     if( (int)s.m_box[0] >= kFlankExonProx ) {
2570                         adj = eHard;
2571                     } else {
2572                         if(adj == eNo) adj = eSoft;
2573                     }
2574                 }
2575 
2576                 if(k + 1 < segments.size() && (! segments[k+1].m_exon ) ) {//gap on right
2577                     if(  ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {
2578                         adj = eHard;
2579                     } else {
2580                         if(adj == eNo) adj = eSoft;
2581                     }
2582                 }
2583 
2584                 if(adj == eSoft) {//20_90 rule
2585                     if (s.m_idty < .9 && s.m_len < 20) {
2586                         drop = true;
2587                     }
2588                 } else if(adj == eHard) {
2589                     if( s.m_len < 20 ) {// 20 rule
2590                         drop = true;
2591                     }
2592                     if ( s.m_idty < kMinTermExonIdty && s.m_len < kMinTermExonSize  ) {// 28_90 rule
2593                         drop = true;
2594                     }
2595                 }
2596                 if(drop) {
2597                     s.SetToGap();
2598                 }
2599             }
2600         }
2601 
2602 
2603         // turn to gaps short weak terminal exons
2604         {{
2605             // find the two leftmost exons
2606             size_t exon_count (0);
2607             TSegment* term_segs[] = {0, 0};
2608             for(size_t i = 0; i < segments.size(); ++i) {
2609                 TSegment& s = segments[i];
2610                 if(s.m_exon) {
2611                     term_segs[exon_count] = &s;
2612                     if(++exon_count == 2) {
2613                         break;
2614                     }
2615                 }
2616             }
2617 
2618             if(exon_count == 2) {
2619                 x_ProcessTermSegm(term_segs, 0);
2620             }
2621         }}
2622 
2623         {{
2624             // find the two rightmost exons
2625             size_t exon_count (0);
2626             TSegment* term_segs[] = {0, 0};
2627             for(int i = segments.size() - 1; i >= 0; --i) {
2628                 TSegment& s = segments[i];
2629                 if(s.m_exon) {
2630                     term_segs[exon_count] = &s;
2631                     if(++exon_count == 2) {
2632                         break;
2633                     }
2634                 }
2635             }
2636 
2637             if(exon_count == 2) {
2638                 x_ProcessTermSegm(term_segs, 1);
2639             }
2640         }}
2641 
2642         // turn to gaps extra-short exons preceded/followed by gaps
2643           bool gap_prev (false);
2644           for(size_t k (0); k < segments.size(); ++k) {
2645 
2646               TSegment& s (segments[k]);
2647               if(s.m_exon == false) {
2648                   gap_prev = true;
2649               }
2650               else {
2651                   size_t length (s.m_box[1] - s.m_box[0] + 1);
2652                   bool gap_next (false);
2653                   if(k + 1 < segments.size()) {
2654                       gap_next = !segments[k+1].m_exon;
2655                   }
2656                   if(length <= 10 && (gap_prev || gap_next)) {
2657                       s.SetToGap();
2658                   }
2659                   gap_prev = false;
2660               }
2661           }
2662 
2663 
2664         // merge all adjacent gaps
2665         int gap_start_idx (-1);
2666         if(segments.size() && segments[0].m_exon == false) {
2667             gap_start_idx = 0;
2668         }
2669 
2670         for(size_t k (0); k < segments.size(); ++k) {
2671             TSegment& s (segments[k]);
2672             if(!s.m_exon) {
2673                 if(gap_start_idx == -1) {
2674                     gap_start_idx = int(k);
2675                     if(k > 0) {
2676                         s.m_box[0] = segments[k-1].m_box[1] + 1;
2677                         s.m_box[2] = segments[k-1].m_box[3] + 1;
2678                     }
2679                 }
2680             }
2681             else {
2682                 if(gap_start_idx >= 0) {
2683                     TSegment& g = segments[gap_start_idx];
2684                     g.m_box[1] = s.m_box[0] - 1;
2685                     g.m_box[3] = s.m_box[2] - 1;
2686                     g.m_len = g.m_box[1] - g.m_box[0] + 1;
2687                     g.m_details.resize(0);
2688                     m_segments.push_back(g);
2689                     gap_start_idx = -1;
2690                 }
2691                 m_segments.push_back(s);
2692             }
2693         }
2694 
2695         if(gap_start_idx >= 0) {
2696             TSegment& g (segments[gap_start_idx]);
2697             g.m_box[1] = segments[segments.size()-1].m_box[1];
2698             g.m_box[3] = segments[segments.size()-1].m_box[3];
2699             g.m_len = g.m_box[1] - g.m_box[0] + 1;
2700             g.m_details.resize(0);
2701             m_segments.push_back(g);
2702         }
2703 
2704         size_t exon_count1 (0);
2705         ITERATE(TSegments, ii, m_segments) {
2706             if(ii->m_exon) ++exon_count1;
2707         }
2708 
2709         if(exon_count1 == 0 ) {
2710             NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2711         }
2712 
2713         if(exon_count0 == exon_count1 && continue_iterations == false) break;
2714 
2715     } // end of trimming iterations
2716 
2717     //cut to AG/GT
2718     {{
2719 
2720 
2721         if ( is_test )  { // test mode
2722             bool first_exon = true;
2723             size_t sdim = m_segments.size();
2724             int last_exon_index = -1;
2725             for(size_t k0 = 0; k0 < sdim; ++k0) {
2726                 if(m_segments[k0].m_exon) {
2727                     last_exon_index = (int)k0;
2728                 }
2729             }
2730             for(size_t k0 = 0; k0 < sdim; ++k0) {
2731                 TSegment& s = m_segments[k0];
2732                 bool cut_from_left = false;
2733                 bool cut_from_right = false;
2734                 if(s.m_exon) {
2735                     //check left
2736                     if( s.m_box[2] == 0 || x_IsInGap(s.m_box[2] - 1) ||
2737                         CSplignTrim::HasAbuttingExonOnLeft(m_segments, k0) ) {
2738                         //abuting an exon or a contig border or a sequence gap on the genome, do not cut
2739                         first_exon = false;
2740                     } else {
2741                         if(first_exon) {
2742                             if( (int)s.m_box[0] >= kFlankExonProx ) {//gap on left
2743                                 cut_from_left = true;
2744                             }
2745                             first_exon = false;
2746                         } else if( ! m_segments[k0-1].m_exon ) {//gap on left
2747                             cut_from_left = true;
2748                         }
2749                     }
2750                     //check right
2751                     if( x_IsInGap(s.m_box[3] + 1) ||
2752                         CSplignTrim::HasAbuttingExonOnRight(m_segments, k0) ) {
2753                         //abuting an exon or a sequence gap on the genome, do not cut
2754                     } else {
2755                         if( last_exon_index == (int)k0 ) {
2756                             if(  ( (int)SeqLen1 - (int)s.m_box[1] - 1 ) >= kFlankExonProx ) {//gap on right
2757                                 cut_from_right = true;
2758                             }
2759                         } else if(k0 + 1 < sdim && (! m_segments[k0+1].m_exon ) ) {//gap on right
2760                             cut_from_right = true;
2761                         }
2762                     }
2763                     //try to cut from left
2764                     if(cut_from_left) {
2765                         int seq1_pos = (int)s.m_box[0];
2766                         int seq2_pos = (int)s.m_box[2];
2767                         int det_pos = 0;
2768                         int max_pos = (int)s.m_box[1] - 8;//exon should not be too short
2769                         while(seq1_pos <= max_pos && det_pos <= kMaxCutToSplice) {
2770                             if( seq2_pos > 1 && s.m_details[det_pos] == 'M' &&
2771                                    toupper(Seq2[seq2_pos-2]) == 'A' &&  toupper(Seq2[seq2_pos-1]) == 'G'  ) {//AG point
2772                                 if(det_pos > 0) {//resize
2773                                     s.m_box[0] = seq1_pos;
2774                                     s.m_box[2] = seq2_pos;
2775                                     s.m_details.erase(0, det_pos);
2776                                     s.Update(m_aligner);
2777                                     // update the first two annotation symbols
2778                                     if(s.m_annot.size() > 0 && s.m_annot[0] == '<') {
2779                                         s.m_annot = "AG" + s.m_annot;
2780                                     } else if(s.m_annot.size() > 2 && s.m_annot[2] == '<') {
2781                                         s.m_annot[0] = 'A';
2782                                         s.m_annot[1] = 'G';
2783                                     }
2784                                     if( k0>0 && ( !m_segments[k0-1].m_exon ) ) {//adjust previos gap
2785                                         TSegment& g = m_segments[k0-1];
2786                                         g.m_box[1] = s.m_box[0] - 1;
2787                                         g.m_box[3] = s.m_box[2] - 1;
2788                                         g.m_len = g.m_box[1] - g.m_box[0] + 1;
2789                                     }
2790                                 }
2791                                 break;
2792                             }
2793                             switch(s.m_details[det_pos]) {
2794                             case 'M' :
2795                                 ++seq1_pos;
2796                                 ++seq2_pos;
2797                                 break;
2798                             case 'R' :
2799                                 ++seq1_pos;
2800                                 ++seq2_pos;
2801                                 break;
2802                             case 'I' :
2803                                 ++seq2_pos;
2804                                 break;
2805                             case 'D' :
2806                                 ++seq1_pos;
2807                                 break;
2808                             }
2809                             ++det_pos;
2810                         }
2811                     }
2812 
2813                     //try to cut from right
2814                     if(cut_from_right) {
2815                         int seq1_pos = (int)s.m_box[1];
2816                         int seq2_pos = (int)s.m_box[3];
2817                         int det_pos = s.m_details.size() - 1;
2818                         int min_det_pos = det_pos - kMaxCutToSplice;
2819                         int min_pos = (int)s.m_box[0] + 8;//exon should not be too short
2820                         while(seq1_pos >= min_pos && det_pos >= min_det_pos) {
2821                             if( (size_t)(seq2_pos + 2) < m_genomic.size() && s.m_details[det_pos] == 'M' &&
2822                                    toupper(Seq2[seq2_pos+1]) == 'G' &&  toupper(Seq2[seq2_pos+2]) == 'T'  ) {//GT point
2823                                 if( (size_t)det_pos + 1 < s.m_details.size() ) {//resize
2824                                     s.m_box[1] = seq1_pos;
2825                                     s.m_box[3] = seq2_pos;
2826                                     s.m_details.resize(det_pos + 1);
2827                                     s.Update(m_aligner);
2828                                     // update the last two annotation symbols
2829                                     size_t adim = s.m_annot.size();
2830                                     if(adim > 0 && s.m_annot[adim-1] == '>') {
2831                                         s.m_annot += "GT";
2832                                     } else if(adim > 2 && s.m_annot[adim-3] == '>') {
2833                                         s.m_annot[adim-2] = 'G';
2834                                         s.m_annot[adim-1] = 'T';
2835                                     }
2836                                     if( k0 + 1 < sdim && ( !m_segments[k0+1].m_exon ) ) {//adjust next gap
2837                                         TSegment& g = m_segments[k0+1];
2838                                         g.m_box[0] = s.m_box[1] + 1;
2839                                         g.m_box[2] = s.m_box[3] + 1;
2840                                         g.m_len = g.m_box[1] - g.m_box[0] + 1;
2841                                     }
2842                                 }
2843                                 break;
2844                             }
2845                             switch(s.m_details[det_pos]) {
2846                             case 'M' :
2847                                 --seq1_pos;
2848                                 --seq2_pos;
2849                                 break;
2850                             case 'R' :
2851                                 --seq1_pos;
2852                                 --seq2_pos;
2853                                 break;
2854                             case 'I' :
2855                                 --seq2_pos;
2856                                 break;
2857                             case 'D' :
2858                                 --seq1_pos;
2859                                 break;
2860                             }
2861                             --det_pos;
2862                         }
2863                     }
2864                 }
2865             }
2866         }
2867     }}
2868 
2869     //throw away bad stand alone exons
2870     {{
2871       if(is_test_plus) {
2872           bool adjust = false;
2873           bool prev_exon = false;
2874           size_t ssize = m_segments.size();
2875           for(size_t pp = 0; pp <ssize ; ++pp) {
2876               if(m_segments[pp].m_exon) {
2877                   if( !prev_exon && ( pp == ssize - 1 || !m_segments[pp+1].m_exon ) ) { //stand alone exon
2878                       if(trim.ThrowAway20_28_90(m_segments[pp])) {
2879                           adjust = true;
2880                       }
2881                   }
2882                   prev_exon = true;
2883               } else {
2884                   prev_exon = false;
2885               }
2886           }
2887           if(adjust) {
2888               trim.AdjustGaps(m_segments);
2889           }
2890       }
2891     }}
2892 
2893     // stich small holes
2894     {{
2895         size_t min_hole_len = GetMinHoleLen();
2896         if( min_hole_len > 0) { //find small holes and stich
2897             trim.AdjustGaps(m_segments);//make sure there is no adjacent gaps
2898             size_t pos1 = 0, pos2 = 2;
2899             for(; pos2 < m_segments.size(); ++pos1, ++pos2) {
2900                 if( m_segments[pos1].m_exon && !m_segments[pos1+1].m_exon && m_segments[pos2].m_exon &&
2901                     m_segments[pos1].m_box[1] + min_hole_len >= m_segments[pos2].m_box[0] &&
2902                     m_segments[pos1].m_box[3] + min_hole_len >= m_segments[pos2].m_box[2] ) {
2903 
2904                     trim.JoinExons(m_segments, pos1, pos2);//note: m_segments changes here!
2905                 }
2906             }
2907         }
2908     }}
2909 
2910     // cut gaps to codons
2911     {{
2912         bool cut_to_codons = true;
2913         if( cut_to_codons ) {
2914         }
2915     }}
2916 
2917     /// there is more postprocessing to follow
2918 
2919     if( m_segments.size() == 0 ) {
2920         NCBI_THROW(CAlgoAlignException, eNoAlignment, g_msg_NoAlignment);//no exons
2921     }
2922 
2923 
2924 //#define DUMP_PROCESSED_SEGS
2925 #ifdef DUMP_PROCESSED_SEGS
2926     cerr << "Processed segments:" << endl;
2927     ITERATE(TSegments, ii, m_segments) {
2928         cerr << ii->m_box[0] << '\t' << ii->m_box[1] << '\t'
2929              << ii->m_box[2] << '\t' << ii->m_box[3] << '\t'
2930              << ii->m_annot << '\t' << ii->m_score << endl;
2931     }
2932 #endif
2933 
2934     rv /= m_aligner->GetWm();
2935     return rv;
2936 }
2937 
2938 
GetIdentity() const2939 double CSplign::SAlignedCompartment::GetIdentity() const
2940 {
2941     string trans;
2942     for(size_t i (0), dim (m_Segments.size()); i < dim; ++i) {
2943         const TSegment & s (m_Segments[i]);
2944         if(s.m_exon) {
2945             trans.append(s.m_details);
2946         }
2947         else {
2948             trans.append(s.m_len, 'D');
2949         }
2950     }
2951     size_t matches = 0;
2952     ITERATE(string, ii, trans) {
2953         if(*ii == 'M') {
2954             ++matches;
2955         }
2956     }
2957     return double(matches) / trans.size();
2958 }
2959 
2960 
2961 
GetBox(Uint4 * box) const2962 void CSplign::SAlignedCompartment::GetBox(Uint4* box) const
2963 {
2964     box[0] = box[2] = kMax_UInt;
2965     box[1] = box[3] = 0;
2966     ITERATE(TSegments, ii, m_Segments) {
2967         const TSegment& s (*ii);
2968         if(s.m_exon) {
2969 
2970             Uint4 a, b;
2971             if(s.m_box[0] <= s.m_box[1]) {
2972                 a = s.m_box[0];
2973                 b = s.m_box[1];
2974             }
2975             else {
2976                 b = s.m_box[0];
2977                 a = s.m_box[1];
2978             }
2979             if(a < box[0]) {
2980                 box[0] = a;
2981             }
2982             if(b > box[1]) {
2983                 box[1] = b;
2984             }
2985 
2986             if(s.m_box[2] <= s.m_box[3]) {
2987                 a = s.m_box[2];
2988                 b = s.m_box[3];
2989             }
2990             else {
2991                 b = s.m_box[2];
2992                 a = s.m_box[3];
2993             }
2994             if(a < box[2]) {
2995                 box[2] = a;
2996             }
2997             if(b > box[3]) {
2998                 box[3] = b;
2999             }
3000         }
3001     }
3002 }
3003 
3004 
x_ProcessTermSegm(TSegment ** term_segs,Uint1 side) const3005 bool CSplign::x_ProcessTermSegm(TSegment** term_segs, Uint1 side) const
3006 {
3007     bool turn2gap (false);
3008 
3009     const size_t exon_size (1 + term_segs[0]->m_box[1] -
3010                             term_segs[0]->m_box[0]);
3011 
3012     const double idty (term_segs[0]->m_idty);
3013 
3014 
3015     if( GetTestType() == kTestType_production_default ) {//default production
3016         if(exon_size < kMinTermExonSize && idty < kMinTermExonIdty ) {
3017             turn2gap = true;
3018         }
3019     }
3020 
3021     if(exon_size < kMinTermExonSize) {
3022 
3023             // verify that the intron is not too long
3024 
3025             size_t a, b;
3026             const char *dnr, *acc;
3027             if(side == 0) {
3028                 a = term_segs[0]->m_box[3];
3029                 b = term_segs[1]->m_box[2];
3030                 dnr = term_segs[0]->GetDonor();
3031                 acc = term_segs[1]->GetAcceptor();
3032             }
3033             else {
3034                 a = term_segs[1]->m_box[3];
3035                 b = term_segs[0]->m_box[2];
3036                 dnr = term_segs[1]->GetDonor();
3037                 acc = term_segs[0]->GetAcceptor();
3038             }
3039 
3040             const size_t intron_len (b - a);
3041 
3042             const bool consensus (TSegment::s_IsConsensusSplice(dnr, acc));
3043 
3044             size_t max_ext ((idty < .96 || !consensus || exon_size < 16)?
3045                             m_max_genomic_ext: (5000 *  kMinTermExonSize));
3046 
3047             if(consensus) {
3048                 if(exon_size < 8) {
3049                     max_ext = 10 * exon_size;
3050                 }
3051             }
3052             else if(exon_size < 16) {
3053                 max_ext = 1;
3054             }
3055 
3056             const size_t max_intron_len (x_GetGenomicExtent(exon_size, max_ext));
3057             if(intron_len > max_intron_len) {
3058                 turn2gap = true;
3059             }
3060     }
3061 
3062     if(turn2gap) {
3063 
3064         // turn the segment into a gap
3065         TSegment& s = *(term_segs[0]);
3066         s.SetToGap();
3067         s.m_len = exon_size;
3068     }
3069 
3070     return turn2gap;
3071 }
3072 
3073 
x_GetGenomicExtent(const Uint4 query_len,Uint4 max_ext) const3074 Uint4 CSplign::x_GetGenomicExtent(const Uint4 query_len, Uint4 max_ext) const
3075 {
3076     if(max_ext == 0) {
3077         max_ext = m_max_genomic_ext;
3078     }
3079 
3080     Uint4 rv (0);
3081     if(query_len >= kNonCoveredEndThreshold) {
3082         rv = m_max_genomic_ext;
3083     }
3084     else {
3085         const double k (pow(kNonCoveredEndThreshold, - 1. / kPower) * max_ext);
3086         const double drv (k * pow(query_len, 1. / kPower));
3087         rv = Uint4(drv);
3088     }
3089 
3090     return rv;
3091 }
3092 
3093 
3094 ////////////////////////////////////
3095 
3096 namespace splign_local {
3097 
3098     template<typename T>
ElemToBuffer(const T & n,char * & p)3099     void ElemToBuffer(const T& n, char*& p)
3100     {
3101         *(reinterpret_cast<T*>(p)) = n;
3102         p += sizeof(n);
3103     }
3104 
3105     template<>
ElemToBuffer(const string & s,char * & p)3106     void ElemToBuffer(const string& s, char*& p)
3107     {
3108         copy(s.begin(), s.end(), p);
3109         p += s.size();
3110         *p++ = 0;
3111     }
3112 
3113     template<typename T>
ElemFromBuffer(T & n,const char * & p)3114     void ElemFromBuffer(T& n, const char*& p)
3115     {
3116         n = *(reinterpret_cast<const T*>(p));
3117         p += sizeof(n);
3118     }
3119 
3120     template<>
ElemFromBuffer(string & s,const char * & p)3121     void ElemFromBuffer(string& s, const char*& p)
3122     {
3123         s = p;
3124         p += s.size() + 1;
3125     }
3126 }
3127 
3128 
ToBuffer(TNetCacheBuffer * target) const3129 void CNWFormatter::SSegment::ToBuffer(TNetCacheBuffer* target) const
3130 {
3131     using namespace splign_local;
3132 
3133     if(target == 0) {
3134         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed );
3135     }
3136 
3137     const size_t total_size = sizeof m_exon + sizeof m_idty +
3138         sizeof m_len + sizeof m_box + m_annot.size() + 1 +
3139         m_details.size() + 1 + sizeof m_score;
3140 
3141     target->resize(total_size);
3142 
3143     char* p = &target->front();
3144     ElemToBuffer(m_exon, p);
3145     ElemToBuffer(m_idty, p);
3146     ElemToBuffer(m_len, p);
3147     for(size_t i = 0; i < 4; ++i) {
3148         ElemToBuffer(m_box[i], p);
3149     }
3150     ElemToBuffer(m_annot, p);
3151     ElemToBuffer(m_details, p);
3152     ElemToBuffer(m_score, p);
3153 }
3154 
3155 
FromBuffer(const TNetCacheBuffer & source)3156 void CNWFormatter::SSegment::FromBuffer(const TNetCacheBuffer& source)
3157 {
3158     using namespace splign_local;
3159 
3160     const size_t min_size = sizeof m_exon + sizeof m_idty + sizeof m_len +
3161         + sizeof m_box + 1 + 1 + sizeof m_score;
3162 
3163     if(source.size() < min_size) {
3164         NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
3165     }
3166 
3167     const char* p = &source.front();
3168     ElemFromBuffer(m_exon, p);
3169     ElemFromBuffer(m_idty, p);
3170     ElemFromBuffer(m_len, p);
3171 
3172     for(size_t i = 0; i < 4; ++i) {
3173         ElemFromBuffer(m_box[i], p);
3174     }
3175 
3176     ElemFromBuffer(m_annot, p);
3177     ElemFromBuffer(m_details, p);
3178     ElemFromBuffer(m_score, p);
3179 }
3180 
3181 
ToBuffer(TNetCacheBuffer * target) const3182 void CSplign::SAlignedCompartment::ToBuffer(TNetCacheBuffer* target) const
3183 {
3184     using namespace splign_local;
3185 
3186     if(target == 0) {
3187         NCBI_THROW(CAlgoAlignException, eBadParameter, g_msg_NullPointerPassed);
3188     }
3189 
3190     const size_t core_size (
3191         sizeof m_Id + sizeof m_Status + m_Msg.size() + 1
3192         + sizeof m_QueryStrand + sizeof m_SubjStrand
3193         + sizeof m_Cds_start + sizeof m_Cds_stop
3194         + sizeof m_QueryLen
3195         + sizeof m_PolyA
3196         + sizeof m_Score);
3197 
3198     vector<char> core (core_size);
3199 
3200     char* p = &core.front();
3201     ElemToBuffer(m_Id, p);
3202     ElemToBuffer(m_Status, p);
3203     ElemToBuffer(m_Msg, p);
3204     ElemToBuffer(m_QueryStrand, p);
3205     ElemToBuffer(m_SubjStrand, p);
3206     ElemToBuffer(m_Cds_start, p);
3207     ElemToBuffer(m_Cds_stop, p);
3208     ElemToBuffer(m_QueryLen, p);
3209     ElemToBuffer(m_PolyA, p);
3210     ElemToBuffer(m_Score, p);
3211 
3212     typedef vector<TNetCacheBuffer> TBuffers;
3213     TBuffers vb (m_Segments.size());
3214     size_t ibuf (0);
3215     ITERATE(TSegments, ii, m_Segments) {
3216         ii->ToBuffer(&vb[ibuf++]);
3217     }
3218 
3219     size_t total_size (core_size + sizeof(size_t) * m_Segments.size());
3220     ITERATE(TBuffers, ii, vb) {
3221         total_size += ii->size();
3222     }
3223 
3224     target->resize(total_size);
3225     TNetCacheBuffer::iterator it = target->begin();
3226     copy(core.begin(), core.end(), it);
3227     it += core_size;
3228 
3229     ITERATE(TBuffers, ii, vb) {
3230         char* p = &(*it);
3231         const size_t seg_buf_size = ii->size();
3232         *((size_t*)p) = seg_buf_size;
3233         it += sizeof (size_t);
3234         copy(ii->begin(), ii->end(), it);
3235         it += seg_buf_size;
3236     }
3237 }
3238 
3239 
FromBuffer(const TNetCacheBuffer & source)3240 void CSplign::SAlignedCompartment::FromBuffer(const TNetCacheBuffer& source)
3241 {
3242     using namespace splign_local;
3243 
3244     const size_t min_size (
3245           sizeof m_Id
3246           + sizeof m_Status
3247           + 1
3248           + sizeof m_QueryStrand + sizeof m_SubjStrand
3249           + sizeof m_Cds_start + sizeof m_Cds_stop
3250           + sizeof m_QueryLen
3251           + sizeof m_PolyA
3252           + sizeof m_Score );
3253 
3254     if(source.size() < min_size) {
3255         NCBI_THROW(CAlgoAlignException, eInternal, g_msg_NetCacheBufferIncomplete);
3256     }
3257 
3258     const char* p (&source.front());
3259     ElemFromBuffer(m_Id, p);
3260     ElemFromBuffer(m_Status, p);
3261     ElemFromBuffer(m_Msg, p);
3262     ElemFromBuffer(m_QueryStrand, p);
3263     ElemFromBuffer(m_SubjStrand, p);
3264     ElemFromBuffer(m_Cds_start, p);
3265     ElemFromBuffer(m_Cds_stop, p);
3266     ElemFromBuffer(m_QueryLen, p);
3267     ElemFromBuffer(m_PolyA, p);
3268     ElemFromBuffer(m_Score, p);
3269 
3270     const char* pe (&source.back());
3271     while(p <= pe) {
3272         size_t seg_buf_size (0);
3273         ElemFromBuffer(seg_buf_size, p);
3274         m_Segments.push_back(TSegment());
3275         TSegment& seg (m_Segments.back());
3276         seg.FromBuffer(TNetCacheBuffer(p, p + seg_buf_size));
3277         p += seg_buf_size;
3278     }
3279 }
3280 
s_ComputeStats(CRef<objects::CSeq_align_set> sas,TScoreSets * output_stats,TOrf cds,EStatFlags flags)3281 size_t CSplign::s_ComputeStats(CRef<objects::CSeq_align_set> sas,
3282                                TScoreSets * output_stats,
3283                                TOrf cds,
3284                                EStatFlags flags)
3285 {
3286 
3287     const
3288     bool valid_input (sas.GetPointer() && sas->CanGet() && sas->Get().size()
3289                       && sas->Get().front()->CanGetSegs()
3290                       && sas->Get().front()->GetSegs().IsSpliced()
3291                       && sas->Get().front()->GetSegs().GetSpliced().GetProduct_type()
3292                       == CSpliced_seg::eProduct_type_transcript
3293                       && output_stats);
3294 
3295     if(!valid_input) {
3296         NCBI_THROW(CAlgoAlignException, eBadParameter,
3297                    "CSplign::s_ComputeStats(): Invalid input");
3298     }
3299 
3300     output_stats->resize(0);
3301 
3302     ITERATE(CSeq_align_set::Tdata, ii1, sas->Get()) {
3303         CRef<CScore_set> ss (s_ComputeStats(*ii1, false, cds, flags));
3304         output_stats->push_back(ss);
3305     }
3306 
3307     return output_stats->size();
3308 }
3309 
3310 
3311 namespace {
3312     const int kFrame_not_set (-10);
3313     const int kFrame_end     (-5);
3314     const int kFrame_lost    (-20);
3315 }
3316 
3317 
s_ComputeStats(CRef<objects::CSeq_align> sa,bool embed_scoreset,TOrf cds,EStatFlags flags)3318 CRef<objects::CScore_set> CSplign::s_ComputeStats(CRef<objects::CSeq_align> sa,
3319                                                   bool embed_scoreset,
3320                                                   TOrf cds,
3321                                                   EStatFlags flags)
3322 {
3323     if(!(flags & (eSF_BasicNonCds | eSF_BasicCds))) {
3324         NCBI_THROW(CException, eUnknown,
3325                    "CSplign::s_ComputeStats(): mode not yet supported.");
3326     }
3327 
3328     const bool cds_stats ((flags & eSF_BasicCds) && (cds.first + cds.second > 0));
3329 
3330     // set individual scores
3331     CRef<CScore_set> ss (new CScore_set);
3332     CScore_set::Tdata & scores (ss->Set());
3333 
3334     if(flags & eSF_BasicNonCds) {
3335        CSeq_align::TScore score_vec;
3336        CScoreBuilderBase().AddSplignScores(*sa, score_vec);
3337        scores.assign(score_vec.begin(), score_vec.end());
3338     }
3339 
3340     if(cds_stats) {
3341 
3342         typedef CSeq_align::TSegs::TSpliced TSpliced;
3343         const TSpliced & spliced (sa->GetSegs().GetSpliced());
3344         if(spliced.GetProduct_type() != CSpliced_seg::eProduct_type_transcript) {
3345             NCBI_THROW(CAlgoAlignException, eBadParameter,
3346                        "CSplign::s_ComputeStats(): Unsupported product type");
3347         }
3348 
3349         const bool qstrand (spliced.GetProduct_strand() != eNa_strand_minus);
3350         if(cds_stats) {
3351             const bool cds_strand (cds.first < cds.second);
3352             if(qstrand ^ cds_strand) {
3353                 NCBI_THROW(CAlgoAlignException, eBadParameter,
3354                            "CSplign::s_ComputeStats(): Transcript orientation not "
3355                            "matching specified CDS orientation.");
3356             }
3357         }
3358 
3359         typedef TSpliced::TExons TExons;
3360         const TExons & exons (spliced.GetExons());
3361 
3362         size_t matches (0),
3363             aligned_query_bases (0),  // matches, mismatches and indels
3364             aln_length_exons (0),
3365             aln_length_gaps (0);
3366 
3367         const TSeqPos  qlen (spliced.GetProduct_length());
3368         const TSeqPos polya (spliced.CanGetPoly_a()?
3369                              spliced.GetPoly_a(): (qstrand? qlen: TSeqPos(-1)));
3370 
3371         typedef CSpliced_exon TExon;
3372         TSeqPos qprev (qstrand? TSeqPos(-1): qlen);
3373         string xcript;
3374         ITERATE(TExons, ii2, exons) {
3375 
3376             const TExon & exon (**ii2);
3377             const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
3378                 qmax (exon.GetProduct_end().GetNucpos());
3379 
3380             const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
3381 
3382             if(qgap > 0) {
3383                 aln_length_gaps += qgap;
3384                 if(cds_stats) xcript.append(qgap, 'X');
3385             }
3386 
3387             typedef TExon::TParts TParts;
3388             const TParts & parts (exon.GetParts());
3389             string errmsg;
3390             ITERATE(TParts, ii3, parts) {
3391                 const CSpliced_exon_chunk & part (**ii3);
3392                 const CSpliced_exon_chunk::E_Choice choice (part.Which());
3393                 TSeqPos len (0);
3394                 switch(choice) {
3395                 case CSpliced_exon_chunk::e_Match:
3396                     len = part.GetMatch();
3397                     matches += len;
3398                     aligned_query_bases += len;
3399                     if(cds_stats) xcript.append(len, 'M');
3400                     break;
3401                 case CSpliced_exon_chunk::e_Mismatch:
3402                     len = part.GetMismatch();
3403                     aligned_query_bases += len;
3404                     if(cds_stats) xcript.append(len, 'R');
3405                     break;
3406                 case CSpliced_exon_chunk::e_Product_ins:
3407                     len = part.GetProduct_ins();
3408                     aligned_query_bases += len;
3409                     if(cds_stats) xcript.append(len, 'D');
3410                     break;
3411                 case CSpliced_exon_chunk::e_Genomic_ins:
3412                     len = part.GetGenomic_ins();
3413                     if(cds_stats) xcript.append(len, 'I');
3414                     break;
3415                 default:
3416                     errmsg = "Unexpected spliced exon chunk part: "
3417                         + part.SelectionName(choice);
3418                     NCBI_THROW(CAlgoAlignException, eBadParameter, errmsg);
3419                 }
3420                 aln_length_exons += len;
3421             }
3422 
3423             qprev = qstrand? qmax: qmin;
3424         } // TExons
3425 
3426         const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
3427         aln_length_gaps += qgap;
3428         if(cds_stats) xcript.append(qgap, 'X');
3429 
3430         if(!qstrand && qlen <= 0) {
3431             NCBI_THROW(CAlgoAlignException, eBadParameter,
3432                        "CSplign::s_ComputeStats(): Cannot compute "
3433                        "inframe stats - transcript length not set.");
3434         }
3435 
3436         int qpos (qstrand? -1: int(qlen));
3437         int qinc (qstrand? +1: -1);
3438         int frame (kFrame_not_set);
3439         size_t aln_length_cds (0);
3440         size_t matches_frame[] = {0, 0, 0, 0, 0};
3441         const int cds_start (cds.first), cds_stop (cds.second);
3442         for(string::const_iterator ie (xcript.end()), ii(xcript.begin());
3443             ii != ie && frame != kFrame_end; ++ii)
3444         {
3445 
3446             switch(*ii) {
3447 
3448             case 'M':
3449                 qpos += qinc;
3450                 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3451                 if(qpos == cds_stop) frame = kFrame_end;
3452                 if(frame >= -2) {
3453                     ++aln_length_cds;
3454                     ++matches_frame[frame + 2];
3455                 }
3456                 break;
3457 
3458             case 'R':
3459                 qpos += qinc;
3460                 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3461                 if(qpos == cds_stop) frame = kFrame_end;
3462                 if(frame >= -2) ++aln_length_cds;
3463                 break;
3464 
3465             case 'D':
3466                 qpos += qinc;
3467                 if(frame == kFrame_not_set && qpos == cds_start) frame = 0;
3468                 if(qpos == cds_stop) frame = kFrame_end;
3469                 if(frame >= -2) {
3470                     ++aln_length_cds;
3471                     frame = (frame + 1) % 3;
3472                 }
3473                 break;
3474 
3475             case 'I':
3476                 if(frame >= -2) {
3477                     ++aln_length_cds;
3478                     frame = (frame - 1) % 3;
3479                 }
3480                 break;
3481 
3482             case 'X':
3483                 qpos += qinc;
3484                 if( (qstrand && cds_start <= qpos && qpos < cds_stop) ||
3485                     (!qstrand && cds_start >= qpos && qpos > cds_stop) )
3486                 {
3487                     frame = kFrame_lost;
3488                     ++aln_length_cds;
3489                 }
3490                 break;
3491             }
3492         }
3493 
3494         {
3495             CRef<CScore> score_matches_inframe (new CScore());
3496             score_matches_inframe->SetId().SetId(eCS_InframeMatches);
3497             score_matches_inframe->SetValue().SetInt(matches_frame[2]);
3498             scores.push_back(score_matches_inframe);
3499         }
3500 
3501         {
3502             CRef<CScore> score_inframe_identity (new CScore());
3503             score_inframe_identity->SetId().SetId(eCS_InframeIdentity);
3504             score_inframe_identity->SetValue().
3505                 SetReal(double(matches_frame[2]) / aln_length_cds);
3506             scores.push_back(score_inframe_identity);
3507         }
3508     }
3509 
3510 
3511     if(embed_scoreset) {
3512         CSeq_align::TScore & sa_score (sa->SetScore());
3513         sa_score.resize(scores.size());
3514         copy(scores.begin(), scores.end(), sa_score.begin());
3515     }
3516 
3517     return ss;
3518 }
3519 
3520 
3521 END_NCBI_SCOPE
3522 
3523 
3524