1 #ifndef ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
2 #define ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
3
4 /* $Id: compartment_finder.hpp 617919 2020-10-08 11:57:15Z gouriano $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Yuri Kapustin, Alexander Souvorov, Boris Kiryutin
30 *
31 * File Description:
32 *
33 * cDNA-to-genomic compartmentization algortihms.
34 * The task of compartmentization is to identify alternative placements
35 * of a transcript on a genomic sequence.
36 *
37 * CCompartmentFinder Identification of genomic compartments.
38 * CCompartmentAccessor An aggregator class to CCompartmentFinder.
39 *
40 * ===========================================================================
41 */
42
43
44 #include <corelib/ncbi_limits.hpp>
45
46 #include <algo/align/nw/align_exception.hpp>
47 #include <algo/align/util/hit_filter.hpp>
48
49 #include <objects/seqalign/Seq_align_set.hpp>
50 #include <objects/seqalign/Seq_align.hpp>
51 #include <objects/seqalign/Std_seg.hpp>
52 #include <objects/seqalign/Dense_seg.hpp>
53 #include <objects/seqloc/Seq_loc.hpp>
54 #include <objects/seqloc/Na_strand.hpp>
55 #include <objects/seq/Seq_literal.hpp>
56
57 #include <objmgr/seq_map.hpp>
58 #include <objmgr/scope.hpp>
59 #include <objmgr/seq_map_ci.hpp>
60
61 #include <algorithm>
62 #include <numeric>
63 #include <vector>
64
65
66 BEGIN_NCBI_SCOPE
67
68 USING_SCOPE(objects);
69
70 template<class THit>
71 class CGapInfo {
72 public:
73 typedef typename THit::TCoord TCoord;
74
CGapInfo(const CSeq_id & id,CScope * scope)75 CGapInfo(const CSeq_id& id, CScope *scope) {
76 m_scope = scope;
77 m_flipped = false;
78 if(scope) {
79 m_seq_map = & (scope->GetBioseqHandle(id).GetSeqMap() );
80 }
81 }
82
Flip(TCoord subj_min,TCoord subj_max)83 void Flip(TCoord subj_min, TCoord subj_max) {
84 m_flipped = true;
85 m_subj_min = subj_min;
86 m_subj_max = subj_max;
87 }
88
IntersectsNonBridgeableGap(TCoord from,TCoord to)89 bool IntersectsNonBridgeableGap(TCoord from, TCoord to) {
90 if(m_flipped) {
91 TCoord new_from = m_subj_min + m_subj_max - to;
92 TCoord new_to = m_subj_min + m_subj_max - from;
93 from = new_from;
94 to = new_to;
95 }
96 bool res = false;
97 if(m_scope && m_seq_map) {
98 CSeqMap_CI it = m_seq_map->ResolvedRangeIterator(m_scope, from, to - from + 1, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
99 for( ; it; ++it) {
100 CConstRef<CSeq_literal> lit = it.GetRefGapLiteral();
101 if(lit && lit->GetBridgeability() == CSeq_literal::e_NotBridgeable) {
102 res = true;
103 break;
104 }
105 }
106 }
107 return res;
108 }
109
110 private:
111 CScope *m_scope;
112 bool m_flipped;
113 TCoord m_subj_min;
114 TCoord m_subj_max;
115 CConstRef<CSeqMap> m_seq_map;
116 };
117
118
119 template<class THit>
120 class CPrecalcGapInfo {
121 public:
122 typedef typename THit::TCoord TCoord;
123
CPrecalcGapInfo(const vector<pair<TCoord,TCoord>> * gaps)124 CPrecalcGapInfo(const vector<pair<TCoord, TCoord> >* gaps) : m_gaps(gaps) {
125 m_flipped = false;
126 }
127
Flip(TCoord subj_min,TCoord subj_max)128 void Flip(TCoord subj_min, TCoord subj_max) {
129 m_flipped = true;
130 m_subj_min = subj_min;
131 m_subj_max = subj_max;
132 }
133
IntersectsNonBridgeableGap(TCoord from,TCoord to)134 bool IntersectsNonBridgeableGap(TCoord from, TCoord to) {
135 if(m_gaps == NULL) {
136 return false;
137 }
138 if(m_flipped) {
139 TCoord new_from = m_subj_min + m_subj_max - to;
140 TCoord new_to = m_subj_min + m_subj_max - from;
141 from = new_from;
142 to = new_to;
143 }
144 bool res = false;
145 for(size_t i = 0; i < m_gaps->size(); ++i) {
146 TCoord gfrom = (*m_gaps)[i].first;
147 TCoord gto = (*m_gaps)[i].second;
148 if( gfrom < to && from < gto ) {
149 res = true;
150 break;
151 }
152 }
153 return res;
154 }
155
156 private:
157 bool m_flipped;
158 TCoord m_subj_min;
159 TCoord m_subj_max;
160 const vector<pair<TCoord, TCoord> > *m_gaps;
161 };
162
163
164 template<class THit>
165 class CCompartmentFinder {
166
167 public:
168
169 typedef CRef<THit> THitRef;
170 typedef vector<THitRef> THitRefs;
171 typedef typename THit::TCoord TCoord;
172
173 /// Create the object from the complete set of local alignments (hits)
174 /// between the transcript and the genomic sequences.
175 /// The hits are assumed to share the same query and subject and
176 /// to have plus strand on both sequences.
177 ///
178 /// @param start
179 /// Start of the input set of alignments.
180 /// @param finish
181 /// End of the input set of alignments.
182
183 CCompartmentFinder(typename THitRefs::const_iterator start,
184 typename THitRefs::const_iterator finish,
185 CGapInfo<THit> *gap_info = NULL, CPrecalcGapInfo<THit> *prec_gap_info = NULL);
186
187 /// Set compartment overlap behaviour. Allow compartments to overlap
188 /// on subject.
189 ///
190 /// @param max_overlap
191 /// Maximum length of overlap on subject.
192 /// 0 - do not allow overlap.
193
SetMaxOverlap(TCoord max_overlap)194 void SetMaxOverlap(TCoord max_overlap) {
195 m_max_overlap = max_overlap;
196 }
197
198 /// Retrieve the default compartment overlap behaviour (no overlap).
199
s_GetDefaultMaxOverlap(void)200 static TCoord s_GetDefaultMaxOverlap(void) {
201 return 0;
202 }
203
204 /// Set the maximum length of an intron
205 ///
206 /// @param intr_max
207 /// Maximum length of an intron, in base pairs.
208
SetMaxIntron(TCoord intr_max)209 void SetMaxIntron(TCoord intr_max) {
210 m_intron_max = intr_max;
211 }
212
213 /// Retrieve the default maximum length of an intron
214
s_GetDefaultMaxIntron(void)215 static TCoord s_GetDefaultMaxIntron(void) {
216 // Some examples:
217 // NM_147181.3 vs NC_000004.10 (~1.1M)
218 return 1200000; // NM_001128929.1 vs NC_000003.10 (~1.2M)
219 }
220
221
222 /// Set the penalty to open a compartment
223 ///
224 /// @param penalty
225 /// Compartment opening penalty, in base pairs.
226
SetPenalty(TCoord penalty)227 void SetPenalty(TCoord penalty) {
228 m_penalty = penalty;
229 }
230
231 /// Set the minimum matching residues per compartment.
232 ///
233 /// @param min_matches
234 /// The min number of matching residues in base pairs
235
SetMinMatches(TCoord min_matches)236 void SetMinMatches(TCoord min_matches) {
237 m_MinSingletonMatches = m_MinMatches = min_matches;
238 }
239
240 /// Set the minimum matching residues per singleton compartment.
241 ///
242 /// @param min_matches
243 /// The min number of matching residues in base pairs
244
SetMinSingletonMatches(TCoord min_matches)245 void SetMinSingletonMatches(TCoord min_matches) {
246 m_MinSingletonMatches = min_matches;
247 }
248
249 /// Retrieve the default compartment penalty, in base pairs
250
s_GetDefaultPenalty(void)251 static TCoord s_GetDefaultPenalty(void) {
252 return 500;
253 }
254
255 /// Retrieve the default minimum coverage, in base pairs
256
s_GetDefaultMinCoverage(void)257 static TCoord s_GetDefaultMinCoverage(void) {
258 return 500;
259 }
260
261 /// Identify compartments.
262 ///
263 /// @param cross_filter
264 /// When activated, cross filtering will ensure that only
265 /// the alignments that provided non-ambguous mapping between
266 /// the sequences will be reported in the output.
267
268 size_t Run(bool cross_filter = false); // do the compartment search
269
270 /// Order compartments by lower subj coordinate
271
272 void OrderCompartments(void);
273
274 /// Individual compartment representation.
275 /// Compartment members are the hits comprising the compartment.
276
277 class CCompartment {
278
279 public:
280
281 /// Create an empty compartment
282
CCompartment(void)283 CCompartment(void) {
284 m_box[0] = m_box[2] = numeric_limits<TCoord>::max();
285 m_box[1] = m_box[3] = 0;
286 }
287
288 /// Retrieve compartment's members
289
GetMembers(void)290 const THitRefs& GetMembers(void) {
291 return m_members;
292 }
293
294 /// Add a new member into the compartment
295
AddMember(const THitRef & hitref)296 void AddMember(const THitRef& hitref) {
297 m_members.push_back(hitref);
298 }
299
300 /// Assign all members of the compartment
301
SetMembers(void)302 THitRefs& SetMembers(void) {
303 return m_members;
304 }
305
306 /// Make sure the compartment's box is up-to-date
307
308 void UpdateMinMax(void);
309
310 /// Retrieve the compartment's strand (true == plus)
311
312 bool GetStrand(void) const;
313
314 /// Retrieve the first member.
315 ///
316 /// @return
317 /// A reference on the first member of the compartment,
318 /// or a null reference if the compartment is empty.
319
GetFirst(void) const320 const THitRef GetFirst(void) const {
321 m_iter = 0;
322 return GetNext();
323 }
324
325
326 /// Retrieve the next member.
327 ///
328 /// @return
329 /// A reference on the next member of the compartment,
330 /// or a null reference if there are no more members.
331
GetNext(void) const332 const THitRef GetNext(void) const {
333 if(m_iter < m_members.size()) {
334 return m_members[m_iter++];
335 }
336 return THitRef(NULL);
337 }
338
339 /// Retrieve the compartment's box.
340 ///
341 /// @return
342 /// A const pointer at the compartment's box.
343
GetBox(void) const344 const TCoord* GetBox(void) const {
345 return m_box;
346 }
347
348 // helper predicate
s_PLowerSubj(const CCompartment & c1,const CCompartment & c2)349 static bool s_PLowerSubj(const CCompartment& c1, const CCompartment& c2) {
350
351 return c1.m_box[2] < c2.m_box[2];
352 }
353
354 protected:
355
356 THitRefs m_members;
357 TCoord m_box[4];
358 mutable size_t m_iter;
359 };
360
361 // iterate through compartments
362
363 CCompartment* GetFirst(void);
364 CCompartment* GetNext(void);
365
366 private:
367
368 TCoord m_max_overlap; //max overlap on subject for different compartments
369 TCoord m_intron_max; // max intron size
370 TCoord m_penalty; // penalty per compartment
371 TCoord m_MinMatches; // min approx matches to report
372 TCoord m_MinSingletonMatches; // min matches for singleton comps
373
374 THitRefs m_hitrefs; // input hits
375 vector<CCompartment> m_compartments; // final compartments
376 int m_iter; // GetFirst/Next index
377 CGapInfo<THit> * m_gap_info; // holds information about sequence gaps
378 CPrecalcGapInfo<THit> * m_prec_gap_info; // holds precalculated information about sequence gaps
379
380 struct SHitStatus {
381
382 enum EType {
383 eNone,
384 eExtension,
385 eOpening
386 };
387
388 EType m_type;
389 int m_prev;
390 double m_score;
391
SHitStatusCCompartmentFinder::SHitStatus392 SHitStatus(void): m_type(eNone), m_prev(-1), m_score(0)
393 {}
SHitStatusCCompartmentFinder::SHitStatus394 SHitStatus(EType type, int prev, double score):
395 m_type(type), m_prev(prev), m_score(score)
396 {}
397 };
398
399
400 static size_t sx_XFilter(THitRefs& hitrefs,
401 typename THitRefs::iterator ihr,
402 typename THitRefs::iterator ihr_e,
403 Uint1 w,
404 size_t min_compartment_hit_len);
405
406 // helper predicate
407
s_PNullRef(const THitRef & hr)408 static bool s_PNullRef(const THitRef& hr) {
409 return hr.IsNull();
410 }
411 };
412
413
414 // Facilitate access to compartments over a hit set
415 template<class THit>
416 class CCompartmentAccessor
417 {
418 public:
419
420 typedef CCompartmentFinder<THit> TCompartmentFinder;
421 typedef typename TCompartmentFinder::THitRefs THitRefs;
422 typedef typename TCompartmentFinder::TCoord TCoord;
423
424 /// Construct the object and assign the parameters of the algorithm.
425 ///
426 /// @param comp_penalty_bps
427 /// Penalty to open a compartment, in base pairs.
428 /// @param min_matches
429 /// The minimum number of matching residues in a compartment, in base pairs.
430 /// @param min_singleton_matches
431 /// The minimum number of matching residues in a singleton, in base pairs.
432 /// @param cross_filter
433 /// Perform cross-filtering when true.
434
435 CCompartmentAccessor(TCoord comp_penalty_bps,
436 TCoord min_matches,
437 TCoord min_singleton_matches = numeric_limits<TCoord>::max(),
438 bool cross_filter = false);
439
440 /// Construct the object; assign the parameters of the algorithm; execute.
441 /// The input range of alignments is assumed to contain the complete set of
442 /// alignments between the same pair of sequences.
443 /// The alignments can be on one or both genomic strands.
444 ///
445 /// @param start
446 /// Start of the input range of input alignments.
447 /// @param finish
448 /// Stop of the input range of input alignments.
449 /// @param comp_penalty_bps
450 /// Penalty to open a compartment, in base pairs.
451 /// @param min_matches
452 /// The minimum number of matching residues in a compartment, in base pairs.
453 /// @param min_singleton_matches
454 /// The minimum number of matching residues in a singleton, in base pairs.
455 /// @param cross_filter
456 /// Perform cross-filtering when true.
457
458 CCompartmentAccessor(typename THitRefs::iterator start,
459 typename THitRefs::iterator finish,
460 TCoord comp_penalty_bps,
461 TCoord min_matches,
462 TCoord min_singleton_matches = numeric_limits<TCoord>::max(),
463 bool cross_filter = false, CScope *scope = NULL,
464 const vector<pair<TCoord, TCoord> > *gaps = NULL);
465
466 /// Execute: identify compartments. The alignments can be on one
467 /// or both genomic strands.
468 ///
469 /// @param start
470 /// Start of the input range of input alignments.
471 /// @param finish
472 /// Stop of the input range of input alignments.
473
474 void Run(typename THitRefs::iterator start,
475 typename THitRefs::iterator finish, CScope *scope = NULL,
476 const vector<pair<TCoord, TCoord> > *gaps = NULL);
477
478 /// Assign the maximum intron length, in base pairs
479
SetMaxIntron(TCoord mi)480 void SetMaxIntron(TCoord mi) { m_MaxIntron = mi; }
481
482 /// Retrieve the maximum intron length, in base pairs
483
GetMaxIntron(void) const484 TCoord GetMaxIntron(void) const { return m_MaxIntron; }
485
486 /// Assign the maximum length for compartments to overlap on the subject.
487
SetMaxOverlap(TCoord max_overlap)488 void SetMaxOverlap(TCoord max_overlap) { m_MaxOverlap = max_overlap; }
489
490 /// Initialize iteration over the results.
491 /// Results are sorted by strand (minus first) and subj position.
492 ///
493 /// @param compartment
494 /// The first identified compartment
495 /// @return
496 /// true if more compartments are available
497
498 bool GetFirst(THitRefs& compartment);
499
500 /// Proceed with iteration over the results.
501 ///
502 /// @param compartment
503 /// The next identified compartment
504 /// @return
505 /// true if more compartments are available
506
507 bool GetNext(THitRefs& compartment);
508
509 /// Retrieve the compartment counts.
510 ///
511 /// @return
512 /// The first number in the pair is the total count;
513 /// the second number is the number of compartments with the number of matches
514 /// abive the minimum.
515 ///
516
GetCounts(void) const517 pair<size_t,size_t> GetCounts(void) const {
518
519 // std::count() not supported on some platforms
520 size_t valid_count (0);
521 for(size_t i(0), n(m_status.size()); i != n; ++i) {
522 if(m_status[i]) ++valid_count;
523 }
524
525 pair<size_t, size_t> rv (m_pending.size(), valid_count);
526
527 return rv;
528 }
529
530 /// Retrieve a compartment by index.
531 ///
532 /// @param idx
533 /// The compartment's index
534 /// @param compartment
535 /// The reference to receive the compartment requested.
536
Get(size_t idx,THitRefs & compartment) const537 void Get(size_t idx, THitRefs& compartment) const {
538 compartment = m_pending[idx];
539 }
540
GetBox(size_t i) const541 const TCoord* GetBox(size_t i) const {
542 return &m_ranges.front() + i*4;
543 }
544
GetStrand(size_t i) const545 bool GetStrand(size_t i) const {
546 return m_strands[i];
547 }
548
GetStatus(size_t i) const549 bool GetStatus(size_t i) const {
550 return m_status[i];
551 }
552
553 /// Retrieve all valid compartments in a seq-align-set.
554 ///
555 /// @return
556 /// A seq-align-set object with this hierarchy.
557 /// 1. Compartment-level seq-align with bounds.
558 /// 2. Seq-align-set keeping individual hits of the compartment.
559 /// 3. Hit-level seq-align of the dense-seg type.
560 CRef<objects::CSeq_align_set> AsSeqAlignSet(void) const;
561
562 private:
563
564 void x_Init(TCoord comp_penalty, TCoord min_matches,
565 TCoord min_singleton_matches, bool cross_filter);
566
567 // compartmentization parameters
568 TCoord m_Penalty;
569 TCoord m_MinMatches;
570 TCoord m_MinSingletonMatches;
571 TCoord m_MaxIntron;
572 TCoord m_MaxOverlap;
573 bool m_CrossFiltering;
574
575 // ancillary members
576 vector<THitRefs> m_pending;
577 vector<TCoord> m_ranges;
578 vector<bool> m_strands;
579 vector<bool> m_status;
580 size_t m_iter;
581
582 void x_Copy2Pending(TCompartmentFinder& finder);
583 };
584
585
586
587 /// IMPLEMENTATION
588 /////////////////////////////////////////////////////////////////////////////
589
590
591 const double kPenaltyPerIntronBase = -2e-5; // a small penalty to prefer
592 // more compact models among equal
593
594 const double kPenaltyPerIntronPos = -1e-9; // a small penalty to favor uniform
595 // selection among equivalent chains
596
597 template<class THit>
UpdateMinMax()598 void CCompartmentFinder<THit>::CCompartment::UpdateMinMax()
599 {
600 typedef CHitFilter<THit> THitFilter;
601 THitFilter::s_GetSpan(m_members, m_box);
602 }
603
604
605 template<class THit>
GetStrand() const606 bool CCompartmentFinder<THit>::CCompartment::GetStrand() const {
607
608 if(m_members.size()) {
609 return m_members.front()->GetSubjStrand();
610 }
611 else {
612 NCBI_THROW( CAlgoAlignException,
613 eInternal,
614 "Strand requested on an empty compartment" );
615 }
616 }
617
618
619 template<class THit>
CCompartmentFinder(typename THitRefs::const_iterator start,typename THitRefs::const_iterator finish,CGapInfo<THit> * gap_info,CPrecalcGapInfo<THit> * prec_gap_info)620 CCompartmentFinder<THit>::CCompartmentFinder(
621 typename THitRefs::const_iterator start,
622 typename THitRefs::const_iterator finish,
623 CGapInfo<THit> *gap_info,
624 CPrecalcGapInfo<THit> *prec_gap_info ):
625
626 m_max_overlap(s_GetDefaultMaxOverlap()),
627 m_intron_max(s_GetDefaultMaxIntron()),
628 m_penalty(s_GetDefaultPenalty()),
629 m_MinMatches(1),
630 m_MinSingletonMatches(1),
631 m_iter(-1),
632 m_gap_info(gap_info),
633 m_prec_gap_info(prec_gap_info)
634 {
635 m_hitrefs.resize(finish - start);
636 copy(start, finish, m_hitrefs.begin());
637 }
638
639 // Query match accumulator
640 template<class THit>
641 class CQueryMatchAccumulator
642 {
643 public:
644
CQueryMatchAccumulator(void)645 CQueryMatchAccumulator(void): m_Finish(-1.0)
646 {}
647
operator ()(double acm,CRef<THit> ph)648 double operator() (double acm, CRef<THit> ph)
649 {
650 const typename THit::TCoord qmin (ph->GetQueryMin()),
651 qmax (ph->GetQueryMax());
652 if(qmin > m_Finish)
653 return acm + ph->GetIdentity() * ((m_Finish = qmax) - qmin + 1);
654 else {
655 if(qmax > m_Finish) {
656 const double finish0 (m_Finish);
657 return acm + ph->GetIdentity() * ((m_Finish = qmax) - finish0);
658 }
659 else {
660 return acm;
661 }
662 }
663 }
664
665 private:
666
667 double m_Finish;
668 };
669
670
671 template<class THit>
GetTotalMatches(const typename CCompartmentFinder<THit>::THitRefs & hitrefs0)672 double GetTotalMatches(
673 const typename CCompartmentFinder<THit>::THitRefs& hitrefs0)
674 {
675 typename CCompartmentFinder<THit>::THitRefs hitrefs (hitrefs0);
676
677 typedef CHitComparator<THit> THitComparator;
678 THitComparator sorter (THitComparator::eQueryMin);
679 stable_sort(hitrefs.begin(), hitrefs.end(), sorter);
680
681 const double rv (accumulate(hitrefs.begin(), hitrefs.end(), 0.0,
682 CQueryMatchAccumulator<THit>()));
683 return rv;
684 }
685
686
687 template<class THit>
Run(bool cross_filter)688 size_t CCompartmentFinder<THit>::Run(bool cross_filter)
689 {
690 const double kMinusInf (-1e12);
691
692 m_compartments.clear();
693 const int dimhits = m_hitrefs.size();
694 if(dimhits == 0) {
695 return 0;
696 }
697
698 // sort the hits to make sure that each hit is placed after:
699 // - hits from which to continue a compartment
700 // - hits from which to open a new compartment
701
702 typedef CHitComparator<THit> THitComparator;
703 stable_sort(m_hitrefs.begin(), m_hitrefs.end(),
704 THitComparator(THitComparator::eSubjMaxQueryMax));
705
706 // For every hit:
707 // - evaluate its best extension potential
708 // - evaluate its best potential to start a new compartment
709 // - select the best variant
710 // - update the hit status array
711
712 typedef vector<SHitStatus> THitData;
713 THitData hitstatus (dimhits);
714
715 const TCoord subj_min_global (m_hitrefs.front()->GetSubjMin());
716
717 int i_bestsofar (0);
718 for(int i (0); i < dimhits; ++i) {
719
720 const THitRef& h (m_hitrefs[i]);
721 const double identity (h->GetIdentity());
722 const typename THit::TCoord hbox [4] = {
723 h->GetQueryMin(), h->GetQueryMax(),
724 h->GetSubjMin(), h->GetSubjMax()
725 };
726
727 //#define CF_DBG_TRACE
728 #ifdef CF_DBG_TRACE
729 cerr << endl << *h << endl;
730 #endif
731
732 double best_ext_score (kMinusInf);
733 int i_best_ext (-1);
734
735 double best_open_score (identity*(hbox[1] - hbox[0] + 1) - m_penalty);
736 int i_best_open (-1); // each can be a start of the very first compartment
737
738 if(hbox[2] + m_max_overlap > m_hitrefs[i_bestsofar]->GetSubjMax()) {
739
740 const double score_open (identity*(hbox[1] - hbox[0] + 1)
741 + hitstatus[i_bestsofar].m_score
742 - m_penalty);
743
744 if(score_open > best_open_score) {
745 best_open_score = score_open;
746 i_best_open = i_bestsofar;
747 }
748 }
749 else {
750 // try every prior hit
751 for(int j (i - 1); j >= 0; --j) {
752
753 const double score_open (identity * (hbox[1] - hbox[0] + 1)
754 + hitstatus[j].m_score
755 - m_penalty);
756
757 if(score_open > best_open_score
758 && hbox[2] + m_max_overlap > m_hitrefs[j]->GetSubjMax())
759 {
760 best_open_score = score_open;
761 i_best_open = j;
762 }
763 }
764 }
765
766 for(int j = i; j >= 0; --j) {
767
768 THitRef phc;
769 typename THit::TCoord phcbox[4];
770 if(j != i) {
771
772 phc = m_hitrefs[j];
773 phcbox[0] = phc->GetQueryMin();
774 phcbox[1] = phc->GetQueryMax();
775 phcbox[2] = phc->GetSubjMin();
776 phcbox[3] = phc->GetSubjMax();
777
778 #ifdef CF_DBG_TRACE
779 cerr << '\t' << *phc << endl;
780 #endif
781 }
782 #ifdef CF_DBG_TRACE
783 else {
784 cerr << "\t[Dummy]" << endl;
785 }
786 #endif
787
788 // check if good for extension
789 {{
790 typename THit::TCoord q0, s0; // possible continuation
791 bool good (false);
792 int subj_space;
793 TCoord intron_start (0);
794
795 if(i != j) {
796
797 if( ( phcbox[1] < hbox[1] && phcbox[0] < hbox[0] )
798 && phcbox[2] <= hbox[2] //prohibit to extend if previous hit is inside the current in genomic coordinates
799 ) {
800
801 if(hbox[0] <= phcbox[1] &&
802 hbox[2] + phcbox[1] - hbox[0] >= phcbox[3]) {
803
804 q0 = phcbox[1] + 1;
805 s0 = hbox[2] + phcbox[1] - hbox[0];
806 }
807 else if(phcbox[3] >= hbox[2]) {
808
809 s0 = phcbox[3] + 1;
810 q0 = hbox[1] - (hbox[3] - s0);
811 }
812 else {
813
814 q0 = hbox[0];
815 s0 = hbox[2];
816 }
817 subj_space = s0 - phcbox[3] - 1;
818
819
820 // max run of spaces inside an exon - NOW DISABLED
821 // example: NM_021645.4 vs NC_000013.9: 51.4 - 51.51M
822 //const TCoord max_gap (100);
823
824 good = (subj_space <= int(m_intron_max))
825 && (s0 < hbox[3] && q0 < hbox[1]);
826
827 //prohibit to go over non-bridgable gaps in subject
828 if(good) {
829 if( phcbox[3] <= hbox[2] ) {
830 if( ( m_prec_gap_info && m_prec_gap_info->IntersectsNonBridgeableGap(phcbox[3], hbox[2]) ) ||
831 ( m_gap_info && m_gap_info->IntersectsNonBridgeableGap(phcbox[3], hbox[2]) ) ) {
832 good = false;
833 }
834 }
835 }
836
837 if(good) {
838 intron_start = phcbox[3];
839 }
840 }
841 }
842
843 if(good) {
844
845 const double jscore (hitstatus[j].m_score);
846 const double intron_penalty ((subj_space > 0)?
847 (kPenaltyPerIntronPos * (intron_start - subj_min_global)
848 + subj_space * kPenaltyPerIntronBase): 0.0);
849
850 const double ext_score (jscore +
851 identity * (hbox[1] - q0 + 1) + intron_penalty);
852
853 if(ext_score > best_ext_score) {
854 best_ext_score = ext_score;
855 i_best_ext = j;
856 }
857 #ifdef CF_DBG_TRACE
858 cerr << "\tGood for extension with score = " << ext_score << endl;
859 #endif
860 }
861 }}
862 }
863
864 typename SHitStatus::EType hit_type;
865 int prev_hit;
866 double best_score;
867 if(best_ext_score > best_open_score) {
868
869 hit_type = SHitStatus::eExtension;
870 prev_hit = i_best_ext;
871 best_score = best_ext_score;
872 }
873 else {
874
875 hit_type = SHitStatus::eOpening;
876 prev_hit = i_best_open;
877 best_score = best_open_score;
878 }
879
880 hitstatus[i].m_type = hit_type;
881 hitstatus[i].m_prev = prev_hit;
882 hitstatus[i].m_score = best_score;
883
884 if(best_score > hitstatus[i_bestsofar].m_score) {
885 i_bestsofar = i;
886 }
887
888 #ifdef CF_DBG_TRACE
889 cerr << "Status = " << ((hit_type == SHitStatus::eOpening)? "Open": "Extend")
890 << '\t' << best_score << endl;
891 if(prev_hit == -1) {
892 cerr << "[Dummy]" << endl;
893 }
894 else {
895 cerr << *(m_hitrefs[prev_hit]) << endl;
896 }
897 #endif
898 }
899
900 #ifdef CF_DBG_TRACE
901 cerr << "\n\n--- BACKTRACE ---\n";
902 #endif
903
904 // *** backtrace ***
905 // - trace back the chain with the highest score
906 const double score_best (hitstatus[i_bestsofar].m_score);
907 const double min_matches (m_MinSingletonMatches < m_MinMatches?
908 m_MinSingletonMatches: m_MinMatches);
909
910 vector<bool> comp_status;
911 if(score_best + m_penalty >= min_matches) {
912
913 int i (i_bestsofar);
914 bool new_compartment (true);
915 THitRefs hitrefs;
916 while(i != -1) {
917
918 if(new_compartment) {
919
920 const double mp (GetTotalMatches<THit>(hitrefs));
921 if(mp > 0) {
922 // save the current compartment
923 m_compartments.push_back(CCompartment());
924 m_compartments.back().SetMembers() = hitrefs;
925 comp_status.push_back(mp >= m_MinMatches);
926 }
927
928 hitrefs.resize(0);
929 new_compartment = false;
930 }
931 hitrefs.push_back(m_hitrefs[i]);
932
933 #ifdef CF_DBG_TRACE
934 cerr << *(m_hitrefs[i]) << endl;
935 #endif
936
937 if(hitstatus[i].m_type == SHitStatus::eOpening) {
938 new_compartment = true;
939 }
940 i = hitstatus[i].m_prev;
941 }
942
943 const double mp (GetTotalMatches<THit>(hitrefs));
944 if(mp > 0) {
945 bool status ( ( m_compartments.size() == 0 && mp >= m_MinSingletonMatches )
946 || mp >= m_MinMatches );
947 m_compartments.push_back(CCompartment());
948 m_compartments.back().SetMembers() = hitrefs;
949 comp_status.push_back(status);
950 }
951 }
952
953 if(cross_filter && m_compartments.size()) {
954
955 const TCoord kMinCompartmentHitLength (8);
956
957 // partial x-filtering using compartment hits only
958 for(size_t icn (m_compartments.size()), ic (0); ic < icn; ++ic) {
959
960 CCompartment & comp (m_compartments[ic]);
961 THitRefs& hitrefs (comp.SetMembers());
962 size_t nullified (0);
963 for(int in (hitrefs.size()), i (in - 1); i > 0; --i) {
964
965 int j1 (i);
966 while(j1 < in && hitrefs[j1].IsNull()) ++j1;
967 if(j1 == in) continue;
968
969 THitRef& h1 (hitrefs[j1]);
970 THitRef& h2 (hitrefs[i-1]);
971
972 if(h1.IsNull()) continue;
973
974 const TCoord * box1o (h1->GetBox());
975 TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
976 const TCoord * box2o (h2->GetBox());
977 TCoord box2 [4] = {box2o[0], box2o[1], box2o[2], box2o[3]};
978
979 const int qd (box1[1] - box2[0] + 1);
980 const int sd (box1[3] - box2[2] + 1);
981 if(qd > sd && qd > 0) {
982
983 if(box2[0] <= box1[0] + kMinCompartmentHitLength) {
984
985 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
986 if(i + 1 == in) {
987 TCoord new_coord (box1[0] + (box1[1] - box1[0])/2);
988 if(box1[0] + 1 <= new_coord) {
989 --new_coord;
990 }
991 else {
992 new_coord = box1[0];
993 }
994 h1->Modify(1, new_coord);
995 }
996 else
997 #endif
998 {
999 h1.Reset(0);
1000 ++nullified;
1001 }
1002 }
1003 else {
1004 h1->Modify(1, box2[0] - 1);
1005 }
1006
1007 if(box2[1] <= box1[1] + kMinCompartmentHitLength) {
1008
1009 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
1010 if(i == 1) {
1011 TCoord new_coord (box2[0] + (box2[1] - box2[0])/2);
1012 if(box2[1] >= new_coord + 1) {
1013 ++new_coord;
1014 }
1015 else {
1016 new_coord = box2[1];
1017 }
1018 h2->Modify(0, new_coord);
1019 }
1020 else
1021 #endif
1022 {
1023 h2.Reset(0);
1024 ++nullified;
1025 }
1026 }
1027 else {
1028 h2->Modify(0, box1[1] + 1);
1029 }
1030 }
1031 else if (sd > 0) {
1032
1033 if(box2[2] <= box1[2] + kMinCompartmentHitLength) {
1034
1035 if(i + 1 == in) {
1036 TCoord new_coord (box1[2] + (box1[3] - box1[2])/2);
1037 if(box1[2] + 1 <= new_coord) {
1038 --new_coord;
1039 }
1040 else {
1041 new_coord = box1[2];
1042 }
1043 h1->Modify(3, new_coord);
1044 }
1045 else {
1046 h1.Reset(0);
1047 ++nullified;
1048 }
1049 }
1050 else {
1051 h1->Modify(3, box2[2] - 1);
1052 }
1053
1054 if(box2[3] <= box1[3] + kMinCompartmentHitLength) {
1055
1056 if(i == 1) {
1057 TCoord new_coord (box2[2] + (box2[3] - box2[2])/2);
1058 if(box2[3] >= new_coord + 1) {
1059 ++new_coord;
1060 }
1061 else {
1062 new_coord = box2[3];
1063 }
1064
1065 h2->Modify(2, new_coord);
1066 }
1067 else {
1068 h2.Reset(0);
1069 ++nullified;
1070 }
1071 }
1072 else {
1073 h2->Modify(2, box1[3] + 1);
1074 }
1075 }
1076 }
1077
1078 if(nullified > 0) {
1079 hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
1080 CCompartmentFinder<THit>::s_PNullRef),
1081 hitrefs.end());
1082 }
1083 }
1084
1085 #define ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
1086 #ifdef ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
1087
1088 typename THitRefs::iterator ihr_b (m_hitrefs.begin()),
1089 ihr_e(m_hitrefs.end()), ihr (ihr_b);
1090
1091 stable_sort(ihr_b, ihr_e, THitComparator(THitComparator::eSubjMinSubjMax));
1092
1093 // complete x-filtering using the full set of hits
1094 for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
1095
1096 CCompartment & comp (m_compartments[ic]);
1097 THitRefs& hitrefs (comp.SetMembers());
1098
1099 if(hitrefs.size() < 3) {
1100 continue;
1101 }
1102 else {
1103 NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1104 (*ii)->SetScore(- (*ii)->GetScore());
1105 }
1106 hitrefs.front()->SetEValue(-1);
1107 hitrefs.back()->SetEValue(-1);
1108 }
1109
1110 const TCoord comp_subj_min (hitrefs.back()->GetSubjStart());
1111 const TCoord comp_subj_max (hitrefs.front()->GetSubjStop());
1112 while(ihr != ihr_e && ((*ihr)->GetSubjStart() < comp_subj_min
1113 || (*ihr)->GetScore() < 0))
1114 {
1115 ++ihr;
1116 }
1117
1118 if(ihr == ihr_e) break;
1119 if((*ihr)->GetSubjStart() > comp_subj_max) continue;
1120 typename THitRefs::iterator ihrc_b (ihr);
1121
1122 typename THitRefs::iterator ihrc_e (ihr);
1123 while(ihrc_e != ihr_e && ((*ihrc_e)->GetSubjStart() < comp_subj_max
1124 || (*ihrc_e)->GetScore() < 0))
1125 {
1126 ++ihrc_e;
1127 }
1128
1129 size_t nullified (sx_XFilter(hitrefs, ihrc_b, ihrc_e, 1,
1130 kMinCompartmentHitLength));
1131 stable_sort(ihrc_b, ihrc_e, THitComparator (THitComparator::eQueryMinQueryMax));
1132
1133 nullified += sx_XFilter(hitrefs, ihrc_b, ihrc_e, 0,
1134 kMinCompartmentHitLength);
1135
1136 ihr = ihrc_e;
1137
1138 if(nullified > 0) {
1139 hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
1140 CCompartmentFinder<THit>::s_PNullRef),
1141 hitrefs.end());
1142 }
1143 }
1144
1145 for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
1146
1147 CCompartment & comp (m_compartments[ic]);
1148 THitRefs& hitrefs (comp.SetMembers());
1149 NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1150 const double score ((*ii)->GetScore());
1151 if(score < 0) {
1152 (*ii)->SetScore(-score);
1153 }
1154 const double eval ((*ii)->GetEValue());
1155 if(eval < 0) {
1156 (*ii)->SetEValue(0);
1157 }
1158 }
1159 }
1160 #endif
1161 }
1162
1163 // mask out compartments with low identity
1164 for(size_t i (0), in (m_compartments.size()); i < in; ++i) {
1165 if(comp_status[i] == false) {
1166
1167 THitRefs & hitrefs (m_compartments[i].SetMembers());
1168 NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1169 THitRef & hr (*ii);
1170 hr->SetScore(-hr->GetScore());
1171 }
1172 }
1173 }
1174
1175 return m_compartments.size();
1176 }
1177
1178
1179 template<class THit>
sx_XFilter(THitRefs & hitrefs,typename THitRefs::iterator ihr,typename THitRefs::iterator ihr_e,Uint1 w,size_t min_compartment_hit_len)1180 size_t CCompartmentFinder<THit>::sx_XFilter(
1181 THitRefs& hitrefs,
1182 typename THitRefs::iterator ihr,
1183 typename THitRefs::iterator ihr_e,
1184 Uint1 w,
1185 size_t min_compartment_hit_len)
1186 {
1187 size_t nullified (0);
1188 for(int in (hitrefs.size()), i (in - 2); i > 0 && ihr != ihr_e; --i) {
1189
1190 THitRef& h1 (hitrefs[i]);
1191 if(h1.IsNull()) continue;
1192
1193 const TCoord * box1o (h1->GetBox());
1194 TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
1195
1196 // locate the first overlap
1197 for(; ihr != ihr_e; ++ihr) {
1198
1199 THitRef hr (*ihr);
1200 if(hr.IsNull()) continue;
1201 if(hr->GetScore() > 0 && hr->GetStop(w) >= box1[2*w]) {
1202 break;
1203 }
1204 }
1205
1206 if(ihr == ihr_e) {
1207 break;
1208 }
1209
1210 // find the smallest not overlapped hit coord and its interval
1211 TCoord ls0 (box1[2*w+1] + 1), cursegmax (0);
1212 TCoord segmax_start(box1[2*w]), segmax_stop(box1[2*w+1]);
1213
1214 if(box1[2*w] < (*ihr)->GetStart(w)) {
1215
1216 segmax_start = ls0 = box1[2*w];
1217 segmax_stop = (*ihr)->GetStart(w) - 1;
1218 cursegmax = segmax_stop - segmax_start + 1;
1219 }
1220 else {
1221
1222 TCoord shrsmax ((*ihr)->GetStop(w));
1223 for(++ihr; ihr != ihr_e; ++ihr) {
1224
1225 THitRef hr (*ihr);
1226 if(hr.IsNull() || hr->GetScore() < 0) {
1227 continue;
1228 }
1229
1230 const TCoord hrs0 (hr->GetStart(w));
1231 if(hrs0 > box1[2*w+1]) {
1232 segmax_stop = box1[2*w+1];
1233 break;
1234 }
1235
1236 if(hrs0 > shrsmax) {
1237 segmax_stop = hrs0 - 1;
1238 break;
1239 }
1240
1241 const TCoord hrs1 (hr->GetStop(w));
1242 if(hrs1 > shrsmax) {
1243 shrsmax = hrs1;
1244 }
1245 }
1246
1247 if(shrsmax < box1[2*w+1]) {
1248 segmax_start = ls0 = shrsmax + 1;
1249 cursegmax = segmax_stop - segmax_start + 1;
1250 }
1251 }
1252
1253 if(ls0 > box1[2*w+1]) {
1254 h1.Reset(0);
1255 ++nullified;
1256 continue;
1257 }
1258
1259 // find the longest surviving segment
1260 for(; ihr != ihr_e; ++ihr) {
1261
1262 THitRef hr (*ihr);
1263 if(hr.IsNull() || hr->GetScore() < 0) {
1264 continue;
1265 }
1266
1267 const TCoord hrs0 (hr->GetStart(w));
1268 if(hrs0 > box1[2*w+1]) {
1269 if(ls0 <= box1[2*w+1] && box1[2*w+1] + 1 - ls0 > cursegmax) {
1270 segmax_start = ls0;
1271 segmax_stop = box1[2*w+1];
1272 cursegmax = segmax_stop - segmax_start + 1;
1273 }
1274 break;
1275 }
1276
1277 if(hrs0 > ls0) {
1278 if(hrs0 - ls0 > cursegmax) {
1279 segmax_start = ls0;
1280 segmax_stop = hrs0 - 1;
1281 cursegmax = segmax_stop - segmax_start + 1;
1282 }
1283
1284 ls0 = hr->GetStop(w) + 1;
1285 }
1286 else if(hr->GetStop(w) + 1 > ls0) {
1287 ls0 = hr->GetStop(w) + 1;
1288 }
1289 }
1290
1291 if(box1[2*w+1] > ls0 + cursegmax) {
1292 segmax_start = ls0;
1293 segmax_stop = box1[2*w+1];
1294 cursegmax = segmax_stop - segmax_start + 1;
1295 }
1296
1297 if(cursegmax < min_compartment_hit_len) {
1298 h1.Reset(0);
1299 ++nullified;
1300 continue;
1301 }
1302 else {
1303
1304 if(box1[2*w] < segmax_start) {
1305 h1->Modify(2 * w, segmax_start);
1306 }
1307
1308 if(segmax_stop < box1[2*w+1]) {
1309 h1->Modify(2 * w + 1, segmax_stop);
1310 }
1311 }
1312
1313 if(ihr == ihr_e) break;
1314 }
1315
1316 return nullified;
1317 }
1318
1319
1320 template<class THit>
1321 typename CCompartmentFinder<THit>::CCompartment*
GetFirst()1322 CCompartmentFinder<THit>::GetFirst()
1323 {
1324 if(m_compartments.size()) {
1325 m_iter = 0;
1326 return &m_compartments[m_iter++];
1327 }
1328 else {
1329 m_iter = -1;
1330 return 0;
1331 }
1332 }
1333
1334 template<class THit>
OrderCompartments(void)1335 void CCompartmentFinder<THit>::OrderCompartments(void) {
1336
1337 for(size_t i = 0, dim = m_compartments.size(); i < dim; ++i) {
1338 m_compartments[i].UpdateMinMax();
1339 }
1340
1341 stable_sort(m_compartments.begin(), m_compartments.end(),
1342 CCompartmentFinder<THit>::CCompartment::s_PLowerSubj);
1343 }
1344
1345 template<class THit>
1346 typename CCompartmentFinder<THit>::CCompartment*
GetNext()1347 CCompartmentFinder<THit>::GetNext()
1348 {
1349 const size_t dim = m_compartments.size();
1350 if(m_iter == -1 || m_iter >= int(dim)) {
1351 m_iter = -1;
1352 return 0;
1353 }
1354 else {
1355 return &m_compartments[m_iter++];
1356 }
1357 }
1358
1359
1360 template<class THit>
CCompartmentAccessor(TCoord comp_penalty,TCoord min_matches,TCoord min_singleton_matches,bool cross_filter)1361 CCompartmentAccessor<THit>::CCompartmentAccessor(TCoord comp_penalty,
1362 TCoord min_matches,
1363 TCoord min_singleton_matches,
1364 bool cross_filter)
1365 {
1366 x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
1367 }
1368
1369
1370 template<class THit>
CCompartmentAccessor(typename THitRefs::iterator istart,typename THitRefs::iterator ifinish,TCoord comp_penalty,TCoord min_matches,TCoord min_singleton_matches,bool cross_filter,CScope * scope,const vector<pair<TCoord,TCoord>> * gaps)1371 CCompartmentAccessor<THit>::CCompartmentAccessor(
1372 typename THitRefs::iterator istart,
1373 typename THitRefs::iterator ifinish,
1374 TCoord comp_penalty,
1375 TCoord min_matches,
1376 TCoord min_singleton_matches,
1377 bool cross_filter,
1378 CScope *scope,
1379 const vector<pair<TCoord, TCoord> > *gaps)
1380 {
1381 x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
1382 Run(istart, ifinish, scope, gaps);
1383 }
1384
1385
1386 template<class THit>
x_Init(TCoord comp_penalty,TCoord min_matches,TCoord min_singleton_matches,bool cross_filter)1387 void CCompartmentAccessor<THit>::x_Init(TCoord comp_penalty,
1388 TCoord min_matches,
1389 TCoord min_singleton_matches,
1390 bool cross_filter)
1391 {
1392 m_Penalty = comp_penalty;
1393 m_MinMatches = min_matches;
1394 m_MinSingletonMatches = min_singleton_matches;
1395 m_CrossFiltering = cross_filter;
1396 m_MaxIntron = CCompartmentFinder<THit>::s_GetDefaultMaxIntron();
1397 m_MaxOverlap = CCompartmentFinder<THit>::s_GetDefaultMaxOverlap();
1398 }
1399
1400
1401 template<class THit>
Run(typename THitRefs::iterator istart,typename THitRefs::iterator ifinish,CScope * scope,const vector<pair<TCoord,TCoord>> * gaps)1402 void CCompartmentAccessor<THit>::Run(typename THitRefs::iterator istart,
1403 typename THitRefs::iterator ifinish,
1404 CScope *scope,
1405 const vector<pair<TCoord, TCoord> > *gaps)
1406 {
1407
1408 if(istart == ifinish) return;
1409
1410 const TCoord kMax_TCoord (numeric_limits<TCoord>::max());
1411
1412 // separate strands for CompartmentFinder
1413 typename THitRefs::iterator ib = istart, ie = ifinish, ii = ib,
1414 iplus_beg = ie;
1415 typedef CHitComparator<THit> THitComparator;
1416 THitComparator sorter (THitComparator::eSubjStrand);
1417 stable_sort(ib, ie, sorter);
1418
1419 TCoord minus_subj_min = kMax_TCoord, minus_subj_max = 0;
1420 for(ii = ib; ii != ie; ++ii) {
1421 if((*ii)->GetSubjStrand()) {
1422 iplus_beg = ii;
1423 break;
1424 }
1425 else {
1426 if((*ii)->GetSubjMin() < minus_subj_min) {
1427 minus_subj_min = (*ii)->GetSubjMin();
1428 }
1429 if((*ii)->GetSubjMax() > minus_subj_max) {
1430 minus_subj_max = (*ii)->GetSubjMax();
1431 }
1432 }
1433 }
1434
1435
1436 // minus
1437 {{
1438 CGapInfo<THit> gapi( *(*istart)->GetSubjId(), scope);
1439 CPrecalcGapInfo<THit> prec_gapi( gaps );
1440 // flip
1441 gapi.Flip(minus_subj_min, minus_subj_max);
1442 prec_gapi.Flip(minus_subj_min, minus_subj_max);
1443 for(ii = ib; ii != iplus_beg; ++ii) {
1444
1445 const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
1446 - (*ii)->GetSubjMax();
1447 const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
1448 - (*ii)->GetSubjMin();
1449 (*ii)->SetSubjStart(s0);
1450 (*ii)->SetSubjStop(s1);
1451 }
1452
1453 CCompartmentFinder<THit> finder (ib, iplus_beg, &gapi, &prec_gapi);
1454 finder.SetPenalty(m_Penalty);
1455 finder.SetMinMatches(m_MinMatches);
1456 finder.SetMinSingletonMatches(m_MinSingletonMatches);
1457 finder.SetMaxIntron(m_MaxIntron);
1458 finder.SetMaxOverlap(m_MaxOverlap);
1459 finder.Run(m_CrossFiltering);
1460
1461 // un-flip
1462 for(ii = ib; ii != iplus_beg; ++ii) {
1463
1464 const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
1465 - (*ii)->GetSubjMax();
1466 const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
1467 - (*ii)->GetSubjMin();
1468 (*ii)->SetSubjStart(s1);
1469 (*ii)->SetSubjStop(s0);
1470 }
1471
1472 x_Copy2Pending(finder);
1473 }}
1474
1475 // plus
1476 {{
1477 CGapInfo<THit> gapi( *(*istart)->GetSubjId(), scope);
1478 CPrecalcGapInfo<THit> prec_gapi( gaps );
1479 CCompartmentFinder<THit> finder (iplus_beg, ie, &gapi, &prec_gapi);
1480 finder.SetPenalty(m_Penalty);
1481 finder.SetMinMatches(m_MinMatches);
1482 finder.SetMinSingletonMatches(m_MinSingletonMatches);
1483 finder.SetMaxIntron(m_MaxIntron);
1484 finder.SetMaxOverlap(m_MaxOverlap);
1485 finder.Run(m_CrossFiltering);
1486 x_Copy2Pending(finder);
1487 }}
1488 }
1489
1490
1491 template<class THit>
x_Copy2Pending(CCompartmentFinder<THit> & finder)1492 void CCompartmentAccessor<THit>::x_Copy2Pending(
1493 CCompartmentFinder<THit>& finder)
1494 {
1495 finder.OrderCompartments();
1496
1497 typedef typename CCompartmentFinder<THit>::THitRef THitRef;
1498
1499 // copy to pending
1500 for(typename CCompartmentFinder<THit>::CCompartment* compartment =
1501 finder.GetFirst(); compartment; compartment = finder.GetNext()) {
1502
1503 if(compartment->GetMembers().size() > 0) {
1504
1505 m_pending.push_back(THitRefs(0));
1506 THitRefs& vh = m_pending.back();
1507
1508 for(THitRef ph (compartment->GetFirst()); ph; ph = compartment->GetNext())
1509 {
1510 vh.push_back(ph);
1511 }
1512
1513 const TCoord* box = compartment->GetBox();
1514 m_ranges.push_back(box[0]);
1515 m_ranges.push_back(box[1]);
1516 m_ranges.push_back(box[2]);
1517 m_ranges.push_back(box[3]);
1518
1519 m_strands.push_back(compartment->GetStrand());
1520 m_status.push_back(compartment->GetFirst()->GetScore() > 0);
1521 }
1522 }
1523 }
1524
1525
1526 template<class THit>
GetFirst(THitRefs & compartment)1527 bool CCompartmentAccessor<THit>::GetFirst(THitRefs& compartment) {
1528
1529 m_iter = 0;
1530 return GetNext(compartment);
1531 }
1532
1533
1534 template<class THit>
GetNext(THitRefs & compartment)1535 bool CCompartmentAccessor<THit>::GetNext(THitRefs& compartment) {
1536
1537 compartment.clear();
1538 if(m_iter < m_pending.size()) {
1539 compartment = m_pending[m_iter++];
1540 return true;
1541 }
1542 else {
1543 return false;
1544 }
1545 }
1546
1547
1548 template<class THit>
AsSeqAlignSet(void) const1549 CRef<objects::CSeq_align_set> CCompartmentAccessor<THit>::AsSeqAlignSet(void) const
1550 {
1551 USING_SCOPE(objects);
1552
1553 CRef<CSeq_align_set> rv (new CSeq_align_set);
1554
1555 // retrieve the ids
1556 CRef<CSeq_id> seqid_query (new CSeq_id), seqid_subj (new CSeq_id);
1557 if(m_pending.size()) {
1558 if(m_pending.front().size()) {
1559 const THit & h (*m_pending.front().front());
1560 seqid_query->Assign(*(h.GetQueryId()));
1561 seqid_subj->Assign(*(h.GetSubjId()));
1562 }
1563 }
1564
1565 CSeq_align_set::Tdata& sas1_data (rv->Set());
1566
1567 for(size_t i(0), idim(m_pending.size()); i < idim; ++i) {
1568
1569 if(m_status[i]) {
1570
1571 // note the range
1572 TCoord range_min (i > 0 && m_strands[i-1] == m_strands[i]
1573 ? m_ranges[4*i - 1]
1574 : 0);
1575
1576 TCoord range_max (i + 1 < idim && m_strands[i+1] == m_strands[i]
1577 ? m_ranges[4*i + 6]
1578 : numeric_limits<TCoord>::max());
1579
1580 CRef<CSeq_align> sa2 (new CSeq_align);
1581 sa2->SetType(CSeq_align::eType_disc);
1582
1583 CSeq_align::TBounds & bounds (sa2->SetBounds());
1584 const ENa_strand query_strand (eNa_strand_plus);
1585
1586 const ENa_strand subj_strand (m_strands[i]? eNa_strand_plus:
1587 eNa_strand_minus);
1588 CRef<CSeq_loc> seq_loc (new CSeq_loc (*seqid_subj, range_min,
1589 range_max, subj_strand));
1590 bounds.push_back(seq_loc);
1591
1592 // add the hits
1593 CSeq_align_set::Tdata& sas2_data (sa2->SetSegs().SetDisc().Set());
1594 ITERATE(typename THitRefs, ii, m_pending[i]) {
1595
1596 const THit& h (**ii);
1597 CRef<CSeq_align> sa3 (new CSeq_align);
1598 sa3->SetType(CSeq_align::eType_global);
1599 const string& transcript(h.GetTranscript());
1600 if (transcript.empty()) {
1601 CRef<CStd_seg> sg(new CStd_seg);
1602 sa3->SetSegs().SetStd().push_back(sg);
1603 sg->SetDim(2);
1604 CRef<CSeq_loc> query_loc(new CSeq_loc(
1605 *seqid_query, h.GetQueryMin(), h.GetQueryMax(), query_strand));
1606 CRef<CSeq_loc> subj_loc(new CSeq_loc(
1607 *seqid_subj, h.GetSubjMin(), h.GetSubjMax(), subj_strand));
1608 sg->SetLoc().push_back(query_loc);
1609 sg->SetLoc().push_back(subj_loc);
1610 } else {
1611 CDense_seg & ds (sa3->SetSegs().SetDenseg());
1612 ds.FromTranscript(h.GetQueryStart(), query_strand,
1613 h.GetSubjStart(), subj_strand,
1614 transcript);
1615 ds.SetIds().push_back(seqid_query);
1616 ds.SetIds().push_back(seqid_subj);
1617 }
1618 sas2_data.push_back(sa3);
1619 }
1620
1621 // save into the seq-align-set
1622 sas1_data.push_back(sa2);
1623 }
1624 }
1625
1626 return rv;
1627 }
1628
1629
1630
1631 END_NCBI_SCOPE
1632
1633
1634 #endif
1635