1 #ifndef ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
2 #define ALGO_ALIGN_UTIL_COMPARTMENT_FINDER__HPP
3 
4 /* $Id: compartment_finder.hpp 617919 2020-10-08 11:57:15Z gouriano $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author:  Yuri Kapustin, Alexander Souvorov, Boris Kiryutin
30 *
31 * File Description:
32 *
33 *   cDNA-to-genomic compartmentization algortihms.
34 *   The task of compartmentization is to identify alternative placements
35 *   of a transcript on a genomic sequence.
36 *
37 *   CCompartmentFinder      Identification of genomic compartments.
38 *   CCompartmentAccessor    An aggregator class to CCompartmentFinder.
39 *
40 * ===========================================================================
41 */
42 
43 
44 #include <corelib/ncbi_limits.hpp>
45 
46 #include <algo/align/nw/align_exception.hpp>
47 #include <algo/align/util/hit_filter.hpp>
48 
49 #include <objects/seqalign/Seq_align_set.hpp>
50 #include <objects/seqalign/Seq_align.hpp>
51 #include <objects/seqalign/Std_seg.hpp>
52 #include <objects/seqalign/Dense_seg.hpp>
53 #include <objects/seqloc/Seq_loc.hpp>
54 #include <objects/seqloc/Na_strand.hpp>
55 #include <objects/seq/Seq_literal.hpp>
56 
57 #include <objmgr/seq_map.hpp>
58 #include <objmgr/scope.hpp>
59 #include <objmgr/seq_map_ci.hpp>
60 
61 #include <algorithm>
62 #include <numeric>
63 #include <vector>
64 
65 
66 BEGIN_NCBI_SCOPE
67 
68 USING_SCOPE(objects);
69 
70 template<class THit>
71 class CGapInfo {
72 public:
73     typedef typename THit::TCoord TCoord;
74 
CGapInfo(const CSeq_id & id,CScope * scope)75     CGapInfo(const CSeq_id& id, CScope *scope) {
76         m_scope = scope;
77         m_flipped = false;
78         if(scope) {
79             m_seq_map = & (scope->GetBioseqHandle(id).GetSeqMap() );
80         }
81     }
82 
Flip(TCoord subj_min,TCoord subj_max)83     void Flip(TCoord subj_min, TCoord subj_max) {
84         m_flipped = true;
85         m_subj_min = subj_min;
86         m_subj_max = subj_max;
87     }
88 
IntersectsNonBridgeableGap(TCoord from,TCoord to)89     bool IntersectsNonBridgeableGap(TCoord from, TCoord to) {
90         if(m_flipped) {
91             TCoord new_from = m_subj_min + m_subj_max - to;
92             TCoord new_to = m_subj_min + m_subj_max - from;
93             from = new_from;
94             to = new_to;
95         }
96         bool res = false;
97         if(m_scope && m_seq_map) {
98             CSeqMap_CI it = m_seq_map->ResolvedRangeIterator(m_scope,  from, to - from + 1, eNa_strand_plus, size_t(-1), CSeqMap::fFindGap);
99             for( ; it; ++it) {
100                 CConstRef<CSeq_literal> lit = it.GetRefGapLiteral();
101                 if(lit && lit->GetBridgeability() == CSeq_literal::e_NotBridgeable) {
102                     res = true;
103                     break;
104                 }
105             }
106         }
107         return res;
108     }
109 
110 private:
111     CScope *m_scope;
112     bool m_flipped;
113     TCoord m_subj_min;
114     TCoord m_subj_max;
115     CConstRef<CSeqMap> m_seq_map;
116 };
117 
118 
119 template<class THit>
120 class CPrecalcGapInfo {
121 public:
122     typedef typename THit::TCoord TCoord;
123 
CPrecalcGapInfo(const vector<pair<TCoord,TCoord>> * gaps)124     CPrecalcGapInfo(const vector<pair<TCoord, TCoord> >* gaps) : m_gaps(gaps) {
125         m_flipped = false;
126     }
127 
Flip(TCoord subj_min,TCoord subj_max)128     void Flip(TCoord subj_min, TCoord subj_max) {
129         m_flipped = true;
130         m_subj_min = subj_min;
131         m_subj_max = subj_max;
132     }
133 
IntersectsNonBridgeableGap(TCoord from,TCoord to)134     bool IntersectsNonBridgeableGap(TCoord from, TCoord to) {
135         if(m_gaps == NULL) {
136             return false;
137         }
138         if(m_flipped) {
139             TCoord new_from = m_subj_min + m_subj_max - to;
140             TCoord new_to = m_subj_min + m_subj_max - from;
141             from = new_from;
142             to = new_to;
143         }
144         bool res = false;
145         for(size_t i = 0; i < m_gaps->size(); ++i) {
146             TCoord gfrom = (*m_gaps)[i].first;
147             TCoord gto = (*m_gaps)[i].second;
148             if( gfrom < to && from < gto ) {
149                 res = true;
150                 break;
151             }
152         }
153         return res;
154     }
155 
156 private:
157     bool m_flipped;
158     TCoord m_subj_min;
159     TCoord m_subj_max;
160     const vector<pair<TCoord, TCoord> > *m_gaps;
161 };
162 
163 
164 template<class THit>
165 class CCompartmentFinder {
166 
167 public:
168 
169     typedef CRef<THit>            THitRef;
170     typedef vector<THitRef>       THitRefs;
171     typedef typename THit::TCoord TCoord;
172 
173     /// Create the object from the complete set of local alignments (hits)
174     /// between the transcript and the genomic sequences.
175     /// The hits are assumed to share the same query and subject and
176     /// to have plus strand on both sequences.
177     ///
178     /// @param start
179     ///    Start of the input set of alignments.
180     /// @param finish
181     ///    End of the input set of alignments.
182 
183     CCompartmentFinder(typename THitRefs::const_iterator start,
184                        typename THitRefs::const_iterator finish,
185                        CGapInfo<THit> *gap_info = NULL, CPrecalcGapInfo<THit> *prec_gap_info = NULL);
186 
187     /// Set compartment overlap behaviour. Allow compartments to overlap
188     /// on subject.
189     ///
190     /// @param max_overlap
191     ///    Maximum length of overlap on subject.
192     ///    0 - do not allow overlap.
193 
SetMaxOverlap(TCoord max_overlap)194     void SetMaxOverlap(TCoord max_overlap) {
195         m_max_overlap = max_overlap;
196     }
197 
198     /// Retrieve the default compartment overlap behaviour (no overlap).
199 
s_GetDefaultMaxOverlap(void)200     static TCoord s_GetDefaultMaxOverlap(void) {
201         return 0;
202     }
203 
204     /// Set the maximum length of an intron
205     ///
206     /// @param intr_max
207     ///    Maximum length of an intron, in base pairs.
208 
SetMaxIntron(TCoord intr_max)209     void SetMaxIntron(TCoord intr_max) {
210         m_intron_max = intr_max;
211     }
212 
213     /// Retrieve the default maximum length of an intron
214 
s_GetDefaultMaxIntron(void)215     static TCoord s_GetDefaultMaxIntron(void) {
216                         // Some examples:
217                         //    NM_147181.3 vs NC_000004.10 (~1.1M)
218         return 1200000; //    NM_001128929.1 vs NC_000003.10 (~1.2M)
219     }
220 
221 
222     /// Set the penalty to open a compartment
223     ///
224     /// @param penalty
225     ///    Compartment opening penalty, in base pairs.
226 
SetPenalty(TCoord penalty)227     void SetPenalty(TCoord penalty) {
228         m_penalty = penalty;
229     }
230 
231     /// Set the minimum matching residues per compartment.
232     ///
233     /// @param min_matches
234     ///    The min number of matching residues in base pairs
235 
SetMinMatches(TCoord min_matches)236     void SetMinMatches(TCoord min_matches) {
237         m_MinSingletonMatches = m_MinMatches = min_matches;
238     }
239 
240     /// Set the minimum matching residues per singleton compartment.
241     ///
242     /// @param min_matches
243     ///    The min number of matching residues in base pairs
244 
SetMinSingletonMatches(TCoord min_matches)245     void SetMinSingletonMatches(TCoord min_matches) {
246         m_MinSingletonMatches = min_matches;
247     }
248 
249     /// Retrieve the default compartment penalty, in base pairs
250 
s_GetDefaultPenalty(void)251     static TCoord s_GetDefaultPenalty(void) {
252         return 500;
253     }
254 
255     /// Retrieve the default minimum coverage, in base pairs
256 
s_GetDefaultMinCoverage(void)257     static TCoord s_GetDefaultMinCoverage(void) {
258         return 500;
259     }
260 
261     /// Identify compartments.
262     ///
263     /// @param  cross_filter
264     ///    When activated, cross filtering will ensure that only
265     ///    the alignments that provided non-ambguous mapping between
266     ///    the sequences will be reported in the output.
267 
268     size_t Run(bool cross_filter = false); // do the compartment search
269 
270     ///  Order compartments by lower subj coordinate
271 
272     void OrderCompartments(void);
273 
274     /// Individual compartment representation.
275     /// Compartment members are the hits comprising the compartment.
276 
277     class CCompartment {
278 
279     public:
280 
281         /// Create an empty compartment
282 
CCompartment(void)283         CCompartment(void) {
284             m_box[0] = m_box[2] = numeric_limits<TCoord>::max();
285             m_box[1] = m_box[3] = 0;
286         }
287 
288         /// Retrieve compartment's members
289 
GetMembers(void)290         const THitRefs& GetMembers(void) {
291             return m_members;
292         }
293 
294         /// Add a new member into the compartment
295 
AddMember(const THitRef & hitref)296         void AddMember(const THitRef& hitref) {
297             m_members.push_back(hitref);
298         }
299 
300         /// Assign all members of the compartment
301 
SetMembers(void)302         THitRefs& SetMembers(void) {
303             return m_members;
304         }
305 
306         /// Make sure the compartment's box is up-to-date
307 
308         void UpdateMinMax(void);
309 
310         /// Retrieve the compartment's strand (true == plus)
311 
312         bool GetStrand(void) const;
313 
314         /// Retrieve the first member.
315         ///
316         /// @return
317         ///    A reference on the first member of the compartment,
318         ///    or a null reference if the compartment is empty.
319 
GetFirst(void) const320         const THitRef GetFirst(void) const {
321             m_iter = 0;
322             return GetNext();
323         }
324 
325 
326         /// Retrieve the next member.
327         ///
328         /// @return
329         ///    A reference on the next member of the compartment,
330         ///    or a null reference if there are no more members.
331 
GetNext(void) const332         const THitRef GetNext(void) const {
333             if(m_iter < m_members.size()) {
334                 return m_members[m_iter++];
335             }
336             return THitRef(NULL);
337         }
338 
339         /// Retrieve the compartment's box.
340         ///
341         /// @return
342         ///    A const pointer at the compartment's box.
343 
GetBox(void) const344         const TCoord* GetBox(void) const {
345             return m_box;
346         }
347 
348         // helper predicate
s_PLowerSubj(const CCompartment & c1,const CCompartment & c2)349         static bool s_PLowerSubj(const CCompartment& c1, const CCompartment& c2) {
350 
351             return c1.m_box[2] < c2.m_box[2];
352         }
353 
354     protected:
355 
356         THitRefs            m_members;
357         TCoord              m_box[4];
358         mutable size_t      m_iter;
359     };
360 
361     // iterate through compartments
362 
363     CCompartment* GetFirst(void);
364     CCompartment* GetNext(void);
365 
366 private:
367 
368     TCoord                m_max_overlap;   //max overlap on subject for different compartments
369     TCoord                m_intron_max;    // max intron size
370     TCoord                m_penalty;       // penalty per compartment
371     TCoord                m_MinMatches;    // min approx matches to report
372     TCoord                m_MinSingletonMatches; // min matches for singleton comps
373 
374     THitRefs              m_hitrefs;         // input hits
375     vector<CCompartment>  m_compartments;    // final compartments
376     int                   m_iter;            // GetFirst/Next index
377     CGapInfo<THit> *      m_gap_info;        // holds information about sequence gaps
378     CPrecalcGapInfo<THit> *      m_prec_gap_info;        // holds precalculated information about sequence gaps
379 
380     struct SHitStatus {
381 
382         enum EType {
383             eNone,
384             eExtension,
385             eOpening
386         };
387 
388         EType  m_type;
389         int    m_prev;
390         double m_score;
391 
SHitStatusCCompartmentFinder::SHitStatus392         SHitStatus(void): m_type(eNone), m_prev(-1), m_score(0)
393         {}
SHitStatusCCompartmentFinder::SHitStatus394         SHitStatus(EType type, int prev, double score):
395             m_type(type), m_prev(prev), m_score(score)
396         {}
397     };
398 
399 
400     static size_t sx_XFilter(THitRefs& hitrefs,
401                              typename THitRefs::iterator ihr,
402                              typename THitRefs::iterator ihr_e,
403                              Uint1 w,
404                              size_t min_compartment_hit_len);
405 
406     // helper predicate
407 
s_PNullRef(const THitRef & hr)408     static bool s_PNullRef(const THitRef& hr) {
409         return hr.IsNull();
410     }
411 };
412 
413 
414 // Facilitate access to compartments over a hit set
415 template<class THit>
416 class CCompartmentAccessor
417 {
418 public:
419 
420     typedef CCompartmentFinder<THit> TCompartmentFinder;
421     typedef typename TCompartmentFinder::THitRefs THitRefs;
422     typedef typename TCompartmentFinder::TCoord   TCoord;
423 
424     /// Construct the object and assign the parameters of the algorithm.
425     ///
426     /// @param  comp_penalty_bps
427     ///    Penalty to open a compartment, in base pairs.
428     /// @param  min_matches
429     ///    The minimum number of matching residues in a compartment, in base pairs.
430     /// @param  min_singleton_matches
431     ///    The minimum number of matching residues in a singleton, in base pairs.
432     /// @param  cross_filter
433     ///    Perform cross-filtering when true.
434 
435     CCompartmentAccessor(TCoord  comp_penalty_bps,
436                          TCoord  min_matches,
437                          TCoord  min_singleton_matches = numeric_limits<TCoord>::max(),
438                          bool    cross_filter = false);
439 
440     /// Construct the object; assign the parameters of the algorithm; execute.
441     /// The input range of alignments is assumed to contain the complete set of
442     /// alignments between the same pair of sequences.
443     /// The alignments can be on one or both genomic strands.
444     ///
445     /// @param  start
446     ///    Start of the input range of input alignments.
447     /// @param  finish
448     ///    Stop of the input range of input alignments.
449     /// @param  comp_penalty_bps
450     ///    Penalty to open a compartment, in base pairs.
451     /// @param  min_matches
452     ///    The minimum number of matching residues in a compartment, in base pairs.
453     /// @param  min_singleton_matches
454     ///    The minimum number of matching residues in a singleton, in base pairs.
455     /// @param  cross_filter
456     ///    Perform cross-filtering when true.
457 
458     CCompartmentAccessor(typename THitRefs::iterator start,
459                          typename THitRefs::iterator finish,
460                          TCoord  comp_penalty_bps,
461                          TCoord  min_matches,
462                          TCoord  min_singleton_matches = numeric_limits<TCoord>::max(),
463                          bool    cross_filter = false, CScope *scope = NULL,
464                          const vector<pair<TCoord, TCoord> > *gaps = NULL);
465 
466     /// Execute: identify compartments. The alignments can be on one
467     /// or both genomic strands.
468     ///
469     /// @param  start
470     ///    Start of the input range of input alignments.
471     /// @param  finish
472     ///    Stop of the input range of input alignments.
473 
474     void Run(typename THitRefs::iterator start,
475              typename THitRefs::iterator finish, CScope *scope = NULL,
476              const vector<pair<TCoord, TCoord> > *gaps = NULL);
477 
478     /// Assign the maximum intron length, in base pairs
479 
SetMaxIntron(TCoord mi)480     void   SetMaxIntron(TCoord mi) { m_MaxIntron = mi; }
481 
482     /// Retrieve the maximum intron length, in base pairs
483 
GetMaxIntron(void) const484     TCoord GetMaxIntron(void) const { return m_MaxIntron; }
485 
486     /// Assign the maximum length for compartments to overlap on the subject.
487 
SetMaxOverlap(TCoord max_overlap)488     void   SetMaxOverlap(TCoord max_overlap) { m_MaxOverlap = max_overlap; }
489 
490     /// Initialize iteration over the results.
491     /// Results are sorted by strand (minus first) and subj position.
492     ///
493     /// @param  compartment
494     ///    The first identified compartment
495     /// @return
496     ///    true if more compartments are available
497 
498     bool GetFirst(THitRefs& compartment);
499 
500     /// Proceed with iteration over the results.
501     ///
502     /// @param  compartment
503     ///    The next identified compartment
504     /// @return
505     ///    true if more compartments are available
506 
507     bool GetNext(THitRefs& compartment);
508 
509     /// Retrieve the compartment counts.
510     ///
511     /// @return
512     ///    The first number in the pair is the total count;
513     ///    the second number is the number of compartments with the number of matches
514     ///    abive the minimum.
515     ///
516 
GetCounts(void) const517     pair<size_t,size_t> GetCounts(void) const {
518 
519         // std::count() not supported on some platforms
520         size_t valid_count (0);
521         for(size_t i(0), n(m_status.size()); i != n; ++i) {
522             if(m_status[i]) ++valid_count;
523         }
524 
525         pair<size_t, size_t> rv (m_pending.size(), valid_count);
526 
527         return rv;
528     }
529 
530     /// Retrieve a compartment by index.
531     ///
532     /// @param idx
533     ///    The compartment's index
534     /// @param compartment
535     ///    The reference to receive the compartment requested.
536 
Get(size_t idx,THitRefs & compartment) const537     void Get(size_t idx, THitRefs& compartment) const {
538         compartment = m_pending[idx];
539     }
540 
GetBox(size_t i) const541     const TCoord* GetBox(size_t i) const {
542         return &m_ranges.front() + i*4;
543     }
544 
GetStrand(size_t i) const545     bool GetStrand(size_t i) const {
546         return m_strands[i];
547     }
548 
GetStatus(size_t i) const549     bool GetStatus(size_t i) const {
550         return m_status[i];
551     }
552 
553     /// Retrieve all valid compartments in a seq-align-set.
554     ///
555     /// @return
556     ///    A seq-align-set object with this hierarchy.
557     ///      1. Compartment-level seq-align with bounds.
558     ///      2. Seq-align-set keeping individual hits of the compartment.
559     ///      3. Hit-level seq-align of the dense-seg type.
560     CRef<objects::CSeq_align_set> AsSeqAlignSet(void) const;
561 
562 private:
563 
564     void x_Init(TCoord comp_penalty, TCoord  min_matches,
565                 TCoord min_singleton_matches, bool cross_filter);
566 
567     // compartmentization parameters
568     TCoord m_Penalty;
569     TCoord m_MinMatches;
570     TCoord m_MinSingletonMatches;
571     TCoord m_MaxIntron;
572     TCoord m_MaxOverlap;
573     bool   m_CrossFiltering;
574 
575     // ancillary members
576     vector<THitRefs>         m_pending;
577     vector<TCoord>           m_ranges;
578     vector<bool>             m_strands;
579     vector<bool>             m_status;
580     size_t                   m_iter;
581 
582     void x_Copy2Pending(TCompartmentFinder& finder);
583 };
584 
585 
586 
587 ///  IMPLEMENTATION
588 /////////////////////////////////////////////////////////////////////////////
589 
590 
591 const double kPenaltyPerIntronBase = -2e-5; // a small penalty to prefer
592                                             // more compact models among equal
593 
594 const double kPenaltyPerIntronPos = -1e-9;  // a small penalty to favor uniform
595                                             // selection among equivalent chains
596 
597 template<class THit>
UpdateMinMax()598 void CCompartmentFinder<THit>::CCompartment::UpdateMinMax()
599 {
600     typedef CHitFilter<THit> THitFilter;
601     THitFilter::s_GetSpan(m_members, m_box);
602 }
603 
604 
605 template<class THit>
GetStrand() const606 bool CCompartmentFinder<THit>::CCompartment::GetStrand() const {
607 
608     if(m_members.size()) {
609         return m_members.front()->GetSubjStrand();
610     }
611     else {
612         NCBI_THROW( CAlgoAlignException,
613                     eInternal,
614                     "Strand requested on an empty compartment" );
615     }
616 }
617 
618 
619 template<class THit>
CCompartmentFinder(typename THitRefs::const_iterator start,typename THitRefs::const_iterator finish,CGapInfo<THit> * gap_info,CPrecalcGapInfo<THit> * prec_gap_info)620 CCompartmentFinder<THit>::CCompartmentFinder(
621     typename THitRefs::const_iterator start,
622     typename THitRefs::const_iterator finish,
623                        CGapInfo<THit> *gap_info,
624                        CPrecalcGapInfo<THit> *prec_gap_info ):
625 
626     m_max_overlap(s_GetDefaultMaxOverlap()),
627     m_intron_max(s_GetDefaultMaxIntron()),
628     m_penalty(s_GetDefaultPenalty()),
629     m_MinMatches(1),
630     m_MinSingletonMatches(1),
631     m_iter(-1),
632     m_gap_info(gap_info),
633     m_prec_gap_info(prec_gap_info)
634 {
635     m_hitrefs.resize(finish - start);
636     copy(start, finish, m_hitrefs.begin());
637 }
638 
639 // Query match accumulator
640 template<class THit>
641 class CQueryMatchAccumulator
642 {
643 public:
644 
CQueryMatchAccumulator(void)645     CQueryMatchAccumulator(void): m_Finish(-1.0)
646     {}
647 
operator ()(double acm,CRef<THit> ph)648     double operator() (double acm, CRef<THit> ph)
649     {
650         const typename THit::TCoord qmin (ph->GetQueryMin()),
651             qmax (ph->GetQueryMax());
652         if(qmin > m_Finish)
653             return acm + ph->GetIdentity() * ((m_Finish = qmax) - qmin + 1);
654         else {
655             if(qmax > m_Finish) {
656                 const double finish0 (m_Finish);
657                 return acm + ph->GetIdentity() * ((m_Finish = qmax) - finish0);
658             }
659             else {
660                 return acm;
661             }
662         }
663     }
664 
665 private:
666 
667     double m_Finish;
668 };
669 
670 
671 template<class THit>
GetTotalMatches(const typename CCompartmentFinder<THit>::THitRefs & hitrefs0)672 double GetTotalMatches(
673     const typename CCompartmentFinder<THit>::THitRefs& hitrefs0)
674 {
675     typename CCompartmentFinder<THit>::THitRefs hitrefs (hitrefs0);
676 
677     typedef CHitComparator<THit> THitComparator;
678     THitComparator sorter (THitComparator::eQueryMin);
679     stable_sort(hitrefs.begin(), hitrefs.end(), sorter);
680 
681     const double rv (accumulate(hitrefs.begin(), hitrefs.end(), 0.0,
682                                 CQueryMatchAccumulator<THit>()));
683     return rv;
684 }
685 
686 
687 template<class THit>
Run(bool cross_filter)688 size_t CCompartmentFinder<THit>::Run(bool cross_filter)
689 {
690     const double kMinusInf (-1e12);
691 
692     m_compartments.clear();
693     const int dimhits = m_hitrefs.size();
694     if(dimhits == 0) {
695         return 0;
696     }
697 
698     // sort the hits to make sure that each hit is placed after:
699     // - hits from which to continue a compartment
700     // - hits from which to open a new compartment
701 
702     typedef CHitComparator<THit> THitComparator;
703     stable_sort(m_hitrefs.begin(), m_hitrefs.end(),
704                 THitComparator(THitComparator::eSubjMaxQueryMax));
705 
706     // For every hit:
707     // - evaluate its best extension potential
708     // - evaluate its best potential to start a new compartment
709     // - select the best variant
710     // - update the hit status array
711 
712     typedef vector<SHitStatus> THitData;
713     THitData hitstatus (dimhits);
714 
715     const TCoord subj_min_global (m_hitrefs.front()->GetSubjMin());
716 
717     int i_bestsofar (0);
718     for(int i (0); i < dimhits; ++i) {
719 
720         const THitRef& h (m_hitrefs[i]);
721         const double identity (h->GetIdentity());
722         const typename THit::TCoord hbox [4] = {
723             h->GetQueryMin(),  h->GetQueryMax(),
724             h->GetSubjMin(),   h->GetSubjMax()
725         };
726 
727 //#define CF_DBG_TRACE
728 #ifdef CF_DBG_TRACE
729         cerr << endl << *h << endl;
730 #endif
731 
732         double best_ext_score (kMinusInf);
733         int    i_best_ext (-1);
734 
735         double best_open_score (identity*(hbox[1] - hbox[0] + 1) - m_penalty);
736         int    i_best_open (-1); // each can be a start of the very first compartment
737 
738         if(hbox[2] + m_max_overlap > m_hitrefs[i_bestsofar]->GetSubjMax()) {
739 
740             const double score_open (identity*(hbox[1] - hbox[0] + 1)
741                                      + hitstatus[i_bestsofar].m_score
742                                      - m_penalty);
743 
744             if(score_open > best_open_score) {
745                 best_open_score = score_open;
746                 i_best_open = i_bestsofar;
747             }
748         }
749         else {
750             // try every prior hit
751             for(int j (i - 1); j >= 0; --j) {
752 
753                 const double score_open (identity * (hbox[1] - hbox[0] + 1)
754                                          + hitstatus[j].m_score
755                                          - m_penalty);
756 
757                 if(score_open > best_open_score
758                    && hbox[2] + m_max_overlap > m_hitrefs[j]->GetSubjMax())
759                 {
760                     best_open_score = score_open;
761                     i_best_open = j;
762                 }
763             }
764         }
765 
766         for(int j = i; j >= 0; --j) {
767 
768             THitRef phc;
769             typename THit::TCoord phcbox[4];
770             if(j != i) {
771 
772                 phc = m_hitrefs[j];
773                 phcbox[0] = phc->GetQueryMin();
774                 phcbox[1] = phc->GetQueryMax();
775                 phcbox[2] = phc->GetSubjMin();
776                 phcbox[3] = phc->GetSubjMax();
777 
778 #ifdef CF_DBG_TRACE
779                 cerr << '\t' << *phc << endl;
780 #endif
781             }
782 #ifdef CF_DBG_TRACE
783             else {
784                 cerr << "\t[Dummy]" << endl;
785             }
786 #endif
787 
788             // check if good for extension
789             {{
790                 typename THit::TCoord q0, s0; // possible continuation
791                 bool good (false);
792                 int subj_space;
793                 TCoord intron_start (0);
794 
795                 if(i != j) {
796 
797                     if( ( phcbox[1] < hbox[1] && phcbox[0] < hbox[0] )
798                         && phcbox[2] <= hbox[2] //prohibit to extend if previous hit is inside the current in genomic coordinates
799                     ) {
800 
801                         if(hbox[0] <= phcbox[1] &&
802                            hbox[2] + phcbox[1] - hbox[0] >= phcbox[3]) {
803 
804                             q0 = phcbox[1] + 1;
805                             s0 = hbox[2] + phcbox[1] - hbox[0];
806                         }
807                         else if(phcbox[3] >= hbox[2]) {
808 
809                             s0 = phcbox[3] + 1;
810                             q0 = hbox[1] - (hbox[3] - s0);
811                         }
812                         else {
813 
814                             q0 = hbox[0];
815                             s0 = hbox[2];
816                         }
817                         subj_space = s0 - phcbox[3] - 1;
818 
819 
820                         // max run of spaces inside an exon - NOW DISABLED
821                         // example: NM_021645.4 vs NC_000013.9: 51.4 - 51.51M
822                         //const TCoord max_gap (100);
823 
824                         good = (subj_space <= int(m_intron_max))
825                             && (s0 < hbox[3] && q0 < hbox[1]);
826 
827                         //prohibit to go over non-bridgable gaps in subject
828                         if(good) {
829                             if( phcbox[3] <= hbox[2] ) {
830                                 if( ( m_prec_gap_info && m_prec_gap_info->IntersectsNonBridgeableGap(phcbox[3], hbox[2]) ) ||
831                                     ( m_gap_info && m_gap_info->IntersectsNonBridgeableGap(phcbox[3], hbox[2]) ) ) {
832                                     good = false;
833                                 }
834                             }
835                         }
836 
837                         if(good) {
838                             intron_start = phcbox[3];
839                         }
840                     }
841                 }
842 
843                 if(good) {
844 
845                     const double jscore (hitstatus[j].m_score);
846                     const double intron_penalty ((subj_space > 0)?
847                          (kPenaltyPerIntronPos * (intron_start - subj_min_global)
848                          + subj_space * kPenaltyPerIntronBase): 0.0);
849 
850                     const double ext_score (jscore +
851                         identity * (hbox[1] - q0 + 1) + intron_penalty);
852 
853                     if(ext_score > best_ext_score) {
854                         best_ext_score = ext_score;
855                         i_best_ext = j;
856                     }
857 #ifdef CF_DBG_TRACE
858                     cerr << "\tGood for extension with score = " << ext_score << endl;
859 #endif
860                 }
861             }}
862         }
863 
864         typename SHitStatus::EType hit_type;
865         int prev_hit;
866         double best_score;
867         if(best_ext_score > best_open_score) {
868 
869             hit_type = SHitStatus::eExtension;
870             prev_hit = i_best_ext;
871             best_score = best_ext_score;
872         }
873         else {
874 
875             hit_type = SHitStatus::eOpening;
876             prev_hit = i_best_open;
877             best_score = best_open_score;
878         }
879 
880         hitstatus[i].m_type  = hit_type;
881         hitstatus[i].m_prev  = prev_hit;
882         hitstatus[i].m_score = best_score;
883 
884         if(best_score > hitstatus[i_bestsofar].m_score) {
885             i_bestsofar = i;
886         }
887 
888 #ifdef CF_DBG_TRACE
889         cerr << "Status = " << ((hit_type == SHitStatus::eOpening)? "Open": "Extend")
890              << '\t' << best_score << endl;
891         if(prev_hit == -1) {
892             cerr << "[Dummy]" << endl;
893         }
894         else {
895             cerr << *(m_hitrefs[prev_hit]) << endl;
896         }
897 #endif
898     }
899 
900 #ifdef CF_DBG_TRACE
901     cerr << "\n\n--- BACKTRACE ---\n";
902 #endif
903 
904     // *** backtrace ***
905     // -  trace back the chain with the highest score
906     const double score_best    (hitstatus[i_bestsofar].m_score);
907     const double min_matches   (m_MinSingletonMatches < m_MinMatches?
908         m_MinSingletonMatches: m_MinMatches);
909 
910     vector<bool> comp_status;
911     if(score_best + m_penalty >= min_matches) {
912 
913         int i (i_bestsofar);
914         bool new_compartment (true);
915         THitRefs hitrefs;
916         while(i != -1) {
917 
918             if(new_compartment) {
919 
920                 const double mp (GetTotalMatches<THit>(hitrefs));
921                 if(mp > 0) {
922                     // save the current compartment
923                     m_compartments.push_back(CCompartment());
924                     m_compartments.back().SetMembers() = hitrefs;
925                     comp_status.push_back(mp >= m_MinMatches);
926                 }
927 
928                 hitrefs.resize(0);
929                 new_compartment = false;
930             }
931             hitrefs.push_back(m_hitrefs[i]);
932 
933 #ifdef CF_DBG_TRACE
934             cerr << *(m_hitrefs[i]) << endl;
935 #endif
936 
937             if(hitstatus[i].m_type == SHitStatus::eOpening) {
938                 new_compartment = true;
939             }
940             i = hitstatus[i].m_prev;
941         }
942 
943         const double mp (GetTotalMatches<THit>(hitrefs));
944         if(mp > 0) {
945             bool status ( ( m_compartments.size() == 0 && mp >= m_MinSingletonMatches )
946                          || mp >= m_MinMatches );
947             m_compartments.push_back(CCompartment());
948             m_compartments.back().SetMembers() = hitrefs;
949             comp_status.push_back(status);
950         }
951     }
952 
953     if(cross_filter && m_compartments.size()) {
954 
955         const TCoord kMinCompartmentHitLength (8);
956 
957         // partial x-filtering using compartment hits only
958         for(size_t icn (m_compartments.size()), ic (0); ic < icn; ++ic) {
959 
960             CCompartment & comp (m_compartments[ic]);
961             THitRefs& hitrefs (comp.SetMembers());
962             size_t nullified (0);
963             for(int in (hitrefs.size()), i (in - 1); i > 0; --i) {
964 
965                 int j1 (i);
966                 while(j1 < in && hitrefs[j1].IsNull()) ++j1;
967                 if(j1 == in) continue;
968 
969                 THitRef& h1 (hitrefs[j1]);
970                 THitRef& h2 (hitrefs[i-1]);
971 
972                 if(h1.IsNull()) continue;
973 
974                 const TCoord * box1o (h1->GetBox());
975                 TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
976                 const TCoord * box2o (h2->GetBox());
977                 TCoord box2 [4] = {box2o[0], box2o[1], box2o[2], box2o[3]};
978 
979                 const int qd (box1[1] - box2[0] + 1);
980                 const int sd (box1[3] - box2[2] + 1);
981                 if(qd > sd && qd > 0) {
982 
983                     if(box2[0] <= box1[0] + kMinCompartmentHitLength) {
984 
985 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
986                         if(i + 1 == in) {
987                             TCoord new_coord (box1[0] + (box1[1] - box1[0])/2);
988                             if(box1[0] + 1 <= new_coord) {
989                                 --new_coord;
990                             }
991                             else {
992                                 new_coord = box1[0];
993                             }
994                             h1->Modify(1, new_coord);
995                         }
996                         else
997 #endif
998                         {
999                             h1.Reset(0);
1000                             ++nullified;
1001                         }
1002                     }
1003                     else {
1004                         h1->Modify(1, box2[0] - 1);
1005                     }
1006 
1007                     if(box2[1] <= box1[1] + kMinCompartmentHitLength) {
1008 
1009 #ifdef ALGOALIGNUTIL_COMPARTMENT_FINDER_KEEP_TERMINII
1010                         if(i == 1) {
1011                             TCoord new_coord (box2[0] + (box2[1] - box2[0])/2);
1012                             if(box2[1] >= new_coord + 1) {
1013                                 ++new_coord;
1014                             }
1015                             else {
1016                                 new_coord = box2[1];
1017                             }
1018                             h2->Modify(0, new_coord);
1019                         }
1020                         else
1021 #endif
1022                         {
1023                             h2.Reset(0);
1024                             ++nullified;
1025                         }
1026                     }
1027                     else {
1028                         h2->Modify(0, box1[1] + 1);
1029                     }
1030                 }
1031                 else if (sd > 0) {
1032 
1033                     if(box2[2] <= box1[2] + kMinCompartmentHitLength) {
1034 
1035                         if(i + 1 == in) {
1036                             TCoord new_coord (box1[2] + (box1[3] - box1[2])/2);
1037                             if(box1[2] + 1 <= new_coord) {
1038                                 --new_coord;
1039                             }
1040                             else {
1041                                 new_coord = box1[2];
1042                             }
1043                             h1->Modify(3, new_coord);
1044                         }
1045                         else {
1046                             h1.Reset(0);
1047                             ++nullified;
1048                         }
1049                     }
1050                     else {
1051                         h1->Modify(3, box2[2] - 1);
1052                     }
1053 
1054                     if(box2[3] <= box1[3] + kMinCompartmentHitLength) {
1055 
1056                         if(i == 1) {
1057                             TCoord new_coord (box2[2] + (box2[3] - box2[2])/2);
1058                             if(box2[3] >= new_coord + 1) {
1059                                 ++new_coord;
1060                             }
1061                             else {
1062                                 new_coord = box2[3];
1063                             }
1064 
1065                             h2->Modify(2, new_coord);
1066                         }
1067                         else {
1068                             h2.Reset(0);
1069                             ++nullified;
1070                         }
1071                     }
1072                     else {
1073                         h2->Modify(2, box1[3] + 1);
1074                     }
1075                 }
1076             }
1077 
1078             if(nullified > 0) {
1079                 hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
1080                                         CCompartmentFinder<THit>::s_PNullRef),
1081                               hitrefs.end());
1082             }
1083         }
1084 
1085 #define ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
1086 #ifdef  ALGO_ALIGN_COMPARTMENT_FINDER_USE_FULL_XFILTERING
1087 
1088         typename THitRefs::iterator ihr_b (m_hitrefs.begin()),
1089             ihr_e(m_hitrefs.end()), ihr (ihr_b);
1090 
1091         stable_sort(ihr_b, ihr_e, THitComparator(THitComparator::eSubjMinSubjMax));
1092 
1093         // complete x-filtering using the full set of hits
1094         for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
1095 
1096             CCompartment & comp (m_compartments[ic]);
1097             THitRefs& hitrefs (comp.SetMembers());
1098 
1099             if(hitrefs.size() < 3) {
1100                 continue;
1101             }
1102             else {
1103                 NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1104                     (*ii)->SetScore(- (*ii)->GetScore());
1105                 }
1106                 hitrefs.front()->SetEValue(-1);
1107                 hitrefs.back()->SetEValue(-1);
1108             }
1109 
1110             const TCoord comp_subj_min (hitrefs.back()->GetSubjStart());
1111             const TCoord comp_subj_max (hitrefs.front()->GetSubjStop());
1112             while(ihr != ihr_e && ((*ihr)->GetSubjStart() < comp_subj_min
1113                                    || (*ihr)->GetScore() < 0))
1114             {
1115                 ++ihr;
1116             }
1117 
1118             if(ihr == ihr_e) break;
1119             if((*ihr)->GetSubjStart() > comp_subj_max) continue;
1120             typename THitRefs::iterator ihrc_b (ihr);
1121 
1122             typename THitRefs::iterator ihrc_e (ihr);
1123             while(ihrc_e != ihr_e && ((*ihrc_e)->GetSubjStart() < comp_subj_max
1124                                       || (*ihrc_e)->GetScore() < 0))
1125             {
1126                 ++ihrc_e;
1127             }
1128 
1129             size_t nullified (sx_XFilter(hitrefs, ihrc_b, ihrc_e, 1,
1130                                          kMinCompartmentHitLength));
1131             stable_sort(ihrc_b, ihrc_e, THitComparator (THitComparator::eQueryMinQueryMax));
1132 
1133             nullified += sx_XFilter(hitrefs, ihrc_b, ihrc_e, 0,
1134                                     kMinCompartmentHitLength);
1135 
1136             ihr = ihrc_e;
1137 
1138             if(nullified > 0) {
1139                 hitrefs.erase(remove_if(hitrefs.begin(), hitrefs.end(),
1140                                         CCompartmentFinder<THit>::s_PNullRef),
1141                               hitrefs.end());
1142             }
1143         }
1144 
1145         for(int icn (m_compartments.size()), ic (icn - 1); ic >= 0; --ic) {
1146 
1147             CCompartment & comp (m_compartments[ic]);
1148             THitRefs& hitrefs (comp.SetMembers());
1149             NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1150                 const double score ((*ii)->GetScore());
1151                 if(score < 0) {
1152                     (*ii)->SetScore(-score);
1153                 }
1154                 const double eval ((*ii)->GetEValue());
1155                 if(eval < 0) {
1156                     (*ii)->SetEValue(0);
1157                 }
1158             }
1159         }
1160 #endif
1161     }
1162 
1163     // mask out compartments with low identity
1164     for(size_t i (0), in (m_compartments.size()); i < in; ++i) {
1165         if(comp_status[i] == false) {
1166 
1167             THitRefs & hitrefs (m_compartments[i].SetMembers());
1168             NON_CONST_ITERATE(typename THitRefs, ii, hitrefs) {
1169                 THitRef & hr (*ii);
1170                 hr->SetScore(-hr->GetScore());
1171             }
1172         }
1173     }
1174 
1175     return m_compartments.size();
1176 }
1177 
1178 
1179 template<class THit>
sx_XFilter(THitRefs & hitrefs,typename THitRefs::iterator ihr,typename THitRefs::iterator ihr_e,Uint1 w,size_t min_compartment_hit_len)1180 size_t CCompartmentFinder<THit>::sx_XFilter(
1181     THitRefs& hitrefs,
1182     typename THitRefs::iterator ihr,
1183     typename THitRefs::iterator ihr_e,
1184     Uint1 w,
1185     size_t min_compartment_hit_len)
1186 {
1187     size_t nullified (0);
1188     for(int in (hitrefs.size()), i (in - 2); i > 0 && ihr != ihr_e; --i) {
1189 
1190         THitRef& h1 (hitrefs[i]);
1191         if(h1.IsNull()) continue;
1192 
1193         const TCoord * box1o (h1->GetBox());
1194         TCoord box1 [4] = {box1o[0], box1o[1], box1o[2], box1o[3]};
1195 
1196         // locate the first overlap
1197         for(; ihr != ihr_e; ++ihr) {
1198 
1199             THitRef hr (*ihr);
1200             if(hr.IsNull()) continue;
1201             if(hr->GetScore() > 0 && hr->GetStop(w) >= box1[2*w]) {
1202                 break;
1203             }
1204         }
1205 
1206         if(ihr == ihr_e) {
1207             break;
1208         }
1209 
1210         // find the smallest not overlapped hit coord and its interval
1211         TCoord ls0 (box1[2*w+1] + 1), cursegmax (0);
1212         TCoord segmax_start(box1[2*w]), segmax_stop(box1[2*w+1]);
1213 
1214         if(box1[2*w] < (*ihr)->GetStart(w)) {
1215 
1216             segmax_start = ls0 = box1[2*w];
1217             segmax_stop = (*ihr)->GetStart(w) - 1;
1218             cursegmax = segmax_stop - segmax_start + 1;
1219         }
1220         else {
1221 
1222             TCoord shrsmax ((*ihr)->GetStop(w));
1223             for(++ihr; ihr != ihr_e; ++ihr) {
1224 
1225                 THitRef hr (*ihr);
1226                 if(hr.IsNull() || hr->GetScore() < 0) {
1227                     continue;
1228                 }
1229 
1230                 const TCoord hrs0 (hr->GetStart(w));
1231                 if(hrs0 > box1[2*w+1]) {
1232                     segmax_stop = box1[2*w+1];
1233                     break;
1234                 }
1235 
1236                 if(hrs0 > shrsmax) {
1237                     segmax_stop = hrs0 - 1;
1238                     break;
1239                 }
1240 
1241                 const TCoord hrs1 (hr->GetStop(w));
1242                 if(hrs1 > shrsmax) {
1243                     shrsmax = hrs1;
1244                 }
1245             }
1246 
1247             if(shrsmax < box1[2*w+1]) {
1248                 segmax_start = ls0 = shrsmax + 1;
1249                 cursegmax = segmax_stop - segmax_start + 1;
1250             }
1251         }
1252 
1253         if(ls0 > box1[2*w+1]) {
1254             h1.Reset(0);
1255             ++nullified;
1256             continue;
1257         }
1258 
1259         // find the longest surviving segment
1260         for(; ihr != ihr_e; ++ihr) {
1261 
1262             THitRef hr (*ihr);
1263             if(hr.IsNull() || hr->GetScore() < 0) {
1264                 continue;
1265             }
1266 
1267             const TCoord hrs0 (hr->GetStart(w));
1268             if(hrs0 > box1[2*w+1]) {
1269                 if(ls0 <= box1[2*w+1] && box1[2*w+1] + 1 - ls0 > cursegmax) {
1270                     segmax_start = ls0;
1271                     segmax_stop = box1[2*w+1];
1272                     cursegmax = segmax_stop - segmax_start + 1;
1273                 }
1274                 break;
1275             }
1276 
1277             if(hrs0 > ls0) {
1278                 if(hrs0 - ls0 > cursegmax) {
1279                     segmax_start = ls0;
1280                     segmax_stop = hrs0 - 1;
1281                     cursegmax = segmax_stop - segmax_start + 1;
1282                 }
1283 
1284                 ls0 = hr->GetStop(w) + 1;
1285             }
1286             else if(hr->GetStop(w) + 1 > ls0) {
1287                 ls0 = hr->GetStop(w) + 1;
1288             }
1289         }
1290 
1291         if(box1[2*w+1] > ls0 + cursegmax) {
1292             segmax_start = ls0;
1293             segmax_stop  = box1[2*w+1];
1294             cursegmax    = segmax_stop - segmax_start + 1;
1295         }
1296 
1297         if(cursegmax < min_compartment_hit_len) {
1298             h1.Reset(0);
1299             ++nullified;
1300             continue;
1301         }
1302         else {
1303 
1304             if(box1[2*w] < segmax_start) {
1305                 h1->Modify(2 * w, segmax_start);
1306             }
1307 
1308             if(segmax_stop < box1[2*w+1]) {
1309                 h1->Modify(2 * w + 1, segmax_stop);
1310             }
1311         }
1312 
1313         if(ihr == ihr_e) break;
1314     }
1315 
1316     return nullified;
1317 }
1318 
1319 
1320 template<class THit>
1321 typename CCompartmentFinder<THit>::CCompartment*
GetFirst()1322 CCompartmentFinder<THit>::GetFirst()
1323 {
1324     if(m_compartments.size()) {
1325         m_iter = 0;
1326         return &m_compartments[m_iter++];
1327     }
1328     else {
1329         m_iter = -1;
1330         return 0;
1331     }
1332 }
1333 
1334 template<class THit>
OrderCompartments(void)1335 void CCompartmentFinder<THit>::OrderCompartments(void) {
1336 
1337     for(size_t i = 0, dim = m_compartments.size(); i < dim; ++i) {
1338         m_compartments[i].UpdateMinMax();
1339     }
1340 
1341     stable_sort(m_compartments.begin(), m_compartments.end(),
1342                 CCompartmentFinder<THit>::CCompartment::s_PLowerSubj);
1343 }
1344 
1345 template<class THit>
1346 typename CCompartmentFinder<THit>::CCompartment*
GetNext()1347 CCompartmentFinder<THit>::GetNext()
1348 {
1349     const size_t dim = m_compartments.size();
1350     if(m_iter == -1 || m_iter >= int(dim)) {
1351         m_iter = -1;
1352         return 0;
1353     }
1354     else {
1355         return &m_compartments[m_iter++];
1356     }
1357 }
1358 
1359 
1360 template<class THit>
CCompartmentAccessor(TCoord comp_penalty,TCoord min_matches,TCoord min_singleton_matches,bool cross_filter)1361 CCompartmentAccessor<THit>::CCompartmentAccessor(TCoord  comp_penalty,
1362                                                  TCoord  min_matches,
1363                                                  TCoord  min_singleton_matches,
1364                                                  bool    cross_filter)
1365 {
1366     x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
1367 }
1368 
1369 
1370 template<class THit>
CCompartmentAccessor(typename THitRefs::iterator istart,typename THitRefs::iterator ifinish,TCoord comp_penalty,TCoord min_matches,TCoord min_singleton_matches,bool cross_filter,CScope * scope,const vector<pair<TCoord,TCoord>> * gaps)1371 CCompartmentAccessor<THit>::CCompartmentAccessor(
1372      typename THitRefs::iterator istart,
1373      typename THitRefs::iterator ifinish,
1374      TCoord comp_penalty,
1375      TCoord min_matches,
1376      TCoord min_singleton_matches,
1377      bool   cross_filter,
1378      CScope *scope,
1379      const vector<pair<TCoord, TCoord> > *gaps)
1380 {
1381     x_Init(comp_penalty, min_matches, min_singleton_matches, cross_filter);
1382     Run(istart, ifinish, scope, gaps);
1383 }
1384 
1385 
1386 template<class THit>
x_Init(TCoord comp_penalty,TCoord min_matches,TCoord min_singleton_matches,bool cross_filter)1387 void CCompartmentAccessor<THit>::x_Init(TCoord  comp_penalty,
1388                                         TCoord  min_matches,
1389                                         TCoord  min_singleton_matches,
1390                                         bool    cross_filter)
1391 {
1392     m_Penalty             = comp_penalty;
1393     m_MinMatches          = min_matches;
1394     m_MinSingletonMatches = min_singleton_matches;
1395     m_CrossFiltering      = cross_filter;
1396     m_MaxIntron           = CCompartmentFinder<THit>::s_GetDefaultMaxIntron();
1397     m_MaxOverlap          = CCompartmentFinder<THit>::s_GetDefaultMaxOverlap();
1398 }
1399 
1400 
1401 template<class THit>
Run(typename THitRefs::iterator istart,typename THitRefs::iterator ifinish,CScope * scope,const vector<pair<TCoord,TCoord>> * gaps)1402 void CCompartmentAccessor<THit>::Run(typename THitRefs::iterator istart,
1403                                      typename THitRefs::iterator ifinish,
1404                                      CScope *scope,
1405                                      const vector<pair<TCoord, TCoord> > *gaps)
1406 {
1407 
1408     if(istart == ifinish) return;
1409 
1410     const TCoord kMax_TCoord (numeric_limits<TCoord>::max());
1411 
1412     // separate strands for CompartmentFinder
1413     typename THitRefs::iterator ib = istart, ie = ifinish, ii = ib,
1414         iplus_beg = ie;
1415     typedef CHitComparator<THit> THitComparator;
1416     THitComparator sorter (THitComparator::eSubjStrand);
1417     stable_sort(ib, ie, sorter);
1418 
1419     TCoord minus_subj_min = kMax_TCoord, minus_subj_max = 0;
1420     for(ii = ib; ii != ie; ++ii) {
1421         if((*ii)->GetSubjStrand()) {
1422             iplus_beg = ii;
1423             break;
1424         }
1425         else {
1426             if((*ii)->GetSubjMin() < minus_subj_min) {
1427                 minus_subj_min = (*ii)->GetSubjMin();
1428             }
1429             if((*ii)->GetSubjMax() > minus_subj_max) {
1430                 minus_subj_max = (*ii)->GetSubjMax();
1431             }
1432         }
1433     }
1434 
1435 
1436     // minus
1437     {{
1438         CGapInfo<THit> gapi( *(*istart)->GetSubjId(), scope);
1439         CPrecalcGapInfo<THit> prec_gapi( gaps );
1440         // flip
1441         gapi.Flip(minus_subj_min, minus_subj_max);
1442         prec_gapi.Flip(minus_subj_min, minus_subj_max);
1443         for(ii = ib; ii != iplus_beg; ++ii) {
1444 
1445             const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
1446                 - (*ii)->GetSubjMax();
1447             const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
1448                 - (*ii)->GetSubjMin();
1449             (*ii)->SetSubjStart(s0);
1450             (*ii)->SetSubjStop(s1);
1451         }
1452 
1453         CCompartmentFinder<THit> finder (ib, iplus_beg, &gapi, &prec_gapi);
1454         finder.SetPenalty(m_Penalty);
1455         finder.SetMinMatches(m_MinMatches);
1456         finder.SetMinSingletonMatches(m_MinSingletonMatches);
1457         finder.SetMaxIntron(m_MaxIntron);
1458         finder.SetMaxOverlap(m_MaxOverlap);
1459         finder.Run(m_CrossFiltering);
1460 
1461         // un-flip
1462         for(ii = ib; ii != iplus_beg; ++ii) {
1463 
1464             const typename THit::TCoord s0 = minus_subj_min + minus_subj_max
1465                 - (*ii)->GetSubjMax();
1466             const typename THit::TCoord s1 = minus_subj_min + minus_subj_max
1467                 - (*ii)->GetSubjMin();
1468             (*ii)->SetSubjStart(s1);
1469             (*ii)->SetSubjStop(s0);
1470         }
1471 
1472         x_Copy2Pending(finder);
1473     }}
1474 
1475     // plus
1476     {{
1477         CGapInfo<THit> gapi( *(*istart)->GetSubjId(), scope);
1478         CPrecalcGapInfo<THit> prec_gapi( gaps );
1479         CCompartmentFinder<THit> finder (iplus_beg, ie, &gapi, &prec_gapi);
1480         finder.SetPenalty(m_Penalty);
1481         finder.SetMinMatches(m_MinMatches);
1482         finder.SetMinSingletonMatches(m_MinSingletonMatches);
1483         finder.SetMaxIntron(m_MaxIntron);
1484         finder.SetMaxOverlap(m_MaxOverlap);
1485         finder.Run(m_CrossFiltering);
1486         x_Copy2Pending(finder);
1487     }}
1488 }
1489 
1490 
1491 template<class THit>
x_Copy2Pending(CCompartmentFinder<THit> & finder)1492 void CCompartmentAccessor<THit>::x_Copy2Pending(
1493     CCompartmentFinder<THit>& finder)
1494 {
1495     finder.OrderCompartments();
1496 
1497     typedef typename CCompartmentFinder<THit>::THitRef THitRef;
1498 
1499     // copy to pending
1500     for(typename CCompartmentFinder<THit>::CCompartment* compartment =
1501             finder.GetFirst(); compartment; compartment = finder.GetNext()) {
1502 
1503         if(compartment->GetMembers().size() > 0) {
1504 
1505             m_pending.push_back(THitRefs(0));
1506             THitRefs& vh = m_pending.back();
1507 
1508             for(THitRef ph (compartment->GetFirst()); ph; ph = compartment->GetNext())
1509             {
1510                 vh.push_back(ph);
1511             }
1512 
1513             const TCoord* box = compartment->GetBox();
1514             m_ranges.push_back(box[0]);
1515             m_ranges.push_back(box[1]);
1516             m_ranges.push_back(box[2]);
1517             m_ranges.push_back(box[3]);
1518 
1519             m_strands.push_back(compartment->GetStrand());
1520             m_status.push_back(compartment->GetFirst()->GetScore() > 0);
1521         }
1522     }
1523 }
1524 
1525 
1526 template<class THit>
GetFirst(THitRefs & compartment)1527 bool CCompartmentAccessor<THit>::GetFirst(THitRefs& compartment) {
1528 
1529     m_iter = 0;
1530     return GetNext(compartment);
1531 }
1532 
1533 
1534 template<class THit>
GetNext(THitRefs & compartment)1535 bool CCompartmentAccessor<THit>::GetNext(THitRefs& compartment) {
1536 
1537     compartment.clear();
1538     if(m_iter < m_pending.size()) {
1539         compartment = m_pending[m_iter++];
1540         return true;
1541     }
1542     else {
1543         return false;
1544     }
1545 }
1546 
1547 
1548 template<class THit>
AsSeqAlignSet(void) const1549 CRef<objects::CSeq_align_set> CCompartmentAccessor<THit>::AsSeqAlignSet(void) const
1550 {
1551     USING_SCOPE(objects);
1552 
1553     CRef<CSeq_align_set> rv (new CSeq_align_set);
1554 
1555     // retrieve the ids
1556     CRef<CSeq_id> seqid_query (new CSeq_id), seqid_subj (new CSeq_id);
1557     if(m_pending.size()) {
1558         if(m_pending.front().size()) {
1559             const THit & h (*m_pending.front().front());
1560             seqid_query->Assign(*(h.GetQueryId()));
1561             seqid_subj->Assign(*(h.GetSubjId()));
1562         }
1563     }
1564 
1565     CSeq_align_set::Tdata& sas1_data (rv->Set());
1566 
1567     for(size_t i(0), idim(m_pending.size()); i < idim; ++i) {
1568 
1569         if(m_status[i]) {
1570 
1571             // note the range
1572             TCoord range_min (i > 0 && m_strands[i-1] == m_strands[i]
1573                               ? m_ranges[4*i - 1]
1574                               : 0);
1575 
1576             TCoord range_max (i + 1 < idim && m_strands[i+1] == m_strands[i]
1577                               ? m_ranges[4*i + 6]
1578                               : numeric_limits<TCoord>::max());
1579 
1580             CRef<CSeq_align> sa2 (new CSeq_align);
1581             sa2->SetType(CSeq_align::eType_disc);
1582 
1583             CSeq_align::TBounds & bounds (sa2->SetBounds());
1584             const ENa_strand query_strand (eNa_strand_plus);
1585 
1586             const ENa_strand subj_strand (m_strands[i]? eNa_strand_plus:
1587                                           eNa_strand_minus);
1588             CRef<CSeq_loc> seq_loc (new CSeq_loc (*seqid_subj, range_min,
1589                                                   range_max, subj_strand));
1590             bounds.push_back(seq_loc);
1591 
1592             // add the hits
1593             CSeq_align_set::Tdata& sas2_data (sa2->SetSegs().SetDisc().Set());
1594             ITERATE(typename THitRefs, ii, m_pending[i]) {
1595 
1596                 const THit& h (**ii);
1597                 CRef<CSeq_align> sa3 (new CSeq_align);
1598                 sa3->SetType(CSeq_align::eType_global);
1599                 const string& transcript(h.GetTranscript());
1600                 if (transcript.empty()) {
1601                     CRef<CStd_seg> sg(new CStd_seg);
1602                     sa3->SetSegs().SetStd().push_back(sg);
1603                     sg->SetDim(2);
1604                     CRef<CSeq_loc> query_loc(new CSeq_loc(
1605                             *seqid_query, h.GetQueryMin(), h.GetQueryMax(), query_strand));
1606                     CRef<CSeq_loc> subj_loc(new CSeq_loc(
1607                             *seqid_subj, h.GetSubjMin(), h.GetSubjMax(), subj_strand));
1608                     sg->SetLoc().push_back(query_loc);
1609                     sg->SetLoc().push_back(subj_loc);
1610                 } else {
1611                     CDense_seg & ds (sa3->SetSegs().SetDenseg());
1612                     ds.FromTranscript(h.GetQueryStart(), query_strand,
1613                                       h.GetSubjStart(), subj_strand,
1614                                       transcript);
1615                     ds.SetIds().push_back(seqid_query);
1616                     ds.SetIds().push_back(seqid_subj);
1617                 }
1618                 sas2_data.push_back(sa3);
1619             }
1620 
1621             // save into the seq-align-set
1622             sas1_data.push_back(sa2);
1623         }
1624     }
1625 
1626     return rv;
1627 }
1628 
1629 
1630 
1631 END_NCBI_SCOPE
1632 
1633 
1634 #endif
1635