1 /*  $Id: seq_loc_util.cpp 569939 2018-08-31 13:32:44Z bollin $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Clifford Clausen, Aaron Ucko, Aleksey Grichenko
27 *
28 * File Description:
29 *   Seq-loc utilities requiring CScope
30 */
31 
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbi_limits.hpp>
34 
35 #include <serial/iterator.hpp>
36 
37 #include <objects/seq/seq_id_mapper.hpp>
38 #include <objects/seq/seq_id_handle.hpp>
39 #include <objects/seq/seq_loc_reverse_complementer.hpp>
40 #include <objects/seq/Bioseq.hpp>
41 #include <objects/seq/MolInfo.hpp>
42 
43 #include <objects/seqloc/Seq_interval.hpp>
44 #include <objects/seqloc/Packed_seqint.hpp>
45 #include <objects/seqloc/Seq_loc_mix.hpp>
46 #include <objects/seqloc/Seq_point.hpp>
47 #include <objects/seqloc/Seq_bond.hpp>
48 #include <objects/seqloc/Seq_loc_equiv.hpp>
49 
50 #include <objmgr/util/seq_loc_util.hpp>
51 #include <objmgr/util/sequence.hpp>
52 #include <objmgr/scope.hpp>
53 #include <objmgr/seqdesc_ci.hpp>
54 #include <objmgr/objmgr_exception.hpp>
55 #include <objmgr/seq_loc_mapper.hpp>
56 #include <objmgr/impl/handle_range_map.hpp>
57 #include <objmgr/impl/synonyms.hpp>
58 #include <objmgr/error_codes.hpp>
59 
60 #include <util/range_coll.hpp>
61 
62 #include <algorithm>
63 
64 #define NCBI_USE_ERRCODE_X   ObjMgr_SeqUtil
65 
66 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)67 BEGIN_SCOPE(objects)
68 BEGIN_SCOPE(sequence)
69 
70 
71 TSeqPos GetLength(const CSeq_id& id, CScope* scope)
72 {
73     if ( !scope ) {
74         return numeric_limits<TSeqPos>::max();
75     }
76 #if 0
77     CBioseq_Handle hnd = scope->GetBioseqHandle(id);
78     if ( !hnd ) {
79         return numeric_limits<TSeqPos>::max();
80     }
81     return hnd.GetBioseqLength();
82 #else
83 	return scope->GetSequenceLength(id);
84 #endif
85 }
86 
87 
GetLength(const CSeq_loc & loc,CScope * scope)88 TSeqPos GetLength(const CSeq_loc& loc, CScope* scope)
89 {
90     switch (loc.Which()) {
91     case CSeq_loc::e_Pnt:
92         return 1;
93     case CSeq_loc::e_Int:
94         return loc.GetInt().GetLength();
95     case CSeq_loc::e_Null:
96     case CSeq_loc::e_Empty:
97         return 0;
98     case CSeq_loc::e_Whole:
99         return GetLength(loc.GetWhole(), scope);
100     case CSeq_loc::e_Packed_int:
101         return loc.GetPacked_int().GetLength();
102     case CSeq_loc::e_Mix:
103         return GetLength(loc.GetMix(), scope);
104     case CSeq_loc::e_Packed_pnt:   // just a bunch of points
105         return TSeqPos(loc.GetPacked_pnt().GetPoints().size());
106     case CSeq_loc::e_Bond:         // return number of points
107         return loc.GetBond().IsSetB() + loc.GetBond().IsSetA();
108     case CSeq_loc::e_not_set:      //can't calculate length
109     case CSeq_loc::e_Feat:
110     case CSeq_loc::e_Equiv:        // unless actually the same length...
111     default:
112         NCBI_THROW(CObjmgrUtilException, eUnknownLength,
113             "Unable to determine length");
114     }
115 }
116 
117 
118 namespace {
119     struct SCoverageCollector {
SCoverageCollector__anone342bb930111::SCoverageCollector120         SCoverageCollector(const CSeq_loc& loc, CScope* scope)
121             {
122                 Add(loc, scope);
123             }
Add__anone342bb930111::SCoverageCollector124         void Add(const CSeq_loc& loc, CScope* scope)
125             {
126                 switch (loc.Which()) {
127                 case CSeq_loc::e_Pnt:
128                     Add(loc.GetPnt());
129                     return;
130                 case CSeq_loc::e_Int:
131                     Add(loc.GetInt());
132                     return;
133                 case CSeq_loc::e_Null:
134                 case CSeq_loc::e_Empty:
135                     return;
136                 case CSeq_loc::e_Whole:
137                     AddWhole(loc.GetWhole(), scope);
138                     return;
139                 case CSeq_loc::e_Mix:
140                     Add(loc.GetMix(), scope);
141                     return;
142                 case CSeq_loc::e_Packed_int:
143                     Add(loc.GetPacked_int());
144                     return;
145                 case CSeq_loc::e_Packed_pnt:
146                     Add(loc.GetPacked_pnt());
147                     return;
148                 case CSeq_loc::e_Bond:
149                     Add(loc.GetBond().GetA());
150                     if ( loc.GetBond().IsSetB() ) {
151                         Add(loc.GetBond().GetB());
152                     }
153                     return;
154                 case CSeq_loc::e_not_set: //can't calculate coverage
155                 case CSeq_loc::e_Feat:
156                 case CSeq_loc::e_Equiv: // unless actually the same length...
157                 default:
158                     NCBI_THROW(CObjmgrUtilException, eUnknownLength,
159                                "Unable to determine coverage");
160                 }
161             }
Add__anone342bb930111::SCoverageCollector162         void Add(const CSeq_id_Handle& idh, TSeqPos from, TSeqPos to)
163             {
164                 m_Intervals[idh] += TRange(from, to);
165             }
Add__anone342bb930111::SCoverageCollector166         void Add(const CSeq_id& id, TSeqPos from, TSeqPos to)
167             {
168                 Add(CSeq_id_Handle::GetHandle(id), from, to);
169             }
AddWhole__anone342bb930111::SCoverageCollector170         void AddWhole(const CSeq_id& id, CScope* scope)
171             {
172                 Add(id, 0, GetLength(id, scope)-1);
173             }
Add__anone342bb930111::SCoverageCollector174         void Add(const CSeq_point& seq_pnt)
175             {
176                 Add(seq_pnt.GetId(), seq_pnt.GetPoint(), seq_pnt.GetPoint());
177             }
Add__anone342bb930111::SCoverageCollector178         void Add(const CSeq_interval& seq_int)
179             {
180                 Add(seq_int.GetId(), seq_int.GetFrom(), seq_int.GetTo());
181             }
Add__anone342bb930111::SCoverageCollector182         void Add(const CPacked_seqint& packed_int)
183             {
184                 ITERATE ( CPacked_seqint::Tdata, it, packed_int.Get() ) {
185                     Add(**it);
186                 }
187             }
Add__anone342bb930111::SCoverageCollector188         void Add(const CPacked_seqpnt& packed_pnt)
189             {
190                 CSeq_id_Handle idh =
191                     CSeq_id_Handle::GetHandle(packed_pnt.GetId());
192                 ITERATE(CPacked_seqpnt::TPoints, it, packed_pnt.GetPoints()) {
193                     Add(idh, *it, *it);
194                 }
195             }
Add__anone342bb930111::SCoverageCollector196         void Add(const CSeq_loc_mix& mix, CScope* scope)
197             {
198                 ITERATE ( CSeq_loc_mix::Tdata, it, mix.Get() ) {
199                     Add(**it, scope);
200                 }
201             }
202 
GetCoverage__anone342bb930111::SCoverageCollector203         TSeqPos GetCoverage(void) const
204             {
205                 TSeqPos coverage = 0;
206                 ITERATE ( TIntervals, it, m_Intervals ) {
207                     ITERATE ( TRanges, it2, it->second ) {
208                         coverage += it2->GetLength();
209                     }
210                 }
211                 return coverage;
212             }
213 
214     private:
215         typedef CRange<TSeqPos> TRange;
216         typedef CRangeCollection<TSeqPos> TRanges;
217         typedef map<CSeq_id_Handle, TRanges> TIntervals;
218         TIntervals m_Intervals;
219     };
220 }
221 
222 
GetCoverage(const CSeq_loc & loc,CScope * scope)223 TSeqPos GetCoverage(const CSeq_loc& loc, CScope* scope)
224 {
225     switch (loc.Which()) {
226     case CSeq_loc::e_Pnt:
227         return 1;
228     case CSeq_loc::e_Int:
229         return loc.GetInt().GetLength();
230     case CSeq_loc::e_Null:
231     case CSeq_loc::e_Empty:
232         return 0;
233     case CSeq_loc::e_Whole:
234         return GetLength(loc.GetWhole(), scope);
235     case CSeq_loc::e_Packed_int:
236     case CSeq_loc::e_Mix:
237     case CSeq_loc::e_Packed_pnt:
238     case CSeq_loc::e_Bond:
239         return SCoverageCollector(loc, scope).GetCoverage();
240     case CSeq_loc::e_not_set:      // can't calculate length
241     case CSeq_loc::e_Feat:
242     case CSeq_loc::e_Equiv:        // unless actually the same length...
243     default:
244         NCBI_THROW(CObjmgrUtilException, eUnknownLength,
245             "Unable to determine length");
246     }
247 }
248 
249 
GetLength(const CSeq_loc_mix & mix,CScope * scope)250 TSeqPos GetLength(const CSeq_loc_mix& mix, CScope* scope)
251 {
252     TSeqPos length = 0;
253 
254     ITERATE( CSeq_loc_mix::Tdata, i, mix.Get() ) {
255         TSeqPos ret = GetLength((**i), scope);
256         if (ret < numeric_limits<TSeqPos>::max()) {
257             length += ret;
258         }
259     }
260     return length;
261 }
262 
263 
IsValid(const CSeq_point & pt,CScope * scope)264 bool IsValid(const CSeq_point& pt, CScope* scope)
265 {
266     if (static_cast<TSeqPos>(pt.GetPoint()) >=
267          GetLength(pt.GetId(), scope) )
268     {
269         return false;
270     }
271 
272     return true;
273 }
274 
275 
IsValid(const CPacked_seqpnt & pts,CScope * scope)276 bool IsValid(const CPacked_seqpnt& pts, CScope* scope)
277 {
278     TSeqPos length = GetLength(pts.GetId(), scope);
279     ITERATE (CPacked_seqpnt::TPoints, it, pts.GetPoints()) {
280         if (*it >= length) {
281             return false;
282         }
283     }
284     return true;
285 }
286 
287 
IsValid(const CSeq_interval & interval,CScope * scope)288 bool IsValid(const CSeq_interval& interval, CScope* scope)
289 {
290     if (interval.GetFrom() > interval.GetTo() ||
291         interval.GetTo() >= GetLength(interval.GetId(), scope))
292     {
293         return false;
294     }
295 
296     return true;
297 }
298 
299 
IsSameBioseq(const CSeq_id & id1,const CSeq_id & id2,CScope * scope,CScope::EGetBioseqFlag get_flag)300 bool IsSameBioseq(const CSeq_id& id1, const CSeq_id& id2, CScope* scope,
301                   CScope::EGetBioseqFlag get_flag)
302 {
303     if ( id1.Match(id2) ) {
304         return true;
305     }
306     return IsSameBioseq(CSeq_id_Handle::GetHandle(id1),
307                         CSeq_id_Handle::GetHandle(id2),
308                         scope, get_flag);
309 }
310 
311 
IsSameBioseq(const CSeq_id_Handle & id1,const CSeq_id_Handle & id2,CScope * scope,CScope::EGetBioseqFlag get_flag)312 bool IsSameBioseq(const CSeq_id_Handle& id1, const CSeq_id_Handle& id2, CScope* scope,
313                   CScope::EGetBioseqFlag get_flag)
314 {
315     // Compare CSeq_ids directly
316     if (id1 == id2) {
317         return true;
318     }
319 
320     // Compare handles
321     return scope && scope->IsSameBioseq(id1, id2, get_flag);
322 }
323 
324 
s_GetId(const CSeq_loc & loc,CScope * scope,string * msg=NULL)325 static const CSeq_id* s_GetId(const CSeq_loc& loc, CScope* scope,
326                               string* msg = NULL)
327 {
328     const CSeq_id* sip = NULL;
329     if (msg != NULL) {
330         msg->erase();
331     }
332 
333     for (CSeq_loc_CI it(loc, CSeq_loc_CI::eEmpty_Allow); it; ++it) {
334         const CSeq_id& id = it.GetSeq_id();
335         if (id.Which() == CSeq_id::e_not_set) {
336             continue;
337         }
338         if (sip == NULL) {
339             sip = &id;
340         } else {
341             if (!IsSameBioseq(*sip, id, scope)) {
342                 if (msg != NULL) {
343                     *msg = "Location contains segments on more than one bioseq.";
344                 }
345                 sip = NULL;
346                 break;
347             }
348         }
349     }
350 
351     if (sip == NULL  &&  msg != NULL  &&  msg->empty()) {
352         *msg = "Location contains no IDs.";
353     }
354 
355     return sip;
356 }
357 
358 
GetId(const CSeq_loc & loc,CScope * scope)359 const CSeq_id& GetId(const CSeq_loc& loc, CScope* scope)
360 {
361     string msg;
362     const CSeq_id* sip = s_GetId(loc, scope, &msg);
363 
364     if (sip == NULL) {
365         NCBI_THROW(CObjmgrUtilException, eNotUnique, msg);
366     }
367 
368     return *sip;
369 }
370 
371 
GetIdHandle(const CSeq_loc & loc,CScope * scope)372 CSeq_id_Handle GetIdHandle(const CSeq_loc& loc, CScope* scope)
373 {
374     CSeq_id_Handle retval;
375 
376     try {
377         if (!loc.IsNull()) {
378             retval = CSeq_id_Handle::GetHandle(GetId(loc, scope));
379         }
380     } catch (CObjmgrUtilException&) {}
381 
382     return retval;
383 }
384 
385 
IsOneBioseq(const CSeq_loc & loc,CScope * scope)386 bool IsOneBioseq(const CSeq_loc& loc, CScope* scope)
387 {
388     return loc.GetId() != NULL || s_GetId(loc, scope) != NULL;
389 }
390 
391 
392 inline
s_GetStrand(const CSeq_loc & loc)393 static ENa_strand s_GetStrand(const CSeq_loc& loc)
394 {
395     switch (loc.Which()) {
396     case CSeq_loc::e_Bond:
397         {
398             const CSeq_bond& bond = loc.GetBond();
399             ENa_strand a_strand = bond.GetA().IsSetStrand() ?
400                 bond.GetA().GetStrand() : eNa_strand_unknown;
401             ENa_strand b_strand = eNa_strand_unknown;
402             if ( bond.IsSetB() ) {
403                 b_strand = bond.GetB().IsSetStrand() ?
404                     bond.GetB().GetStrand() : eNa_strand_unknown;
405             }
406 
407             if ( a_strand == eNa_strand_unknown  &&
408                  b_strand != eNa_strand_unknown ) {
409                 a_strand = b_strand;
410             } else if ( a_strand != eNa_strand_unknown  &&
411                         b_strand == eNa_strand_unknown ) {
412                 b_strand = a_strand;
413             }
414 
415             return (a_strand != b_strand) ? eNa_strand_other : a_strand;
416         }
417     case CSeq_loc::e_Whole:
418         return eNa_strand_both;
419     case CSeq_loc::e_Int:
420         return loc.GetInt().IsSetStrand() ? loc.GetInt().GetStrand() :
421             eNa_strand_unknown;
422     case CSeq_loc::e_Pnt:
423         return loc.GetPnt().IsSetStrand() ? loc.GetPnt().GetStrand() :
424             eNa_strand_unknown;
425     case CSeq_loc::e_Packed_pnt:
426         return loc.GetPacked_pnt().IsSetStrand() ?
427             loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
428     case CSeq_loc::e_Packed_int:
429     {
430         ENa_strand strand = eNa_strand_unknown;
431         bool strand_set = false;
432         ITERATE(CPacked_seqint::Tdata, i, loc.GetPacked_int().Get()) {
433             ENa_strand istrand = (*i)->IsSetStrand() ? (*i)->GetStrand() :
434                 eNa_strand_unknown;
435             if (strand == eNa_strand_unknown  &&  istrand == eNa_strand_plus) {
436                 strand = eNa_strand_plus;
437                 strand_set = true;
438             } else if (strand == eNa_strand_plus  &&
439                 istrand == eNa_strand_unknown) {
440                 strand_set = true;
441             } else if (!strand_set) {
442                 strand = istrand;
443                 strand_set = true;
444             } else if (istrand != strand) {
445                 return eNa_strand_other;
446             }
447         }
448         return strand;
449     }
450     case CSeq_loc::e_Mix:
451     {
452         ENa_strand strand = eNa_strand_unknown;
453         bool strand_set = false;
454         ITERATE(CSeq_loc_mix::Tdata, it, loc.GetMix().Get()) {
455             if ((*it)->IsNull()  ||  (*it)->IsEmpty()) {
456                 continue;
457             }
458             ENa_strand istrand = GetStrand(**it);
459             if (strand == eNa_strand_unknown  &&  istrand == eNa_strand_plus) {
460                 strand = eNa_strand_plus;
461                 strand_set = true;
462             } else if (strand == eNa_strand_plus  &&
463                 istrand == eNa_strand_unknown) {
464                 strand_set = true;
465             } else if (!strand_set) {
466                 strand = istrand;
467                 strand_set = true;
468             } else if (istrand != strand) {
469                 return eNa_strand_other;
470             }
471         }
472         return strand;
473     }
474     default:
475         return eNa_strand_unknown;
476     }
477 }
478 
479 
480 
GetStrand(const CSeq_loc & loc,CScope * scope)481 ENa_strand GetStrand(const CSeq_loc& loc, CScope* scope)
482 {
483     switch (loc.Which()) {
484     case CSeq_loc::e_Int:
485         if (loc.GetInt().IsSetStrand()) {
486             return loc.GetInt().GetStrand();
487         }
488         break;
489 
490     case CSeq_loc::e_Whole:
491         return eNa_strand_both;
492 
493     case CSeq_loc::e_Pnt:
494         if (loc.GetPnt().IsSetStrand()) {
495             return loc.GetPnt().GetStrand();
496         }
497         break;
498 
499     case CSeq_loc::e_Packed_pnt:
500         return loc.GetPacked_pnt().IsSetStrand() ?
501             loc.GetPacked_pnt().GetStrand() : eNa_strand_unknown;
502 
503     default:
504         if (!IsOneBioseq(loc, scope)) {
505             return eNa_strand_unknown;  // multiple bioseqs
506         } else {
507             return s_GetStrand(loc);
508         }
509     }
510 
511     /// default to unknown strand
512     return eNa_strand_unknown;
513 }
514 
515 
GetStart(const CSeq_loc & loc,CScope * scope,ESeqLocExtremes ext)516 TSeqPos GetStart(const CSeq_loc& loc, CScope* scope, ESeqLocExtremes ext)
517 {
518     // Throw CObjmgrUtilException if loc does not represent one CBioseq
519     GetId(loc, scope);
520 
521     return loc.GetStart(ext);
522 }
523 
524 
GetStop(const CSeq_loc & loc,CScope * scope,ESeqLocExtremes ext)525 TSeqPos GetStop(const CSeq_loc& loc, CScope* scope, ESeqLocExtremes ext)
526 {
527     // Throw CObjmgrUtilException if loc does not represent one CBioseq
528     GetId(loc, scope);
529 
530     if (loc.IsWhole()  &&  scope != NULL) {
531         CBioseq_Handle seq = GetBioseqFromSeqLoc(loc, *scope);
532         if (seq) {
533             return seq.GetBioseqLength() - 1;
534         }
535     }
536     return loc.GetStop(ext);
537 }
538 
539 
ChangeSeqId(CSeq_id * id,bool best,CScope * scope)540 void ChangeSeqId(CSeq_id* id, bool best, CScope* scope)
541 {
542     // Return if no scope
543     if (!scope  ||  !id) {
544         return;
545     }
546 
547     // Get CBioseq represented by *id
548     CBioseq_Handle bsh = scope->GetBioseqHandle(*id);
549     if (!bsh) {
550         return;
551     }
552     CBioseq_Handle::TBioseqCore seq =
553         bsh.GetBioseqCore();
554 
555     // Get pointer to the best/worst id of *seq
556     const CSeq_id* tmp_id;
557     if (best) {
558         tmp_id = FindBestChoice(seq->GetId(), CSeq_id::BestRank).GetPointer();
559     } else {
560         tmp_id = FindBestChoice(seq->GetId(), CSeq_id::WorstRank).GetPointer();
561     }
562 
563     // Change the contents of *id to that of *tmp_id
564     id->Reset();
565     id->Assign(*tmp_id);
566 }
567 
568 
ChangeSeqLocId(CSeq_loc * loc,bool best,CScope * scope)569 void ChangeSeqLocId(CSeq_loc* loc, bool best, CScope* scope)
570 {
571     if (!scope) {
572         return;
573     }
574 
575     for (CTypeIterator<CSeq_id> id(Begin(*loc)); id; ++id) {
576         ChangeSeqId(&(*id), best, scope);
577     }
578 }
579 
580 
BadSeqLocSortOrder(const CBioseq_Handle & bsh,const CSeq_loc & loc)581 bool BadSeqLocSortOrder
582 (const CBioseq_Handle& bsh,
583  const CSeq_loc& loc)
584 {
585     try {
586         CSeq_loc_Mapper mapper (bsh, CSeq_loc_Mapper::eSeqMap_Up);
587         CConstRef<CSeq_loc> mapped_loc = mapper.Map(loc);
588         if (!mapped_loc) {
589             return false;
590         }
591 
592         // Check that loc segments are in order
593         CSeq_loc::TRange last_range;
594         bool first = true;
595         for (CSeq_loc_CI lit(*mapped_loc); lit; ++lit) {
596             if (first) {
597                 last_range = lit.GetRange();
598                 first = false;
599                 continue;
600             }
601             if (lit.GetStrand() == eNa_strand_minus) {
602                 if (last_range.GetTo() < lit.GetRange().GetTo()) {
603                     return true;
604                 }
605             } else {
606                 if (last_range.GetFrom() > lit.GetRange().GetFrom()) {
607                     return true;
608                 }
609             }
610             last_range = lit.GetRange();
611         }
612     } catch (CException) {
613         // exception will be thrown if references far sequence and not remote fetching
614     }
615     return false;
616 }
617 
618 
BadSeqLocSortOrder(const CBioseq & seq,const CSeq_loc & loc,CScope * scope)619 bool BadSeqLocSortOrder
620 (const CBioseq&  seq,
621  const CSeq_loc& loc,
622  CScope*         scope)
623 {
624     if (scope) {
625         return BadSeqLocSortOrder (scope->GetBioseqHandle(seq), loc);
626     } else {
627         return false;
628     }
629 }
630 
631 
SeqLocCheck(const CSeq_loc & loc,CScope * scope)632 ESeqLocCheck SeqLocCheck(const CSeq_loc& loc, CScope* scope)
633 {
634     ESeqLocCheck rtn = eSeqLocCheck_ok;
635 
636     ENa_strand strand = GetStrand(loc, scope);
637     if (strand == eNa_strand_unknown  ||  strand == eNa_strand_other) {
638         rtn = eSeqLocCheck_warning;
639     }
640 
641     CTypeConstIterator<CSeq_loc> lit(ConstBegin(loc));
642     for (;lit; ++lit) {
643         switch (lit->Which()) {
644         case CSeq_loc::e_Int:
645             if (!IsValid(lit->GetInt(), scope)) {
646                 rtn = eSeqLocCheck_error;
647             }
648             break;
649         case CSeq_loc::e_Packed_int:
650         {
651             CTypeConstIterator<CSeq_interval> sit(ConstBegin(*lit));
652             for(;sit; ++sit) {
653                 if (!IsValid(*sit, scope)) {
654                     rtn = eSeqLocCheck_error;
655                     break;
656                 }
657             }
658             break;
659         }
660         case CSeq_loc::e_Pnt:
661             if (!IsValid(lit->GetPnt(), scope)) {
662                 rtn = eSeqLocCheck_error;
663             }
664             break;
665         case CSeq_loc::e_Packed_pnt:
666             if (!IsValid(lit->GetPacked_pnt(), scope)) {
667                 rtn = eSeqLocCheck_error;
668             }
669             break;
670         default:
671             break;
672         }
673     }
674     return rtn;
675 }
676 
677 
LocationOffset(const CSeq_loc & outer,const CSeq_loc & inner,EOffsetType how,CScope * scope)678 TSeqPos LocationOffset(const CSeq_loc& outer, const CSeq_loc& inner,
679                        EOffsetType how, CScope* scope)
680 {
681     SRelLoc rl(outer, inner, scope);
682     if (rl.m_Ranges.empty()) {
683         return (TSeqPos)-1;
684     }
685     bool want_reverse = false;
686     {{
687         bool outer_is_reverse = IsReverse(GetStrand(outer, scope));
688         switch (how) {
689         case eOffset_FromStart:
690             want_reverse = false;
691             break;
692         case eOffset_FromEnd:
693             want_reverse = true;
694             break;
695         case eOffset_FromLeft:
696             want_reverse = outer_is_reverse;
697             break;
698         case eOffset_FromRight:
699             want_reverse = !outer_is_reverse;
700             break;
701         }
702     }}
703     if (want_reverse) {
704         // Subtract 1 to compensate comparing 0-based coordinates to a
705         // 1-based length.
706         return GetLength(outer, scope) - rl.m_Ranges.back()->GetTo() - 1;
707     } else {
708         return rl.m_Ranges.front()->GetFrom();
709     }
710 }
711 
712 
SeqIntPartialCheck(const CSeq_interval & itv,unsigned int & retval,bool is_first,bool is_last,CScope & scope)713 void SeqIntPartialCheck(const CSeq_interval& itv,
714                         unsigned int& retval,
715                         bool is_first,
716                         bool is_last,
717                         CScope& scope)
718 {
719     if (itv.IsSetFuzz_from()) {
720         const CInt_fuzz& fuzz = itv.GetFuzz_from();
721         if (fuzz.Which() == CInt_fuzz::e_Lim) {
722             CInt_fuzz::ELim lim = fuzz.GetLim();
723             if (lim == CInt_fuzz::eLim_gt) {
724                 retval |= eSeqlocPartial_Limwrong;
725             } else if (lim == CInt_fuzz::eLim_lt  ||
726                 lim == CInt_fuzz::eLim_unk) {
727                 if (itv.IsSetStrand()  &&
728                     itv.GetStrand() == eNa_strand_minus) {
729                     if ( is_last ) {
730                         retval |= eSeqlocPartial_Stop;
731                     } else {
732                         retval |= eSeqlocPartial_Internal;
733                     }
734                     if (itv.GetFrom() != 0) {
735                         if ( is_last ) {
736                             retval |= eSeqlocPartial_Nostop;
737                         } else {
738                             retval |= eSeqlocPartial_Nointernal;
739                         }
740                     }
741                 } else {
742                     if ( is_first ) {
743                         retval |= eSeqlocPartial_Start;
744                     } else {
745                         retval |= eSeqlocPartial_Internal;
746                     }
747                     if (itv.GetFrom() != 0) {
748                         if ( is_first ) {
749                             retval |= eSeqlocPartial_Nostart;
750                         } else {
751                             retval |= eSeqlocPartial_Nointernal;
752                         }
753                     }
754                 }
755             }
756         } else if (fuzz.Which() == CInt_fuzz::e_Range) {
757             // range
758             if (itv.IsSetStrand()  &&   itv.GetStrand() == eNa_strand_minus) {
759                 if (is_last) {
760                     retval |= eSeqlocPartial_Stop;
761                 }
762             } else {
763                 if (is_first) {
764                     retval |= eSeqlocPartial_Start;
765                 }
766             }
767         }
768     }
769 
770     if (itv.IsSetFuzz_to()) {
771         const CInt_fuzz& fuzz = itv.GetFuzz_to();
772         CInt_fuzz::ELim lim = fuzz.IsLim() ?
773             fuzz.GetLim() : CInt_fuzz::eLim_unk;
774         if (lim == CInt_fuzz::eLim_lt) {
775             retval |= eSeqlocPartial_Limwrong;
776         } else if (lim == CInt_fuzz::eLim_gt  ||
777             lim == CInt_fuzz::eLim_unk) {
778             CBioseq_Handle hnd =
779                 scope.GetBioseqHandle(itv.GetId());
780             bool miss_end = false;
781             if ( hnd ) {
782                 if (itv.GetTo() != hnd.GetBioseqLength() - 1) {
783                     miss_end = true;
784                 }
785             }
786             if (itv.IsSetStrand()  &&
787                 itv.GetStrand() == eNa_strand_minus) {
788                 if ( is_first ) {
789                     retval |= eSeqlocPartial_Start;
790                 } else {
791                     retval |= eSeqlocPartial_Internal;
792                 }
793                 if (miss_end) {
794                     if ( is_first /* was last */) {
795                         retval |= eSeqlocPartial_Nostart;
796                     } else {
797                         retval |= eSeqlocPartial_Nointernal;
798                     }
799                 }
800             } else {
801                 if ( is_last ) {
802                     retval |= eSeqlocPartial_Stop;
803                 } else {
804                     retval |= eSeqlocPartial_Internal;
805                 }
806                 if ( miss_end ) {
807                     if ( is_last ) {
808                         retval |= eSeqlocPartial_Nostop;
809                     } else {
810                         retval |= eSeqlocPartial_Nointernal;
811                     }
812                 }
813             }
814         }
815     }
816 }
817 
818 
SeqLocPartialCheck(const CSeq_loc & loc,CScope * scope)819 int SeqLocPartialCheck(const CSeq_loc& loc, CScope* scope)
820 {
821     unsigned int retval = 0;
822     if (!scope) {
823         return retval;
824     }
825 
826     // Find first and last Seq-loc
827     const CSeq_loc *first = 0, *last = 0;
828     for ( CSeq_loc_CI loc_iter(loc); loc_iter; ++loc_iter ) {
829         if ( first == 0 ) {
830             first = &(loc_iter.GetEmbeddingSeq_loc());
831         }
832         last = &(loc_iter.GetEmbeddingSeq_loc());
833     }
834     if (!first) {
835         return retval;
836     }
837 
838     CSeq_loc_CI i2(loc, CSeq_loc_CI::eEmpty_Allow);
839     while ( i2 ) {
840         const CSeq_loc* slp = &(i2.GetEmbeddingSeq_loc());
841         switch (slp->Which()) {
842         case CSeq_loc::e_Null:
843             if (slp == first) {
844                 retval |= eSeqlocPartial_Start;
845             } else if (slp == last) {
846                 retval |= eSeqlocPartial_Stop;
847             } else {
848                 retval |= eSeqlocPartial_Internal;
849             }
850             break;
851         case CSeq_loc::e_Int:
852             {
853                 SeqIntPartialCheck(slp->GetInt(), retval,
854                     slp == first, slp == last, *scope);
855             }
856             break;
857         case CSeq_loc::e_Packed_int:
858             {
859                 const CPacked_seqint::Tdata& ints = slp->GetPacked_int().Get();
860                 const CSeq_interval* first_int =
861                     ints.empty() ? 0 : ints.front().GetPointer();
862                 const CSeq_interval* last_int =
863                     ints.empty() ? 0 : ints.back().GetPointer();
864                 ITERATE(CPacked_seqint::Tdata, it, ints) {
865                     SeqIntPartialCheck(**it, retval,
866                         slp == first  &&  *it == first_int,
867                         slp == last  &&  *it == last_int,
868                         *scope);
869                     ++i2;
870                 }
871                 continue;
872             }
873         case CSeq_loc::e_Pnt:
874             if (slp->GetPnt().IsSetFuzz()) {
875                 const CInt_fuzz& fuzz = slp->GetPnt().GetFuzz();
876                 if (fuzz.Which() == CInt_fuzz::e_Lim) {
877                     CInt_fuzz::ELim lim = fuzz.GetLim();
878                     if (lim == CInt_fuzz::eLim_gt  ||
879                         lim == CInt_fuzz::eLim_lt  ||
880                         lim == CInt_fuzz::eLim_unk) {
881                         if (slp == first) {
882                             retval |= eSeqlocPartial_Start;
883                         } else if (slp == last) {
884                             retval |= eSeqlocPartial_Stop;
885                         } else {
886                             retval |= eSeqlocPartial_Internal;
887                         }
888                     }
889                 }
890             }
891             break;
892         case CSeq_loc::e_Packed_pnt:
893             if (slp->GetPacked_pnt().IsSetFuzz()) {
894                 const CInt_fuzz& fuzz = slp->GetPacked_pnt().GetFuzz();
895                 if (fuzz.Which() == CInt_fuzz::e_Lim) {
896                     CInt_fuzz::ELim lim = fuzz.GetLim();
897                     if (lim == CInt_fuzz::eLim_gt  ||
898                         lim == CInt_fuzz::eLim_lt  ||
899                         lim == CInt_fuzz::eLim_unk) {
900                         if (slp == first) {
901                             retval |= eSeqlocPartial_Start;
902                         } else if (slp == last) {
903                             retval |= eSeqlocPartial_Stop;
904                         } else {
905                             retval |= eSeqlocPartial_Internal;
906                         }
907                     }
908                 }
909             }
910             break;
911         case CSeq_loc::e_Whole:
912         {
913             // Get the Bioseq referred to by Whole
914             CBioseq_Handle bsh = scope->GetBioseqHandle(slp->GetWhole());
915             if ( !bsh ) {
916                 break;
917             }
918             // Check for CMolInfo on the biodseq
919             CSeqdesc_CI di( bsh, CSeqdesc::e_Molinfo );
920             if ( !di ) {
921                 // If no CSeq_descr, nothing can be done
922                 break;
923             }
924             // First try to loop through CMolInfo
925             const CMolInfo& mi = di->GetMolinfo();
926             if (!mi.IsSetCompleteness()) {
927                 ++i2;
928                 continue;
929             }
930             switch (mi.GetCompleteness()) {
931             case CMolInfo::eCompleteness_no_left:
932                 if (slp == first) {
933                     retval |= eSeqlocPartial_Start;
934                 } else {
935                     retval |= eSeqlocPartial_Internal;
936                 }
937                 break;
938             case CMolInfo::eCompleteness_no_right:
939                 if (slp == last) {
940                     retval |= eSeqlocPartial_Stop;
941                 } else {
942                     retval |= eSeqlocPartial_Internal;
943                 }
944                 break;
945             case CMolInfo::eCompleteness_partial:
946                 retval |= eSeqlocPartial_Other;
947                 break;
948             case CMolInfo::eCompleteness_no_ends:
949                 retval |= eSeqlocPartial_Start;
950                 retval |= eSeqlocPartial_Stop;
951                 break;
952             default:
953                 break;
954             }
955             break;
956         }
957         default:
958             break;
959         }
960         ++i2;
961     }
962     return retval;
963 }
964 
965 /////////////////////////////////////////////////////////////////////
966 //
967 //  Implementation of SeqLocRevCmpl()
968 //
969 
SeqLocRevCmpl(const CSeq_loc & loc,CScope *)970 CSeq_loc* SeqLocRevCmpl(const CSeq_loc& loc, CScope*)
971 {
972     CReverseComplementHelper helper;
973     return GetReverseComplement( loc, &helper );
974 }
975 
976 /////////////////////////////////////////////////////////////////////
977 //
978 //  Implementation of Compare()
979 //
980 
981 
982 typedef map<CSeq_id_Handle, CSeq_id_Handle> TSynMap;
983 
984 // Map the id to it's 'main' synonym.
s_GetSynHandle(CSeq_id_Handle id,TSynMap & syns,CScope * scope)985 CSeq_id_Handle s_GetSynHandle(CSeq_id_Handle id, TSynMap& syns, CScope* scope)
986 {
987     TSynMap::const_iterator syn_it = syns.find(id);
988     if (syn_it != syns.end()) {
989         // known id
990         return syn_it->second;
991     }
992     // Unknown id, try to find a match
993     ITERATE(TSynMap, sit, syns) {
994         if ( IsSameBioseq(sit->first, id, scope) ) {
995             // Found a synonym
996             CSeq_id_Handle map_to = sit->second;
997             syns[id] = map_to;
998             return map_to;
999         }
1000     }
1001     syns[id] = id;
1002     return id;
1003 }
1004 
1005 
1006 typedef CRange<TSeqPos>  TRangeInfo;
1007 typedef list<TRangeInfo> TRangeInfoList;
1008 typedef map<CSeq_id_Handle, TRangeInfoList> TRangeInfoMap;
1009 
1010 // Convert the seq-loc to TRangeInfos. The id map is used to
1011 // normalize ids.
s_SeqLocToRangeInfoMap(const CSeq_loc & loc,TRangeInfoMap & infos,TSynMap & syns,CScope * scope)1012 void s_SeqLocToRangeInfoMap(const CSeq_loc& loc,
1013                             TRangeInfoMap&  infos,
1014                             TSynMap&        syns,
1015                             CScope*         scope)
1016 {
1017     CSeq_loc_CI it(loc,
1018         CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Positional);
1019     for ( ; it; ++it) {
1020         TRangeInfo info;
1021         if ( it.IsWhole() ) {
1022             info.SetOpen(0, GetLength(it.GetSeq_id(), scope));
1023         }
1024         else {
1025             info.SetOpen(it.GetRange().GetFrom(), it.GetRange().GetToOpen());
1026         }
1027         CSeq_id_Handle id = s_GetSynHandle(it.GetSeq_id_Handle(), syns, scope);
1028         infos[id].push_back(info);
1029     }
1030     NON_CONST_ITERATE(TRangeInfoMap, info, infos) {
1031         info->second.sort();
1032     }
1033 }
1034 
1035 
s_CompareOverlapping(const CSeq_loc & me,const CSeq_loc & you,TSynMap & syns,CScope * scope)1036 ECompare s_CompareOverlapping(const CSeq_loc& me,
1037                               const CSeq_loc& you,
1038                               TSynMap&        syns,
1039                               CScope*         scope)
1040 {
1041     TRangeInfoMap me_infos, you_infos;
1042     s_SeqLocToRangeInfoMap(me, me_infos, syns, scope);
1043     s_SeqLocToRangeInfoMap(you, you_infos, syns, scope);
1044 
1045     // Check if locs are equal. The ranges are sorted now, so the original
1046     // order is not important.
1047     if (me_infos.size() == you_infos.size()) {
1048         bool equal = true;
1049         TRangeInfoMap::const_iterator mid_it = me_infos.begin();
1050         TRangeInfoMap::const_iterator yid_it = you_infos.begin();
1051         for ( ; mid_it != me_infos.end(); ++mid_it, ++yid_it) {
1052             _ASSERT(yid_it != you_infos.end());
1053             if (mid_it->first != yid_it->first  ||
1054                 mid_it->second.size() != yid_it->second.size()) {
1055                 equal = false;
1056                 break;
1057             }
1058             TRangeInfoList::const_iterator mit = mid_it->second.begin();
1059             TRangeInfoList::const_iterator yit = yid_it->second.begin();
1060             for ( ; mit != mid_it->second.end(); ++mit, ++yit) {
1061                 _ASSERT(yit != yid_it->second.end());
1062                 if (*mit != *yit) {
1063                     equal = false;
1064                     break;
1065                 }
1066             }
1067             if ( !equal ) break;
1068         }
1069         if ( equal ) {
1070             return eSame;
1071         }
1072     }
1073 
1074     // Check if 'me' is contained or overlapping with 'you'.
1075     bool me_contained = true;
1076     bool overlap = false;
1077 
1078     ITERATE(TRangeInfoMap, mid_it, me_infos) {
1079         TRangeInfoMap::const_iterator yid_it = you_infos.find(mid_it->first);
1080         if (yid_it == you_infos.end()) {
1081             // The id is missing from 'you'.
1082             me_contained = false;
1083             if ( overlap ) {
1084                 break; // nothing else to check
1085             }
1086             continue; // check for overlap
1087         }
1088         ITERATE(TRangeInfoList, mit, mid_it->second) {
1089             // current range is contained in 'you'?
1090             bool mit_contained = false;
1091             ITERATE(TRangeInfoList, yit, yid_it->second) {
1092                 if (yit->GetToOpen() > mit->GetFrom()  &&
1093                     yit->GetFrom() < mit->GetToOpen()) {
1094                     overlap = true;
1095                     if (yit->GetFrom() <= mit->GetFrom()  &&
1096                         yit->GetToOpen() >= mit->GetToOpen()) {
1097                         mit_contained = true;
1098                         break;
1099                     }
1100                 }
1101             }
1102             if ( !mit_contained ) {
1103                 me_contained = false;
1104                 if ( overlap ) break; // nothing else to check
1105             }
1106         }
1107         if (!me_contained  &&  overlap) {
1108             break; // nothing else to check
1109         }
1110     }
1111 
1112     // Reverse check: if 'you' is contained in 'me'.
1113     bool you_contained = true;
1114 
1115     ITERATE(TRangeInfoMap, yid_it, you_infos) {
1116         TRangeInfoMap::const_iterator mid_it = me_infos.find(yid_it->first);
1117         if (mid_it == me_infos.end()) {
1118             // The id is missing from 'me'.
1119             you_contained = false;
1120             break; // nothing else to check
1121         }
1122         ITERATE(TRangeInfoList, yit, yid_it->second) {
1123             // current range is contained in 'me'?
1124             bool yit_contained = false;
1125             ITERATE(TRangeInfoList, mit, mid_it->second) {
1126                 if (mit->GetFrom() <= yit->GetFrom()  &&
1127                     mit->GetToOpen() >= yit->GetToOpen()) {
1128                     yit_contained = true;
1129                     break;
1130                 }
1131             }
1132             if ( !yit_contained ) {
1133                 you_contained = false;
1134                 break;
1135             }
1136         }
1137         if ( !you_contained ) {
1138             break; // nothing else to check
1139         }
1140     }
1141 
1142     // Always prefere 'eContains' over 'eContained'.
1143     if ( you_contained ) {
1144         return eContains;
1145     }
1146     if ( me_contained ) {
1147         return eContained;
1148     }
1149     return overlap ? eOverlap : eNoOverlap;
1150 }
1151 
1152 
s_CheckAbutting(const CSeq_loc & loc1,const CSeq_loc & loc2,TSynMap & syns,CScope * scope,ESeqLocExtremes ext)1153 bool s_CheckAbutting(const CSeq_loc& loc1,
1154                      const CSeq_loc& loc2,
1155                      TSynMap&        syns,
1156                      CScope*         scope,
1157                      ESeqLocExtremes ext)
1158 {
1159     // Compare biological end of loc1 and start of loc2.
1160     // If not abutting, use Compare.
1161     CSeq_loc_CI it1_last(loc1,
1162         CSeq_loc_CI::eEmpty_Allow,
1163         ext == eExtreme_Positional ?
1164         CSeq_loc_CI::eOrder_Positional : CSeq_loc_CI::eOrder_Biological);
1165     it1_last.SetPos(it1_last.GetSize() - 1);
1166     CSeq_loc_CI it2_first(loc2,
1167         CSeq_loc_CI::eEmpty_Allow,
1168         ext == eExtreme_Positional ?
1169         CSeq_loc_CI::eOrder_Positional : CSeq_loc_CI::eOrder_Biological);
1170 
1171     CSeq_id_Handle id1_last = s_GetSynHandle(it1_last.GetSeq_id_Handle(), syns, scope);
1172     CSeq_id_Handle id2_first = s_GetSynHandle(it2_first.GetSeq_id_Handle(), syns, scope);
1173 
1174     // Empty and whole locations can not abut.
1175     if (it1_last.IsEmpty()  ||  it2_first.IsEmpty()  ||
1176         it1_last.IsWhole()  ||  it2_first.IsWhole()  ||
1177         id1_last != id2_first) {
1178         return false;
1179     }
1180 
1181     if (ext == eExtreme_Positional) {
1182         return it1_last.GetRange().GetToOpen() == it2_first.GetRange().GetFrom();
1183     }
1184 
1185     // Plus strand
1186     if (!IsReverse(it1_last.GetStrand())  &&
1187         !IsReverse(it2_first.GetStrand())  &&
1188         it1_last.GetRange().GetToOpen() == it2_first.GetRange().GetFrom()) {
1189         return true;
1190     }
1191     // Minus strand
1192     if (IsReverse(it1_last.GetStrand())  &&
1193         IsReverse(it2_first.GetStrand()) &&
1194         it1_last.GetRange().GetFrom() == it2_first.GetRange().GetToOpen()) {
1195         return true;
1196     }
1197 
1198     return false;
1199 }
1200 
1201 
Compare(const CSeq_loc & me,const CSeq_loc & you,CScope * scope)1202 ECompare Compare(const CSeq_loc& me,
1203                  const CSeq_loc& you,
1204                  CScope*         scope)
1205 {
1206     TSynMap syns;
1207     return s_CompareOverlapping(me, you, syns, scope);
1208 }
1209 
1210 
Compare(const CSeq_loc & loc1,const CSeq_loc & loc2,CScope * scope,TCompareFlags flags)1211 ECompare Compare(const CSeq_loc& loc1,
1212                  const CSeq_loc& loc2,
1213                  CScope*         scope,
1214                  TCompareFlags   flags)
1215 {
1216     TSynMap syns;
1217 
1218     bool abut = false;
1219     ECompare ret = eNoOverlap;
1220 
1221     if (flags & fCompareAbutting) {
1222         abut = s_CheckAbutting(loc1, loc2, syns, scope,
1223             (flags & fComparePositional)
1224             ? eExtreme_Positional : eExtreme_Biological);
1225     }
1226     if (flags & fCompareOverlapping) {
1227         ret = s_CompareOverlapping(loc1, loc2, syns, scope);
1228     }
1229     if ( abut ) {
1230         if (ret == eNoOverlap) {
1231             ret = eAbutting;
1232         }
1233         else {
1234             ret = eAbutAndOverlap;
1235         }
1236     }
1237     return ret;
1238 }
1239 
1240 
1241 /////////////////////////////////////////////////////////////////////
1242 //
1243 //  Implementation of TestForOverlap()
1244 //
1245 
s_Test_Strands(ENa_strand strand1,ENa_strand strand2)1246 bool s_Test_Strands(ENa_strand strand1, ENa_strand strand2)
1247 {
1248     // Check strands. Overlapping rules for strand:
1249     //   - equal strands overlap
1250     //   - "both" overlaps with any other
1251     //   - "unknown" overlaps with any other except "minus"
1252     //   - "other" indicates mixed strands and needs a closer look
1253     if (strand1 == eNa_strand_other  ||  strand2 == eNa_strand_other) {
1254         return false;
1255     }
1256     return strand1 == strand2
1257         || strand1 == eNa_strand_both
1258         || strand2 == eNa_strand_both
1259         || (strand1 == eNa_strand_unknown  && strand2 != eNa_strand_minus)
1260         || (strand2 == eNa_strand_unknown  && strand1 != eNa_strand_minus);
1261 }
1262 
1263 
1264 inline
AbsInt8(Int8 x)1265 Int8 AbsInt8(Int8 x)
1266 {
1267     return x < 0 ? -x : x;
1268 }
1269 
1270 
1271 typedef pair<TRangeInfo, TRangeInfo> TRangeInfoByStrand;
1272 typedef map<CSeq_id_Handle, TRangeInfoByStrand> TTotalRangeInfoMap;
1273 
1274 // Convert the seq-loc to TTotalRangeInfoMap, which stores total
1275 // ranges by id and strand. The id map is used to normalize ids.
s_SeqLocToTotalRangeInfoMap(const CSeq_loc & loc,TTotalRangeInfoMap & infos,TSynMap & syns,CScope * scope)1276 void s_SeqLocToTotalRangeInfoMap(const CSeq_loc&     loc,
1277                                  TTotalRangeInfoMap& infos,
1278                                  TSynMap&            syns,
1279                                  CScope*             scope)
1280 {
1281     CSeq_loc_CI it(loc,
1282         CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Positional);
1283     for ( ; it; ++it) {
1284         TRangeInfo rg;
1285         if ( it.IsWhole() ) {
1286             rg.SetOpen(0, GetLength(it.GetSeq_id(), scope));
1287         }
1288         else {
1289             rg.SetOpen(it.GetRange().GetFrom(), it.GetRange().GetToOpen());
1290         }
1291         CSeq_id_Handle id = s_GetSynHandle(it.GetSeq_id_Handle(), syns, scope);
1292         if ( IsReverse(it.GetStrand()) ) {
1293             infos[id].second.CombineWith(rg);
1294         }
1295         else {
1296             infos[id].first.CombineWith(rg);
1297         }
1298     }
1299 }
1300 
1301 
TestForOverlap(const CSeq_loc & loc1,const CSeq_loc & loc2,EOverlapType type,TSeqPos circular_len,CScope * scope)1302 int TestForOverlap(const CSeq_loc& loc1,
1303                    const CSeq_loc& loc2,
1304                    EOverlapType type,
1305                    TSeqPos circular_len,
1306                    CScope* scope)
1307 {
1308     Int8 ret = TestForOverlap64(loc1, loc2, type, circular_len, scope);
1309     return ret <= kMax_Int ? int(ret) : kMax_Int;
1310 }
1311 
1312 
1313 typedef pair<TRangeInfoList, TRangeInfoList> TRangeInfoListByStrand;
1314 typedef map<CSeq_id_Handle, TRangeInfoListByStrand> TRangeInfoMapByStrand;
1315 
1316 // Convert the seq-loc to TRangeInfos for each strand. The id map is used to
1317 // normalize ids.
s_SeqLocToRangeInfoMapByStrand(const CSeq_loc & loc,TRangeInfoMapByStrand & infos,TSynMap & syns,CScope * scope)1318 void s_SeqLocToRangeInfoMapByStrand(const CSeq_loc&         loc,
1319                                     TRangeInfoMapByStrand&  infos,
1320                                     TSynMap&                syns,
1321                                     CScope*                 scope)
1322 {
1323     CSeq_loc_CI it(loc,
1324         CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Positional);
1325     for ( ; it; ++it) {
1326         TRangeInfo info;
1327         if ( it.IsWhole() ) {
1328             info.SetOpen(0, GetLength(it.GetSeq_id(), scope));
1329         }
1330         else {
1331             info.SetOpen(it.GetRange().GetFrom(), it.GetRange().GetToOpen());
1332         }
1333         CSeq_id_Handle id = s_GetSynHandle(it.GetSeq_id_Handle(), syns, scope);
1334         if (it.IsSetStrand()  &&  IsReverse(it.GetStrand())) {
1335             infos[id].second.push_back(info);
1336         }
1337         else {
1338             infos[id].first.push_back(info);
1339         }
1340     }
1341     NON_CONST_ITERATE(TRangeInfoMapByStrand, info, infos) {
1342         info->second.first.sort();
1343         info->second.second.sort();
1344     }
1345 }
1346 
1347 
1348 struct STopologyInfo
1349 {
1350     bool    circular;
1351     TSeqPos length;
1352 };
1353 
1354 typedef map<CSeq_id_Handle, STopologyInfo> TTopologyMap;
1355 
s_GetTopology(CSeq_id_Handle idh,TTopologyMap & topologies,TOverlapFlags flags,CScope * scope)1356 STopologyInfo s_GetTopology(CSeq_id_Handle idh,
1357                             TTopologyMap&  topologies,
1358                             TOverlapFlags flags,
1359                             CScope*       scope)
1360 {
1361     TTopologyMap::const_iterator found = topologies.find(idh);
1362     if (found != topologies.end()) {
1363         return found->second;
1364     }
1365     STopologyInfo info;
1366     info.circular = false;
1367     info.length = kInvalidSeqPos;
1368     if ( scope ) {
1369         CBioseq_Handle bh = scope->GetBioseqHandle(idh);
1370         if ( bh ) {
1371             if ((flags & fOverlap_IgnoreTopology) == 0) {
1372                 info.circular = (bh.IsSetInst_Topology()  &&
1373                     bh.GetInst_Topology() == CSeq_inst::eTopology_circular);
1374             }
1375             info.length = bh.GetBioseqLength();
1376         }
1377     }
1378     topologies[idh] = info;
1379     return info;
1380 }
1381 
1382 
1383 // Convert the seq-loc to TRangeInfos using extremes for each bioseq,
1384 // strand and ordered set of ranges. The id map is used to normalize ids.
s_SeqLocToTotalRangesInfoMapByStrand(const CSeq_loc & loc,TRangeInfoMapByStrand & infos,TSynMap & syns,TTopologyMap & topologies,TOverlapFlags flags,CScope * scope)1385 void s_SeqLocToTotalRangesInfoMapByStrand(const CSeq_loc&         loc,
1386                                           TRangeInfoMapByStrand&  infos,
1387                                           TSynMap&                syns,
1388                                           TTopologyMap&           topologies,
1389                                           TOverlapFlags           flags,
1390                                           CScope*                 scope)
1391 {
1392     CSeq_loc_CI it(loc,
1393         CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Biological);
1394     if ( !it ) return;
1395 
1396     CSeq_id_Handle last_id = s_GetSynHandle(it.GetSeq_id_Handle(), syns, scope);
1397     TRangeInfo last_rg;
1398     bool last_reverse = it.IsSetStrand()  &&  IsReverse(it.GetStrand());
1399     // In case of circular bioseq allow to cross zero only once.
1400     bool crossed_zero = false;
1401 
1402     TRangeInfo total_range;
1403     for ( ; it; ++it) {
1404         CSeq_id_Handle id = s_GetSynHandle(it.GetSeq_id_Handle(), syns, scope);
1405         TRangeInfo rg = it.GetRange();
1406         STopologyInfo topo =
1407             s_GetTopology(id, topologies, flags, scope);
1408         bool reverse = it.IsSetStrand()  &&  IsReverse(it.GetStrand());
1409         bool break_range = reverse != last_reverse  ||  id != last_id;
1410 
1411         bool bad_order = false;
1412         // Don't try to check the order if id or strand have just changed.
1413         // Also don't check it for the first range.
1414         if ( !break_range  &&  !last_rg.Empty() ) {
1415             if ( reverse ) {
1416                 if (rg.GetFrom() > last_rg.GetFrom()) {
1417                     bad_order = true;
1418                     if ( topo.circular ) {
1419                         if ( !crossed_zero ) {
1420                             total_range.SetFrom(0);
1421                         }
1422                         crossed_zero = true;
1423                     }
1424                 }
1425             }
1426             else {
1427                 if (rg.GetFrom() < last_rg.GetFrom()) {
1428                     bad_order = true;
1429                     if ( topo.circular ) {
1430                         if ( !crossed_zero ) {
1431                             total_range.SetToOpen(topo.length != kInvalidSeqPos ?
1432                                 topo.length : TRangeInfo::GetWholeToOpen());
1433                             crossed_zero = true;
1434                         }
1435                     }
1436                 }
1437             }
1438         }
1439         if (break_range  ||  bad_order) {
1440             // Push the next total range, start the new one
1441             if ( last_reverse ) {
1442                 infos[last_id].second.push_back(total_range);
1443             }
1444             else {
1445                 infos[last_id].first.push_back(total_range);
1446             }
1447             total_range = TRangeInfo::GetEmpty();
1448             if ( crossed_zero ) {
1449                 if ( reverse ) {
1450                     rg.SetToOpen(topo.length != kInvalidSeqPos ?
1451                         topo.length : TRangeInfo::GetWholeToOpen());
1452                 }
1453                 else {
1454                     rg.SetFrom(0);
1455                 }
1456             }
1457             crossed_zero = false;
1458         }
1459         last_rg = rg;
1460         total_range.CombineWith(rg);
1461         last_id = id;
1462         last_reverse = reverse;
1463     }
1464     if ( !total_range.Empty() ) {
1465         if ( last_reverse ) {
1466             infos[last_id].second.push_back(total_range);
1467         }
1468         else {
1469             infos[last_id].first.push_back(total_range);
1470         }
1471     }
1472     NON_CONST_ITERATE(TRangeInfoMapByStrand, info, infos) {
1473         info->second.first.sort();
1474         info->second.second.sort();
1475     }
1476 }
1477 
1478 
s_GetUncoveredLength(const TRangeInfoList & ranges1,const TRangeInfoList & ranges2)1479 Int8 s_GetUncoveredLength(const TRangeInfoList& ranges1,
1480                           const TRangeInfoList& ranges2)
1481 {
1482     Int8 diff = 0;
1483     ITERATE(TRangeInfoList, rg_it1, ranges1) {
1484         TRangeInfo rg = *rg_it1;
1485         ITERATE(TRangeInfoList, rg_it2, ranges2) {
1486             if (rg_it2->GetFrom() > rg.GetTo()) break;
1487             if ( !rg.IntersectingWith(*rg_it2) ) continue;
1488             if (rg_it2->GetFrom() > rg.GetFrom()) {
1489                 diff += static_cast<Int8>(rg_it2->GetFrom() - rg.GetFrom());
1490             }
1491             if (rg_it2->GetTo() < rg.GetTo()) {
1492                 rg.SetFrom(rg_it2->GetToOpen());
1493             }
1494             else {
1495                 rg = TRangeInfo::GetEmpty();
1496                 break;
1497             }
1498         }
1499         if (rg.IsWhole()) return numeric_limits<Int8>::max();
1500         diff += rg.GetLength();
1501     }
1502     return diff;
1503 }
1504 
1505 
1506 // Calculate sum of all subranges from ranges1 not covered by ranges2.
s_GetUncoveredLength(const TRangeInfoMapByStrand & ranges1,const TRangeInfoMapByStrand & ranges2)1507 Int8 s_GetUncoveredLength(const TRangeInfoMapByStrand& ranges1,
1508                           const TRangeInfoMapByStrand& ranges2)
1509 {
1510     Int8 diff = 0;
1511     ITERATE(TRangeInfoMapByStrand, id_it1, ranges1) {
1512         TRangeInfoMapByStrand::const_iterator id_it2 = ranges2.find(id_it1->first);
1513         if (id_it2 != ranges2.end()) {
1514             Int8 diff_plus = s_GetUncoveredLength(id_it1->second.first, id_it2->second.first);
1515             Int8 diff_minus = s_GetUncoveredLength(id_it1->second.second, id_it2->second.second);
1516             if (diff_plus == numeric_limits<Int8>::max()) return diff_plus;
1517             if (diff_minus == numeric_limits<Int8>::max()) return diff_minus;
1518             diff += diff_plus + diff_minus;
1519         }
1520         else {
1521             ITERATE(TRangeInfoList, rg_it, id_it1->second.first) {
1522                 if (rg_it->IsWhole()) return numeric_limits<Int8>::max();
1523                 diff += rg_it->GetLength();
1524             }
1525             ITERATE(TRangeInfoList, rg_it, id_it1->second.second) {
1526                 if (rg_it->IsWhole()) return numeric_limits<Int8>::max();
1527                 diff += rg_it->GetLength();
1528             }
1529         }
1530     }
1531     return diff;
1532 }
1533 
1534 
s_Test_Subset(const CSeq_loc & loc1,const CSeq_loc & loc2,CScope * scope)1535 bool s_Test_Subset(const CSeq_loc& loc1,
1536                    const CSeq_loc& loc2,
1537                    CScope*         scope)
1538 {
1539     TSynMap syns;
1540     TRangeInfoMapByStrand rm1, rm2;
1541     s_SeqLocToRangeInfoMapByStrand(loc1, rm1, syns, scope);
1542     s_SeqLocToRangeInfoMapByStrand(loc2, rm2, syns, scope);
1543     ITERATE(TRangeInfoMapByStrand, id_it2, rm2) {
1544         // For each id from loc2 find an entry in loc1.
1545         TRangeInfoMapByStrand::iterator id_it1 = rm1.find(id_it2->first);
1546         if (id_it1 == rm1.end()) {
1547             return false; // unmatched id in loc2
1548         }
1549         const TRangeInfoList& rglist1_plus = id_it1->second.first;
1550         const TRangeInfoList& rglist1_minus = id_it1->second.second;
1551         const TRangeInfoList& rglist2_plus = id_it2->second.first;
1552         const TRangeInfoList& rglist2_minus = id_it2->second.second;
1553         ITERATE(TRangeInfoList, it2, rglist2_plus) {
1554             bool contained = false;
1555             ITERATE(TRangeInfoList, it1, rglist1_plus) {
1556                 // Already missed the rage on loc2?
1557                 if (!contained  &&  it1->GetFrom() > it2->GetFrom()) {
1558                     return false;
1559                 }
1560                 // found a contaning range?
1561                 if (it1->IsWhole()  ||
1562                     (it1->GetFrom() <= it2->GetFrom()  &&
1563                     it1->GetTo() >= it2->GetTo())) {
1564                     contained = true;
1565                     break;
1566                 }
1567             }
1568             if ( !contained ) return false;
1569         }
1570         ITERATE(TRangeInfoList, it2, rglist2_minus) {
1571             bool contained = false;
1572             ITERATE(TRangeInfoList, it1, rglist1_minus) {
1573                 // Already missed the rage on loc2?
1574                 if (!contained  &&  it1->GetFrom() > it2->GetFrom()) {
1575                     return false;
1576                 }
1577                 // found a contaning range?
1578                 if (it1->IsWhole()  ||
1579                     (it1->GetFrom() <= it2->GetFrom()  &&
1580                     it1->GetTo() >= it2->GetTo())) {
1581                     contained = true;
1582                     break;
1583                 }
1584             }
1585             if ( !contained ) return false;
1586         }
1587     }
1588     return true;
1589 }
1590 
1591 
s_Test_CheckIntervals(CSeq_loc_CI it1,CSeq_loc_CI it2,bool minus_strand,CScope * scope,bool single_id)1592 bool s_Test_CheckIntervals(CSeq_loc_CI it1,
1593                            CSeq_loc_CI it2,
1594                            bool minus_strand,
1595                            CScope* scope,
1596                            bool single_id)
1597 {
1598     // Check intervals one by one
1599     while ( it1  &&  it2 ) {
1600         bool same_it_id = single_id;
1601         if ( !same_it_id ) {
1602             if ( !IsSameBioseq(it1.GetSeq_id(), it2.GetSeq_id(), scope) ) {
1603                 return false;
1604             }
1605         }
1606         if ( !s_Test_Strands(it1.GetStrand(), it2.GetStrand()) ) {
1607             return false;
1608         }
1609         if ( minus_strand ) {
1610             if (it1.GetRange().GetFrom()  !=  it2.GetRange().GetFrom() ) {
1611                 // The last interval from loc2 may be shorter than the
1612                 // current interval from loc1
1613                 if (it1.GetRange().GetFrom() > it2.GetRange().GetFrom()  ||
1614                     ++it2) {
1615                     return false;
1616                 }
1617                 break;
1618             }
1619         }
1620         else {
1621             if (it1.GetRange().GetTo()  !=  it2.GetRange().GetTo() ) {
1622                 // The last interval from loc2 may be shorter than the
1623                 // current interval from loc1
1624                 if (it1.GetRange().GetTo() < it2.GetRange().GetTo()  ||
1625                     ++it2) {
1626                     return false;
1627                 }
1628                 break;
1629             }
1630         }
1631         // Go to the next interval start
1632         if ( !(++it2) ) {
1633             break;
1634         }
1635         if ( !(++it1) ) {
1636             return false; // loc1 has not enough intervals
1637         }
1638         if ( minus_strand ) {
1639             if (it1.GetRange().GetTo() != it2.GetRange().GetTo()) {
1640                 return false;
1641             }
1642         }
1643         else {
1644             if (it1.GetRange().GetFrom() != it2.GetRange().GetFrom()) {
1645                 return false;
1646             }
1647         }
1648     }
1649     return true;
1650 }
1651 
1652 
1653 // Test for overlap using extremes rather than ranges (simple, contained).
1654 // Used for multi-id, multi-strand and out-of-order locations.
s_Test_Extremes(const CSeq_loc & loc1,const CSeq_loc & loc2,EOverlapType type,TSynMap & syns,TTopologyMap & topologies,TOverlapFlags flags,CScope * scope)1655 Int8 s_Test_Extremes(const CSeq_loc& loc1,
1656                      const CSeq_loc& loc2,
1657                      EOverlapType    type,
1658                      TSynMap&        syns,
1659                      TTopologyMap&   topologies,
1660                      TOverlapFlags   flags,
1661                      CScope*         scope)
1662 {
1663     // Here we accept only two overlap types.
1664     _ASSERT(type == eOverlap_Simple  ||  type == eOverlap_Contained);
1665 
1666     TRangeInfoMapByStrand ranges1, ranges2;
1667 
1668     s_SeqLocToTotalRangesInfoMapByStrand(loc1, ranges1, syns, topologies, flags, scope);
1669     s_SeqLocToTotalRangesInfoMapByStrand(loc2, ranges2, syns, topologies, flags, scope);
1670 
1671     bool overlap = false;
1672     ITERATE(TRangeInfoMapByStrand, id_it2, ranges2) {
1673         TRangeInfoMapByStrand::const_iterator id_it1 = ranges1.find(id_it2->first);
1674         if (id_it1 == ranges1.end()) {
1675             if (type == eOverlap_Contained) {
1676                 // loc2 has parts not contained in loc1
1677                 return -1;
1678             }
1679             else { // eOverlap_Simple
1680                 continue; // next id_it2
1681             }
1682         }
1683         // Found the same id in loc1 - check ranges.
1684         // Plus strand
1685         ITERATE(TRangeInfoList, rg_it2, id_it2->second.first) {
1686             bool contained = false;
1687             ITERATE(TRangeInfoList, rg_it1, id_it1->second.first) {
1688                 if ( !rg_it2->IntersectingWith(*rg_it1) ) {
1689                     // Ranges are sorted, we can stop as soon as
1690                     // we go beyond the right end of rg2.
1691                     if (rg_it2->GetTo() < rg_it1->GetFrom()) break;
1692                     continue;
1693                 }
1694                 overlap = true;
1695                 if (type == eOverlap_Contained) {
1696                     if (rg_it2->GetFrom() >= rg_it1->GetFrom()  &&
1697                         rg_it2->GetTo() <= rg_it1->GetTo()) {
1698                         contained = true;
1699                         break;
1700                     }
1701                 }
1702                 else { // eOverlap_Simple
1703                     break; // found overlap, go to next range from loc2
1704                 }
1705             }
1706             if (type == eOverlap_Contained) {
1707                 if ( !contained ) return -1;
1708             }
1709             else if ( overlap ) break;
1710         }
1711         // Munis strand
1712         ITERATE(TRangeInfoList, rg_it2, id_it2->second.second) {
1713             bool contained = false;
1714             ITERATE(TRangeInfoList, rg_it1, id_it1->second.second) {
1715                 if ( !rg_it2->IntersectingWith(*rg_it1) ) {
1716                     // Ranges are sorted, we can stop as soon as
1717                     // we go beyond the right end of rg2.
1718                     if (rg_it2->GetTo() < rg_it1->GetFrom()) break;
1719                     continue;
1720                 }
1721                 overlap = true;
1722                 if (type == eOverlap_Contained) {
1723                     if (rg_it2->GetFrom() >= rg_it1->GetFrom()  &&
1724                         rg_it2->GetTo() <= rg_it1->GetTo()) {
1725                         contained = true;
1726                         break;
1727                     }
1728                 }
1729                 else { // eOverlap_Simple
1730                     break; // found overlap, go to next range from loc2
1731                 }
1732             }
1733             if (type == eOverlap_Contained) {
1734                 if ( !contained ) return -1;
1735             }
1736             else if ( overlap ) break;
1737         }
1738     }
1739 
1740     // Now it's time to calculate quality of the overlap. Take into
1741     // account that a single range from one location may contain/overlap
1742     // multiple ranges from another location.
1743     if (type == eOverlap_Contained) {
1744         // There should be no subranges in ranges2 not covered by ranges1.
1745         return s_GetUncoveredLength(ranges1, ranges2);
1746     }
1747     if (type == eOverlap_Simple  &&  overlap) {
1748         Int8 diff1 = s_GetUncoveredLength(ranges1, ranges2);
1749         Int8 diff2 = s_GetUncoveredLength(ranges2, ranges1);
1750         if (diff1 == numeric_limits<Int8>::max()) return diff1;
1751         if (diff2 == numeric_limits<Int8>::max()) return diff2;
1752         return diff1 + diff2;
1753     }
1754     return -1;
1755 }
1756 
1757 
s_Test_Interval(const CSeq_loc & loc1,const CSeq_loc & loc2,TSynMap & syns,TTopologyMap & topologies,TOverlapFlags flags,CScope * scope)1758 Int8 s_Test_Interval(const CSeq_loc& loc1,
1759                      const CSeq_loc& loc2,
1760                      TSynMap&        syns,
1761                      TTopologyMap&   topologies,
1762                      TOverlapFlags   flags,
1763                      CScope*         scope)
1764 {
1765     TRangeInfoMapByStrand ranges1, ranges2;
1766 
1767     s_SeqLocToRangeInfoMapByStrand(loc1, ranges1, syns, scope);
1768     s_SeqLocToRangeInfoMapByStrand(loc2, ranges2, syns, scope);
1769 
1770     bool overlap = false;
1771     ITERATE(TRangeInfoMapByStrand, id_it1, ranges1) {
1772         TRangeInfoMapByStrand::const_iterator id_it2 = ranges2.find(id_it1->first);
1773         if (id_it2 == ranges2.end()) continue;
1774         // Plus strand ranges
1775         ITERATE(TRangeInfoList, rg_it1, id_it1->second.first) {
1776             ITERATE(TRangeInfoList, rg_it2, id_it2->second.first) {
1777                 if ( rg_it1->IntersectingWith(*rg_it2) ) {
1778                     overlap = true;
1779                     break;
1780                 }
1781             }
1782             if ( overlap ) break;
1783         }
1784         if ( overlap ) break;
1785         // Minus strand ranges
1786         ITERATE(TRangeInfoList, rg_it1, id_it1->second.second) {
1787             ITERATE(TRangeInfoList, rg_it2, id_it2->second.second) {
1788                 if ( rg_it1->IntersectingWith(*rg_it2) ) {
1789                     overlap = true;
1790                     break;
1791                 }
1792             }
1793             if ( overlap ) break;
1794         }
1795         if ( overlap ) break;
1796     }
1797 
1798     if ( !overlap ) return -1;
1799 
1800     ranges1.clear();
1801     ranges2.clear();
1802     s_SeqLocToTotalRangesInfoMapByStrand(loc1, ranges1,
1803         syns, topologies, flags, scope);
1804     s_SeqLocToTotalRangesInfoMapByStrand(loc2, ranges2,
1805         syns, topologies, flags, scope);
1806 
1807     Int8 diff1 = s_GetUncoveredLength(ranges1, ranges2);
1808     Int8 diff2 = s_GetUncoveredLength(ranges2, ranges1);
1809     if (diff1 == numeric_limits<Int8>::max()) return diff1;
1810     if (diff2 == numeric_limits<Int8>::max()) return diff2;
1811     return diff1 + diff2;
1812 }
1813 
1814 
s_TestForOverlapEx(const CSeq_loc & loc1,const CSeq_loc & loc2,EOverlapType type,TOverlapFlags flags,TSeqPos circular_len,CScope * scope)1815 Int8 s_TestForOverlapEx(const CSeq_loc& loc1,
1816                         const CSeq_loc& loc2,
1817                         EOverlapType    type,
1818                         TOverlapFlags   flags,
1819                         TSeqPos         circular_len,
1820                         CScope*         scope)
1821 {
1822     if (circular_len == 0) {
1823         circular_len = kInvalidSeqPos;
1824     }
1825     // Do not allow conflicting values.
1826     if (circular_len != kInvalidSeqPos  &&  (flags & fOverlap_IgnoreTopology)) {
1827         NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
1828             "Circular length can not be combined with no-topology flag.");
1829     }
1830 
1831     const CSeq_loc* ploc1 = &loc1;
1832     const CSeq_loc* ploc2 = &loc2;
1833     const CSeq_id *id1 = NULL;
1834     const CSeq_id *id2 = NULL;
1835     id1 = loc1.GetId();
1836     id2 = loc2.GetId();
1837     TSynMap syns;
1838     TTopologyMap topologies;
1839     bool single_seq = true;
1840 
1841     // Get seq-ids. They should be cached by GetTotalRange() above and should
1842     // not be null.
1843     if (id1  &&  id2) {
1844         if ( !IsSameBioseq(*id1, *id2, scope) ) return -1;
1845         // Use known id and topology if there's just one sequence.
1846         CSeq_id_Handle idh1 = CSeq_id_Handle::GetHandle(*id1);
1847         CSeq_id_Handle idh2 = CSeq_id_Handle::GetHandle(*id2);
1848         syns[idh1] = idh1;
1849         if (idh2 != idh1) {
1850             syns[idh2] = idh1;
1851         }
1852         if (circular_len != kInvalidSeqPos) {
1853             STopologyInfo topo;
1854             topo.circular = true;
1855             topo.length = circular_len;
1856             topologies[idh1] = topo;
1857         }
1858     }
1859     else {
1860         if (flags & fOverlap_NoMultiSeq) {
1861             NCBI_THROW(CObjmgrUtilException, eBadLocation,
1862                 "Multi-bioseq locations are disabled by the flags.");
1863         }
1864         // Multi-id locations - no circular_len allowed
1865         if ( circular_len != kInvalidSeqPos &&
1866              type != eOverlap_Subset ) {
1867             NCBI_THROW(CObjmgrUtilException, eBadLocation,
1868                 "Circular bioseq length can not be specified "
1869                 "for multi-bioseq locations.");
1870         }
1871         single_seq = false;
1872     }
1873 
1874     // Shortcut - if strands do not intersect, don't even look at the ranges.
1875     ENa_strand strand1 = GetStrand(loc1);
1876     ENa_strand strand2 = GetStrand(loc2);
1877     if ( !s_Test_Strands(strand1, strand2) ) {
1878         // For multi-seq strand is unknown rather than other -
1879         // can not use this test.
1880         if (single_seq  &&
1881             strand1 != eNa_strand_other && strand2 != eNa_strand_other ) {
1882             // singular but incompatible strands
1883             return -1;
1884         }
1885         // There is a possibility of multiple strands that needs to be
1886         // checked too (if allowed by the flags).
1887         if (flags & fOverlap_NoMultiStrand) {
1888             NCBI_THROW(CObjmgrUtilException, eBadLocation,
1889                 "Multi-strand locations are disabled by the flags.");
1890         }
1891     }
1892 
1893     switch (type) {
1894     case eOverlap_Simple:
1895         return s_Test_Extremes(loc1, loc2, eOverlap_Simple,
1896             syns, topologies, flags, scope);
1897     case eOverlap_Contains:
1898         swap(ploc1, ploc2);
1899         // Go on to the next case
1900     case eOverlap_Contained:
1901         return s_Test_Extremes(*ploc1, *ploc2, eOverlap_Contained,
1902             syns, topologies, flags, scope);
1903     case eOverlap_SubsetRev:
1904         swap(ploc1, ploc2);
1905         // continue to eOverlap_Subset case
1906     case eOverlap_Subset:
1907         if ( !s_Test_Subset(*ploc1, *ploc2, scope) ) return -1;
1908         return Int8(GetCoverage(*ploc1, scope)) -
1909             Int8(GetCoverage(*ploc2, scope));
1910     case eOverlap_CheckIntRev:
1911         swap(ploc1, ploc2);
1912         // Go on to the next case
1913     case eOverlap_CheckIntervals:
1914         {
1915             // Check intervals' boundaries.
1916             CSeq_loc_CI it1(*ploc1);
1917             CSeq_loc_CI it2(*ploc2);
1918             if (!it1  ||  !it2) {
1919                 return -1;
1920             }
1921             TSeqPos loc2start = it2.GetRange().GetFrom();
1922             TSeqPos loc2end = it2.GetRange().GetTo();
1923             bool loc2rev = it2.GetStrand() == eNa_strand_minus;
1924             bool single_id = (id1  &&  id2);
1925             for ( ; it1; ++it1) {
1926                 // If there are multiple ids per seq-loc, check each pair.
1927                 if ( !single_id ) {
1928                     if ( !IsSameBioseq(it1.GetSeq_id(), it2.GetSeq_id(),
1929                         scope) ) continue;
1930                 }
1931                 // Find the first range in loc1 containing the first range
1932                 // of loc2. s_Test_CheckIntervals will do the rest.
1933                 if (it1.GetRange().GetFrom() <= loc2start  &&
1934                     it1.GetRange().GetTo() >= loc2end  &&
1935                     s_Test_CheckIntervals(it1, it2, loc2rev, scope, single_id)) {
1936                     // GetLength adds up all strands/seqs, but for this overlap
1937                     // type it's ok.
1938                     return Int8(GetLength(*ploc1, scope)) -
1939                         Int8(GetLength(*ploc2, scope));
1940                 }
1941             }
1942             return -1;
1943         }
1944     case eOverlap_Interval:
1945         return s_Test_Interval(loc1, loc2, syns, topologies, flags, scope);
1946     }
1947     return -1;
1948 }
1949 
1950 
TestForOverlap64(const CSeq_loc & loc1,const CSeq_loc & loc2,EOverlapType type,TSeqPos circular_len,CScope * scope)1951 Int8 TestForOverlap64(const CSeq_loc& loc1,
1952                       const CSeq_loc& loc2,
1953                       EOverlapType    type,
1954                       TSeqPos         circular_len,
1955                       CScope*         scope)
1956 {
1957     return s_TestForOverlapEx(loc1, loc2, type, fOverlap_Default, circular_len, scope);
1958 }
1959 
1960 
TestForOverlapEx(const CSeq_loc & loc1,const CSeq_loc & loc2,EOverlapType type,CScope * scope,TOverlapFlags flags)1961 Int8 TestForOverlapEx(const CSeq_loc& loc1,
1962                       const CSeq_loc& loc2,
1963                       EOverlapType    type,
1964                       CScope*         scope,
1965                       TOverlapFlags   flags)
1966 {
1967     return s_TestForOverlapEx(loc1, loc2, type, flags, kInvalidSeqPos, scope);
1968 }
1969 
1970 
1971 /////////////////////////////////////////////////////////////////////
1972 //
1973 //  Seq-loc operations
1974 //
1975 
1976 class CDefaultSynonymMapper : public ISynonymMapper
1977 {
1978 public:
1979     CDefaultSynonymMapper(CScope* scope);
1980     virtual ~CDefaultSynonymMapper(void);
1981 
1982     virtual CSeq_id_Handle GetBestSynonym(const CSeq_id& id);
1983 
1984 private:
1985     typedef map<CSeq_id_Handle, CSeq_id_Handle> TSynonymMap;
1986 
1987     CRef<CSeq_id_Mapper> m_IdMapper;
1988     TSynonymMap          m_SynMap;
1989     CScope*              m_Scope;
1990 };
1991 
1992 
CDefaultSynonymMapper(CScope * scope)1993 CDefaultSynonymMapper::CDefaultSynonymMapper(CScope* scope)
1994     : m_IdMapper(CSeq_id_Mapper::GetInstance()),
1995       m_Scope(scope)
1996 {
1997     return;
1998 }
1999 
2000 
~CDefaultSynonymMapper(void)2001 CDefaultSynonymMapper::~CDefaultSynonymMapper(void)
2002 {
2003     return;
2004 }
2005 
2006 
GetBestSynonym(const CSeq_id & id)2007 CSeq_id_Handle CDefaultSynonymMapper::GetBestSynonym(const CSeq_id& id)
2008 {
2009     CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(id);
2010     if ( !m_Scope  ||  id.Which() == CSeq_id::e_not_set ) {
2011         return idh;
2012     }
2013     TSynonymMap::iterator id_syn = m_SynMap.find(idh);
2014     if (id_syn != m_SynMap.end()) {
2015         return id_syn->second;
2016     }
2017     CSeq_id_Handle best;
2018     int best_rank = CSeq_id::kMaxScore;
2019     CConstRef<CSynonymsSet> syn_set = m_Scope->GetSynonyms(idh);
2020 #ifdef _DEBUG
2021     TGi gi = ZERO_GI;
2022 #endif
2023     ITERATE(CSynonymsSet, syn_it, *syn_set) {
2024         CSeq_id_Handle synh = syn_set->GetSeq_id_Handle(syn_it);
2025 #ifdef _DEBUG
2026         if (synh.IsGi()) {
2027             gi = synh.GetGi();
2028         }
2029 #endif
2030         int rank = synh.GetSeqId()->BestRankScore();
2031         if (rank < best_rank) {
2032             best = synh;
2033             best_rank = rank;
2034         }
2035     }
2036     if ( !best ) {
2037         // Synonyms set was empty
2038         m_SynMap[idh] = idh;
2039         return idh;
2040     }
2041     ITERATE(CSynonymsSet, syn_it, *syn_set) {
2042         m_SynMap[syn_set->GetSeq_id_Handle(syn_it)] = best;
2043     }
2044 #ifdef _DEBUG
2045     const CTextseq_id* txt_id = best.GetSeqId()->GetTextseq_Id();
2046     if (txt_id && !txt_id->IsSetVersion() && gi != ZERO_GI) {
2047         ERR_POST("Using version-less accession " << txt_id->GetAccession()
2048             << " instead of GI " << gi);
2049     }
2050 #endif
2051     return best;
2052 }
2053 
2054 
2055 class CDefaultLengthGetter : public ILengthGetter
2056 {
2057 public:
2058     CDefaultLengthGetter(CScope* scope);
2059     virtual ~CDefaultLengthGetter(void);
2060 
2061     virtual TSeqPos GetLength(const CSeq_id& id);
2062 
2063 protected:
2064     CScope*              m_Scope;
2065 };
2066 
2067 
CDefaultLengthGetter(CScope * scope)2068 CDefaultLengthGetter::CDefaultLengthGetter(CScope* scope)
2069     : m_Scope(scope)
2070 {
2071     return;
2072 }
2073 
2074 
~CDefaultLengthGetter(void)2075 CDefaultLengthGetter::~CDefaultLengthGetter(void)
2076 {
2077     return;
2078 }
2079 
2080 
GetLength(const CSeq_id & id)2081 TSeqPos CDefaultLengthGetter::GetLength(const CSeq_id& id)
2082 {
2083     if (id.Which() == CSeq_id::e_not_set) {
2084         return 0;
2085     }
2086     CBioseq_Handle bh;
2087     if ( m_Scope ) {
2088         bh = m_Scope->GetBioseqHandle(id);
2089     }
2090     if ( !bh ) {
2091         NCBI_THROW(CException, eUnknown,
2092             "Can not get length of whole location");
2093     }
2094     return bh.GetBioseqLength();
2095 }
2096 
2097 
Seq_loc_Merge(const CSeq_loc & loc,CSeq_loc::TOpFlags flags,CScope * scope)2098 CRef<CSeq_loc> Seq_loc_Merge(const CSeq_loc&    loc,
2099                              CSeq_loc::TOpFlags flags,
2100                              CScope*            scope)
2101 {
2102     CDefaultSynonymMapper syn_mapper(scope);
2103     return loc.Merge(flags, &syn_mapper);
2104 }
2105 
2106 
Seq_loc_Add(const CSeq_loc & loc1,const CSeq_loc & loc2,CSeq_loc::TOpFlags flags,CScope * scope)2107 CRef<CSeq_loc> Seq_loc_Add(const CSeq_loc&    loc1,
2108                            const CSeq_loc&    loc2,
2109                            CSeq_loc::TOpFlags flags,
2110                            CScope*            scope)
2111 {
2112     CDefaultSynonymMapper syn_mapper(scope);
2113     return loc1.Add(loc2, flags, &syn_mapper);
2114 }
2115 
2116 
Seq_loc_Subtract(const CSeq_loc & loc1,const CSeq_loc & loc2,CSeq_loc::TOpFlags flags,CScope * scope)2117 CRef<CSeq_loc> Seq_loc_Subtract(const CSeq_loc&    loc1,
2118                                 const CSeq_loc&    loc2,
2119                                 CSeq_loc::TOpFlags flags,
2120                                 CScope*            scope)
2121 {
2122     CDefaultSynonymMapper syn_mapper(scope);
2123     CDefaultLengthGetter len_getter(scope);
2124     return loc1.Subtract(loc2, flags, &syn_mapper, &len_getter);
2125 }
2126 
2127 
2128 END_SCOPE(sequence)
2129 END_SCOPE(objects)
2130 END_NCBI_SCOPE
2131