1 /*  $Id: sequence.cpp 629259 2021-04-13 13:28:40Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Clifford Clausen
27 *
28 * File Description:
29 *   Sequence utilities requiring CScope
30 */
31 
32 #include <ncbi_pch.hpp>
33 #include <serial/iterator.hpp>
34 #include <util/static_map.hpp>
35 
36 #include <objmgr/object_manager.hpp>
37 #include <objmgr/scope.hpp>
38 #include <objmgr/seq_vector.hpp>
39 #include <objmgr/seq_vector_ci.hpp>
40 #include <objmgr/seqdesc_ci.hpp>
41 #include <objmgr/feat_ci.hpp>
42 #include <objmgr/bioseq_ci.hpp>
43 #include <objmgr/seq_entry_handle.hpp>
44 #include <objmgr/impl/handle_range_map.hpp>
45 #include <objmgr/impl/synonyms.hpp>
46 #include <objmgr/util/seq_loc_util.hpp>
47 #include <objmgr/util/create_defline.hpp>
48 
49 #include <objects/general/Int_fuzz.hpp>
50 #include <objects/general/Dbtag.hpp>
51 #include <objects/general/Object_id.hpp>
52 #include <objects/general/User_object.hpp>
53 #include <objects/general/User_field.hpp>
54 #include <objects/general/Date.hpp>
55 #include <objects/general/general_macros.hpp>
56 
57 #include <objects/misc/sequence_util_macros.hpp>
58 
59 #include <objects/seq/Bioseq.hpp>
60 #include <objects/seq/Delta_ext.hpp>
61 #include <objects/seq/Delta_seq.hpp>
62 #include <objects/seq/Linkage_evidence.hpp>
63 #include <objects/seq/MolInfo.hpp>
64 #include <objects/seq/Seg_ext.hpp>
65 #include <objects/seq/Seq_ext.hpp>
66 #include <objects/seq/Seq_gap.hpp>
67 #include <objects/seq/Seq_inst.hpp>
68 #include <objects/seq/Seq_literal.hpp>
69 #include <objects/seq/seqport_util.hpp>
70 #include <objects/seq/Seq_hist.hpp>
71 #include <objects/seq/Seq_hist_rec.hpp>
72 #include <objects/seq/seq_macros.hpp>
73 
74 #include <objects/seqloc/Packed_seqpnt.hpp>
75 #include <objects/seqloc/Seq_bond.hpp>
76 #include <objects/seqloc/Seq_id.hpp>
77 #include <objects/seqloc/Seq_interval.hpp>
78 #include <objects/seqloc/Seq_loc.hpp>
79 #include <objects/seqloc/Seq_loc_equiv.hpp>
80 #include <objects/seqloc/Seq_loc_mix.hpp>
81 #include <objects/seqloc/Seq_point.hpp>
82 
83 #include <objects/seqset/Seq_entry.hpp>
84 
85 #include <objects/seqfeat/seqfeat__.hpp>
86 
87 #include <objmgr/seq_loc_mapper.hpp>
88 #include <objmgr/seq_entry_ci.hpp>
89 #include <objmgr/util/sequence.hpp>
90 #include <objmgr/error_codes.hpp>
91 #include <util/strsearch.hpp>
92 
93 #include <list>
94 #include <algorithm>
95 
96 
97 #define NCBI_USE_ERRCODE_X   ObjMgr_SeqUtil
98 
99 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)100 BEGIN_SCOPE(objects)
101 BEGIN_SCOPE(sequence)
102 
103 
104 const CBioSource* GetBioSource(const CBioseq& bioseq)
105 {
106     ITERATE(CBioseq::TDescr::Tdata, it, bioseq.GetDescr().Get())
107     {
108         if ((**it).IsSource())
109             return &(**it).GetSource();
110     }
111 
112     return NULL;
113 }
114 
GetBioSource(const CBioseq_Handle & handle)115 const CBioSource* GetBioSource(const CBioseq_Handle& handle)
116 {
117     {{
118         CSeqdesc_CI desc(handle, CSeqdesc::e_Source);
119         if (desc) {
120             return &desc->GetSource();
121         }
122     }}
123     {{
124         CSeqdesc_CI desc(handle.GetTopLevelEntry(), CSeqdesc::e_Source);
125         if (desc) {
126             return &desc->GetSource();
127         }
128     }}
129 
130     return NULL;
131 }
132 
133 
GetOrg_refOrNull(const CBioseq_Handle & handle)134 const COrg_ref* GetOrg_refOrNull(const CBioseq_Handle& handle)
135 {
136     vector<CSeqdesc::E_Choice> types;
137     types.push_back(CSeqdesc::e_Source);
138     types.push_back(CSeqdesc::e_Org);
139     CSeqdesc_CI desc_it(handle, types);
140     if ( desc_it ) {
141         const CSeqdesc& desc = *desc_it;
142         if ( desc.IsSource() ) {
143             return &desc.GetSource().GetOrg();
144         }
145         if ( desc.IsOrg() ) {
146             return &desc.GetOrg();
147         }
148     }
149     return 0;
150 }
151 
152 
GetOrg_ref(const CBioseq_Handle & handle)153 const COrg_ref& GetOrg_ref(const CBioseq_Handle& handle)
154 {
155     const COrg_ref* org_ref = GetOrg_refOrNull(handle);
156     if ( org_ref ) {
157         return *org_ref;
158     }
159     NCBI_THROW(CException, eUnknown, "No organism set");
160 }
161 
162 
GetTaxId(const CBioseq_Handle & handle)163 TTaxId GetTaxId(const CBioseq_Handle& handle)
164 {
165     const COrg_ref* org_ref = GetOrg_refOrNull(handle);
166     if ( org_ref ) {
167         return org_ref->GetTaxId();
168     }
169     return ZERO_TAX_ID;
170 }
171 
172 
GetMolInfo(const CBioseq & bioseq)173 const CMolInfo* GetMolInfo(const CBioseq& bioseq)
174 {
175     ITERATE(CBioseq::TDescr::Tdata, it, bioseq.GetDescr().Get())
176     {
177         if ((**it).IsMolinfo())
178             return &(**it).GetMolinfo();
179     }
180     return NULL;
181 }
182 
183 
GetMolInfo(const CBioseq_Handle & handle)184 const CMolInfo* GetMolInfo(const CBioseq_Handle& handle)
185 {
186     CSeqdesc_CI desc_iter(handle, CSeqdesc::e_Molinfo);
187     for ( ;  desc_iter;  ++desc_iter) {
188         return &desc_iter->GetMolinfo();
189     }
190 
191     return NULL;
192 }
193 
194 
195 
GetBioseqFromSeqLoc(const CSeq_loc & loc,CScope & scope,CScope::EGetBioseqFlag flag)196 CBioseq_Handle GetBioseqFromSeqLoc
197 (const CSeq_loc& loc,
198  CScope& scope,
199  CScope::EGetBioseqFlag flag)
200 {
201     CBioseq_Handle retval;
202 
203     try {
204         if (IsOneBioseq(loc, &scope)) {
205             return scope.GetBioseqHandle(GetId(loc, &scope), flag);
206         }
207 
208         // assuming location is annotated on parts of a segmented bioseq
209         for (CSeq_loc_CI it(loc); it; ++it) {
210             CBioseq_Handle part = scope.GetBioseqHandle(it.GetSeq_id(), flag);
211             if (part) {
212                 retval = GetParentForPart(part);
213             }
214             break;  // check only the first part
215         }
216 
217         // if multiple intervals and not parts, look for the first loaded bioseq
218         if (!retval) {
219             for (CSeq_loc_CI it(loc); it; ++it) {
220                 retval =
221                     scope.GetBioseqHandle(it.GetSeq_id_Handle(), CScope::eGetBioseq_Loaded);
222                 if (retval) {
223                     break;
224                 }
225             }
226         }
227 
228         if (!retval  &&  flag == CScope::eGetBioseq_All) {
229             for (CSeq_loc_CI it(loc); it; ++it) {
230                 retval =
231                     scope.GetBioseqHandle(it.GetSeq_id_Handle(), flag);
232                 if (retval) {
233                     break;
234                 }
235             }
236         }
237     } catch (exception&) {
238         retval.Reset();
239     }
240 
241     return retval;
242 }
243 
244 
GetProteinName(const CBioseq_Handle & seq)245 string GetProteinName(const CBioseq_Handle& seq)
246 {
247     if ( !seq ) {
248         NCBI_THROW(CObjMgrException, eInvalidHandle,
249                    "GetProteinName: "
250                    "null handle");
251     }
252     if ( !seq.IsProtein() ) {
253         NCBI_THROW_FMT(CObjmgrUtilException, eBadSequenceType,
254                        "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
255                        "the sequence is not a protein");
256     }
257     TSeqPos seq_length = seq.GetBioseqLength();
258     TSeqPos best_length = 0;
259     vector<CMappedFeat> best_feats;
260     for ( CFeat_CI it(seq, CSeqFeatData::e_Prot); it; ++it ) {
261         COpenRange<TSeqPos> range = it->GetRange();
262         if ( range.GetToOpen() > seq_length ) {
263             range.SetToOpen(seq_length);
264         }
265         TSeqPos length = range.GetLength();
266         if ( length > best_length ) {
267             best_length = length;
268             best_feats.clear();
269         }
270         if ( length == best_length ) {
271             best_feats.push_back(*it);
272         }
273     }
274     if ( best_feats.empty() ) {
275         NCBI_THROW_FMT(CObjMgrException, eFindFailed,
276                        "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
277                        "the sequence does't have prot feature");
278     }
279     if ( best_feats.size() > 1 ) {
280         NCBI_THROW_FMT(CObjMgrException, eFindConflict,
281                        "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
282                        "the sequence have ambiguous prot feature");
283     }
284     string ret;
285     best_feats[0].GetData().GetProt().GetLabel(&ret);
286     if ( ret.empty() ) {
287         NCBI_THROW_FMT(CObjmgrUtilException, eBadFeature,
288                        "GetProteinName("<<GetId(seq, eGetId_Best)<<"): "
289                        "the prot feature doesn't return name");
290     }
291     return ret;
292 }
293 
294 
GetErrCodeString(void) const295 const char* CSeqIdFromHandleException::GetErrCodeString(void) const
296 {
297     switch (GetErrCode()) {
298     case eNoSynonyms:           return "eNoSynonyms";
299     case eRequestedIdNotFound:  return "eRequestedIdNotFound";
300     default:                    return CException::GetErrCodeString();
301     }
302 }
303 
304 
Score_SeqIdHandle(const CSeq_id_Handle & idh)305 int Score_SeqIdHandle(const CSeq_id_Handle& idh)
306 {
307     CConstRef<CSeq_id> id = idh.GetSeqId();
308     CRef<CSeq_id> id_non_const
309         (const_cast<CSeq_id*>(id.GetPointer()));
310     return CSeq_id::Score(id_non_const);
311 }
312 
313 
BestRank_SeqIdHandle(const CSeq_id_Handle & idh)314 int BestRank_SeqIdHandle(const CSeq_id_Handle& idh)
315 {
316     CConstRef<CSeq_id> id = idh.GetSeqId();
317     CRef<CSeq_id> id_non_const
318         (const_cast<CSeq_id*>(id.GetPointer()));
319     return CSeq_id::BestRank(id_non_const);
320 }
321 
322 
WorstRank_SeqIdHandle(const CSeq_id_Handle & idh)323 int WorstRank_SeqIdHandle(const CSeq_id_Handle& idh)
324 {
325     CConstRef<CSeq_id> id = idh.GetSeqId();
326     CRef<CSeq_id> id_non_const
327         (const_cast<CSeq_id*>(id.GetPointer()));
328     return CSeq_id::WorstRank(id_non_const);
329 }
330 
331 
FastaAARank_SeqIdHandle(const CSeq_id_Handle & idh)332 int FastaAARank_SeqIdHandle(const CSeq_id_Handle& idh)
333 {
334     CConstRef<CSeq_id> id = idh.GetSeqId();
335     CRef<CSeq_id> id_non_const
336         (const_cast<CSeq_id*>(id.GetPointer()));
337     return CSeq_id::FastaAARank(id_non_const);
338 }
339 
340 
FastaNARank_SeqIdHandle(const CSeq_id_Handle & idh)341 int FastaNARank_SeqIdHandle(const CSeq_id_Handle& idh)
342 {
343     CConstRef<CSeq_id> id = idh.GetSeqId();
344     CRef<CSeq_id> id_non_const
345         (const_cast<CSeq_id*>(id.GetPointer()));
346     return CSeq_id::FastaNARank(id_non_const);
347 }
348 
349 
350 
x_GetId(const CScope::TIds & ids,EGetIdType type)351 CSeq_id_Handle x_GetId(const CScope::TIds& ids, EGetIdType type)
352 {
353     if ( ids.empty() ) {
354         return CSeq_id_Handle();
355     }
356 
357     switch ( (type & eGetId_TypeMask) ) {
358     case eGetId_ForceGi:
359         if ( !CSeq_id::AvoidGi() ) {
360             ITERATE (CScope::TIds, iter, ids) {
361                 if (iter->IsGi()) {
362                     return *iter;
363                 }
364             }
365         }
366         if ((type & eGetId_ThrowOnError) != 0) {
367             NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
368                     "sequence::GetId(): gi seq-id not found in the list");
369         }
370         break;
371 
372     case eGetId_ForceAcc:
373         {{
374             CSeq_id_Handle best = x_GetId(ids, eGetId_Best);
375             if (best  &&
376                 best.GetSeqId()->GetTextseq_Id() != NULL  &&
377                 best.GetSeqId()->GetTextseq_Id()->IsSetAccession()) {
378                 return best;
379             }
380         }}
381         if ((type & eGetId_ThrowOnError) != 0) {
382             NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
383                     "sequence::GetId(): text seq-id not found in the list");
384         }
385         break;
386 
387     case eGetId_Best:
388         {{
389             return FindBestChoice(ids, Score_SeqIdHandle);
390         }}
391 
392     case eGetId_Seq_id_Score:
393         {{
394             return FindBestChoice(ids, Score_SeqIdHandle);
395         }}
396 
397     case eGetId_Seq_id_BestRank:
398         {{
399             return FindBestChoice(ids, BestRank_SeqIdHandle);
400         }}
401 
402     case eGetId_Seq_id_WorstRank:
403         {{
404             return FindBestChoice(ids, WorstRank_SeqIdHandle);
405         }}
406 
407     case eGetId_Seq_id_FastaAARank:
408         {{
409             return FindBestChoice(ids, FastaAARank_SeqIdHandle);
410         }}
411 
412     case eGetId_Seq_id_FastaNARank:
413         {{
414             return FindBestChoice(ids, FastaNARank_SeqIdHandle);
415         }}
416 
417     default:
418         break;
419     }
420     return CSeq_id_Handle();
421 }
422 
423 
GetId(const CBioseq & seq,EGetIdType type)424 CSeq_id_Handle GetId(const CBioseq& seq, EGetIdType type)
425 {
426     return GetId(seq.GetId(), type);
427 }
428 
429 
GetId(const CBioseq::TId & ids_in,EGetIdType type)430 CSeq_id_Handle GetId(const CBioseq::TId& ids_in, EGetIdType type)
431 {
432     CScope::TIds ids;
433     ITERATE (CBioseq::TId, it, ids_in) {
434         ids.push_back(CSeq_id_Handle::GetHandle(**it));
435     }
436 
437     return x_GetId(ids, type);
438 }
439 
440 
GetId(const CSeq_id & id,CScope & scope,EGetIdType type)441 CSeq_id_Handle GetId(const CSeq_id& id, CScope& scope, EGetIdType type)
442 {
443     return GetId(CSeq_id_Handle::GetHandle(id), scope, type);
444 }
445 
446 
GetId(const CSeq_id_Handle & idh,CScope & scope,EGetIdType type)447 CSeq_id_Handle GetId(const CSeq_id_Handle& idh, CScope& scope,
448                      EGetIdType type)
449 {
450     CSeq_id_Handle ret;
451     if (!idh) return ret;
452     try {
453         if ( (type & eGetId_TypeMask) == eGetId_ForceGi ) {
454             if ( idh.IsGi()  &&  (type & eGetId_VerifyId) == 0 ) {
455                 return idh;
456             }
457             TGi gi = scope.GetGi(idh);
458             if (gi != ZERO_GI) {
459                 ret = CSeq_id_Handle::GetGiHandle(gi);
460             }
461         }
462         else if ( (type & eGetId_TypeMask) == eGetId_Canonical) {
463             /// Short-cuts for commonly used IDs that are
464             /// known unambiguously to be canonical:
465             /// - ID/GenBank: GI
466             /// - Trace: gnl|ti|<tid> in the C++ Toolkit;
467             ///          note that in the C Toolkit, the
468             ///          canonical ID appears to be gnl|TRACE|<tid>.
469             /// - Short Read Archive: gnl|SRA|...
470             if (!CSeq_id::PreferAccessionOverGi()  &&
471                 idh.IsGi()) return idh;
472             if (idh.Which() == CSeq_id::e_General) {
473                 CConstRef<CSeq_id> id = idh.GetSeqId();
474                 _ASSERT(id  &&  id->IsGeneral());
475                 const CSeq_id::TGeneral::TDb& db = id->GetGeneral().GetDb();
476                 if (db == "ti"  ||  db == "SRA") return idh;
477             }
478 
479             /// Fallback to retrieve IDs.
480             ret = x_GetId(scope.GetIds(idh), type);
481             if ( !ret ) {
482                 /// failed to retrieve IDs
483                 /// assume input is the best that we can do
484                 ret = idh;
485             }
486         }
487         else if ( (type & eGetId_TypeMask) == eGetId_ForceAcc ) {
488             ret = scope.GetAccVer(idh);
489         }
490         else {
491             ret = x_GetId(scope.GetIds(idh), type);
492         }
493     }
494     catch (exception& e) {
495         ERR_POST("sequence::GetId(): exception: "<<e.what());
496         if ( (type & eGetId_ThrowOnError) != 0 ) {
497             throw;
498         }
499         ret.Reset();
500         return ret;
501     }
502     if ( !ret  &&  (type & eGetId_ThrowOnError) != 0 ) {
503         NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
504                 "sequence::GetId(): seq-id not found in the scope");
505     }
506     return ret;
507 }
508 
509 
GetId(const CBioseq_Handle & handle,EGetIdType type)510 CSeq_id_Handle GetId(const CBioseq_Handle& handle,
511                      EGetIdType type)
512 {
513     _ASSERT(handle);
514 
515     const CScope::TIds& ids = handle.GetId();
516     CSeq_id_Handle idh = x_GetId(ids, type);
517 
518     if ( !idh  &&  (type & eGetId_ThrowOnError) != 0 ) {
519         NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
520                    "Unable to get Seq-id from handle");
521     }
522 
523     return idh;
524 }
525 
526 
GetGiForAccession(const string & acc,CScope & scope,EGetIdType flags)527 TGi GetGiForAccession(const string& acc, CScope& scope, EGetIdType flags)
528 {
529     if ( CSeq_id::AvoidGi() ) return ZERO_GI;
530 
531     // Clear throw-on-error flag
532     EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceGi;
533     try {
534         CSeq_id acc_id(acc);
535         // Get gi only if acc a real accession.
536         if ( acc_id.GetTextseq_Id() ) {
537             CSeq_id_Handle idh = GetId(acc_id, scope, get_id_flags);
538             if ( idh.IsGi() ) {
539                 return idh.GetGi();
540             }
541         }
542     }
543     catch (exception& e) {
544         if ( (flags & eGetId_ThrowOnError) != 0 ) {
545             throw e;
546         }
547         return ZERO_GI;
548     }
549     if ( (flags & eGetId_ThrowOnError) != 0 ) {
550         NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
551             "sequence::GetGiForAccession(): invalid seq-id type");
552     }
553     return ZERO_GI;
554 }
555 
556 
GetGiForId(const objects::CSeq_id & id,CScope & scope,EGetIdType flags)557 TGi GetGiForId(const objects::CSeq_id& id, CScope& scope, EGetIdType flags)
558 {
559     if ( CSeq_id::AvoidGi() ) return ZERO_GI;
560 
561     // Clear throw-on-error flag
562     EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceGi;
563     CSeq_id_Handle idh = GetId(id, scope, get_id_flags);
564     if ( idh.IsGi() ) {
565         return idh.GetGi();
566     }
567     if ( (flags & eGetId_ThrowOnError) != 0 ) {
568         NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
569             "sequence::GetGiForId(): seq-id not found in the scope");
570     }
571     return ZERO_GI;
572 }
573 
574 
GetAccessionForGi(TGi gi,CScope & scope,EAccessionVersion use_version,EGetIdType flags)575 string GetAccessionForGi(TGi           gi,
576                          CScope&       scope,
577                          EAccessionVersion use_version,
578                          EGetIdType flags)
579 {
580     // Clear throw-on-error flag
581     EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceAcc;
582     bool with_version = (use_version == eWithAccessionVersion);
583 
584     CSeq_id gi_id(CSeq_id::e_Gi, gi);
585     CSeq_id_Handle idh = GetId(gi_id, scope, get_id_flags);
586     if ( idh ) {
587         return idh.GetSeqId()->GetSeqIdString(with_version);
588     }
589     if ( (flags & eGetId_ThrowOnError) != 0 ) {
590         NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
591             "sequence::GetAccessionForGi(): seq-id not found in the scope");
592     }
593     return kEmptyStr;
594 }
595 
596 
GetAccessionForId(const objects::CSeq_id & id,CScope & scope,EAccessionVersion use_version,EGetIdType flags)597 string GetAccessionForId(const objects::CSeq_id& id,
598                          CScope&       scope,
599                          EAccessionVersion use_version,
600                          EGetIdType flags)
601 {
602     // Clear throw-on-error flag
603     EGetIdType get_id_flags = (flags & eGetId_VerifyId) | eGetId_ForceAcc;
604     bool with_version = (use_version == eWithAccessionVersion);
605 
606     CSeq_id_Handle idh = GetId(id, scope, get_id_flags);
607     if ( idh ) {
608         return idh.GetSeqId()->GetSeqIdString(with_version);
609     }
610     if ( (flags & eGetId_ThrowOnError) != 0 ) {
611         NCBI_THROW(CSeqIdFromHandleException, eRequestedIdNotFound,
612             "sequence::GetAccessionForId(): seq-id not found in the scope");
613     }
614     return kEmptyStr;
615 }
616 
617 
x_FindLatestSequence(const CSeq_id_Handle & idh,CScope & scope,const CTime * tlim)618 CSeq_id_Handle x_FindLatestSequence(const CSeq_id_Handle& idh,
619                                    CScope&               scope,
620                                    const CTime*          tlim)
621 {
622     CBioseq_Handle h = scope.GetBioseqHandle(idh);
623     set<CSeq_id_Handle> visited;
624     CSeq_id_Handle next = idh;
625     while (h  &&  h.IsSetInst() && h.GetInst().IsSetHist()
626         && h.GetInst().GetHist().IsSetReplaced_by()) {
627         const CSeq_hist_rec& rec = h.GetInst().GetHist().GetReplaced_by();
628 
629         // Check if the next bioseq is newer than the limit.
630         if (tlim  &&  rec.IsSetDate()  &&
631             rec.GetDate().AsCTime().DiffTimeSpan(*tlim).GetSign() == ePositive) {
632             break;
633         }
634         // Make sure the list of ids is not empty
635         if ( rec.GetIds().empty() ) {
636             return CSeq_id_Handle();
637         }
638         visited.insert(next);
639         // If there are several replaced-by entries, use the first one
640         next = CSeq_id_Handle::GetHandle(
641             *h.GetInst().GetHist().GetReplaced_by().GetIds().front());
642         if (visited.find(next) != visited.end()) {
643             // Infinite recursion detected
644             return CSeq_id_Handle();
645         }
646         h = scope.GetBioseqHandle(next);
647     }
648     return h ? next : CSeq_id_Handle();
649 }
650 
651 
FindLatestSequence(const CSeq_id & id,CScope & scope)652 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id, CScope& scope)
653 {
654     return x_FindLatestSequence(CSeq_id_Handle::GetHandle(id),
655         scope, NULL).GetSeqId();
656 }
657 
FindLatestSequence(const CSeq_id_Handle & idh,CScope & scope)658 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh, CScope& scope)
659 {
660     return x_FindLatestSequence(idh, scope, NULL);
661 }
662 
FindLatestSequence(const CSeq_id & id,CScope & scope,const CTime & tlim)663 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id,
664                                       CScope&        scope,
665                                       const CTime&   tlim)
666 {
667     return x_FindLatestSequence(CSeq_id_Handle::GetHandle(id),
668         scope, &tlim).GetSeqId();
669 }
670 
FindLatestSequence(const CSeq_id_Handle & idh,CScope & scope,const CTime & tlim)671 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh,
672                                   CScope&               scope,
673                                   const CTime&          tlim)
674 {
675     return x_FindLatestSequence(idh, scope, &tlim);
676 }
677 
678 
SourceToProduct(const CSeq_feat & feat,const CSeq_loc & source_loc,TS2PFlags flags,CScope * scope,int * frame)679 CRef<CSeq_loc> SourceToProduct(const CSeq_feat& feat,
680                                const CSeq_loc& source_loc, TS2PFlags flags,
681                                CScope* scope, int* frame)
682 {
683     SRelLoc::TFlags rl_flags = 0;
684     if (flags & fS2P_NoMerge) {
685         rl_flags |= SRelLoc::fNoMerge;
686     }
687     SRelLoc rl(feat.GetLocation(), source_loc, scope, rl_flags);
688     _ASSERT(!rl.m_Ranges.empty());
689     rl.m_ParentLoc.Reset(&feat.GetProduct());
690     if (feat.GetData().IsCdregion()) {
691         // 3:1 ratio
692         const CCdregion& cds         = feat.GetData().GetCdregion();
693         int              base_frame  = cds.GetFrame();
694         if (base_frame > 0) {
695             --base_frame;
696         }
697         if (frame) {
698             *frame = (3 + rl.m_Ranges.front()->GetFrom() - base_frame) % 3 + 1;
699         }
700         TSeqPos prot_length;
701         try {
702             prot_length = GetLength(feat.GetProduct(), scope);
703         } catch (CObjmgrUtilException) {
704             prot_length = numeric_limits<TSeqPos>::max();
705         }
706         NON_CONST_ITERATE (SRelLoc::TRanges, it, rl.m_Ranges) {
707             if (IsReverse((*it)->GetStrand())) {
708                 ERR_POST_X(6, Warning
709                            << "SourceToProduct:"
710                            " parent and child have opposite orientations");
711             }
712             TSeqPos fr = (*it)->GetFrom();
713             TSeqPos to = (*it)->GetTo();
714             (*it)->SetFrom(((*it)->GetFrom() - base_frame) / 3);
715             (*it)->SetTo  (((*it)->GetTo()   - base_frame) / 3);
716             if ((flags & fS2P_AllowTer)  &&  to == prot_length * 3  && fr < to ) {
717                 --(*it)->SetTo();
718             }
719         }
720     } else {
721         if (frame) {
722             *frame = 0; // not applicable; explicitly zero
723         }
724     }
725 
726     return rl.Resolve(scope, rl_flags);
727 }
728 
729 
ProductToSource(const CSeq_feat & feat,const CSeq_loc & prod_loc,TP2SFlags flags,CScope * scope)730 CRef<CSeq_loc> ProductToSource(const CSeq_feat& feat, const CSeq_loc& prod_loc,
731                                TP2SFlags flags, CScope* scope)
732 {
733     SRelLoc rl(feat.GetProduct(), prod_loc, scope);
734     _ASSERT(!rl.m_Ranges.empty());
735     rl.m_ParentLoc.Reset(&feat.GetLocation());
736     if (feat.GetData().IsCdregion()) {
737         // 3:1 ratio
738         const CCdregion& cds        = feat.GetData().GetCdregion();
739         int              base_frame = cds.GetFrame();
740         if (base_frame > 0) {
741             --base_frame;
742         }
743         TSeqPos nuc_length, prot_length;
744         try {
745             nuc_length = GetLength(feat.GetLocation(), scope);
746         } catch (CObjmgrUtilException) {
747             nuc_length = numeric_limits<TSeqPos>::max();
748         }
749         try {
750             prot_length = GetLength(feat.GetProduct(), scope);
751         } catch (CObjmgrUtilException) {
752             prot_length = numeric_limits<TSeqPos>::max();
753         }
754         NON_CONST_ITERATE(SRelLoc::TRanges, it, rl.m_Ranges) {
755             _ASSERT( !IsReverse((*it)->GetStrand()) );
756             TSeqPos from, to;
757             if ((flags & fP2S_Extend)  &&  (*it)->GetFrom() == 0) {
758                 from = 0;
759             } else {
760                 from = (*it)->GetFrom() * 3 + base_frame;
761             }
762             if ((flags & fP2S_Extend)  &&  (*it)->GetTo() == prot_length - 1) {
763                 to = nuc_length - 1;
764             } else {
765                 to = (*it)->GetTo() * 3 + base_frame + 2;
766             }
767             (*it)->SetFrom(from);
768             (*it)->SetTo  (to);
769         }
770     }
771 
772     return rl.Resolve(scope);
773 }
774 
775 
776 typedef pair<Int8, CConstRef<CSeq_feat> > TFeatScore;
777 typedef vector<TFeatScore> TFeatScores;
778 
779 template <class T, class U>
780 struct SPairLessByFirst
781 {
operator ()SPairLessByFirst782     bool operator()(const pair<T,U>& p1, const pair<T,U>& p2) const
783     {
784         return p1.first < p2.first;
785     }
786 };
787 
788 template <class T, class U>
789 struct SPairLessBySecond
790 {
operator ()SPairLessBySecond791     bool operator()(const pair<T,U>& p1, const pair<T,U>& p2) const
792     {
793         return p1.second < p2.second;
794     }
795 };
796 
797 class COverlapPairLess
798 {
799 public:
COverlapPairLess(CScope * scope_arg)800     COverlapPairLess( CScope *scope_arg ) : scope(scope_arg) { }
801 
operator ()(const pair<Int8,CConstRef<CSeq_feat>> & gene1,const pair<Int8,CConstRef<CSeq_feat>> & gene2)802     bool operator()( const pair<Int8,CConstRef<CSeq_feat> >& gene1,
803         const pair<Int8, CConstRef<CSeq_feat> >& gene2 )
804     {
805         // First, compare by overlap amount
806         if( gene1.first != gene2.first ) {
807             return gene1.first < gene2.first;
808         }
809 
810         const CSeq_loc &loc1 = gene1.second->GetLocation();
811         const CSeq_loc &loc2 = gene2.second->GetLocation();
812 
813          // If genes are at identical positions, we fall back on the label
814          if( sequence::Compare(loc1, loc2, scope, sequence::fCompareOverlapping ) ==
815              sequence::eSame) {
816             if( gene1.second->IsSetData() && gene1.second->GetData().IsGene() &&
817                 gene2.second->IsSetData() && gene2.second->GetData().IsGene() )
818             {
819                 string gene1_label;
820                 string gene2_label;
821 
822                 gene1.second->GetData().GetGene().GetLabel( &gene1_label );
823                 gene2.second->GetData().GetGene().GetLabel( &gene2_label );
824                 return gene1_label < gene2_label;
825             }
826          }
827 
828         return false;
829     }
830 private:
831     CScope *scope;
832 };
833 
GetOverlappingFeatures(const CSeq_loc & loc,CSeqFeatData::E_Choice feat_type,CSeqFeatData::ESubtype feat_subtype,EOverlapType overlap_type,TFeatScores & feats,CScope & scope,const TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)834 void GetOverlappingFeatures(const CSeq_loc& loc,
835                             CSeqFeatData::E_Choice feat_type,
836                             CSeqFeatData::ESubtype feat_subtype,
837                             EOverlapType overlap_type,
838                             TFeatScores& feats,
839                             CScope& scope,
840                             const TBestFeatOpts opts,
841                             CGetOverlappingFeaturesPlugin *plugin )
842 {
843     bool revert_locations = false;
844     SAnnotSelector::EOverlapType annot_overlap_type;
845     switch (overlap_type) {
846     case eOverlap_Simple:
847     case eOverlap_Contained:
848     case eOverlap_Contains:
849         // Require total range overlap
850         annot_overlap_type = SAnnotSelector::eOverlap_TotalRange;
851         break;
852     case eOverlap_Subset:
853     case eOverlap_SubsetRev:
854     case eOverlap_CheckIntervals:
855     case eOverlap_Interval:
856     case eOverlap_CheckIntRev:
857         revert_locations = true;
858         // there's no break here - proceed to "default"
859     default:
860         // Require intervals overlap
861         annot_overlap_type = SAnnotSelector::eOverlap_Intervals;
862         break;
863     }
864 
865     CConstRef<CSeq_feat> feat_ref;
866     TOverlapFlags overlap_flags = fOverlap_Default;
867 
868     CBioseq_Handle bioseq_handle;
869     CRange<TSeqPos> range;
870     ENa_strand strand = eNa_strand_unknown;
871     if ( loc.IsWhole() ) {
872         bioseq_handle = scope.GetBioseqHandle(loc.GetWhole());
873         range = range.GetWhole();
874     }
875     else if ( loc.IsInt() || loc.IsPnt() || loc.IsPacked_int() || loc.IsMix() || loc.IsPacked_pnt() ) {
876         const CSeq_id* id = loc.GetId();
877         if( NULL != id ) {
878             bioseq_handle = scope.GetBioseqHandle(*id);
879             range.SetFrom(loc.GetStart(eExtreme_Positional));
880             range.SetTo(loc.GetStop(eExtreme_Positional));
881             if ( loc.IsSetStrand() ) {
882                 strand = loc.GetStrand();
883             }
884         }
885     }
886     else {
887         range = range.GetEmpty();
888     }
889 
890     // Check if the sequence is circular
891     TSeqPos circular_length = kInvalidSeqPos;
892     CConstRef<CSeq_id> circular_id;
893     if ( bioseq_handle ) {
894         if ( bioseq_handle.IsSetInst_Topology() &&
895              bioseq_handle.GetInst_Topology() == CSeq_inst::eTopology_circular ) {
896             circular_length = bioseq_handle.GetBioseqLength();
897             circular_id = bioseq_handle.GetSeqId();
898         }
899     }
900     else {
901         try {
902             const CSeq_id* loc_id = nullptr;
903             try {
904                 loc.CheckId(loc_id);
905             }
906             catch (exception&) {
907                 loc_id = 0;
908             }
909             if ( loc_id ) {
910                 circular_id.Reset(loc_id);
911                 CBioseq_Handle bseq_handle = scope.GetBioseqHandle(*circular_id);
912                 if ( bseq_handle && bseq_handle.IsSetInst_Topology() &&
913                      bseq_handle.GetInst_Topology() == CSeq_inst::eTopology_circular ) {
914                     circular_length = bseq_handle.GetBioseqLength();
915                 }
916             }
917         }
918         catch (exception& _DEBUG_ARG(e)) {
919             _TRACE("test for circularity failed: " << e.what()) ;
920         }
921     }
922 
923     CRef<CSeq_loc> circular_loc;
924     if (circular_id  &&  range.GetFrom() > range.GetTo()) {
925         // Circular bioseq, the location crosses zero. Can't use a single
926         // total range.
927         circular_loc.Reset(new CSeq_loc);
928         CRef<CSeq_interval> sub_loc(new CSeq_interval);
929         sub_loc->SetId().Assign(*circular_id);
930         sub_loc->SetFrom(0);
931         sub_loc->SetTo(range.GetTo());
932         if ( loc.IsSetStrand() ) {
933             sub_loc->SetStrand(loc.GetStrand());
934         }
935         // First interval - no matter front or back
936         circular_loc->SetPacked_int().Set().push_back(sub_loc);
937         sub_loc.Reset(new CSeq_interval);
938         sub_loc->SetId().Assign(*circular_id);
939         sub_loc->SetFrom(range.GetFrom());
940         sub_loc->SetTo(circular_length == kInvalidSeqPos
941             ? kInvalidSeqPos : circular_length - 1);
942         if ( loc.IsSetStrand() ) {
943             sub_loc->SetStrand(loc.GetStrand());
944         }
945         if ( IsReverse(strand) ) {
946             circular_loc->SetPacked_int().Set().push_front(sub_loc);
947         }
948         else {
949             circular_loc->SetPacked_int().Set().push_back(sub_loc);
950         }
951     }
952     try {
953         SAnnotSelector sel;
954         sel.SetFeatType(feat_type)
955             .SetFeatSubtype(feat_subtype)
956             .SetOverlapType(annot_overlap_type)
957             .SetResolveTSE();
958         if( opts & fBestFeat_IgnoreStrand ) {
959             sel.SetIgnoreStrand();
960             if( ! circular_id  &&  range.GetFrom() > range.GetTo() ) {
961                 // switch from and to
962                 range = CRange<TSeqPos>( range.GetTo(), range.GetFrom() );
963             }
964         }
965         if( plugin ) {
966             plugin->processSAnnotSelector( sel );
967         }
968 
969         auto_ptr<CFeat_CI> feat_it_ptr;
970         if( plugin ) {
971             plugin->setUpFeatureIterator( bioseq_handle, feat_it_ptr,
972                 circular_length, range, loc, sel, scope, strand);
973         } else {
974             if ( circular_loc ) {
975                 if ( !bioseq_handle ) {
976                     sel.SetSearchUnresolved();
977                 }
978                 feat_it_ptr.reset( new CFeat_CI(scope, *circular_loc, sel) );
979             }
980             else if ( bioseq_handle ) {
981                 feat_it_ptr.reset( new CFeat_CI(bioseq_handle, range, strand, sel) );
982             }
983             else {
984                 sel.SetSearchUnresolved();
985                 feat_it_ptr.reset( new CFeat_CI(scope, loc, sel) );
986             }
987         }
988         // convenience variable so we don't have to keep dereferencing the auto_ptr
989         CFeat_CI &feat_it = *feat_it_ptr;
990 
991         CRef<CSeq_loc> cleaned_loc( new CSeq_loc );
992         cleaned_loc->Assign( loc );
993         if( opts & fBestFeat_IgnoreStrand ) {
994             cleaned_loc->SetStrand(eNa_strand_plus);
995             overlap_flags |= fOverlap_IgnoreTopology;
996         }
997         if( plugin ) {
998             plugin->processLoc( bioseq_handle, cleaned_loc, circular_length );
999         }
1000 
1001         for ( ;  feat_it;  ++feat_it) {
1002             CRef<CSeq_loc> cleaned_loc_this_iteration = cleaned_loc;
1003             CRef<CSeq_loc> candidate_feat_loc( new CSeq_loc );
1004             candidate_feat_loc->Assign( feat_it->GetOriginalFeature().GetLocation() );
1005             if( opts & fBestFeat_IgnoreStrand ) {
1006                 candidate_feat_loc->SetStrand(eNa_strand_plus);
1007             }
1008             EOverlapType overlap_type_this_iteration = overlap_type;
1009             bool revert_locations_this_iteration = revert_locations;
1010 
1011             if( plugin ) {
1012                 bool shouldContinueToNextIteration = false;
1013                 plugin->processMainLoop(
1014                     shouldContinueToNextIteration,
1015                     cleaned_loc_this_iteration,
1016                     candidate_feat_loc,
1017                     overlap_type_this_iteration,
1018                     revert_locations_this_iteration,
1019                     bioseq_handle,
1020                     *feat_it,
1021                     circular_length,
1022                     annot_overlap_type);
1023                 if( shouldContinueToNextIteration ) {
1024                     continue;
1025                 }
1026             }
1027 
1028             try {
1029                 // treat subset as a special case
1030                 Int8 cur_diff = -1;
1031                 if ( !revert_locations_this_iteration ) {
1032                     if (overlap_flags == fOverlap_Default) {
1033                         cur_diff = TestForOverlap64(*candidate_feat_loc,
1034                             *cleaned_loc_this_iteration,
1035                             overlap_type_this_iteration,
1036                             circular_length,
1037                             &scope);
1038                     }
1039                     else {
1040                         cur_diff = TestForOverlapEx(*candidate_feat_loc,
1041                             *cleaned_loc_this_iteration,
1042                             overlap_type_this_iteration,
1043                             &scope,
1044                             overlap_flags);
1045                     }
1046                 }
1047                 else {
1048                     if (overlap_flags == fOverlap_Default) {
1049                         cur_diff = TestForOverlap64(*cleaned_loc_this_iteration,
1050                             *candidate_feat_loc,
1051                             overlap_type_this_iteration,
1052                             circular_length,
1053                             &scope);
1054                     }
1055                     else {
1056                         cur_diff = TestForOverlapEx(*cleaned_loc_this_iteration,
1057                             *candidate_feat_loc,
1058                             overlap_type_this_iteration,
1059                             &scope,
1060                             overlap_flags);
1061                     }
1062                 }
1063 
1064                 if( plugin ) {
1065                     plugin->postProcessDiffAmount( cur_diff, cleaned_loc_this_iteration,
1066                         candidate_feat_loc, scope, sel, circular_length );
1067                 }
1068                 if (cur_diff < 0) {
1069                     continue;
1070                 }
1071 
1072                 // quick fix for CFeat_CI returning wrong additional features
1073                 if (overlap_type == eOverlap_Contained) {
1074                     ECompare cmp = Compare(feat_it->GetLocation(), loc, &scope, fCompareOverlapping);
1075                     if (cmp != eContains && cmp != eSame) {
1076                         continue;
1077                     }
1078                 }
1079                 TFeatScore sc(cur_diff, ConstRef(&feat_it->GetMappedFeature()));
1080                 feats.push_back(sc);
1081             }
1082             catch (CObjmgrUtilException&) {
1083                 // On TestForOverlap64 error proceed to the next feature.
1084                 continue;
1085             }
1086         }
1087     }
1088     catch (exception&) {
1089         _TRACE("GetOverlappingFeatures(): error: feature iterator failed");
1090     }
1091 
1092     std::stable_sort(feats.begin(), feats.end(),
1093         COverlapPairLess( &scope ) );
1094 }
1095 
1096 
GetBestOverlappingFeat(const CSeq_loc & loc,CSeqFeatData::E_Choice feat_type,EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1097 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
1098                                             CSeqFeatData::E_Choice feat_type,
1099                                             EOverlapType overlap_type,
1100                                             CScope& scope,
1101                                             TBestFeatOpts opts,
1102                                             CGetOverlappingFeaturesPlugin *plugin )
1103 {
1104     TFeatScores scores;
1105     GetOverlappingFeatures(loc,
1106                            feat_type, CSeqFeatData::eSubtype_any,
1107                            overlap_type, scores, scope, opts, plugin );
1108     if (scores.size()) {
1109         if (opts & fBestFeat_FavorLonger) {
1110             return scores.back().second;
1111         } else {
1112             return scores.front().second;
1113         }
1114     }
1115     return CConstRef<CSeq_feat>();
1116 }
1117 
1118 
GetBestOverlappingFeat(const CSeq_loc & loc,CSeqFeatData::ESubtype feat_type,EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1119 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
1120                                             CSeqFeatData::ESubtype feat_type,
1121                                             EOverlapType overlap_type,
1122                                             CScope& scope,
1123                                             TBestFeatOpts opts,
1124                                             CGetOverlappingFeaturesPlugin *plugin )
1125 {
1126     TFeatScores scores;
1127     GetOverlappingFeatures(loc,
1128         CSeqFeatData::GetTypeFromSubtype(feat_type), feat_type,
1129         overlap_type, scores, scope, opts, plugin );
1130 
1131     if (scores.size()) {
1132         if (opts & fBestFeat_FavorLonger) {
1133             return scores.back().second;
1134         } else {
1135             return scores.front().second;
1136         }
1137     }
1138     return CConstRef<CSeq_feat>();
1139 }
1140 
1141 
1142 /// GetmRNAforCDS
1143 /// A function to find a CSeq_feat representing the
1144 /// appropriate mRNA for a given CDS.
1145 /// @param cds        The feature for which the mRNA to be found
1146 /// @param scope      The scope
1147 ///
1148 /// @return           CConstRef<CSeq_feat> for new mRNA (will be NULL if none is found)
1149 
GetmRNAforCDS(const CSeq_feat & cds,CScope & scope)1150 CConstRef<CSeq_feat> GetmRNAforCDS(const CSeq_feat& cds, CScope& scope)
1151 {
1152     CConstRef<CSeq_feat> mrna;
1153 
1154     bool has_xref = false;
1155     if (cds.IsSetXref()) {
1156         /* using FeatID from feature cross-references:
1157         * if CDS refers to an mRNA by feature ID, use that feature
1158         */
1159         CBioseq_Handle bsh;
1160         try {
1161             bsh = scope.GetBioseqHandle(cds.GetLocation());
1162         } catch (CException& ) {
1163             // multi-accession location, can't do this check
1164             return CConstRef<CSeq_feat>(NULL);
1165         }
1166         if (!bsh)
1167         {
1168             return CConstRef<CSeq_feat>(NULL);
1169         }
1170 
1171         CTSE_Handle tse = bsh.GetTSE_Handle();
1172         ITERATE(CSeq_feat::TXref, it, cds.GetXref()) {
1173             if ((*it)->IsSetId() && (*it)->GetId().IsLocal()) {
1174                 CSeq_feat_Handle mrna_h = tse.GetFeatureWithId(CSeqFeatData::eSubtype_mRNA, (*it)->GetId().GetLocal());
1175                 if (mrna_h) {
1176                     mrna = mrna_h.GetSeq_feat();
1177                 }
1178                 has_xref = true;
1179             }
1180         }
1181     }
1182     if (!has_xref) {
1183         /* using original location to find mRNA:
1184         * mRNA must include the CDS location and the internal interval boundaries need to be identical
1185         */
1186         mrna = GetBestOverlappingFeat(cds.GetLocation(), CSeqFeatData::eSubtype_mRNA, sequence::eOverlap_CheckIntRev, scope);
1187     }
1188     return mrna;
1189 }
1190 
1191 
1192 static
x_GetBestOverlapForSNP(const CSeq_feat & snp_feat,CSeqFeatData::E_Choice type,CSeqFeatData::ESubtype subtype,CScope & scope,bool search_both_strands=true)1193 CConstRef<CSeq_feat> x_GetBestOverlapForSNP(const CSeq_feat& snp_feat,
1194                                             CSeqFeatData::E_Choice type,
1195                                             CSeqFeatData::ESubtype subtype,
1196                                             CScope& scope,
1197                                             bool search_both_strands = true)
1198 {
1199     TFeatScores scores;
1200     CConstRef<CSeq_feat> overlap;
1201     GetOverlappingFeatures(snp_feat.GetLocation(),
1202                            type, subtype,
1203                            eOverlap_Contained, scores,
1204                            scope);
1205     if (scores.size()) {
1206         overlap = scores.front().second;
1207     }
1208 
1209     if (search_both_strands  &&  !overlap) {
1210         CRef<CSeq_loc> loc(new CSeq_loc);
1211         loc->Assign(snp_feat.GetLocation());
1212 
1213         ENa_strand strand = GetStrand(*loc, &scope);
1214         if (strand == eNa_strand_plus  ||  strand == eNa_strand_minus) {
1215             loc->FlipStrand();
1216         } else if (strand == eNa_strand_unknown) {
1217             loc->SetStrand(eNa_strand_minus);
1218         }
1219 
1220         scores.clear();
1221         GetOverlappingFeatures(*loc,
1222                                type, subtype,
1223                                eOverlap_Contained, scores,
1224                                scope);
1225         if (scores.size()) {
1226             overlap = scores.front().second;
1227         }
1228     }
1229 
1230     return overlap;
1231 }
1232 
1233 
GetBestOverlapForSNP(const CSeq_feat & snp_feat,CSeqFeatData::E_Choice type,CScope & scope,bool search_both_strands)1234 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat,
1235                                           CSeqFeatData::E_Choice type,
1236                                           CScope& scope,
1237                                           bool search_both_strands)
1238 {
1239     return x_GetBestOverlapForSNP(snp_feat, type, CSeqFeatData::eSubtype_any,
1240                                   scope, search_both_strands);
1241 }
1242 
1243 
GetBestOverlapForSNP(const CSeq_feat & snp_feat,CSeqFeatData::ESubtype subtype,CScope & scope,bool search_both_strands)1244 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat,
1245                                           CSeqFeatData::ESubtype subtype,
1246                                           CScope& scope,
1247                                           bool search_both_strands)
1248 {
1249     return x_GetBestOverlapForSNP(snp_feat,
1250         CSeqFeatData::GetTypeFromSubtype(subtype), subtype, scope,
1251         search_both_strands);
1252 }
1253 
1254 
GetOverlappingGene(const CSeq_loc & loc,CScope & scope,ETransSplicing eTransSplicing)1255 CConstRef<CSeq_feat> GetOverlappingGene(
1256     const CSeq_loc& loc, CScope& scope,
1257     ETransSplicing eTransSplicing )
1258 {
1259     switch ( eTransSplicing ) {
1260     case eTransSplicing_Auto:
1261         {
1262             ENa_strand strand = loc.GetStrand();
1263             if (strand == eNa_strand_both  ||  strand == eNa_strand_other) {
1264                 // Mixed strand indicates trans-splicing must be on.
1265                 return GetOverlappingGene(loc, scope, eTransSplicing_Yes);
1266             }
1267             // Try with trans-splicing on first. If it finds nothing, try
1268             // to turn it off.
1269             CConstRef<CSeq_feat> ret = GetOverlappingGene(loc, scope, eTransSplicing_Yes);
1270             return ret ? ret : GetOverlappingGene(loc, scope, eTransSplicing_No);
1271         }
1272     case eTransSplicing_Yes:
1273         {
1274             // If trans-splicing is on, the result must be a multi-range gene.
1275             CConstRef<CSeq_feat> ret = GetBestOverlappingFeat(loc,
1276                 CSeqFeatData::eSubtype_gene,
1277                 eOverlap_Contained, scope, fBestFeat_IgnoreStrand);
1278             if ( ret ) {
1279                 CSeq_loc_CI it(ret->GetLocation());
1280                 ++it;
1281                 if ( !it ) ret.Reset();
1282             }
1283             return ret;
1284         }
1285     case eTransSplicing_No:
1286         {
1287             // Multi-range genes assume trans-splicing=on and should not be included
1288             // when it's off.
1289             CConstRef<CSeq_feat> ret = GetBestOverlappingFeat(loc,
1290                 CSeqFeatData::eSubtype_gene,
1291                 eOverlap_Contained, scope, 0);
1292             if ( ret ) {
1293                 CSeq_loc_CI it(ret->GetLocation());
1294                 ++it;
1295                 if ( it ) ret.Reset();
1296             }
1297             return ret;
1298         }
1299     }
1300     return null;
1301 }
1302 
1303 
IsTransSpliced(const CSeq_feat & feat)1304 bool IsTransSpliced(const CSeq_feat& feat)
1305 {
1306     // note - even if the exception says "trans-splicing", it isn't really trans-splicing if
1307     // it's a single interval
1308     if (feat.IsSetExcept_text() && NStr::Find(feat.GetExcept_text(), "trans-splicing") != string::npos
1309         && !feat.GetLocation().IsInt()) {
1310         return true;
1311     } else {
1312         return false;
1313     }
1314 }
1315 
1316 
IsPseudo(const CSeq_feat & feat,CScope & scope)1317 bool IsPseudo(const CSeq_feat& feat, CScope& scope)
1318 {
1319     if (feat.IsSetPseudo() && feat.GetPseudo()) {
1320         return true;
1321     }
1322     if (feat.IsSetQual()) {
1323         ITERATE(CSeq_feat::TQual, it, feat.GetQual()) {
1324             if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "pseudogene")) {
1325                 return true;
1326             }
1327         }
1328     }
1329     if (feat.GetData().IsGene()) {
1330         if (feat.GetData().GetGene().IsSetPseudo() && feat.GetData().GetGene().GetPseudo()) {
1331             return true;
1332         }
1333     } else {
1334         if (feat.IsSetXref()) {
1335             ITERATE(CSeq_feat::TXref, it, feat.GetXref()) {
1336                 if ((*it)->IsSetData() && (*it)->GetData().IsGene() &&
1337                     (*it)->GetData().GetGene().IsSetPseudo() &&
1338                     (*it)->GetData().GetGene().GetPseudo()) {
1339                     return true;
1340                 }
1341             }
1342         }
1343         CConstRef<CSeq_feat> gene = GetGeneForFeature(feat, scope);
1344         if (gene && IsPseudo(*gene, scope)) {
1345             return true;
1346         }
1347     }
1348     return false;
1349 }
1350 
GetLocalGeneByLocus(const string & locus,bool use_tag,CBioseq_Handle bsh)1351 CConstRef<CSeq_feat> GetLocalGeneByLocus(const string& locus, bool use_tag, CBioseq_Handle bsh)
1352 {
1353     CTSE_Handle tse = bsh.GetTSE_Handle();
1354     const CBioseq& b = *(bsh.GetCompleteBioseq());
1355 
1356     CTSE_Handle::TSeq_feat_Handles potentials = tse.GetGenesWithLocus(locus, use_tag);
1357     //if (potentials.size() == 1) { // it may return wrong gene!
1358     //    return potentials.front().GetSeq_feat();
1359     //}
1360     ITERATE(CTSE_Handle::TSeq_feat_Handles, p, potentials) {
1361         try {
1362             CSeq_id_Handle id_h = p->GetLocationId();
1363             if (id_h) {
1364                 CConstRef<CSeq_id> p_id = id_h.GetSeqId();
1365             if (p_id) {
1366                 ITERATE(CBioseq::TId, id, b.GetId()) {
1367                     CSeq_id::E_SIC cmp = p_id->Compare(**id);
1368                     if (cmp == CSeq_id::e_YES) {
1369                         return p->GetSeq_feat();
1370                     } else if (cmp == CSeq_id::e_NO) {
1371                         break;
1372                     }
1373                 }
1374             }
1375             }
1376         } catch (CException&) {
1377             CSeq_loc_CI li(p->GetLocation());
1378             while (li) {
1379                 try {
1380                     const CSeq_id& this_id = li.GetSeq_id();
1381                     ITERATE(CBioseq::TId, id, b.GetId()) {
1382                         CSeq_id::E_SIC cmp = this_id.Compare(**id);
1383                         if (cmp == CSeq_id::e_YES) {
1384                             return p->GetSeq_feat();
1385                         } else if (cmp == CSeq_id::e_NO) {
1386                             break;
1387                         }
1388                     }
1389                 } catch (CException& ) {
1390                     // no Seq-id for this sublocation, keep trying
1391                 }
1392                 ++li;
1393             }
1394         }
1395     }
1396     return CConstRef<CSeq_feat>(NULL);
1397 }
1398 
1399 
GetLocalGeneByXref(const CGene_ref & gene,CBioseq_Handle bsh)1400 CConstRef<CSeq_feat> GetLocalGeneByXref(const CGene_ref& gene, CBioseq_Handle bsh)
1401 {
1402     if (gene.IsSetLocus_tag() && !(gene.GetLocus_tag().empty())) {
1403         CConstRef<CSeq_feat> f = GetLocalGeneByLocus(gene.GetLocus_tag(), true, bsh);
1404         if (f) {
1405             return f;
1406         }
1407     }
1408     if (gene.IsSetLocus() && !(gene.GetLocus().empty())) {
1409         CConstRef<CSeq_feat> f = GetLocalGeneByLocus(gene.GetLocus(), false, bsh);
1410         if (f) {
1411             return f;
1412         }
1413     }
1414     return CConstRef<CSeq_feat>(NULL);
1415 }
1416 
1417 
GetGeneForFeature(const CSeq_feat & feat,CScope & scope)1418 CConstRef<CSeq_feat> GetGeneForFeature(const CSeq_feat& feat, CScope& scope)
1419 {
1420     if (feat.IsSetXref()) {
1421         CBioseq_Handle bsh = GetBioseqFromSeqLoc(feat.GetLocation(), scope);
1422         if (!bsh) {
1423             return CConstRef<CSeq_feat>();
1424         }
1425         CTSE_Handle tse = bsh.GetTSE_Handle();
1426         ITERATE(CSeq_feat::TXref, xit, feat.GetXref()) {
1427             if ((*xit)->IsSetData() && (*xit)->GetData().IsGene() && (*xit)->GetData().GetGene().IsSuppressed()) {
1428                 return (CConstRef <CSeq_feat>());
1429             }
1430             if ((*xit)->IsSetId() && (*xit)->GetId().IsLocal() &&
1431                 (!(*xit)->IsSetData() || (*xit)->GetData().IsGene())) {
1432                 const CTSE_Handle::TFeatureId& feat_id = (*xit)->GetId().GetLocal();
1433                 CTSE_Handle::TSeq_feat_Handles far_feats = tse.GetFeaturesWithId(CSeqFeatData::eSubtype_gene, feat_id);
1434                 if (far_feats.size() > 0) {
1435                     return far_feats.front().GetSeq_feat();
1436                 }
1437                 // if xref claims to point to gene feature but gene feature does not exist,
1438                 // return NULL
1439                 if ((*xit)->IsSetData() && (*xit)->GetData().IsGene()) {
1440                     return CConstRef<CSeq_feat>();
1441                 }
1442             } else if ((*xit)->IsSetData() && (*xit)->GetData().IsGene()) {
1443                 const CGene_ref& gene = (*xit)->GetData().GetGene();
1444                 return GetLocalGeneByXref(gene, bsh);
1445             }
1446         }
1447     }
1448 
1449     CConstRef <CSeq_feat> gf = GetOverlappingGene(feat.GetLocation(), scope, IsTransSpliced(feat) ? sequence::eTransSplicing_Yes : sequence::eTransSplicing_Auto);
1450     if (gf) {
1451         ECompare cmp = Compare(gf->GetLocation(), feat.GetLocation(), &scope, fCompareOverlapping);
1452         if (cmp == eContains || cmp == eSame) {
1453             return gf;
1454         }
1455     }
1456 
1457     return CConstRef <CSeq_feat>();
1458 }
1459 
1460 
GetOverlappingmRNA(const CSeq_loc & loc,CScope & scope)1461 CConstRef<CSeq_feat> GetOverlappingmRNA(const CSeq_loc& loc, CScope& scope)
1462 {
1463     return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_mRNA,
1464                                   eOverlap_Contained, scope);
1465 }
1466 
1467 
GetOverlappingCDS(const CSeq_loc & loc,CScope & scope)1468 CConstRef<CSeq_feat> GetOverlappingCDS(const CSeq_loc& loc, CScope& scope)
1469 {
1470     return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_cdregion,
1471                                   eOverlap_Contained, scope);
1472 }
1473 
1474 
GetOverlappingPub(const CSeq_loc & loc,CScope & scope)1475 CConstRef<CSeq_feat> GetOverlappingPub(const CSeq_loc& loc, CScope& scope)
1476 {
1477     return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_pub,
1478                                   eOverlap_Contained, scope);
1479 }
1480 
1481 
GetOverlappingSource(const CSeq_loc & loc,CScope & scope)1482 CConstRef<CSeq_feat> GetOverlappingSource(const CSeq_loc& loc, CScope& scope)
1483 {
1484     return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_biosrc,
1485                                   eOverlap_Contained, scope);
1486 }
1487 
1488 
GetOverlappingOperon(const CSeq_loc & loc,CScope & scope)1489 CConstRef<CSeq_feat> GetOverlappingOperon(const CSeq_loc& loc, CScope& scope)
1490 {
1491     return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_operon,
1492                                   eOverlap_Contained, scope);
1493 }
1494 
1495 
1496 const char* kRibosomalSlippageText = "ribosomal slippage";
1497 
GetBestMrnaForCds(const CSeq_feat & cds_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1498 CConstRef<CSeq_feat> GetBestMrnaForCds(const CSeq_feat& cds_feat,
1499                                        CScope& scope,
1500                                        TBestFeatOpts opts,
1501                                        CGetOverlappingFeaturesPlugin *plugin )
1502 {
1503     _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
1504     CConstRef<CSeq_feat> mrna_feat;
1505 
1506     // search for a best overlapping mRNA
1507     // we start with a scan through the product accessions because we need
1508     // to insure that the chosen transcript does indeed match what we want
1509     TFeatScores feats;
1510     EOverlapType overlap_type = eOverlap_CheckIntRev;
1511     if (cds_feat.IsSetExcept()  &&  cds_feat.GetExcept()  &&
1512         cds_feat.IsSetExcept_text()  &&
1513         cds_feat.GetExcept_text() == kRibosomalSlippageText) {
1514         overlap_type = eOverlap_SubsetRev;
1515     }
1516     GetOverlappingFeatures(cds_feat.GetLocation(),
1517                            CSeqFeatData::e_Rna,
1518                            CSeqFeatData::eSubtype_mRNA,
1519                            overlap_type,
1520                            feats, scope, opts, plugin );
1521     /// easy out: 0 or 1 possible features
1522     if (feats.size() < 2) {
1523         if (feats.size() == 1) {
1524             mrna_feat = feats.front().second;
1525         }
1526         return mrna_feat;
1527     }
1528 
1529     if (cds_feat.IsSetProduct()) {
1530         try {
1531             // this may throw, if the product spans multiple sequences
1532             // this would be extremely unlikely, but we catch anyway
1533             const CSeq_id& product_id =
1534                 sequence::GetId(cds_feat.GetProduct(), &scope);
1535 
1536             ITERATE (TFeatScores, feat_iter, feats) {
1537                 const CSeq_feat& feat = *feat_iter->second;
1538                 if ( !feat.IsSetExt() ) {
1539                     continue;
1540                 }
1541 
1542                 /// scan the user object in the ext field
1543                 /// we look for a user object of type MrnaProteinLink
1544                 /// this should contain a seq-d string that we can match
1545                 CTypeConstIterator<CUser_object> obj_iter(feat);
1546                 for ( ;  obj_iter;  ++obj_iter) {
1547                     if (obj_iter->IsSetType()  &&
1548                         obj_iter->GetType().IsStr()  &&
1549                         obj_iter->GetType().GetStr() == "MrnaProteinLink") {
1550                         string prot_id_str = obj_iter->GetField("protein seqID")
1551                             .GetData().GetStr();
1552                         CSeq_id prot_id(prot_id_str);
1553                         vector<CSeq_id_Handle> ids = scope.GetIds(prot_id);
1554                         ids.push_back(CSeq_id_Handle::GetHandle(prot_id));
1555                         ITERATE (vector<CSeq_id_Handle>, id_iter, ids) {
1556                             if (product_id.Match(*id_iter->GetSeqId())) {
1557                                 mrna_feat.Reset(&feat);
1558                                 return mrna_feat;
1559                             }
1560                         }
1561                     }
1562                 }
1563             }
1564         }
1565         catch (exception&) {
1566         }
1567     }
1568 
1569     if (cds_feat.IsSetProduct()  &&  !(opts & fBestFeat_NoExpensive) ) {
1570         try {
1571             // this may throw, if the product spans multiple sequences
1572             // this would be extremely unlikely, but we catch anyway
1573             const CSeq_id& product_id =
1574                 sequence::GetId(cds_feat.GetProduct(), &scope);
1575 
1576             TFeatScores matching_feats;
1577             ITERATE (TFeatScores, feat_iter, feats) {
1578 
1579                 // we grab the mRNA product, if available, and scan it for
1580                 // a CDS feature.  the CDS feature should point to the same
1581                 // product as our current feature.
1582                 const CSeq_feat& mrna = *feat_iter->second;
1583                 if ( !mrna.IsSetProduct() ) {
1584                     continue;
1585                 }
1586 
1587                 CBioseq_Handle handle =
1588                     scope.GetBioseqHandle(mrna.GetProduct());
1589                 if ( !handle ) {
1590                     continue;
1591                 }
1592 
1593                 SAnnotSelector cds_sel;
1594                 cds_sel.SetOverlapIntervals()
1595                     .ExcludeNamedAnnots("SNP")
1596                     .SetResolveTSE()
1597                     .SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
1598                 CFeat_CI other_iter(scope, mrna.GetProduct(), cds_sel);
1599                 for ( ;  other_iter  &&  !mrna_feat;  ++other_iter) {
1600                     const CSeq_feat& cds = other_iter->GetOriginalFeature();
1601                     if ( !cds.IsSetProduct() ) {
1602                         continue;
1603                     }
1604 
1605                     CBioseq_Handle prot_handle =
1606                         scope.GetBioseqHandle(cds.GetProduct());
1607                     if ( !prot_handle ) {
1608                         continue;
1609                     }
1610 
1611                     if (prot_handle.IsSynonym(product_id)) {
1612                         // got it!
1613                         matching_feats.push_back(*feat_iter);
1614                         break;
1615                     }
1616                 }
1617             }
1618             if ( !matching_feats.empty() ) {
1619                 // keep only matching features
1620                 feats.swap(matching_feats);
1621                 if ( feats.size() == 1 ) {
1622                     mrna_feat = feats.front().second;
1623                     return mrna_feat;
1624                 }
1625             }
1626         }
1627         catch (exception&) {
1628         }
1629     }
1630 
1631     // check for transcript_id; this is a fast check
1632     string transcript_id = cds_feat.GetNamedQual("transcript_id");
1633     if ( !transcript_id.empty() ) {
1634         ITERATE (vector<TFeatScore>, feat_iter, feats) {
1635             const CSeq_feat& feat = *feat_iter->second;
1636             string other_transcript_id =
1637                 feat.GetNamedQual("transcript_id");
1638             if (transcript_id == other_transcript_id) {
1639                 mrna_feat.Reset(&feat);
1640                 return mrna_feat;
1641             }
1642         }
1643     }
1644 
1645     //
1646     // try to find the best by overlaps alone
1647     //
1648 
1649     if ( !mrna_feat  &&  !(opts & fBestFeat_StrictMatch) ) {
1650         if (opts & fBestFeat_FavorLonger) {
1651             mrna_feat = feats.back().second;
1652         } else {
1653             mrna_feat = feats.front().second;
1654         }
1655     }
1656 
1657     return mrna_feat;
1658 }
1659 
1660 
1661 // Plugin for GetOverlappingFeatures - uses eOverlap_CheckIntervals
1662 // or eOverlap_Subset depending on the "ribosomal slippage" flag
1663 // in the current feature.
1664 
1665 class CCdsForMrnaPlugin : public CGetOverlappingFeaturesPlugin
1666 {
1667 public:
CCdsForMrnaPlugin(CGetOverlappingFeaturesPlugin * prev_plugin)1668     CCdsForMrnaPlugin(CGetOverlappingFeaturesPlugin* prev_plugin)
1669         : m_PrevPlugin(prev_plugin) {}
~CCdsForMrnaPlugin()1670     virtual ~CCdsForMrnaPlugin() {}
1671 
processSAnnotSelector(SAnnotSelector & sel)1672     virtual void processSAnnotSelector(
1673         SAnnotSelector &sel)
1674     {
1675         if ( m_PrevPlugin ) {
1676             m_PrevPlugin->processSAnnotSelector(sel);
1677         }
1678     }
1679 
setUpFeatureIterator(CBioseq_Handle & bioseq_handle,auto_ptr<CFeat_CI> & feat_ci,TSeqPos circular_length,CRange<TSeqPos> & range,const CSeq_loc & loc,SAnnotSelector & sel,CScope & scope,ENa_strand & strand)1680     virtual void setUpFeatureIterator(
1681         CBioseq_Handle &bioseq_handle,
1682         auto_ptr<CFeat_CI> &feat_ci,
1683         TSeqPos circular_length ,
1684         CRange<TSeqPos> &range,
1685         const CSeq_loc& loc,
1686         SAnnotSelector &sel,
1687         CScope &scope,
1688         ENa_strand &strand)
1689     {
1690         if ( m_PrevPlugin ) {
1691             m_PrevPlugin->setUpFeatureIterator(bioseq_handle,
1692                 feat_ci, circular_length, range, loc, sel, scope, strand);
1693             return;
1694         }
1695         if ( bioseq_handle ) {
1696             feat_ci.reset(new CFeat_CI(bioseq_handle, range, strand, sel));
1697         } else {
1698             feat_ci.reset(new CFeat_CI(scope, loc, sel));
1699         }
1700     }
1701 
processLoc(CBioseq_Handle & bioseq_handle,CRef<CSeq_loc> & loc,TSeqPos circular_length)1702     virtual void processLoc(
1703         CBioseq_Handle &bioseq_handle,
1704         CRef<CSeq_loc> &loc,
1705         TSeqPos circular_length)
1706     {
1707         if ( m_PrevPlugin ) {
1708             m_PrevPlugin->processLoc(bioseq_handle, loc, circular_length);
1709         }
1710     }
1711 
processMainLoop(bool & shouldContinueToNextIteration,CRef<CSeq_loc> & cleaned_loc_this_iteration,CRef<CSeq_loc> & candidate_feat_loc,EOverlapType & overlap_type_this_iteration,bool & revert_locations_this_iteration,CBioseq_Handle & bioseq_handle,const CMappedFeat & feat,TSeqPos circular_length,SAnnotSelector::EOverlapType annot_overlap_type)1712     virtual void processMainLoop(
1713         bool &shouldContinueToNextIteration,
1714         CRef<CSeq_loc> &cleaned_loc_this_iteration,
1715         CRef<CSeq_loc> &candidate_feat_loc,
1716         EOverlapType &overlap_type_this_iteration,
1717         bool &revert_locations_this_iteration,
1718         CBioseq_Handle &bioseq_handle,
1719         const CMappedFeat &feat,
1720         TSeqPos circular_length,
1721         SAnnotSelector::EOverlapType annot_overlap_type)
1722     {
1723         const CSeq_feat& cds = feat.GetOriginalFeature();
1724         _ASSERT(cds.GetData().GetSubtype() ==
1725             CSeqFeatData::eSubtype_cdregion);
1726         // If the feature has "ribosomal slippage" flag set, use
1727         // eOverlap_Subset. Otherwise use more strict eOverlap_CheckIntervals.
1728         if (cds.IsSetExcept()  &&  cds.GetExcept()  &&
1729             cds.IsSetExcept_text()  &&
1730             cds.GetExcept_text() == kRibosomalSlippageText) {
1731             overlap_type_this_iteration = eOverlap_Subset;
1732         }
1733         if ( m_PrevPlugin ) {
1734             m_PrevPlugin->processMainLoop(shouldContinueToNextIteration,
1735                 cleaned_loc_this_iteration, candidate_feat_loc,
1736                 overlap_type_this_iteration,
1737                 revert_locations_this_iteration,
1738                 bioseq_handle, feat, circular_length, annot_overlap_type);
1739         }
1740     }
1741 
postProcessDiffAmount(Int8 & cur_diff,CRef<CSeq_loc> & cleaned_loc,CRef<CSeq_loc> & candidate_feat_loc,CScope & scope,SAnnotSelector & sel,TSeqPos circular_length)1742     virtual void postProcessDiffAmount(
1743         Int8 &cur_diff,
1744         CRef<CSeq_loc> &cleaned_loc,
1745         CRef<CSeq_loc> &candidate_feat_loc,
1746         CScope &scope,
1747         SAnnotSelector &sel,
1748         TSeqPos circular_length )
1749     {
1750         if ( m_PrevPlugin ) {
1751             m_PrevPlugin->postProcessDiffAmount(cur_diff,
1752                 cleaned_loc, candidate_feat_loc,
1753                 scope, sel, circular_length);
1754         }
1755     }
1756 
1757 private:
1758     CGetOverlappingFeaturesPlugin* m_PrevPlugin;
1759 };
1760 
1761 
1762 CConstRef<CSeq_feat>
GetBestCdsForMrna(const CSeq_feat & mrna_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1763 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
1764                   CScope& scope,
1765                   TBestFeatOpts opts,
1766                   CGetOverlappingFeaturesPlugin *plugin )
1767 {
1768     _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
1769     CConstRef<CSeq_feat> cds_feat;
1770 
1771     auto_ptr<CGetOverlappingFeaturesPlugin> cds_plugin(
1772         new CCdsForMrnaPlugin(plugin));
1773     // search for a best overlapping CDS
1774     // we start with a scan through the product accessions because we need
1775     // to insure that the chosen transcript does indeed match what we want
1776     TFeatScores feats;
1777     GetOverlappingFeatures(mrna_feat.GetLocation(),
1778                            CSeqFeatData::e_Cdregion,
1779                            CSeqFeatData::eSubtype_cdregion,
1780                            eOverlap_CheckIntervals,
1781                            feats, scope, opts, cds_plugin.get());
1782 
1783     /// easy out: 0 or 1 possible features
1784     if (feats.size() < 2) {
1785         if (feats.size() == 1) {
1786             cds_feat = feats.front().second;
1787         }
1788         return cds_feat;
1789     }
1790 
1791     if (mrna_feat.IsSetExt()) {
1792         /// scan the user object in the ext field
1793         /// we look for a user object of type MrnaProteinLink
1794         /// this should contain a seq-d string that we can match
1795         string prot_id_str;
1796         CTypeConstIterator<CUser_object> obj_iter(mrna_feat);
1797         for ( ;  obj_iter;  ++obj_iter) {
1798             if (obj_iter->IsSetType()  &&
1799                 obj_iter->GetType().IsStr()  &&
1800                 obj_iter->GetType().GetStr() == "MrnaProteinLink") {
1801                 prot_id_str = obj_iter->GetField("protein seqID").GetData().GetStr();
1802                 break;
1803             }
1804         }
1805         if ( !prot_id_str.empty() ) {
1806             CSeq_id prot_id(prot_id_str);
1807             vector<CSeq_id_Handle> ids = scope.GetIds(prot_id);
1808             ids.push_back(CSeq_id_Handle::GetHandle(prot_id));
1809 
1810             try {
1811                 /// look for a CDS feature that matches this expected ID
1812                 ITERATE (TFeatScores, feat_iter, feats) {
1813                     const CSeq_feat& feat = *feat_iter->second;
1814                     if ( !feat.IsSetProduct() ) {
1815                         continue;
1816                     }
1817                     const CSeq_id& id =
1818                         sequence::GetId(feat.GetLocation(), &scope);
1819                     ITERATE (vector<CSeq_id_Handle>, id_iter, ids) {
1820                         if (id.Match(*id_iter->GetSeqId())) {
1821                             cds_feat.Reset(&feat);
1822                             return cds_feat;
1823                         }
1824                     }
1825                 }
1826             }
1827             catch (exception&) {
1828             }
1829         }
1830     }
1831 
1832     // scan through the product accessions because we need to insure that the
1833     // chosen transcript does indeed match what we want
1834     if (mrna_feat.IsSetProduct()  &&  !(opts & fBestFeat_NoExpensive) ) {
1835         do {
1836             try {
1837                 // this may throw, if the product spans multiple sequences
1838                 // this would be extremely unlikely, but we catch anyway
1839                 const CSeq_id& mrna_product  =
1840                     sequence::GetId(mrna_feat.GetProduct(), &scope);
1841                 CBioseq_Handle mrna_handle =
1842                     scope.GetBioseqHandle(mrna_product);
1843 
1844                 // find the ID of the protein accession we're looking for
1845                 CConstRef<CSeq_id> protein_id;
1846                 {{
1847                     SAnnotSelector sel;
1848                     sel.SetOverlapIntervals()
1849                         .ExcludeNamedAnnots("SNP")
1850                         .SetResolveTSE()
1851                         .SetFeatSubtype(CSeqFeatData::eSubtype_cdregion);
1852 
1853                      CFeat_CI iter(mrna_handle, sel);
1854                      for ( ;  iter;  ++iter) {
1855                          if (iter->IsSetProduct()) {
1856                              protein_id.Reset
1857                                  (&sequence::GetId(iter->GetProduct(),
1858                                  &scope));
1859                              break;
1860                          }
1861                      }
1862                  }}
1863 
1864                 if ( !protein_id ) {
1865                     break;
1866                 }
1867 
1868                 TFeatScores::const_iterator feat_iter = feats.begin();
1869                 TFeatScores::const_iterator feat_end  = feats.end();
1870                 for ( ;  feat_iter != feat_end  &&  !cds_feat;  ++feat_iter) {
1871                     /// look for all contained CDS features; for each, check
1872                     /// to see if the protein product is the expected protein
1873                     /// product
1874                     const CSeq_feat& cds = *feat_iter->second;
1875                     if ( !cds.IsSetProduct() ) {
1876                         continue;
1877                     }
1878 
1879                     CBioseq_Handle prot_handle =
1880                         scope.GetBioseqHandle(cds.GetProduct());
1881                     if ( !prot_handle ) {
1882                         continue;
1883                     }
1884 
1885                     if (prot_handle.IsSynonym(*protein_id)) {
1886                         // got it!
1887                         cds_feat.Reset(&cds);
1888                         return cds_feat;
1889                     }
1890                 }
1891             }
1892             catch ( exception& ) {
1893             }
1894         }
1895         while (false);
1896     }
1897 
1898     // check for transcript_id
1899     // this is generally only available in GTF/GFF-imported features
1900     string transcript_id = mrna_feat.GetNamedQual("transcript_id");
1901     if ( !transcript_id.empty() ) {
1902         ITERATE (TFeatScores, feat_iter, feats) {
1903             const CSeq_feat& feat = *feat_iter->second;
1904             string other_transcript_id =
1905                 feat.GetNamedQual("transcript_id");
1906             if (transcript_id == other_transcript_id) {
1907                 cds_feat.Reset(&feat);
1908                 return cds_feat;
1909             }
1910         }
1911     }
1912 
1913     //
1914     // try to find the best by overlaps alone
1915     //
1916 
1917     if ( !cds_feat  &&  !(opts & fBestFeat_StrictMatch) ) {
1918         if (opts & fBestFeat_FavorLonger) {
1919             cds_feat = feats.back().second;
1920         } else {
1921             cds_feat = feats.front().second;
1922         }
1923     }
1924 
1925     return cds_feat;
1926 }
1927 
1928 
GetBestGeneForMrna(const CSeq_feat & mrna_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)1929 CConstRef<CSeq_feat> GetBestGeneForMrna(const CSeq_feat& mrna_feat,
1930                                           CScope& scope,
1931                                           TBestFeatOpts opts,
1932                                           CGetOverlappingFeaturesPlugin *plugin )
1933 {
1934     _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
1935     CConstRef<CSeq_feat> gene_feat;
1936 
1937     // search for a best overlapping gene
1938     TFeatScores feats;
1939     GetOverlappingFeatures(mrna_feat.GetLocation(),
1940                            CSeqFeatData::e_Gene,
1941                            CSeqFeatData::eSubtype_any,
1942                            eOverlap_Contained,
1943                            feats, scope, opts, plugin );
1944     /// easy out: 0 or 1 possible features
1945     if (feats.size() < 2) {
1946         if (feats.size() == 1) {
1947             gene_feat = feats.front().second;
1948         }
1949         return gene_feat;
1950     }
1951 
1952     ///
1953     /// compare gene xrefs to see if ew can find a match
1954     ///
1955     const CGene_ref* ref = mrna_feat.GetGeneXref();
1956     if (ref) {
1957         if (ref->IsSuppressed()) {
1958             /// 'suppress' case
1959             return gene_feat;
1960         }
1961 
1962         string ref_str;
1963         ref->GetLabel(&ref_str);
1964 
1965         ITERATE (TFeatScores, feat_it, feats) {
1966             const CSeq_feat& feat      = *feat_it->second;
1967             const CGene_ref& other_ref = feat.GetData().GetGene();
1968             string other_ref_str;
1969             other_ref.GetLabel(&other_ref_str);
1970             if (ref_str == other_ref_str) {
1971                 gene_feat = &feat;
1972                 return gene_feat;
1973             }
1974         }
1975     }
1976 
1977     ///
1978     /// compare by dbxrefs
1979     ///
1980     if (mrna_feat.IsSetDbxref()) {
1981         int gene_id = 0;
1982         ITERATE (CSeq_feat::TDbxref, dbxref, mrna_feat.GetDbxref()) {
1983             if ((*dbxref)->GetDb() == "GeneID"  ||
1984                 (*dbxref)->GetDb() == "LocusID") {
1985                 gene_id = (*dbxref)->GetTag().GetId();
1986                 break;
1987             }
1988         }
1989 
1990         if (gene_id != 0) {
1991             ITERATE (TFeatScores, feat_it, feats) {
1992                 const CSeq_feat& feat = *feat_it->second;
1993                 ITERATE (CSeq_feat::TDbxref, dbxref, feat.GetDbxref()) {
1994                     const string& db = (*dbxref)->GetDb();
1995                     if ((db == "GeneID"  ||  db == "LocusID")  &&
1996                         (*dbxref)->GetTag().GetId() == gene_id) {
1997                         gene_feat = &feat;
1998                         return gene_feat;
1999                     }
2000                 }
2001             }
2002         }
2003     }
2004 
2005     if ( !gene_feat  &&  !(opts & fBestFeat_StrictMatch) ) {
2006         if (opts & fBestFeat_FavorLonger) {
2007             gene_feat = feats.back().second;
2008         } else {
2009             gene_feat = feats.front().second;
2010         }
2011     }
2012 
2013     return gene_feat;
2014 }
2015 
2016 
GetBestGeneForCds(const CSeq_feat & cds_feat,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2017 CConstRef<CSeq_feat> GetBestGeneForCds(const CSeq_feat& cds_feat,
2018                                          CScope& scope,
2019                                          TBestFeatOpts opts,
2020                                          CGetOverlappingFeaturesPlugin *plugin )
2021 {
2022     _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
2023 
2024     CConstRef<CSeq_feat> feat_ref;
2025 
2026     // search for a best overlapping gene
2027     TFeatScores feats;
2028     GetOverlappingFeatures(cds_feat.GetLocation(),
2029                            CSeqFeatData::e_Gene,
2030                            CSeqFeatData::eSubtype_any,
2031                            eOverlap_Contained,
2032                            feats, scope, opts, plugin );
2033     /// easy out: 0 or 1 possible features
2034     if (feats.size() < 2) {
2035         if (feats.size() == 1) {
2036             feat_ref = feats.front().second;
2037         }
2038         return feat_ref;
2039     }
2040 
2041     // next: see if we can match based on gene xref
2042     const CGene_ref* ref = cds_feat.GetGeneXref();
2043     if (ref) {
2044         if (ref->IsSuppressed()) {
2045             /// 'suppress' case
2046             return feat_ref;
2047         }
2048 
2049         ITERATE (TFeatScores, feat_it, feats) {
2050             const CSeq_feat& feat = *feat_it->second;
2051 
2052             string ref_str;
2053             ref->GetLabel(&ref_str);
2054 
2055             const CGene_ref& other_ref = feat.GetData().GetGene();
2056             string other_ref_str;
2057             other_ref.GetLabel(&other_ref_str);
2058             if (ref_str == other_ref_str) {
2059                 feat_ref = &feat;
2060                 return feat_ref;
2061             }
2062         }
2063     }
2064 
2065     /// last check: expensive: need to proxy through mRNA match
2066     if ( !feat_ref  &&  !(opts & fBestFeat_NoExpensive) ) {
2067         feat_ref = GetBestMrnaForCds(cds_feat, scope,
2068                                      opts | fBestFeat_StrictMatch);
2069         if (feat_ref) {
2070             feat_ref = GetBestGeneForMrna(*feat_ref, scope, opts);
2071             if (feat_ref) {
2072                 return feat_ref;
2073             }
2074         }
2075     }
2076 
2077     if ( !feat_ref  &&  !(opts & fBestFeat_StrictMatch) ) {
2078         feat_ref = feats.front().second;
2079     }
2080     return feat_ref;
2081 }
2082 
2083 
GetMrnasForGene(const CSeq_feat & gene_feat,CScope & scope,list<CConstRef<CSeq_feat>> & mrna_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2084 void GetMrnasForGene(const CSeq_feat& gene_feat, CScope& scope,
2085                      list< CConstRef<CSeq_feat> >& mrna_feats,
2086                      TBestFeatOpts opts,
2087                      CGetOverlappingFeaturesPlugin *plugin )
2088 {
2089     _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2090     SAnnotSelector sel;
2091     sel.SetResolveTSE()
2092         .SetAdaptiveDepth()
2093         .IncludeFeatSubtype(CSeqFeatData::eSubtype_mRNA);
2094     CFeat_CI feat_it(scope, gene_feat.GetLocation(), sel);
2095     if (feat_it.GetSize() == 0) {
2096         return;
2097     }
2098 
2099     ///
2100     /// pass 1: compare by gene xref
2101     ///
2102     {{
2103          const CGene_ref& ref = gene_feat.GetData().GetGene();
2104          string ref_str;
2105          ref.GetLabel(&ref_str);
2106          size_t count = 0;
2107          for ( ;  feat_it;  ++feat_it) {
2108 
2109              const CGene_ref* other_ref =
2110                  feat_it->GetOriginalFeature().GetGeneXref();
2111              if ( !other_ref  ||  other_ref->IsSuppressed() ) {
2112                  continue;
2113              }
2114 
2115              string other_ref_str;
2116              other_ref->GetLabel(&other_ref_str);
2117              if (other_ref_str != ref_str) {
2118                  continue;
2119              }
2120 
2121              ECompare comp = sequence::Compare(gene_feat.GetLocation(),
2122                                                feat_it->GetLocation(),
2123                                                &scope,
2124                                                sequence::fCompareOverlapping);
2125              if (comp != eSame  &&  comp != eContains) {
2126                  continue;
2127              }
2128 
2129              CConstRef<CSeq_feat> feat_ref(&feat_it->GetOriginalFeature());
2130              mrna_feats.push_back(feat_ref);
2131              ++count;
2132          }
2133 
2134          if (count) {
2135              return;
2136          }
2137      }}
2138 
2139     ///
2140     /// pass 2: compare by gene id
2141     ///
2142     {{
2143         int gene_id = 0;
2144         if (gene_feat.IsSetDbxref()) {
2145             ITERATE (CSeq_feat::TDbxref, dbxref, gene_feat.GetDbxref()) {
2146                 if ((*dbxref)->GetDb() == "GeneID"  ||
2147                     (*dbxref)->GetDb() == "LocusID") {
2148                     gene_id = (*dbxref)->GetTag().GetId();
2149                     break;
2150                 }
2151             }
2152         }
2153 
2154         if (gene_id) {
2155             size_t count = 0;
2156             feat_it.Rewind();
2157             for ( ;  feat_it;  ++feat_it) {
2158                 /// check the suppress case
2159                 /// regardless of the gene-id binding, we always ignore these
2160                 const CGene_ref* other_ref =
2161                     feat_it->GetOriginalFeature().GetGeneXref();
2162                 if ( other_ref  &&  other_ref->IsSuppressed() ) {
2163                     continue;
2164                 }
2165 
2166                 CConstRef<CSeq_feat> ref(&feat_it->GetOriginalFeature());
2167 
2168                 ECompare comp = sequence::Compare(gene_feat.GetLocation(),
2169                                                 feat_it->GetLocation(),
2170                                                 &scope,
2171                                                 sequence::fCompareOverlapping);
2172                 if (comp != eSame  &&  comp != eContains) {
2173                     continue;
2174                 }
2175 
2176                 if (feat_it->IsSetDbxref()) {
2177                     ITERATE (CSeq_feat::TDbxref, dbxref, feat_it->GetDbxref()) {
2178                         if (((*dbxref)->GetDb() == "GeneID"  ||
2179                             (*dbxref)->GetDb() == "LocusID")  &&
2180                             (*dbxref)->GetTag().GetId() == gene_id) {
2181                             mrna_feats.push_back(ref);
2182                             ++count;
2183                             break;
2184                         }
2185                     }
2186                 }
2187             }
2188 
2189             if (count) {
2190                 return;
2191             }
2192         }
2193     }}
2194 
2195     // gene doesn't have a gene_id or a gene ref
2196     CConstRef<CSeq_feat> feat =
2197         sequence::GetBestOverlappingFeat(gene_feat.GetLocation(),
2198                                          CSeqFeatData::eSubtype_mRNA,
2199                                          sequence::eOverlap_Contains,
2200                                          scope, opts, plugin );
2201     if (feat) {
2202         mrna_feats.push_back(feat);
2203     }
2204 }
2205 
2206 
GetCdssForGene(const CSeq_feat & gene_feat,CScope & scope,list<CConstRef<CSeq_feat>> & cds_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2207 void GetCdssForGene(const CSeq_feat& gene_feat, CScope& scope,
2208                     list< CConstRef<CSeq_feat> >& cds_feats,
2209                     TBestFeatOpts opts,
2210                     CGetOverlappingFeaturesPlugin *plugin )
2211 {
2212     _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2213     list< CConstRef<CSeq_feat> > mrna_feats;
2214     GetMrnasForGene(gene_feat, scope, mrna_feats, opts);
2215     if (mrna_feats.size()) {
2216         ITERATE (list< CConstRef<CSeq_feat> >, iter, mrna_feats) {
2217             CConstRef<CSeq_feat> cds = GetBestCdsForMrna(**iter, scope, opts);
2218             if (cds) {
2219                 cds_feats.push_back(cds);
2220             }
2221         }
2222     } else {
2223         CConstRef<CSeq_feat> feat =
2224             sequence::GetBestOverlappingFeat(gene_feat.GetLocation(),
2225                                              CSeqFeatData::eSubtype_cdregion,
2226                                              sequence::eOverlap_Subset,
2227                                              scope, opts, plugin );
2228         if (feat) {
2229             cds_feats.push_back(feat);
2230         }
2231     }
2232 }
2233 
2234 
2235 CConstRef<CSeq_feat>
GetBestOverlappingFeat(const CSeq_feat & feat,CSeqFeatData::E_Choice feat_type,sequence::EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2236 GetBestOverlappingFeat(const CSeq_feat& feat,
2237                        CSeqFeatData::E_Choice feat_type,
2238                        sequence::EOverlapType overlap_type,
2239                        CScope& scope,
2240                        TBestFeatOpts opts,
2241                        CGetOverlappingFeaturesPlugin *plugin )
2242 {
2243     CConstRef<CSeq_feat> feat_ref;
2244     switch (feat_type) {
2245     case CSeqFeatData::e_Gene:
2246         return GetBestOverlappingFeat(feat,
2247                                       CSeqFeatData::eSubtype_gene,
2248                                       overlap_type, scope, opts, plugin );
2249 
2250     case CSeqFeatData::e_Rna:
2251         feat_ref = GetBestOverlappingFeat(feat,
2252                                           CSeqFeatData::eSubtype_mRNA,
2253                                           overlap_type, scope, opts, plugin );
2254         break;
2255 
2256     case CSeqFeatData::e_Cdregion:
2257         return GetBestOverlappingFeat(feat,
2258                                       CSeqFeatData::eSubtype_cdregion,
2259                                       overlap_type, scope, opts, plugin );
2260 
2261     default:
2262         break;
2263     }
2264 
2265     if ( !feat_ref ) {
2266         feat_ref = sequence::GetBestOverlappingFeat
2267             (feat.GetLocation(), feat_type, overlap_type, scope, opts, plugin );
2268     }
2269 
2270     return feat_ref;
2271 }
2272 
2273 
2274 CConstRef<CSeq_feat>
GetBestOverlappingFeat(const CSeq_feat & feat,CSeqFeatData::ESubtype subtype,sequence::EOverlapType overlap_type,CScope & scope,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2275 GetBestOverlappingFeat(const CSeq_feat& feat,
2276                        CSeqFeatData::ESubtype subtype,
2277                        sequence::EOverlapType overlap_type,
2278                        CScope& scope,
2279                        TBestFeatOpts opts,
2280                        CGetOverlappingFeaturesPlugin *plugin )
2281 {
2282     CConstRef<CSeq_feat> feat_ref;
2283     switch (feat.GetData().GetSubtype()) {
2284     case CSeqFeatData::eSubtype_mRNA:
2285         switch (subtype) {
2286         case CSeqFeatData::eSubtype_gene:
2287             return GetBestGeneForMrna(feat, scope, opts);
2288 
2289         case CSeqFeatData::eSubtype_cdregion:
2290             return GetBestCdsForMrna(feat, scope, opts);
2291 
2292         default:
2293             break;
2294         }
2295         break;
2296 
2297     case CSeqFeatData::eSubtype_cdregion:
2298         switch (subtype) {
2299         case CSeqFeatData::eSubtype_mRNA:
2300             return GetBestMrnaForCds(feat, scope, opts);
2301 
2302         case CSeqFeatData::eSubtype_gene:
2303             return GetBestGeneForCds(feat, scope, opts);
2304 
2305         default:
2306             break;
2307         }
2308         break;
2309 
2310     case CSeqFeatData::eSubtype_variation:
2311         return GetBestOverlapForSNP(feat, subtype, scope, true);
2312 
2313     default:
2314         break;
2315     }
2316 
2317     if ( !feat_ref ) {
2318         feat_ref = GetBestOverlappingFeat
2319             (feat.GetLocation(), subtype, overlap_type, scope, opts, plugin );
2320     }
2321 
2322     return feat_ref;
2323 }
2324 
2325 
2326 namespace {
2327 
x_GetFeatById(CSeqFeatData::ESubtype subtype,const CSeq_feat & feat,const CTSE_Handle & tse)2328 CConstRef<CSeq_feat> x_GetFeatById(CSeqFeatData::ESubtype subtype,
2329                                    const CSeq_feat& feat,
2330                                    const CTSE_Handle& tse)
2331 {
2332     if ( feat.IsSetXref() ) {
2333         ITERATE ( CSeq_feat::TXref, it, feat.GetXref() ) {
2334             const CSeqFeatXref& xref = **it;
2335             if ( xref.IsSetId() ) {
2336                 const CFeat_id& id = xref.GetId();
2337                 if ( id.IsLocal() ) {
2338                     const CObject_id& obj_id = id.GetLocal();
2339                     if ( obj_id.IsId() ) {
2340                         int local_id = obj_id.GetId();
2341                         CSeq_feat_Handle feat_handle =
2342                             tse.GetFeatureWithId(subtype, local_id);
2343                         if ( feat_handle ) {
2344                             return feat_handle.GetSeq_feat();
2345                         }
2346                     }
2347                 }
2348             }
2349         }
2350     }
2351     return null;
2352 }
2353 
2354 }
2355 
2356 
2357 CConstRef<CSeq_feat>
GetBestGeneForMrna(const CSeq_feat & mrna_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2358 GetBestGeneForMrna(const CSeq_feat& mrna_feat,
2359                    const CTSE_Handle& tse,
2360                    TBestFeatOpts opts,
2361                    CGetOverlappingFeaturesPlugin *plugin)
2362 {
2363     _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
2364     CConstRef<CSeq_feat> ret =
2365         x_GetFeatById(CSeqFeatData::eSubtype_gene, mrna_feat, tse);
2366     if ( !ret ) {
2367         ret = GetBestGeneForMrna(mrna_feat, tse.GetScope(), opts);
2368     }
2369     return ret;
2370 }
2371 
2372 CConstRef<CSeq_feat>
GetBestGeneForCds(const CSeq_feat & cds_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2373 GetBestGeneForCds(const CSeq_feat& cds_feat,
2374                   const CTSE_Handle& tse,
2375                   TBestFeatOpts opts,
2376                   CGetOverlappingFeaturesPlugin *plugin)
2377 {
2378     _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
2379     CConstRef<CSeq_feat> ret =
2380         x_GetFeatById(CSeqFeatData::eSubtype_gene, cds_feat, tse);
2381     if ( !ret ) {
2382         ret = GetBestGeneForCds(cds_feat, tse.GetScope(), opts);
2383     }
2384     return ret;
2385 }
2386 
2387 CConstRef<CSeq_feat>
GetBestMrnaForCds(const CSeq_feat & cds_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2388 GetBestMrnaForCds(const CSeq_feat& cds_feat,
2389                   const CTSE_Handle& tse,
2390                   TBestFeatOpts opts,
2391                   CGetOverlappingFeaturesPlugin *plugin)
2392 {
2393     _ASSERT(cds_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_cdregion);
2394     CConstRef<CSeq_feat> ret =
2395         x_GetFeatById(CSeqFeatData::eSubtype_mRNA, cds_feat, tse);
2396     if ( !ret ) {
2397         ret = GetBestMrnaForCds(cds_feat, tse.GetScope(), opts);
2398     }
2399     return ret;
2400 }
2401 
2402 CConstRef<CSeq_feat>
GetBestCdsForMrna(const CSeq_feat & mrna_feat,const CTSE_Handle & tse,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2403 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
2404                   const CTSE_Handle& tse,
2405                   TBestFeatOpts opts,
2406                   CGetOverlappingFeaturesPlugin *plugin)
2407 {
2408     _ASSERT(mrna_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_mRNA);
2409     CConstRef<CSeq_feat> ret =
2410         x_GetFeatById(CSeqFeatData::eSubtype_cdregion, mrna_feat, tse);
2411     if ( !ret ) {
2412         ret = GetBestCdsForMrna(mrna_feat, tse.GetScope(), opts);
2413     }
2414     return ret;
2415 }
2416 
GetMrnasForGene(const CSeq_feat & gene_feat,const CTSE_Handle & tse,list<CConstRef<CSeq_feat>> & mrna_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2417 void GetMrnasForGene(const CSeq_feat& gene_feat,
2418                      const CTSE_Handle& tse,
2419                      list< CConstRef<CSeq_feat> >& mrna_feats,
2420                      TBestFeatOpts opts,
2421                      CGetOverlappingFeaturesPlugin *plugin)
2422 {
2423     _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2424     GetMrnasForGene(gene_feat, tse.GetScope(), mrna_feats, opts);
2425 }
2426 
GetCdssForGene(const CSeq_feat & gene_feat,const CTSE_Handle & tse,list<CConstRef<CSeq_feat>> & cds_feats,TBestFeatOpts opts,CGetOverlappingFeaturesPlugin * plugin)2427 void GetCdssForGene(const CSeq_feat& gene_feat,
2428                     const CTSE_Handle& tse,
2429                     list< CConstRef<CSeq_feat> >& cds_feats,
2430                     TBestFeatOpts opts,
2431                     CGetOverlappingFeaturesPlugin *plugin)
2432 {
2433     _ASSERT(gene_feat.GetData().GetSubtype() == CSeqFeatData::eSubtype_gene);
2434     GetCdssForGene(gene_feat, tse.GetScope(), cds_feats, opts);
2435 }
2436 
2437 // Get the encoding CDS feature of a given protein sequence.
GetCDSForProduct(const CBioseq & product,CScope * scope)2438 const CSeq_feat* GetCDSForProduct(const CBioseq& product, CScope* scope)
2439 {
2440     if ( scope == 0 ) {
2441         return 0;
2442     }
2443 
2444     return GetCDSForProduct(scope->GetBioseqHandle(product));
2445 }
2446 
GetCDSForProduct(const CBioseq_Handle & bsh)2447 const CSeq_feat* GetCDSForProduct(const CBioseq_Handle& bsh)
2448 {
2449     CMappedFeat f = GetMappedCDSForProduct(bsh);
2450     if ( f ) {
2451         return &f.GetOriginalFeature();
2452     }
2453 
2454     return 0;
2455 }
2456 
GetMappedCDSForProduct(const CBioseq_Handle & bsh)2457 CMappedFeat GetMappedCDSForProduct(const CBioseq_Handle& bsh)
2458 {
2459     if ( bsh ) {
2460         // try first in-TSE CDS
2461         CFeat_CI fi(bsh,
2462                     SAnnotSelector(CSeqFeatData::e_Cdregion)
2463                     .SetByProduct().SetLimitTSE(bsh.GetTSE_Handle()));
2464         if ( !fi ) {
2465             // then any other CDS
2466             fi = CFeat_CI(bsh,
2467                           SAnnotSelector(CSeqFeatData::e_Cdregion)
2468                           .SetByProduct().ExcludeTSE(bsh.GetTSE_Handle()));
2469         }
2470         if ( fi ) {
2471             // return the first one (should be the one packaged on the
2472             // nuc-prot set).
2473             return *fi;
2474         }
2475     }
2476 
2477     return CMappedFeat();
2478 }
2479 
2480 
2481 // Get the mature peptide feature of a protein
GetPROTForProduct(const CBioseq & product,CScope * scope)2482 const CSeq_feat* GetPROTForProduct(const CBioseq& product, CScope* scope)
2483 {
2484     if ( scope == 0 ) {
2485         return 0;
2486     }
2487 
2488     return GetPROTForProduct(scope->GetBioseqHandle(product));
2489 }
2490 
GetPROTForProduct(const CBioseq_Handle & bsh)2491 const CSeq_feat* GetPROTForProduct(const CBioseq_Handle& bsh)
2492 {
2493     if ( bsh ) {
2494         CFeat_CI fi(bsh, SAnnotSelector(CSeqFeatData::e_Prot).SetByProduct());
2495         if ( fi ) {
2496             return &(fi->GetOriginalFeature());
2497         }
2498     }
2499 
2500     return 0;
2501 }
2502 
2503 
2504 
2505 // Get the encoding mRNA feature of a given mRNA (cDNA) bioseq.
GetmRNAForProduct(const CBioseq & product,CScope * scope)2506 const CSeq_feat* GetmRNAForProduct(const CBioseq& product, CScope* scope)
2507 {
2508     if ( scope == 0 ) {
2509         return 0;
2510     }
2511 
2512     return GetmRNAForProduct(scope->GetBioseqHandle(product));
2513 }
2514 
GetmRNAForProduct(const CBioseq_Handle & bsh)2515 const CSeq_feat* GetmRNAForProduct(const CBioseq_Handle& bsh)
2516 {
2517     if ( bsh ) {
2518         SAnnotSelector as(CSeqFeatData::eSubtype_mRNA);
2519         as.SetByProduct();
2520 
2521         CFeat_CI fi(bsh, as);
2522         if ( fi ) {
2523             return &(fi->GetOriginalFeature());
2524         }
2525     }
2526 
2527     return 0;
2528 }
2529 
2530 
GetMappedmRNAForProduct(const CBioseq_Handle & bsh)2531 CMappedFeat GetMappedmRNAForProduct(const CBioseq_Handle& bsh)
2532 {
2533     if ( bsh ) {
2534         CFeat_CI fi(bsh,
2535                     SAnnotSelector(CSeqFeatData::eSubtype_mRNA)
2536                     .SetByProduct());
2537         if ( fi ) {
2538             // return the first one (should be the one packaged on the
2539             // nuc-prot set).
2540             return *fi;
2541         }
2542     }
2543 
2544     return CMappedFeat();
2545 }
2546 
2547 
2548 // Get the encoding sequence of a protein
GetNucleotideParent(const CBioseq & product,CScope * scope)2549 const CBioseq* GetNucleotideParent(const CBioseq& product, CScope* scope)
2550 {
2551     if ( scope == 0 ) {
2552         return 0;
2553     }
2554     CBioseq_Handle bsh = GetNucleotideParent(scope->GetBioseqHandle(product));
2555     return bsh ? bsh.GetCompleteBioseq() : reinterpret_cast<const CBioseq*>(0);
2556 }
2557 
GetNucleotideParent(const CBioseq_Handle & bsh)2558 CBioseq_Handle GetNucleotideParent(const CBioseq_Handle& bsh)
2559 {
2560     // If protein use CDS to get to the encoding Nucleotide.
2561     // if nucleotide (cDNA) use mRNA feature.
2562     const CSeq_feat* sfp = bsh.GetInst().IsAa() ?
2563         GetCDSForProduct(bsh) : GetmRNAForProduct(bsh);
2564 
2565     CBioseq_Handle ret;
2566     if ( sfp ) {
2567         try {
2568             ret = bsh.GetScope().GetBioseqHandle(sfp->GetLocation());
2569         } catch(...) {
2570             // may fail due to trans-splicing, e.g., on small-genome set
2571         }
2572     }
2573     return ret;
2574 }
2575 
2576 
GetParentForPart(const CBioseq_Handle & part)2577 CBioseq_Handle GetParentForPart(const CBioseq_Handle& part)
2578 {
2579     CBioseq_Handle seg;
2580 
2581     if (part) {
2582         CSeq_entry_Handle segset =
2583             part.GetExactComplexityLevel(CBioseq_set::eClass_segset);
2584         if (segset) {
2585             for (CSeq_entry_CI it(segset); it; ++it) {
2586                 if (it->IsSeq()) {
2587                     seg = it->GetSeq();
2588                     break;
2589                 }
2590             }
2591         }
2592     }
2593 
2594     return seg;
2595 }
2596 
2597 
END_SCOPE(sequence)2598 END_SCOPE(sequence)
2599 
2600 
2601 
2602 CFastaOstream::CFastaOstream(CNcbiOstream& out)
2603     : m_Out(out),
2604       m_Flags(fInstantiateGaps | fAssembleParts | fEnableGI),
2605       m_GapMode(eGM_letters)
2606 {
2607     m_Gen.reset(new sequence::CDeflineGenerator);
2608     SetWidth(70);
2609 }
2610 
~CFastaOstream()2611 CFastaOstream::~CFastaOstream()
2612 {
2613     m_Out << flush;
2614 }
2615 
Write(const CSeq_entry_Handle & handle,const CSeq_loc * location)2616 void CFastaOstream::Write(const CSeq_entry_Handle& handle,
2617                           const CSeq_loc* location)
2618 {
2619     for (CBioseq_CI it(handle);  it;  ++it) {
2620         if ( !SkipBioseq(*it) ) {
2621             if (location) {
2622                 CSeq_loc loc2;
2623                 loc2.SetWhole().Assign(*it->GetSeqId());
2624                 int d = sequence::TestForOverlap
2625                     (*location, loc2, sequence::eOverlap_Interval,
2626                      kInvalidSeqPos, &handle.GetScope());
2627                 if (d < 0) {
2628                     continue;
2629                 }
2630             }
2631             Write(*it, location);
2632         }
2633     }
2634 }
2635 
2636 
Write(const CBioseq_Handle & handle,const CSeq_loc * location,const string & custom_title)2637 void CFastaOstream::Write(const CBioseq_Handle& handle,
2638                           const CSeq_loc* location,
2639                           const string& custom_title)
2640 {
2641     WriteTitle(handle, location, custom_title);
2642     WriteSequence(handle, location);
2643 }
2644 
2645 
s_FastaGetOriginalID(const CBioseq & seq)2646 static string s_FastaGetOriginalID (const CBioseq& seq)
2647 
2648 {
2649     FOR_EACH_SEQDESC_ON_BIOSEQ (it, seq) {
2650         const CSeqdesc& desc = **it;
2651         if (! desc.IsUser()) continue;
2652         if (! desc.GetUser().IsSetType()) continue;
2653         const CUser_object& usr = desc.GetUser();
2654         const CObject_id& oi = usr.GetType();
2655         if (! oi.IsStr()) continue;
2656         const string& type = oi.GetStr();
2657         if (! NStr::EqualNocase(type, "OrginalID") && ! NStr::EqualNocase(type, "OriginalID")) continue;
2658         FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
2659             const CUser_field& fld = **uitr;
2660             if (FIELD_IS_SET_AND_IS(fld, Label, Str)) {
2661                 const string &label_str = GET_FIELD(fld.GetLabel(), Str);
2662                 if (! NStr::EqualNocase(label_str, "LocalId")) continue;
2663                 if (fld.IsSetData() && fld.GetData().IsStr()) {
2664                     return fld.GetData().GetStr();
2665                 }
2666             }
2667         }
2668     }
2669 
2670     return "";
2671 }
2672 
s_ShouldUseOriginalID(const CBioseq & seq)2673 static bool s_ShouldUseOriginalID (const CBioseq& seq)
2674 {
2675     FOR_EACH_SEQID_ON_BIOSEQ (id_itr, seq) {
2676         const CSeq_id& sid = **id_itr;
2677         switch (sid.Which()) {
2678             case CSeq_id::e_Local:
2679                 break;
2680             case CSeq_id::e_General:
2681                 {
2682                     const CDbtag& dbtag = sid.GetGeneral();
2683                     if (dbtag.IsSetDb()) {
2684                         const string& db = dbtag.GetDb();
2685                         if (! NStr::EqualNocase(db, "TMSMART") &&
2686                             ! NStr::EqualNocase(db, "BankIt") &&
2687                             ! NStr::EqualNocase(db, "NCBIFILE")) {
2688                             return false;
2689                         }
2690                     }
2691                 }
2692                 break;
2693             default:
2694                 return false;
2695         }
2696     }
2697 
2698     return true;
2699 }
2700 
x_GetBestId(CConstRef<CSeq_id> & gi_id,CConstRef<CSeq_id> & best_id,bool & hide_prefix,const CBioseq & bioseq)2701 void CFastaOstream::x_GetBestId(CConstRef<CSeq_id>& gi_id, CConstRef<CSeq_id>& best_id, bool& hide_prefix, const CBioseq& bioseq)
2702 {
2703     bool is_na = bioseq.GetInst().GetMol() != CSeq_inst::eMol_aa;
2704     best_id = FindBestChoice(bioseq.GetId(), is_na ? CSeq_id::FastaNARank : CSeq_id::FastaAARank);
2705 
2706     ITERATE(CBioseq::TId, id, bioseq.GetId()) {
2707         if ((*id)->IsGi()) {
2708             gi_id = *id;
2709             break;
2710         }
2711     }
2712 
2713     // see SQD-4144, only Accession.Version should be shown, without prefixes and suffixes
2714     if (best_id.NotEmpty() &&
2715         (m_Flags & fEnableGI) == 0 &&
2716         (m_Flags & fHideGenBankPrefix) != 0)
2717     {
2718         switch (best_id->Which())
2719         {
2720         case CSeq_id::e_Genbank:
2721         case CSeq_id::e_Embl:
2722         case CSeq_id::e_Other:
2723         case CSeq_id::e_Ddbj:
2724         case CSeq_id::e_Tpg:
2725         case CSeq_id::e_Tpe:
2726         case CSeq_id::e_Tpd:
2727             hide_prefix = true;
2728             break;
2729         default:
2730             break;
2731         }
2732     }
2733 }
2734 
x_WriteAsFasta(const CBioseq & bioseq)2735 void CFastaOstream::x_WriteAsFasta(const CBioseq& bioseq)
2736 {
2737     CConstRef<CSeq_id> best_id;
2738     CConstRef<CSeq_id> gi_id;
2739     bool hide_prefix = false;
2740 
2741     // override this method and provide application specific 'best id' policy
2742     x_GetBestId(gi_id, best_id, hide_prefix, bioseq);
2743 
2744     if (best_id.NotEmpty())
2745     {
2746         // RW-139, no GI in FASTA output
2747         if (gi_id.NotEmpty() && (m_Flags & fEnableGI) && !best_id->IsGi())
2748         {
2749             // FastA format
2750             // Here we have something like:
2751             //      gi|###|SOME_ACCESSION|title
2752 
2753             gi_id->WriteAsFasta(m_Out);
2754             m_Out << '|';
2755         }
2756 
2757         const CTextseq_id* text_id = 0;
2758         if (hide_prefix)
2759         {
2760             text_id = best_id->GetTextseq_Id();
2761         }
2762 
2763         if (text_id != 0)
2764         {
2765             if (text_id->IsSetAccession())
2766             {
2767                 m_Out << text_id->GetAccession();
2768                 if (text_id->IsSetVersion())
2769                 {
2770                     m_Out << "." << text_id->GetVersion();
2771                 }
2772             }
2773         }
2774         else
2775         {
2776             best_id->WriteAsFasta(m_Out);
2777         }
2778     }
2779 }
2780 
x_WriteSeqIds(const CBioseq & bioseq,const CSeq_loc * location)2781 void CFastaOstream::x_WriteSeqIds(const CBioseq& bioseq,
2782                                   const CSeq_loc* location)
2783 {
2784     bool have_range = (location != NULL  &&  !location->IsWhole()
2785                        &&  !(m_Flags & fSuppressRange) );
2786 
2787     if ( !have_range  &&  (m_Flags & fNoDupCheck) == 0) {
2788         ITERATE (CBioseq::TId, id, bioseq.GetId()) {
2789             CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(**id);
2790             pair<TSeq_id_HandleSet::iterator, bool> p
2791                 = m_PreviousWholeIds.insert(idh);
2792             if ( !p.second ) {
2793                 NCBI_THROW(CObjmgrUtilException, eBadLocation,
2794                            "Duplicate Seq-id " + (*id)->AsFastaString()
2795                            + " in FASTA output");
2796             }
2797         }
2798     }
2799 
2800     m_Out << '>';
2801     if (!(m_Flags & fIgnoreOriginalID) &&
2802             s_ShouldUseOriginalID(bioseq)) {
2803         string origID = s_FastaGetOriginalID(bioseq);
2804         if (! NStr::IsBlank(origID)) {
2805             m_Out << "lcl|" << origID;
2806         } else {
2807             x_WriteAsFasta(bioseq);
2808         }
2809     } else {
2810         x_WriteAsFasta(bioseq);
2811     }
2812 
2813     if (have_range) {
2814         char delim = ':';
2815         for (CSeq_loc_CI it(*location);  it;  ++it) {
2816             CSeq_loc::TRange range = it.GetRange();
2817             TSeqPos from = range.GetFrom() + 1, to = range.GetTo() + 1;
2818             _ASSERT(from <= to);
2819             m_Out << delim;
2820             if (it.IsSetStrand()  &&  IsReverse(it.GetStrand())) {
2821                 m_Out << 'c' << to << '-' << from;
2822             } else {
2823                 m_Out << from << '-' << to;
2824             }
2825             delim = ',';
2826         }
2827     }
2828 }
2829 
2830 inline
2831 sequence::CDeflineGenerator::TUserFlags
x_GetTitleFlags(void) const2832 CFastaOstream::x_GetTitleFlags(void) const
2833 {
2834     sequence::TGetTitleFlags title_flags = 0;
2835     title_flags |= sequence::CDeflineGenerator::fFastaFormat;
2836 
2837     if ((m_Flags & fNoExpensiveOps) != 0) {
2838         title_flags |= sequence::CDeflineGenerator::fNoExpensiveOps;
2839     }
2840     if ((m_Flags & fShowModifiers) != 0) {
2841         title_flags |= sequence::CDeflineGenerator::fShowModifiers;
2842     }
2843     /*
2844     if ((m_Flags & fUseAutoDef) != 0) {
2845         title_flags |= sequence::CDeflineGenerator::fUseAutoDef;
2846     }
2847     */
2848     if ((m_Flags & fDoNotUseAutoDef) == 0) {
2849         title_flags |= sequence::CDeflineGenerator::fUseAutoDef;
2850     }
2851     return title_flags;
2852 }
2853 
x_WriteSeqTitle(const CBioseq_Handle & bioseq_handle,const string & custom_title)2854 void CFastaOstream::x_WriteSeqTitle(const CBioseq_Handle & bioseq_handle,
2855                                     const string& custom_title)
2856 {
2857     string safe_title = (!custom_title.empty()) ? custom_title
2858         : m_Gen->GenerateDefline(bioseq_handle, x_GetTitleFlags());
2859 
2860     if ( !safe_title.empty() ) {
2861         if ( !(m_Flags & fKeepGTSigns) ) {
2862             NStr::ReplaceInPlace(safe_title, ">", "_");
2863         }
2864         if (safe_title[0] != ' ') {
2865             m_Out << ' ';
2866         }
2867 
2868         if ((m_Flags & fHTMLEncode) != 0) {
2869             safe_title = NStr::HtmlEncode(safe_title);
2870         }
2871         m_Out << safe_title;
2872     }
2873     m_Out << '\n';
2874 }
2875 
WriteTitle(const CBioseq & bioseq,const CSeq_loc * location,bool no_scope,const string & custom_title)2876 void CFastaOstream::WriteTitle(const CBioseq& bioseq,
2877                                const CSeq_loc* location,
2878                                bool no_scope, // not used
2879                                const string& custom_title)
2880 {
2881     x_WriteSeqIds(bioseq, location);
2882     CScope scope(*CObjectManager::GetInstance());
2883     CBioseq_Handle bioseq_handle = scope.AddBioseq(bioseq);
2884     x_WriteSeqTitle(bioseq_handle, custom_title);
2885 }
2886 
WriteTitle(const CBioseq_Handle & bioseq_handle,const CSeq_loc * location,const string & custom_title)2887 void CFastaOstream::WriteTitle(const CBioseq_Handle& bioseq_handle,
2888                                const CSeq_loc* location,
2889                                const string& custom_title)
2890 {
2891     const CBioseq& bioseq = *bioseq_handle.GetBioseqCore();
2892     x_WriteSeqIds(bioseq, location);
2893     x_WriteSeqTitle(bioseq_handle, custom_title);
2894 }
2895 
2896 
x_MapMask(CSeq_loc_Mapper & mapper,const CSeq_loc & mask,const CSeq_id * base_seq_id,CScope * scope)2897 CConstRef<CSeq_loc> CFastaOstream::x_MapMask(CSeq_loc_Mapper& mapper,
2898                                              const CSeq_loc& mask,
2899                                              const CSeq_id* base_seq_id,
2900                                              CScope* scope)
2901 {
2902     CConstRef<CSeq_loc> mapped_mask(&mask);
2903 
2904     // Mapping down requires the higher-level ID as a reference, even
2905     // when given a scope, and as such should precede mapping up to
2906     // keep sequence::GetId from bombing out.
2907     if ((m_Flags & fMapMasksDown) != 0  &&  scope) {
2908         try {
2909             CSeq_loc_Mapper mapper_down
2910                 (scope->GetBioseqHandle(sequence::GetId(*mapped_mask, scope)),
2911                  CSeq_loc_Mapper::eSeqMap_Down);
2912             mapped_mask = mapped_mask->Add(*mapper_down.Map(*mapped_mask),
2913                                            CSeq_loc::fSortAndMerge_All, 0);
2914         } catch (CObjmgrUtilException&) {
2915         }
2916     }
2917     if ((m_Flags & fMapMasksUp) != 0  &&  scope  &&  base_seq_id) {
2918         CSeq_loc_Mapper mapper_up(scope->GetBioseqHandle(*base_seq_id),
2919                                   CSeq_loc_Mapper::eSeqMap_Up);
2920         mapped_mask = mapped_mask->Add(*mapper_up.Map(*mapped_mask),
2921                                        CSeq_loc::fSortAndMerge_All, 0);
2922     }
2923     mapped_mask = mapper.Map(*mapped_mask);
2924     return mapped_mask;
2925 }
2926 
2927 
x_GetMaskingStates(TMSMap & masking_state,const CSeq_id * base_seq_id,const CSeq_loc * location,CScope * scope)2928 void CFastaOstream::x_GetMaskingStates(TMSMap& masking_state,
2929                                        const CSeq_id* base_seq_id,
2930                                        const CSeq_loc* location,
2931                                        CScope* scope)
2932 {
2933     CRef<CSeq_loc_Mapper> mapper;
2934     CBioseq_Handle        bsh;
2935 
2936     if (m_SoftMask.NotEmpty()  ||  m_HardMask.NotEmpty()) {
2937         _ASSERT(base_seq_id);
2938         if (location) {
2939             CSeq_loc loc2;
2940             try {
2941                 TSeqPos length = sequence::GetLength(*location, scope);
2942                 loc2.SetInt().SetId().Assign(*base_seq_id);
2943                 loc2.SetInt().SetFrom(0);
2944                 loc2.SetInt().SetTo(length - 1);
2945             } catch (exception&) {
2946                 loc2.SetWhole().Assign(*base_seq_id);
2947             }
2948             mapper.Reset(new CSeq_loc_Mapper(*location, loc2, scope));
2949         } else {
2950             // still useful for filtering out locations on other sequences
2951             CSeq_loc whole;
2952             whole.SetWhole().Assign(*base_seq_id);
2953             mapper.Reset(new CSeq_loc_Mapper(whole, whole, scope));
2954         }
2955         mapper->SetMergeAll();
2956         mapper->TruncateNonmappingRanges();
2957 
2958         if (scope  &&  (m_Flags & (fMapMasksUp | fMapMasksDown))) {
2959             bsh = scope->GetBioseqHandle(*base_seq_id);
2960         }
2961 
2962         const CSeq_loc&     mask      = m_SoftMask ? *m_SoftMask : *m_HardMask;
2963         int                 type      = m_SoftMask ? eSoftMask : eHardMask;
2964         CConstRef<CSeq_loc> mapped_mask = x_MapMask(*mapper, mask, base_seq_id,
2965                                                     scope);
2966 
2967         masking_state[0] = 0;
2968         for (CSeq_loc_CI it(*mapped_mask);  it;  ++it) {
2969             CSeq_loc_CI::TRange loc_range = it.GetRange();
2970             masking_state[loc_range.GetFrom()]   = type;
2971             masking_state[loc_range.GetToOpen()] = 0;
2972         }
2973     }
2974 
2975     if (m_SoftMask.NotEmpty()  &&  m_HardMask.NotEmpty()) {
2976         CConstRef<CSeq_loc> mapped_mask = x_MapMask(*mapper, *m_HardMask,
2977                                                     base_seq_id, scope);
2978         for (CSeq_loc_CI it(*mapped_mask);  it;  ++it) {
2979             CSeq_loc_CI::TRange loc_range = it.GetRange();
2980             TSeqPos             from      = loc_range.GetFrom();
2981             TSeqPos             to        = loc_range.GetToOpen();
2982             TMSMap::iterator    ms_it     = masking_state.lower_bound(from);
2983             int                 prev_state;
2984 
2985             if (ms_it == masking_state.end()) {
2986                 masking_state[loc_range.GetFrom()]   = eHardMask;
2987                 masking_state[loc_range.GetToOpen()] = 0;
2988                 continue;
2989             } else if (ms_it->first == from) {
2990                 prev_state = ms_it->second;
2991                 ms_it->second |= eHardMask;
2992             } else {
2993                 // NB: lower_bound's name is misleading, as it actually
2994                 // returns the least element whose key >= from.
2995                 _ASSERT(ms_it != masking_state.begin());
2996                 TMSMap::iterator prev_it = ms_it;
2997                 --prev_it;
2998                 prev_state = prev_it->second;
2999                 TMSMap::value_type value(from, prev_state | eHardMask);
3000 
3001                 // Add the new element (using ms_it as a position hint),
3002                 // and repoint ms_it at it so that the below loop will
3003                 // start at the correct position.
3004                 ms_it = masking_state.insert(ms_it, value);
3005             }
3006             while (++ms_it != masking_state.end()  &&  ms_it->first < to) {
3007                 prev_state = ms_it->second;
3008                 ms_it->second |= eHardMask;
3009             }
3010             if (ms_it == masking_state.end()  ||  ms_it->first != to) {
3011                 masking_state.insert(ms_it, TMSMap::value_type(to, prev_state));
3012             }
3013         }
3014     }
3015 }
3016 
3017 
x_WriteSequence(const CSeqVector & vec,const TMSMap & masking_state)3018 void CFastaOstream::x_WriteSequence(const CSeqVector& vec,
3019                                     const TMSMap& masking_state)
3020 {
3021     TSeqPos                 rem_line      = m_Width;
3022     CSeqVector_CI           it(vec);
3023     TMSMap::const_iterator  ms_it         = masking_state.begin();
3024     TSeqPos                 rem_state
3025         = (ms_it == masking_state.end() ? numeric_limits<TSeqPos>::max()
3026            : ms_it->first);
3027     int                     current_state = 0;
3028     CTempString             uc_hard_mask_str
3029         (vec.IsProtein() ? m_UC_Xs.get() : m_UC_Ns.get(), m_Width);
3030     CTempString             lc_hard_mask_str
3031         (vec.IsProtein() ? m_LC_Xs.get() : m_LC_Ns.get(), m_Width);
3032     EGapMode                native_gap_mode
3033         = ((vec.GetGapChar() == '-') ? eGM_dashes : eGM_letters);
3034     CTempString             alt_gap_str;
3035 
3036     if (native_gap_mode == eGM_dashes) {
3037         alt_gap_str = uc_hard_mask_str;
3038     } else {
3039         alt_gap_str.assign(m_Dashes.get(), m_Width);
3040     }
3041 
3042     if ((m_Flags & fReverseStrand) != 0) {
3043         it.SetStrand(Reverse(it.GetStrand()));
3044     }
3045 
3046     while ( it ) {
3047         if (rem_state == 0) {
3048             _ASSERT(ms_it->first == it.GetPos());
3049             current_state = ms_it->second;
3050             if (++ms_it == masking_state.end()) {
3051                 rem_state = numeric_limits<TSeqPos>::max();
3052             } else {
3053                 rem_state = ms_it->first - it.GetPos();
3054             }
3055         }
3056         if( (m_Flags & fShowGapsOfSizeZero) != 0 &&
3057             it.HasZeroGapBefore() )
3058         {
3059             m_Out << "-\n";
3060             rem_line = m_Width;
3061         }
3062         if ((m_GapMode != native_gap_mode || (m_Flags & fInstantiateGaps) == 0)
3063             &&  it.GetGapSizeForward())
3064         {
3065             TSeqPos gap_size = it.GetGapSizeForward();
3066             if (m_GapMode == eGM_one_dash
3067                 ||  (m_Flags & fInstantiateGaps) == 0) {
3068                 m_Out << "-\n";
3069                 rem_line = m_Width;
3070             } else if (m_GapMode == eGM_count) {
3071                 if (rem_line < m_Width) {
3072                     m_Out << '\n';
3073                 }
3074                 _ASSERT(it.GetCurrentSeqMap_CI().GetType() == CSeqMap::eSeqGap);
3075                 if (it.GetCurrentSeqMap_CI().IsUnknownLength()) {
3076                     // conventional designation, regardless of nominal length
3077                     if( gap_size > 0 && (m_Flags & fKeepUnknGapNomLen) != 0 )
3078                     {
3079                         m_Out << ">?unk" << gap_size;
3080                     } else {
3081                         m_Out << ">?unk100";
3082                     }
3083                 } else {
3084                     m_Out << ">?" << gap_size;
3085                 }
3086                 // print gap mods, if requested
3087                 if( (m_Flags & fShowGapModifiers) != 0 )
3088                 {
3089                     CConstRef<CSeq_literal> pGapLiteral =
3090                         it.GetCurrentSeqMap_CI().GetRefGapLiteral();
3091                     if( pGapLiteral &&
3092                         FIELD_IS_SET_AND_IS(*pGapLiteral, Seq_data, Gap) )
3093                     {
3094                         const CSeq_gap & seq_gap =
3095                             pGapLiteral->GetSeq_data().GetGap();
3096                         SGapModText gap_mod_text;
3097                         GetGapModText(seq_gap, gap_mod_text);
3098 
3099                         CNcbiOstrstream gap_mod_strm;
3100                         gap_mod_text.WriteAllModsAsFasta(gap_mod_strm);
3101                         const string sGapModText =
3102                             CNcbiOstrstreamToString(gap_mod_strm);
3103                         if( ! sGapModText.empty() ) {
3104                             m_Out << ' ' << sGapModText;
3105                         }
3106                     }
3107                 }
3108                 m_Out << '\n';
3109                 rem_line = m_Width;
3110             } else {
3111                 TSeqPos rem_gap = gap_size;
3112                 while (rem_gap >= rem_line) {
3113                     x_WriteBuffer(alt_gap_str.data(), rem_line);
3114                     m_Out << '\n';
3115                     rem_gap -= rem_line;
3116                     rem_line = m_Width;
3117                 }
3118                 if (rem_gap > 0) {
3119                     x_WriteBuffer(alt_gap_str.data(), rem_gap);
3120                     rem_line -= rem_gap;
3121                 }
3122             }
3123             it.SkipGap();
3124             if (rem_state >= gap_size) {
3125                 rem_state -= gap_size;
3126             } else {
3127                 while (++ms_it != masking_state.end()
3128                        &&  ms_it->first < it.GetPos()) {
3129                     current_state = ms_it->second;
3130                 }
3131                 if (ms_it == masking_state.end()) {
3132                     rem_state = numeric_limits<TSeqPos>::max();
3133                 } else {
3134                     rem_state = ms_it->first - it.GetPos();
3135                 }
3136             }
3137         } else {
3138             TSeqPos     count   = min(TSeqPos(it.GetBufferSize()), rem_state);
3139             TSeqPos     new_pos = it.GetPos() + count;
3140             const char* ptr     = it.GetBufferPtr();
3141             string      lc_buffer;
3142 
3143             rem_state -= count;
3144             if (current_state & eHardMask) {
3145                 ptr = (current_state & eSoftMask) ? lc_hard_mask_str.data()
3146                     : uc_hard_mask_str.data();
3147             } else if (current_state & eSoftMask) {
3148                 // ToLower() always operates in place. :-/
3149                 lc_buffer.assign(ptr, count);
3150                 NStr::ToLower(lc_buffer);
3151                 ptr = lc_buffer.data();
3152             }
3153             while ( count >= rem_line ) {
3154                 x_WriteBuffer(ptr, rem_line);
3155                 if ( !(current_state & eHardMask) ) {
3156                     ptr += rem_line;
3157                 }
3158                 count -= rem_line;
3159                 m_Out << '\n';
3160                 rem_line = m_Width;
3161             }
3162             if ( count > 0 ) {
3163                 x_WriteBuffer(ptr, count);
3164                 rem_line -= count;
3165             }
3166             it.SetPos(new_pos);
3167         }
3168     }
3169     if ( rem_line < m_Width ) {
3170         m_Out << '\n';
3171     }
3172     // m_Out << NcbiFlush;
3173 }
3174 
3175 
WriteSequence(const CBioseq_Handle & handle,const CSeq_loc * location,const CSeq_loc::EOpFlags merge_flags)3176 void CFastaOstream::WriteSequence(const CBioseq_Handle& handle,
3177                                   const CSeq_loc* location,
3178                                   const CSeq_loc::EOpFlags merge_flags)
3179 
3180 {
3181     vector<CTSE_Handle> used_tses;
3182     if ( !(m_Flags & fAssembleParts)  &&  !handle.IsSetInst_Seq_data() ) {
3183         SSeqMapSelector sel(CSeqMap::fFindInnerRef, (size_t)-1);
3184         sel.SetLinkUsedTSE(handle.GetTSE_Handle());
3185         sel.SetLinkUsedTSE(used_tses);
3186         if ( !handle.GetSeqMap().CanResolveRange(&handle.GetScope(), sel) ) {
3187             return;
3188         }
3189     }
3190 
3191     CScope&    scope = handle.GetScope();
3192     CSeqVector v;
3193     if (location) {
3194         if (sequence::SeqLocCheck(*location, &scope)
3195             == sequence::eSeqLocCheck_error) {
3196             string label;
3197             location->GetLabel(&label);
3198             NCBI_THROW(CObjmgrUtilException, eBadLocation,
3199                        "CFastaOstream: location out of range: " + label);
3200         }
3201         CRef<CSeq_loc> merged
3202             = sequence::Seq_loc_Merge(*location, merge_flags, &scope);
3203         v = CSeqVector(*merged, scope, CBioseq_Handle::eCoding_Iupac);
3204     } else {
3205         v = handle.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
3206     }
3207     if (v.IsProtein()) { // allow extensions
3208         v.SetCoding(CSeq_data::e_Ncbieaa);
3209     }
3210 
3211     TMSMap masking_state;
3212     if (m_SoftMask.NotEmpty()  ||  m_HardMask.NotEmpty()) {
3213         x_GetMaskingStates(masking_state, handle.GetSeqId(), location, &scope);
3214     }
3215     x_WriteSequence(v, masking_state);
3216 }
3217 
3218 
Write(const CSeq_entry & entry,const CSeq_loc * location,bool no_scope)3219 void CFastaOstream::Write(const CSeq_entry& entry, const CSeq_loc* location,
3220                           bool no_scope)
3221 {
3222     if (location || !no_scope) {
3223         CScope scope(*CObjectManager::GetInstance());
3224         Write(scope.AddTopLevelSeqEntry(entry), location);
3225     } else {
3226         switch (entry.Which()) {
3227         case CSeq_entry::e_Seq:
3228             Write(entry.GetSeq(), location, no_scope);
3229             break;
3230         case CSeq_entry::e_Set:
3231             ITERATE (CBioseq_set::TSeq_set, it, entry.GetSet().GetSeq_set()) {
3232                 Write(**it, location, no_scope);
3233             }
3234             break;
3235         default:
3236             // throw
3237             break;
3238         }
3239     }
3240 }
3241 
3242 
Write(const CBioseq & seq,const CSeq_loc * location,bool no_scope,const string & custom_title)3243 void CFastaOstream::Write(const CBioseq& seq, const CSeq_loc* location,
3244                           bool no_scope, const string& custom_title )
3245 {
3246     CScope scope(*CObjectManager::GetInstance());
3247     CBioseq_Handle bioseq_handle = scope.AddBioseq(seq);
3248     if (location || !no_scope) {
3249         Write(bioseq_handle, location, custom_title);
3250     } else {
3251         /// write our title
3252         x_WriteSeqIds(seq, NULL);
3253         x_WriteSeqTitle(bioseq_handle, custom_title);
3254 
3255         /// write the sequence
3256         TMSMap masking_state;
3257         x_GetMaskingStates(masking_state, NULL, NULL, NULL);
3258 
3259         /// check to see if all of our segments are resolvable
3260         bool is_raw = true;
3261         switch (seq.GetInst().GetRepr()) {
3262         case CSeq_inst::eRepr_raw:
3263             break;
3264         case CSeq_inst::eRepr_delta:
3265             ITERATE (CSeq_inst::TExt::TDelta::Tdata, iter,
3266                      seq.GetInst().GetExt().GetDelta().Get()) {
3267                 if ((*iter)->Which() == CDelta_seq::e_Loc) {
3268                     is_raw = false;
3269                     break;
3270                 }
3271             }
3272             break;
3273         default:
3274             is_raw = false;
3275             break;
3276         }
3277 
3278         if (is_raw) {
3279             CSeqVector vec(seq, NULL, CBioseq_Handle::eCoding_Iupac);
3280             if (vec.IsProtein()) { // allow extensions
3281                 vec.SetCoding(CSeq_data::e_Ncbieaa);
3282             }
3283             x_WriteSequence(vec, masking_state);
3284         } else {
3285             /// we require far-pointer resolution
3286             CScope scope(*CObjectManager::GetInstance());
3287             CBioseq_Handle bsh = scope.AddBioseq(seq);
3288             CSeqVector vec(bsh, CBioseq_Handle::eCoding_Iupac);
3289             if (vec.IsProtein()) {
3290                 vec.SetCoding(CSeq_data::e_Ncbieaa);
3291             }
3292             x_WriteSequence(vec, masking_state);
3293         }
3294     }
3295 }
3296 
3297 
GetMask(EMaskType type) const3298 CConstRef<CSeq_loc> CFastaOstream::GetMask(EMaskType type) const
3299 {
3300     return (type == eSoftMask) ? m_SoftMask : m_HardMask;
3301 }
3302 
3303 
SetMask(EMaskType type,CConstRef<CSeq_loc> location)3304 void CFastaOstream::SetMask(EMaskType type, CConstRef<CSeq_loc> location)
3305 {
3306     ((type == eSoftMask) ? m_SoftMask : m_HardMask) = location;
3307 }
3308 
3309 
SetWidth(TSeqPos width)3310 void CFastaOstream::SetWidth(TSeqPos width)
3311 {
3312     m_Width = width;
3313     m_Dashes.reset(new char[width]);  memset(m_Dashes.get(), '-', width);
3314     m_LC_Ns .reset(new char[width]);  memset(m_LC_Ns .get(), 'n', width);
3315     m_LC_Xs .reset(new char[width]);  memset(m_LC_Xs .get(), 'x', width);
3316     m_UC_Ns .reset(new char[width]);  memset(m_UC_Ns .get(), 'N', width);
3317     m_UC_Xs .reset(new char[width]);  memset(m_UC_Xs .get(), 'X', width);
3318 }
3319 
3320 void
WriteAllModsAsFasta(CNcbiOstream & out) const3321 CFastaOstream::SGapModText::WriteAllModsAsFasta(
3322     CNcbiOstream & out ) const
3323 {
3324     string sPrefix;
3325     if( ! gap_type.empty() ) {
3326         out << sPrefix << "[gap-type=" << gap_type << ']';
3327         sPrefix = " ";
3328     }
3329     if( ! gap_linkage_evidences.empty() ) {
3330         out << sPrefix << "[linkage-evidence=" << NStr::Join(gap_linkage_evidences, ";") << ']';
3331         sPrefix = " ";
3332     }
3333 }
3334 
3335 // static
3336 void
GetGapModText(const CSeq_gap & seq_gap,SGapModText & out_gap_info)3337 CFastaOstream::GetGapModText(
3338     const CSeq_gap & seq_gap,
3339     SGapModText & out_gap_info )
3340 {
3341     // convenience references
3342     string & gap_type = out_gap_info.gap_type;
3343     vector<string> & gap_linkage_evidences =
3344         out_gap_info.gap_linkage_evidences;
3345 
3346     // make sure initialized
3347     gap_type.clear();
3348     gap_linkage_evidences.clear();
3349 
3350     // true if we need to have a /linkage-evidence tag.
3351     // Also, if this is false, we should *not* have any
3352     // linkage-evidence tag
3353     bool need_evidence = false;
3354 
3355     // determine if we're linked, and also determine if
3356     // we need linkage-evidence
3357     bool is_linkage =
3358         seq_gap.CanGetLinkage() &&
3359         seq_gap.GetLinkage() == CSeq_gap::eLinkage_linked;
3360 
3361     if ( seq_gap.IsSetLinkage_evidence() ) {
3362         is_linkage = true; /* do not rely solely on Seq-gap.linkage, which is not always set correctly */
3363     }
3364 
3365     // For /gap_type qual
3366     if( seq_gap.CanGetType() ) {
3367         switch( seq_gap.GetType() ) {
3368         case CSeq_gap::eType_unknown:
3369             // don't show /gap_type - policy changed at SQD-1801
3370             gap_type = "unknown";
3371             need_evidence = is_linkage;
3372             break;
3373         case CSeq_gap::eType_fragment:
3374             gap_type = "within scaffold";
3375             need_evidence = true;
3376             break;
3377         case CSeq_gap::eType_clone:
3378             gap_type = ( is_linkage ? "within scaffold" : "between scaffolds" );
3379             need_evidence = is_linkage;
3380             break;
3381         case CSeq_gap::eType_short_arm:
3382             gap_type = "short arm";
3383             break;
3384         case CSeq_gap::eType_heterochromatin:
3385             gap_type = "heterochromatin";
3386             break;
3387         case CSeq_gap::eType_centromere:
3388             gap_type = "centromere";
3389             break;
3390         case CSeq_gap::eType_telomere:
3391             gap_type = "telomere";
3392             break;
3393         case CSeq_gap::eType_repeat:
3394             gap_type = ( is_linkage ?
3395                 "repeat within scaffold" :
3396             "repeat between scaffolds" );
3397             need_evidence = is_linkage;
3398             break;
3399         case CSeq_gap::eType_contig:
3400             gap_type = "between scaffolds";
3401             break;
3402         case CSeq_gap::eType_scaffold:
3403             gap_type = "within scaffold";
3404             need_evidence = is_linkage;
3405             break;
3406         case CSeq_gap::eType_contamination:
3407             gap_type = "contamination";
3408             need_evidence = is_linkage;
3409             break;
3410         case CSeq_gap::eType_other:
3411             gap_type = "other";
3412             break;
3413         default:
3414             gap_type = "(ERROR: UNRECOGNIZED_GAP_TYPE:" +
3415                 NStr::IntToString(seq_gap.GetType()) + ")";
3416             break;
3417         }
3418     }
3419 
3420     // For linkage evidence
3421     if( seq_gap.CanGetLinkage_evidence() ) {
3422         ITERATE( CSeq_gap::TLinkage_evidence,
3423             evidence_iter,
3424             seq_gap.GetLinkage_evidence() )
3425         {
3426             const CLinkage_evidence & evidence = **evidence_iter;
3427             if( evidence.CanGetType() ) {
3428                 switch( evidence.GetType() ) {
3429                 case CLinkage_evidence::eType_paired_ends:
3430                     gap_linkage_evidences.push_back("paired-ends");
3431                     break;
3432                 case CLinkage_evidence::eType_align_genus:
3433                     gap_linkage_evidences.push_back("align genus");
3434                     break;
3435                 case CLinkage_evidence::eType_align_xgenus:
3436                     gap_linkage_evidences.push_back("align xgenus");
3437                     break;
3438                 case CLinkage_evidence::eType_align_trnscpt:
3439                     gap_linkage_evidences.push_back("align trnscpt");
3440                     break;
3441                 case CLinkage_evidence::eType_within_clone:
3442                     gap_linkage_evidences.push_back("within clone");
3443                     break;
3444                 case CLinkage_evidence::eType_clone_contig:
3445                     gap_linkage_evidences.push_back("clone contig");
3446                     break;
3447                 case CLinkage_evidence::eType_map:
3448                     gap_linkage_evidences.push_back("map");
3449                     break;
3450                 case CLinkage_evidence::eType_strobe:
3451                     gap_linkage_evidences.push_back("strobe");
3452                     break;
3453                 case CLinkage_evidence::eType_unspecified:
3454                     gap_linkage_evidences.push_back("unspecified");
3455                     break;
3456                 case CLinkage_evidence::eType_pcr:
3457                     gap_linkage_evidences.push_back("pcr");
3458                     break;
3459                 case CLinkage_evidence::eType_proximity_ligation:
3460                     gap_linkage_evidences.push_back("proximity ligation");
3461                     break;
3462                 case CLinkage_evidence::eType_other:
3463                     gap_linkage_evidences.push_back("other");
3464                     break;
3465                 default:
3466                     gap_linkage_evidences.push_back("(UNRECOGNIZED LINKAGE EVIDENCE:" +
3467                         NStr::IntToString( evidence.GetType() ) + ")");
3468                     break;
3469                 }
3470             }
3471         }
3472     }
3473 
3474     if( need_evidence && gap_linkage_evidences.empty() ) {
3475         gap_linkage_evidences.push_back("unspecified");
3476     } else if( ! need_evidence && ! gap_linkage_evidences.empty() ) {
3477         // This case shouldn't happen if the validator is checking
3478         // records first.
3479         gap_linkage_evidences.clear();
3480     }
3481 }
3482 
3483 /////////////////////////////////////////////////////////////////////////////
3484 //
3485 // sequence translation
3486 //
3487 
3488 
3489 template <class Container>
x_Translate(const Container & seq,string & prot,int frame,const CGenetic_code * code,bool is_5prime_complete,bool is_3prime_complete,bool include_stop,bool remove_trailing_X,bool * alt_start)3490 void x_Translate(const Container& seq,
3491                  string& prot,
3492                  int frame,
3493                  const CGenetic_code* code,
3494                  bool is_5prime_complete,
3495                  bool is_3prime_complete,
3496                  bool include_stop,
3497                  bool remove_trailing_X,
3498                  bool* alt_start)
3499 {
3500     // reserve our space
3501     const size_t usable_size = seq.size() > frame ? seq.size() - frame : 0;
3502     const size_t mod = usable_size % 3;
3503     prot.erase();
3504     prot.reserve((usable_size + 2) / 3);
3505 
3506     // get appropriate translation table
3507     const CTrans_table & tbl =
3508         (code ? CGen_code_table::GetTransTable(*code) :
3509                 CGen_code_table::GetTransTable(1));
3510 
3511     char aa = '\0';
3512     int state = 0;
3513     int start_state = 0;
3514     try {
3515         // main loop through bases
3516         typename Container::const_iterator start = seq.begin();
3517         {{
3518                 for (int i = 0; i < frame; ++i) {
3519                     ++start;
3520                 }
3521         }}
3522 
3523         size_t i;
3524         size_t k;
3525         size_t length = usable_size / 3;
3526         bool check_start = (is_5prime_complete && frame == 0);
3527         bool first_time = true;
3528 
3529         for (i = 0; i < length; ++i) {
3530 
3531             // loop through one codon at a time
3532             for (k = 0; k < 3; ++k, ++start) {
3533                 state = tbl.NextCodonState(state, *start);
3534             }
3535 
3536             if (first_time) {
3537                 start_state = state;
3538             }
3539 
3540             // save translated amino acid
3541             if (first_time  &&  check_start) {
3542                 aa = tbl.GetStartResidue(state);
3543                 prot.append(1, aa);
3544             } else {
3545                 aa = tbl.GetCodonResidue(state);
3546                 prot.append(1, aa);
3547             }
3548 
3549             first_time = false;
3550         }
3551 
3552         if (mod) {
3553             for (k = 0; k < mod; ++k, ++start) {
3554                 state = tbl.NextCodonState(state, *start);
3555             }
3556 
3557             for (; k < 3; ++k) {
3558                 state = tbl.NextCodonState(state, 'N');
3559             }
3560 
3561             if (first_time) {
3562                 start_state = state;
3563             }
3564 
3565             // save translated amino acid
3566             char c = tbl.GetCodonResidue(state);
3567             if (first_time  &&  check_start) {
3568                 aa = tbl.GetStartResidue(state);
3569                 prot.append(1, aa);
3570             } else if (c != 'X') {
3571                 // if padding was needed, trim ambiguous last residue
3572                 aa = tbl.GetCodonResidue(state);
3573                 prot.append(1, aa);
3574             }
3575         }
3576     } catch (CSeqVectorException& /*ex*/) {
3577         // ran out of sequence
3578     }
3579 
3580     if ( aa != '*' && include_stop && (! mod) && prot.size() > 0 && is_3prime_complete ) {
3581         // check for stop codon that normally encodes an amino acid
3582         aa = tbl.GetStopResidue(state);
3583         if (aa == '*') {
3584             prot[prot.size()-1] = aa;
3585         }
3586     }
3587 
3588     // check for alternative start codon
3589     if (alt_start && is_5prime_complete) {
3590         if ( tbl.IsAltStart(start_state) ) {
3591             *alt_start = true;
3592         } else {
3593             *alt_start = false;
3594         }
3595     }
3596 
3597     if ( !include_stop ) {
3598         SIZE_TYPE sz = prot.find_first_of("*");
3599         if (sz != string::npos) {
3600             prot.resize(sz);
3601         }
3602     }
3603 
3604     if (remove_trailing_X) {
3605         SIZE_TYPE sz;
3606         for (sz = prot.size();  sz > 0  &&  prot[sz - 1] == 'X';  --sz) {
3607         }
3608         prot.resize(sz);
3609     }
3610 
3611     /**
3612     cerr << "source: ";
3613     ITERATE (typename Container, it, seq) {
3614         cerr << *it;
3615     }
3616     cerr << endl;
3617     cerr << "xlate: ";
3618     ITERATE (string, it, prot) {
3619         cerr << *it;
3620     }
3621     cerr << endl;
3622     **/
3623 }
3624 
3625 
AddAAToDeltaSeq(CRef<CBioseq> prot,char residue)3626 static void AddAAToDeltaSeq (CRef<CBioseq> prot, char residue)
3627 {
3628     if (prot->SetInst().SetExt().SetDelta().Set().empty()
3629         || prot->GetInst().GetExt().GetDelta().Get().back()->GetLiteral().GetSeq_data().IsGap()) {
3630         // either first seg or transitioning from gap, need new seg
3631         CRef<CDelta_seq> seg(new CDelta_seq());
3632         seg->SetLiteral().SetLength(0);
3633         prot->SetInst().SetExt().SetDelta().Set().push_back(seg);
3634     }
3635 
3636     CRef<CDelta_seq> last = prot->SetInst().SetExt().SetDelta().Set().back();
3637 
3638     if (residue == '*' || residue == '-') {
3639         // found a residue that is not part of the IUPACAA alphabet, must convert to NCBIEAA
3640         if (last->IsLiteral() && last->GetLiteral().IsSetSeq_data() && last->GetLiteral().GetSeq_data().IsIupacaa()) {
3641             // convert to ncbieaa
3642             string current = last->GetLiteral().GetSeq_data().GetIupacaa().Get();
3643             last->SetLiteral().SetSeq_data().SetNcbieaa().Set(current);
3644         }
3645         // add *
3646         last->SetLiteral().SetSeq_data().SetNcbieaa().Set().append(1, residue);
3647     } else if (last->IsLiteral() && last->GetLiteral().IsSetSeq_data() && last->GetLiteral().GetSeq_data().IsNcbieaa()) {
3648         // already using NCBIEAA, must continue to do so
3649         last->SetLiteral().SetSeq_data().SetNcbieaa().Set().append(1, residue);
3650     } else {
3651         // so far, have not found residues that are not part of IUPACAA, can continue to use IUPACAA
3652         last->SetLiteral().SetSeq_data().SetIupacaa().Set().append(1, residue);
3653     }
3654 
3655     TSeqPos len = last->GetLiteral().GetLength();
3656     last->SetLiteral().SetLength(len + 1);
3657 }
3658 
3659 
AddGapToDeltaSeq(CRef<CBioseq> prot,bool unknown_length,TSeqPos add_len)3660 static void AddGapToDeltaSeq (CRef<CBioseq>prot, bool unknown_length, TSeqPos add_len)
3661 {
3662     if (prot->SetInst().SetExt().SetDelta().Set().empty()) {
3663         // create new segment for gap
3664         CRef<CDelta_seq> new_seg(new CDelta_seq());
3665         new_seg->SetLiteral().SetSeq_data().SetGap().SetType(CSeq_gap::eType_unknown);
3666         new_seg->SetLiteral().SetLength(add_len);
3667         if (unknown_length) {
3668             new_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3669         }
3670         prot->SetInst().SetExt().SetDelta().Set().push_back(new_seg);
3671     } else {
3672         CRef<CDelta_seq> last = prot->SetInst().SetExt().SetDelta().Set().back();
3673         if (last->SetLiteral().GetSeq_data().IsGap()
3674             && ((unknown_length && last->SetLiteral().IsSetFuzz())
3675                  || (!unknown_length && !last->SetLiteral().IsSetFuzz()))) {
3676             // ok, already creating gap segment with correct fuzz
3677             TSeqPos len = prot->GetInst().GetExt().GetDelta().Get().back()->GetLiteral().GetLength();
3678             prot->SetInst().SetExt().SetDelta().Set().back()->SetLiteral().SetLength(len + add_len);
3679         } else {
3680             // create new segment for gap
3681             CRef<CDelta_seq> new_seg(new CDelta_seq());
3682             new_seg->SetLiteral().SetSeq_data().SetGap().SetType(CSeq_gap::eType_unknown);
3683             new_seg->SetLiteral().SetLength(add_len);
3684             if (unknown_length) {
3685                 new_seg->SetLiteral().SetFuzz().SetLim(CInt_fuzz::eLim_unk);
3686             }
3687             prot->SetInst().SetExt().SetDelta().Set().push_back(new_seg);
3688         }
3689     }
3690 }
3691 
3692 
TranslateToProtein(const CSeq_feat & cds,CScope & scope)3693 CRef<CBioseq> CSeqTranslator::TranslateToProtein(const CSeq_feat& cds,
3694     CScope& scope)
3695 {
3696     const CGenetic_code* code = NULL;
3697     int frame = 0;
3698     if (cds.GetData().IsCdregion()) {
3699         const CCdregion& cdr = cds.GetData().GetCdregion();
3700         if (cdr.IsSetFrame()) {
3701             switch (cdr.GetFrame()) {
3702             case CCdregion::eFrame_two:
3703                 frame = 1;
3704                 break;
3705             case CCdregion::eFrame_three:
3706                 frame = 2;
3707                 break;
3708             default:
3709                 break;
3710             }
3711         }
3712         if (cdr.IsSetCode()) {
3713             code = &cdr.GetCode();
3714         }
3715     }
3716     bool is_5prime_complete = !cds.GetLocation().IsPartialStart(eExtreme_Biological);
3717 
3718     CSeqVector seq(cds.GetLocation(), scope, CBioseq_Handle::eCoding_Iupac);
3719     CConstRef<CSeqMap> map;
3720     map.Reset(&seq.GetSeqMap());
3721 
3722     CRef<CBioseq> prot(new CBioseq());
3723 
3724     prot->SetInst().SetRepr(CSeq_inst::eRepr_delta);
3725     prot->SetInst().SetMol(CSeq_inst::eMol_aa);
3726     prot->SetInst().SetLength(0);
3727 
3728     // reserve our space
3729     const TSeqPos usable_size = TSeqPos(seq.size()) - frame;
3730     const TSeqPos mod = usable_size % 3;
3731 
3732     // get appropriate translation table
3733     const CTrans_table & tbl =
3734         (code ? CGen_code_table::GetTransTable(*code) :
3735         CGen_code_table::GetTransTable(1));
3736 
3737     try {
3738         // main loop through bases
3739         CSeqVector::const_iterator start = seq.begin();
3740         for (int i = 0; i < frame; ++i) {
3741             ++start;
3742         }
3743 
3744         TSeqPos i;
3745         TSeqPos k;
3746         int state = 0;
3747         TSeqPos length = usable_size / 3;
3748         bool check_start = (is_5prime_complete && frame == 0);
3749         bool first_time = true;
3750 
3751         for (i = 0; i < length; ++i) {
3752             bool is_gap = true;
3753             bool unknown_length = false;
3754             TSeqPos pos = (i * 3) + frame;
3755 
3756             if (start.HasZeroGapBefore()) {
3757                 AddGapToDeltaSeq(prot, true, 0);
3758             }
3759 
3760             // loop through one codon at a time
3761             for (k = 0; k < 3; ++k, ++start) {
3762                 state = tbl.NextCodonState(state, *start);
3763                 if (seq.IsInGap(pos + k)) {
3764                     if (is_gap && !unknown_length) {
3765                         CSeqMap_CI map_iter(map, &scope, SSeqMapSelector(), pos + k);
3766                         if (map_iter.GetType() == CSeqMap::eSeqGap
3767                             && map_iter.IsUnknownLength()) {
3768                             unknown_length = true;
3769                         }
3770                     }
3771                 } else {
3772                     is_gap = false;
3773                 }
3774             }
3775 
3776             if (is_gap) {
3777                 AddGapToDeltaSeq(prot, unknown_length, 1);
3778             } else {
3779                 // save translated amino acid
3780                 if (first_time  &&  check_start) {
3781                     AddAAToDeltaSeq(prot, tbl.GetStartResidue(state));
3782                 } else {
3783                     AddAAToDeltaSeq(prot, tbl.GetCodonResidue(state));
3784                 }
3785 
3786             }
3787 
3788             first_time = false;
3789         }
3790 
3791         if (mod) {
3792             bool is_gap = true;
3793             bool unknown_length = false;
3794             TSeqPos pos = (length * 3) + frame;
3795             for (k = 0; k < mod; ++k, ++start) {
3796                 state = tbl.NextCodonState(state, *start);
3797                 if (seq.IsInGap(pos + k)) {
3798                     if (is_gap && !unknown_length) {
3799                         CSeqMap_CI map_iter(map, &scope, SSeqMapSelector(), pos + k);
3800                         if (map_iter.GetType() == CSeqMap::eSeqGap) {
3801                             if (map_iter.IsUnknownLength()) {
3802                                 unknown_length = true;
3803                             }
3804                         }
3805                     }
3806                 } else {
3807                     is_gap = false;
3808                 }
3809             }
3810 
3811             if (is_gap) {
3812                 AddGapToDeltaSeq(prot, unknown_length, 1);
3813             } else {
3814                 for (; k < 3; ++k) {
3815                     state = tbl.NextCodonState(state, 'N');
3816                 }
3817 
3818                 // save translated amino acid
3819                 char c = tbl.GetCodonResidue(state);
3820                 if (c != 'X') {
3821                     if (first_time  &&  check_start) {
3822                         AddAAToDeltaSeq(prot, tbl.GetStartResidue(state));
3823                     } else {
3824                         AddAAToDeltaSeq(prot, tbl.GetCodonResidue(state));
3825                     }
3826                 }
3827             }
3828         }
3829     } catch (CSeqVectorException& /*ex*/) {
3830         // ran out of sequence
3831     }
3832 
3833     TSeqPos prot_len = 0;
3834     ITERATE(CDelta_ext::Tdata, seg_it, prot->SetInst().SetExt().SetDelta().Set()) {
3835         prot_len += (*seg_it)->GetLiteral().GetLength();
3836     }
3837 
3838     // code break substitution
3839     if (cds.GetData().IsCdregion() &&
3840         cds.GetData().GetCdregion().IsSetCode_break()) {
3841         const CCdregion& cdr = cds.GetData().GetCdregion();
3842         ITERATE(CCdregion::TCode_break, code_break, cdr.GetCode_break()) {
3843             const CRef <CCode_break> brk = *code_break;
3844             const CSeq_loc& cbk_loc = brk->GetLoc();
3845             TSeqPos seq_pos =
3846                 sequence::LocationOffset(cds.GetLocation(), cbk_loc,
3847                 sequence::eOffset_FromStart,
3848                 &scope);
3849             seq_pos -= frame;
3850             string::size_type j = seq_pos / 3;
3851             if (j < prot_len) {
3852                 const CCode_break::C_Aa& c_aa = brk->GetAa();
3853                 if (c_aa.IsNcbieaa()) {
3854                     CDelta_ext::Tdata::iterator seg_it = prot->SetInst().SetExt().SetDelta().Set().begin();
3855                     string::size_type offset = 0;
3856                     while (seg_it != prot->SetInst().SetExt().SetDelta().Set().end()
3857                         && offset + (*seg_it)->GetLiteral().GetLength() < j) {
3858                         offset += (*seg_it)->GetLiteral().GetLength();
3859                         ++seg_it;
3860                     }
3861                     if (seg_it != prot->SetInst().SetExt().SetDelta().Set().end()
3862                         && !(*seg_it)->GetLiteral().GetSeq_data().IsGap()) {
3863                         if ((*seg_it)->GetLiteral().GetSeq_data().IsIupacaa()) {
3864                             (*seg_it)->SetLiteral().SetSeq_data().SetIupacaa().Set()[j - offset] = c_aa.GetNcbieaa();
3865                         } else {
3866                             (*seg_it)->SetLiteral().SetSeq_data().SetNcbieaa().Set()[j - offset] = c_aa.GetNcbieaa();
3867                         }
3868                     }
3869                 }
3870             } else if (j == prot_len) {
3871                 // add terminal exception
3872                 const CCode_break::C_Aa& c_aa = brk->GetAa();
3873                 if (c_aa.IsNcbieaa() && c_aa.GetNcbieaa() == 42) {
3874                     AddAAToDeltaSeq(prot, c_aa.GetNcbieaa());
3875                 }
3876             }
3877         }
3878     }
3879 
3880     // remove stop codon from end
3881     CRef<CDelta_seq> end;
3882     if (!prot->SetInst().SetExt().SetDelta().Set().empty())
3883     {
3884         end = prot->SetInst().SetExt().SetDelta().Set().back();
3885     }
3886 
3887     if (end && end->IsLiteral() && end->GetLiteral().IsSetSeq_data()) {
3888         if (end->GetLiteral().GetSeq_data().IsIupacaa()) {
3889             string& last_seg = end->SetLiteral().SetSeq_data().SetIupacaa().Set();
3890             if (NStr::EndsWith(last_seg, "*")) {
3891                 last_seg = last_seg.substr(0, last_seg.length() - 1);
3892                 end->SetLiteral().SetLength(TSeqPos(last_seg.length()));
3893             }
3894         } else if (end->GetLiteral().GetSeq_data().IsNcbieaa()) {
3895             string& last_seg = end->SetLiteral().SetSeq_data().SetNcbieaa().Set();
3896             if (NStr::EndsWith(last_seg, "*")) {
3897                 last_seg = last_seg.substr(0, last_seg.length() - 1);
3898                 end->SetLiteral().SetLength(TSeqPos(last_seg.length()));
3899             }
3900         }
3901     }
3902 
3903     // recalculate protein length, check need for ncbieaa - may have been altered by removal of stop codon/transl_except
3904     prot_len = 0;
3905     NON_CONST_ITERATE(CDelta_ext::Tdata, seg_it, prot->SetInst().SetExt().SetDelta().Set()) {
3906         prot_len += (*seg_it)->GetLiteral().GetLength();
3907         if ((*seg_it)->GetLiteral().IsSetSeq_data()
3908             && (*seg_it)->GetLiteral().GetSeq_data().IsNcbieaa()) {
3909             string current = (*seg_it)->GetLiteral().GetSeq_data().GetNcbieaa();
3910             if (NStr::Find(current, "*") == string::npos && NStr::Find(current, "-") == string::npos) {
3911                 (*seg_it)->SetLiteral().SetSeq_data().SetIupacaa().Set(current);
3912             }
3913         }
3914     }
3915     prot->SetInst().SetLength(prot_len);
3916 
3917     if (prot->GetInst().GetLength() == 0) {
3918         prot.Reset(NULL);
3919     } else if (prot->SetInst().SetExt().SetDelta().Set().size() == 1
3920         && prot->SetInst().SetExt().SetDelta().Set().front()->IsLiteral()
3921         && prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().IsSetSeq_data()) {
3922         // only one segment, should be raw rather than delta
3923         if (prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().IsIupacaa()) {
3924             string data = prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().GetIupacaa().Get();
3925             prot->SetInst().ResetExt();
3926             prot->SetInst().SetSeq_data().SetIupacaa().Set(data);
3927             prot->SetInst().SetRepr(CSeq_inst::eRepr_raw);
3928         } else if (prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().IsNcbieaa()) {
3929             string data = prot->SetInst().SetExt().SetDelta().Set().front()->GetLiteral().GetSeq_data().GetNcbieaa().Get();
3930             prot->SetInst().ResetExt();
3931             prot->SetInst().SetSeq_data().SetNcbieaa().Set(data);
3932             prot->SetInst().SetRepr(CSeq_inst::eRepr_raw);
3933         }
3934     }
3935 
3936     return prot;
3937 }
3938 
3939 
ChangeDeltaProteinToRawProtein(CRef<CBioseq> protein)3940 bool CSeqTranslator::ChangeDeltaProteinToRawProtein(CRef<CBioseq> protein)
3941 {
3942     if (!protein || !protein->IsAa() || !protein->IsSetInst()) {
3943         return false;
3944     }
3945     return protein->SetInst().ConvertDeltaToRaw();
3946 }
3947 
3948 
Translate(const string & seq,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start,bool is_5prime_complete,bool is_3prime_complete)3949 void CSeqTranslator::Translate(const string& seq, string& prot,
3950                                const CGenetic_code* code,
3951                                bool include_stop,
3952                                bool remove_trailing_X,
3953                                bool* alt_start,
3954                                bool is_5prime_complete,
3955                                bool is_3prime_complete)
3956 {
3957     x_Translate(seq, prot, 0, code,
3958                 is_5prime_complete, is_3prime_complete, include_stop, remove_trailing_X, alt_start);
3959 }
3960 
3961 
Translate(const string & seq,string & prot,TTranslationFlags flags,const CGenetic_code * code,bool * alt_start)3962 void CSeqTranslator::Translate(const string& seq,
3963                                string& prot,
3964                                TTranslationFlags flags,
3965                                const CGenetic_code* code,
3966                                bool* alt_start)
3967 {
3968     x_Translate(seq, prot, 0, code,
3969                 !(flags & fIs5PrimePartial),
3970                 !(flags & fIs3PrimePartial),
3971                 !(flags & fNoStop),
3972                 flags & fRemoveTrailingX,
3973                 alt_start);
3974 }
3975 
3976 
Translate(const CSeqVector & seq,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start,bool is_5prime_complete,bool is_3prime_complete)3977 void CSeqTranslator::Translate(const CSeqVector& seq, string& prot,
3978                                const CGenetic_code* code,
3979                                bool include_stop,
3980                                bool remove_trailing_X,
3981                                bool* alt_start,
3982                                bool is_5prime_complete,
3983                                bool is_3prime_complete)
3984 {
3985     x_Translate(seq, prot, 0, code,
3986                 is_5prime_complete, is_3prime_complete, include_stop, remove_trailing_X, alt_start);
3987 }
3988 
3989 
Translate(const CSeqVector & seq,string & prot,TTranslationFlags flags,const CGenetic_code * code,bool * alt_start)3990 void CSeqTranslator::Translate(const CSeqVector& seq, string& prot,
3991                                TTranslationFlags flags,
3992                                const CGenetic_code* code,
3993                                bool* alt_start)
3994 {
3995     x_Translate(seq, prot, 0, code,
3996                 !(flags & fIs5PrimePartial),
3997                 !(flags & fIs3PrimePartial),
3998                 !(flags & fNoStop),
3999                 flags & fRemoveTrailingX,
4000                 alt_start);
4001 }
4002 
4003 
Translate(const CSeq_loc & loc,const CBioseq_Handle & handle,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start)4004 void CSeqTranslator::Translate(const CSeq_loc& loc,
4005                                const CBioseq_Handle& handle,
4006                                string& prot,
4007                                const CGenetic_code* code,
4008                                bool include_stop,
4009                                bool remove_trailing_X,
4010                                bool* alt_start)
4011 {
4012     CSeqVector seq(loc, handle.GetScope(), CBioseq_Handle::eCoding_Iupac);
4013     x_Translate(seq, prot, 0, code,
4014                 !loc.IsPartialStart(eExtreme_Biological),
4015                 !loc.IsPartialStop(eExtreme_Biological),
4016                 include_stop, remove_trailing_X, alt_start);
4017 }
4018 
4019 
4020 
Translate(const CSeq_loc & loc,CScope & scope,string & prot,const CGenetic_code * code,bool include_stop,bool remove_trailing_X,bool * alt_start)4021 void CSeqTranslator::Translate(const CSeq_loc& loc,
4022                                CScope& scope,
4023                                string& prot,
4024                                const CGenetic_code* code,
4025                                bool include_stop,
4026                                bool remove_trailing_X,
4027                                bool* alt_start)
4028 {
4029     CSeqVector seq(loc, scope, CBioseq_Handle::eCoding_Iupac);
4030     x_Translate(seq, prot, 0, code,
4031                 !loc.IsPartialStart(eExtreme_Biological),
4032                 !loc.IsPartialStop(eExtreme_Biological),
4033                 include_stop, remove_trailing_X, alt_start);
4034 }
4035 
4036 
Translate(const CSeq_feat & feat,CScope & scope,string & prot,bool include_stop,bool remove_trailing_X,bool * alt_start)4037 void CSeqTranslator::Translate(const CSeq_feat& feat,
4038                                CScope& scope,
4039                                string& prot,
4040                                bool include_stop,
4041                                bool remove_trailing_X,
4042                                bool* alt_start)
4043 {
4044     const CGenetic_code* code = NULL;
4045     int frame = 0;
4046     if (feat.GetData().IsCdregion()) {
4047         const CCdregion& cdr = feat.GetData().GetCdregion();
4048         if (cdr.IsSetFrame ()) {
4049             switch (cdr.GetFrame ()) {
4050             case CCdregion::eFrame_two :
4051                 frame = 1;
4052                 break;
4053             case CCdregion::eFrame_three :
4054                 frame = 2;
4055                 break;
4056             default :
4057                 break;
4058             }
4059         }
4060         if (cdr.IsSetCode()) {
4061             code = &cdr.GetCode();
4062         }
4063     }
4064 
4065     bool code_break_include_stop = include_stop;
4066     if (feat.GetData().IsCdregion()  &&
4067         feat.GetData().GetCdregion().IsSetCode_break()) {
4068         code_break_include_stop = true;
4069     }
4070 
4071     CSeqVector seq(feat.GetLocation(), scope, CBioseq_Handle::eCoding_Iupac);
4072     x_Translate(seq, prot, frame, code,
4073                 !feat.GetLocation().IsPartialStart(eExtreme_Biological),
4074                 !feat.GetLocation().IsPartialStop(eExtreme_Biological),
4075                 code_break_include_stop, remove_trailing_X, alt_start);
4076 
4077 
4078     // code break substitution
4079     if (feat.GetData().IsCdregion()  &&
4080         feat.GetData().GetCdregion().IsSetCode_break()) {
4081         const CCdregion& cdr = feat.GetData().GetCdregion();
4082         string::size_type protlen = prot.size();
4083         ITERATE (CCdregion::TCode_break, code_break, cdr.GetCode_break()) {
4084             const CRef <CCode_break> brk = *code_break;
4085             const CSeq_loc& cbk_loc = brk->GetLoc();
4086             TSeqPos seq_pos =
4087                 sequence::LocationOffset(feat.GetLocation(), cbk_loc,
4088                                          sequence::eOffset_FromStart,
4089                                          &scope);
4090             seq_pos -= frame;
4091             string::size_type i = seq_pos / 3;
4092             if (i < protlen) {
4093                 const CCode_break::C_Aa& c_aa = brk->GetAa ();
4094                 if (c_aa.IsNcbieaa ()) {
4095                     prot [i] = c_aa.GetNcbieaa ();
4096                 }
4097             } else if (i == protlen) {
4098                 // add terminal exception
4099                 const CCode_break::C_Aa& c_aa = brk->GetAa ();
4100                 if (c_aa.IsNcbieaa () && c_aa.GetNcbieaa () == 42) {
4101                     prot += c_aa.GetNcbieaa ();
4102                 }
4103             }
4104         }
4105 
4106         if ( !include_stop ) {
4107             SIZE_TYPE sz = prot.find_first_of("*");
4108             if (sz != string::npos) {
4109                 prot.resize(sz);
4110             }
4111         }
4112     }
4113 }
4114 
4115 
4116 typedef struct {
4117     bool has_final_stop;
4118     bool has_internal_stop;
4119     bool has_start_m;
4120     size_t len;
4121     size_t frame_offset;
4122 } SFrameInfo;
4123 
4124 typedef map<CCdregion::EFrame, SFrameInfo> TFrameInfoMap;
4125 
FindBestFrame(const CSeq_feat & cds,CScope & scope,bool & ambiguous)4126 CCdregion::EFrame CSeqTranslator::FindBestFrame(const CSeq_feat& cds, CScope& scope, bool& ambiguous)
4127 {
4128     ambiguous = false;
4129     if (!cds.IsSetLocation() || !cds.IsSetData() || !cds.GetData().IsCdregion()) {
4130         return CCdregion::eFrame_not_set;
4131     }
4132     const CCdregion& cdr = cds.GetData().GetCdregion();
4133 
4134     CCdregion::EFrame orig_frame = cdr.IsSetFrame() ? cdr.GetFrame() : CCdregion::eFrame_one;
4135     if (orig_frame == CCdregion::eFrame_not_set) {
4136         orig_frame = CCdregion::eFrame_one;
4137     }
4138 
4139     CRef<CSeq_feat> tmp_cds(new CSeq_feat());
4140     tmp_cds->Assign(cds);
4141     TFrameInfoMap frame_map;
4142     frame_map[CCdregion::eFrame_one] = { false, false, false, NPOS, 0 };
4143     frame_map[CCdregion::eFrame_two] = { false, false, false, NPOS, 1 };
4144     frame_map[CCdregion::eFrame_three] = { false, false, false, NPOS, 2 };
4145 
4146     bool is_3complete = !tmp_cds->GetLocation().IsPartialStop(eExtreme_Biological);
4147     bool is_5complete = !tmp_cds->GetLocation().IsPartialStart(eExtreme_Biological);
4148 
4149     size_t leftover = sequence::GetLength(tmp_cds->GetLocation(), &scope) % 3;
4150 
4151     for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4152         tmp_cds->SetData().SetCdregion().SetFrame(it->first);
4153         string prot;
4154         CSeqTranslator::Translate(*tmp_cds, scope, prot, true, false, NULL);
4155         size_t pos = NStr::Find(prot, "*");
4156         it->second.len = prot.length();
4157 
4158         if ((pos == prot.length() - 1) && (leftover == it->second.frame_offset)) {
4159             it->second.has_final_stop = true;
4160         } else if (pos != NPOS) {
4161             it->second.has_internal_stop = true;
4162         }
4163 
4164         if (NStr::StartsWith(prot, "M") && it->second.frame_offset == 0) {
4165             it->second.has_start_m = true;
4166         }
4167     }
4168 
4169     // if the original frame has no internal stop codons and has a final
4170     // stop codon, keep the original frame
4171     if (frame_map[orig_frame].has_final_stop) {
4172         return orig_frame;
4173     }
4174 
4175     if (is_3complete && !is_5complete) {
4176         // find a frame that has a stop codon
4177         for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4178             if (it->second.has_final_stop) {
4179                 return it->first;
4180             }
4181         }
4182     }
4183 
4184     if (is_5complete && !is_3complete) {
4185         // find a frame that has a start codon (could only be first frame)
4186         if (frame_map[CCdregion::eFrame_one].has_start_m && !frame_map[CCdregion::eFrame_one].has_internal_stop) {
4187             return CCdregion::eFrame_one;
4188         }
4189     }
4190 
4191     if (is_5complete) {
4192         // find a frame that has a start codon (could only be first frame)
4193         if (frame_map[CCdregion::eFrame_one].has_start_m && !frame_map[CCdregion::eFrame_one].has_internal_stop) {
4194             return CCdregion::eFrame_one;
4195         }
4196     }
4197 
4198     if (is_3complete) {
4199         // find a frame that has a stop codon
4200         for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4201             if (it->second.has_final_stop) {
4202                 return it->first;
4203             }
4204         }
4205     }
4206 
4207     // otherwise, just looking for no internal stop codon
4208     if (!frame_map[orig_frame].has_internal_stop) {
4209         return orig_frame;
4210     }
4211 
4212     CCdregion::EFrame best_frame = CCdregion::eFrame_not_set;
4213     for (auto it = frame_map.begin(); it != frame_map.end(); it++) {
4214         if (!it->second.has_internal_stop) {
4215             if (best_frame == CCdregion::eFrame_not_set) {
4216                 best_frame = it->first;
4217             } else {
4218                 ambiguous = true;
4219             }
4220         }
4221     }
4222     if (best_frame != CCdregion::eFrame_not_set) {
4223         return best_frame;
4224     } else {
4225         return orig_frame;
4226     }
4227 }
4228 
4229 
FindBestFrame(const CSeq_feat & cds,CScope & scope)4230 CCdregion::EFrame CSeqTranslator::FindBestFrame(const CSeq_feat& cds, CScope& scope)
4231 {
4232     bool ambiguous = false;
4233 
4234     return FindBestFrame(cds, scope, ambiguous);
4235 }
4236 
4237 
TranslateCdregion(string & prot,const CBioseq_Handle & bsh,const CSeq_loc & loc,const CCdregion & cdr,bool include_stop,bool remove_trailing_X,bool * alt_start,ETranslationLengthProblemOptions)4238 void CCdregion_translate::TranslateCdregion (string& prot,
4239                                              const CBioseq_Handle& bsh,
4240                                              const CSeq_loc& loc,
4241                                              const CCdregion& cdr,
4242                                              bool include_stop,
4243                                              bool remove_trailing_X,
4244                                              bool* alt_start,
4245                                              ETranslationLengthProblemOptions /*options*/)
4246 {
4247     CSeq_feat feat;
4248     feat.SetLocation(const_cast<CSeq_loc&>(loc));
4249     feat.SetData().SetCdregion(const_cast<CCdregion&>(cdr));
4250     CSeqTranslator::Translate(feat, bsh.GetScope(), prot,
4251                               include_stop, remove_trailing_X, alt_start);
4252 }
4253 
4254 
TranslateCdregion(string & prot,const CSeq_feat & cds,CScope & scope,bool include_stop,bool remove_trailing_X,bool * alt_start,ETranslationLengthProblemOptions)4255 void CCdregion_translate::TranslateCdregion (
4256         string& prot,
4257         const CSeq_feat& cds,
4258         CScope& scope,
4259         bool include_stop,
4260         bool remove_trailing_X,
4261         bool* alt_start,
4262         ETranslationLengthProblemOptions /*options*/)
4263 {
4264     _ASSERT(cds.GetData().IsCdregion());
4265     prot.erase();
4266     CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
4267     if ( !bsh ) {
4268         return;
4269     }
4270     CSeqTranslator::Translate(cds, bsh.GetScope(), prot,
4271                               include_stop, remove_trailing_X, alt_start);
4272 }
4273 
4274 
SRelLoc(const CSeq_loc & parent,const CSeq_loc & child,CScope * scope,SRelLoc::TFlags flags)4275 SRelLoc::SRelLoc(const CSeq_loc& parent, const CSeq_loc& child, CScope* scope,
4276                  SRelLoc::TFlags flags)
4277     : m_ParentLoc(&parent)
4278 {
4279     typedef CSeq_loc::TRange TRange0;
4280     for (CSeq_loc_CI cit(child);  cit;  ++cit) {
4281         const CSeq_id& cseqid  = cit.GetSeq_id();
4282         TRange0        crange  = cit.GetRange();
4283         if (crange.IsWholeTo()  &&  scope) {
4284             // determine actual end
4285             crange.SetToOpen(sequence::GetLength(cit.GetSeq_id(), scope));
4286         }
4287         ENa_strand     cstrand = cit.GetStrand();
4288         TSeqPos        pos     = 0;
4289         for (CSeq_loc_CI pit(parent);  pit;  ++pit) {
4290             ENa_strand pstrand = pit.GetStrand();
4291             TRange0    prange  = pit.GetRange();
4292             if (prange.IsWholeTo()  &&  scope) {
4293                 // determine actual end
4294                 prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
4295             }
4296             if ( !sequence::IsSameBioseq(cseqid, pit.GetSeq_id(), scope) ) {
4297                 pos += prange.GetLength();
4298                 continue;
4299             }
4300             CRef<TRange>         intersection(new TRange);
4301             TSeqPos              abs_from, abs_to;
4302             CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
4303             if (crange.GetFrom() >= prange.GetFrom()) {
4304                 abs_from  = crange.GetFrom();
4305                 fuzz_from = cit.GetFuzzFrom();
4306                 if (abs_from == prange.GetFrom()) {
4307                     // subtract out parent fuzz, if any
4308                     const CInt_fuzz* pfuzz = pit.GetFuzzFrom();
4309                     if (pfuzz) {
4310                         if (fuzz_from) {
4311                             CRef<CInt_fuzz> f(new CInt_fuzz);
4312                             f->Assign(*fuzz_from);
4313                             f->Subtract(*pfuzz, abs_from, abs_from);
4314                             if (f->IsP_m()  &&  !f->GetP_m() ) {
4315                                 fuzz_from.Reset(); // cancelled
4316                             } else {
4317                                 fuzz_from = f;
4318                             }
4319                         } else {
4320                             fuzz_from = pfuzz->Negative(abs_from);
4321                         }
4322                     }
4323                 }
4324             } else {
4325                 abs_from  = prange.GetFrom();
4326                 // fuzz_from = pit.GetFuzzFrom();
4327                 CRef<CInt_fuzz> f(new CInt_fuzz);
4328                 f->SetLim(CInt_fuzz::eLim_lt);
4329                 fuzz_from = f;
4330             }
4331             if (crange.GetTo() <= prange.GetTo()) {
4332                 abs_to  = crange.GetTo();
4333                 fuzz_to = cit.GetFuzzTo();
4334                 if (abs_to == prange.GetTo()) {
4335                     // subtract out parent fuzz, if any
4336                     const CInt_fuzz* pfuzz = pit.GetFuzzTo();
4337                     if (pfuzz) {
4338                         if (fuzz_to) {
4339                             CRef<CInt_fuzz> f(new CInt_fuzz);
4340                             f->Assign(*fuzz_to);
4341                             f->Subtract(*pfuzz, abs_to, abs_to);
4342                             if (f->IsP_m()  &&  !f->GetP_m() ) {
4343                                 fuzz_to.Reset(); // cancelled
4344                             } else {
4345                                 fuzz_to = f;
4346                             }
4347                         } else {
4348                             fuzz_to = pfuzz->Negative(abs_to);
4349                         }
4350                     }
4351                 }
4352             } else {
4353                 abs_to  = prange.GetTo();
4354                 // fuzz_to = pit.GetFuzzTo();
4355                 CRef<CInt_fuzz> f(new CInt_fuzz);
4356                 f->SetLim(CInt_fuzz::eLim_gt);
4357                 fuzz_to = f;
4358             }
4359             if (abs_from <= abs_to) {
4360                 if (IsReverse(pstrand)) {
4361                     TSeqPos sigma = pos + prange.GetTo();
4362                     intersection->SetFrom(sigma - abs_to);
4363                     intersection->SetTo  (sigma - abs_from);
4364                     if (fuzz_from) {
4365                         intersection->SetFuzz_to().AssignTranslated
4366                             (*fuzz_from, intersection->GetTo(), abs_from);
4367                         intersection->SetFuzz_to().Negate
4368                             (intersection->GetTo());
4369                     }
4370                     if (fuzz_to) {
4371                         intersection->SetFuzz_from().AssignTranslated
4372                             (*fuzz_to, intersection->GetFrom(), abs_to);
4373                         intersection->SetFuzz_from().Negate
4374                             (intersection->GetFrom());
4375                     }
4376                     if (cstrand == eNa_strand_unknown) {
4377                         intersection->SetStrand(pstrand);
4378                     } else {
4379                         intersection->SetStrand(Reverse(cstrand));
4380                     }
4381                 } else {
4382                     TSignedSeqPos delta = pos - prange.GetFrom();
4383                     intersection->SetFrom(abs_from + delta);
4384                     intersection->SetTo  (abs_to   + delta);
4385                     if (fuzz_from) {
4386                         intersection->SetFuzz_from().AssignTranslated
4387                             (*fuzz_from, intersection->GetFrom(), abs_from);
4388                     }
4389                     if (fuzz_to) {
4390                         intersection->SetFuzz_to().AssignTranslated
4391                             (*fuzz_to, intersection->GetTo(), abs_to);
4392                     }
4393                     if (cstrand == eNa_strand_unknown) {
4394                         intersection->SetStrand(pstrand);
4395                     } else {
4396                         intersection->SetStrand(cstrand);
4397                     }
4398                 }
4399                 // add to m_Ranges, combining with the previous
4400                 // interval if possible
4401                 if ( !(flags & fNoMerge)  &&  !m_Ranges.empty()
4402                     &&  SameOrientation(intersection->GetStrand(),
4403                                         m_Ranges.back()->GetStrand()) ) {
4404                     if (m_Ranges.back()->GetTo() == intersection->GetFrom() - 1
4405                         &&  !IsReverse(intersection->GetStrand()) ) {
4406                         m_Ranges.back()->SetTo(intersection->GetTo());
4407                         if (intersection->IsSetFuzz_to()) {
4408                             m_Ranges.back()->SetFuzz_to
4409                                 (intersection->SetFuzz_to());
4410                         } else {
4411                             m_Ranges.back()->ResetFuzz_to();
4412                         }
4413                     } else if (m_Ranges.back()->GetFrom()
4414                                == intersection->GetTo() + 1
4415                                &&  IsReverse(intersection->GetStrand())) {
4416                         m_Ranges.back()->SetFrom(intersection->GetFrom());
4417                         if (intersection->IsSetFuzz_from()) {
4418                             m_Ranges.back()->SetFuzz_from
4419                                 (intersection->SetFuzz_from());
4420                         } else {
4421                             m_Ranges.back()->ResetFuzz_from();
4422                         }
4423                     } else {
4424                         m_Ranges.push_back(intersection);
4425                     }
4426                 } else {
4427                     m_Ranges.push_back(intersection);
4428                 }
4429             }
4430             pos += prange.GetLength();
4431         }
4432     }
4433 }
4434 
4435 
4436 // Bother trying to merge?
Resolve(const CSeq_loc & new_parent,CScope * scope,SRelLoc::TFlags) const4437 CRef<CSeq_loc> SRelLoc::Resolve(const CSeq_loc& new_parent, CScope* scope,
4438                                 SRelLoc::TFlags /* flags */)
4439     const
4440 {
4441     typedef CSeq_loc::TRange TRange0;
4442     CRef<CSeq_loc> result(new CSeq_loc);
4443     CSeq_loc_mix&  mix = result->SetMix();
4444     ITERATE (TRanges, it, m_Ranges) {
4445         _ASSERT((*it)->GetFrom() <= (*it)->GetTo());
4446         TSeqPos pos = 0, start = (*it)->GetFrom();
4447         bool    keep_going = true;
4448         for (CSeq_loc_CI pit(new_parent);  pit;  ++pit) {
4449             TRange0 prange = pit.GetRange();
4450             if (prange.IsWholeTo()  &&  scope) {
4451                 // determine actual end
4452                 prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
4453             }
4454             TSeqPos length = prange.GetLength();
4455             if (start >= pos  &&  start < pos + length) {
4456                 TSeqPos              from, to;
4457                 CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
4458                 ENa_strand           strand;
4459                 if (IsReverse(pit.GetStrand())) {
4460                     TSeqPos sigma = pos + prange.GetTo();
4461                     from = sigma - (*it)->GetTo();
4462                     to   = sigma - start;
4463                     if (from < prange.GetFrom()  ||  from > sigma) {
4464                         from = prange.GetFrom();
4465                         keep_going = true;
4466                     } else {
4467                         keep_going = false;
4468                     }
4469                     if ( !(*it)->IsSetStrand()
4470                         ||  (*it)->GetStrand() == eNa_strand_unknown) {
4471                         strand = pit.GetStrand();
4472                     } else {
4473                         strand = Reverse((*it)->GetStrand());
4474                     }
4475                     if (from == prange.GetFrom()) {
4476                         fuzz_from = pit.GetFuzzFrom();
4477                     }
4478                     if ( !keep_going  &&  (*it)->IsSetFuzz_to() ) {
4479                         CRef<CInt_fuzz> f(new CInt_fuzz);
4480                         if (fuzz_from) {
4481                             f->Assign(*fuzz_from);
4482                         } else {
4483                             f->SetP_m(0);
4484                         }
4485                         f->Subtract((*it)->GetFuzz_to(), from, (*it)->GetTo(),
4486                                     CInt_fuzz::eAmplify);
4487                         if (f->IsP_m()  &&  !f->GetP_m() ) {
4488                             fuzz_from.Reset(); // cancelled
4489                         } else {
4490                             fuzz_from = f;
4491                         }
4492                     }
4493                     if (to == prange.GetTo()) {
4494                         fuzz_to = pit.GetFuzzTo();
4495                     }
4496                     if (start == (*it)->GetFrom()
4497                         &&  (*it)->IsSetFuzz_from()) {
4498                         CRef<CInt_fuzz> f(new CInt_fuzz);
4499                         if (fuzz_to) {
4500                             f->Assign(*fuzz_to);
4501                         } else {
4502                             f->SetP_m(0);
4503                         }
4504                         f->Subtract((*it)->GetFuzz_from(), to,
4505                                     (*it)->GetFrom(), CInt_fuzz::eAmplify);
4506                         if (f->IsP_m()  &&  !f->GetP_m() ) {
4507                             fuzz_to.Reset(); // cancelled
4508                         } else {
4509                             fuzz_to = f;
4510                         }
4511                     }
4512                 } else {
4513                     TSignedSeqPos delta = prange.GetFrom() - pos;
4514                     from = start          + delta;
4515                     to   = (*it)->GetTo() + delta;
4516                     if (to > prange.GetTo()) {
4517                         to = prange.GetTo();
4518                         keep_going = true;
4519                     } else {
4520                         keep_going = false;
4521                     }
4522                     if ( !(*it)->IsSetStrand()
4523                         ||  (*it)->GetStrand() == eNa_strand_unknown) {
4524                         strand = pit.GetStrand();
4525                     } else {
4526                         strand = (*it)->GetStrand();
4527                     }
4528                     if (from == prange.GetFrom()) {
4529                         fuzz_from = pit.GetFuzzFrom();
4530                     }
4531                     if (start == (*it)->GetFrom()
4532                         &&  (*it)->IsSetFuzz_from()) {
4533                         CRef<CInt_fuzz> f(new CInt_fuzz);
4534                         if (fuzz_from) {
4535                             f->Assign(*fuzz_from);
4536                             f->Add((*it)->GetFuzz_from(), from,
4537                                    (*it)->GetFrom());
4538                         } else {
4539                             f->AssignTranslated((*it)->GetFuzz_from(), from,
4540                                                 (*it)->GetFrom());
4541                         }
4542                         if (f->IsP_m()  &&  !f->GetP_m() ) {
4543                             fuzz_from.Reset(); // cancelled
4544                         } else {
4545                             fuzz_from = f;
4546                         }
4547                     }
4548                     if (to == prange.GetTo()) {
4549                         fuzz_to = pit.GetFuzzTo();
4550                     }
4551                     if ( !keep_going  &&  (*it)->IsSetFuzz_to() ) {
4552                         CRef<CInt_fuzz> f(new CInt_fuzz);
4553                         if (fuzz_to) {
4554                             f->Assign(*fuzz_to);
4555                             f->Add((*it)->GetFuzz_to(), to, (*it)->GetTo());
4556                         } else {
4557                             f->AssignTranslated((*it)->GetFuzz_to(), to,
4558                                                 (*it)->GetTo());
4559                         }
4560                         if (f->IsP_m()  &&  !f->GetP_m() ) {
4561                             fuzz_to.Reset(); // cancelled
4562                         } else {
4563                             fuzz_to = f;
4564                         }
4565                     }
4566                 }
4567                 if (from == to
4568                     &&  (fuzz_from == fuzz_to
4569                          ||  (fuzz_from.GetPointer()  &&  fuzz_to.GetPointer()
4570                               &&  fuzz_from->Equals(*fuzz_to)))) {
4571                     // just a point
4572                     CRef<CSeq_loc> loc(new CSeq_loc);
4573                     CSeq_point& point = loc->SetPnt();
4574                     point.SetPoint(from);
4575                     if (strand != eNa_strand_unknown) {
4576                         point.SetStrand(strand);
4577                     }
4578                     if (fuzz_from) {
4579                         point.SetFuzz().Assign(*fuzz_from);
4580                     }
4581                     point.SetId().Assign(pit.GetSeq_id());
4582                     mix.Set().push_back(loc);
4583                 } else {
4584                     CRef<CSeq_loc> loc(new CSeq_loc);
4585                     CSeq_interval& ival = loc->SetInt();
4586                     ival.SetFrom(from);
4587                     ival.SetTo(to);
4588                     if (strand != eNa_strand_unknown) {
4589                         ival.SetStrand(strand);
4590                     }
4591                     if (fuzz_from) {
4592                         ival.SetFuzz_from().Assign(*fuzz_from);
4593                     }
4594                     if (fuzz_to) {
4595                         ival.SetFuzz_to().Assign(*fuzz_to);
4596                     }
4597                     ival.SetId().Assign(pit.GetSeq_id());
4598                     mix.Set().push_back(loc);
4599                 }
4600                 if (keep_going) {
4601                     start = pos + length;
4602                 } else {
4603                     break;
4604                 }
4605             }
4606             pos += length;
4607         }
4608         if (keep_going) {
4609             TSeqPos total_length;
4610             string  label;
4611             new_parent.GetLabel(&label);
4612             try {
4613                 total_length = sequence::GetLength(new_parent, scope);
4614                 ERR_POST_X(8, Warning << "SRelLoc::Resolve: Relative position "
4615                            << start << " exceeds length (" << total_length
4616                            << ") of parent location " << label);
4617             } catch (CObjmgrUtilException) {
4618                 ERR_POST_X(9, Warning << "SRelLoc::Resolve: Relative position "
4619                            << start
4620                            << " exceeds length (?\?\?) of parent location "
4621                            << label);
4622             }
4623         }
4624     }
4625     // clean up output
4626     switch (mix.Get().size()) {
4627     case 0:
4628         result->SetNull();
4629         break;
4630     case 1:
4631     {{
4632         CRef<CSeq_loc> first = mix.Set().front();
4633         result = first;
4634         break;
4635     }}
4636     default:
4637         break;
4638     }
4639     return result;
4640 }
4641 
4642 
4643 //============================================================================//
4644 //                                 SeqSearch                                  //
4645 //============================================================================//
4646 
4647 // Public:
4648 // =======
4649 
4650 // Constructors and Destructors:
CSeqSearch(IClient * client,TSearchFlags flags)4651 CSeqSearch::CSeqSearch(IClient *client, TSearchFlags flags) :
4652     m_Client(client), m_Flags(flags), m_LongestPattern(0), m_Fsa(true)
4653 {
4654 }
4655 
4656 
~CSeqSearch(void)4657 CSeqSearch::~CSeqSearch(void)
4658 {
4659 }
4660 
4661 
4662 typedef SStaticPair<Char, Char> TCharPair;
4663 static const TCharPair sc_comp_tbl[32] = {
4664     // uppercase
4665     { 'A', 'T' },
4666     { 'B', 'V' },
4667     { 'C', 'G' },
4668     { 'D', 'H' },
4669     { 'G', 'C' },
4670     { 'H', 'D' },
4671     { 'K', 'M' },
4672     { 'M', 'K' },
4673     { 'N', 'N' },
4674     { 'R', 'Y' },
4675     { 'S', 'S' },
4676     { 'T', 'A' },
4677     { 'U', 'A' },
4678     { 'V', 'B' },
4679     { 'W', 'W' },
4680     { 'Y', 'R' },
4681     // lowercase
4682     { 'a', 'T' },
4683     { 'b', 'V' },
4684     { 'c', 'G' },
4685     { 'd', 'H' },
4686     { 'g', 'C' },
4687     { 'h', 'D' },
4688     { 'k', 'M' },
4689     { 'm', 'K' },
4690     { 'n', 'N' },
4691     { 'r', 'Y' },
4692     { 's', 'S' },
4693     { 't', 'A' },
4694     { 'u', 'A' },
4695     { 'v', 'B' },
4696     { 'w', 'W' },
4697     { 'y', 'R' },
4698 };
4699 typedef CStaticPairArrayMap<Char, Char> TComplement;
4700 DEFINE_STATIC_ARRAY_MAP(TComplement, sc_Complement, sc_comp_tbl);
4701 
4702 
4703 inline
s_GetComplement(char c)4704 static char s_GetComplement(char c)
4705 {
4706     TComplement::const_iterator comp_it = sc_Complement.find(c);
4707     return (comp_it != sc_Complement.end()) ? comp_it->second : '\0';
4708 }
4709 
4710 
s_GetReverseComplement(const string & sequence)4711 static string s_GetReverseComplement(const string& sequence)
4712 {
4713     string revcomp;
4714     revcomp.reserve(sequence.length());
4715     string::const_reverse_iterator rend = sequence.rend();
4716 
4717     for (string::const_reverse_iterator rit = sequence.rbegin(); rit != rend; ++rit) {
4718         revcomp += s_GetComplement(*rit);
4719     }
4720 
4721     return revcomp;
4722 }
4723 
4724 
AddNucleotidePattern(const string & name,const string & sequence,Int2 cut_site,TSearchFlags flags)4725 void CSeqSearch::AddNucleotidePattern
4726 (const string& name,
4727  const string& sequence,
4728  Int2          cut_site,
4729  TSearchFlags  flags)
4730 {
4731     if (NStr::IsBlank(name)  ||  NStr::IsBlank(sequence)) {
4732         NCBI_THROW(CUtilException, eNoInput, "Empty input value");
4733     }
4734 
4735     // cleanup pattern
4736     string pattern = sequence;
4737     NStr::TruncateSpaces(pattern);
4738     NStr::ToUpper(pattern);
4739 
4740     string revcomp = s_GetReverseComplement(pattern);
4741     bool symmetric = (pattern == revcomp);
4742     ENa_strand strand = symmetric ? eNa_strand_both : eNa_strand_plus;
4743 
4744     // record expansion of entered pattern
4745     x_AddNucleotidePattern(name, pattern, cut_site, strand, flags);
4746 
4747     // record expansion of reverse complement of asymmetric pattern
4748     if (!symmetric  &&  (!x_IsJustTopStrand(flags))) {
4749         TSeqPos revcomp_cut_site = TSeqPos(pattern.length()) - cut_site;
4750         x_AddNucleotidePattern(name, revcomp, revcomp_cut_site,
4751             eNa_strand_minus, flags);
4752     }
4753 }
4754 
4755 
4756 // Program passes each character in turn to finite state machine.
Search(int current_state,char ch,int position,int length)4757 int CSeqSearch::Search
4758 (int  current_state,
4759  char ch,
4760  int  position,
4761  int  length)
4762 {
4763     if (m_Client == NULL) {
4764         return 0;
4765     }
4766 
4767     // on first character, populate state transition table
4768     if (!m_Fsa.IsPrimed()) {
4769         m_Fsa.Prime();
4770     }
4771 
4772     int next_state = m_Fsa.GetNextState(current_state, ch);
4773 
4774     // report matches (if any)
4775     if (m_Fsa.IsMatchFound(next_state)) {
4776         ITERATE(vector<TPatternInfo>, it, m_Fsa.GetMatches(next_state)) {
4777             int start = position - int(it->GetSequence().length()) + 1;
4778 
4779             // prevent multiple reports of patterns for circular sequences.
4780             if (start < length) {
4781                 bool keep_going = m_Client->OnPatternFound(*it, start);
4782                 if (!keep_going) {
4783                     break;
4784                 }
4785             }
4786         }
4787     }
4788 
4789     return next_state;
4790 }
4791 
4792 
4793 // Search entire bioseq.
Search(const CBioseq_Handle & bsh)4794 void CSeqSearch::Search(const CBioseq_Handle& bsh)
4795 {
4796     if (!bsh  ||  m_Client == NULL) {
4797         return;
4798     }
4799 
4800     CSeqVector seq_vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
4801     TSeqPos seq_len = seq_vec.size();
4802     TSeqPos search_len = seq_len;
4803 
4804     // handle circular bioseqs
4805     CSeq_inst::ETopology topology = bsh.GetInst_Topology();
4806     if (topology == CSeq_inst::eTopology_circular) {
4807         search_len += TSeqPos(m_LongestPattern - 1);
4808     }
4809 
4810     int state = m_Fsa.GetInitialState();
4811 
4812     for (TSeqPos i = 0; i < search_len; ++i) {
4813         state = Search(state, seq_vec[i % seq_len], i, seq_len);
4814     }
4815 }
4816 
4817 
4818 // Private:
4819 // ========
4820 
4821 /// translation finite state machine base codes - ncbi4na
4822 enum EBaseCode {
4823     eBase_A = 1,  ///< A
4824     eBase_C,      ///< C
4825     eBase_M,      ///< AC
4826     eBase_G,      ///< G
4827     eBase_R,      ///< AG
4828     eBase_S,      ///< CG
4829     eBase_V,      ///< ACG
4830     eBase_T,      ///< T
4831     eBase_W,      ///< AT
4832     eBase_Y,      ///< CT
4833     eBase_H,      ///< ACT
4834     eBase_K,      ///< GT
4835     eBase_D,      ///< AGT
4836     eBase_B,      ///< CGT
4837     eBase_N       ///< ACGT
4838 };
4839 
4840 /// conversion table from Ncbi4na / Iupacna to EBaseCode
4841 static const EBaseCode sc_CharToEnum[256] = {
4842     // Ncbi4na
4843     eBase_N, eBase_A, eBase_C, eBase_M,
4844     eBase_G, eBase_R, eBase_S, eBase_V,
4845     eBase_T, eBase_W, eBase_Y, eBase_H,
4846     eBase_K, eBase_D, eBase_B, eBase_N,
4847 
4848     eBase_N, eBase_N, eBase_N, eBase_N,
4849     eBase_N, eBase_N, eBase_N, eBase_N,
4850     eBase_N, eBase_N, eBase_N, eBase_N,
4851     eBase_N, eBase_N, eBase_N, eBase_N,
4852     eBase_N, eBase_N, eBase_N, eBase_N,
4853     eBase_N, eBase_N, eBase_N, eBase_N,
4854     eBase_N, eBase_N, eBase_N, eBase_N,
4855     eBase_N, eBase_N, eBase_N, eBase_N,
4856     eBase_N, eBase_N, eBase_N, eBase_N,
4857     eBase_N, eBase_N, eBase_N, eBase_N,
4858     eBase_N, eBase_N, eBase_N, eBase_N,
4859     eBase_N, eBase_N, eBase_N, eBase_N,
4860     // Iupacna (uppercase)
4861     eBase_N, eBase_A, eBase_B, eBase_C,
4862     eBase_D, eBase_N, eBase_N, eBase_G,
4863     eBase_H, eBase_N, eBase_N, eBase_K,
4864     eBase_N, eBase_M, eBase_N, eBase_N,
4865     eBase_N, eBase_N, eBase_R, eBase_S,
4866     eBase_T, eBase_T, eBase_V, eBase_W,
4867     eBase_N, eBase_Y, eBase_N, eBase_N,
4868     eBase_N, eBase_N, eBase_N, eBase_N,
4869     // Iupacna (lowercase)
4870     eBase_N, eBase_A, eBase_B, eBase_C,
4871     eBase_D, eBase_N, eBase_N, eBase_G,
4872     eBase_H, eBase_N, eBase_N, eBase_K,
4873     eBase_N, eBase_M, eBase_N, eBase_N,
4874     eBase_N, eBase_N, eBase_R, eBase_S,
4875     eBase_T, eBase_T, eBase_V, eBase_W,
4876     eBase_N, eBase_Y, eBase_N, eBase_N,
4877 
4878     eBase_N, eBase_N, eBase_N, eBase_N,
4879     eBase_N, eBase_N, eBase_N, eBase_N,
4880     eBase_N, eBase_N, eBase_N, eBase_N,
4881     eBase_N, eBase_N, eBase_N, eBase_N,
4882     eBase_N, eBase_N, eBase_N, eBase_N,
4883     eBase_N, eBase_N, eBase_N, eBase_N,
4884     eBase_N, eBase_N, eBase_N, eBase_N,
4885     eBase_N, eBase_N, eBase_N, eBase_N,
4886     eBase_N, eBase_N, eBase_N, eBase_N,
4887     eBase_N, eBase_N, eBase_N, eBase_N,
4888     eBase_N, eBase_N, eBase_N, eBase_N,
4889     eBase_N, eBase_N, eBase_N, eBase_N,
4890     eBase_N, eBase_N, eBase_N, eBase_N,
4891     eBase_N, eBase_N, eBase_N, eBase_N,
4892     eBase_N, eBase_N, eBase_N, eBase_N,
4893     eBase_N, eBase_N, eBase_N, eBase_N,
4894     eBase_N, eBase_N, eBase_N, eBase_N,
4895     eBase_N, eBase_N, eBase_N, eBase_N,
4896     eBase_N, eBase_N, eBase_N, eBase_N,
4897     eBase_N, eBase_N, eBase_N, eBase_N,
4898     eBase_N, eBase_N, eBase_N, eBase_N,
4899     eBase_N, eBase_N, eBase_N, eBase_N,
4900     eBase_N, eBase_N, eBase_N, eBase_N,
4901     eBase_N, eBase_N, eBase_N, eBase_N,
4902     eBase_N, eBase_N, eBase_N, eBase_N,
4903     eBase_N, eBase_N, eBase_N, eBase_N,
4904     eBase_N, eBase_N, eBase_N, eBase_N,
4905     eBase_N, eBase_N, eBase_N, eBase_N,
4906     eBase_N, eBase_N, eBase_N, eBase_N,
4907     eBase_N, eBase_N, eBase_N, eBase_N,
4908     eBase_N, eBase_N, eBase_N, eBase_N,
4909     eBase_N, eBase_N, eBase_N, eBase_N,
4910     eBase_N, eBase_N, eBase_N, eBase_N
4911 };
4912 
4913 static const char sc_EnumToChar[16] = {
4914     '\0', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'
4915 };
4916 
4917 
x_AddNucleotidePattern(const string & name,string & pattern,Int2 cut_site,ENa_strand strand,TSearchFlags flags)4918 void CSeqSearch::x_AddNucleotidePattern
4919 (const string& name,
4920  string& pattern,
4921  Int2 cut_site,
4922  ENa_strand strand,
4923  TSearchFlags flags)
4924 {
4925     if (pattern.length() > m_LongestPattern) {
4926         m_LongestPattern = pattern.length();
4927     }
4928 
4929     TPatternInfo pat_info(name, kEmptyStr, cut_site);
4930     pat_info.m_Strand = strand;
4931 
4932     if (!x_IsExpandPattern(flags)) {
4933         pat_info.m_Sequence = pattern;
4934         x_AddPattern(pat_info, pattern, flags);
4935     } else {
4936         string buffer;
4937         buffer.reserve(pattern.length());
4938 
4939         x_ExpandPattern(pattern, buffer, 0, pat_info, flags);
4940     }
4941 }
4942 
4943 
x_ExpandPattern(string & sequence,string & buf,size_t pos,TPatternInfo & pat_info,TSearchFlags flags)4944 void CSeqSearch::x_ExpandPattern
4945 (string& sequence,
4946  string& buf,
4947  size_t pos,
4948  TPatternInfo& pat_info,
4949  TSearchFlags flags)
4950 {
4951     static const EBaseCode expansion[] = { eBase_A, eBase_C, eBase_G, eBase_T };
4952 
4953     if (pos < sequence.length()) {
4954         Uint4 code = static_cast<Uint4>(sc_CharToEnum[static_cast<Uint1>(sequence[pos])]);
4955 
4956         for (int i = 0; i < 4; ++i) {
4957             if ((code & expansion[i]) != 0) {
4958                 buf += sc_EnumToChar[expansion[i]];
4959                 x_ExpandPattern(sequence, buf, pos + 1, pat_info, flags);
4960                 buf.erase(pos);
4961             }
4962         }
4963     } else {
4964         // when position reaches pattern length, store one expanded string.
4965         x_AddPattern(pat_info, buf, flags);
4966     }
4967 }
4968 
4969 
x_AddPattern(TPatternInfo & pat_info,string & sequence,TSearchFlags flags)4970 void CSeqSearch::x_AddPattern(TPatternInfo& pat_info, string& sequence, TSearchFlags flags)
4971 {
4972     x_StorePattern(pat_info, sequence);
4973 
4974     if (x_IsAllowMismatch(flags)) {
4975         // put 'N' at every position if a single mismatch is allowed.
4976         char ch = 'N';
4977         NON_CONST_ITERATE (string, it, sequence) {
4978             swap(*it, ch);
4979 
4980             x_StorePattern(pat_info, sequence);
4981 
4982             // restore proper character, go on to put N in next position.
4983             swap(*it, ch);
4984         }
4985     }
4986 }
4987 
4988 
x_StorePattern(TPatternInfo & pat_info,string & sequence)4989 void CSeqSearch::x_StorePattern(TPatternInfo& pat_info, string& sequence)
4990 {
4991     pat_info.m_Sequence = sequence;
4992     m_Fsa.AddWord(sequence, pat_info);
4993 }
4994 
4995 
ReverseComplement(CSeq_inst & inst,CScope * scope)4996 void ReverseComplement(CSeq_inst& inst, CScope* scope)
4997 {
4998     switch (inst.GetRepr()) {
4999         case CSeq_inst::eRepr_raw:
5000             CSeqportUtil::ReverseComplement(&(inst.SetSeq_data()), 0, inst.GetLength());
5001             break;
5002         case CSeq_inst::eRepr_delta:
5003             if (!inst.IsSetExt() || !inst.GetExt().IsDelta()) {
5004                 NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
5005                    "Sequence of this type cannot be reverse-complemented.");
5006             }
5007             // reverse order of segments
5008             inst.SetExt().SetDelta().Set().reverse();
5009             // reverse-complement individual segments
5010             NON_CONST_ITERATE(CSeq_inst::TExt::TDelta::Tdata, it, inst.SetExt().SetDelta().Set()) {
5011                 switch ((*it)->Which()) {
5012                     case CDelta_seq::e_Literal:
5013                         if ((*it)->GetLiteral().IsSetSeq_data()) {
5014                             CSeq_literal& lit = (*it)->SetLiteral();
5015                             if (!lit.GetSeq_data().IsGap()) {
5016                                 CSeqportUtil::ReverseComplement(&(lit.SetSeq_data()), 0, lit.GetLength());
5017                             }
5018                         }
5019                         break;
5020                     case CDelta_seq::e_Loc:
5021                         {{
5022                             CRef<CSeq_loc> flip(sequence::SeqLocRevCmpl((*it)->SetLoc(), scope));
5023                             (*it)->SetLoc(*flip);
5024                         }}
5025                         break;
5026                     default:
5027                         // do nothing
5028                         break;
5029                 }
5030             }
5031             break;
5032         default:
5033             NCBI_THROW(CObjmgrUtilException, eBadSequenceType,
5034                 "Sequence of this type cannot be reverse-complemented.");
5035             break;
5036     }
5037 }
5038 
5039 
5040 END_SCOPE(objects)
5041 END_NCBI_SCOPE
5042 
5043