1 /*  $Id: variation_util2.cpp 415910 2013-10-22 16:06:39Z astashya
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Description:
27  *
28  */
29 
30 #include <ncbi_pch.hpp>
31 #include <objects/seqloc/Seq_point.hpp>
32 #include <objects/general/Object_id.hpp>
33 #include <objects/general/Dbtag.hpp>
34 
35 #include <objects/seqalign/Seq_align.hpp>
36 #include <objects/seqalign/Spliced_seg.hpp>
37 #include <objects/seqalign/Spliced_exon.hpp>
38 #include <objects/seqalign/Product_pos.hpp>
39 #include <objects/seqalign/Prot_pos.hpp>
40 #include <objmgr/seq_loc_mapper.hpp>
41 #include <objmgr/feat_ci.hpp>
42 #include <objmgr/align_ci.hpp>
43 
44 
45 #include <objects/seq/seqport_util.hpp>
46 #include <objects/seqblock/GB_block.hpp>
47 
48 #include <objects/seqfeat/Seq_feat.hpp>
49 #include <objects/seqfeat/Genetic_code.hpp>
50 #include <objects/seqfeat/Variation_ref.hpp>
51 #include <objects/seqfeat/Phenotype.hpp>
52 #include <objects/seqfeat/Variation_inst.hpp>
53 #include <objects/seqfeat/VariantProperties.hpp>
54 #include <objects/seqfeat/Delta_item.hpp>
55 #include <objects/seqfeat/Ext_loc.hpp>
56 #include <objects/seqfeat/BioSource.hpp>
57 #include <objects/seqfeat/SubSource.hpp>
58 #include <objects/seqfeat/SeqFeatXref.hpp>
59 
60 #include <objects/pub/Pub_set.hpp>
61 #include <objects/pub/Pub.hpp>
62 #include <objects/biblio/Cit_art.hpp>
63 #include <objects/biblio/Cit_book.hpp>
64 #include <objects/biblio/Cit_jour.hpp>
65 
66 
67 #include <objects/variation/VariationMethod.hpp>
68 #include <objects/variation/VariationException.hpp>
69 
70 #include <objects/seq/Seq_literal.hpp>
71 #include <objects/seq/Seq_data.hpp>
72 #include <objects/seq/Annot_descr.hpp>
73 #include <objects/seq/Annotdesc.hpp>
74 #include <objects/seq/Seq_descr.hpp>
75 #include <objects/seq/Seq_inst.hpp>
76 #include <objects/seq/Seqdesc.hpp>
77 
78 #include <objects/general/User_object.hpp>
79 #include <objects/general/Object_id.hpp>
80 
81 #include <objects/seqloc/Seq_loc_equiv.hpp>
82 
83 #include <serial/iterator.hpp>
84 #include <objmgr/util/sequence.hpp>
85 #include <objmgr/bioseq_handle.hpp>
86 #include <objmgr/seq_vector.hpp>
87 #include <objmgr/util/seq_loc_util.hpp>
88 #include <misc/hgvs/variation_util2.hpp>
89 
90 #include <objects/seqloc/Na_strand.hpp>
91 
92 BEGIN_NCBI_SCOPE
93 
94 namespace variation
95 {
96 
CreateException(const string & message,CVariationException::ECode code=static_cast<CVariationException::ECode> (0))97     static CRef<CVariationException> CreateException(
98         const string& message,
99         CVariationException::ECode code =
100         static_cast<CVariationException::ECode>(0))
101     {
102         CRef<CVariationException> e(new CVariationException);
103         e->SetMessage(message);
104         if(code) {
105             e->SetCode(code);
106         }
107         return e;
108     }
109 
110 
111     template<typename T>
ChangeIdsInPlace(T & container,sequence::EGetIdType id_type,CScope & scope)112     static void ChangeIdsInPlace(
113         T& container,
114         sequence::EGetIdType id_type,
115         CScope& scope)
116     {
117         for(CTypeIterator<CSeq_id> it = Begin(container); it; ++it) {
118             CSeq_id& id = *it;
119             if((id_type == sequence::eGetId_ForceAcc && id.GetTextseq_Id())
120                || (id_type == sequence::eGetId_ForceGi && id.IsGi())) {
121                 continue;
122             }
123 
124             const CSeq_id_Handle idh = sequence::GetId(id, scope, id_type);
125 
126             if(idh) {
127                 id.Assign(*idh.GetSeqId());
128             } else if(id.IsGeneral() || id.IsLocal()) {
129                 ; // e.g. ENST
130             } else {
131                 NCBI_USER_THROW("Can't convert seq-id for " + it->AsFastaString());
132             }
133         }
134     }
135 
s_GetLength(const CVariantPlacement & p,CScope * scope)136     TSeqPos CVariationUtil::s_GetLength(
137         const CVariantPlacement& p,
138         CScope* scope)
139     {
140         int start_offset = p.IsSetStart_offset() ? p.GetStart_offset() : 0;
141 
142         int stop_offset = p.IsSetStop_offset() ? p.GetStop_offset()
143             : p.GetLoc().IsPnt() ? start_offset
144             : 0;
145         return ncbi::sequence::GetLength(p.GetLoc(), scope)
146             + stop_offset
147             - start_offset;
148     }
149 
150     // verify that exon_anchor_pos is represented in biostarts or biostops
151     // depending on the sign of offset_pos
ValidExonTerminal(const set<TSeqPos> & exon_biostarts,const set<TSeqPos> & exon_biostops,TSeqPos exon_anchor_pos,int offset_pos)152     static bool ValidExonTerminal(
153         const set<TSeqPos>& exon_biostarts,
154         const set<TSeqPos>& exon_biostops,
155         TSeqPos exon_anchor_pos,
156         int offset_pos)
157     {
158         // VAR-1379, VAR-1500
159         // Note: offset=0 may appear in cases like NM_000535.5:c.1145-?_2174+?del
160         // We don't know here whether we're dealing with lt or gt, so will optimistically
161         // allow if we find the anchor in either starts or stops.
162         bool found_in_starts = exon_biostarts.find(exon_anchor_pos)
163             != exon_biostarts.end();
164 
165         bool found_in_stops = exon_biostops.find(exon_anchor_pos)
166             != exon_biostops.end();
167 
168         return (offset_pos < 0 && found_in_starts)
169             || (offset_pos  > 0 && found_in_stops)
170             || (offset_pos == 0 && (found_in_starts || found_in_stops));
171     }
172 
173 
GetFuzzSign(const CInt_fuzz & fuzz,int loc_sign)174     static int GetFuzzSign(const CInt_fuzz& fuzz, int loc_sign)
175     {
176         return !fuzz.IsLim() ? 0
177             : fuzz.GetLim() == CInt_fuzz::eLim_lt ? -1
178             : fuzz.GetLim() == CInt_fuzz::eLim_tl ? -1 * loc_sign
179             : fuzz.GetLim() == CInt_fuzz::eLim_gt ? 1
180             : fuzz.GetLim() == CInt_fuzz::eLim_tr ? 1 * loc_sign
181             : 0;
182     }
183 
184 
ValidExonTerminals(const set<TSeqPos> & exon_biostarts,const set<TSeqPos> & exon_biostops,const CVariantPlacement & p)185     static bool ValidExonTerminals(
186         const set<TSeqPos>& exon_biostarts,
187         const set<TSeqPos>& exon_biostops,
188         const CVariantPlacement& p)
189     {
190         int sign = sequence::GetStrand(p.GetLoc(), NULL) == eNa_strand_minus ? -1 : 1;
191 
192         const int start_offset_sign = //VAR-1500
193             p.IsSetStart_offset()
194             && p.GetStart_offset() > 0 ? 1
195 
196             : p.IsSetStart_offset()
197             && p.GetStart_offset() < 0 ? -1
198 
199             : p.IsSetStart_offset_fuzz() ? GetFuzzSign(p.GetStart_offset_fuzz(), sign)
200 
201             : 0;
202 
203         const int stop_offset_sign =
204             p.IsSetStop_offset()
205             && p.GetStop_offset() > 0 ? 1
206 
207             : p.IsSetStop_offset()
208             && p.GetStop_offset() < 0 ? -1
209 
210             : p.IsSetStop_offset_fuzz() ? GetFuzzSign(p.GetStop_offset_fuzz(), sign)
211 
212             : 0;
213 
214         // if offset's sign is consistent with location's strand, we expect to find
215         // the anchor in exon-stops, otherwise in exon-starts. //VAR-1379
216         const bool start_ok =
217             (!p.IsSetStart_offset() && !p.IsSetStart_offset_fuzz())
218             || ValidExonTerminal(
219             exon_biostarts,
220             exon_biostops,
221             sequence::GetStart(p.GetLoc(), NULL, eExtreme_Biological),
222             start_offset_sign * sign);
223 
224         const bool stop_ok =
225             (!p.IsSetStop_offset() && !p.IsSetStop_offset_fuzz())
226             || ValidExonTerminal(
227             exon_biostarts,
228             exon_biostops,
229             sequence::GetStop(p.GetLoc(), NULL, eExtreme_Biological),
230             stop_offset_sign * sign);
231 
232         return start_ok && stop_ok;
233     }
234 
235 
CheckExonBoundary(const CVariantPlacement & p)236     CVariationUtil::ETestStatus CVariationUtil::CheckExonBoundary(const CVariantPlacement& p)
237     {
238         const CSeq_id* id = p.GetLoc().GetId();
239         if(!id
240            || !(p.IsSetStart_offset() || p.IsSetStop_offset())
241            || !(p.IsSetMol() && (p.GetMol() == CVariantPlacement::eMol_cdna
242            || p.GetMol() == CVariantPlacement::eMol_rna))) {
243             return eNotApplicable;
244         }
245 
246         SAnnotSelector sel;
247         sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_exon);
248         CBioseq_Handle bsh = m_scope->GetBioseqHandle(*id);
249 
250         set<TSeqPos> exon_biostarts, exon_biostops;
251         for(CFeat_CI ci(bsh, sel); ci; ++ci) {
252             const CMappedFeat& mf = *ci;
253             exon_biostarts.insert(sequence::GetStart(mf.GetLocation(), NULL));
254             exon_biostops.insert(sequence::GetStop(mf.GetLocation(), NULL));
255         }
256 
257         return exon_biostarts.empty() && exon_biostops.empty() ? eNotApplicable
258             : ValidExonTerminals(exon_biostarts, exon_biostops, p) ? ePass
259             : eFail;
260     }
261 
CheckExonBoundary(const CVariantPlacement & p,const CSeq_align & aln)262     CVariationUtil::ETestStatus CVariationUtil::CheckExonBoundary(
263         const CVariantPlacement& p,
264         const CSeq_align& aln)
265     {
266         const CSeq_id* id = p.GetLoc().GetId();
267         if(!aln.GetSegs().IsSpliced()
268            || !id
269            || !sequence::IsSameBioseq(aln.GetSeq_id(0), *id, m_scope)
270            || !(p.IsSetStart_offset() || p.IsSetStop_offset())
271            || (p.IsSetMol() && p.GetMol() == CVariantPlacement::eMol_genomic)
272            ) {
273             return eNotApplicable;
274         }
275 
276         set<TSeqPos> exon_biostarts, exon_biostops;
277         ITERATE(CSpliced_seg::TExons, it, aln.GetSegs().GetSpliced().GetExons())
278         {
279             const CSpliced_exon& exon = **it;
280             exon_biostarts.insert(exon.GetProduct_start().IsNucpos() ?
281                                   exon.GetProduct_start().GetNucpos() :
282                                   exon.GetProduct_start().GetProtpos().GetAmin());
283 
284             exon_biostops.insert(exon.GetProduct_end().IsNucpos() ?
285                                  exon.GetProduct_end().GetNucpos() :
286                                  exon.GetProduct_end().GetProtpos().GetAmin());
287         }
288 
289         return ValidExonTerminals(exon_biostarts, exon_biostops, p) ? ePass : eFail;
290     }
291 
SwapLtGtFuzz(CInt_fuzz & fuzz)292     static void SwapLtGtFuzz(CInt_fuzz& fuzz)
293     {
294         if(fuzz.IsLim() && fuzz.GetLim() == CInt_fuzz::eLim_gt) {
295             fuzz.SetLim(CInt_fuzz::eLim_lt);
296         } else if(fuzz.IsLim() && fuzz.GetLim() == CInt_fuzz::eLim_lt) {
297             fuzz.SetLim(CInt_fuzz::eLim_gt);
298         }
299     }
300 
301 
302     //loc is presumed Int-loc
ApplyOffsetFuzz(CSeq_loc & loc,const CInt_fuzz & offset_fuzz,bool is_start)303     static void ApplyOffsetFuzz(
304         CSeq_loc& loc,
305         const CInt_fuzz& offset_fuzz,
306         bool is_start)
307     {
308         bool minus_strand = loc.GetStrand() == eNa_strand_minus;
309         CInt_fuzz& fuzz = (minus_strand == is_start) ? loc.SetInt().SetFuzz_to()
310             : loc.SetInt().SetFuzz_from();
311         if(!fuzz.Which()) { //will not apply this fuzz if mapping itself was fuzzy
312             fuzz.Assign(offset_fuzz);
313             if(minus_strand) {
314                 SwapLtGtFuzz(fuzz);
315             }
316             if(fuzz.IsRange()) {
317                 //fuzz range coordinates are invalid after remapping
318                 fuzz.SetLim(CInt_fuzz::eLim_unk);
319             }
320         }
321     }
322 
323 
s_ResolveIntronicOffsets(CVariantPlacement & p)324     void CVariationUtil::s_ResolveIntronicOffsets(CVariantPlacement& p)
325     {
326         if(!p.IsSetStart_offset()
327            && !p.IsSetStop_offset()) {
328             return;
329         }
330 
331         if(p.GetLoc().IsPnt()
332            && !p.IsSetStop_offset()
333            && p.IsSetStart_offset()) {
334             //In case of point, only start-offset may be specified. In this case
335             //temporarily set stop-offset to be the same (it will be reset below),
336             //such that start and stop of point-as-interval are adjusted consistently.
337             p.SetStop_offset(p.GetStart_offset());
338         }
339 
340         //Convert to interval.
341         //Need to do for a point so that we can deal with start and stop independently.
342         //Another scenario is when start/stop in CDNA coords lie in different exons, so
343         //the mapping is across one or more intron - we want to collapse the genomic loc
344         //(do we want to preselve the gaps caused by indels rather than introns?)
345         CRef<CSeq_loc> loc =
346             sequence::Seq_loc_Merge(
347             p.GetLoc(),
348             CSeq_loc::fMerge_SingleRange,
349             NULL);
350 
351         if(!loc->IsInt()
352            && (p.IsSetStart_offset() || p.IsSetStop_offset())) {
353             NcbiCerr << MSerial_AsnText << p;
354             NCBI_THROW(CException, eUnknown, "Complex location");
355         }
356 
357         bool minus_strand = loc->GetStrand() == eNa_strand_minus;
358 
359         if(p.IsSetStart_offset()) {
360             TSeqPos& bio_start_ref = minus_strand ? loc->SetInt().SetTo()
361                 : loc->SetInt().SetFrom();
362 
363             bio_start_ref += p.GetStart_offset() * (minus_strand ? -1 : 1);
364             p.ResetStart_offset();
365         }
366 
367         if(p.IsSetStop_offset()) {
368             TSeqPos& bio_stop_ref =
369                 loc->GetStrand() == eNa_strand_minus ? loc->SetInt().SetFrom()
370                 : loc->SetInt().SetTo();
371 
372             bio_stop_ref += p.GetStop_offset() * (minus_strand ? -1 : 1);
373             p.ResetStop_offset();
374         }
375 
376         if(p.IsSetStart_offset_fuzz()) {
377             ApplyOffsetFuzz(*loc, p.GetStart_offset_fuzz(), true);
378             p.ResetStart_offset_fuzz();
379         }
380 
381         if(p.IsSetStop_offset_fuzz()) {
382             ApplyOffsetFuzz(*loc, p.GetStop_offset_fuzz(), false);
383             p.ResetStop_offset_fuzz();
384         }
385 
386         if(loc->GetTotalRange().GetLength() == 1) {
387             loc = sequence::Seq_loc_Merge(*loc, CSeq_loc::fSortAndMerge_All, NULL);
388         }
389 
390         p.SetLoc().Assign(*loc);
391     }
392 
393 
s_AddIntronicOffsets(CVariantPlacement & p,const CSpliced_seg & ss,CScope * scope)394     void CVariationUtil::s_AddIntronicOffsets(
395         CVariantPlacement& p,
396         const CSpliced_seg& ss,
397         CScope* scope)
398     {
399 #if 0
400         if(!p.GetLoc().IsPnt() && !p.GetLoc().IsInt()) {
401             NCBI_THROW(CException, eUnknown, "Expected simple loc (int or pnt)");
402         }
403 #endif
404 
405         if(p.IsSetStart_offset()
406            || p.IsSetStop_offset()
407            || p.IsSetStart_offset_fuzz()
408            || p.IsSetStop_offset_fuzz()) {
409             NCBI_THROW(CException, eUnknown, "Expected offset-free placement");
410         }
411 
412         if(!p.GetLoc().GetId()
413            || !sequence::IsSameBioseq(
414            *p.GetLoc().GetId(),
415            ss.GetGenomic_id(),
416            scope)) {
417             NCBI_THROW(CException, eUnknown,
418                        "Expected genomic_id in the variation to be the same as in spliced-seg");
419         }
420 
421         long start = p.GetLoc().GetStart(eExtreme_Positional);
422         long stop = p.GetLoc().GetStop(eExtreme_Positional);
423 
424         long closest_start = ss.GetExons().front()->GetGenomic_start();
425         // closest-exon-boundary for bio-start of variation location
426 
427         long closest_stop = ss.GetExons().front()->GetGenomic_start();
428         // closest-exon-boundary for bio-stop of variation location
429 
430         ITERATE(CSpliced_seg::TExons, it, ss.GetExons())
431         {
432             const CSpliced_exon& se = **it;
433 
434             if(se.GetGenomic_end() >= start && se.GetGenomic_start() <= start) {
435                 closest_start = start; //start is within exon - use itself.
436             } else {
437                 if(abs((long)se.GetGenomic_end() - start) < abs(closest_start - start)) {
438                     closest_start = (long)se.GetGenomic_end();
439                 }
440                 if(abs((long)se.GetGenomic_start() - start) < abs(closest_start - start)) {
441                     closest_start = (long)se.GetGenomic_start();
442                 }
443             }
444 
445             if(se.GetGenomic_end() >= stop && se.GetGenomic_start() <= stop) {
446                 closest_stop = stop; //end is within exon - use itself.
447             } else {
448                 if(abs((long)se.GetGenomic_end() - stop) < abs(closest_stop - stop)) {
449                     closest_stop = (long)se.GetGenomic_end();
450                 }
451                 if(abs((long)se.GetGenomic_start() - stop) < abs(closest_stop - stop)) {
452                     closest_stop = (long)se.GetGenomic_start();
453                 }
454             }
455         }
456 
457         //convert to int, so we can set start and stop
458         CRef<CSeq_loc> int_loc = sequence::Seq_loc_Merge(p.GetLoc(), CSeq_loc::fMerge_SingleRange, NULL);
459 
460         //adjust location
461         if(start != closest_start || stop != closest_stop) {
462             int_loc->SetInt().SetFrom(closest_start);
463             int_loc->SetInt().SetTo(closest_stop);
464         }
465         p.SetLoc(*int_loc);
466 
467         //Add offsets. Note that start-offset and stop-offset are always biological.
468         //(e.g. start-offset applies to bio-start; value > 0 means downstream, and value <0 means upstream)
469 
470         //Note: qry/sbj strandedness in the alignment is irrelevent - we're simply adjusting the
471         //location in the genomic coordinate system, conserving the strand
472         if(start != closest_start) {
473             long offset = (start - closest_start);
474             if(p.GetLoc().GetStrand() == eNa_strand_minus) {
475                 p.SetStop_offset(offset * -1);
476             } else {
477                 p.SetStart_offset(offset);
478             }
479         }
480 
481         if(stop != closest_stop) {
482             long offset = (stop - closest_stop);
483             if(p.GetLoc().GetStrand() == eNa_strand_minus) {
484                 p.SetStart_offset(offset * -1);
485             } else {
486                 p.SetStop_offset(offset);
487             }
488         }
489     }
490 
491 
Remap(const CVariantPlacement & p,const CSeq_align & aln,bool check_placements)492     CRef<CVariantPlacement> CVariationUtil::Remap(const CVariantPlacement& p, const CSeq_align& aln, bool check_placements)
493     {
494         if(!p.GetLoc().GetId()) {
495             NCBI_THROW(CException, eUnknown, "Can't get seq-id");
496         }
497 
498         //note: the operations here are scope-free (i.e. seq-id types in aln must be consistent with p,
499         //or else will throw; using a scope will impose major slowdown, which is problematic if we're
500         //dealing with tens or hundreds of thousands of remappings.
501 
502         CRef<CVariantPlacement> p2(new CVariantPlacement);
503         p2->Assign(p);
504 
505         if(aln.GetSegs().IsSpliced()
506            && sequence::IsSameBioseq(aln.GetSegs().GetSpliced().GetGenomic_id(),
507            *p2->GetLoc().GetId(), NULL)) {
508             s_AddIntronicOffsets(*p2, aln.GetSegs().GetSpliced(), NULL);
509         }
510 
511         CSeq_align::TDim target_row = -1;
512         for(int i = 0; i < 2; i++) {
513             if(sequence::IsSameBioseq(*p2->GetLoc().GetId(), aln.GetSeq_id(i), NULL)) {
514                 target_row = 1 - i;
515             }
516         }
517 
518         if(target_row == -1) {
519             NCBI_THROW(CException,
520                        eUnknown,
521                        "The alignment has no row for seq-id "
522                        + p2->GetLoc().GetId()->AsFastaString());
523         }
524 
525         CRef<CSeq_loc_Mapper> mapper(new CSeq_loc_Mapper(aln, target_row, NULL));
526         mapper->SetMergeAll();
527 
528         CRef<CVariantPlacement> p3 = x_Remap(*p2, *mapper);
529 
530         if(p3->GetLoc().GetId()
531            && aln.GetSegs().IsSpliced()
532            && sequence::IsSameBioseq(aln.GetSegs().GetSpliced().GetGenomic_id(),
533            *p3->GetLoc().GetId(), NULL)) {
534             if(CheckExonBoundary(p, aln) == eFail) {
535 
536                 bool source_loc_is_projected =
537                     p.IsSetPlacement_method()
538                     && p.GetPlacement_method() == CVariantPlacement::ePlacement_method_projected;
539                 // checkig exon-boundary only for non-projected cases, since an exon-boundary
540                 // may become non-exon-boundary (e.g. transcript extended or exon boundary shifted).
541                 // VAR-1309
542 
543                 CVariationException::ECode code =
544                     source_loc_is_projected ? CVariationException::eCode_hgvs_exon_boundary_induced
545                     : CVariationException::eCode_hgvs_exon_boundary;
546 
547                 const string msg =
548                     "HGVS exon-boundary position not found in alignment of "
549                     + aln.GetSeq_id(0).AsFastaString()
550                     + "-vs-"
551                     + aln.GetSeq_id(1).AsFastaString();
552 
553                 p3->SetExceptions().push_back(CreateException(msg, code));
554             }
555 
556             s_ResolveIntronicOffsets(*p3);
557         }
558 
559         //note: AttachSeq happens only after the intronic offsets are resolved.
560         AttachSeq(*p3);
561 
562         // Note: as per SNP-5641 - will check for mismatches
563         // even in the presence of other more severe exceptions
564         // NcbiCerr << "Checking for mismatches-in-mapping"
565         //          << MSerial_AsnText << p << MSerial_AsnText << *p3 << "\n\n";
566         if(p.IsSetSeq() && p3->IsSetSeq()
567            && p.GetSeq().GetLength() == p3->GetSeq().GetLength()
568            && p.GetSeq().IsSetSeq_data() && p3->GetSeq().IsSetSeq_data()
569            && p.GetSeq().GetSeq_data().Which() == p3->GetSeq().GetSeq_data().Which()
570            && !p.GetSeq().GetSeq_data().Equals(p3->GetSeq().GetSeq_data())) {
571             p3->SetExceptions().push_back(CreateException(
572                 "Mismatches in mapping",
573                 CVariationException::eCode_mismatches_in_mapping));
574         }
575 
576         //VAR-1307
577     {{
578             static const long thr = 5000;
579             long start_offset = !p2->IsSetStart_offset() ? 0 : p2->GetStart_offset();
580             long stop_offset = !p2->IsSetStop_offset() ? 0 : p2->GetStop_offset();
581             bool far_start = start_offset + thr < 0
582                 && (p2->GetLoc().GetStart(eExtreme_Biological) == aln.GetSeqStart(1)
583                 || p2->GetLoc().GetStart(eExtreme_Biological) == aln.GetSeqStop(1));
584 
585             bool far_stop = stop_offset > thr
586                 && (p2->GetLoc().GetStop(eExtreme_Biological) == aln.GetSeqStart(1)
587                 || p2->GetLoc().GetStop(eExtreme_Biological) == aln.GetSeqStop(1));
588 
589             if(far_start || far_stop) {
590                 p3->SetExceptions().push_back(CreateException(
591                     "Source location overhangs the alignment by at least 5kb ",
592                     CVariationException::eCode_source_location_overhang));
593             }
594         }}
595 
596     if(check_placements) {
597         CheckPlacement(*p3);
598     }
599     return p3;
600     }
601 
602 
CreateSplicedSeqAlignFromFeat(const CSeq_feat & rna_feat)603     static CRef<CSeq_align> CreateSplicedSeqAlignFromFeat(const CSeq_feat& rna_feat)
604     {
605         CRef<CSeq_align> aln(new CSeq_align);
606         aln->SetType(CSeq_align::eType_other);
607         CSpliced_seg& ss = aln->SetSegs().SetSpliced();
608         ss.SetProduct_id().Assign(sequence::GetId(rna_feat.GetProduct(), NULL));
609         ss.SetGenomic_id().Assign(sequence::GetId(rna_feat.GetLocation(), NULL));
610         ss.SetExons();
611         ss.SetProduct_type(CSpliced_seg::eProduct_type_transcript);
612         ss.SetProduct_strand(eNa_strand_plus);
613         ss.SetGenomic_strand(sequence::GetStrand(rna_feat.GetLocation()));
614 
615         TSignedSeqPos product_pos(0);
616         for(CSeq_loc_CI ci(rna_feat.GetLocation(), CSeq_loc_CI::eEmpty_Skip, CSeq_loc_CI::eOrder_Biological); ci; ++ci) {
617             CRef<CSpliced_exon> se(new CSpliced_exon);
618             se->SetGenomic_start(ci.GetRange().GetFrom());
619             se->SetGenomic_end(ci.GetRange().GetTo());
620             se->SetProduct_start().SetNucpos(product_pos);
621             se->SetProduct_end().SetNucpos(product_pos + ci.GetRange().GetLength() - 1);
622             product_pos += ci.GetRange().GetLength();
623             ss.SetExons().push_back(se);
624         }
625 
626         return aln;
627     }
628 
629 
630     //todo: 1. what if we don't have the requested version of the product annotated?
631     //      2. do we need to support NGs, etc (will need to search all alignments on sequnece, as we can't find NG by product-id)
632     //      3. Do we want to return VariantPlacement instead, such that we can communicate auxiliary info?
RemapToAnnotatedTarget(const CVariation & v,const CSeq_id & target)633     CRef<CVariantPlacement> CVariationUtil::RemapToAnnotatedTarget(
634         const CVariation& v,
635         const CSeq_id& target)
636     {
637         CRef<CVariantPlacement> result(new CVariantPlacement());
638         result->SetLoc().SetNull();
639         result->SetMol(CVariantPlacement::eMol_unknown);
640         CBioseq_Handle bsh = m_scope->GetBioseqHandle(target);
641         CRef<CSeq_loc_Mapper> upmapper;
642 
643         if(v.IsSetPlacements()) {
644             //First see if we already have location on target in one of the placements
645             ITERATE(CVariation::TPlacements, it, v.GetPlacements())
646             {
647                 const CVariantPlacement& p = **it;
648                 if(p.GetLoc().GetId() && sequence::IsSameBioseq(target, *p.GetLoc().GetId(), m_scope)) {
649                     result->Assign(p);
650                     break;
651                 }
652             }
653 
654             if(result->GetLoc().IsNull()) {
655                 //did not find an existing placement for the target; will try to remap
656                 ITERATE(CVariation::TPlacements, it, v.GetPlacements())
657                 {
658                     const CVariantPlacement& placement = **it;
659 
660                     if(!placement.GetLoc().GetId()) {
661                         continue;
662                     }
663                     CSeq_id_Handle p_idh = sequence::GetId(*placement.GetLoc().GetId(),
664                                                            *m_scope,
665                                                            sequence::eGetId_ForceAcc);
666 
667                     bool is_scaffold = p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_contig
668                         || p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_contig_ncbo
669                         || p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_wgs_nuc
670                         || p_idh.IdentifyAccession() == CSeq_id::eAcc_refseq_wgs_intermed;
671 
672                     if(placement.GetMol() == CVariantPlacement::eMol_protein) {
673 
674                         CRef<CVariation> precursor = InferNAfromAA(v);
675                         result = RemapToAnnotatedTarget(*precursor, target);
676 
677                     } else if(placement.GetMol() == CVariantPlacement::eMol_genomic
678                               && is_scaffold) {
679                         // location is possibly a component of the target
680                         if(!upmapper) {
681                             upmapper.Reset(new CSeq_loc_Mapper(1, bsh, CSeq_loc_Mapper::eSeqMap_Up));
682                         }
683                         result = Remap(placement, *upmapper);
684                         ChangeIdsInPlace(*result, sequence::eGetId_ForceAcc, *m_scope);
685 
686                     } else {
687 
688                         // the annotation is gi-based, so we'll make sure the target
689                         // is gi-based so we won't need to do conversion for all products while searching.
690                         CSeq_id_Handle product_idh = sequence::GetId(*placement.GetLoc().GetId(),
691                                                                      *m_scope,
692                                                                      sequence::eGetId_ForceGi);
693                         if(!product_idh) {
694                             continue;
695                         }
696 
697                         // note that here we are looking for annotated (packaged) alignment at the
698                         // feature's location first (we could have searched the whole sequence instead,
699                         // but it is quite slow for NCs) If we have the feature but no alignment,
700                         // it is assumed that the alignment is perfect, and a synthetic seq-align
701                         // is created based on RNA feature.
702                         CRef<CSeq_align> aln;
703                         {{
704                                 CRef<CSeq_loc> target_loc = bsh.GetRangeSeq_loc(0, 0); //this returns "whole"
705                                 //will try to constrain target_loc from whole to product location only
706 
707                                 SAnnotSelector sel;
708                                 sel.SetResolveAll();
709                                 sel.SetAdaptiveDepth(true);
710                                 sel.IncludeFeatType(CSeqFeatData::e_Rna);
711 
712                                 for(CFeat_CI ci(bsh, sel); ci; ++ci) {
713                                     const CMappedFeat& mf = *ci;
714                                     if(!mf.IsSetProduct()) {
715                                         continue;
716                                     }
717                                     const CSeq_id& product_id = sequence::GetId(mf.GetProduct(), NULL);
718                                     if(product_id.Equals(*product_idh.GetSeqId())) {
719                                         target_loc->Assign(mf.GetLocation());
720                                         aln = CreateSplicedSeqAlignFromFeat(mf.GetMappedFeature());
721                                         break;
722                                     }
723                                 }
724 
725                                 //try to find explicit seq_align at the target-loc (if not found, will use the synthetic one)
726                                 for(CAlign_CI ci2(*m_scope, *target_loc, sel); ci2; ++ci2) {
727                                     const CSeq_align& current_aln = *ci2;
728                                     if(current_aln.GetSeq_id(0).Equals(*product_idh.GetSeqId())) {
729                                         aln = SerialClone<CSeq_align>(current_aln);
730                                     }
731                                 }
732                             }}
733 
734                         if(aln) {
735                             CRef<CVariantPlacement> p2(SerialClone<CVariantPlacement>(placement));
736                             ChangeIdsInPlace(*p2, sequence::eGetId_ForceAcc, *m_scope);
737                             ChangeIdsInPlace(*aln, sequence::eGetId_ForceAcc, *m_scope);
738                             result = Remap(*p2, *aln);
739                         }
740 
741                     }
742                     if(!result->GetLoc().IsNull()) {
743                         break;
744                     }
745                 }
746             }
747         } else if(v.GetData().IsSet()) { //no placements at this level - try in sub-variations
748             ITERATE(CVariation::TData::TSet::TVariations, it, v.GetData().GetSet().GetVariations())
749             {
750                 const CVariation& v2 = **it;
751                 CRef<CVariantPlacement> mapped_placement = RemapToAnnotatedTarget(v2, target);
752                 if(!result->GetLoc().IsNull()) { //already have somthing - append
753                     result->SetLoc().Add(mapped_placement->GetLoc());
754                 } else {
755                     result->Assign(*mapped_placement);
756                 }
757             }
758         }
759 
760         CRef<CSeq_loc> loc = sequence::Seq_loc_Merge(result->GetLoc(), CSeq_loc::fSortAndMerge_All, NULL);
761         result->SetLoc().Assign(*loc);
762         return result;
763     }
764 
765 
766 
767     template<typename T>
ContainsIupacNaAmbiguities(const T & obj)768     static bool ContainsIupacNaAmbiguities(const T& obj)
769     {
770         for(CTypeConstIterator<CSeq_data> it(Begin(obj)); it; ++it) {
771             const CSeq_data& sd = *it;
772             if(!sd.IsIupacna()) {
773                 continue;
774             }
775             ITERATE(string, it2, sd.GetIupacna().Get())
776             {
777                 char c = *it2;
778                 if(c != 'A' &&  c != 'C' && c != 'G' && c != 'T') {
779                     return true;
780                 }
781             }
782         }
783         return false;
784     }
785 
CheckAmbiguitiesInLiterals(CVariation & v)786     bool CVariationUtil::CheckAmbiguitiesInLiterals(CVariation& v)
787     {
788         bool had_ambiguities = false;
789 
790         if(v.IsSetPlacements() && v.GetPlacements().size() > 0) {
791             if(ContainsIupacNaAmbiguities(v.GetData())) {
792                 had_ambiguities = true;
793                 v.SetPlacements().front()->SetExceptions().push_back(CreateException("Ambiguous residues in variation", CVariationException::eCode_ambiguous_sequence));
794             }
795         }
796 
797         if(v.GetData().IsSet()) {
798             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, v.SetData().SetSet().SetVariations())
799             {
800                 CVariation& v2 = **it;
801                 had_ambiguities = had_ambiguities || CheckAmbiguitiesInLiterals(v2);
802             }
803         }
804         return had_ambiguities;
805     }
806 
CheckPlacement(CVariantPlacement & p)807     bool CVariationUtil::CheckPlacement(CVariantPlacement& p)
808     {
809         bool invalid_location = false;
810         bool out_of_order = false;
811 
812         if(sequence::SeqLocCheck(p.GetLoc(), m_scope) == sequence::eSeqLocCheck_error) {
813             invalid_location = true;
814         }
815 
816         if(p.GetLoc().GetId() && !p.GetLoc().IsEmpty()) {
817             if(sequence::GetStart(p.GetLoc(), NULL) > sequence::GetStop(p.GetLoc(), NULL)) {
818                 out_of_order = true;
819             }
820 
821 #if 0
822             //Note: not comparing offsets, as it may be legitimately out-of-order. JIRA:SNP-5238
823             if(p.IsSetStart_offset() && p.IsSetStop_offset() && sequence::GetLength(p.GetLoc(), NULL) == 1) {
824                 if(p.GetStart_offset() > p.GetStop_offset()) {
825                     //note: not checking for strand, as offsets are always in the order of transcription
826                     out_of_order = true;
827                     invalid_location = true;
828                 }
829             }
830 #endif
831         }
832 
833         if(invalid_location) {
834             p.SetExceptions().push_back(CreateException(
835                 out_of_order ? "Invalid location - start and stop are out of order"
836                 : "Invalid location",
837                 CVariationException::eCode_hgvs_parsing));
838         }
839 
840         if(p.GetLoc().GetId()) {
841 
842             CBioseq_Handle bsh =
843                 m_scope->GetBioseqHandle(
844                 *p.GetLoc().GetId());
845 
846             if(bsh && bsh.GetState() != 0
847                && bsh.GetState() != CBioseq_Handle::fState_dead) { //dead is OK, beacuse supporting old versions
848                 p.SetExceptions().push_back(
849                     CreateException(
850                     "Bioseq is suppressed or withdrawn",
851                     CVariationException::eCode_bioseq_state));
852             }
853         }
854 
855         return invalid_location;
856     }
857 
858 
Remap(const CVariantPlacement & p,CSeq_loc_Mapper & mapper)859     CRef<CVariantPlacement> CVariationUtil::Remap(
860         const CVariantPlacement& p,
861         CSeq_loc_Mapper& mapper)
862     {
863         CRef<CVariantPlacement> p2 = x_Remap(p, mapper);
864 
865         if(((p.IsSetStart_offset()
866             || p.IsSetStop_offset())
867             && p2->GetMol() == CVariantPlacement::eMol_genomic)
868             || (p.GetMol() == CVariantPlacement::eMol_genomic
869             && p2->GetMol() != CVariantPlacement::eMol_genomic
870             && p2->GetMol() != CVariantPlacement::eMol_unknown) /*in case remapped to nothing*/) {
871             NcbiCerr << "Mapped: " << MSerial_AsnText << *p2;
872             //When mapping an offset-placement to a genomic placement, may need to resolve offsets.
873             //or, when mapping from genomic to a product coordinates, may need to add offsets. In above cases
874             //we need to use the seq-align-based mapping.
875             NCBI_USER_THROW(
876                 "Cannot use Mapper-based method to remap intronic cases;"
877                 "must remap via spliced-seg alignment instead.");
878         }
879 
880         AttachSeq(*p2);
881 
882         if(p.IsSetSeq() && p2->IsSetSeq()
883            && p.GetSeq().GetLength() == p2->GetSeq().GetLength()
884            && p.GetSeq().IsSetSeq_data() && p2->GetSeq().IsSetSeq_data()
885            && p.GetSeq().GetSeq_data().Which() == p2->GetSeq().GetSeq_data().Which()
886            && !p.GetSeq().GetSeq_data().Equals(p2->GetSeq().GetSeq_data())) {
887             p2->SetExceptions().push_back(
888                 CreateException(
889                 "Mismatches in mapping",
890                 CVariationException::eCode_mismatches_in_mapping));
891         }
892 
893         CheckPlacement(*p2);
894         return p2;
895     }
896 
897 
x_Remap(const CVariantPlacement & p,CSeq_loc_Mapper & mapper)898     CRef<CVariantPlacement> CVariationUtil::x_Remap(
899         const CVariantPlacement& p,
900         CSeq_loc_Mapper& mapper)
901     {
902         CRef<CVariantPlacement> p2(new CVariantPlacement);
903 
904 
905         if(p.IsSetStart_offset()) {
906             p2->SetStart_offset() = p.GetStart_offset();
907         }
908         if(p.IsSetStop_offset()) {
909             p2->SetStop_offset() = p.GetStop_offset();
910         }
911         if(p.IsSetStart_offset_fuzz()) {
912             p2->SetStart_offset_fuzz().Assign(p.GetStart_offset_fuzz());
913         }
914         if(p.IsSetStop_offset_fuzz()) {
915             p2->SetStop_offset_fuzz().Assign(p.GetStop_offset_fuzz());
916         }
917 
918         CRef<CSeq_loc> mapped_loc = mapper.Map(p.GetLoc());
919 
920         {{
921                 // If we have offsets, then the distinction
922                 // between point and one-point interval is important, e.g.
923                 // NM_000155.3:c.-116-3_-116 - the location is an interval,
924                 // but the anchor point is the same; we need to
925                 // keep it as interval, as if we represent it as a point,
926                 // then the corresponding HGVS is also a point: NM_000155.3:c.-116-3
927                 const bool equal_offsets = (
928                     !p2->IsSetStart_offset()
929                     && !p2->IsSetStop_offset())
930 
931                     || (p2->IsSetStart_offset()
932                     && p2->IsSetStop_offset()
933 
934                     && p2->GetStart_offset()
935                     == p2->GetStop_offset());
936 
937                 const bool merge_single_range =
938                     p.GetLoc().IsInt()
939                     && mapped_loc->IsPnt()
940                     && !equal_offsets;
941 
942                 if(mapped_loc->IsInt()
943                    && mapped_loc->GetInt().IsSetFuzz_from()
944                    && mapped_loc->GetInt().IsSetFuzz_to()
945                    && sequence::GetLength(*mapped_loc, NULL) == 1) {
946                     // workaround for CXX-4376. single-nt locations will
947                     // not be collapsed to a point if both From and To fuzz is present
948                     mapped_loc->SetInt().ResetFuzz_to();
949                 }
950 
951                 mapped_loc = sequence::Seq_loc_Merge(
952                     *mapped_loc,
953                     merge_single_range ? CSeq_loc::fMerge_SingleRange
954                     : CSeq_loc::fSortAndMerge_All,
955                     NULL);
956             }}
957 
958 #if 1
959         //SNP-5148
960         if(mapped_loc->IsNull() && p.GetLoc().GetId() && !p.GetLoc().IsEmpty()) {
961             //If mapped to nothing, expand the loc and try again. The purpose is to know the seq-id of the
962             //neighborhood that we couldn't map to, so we can produce and Empty seq-loc that contains
963             //a seq-id, reporting that we tried to map to this sequence, but there's no mapping
964             CRef<CSeq_loc> loc1 = sequence::Seq_loc_Merge(p.GetLoc(), CSeq_loc::fMerge_SingleRange, NULL);
965             loc1->SetInt().SetFrom() = loc1->GetInt().GetFrom() < 500 ? 0 : loc1->SetInt().GetFrom() - 500;
966             loc1->SetInt().SetTo() += 500;
967             CRef<CSeq_loc> tmp_mapped_loc = mapper.Map(*loc1);
968 
969             if(tmp_mapped_loc->GetId()) {
970                 mapped_loc->SetEmpty().Assign(*tmp_mapped_loc->GetId());
971             }
972 
973         }
974         if(p2->GetLoc().IsEmpty()) {
975             p2->SetExceptions().push_back(CreateException("No mapping", CVariationException::eCode_no_mapping));
976         }
977 #endif
978 
979 
980         p2->SetLoc(*mapped_loc);
981         p2->SetPlacement_method(CVariantPlacement::ePlacement_method_projected);
982 
983         {{
984                 TSeqPos orig_len = p.GetLoc().Which() ? sequence::GetLength(p.GetLoc(), NULL) : 0;
985                 TSeqPos mapped_len = mapped_loc->Which() ? sequence::GetLength(*mapped_loc, NULL) : 0;
986                 bool orig_is_compound = p.GetLoc().IsMix() || p.GetLoc().IsPacked_int();
987                 bool mapped_is_compound = mapped_loc->IsMix() || mapped_loc->IsPacked_int();
988                 CRef<CVariationException> exception;
989                 if(mapped_len == 0) {
990                     exception.Reset(new CVariationException);
991                     exception->SetCode(CVariationException::eCode_no_mapping);
992                 } else if(mapped_len < orig_len) {
993                     exception.Reset(new CVariationException);
994                     exception->SetCode(CVariationException::eCode_partial_mapping);
995                 } else if(!orig_is_compound && mapped_is_compound) {
996                     exception.Reset(new CVariationException);
997                     exception->SetCode(CVariationException::eCode_split_mapping);
998                 }
999                 if(exception) {
1000                     exception->SetMessage("");
1001                     p2->SetExceptions().push_back(exception);
1002                 }
1003             }}
1004 
1005         if(p2->GetLoc().GetId()) {
1006             p2->SetMol(GetMolType(sequence::GetId(p2->GetLoc(), NULL)));
1007         } else {
1008             p2->SetMol(CVariantPlacement::eMol_unknown);
1009         }
1010 
1011         //VAR-1307
1012         if(p2->GetLoc().GetId()
1013            && (p2->GetMol() == CVariantPlacement::eMol_genomic
1014            || p2->GetMol() == CVariantPlacement::eMol_mitochondrion
1015            || p2->GetMol() == CVariantPlacement::eMol_unknown)
1016 
1017            && (p.GetMol() == CVariantPlacement::eMol_genomic
1018            || p.GetMol() == CVariantPlacement::eMol_mitochondrion
1019            || p.GetMol() == CVariantPlacement::eMol_unknown)
1020 
1021            && s_GetLength(p, m_scope)
1022                 > s_GetLength(*p2, m_scope) + 5000) {
1023             p2->SetExceptions().push_back(CreateException(
1024                 "Source location overhangs the alignment by at least 5kb",
1025                 CVariationException::eCode_source_location_overhang));
1026         }
1027 
1028 
1029         ChangeIdsInPlace(*p2, sequence::eGetId_ForceAcc, *m_scope);
1030         return p2;
1031     }
1032 
AttachSeq(CVariantPlacement & p,TSeqPos max_len)1033     bool CVariationUtil::AttachSeq(CVariantPlacement& p, TSeqPos max_len)
1034     {
1035         p.ResetSeq();
1036         bool ret = false;
1037         TSeqPos length = s_GetLength(p, m_scope);
1038 
1039         p.SetSeq().SetLength(length);
1040 
1041         if((p.IsSetStart_offset() && p.GetStart_offset() != 0)
1042            || (p.IsSetStop_offset() && p.GetStop_offset() != 0)) {
1043             p.SetExceptions().push_back(CreateException(
1044                 "Can't get sequence for an offset-based location",
1045                 CVariationException::eCode_seqfetch_intronic));
1046             ret = false;
1047         } else if(length > max_len) {
1048             p.SetExceptions().push_back(CreateException(
1049                 "Sequence is longer than the cutoff threshold",
1050                 CVariationException::eCode_seqfetch_too_long));
1051             ret = false;
1052         } else {
1053             try {
1054                 CRef<CSeq_literal> literal = x_GetLiteralAtLoc(p.GetLoc());
1055                 p.SetSeq(*literal);
1056 
1057                 if(ContainsIupacNaAmbiguities(*literal)) {
1058                     p.SetExceptions().push_back(CreateException(
1059                         "Ambiguous residues in reference",
1060                         CVariationException::eCode_ambiguous_sequence));
1061                 }
1062                 ret = true;
1063             }
1064             catch(CException&) {
1065                 //location can be invalid - SNP-5510
1066                 p.SetExceptions().push_back(CreateException(
1067                     "Cannot fetch sequence at location",
1068                     CVariationException::eCode_seqfetch_invalid));
1069                 ret = false;
1070             }
1071         }
1072         return ret;
1073     }
1074 
AttachSeq(CVariation & v,TSeqPos max_len)1075     bool CVariationUtil::AttachSeq(CVariation& v, TSeqPos max_len)
1076     {
1077         v.Index();
1078         bool had_exceptions = false;
1079         if(v.IsSetPlacements()) {
1080 
1081             CConstRef<CSeq_literal> asserted_literal;
1082             CConstRef<CSeq_literal> variant_literal;
1083             // will only fetch for snp/mnp cases,
1084             // as ref_same_as_variant test is only applicable for these
1085 
1086             // Find the asserted and variant literal for this variation
1087             // (don't look into consequence-variations that may also contain
1088             // protein-specific asserted and variant sequence)
1089             for(CTypeConstIterator<CVariation> it(Begin(v)); it; ++it) {
1090                 const CVariation& v2 = *it;
1091                 if(!v2.GetData().IsInstance() || (v2.GetConsequenceParent() && &v != &v2)) {
1092                     continue;
1093                 }
1094 
1095                 const CVariation_inst& inst = v2.GetData().GetInstance();
1096                 if(inst.GetDelta().size() == 1
1097                    && inst.GetDelta().front()->IsSetSeq()
1098                    && inst.GetDelta().front()->GetSeq().IsLiteral()) {
1099                     CConstRef<CSeq_literal> literal(&inst.GetDelta().front()->GetSeq().GetLiteral());
1100                     if(!asserted_literal
1101                        && inst.IsSetObservation()
1102                        && inst.GetObservation() == CVariation_inst::eObservation_asserted) {
1103                         asserted_literal = literal;
1104                     } else if(!variant_literal //note: finding the very first one; will that work always?
1105                               && (!inst.IsSetObservation() || inst.GetObservation() == CVariation_inst::eObservation_variant)
1106                               && (!inst.GetDelta().front()->IsSetAction() || inst.GetDelta().front()->GetAction() == CDelta_item::eAction_morph)
1107                               && (!inst.GetDelta().front()->IsSetMultiplier() && !inst.GetDelta().front()->IsSetMultiplier_fuzz())
1108                               ) {
1109                         variant_literal = literal;
1110                     }
1111                 }
1112             }
1113 
1114 #if 0
1115             if(variant_literal) {
1116                 LOG_POST("Found variant-literal");
1117                 NcbiCout << MSerial_AsnText << *variant_literal;
1118             } else {
1119                 LOG_POST("Did not find variant-literal");
1120             }
1121 #endif
1122 
1123             NON_CONST_ITERATE(CVariation::TPlacements, it, v.SetPlacements())
1124             {
1125                 CVariantPlacement& p = **it;
1126                 bool attached = AttachSeq(p, max_len);
1127 
1128                 if(attached
1129                    && asserted_literal
1130                    && (p.GetSeq().GetLength() != asserted_literal->GetLength()
1131                    || (p.GetSeq().IsSetSeq_data() && asserted_literal->IsSetSeq_data()
1132                    && p.GetSeq().GetSeq_data().Which() == asserted_literal->GetSeq_data().Which()
1133                    && !p.GetSeq().GetSeq_data().Equals(asserted_literal->GetSeq_data())))) {
1134                     p.SetExceptions().push_back(CreateException(
1135                         "Asserted sequence is inconsistent with reference",
1136                         CVariationException::eCode_inconsistent_asserted_allele));
1137                     had_exceptions = true;
1138                 }
1139 
1140                 if(attached
1141                    && variant_literal
1142                    && variant_literal->Equals(p.GetSeq())) {
1143                     p.SetExceptions().push_back(CreateException(
1144                         "Reference sequence is the same as variant",
1145                         CVariationException::eCode_ref_same_as_variant));
1146                     had_exceptions = true;
1147                 }
1148             }
1149         }
1150 
1151         if(v.GetData().IsSet()) {
1152             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it,
1153                               v.SetData().SetSet().SetVariations())
1154             {
1155                 CVariation& v2 = **it;
1156                 had_exceptions = had_exceptions || AttachSeq(v2, max_len);
1157             }
1158         }
1159         return !had_exceptions;
1160     }
1161 
1162 
GetMolType(const CSeq_id & id)1163     CVariantPlacement::TMol CVariationUtil::GetMolType(const CSeq_id& id)
1164     {
1165         //Most of the time can figure out from seq-id. Must be careful not to
1166         //automatically treat NC as "genomic", as we need to differentiate from
1167         //mitochondrion.
1168         CSeq_id::EAccessionInfo acinf = id.IdentifyAccession();
1169         CVariantPlacement::TMol moltype = CVariantPlacement::eMol_unknown;
1170 
1171         if(acinf == CSeq_id::eAcc_refseq_prot || acinf == CSeq_id::eAcc_refseq_prot_predicted) {
1172             moltype = CVariantPlacement::eMol_protein;
1173         } else if(acinf == CSeq_id::eAcc_refseq_mrna || acinf == CSeq_id::eAcc_refseq_mrna_predicted) {
1174             moltype = CVariantPlacement::eMol_cdna;
1175         } else if(acinf == CSeq_id::eAcc_refseq_ncrna || acinf == CSeq_id::eAcc_refseq_ncrna_predicted) {
1176             moltype = CVariantPlacement::eMol_rna;
1177         } else if(acinf == CSeq_id::eAcc_refseq_genomic
1178                   || acinf == CSeq_id::eAcc_refseq_contig
1179                   || acinf == CSeq_id::eAcc_refseq_wgs_intermed) {
1180             moltype = CVariantPlacement::eMol_genomic;
1181         } else if(id.IdentifyAccession() == CSeq_id::eAcc_refseq_chromosome
1182                   && NStr::StartsWith(id.GetTextseq_Id()->GetAccession(), "AC_")) {
1183             moltype = CVariantPlacement::eMol_genomic;
1184         } else if(id.IdentifyAccession() == CSeq_id::eAcc_refseq_chromosome
1185                   && ((id.GetTextseq_Id()->GetAccession() >= "NC_000001" && id.GetTextseq_Id()->GetAccession() <= "NC_000024")
1186                   || (id.GetTextseq_Id()->GetAccession() >= "NC_000067" && id.GetTextseq_Id()->GetAccession() <= "NC_000087"))) {
1187             //hardcode most "popular" chromosomes that are not mitochondrions.
1188             moltype = CVariantPlacement::eMol_genomic;
1189         } else {
1190             CBioseq_Handle bsh = m_scope->GetBioseqHandle(id);
1191             if(!bsh) {
1192                 moltype = CVariantPlacement::eMol_unknown;
1193             } else {
1194                 //check if mitochondrion
1195                 if(bsh.GetTopLevelEntry().IsSetDescr()) {
1196                     ITERATE(CBioseq_Handle::TDescr::Tdata, it, bsh.GetTopLevelEntry().GetDescr().Get())
1197                     {
1198                         const CSeqdesc& desc = **it;
1199                         if(!desc.IsSource()) {
1200                             continue;
1201                         }
1202                         if(desc.GetSource().IsSetGenome() && desc.GetSource().GetGenome() == CBioSource::eGenome_mitochondrion) {
1203                             moltype = CVariantPlacement::eMol_mitochondrion;
1204                         }
1205                     }
1206                 }
1207 
1208                 //check molinfo
1209                 if(moltype == CVariantPlacement::eMol_unknown) {
1210                     const CMolInfo* m = sequence::GetMolInfo(bsh);
1211                     if(!m) {
1212                         moltype = CVariantPlacement::eMol_unknown;
1213                     } else if(m->GetBiomol() == CMolInfo::eBiomol_genomic
1214                               || m->GetBiomol() == CMolInfo::eBiomol_other_genetic) {
1215                         moltype = CVariantPlacement::eMol_genomic;
1216                     } else if(m->GetBiomol() == CMolInfo::eBiomol_mRNA
1217                               || m->GetBiomol() == CMolInfo::eBiomol_genomic_mRNA) {
1218                         moltype = CVariantPlacement::eMol_cdna;
1219                     } else if(m->GetBiomol() == CMolInfo::eBiomol_ncRNA
1220                               || m->GetBiomol() == CMolInfo::eBiomol_rRNA
1221                               || m->GetBiomol() == CMolInfo::eBiomol_scRNA
1222                               || m->GetBiomol() == CMolInfo::eBiomol_snRNA
1223                               || m->GetBiomol() == CMolInfo::eBiomol_snoRNA
1224                               || m->GetBiomol() == CMolInfo::eBiomol_pre_RNA
1225                               || m->GetBiomol() == CMolInfo::eBiomol_tmRNA
1226                               || m->GetBiomol() == CMolInfo::eBiomol_cRNA
1227                               || m->GetBiomol() == CMolInfo::eBiomol_tRNA
1228                               || m->GetBiomol() == CMolInfo::eBiomol_transcribed_RNA) {
1229                         moltype = CVariantPlacement::eMol_rna;
1230                     } else if(m->GetBiomol() == CMolInfo::eBiomol_peptide) {
1231                         moltype = CVariantPlacement::eMol_protein;
1232                     } else {
1233                         moltype = CVariantPlacement::eMol_unknown;
1234                     }
1235                 }
1236             }
1237         }
1238         return moltype;
1239     }
1240 
1241 
CreateFlankLocs(const CSeq_loc & loc,TSeqPos len)1242     CVariationUtil::SFlankLocs CVariationUtil::CreateFlankLocs(const CSeq_loc& loc, TSeqPos len)
1243     {
1244         SFlankLocs flanks;
1245         flanks.upstream.Reset(new CSeq_loc(CSeq_loc::e_Null));
1246         flanks.downstream.Reset(new CSeq_loc(CSeq_loc::e_Null));
1247         if(!loc.GetId()) {
1248             return flanks;
1249         }
1250 
1251         TSignedSeqPos start = sequence::GetStart(loc, m_scope, eExtreme_Positional);
1252         TSignedSeqPos stop = sequence::GetStop(loc, m_scope, eExtreme_Positional);
1253 
1254         const CSeq_id& seq_id = sequence::GetId(loc, NULL);
1255 
1256         CBioseq_Handle bsh = m_scope->GetBioseqHandle(seq_id);
1257 
1258         if(!bsh) {
1259             NCBI_USER_THROW_FMT("Unable to retrieve bioseq from scope : id " <<
1260                                 seq_id.GetSeqIdString(true /*with version*/));
1261         }
1262 
1263         TSignedSeqPos max_pos = bsh.GetInst_Length() - 1;
1264 
1265         flanks.upstream->SetInt().SetId().Assign(sequence::GetId(loc, NULL));
1266         flanks.upstream->SetInt().SetStrand(sequence::GetStrand(loc, NULL));
1267         flanks.upstream->SetInt().SetTo(min(max_pos, stop + (TSignedSeqPos)len));
1268         flanks.upstream->SetInt().SetFrom(max((TSignedSeqPos)0, start - (TSignedSeqPos)len));
1269 
1270         flanks.downstream->Assign(*flanks.upstream);
1271 
1272         CSeq_loc& second = sequence::GetStrand(loc, NULL) == eNa_strand_minus ? *flanks.upstream : *flanks.downstream;
1273         CSeq_loc& first = sequence::GetStrand(loc, NULL) == eNa_strand_minus ? *flanks.downstream : *flanks.upstream;
1274 
1275         if(start == 0) {
1276             first.SetNull();
1277         } else {
1278             first.SetInt().SetTo(start - 1);
1279         }
1280 
1281         if(stop == max_pos) {
1282             second.SetNull();
1283         } else {
1284             second.SetInt().SetFrom(stop + 1);
1285         }
1286 
1287         return flanks;
1288     }
1289 
1290 
1291 
1292     ///////////////////////////////////////////////////////////////////////////////
1293     //
1294     // Methods and functions pertaining to converting protein variation in precursor coords
1295     //
1296     ///////////////////////////////////////////////////////////////////////////////
s_UntranslateProt(const string & prot_str,vector<string> & codons)1297     void CVariationUtil::s_UntranslateProt(const string& prot_str, vector<string>& codons)
1298     {
1299         if(prot_str.size() != 1) {
1300             NCBI_THROW(CException, eUnknown, "Expected prot_str of length 1");
1301         }
1302 
1303         static const char* alphabet = "ACGT";
1304         string codon = "AAA";
1305         for(size_t i0 = 0; i0 < 4; i0++) {
1306             codon[0] = alphabet[i0];
1307             for(size_t i1 = 0; i1 < 4; i1++) {
1308                 codon[1] = alphabet[i1];
1309                 for(size_t i2 = 0; i2 < 4; i2++) {
1310                     codon[2] = alphabet[i2];
1311                     string prot("");
1312                     CSeqTranslator::Translate(codon, prot, CSeqTranslator::fIs5PrimePartial);
1313                     NStr::ReplaceInPlace(prot, "*", "X"); //Conversion to IUPAC produces "X", but Translate produces "*"
1314 
1315                     //LOG_POST(">>>" << codon << " " << prot << " " << prot_str);
1316                     if(prot == prot_str) {
1317                         codons.push_back(codon);
1318                     }
1319                 }
1320             }
1321         }
1322     }
1323 
s_CountMatches(const string & a,const string & b)1324     size_t CVariationUtil::s_CountMatches(const string& a, const string& b)
1325     {
1326         size_t count(0);
1327         for(size_t i = 0; i < min(a.size(), b.size()); i++) {
1328             if(a[i] == b[i]) {
1329                 count++;
1330             }
1331         }
1332         return count;
1333     }
1334 
s_CalcPrecursorVariationCodon(const string & codon_from,const string & prot_to,vector<string> & codons_to)1335     void CVariationUtil::s_CalcPrecursorVariationCodon(
1336         const string& codon_from, //codon on cDNA
1337         const string& prot_to,    //missense/nonsense AA
1338         vector<string>& codons_to)  //calculated variation-codons
1339     {
1340         vector<string> candidates1;
1341         size_t max_matches(0);
1342         s_UntranslateProt(prot_to, candidates1);
1343         codons_to.clear();
1344         bool have_silent = false;
1345 
1346         ITERATE(vector<string>, it1, candidates1)
1347         {
1348             size_t matches = s_CountMatches(codon_from, *it1);
1349 
1350             //        LOG_POST("CalcPrecursorVariationCodon:" << codon_from << " " << prot_to << " " << *it1 << " " << matches);
1351             if(matches == 3) {
1352                 //all three bases in a codon matched - we must be processing a silent mutation.
1353                 //in this case we want to consider candidate codons other than itself.
1354                 have_silent = true;
1355                 continue;
1356             }
1357 
1358             if(matches >= max_matches) {
1359                 if(matches > max_matches) {
1360                     codons_to.clear();
1361                 }
1362                 codons_to.push_back(*it1);
1363                 max_matches = matches;
1364             }
1365         }
1366 
1367         // didn't find polymorphic candidate - must be dealing with no-change at nucleotide level
1368         if(codons_to.empty() && have_silent) {
1369             codons_to.push_back(codon_from);
1370         }
1371     }
1372 
s_CollapseAmbiguities(const vector<string> & seqs)1373     string CVariationUtil::s_CollapseAmbiguities(const vector<string>& seqs)
1374     {
1375         string collapsed_seq;
1376 
1377         vector<int> bits; //4-bit bitmask denoting whether a nucleotide occurs at this pos at any seq in seqs
1378 
1379         typedef const vector<string> TConstStrs;
1380         ITERATE(TConstStrs, it, seqs)
1381         {
1382             const string& seq = *it;
1383             if(seq.size() > bits.size()) {
1384                 bits.resize(seq.size());
1385             }
1386 
1387             for(size_t i = 0; i < seq.size(); i++) {
1388                 char nt = seq[i];
1389                 int m = (nt == 'T' ? 1
1390                          : nt == 'G' ? 2
1391                          : nt == 'C' ? 4
1392                          : nt == 'A' ? 8 : 0);
1393                 if(!m) {
1394                     NCBI_THROW(CException, eUnknown, "Expected [ACGT] alphabet");
1395                 }
1396 
1397                 bits[i] |= m;
1398             }
1399         }
1400 
1401         static const char* iupac_nuc_ambiguity_codes = "NTGKCYSBAWRDMHVN";
1402         collapsed_seq.resize(bits.size());
1403         for(size_t i = 0; i < collapsed_seq.size(); i++) {
1404             collapsed_seq[i] = iupac_nuc_ambiguity_codes[bits[i]];
1405         }
1406         return collapsed_seq;
1407     }
1408 
1409 
x_InferNAfromAA(CVariation & v,TAA2NAFlags flags)1410     void CVariationUtil::x_InferNAfromAA(CVariation& v, TAA2NAFlags flags)
1411     {
1412         if(v.GetData().IsSet()) {
1413             CVariation::TData::TSet::TVariations variations = v.SetData().SetSet().SetVariations();
1414             v.SetData().SetSet().SetVariations().clear();
1415 
1416             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, variations)
1417             {
1418                 CRef<CVariation> v2 = *it;
1419 
1420                 if(v2->GetData().IsInstance()
1421                    && v2->GetData().GetInstance().IsSetObservation()
1422                    && !(v2->GetData().GetInstance().GetObservation() & CVariation_inst::eObservation_variant)) {
1423                     //only variant observations are propagated to the nuc level
1424                     continue;
1425                 }
1426                 x_InferNAfromAA(*v2, flags);
1427                 v.SetData().SetSet().SetVariations().push_back(v2);
1428             }
1429             v.ResetPlacements(); //nucleotide placement will be computed for each instance and attached at its level
1430 
1431             if(v.GetData().GetSet().GetVariations().size() == 1) {
1432                 v.Assign(*v.GetData().GetSet().GetVariations().front());
1433             }
1434             return;
1435         }
1436 
1437         //Remap the placement to nucleotide
1438 
1439         const CVariation::TPlacements* placements = s_GetPlacements(v);
1440         if(!placements || placements->size() == 0) {
1441             NCBI_THROW(CException, eUnknown, "Expected a placement");
1442         }
1443 
1444         const CVariantPlacement& placement = *placements->front();
1445 
1446         if(placement.IsSetStart_offset() || placement.IsSetStop_offset()) {
1447             NCBI_THROW(CException, eUnknown, "Expected offset-free placement");
1448         }
1449 
1450         if(placement.IsSetMol() && placement.GetMol() != CVariantPlacement::eMol_protein) {
1451             NCBI_THROW(CException, eUnknown, "Expected a protein-type placement");
1452         }
1453 
1454         SAnnotSelector sel(CSeqFeatData::e_Cdregion, true);
1455         //note: sel by product; location is prot; found feature is mrna having this prot as product
1456         CRef<CSeq_loc_Mapper> prot2precursor_mapper;
1457         for(CFeat_CI ci(*m_scope, placement.GetLoc(), sel); ci; ++ci) {
1458             try {
1459                 prot2precursor_mapper.Reset(new CSeq_loc_Mapper(ci->GetMappedFeature(), CSeq_loc_Mapper::eProductToLocation, m_scope));
1460             }
1461             catch(CException&) {
1462                 ;// may legitimately throw if feature is not good for mapping
1463             }
1464             break;
1465         }
1466 
1467         if(!prot2precursor_mapper) {
1468             NCBI_THROW(CException, eUnknown, "Can't create prot2precursor mapper. Is this a prot?");
1469         }
1470 
1471         CRef<CSeq_loc> nuc_loc = prot2precursor_mapper->Map(placement.GetLoc());
1472         ChangeIdsInPlace(*nuc_loc, sequence::eGetId_ForceAcc, *m_scope);
1473 
1474         CConstRef<CDelta_item> delta =
1475             !v.GetData().IsInstance() ? CConstRef<CDelta_item>(NULL)
1476             : v.GetData().GetInstance().GetDelta().size() != 1 ? CConstRef<CDelta_item>(NULL)
1477             : v.GetData().GetInstance().GetDelta().front();
1478 
1479 
1480         if(!v.GetData().IsInstance()) {
1481             // variation has "special" payload - nothing to do other than remap location
1482             CRef<CVariantPlacement> p(new CVariantPlacement);
1483             p->SetLoc(*nuc_loc);
1484             p->SetMol(nuc_loc->GetId() ? GetMolType(*nuc_loc->GetId()) : CVariantPlacement::eMol_unknown);
1485             CRef<CVariation> v2(new CVariation);
1486             v2->SetData().Assign(v.GetData());
1487             v2->SetPlacements().push_back(p);
1488             v.Assign(*v2);
1489             return;
1490         }
1491 
1492         //See if the variation is simple-enough to process; if too complex, return "unknown-variation"
1493         if(!nuc_loc->IsInt() //complex nucleotide location
1494            || sequence::GetLength(*nuc_loc, NULL) != 3 //does not remap to single codon
1495            || !delta //delta is not a single-element item
1496            || (delta->IsSetAction() && delta->GetAction() != CDelta_item::eAction_morph) //not a replace-at-location delta-type
1497            || !delta->IsSetSeq() //don't know what's the variant allele
1498            || !delta->GetSeq().IsLiteral()
1499            || delta->GetSeq().GetLiteral().GetLength() != 1)  //not single-AA missense
1500         {
1501             CRef<CVariantPlacement> p(new CVariantPlacement);
1502             p->SetLoc(*nuc_loc);
1503             p->SetMol(nuc_loc->GetId() ? GetMolType(*nuc_loc->GetId()) : CVariantPlacement::eMol_unknown);
1504             CRef<CVariation> v2(new CVariation);
1505             v2->SetData().SetUnknown();
1506             v2->SetPlacements().push_back(p);
1507             v.Assign(*v2);
1508             return;
1509         }
1510 
1511 
1512         CSeqVector seqv(*nuc_loc, *m_scope, CBioseq_Handle::eCoding_Iupac);
1513         string original_allele_codon; //nucleotide allele on the sequence
1514         seqv.GetSeqData(seqv.begin(), seqv.end(), original_allele_codon);
1515 
1516         string variant_codon;
1517         {{
1518                 CSeq_data variant_prot_seq;
1519                 CSeqportUtil::Convert(
1520                     delta->GetSeq().GetLiteral().GetSeq_data(),
1521                     &variant_prot_seq,
1522                     CSeq_data::e_Iupacaa);
1523 
1524                 vector<string> variant_codons;
1525                 if(v.GetData().GetInstance().IsSetObservation()
1526                    && v.GetData().GetInstance().GetObservation()
1527                    == CVariation_inst::eObservation_asserted) {
1528                     s_UntranslateProt(
1529                         variant_prot_seq.GetIupacaa(),
1530                         variant_codons);
1531                 } else {
1532                     s_CalcPrecursorVariationCodon(
1533                         original_allele_codon,
1534                         variant_prot_seq.GetIupacaa(),
1535                         variant_codons);
1536                 }
1537                 variant_codon = s_CollapseAmbiguities(variant_codons);
1538             }}
1539 
1540         if(flags & fAA2NA_truncate_common_prefix_and_suffix
1541            && !v.IsSetFrameshift()
1542            && variant_codon != original_allele_codon) {
1543             while(variant_codon.length() > 0
1544                   && original_allele_codon.length() > 0
1545                   && variant_codon.at(0) == original_allele_codon.at(0)) {
1546                 variant_codon = variant_codon.substr(1);
1547                 original_allele_codon = original_allele_codon.substr(1);
1548                 if(nuc_loc->GetStrand() == eNa_strand_minus) {
1549                     nuc_loc->SetInt().SetTo()--;
1550                 } else {
1551                     nuc_loc->SetInt().SetFrom()++;
1552                 }
1553             }
1554 
1555             while(variant_codon.length() > 0
1556                   && original_allele_codon.length() > 0
1557                   && variant_codon.at(variant_codon.length() - 1)
1558                   == original_allele_codon.at(original_allele_codon.length() - 1)) {
1559                 variant_codon.resize(variant_codon.length() - 1);
1560                 original_allele_codon.resize(original_allele_codon.length() - 1);
1561                 //Note: normally given a protein, the parent will be a mRNA and the CDS location
1562                 //will have plus strand; however, the parent could be MT, so we can't assume plus strand
1563                 if(nuc_loc->GetStrand() == eNa_strand_minus) {
1564                     nuc_loc->SetInt().SetFrom()++;
1565                 } else {
1566                     nuc_loc->SetInt().SetTo()--;
1567                 }
1568             }
1569         }
1570 
1571         CRef<CDelta_item> delta2(new CDelta_item);
1572         delta2->SetSeq().SetLiteral().SetLength(variant_codon.length());
1573         delta2->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set(variant_codon);
1574 
1575         CRef<CVariation> v2(new CVariation);
1576 
1577         //set placement
1578         CRef<CVariantPlacement> p2(new CVariantPlacement);
1579         //merge loc to convert int of length 1 to a pnt as necessary
1580         p2->SetLoc(*sequence::Seq_loc_Merge(*nuc_loc, CSeq_loc::fSortAndMerge_All, NULL));
1581         p2->SetMol(GetMolType(sequence::GetId(*nuc_loc, NULL)));
1582         AttachSeq(*p2);
1583         v2->SetPlacements().push_back(p2);
1584 
1585 
1586         if(v.IsSetFrameshift()) {
1587             v2->SetData().SetUnknown(); //don't know what is happening at nucleotide level
1588         } else {
1589             CVariation_inst& inst2 = v2->SetData().SetInstance();
1590             inst2.SetType(variant_codon.length() == 1 ? CVariation_inst::eType_snv : CVariation_inst::eType_mnp);
1591             inst2.SetDelta().push_back(delta2);
1592 
1593             if(v.GetData().GetInstance().IsSetObservation()) {
1594                 inst2.SetObservation(v.GetData().GetInstance().GetObservation());
1595             }
1596         }
1597 
1598         v.Assign(*v2);
1599     }
1600 
InferNAfromAA(const CVariation & v,TAA2NAFlags flags)1601     CRef<CVariation> CVariationUtil::InferNAfromAA(const CVariation& v, TAA2NAFlags flags)
1602     {
1603         CRef<CVariation> v2(new CVariation);
1604         v2->Assign(v);
1605         v2->Index();
1606         x_InferNAfromAA(*v2, flags);
1607         s_FactorOutPlacements(*v2);
1608         v2->Index();
1609 
1610         CheckAmbiguitiesInLiterals(*v2);
1611 
1612         //Note: The result describes whole codons, even for point mutations, i.e. common suffx/prefix are not truncated.
1613         return v2;
1614     }
1615 
1616 
GetLiteralAtLoc(const CSeq_loc & loc)1617     CRef<CSeq_literal> CVariationUtil::GetLiteralAtLoc(const CSeq_loc& loc)
1618     {
1619         return x_GetLiteralAtLoc(loc);
1620     }
1621 
1622 
IsRightPartial(CBioseq_Handle bsh)1623     static bool IsRightPartial(CBioseq_Handle bsh)
1624     {
1625         ITERATE(CSeq_descr::Tdata, it, bsh.GetDescr().Get())
1626         {
1627             const CSeqdesc& desc = **it;
1628             if(!desc.IsMolinfo()
1629                || !desc.GetMolinfo().IsSetCompleteness()) {
1630                 continue;
1631             }
1632 
1633             return desc.GetMolinfo().GetCompleteness() == CMolInfo::eCompleteness_no_right
1634                 || desc.GetMolinfo().GetCompleteness() == CMolInfo::eCompleteness_no_ends;
1635         }
1636 
1637         return false;
1638     }
1639 
1640 
x_GetLiteralAtLoc(const CSeq_loc & loc)1641     CRef<CSeq_literal> CVariationUtil::x_GetLiteralAtLoc(const CSeq_loc& loc)
1642     {
1643         CRef<CSeq_literal> literal(new CSeq_literal);
1644         if(sequence::GetLength(loc, NULL) == 0) {
1645             literal->SetLength(0);
1646         } else {
1647             CRef<CSeq_literal> cached_literal = m_cdregion_index.GetCachedLiteralAtLoc(loc);
1648             if(cached_literal) {
1649                 literal = cached_literal;
1650             } else {
1651                 string seq;
1652                 try {
1653                     CSeqVector v(loc, *m_scope, CBioseq_Handle::eCoding_Iupac);
1654 
1655                     if(v.IsProtein()) {
1656                         // if loc extends by 1 past the end protein - we'll need to
1657                         // truncate the loc to the boundaries of prot, and then add "*" or "X" manually,
1658                         // as otherwise fetching seq will throw.
1659                         CBioseq_Handle bsh = m_scope->GetBioseqHandle(sequence::GetId(loc, NULL));
1660                         if(bsh.GetInst_Length() == loc.GetStop(eExtreme_Positional)) {
1661 
1662                             CRef<CSeq_loc> range_loc = sequence::Seq_loc_Merge(loc, CSeq_loc::fMerge_SingleRange, NULL);
1663                             range_loc->SetInt().SetTo(bsh.GetInst_Length() - 1);
1664 
1665                             CRef<CSeq_literal>  literal = x_GetLiteralAtLoc(
1666                                 *range_loc->Intersect(loc, 0, NULL));
1667 
1668                             literal->SetLength() += 1;
1669                             literal->SetSeq_data().SetNcbieaa().Set().push_back(
1670                                 IsRightPartial(bsh) ? 'X' : '*');
1671 
1672                             return literal;
1673                         }
1674                     }
1675 
1676                     v.GetSeqData(v.begin(), v.end(), seq);
1677                     literal->SetLength(seq.size());
1678                     if(v.IsProtein()) {
1679                         literal->SetSeq_data().SetNcbieaa().Set(seq);
1680                     } else if(v.IsNucleotide()) {
1681                         literal->SetSeq_data().SetIupacna().Set(seq);
1682                     }
1683                 }
1684                 catch(CException& e) {
1685                     string loc_label;
1686                     loc.GetLabel(&loc_label);
1687                     NCBI_RETHROW_SAME(e, "Can't get literal for " + loc_label);
1688                 }
1689             }
1690         }
1691         return literal;
1692     }
1693 
1694 
s_CatLiterals(const CSeq_literal & a,const CSeq_literal & b)1695     CRef<CSeq_literal> CVariationUtil::s_CatLiterals(const CSeq_literal& a, const CSeq_literal& b)
1696     {
1697         CRef<CSeq_literal> c(new CSeq_literal);
1698 
1699         if(b.GetLength() == 0) {
1700             c->Assign(a);
1701         } else if(a.GetLength() == 0) {
1702             c->Assign(b);
1703         } else {
1704             c->SetLength(a.GetLength() + b.GetLength());
1705 
1706             if(a.IsSetFuzz() || b.IsSetFuzz()) {
1707                 c->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1708             }
1709 
1710             if(a.IsSetSeq_data() && b.IsSetSeq_data()) {
1711                 CSeqportUtil::Append(&(c->SetSeq_data()),
1712                                      a.GetSeq_data(), 0, a.GetLength(),
1713                                      b.GetSeq_data(), 0, b.GetLength());
1714             }
1715         }
1716         return c;
1717     }
1718 
s_SpliceLiterals(const CSeq_literal & payload,const CSeq_literal & ref,TSeqPos pos)1719     CRef<CSeq_literal> CVariationUtil::s_SpliceLiterals(
1720         const CSeq_literal& payload,
1721         const CSeq_literal& ref, TSeqPos pos)
1722     {
1723         CRef<CSeq_literal> result;
1724         if(pos == 0) {
1725             result = s_CatLiterals(payload, ref);
1726         } else if(pos == ref.GetLength()) {
1727             result = s_CatLiterals(ref, payload);
1728         } else {
1729             CRef<CSeq_literal> prefix(new CSeq_literal);
1730             prefix->Assign(ref);
1731             prefix->SetLength(pos);
1732             CSeqportUtil::Keep(&prefix->SetSeq_data(), 0, prefix->GetLength());
1733 
1734             CRef<CSeq_literal> suffix(new CSeq_literal);
1735             suffix->Assign(ref);
1736             suffix->SetLength(ref.GetLength() - pos);
1737             CSeqportUtil::Keep(&suffix->SetSeq_data(), pos, suffix->GetLength());
1738 
1739             CRef<CSeq_literal> prefix_and_payload = s_CatLiterals(*prefix, payload);
1740             result = s_CatLiterals(*prefix_and_payload, *suffix);
1741         }
1742         return result;
1743     }
1744 
1745 
1746 
1747 
x_FindOrCreateLiteral(const CVariation & v)1748     CConstRef<CSeq_literal>  CVariationUtil::x_FindOrCreateLiteral(const CVariation& v)
1749     {
1750         CConstRef<CSeq_literal> literal = s_FindFirstLiteral(v);
1751         if(literal) {
1752             return literal;
1753         }
1754 
1755         //instantiate a literal from a placement (make sure there are no offsets)
1756         const CVariation::TPlacements* placements = s_GetPlacements(v);
1757 
1758         if(!placements) {
1759             return literal;
1760         }
1761 
1762         ITERATE(CVariation::TPlacements, it, *placements)
1763         {
1764             const CVariantPlacement& p = **it;
1765             if(p.IsSetStart_offset() || p.IsSetStop_offset()) {
1766                 continue;
1767             }
1768 
1769             literal = x_GetLiteralAtLoc(p.GetLoc());
1770             break;
1771         }
1772 
1773         return literal;
1774     }
1775 
1776 
1777     /*!
1778      * Convert any simple nucleotide variation to delins form, if possible; throw if not.
1779      * Precondition: location must be set.
1780      */
ChangeToDelins(CVariation & v)1781     void CVariationUtil::ChangeToDelins(CVariation& v)
1782     {
1783         v.Index();
1784 
1785         if(v.GetData().IsSet()) {
1786             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it,
1787                               v.SetData().SetSet().SetVariations())
1788             {
1789                 ChangeToDelins(**it);
1790             }
1791         } else if(v.GetData().IsInstance()
1792                   && v.GetData().GetInstance().GetType() == CVariation_inst::eType_inv
1793                   && v.GetData().GetInstance().GetDelta().empty()) {
1794             CVariation_inst& inst = v.SetData().SetInstance();
1795             inst.SetType(CVariation_inst::eType_delins);
1796 
1797             CConstRef<CSeq_literal> this_literal = x_FindOrCreateLiteral(v);
1798             if(!this_literal) {
1799                 NcbiCerr << MSerial_AsnText << v;
1800                 NCBI_THROW(CException, eUnknown,
1801                            "Could not find literal for 'this' location in placements");
1802             }
1803 
1804             CRef<CDelta_item> di(new CDelta_item);
1805             di->SetSeq().SetLiteral().Assign(*this_literal);
1806             this->FlipStrand(*di);
1807             inst.SetDelta().push_back(di);
1808 
1809         } else if(v.GetData().IsInstance()) {
1810             CVariation_inst& inst = v.SetData().SetInstance();
1811             inst.SetType(CVariation_inst::eType_delins);
1812 
1813             if(inst.GetDelta().size() == 0) {
1814                 CRef<CDelta_item> di(new CDelta_item);
1815                 di->SetSeq().SetLiteral().SetLength(0);
1816                 di->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set("");
1817                 inst.SetDelta().push_back(di);
1818             } else if(inst.GetDelta().size() > 1) {
1819                 NCBI_THROW(CException, eUnknown, "Deltas of length >1 are not supported");
1820             } else {
1821                 CDelta_item& di = *inst.SetDelta().front();
1822 
1823                 //convert 'del' to 'replace-with-empty-literal'
1824                 if(di.GetAction() == CDelta_item::eAction_del_at) {
1825                     di.ResetAction();
1826                     di.SetSeq().SetLiteral().SetLength(0);
1827                     di.SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set("");
1828                 }
1829 
1830                 //convert 'loc' or 'this'-based deltas to literals
1831                 if(di.GetSeq().IsLoc()) {
1832                     CRef<CSeq_literal> literal = x_GetLiteralAtLoc(di.GetSeq().GetLoc());
1833                     di.SetSeq().SetLiteral(*literal);
1834                 } else if(di.GetSeq().IsThis()) {
1835                     CConstRef<CSeq_literal> this_literal = x_FindOrCreateLiteral(v);
1836                     if(!this_literal) {
1837                         NcbiCerr << MSerial_AsnText << v;
1838                         NCBI_THROW(CException, eUnknown, "Could not find literal for 'this' location in placements");
1839                     } else {
1840                         di.SetSeq().SetLiteral().Assign(*this_literal);
1841                     }
1842                 }
1843 
1844                 //expand multipliers.
1845                 if(di.IsSetMultiplier()) {
1846                     if(di.GetMultiplier() < 0) {
1847                         NCBI_THROW(CException, eUnknown, "Encountered negative multiplier");
1848                     } else {
1849                         CSeq_literal& literal = di.SetSeq().SetLiteral();
1850 
1851                         if(!literal.IsSetSeq_data() || !literal.GetSeq_data().IsIupacna()) {
1852                             NCBI_THROW(CException, eUnknown, "Expected IUPACNA-type seq-literal");
1853                         }
1854                         string str_kernel = literal.GetSeq_data().GetIupacna().Get();
1855                         literal.SetSeq_data().SetIupacna().Set("");
1856                         for(int i = 0; i < di.GetMultiplier(); i++) {
1857                             literal.SetSeq_data().SetIupacna().Set() += str_kernel;
1858                         }
1859                         literal.SetLength(literal.GetSeq_data().GetIupacna().Get().size());
1860                         if(literal.IsSetFuzz()) {
1861                             literal.SetFuzz().SetLim(CInt_fuzz::eLim_unk);
1862                         }
1863 
1864                         di.ResetMultiplier();
1865                         if(di.IsSetMultiplier_fuzz()) {
1866                             di.SetMultiplier_fuzz().SetLim(CInt_fuzz::eLim_unk);
1867                         }
1868                     }
1869                 }
1870 
1871                 //Convert ins-X-before-loc to 'replace seq@loc with X + seq@loc'
1872                 if(!di.IsSetAction() || di.GetAction() == CDelta_item::eAction_morph) {
1873                     ;  //already done
1874                 } else if(di.GetAction() == CDelta_item::eAction_ins_before) {
1875                     di.ResetAction();
1876                     CConstRef<CSeq_literal> this_literal = x_FindOrCreateLiteral(v);
1877 
1878 
1879                     if(!this_literal) {
1880                         NCBI_THROW(CException, eUnknown, "Could not find literal for 'this' location in placements");
1881                     } else if(this_literal->GetLength() == 0) {
1882                         NCBI_THROW(CException, eUnknown, "Encountered empty literal");
1883                     } else {
1884                         //Note: need to be careful here: if dinucleotide or multinucleotide, "before" really means "before last pos"
1885                         //      (as dinucleotide placement is used for insertions in order to be strand-flippable)
1886 
1887                         CRef<CSeq_literal> literal = s_SpliceLiterals(di.GetSeq().GetLiteral(),
1888                                                                       *this_literal,
1889                                                                       this_literal->GetLength() - 1);
1890                         di.SetSeq().SetLiteral().Assign(*literal);
1891                     }
1892                 }
1893             }
1894         }
1895     }
1896 
1897 
1898 
AttachProteinConsequences(CVariation & v,const CSeq_id * target_id,bool ignore_genomic)1899     void CVariationUtil::AttachProteinConsequences(
1900         CVariation& v,
1901         const CSeq_id* target_id,
1902         bool ignore_genomic)
1903     {
1904         v.Index();
1905 
1906         if(v.GetData().IsSet()) {
1907 
1908             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it,
1909                               v.SetData().SetSet().SetVariations())
1910             {
1911                 AttachProteinConsequences(**it, target_id, ignore_genomic);
1912             }
1913             return;
1914         }
1915 
1916         if(!v.GetData().IsInstance()) {
1917             return;
1918         }
1919 
1920 
1921         const CVariation::TPlacements* placements = s_GetPlacements(v);
1922 
1923         if(!placements || placements->size() == 0) {
1924             return;
1925         }
1926 
1927 
1928         if(v.GetData().GetInstance().IsSetObservation()
1929            && !(v.GetData().GetInstance().GetObservation()
1930            & CVariation_inst::eObservation_variant)) {
1931             //only compute placements for variant instances; (as for asserted/reference there's no change)
1932             return;
1933         }
1934 
1935         ITERATE(CVariation::TPlacements, it, *placements)
1936         {
1937             const CVariantPlacement& placement = **it;
1938 
1939             if(!placement.GetLoc().GetId()
1940                || (target_id && !sequence::IsSameBioseq(*target_id,
1941                sequence::GetId(placement.GetLoc(), NULL),
1942                m_scope))) {
1943                 continue;
1944             }
1945 
1946             if(placement.IsSetStart_offset() && (placement.IsSetStop_offset() || placement.GetLoc().IsPnt())) {
1947                 continue; //intronic.
1948             }
1949 
1950             if(placement.GetMol() != CVariantPlacement::eMol_cdna
1951                && placement.GetMol() != CVariantPlacement::eMol_mitochondrion
1952                && (placement.GetMol() != CVariantPlacement::eMol_genomic || ignore_genomic)) {
1953                 continue;
1954             }
1955 
1956 
1957             CCdregionIndex::TCdregions cdregions;
1958             m_cdregion_index.Get(placement.GetLoc(), cdregions);
1959 
1960             ITERATE(CCdregionIndex::TCdregions, it, cdregions)
1961             {
1962                 CRef<CVariation> prot_variation = TranslateNAtoAA(v.GetData().GetInstance(), placement, *it->cdregion_feat);
1963                 CVariation::TConsequence::value_type consequence(new CVariation::TConsequence::value_type::TObjectType);
1964                 consequence->SetVariation(*prot_variation);
1965                 v.SetConsequence().push_back(consequence);
1966             }
1967         }
1968     }
1969 
1970     //translate IUPACNA literal; do not translate last partial codon.
Translate(const string & nuc_str,bool is_mito)1971     static string Translate(const string& nuc_str, bool is_mito)
1972     {
1973         string prot_str;
1974 
1975         CGenetic_code code;
1976         code.SetId(is_mito ? 2 : 1);
1977 
1978         CSeqTranslator::Translate(
1979             nuc_str,
1980             prot_str,
1981             CSeqTranslator::fIs5PrimePartial,
1982             &code);
1983 
1984         if(prot_str.size() * 3 < nuc_str.size()) {
1985             prot_str.push_back('X');
1986         }
1987 
1988         //truncate everything past the first stop
1989         size_t stop_pos = prot_str.find('*');
1990         if(stop_pos != NPOS) {
1991             prot_str.resize(stop_pos + 1);
1992         }
1993 
1994         return prot_str;
1995     }
1996 
CalcInstTypeForAA(const string & prot_ref_str,const string & prot_delta_str)1997     CVariation_inst::EType CalcInstTypeForAA(
1998         const string& prot_ref_str,
1999         const string& prot_delta_str)
2000     {
2001         CVariation_inst::EType inst_type =
2002             prot_delta_str.size() == 0 && prot_ref_str.size() > 0 ? CVariation_inst::eType_del
2003             : prot_delta_str.size() > 0 && prot_ref_str.size() == 0 ? CVariation_inst::eType_ins
2004             : prot_delta_str.size() != prot_ref_str.size() ? CVariation_inst::eType_prot_other
2005             : prot_ref_str == prot_delta_str ? CVariation_inst::eType_prot_silent
2006             : prot_ref_str.size() > 1 ? CVariation_inst::eType_prot_other
2007             : NStr::Find(prot_delta_str, "*") != NPOS ? CVariation_inst::eType_prot_nonsense
2008             : CVariation_inst::eType_prot_missense;
2009         return inst_type;
2010     }
2011 
CalcEffectForProt(const string & prot_ref_str,const string & prot_delta_str)2012     static CVariantProperties::TEffect CalcEffectForProt(
2013         const string& prot_ref_str,
2014         const string& prot_delta_str)
2015     {
2016         CVariantProperties::TEffect effect = 0;
2017         for(size_t i = 0; i < prot_ref_str.size() && i < prot_delta_str.size(); i++) {
2018             if(prot_ref_str[i] == prot_delta_str[i]) {
2019                 effect |= CVariantProperties::eEffect_synonymous;
2020             } else if(prot_ref_str[i] == '*') {
2021                 effect |= CVariantProperties::eEffect_stop_loss;
2022             } else if(prot_delta_str[i] == '*') {
2023                 effect |= CVariantProperties::eEffect_stop_gain;
2024             } else {
2025                 effect |= CVariantProperties::eEffect_missense;
2026             }
2027         }
2028         return effect;
2029     }
2030 
CalcSOTermsForProt(TSignedSeqPos nuc_delta_len,const string & prot_ref_str,const string & prot_variant_str)2031     static CVariationUtil::TSOTerms CalcSOTermsForProt(
2032         TSignedSeqPos nuc_delta_len,
2033         const string& prot_ref_str,
2034         const string& prot_variant_str)
2035     {
2036         CVariationUtil::TSOTerms terms;
2037 
2038         bool stop_gain = false;
2039         bool stop_loss = false;
2040         for(size_t i = 0; i < max(prot_ref_str.size(), prot_variant_str.size()); i++) {
2041             char r = i >= prot_ref_str.size() ? '-' : prot_ref_str[i];
2042             char v = i >= prot_variant_str.size() ? '-' : prot_variant_str[i];
2043 
2044             if(r == '*' && v != '*') {
2045                 stop_loss = true;
2046             }
2047 
2048             if(r != '*' && v == '*') {
2049                 stop_gain = true;
2050             }
2051         }
2052         if(stop_gain) {
2053             terms.push_back(CVariationUtil::eSO_stop_gained);
2054         }
2055         if(stop_loss) {
2056             terms.push_back(CVariationUtil::eSO_stop_lost);
2057         }
2058 
2059         if(nuc_delta_len == 0) {
2060             if(!stop_gain && !stop_loss) {
2061                 terms.push_back(prot_ref_str == prot_variant_str ? CVariationUtil::eSO_synonymous_variant
2062                                 : CVariationUtil::eSO_missense_variant);
2063             }
2064         } else if(nuc_delta_len % 3 == 0) {
2065             terms.push_back(CVariationUtil::eSO_inframe_indel);
2066         } else {
2067             terms.push_back(CVariationUtil::eSO_frameshift_variant);
2068         }
2069 
2070         return terms;
2071     }
2072 
2073 
FlipStrand(CVariantPlacement & vp) const2074     void CVariationUtil::FlipStrand(CVariantPlacement& vp) const
2075     {
2076         vp.SetLoc().FlipStrand();
2077         if(vp.IsSetSeq() && vp.GetSeq().IsSetSeq_data()) {
2078             CSeqportUtil::ReverseComplement(
2079                 vp.SetSeq().SetSeq_data(),
2080                 &vp.SetSeq().SetSeq_data(),
2081                 0,
2082                 vp.GetSeq().GetLength());
2083         }
2084 
2085         CRef<CVariantPlacement> tmp(new CVariantPlacement);
2086         tmp->Assign(vp);
2087 
2088         if(tmp->IsSetStart_offset()) {
2089             vp.SetStop_offset(tmp->GetStart_offset() * -1);
2090         } else {
2091             vp.ResetStop_offset();
2092         }
2093 
2094         if(tmp->IsSetStop_offset()) {
2095             vp.SetStart_offset(tmp->GetStop_offset() * -1);
2096         } else {
2097             vp.ResetStart_offset();
2098         }
2099 
2100         if(tmp->IsSetStart_offset_fuzz()) {
2101             vp.SetStop_offset_fuzz().Assign(tmp->GetStart_offset_fuzz());
2102         } else {
2103             vp.ResetStop_offset_fuzz();
2104         }
2105 
2106         if(tmp->IsSetStop_offset_fuzz()) {
2107             vp.SetStart_offset_fuzz().Assign(tmp->GetStop_offset_fuzz());
2108         } else {
2109             vp.ResetStart_offset_fuzz();
2110         }
2111     }
2112 
2113 
s_IsInstStrandFlippable(const CVariation & v,const CVariation_inst & inst)2114     bool CVariationUtil::s_IsInstStrandFlippable(
2115         const CVariation& v,
2116         const CVariation_inst& inst)
2117     {
2118         bool ret = true;
2119         ITERATE(CVariation_inst::TDelta, it, v.GetData().GetInstance().GetDelta())
2120         {
2121             const CDelta_item& di = **it;
2122             if(di.IsSetAction() && di.GetAction() == CDelta_item::eAction_ins_before) {
2123                 const CVariation::TPlacements* p = s_GetPlacements(v);
2124                 ret = false; //will set back to true if found acceptable placement
2125                 if(p) {
2126                     ITERATE(CVariation::TPlacements, it, *p)
2127                     {
2128                         if(s_GetLength(**it, NULL) >= 2) {
2129                             ret = true;
2130                         }
2131                     }
2132                 }
2133             }
2134         }
2135         return ret;
2136     }
2137 
FlipStrand(CVariation & v) const2138     void CVariationUtil::FlipStrand(CVariation& v) const
2139     {
2140         v.Index(); //required so that can get factored placements from sub-variations.
2141         if(v.IsSetPlacements()) {
2142             NON_CONST_ITERATE(CVariation::TPlacements, it, v.SetPlacements())
2143             {
2144                 FlipStrand(**it);
2145             }
2146         }
2147 
2148         if(v.GetData().IsSet()) {
2149             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations,
2150                               it,
2151                               v.SetData().SetSet().SetVariations())
2152             {
2153                 FlipStrand(**it);
2154             }
2155         } else if(v.GetData().IsInstance()) {
2156 
2157             if(!s_IsInstStrandFlippable(v, v.GetData().GetInstance())) {
2158                 NCBI_THROW(CException, eUnknown, "Variation is not strand-flippable");
2159             } else {
2160                 FlipStrand(v.SetData().SetInstance());
2161             }
2162         }
2163     }
2164 
2165 
FlipStrand(CDelta_item & di) const2166     void CVariationUtil::FlipStrand(CDelta_item& di) const
2167     {
2168         if(!di.IsSetSeq()) {
2169             return;
2170         }
2171         if(di.GetSeq().IsLoc()) {
2172             di.SetSeq().SetLoc().FlipStrand();
2173         } else if(di.GetSeq().IsLiteral() && di.GetSeq().GetLiteral().IsSetSeq_data()) {
2174             CSeqportUtil::ReverseComplement(
2175                 di.SetSeq().GetLiteral().GetSeq_data(),
2176                 &di.SetSeq().SetLiteral().SetSeq_data(),
2177                 0, di.GetSeq().GetLiteral().GetLength());
2178         }
2179     }
2180 
2181 
FlipStrand(CVariation_inst & inst) const2182     void CVariationUtil::FlipStrand(CVariation_inst& inst) const
2183     {
2184         if(!inst.IsSetDelta()) {
2185             return;
2186         }
2187 
2188         NON_CONST_ITERATE(CVariation_inst::TDelta, it, inst.SetDelta())
2189         {
2190             FlipStrand(**it);
2191         }
2192         reverse(inst.SetDelta().begin(), inst.SetDelta().end());
2193     }
2194 
2195 
x_AdjustDelinsToInterval(CVariation & v,const CSeq_loc & loc)2196     void CVariationUtil::x_AdjustDelinsToInterval(CVariation& v, const CSeq_loc& loc)
2197     {
2198         //note: loc may be a split loc when part of a codon is in another exon
2199 
2200         if(!v.IsSetPlacements() || v.GetPlacements().size() != 1) {
2201             NCBI_THROW(CException, eUnknown, "Expected single placement");
2202         }
2203 
2204         CRef<CSeq_loc> range_loc = sequence::Seq_loc_Merge(loc, CSeq_loc::fMerge_SingleRange, NULL);
2205 
2206         const CSeq_loc& orig_loc = v.GetPlacements().front()->GetLoc();
2207         CRef<CSeq_loc> sub_loc = orig_loc.Subtract(loc, 0, NULL, NULL);
2208         if(sub_loc->Which() && sequence::GetLength(*sub_loc, NULL) > 0) {
2209             //NcbiCerr << MSerial_AsnText << v;
2210             //NcbiCerr << MSerial_AsnText << loc;
2211             NCBI_THROW(CException, eUnknown, "Location must be a superset of the variation's loc");
2212         }
2213 
2214         sub_loc->Assign(*range_loc);
2215         sub_loc->SetInt().SetTo(sequence::GetStop(orig_loc, NULL, eExtreme_Positional));
2216         CRef<CSeq_loc> suffix_loc = sequence::Seq_loc_Subtract(loc, *sub_loc, CSeq_loc::fSortAndMerge_All, NULL);
2217         if(!suffix_loc->Which()) {
2218             suffix_loc->SetNull(); //bug in subtract
2219         }
2220 
2221         sub_loc->Assign(*range_loc);
2222         sub_loc->SetInt().SetFrom(sequence::GetStart(orig_loc, NULL, eExtreme_Positional));
2223         CRef<CSeq_loc> prefix_loc = sequence::Seq_loc_Subtract(loc, *sub_loc, CSeq_loc::fSortAndMerge_All, NULL);
2224         if(!prefix_loc->Which()) {
2225             prefix_loc->SetNull();
2226         }
2227 
2228         if(sequence::GetStrand(loc, NULL) == eNa_strand_minus) {
2229             swap(prefix_loc, suffix_loc);
2230         }
2231 
2232         //NcbiCerr << "prefix loc" << MSerial_AsnText << *prefix_loc;
2233         //NcbiCerr << "suffix loc" << MSerial_AsnText << *suffix_loc;
2234 
2235 
2236         CRef<CSeq_literal> prefix_literal = x_GetLiteralAtLoc(*prefix_loc);
2237         CRef<CSeq_literal> suffix_literal = x_GetLiteralAtLoc(*suffix_loc);
2238 
2239         for(CTypeIterator<CVariation_inst> it(Begin(v)); it; ++it) {
2240             CVariation_inst& inst = *it;
2241             if(inst.GetDelta().size() != 1) {
2242                 NCBI_THROW(CException, eUnknown, "Expected single-element delta");
2243             }
2244 
2245             CDelta_item& delta = *inst.SetDelta().front();
2246 
2247             if(!delta.IsSetSeq() || !delta.GetSeq().IsLiteral()) {
2248                 NCBI_THROW(CException, eUnknown, "Expected literal");
2249             }
2250 
2251             CRef<CSeq_literal> tmp_literal1 = s_CatLiterals(*prefix_literal, delta.SetSeq().SetLiteral());
2252             CRef<CSeq_literal> tmp_literal2 = s_CatLiterals(*tmp_literal1, *suffix_literal);
2253             delta.SetSeq().SetLiteral(*tmp_literal2);
2254         }
2255 
2256         v.SetPlacements().front()->SetLoc().Assign(loc);
2257     }
2258 
Contains(const CSeq_loc & a,const CSeq_loc & b,CScope * scope)2259     static bool Contains(const CSeq_loc& a, const CSeq_loc& b, CScope* scope)
2260     {
2261         CRef<CSeq_loc> a1(new CSeq_loc);
2262         a1->Assign(a);
2263         a1->ResetStrand();
2264 
2265         CRef<CSeq_loc> b1(new CSeq_loc);
2266         b1->Assign(b);
2267         b1->ResetStrand();
2268 
2269         return sequence::Compare(*a1, *b1, scope, sequence::fCompareOverlapping) == sequence::eContains;
2270     }
2271 
x_CreateUnknownVariation(const CSeq_id & id,CVariantPlacement::TMol mol)2272     CRef<CVariation> CVariationUtil::x_CreateUnknownVariation(
2273         const CSeq_id& id,
2274         CVariantPlacement::TMol mol)
2275     {
2276         CRef<CVariantPlacement> p(new CVariantPlacement);
2277         p->SetLoc().SetWhole().Assign(id);
2278         p->SetMol(mol);
2279         CRef<CVariation> v(new CVariation);
2280         v->SetData().SetUnknown();
2281         v->SetPlacements().push_back(p);
2282         ChangeIdsInPlace(*v, sequence::eGetId_ForceAcc, *m_scope);
2283         return v;
2284     }
2285 
CreateUnknownProtConsequenceVariation(const CVariantPlacement & nuc_p,const CSeq_feat & cds_feat,bool is_frameshifting,CScope & scope)2286     static CRef<CVariation> CreateUnknownProtConsequenceVariation(
2287         const CVariantPlacement& nuc_p,
2288         const CSeq_feat& cds_feat,
2289         bool is_frameshifting,
2290         CScope& scope)
2291     {
2292         CRef<CSeq_loc> prot_loc;
2293         CRef<CSeq_loc> codons_loc;
2294         try {
2295             CRef<CSeq_loc_Mapper> nuc2prot_mapper;
2296             CRef<CSeq_loc_Mapper> prot2nuc_mapper;
2297             nuc2prot_mapper.Reset(new CSeq_loc_Mapper(
2298                 cds_feat,
2299                 CSeq_loc_Mapper::eLocationToProduct,
2300                 &scope));
2301             prot2nuc_mapper.Reset(new CSeq_loc_Mapper(
2302                 cds_feat,
2303                 CSeq_loc_Mapper::eProductToLocation,
2304                 &scope));
2305 
2306             prot_loc = nuc2prot_mapper->Map(nuc_p.GetLoc());
2307             codons_loc = prot2nuc_mapper->Map(*prot_loc);
2308 
2309 
2310 
2311             if(codons_loc->IsNull()) {
2312                 // normally shouldn't happen, but may happen with
2313                 // dubious annotation, e.g. BC149603.1:c.1A>G.
2314                 // Mapping to protein coordinates returns NULL,
2315                 // probably because protein and cds lengths are inconsistent.
2316                 NCBI_THROW(CException, eUnknown, "");
2317             }
2318         }
2319         catch(CException&) {
2320             // may legitimately throw if feature is not good for mapping (e.g. partial).
2321             prot_loc.Reset(new CSeq_loc());
2322             prot_loc->SetWhole().Assign(sequence::GetId(cds_feat.GetProduct(), NULL));
2323             codons_loc.Reset(SerialClone(nuc_p.GetLoc()));
2324         }
2325 
2326         CRef<CVariation> v(new CVariation);
2327         v->SetData().SetUnknown();
2328 
2329         CRef<CVariantPlacement> prot_p(new CVariantPlacement);
2330         prot_p->SetLoc(*prot_loc);
2331         prot_p->SetMol(CVariantPlacement::eMol_protein);
2332         prot_p->SetExceptions().push_back(CreateException(
2333             "Cannot infer consequence; projecting location only",
2334             CVariationException::eCode_inconsistent_consequence));
2335         v->SetPlacements().push_back(prot_p);
2336 
2337         CRef<CVariantPlacement> codons_p(new CVariantPlacement);
2338         codons_p->SetLoc(*codons_loc);
2339         codons_p->SetMol(nuc_p.GetMol());
2340         v->SetPlacements().push_back(codons_p);
2341 
2342         ChangeIdsInPlace(*v, sequence::eGetId_ForceAcc, scope);
2343 
2344         CVariationMethod& m = v->SetMethod();
2345         m.SetMethod().push_back(CVariationMethod::eMethod_E_computational);
2346 
2347         if(is_frameshifting) {
2348             v->SetSo_terms().push_back(CVariationUtil::eSO_frameshift_variant);
2349         }
2350 
2351         return v;
2352     }
2353 
2354 
GetCommonPrefixLen(const string & a,const string & b)2355     static size_t GetCommonPrefixLen(const string& a, const string& b)
2356     {
2357         size_t i(0);
2358         while(i < a.size() && i < b.size() && a[i] == b[i]) {
2359             i++;
2360         }
2361         return i;
2362     }
2363 
GetCommonSuffixLen(const string & a,const string & b)2364     static size_t GetCommonSuffixLen(const string& a, const string& b)
2365     {
2366         size_t i(0);
2367         while(i < a.size() && i < b.size() && a[a.size() - 1 - i] == b[b.size() - 1 - i]) {
2368             i++;
2369         }
2370         return i;
2371     }
2372 
2373 
2374     // Return true iff can't use the CDS for mapping between prot and mRNA;
HasProblematicExceptions(const CSeq_feat & cds_feat)2375     static bool HasProblematicExceptions(const CSeq_feat& cds_feat)
2376     {
2377         return !cds_feat.IsSetExcept() ? false
2378             : !cds_feat.GetExcept() ? false
2379             : !cds_feat.IsSetExcept_text() ? true
2380             : (int)cds_feat.GetExcept_text().size() > ( //not all of except-text is explained by benign exceptions
2381             49 * (NStr::Find(cds_feat.GetExcept_text(), "translation initiation by" /* tRNA-Leu at CUG codon"*/) != NPOS)
2382             + 27 * (NStr::Find(cds_feat.GetExcept_text(), "mismatches in translation") != NPOS)
2383             + 20 * (NStr::Find(cds_feat.GetExcept_text(), "ribosomal slippage") != NPOS));
2384     }
2385 
2386 
TranslateNAtoAA(const CVariation_inst & nuc_inst,const CVariantPlacement & nuc_p,const CSeq_feat & cds_feat)2387     CRef<CVariation> CVariationUtil::TranslateNAtoAA(
2388         const CVariation_inst& nuc_inst,
2389         const CVariantPlacement& nuc_p,
2390         const CSeq_feat& cds_feat)
2391     {
2392         const bool verbose = false;
2393 
2394         if(verbose) NcbiCerr << "TranslateNAtoAA Inputs:\n" << MSerial_AsnText
2395             << nuc_inst << "\n\n" << MSerial_AsnText << nuc_p
2396             << "\n\n" << MSerial_AsnText << cds_feat << endl;
2397 
2398         if(!sequence::IsSameBioseq(sequence::GetId(nuc_p.GetLoc(), NULL),
2399             sequence::GetId(cds_feat.GetLocation(), NULL),
2400             m_scope)) {
2401             NCBI_THROW(CException, eUnknown, "Placement and CDS are on different seqs");
2402         }
2403 
2404         //create an inst-variation from the provided inst with the single specified placement
2405         CRef<CVariation> v(new CVariation);
2406         v->SetData().SetInstance().Assign(nuc_inst);
2407         v->ResetPlacements();
2408 
2409         CRef<CVariantPlacement> p(new CVariantPlacement);
2410         p->Assign(nuc_p);
2411         p->ResetSeq(); //Will recalculate after adjusting to codon boundary
2412         v->SetPlacements().push_back(p);
2413 
2414         //normalize to delins form so we can deal with it uniformly
2415         try {
2416             ChangeToDelins(*v);
2417         }
2418         catch(...) {
2419             //e.g. NM_000384.2:c.905-1_905dupGG - can't convert t odelins
2420 
2421             return CreateUnknownProtConsequenceVariation(
2422                 nuc_p,
2423                 cds_feat,
2424                 false,
2425                 *m_scope);
2426         }
2427 
2428         const CDelta_item& nuc_delta = *v->GetData().GetInstance().GetDelta().front();
2429 
2430         // note: using type long instead of TSignedSeqPos is a bug on 64-bit systems:
2431         // the result of the subtraction that's expected to be negative would be a
2432         // BIG_NUMBER that is cast to type long without the wrap-around into negatives.
2433         TSignedSeqPos nuc_delta_len = nuc_delta.GetSeq().GetLiteral().GetLength()
2434             - sequence::GetLength(p->GetLoc(), NULL);
2435 
2436 
2437         if(!SameOrientation(sequence::GetStrand(cds_feat.GetLocation(), NULL),
2438             sequence::GetStrand(p->GetLoc(), NULL))) {
2439             //need the variation and cds to be in the same orientation, such that
2440             //sequence represents the actual codons and AAs when mapped to protein
2441             FlipStrand(*v);
2442         }
2443 
2444         if(verbose) NcbiCerr << "Normalized variation: " << MSerial_AsnText << *v;
2445 
2446 
2447         //Map to protein and back to get the affected codons.
2448         //Note: need the scope because the cds_feat is probably gi-based,
2449         //while our locs are probably accver-based
2450         CRef<CSeq_loc_Mapper> nuc2prot_mapper, prot2nuc_mapper;
2451         CRef<CSeq_loc> prot_loc, codons_loc;
2452         try {
2453             nuc2prot_mapper.Reset(new CSeq_loc_Mapper(
2454                 cds_feat,
2455                 CSeq_loc_Mapper::eLocationToProduct,
2456                 m_scope));
2457             prot2nuc_mapper.Reset(new CSeq_loc_Mapper(cds_feat,
2458                 CSeq_loc_Mapper::eProductToLocation,
2459                 m_scope));
2460 
2461             prot_loc = nuc2prot_mapper->Map(p->GetLoc());
2462             codons_loc = prot2nuc_mapper->Map(*prot_loc);
2463         }
2464         catch(CException&) {
2465             // may legitimately throw if feature
2466             // is not good for mapping (e.g. partial).
2467         }
2468 
2469         const bool is_within_cds = Contains(cds_feat.GetLocation(), nuc_p.GetLoc(), m_scope);
2470 
2471         bool is_ok = !nuc_p.IsSetStart_offset()
2472             && !nuc_p.IsSetStop_offset()
2473             && is_within_cds
2474             && !codons_loc.IsNull()
2475             && !codons_loc->IsNull()
2476             && nuc_delta.GetSeq().IsLiteral()
2477             && nuc_delta.GetSeq().GetLiteral().IsSetSeq_data();
2478 
2479         int frameshift_phase = nuc_delta_len % 3;
2480         if(frameshift_phase < 0) {
2481             frameshift_phase += 3;
2482         }
2483 
2484         if(!is_ok) {
2485             return CreateUnknownProtConsequenceVariation(
2486                 nuc_p,
2487                 cds_feat,
2488                 frameshift_phase != 0,
2489                 *m_scope);
2490         }
2491 
2492 
2493         string downstream_cds_suffix_seq_str; //cds sequence downstream of affected codons
2494         {{
2495                 //extend cds-loc downstream by a little bit to allow inferring changes in the stop-codon
2496                 //(e.g. base in the stop-codon is deleted, and the sequence downstream of cds is now involved)
2497                 CRef<CSeq_loc> ext_cds_loc;
2498                 {{
2499                         SFlankLocs flocs = CreateFlankLocs(cds_feat.GetLocation(), 6);
2500                         ext_cds_loc = sequence::Seq_loc_Add(
2501                             cds_feat.GetLocation(),
2502                             *flocs.downstream,
2503                             CSeq_loc::fSortAndMerge_All, NULL);
2504                     }}
2505 
2506                 CRef<CSeq_loc> downstream_cds_loc;
2507                 {{
2508                         SFlankLocs flocs = CreateFlankLocs(*codons_loc, 1000);
2509                         downstream_cds_loc = ext_cds_loc->Intersect(
2510                             *flocs.downstream,
2511                             CSeq_loc::fSortAndMerge_All, NULL);
2512                     }}
2513 
2514                 CRef<CSeq_literal> literal = x_GetLiteralAtLoc(*downstream_cds_loc);
2515                 if(verbose) NcbiCerr << "Downstream-cds: " << MSerial_AsnText << *literal;
2516 
2517                 if(literal->GetLength() > 0) {
2518                     downstream_cds_suffix_seq_str = literal->GetSeq_data().GetIupacna().Get();
2519                 }
2520             }}
2521 
2522         TSignedSeqPos frame = abs(
2523             (TSignedSeqPos)p->GetLoc().GetStart(eExtreme_Biological)
2524             - (TSignedSeqPos)codons_loc->GetStart(eExtreme_Biological));
2525 
2526         codons_loc->SetId(sequence::GetId(p->GetLoc(), NULL));
2527         // restore the original id, as mapping forward and back may have changed the type
2528 
2529         ChangeIdsInPlace(*prot_loc, sequence::eGetId_ForceAcc, *m_scope);
2530         // not strictly required, but generally prefer accvers in the output for readability
2531         // as we're dealing with various types of mols
2532 
2533         x_AdjustDelinsToInterval(*v, *codons_loc);
2534         AttachSeq(*v, 100000); //need enough sequnece to get create reference and variant translations downstream
2535 
2536         if(!v->GetPlacements().front()->GetSeq().IsSetSeq_data()) {
2537             return CreateUnknownProtConsequenceVariation(
2538                 nuc_p,
2539                 cds_feat,
2540                 frameshift_phase != 0,
2541                 *m_scope);
2542         }
2543 
2544         if(verbose) NcbiCerr << "Adjusted-for-codons " << MSerial_AsnText << *v;
2545 
2546         string nuc_ref_prefix = v->GetPlacements().front()->GetSeq().GetSeq_data().GetIupacna().Get();
2547 
2548         const CSeq_literal& nuc_var_literal = v->GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral();
2549         string nuc_var_prefix = nuc_var_literal.GetLength() == 0 ? "" : nuc_var_literal.GetSeq_data().GetIupacna().Get();
2550 
2551         string nuc_ref_str = nuc_ref_prefix + downstream_cds_suffix_seq_str;
2552         string nuc_var_str = nuc_var_prefix + downstream_cds_suffix_seq_str;
2553 
2554         string prot_ref_str = Translate(nuc_ref_str, p->GetMol() == CVariantPlacement::eMol_mitochondrion);
2555         string prot_var_str = Translate(nuc_var_str, p->GetMol() == CVariantPlacement::eMol_mitochondrion);
2556         int num_ref_codons = (nuc_ref_prefix.size() + 2) / 3;
2557         int num_var_codons = (nuc_var_prefix.size() + 2) / 3;
2558 
2559 
2560         if(verbose) NcbiCerr << "nuc_ref_str: " << nuc_ref_str << "\n"
2561             << "nuc_var_str: " << nuc_var_str << "\n";
2562 
2563 
2564         if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << "\n"
2565             << "prot_var_str: " << prot_var_str << "\n";
2566 
2567 
2568         int common_prot_prefix_len(0); //will calculate length of unchanged leading AAs in translations (starting with codons of interest)
2569 
2570 
2571         if(prot_ref_str == prot_var_str) {
2572             //No-change variation. Will truncate to original counts of affected codons, such that the
2573             //no-change variation describes the original codons that did not affect the translation.
2574             prot_ref_str.resize(min(static_cast<int>(prot_ref_str.size()), num_ref_codons));
2575             prot_var_str.resize(prot_ref_str.size());
2576 
2577             if(prot_ref_str.size() > 0 && *prot_ref_str.rbegin() == '*') {
2578                 //If translated all the way to stop with no-changes, this is no longer a frameshift, even though the indel is not mod%3
2579                 frameshift_phase = 0;
2580             }
2581 
2582         } else {
2583 
2584             //Justify the variation to first affected AA.
2585             //
2586             //Truncate common prefix of translations; truncate the 5'-end of the prot-loc and recalculate codons-loc accordingly.
2587             //Rationale: nucleotide level might not affect translation at the affected codons' location, but change the translation
2588             //downstream. That's why we have to skip unchanged translation to find the first affected AA.
2589             //
2590             //This does not apply to synonymous changes where there's no change at all rather than a downstream change.
2591 
2592             common_prot_prefix_len = GetCommonPrefixLen(prot_ref_str, prot_var_str);
2593             if(common_prot_prefix_len > 0
2594                && common_prot_prefix_len == static_cast<int>(prot_ref_str.size())) {
2595                 common_prot_prefix_len -= 1; // when truncating common prefx, don't want to truncate all the way
2596                 // will leave last position as necessary
2597             }
2598 
2599             prot_ref_str = prot_ref_str.substr(common_prot_prefix_len);
2600             prot_var_str = prot_var_str.substr(common_prot_prefix_len);
2601 
2602             if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << ":" << prot_ref_str.size() << "\n"
2603                 << "prot_var_str: " << prot_var_str << ":" << prot_var_str.size() << "\n";
2604 
2605             if(frameshift_phase == 0) {
2606 
2607                 size_t suffix_len = GetCommonSuffixLen(prot_ref_str, prot_var_str);
2608                 size_t min_len = min(prot_ref_str.size(), prot_var_str.size());
2609                 size_t ref_stop_pos = prot_ref_str.find('*');
2610                 size_t var_stop_pos = prot_var_str.find('*');
2611                 size_t min_stop_pos = min(ref_stop_pos, var_stop_pos);
2612 
2613                 // VAR-699, VAR-872
2614                 bool truncate_at_stop = min_stop_pos < min_len
2615                     && ref_stop_pos != var_stop_pos
2616                     && nuc_delta_len == 0;
2617 
2618                 if(truncate_at_stop) {
2619                     prot_ref_str.resize(min_stop_pos + 1);
2620                     prot_var_str.resize(min_stop_pos + 1);
2621                 } else {
2622                     prot_ref_str.resize(prot_ref_str.size() - suffix_len);
2623                     prot_var_str.resize(prot_var_str.size() - suffix_len);
2624                 }
2625             } else {
2626                 //Keep the frst AA in case of frameshifts
2627                 //NM_000492.3:c.3528delC -> NP_000483.3:p.Lys1177Serfs  instead of NP_000483.3:p.Lys1177delfs
2628                 prot_ref_str.resize(min(static_cast<size_t>(1), prot_ref_str.size()));
2629                 prot_var_str.resize(min(static_cast<size_t>(1), prot_var_str.size()));
2630             }
2631 
2632             //adjust the protein location.
2633             prot_loc = sequence::Seq_loc_Merge(*prot_loc, CSeq_loc::fMerge_SingleRange, NULL); //to convert point to interval
2634             if(prot_ref_str.size() == 0) {
2635                 //After truncating common prefix and suffix the reference is reduced to nothing - we have pure insertion.
2636                 //We need 2-AA location describing the point of insertion
2637                 prot_loc->SetInt().SetFrom() += common_prot_prefix_len - 1;
2638                 prot_loc->SetInt().SetTo(prot_loc->SetInt().SetFrom() + 1);
2639             } else {
2640                 //the location describes the sequence that's being changed
2641                 prot_loc->SetInt().SetFrom() += common_prot_prefix_len;
2642                 prot_loc->SetInt().SetTo(prot_loc->SetInt().SetFrom() + prot_ref_str.size() - 1);
2643             }
2644             prot_loc = sequence::Seq_loc_Merge(*prot_loc, 0, NULL); //to convert single-pos int to point, as appropriate
2645 
2646             codons_loc = prot2nuc_mapper->Map(*prot_loc);
2647 
2648             if(codons_loc->IsNull()) {
2649                 // may have gone past the end of the CDS, e.g. NG_011646.1:g.1508delT
2650                 return CreateUnknownProtConsequenceVariation(
2651                     nuc_p,
2652                     cds_feat,
2653                     frameshift_phase != 0,
2654                     *m_scope);
2655             }
2656 
2657 
2658             codons_loc->SetId(sequence::GetId(p->GetLoc(), NULL));
2659 
2660             if(verbose) NcbiCerr << "prot_ref_str: " << prot_ref_str << ":" << prot_ref_str.size() << "\n"
2661                 << "prot_var_str: " << prot_var_str << ":" << prot_var_str.size() << "\n";
2662         }
2663 
2664 
2665         if(verbose)  NcbiCerr << "reference codons: " << num_ref_codons
2666             << "; variant codons: " << num_var_codons
2667             << "; common prefix: " << common_prot_prefix_len << "\n";
2668 
2669 
2670         //if(verbose) NcbiCerr << "prot_ref  : " << prot_ref_str << "\n" << "prot_delta: " << prot_var_str << "\ntruncated prefix: " << common_prot_prefix_len << "\n Adjusted codons loc: " << MSerial_AsnText << *codons_loc;
2671 
2672         //Constructing protein-variation
2673 
2674         CRef<CVariation> prot_v(new CVariation);
2675 
2676         //will have two placements: one describing the AA, and the other describing codons
2677         CRef<CVariantPlacement> prot_p(new CVariantPlacement);
2678 
2679         if(HasProblematicExceptions(cds_feat)) {
2680             //mapped protein position is bogus, as cds is either partial or there are indels.
2681             prot_p->SetLoc().SetEmpty().Assign(sequence::GetId(*prot_loc, NULL));
2682             prot_p->SetMol(GetMolType(sequence::GetId(prot_p->GetLoc(), NULL)));
2683             prot_p->SetSeq().SetLength(prot_ref_str.size());
2684             prot_p->SetSeq().SetSeq_data().SetNcbieaa().Set(prot_ref_str);
2685         } else {
2686             prot_p->SetLoc(*prot_loc);
2687             prot_p->SetMol(GetMolType(sequence::GetId(prot_p->GetLoc(), NULL)));
2688             AttachSeq(*prot_p);
2689         }
2690         prot_v->SetPlacements().push_back(prot_p);
2691 
2692 
2693         CRef<CVariantPlacement> codons_p(new CVariantPlacement);
2694         codons_p->SetLoc(*codons_loc);
2695         codons_p->SetMol(GetMolType(sequence::GetId(codons_p->GetLoc(), NULL)));
2696         codons_p->SetFrame(frame);
2697         AttachSeq(*codons_p);
2698         prot_v->SetPlacements().push_back(codons_p);
2699 
2700 
2701         //calculate properties
2702         {{
2703                 if(frameshift_phase == 0 && prot_ref_str.size() == prot_var_str.size()) {
2704                     //VAR-267 - calculate missense/synonymous/stop-gain/loss for non-frameshifting and non-length-changing cases only
2705                     CVariantProperties::TEffect prop = CalcEffectForProt(prot_ref_str, prot_var_str);
2706                     if(prop != 0) {
2707                         prot_v->SetVariant_prop().SetEffect(prop);
2708                     }
2709                 }
2710 
2711                 TSOTerms so_terms = CalcSOTermsForProt(nuc_delta_len,
2712                                                        prot_ref_str,
2713                                                        prot_var_str);
2714                 copy(so_terms.begin(), so_terms.end(), back_inserter(prot_v->SetSo_terms()));
2715             }}
2716 
2717 
2718         prot_v->SetData().SetInstance().SetType(CalcInstTypeForAA(prot_ref_str, prot_var_str));
2719 
2720         //set inst data
2721         CRef<CDelta_item> di(new CDelta_item);
2722         prot_v->SetData().SetInstance().SetDelta().push_back(di);
2723 
2724         if(prot_var_str.size() > 0) {
2725 
2726             // Use NA alphabet instead of AA, as the dbSNP requested original codons.
2727             // The downstream process can always translate this to AAs.
2728             //
2729             // Note, however, that we cannot always use the original variant codons sequence, because
2730             // the location may have been adjusted downstream (see comments at TruncateCommonPrefixAndSuffix).
2731             // So we'll have to get this from nuc_var_str.
2732             if(false && common_prot_prefix_len == 0) {
2733                 di->SetSeq().Assign(v->GetData().GetInstance().GetDelta().front()->GetSeq());
2734             } else {
2735 
2736                 if(verbose) NcbiCerr
2737                     << "inst-type: " << prot_v->GetData().GetInstance().GetType()
2738                     << "; nuc_var_len: " << nuc_var_str.size()
2739                     << "; nuc_var_str: " << nuc_var_str
2740                     << "; prefix_len: " << common_prot_prefix_len * 3
2741                     << "; var_codons:" << prot_var_str.size() * 3 << "\n";
2742 
2743                 // Note: need to take min here, because common_prot_prefix_len * 3 may exceed
2744                 // nuc_var_str.size() due to translation of a partial codon.
2745                 string adjusted_codons_str = nuc_var_str.substr(
2746                     min<int>(nuc_var_str.size(), common_prot_prefix_len * 3),
2747                     prot_var_str.size() * 3);
2748 
2749                 if(adjusted_codons_str.size() > 0) {
2750                     di->SetSeq().SetLiteral().SetLength(adjusted_codons_str.size());
2751                     di->SetSeq().SetLiteral().SetSeq_data().SetIupacna().Set() = adjusted_codons_str;
2752                 } else {
2753                     di->SetSeq().SetThis();
2754                     di->SetAction(CDelta_item::eAction_del_at);
2755                 }
2756             }
2757 
2758             if(prot_ref_str.size() == 0) {
2759                 di->SetAction(CDelta_item::eAction_ins_before);
2760             }
2761         } else {
2762             di->SetSeq().SetThis();
2763             di->SetAction(CDelta_item::eAction_del_at);
2764         }
2765 
2766         //set frameshift
2767         if(frameshift_phase != 0) {
2768             //note: SetEffect() contains magic-value in debug mode by default, not 0.
2769             //prot_v->SetVariant_prop().SetEffect() |= CVariantProperties::eEffect_frameshift;
2770             prot_v->SetVariant_prop().SetEffect(
2771                 CVariantProperties::eEffect_frameshift
2772                 | (prot_v->IsSetVariant_prop()
2773                 && prot_v->GetVariant_prop().IsSetEffect()
2774                 ? prot_v->GetVariant_prop().GetEffect() : 0));
2775 
2776             prot_v->SetFrameshift().SetPhase(frameshift_phase);
2777         }
2778 
2779         if(verbose) NcbiCerr << "protein variation:" << MSerial_AsnText << *prot_v;
2780         if(verbose) NcbiCerr << "Done with protein variation\n";
2781 
2782         return prot_v;
2783     }
2784 
2785 
2786 
2787     ////////////////////////////////////////////////////////////////////////////////
2788     //
2789     // SO-terms calculations
2790     //
2791     ////////////////////////////////////////////////////////////////////////////////
2792 
2793 
AsSOTerms(const CVariantProperties & p,TSOTerms & terms)2794     void CVariationUtil::AsSOTerms(const CVariantProperties& p, TSOTerms& terms)
2795     {
2796         if(p.GetGene_location() & CVariantProperties::eGene_location_intergenic) {
2797             terms.push_back(eSO_intergenic_variant);
2798         }
2799         if(p.GetGene_location() & CVariantProperties::eGene_location_near_gene_5) {
2800             terms.push_back(eSO_2KB_upstream_variant);
2801         }
2802         if(p.GetGene_location() & CVariantProperties::eGene_location_near_gene_3) {
2803             terms.push_back(eSO_500B_downstream_variant);
2804         }
2805         if(p.GetGene_location() & CVariantProperties::eGene_location_donor) {
2806             terms.push_back(eSO_splice_donor_variant);
2807         }
2808         if(p.GetGene_location() & CVariantProperties::eGene_location_acceptor) {
2809             terms.push_back(eSO_splice_acceptor_variant);
2810         }
2811         if(p.GetGene_location() & CVariantProperties::eGene_location_intron) {
2812             terms.push_back(eSO_intron_variant);
2813         }
2814         if(p.GetGene_location() & CVariantProperties::eGene_location_utr_5) {
2815             terms.push_back(eSO_5_prime_UTR_variant);
2816         }
2817         if(p.GetGene_location() & CVariantProperties::eGene_location_utr_3) {
2818             terms.push_back(eSO_3_prime_UTR_variant);
2819         }
2820         if(p.GetGene_location() & CVariantProperties::eGene_location_conserved_noncoding) {
2821             terms.push_back(eSO_nc_transcript_variant);
2822         }
2823         if(p.GetGene_location() & CVariantProperties::eGene_location_in_start_codon) {
2824             terms.push_back(eSO_initiator_codon_variant);
2825         }
2826         if(p.GetGene_location() & CVariantProperties::eGene_location_in_stop_codon) {
2827             terms.push_back(eSO_terminator_codon_variant);
2828         }
2829 
2830         if(p.GetEffect() & CVariantProperties::eEffect_frameshift) {
2831             terms.push_back(eSO_frameshift_variant);
2832         }
2833         if(p.GetEffect() & CVariantProperties::eEffect_missense) {
2834             terms.push_back(eSO_missense_variant);
2835         }
2836 
2837         if(p.GetEffect() & CVariantProperties::eEffect_nonsense || p.GetEffect() & CVariantProperties::eEffect_stop_gain) {
2838             terms.push_back(eSO_stop_gained);
2839         }
2840         if(p.GetEffect() & CVariantProperties::eEffect_nonsense || p.GetEffect() & CVariantProperties::eEffect_stop_loss) {
2841             terms.push_back(eSO_stop_lost);
2842         }
2843         if(p.GetEffect() & CVariantProperties::eEffect_synonymous) {
2844             terms.push_back(eSO_synonymous_variant);
2845         }
2846     }
2847 
AsString(ESOTerm term)2848     string CVariationUtil::AsString(ESOTerm term)
2849     {
2850         return
2851             term == eSO_intergenic_variant ? "intergenic_variant"
2852             : term == eSO_2KB_upstream_variant ? "2KB_upstream_variant"
2853             : term == eSO_500B_downstream_variant ? "500B_downstream_variant"
2854             : term == eSO_splice_donor_variant ? "splice_donor_variant"
2855             : term == eSO_splice_acceptor_variant ? "splice_acceptor_varian"
2856             : term == eSO_intron_variant ? "intron_variant"
2857             : term == eSO_5_prime_UTR_variant ? "5_prime_UTR_variant"
2858             : term == eSO_3_prime_UTR_variant ? "3_prime_UTR_variant"
2859             : term == eSO_coding_sequence_variant ? "coding_sequence_variant"
2860             : term == eSO_nc_transcript_variant ? "nc_transcript_variant"
2861             : term == eSO_initiator_codon_variant ? "initiator_codon_variant"
2862             : term == eSO_terminator_codon_variant ? "terminator_codon_variant"
2863 
2864             : term == eSO_synonymous_variant ? "synonymous_variant"
2865             : term == eSO_missense_variant ? "missense_variant"
2866             : term == eSO_frameshift_variant ? "frameshift_variant"
2867             : term == eSO_inframe_indel ? "inframe_indel"
2868             : term == eSO_stop_gained ? "stop_gained"
2869             : term == eSO_stop_lost ? "stop_lost"
2870             : "other_variant";
2871     };
2872 
2873 
2874 
s_GetPlacements(const CVariation & v)2875     const CVariation::TPlacements* CVariationUtil::s_GetPlacements(const CVariation& v)
2876     {
2877         if(v.IsSetPlacements()) {
2878             return &v.GetPlacements();
2879         } else if(v.GetParent()) {
2880             return s_GetPlacements(*v.GetParent());
2881         } else {
2882             return NULL;
2883         }
2884     }
2885 
s_FindFirstLiteral(const CVariation & v)2886     const CConstRef<CSeq_literal> CVariationUtil::s_FindFirstLiteral(const CVariation& v)
2887     {
2888         const CVariation::TPlacements* placements = s_GetPlacements(v);
2889         ITERATE(CVariation::TPlacements, it, *placements)
2890         {
2891             const CVariantPlacement& p = **it;
2892             if(p.IsSetSeq()) {
2893                 return CConstRef<CSeq_literal>(&p.GetSeq());
2894             }
2895         }
2896         return CConstRef<CSeq_literal>(NULL);
2897     }
2898 
s_FindAssertedLiteral(const CVariation & v)2899     const CConstRef<CSeq_literal> CVariationUtil::s_FindAssertedLiteral(const CVariation& v)
2900     {
2901         const CVariation* parent = v.GetParent();
2902         if(parent == NULL) {
2903             return CConstRef<CSeq_literal>(NULL);
2904         } else {
2905             if(parent->GetData().IsSet()) {
2906                 ITERATE(CVariation::TData::TSet::TVariations, it, parent->GetData().GetSet().GetVariations())
2907                 {
2908                     const CVariation& sibling = **it;
2909                     if(sibling.GetData().IsInstance()
2910                        && sibling.GetData().GetInstance().IsSetObservation()
2911                        && sibling.GetData().GetInstance().GetObservation() == CVariation_inst::eObservation_asserted
2912                        && sibling.GetData().GetInstance().GetDelta().size() == 1
2913                        && sibling.GetData().GetInstance().GetDelta().front()->IsSetSeq()
2914                        && sibling.GetData().GetInstance().GetDelta().front()->GetSeq().IsLiteral()) {
2915                         return CConstRef<CSeq_literal>(&sibling.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral());
2916                     }
2917                 }
2918             }
2919 
2920             return s_FindAssertedLiteral(*parent);
2921         }
2922     }
2923 
2924 
Equals(const CVariation::TPlacements & p1,const CVariation::TPlacements & p2)2925     static bool Equals(
2926         const CVariation::TPlacements& p1,
2927         const CVariation::TPlacements& p2)
2928     {
2929         if(p1.size() != p2.size()) {
2930             return false;
2931         }
2932         CVariation::TPlacements::const_iterator it1 = p1.begin();
2933         CVariation::TPlacements::const_iterator it2 = p2.begin();
2934 
2935         for(; it1 != p1.end() && it2 != p2.end(); ++it1, ++it2) {
2936             const CVariantPlacement& p1 = **it1;
2937             const CVariantPlacement& p2 = **it2;
2938             if(!p1.Equals(p2)) {
2939                 return false;
2940             }
2941         }
2942         return true;
2943     }
2944 
s_FactorOutPlacements(CVariation & v)2945     void CVariationUtil::s_FactorOutPlacements(CVariation& v)
2946     {
2947         if(!v.GetData().IsSet()) {
2948             return;
2949         }
2950 
2951         NON_CONST_ITERATE(
2952             CVariation::TData::TSet::TVariations, it,
2953             v.SetData().SetSet().SetVariations())
2954         {
2955             s_FactorOutPlacements(**it);
2956         }
2957 
2958         CVariation::TPlacements* p1 = NULL;
2959 
2960         bool ok = true;
2961         NON_CONST_ITERATE(
2962             CVariation::TData::TSet::TVariations, it,
2963             v.SetData().SetSet().SetVariations())
2964         {
2965             CVariation& v2 = **it;
2966             if(!v2.IsSetPlacements()) {
2967                 ok = false;
2968                 break;
2969             } else if(!p1) {
2970                 p1 = &v2.SetPlacements();
2971                 continue;
2972             } else {
2973                 if(!Equals(*p1, v2.GetPlacements())) {
2974                     ok = false;
2975                     break;
2976                 }
2977             }
2978         }
2979 
2980         if(ok && p1) {
2981             //transfer p1 placements to this level
2982             NON_CONST_ITERATE(CVariation::TPlacements, it, *p1)
2983             {
2984                 v.SetPlacements().push_back(*it);
2985             }
2986 
2987             //reset placements at the children level
2988             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it,
2989                               v.SetData().SetSet().SetVariations())
2990             {
2991                 CVariation& v2 = **it;
2992                 v2.ResetPlacements();
2993             }
2994         }
2995     }
2996 
2997 
s_FindConsequenceForPlacement(const CVariation & v,const CVariantPlacement & p)2998     CConstRef<CVariation> CVariationUtil::s_FindConsequenceForPlacement(
2999         const CVariation& v,
3000         const CVariantPlacement& p)
3001     {
3002         CConstRef<CVariation> cons_v(NULL);
3003         if(v.IsSetConsequence() && p.GetLoc().GetId()) {
3004             ITERATE(CVariation::TConsequence, it, v.GetConsequence())
3005             {
3006                 const CVariation::TConsequence::value_type::TObjectType& cons = **it;
3007                 if(cons.IsVariation()
3008                    && cons.GetVariation().IsSetPlacements()) {
3009                     ITERATE(CVariation::TPlacements, it2, cons.GetVariation().GetPlacements())
3010                     {
3011                         const CVariantPlacement& vp = **it2;
3012                         if(vp.GetLoc().GetId()
3013                            && p.GetLoc().GetId()
3014                            && vp.GetLoc().GetId()->Equals(*p.GetLoc().GetId())) {
3015                             cons_v.Reset(&cons.GetVariation());
3016                             break;
3017                         }
3018                     }
3019                 }
3020             }
3021         }
3022 
3023         if(!cons_v && v.GetData().IsSet()) {
3024             ITERATE(CVariation::TData::TSet::TVariations, it,
3025                     v.GetData().GetSet().GetVariations())
3026             {
3027                 CConstRef<CVariation> cons_v1 = s_FindConsequenceForPlacement(**it, p);
3028                 if(cons_v1) {
3029                     cons_v = cons_v1;
3030                     break;
3031                 }
3032             }
3033         }
3034 
3035         return cons_v;
3036     }
3037 
3038 
s_AttachGeneIDdbxref(CVariantPlacement & p,int gene_id)3039     void CVariationUtil::s_AttachGeneIDdbxref(CVariantPlacement& p, int gene_id)
3040     {
3041         ITERATE(CVariantPlacement::TDbxrefs, it, p.SetDbxrefs())
3042         {
3043             const CDbtag& dbtag = **it;
3044             if(dbtag.GetDb() == "GeneID"
3045                && dbtag.GetTag().IsId()
3046                && dbtag.GetTag().GetId() == gene_id) {
3047                 return;
3048             }
3049         }
3050 
3051         CRef<CDbtag> dbtag(new CDbtag());
3052         dbtag->SetDb("GeneID");
3053         dbtag->SetTag().SetId(gene_id);
3054         p.SetDbxrefs().push_back(dbtag);
3055     }
3056 
SetPlacementProperties(CVariantPlacement & placement)3057     void CVariationUtil::SetPlacementProperties(CVariantPlacement& placement)
3058     {
3059         if(!placement.IsSetGene_location()) {
3060             // need to zero-out the bitmask,
3061             // otherwise in debug mode it will be preset to a magic value,
3062             // and then modifying it with "|=" will produce garbage.
3063             placement.SetGene_location(0);
3064         }
3065 
3066 
3067         // for offset-style intronic locations (not genomic and have offset),
3068         // can infer where we are based on offset
3069         if(!placement.IsSetMol()
3070            || placement.GetMol() != CVariantPlacement::eMol_genomic) {
3071             CBioseq_Handle bsh = m_scope->GetBioseqHandle(
3072                 sequence::GetId(placement.GetLoc(), NULL));
3073 
3074             if(placement.IsSetStart_offset()
3075                && placement.GetStart_offset() != 0) {
3076                 x_SetVariantPropertiesForIntronic(
3077                     placement,
3078                     placement.GetStart_offset(),
3079                     placement.GetLoc(),
3080                     bsh);
3081             }
3082 
3083             if(placement.IsSetStop_offset()
3084                && placement.GetStop_offset() != 0) {
3085                 x_SetVariantPropertiesForIntronic(
3086                     placement,
3087                     placement.GetStop_offset(),
3088                     placement.GetLoc(),
3089                     bsh);
3090             }
3091         }
3092 
3093         CVariantPropertiesIndex::TGeneIDAndPropVector v;
3094         m_variant_properties_index.GetLocationProperties(placement.GetLoc(), v);
3095 
3096         // note: this assumes that the offsets are HGVS-spec compliant:
3097         // anchor locations are at the exon terminals, and the
3098         // offset values point into the intron.
3099         bool is_completely_intronic = false;
3100         {{
3101                 bool is_start_offset = placement.IsSetStart_offset()
3102                     && placement.GetStart_offset() != 0;
3103 
3104                 bool is_stop_offset = placement.IsSetStop_offset()
3105                     && placement.GetStop_offset() != 0;
3106 
3107                 // Single anchor point, and have any offset.
3108                 bool is_case1 = sequence::GetLength(placement.GetLoc(), NULL) == 1
3109                     && (is_start_offset || is_stop_offset);
3110 
3111                 // Other possibility is when start and stop are addressed from different exons:
3112                 // The location length must be 2 (end of one exon and start of the other),
3113                 // and the offsets point inwards.
3114                 bool is_case2 = sequence::GetLength(placement.GetLoc(), NULL) == 2
3115                     && is_start_offset && placement.GetStart_offset() > 0
3116                     && is_stop_offset && placement.GetStop_offset() < 0;
3117 
3118                 is_completely_intronic = is_case1 || is_case2;
3119             }}
3120 
3121         // collapse all gene-specific location properties into prop.
3122         // Will do it in three rounds so that more important properties
3123         // will have their gene-ids on top of the list: VAR-1297
3124         //    First:  ignore near-gene-or-intron.
3125         //    Second: ignore near-gene.
3126         //    Third:  don't ignore any property.
3127         for(size_t i = 0; i < 3; i++) {
3128             static const CVariantProperties::TGene_location flags[3] = {
3129                 CVariantProperties::eGene_location_near_gene_3
3130                 | CVariantProperties::eGene_location_near_gene_5
3131                 | CVariantProperties::eGene_location_intron
3132                 ,
3133                 CVariantProperties::eGene_location_near_gene_3
3134                 | CVariantProperties::eGene_location_near_gene_5
3135                 ,
3136                 0 };
3137 
3138             ITERATE(CVariantPropertiesIndex::TGeneIDAndPropVector, it, v)
3139             {
3140                 int gene_id = it->first;
3141                 CVariantProperties::TGene_location loc_prop = it->second;
3142 
3143                 if(loc_prop & flags[i]) {
3144                     continue;
3145                 }
3146 
3147                 if(gene_id != 0) {
3148                     s_AttachGeneIDdbxref(placement, gene_id);
3149                 }
3150 
3151                 if(!is_completely_intronic) {
3152                     //we don't want to compute properties for intronic anchor points. VAR-149
3153                     placement.SetGene_location() |= loc_prop;
3154                 }
3155             }
3156         }
3157 
3158     }
3159 
3160 
FindLocationProperties(const CSeq_align & transcript_aln,const CSeq_loc & query_loc,TSOTerms & terms)3161     void CVariationUtil::FindLocationProperties(
3162         const CSeq_align& transcript_aln,
3163         const CSeq_loc& query_loc,
3164         TSOTerms& terms)
3165     {
3166         //note: initializing mapper with scope because the annotation from scope is gi-based,
3167         //while the parameters are normaly accver-based
3168         CRef<CSeq_loc_Mapper> mapper(new CSeq_loc_Mapper(transcript_aln, 1, m_scope));
3169 
3170         CConstRef<CSeq_loc> genomic_query_loc;
3171         if(query_loc.GetId() && query_loc.GetId()->Equals(transcript_aln.GetSeq_id(0))) {
3172             genomic_query_loc = mapper->Map(query_loc);
3173         } else {
3174             genomic_query_loc.Reset(&query_loc);
3175         }
3176 
3177         CRef<CSeq_loc> rna_loc = transcript_aln.CreateRowSeq_loc(1);
3178 
3179         CRef<CSeq_loc> cds_loc;
3180         {{
3181                 CBioseq_Handle bsh = m_scope->GetBioseqHandle(transcript_aln.GetSeq_id(0));
3182                 for(CFeat_CI ci(bsh, SAnnotSelector(CSeqFeatData::e_Cdregion)); ci; ++ci) {
3183                     const CMappedFeat& mf = *ci;
3184                     cds_loc = mapper->Map(mf.GetLocation());
3185 
3186                     //remove indels from the mapped cds loc
3187                     cds_loc = sequence::Seq_loc_Merge(*cds_loc, CSeq_loc::fMerge_SingleRange, NULL);
3188                     cds_loc = rna_loc->Intersect(*cds_loc, 0, NULL);
3189                     break;
3190                 }
3191             }}
3192 
3193         s_FindLocationProperties(rna_loc, cds_loc, *genomic_query_loc, terms);
3194     }
3195 
3196 
s_FindLocationProperties(CConstRef<CSeq_loc> rna_loc,CConstRef<CSeq_loc> cds_loc,const CSeq_loc & query_loc,CVariationUtil::TSOTerms & terms)3197     void CVariationUtil::s_FindLocationProperties(
3198         CConstRef<CSeq_loc> rna_loc,
3199         CConstRef<CSeq_loc> cds_loc,
3200         const CSeq_loc& query_loc,
3201         CVariationUtil::TSOTerms& terms)
3202     {
3203         struct SPropsMap
3204         {
3205             typedef CRangeMap<CVariationUtil::ESOTerm, TSeqPos> TRangeMap;
3206             typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap;
3207             TIdRangeMap loc_map;
3208 
3209             void Add(CVariationUtil::ESOTerm term, const CSeq_loc& loc)
3210             {
3211                 for(CSeq_loc_CI ci(loc); ci; ++ci) {
3212                     loc_map[ci.GetSeq_id_Handle()][ci.GetRange()] = term;
3213                 }
3214             }
3215         } props_map;
3216 
3217         typedef pair<CRef<CSeq_loc>, CRef<CSeq_loc> > TLocsPair;
3218 
3219         if(!rna_loc && !cds_loc) {
3220             return;
3221         }
3222 
3223         const CSeq_loc& main_loc = rna_loc ? *rna_loc : *cds_loc;
3224 
3225         {{
3226                 TLocsPair p = CVariantPropertiesIndex::s_GetNeighborhoodLocs(
3227                     main_loc,
3228                     main_loc.GetStop(eExtreme_Positional) + 100000);
3229 
3230                 props_map.Add(eSO_2KB_upstream_variant, *p.first);
3231                 props_map.Add(eSO_500B_downstream_variant, *p.second);
3232             }}
3233 
3234     {{
3235             TLocsPair p = CVariantPropertiesIndex::s_GetIntronsAndSpliceSiteLocs(main_loc);
3236             props_map.Add(eSO_intron_variant, *p.first);
3237             size_t i(0);
3238             for(CSeq_loc_CI ci(*p.second,
3239                 CSeq_loc_CI::eEmpty_Skip,
3240                 CSeq_loc_CI::eOrder_Biological); ci; ++ci) {
3241                 props_map.Add((i % 2 ? eSO_splice_acceptor_variant
3242                     : eSO_splice_donor_variant),
3243                     *ci.GetRangeAsSeq_loc());
3244                 i++;
3245             }
3246         }}
3247 
3248     if(!cds_loc) {
3249         props_map.Add(eSO_nc_transcript_variant, *rna_loc);
3250     } else {
3251         props_map.Add(eSO_coding_sequence_variant, *cds_loc);
3252 
3253         {{
3254                 TLocsPair p = CVariantPropertiesIndex::s_GetStartAndStopCodonsLocs(*cds_loc);
3255                 props_map.Add(eSO_initiator_codon_variant, *p.first);
3256                 props_map.Add(eSO_terminator_codon_variant, *p.second);
3257             }}
3258 
3259         if(rna_loc) {
3260             TLocsPair p = CVariantPropertiesIndex::s_GetUTRLocs(*cds_loc, *rna_loc);
3261             props_map.Add(eSO_5_prime_UTR_variant, *p.first);
3262             props_map.Add(eSO_3_prime_UTR_variant, *p.second);
3263         }
3264     }
3265 
3266     set<CVariationUtil::ESOTerm> terms_set;
3267     for(CSeq_loc_CI ci(query_loc, CSeq_loc_CI::eEmpty_Skip); ci; ++ci) {
3268         const SPropsMap::TRangeMap& rm = props_map.loc_map[ci.GetSeq_id_Handle()];
3269         for(SPropsMap::TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) {
3270             terms_set.insert(it2->second);
3271         }
3272     }
3273     copy(terms_set.begin(), terms_set.end(), back_inserter(terms));
3274     }
3275 
3276 
3277 
3278 
3279 
3280     //transcript length less polyA
GetEffectiveTranscriptLength(const CBioseq_Handle & bsh)3281     TSeqPos CVariationUtil::GetEffectiveTranscriptLength(const CBioseq_Handle& bsh)
3282     {
3283         CVariantPlacement::TMol mol = GetMolType(*bsh.GetSeqId());
3284         if(mol != CVariantPlacement::eMol_rna && mol != CVariantPlacement::eMol_cdna) {
3285             return bsh.GetInst_Length();
3286         }
3287 
3288         SAnnotSelector sel;
3289         sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_exon);
3290         sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_polyA_site);
3291 
3292         TSeqPos last_exon_pos(0);
3293         TSeqPos last_polyA_pos(0);
3294         for(CFeat_CI ci(bsh, sel); ci; ++ci) {
3295             const CMappedFeat& mf = *ci;
3296             if(mf.GetData().GetSubtype() == CSeqFeatData::eSubtype_exon) {
3297                 last_exon_pos = max(last_exon_pos, sequence::GetStop(mf.GetLocation(), NULL));
3298             } else {
3299                 last_polyA_pos = max(last_polyA_pos, sequence::GetStop(mf.GetLocation(), NULL));
3300             }
3301         }
3302 
3303         return last_exon_pos ? last_exon_pos + 1
3304             : last_polyA_pos ? last_polyA_pos + 1
3305             : bsh.GetInst_Length();
3306     }
3307 
3308 
x_SetVariantPropertiesForIntronic(CVariantPlacement & p,int offset,const CSeq_loc & loc,CBioseq_Handle & bsh)3309     void CVariationUtil::x_SetVariantPropertiesForIntronic(
3310         CVariantPlacement& p,
3311         int offset,
3312         const CSeq_loc& loc,
3313         CBioseq_Handle& bsh)
3314     {
3315         if(!p.IsSetGene_location()) {
3316             //need to zero-out the bitmask, otherwise in debug mode it will be preset to a magic value,
3317             //and then modifying it with "|=" will produce garbage.
3318             p.SetGene_location(0);
3319         }
3320 
3321         if(sequence::GetStrand(loc, NULL) == eNa_strand_minus) {
3322             offset *= -1;
3323         }
3324 
3325         if(loc.GetStop(eExtreme_Positional) + 1 >= GetEffectiveTranscriptLength(bsh) && offset > 0) {
3326             //at the 3'-end; check if near-gene or intergenic
3327             if(offset <= 500) {
3328                 p.SetGene_location() |= CVariantProperties::eGene_location_near_gene_3;
3329             } else {
3330                 p.SetGene_location() |= CVariantProperties::eGene_location_intergenic;
3331             }
3332         } else if(loc.GetStart(eExtreme_Positional) == 0 && offset < 0) {
3333             //at the 5'-end; check if near-gene or intergenic
3334             if(offset >= -2000) {
3335                 p.SetGene_location() |= CVariantProperties::eGene_location_near_gene_5;
3336             } else {
3337                 p.SetGene_location() |= CVariantProperties::eGene_location_intergenic;
3338             }
3339         } else {
3340             //intronic or splice
3341             if(offset < 0 && offset >= -2) {
3342                 p.SetGene_location() |= CVariantProperties::eGene_location_acceptor;
3343             } else if(offset > 0 && offset <= 2) {
3344                 p.SetGene_location() |= CVariantProperties::eGene_location_donor;
3345             } else {
3346                 p.SetGene_location() |= CVariantProperties::eGene_location_intron;
3347             }
3348         }
3349 
3350         if(p.GetGene_location() == 0) {
3351             p.ResetGene_location();
3352         }
3353     }
3354 
3355 
SetVariantProperties(CVariation & v)3356     void CVariationUtil::SetVariantProperties(CVariation& v)
3357     {
3358         v.Index();
3359 
3360         for(CTypeIterator<CVariation> it(Begin(v)); it; ++it) {
3361             CVariation& v2 = *it;
3362             if(!v2.IsSetPlacements()) {
3363                 continue;
3364             }
3365 
3366             //will collapse placement-specific gene-location properties onto variation properties
3367             if(!v2.SetVariant_prop().IsSetGene_location()) {
3368                 v2.SetVariant_prop().SetGene_location(0); //clear out the magic-value
3369             }
3370 
3371             NON_CONST_ITERATE(CVariation::TPlacements, it2, v2.SetPlacements())
3372             {
3373                 CVariantPlacement& p = **it2;
3374                 SetPlacementProperties(p);
3375 
3376                 if(v2.GetConsequenceParent()) {
3377                     //If this variation is a consequence of a parent variation, we are only interested
3378                     //in the product-specific properties (as the consequence variation will have nucleotide-level placement
3379                     //but we're not interested in recomputing multiple-genes-specific properties here.
3380                     break;
3381                 }
3382 
3383                 v2.SetVariant_prop().SetGene_location() |= p.GetGene_location();
3384             }
3385         }
3386     }
3387 
3388 
GetLocationProperties(const CSeq_loc & loc,CVariantPropertiesIndex::TGeneIDAndPropVector & v)3389     void CVariationUtil::CVariantPropertiesIndex::GetLocationProperties(
3390         const CSeq_loc& loc,
3391         CVariantPropertiesIndex::TGeneIDAndPropVector& v)
3392     {
3393         typedef map<int, CVariantProperties::TGene_location> TMap;
3394         TMap m; //will collapse properties per gene-id (TGene_location is a bitmask)
3395 
3396         CRef<CSeq_loc> loc2(new CSeq_loc);
3397         loc2->Assign(loc);
3398         ChangeIdsInPlace(*loc2, sequence::eGetId_Canonical, *m_scope);
3399 
3400         for(CSeq_loc_CI ci(*loc2); ci; ++ci) {
3401             CSeq_id_Handle idh = ci.GetSeq_id_Handle();
3402 
3403             if(m_loc2prop.find(idh) == m_loc2prop.end()) {
3404                 //first time seeing this seq-id - index it
3405                 try {
3406                     x_Index(idh);
3407                 }
3408                 catch(CException& e) {
3409                     NCBI_RETHROW_SAME(e, "Can't Index " + idh.AsString());
3410                 }
3411             }
3412 
3413             TIdRangeMap::const_iterator it = m_loc2prop.find(idh);
3414             if(it == m_loc2prop.end()) {
3415                 return;
3416             }
3417             const TRangeMap& rm = it->second;
3418 
3419             for(TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) {
3420                 ITERATE(TGeneIDAndPropVector, it3, it2->second)
3421                 {
3422                     TGeneIDAndProp gene_id_and_prop = *it3;
3423                     int gene_id = gene_id_and_prop.first;
3424                     CVariantProperties::TGene_location properties = gene_id_and_prop.second;
3425 
3426                     if(m.find(gene_id) == m.end()) {
3427                         m[gene_id] = 0; //need to zero-out the bitmask in debug mode
3428                     }
3429                     m[gene_id] |= properties;
3430                 }
3431             }
3432         }
3433 
3434         ITERATE(TMap, it, m)
3435         {
3436             v.push_back(TGeneIDAndProp(it->first, it->second));
3437         }
3438     }
3439 
x_Add(const CSeq_loc & loc,int gene_id,CVariantProperties::TGene_location prop)3440     void CVariationUtil::CVariantPropertiesIndex::x_Add(
3441         const CSeq_loc& loc,
3442         int gene_id,
3443         CVariantProperties::TGene_location prop)
3444     {
3445         for(CSeq_loc_CI ci(loc); ci; ++ci) {
3446             try {
3447                 m_loc2prop[ci.GetSeq_id_Handle()][ci.GetRange()].push_back(TGeneIDAndProp(gene_id, prop));
3448             }
3449             catch(CException& e) {
3450                 NcbiCerr << MSerial_AsnText << loc << gene_id << " " << prop;
3451                 NCBI_RETHROW_SAME(e, "Can't index location");
3452             }
3453         }
3454     }
3455 
3456     ///Note: this is strand-agnostic
3457     class SFastLocSubtract
3458     {
3459     public:
SFastLocSubtract(const CSeq_loc & loc)3460         SFastLocSubtract(const CSeq_loc& loc)
3461         {
3462             for(CSeq_loc_CI ci(loc); ci; ++ci) {
3463                 m_rangemap[ci.GetSeq_id_Handle()][ci.GetRange()] = ci.GetRangeAsSeq_loc();
3464             }
3465         }
3466 
operator ()(CSeq_loc & container_loc) const3467         void operator()(CSeq_loc& container_loc) const
3468         {
3469             for(CTypeIterator<CSeq_loc> it(Begin(container_loc)); it; ++it) {
3470                 CSeq_loc& loc = *it;
3471                 if(!loc.IsInt() && !loc.IsPnt()) {
3472                     continue;
3473                 }
3474                 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*loc.GetId());
3475                 TIdRangeMap::const_iterator it2 = m_rangemap.find(idh);
3476                 if(it2 == m_rangemap.end()) {
3477                     continue;
3478                 }
3479                 const TRangeMap& rm = it2->second;
3480 
3481                 for(TRangeMap::const_iterator it3 = rm.begin(loc.GetTotalRange()); it3.Valid(); ++it3) {
3482                     CConstRef<CSeq_loc> loc2 = it3->second;
3483                     CRef<CSeq_loc> loc3 = sequence::Seq_loc_Subtract(loc, *loc2, 0, NULL);
3484                     loc.Assign(*loc3);
3485                 }
3486             }
3487         }
3488     private:
3489         typedef CRangeMap<CConstRef<CSeq_loc>, TSeqPos> TRangeMap;
3490         typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap;
3491         TIdRangeMap m_rangemap;
3492     };
3493 
3494 
IsRefSeqGene(const CBioseq_Handle & bsh)3495     static bool IsRefSeqGene(const CBioseq_Handle& bsh)
3496     {
3497         if(!bsh.CanGetDescr()) {
3498             return false;
3499         }
3500         ITERATE(CSeq_descr::Tdata, it, bsh.GetDescr().Get())
3501         {
3502             const CSeqdesc& desc = **it;
3503             if(desc.IsGenbank()) {
3504                 const CGB_block::TKeywords& k = desc.GetGenbank().GetKeywords();
3505                 if(std::find(k.begin(), k.end(), "RefSeqGene") != k.end()) {
3506                     return true;
3507                 }
3508             }
3509         }
3510         return false;
3511     }
3512 
3513     // VAR-239
3514     // If RefSeqGene NG, Return GeneIDs for loci having alignments to transcripts.
3515     // Otherwise, return empty set
GetFocusLocusIDs(const CBioseq_Handle & bsh)3516     static set<int> GetFocusLocusIDs(const CBioseq_Handle& bsh)
3517     {
3518         set<int> gene_ids;
3519         if(!IsRefSeqGene(bsh)) {
3520             return gene_ids;
3521         }
3522 
3523         set<CSeq_id_Handle> transcript_seq_ids;
3524         for(CAlign_CI ci(bsh); ci; ++ci) {
3525             const CSeq_align& aln = *ci;
3526             if(aln.GetSegs().IsSpliced()) {
3527                 transcript_seq_ids.insert(CSeq_id_Handle::GetHandle(aln.GetSeq_id(0)));
3528             }
3529         }
3530 
3531         SAnnotSelector sel;
3532         sel.IncludeFeatType(CSeqFeatData::e_Gene);
3533         sel.IncludeFeatType(CSeqFeatData::e_Rna);
3534         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
3535         for(CFeat_CI ci(bsh, sel); ci; ++ci) {
3536             const CMappedFeat& mf = *ci;
3537             if(!mf.IsSetProduct() || !mf.IsSetDbxref()) {
3538                 continue;
3539             }
3540 
3541             CSeq_id_Handle product_id = CSeq_id_Handle::GetHandle(
3542                 sequence::GetId(mf.GetProduct(), NULL));
3543 
3544             if(transcript_seq_ids.find(product_id) == transcript_seq_ids.end()) {
3545                 continue;
3546             }
3547 
3548             //note: using temporary reference "dbxrefs" is necessary because
3549             //"ITERATE(CSeq_feat::TDbxref, it, mf.GetDbxref())", assert-fails in stl-safe-iter mode
3550             const CSeq_feat::TDbxref& dbxrefs = mf.GetDbxref();
3551             ITERATE(CSeq_feat::TDbxref, it, dbxrefs)
3552             {
3553                 const CDbtag& dbtag = **it;
3554                 if(dbtag.GetDb() == "GeneID"
3555                    || dbtag.GetDb() == "LocusID") {
3556                     gene_ids.insert(dbtag.GetTag().GetId());
3557                 }
3558             }
3559         }
3560         return gene_ids;
3561     }
3562 
3563 
3564     // There's no GeneID on a protein bioseq.
3565     // Instead, need to find the corresponding cdregion (which may or may not have GeneID dbxref),
3566     // and get GeneID based on parent gene feature.
s_GetGeneIdForProduct(CBioseq_Handle orig_bsh)3567     int CVariationUtil::CVariantPropertiesIndex::s_GetGeneIdForProduct(
3568         CBioseq_Handle orig_bsh)
3569     {
3570         CBioseq_Handle bsh = orig_bsh.GetInst_Mol() == CSeq_inst::eMol_aa ?
3571             sequence::GetNucleotideParent(orig_bsh) : orig_bsh;
3572 
3573         SAnnotSelector sel;
3574         sel.SetResolveTSE();
3575         sel.SetAdaptiveDepth();
3576         sel.IncludeFeatType(CSeqFeatData::e_Gene);
3577         sel.IncludeFeatType(CSeqFeatData::e_Rna);
3578         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
3579         CFeat_CI ci(bsh, sel);
3580         feature::CFeatTree ft(ci);
3581 
3582         for(ci.Rewind(); ci; ++ci) {
3583             const CMappedFeat& mf = *ci;
3584             if(mf.IsSetProduct()
3585                && mf.GetProduct().GetId()
3586                && sequence::IsSameBioseq(
3587                *orig_bsh.GetSeqId(),
3588                *mf.GetProduct().GetId(),
3589                &orig_bsh.GetScope())) {
3590                 return s_GetGeneID(mf, ft);
3591             }
3592         }
3593         return 0;
3594     }
3595 
3596 
x_Index(const CSeq_id_Handle & idh)3597     void CVariationUtil::CVariantPropertiesIndex::x_Index(const CSeq_id_Handle& idh)
3598     {
3599         SAnnotSelector sel;
3600         sel.SetResolveAll(); //needs to work on NCs
3601         sel.SetAdaptiveDepth();
3602         sel.IncludeFeatType(CSeqFeatData::e_Gene);
3603         sel.IncludeFeatType(CSeqFeatData::e_Rna);
3604         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
3605 
3606         CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh);
3607         CFeat_CI ci(bsh, sel);
3608         feature::CFeatTree ft(ci);
3609 
3610         m_loc2prop[idh].size(); //in case there's no annotation, simply add the entry to the map
3611         //se we know it has been indexed.
3612 
3613         set<int> focus_loci = GetFocusLocusIDs(bsh);
3614 
3615         //Note: A location can have in-neighborhood == true only if it is NOT in
3616         //range of another locus. However, if we're dealing with in-focus/out-of-focus
3617         //genes on NG, then for the purpose of calculating in-neighborhood flags we need to
3618         //put the non-focus genes out-of-scope. On the other hand, when calculating is-intergenic
3619         //flag, we have to assume that the non-focus genes are in scope (VAR-239).
3620         //That's why we need to collect focus_gene_ranges and non_focus_gene_ranges separately.
3621         //(See the code below that sets CVariantProperties::eGene_location_intergenic)
3622 
3623         CRef<CSeq_loc> focus_gene_ranges(new CSeq_loc(CSeq_loc::e_Mix));
3624         CRef<CSeq_loc> non_focus_gene_ranges(new CSeq_loc(CSeq_loc::e_Mix));
3625         for(ci.Rewind(); ci; ++ci) {
3626             const CMappedFeat& mf = *ci;
3627             if(!mf.GetData().IsGene()) {
3628                 continue;
3629             }
3630 
3631             const int gene_id = s_GetGeneID(mf, ft);
3632             const bool is_focus_locus = focus_loci.empty()
3633                 || focus_loci.count(gene_id);
3634 
3635             (is_focus_locus ? focus_gene_ranges
3636              : non_focus_gene_ranges)
3637              ->SetMix().Set().push_back(
3638              sequence::Seq_loc_Merge(
3639              mf.GetLocation(),
3640              CSeq_loc::fMerge_SingleRange,
3641              NULL));
3642         }
3643         focus_gene_ranges->ResetStrand();
3644         non_focus_gene_ranges->ResetStrand();
3645         SFastLocSubtract subtract_gene_ranges_from(*focus_gene_ranges);
3646 
3647         CRef<CSeq_loc> all_gene_neighborhoods(new CSeq_loc(CSeq_loc::e_Mix));
3648 
3649         bool found_some_gene_ids = false;
3650 
3651         for(ci.Rewind(); ci; ++ci) {
3652             const CMappedFeat& mf = *ci;
3653             CMappedFeat parent_mf = ft.GetParent(mf);
3654 
3655             if(!mf.GetLocation().GetId()) {
3656                 continue;
3657             }
3658 
3659             const int gene_id = s_GetGeneID(mf, ft);
3660             if(!focus_loci.empty()
3661                && focus_loci.find(gene_id) == focus_loci.end()) {
3662                 continue; //VAR-239
3663             }
3664 
3665             if(!parent_mf && gene_id) {
3666                 // Some locations currently may not be covered by
3667                 // any variant-properties values (e.g. in-cds)
3668                 // and yet we need to index gene_ids at those locations.
3669                 // Since variant-properties is a bitmask,
3670                 // we can simply use the value 0 for that.
3671                 // Only need to do that for parent locs.
3672                 x_Add(mf.GetLocation(), gene_id, 0);
3673                 found_some_gene_ids = true;
3674             }
3675 
3676             // compute neighbborhood locations.
3677             // Normally for gene features, or rna features lacking a parent gene feature.
3678             // Applicable for dna context only.
3679             if((mf.GetData().IsGene() || (!parent_mf && mf.GetData().IsRna()))
3680                && bsh.GetBioseqMolType() == CSeq_inst::eMol_dna) {
3681                 TLocsPair p = s_GetNeighborhoodLocs(mf.GetLocation(), bsh.GetInst_Length() - 1);
3682 
3683                 // exclude any gene overlap from neighborhood
3684                 // (i.e. we don't care if location is in
3685                 // neighborhood of gene-A if it is within gene-B.
3686 
3687                 // First, reset strands (as we did with gene_ranges),
3688                 // as we need to do this in strand-agnostic manner.
3689                 // Note: a reasonable thing to do would be to set
3690                 // all-gene-ranges strand to "both", with an expectation
3691                 // that subtracting such a loc from either plus or minus
3692                 // loc would work, but unfortunately it doesn't.
3693                 // Resetting strand, however, is fine, because our
3694                 // indexing is strand-agnostic.
3695                 p.first->ResetStrand();
3696                 p.second->ResetStrand();
3697 
3698 #if 0
3699                 // Note: disabling subtraction of gene ranges because want to
3700                 // report neargene-ness regardless of other loci SNP-5000
3701 #if
3702                 //all_gene_ranges is a big complex loc, and subtracting with Seq_loc_Subtract
3703                 //for every neighborhood is slow (takes almost 10 seconds for NC_000001),
3704                 //so we'll use fast map-based subtractor instead
3705                 p.first = sequence::Seq_loc_Subtract(
3706                     *p.first,
3707                     *all_gene_ranges,
3708                     CSeq_loc::fSortAndMerge_All,
3709                     NULL);
3710 
3711                 p.second = sequence::Seq_loc_Subtract(
3712                     *p.second,
3713                     *all_gene_ranges,
3714                     CSeq_loc::fSortAndMerge_All,
3715                     NULL);
3716 #else
3717                 subtract_gene_ranges_from(*p.first);
3718                 subtract_gene_ranges_from(*p.second);
3719 #endif
3720 #endif
3721                 x_Add(*p.first, gene_id, CVariantProperties::eGene_location_near_gene_5);
3722                 x_Add(*p.second, gene_id, CVariantProperties::eGene_location_near_gene_3);
3723 
3724                 all_gene_neighborhoods->SetMix().Set().push_back(p.first);
3725                 all_gene_neighborhoods->SetMix().Set().push_back(p.second);
3726             }
3727 
3728             //VAR-703: need to process both RNAs and CDSes to include IGs.
3729             if(mf.GetData().IsRna() || mf.GetData().IsCdregion()) {
3730                 if(mf.GetData().IsRna()
3731                    && mf.GetData().GetSubtype() != CSeqFeatData::eSubtype_mRNA) {
3732                     x_Add(mf.GetLocation(),
3733                           gene_id,
3734                           CVariantProperties::eGene_location_conserved_noncoding);
3735                 }
3736 
3737                 TLocsPair p = s_GetIntronsAndSpliceSiteLocs(mf.GetLocation());
3738                 x_Add(*p.first,
3739                       gene_id,
3740                       CVariantProperties::eGene_location_intron);
3741 
3742                 size_t i(0);
3743                 for(CSeq_loc_CI ci(*p.second,
3744                     CSeq_loc_CI::eEmpty_Skip,
3745                     CSeq_loc_CI::eOrder_Biological);
3746                 ci; ++ci) {
3747                     x_Add(*ci.GetRangeAsSeq_loc(),
3748                           gene_id,
3749                           i % 2 ? CVariantProperties::eGene_location_acceptor
3750                           : CVariantProperties::eGene_location_donor);
3751                     ++i;
3752                 }
3753             }
3754 
3755             if(mf.GetData().IsCdregion()) {
3756                 TLocsPair p = s_GetStartAndStopCodonsLocs(mf.GetLocation());
3757 
3758                 x_Add(*p.first,
3759                       gene_id,
3760                       CVariantProperties::eGene_location_in_start_codon);
3761 
3762                 x_Add(*p.second,
3763                       gene_id,
3764                       CVariantProperties::eGene_location_in_stop_codon);
3765 
3766                 CRef<CSeq_loc> rna_loc(new CSeq_loc(CSeq_loc::e_Null));
3767                 {{
3768                         if(parent_mf) {
3769                             rna_loc->Assign(parent_mf.GetLocation());
3770                         } else if(bsh.GetBioseqMolType() == CSeq_inst::eMol_rna) {
3771                             //refseqs have gene feature annotated on rnas spanning the whole sequence,
3772                             //but in general there may be free-floating CDS on an rna, in which case
3773                             //parent loc is the whole seq.
3774                             rna_loc = bsh.GetRangeSeq_loc(
3775                                 0,
3776                                 bsh.GetInst_Length() - 1,
3777                                 mf.GetLocation().GetStrand());
3778                         }
3779                     }}
3780 
3781                 p = s_GetUTRLocs(mf.GetLocation(), *rna_loc);
3782 
3783                 x_Add(*p.first,
3784                       gene_id,
3785                       CVariantProperties::eGene_location_utr_5);
3786 
3787                 x_Add(*p.second,
3788                       gene_id,
3789                       CVariantProperties::eGene_location_utr_3);
3790 
3791             } else if(mf.GetData().IsGene()) {
3792                 if(ft.GetChildren(mf).size() == 0) {
3793                     x_Add(mf.GetLocation(),
3794                           gene_id,
3795                           bsh.GetBioseqMolType() == CSeq_inst::eMol_rna ?
3796                           CVariantProperties::eGene_location_conserved_noncoding
3797                           : CVariantProperties::eGene_location_in_gene);
3798                 }
3799             }
3800         }
3801 
3802         if(bsh.GetBioseqMolType() == CSeq_inst::eMol_dna) {
3803 
3804             CRef<CSeq_loc> range_loc =
3805                 bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus);
3806 
3807             CRef<CSeq_loc> genes_and_neighborhoods_loc =
3808                 sequence::Seq_loc_Add(*all_gene_neighborhoods,
3809                 *focus_gene_ranges,
3810                 0, NULL);
3811 
3812             genes_and_neighborhoods_loc =
3813                 sequence::Seq_loc_Add(
3814                 *genes_and_neighborhoods_loc,
3815                 *non_focus_gene_ranges,
3816                 0, NULL);
3817 
3818             genes_and_neighborhoods_loc->ResetStrand();
3819 
3820             CRef<CSeq_loc> intergenic_loc =
3821                 sequence::Seq_loc_Subtract(
3822                 *range_loc,
3823                 *genes_and_neighborhoods_loc,
3824                 CSeq_loc::fSortAndMerge_All,
3825                 NULL);
3826 
3827             x_Add(*intergenic_loc,
3828                   0,
3829                   CVariantProperties::eGene_location_intergenic);
3830         }
3831 
3832         if(bsh.GetBioseqMolType() == CSeq_inst::eMol_aa
3833            && !found_some_gene_ids) {
3834             // JIRA:SNP-5390
3835             // in it's normal configuration the iterator won't
3836             // find a feature with GeneID on a protein, and so
3837             // it would not be reported in the protein placement.
3838             // If a CDS is annotated directly on a DNA molecule,
3839             // there would be no product-specific GeneID at all (
3840             CRef<CSeq_loc> whole_range_loc =
3841                 bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus);
3842 
3843             int gene_id = s_GetGeneIdForProduct(bsh);
3844             if(gene_id) {
3845                 x_Add(*whole_range_loc, gene_id, 0);
3846             }
3847         }
3848     }
3849 
s_GetGeneID(const CMappedFeat & mf,feature::CFeatTree & ft)3850     int CVariationUtil::CVariantPropertiesIndex::s_GetGeneID(
3851         const CMappedFeat& mf,
3852         feature::CFeatTree& ft)
3853     {
3854         int gene_id = 0;
3855         if(mf.IsSetDbxref()) {
3856             //note: using temporary reference "dbxrefs" is necessary because
3857             //"ITERATE(CSeq_feat::TDbxref, it, mf.GetDbxref())", assert-fails in stl-safe-iter mode
3858             const CSeq_feat::TDbxref& dbxrefs = mf.GetDbxref();
3859             ITERATE(CSeq_feat::TDbxref, it, dbxrefs)
3860             {
3861                 const CDbtag& dbtag = **it;
3862                 if(dbtag.GetDb() == "GeneID"
3863                    || dbtag.GetDb() == "LocusID") {
3864                     gene_id = dbtag.GetTag().GetId();
3865                 }
3866             }
3867         }
3868 
3869         // try legacy representation, e.g. NM_000155.1:
3870         if(!gene_id
3871            && mf.GetData().IsGene()
3872            && mf.GetData().GetGene().IsSetDb()) {
3873             const CGene_ref::TDb& dbxrefs = mf.GetData().GetGene().GetDb();
3874             ITERATE(CSeq_feat::TDbxref, it, dbxrefs)
3875             {
3876                 const CDbtag& dbtag = **it;
3877                 if(dbtag.GetDb() == "GeneID"
3878                    || dbtag.GetDb() == "LocusID") {
3879                     gene_id = dbtag.GetTag().GetId();
3880                 }
3881             }
3882         }
3883 
3884         if(gene_id == 0) {
3885             CMappedFeat parent = ft.GetParent(mf);
3886             return parent ? s_GetGeneID(parent, ft) : gene_id;
3887         } else {
3888             return gene_id;
3889         }
3890     }
3891 
3892 
3893     CVariationUtil::CVariantPropertiesIndex::TLocsPair
s_GetStartAndStopCodonsLocs(const CSeq_loc & cds_loc)3894         CVariationUtil::CVariantPropertiesIndex::s_GetStartAndStopCodonsLocs(const CSeq_loc& cds_loc)
3895     {
3896         TLocsPair p;
3897         p.first = sequence::Seq_loc_Merge(cds_loc, CSeq_loc::fMerge_SingleRange, NULL);
3898         p.second.Reset(new CSeq_loc);
3899         p.second->Assign(*p.first);
3900         p.first->SetInt().SetTo(p.first->GetInt().GetFrom() + 2);
3901         p.second->SetInt().SetFrom(p.second->GetInt().GetTo() - 2);
3902 
3903         if(IsReverse(cds_loc.GetStrand())) {
3904             swap(p.first, p.second);
3905         }
3906 
3907         if(cds_loc.IsPartialStart(eExtreme_Biological) || cds_loc.IsTruncatedStart(eExtreme_Biological)) {
3908             p.first->SetNull();
3909         }
3910         if(cds_loc.IsPartialStop(eExtreme_Biological) || cds_loc.IsTruncatedStop(eExtreme_Biological)) {
3911             p.second->SetNull();
3912         }
3913 
3914         return p;
3915     }
3916 
3917     CVariationUtil::CVariantPropertiesIndex::TLocsPair
s_GetUTRLocs(const CSeq_loc & cds_loc,const CSeq_loc & parent_loc)3918         CVariationUtil::CVariantPropertiesIndex::s_GetUTRLocs(const CSeq_loc& cds_loc, const CSeq_loc& parent_loc)
3919     {
3920         TLocsPair p;
3921         CRef<CSeq_loc> sub_loc1 = sequence::Seq_loc_Merge(cds_loc, CSeq_loc::fMerge_SingleRange, NULL);
3922         CRef<CSeq_loc> sub_loc2(new CSeq_loc);
3923         sub_loc2->Assign(*sub_loc1);
3924 
3925         sub_loc1->SetInt().SetTo(parent_loc.GetTotalRange().GetTo());
3926         sub_loc2->SetInt().SetFrom(parent_loc.GetTotalRange().GetFrom());
3927 
3928         p.first = sequence::Seq_loc_Subtract(parent_loc, *sub_loc1, CSeq_loc::fSortAndMerge_All, NULL);
3929         p.second = sequence::Seq_loc_Subtract(parent_loc, *sub_loc2, CSeq_loc::fSortAndMerge_All, NULL);
3930 
3931         if(IsReverse(cds_loc.GetStrand())) {
3932             swap(p.first, p.second);
3933         }
3934         return p;
3935     }
3936 
3937     CVariationUtil::CVariantPropertiesIndex::TLocsPair
s_GetNeighborhoodLocs(const CSeq_loc & gene_loc,TSeqPos max_pos)3938         CVariationUtil::CVariantPropertiesIndex::s_GetNeighborhoodLocs(const CSeq_loc& gene_loc, TSeqPos max_pos)
3939     {
3940         TSeqPos flank1_len(2000), flank2_len(500);
3941         if(IsReverse(gene_loc.GetStrand())) {
3942             swap(flank1_len, flank2_len);
3943         }
3944 
3945         TLocsPair p;
3946         p.first = sequence::Seq_loc_Merge(gene_loc, CSeq_loc::fMerge_SingleRange, NULL);
3947         p.second.Reset(new CSeq_loc);
3948         p.second->Assign(*p.first);
3949 
3950         if(p.first->GetTotalRange().GetFrom() == 0) {
3951             p.first->SetNull();
3952         } else {
3953             p.first->SetInt().SetTo(p.first->GetTotalRange().GetFrom() - 1);
3954             p.first->SetInt().SetFrom(p.first->GetTotalRange().GetFrom() < flank1_len ? 0 : p.first->GetTotalRange().GetFrom() - flank1_len);
3955         }
3956 
3957         if(p.second->GetTotalRange().GetTo() == max_pos) {
3958             p.second->SetNull();
3959         } else {
3960             p.second->SetInt().SetFrom(p.second->GetTotalRange().GetTo() + 1);
3961             p.second->SetInt().SetTo(p.second->GetTotalRange().GetTo() > max_pos ? max_pos : p.second->GetTotalRange().GetTo() + flank2_len);
3962         }
3963 
3964         if(IsReverse(gene_loc.GetStrand())) {
3965             swap(p.first, p.second);
3966         }
3967         return p;
3968     }
3969 
3970 
3971     CVariationUtil::CVariantPropertiesIndex::TLocsPair
s_GetIntronsAndSpliceSiteLocs(const CSeq_loc & rna_loc)3972         CVariationUtil::CVariantPropertiesIndex::s_GetIntronsAndSpliceSiteLocs(const CSeq_loc& rna_loc)
3973     {
3974         CRef<CSeq_loc> range_loc = sequence::Seq_loc_Merge(rna_loc, CSeq_loc::fMerge_SingleRange, NULL);
3975 
3976         CRef<CSeq_loc> introns_loc_with_splice_sites = sequence::Seq_loc_Subtract(*range_loc, rna_loc, 0, NULL);
3977         CRef<CSeq_loc> introns_loc_without_splice_sites(new CSeq_loc);
3978         introns_loc_without_splice_sites->Assign(*introns_loc_with_splice_sites);
3979 
3980         //truncate each terminal by two bases from each end to exclude splice sites.
3981         for(CTypeIterator<CSeq_interval> it(Begin(*introns_loc_without_splice_sites)); it; ++it) {
3982             CSeq_interval& seqint = *it;
3983             if(seqint.GetLength() >= 5) {
3984                 seqint.SetFrom() += 2;
3985                 seqint.SetTo() -= 2;
3986             }
3987         }
3988 
3989         TLocsPair p;
3990         p.first = introns_loc_without_splice_sites;
3991         p.second = sequence::Seq_loc_Subtract(*introns_loc_with_splice_sites,
3992                                               *introns_loc_without_splice_sites,
3993                                               CSeq_loc::fSortAndMerge_All,
3994                                               NULL);
3995         return p;
3996     }
3997 
3998 
3999 
4000 
4001 
4002 
x_Index(const CSeq_id_Handle & idh)4003     void CVariationUtil::CCdregionIndex::x_Index(const CSeq_id_Handle& idh)
4004     {
4005         SAnnotSelector sel;
4006         sel.SetResolveAll(); //needs to work on NCs
4007         sel.SetAdaptiveDepth();
4008         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
4009         sel.IncludeFeatType(CSeqFeatData::e_Rna);
4010         CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh);
4011 
4012         m_data[idh].size(); //this will create m_data[idh] entry so that next time we don't try to reindex it if there are no cdregions
4013         m_seq_data_map[idh].mapper.Reset();
4014 
4015         CRef<CSeq_loc> all_rna_loc(new CSeq_loc(CSeq_loc::e_Null));
4016 
4017         for(CFeat_CI ci(bsh, sel); ci; ++ci) {
4018             const CMappedFeat& mf = *ci;
4019             if(!mf.IsSetProduct() || !mf.GetProduct().GetId()) {
4020                 continue; //could be segment
4021             }
4022 
4023             if(mf.GetData().IsRna()) {
4024                 all_rna_loc->Add(mf.GetLocation());
4025             } else {
4026                 all_rna_loc->Add(mf.GetLocation()); //if on mRNA seq, there's no annotated mRNA feature - will just index cds
4027 
4028                 SCdregion s;
4029                 s.cdregion_feat.Reset(mf.GetSeq_feat());
4030                 for(CSeq_loc_CI ci(mf.GetLocation()); ci; ++ci) {
4031                     m_data[ci.GetSeq_id_Handle()][ci.GetRange()].push_back(s);
4032                 }
4033 
4034 #if 0
4035                 //could possibly cache product sequence as well
4036                 if(m_options && CVariationUtil::fOpt_cache_exon_sequence) {
4037                     x_CacheSeqData(mf.GetProduct(), CSeq_id_Handle::GetHandle(*mf.GetProduct().GetId()));
4038                 }
4039 #endif
4040             }
4041         }
4042 
4043         all_rna_loc = sequence::Seq_loc_Merge(*all_rna_loc, CSeq_loc::fSortAndMerge_All, NULL);
4044 
4045         if(m_options && CVariationUtil::fOpt_cache_exon_sequence) {
4046             x_CacheSeqData(*all_rna_loc, idh);
4047         }
4048     }
4049 
x_CacheSeqData(const CSeq_loc & loc,const CSeq_id_Handle & idh)4050     void CVariationUtil::CCdregionIndex::x_CacheSeqData(const CSeq_loc& loc, const CSeq_id_Handle& idh)
4051     {
4052         CSeq_id_Handle idh2 = sequence::GetId(idh, *m_scope, sequence::eGetId_Canonical);
4053         SSeqData& d = m_seq_data_map[idh2];
4054 
4055         if(loc.IsNull() || loc.IsEmpty()) {
4056             return;
4057         }
4058 
4059         CRef<CSeq_loc> loc2(new CSeq_loc);
4060         loc2->Assign(loc);
4061 
4062         CSeqVector seqv;
4063         if(loc.IsWhole()) {
4064             CBioseq_Handle bsh = m_scope->GetBioseqHandle(idh);
4065             loc2 = bsh.GetRangeSeq_loc(0, bsh.GetInst_Length() - 1, eNa_strand_plus);
4066             seqv = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
4067         } else {
4068             seqv = CSeqVector(*loc2, *m_scope, CBioseq_Handle::eCoding_Iupac);
4069         }
4070 
4071         seqv.GetSeqData(seqv.begin(), seqv.end(), d.seq_data);
4072 
4073         if(d.seq_data.size() != sequence::GetLength(*loc2, m_scope) /*could be whole, so need scope*/) {
4074             //expecting the length of returned sequence to be exactly the same as location;
4075             //otherwise can't associate sub-location with position in returned sequence
4076             d.seq_data = "";
4077             d.mapper.Reset();
4078             return;
4079         }
4080 
4081         CRef<CSeq_loc> target_loc(new CSeq_loc);
4082         target_loc->SetInt().SetId().SetLocal().SetStr("all_cds");
4083         target_loc->SetInt().SetFrom(0);
4084         target_loc->SetInt().SetTo(d.seq_data.size() - 1);
4085         target_loc->SetInt().SetStrand(eNa_strand_plus);
4086 
4087         d.mapper.Reset(new CSeq_loc_Mapper(*loc2, *target_loc, NULL));
4088     }
4089 
GetCachedLiteralAtLoc(const CSeq_loc & loc)4090     CRef<CSeq_literal> CVariationUtil::CCdregionIndex::GetCachedLiteralAtLoc(const CSeq_loc& loc)
4091     {
4092         CRef<CSeq_literal> literal(new CSeq_literal);
4093         literal->SetSeq_data().SetIupacna().Set("");
4094 
4095 
4096         CRef<CSeq_loc> loc2(new CSeq_loc);
4097         loc2->Assign(loc);
4098         ChangeIdsInPlace(*loc2, sequence::eGetId_Canonical, *m_scope);
4099 
4100         for(CSeq_loc_CI ci(*loc2); ci; ++ci) {
4101             if(m_seq_data_map.find(ci.GetSeq_id_Handle()) == m_seq_data_map.end()) {
4102                 return CRef<CSeq_literal>(NULL);
4103             }
4104             const SSeqData& d = m_seq_data_map.find(ci.GetSeq_id_Handle())->second;
4105             if(!d.mapper) {
4106                 return CRef<CSeq_literal>(NULL);
4107             }
4108 
4109             CRef<CSeq_loc> mapped_loc = d.mapper->Map(*ci.GetRangeAsSeq_loc());
4110 
4111             string s = "";
4112             mapped_loc->GetLabel(&s);
4113             if((!mapped_loc->IsInt() && !mapped_loc->IsPnt()) //split mapping
4114                || mapped_loc->GetTotalRange().GetLength() != ci.GetRange().GetLength()
4115                || mapped_loc->GetStrand() != eNa_strand_plus) //todo: can accomodate minus by reverse-complementing the chunk
4116             {
4117                 return CRef<CSeq_literal>(NULL);
4118             }
4119 
4120             string seq_chunk = d.seq_data.substr(mapped_loc->GetStart(eExtreme_Positional), mapped_loc->GetTotalRange().GetLength());
4121             literal->SetSeq_data().SetIupacna().Set() += seq_chunk;
4122         }
4123 
4124         literal->SetLength(literal->GetSeq_data().GetIupacna().Get().size());
4125         return literal;
4126     }
4127 
Get(const CSeq_loc & loc,TCdregions & cdregions)4128     void CVariationUtil::CCdregionIndex::Get(const CSeq_loc& loc, TCdregions& cdregions)
4129     {
4130         CRef<CSeq_loc> loc2(new CSeq_loc);
4131         loc2->Assign(loc);
4132         ChangeIdsInPlace(*loc2, sequence::eGetId_Canonical, *m_scope);
4133 
4134         for(CSeq_loc_CI ci(*loc2); ci; ++ci) {
4135             CSeq_id_Handle idh = ci.GetSeq_id_Handle();
4136 
4137             if(m_data.find(idh) == m_data.end()) {
4138                 //first time seeing this seq-id - index it
4139                 x_Index(idh);
4140             }
4141 
4142             TIdRangeMap::const_iterator it = m_data.find(idh);
4143             if(it == m_data.end()) {
4144                 return;
4145             }
4146             const TRangeMap& rm = it->second;
4147 
4148             set<SCdregion> results;
4149             for(TRangeMap::const_iterator it2 = rm.begin(ci.GetRange()); it2.Valid(); ++it2) {
4150                 ITERATE(TCdregions, it3, it2->second)
4151                 {
4152                     results.insert(*it3);
4153                 }
4154             }
4155             ITERATE(set<SCdregion>, it, results)
4156             {
4157                 cdregions.push_back(*it);
4158             }
4159         }
4160     }
4161 
4162 
4163 
4164     //
4165     //
4166     //  Methods to suuport Variation / Variation_ref conversions.
4167     //
4168     //
4169 
4170     //Make a copy of the child and inherit relevant (SNP-5716) fields from parent that are not set in the child
InheritParentAttributes(const CVariation & child,const CVariation & parent)4171     CRef<CVariation> InheritParentAttributes(const CVariation& child, const CVariation& parent)
4172     {
4173         CRef<CVariation> v(SerialClone(child));
4174 
4175         if(!v->IsSetId() && parent.IsSetId()) {
4176             v->SetId().Assign(parent.GetId());
4177         }
4178 
4179         if(!v->IsSetParent_id() && parent.IsSetParent_id()) {
4180             v->SetParent_id().Assign(parent.GetParent_id());
4181         }
4182 
4183         if(!v->IsSetSample_id() && parent.IsSetSample_id()) {
4184             ITERATE(CVariation::TSample_id, it, parent.GetSample_id())
4185             {
4186                 v->SetSample_id().push_back(CRef<CObject_id>(SerialClone(**it)));
4187             }
4188         }
4189 
4190         if(!v->IsSetOther_ids() && parent.IsSetOther_ids()) {
4191             ITERATE(CVariation::TOther_ids, it, parent.GetOther_ids())
4192             {
4193                 v->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it)));
4194             }
4195         }
4196 
4197         return v;
4198     }
4199 
AsVariation_feats(const CVariation & v,CSeq_annot::TData::TFtable & out_feats)4200     void CVariationUtil::AsVariation_feats(const CVariation& v, CSeq_annot::TData::TFtable& out_feats)
4201     {
4202         CSeq_annot::TData::TFtable feats;
4203         if(v.IsSetPlacements()) {
4204             ITERATE(CVariation::TPlacements, it, v.GetPlacements())
4205             {
4206                 const CVariantPlacement& p = **it;
4207                 CRef<CVariation_ref> vr = x_AsVariation_ref(v, p);
4208                 CRef<CSeq_feat> feat(new CSeq_feat);
4209                 feat->SetLocation().Assign(p.GetLoc());
4210                 feat->SetData().SetVariation(*vr);
4211                 feats.push_back(feat);
4212             }
4213         } else if(v.GetData().IsSet()) {
4214             ITERATE(CVariation::TData::TSet::TVariations, it, v.GetData().GetSet().GetVariations())
4215             {
4216                 AsVariation_feats(*InheritParentAttributes(**it, v), feats);
4217             }
4218         }
4219 
4220         NON_CONST_ITERATE(CSeq_annot::TData::TFtable, it, feats)
4221         {
4222             CSeq_feat& feat = **it;
4223             if(v.IsSetPub() && !feat.IsSetCit()) {
4224                 feat.SetCit().Assign(v.GetPub());
4225             }
4226             if(v.IsSetExt()) {
4227                 ITERATE(CVariation::TExt, it2, v.GetExt())
4228                 {
4229                     CRef<CUser_object> uo(new CUser_object);
4230                     uo->Assign(**it2);
4231                     feat.SetExts().push_back(uo);
4232                 }
4233             }
4234         }
4235 
4236         //note: we can't attach user-objects to out_feats directly because it will result in duplicates
4237         //when out_feats recurses into next subvariation. Hence we are working with our own ftable of
4238         //feats, and attach the results in the end.
4239         out_feats.insert(out_feats.end(), feats.begin(), feats.end());
4240     }
4241 
x_AsVariation_ref(const CVariation & v,const CVariantPlacement & p)4242     CRef<CVariation_ref> CVariationUtil::x_AsVariation_ref(const CVariation& v, const CVariantPlacement& p)
4243     {
4244         CRef<CVariation_ref> vr(new CVariation_ref);
4245 
4246         if(v.IsSetId()) {
4247             vr->SetId().Assign(v.GetId());
4248         }
4249 
4250         if(v.IsSetParent_id()) {
4251             vr->SetParent_id().Assign(v.GetParent_id());
4252         }
4253 
4254         if(v.IsSetSample_id() && v.GetSample_id().size() > 0) {
4255             vr->SetSample_id().Assign(*v.GetSample_id().front());
4256         }
4257 
4258         if(v.IsSetOther_ids()) {
4259             ITERATE(CVariation::TOther_ids, it, v.GetOther_ids())
4260             {
4261                 vr->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it)));
4262             }
4263         }
4264 
4265         if(v.IsSetName()) {
4266             vr->SetName(v.GetName());
4267         }
4268 
4269         if(v.IsSetSynonyms()) {
4270             vr->SetSynonyms() = v.GetSynonyms();
4271         }
4272 
4273         if(v.IsSetDescription()) {
4274             vr->SetName(v.GetDescription());
4275         }
4276 
4277         if(v.IsSetPhenotype()) {
4278             ITERATE(CVariation::TPhenotype, it, v.GetPhenotype())
4279             {
4280                 CRef<CPhenotype> p(new CPhenotype);
4281                 p->Assign(**it);
4282                 vr->SetPhenotype().push_back(p);
4283             }
4284         }
4285 
4286         if(v.IsSetMethod()) {
4287             vr->SetMethod() = v.GetMethod().GetMethod();
4288         }
4289 
4290         if(v.IsSetVariant_prop()) {
4291             vr->SetVariant_prop().Assign(v.GetVariant_prop());
4292         }
4293 
4294         //Note: Variation.frameshift <-> Variation-ref.consequence[].frameshift
4295         if(v.IsSetFrameshift()) {
4296             CVariation_ref::TConsequence::value_type fr_cons(
4297                 new CVariation_ref::TConsequence::value_type::TObjectType);
4298             vr->SetConsequence().push_back(fr_cons);
4299             fr_cons->SetFrameshift();
4300             if(v.GetFrameshift().IsSetPhase()) {
4301                 fr_cons->SetFrameshift().SetPhase(v.GetFrameshift().GetPhase());
4302             }
4303             if(v.GetFrameshift().IsSetX_length()) {
4304                 fr_cons->SetFrameshift().SetX_length(v.GetFrameshift().GetX_length());
4305             }
4306         }
4307 
4308         if(v.GetData().IsComplex()) {
4309             vr->SetData().SetComplex();
4310         } else if(v.GetData().IsInstance()) {
4311             vr->SetData().SetInstance().Assign(v.GetData().GetInstance());
4312             s_AddInstOffsetsFromPlacementOffsets(vr->SetData().SetInstance(), p);
4313         } else if(v.GetData().IsNote()) {
4314             vr->SetData().SetNote() = v.GetData().GetNote();
4315         } else if(v.GetData().IsUniparental_disomy()) {
4316             vr->SetData().SetUniparental_disomy();
4317         } else if(v.GetData().IsUnknown()) {
4318             vr->SetData().SetUnknown();
4319         } else if(v.GetData().IsSet()) {
4320             const CVariation::TData::TSet& v_set = v.GetData().GetSet();
4321             CVariation_ref::TData::TSet& vr_set = vr->SetData().SetSet();
4322             vr_set.SetType(v_set.GetType());
4323             if(v_set.IsSetName()) {
4324                 vr_set.SetName(v_set.GetName());
4325             }
4326             ITERATE(CVariation::TData::TSet::TVariations, it, v_set.GetVariations())
4327             {
4328                 vr_set.SetVariations().push_back(x_AsVariation_ref(**it, p));
4329             }
4330         } else {
4331             NCBI_THROW(CException, eUnknown, "Unhandled Variation_ref::TData::E_CChoice");
4332         }
4333 
4334         if(v.IsSetConsequence()) {
4335             vr->SetConsequence();
4336             ITERATE(CVariation::TConsequence, it, v.GetConsequence())
4337             {
4338                 const CVariation::TConsequence::value_type::TObjectType& v_cons = **it;
4339                 CVariation_ref::TConsequence::value_type vr_cons(
4340                     new CVariation_ref::TConsequence::value_type::TObjectType);
4341                 vr->SetConsequence().push_back(vr_cons);
4342                 vr_cons->SetUnknown();
4343 
4344                 if(v_cons.IsSplicing()) {
4345                     vr_cons->SetSplicing();
4346                 } else if(v_cons.IsNote()) {
4347                     vr_cons->SetNote(v_cons.GetNote());
4348                 } else if(v_cons.IsVariation()) {
4349                     CRef<CVariation_ref> cons_variation = x_AsVariation_ref(v_cons.GetVariation(), p);
4350                     vr_cons->SetVariation(*cons_variation);
4351                 } else if(v_cons.IsLoss_of_heterozygosity()) {
4352                     vr_cons->SetLoss_of_heterozygosity();
4353                     if(v_cons.GetLoss_of_heterozygosity().IsSetReference()) {
4354                         vr_cons->SetLoss_of_heterozygosity().SetReference(
4355                             v_cons.GetLoss_of_heterozygosity().GetReference());
4356                     }
4357                     if(v_cons.GetLoss_of_heterozygosity().IsSetTest()) {
4358                         vr_cons->SetLoss_of_heterozygosity().SetTest(
4359                             v_cons.GetLoss_of_heterozygosity().GetTest());
4360                     }
4361                 }
4362             }
4363         }
4364 
4365         if(v.IsSetSomatic_origin()) {
4366             vr->SetSomatic_origin();
4367             ITERATE(CVariation::TSomatic_origin, it, v.GetSomatic_origin())
4368             {
4369                 const CVariation::TSomatic_origin::value_type::TObjectType& v_so = **it;
4370 
4371                 CVariation_ref::TSomatic_origin::value_type vr_so(
4372                     new CVariation_ref::TSomatic_origin::value_type::TObjectType);
4373 
4374                 if(v_so.IsSetSource()) {
4375                     vr_so->SetSource().Assign(v_so.GetSource());
4376                 }
4377 
4378                 if(v_so.IsSetCondition()) {
4379                     vr_so->SetCondition();
4380                     if(v_so.GetCondition().IsSetDescription()) {
4381                         vr_so->SetCondition().SetDescription(
4382                             v_so.GetCondition().GetDescription());
4383                     }
4384                     if(v_so.GetCondition().IsSetObject_id()) {
4385                         vr_so->SetCondition().SetObject_id();
4386                         ITERATE(CVariation::TSomatic_origin::value_type::TObjectType::TCondition::TObject_id,
4387                                 it,
4388                                 v_so.GetCondition().GetObject_id())
4389                         {
4390                             CRef<CDbtag> dbtag(new CDbtag);
4391                             dbtag->Assign(**it);
4392                             vr_so->SetCondition().SetObject_id().push_back(dbtag);
4393                         }
4394                     }
4395                 }
4396 
4397                 vr->SetSomatic_origin().push_back(vr_so);
4398             }
4399         }
4400 
4401 
4402         return vr;
4403     }
4404 
CreateDeltaForOffset(int offset,const CInt_fuzz * fuzz)4405     static CRef<CDelta_item> CreateDeltaForOffset(int offset, const CInt_fuzz* fuzz)
4406     {
4407         CRef<CDelta_item> delta(new CDelta_item);
4408         delta->SetAction(CDelta_item::eAction_offset);
4409         delta->SetSeq().SetLiteral().SetLength(abs(offset));
4410         if(offset < 0) {
4411             delta->SetMultiplier(-1);
4412         }
4413         if(fuzz) {
4414             delta->SetSeq().SetLiteral().SetFuzz().Assign(*fuzz);
4415         }
4416         return delta;
4417     }
4418 
s_AddInstOffsetsFromPlacementOffsets(CVariation_inst & vi,const CVariantPlacement & p)4419     void CVariationUtil::s_AddInstOffsetsFromPlacementOffsets(
4420         CVariation_inst& vi,
4421         const CVariantPlacement& p)
4422     {
4423         if(p.IsSetStart_offset()) {
4424             vi.SetDelta().push_front(CreateDeltaForOffset(
4425                 p.GetStart_offset(),
4426                 p.IsSetStart_offset_fuzz() ? &p.GetStart_offset_fuzz() : NULL));
4427         }
4428         if(p.IsSetStop_offset()) {
4429             vi.SetDelta().push_back(CreateDeltaForOffset(
4430                 p.GetStop_offset(),
4431                 p.IsSetStop_offset_fuzz() ? &p.GetStop_offset_fuzz() : NULL));
4432         }
4433     }
4434 
4435 
AsVariation(const CSeq_feat & variation_feat)4436     CRef<CVariation> CVariationUtil::AsVariation(const CSeq_feat& variation_feat)
4437     {
4438         if(!variation_feat.GetData().IsVariation()) {
4439             NCBI_THROW(CException, eUnknown, "Expected variation-ref feature");
4440         }
4441 
4442         CRef<CVariation> v = x_AsVariation(variation_feat.GetData().GetVariation());
4443 
4444         CRef<CVariantPlacement> p(new CVariantPlacement);
4445         p->SetLoc().Assign(variation_feat.GetLocation());
4446         if(p->GetLoc().GetId()) {
4447             p->SetMol(GetMolType(sequence::GetId(p->GetLoc(), NULL)));
4448         } else {
4449             p->SetMol(CVariantPlacement::eMol_unknown);
4450         }
4451         v->SetPlacements().push_back(p);
4452 
4453 
4454         if(variation_feat.IsSetCit()) {
4455             v->SetPub().Assign(variation_feat.GetCit());
4456         }
4457         if(variation_feat.IsSetExt()) {
4458             v->SetExt();
4459             CRef<CUser_object> uo(new CUser_object);
4460             uo->Assign(variation_feat.GetExt());
4461             v->SetExt().push_back(uo);
4462         }
4463         if(variation_feat.IsSetExts()) {
4464             v->SetExt();
4465             ITERATE(CSeq_feat::TExts, it, variation_feat.GetExts())
4466             {
4467                 CRef<CUser_object> uo(new CUser_object);
4468                 uo->Assign(**it);
4469                 v->SetExt().push_back(uo);
4470             }
4471         }
4472 
4473         s_ConvertInstOffsetsToPlacementOffsets(*v, *p);
4474         return v;
4475     }
4476 
x_AsVariation(const CVariation_ref & vr)4477     CRef<CVariation> CVariationUtil::x_AsVariation(const CVariation_ref& vr)
4478     {
4479         CRef<CVariation> v(new CVariation);
4480         if(vr.IsSetId()) {
4481             v->SetId().Assign(vr.GetId());
4482         }
4483 
4484         if(vr.IsSetParent_id()) {
4485             v->SetParent_id().Assign(vr.GetParent_id());
4486         }
4487 
4488         if(vr.IsSetSample_id()) {
4489             v->SetSample_id().push_back(CRef<CObject_id>(SerialClone(vr.GetSample_id())));
4490         }
4491 
4492         if(vr.IsSetOther_ids()) {
4493             ITERATE(CVariation_ref::TOther_ids, it, vr.GetOther_ids())
4494             {
4495                 v->SetOther_ids().push_back(CRef<CDbtag>(SerialClone(**it)));
4496             }
4497         }
4498 
4499         if(vr.IsSetName()) {
4500             v->SetName(vr.GetName());
4501         }
4502 
4503         if(vr.IsSetSynonyms()) {
4504             v->SetSynonyms() = vr.GetSynonyms();
4505         }
4506 
4507         if(vr.IsSetDescription()) {
4508             v->SetName(vr.GetDescription());
4509         }
4510 
4511         if(vr.IsSetPhenotype()) {
4512             ITERATE(CVariation_ref::TPhenotype, it, vr.GetPhenotype())
4513             {
4514                 CRef<CPhenotype> p(new CPhenotype);
4515                 p->Assign(**it);
4516                 v->SetPhenotype().push_back(p);
4517             }
4518         }
4519 
4520         if(vr.IsSetMethod()) {
4521             v->SetMethod().SetMethod() = vr.GetMethod();
4522         }
4523 
4524         if(vr.IsSetVariant_prop()) {
4525             v->SetVariant_prop().Assign(vr.GetVariant_prop());
4526         }
4527 
4528         if(vr.GetData().IsComplex()) {
4529             v->SetData().SetComplex();
4530         } else if(vr.GetData().IsInstance()) {
4531             v->SetData().SetInstance().Assign(vr.GetData().GetInstance());
4532         } else if(vr.GetData().IsNote()) {
4533             v->SetData().SetNote() = vr.GetData().GetNote();
4534         } else if(vr.GetData().IsUniparental_disomy()) {
4535             v->SetData().SetUniparental_disomy();
4536         } else if(vr.GetData().IsUnknown()) {
4537             v->SetData().SetUnknown();
4538         } else if(vr.GetData().IsSet()) {
4539             const CVariation_ref::TData::TSet& vr_set = vr.GetData().GetSet();
4540             CVariation::TData::TSet& v_set = v->SetData().SetSet();
4541             v_set.SetType(vr_set.GetType());
4542             if(vr_set.IsSetName()) {
4543                 v_set.SetName(vr_set.GetName());
4544             }
4545             ITERATE(CVariation_ref::TData::TSet::TVariations, it, vr_set.GetVariations())
4546             {
4547                 v_set.SetVariations().push_back(x_AsVariation(**it));
4548             }
4549         } else {
4550             NCBI_THROW(CException, eUnknown, "Unhandled Variation_ref::TData::E_CChoice");
4551         }
4552 
4553         if(vr.IsSetConsequence()) {
4554             v->SetConsequence();
4555             ITERATE(CVariation_ref::TConsequence, it, vr.GetConsequence())
4556             {
4557                 const CVariation_ref::TConsequence::value_type::TObjectType& vr_cons = **it;
4558 
4559                 if(vr_cons.IsFrameshift()) {
4560                     // frameshift is represented in consequence in Variation-ref, and
4561                     // directly as an attribute in Variation.
4562                     CVariation& cons_variation = *v;
4563                     cons_variation.SetFrameshift();
4564                     if(vr_cons.GetFrameshift().IsSetPhase()) {
4565                         cons_variation.SetFrameshift().SetPhase(vr_cons.GetFrameshift().GetPhase());
4566                     }
4567                     if(vr_cons.GetFrameshift().IsSetX_length()) {
4568                         cons_variation.SetFrameshift().SetX_length(vr_cons.GetFrameshift().GetX_length());
4569                     }
4570                     continue;
4571                 }
4572 
4573                 CVariation::TConsequence::value_type v_cons(new CVariation::TConsequence::value_type::TObjectType);
4574 
4575                 if(vr_cons.IsUnknown()) {
4576                     v_cons->SetUnknown();
4577                 } else if(vr_cons.IsSplicing()) {
4578                     v_cons->SetSplicing();
4579                 } else if(vr_cons.IsNote()) {
4580                     v_cons->SetNote(vr_cons.GetNote());
4581                 } else if(vr_cons.IsVariation()) {
4582                     CRef<CVariation> cons_variation = x_AsVariation(vr_cons.GetVariation());
4583                     v_cons->SetVariation(*cons_variation);
4584                 } else if(vr_cons.IsLoss_of_heterozygosity()) {
4585                     v_cons->SetLoss_of_heterozygosity();
4586                     if(vr_cons.GetLoss_of_heterozygosity().IsSetReference()) {
4587                         v_cons->SetLoss_of_heterozygosity().SetReference(vr_cons.GetLoss_of_heterozygosity().GetReference());
4588                     }
4589                     if(vr_cons.GetLoss_of_heterozygosity().IsSetTest()) {
4590                         v_cons->SetLoss_of_heterozygosity().SetTest(vr_cons.GetLoss_of_heterozygosity().GetTest());
4591                     }
4592                 }
4593 
4594                 v->SetConsequence().push_back(v_cons);
4595             }
4596             if(v->GetConsequence().empty()) {
4597                 v->ResetConsequence();
4598             }
4599         }
4600 
4601         if(vr.IsSetSomatic_origin()) {
4602             v->SetSomatic_origin();
4603             ITERATE(CVariation_ref::TSomatic_origin, it, vr.GetSomatic_origin())
4604             {
4605                 const CVariation_ref::TSomatic_origin::value_type::TObjectType& vr_so = **it;
4606                 CVariation::TSomatic_origin::value_type v_so(new CVariation::TSomatic_origin::value_type::TObjectType);
4607 
4608                 if(vr_so.IsSetSource()) {
4609                     v_so->SetSource().Assign(vr_so.GetSource());
4610                 }
4611 
4612                 if(vr_so.IsSetCondition()) {
4613                     v_so->SetCondition();
4614                     if(vr_so.GetCondition().IsSetDescription()) {
4615                         v_so->SetCondition().SetDescription(vr_so.GetCondition().GetDescription());
4616                     }
4617                     if(vr_so.GetCondition().IsSetObject_id()) {
4618                         v_so->SetCondition().SetObject_id();
4619                         ITERATE(CVariation_ref::TSomatic_origin::value_type::TObjectType::TCondition::TObject_id,
4620                                 it,
4621                                 vr_so.GetCondition().GetObject_id())
4622                         {
4623                             CRef<CDbtag> dbtag(new CDbtag);
4624                             dbtag->Assign(**it);
4625                             v_so->SetCondition().SetObject_id().push_back(dbtag);
4626                         }
4627                     }
4628                 }
4629 
4630                 v->SetSomatic_origin().push_back(v_so);
4631             }
4632         }
4633 
4634         return v;
4635     }
4636 
4637 
GetSignedOffset(const CDelta_item & delta)4638     static int GetSignedOffset(const CDelta_item& delta)
4639     {
4640         return delta.GetSeq().GetLiteral().GetLength()
4641             * (delta.IsSetMultiplier() ? delta.GetMultiplier() : 1);
4642     }
4643 
GetFuzz(const CDelta_item & delta)4644     static const CInt_fuzz* GetFuzz(const CDelta_item& delta)
4645     {
4646         return delta.GetSeq().GetLiteral().IsSetFuzz() ?
4647             &delta.GetSeq().GetLiteral().GetFuzz() : NULL;
4648     }
4649 
s_ConvertInstOffsetsToPlacementOffsets(CVariation & v,CVariantPlacement & p)4650     void CVariationUtil::s_ConvertInstOffsetsToPlacementOffsets(CVariation& v, CVariantPlacement& p)
4651     {
4652         if(v.GetData().IsSet()) {
4653             NON_CONST_ITERATE(CVariation::TData::TSet::TVariations,
4654                               it,
4655                               v.SetData().SetSet().SetVariations())
4656             {
4657                 s_ConvertInstOffsetsToPlacementOffsets(**it, p);
4658             }
4659         } else if(v.GetData().IsInstance() && v.GetData().GetInstance().GetDelta().size() > 0) {
4660             CConstRef<CDelta_item> delta_first = v.GetData().GetInstance().GetDelta().front();
4661             CConstRef<CDelta_item> delta_last = v.GetData().GetInstance().GetDelta().back();
4662 
4663             if(delta_first->IsSetAction()
4664                && delta_first->GetAction() == CDelta_item::eAction_offset) {
4665                 p.SetStart_offset(GetSignedOffset(*delta_first));
4666                 const CInt_fuzz* fuzz = GetFuzz(*delta_first);
4667                 if(fuzz) {
4668                     p.SetStart_offset_fuzz().Assign(*fuzz);
4669                 }
4670 
4671                 v.SetData().SetInstance().SetDelta().pop_front();
4672             }
4673 
4674             if(delta_last != delta_first
4675                && delta_last->IsSetAction()
4676                && delta_last->GetAction() == CDelta_item::eAction_offset) {
4677                 p.SetStop_offset(GetSignedOffset(*delta_last));
4678                 const CInt_fuzz* fuzz = GetFuzz(*delta_last);
4679                 if(fuzz) {
4680                     p.SetStop_offset_fuzz().Assign(*fuzz);
4681                 }
4682 
4683                 v.SetData().SetInstance().SetDelta().pop_back();
4684             }
4685         }
4686     }
4687 
4688 
4689 
4690 #if 0
4691     CVariationUtil::ETestStatus CVariationUtil::CheckAssertedAllele(
4692         const CSeq_feat& variation_feat,
4693         string* asserted_out,
4694         string* actual_out)
4695     {
4696         return eNotApplicable;
4697         if(!variation_feat.GetData().IsVariation()) {
4698             return eNotApplicable;
4699         }
4700 
4701         CVariation_ref vr;
4702         vr.Assign(variation_feat.GetData().GetVariation());
4703         if(!vr.IsSetLocation()) {
4704             vr.SetLocation().Assign(variation_feat.GetLocation());
4705         }
4706         s_PropagateLocsInPlace(vr);
4707 
4708 
4709         bool have_asserted_seq = false;
4710         bool is_ok = true;
4711         for(CTypeIterator<CVariation_ref> it1(Begin(vr)); it1; ++it1) {
4712             const CVariation_ref& vr1 = *it1;
4713             if(vr1.GetData().IsInstance()
4714                && vr1.GetData().GetInstance().IsSetObservation()
4715                && vr1.GetData().GetInstance().GetObservation() == CVariation_inst::eObservation_asserted)
4716             {
4717                 string asserted_seq;
4718                 const CSeq_literal& literal = vr1.GetData().GetInstance().GetDelta().front()->GetSeq().GetLiteral();
4719                 if(literal.GetSeq_data().IsIupacna()) {
4720                     asserted_seq = literal.GetSeq_data().GetIupacna();
4721                     have_asserted_seq = true;
4722                 } else if(literal.GetSeq_data().IsNcbieaa()) {
4723                     asserted_seq = literal.GetSeq_data().GetNcbieaa();
4724                     have_asserted_seq = true;
4725                 }
4726 
4727                 //an asserted sequnece may be of the form "A..BC", where ".." is to be interpreted as a
4728                 //gap of arbitrary length - we need to match prefix and suffix separately
4729                 string prefix, suffix;
4730                 string str_tmp = NStr::Replace(asserted_seq, "..", "\t"); //SplitInTwo's delimiter must be single-character
4731                 NStr::SplitInTwo(str_tmp, "\t", prefix, suffix);
4732 
4733                 CSeqVector v(vr1.GetLocation(), *m_scope, CBioseq_Handle::eCoding_Iupac);
4734                 string actual_seq;
4735                 v.GetSeqData(v.begin(), v.end(), actual_seq);
4736 
4737                 if(   prefix.size() > 0 && !NStr::StartsWith(actual_seq, prefix)
4738                    || suffix.size() > 0 && !NStr::EndsWith(actual_seq, suffix))
4739                 {
4740                     is_ok = false;
4741                     if(asserted_out) {
4742                         *asserted_out = asserted_seq;
4743                     }
4744                     if(actual_out) {
4745                         *actual_out = actual_seq;
4746                     }
4747                     break;
4748                 }
4749             }
4750         }
4751 
4752         return !have_asserted_seq ? eNotApplicable : is_ok ? ePass : eFail;
4753     }
4754 #endif
4755 
4756 
4757 };
4758 
4759 END_NCBI_SCOPE
4760 
4761