1 /*  $Id: hgvs_parser2.cpp 612015 2020-07-14 17:21:57Z villamar $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * File Description:
27  *   Sample library
28  *
29  */
30 
31 #include <ncbi_pch.hpp>
32 
33 
34 #include <serial/iterator.hpp>
35 
36 #include <objects/seqalign/Seq_align.hpp>
37 #include <objects/seqalign/Spliced_seg.hpp>
38 #include <objects/seqalign/Spliced_exon.hpp>
39 #include <objects/seqalign/Product_pos.hpp>
40 #include <objects/seqalign/Prot_pos.hpp>
41 
42 #include <objects/variation/Variation.hpp>
43 #include <objects/variation/VariantPlacement.hpp>
44 #include <objects/variation/VariationMethod.hpp>
45 #include <objects/variation/VariationException.hpp>
46 
47 #include <objects/seqfeat/Seq_feat.hpp>
48 #include <objects/seqfeat/Variation_inst.hpp>
49 #include <objects/seqfeat/Delta_item.hpp>
50 #include <objects/seqfeat/Ext_loc.hpp>
51 #include <objects/seqfeat/BioSource.hpp>
52 #include <objects/seqfeat/SeqFeatXref.hpp>
53 
54 #include <objects/seq/seqport_util.hpp>
55 #include <objects/seq/Seq_literal.hpp>
56 #include <objects/seq/Seq_data.hpp>
57 #include <objects/seq/Numbering.hpp>
58 #include <objects/seq/Num_ref.hpp>
59 #include <objects/seq/Annot_descr.hpp>
60 #include <objects/seq/Annotdesc.hpp>
61 #include <objects/seq/Seq_descr.hpp>
62 
63 #include <objects/general/Object_id.hpp>
64 #include <objects/general/User_object.hpp>
65 #include <objects/general/Dbtag.hpp>
66 
67 #include <objects/seqloc/Seq_point.hpp>
68 #include <objects/seqloc/Seq_loc_equiv.hpp>
69 
70 #include <objmgr/util/sequence.hpp>
71 #include <objmgr/seq_vector.hpp>
72 #include <objmgr/align_ci.hpp>
73 #include <objmgr/feat_ci.hpp>
74 #include <objmgr/seq_loc_mapper.hpp>
75 
76 #include <misc/hgvs/hgvs_parser2.hpp>
77 #include <misc/hgvs/variation_util2.hpp>
78 
79 
80 BEGIN_NCBI_SCOPE
81 
82 namespace variation {
83 
84 #define HGVS_THROW(err_code, message) NCBI_THROW(CHgvsParser::CHgvsParserException, err_code, message)
85 
86 #define HGVS_ASSERT_RULE(i, rule_id) \
87     if((i->value.id()) != (SGrammar::rule_id))             \
88     {HGVS_THROW(eGrammatic, "Unexpected rule " + CHgvsParser::SGrammar::s_GetRuleName(i->value.id()) ); }
89 
90 
91 CSafeStatic<CHgvsParser::SGrammar> CHgvsParser::s_grammar;
92 
93 const char* CHgvsParser::SGrammar::s_rule_names[CHgvsParser::SGrammar::eNodeIds_SIZE] =
94 {
95     "NONE",
96     "root",
97     "list_delimiter",
98     "list1a",
99     "list2a",
100     "list3a",
101     "list1b",
102     "list2b",
103     "list3b",
104     "expr1",
105     "expr2",
106     "expr3",
107     "translocation",
108     "header",
109     "seq_id",
110     "mut_list",
111     "mut_ref",
112     "mol",
113     "int_fuzz",
114     "abs_pos",
115     "general_pos",
116     "fuzzy_pos",
117     "pos_spec",
118     "location",
119     "nuc_range",
120     "prot_range",
121     "mut_inst",
122     "raw_seq",
123     "raw_seq_or_len",
124     "aminoacid1",
125     "aminoacid2",
126     "aminoacid3",
127     "nuc_subst",
128     "deletion",
129     "insertion",
130     "delins",
131     "duplication",
132     "nuc_inv",
133     "ssr",
134     "conversion",
135     "seq_loc",
136     "seq_ref",
137     "prot_pos",
138     "prot_fs",
139     "prot_missense",
140     "prot_ext",
141     "no_change"
142 };
143 
144 
SetFirstPlacement(CVariation & v)145 CVariantPlacement& SetFirstPlacement(CVariation& v)
146 {
147     if(v.SetPlacements().size() == 0) {
148         CRef<CVariantPlacement> p(new CVariantPlacement);
149         v.SetPlacements().push_back(p);
150     }
151     return *v.SetPlacements().front();
152 }
153 
SetComputational(CVariation & variation)154 void SetComputational(CVariation& variation)
155 {
156     CVariationMethod& m = variation.SetMethod();
157     m.SetMethod();
158 
159     if(find(m.GetMethod().begin(),
160             m.GetMethod().end(),
161             CVariationMethod::eMethod_E_computational) == m.GetMethod().end())
162     {
163         m.SetMethod().push_back(CVariationMethod::eMethod_E_computational);
164     }
165 }
166 
167 
SeqsMatch(const string & query,const char * text)168 bool SeqsMatch(const string& query, const char* text)
169 {
170     static const char* iupac_bases = ".TGKCYSBAWRDMHVN"; //position of the iupac literal = 4-bit mask for A|C|G|T
171     for(size_t i = 0; i < query.size(); i++) {
172         size_t a = CTempString(iupac_bases).find(query[i]);
173         size_t b = CTempString(iupac_bases).find(text[i]);
174         if(!(a & b)) {
175             return false;
176         }
177     }
178     return true;
179 }
180 
181 
FindSSRLoc(const CSeq_loc & loc,const string & seq,CScope & scope)182 CRef<CSeq_loc> FindSSRLoc(const CSeq_loc& loc, const string& seq, CScope& scope)
183 {
184     //Extend the loc 10kb up and down; Find all occurences of seq in the resulting
185     //interval, create locs for individual repeat units; then merge them, and keep the interval that
186     //overlaps the original.
187 
188     if (loc.GetStart(eExtreme_Positional) > loc.GetStop(eExtreme_Positional)) {
189         HGVS_THROW(eSemantic, "Invalid range: start position is bigger than stop position");
190     }
191     const TSeqPos ext_interval = 10000;
192 
193     CRef<CSeq_loc> loc1 = sequence::Seq_loc_Merge(loc, CSeq_loc::fMerge_SingleRange, NULL);
194     CBioseq_Handle bsh = scope.GetBioseqHandle(sequence::GetId(loc, NULL));
195     TSeqPos seq_len = bsh.GetInst_Length();
196     loc1->SetInt().SetFrom() -= min(ext_interval, loc1->GetInt().GetFrom());
197     loc1->SetInt().SetTo() += min(ext_interval, seq_len - 1 - loc1->GetInt().GetTo());
198 
199     CSeqVector v(*loc1, scope, CBioseq_Handle::eCoding_Iupac);
200     string str1;
201     v.GetSeqData(v.begin(), v.end(), str1);
202 
203     CRef<CSeq_loc> container(new CSeq_loc(CSeq_loc::e_Mix));
204 
205     if (str1.size() >= seq.size()) {
206         for (size_t i = 0u; i < str1.size() - seq.size(); ++i) {
207             if (SeqsMatch(seq, &str1[i])) {
208                 CRef<CSeq_loc> repeat_unit_loc(new CSeq_loc);
209                 repeat_unit_loc->Assign(*loc1);
210 
211                 if (sequence::GetStrand(loc, NULL) == eNa_strand_minus) {
212                     repeat_unit_loc->SetInt().SetTo() -= i;
213                     repeat_unit_loc->SetInt().SetFrom(repeat_unit_loc->GetInt().GetTo() - (seq.size() - 1));
214                 } else {
215                     repeat_unit_loc->SetInt().SetFrom() += i;
216                     repeat_unit_loc->SetInt().SetTo(repeat_unit_loc->GetInt().GetFrom() + (seq.size() - 1));
217                 }
218                 container->SetMix().Set().push_back(repeat_unit_loc);
219             }
220         }
221     }
222 
223     CRef<CSeq_loc> merged_repeats = sequence::Seq_loc_Merge(*container, CSeq_loc::fSortAndMerge_All, NULL);
224     merged_repeats->ChangeToMix();
225     CRef<CSeq_loc> result(new CSeq_loc(CSeq_loc::e_Null));
226     result->Assign(loc);
227 
228     for(CSeq_loc_CI ci(*merged_repeats); ci; ++ci) {
229         const CSeq_loc& loc2 = ci.GetEmbeddingSeq_loc();
230         if(sequence::Compare(loc, loc2, NULL, sequence::fCompareOverlapping) != sequence::eNoOverlap) {
231             result->Add(loc2);
232         }
233     }
234 
235     return sequence::Seq_loc_Merge(*result, CSeq_loc::fSortAndMerge_All, NULL);
236 }
237 
238 
239 
240 
s_SetStartOffset(CVariantPlacement & p,const CHgvsParser::SFuzzyInt & fint)241 void CHgvsParser::s_SetStartOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint)
242 {
243     p.ResetStart_offset();
244     p.ResetStart_offset_fuzz();
245     if(fint.value || fint.fuzz) {
246         p.SetStart_offset(fint.value);
247     }
248 
249     if(fint.fuzz) {
250         p.SetStart_offset_fuzz().Assign(*fint.fuzz);
251     }
252 
253 #if 0
254     if(!fint.value
255        && fint.fuzz
256        && fint.fuzz->IsLim()
257        && (   fint.fuzz->GetLim() == CInt_fuzz::eLim_lt
258            || fint.fuzz->GetLim() == CInt_fuzz::eLim_tl))
259     {
260         // VAR-832
261         // interpret c.x-? as c.x-(?_1), not as c.x+(?_0)
262         p.SetStart_offset(-1);
263     }
264 #endif
265 
266 }
267 
s_SetStopOffset(CVariantPlacement & p,const CHgvsParser::SFuzzyInt & fint)268 void CHgvsParser::s_SetStopOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint)
269 {
270     p.ResetStop_offset();
271     p.ResetStop_offset_fuzz();
272     if(fint.value || fint.fuzz) {
273         p.SetStop_offset(fint.value);
274     }
275 
276     if(fint.fuzz) {
277         p.SetStop_offset_fuzz().Assign(*fint.fuzz);
278     }
279 
280 #if 0
281     if(!fint.value
282        && fint.fuzz
283        && fint.fuzz->IsLim()
284        && (   fint.fuzz->GetLim() == CInt_fuzz::eLim_gt
285            || fint.fuzz->GetLim() == CInt_fuzz::eLim_tr))
286     {
287         // VAR-832
288         // interpret c.x+? as c.x+(1_?), not as c.x+(0_?)
289         p.SetStop_offset(1);
290     }
291 #endif
292 }
293 
294 
295 
296 //if a variation has an asserted sequence, stored in placement.seq, repackage it as a set having
297 //the original variation and a synthetic one representing the asserted sequence. The placement.seq
298 //is cleared, as it is a placeholder for the actual reference sequence.
RepackageAssertedSequence(CVariation & vr)299 void RepackageAssertedSequence(CVariation& vr)
300 {
301     if(vr.IsSetPlacements() && SetFirstPlacement(vr).IsSetSeq()) {
302         CRef<CVariation> container(new CVariation);
303         container->SetPlacements() = vr.SetPlacements();
304 
305         CRef<CVariation> orig(new CVariation);
306         orig->Assign(vr);
307         orig->ResetPlacements(); //location will be set on the package, as it is the same for both members
308 
309         container->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_package);
310         container->SetData().SetSet().SetVariations().push_back(orig);
311 
312         CRef<CVariation> asserted_vr(new CVariation);
313         asserted_vr->SetData().SetInstance().SetObservation(CVariation_inst::eObservation_asserted);
314         asserted_vr->SetData().SetInstance().SetType(CVariation_inst::eType_identity);
315 
316         CRef<CDelta_item> delta(new CDelta_item);
317         delta->SetSeq().SetLiteral().Assign(SetFirstPlacement(vr).GetSeq());
318         asserted_vr->SetData().SetInstance().SetDelta().push_back(delta);
319 
320         SetFirstPlacement(*container).ResetSeq();
321         container->SetData().SetSet().SetVariations().push_back(asserted_vr);
322 
323         vr.Assign(*container);
324 
325     } else if(vr.GetData().IsSet()) {
326         NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, vr.SetData().SetSet().SetVariations()) {
327             RepackageAssertedSequence(**it);
328         }
329     }
330 }
331 
332 
333 //HGVS distinguished between c. n. and r. molecules, while in
334 //VariantPlacement we have moltype cdna (which maps onto c.) and rna (which maps onto n.)
335 //
336 //If we have an HGVS expression like NM_123456.7:r. the VariantPlacement will have moltype rna
337 //that would map-back onto NM_123456.7:n., which is what we don't want, as "n.' reserved
338 //for non-coding RNAs, so we'll convert rna moltype to cdna based on accession here.
339 //Note that this is a post-processing step, as in the midst of parsing we want to treat
340 //NM_123456.7:r. as non-coding rna, since these have absolute coordinates rather than cds-relative.
AdjustMoltype(CVariation & vr,CScope & scope)341 void AdjustMoltype(CVariation& vr, CScope& scope)
342 {
343     CVariationUtil util(scope);
344 
345     for(CTypeIterator<CVariantPlacement> it(Begin(vr)); it; ++it) {
346         CVariantPlacement& p = *it;
347 
348         if(p.IsSetMol()
349            && p.GetMol() == CVariantPlacement::eMol_rna
350            && p.GetLoc().GetId())
351         {
352             p.SetMol(util.GetMolType(*p.GetLoc().GetId()));
353         }
354     }
355 }
356 
357 
CContext(const CContext & other)358 CHgvsParser::CContext::CContext(const CContext& other)
359   : m_hgvs(other.m_hgvs)
360 {
361     this->m_bsh = other.m_bsh;
362     this->m_cds = other.m_cds;
363     this->m_scope = other.m_scope;
364     this->m_seq_id_resolvers = other.m_seq_id_resolvers;
365     this->m_placement.Reset();
366     if(other.m_placement) {
367         /*
368          * Note: need to make a copy of the placement, such that if preceding subvariation
369          * is location-specific and the other one isn't, first's placement is not
370          * given to the other one, e.g. "NM_004004.2:c.[35_36dup;40del]+[=]" - if we
371          * made a shallow copy, then the location context for "[=]" would be erroneously
372          * inherited from the last sibling: "NM_004004.2:c.40=" instead of whole "NM_004004.2:c.="
373          */
374          this->m_placement.Reset(new CVariantPlacement);
375          this->m_placement->Assign(*other.m_placement);
376     }
377 }
378 
GetCDS() const379 const CSeq_feat& CHgvsParser::CContext::GetCDS() const
380 {
381     if(m_cds.IsNull()) {
382         HGVS_THROW(eContext, "No CDS feature in context");
383     }
384     return *m_cds;
385 }
386 
GetId() const387 const CSeq_id& CHgvsParser::CContext::GetId() const
388 {
389     return sequence::GetId(GetPlacement().GetLoc(), NULL);
390 }
391 
392 
SetId(const CSeq_id & id,CVariantPlacement::TMol mol)393 void CHgvsParser::CContext::SetId(const CSeq_id& id, CVariantPlacement::TMol mol)
394 {
395     Clear();
396 
397     SetPlacement().SetMol(mol);
398     SetPlacement().SetLoc().SetWhole().Assign(id);
399 
400     m_bsh = m_scope->GetBioseqHandle(id);
401 
402     if(!m_bsh) {
403         HGVS_THROW(eContext, "Cannnot get bioseq for seq-id " + id.AsFastaString());
404     }
405 
406     if(mol == CVariantPlacement::eMol_cdna) {
407         SAnnotSelector sel;
408         sel.SetResolveTSE();
409         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
410         for(CFeat_CI ci(m_bsh, sel); ci; ++ci) {
411             const CMappedFeat& mf = *ci;
412             if(mf.GetData().IsCdregion()) {
413                 if(m_cds.IsNull()) {
414                     m_cds.Reset(new CSeq_feat());
415                     m_cds->Assign(mf.GetMappedFeature());
416                 } else {
417                     HGVS_THROW(eContext, "Multiple CDS features on the sequence");
418                 }
419             }
420         }
421         if(m_cds.IsNull()) {
422             HGVS_THROW(eContext, "Could not find CDS feat");
423         }
424     }
425 }
426 
427 
s_GetRuleName(parser_id id)428 const string CHgvsParser::SGrammar::s_GetRuleName(parser_id id)
429 {
430     if(id.to_long() >= CHgvsParser::SGrammar::eNodeIds_SIZE) {
431         HGVS_THROW(eLogic, "Rule name not hardcoded");
432     } else {
433         return s_rule_names[id.to_long()];
434     }
435 }
436 
437 
x_int_fuzz(TIterator const & i,const CContext & context)438 CHgvsParser::SFuzzyInt CHgvsParser::x_int_fuzz(TIterator const& i, const CContext& context)
439 {
440     HGVS_ASSERT_RULE(i, eID_int_fuzz);
441     TIterator it = i->children.begin();
442 
443     CHgvsParser::SFuzzyInt fint;
444     fint.fuzz.Reset(new CInt_fuzz);
445 
446     if(i->children.size() == 1) { //e.g. '5' or '?'
447         string s(it->value.begin(), it->value.end());
448         if(s == "?") {
449             fint.SetPureFuzz();
450         } else {
451             fint.value = NStr::StringToInt(s);
452             fint.fuzz.Reset();
453         }
454     } else if(i->children.size() == 3) { //e.g. '(5)' or '(?)'
455         ++it;
456         string s(it->value.begin(), it->value.end());
457         if(s == "?") {
458             fint.SetPureFuzz();
459         } else {
460             fint.value = NStr::StringToInt(s);
461             fint.fuzz->SetLim(CInt_fuzz::eLim_unk);
462         }
463     } else if(i->children.size() == 5) { //e.g. '(5_7)' or '(?_10)'
464         ++it;
465         string s1(it->value.begin(), it->value.end());
466         ++it;
467         ++it;
468         string s2(it->value.begin(), it->value.end());
469 
470         if(s1 == "?" && s2 == "?") {
471             fint.SetPureFuzz();
472         } else if(s1 != "?" && s2 != "?") {
473             fint.value = NStr::StringToInt(s1);
474             fint.fuzz->SetRange().SetMin(NStr::StringToInt(s1));
475             fint.fuzz->SetRange().SetMax(NStr::StringToInt(s2));
476         } else if(s2 == "?") {
477             fint.value = NStr::StringToInt(s1);
478             fint.fuzz->SetLim(CInt_fuzz::eLim_gt);
479         } else if(s1 == "?") {
480             fint.value = NStr::StringToInt(s2);
481             fint.fuzz->SetLim(CInt_fuzz::eLim_lt);
482         } else {
483             HGVS_THROW(eLogic, "Unreachable code");
484         }
485     }
486 
487     return fint;
488 }
489 
490 
491 
492 /* In HGVS:
493  * the nucleotide 3' of the translation stop codon is *1, the next *2, etc.
494  * # there is no nucleotide 0
495  * # nucleotide 1 is the A of the ATG-translation initiation codon
496  * # the nucleotide 5' of the ATG-translation initiation codon is -1, the previous -2, etc.
497  *
498  * I.e. need to adjust if dealing with positive coordinates, except for *-relative ones.
499  */
500 template<typename T>
AdjustHgvsCoord(T val,TSeqPos offset,bool adjust)501 T AdjustHgvsCoord(T val, TSeqPos offset, bool adjust)
502 {
503     val += offset;
504     if(adjust && val > (T)offset) {
505         // note: val may be unsigned, so check for (val+offset > offset)
506         // instead of (val > 0)
507         val -= 1;
508     }
509     return val;
510 }
511 
x_abs_pos(TIterator const & i,const CContext & context)512 CRef<CSeq_point> CHgvsParser::x_abs_pos(TIterator const& i, const CContext& context)
513 {
514     HGVS_ASSERT_RULE(i, eID_abs_pos);
515     TIterator it = i->children.begin();
516 
517     TSeqPos offset(0);
518     bool adjust = true; //see AdjustHgvsCoord comments above
519     if(i->children.size() == 2) {
520         adjust = false;
521         string s(it->value.begin(), it->value.end());
522         if(s != "*") {
523             HGVS_THROW(eGrammatic, "Expected literal '*'");
524         }
525         if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_cdna) {
526             HGVS_THROW(eContext, "Expected 'c.' context for stop-codon-relative coordinate");
527         }
528 
529         offset = context.GetCDS().GetLocation().GetStop(eExtreme_Biological);
530         ++it;
531     } else {
532         if (context.GetPlacement().GetMol() == CVariantPlacement::eMol_cdna) {
533             //Note: in RNA coordinates (r.) the coordinates are absolute, like in genomic sequences,
534             //  "The RNA sequence type uses only GenBank mRNA records. The value 1 is assigned to the first
535             //  base in the record and from there all bases are counted normally."
536             //so the cds-start offset applies only to "c." coordinates
537             offset = context.GetCDS().GetLocation().GetStart(eExtreme_Biological);
538         }
539     }
540 
541     CRef<CSeq_point> pnt(new CSeq_point);
542     {{
543         SFuzzyInt int_fuzz = x_int_fuzz(it, context);
544 
545         pnt->SetId().Assign(context.GetId());
546         pnt->SetStrand(eNa_strand_plus);
547         if(int_fuzz.IsPureFuzz()) {
548             pnt->SetPoint(kInvalidSeqPos);
549         } else {
550             pnt->SetPoint(AdjustHgvsCoord(int_fuzz.value, offset, adjust));
551         }
552 
553         if(!int_fuzz.fuzz.IsNull()) {
554             pnt->SetFuzz(*int_fuzz.fuzz);
555             if(pnt->GetFuzz().IsRange()) {
556                 CInt_fuzz_Base::TRange& r = pnt->SetFuzz().SetRange();
557                 r.SetMin(AdjustHgvsCoord(r.GetMin(), offset, adjust));
558                 r.SetMax(AdjustHgvsCoord(r.GetMax(), offset, adjust));
559             }
560         }
561     }}
562 
563     return pnt;
564 }
565 
IsPureFuzzPoint(const CSeq_point & p)566 bool IsPureFuzzPoint(const CSeq_point& p)
567 {
568     return p.GetPoint() == kInvalidSeqPos
569         && p.IsSetFuzz()
570         && p.GetFuzz().IsLim()
571         && p.GetFuzz().GetLim() == CInt_fuzz::eLim_other;
572 }
573 
574 
575 /*
576  * general_pos is either simple abs-pos that is passed down to x_abs_pos,
577  * or an intronic location that is specified by a mapping point in the
578  * local coordinates and the -upstream / +downstream offset after remapping.
579  *
580  * The mapping point can either be an abs-pos in local coordinates, or
581  * specified as offset in intron-specific coordinate system where IVS# specifies
582  * the intron number
583  */
x_general_pos(TIterator const & i,const CContext & context)584 CHgvsParser::SOffsetPoint CHgvsParser::x_general_pos(TIterator const& i, const CContext& context)
585 {
586     HGVS_ASSERT_RULE(i, eID_general_pos);
587 
588     SOffsetPoint ofpnt;
589 
590     if(i->children.size() == 1) {
591         //local coordinates
592         ofpnt.pnt = x_abs_pos(i->children.begin(), context);
593     } else {
594         //(str_p("IVS") >> int_p | abs_pos) >> sign_p >> int_fuzz
595 
596         TIterator it = i->children.end() - 1;
597         ofpnt.offset = x_int_fuzz(it, context);
598         --it;
599 
600         //adjust for sign; convert +? or -? offsets to
601         //0> or 0<
602         string s_sign(it->value.begin(), it->value.end());
603         int sign1 = s_sign == "-" ? -1 : 1;
604         ofpnt.offset.value *= sign1;
605         if(ofpnt.offset.fuzz && ofpnt.offset.fuzz->IsRange()) {
606             ofpnt.offset.fuzz->SetRange().SetMin() *= sign1;
607             ofpnt.offset.fuzz->SetRange().SetMax() *= sign1;
608         } else if(ofpnt.offset.IsPureFuzz()) {
609             ofpnt.offset.fuzz->SetLim(sign1 < 0 ? CInt_fuzz::eLim_lt : CInt_fuzz::eLim_gt);
610         }
611 
612         --it;
613         if(it->value.id() == SGrammar::eID_abs_pos) {
614             //base-loc is an abs-pos
615             ofpnt.pnt = x_abs_pos(i->children.begin(), context);
616         } else {
617             //base-loc is IVS-relative.
618             ofpnt.pnt.Reset(new CSeq_point);
619             ofpnt.pnt->SetId().Assign(context.GetId());
620             ofpnt.pnt->SetStrand(eNa_strand_plus);
621 
622             TIterator it = i->children.begin();
623             string s_ivs(it->value.begin(), it->value.end());
624             ++it;
625             string s_ivs_num(it->value.begin(), it->value.end());
626             int ivs_num = NStr::StringToInt(s_ivs_num);
627 
628             //If IVS3+50, the mapping point is the last base of third exon
629             //if IVS3-50, the mapping point is the first base of the fourth exon
630             size_t target_exon_num = sign1 < 0 ? ivs_num + 1 : ivs_num;
631 
632             SAnnotSelector sel;
633             sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_exon);
634             CBioseq_Handle bsh = context.GetScope().GetBioseqHandle(context.GetId());
635             size_t exon_num = 1;
636             //Note: IVS is cDNA-centric, so we'll have to use ordinals of the exons instead of /number qual
637             for(CFeat_CI ci(bsh, sel); ci; ++ci) {
638                 const CMappedFeat& mf = *ci;
639                 if(exon_num == target_exon_num) {
640                     ofpnt.pnt->SetPoint(sign1 > 0 ? mf.GetLocation().GetStop(eExtreme_Biological)
641                                                   : mf.GetLocation().GetStart(eExtreme_Biological));
642                     break;
643                 }
644                 exon_num++;
645             }
646         }
647     }
648 
649     //We could be dealing with improper HGVS expression (that we need to support anyway)
650     //where the coordinate extends beyond the range of sequence
651     //e.g. NM_000518:c.-78A>G, where codon-start is at 51. In this case the resulting
652     //coordinate will be negative; we'll convert it to offset-format (27 bases upstream of pos 0)
653 
654     if(!IsPureFuzzPoint(*ofpnt.pnt)
655        && (   context.GetPlacement().GetMol() == CVariantPlacement::eMol_cdna
656            || context.GetPlacement().GetMol() == CVariantPlacement::eMol_rna))
657     {
658         if(static_cast<TSignedSeqPos>(ofpnt.pnt->GetPoint()) < 0) {
659             ofpnt.offset.value += static_cast<TSignedSeqPos>(ofpnt.pnt->GetPoint());
660             ofpnt.pnt->SetPoint(0);
661         } else {
662             CVariationUtil vu(context.GetScope());
663             TSeqPos transcribed_len = vu.GetEffectiveTranscriptLength(context.GetBioseqHandle());
664             // Note: positions past the last exon are interpreted as specifying near-gene/intronic
665             // target rather than polyA. JIRA:SNP-7341
666 
667             if(ofpnt.pnt->GetPoint() >= transcribed_len) {
668                 TSeqPos anchor_pos = transcribed_len - 1;
669                 TSeqPos overrun = ofpnt.pnt->GetPoint() - anchor_pos;
670                 ofpnt.offset.value += overrun;
671                 ofpnt.pnt->SetPoint(anchor_pos);
672             }
673         }
674     }
675 
676     return ofpnt;
677 }
678 
679 
x_fuzzy_pos(TIterator const & i,const CContext & context)680 CHgvsParser::SOffsetPoint CHgvsParser::x_fuzzy_pos(TIterator const& i, const CContext& context)
681 {
682     HGVS_ASSERT_RULE(i, eID_fuzzy_pos);
683 
684     SOffsetPoint pnt1 = x_general_pos(i->children.begin(), context);
685     SOffsetPoint pnt2 = x_general_pos(i->children.begin() + 1, context);
686 
687     //Verify that on the same seq-id.
688     if(!pnt1.pnt->GetId().Equals(pnt2.pnt->GetId())) {
689         HGVS_THROW(eSemantic, "Points in a fuzzy pos are on different sequences");
690     }
691     if(pnt1.pnt->GetStrand() != pnt2.pnt->GetStrand()) {
692         HGVS_THROW(eSemantic, "Range-loc start/stop are on different strands.");
693     }
694 
695     if(IsPureFuzzPoint(*pnt1.pnt) || IsPureFuzzPoint(*pnt2.pnt)) {
696         if(IsPureFuzzPoint(*pnt2.pnt)) {
697             pnt1.pnt->SetFuzz().SetLim(CInt_fuzz::eLim_tr);
698             return pnt1;
699         } else {
700             pnt2.pnt->SetFuzz().SetLim(CInt_fuzz::eLim_tl);
701             return pnt2;
702         }
703     }
704 
705     if((pnt1.offset.value != 0 || pnt2.offset.value != 0) && !pnt1.pnt->Equals(*pnt2.pnt)) {
706         HGVS_THROW(eSemantic, "Base-points in an intronic fuzzy position must be equal");
707     }
708 
709     SOffsetPoint pnt = pnt1;
710     if(pnt1.offset.value != pnt2.offset.value) {
711         pnt.offset.fuzz.Reset(new CInt_fuzz);
712         pnt.offset.fuzz->SetRange().SetMin(pnt1.offset.value);
713         pnt.offset.fuzz->SetRange().SetMax(pnt2.offset.value);
714     }
715 
716     return pnt;
717 
718 #if 0
719     todo: reconcile
720     //If Both are Empty - the result is empty, otherwise reconciliate
721     if(pnt1.pnt->GetPoint() == kInvalidSeqPos && pnt2.pnt->GetPoint() == kInvalidSeqPos) {
722         pnt.pnt = pnt1.pnt;
723         pnt.offset = pnt1.offset;
724     } else {
725         pnt.pnt.Reset(new CSeq_point);
726         pnt.pnt.Assign(*pnt1.pnt);
727 
728         TSeqPos min_pos = min(pnt1.pnt->GetPoint(), pnt2.pnt->GetPoint());
729         TSeqPos max_pos = max(pnt1.pnt->GetPoint(), pnt2.pnt->GetPoint());
730 
731         if(!pnt1->IsSetFuzz() && !pnt2->IsSetFuzz()) {
732             //Both are non-fuzzy - create the min-max fuzz.
733             //(10+50_10+60)
734             pnt->SetFuzz().SetRange().SetMin(min_pos);
735             pnt->SetFuzz().SetRange().SetMax(max_pos);
736 
737         } else if(pnt1->IsSetFuzz() && pnt2->IsSetFuzz()) {
738             //Both are fuzzy - reconcile the fuzz.
739 
740             if(pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tr
741             && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tl)
742             {
743                 //fuzz points inwards - create min-max fuzz
744                 //(10+?_11-?)
745                 pnt->SetFuzz().SetRange().SetMin(min_pos);
746                 pnt->SetFuzz().SetRange().SetMax(max_pos);
747 
748             } else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tl
749                     && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tr)
750             {
751                 //fuzz points outwards - set fuzz to unk
752                 //(10-?_10+?)
753                 //(?_10+?)
754                 //(10-?_?)
755                 pnt->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
756 
757             }  else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tl
758                      && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tl)
759             {
760                 //fuzz is to the left - use 5'-most
761                 //(?_10-?)
762                 //(10-?_11-?)
763                 pnt->SetPoint(pnt->GetStrand() == eNa_strand_minus ? max_pos : min_pos);
764 
765             }  else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tr
766                      && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tr)
767             {
768                 //fuzz is to the right - use 3'-most
769                 //(10+?_?)
770                 //(10+?_11+?)
771                 pnt->SetPoint(pnt->GetStrand() == eNa_strand_minus ? min_pos : max_pos);
772 
773             } else {
774                 pnt->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
775             }
776         } else {
777             // One of the two is non-fuzzy:
778             // use it to specify position, and the fuzz of the other to specify the fuzz
779             // e.g.  (10+5_10+?)  -> loc1=100005; loc2=100000tr  -> 100005tr
780 
781             pnt->Assign(pnt1->IsSetFuzz() ? *pnt2 : *pnt1);
782             pnt->SetFuzz().Assign(pnt1->IsSetFuzz() ? pnt1->GetFuzz()
783                                                     : pnt2->GetFuzz());
784 
785         }
786     }
787 #endif
788 
789 
790 
791 }
792 
GetUniquePrimaryTranscriptId(CBioseq_Handle & bsh)793 CSeq_id_Handle GetUniquePrimaryTranscriptId(CBioseq_Handle& bsh)
794 {
795     set<CSeq_id_Handle> annotated_transcript_feats;
796     set<CSeq_id_Handle> annotated_transcript_aligns;
797 
798     SAnnotSelector sel;
799     sel.SetResolveTSE();
800     sel.IncludeFeatType(CSeqFeatData::e_Rna);
801     for(CFeat_CI ci(bsh, sel); ci; ++ci) {
802         const CMappedFeat& mf = *ci;
803         if(mf.IsSetProduct() && mf.GetProduct().GetId()) {
804             annotated_transcript_feats.insert(
805                     CSeq_id_Handle::GetHandle(*mf.GetProduct().GetId()));
806         }
807     }
808 
809     for(CAlign_CI ci(bsh, sel); ci; ++ci) {
810         const CSeq_align& aln = *ci;
811         if(aln.GetSegs().IsSpliced()) {
812             annotated_transcript_aligns.insert(
813                     CSeq_id_Handle::GetHandle(aln.GetSeq_id(0)));
814         }
815     }
816 
817     vector<CSeq_id_Handle> v;
818     set_intersection(annotated_transcript_feats.begin(),
819                      annotated_transcript_feats.end(),
820                      annotated_transcript_aligns.begin(),
821                      annotated_transcript_aligns.end(),
822                      back_inserter(v));
823 
824     return v.size() != 1 ? CSeq_id_Handle()
825         : sequence::GetId(v.front(),
826                           bsh.GetScope(),
827                           sequence::eGetId_ForceAcc);
828 }
829 
IsLRG(CBioseq_Handle & bsh)830 bool IsLRG(CBioseq_Handle& bsh)
831 {
832     ITERATE(CBioseq_Handle::TId, it, bsh.GetId()) {
833         const CSeq_id& id = *it->GetSeqId();
834         if(   id.IsGeneral()
835            && id.GetGeneral().GetDb() == "LRG")
836         {
837             return true;
838         }
839     }
840     return false;
841 }
842 
843 
x_header(TIterator const & i,const CContext & context)844 CHgvsParser::CContext CHgvsParser::x_header(TIterator const& i, const CContext& context)
845 {
846     HGVS_ASSERT_RULE(i, eID_header);
847 
848     CContext ctx(context);
849 
850     TIterator it = i->children.rbegin()->children.begin();
851     string mol(it->value.begin(), it->value.end());
852     CVariantPlacement::TMol mol_type =
853                        mol == "c" ? CVariantPlacement::eMol_cdna
854                      : mol == "g" ? CVariantPlacement::eMol_genomic
855                      : mol == "r" ? CVariantPlacement::eMol_rna
856                      : mol == "n" ? CVariantPlacement::eMol_rna
857                      : mol == "p" ? CVariantPlacement::eMol_protein
858                      : mol == "m" ? CVariantPlacement::eMol_mitochondrion
859                      : mol == "mt" ? CVariantPlacement::eMol_mitochondrion
860                      : CVariantPlacement::eMol_unknown;
861 
862     it  = (i->children.rbegin() + 1)->children.begin();
863     string id_str(it->value.begin(), it->value.end());
864 
865     CSeq_id_Handle idh = context.ResolevSeqId(id_str);
866     CBioseq_Handle bsh = context.GetScope().GetBioseqHandle(idh);
867     if(!bsh) {
868         HGVS_THROW(eSemantic, "Could not resolve seq-id-str='" + id_str + "'; idh=" + idh.AsString());
869     }
870 
871 
872     if(bsh.IsNucleotide()
873        && mol_type == CVariantPlacement::eMol_protein
874        && NStr::Find(id_str, "CCDS") == 0)
875     {
876         //If we have something like CCDS2.1:p., the CCDS2.1 will resolve
877         //to an NM, but we need to resolve it to corresponding NP.
878         //
879         //We could do this for all seq-ids, as long as there's unique CDS,
880         //but as per SNP-4536 the seq-id and moltype correspondence must be enforced,
881         //so we do it for CCDS only
882 
883         SAnnotSelector sel;
884         sel.SetResolveTSE();
885         sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
886         bool already_found = false;
887         for(CFeat_CI ci(bsh, sel); ci; ++ci) {
888             const CMappedFeat& mf = *ci;
889             if(mf.IsSetProduct() && mf.GetProduct().GetId()) {
890                 if(already_found) {
891                     HGVS_THROW(eSemantic, "Can't resolve to prot - multiple CDSes on " + idh.AsString());
892                 } else {
893                     idh = sequence::GetId(
894                             *mf.GetProduct().GetId(),
895                             context.GetScope(),
896                             sequence::eGetId_ForceAcc);
897                     already_found = true;
898                 }
899             }
900         }
901         if(!already_found) {
902             HGVS_THROW(eSemantic, "Can't resolve to prot - can't find CDS on " + idh.AsString());
903         }
904     } else if(   (   mol_type == CVariantPlacement::eMol_cdna
905                   || mol_type == CVariantPlacement::eMol_rna)
906               && idh.IdentifyAccession() == CSeq_id::eAcc_refseq_genomic)  //e.g. NG_009822.1:c.1437+1G>A
907     {
908         //VAR-861
909         if(!IsLRG(bsh)) {
910             HGVS_THROW(eSemantic, "Specifying c. expression in NG coordinates is only supported for LRG subset where NM/NG associations are stable");
911         }
912 
913         CSeq_id_Handle idh2 = GetUniquePrimaryTranscriptId(bsh);
914         if(!idh2) {
915             HGVS_THROW(eSemantic, "Can't resolve to a unique transcript on NG: " + idh.AsString());
916         } else {
917             idh = idh2;
918         }
919     }
920 
921     ctx.SetId(*idh.GetSeqId(), mol_type);
922 
923     if(i->children.size() == 3) {
924         it  = (i->children.rbegin() + 2)->children.begin();
925         string tag_str(it->value.begin(), it->value.end());
926         //record tag in context, if it is necessary in the future
927     }
928 
929     return ctx;
930 }
931 
932 
x_pos_spec(TIterator const & i,const CContext & context)933 CHgvsParser::SOffsetPoint CHgvsParser::x_pos_spec(TIterator const& i, const CContext& context)
934 {
935     HGVS_ASSERT_RULE(i, eID_pos_spec);
936 
937     SOffsetPoint pnt;
938     TIterator it = i->children.begin();
939     if(it->value.id() == SGrammar::eID_general_pos) {
940         pnt = x_general_pos(it, context);
941     } else if(it->value.id() == SGrammar::eID_fuzzy_pos) {
942         pnt = x_fuzzy_pos(it, context);
943     } else {
944         bool flip_strand = false;
945         if(i->children.size() == 3) {
946             //first child is 'o' - opposite
947             flip_strand = true;
948             ++it;
949         }
950 
951         CContext local_ctx = x_header(it, context);
952         ++it;
953         pnt = x_pos_spec(it, local_ctx);
954 
955         if(flip_strand) {
956             pnt.pnt->FlipStrand();
957         }
958     }
959 
960     return pnt;
961 }
962 
963 
x_prot_pos(TIterator const & i,const CContext & context)964 CHgvsParser::SOffsetPoint CHgvsParser::x_prot_pos(TIterator const& i, const CContext& context)
965 {
966     HGVS_ASSERT_RULE(i, eID_prot_pos);
967     TIterator it = i->children.begin();
968 
969     CRef<CSeq_literal> prot_literal = x_raw_seq(it, context);
970 
971     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
972         HGVS_THROW(eSemantic, "Expected protein context");
973     }
974 
975     if(prot_literal->GetLength() != 1) {
976         HGVS_THROW(eSemantic, "Expected single aa literal in prot-pos");
977     }
978 
979     ++it;
980     SOffsetPoint pnt = x_pos_spec(it, context);
981 
982     pnt.asserted_sequence = prot_literal->GetSeq_data().GetNcbieaa();
983 
984     return pnt;
985 }
986 
987 
x_range(TIterator const & i,const CContext & context)988 CRef<CVariantPlacement> CHgvsParser::x_range(TIterator const& i, const CContext& context)
989 {
990     SOffsetPoint pnt1, pnt2;
991 
992     CRef<CVariantPlacement> p(new CVariantPlacement);
993     p->Assign(context.GetPlacement());
994 
995     if(i->value.id() == SGrammar::eID_prot_range) {
996         pnt1 = x_prot_pos(i->children.begin(), context);
997         pnt2 = x_prot_pos(i->children.begin() + 1, context);
998     } else if(i->value.id() == SGrammar::eID_nuc_range) {
999         pnt1 = x_pos_spec(i->children.begin(), context);
1000         pnt2 = x_pos_spec(i->children.begin() + 1, context);
1001     } else {
1002         HGVS_ASSERT_RULE(i, eID_NONE);
1003     }
1004 
1005     if(!pnt1.pnt->GetId().Equals(pnt2.pnt->GetId())) {
1006         HGVS_THROW(eSemantic, "Range-loc start/stop are on different seq-ids.");
1007     }
1008     if(pnt1.pnt->GetStrand() != pnt2.pnt->GetStrand()) {
1009         HGVS_THROW(eSemantic, "Range-loc start/stop are on different strands.");
1010     }
1011 
1012     p->SetLoc().SetInt().SetId(pnt1.pnt->SetId());
1013     p->SetLoc().SetInt().SetFrom(pnt1.pnt->GetPoint());
1014     p->SetLoc().SetInt().SetTo(pnt2.pnt->GetPoint());
1015     p->SetLoc().SetInt().SetStrand(pnt1.pnt->GetStrand());
1016     if(pnt1.pnt->IsSetFuzz()) {
1017         p->SetLoc().SetInt().SetFuzz_from(pnt1.pnt->SetFuzz());
1018     }
1019 
1020     if(pnt2.pnt->IsSetFuzz()) {
1021         p->SetLoc().SetInt().SetFuzz_to(pnt2.pnt->SetFuzz());
1022     }
1023 
1024     s_SetStartOffset(*p, pnt1.offset);
1025     s_SetStopOffset(*p, pnt2.offset);
1026 
1027     if(pnt1.asserted_sequence != "" || pnt2.asserted_sequence != "") {
1028         //for proteins, the asserted sequence is specified as part of location, rather than variation
1029         p->SetSeq().SetLength(CVariationUtil::s_GetLength(*p, NULL));
1030         string& seq_str = (context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein)
1031                 ? p->SetSeq().SetSeq_data().SetNcbieaa().Set()
1032                 : p->SetSeq().SetSeq_data().SetIupacna().Set();
1033         seq_str = pnt1.asserted_sequence + ".." + pnt2.asserted_sequence;
1034     }
1035 
1036     return p;
1037 }
1038 
x_location(TIterator const & i,const CContext & context)1039 CRef<CVariantPlacement> CHgvsParser::x_location(TIterator const& i, const CContext& context)
1040 {
1041     HGVS_ASSERT_RULE(i, eID_location);
1042 
1043     CRef<CVariantPlacement> placement(new CVariantPlacement);
1044     placement->Assign(context.GetPlacement());
1045 
1046     TIterator it = i->children.begin();
1047     CRef<CSeq_loc> loc(new CSeq_loc);
1048     if(it->value.id() == SGrammar::eID_prot_pos || it->value.id() == SGrammar::eID_pos_spec) {
1049         SOffsetPoint pnt = it->value.id() == SGrammar::eID_prot_pos
1050                 ? x_prot_pos(it, context)
1051                 : x_pos_spec(it, context);
1052         placement->SetLoc().SetPnt(*pnt.pnt);
1053         s_SetStartOffset(*placement, pnt.offset);
1054         if(pnt.asserted_sequence != "") {
1055             placement->SetSeq().SetLength(CVariationUtil::s_GetLength(*placement, NULL));
1056             string& seq_str = (context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein)
1057                     ? placement->SetSeq().SetSeq_data().SetNcbieaa().Set()
1058                     : placement->SetSeq().SetSeq_data().SetIupacna().Set();
1059             seq_str = pnt.asserted_sequence;
1060         }
1061 
1062         //todo point with pos=0 and fuzz=unk -> unknown pos ->set loc to empty
1063 
1064     } else if(it->value.id() == SGrammar::eID_nuc_range || it->value.id() == SGrammar::eID_prot_range) {
1065         placement = x_range(it, context);
1066     } else {
1067         HGVS_ASSERT_RULE(it, eID_NONE);
1068     }
1069 
1070     if(placement->GetLoc().IsPnt() && placement->GetLoc().GetPnt().GetPoint() == kInvalidSeqPos) {
1071         placement->SetLoc().SetEmpty().Assign(context.GetId());
1072     }
1073 
1074     CVariationUtil util(context.GetScope());
1075     if(CVariationUtil::eFail == util.CheckExonBoundary(*placement)) {
1076         CRef<CVariationException> exception(new CVariationException);
1077         exception->SetCode(CVariationException::eCode_hgvs_exon_boundary);
1078         exception->SetMessage("HGVS exon-boundary position not represented in the transcript annotation");
1079         placement->SetExceptions().push_back(exception);
1080     }
1081     util.CheckPlacement(*placement);
1082 
1083     return placement;
1084 }
1085 
1086 
x_seq_loc(TIterator const & i,const CContext & context)1087 CRef<CSeq_loc> CHgvsParser::x_seq_loc(TIterator const& i, const CContext& context)
1088 {
1089     HGVS_ASSERT_RULE(i, eID_seq_loc);
1090     TIterator it = i->children.begin();
1091 
1092     bool flip_strand = false;
1093     if(i->children.size() == 3) {
1094         //first child is 'o' - opposite
1095         flip_strand = true;
1096         ++it;
1097     }
1098 
1099     CContext local_context = x_header(it, context);
1100     ++it;
1101     CRef<CVariantPlacement> p = x_location(it, local_context);
1102 
1103     if(flip_strand) {
1104         p->SetLoc().FlipStrand();
1105     }
1106 
1107     if(p->IsSetStop_offset() || p->IsSetStart_offset()) {
1108         HGVS_THROW(eSemantic, "Intronic seq-locs are not supported in this context");
1109     }
1110 
1111     CRef<CSeq_loc> loc(new CSeq_loc);
1112     loc->Assign(p->GetLoc());
1113     return loc;
1114 }
1115 
x_raw_seq_or_len(TIterator const & i,const CContext & context)1116 CRef<CSeq_literal> CHgvsParser::x_raw_seq_or_len(TIterator const& i, const CContext& context)
1117 {
1118     HGVS_ASSERT_RULE(i, eID_raw_seq_or_len);
1119 
1120     CRef<CSeq_literal> literal;
1121     TIterator it = i->children.begin();
1122 
1123     if(it == i->children.end()) {
1124         HGVS_THROW(eLogic, "Unexpected parse-tree state when parsing " + context.GetHgvs());
1125     }
1126 
1127     if(it->value.id() == SGrammar::eID_raw_seq) {
1128         literal = x_raw_seq(it, context);
1129     } else if(it->value.id() == SGrammar::eID_int_fuzz) {
1130         SFuzzyInt int_fuzz = x_int_fuzz(it, context);
1131         literal.Reset(new CSeq_literal);
1132         literal->SetLength(int_fuzz.value);
1133         if(int_fuzz.fuzz.IsNull()) {
1134             ;//no-fuzz;
1135         } else if(int_fuzz.IsPureFuzz()) {
1136             //unknown length (no value) - will represent as length=0 with gt fuzz
1137             literal->SetFuzz().SetLim(CInt_fuzz::eLim_gt);
1138         } else {
1139             literal->SetFuzz(*int_fuzz.fuzz);
1140         }
1141     } else {
1142         HGVS_ASSERT_RULE(it, eID_NONE);
1143     }
1144     return literal;
1145 }
1146 
x_seq_ref(TIterator const & i,const CContext & context)1147 CHgvsParser::TDelta CHgvsParser::x_seq_ref(TIterator const& i, const CContext& context)
1148 {
1149     HGVS_ASSERT_RULE(i, eID_seq_ref);
1150     CHgvsParser::TDelta delta(new TDelta::TObjectType);
1151     TIterator it = i->children.begin();
1152 
1153     if(it->value.id() == SGrammar::eID_seq_loc) {
1154         CRef<CSeq_loc> loc = x_seq_loc(it, context);
1155         delta->SetSeq().SetLoc(*loc);
1156     } else if(it->value.id() == SGrammar::eID_nuc_range || it->value.id() == SGrammar::eID_prot_range) {
1157         CRef<CVariantPlacement> p = x_range(it, context);
1158         if(p->IsSetStart_offset() || p->IsSetStop_offset()) {
1159             HGVS_THROW(eSemantic, "Intronic loc is not supported in this context");
1160         }
1161         delta->SetSeq().SetLoc().Assign(p->GetLoc());
1162     } else if(it->value.id() == SGrammar::eID_raw_seq_or_len) {
1163         CRef<CSeq_literal> literal = x_raw_seq_or_len(it, context);
1164         delta->SetSeq().SetLiteral(*literal);
1165     } else {
1166         HGVS_ASSERT_RULE(it, eID_NONE);
1167     }
1168 
1169     return delta;
1170 }
1171 
1172 
s_hgvsaa2ncbieaa(const string & hgvsaa,string & out)1173 bool CHgvsParser::s_hgvsaa2ncbieaa(const string& hgvsaa, string& out)
1174 {
1175     //try to interpret sequence that was matched by either of aminoacid1, aminoacid, or aminoacid3
1176     string tmp_out(""); //so that the caller may pass same variable for in and out
1177     bool ret = s_hgvsaa2ncbieaa(hgvsaa, true, tmp_out);
1178     if(!ret) {
1179         ret = s_hgvsaa2ncbieaa(hgvsaa, false, tmp_out);
1180     }
1181     if(!ret) {
1182         ret = s_hgvs_iupacaa2ncbieaa(hgvsaa, tmp_out);
1183     }
1184 
1185     out = tmp_out;
1186     return ret;
1187 }
1188 
s_hgvs_iupacaa2ncbieaa(const string & hgvsaa,string & out)1189 bool CHgvsParser::s_hgvs_iupacaa2ncbieaa(const string& hgvsaa, string& out)
1190 {
1191     out = hgvsaa;
1192 
1193     //"X" used to mean "Ter" in HGVS; now it means "unknown aminoacid"
1194     //Still, we'll interpret it as Ter, simply beacuse it is more likely
1195     //that the submitter is using legacy representation.
1196     NStr::ReplaceInPlace(out, "X", "*");
1197     NStr::ReplaceInPlace(out, "?", "X");
1198     return true;
1199 }
1200 
s_hgvsaa2ncbieaa(const string & hgvsaa,bool uplow,string & out)1201 bool CHgvsParser::s_hgvsaa2ncbieaa(const string& hgvsaa, bool uplow, string& out)
1202 {
1203     string in = hgvsaa;
1204     out = "";
1205     while(in != "") {
1206         bool found = false;
1207         for(size_t i_ncbistdaa = 0; i_ncbistdaa < 28; i_ncbistdaa++) {
1208             string iupac3 = CSeqportUtil::GetIupacaa3(i_ncbistdaa);
1209             if(NStr::StartsWith(in, uplow ? iupac3 : NStr::ToUpper(iupac3))) {
1210                 size_t i_ncbieaa = CSeqportUtil::GetMapToIndex(CSeq_data::e_Ncbistdaa,
1211                                                                CSeq_data::e_Ncbieaa,
1212                                                                i_ncbistdaa);
1213                 out += CSeqportUtil::GetCode(CSeq_data::e_Ncbieaa, i_ncbieaa);
1214                 found = true;
1215                 break;
1216             }
1217         }
1218         if(found) {
1219             in = in.substr(3);
1220         } else if(NStr::StartsWith(in, "*")) { out.push_back('*'); in = in.substr(1);
1221         } else if(NStr::StartsWith(in, "X")) { out.push_back('*'); in = in.substr(1);
1222         } else if(NStr::StartsWith(in, "?")) { out.push_back('X'); in = in.substr(1);
1223         //} else if(NStr::StartsWith(in, "STOP", NStr::eNocase)) { out.push_back('X'); in = in.substr(4); //VAR-283
1224         } else {
1225             out = hgvsaa;
1226             return false;
1227         }
1228     }
1229     return true;
1230 }
1231 
1232 
1233 
x_raw_seq(TIterator const & i,const CContext & context)1234 CRef<CSeq_literal> CHgvsParser::x_raw_seq(TIterator const& i, const CContext& context)
1235 {
1236     HGVS_ASSERT_RULE(i, eID_raw_seq);
1237     TIterator it = i->children.begin();
1238 
1239     string seq_str(it->value.begin(), it->value.end());
1240 
1241     CRef<CSeq_literal>literal(new CSeq_literal);
1242     if(context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein) {
1243         s_hgvsaa2ncbieaa(seq_str, seq_str);
1244         literal->SetSeq_data().SetNcbieaa().Set(seq_str);
1245     } else {
1246         seq_str = NStr::ToUpper(seq_str);
1247         NStr::ReplaceInPlace(seq_str, "U", "T");
1248         literal->SetSeq_data().SetIupacna().Set(seq_str);
1249     }
1250 
1251     literal->SetLength(seq_str.size());
1252 
1253     vector<TSeqPos> bad;
1254     CSeqportUtil::Validate(literal->GetSeq_data(), &bad);
1255 
1256     if(bad.size() > 0) {
1257         HGVS_THROW(eSemantic, "Invalid sequence at pos " +  NStr::IntToString(bad[0]) + " in " + seq_str);
1258     }
1259 
1260     return literal;
1261 }
1262 
1263 
GetDeltaLength(const CDelta_item & delta,int loc_len)1264 int GetDeltaLength(const CDelta_item& delta, int loc_len)
1265 {
1266     int len = !delta.IsSetSeq()          ? 0
1267             : delta.GetSeq().IsLiteral() ? delta.GetSeq().GetLiteral().GetLength()
1268             : delta.GetSeq().IsLoc()     ? sequence::GetLength(delta.GetSeq().GetLoc(), NULL)
1269             : delta.GetSeq().IsThis()    ? loc_len
1270             : 0;
1271    if(delta.IsSetMultiplier()) {
1272         len *= delta.GetMultiplier();
1273    }
1274    return len;
1275 }
1276 
GetDelInsSubtype(int del_len,int ins_len)1277 CVariation_inst::EType GetDelInsSubtype(int del_len, int ins_len)
1278 {
1279     return del_len > 0 && ins_len == 0        ? CVariation_inst::eType_del
1280          : del_len == 0 && ins_len > 0        ? CVariation_inst::eType_ins
1281          : del_len == ins_len && del_len != 1 ? CVariation_inst::eType_mnp
1282          : del_len == ins_len && del_len == 1 ? CVariation_inst::eType_snv
1283          :                                      CVariation_inst::eType_delins;
1284 }
1285 
x_delins(TIterator const & i,const CContext & context)1286 CRef<CVariation> CHgvsParser::x_delins(TIterator const& i, const CContext& context)
1287 {
1288     HGVS_ASSERT_RULE(i, eID_delins);
1289     TIterator it = i->children.begin();
1290     CRef<CVariation> del_vr = x_deletion(it, context);
1291     ++it;
1292     CRef<CVariation> ins_vr = x_insertion(it, context, false);
1293         //note: don't verify location, as it must be len=2 for pure-insertion only
1294 
1295     //The resulting delins variation has deletion's placement (with asserted seq, if any),
1296     //and insertion's inst, except action type is "replace" (default) rather than "ins-before",
1297     //so we reset action
1298 
1299     int placement_len = CVariationUtil::s_GetLength(SetFirstPlacement(*del_vr), NULL);
1300     int del_len = GetDeltaLength(*del_vr->GetData().GetInstance().GetDelta().front(), placement_len);
1301     int ins_len = GetDeltaLength(*ins_vr->GetData().GetInstance().GetDelta().front(), placement_len);
1302     del_vr->SetData().SetInstance().SetType(GetDelInsSubtype(del_len, ins_len));
1303 
1304     del_vr->SetData().SetInstance().SetDelta() = ins_vr->SetData().SetInstance().SetDelta();
1305     del_vr->SetData().SetInstance().SetDelta().front()->ResetAction();
1306 
1307     if(ins_len == 1 && del_len == 1) {
1308         CRef<CVariationException> ex(new CVariationException);
1309         ex->SetCode(CVariationException::eCode_hgvs_parsing);
1310         ex->SetMessage("delins used for single-nt substitution");
1311         SetFirstPlacement(*del_vr).SetExceptions().push_back(ex);
1312     }
1313 
1314     return del_vr;
1315 }
1316 
x_deletion(TIterator const & i,const CContext & context)1317 CRef<CVariation> CHgvsParser::x_deletion(TIterator const& i, const CContext& context)
1318 {
1319     HGVS_ASSERT_RULE(i, eID_deletion);
1320     TIterator it = i->children.begin();
1321     CRef<CVariation> vr(new CVariation);
1322     CVariation_inst& var_inst = vr->SetData().SetInstance();
1323 
1324     var_inst.SetType(CVariation_inst::eType_del);
1325     CVariantPlacement& p = SetFirstPlacement(*vr);
1326     p.Assign(context.GetPlacement());
1327 
1328     CRef<CDelta_item> di(new CDelta_item);
1329     di->SetAction(CDelta_item::eAction_del_at);
1330     di->SetSeq().SetThis();
1331     var_inst.SetDelta().push_back(di);
1332 
1333     ++it;
1334 
1335     if(it != i->children.end() && it->value.id() == SGrammar::eID_raw_seq_or_len) {
1336         CRef<CSeq_literal> literal = x_raw_seq_or_len(it, context);
1337         ++it;
1338         SetFirstPlacement(*vr).SetSeq(*literal);
1339 
1340         if(literal->GetLength() != CVariationUtil::s_GetLength(p, NULL)) {
1341             CRef<CVariationException> ex(new CVariationException);
1342             ex->SetCode(CVariationException::eCode_hgvs_parsing);
1343             ex->SetMessage("Sequence length is inconsistent with location length");
1344             p.SetExceptions().push_back(ex);
1345         }
1346     }
1347 
1348     var_inst.SetDelta();
1349     return vr;
1350 }
1351 
1352 
x_insertion(TIterator const & i,const CContext & context,bool check_loc)1353 CRef<CVariation> CHgvsParser::x_insertion(TIterator const& i, const CContext& context, bool check_loc)
1354 {
1355     HGVS_ASSERT_RULE(i, eID_insertion);
1356     TIterator it = i->children.begin();
1357     ++it; //skip ins
1358     CRef<CVariation> vr(new CVariation);
1359     CVariation_inst& var_inst = vr->SetData().SetInstance();
1360 
1361     var_inst.SetType(CVariation_inst::eType_ins);
1362 
1363     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1364 
1365     if(check_loc && CVariationUtil::s_GetLength(*vr->GetPlacements().front(), NULL) != 2) {
1366         HGVS_THROW(eSemantic, "Location must be a dinucleotide");
1367     }
1368 
1369     TDelta delta_ins = x_seq_ref(it, context);
1370 
1371     //todo:
1372     //alternative representation: if delta is literal, might use action=morph and prefix/suffix the insertion with the flanking nucleotides.
1373     delta_ins->SetAction(CDelta_item::eAction_ins_before);
1374 
1375     var_inst.SetDelta().push_back(delta_ins);
1376 
1377     return vr;
1378 }
1379 
1380 
x_duplication(TIterator const & i,const CContext & context)1381 CRef<CVariation> CHgvsParser::x_duplication(TIterator const& i, const CContext& context)
1382 {
1383     HGVS_ASSERT_RULE(i, eID_duplication);
1384     TIterator it = i->children.begin();
1385     CRef<CVariation> vr(new CVariation);
1386     CVariation_inst& var_inst = vr->SetData().SetInstance();
1387     var_inst.SetType(CVariation_inst::eType_ins); //replace seq @ location with this*2
1388 
1389     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1390 
1391     TDelta delta(new TDelta::TObjectType);
1392     delta->SetSeq().SetThis(); //delta->SetSeq().SetLoc(vr->SetLocation());
1393     delta->SetMultiplier(2);
1394     var_inst.SetDelta().push_back(delta);
1395 
1396     ++it; //skip dup
1397 
1398     //the next node is either expected length or expected sequence
1399     if(it != i->children.end() && it->value.id() == SGrammar::eID_seq_ref) {
1400         TDelta dup_seq = x_seq_ref(it, context);
1401         if(dup_seq->GetSeq().IsLiteral()) {
1402             SetFirstPlacement(*vr).SetSeq(dup_seq->SetSeq().SetLiteral());
1403 
1404             if(CVariationUtil::s_GetLength(*vr->GetPlacements().front(), NULL) != dup_seq->GetSeq().GetLiteral().GetLength()) {
1405                 HGVS_THROW(eSemantic, "Location length and asserted sequence length differ");
1406             }
1407         }
1408     }
1409 
1410     return vr;
1411 }
1412 
1413 
x_no_change(TIterator const & i,const CContext & context)1414 CRef<CVariation> CHgvsParser::x_no_change(TIterator const& i, const CContext& context)
1415 {
1416     HGVS_ASSERT_RULE(i, eID_no_change);
1417     TIterator it = i->children.begin();
1418     CRef<CVariation> vr(new CVariation);
1419     CVariation_inst& var_inst = vr->SetData().SetInstance();
1420 
1421     CVariantPlacement& p = SetFirstPlacement(*vr);
1422     p.Assign(context.GetPlacement());
1423 
1424     //VAR-574: The no-change variation is interpreted as X>X
1425 
1426     if(it->value.id() == SGrammar::eID_raw_seq) {
1427         CRef<CSeq_literal> seq_from = x_raw_seq(it, context);
1428         p.SetSeq(*seq_from);
1429         ++it;
1430     } else if(p.GetLoc().IsWhole()) {
1431         // will fall-back 'identity' inst-type instead of whole-seq mnp
1432     } else {
1433         CVariationUtil util(context.GetScope());
1434         util.AttachSeq(p);
1435         p.ResetExceptions();
1436             // if could not attach seq (e.g. too-large or intronic context)
1437             // will fall-back on 'identity' inst-type below, so ignoring
1438             // the exceptions.
1439     }
1440 
1441     var_inst.SetType(
1442             p.GetMol() == CVariantPlacement::eMol_protein ? CVariation_inst::eType_prot_silent
1443           : !p.IsSetSeq()                                 ? CVariation_inst::eType_identity
1444           : p.GetSeq().GetLength() == 1                   ? CVariation_inst::eType_snv
1445           :                                                 CVariation_inst::eType_mnp);
1446 
1447     TDelta delta(new TDelta::TObjectType);
1448     if(p.IsSetSeq()) {
1449         delta->SetSeq().SetLiteral().Assign(p.GetSeq());
1450     } else {
1451         delta->SetSeq().SetThis();
1452     }
1453     var_inst.SetDelta().push_back(delta);
1454 
1455     return vr;
1456 }
1457 
1458 
x_nuc_subst(TIterator const & i,const CContext & context)1459 CRef<CVariation> CHgvsParser::x_nuc_subst(TIterator const& i, const CContext& context)
1460 {
1461     HGVS_ASSERT_RULE(i, eID_nuc_subst);
1462     TIterator it = i->children.begin();
1463     CRef<CVariation> vr(new CVariation);
1464     CVariation_inst& var_inst = vr->SetData().SetInstance();
1465 
1466     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1467 
1468     if(it->value.id() == SGrammar::eID_raw_seq) {
1469         CRef<CSeq_literal> seq_from = x_raw_seq(it, context);
1470         SetFirstPlacement(*vr).SetSeq(*seq_from);
1471         ++it;
1472     }
1473 
1474     ++it;//skip ">"
1475 
1476     CRef<CSeq_literal> seq_to = x_raw_seq(it, context);
1477     TDelta delta(new TDelta::TObjectType);
1478     delta->SetSeq().SetLiteral(*seq_to);
1479     var_inst.SetDelta().push_back(delta);
1480 
1481     var_inst.SetType(
1482             GetDelInsSubtype(
1483                 CVariationUtil::s_GetLength(SetFirstPlacement(*vr), NULL),
1484                 seq_to->GetLength()));
1485 
1486     return vr;
1487 }
1488 
1489 
x_nuc_inv(TIterator const & i,const CContext & context)1490 CRef<CVariation> CHgvsParser::x_nuc_inv(TIterator const& i, const CContext& context)
1491 {
1492     HGVS_ASSERT_RULE(i, eID_nuc_inv);
1493 
1494     TIterator it = i->children.begin();
1495     CRef<CVariation> vr(new CVariation);
1496     CVariation_inst& var_inst = vr->SetData().SetInstance();
1497     var_inst.SetType(CVariation_inst::eType_inv);
1498 
1499     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1500 
1501 #if 0
1502     TDelta delta(new TDelta::TObjectType);
1503     delta->SetSeq().SetLoc().Assign(*loc);
1504     delta->SetSeq().SetLoc().FlipStrand();
1505     var_inst.SetDelta().push_back(delta);
1506 #else
1507     //don't put anything in the delta, as the inversion sequence is placement-specific, not variation-specific
1508     var_inst.SetDelta();
1509 #endif
1510 
1511     ++it;
1512 
1513      //capture asserted seq
1514      if(it != i->children.end() && it->value.id() == SGrammar::eID_seq_ref) {
1515          TDelta dup_seq = x_seq_ref(it, context);
1516          if(dup_seq->GetSeq().IsLiteral()) {
1517              SetFirstPlacement(*vr).SetSeq(dup_seq->SetSeq().SetLiteral());
1518          }
1519      }
1520 
1521     return vr;
1522 }
1523 
1524 
x_ssr(TIterator const & i,const CContext & context)1525 CRef<CVariation> CHgvsParser::x_ssr(TIterator const& i, const CContext& context)
1526 {
1527     HGVS_ASSERT_RULE(i, eID_ssr);
1528     TIterator it = i->children.begin();
1529     CRef<CVariation> vr(new CVariation);
1530     vr->SetData().SetInstance().SetType(CVariation_inst::eType_microsatellite);
1531 
1532 
1533     CRef<CSeq_literal> literal;
1534     if(it->value.id() == SGrammar::eID_raw_seq) {
1535         literal = x_raw_seq(it, context);
1536         ++it;
1537     }
1538 
1539     CVariantPlacement& p = SetFirstPlacement(*vr);
1540     p.Assign(context.GetPlacement());
1541 
1542     // The location may either specify a repeat unit,
1543     // or point to the first base of a repeat unit.
1544     // We normalize it so it is alwas the repeat unit.
1545 #if 1
1546     if(   !literal.IsNull()
1547        && literal->IsSetSeq_data()
1548        && literal->GetSeq_data().IsIupacna()
1549        && !p.IsSetStart_offset()
1550        && !p.IsSetStop_offset())
1551     {
1552         CRef<CSeq_loc> ssr_loc = FindSSRLoc(
1553                 p.GetLoc(),
1554                 literal->GetSeq_data().GetIupacna(),
1555                 context.GetScope());
1556         p.SetLoc().Assign(*ssr_loc);
1557     } else if(p.IsSetStart_offset() && !p.IsSetStop_offset()) {
1558         p.SetStop_offset(p.GetStart_offset()
1559                        + (literal.IsNull() ? 0 : literal->GetLength() - 1));
1560     }
1561 #else
1562     if(SetFirstPlacement(*vr).GetLoc().IsPnt() && !literal.IsNull()) {
1563         ExtendDownstream(SetFirstPlacement(*vr), literal->GetLength() - 1);
1564     }
1565 #endif
1566 
1567 
1568     if(it->value.id() == SGrammar::eID_ssr) { // list('['>>int_p>>']', '+') with '[',']','+' nodes discarded;
1569         //Note: see ssr grammar in the header for reasons why we have to match all alleles here
1570         //rather than match them separately as mut_insts
1571 
1572         vr->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_genotype);
1573         for(; it != i->children.end(); ++it) {
1574             string s1(it->value.begin(), it->value.end());
1575             CRef<CVariation> vr2(new CVariation);
1576             vr2->SetData().SetInstance().SetType(CVariation_inst::eType_microsatellite);
1577 
1578             TDelta delta(new TDelta::TObjectType);
1579             if(!literal.IsNull()) {
1580                 delta->SetSeq().SetLiteral().Assign(*literal);
1581             } else {
1582                 delta->SetSeq().SetThis();
1583             }
1584             delta->SetMultiplier(NStr::StringToInt(s1));
1585 
1586             vr2->SetData().SetInstance().SetDelta().push_back(delta);
1587             vr->SetData().SetSet().SetVariations().push_back(vr2);
1588         }
1589         vr = x_unwrap_iff_singleton(*vr);
1590     } else {
1591         TDelta delta(new TDelta::TObjectType);
1592         if(!literal.IsNull()) {
1593             delta->SetSeq().SetLiteral().Assign(*literal);
1594         } else {
1595             delta->SetSeq().SetThis();
1596         }
1597 
1598         SFuzzyInt int_fuzz = x_int_fuzz(it, context);
1599         delta->SetMultiplier(int_fuzz.value);
1600         if(int_fuzz.fuzz.IsNull()) {
1601             ;
1602         } else {
1603             delta->SetMultiplier_fuzz(*int_fuzz.fuzz);
1604         }
1605         vr->SetData().SetInstance().SetDelta().push_back(delta);
1606     }
1607 
1608     return vr;
1609 }
1610 
1611 
x_translocation(TIterator const & i,const CContext & context)1612 CRef<CVariation> CHgvsParser::x_translocation(TIterator const& i, const CContext& context)
1613 {
1614     HGVS_ASSERT_RULE(i, eID_translocation);
1615     TIterator it = i->children.end() - 1; //note: seq-loc follows iscn expression, i.e. last child
1616     CRef<CVariation> vr(new CVariation);
1617     CVariation_inst& var_inst = vr->SetData().SetInstance();
1618     var_inst.SetType(CVariation_inst::eType_translocation);
1619 
1620     CRef<CSeq_loc> loc = x_seq_loc(it, context);
1621     SetFirstPlacement(*vr).SetLoc().Assign(*loc);
1622     CVariationUtil util(context.GetScope());
1623     SetFirstPlacement(*vr).SetMol(util.GetMolType(sequence::GetId(*loc, NULL)));
1624 
1625     it = i->children.begin();
1626     string iscn_expr(it->value.begin(), it->value.end());
1627     vr->SetSynonyms().push_back("ISCN:" + iscn_expr);
1628     var_inst.SetDelta(); //no delta contents
1629 
1630     return vr;
1631 }
1632 
1633 
x_conversion(TIterator const & i,const CContext & context)1634 CRef<CVariation> CHgvsParser::x_conversion(TIterator const& i, const CContext& context)
1635 {
1636     HGVS_ASSERT_RULE(i, eID_conversion);
1637     TIterator it = i->children.begin();
1638     CRef<CVariation> vr(new CVariation);
1639     CVariation_inst& var_inst = vr->SetData().SetInstance();
1640     var_inst.SetType(CVariation_inst::eType_transposon);
1641 
1642     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1643 
1644     ++it;
1645     CRef<CSeq_loc> loc_other = x_seq_loc(it, context);
1646 
1647     TDelta delta(new TDelta::TObjectType);
1648     delta->SetSeq().SetLoc().Assign(*loc_other);
1649     var_inst.SetDelta().push_back(delta);
1650 
1651     return vr;
1652 }
1653 
1654 
x_prot_fs(TIterator const & i,const CContext & context)1655 CRef<CVariation> CHgvsParser::x_prot_fs(TIterator const& i, const CContext& context)
1656 {
1657     HGVS_ASSERT_RULE(i, eID_prot_fs);
1658     TIterator it = i->children.begin();
1659     CRef<CVariation> vr(new CVariation);
1660 
1661     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
1662         HGVS_THROW(eContext, "Frameshift can only be specified in protein context");
1663     }
1664 
1665     vr->SetData().SetNote("Frameshift");
1666     vr->SetFrameshift();
1667 
1668     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1669 
1670     ++it; //skip 'fs'
1671     if(it != i->children.end()) {
1672         //fsX# description: the remaining tokens are 'X' and integer
1673         ++it; //skip 'X'
1674         if(it != i->children.end()) {
1675             string s(it->value.begin(), it->value.end());
1676             int x_length = NStr::StringToInt(s);
1677             vr->SetFrameshift().SetX_length(x_length);
1678         }
1679     }
1680 
1681     return vr;
1682 }
1683 
1684 
x_prot_ext(TIterator const & i,const CContext & context)1685 CRef<CVariation> CHgvsParser::x_prot_ext(TIterator const& i, const CContext& context)
1686 {
1687     HGVS_ASSERT_RULE(i, eID_prot_ext);
1688     TIterator it = i->children.begin();
1689 
1690     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
1691         HGVS_THROW(eContext, "Expected protein context");
1692     }
1693 
1694     CRef<CVariation> vr(new CVariation);
1695     CVariation_inst& var_inst = vr->SetData().SetInstance();
1696     var_inst.SetType(CVariation_inst::eType_prot_other);
1697     string ext_type_str(it->value.begin(), it->value.end());
1698     ++it;
1699     string ext_len_str(it->value.begin(), it->value.end());
1700     int ext_len = NStr::StringToInt(ext_len_str);
1701 
1702     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1703     SetFirstPlacement(*vr).SetLoc().SetPnt().SetId().Assign(context.GetId());
1704     SetFirstPlacement(*vr).SetLoc().SetPnt().SetStrand(eNa_strand_plus);
1705 
1706     TDelta delta(new TDelta::TObjectType);
1707     delta->SetSeq().SetLiteral().SetLength(abs(ext_len) + 1);
1708         //extension of Met or X by N bases = replacing first or last AA with (N+1) AAs
1709 
1710     if(ext_type_str == "extMet") {
1711         if(ext_len > 0) {
1712             HGVS_THROW(eSemantic, "extMet must be followed by a negative integer");
1713         }
1714         SetFirstPlacement(*vr).SetLoc().SetPnt().SetPoint(0);
1715         //extension precedes first AA
1716         var_inst.SetDelta().push_back(delta);
1717     } else if(ext_type_str == "extX"
1718            || ext_type_str == "ext*"
1719            || ext_type_str == "extTer")
1720     {
1721         if(ext_len < 0) {
1722             HGVS_THROW(eSemantic, "exX must be followed by a non-negative integer");
1723         }
1724 
1725         SetFirstPlacement(*vr).SetLoc().SetPnt().SetPoint(context.GetBioseqHandle().GetInst_Length() - 1);
1726         //extension follows last AA
1727         var_inst.SetDelta().push_back(delta);
1728     } else {
1729         HGVS_THROW(eGrammatic, "Unexpected ext_type: " + ext_type_str);
1730     }
1731 
1732     return vr;
1733 }
1734 
1735 
x_prot_missense(TIterator const & i,const CContext & context)1736 CRef<CVariation> CHgvsParser::x_prot_missense(TIterator const& i, const CContext& context)
1737 {
1738     HGVS_ASSERT_RULE(i, eID_prot_missense);
1739     TIterator it = i->children.begin();
1740 
1741     CRef<CSeq_literal> prot_literal = x_raw_seq(it, context);
1742 
1743     if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
1744         HGVS_THROW(eContext, "Expected protein context");
1745     }
1746 
1747     CRef<CVariation> vr(new CVariation);
1748     CVariation_inst& var_inst = vr->SetData().SetInstance();
1749     var_inst.SetType(prot_literal->GetLength() == 1 ?
1750             CVariation_inst::eType_prot_missense
1751           : CVariation_inst::eType_prot_other);
1752 
1753     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1754 
1755     TDelta delta(new TDelta::TObjectType);
1756     delta->SetSeq().SetLiteral(*prot_literal);
1757     var_inst.SetDelta().push_back(delta);
1758 
1759     return vr;
1760 }
1761 
1762 
x_string_content(TIterator const & i,const CContext & context)1763 CRef<CVariation>  CHgvsParser::x_string_content(TIterator const& i, const CContext& context)
1764 {
1765     CRef<CVariation> vr(new CVariation);
1766     string s(i->value.begin(), i->value.end());
1767     s = s.substr(1); //truncate the leading pipe
1768     SetFirstPlacement(*vr).Assign(context.GetPlacement());
1769     vr->SetData().SetNote(s);
1770     return vr;
1771 }
1772 
1773 
x_mut_inst(TIterator const & i,const CContext & context)1774 CRef<CVariation> CHgvsParser::x_mut_inst(TIterator const& i, const CContext& context)
1775 {
1776     HGVS_ASSERT_RULE(i, eID_mut_inst);
1777 
1778     TIterator it = i->children.begin();
1779 
1780     CRef<CVariation> vr(new CVariation);
1781     if(it->value.id() == SGrammar::eID_mut_inst) {
1782         string s(it->value.begin(), it->value.end());
1783         if(s == "?") {
1784             vr->SetData().SetUnknown();
1785             SetFirstPlacement(*vr).Assign(context.GetPlacement());
1786         } else {
1787             vr = x_string_content(it, context);
1788         }
1789     } else {
1790         vr =
1791             it->value.id() == SGrammar::eID_no_change     ? x_no_change(it, context)
1792           : it->value.id() == SGrammar::eID_delins        ? x_delins(it, context)
1793           : it->value.id() == SGrammar::eID_deletion      ? x_deletion(it, context)
1794           : it->value.id() == SGrammar::eID_insertion     ? x_insertion(it, context, true)
1795           : it->value.id() == SGrammar::eID_duplication   ? x_duplication(it, context)
1796           : it->value.id() == SGrammar::eID_nuc_subst     ? x_nuc_subst(it, context)
1797           : it->value.id() == SGrammar::eID_nuc_inv       ? x_nuc_inv(it, context)
1798           : it->value.id() == SGrammar::eID_ssr           ? x_ssr(it, context)
1799           : it->value.id() == SGrammar::eID_conversion    ? x_conversion(it, context)
1800           : it->value.id() == SGrammar::eID_prot_ext      ? x_prot_ext(it, context)
1801           : it->value.id() == SGrammar::eID_prot_fs       ? x_prot_fs(it, context)
1802           : it->value.id() == SGrammar::eID_prot_missense ? x_prot_missense(it, context)
1803           : it->value.id() == SGrammar::eID_translocation ? x_translocation(it, context)
1804           : CRef<CVariation>(NULL);
1805 
1806         if(vr.IsNull()) {
1807             HGVS_ASSERT_RULE(it, eID_NONE);
1808         }
1809     }
1810 
1811     return vr;
1812 }
1813 
x_expr1(TIterator const & i,const CContext & context)1814 CRef<CVariation> CHgvsParser::x_expr1(TIterator const& i, const CContext& context)
1815 {
1816     HGVS_ASSERT_RULE(i, eID_expr1);
1817     TIterator it = i->children.begin();
1818     CRef<CVariation> vr;
1819 
1820     string s(it->value.begin(), it->value.end());
1821     if(it->value.id() == i->value.id() && s == "(") {
1822         ++it;
1823         vr = x_expr1(it, context);
1824         SetComputational(*vr);
1825     } else if(it->value.id() == SGrammar::eID_list1a) {
1826         vr = x_list(it, context);
1827     } else if(it->value.id() == SGrammar::eID_header) {
1828         CContext local_ctx = x_header(it, context);
1829         ++it;
1830         vr = x_expr2(it, local_ctx);
1831     } else if(it->value.id() == SGrammar::eID_translocation) {
1832         vr = x_translocation(it, context);
1833     } else {
1834         HGVS_ASSERT_RULE(it, eID_NONE);
1835     }
1836 
1837     return vr;
1838 }
1839 
x_expr2(TIterator const & i,const CContext & context)1840 CRef<CVariation> CHgvsParser::x_expr2(TIterator const& i, const CContext& context)
1841 {
1842     HGVS_ASSERT_RULE(i, eID_expr2);
1843     TIterator it = i->children.begin();
1844     CRef<CVariation> vr;
1845 
1846     string s(it->value.begin(), it->value.end());
1847     if(it->value.id() == i->value.id() && s == "(") {
1848         ++it;
1849         vr = x_expr2(it, context);
1850         SetComputational(*vr);
1851     } else if(it->value.id() == SGrammar::eID_list2a) {
1852         vr = x_list(it, context);
1853     } else if(it->value.id() == SGrammar::eID_location) {
1854         CContext local_context(context);
1855         CRef<CVariantPlacement> placement = x_location(it, local_context);
1856         local_context.SetPlacement().Assign(*placement);
1857         ++it;
1858         vr = x_expr3(it, local_context);
1859     } else if(it->value.id() == SGrammar::eID_prot_ext) {
1860         vr = x_prot_ext(it, context);
1861     } else if(it->value.id() == SGrammar::eID_no_change) {
1862         vr = x_no_change(it, context);
1863     } else if(it->value.id() == i->value.id()) {
1864         vr.Reset(new CVariation);
1865         SetFirstPlacement(*vr).Assign(context.GetPlacement());
1866 
1867         if(s == "?") {
1868             vr->SetData().SetUnknown();
1869             //SetFirstPlacement(*vr).SetLoc().SetEmpty().Assign(context.GetId());
1870         } else if(s == "0?" || s == "0") {
1871             //loss of product: represent as deletion of the whole product sequence.
1872             SetFirstPlacement(*vr).SetLoc().SetWhole().Assign(context.GetId());
1873             CVariation_inst& var_inst = vr->SetData().SetInstance();
1874             var_inst.SetType(CVariation_inst::eType_del);
1875             CRef<CDelta_item> di(new CDelta_item);
1876             di->SetAction(CDelta_item::eAction_del_at);
1877             di->SetSeq().SetThis();
1878             var_inst.SetDelta().push_back(di);
1879 
1880             if(s == "0?") {
1881                 SetComputational(*vr);
1882             }
1883         } else {
1884             HGVS_THROW(eGrammatic, "Unexpected expr terminal: " + s);
1885         }
1886     } else {
1887         HGVS_ASSERT_RULE(it, eID_NONE);
1888     }
1889 
1890     return vr;
1891 }
1892 
1893 
x_expr3(TIterator const & i,const CContext & context)1894 CRef<CVariation> CHgvsParser::x_expr3(TIterator const& i, const CContext& context)
1895 {
1896     HGVS_ASSERT_RULE(i, eID_expr3);
1897     TIterator it = i->children.begin();
1898     CRef<CVariation> vr;
1899 
1900     string s(it->value.begin(), it->value.end());
1901     if(it->value.id() == i->value.id() && s == "(") {
1902         ++it;
1903         vr = x_expr3(it, context);
1904         SetComputational(*vr);
1905     } else if(it->value.id() == SGrammar::eID_list3a) {
1906         vr = x_list(it, context);
1907     } else if(it->value.id() == SGrammar::eID_mut_inst) {
1908         vr.Reset(new CVariation);
1909         vr->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_compound);
1910         for(; it != i->children.end(); ++it) {
1911             CRef<CVariation> inst_ref = x_mut_inst(it, context);
1912 
1913             if(inst_ref->GetData().IsNote()
1914               && inst_ref->GetData().GetNote() == "Frameshift")
1915             {
1916                if(vr->SetData().SetSet().SetVariations().size() > 0) {
1917                     //if inst_ref is a frameshift subexpression, we need to attach it as attribute of the
1918                     //previous variation in a compound inst-list, since frameshift is not a subtype of
1919                     //Variation.data, and thus not represented as a separate subvariation.
1920                     //e.g. p.Ile20Serfs - frameshift an attribute of Ile20Ser
1921                     vr->SetData().SetSet().SetVariations().back()->SetFrameshift().Assign(inst_ref->GetFrameshift());
1922                 } else {
1923                     //in updated HGVS spec fs can be a standalone, e.g. p.Ile20fs.
1924                     //In this case we'll represent it as p.Ile20Xaafs
1925                     CVariation_inst& inst = inst_ref->SetData().SetInstance();
1926                     inst.SetType(CVariation_inst::eType_prot_other);
1927 
1928                     CRef<CDelta_item> delta(new CDelta_item);
1929                     delta->SetSeq().SetLiteral().SetLength(1);
1930                     delta->SetSeq().SetLiteral().SetSeq_data().SetNcbieaa().Set("X");
1931                     inst.SetDelta().push_back(delta);
1932                     vr->SetData().SetSet().SetVariations().push_back(inst_ref);
1933                 }
1934             } else {
1935                 vr->SetData().SetSet().SetVariations().push_back(inst_ref);
1936             }
1937         }
1938         vr = x_unwrap_iff_singleton(*vr);
1939     } else {
1940         HGVS_ASSERT_RULE(it, eID_NONE);
1941     }
1942 
1943     return vr;
1944 }
1945 
1946 
x_list_delimiter(TIterator const & i,const CContext & context)1947 CVariation::TData::TSet::EData_set_type CHgvsParser::x_list_delimiter(TIterator const& i, const CContext& context)
1948 {
1949     HGVS_ASSERT_RULE(i, eID_list_delimiter);
1950     TIterator it = i->children.begin();
1951     string s(it->value.begin(), it->value.end());
1952 
1953     return s == "//" ? CVariation::TData::TSet::eData_set_type_chimeric
1954          : s == "/"  ? CVariation::TData::TSet::eData_set_type_mosaic
1955          : s == "+"  ? CVariation::TData::TSet::eData_set_type_genotype
1956          : s == ","  ? CVariation::TData::TSet::eData_set_type_products
1957          : s == ";"  ? CVariation::TData::TSet::eData_set_type_haplotype //note that within context of list#a, ";" delimits genotype
1958          : s == "(+)" || s == "(;)" ? CVariation::TData::TSet::eData_set_type_individual
1959          : CVariation::TData::TSet::eData_set_type_unknown;
1960 }
1961 
1962 
x_list(TIterator const & i,const CContext & context)1963 CRef<CVariation> CHgvsParser::x_list(TIterator const& i, const CContext& context)
1964 {
1965     if(!SGrammar::s_is_list(i->value.id())) {
1966         HGVS_ASSERT_RULE(i, eID_NONE);
1967     }
1968 
1969     CRef<CVariation> vr(new CVariation);
1970     TVariationSet& varset = vr->SetData().SetSet();
1971     varset.SetType(CVariation::TData::TSet::eData_set_type_unknown);
1972 
1973 
1974     for(TIterator it = i->children.begin(); it != i->children.end(); ++it) {
1975         //will process two elements from the children list: delimiter and following expression.
1976         //The first one does not have the delimiter. The delimiter determines the set-type.
1977         if(it != i->children.begin()) {
1978             if(SGrammar::s_is_list_a(i->value.id())) {
1979                 /*
1980                  * list#a is delimited by either ";"(HGVS-2.0) or "+"(HGVS_1.0);
1981                  * Both represent alleles within genotype.
1982                  * Note: the delimiter rule in the context is chset_p<>("+;"), i.e.
1983                  * a terminal, not a rule like list_delimiter; so calling
1984                  * x_list_delimiter(...) parser here would throw
1985                  */
1986                 varset.SetType(CVariation::TData::TSet::eData_set_type_genotype);
1987             } else {
1988                 CVariation::TData::TSet::EData_set_type set_type = x_list_delimiter(it, context);
1989                 if(varset.GetType() == CVariation::TData::TSet::eData_set_type_unknown) {
1990                     varset.SetType(set_type);
1991                 } else if(set_type != varset.GetType()) {
1992                     HGVS_THROW(eSemantic, "Non-unique delimiters within a list");
1993                 }
1994             }
1995             ++it;
1996         }
1997 
1998         CRef<CVariation> vr;
1999         if(it->value.id() == SGrammar::eID_expr1) {
2000             vr = x_expr1(it, context);
2001         } else if(it->value.id() == SGrammar::eID_expr2) {
2002             vr = x_expr2(it, context);
2003         } else if(it->value.id() == SGrammar::eID_expr3) {
2004             vr = x_expr3(it, context);
2005         } else if(SGrammar::s_is_list(it->value.id())) {
2006             vr = x_list(it, context);
2007         } else {
2008             HGVS_ASSERT_RULE(it, eID_NONE);
2009         }
2010 
2011         varset.SetVariations().push_back(vr);
2012     }
2013 
2014     vr = x_unwrap_iff_singleton(*vr);
2015     return vr;
2016 }
2017 
2018 
x_root(TIterator const & i,const CContext & context)2019 CRef<CVariation> CHgvsParser::x_root(TIterator const& i, const CContext& context)
2020 {
2021     HGVS_ASSERT_RULE(i, eID_root);
2022 
2023     CRef<CVariation> vr = x_list(i, context);
2024 
2025     RepackageAssertedSequence(*vr);
2026     AdjustMoltype(*vr, context.GetScope());
2027     CVariationUtil::s_FactorOutPlacements(*vr);
2028 
2029     vr->Index();
2030     return vr;
2031 }
2032 
x_unwrap_iff_singleton(CVariation & v)2033 CRef<CVariation>  CHgvsParser::x_unwrap_iff_singleton(CVariation& v)
2034 {
2035     if(v.GetData().IsSet() && v.GetData().GetSet().GetVariations().size() == 1) {
2036         CRef<CVariation> first = v.SetData().SetSet().SetVariations().front();
2037         if(!first->IsSetPlacements() && v.IsSetPlacements()) {
2038             first->SetPlacements() = v.SetPlacements();
2039         }
2040         return first;
2041     } else {
2042         return CRef<CVariation>(&v);
2043     }
2044 }
2045 
2046 
sx_AppendMoltypeExceptions(CVariation & v,CScope & scope)2047 void CHgvsParser::sx_AppendMoltypeExceptions(CVariation& v, CScope& scope)
2048 {
2049     CVariationUtil util(scope);
2050     for(CTypeIterator<CVariantPlacement> it(Begin(v)); it; ++it) {
2051         CVariantPlacement& p = *it;
2052         CVariantPlacement::TMol mol = util.GetMolType(sequence::GetId(p.GetLoc(), NULL));
2053         if(p.GetMol() != mol) {
2054             CRef<CVariantPlacement> p2(new CVariantPlacement);
2055             p2->Assign(p);
2056             p2->SetMol(mol);
2057 
2058             string asserted_header = CHgvsParser::s_SeqIdToHgvsStr(p, &scope);
2059             string expected_header = CHgvsParser::s_SeqIdToHgvsStr(*p2, &scope);
2060 
2061             CRef<CVariationException> ex(new CVariationException);
2062             ex->SetCode(CVariationException::eCode_inconsistent_asserted_moltype);
2063             ex->SetMessage("Inconsistent mol-type. asserted:'" + asserted_header + "'; expected:'" + expected_header + "'");
2064             p.SetExceptions().push_back(ex);
2065         }
2066     }
2067 }
2068 
AsVariation(const string & hgvs,TOpFlags flags)2069 CRef<CVariation> CHgvsParser::AsVariation(const string& hgvs, TOpFlags flags)
2070 {
2071     string hgvs2 = NStr::TruncateSpaces(hgvs);
2072     tree_parse_info<> info = pt_parse(hgvs2.c_str(), *s_grammar, +space_p);
2073 
2074     if(!info.full) {
2075 #if 0
2076         CNcbiOstrstream ostr;
2077         tree_to_xml(ostr, info.trees, hgvs2.c_str() , CHgvsParser::SGrammar::s_GetRuleNames());
2078         string tree_str = CNcbiOstrstreamToString(ostr);
2079 #endif
2080         HGVS_THROW(eGrammatic, "Syntax error at pos " + NStr::SizetToString(info.length + 1) + " in " + hgvs2 + "");
2081     }
2082 
2083     CContext context(m_scope, m_seq_id_resolvers, hgvs);
2084     CRef<CVariation> vr = x_root(info.trees.begin(), context);
2085     vr->SetName(hgvs2);
2086     sx_AppendMoltypeExceptions(*vr, context.GetScope());
2087 
2088     CVariationUtil util(context.GetScope());
2089     util.CheckAmbiguitiesInLiterals(*vr);
2090 
2091     return vr;
2092 }
2093 
2094 
2095 
AttachHgvs(CVariation & v)2096 void CHgvsParser::AttachHgvs(CVariation& v)
2097 {
2098     v.Index();
2099 
2100     //compute and attach placement-specific HGVS expressions
2101     for(CTypeIterator<CVariation> it(Begin(v)); it; ++it) {
2102         CVariation& v2 = *it;
2103         if(!v2.IsSetPlacements()) {
2104             continue;
2105         }
2106         NON_CONST_ITERATE(CVariation::TPlacements, it2, v2.SetPlacements()) {
2107             CVariantPlacement& p2 = **it2;
2108 
2109             if(!p2.GetLoc().GetId()) {
2110                 continue;
2111             }
2112 
2113             if(p2.GetMol() != CVariantPlacement::eMol_protein && v2.GetConsequenceParent()) {
2114                 //if this variation is in consequnece, only compute HGVS for protein variations
2115                 //(as otherwise it will throw - can't have HGVS expression for protein with nuc placement)
2116                 continue;
2117             }
2118 
2119             //compute hgvs-expression specific to the placement and the variation to which it is attached
2120             try {
2121                 string hgvs_expression = AsHgvsExpression(v2, CConstRef<CSeq_id>(p2.GetLoc().GetId()));
2122                 p2.SetHgvs_name(hgvs_expression);
2123             } catch (CException& e ) {
2124                 CNcbiOstrstream ostr;
2125                 ostr << MSerial_AsnText << p2;
2126                 string s = CNcbiOstrstreamToString(ostr);
2127                 NCBI_REPORT_EXCEPTION("Can't compute HGVS expression for " + s, e);
2128             }
2129         }
2130     }
2131 
2132     //If the root variation does not have placements (e.g. a container for placement-specific subvariations)
2133     //then compute the hgvs expression for the root placement and attach it to variation itself as a synonym.
2134     if(!v.IsSetPlacements()) {
2135         string root_output_hgvs = AsHgvsExpression(v);
2136         v.SetSynonyms().push_back(root_output_hgvs);
2137     }
2138 }
2139 
2140 
2141 
2142 };
2143 
2144 END_NCBI_SCOPE
2145 
2146