1 /* $Id: hgvs_parser2.cpp 612015 2020-07-14 17:21:57Z villamar $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Description:
27 * Sample library
28 *
29 */
30
31 #include <ncbi_pch.hpp>
32
33
34 #include <serial/iterator.hpp>
35
36 #include <objects/seqalign/Seq_align.hpp>
37 #include <objects/seqalign/Spliced_seg.hpp>
38 #include <objects/seqalign/Spliced_exon.hpp>
39 #include <objects/seqalign/Product_pos.hpp>
40 #include <objects/seqalign/Prot_pos.hpp>
41
42 #include <objects/variation/Variation.hpp>
43 #include <objects/variation/VariantPlacement.hpp>
44 #include <objects/variation/VariationMethod.hpp>
45 #include <objects/variation/VariationException.hpp>
46
47 #include <objects/seqfeat/Seq_feat.hpp>
48 #include <objects/seqfeat/Variation_inst.hpp>
49 #include <objects/seqfeat/Delta_item.hpp>
50 #include <objects/seqfeat/Ext_loc.hpp>
51 #include <objects/seqfeat/BioSource.hpp>
52 #include <objects/seqfeat/SeqFeatXref.hpp>
53
54 #include <objects/seq/seqport_util.hpp>
55 #include <objects/seq/Seq_literal.hpp>
56 #include <objects/seq/Seq_data.hpp>
57 #include <objects/seq/Numbering.hpp>
58 #include <objects/seq/Num_ref.hpp>
59 #include <objects/seq/Annot_descr.hpp>
60 #include <objects/seq/Annotdesc.hpp>
61 #include <objects/seq/Seq_descr.hpp>
62
63 #include <objects/general/Object_id.hpp>
64 #include <objects/general/User_object.hpp>
65 #include <objects/general/Dbtag.hpp>
66
67 #include <objects/seqloc/Seq_point.hpp>
68 #include <objects/seqloc/Seq_loc_equiv.hpp>
69
70 #include <objmgr/util/sequence.hpp>
71 #include <objmgr/seq_vector.hpp>
72 #include <objmgr/align_ci.hpp>
73 #include <objmgr/feat_ci.hpp>
74 #include <objmgr/seq_loc_mapper.hpp>
75
76 #include <misc/hgvs/hgvs_parser2.hpp>
77 #include <misc/hgvs/variation_util2.hpp>
78
79
80 BEGIN_NCBI_SCOPE
81
82 namespace variation {
83
84 #define HGVS_THROW(err_code, message) NCBI_THROW(CHgvsParser::CHgvsParserException, err_code, message)
85
86 #define HGVS_ASSERT_RULE(i, rule_id) \
87 if((i->value.id()) != (SGrammar::rule_id)) \
88 {HGVS_THROW(eGrammatic, "Unexpected rule " + CHgvsParser::SGrammar::s_GetRuleName(i->value.id()) ); }
89
90
91 CSafeStatic<CHgvsParser::SGrammar> CHgvsParser::s_grammar;
92
93 const char* CHgvsParser::SGrammar::s_rule_names[CHgvsParser::SGrammar::eNodeIds_SIZE] =
94 {
95 "NONE",
96 "root",
97 "list_delimiter",
98 "list1a",
99 "list2a",
100 "list3a",
101 "list1b",
102 "list2b",
103 "list3b",
104 "expr1",
105 "expr2",
106 "expr3",
107 "translocation",
108 "header",
109 "seq_id",
110 "mut_list",
111 "mut_ref",
112 "mol",
113 "int_fuzz",
114 "abs_pos",
115 "general_pos",
116 "fuzzy_pos",
117 "pos_spec",
118 "location",
119 "nuc_range",
120 "prot_range",
121 "mut_inst",
122 "raw_seq",
123 "raw_seq_or_len",
124 "aminoacid1",
125 "aminoacid2",
126 "aminoacid3",
127 "nuc_subst",
128 "deletion",
129 "insertion",
130 "delins",
131 "duplication",
132 "nuc_inv",
133 "ssr",
134 "conversion",
135 "seq_loc",
136 "seq_ref",
137 "prot_pos",
138 "prot_fs",
139 "prot_missense",
140 "prot_ext",
141 "no_change"
142 };
143
144
SetFirstPlacement(CVariation & v)145 CVariantPlacement& SetFirstPlacement(CVariation& v)
146 {
147 if(v.SetPlacements().size() == 0) {
148 CRef<CVariantPlacement> p(new CVariantPlacement);
149 v.SetPlacements().push_back(p);
150 }
151 return *v.SetPlacements().front();
152 }
153
SetComputational(CVariation & variation)154 void SetComputational(CVariation& variation)
155 {
156 CVariationMethod& m = variation.SetMethod();
157 m.SetMethod();
158
159 if(find(m.GetMethod().begin(),
160 m.GetMethod().end(),
161 CVariationMethod::eMethod_E_computational) == m.GetMethod().end())
162 {
163 m.SetMethod().push_back(CVariationMethod::eMethod_E_computational);
164 }
165 }
166
167
SeqsMatch(const string & query,const char * text)168 bool SeqsMatch(const string& query, const char* text)
169 {
170 static const char* iupac_bases = ".TGKCYSBAWRDMHVN"; //position of the iupac literal = 4-bit mask for A|C|G|T
171 for(size_t i = 0; i < query.size(); i++) {
172 size_t a = CTempString(iupac_bases).find(query[i]);
173 size_t b = CTempString(iupac_bases).find(text[i]);
174 if(!(a & b)) {
175 return false;
176 }
177 }
178 return true;
179 }
180
181
FindSSRLoc(const CSeq_loc & loc,const string & seq,CScope & scope)182 CRef<CSeq_loc> FindSSRLoc(const CSeq_loc& loc, const string& seq, CScope& scope)
183 {
184 //Extend the loc 10kb up and down; Find all occurences of seq in the resulting
185 //interval, create locs for individual repeat units; then merge them, and keep the interval that
186 //overlaps the original.
187
188 if (loc.GetStart(eExtreme_Positional) > loc.GetStop(eExtreme_Positional)) {
189 HGVS_THROW(eSemantic, "Invalid range: start position is bigger than stop position");
190 }
191 const TSeqPos ext_interval = 10000;
192
193 CRef<CSeq_loc> loc1 = sequence::Seq_loc_Merge(loc, CSeq_loc::fMerge_SingleRange, NULL);
194 CBioseq_Handle bsh = scope.GetBioseqHandle(sequence::GetId(loc, NULL));
195 TSeqPos seq_len = bsh.GetInst_Length();
196 loc1->SetInt().SetFrom() -= min(ext_interval, loc1->GetInt().GetFrom());
197 loc1->SetInt().SetTo() += min(ext_interval, seq_len - 1 - loc1->GetInt().GetTo());
198
199 CSeqVector v(*loc1, scope, CBioseq_Handle::eCoding_Iupac);
200 string str1;
201 v.GetSeqData(v.begin(), v.end(), str1);
202
203 CRef<CSeq_loc> container(new CSeq_loc(CSeq_loc::e_Mix));
204
205 if (str1.size() >= seq.size()) {
206 for (size_t i = 0u; i < str1.size() - seq.size(); ++i) {
207 if (SeqsMatch(seq, &str1[i])) {
208 CRef<CSeq_loc> repeat_unit_loc(new CSeq_loc);
209 repeat_unit_loc->Assign(*loc1);
210
211 if (sequence::GetStrand(loc, NULL) == eNa_strand_minus) {
212 repeat_unit_loc->SetInt().SetTo() -= i;
213 repeat_unit_loc->SetInt().SetFrom(repeat_unit_loc->GetInt().GetTo() - (seq.size() - 1));
214 } else {
215 repeat_unit_loc->SetInt().SetFrom() += i;
216 repeat_unit_loc->SetInt().SetTo(repeat_unit_loc->GetInt().GetFrom() + (seq.size() - 1));
217 }
218 container->SetMix().Set().push_back(repeat_unit_loc);
219 }
220 }
221 }
222
223 CRef<CSeq_loc> merged_repeats = sequence::Seq_loc_Merge(*container, CSeq_loc::fSortAndMerge_All, NULL);
224 merged_repeats->ChangeToMix();
225 CRef<CSeq_loc> result(new CSeq_loc(CSeq_loc::e_Null));
226 result->Assign(loc);
227
228 for(CSeq_loc_CI ci(*merged_repeats); ci; ++ci) {
229 const CSeq_loc& loc2 = ci.GetEmbeddingSeq_loc();
230 if(sequence::Compare(loc, loc2, NULL, sequence::fCompareOverlapping) != sequence::eNoOverlap) {
231 result->Add(loc2);
232 }
233 }
234
235 return sequence::Seq_loc_Merge(*result, CSeq_loc::fSortAndMerge_All, NULL);
236 }
237
238
239
240
s_SetStartOffset(CVariantPlacement & p,const CHgvsParser::SFuzzyInt & fint)241 void CHgvsParser::s_SetStartOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint)
242 {
243 p.ResetStart_offset();
244 p.ResetStart_offset_fuzz();
245 if(fint.value || fint.fuzz) {
246 p.SetStart_offset(fint.value);
247 }
248
249 if(fint.fuzz) {
250 p.SetStart_offset_fuzz().Assign(*fint.fuzz);
251 }
252
253 #if 0
254 if(!fint.value
255 && fint.fuzz
256 && fint.fuzz->IsLim()
257 && ( fint.fuzz->GetLim() == CInt_fuzz::eLim_lt
258 || fint.fuzz->GetLim() == CInt_fuzz::eLim_tl))
259 {
260 // VAR-832
261 // interpret c.x-? as c.x-(?_1), not as c.x+(?_0)
262 p.SetStart_offset(-1);
263 }
264 #endif
265
266 }
267
s_SetStopOffset(CVariantPlacement & p,const CHgvsParser::SFuzzyInt & fint)268 void CHgvsParser::s_SetStopOffset(CVariantPlacement& p, const CHgvsParser::SFuzzyInt& fint)
269 {
270 p.ResetStop_offset();
271 p.ResetStop_offset_fuzz();
272 if(fint.value || fint.fuzz) {
273 p.SetStop_offset(fint.value);
274 }
275
276 if(fint.fuzz) {
277 p.SetStop_offset_fuzz().Assign(*fint.fuzz);
278 }
279
280 #if 0
281 if(!fint.value
282 && fint.fuzz
283 && fint.fuzz->IsLim()
284 && ( fint.fuzz->GetLim() == CInt_fuzz::eLim_gt
285 || fint.fuzz->GetLim() == CInt_fuzz::eLim_tr))
286 {
287 // VAR-832
288 // interpret c.x+? as c.x+(1_?), not as c.x+(0_?)
289 p.SetStop_offset(1);
290 }
291 #endif
292 }
293
294
295
296 //if a variation has an asserted sequence, stored in placement.seq, repackage it as a set having
297 //the original variation and a synthetic one representing the asserted sequence. The placement.seq
298 //is cleared, as it is a placeholder for the actual reference sequence.
RepackageAssertedSequence(CVariation & vr)299 void RepackageAssertedSequence(CVariation& vr)
300 {
301 if(vr.IsSetPlacements() && SetFirstPlacement(vr).IsSetSeq()) {
302 CRef<CVariation> container(new CVariation);
303 container->SetPlacements() = vr.SetPlacements();
304
305 CRef<CVariation> orig(new CVariation);
306 orig->Assign(vr);
307 orig->ResetPlacements(); //location will be set on the package, as it is the same for both members
308
309 container->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_package);
310 container->SetData().SetSet().SetVariations().push_back(orig);
311
312 CRef<CVariation> asserted_vr(new CVariation);
313 asserted_vr->SetData().SetInstance().SetObservation(CVariation_inst::eObservation_asserted);
314 asserted_vr->SetData().SetInstance().SetType(CVariation_inst::eType_identity);
315
316 CRef<CDelta_item> delta(new CDelta_item);
317 delta->SetSeq().SetLiteral().Assign(SetFirstPlacement(vr).GetSeq());
318 asserted_vr->SetData().SetInstance().SetDelta().push_back(delta);
319
320 SetFirstPlacement(*container).ResetSeq();
321 container->SetData().SetSet().SetVariations().push_back(asserted_vr);
322
323 vr.Assign(*container);
324
325 } else if(vr.GetData().IsSet()) {
326 NON_CONST_ITERATE(CVariation::TData::TSet::TVariations, it, vr.SetData().SetSet().SetVariations()) {
327 RepackageAssertedSequence(**it);
328 }
329 }
330 }
331
332
333 //HGVS distinguished between c. n. and r. molecules, while in
334 //VariantPlacement we have moltype cdna (which maps onto c.) and rna (which maps onto n.)
335 //
336 //If we have an HGVS expression like NM_123456.7:r. the VariantPlacement will have moltype rna
337 //that would map-back onto NM_123456.7:n., which is what we don't want, as "n.' reserved
338 //for non-coding RNAs, so we'll convert rna moltype to cdna based on accession here.
339 //Note that this is a post-processing step, as in the midst of parsing we want to treat
340 //NM_123456.7:r. as non-coding rna, since these have absolute coordinates rather than cds-relative.
AdjustMoltype(CVariation & vr,CScope & scope)341 void AdjustMoltype(CVariation& vr, CScope& scope)
342 {
343 CVariationUtil util(scope);
344
345 for(CTypeIterator<CVariantPlacement> it(Begin(vr)); it; ++it) {
346 CVariantPlacement& p = *it;
347
348 if(p.IsSetMol()
349 && p.GetMol() == CVariantPlacement::eMol_rna
350 && p.GetLoc().GetId())
351 {
352 p.SetMol(util.GetMolType(*p.GetLoc().GetId()));
353 }
354 }
355 }
356
357
CContext(const CContext & other)358 CHgvsParser::CContext::CContext(const CContext& other)
359 : m_hgvs(other.m_hgvs)
360 {
361 this->m_bsh = other.m_bsh;
362 this->m_cds = other.m_cds;
363 this->m_scope = other.m_scope;
364 this->m_seq_id_resolvers = other.m_seq_id_resolvers;
365 this->m_placement.Reset();
366 if(other.m_placement) {
367 /*
368 * Note: need to make a copy of the placement, such that if preceding subvariation
369 * is location-specific and the other one isn't, first's placement is not
370 * given to the other one, e.g. "NM_004004.2:c.[35_36dup;40del]+[=]" - if we
371 * made a shallow copy, then the location context for "[=]" would be erroneously
372 * inherited from the last sibling: "NM_004004.2:c.40=" instead of whole "NM_004004.2:c.="
373 */
374 this->m_placement.Reset(new CVariantPlacement);
375 this->m_placement->Assign(*other.m_placement);
376 }
377 }
378
GetCDS() const379 const CSeq_feat& CHgvsParser::CContext::GetCDS() const
380 {
381 if(m_cds.IsNull()) {
382 HGVS_THROW(eContext, "No CDS feature in context");
383 }
384 return *m_cds;
385 }
386
GetId() const387 const CSeq_id& CHgvsParser::CContext::GetId() const
388 {
389 return sequence::GetId(GetPlacement().GetLoc(), NULL);
390 }
391
392
SetId(const CSeq_id & id,CVariantPlacement::TMol mol)393 void CHgvsParser::CContext::SetId(const CSeq_id& id, CVariantPlacement::TMol mol)
394 {
395 Clear();
396
397 SetPlacement().SetMol(mol);
398 SetPlacement().SetLoc().SetWhole().Assign(id);
399
400 m_bsh = m_scope->GetBioseqHandle(id);
401
402 if(!m_bsh) {
403 HGVS_THROW(eContext, "Cannnot get bioseq for seq-id " + id.AsFastaString());
404 }
405
406 if(mol == CVariantPlacement::eMol_cdna) {
407 SAnnotSelector sel;
408 sel.SetResolveTSE();
409 sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
410 for(CFeat_CI ci(m_bsh, sel); ci; ++ci) {
411 const CMappedFeat& mf = *ci;
412 if(mf.GetData().IsCdregion()) {
413 if(m_cds.IsNull()) {
414 m_cds.Reset(new CSeq_feat());
415 m_cds->Assign(mf.GetMappedFeature());
416 } else {
417 HGVS_THROW(eContext, "Multiple CDS features on the sequence");
418 }
419 }
420 }
421 if(m_cds.IsNull()) {
422 HGVS_THROW(eContext, "Could not find CDS feat");
423 }
424 }
425 }
426
427
s_GetRuleName(parser_id id)428 const string CHgvsParser::SGrammar::s_GetRuleName(parser_id id)
429 {
430 if(id.to_long() >= CHgvsParser::SGrammar::eNodeIds_SIZE) {
431 HGVS_THROW(eLogic, "Rule name not hardcoded");
432 } else {
433 return s_rule_names[id.to_long()];
434 }
435 }
436
437
x_int_fuzz(TIterator const & i,const CContext & context)438 CHgvsParser::SFuzzyInt CHgvsParser::x_int_fuzz(TIterator const& i, const CContext& context)
439 {
440 HGVS_ASSERT_RULE(i, eID_int_fuzz);
441 TIterator it = i->children.begin();
442
443 CHgvsParser::SFuzzyInt fint;
444 fint.fuzz.Reset(new CInt_fuzz);
445
446 if(i->children.size() == 1) { //e.g. '5' or '?'
447 string s(it->value.begin(), it->value.end());
448 if(s == "?") {
449 fint.SetPureFuzz();
450 } else {
451 fint.value = NStr::StringToInt(s);
452 fint.fuzz.Reset();
453 }
454 } else if(i->children.size() == 3) { //e.g. '(5)' or '(?)'
455 ++it;
456 string s(it->value.begin(), it->value.end());
457 if(s == "?") {
458 fint.SetPureFuzz();
459 } else {
460 fint.value = NStr::StringToInt(s);
461 fint.fuzz->SetLim(CInt_fuzz::eLim_unk);
462 }
463 } else if(i->children.size() == 5) { //e.g. '(5_7)' or '(?_10)'
464 ++it;
465 string s1(it->value.begin(), it->value.end());
466 ++it;
467 ++it;
468 string s2(it->value.begin(), it->value.end());
469
470 if(s1 == "?" && s2 == "?") {
471 fint.SetPureFuzz();
472 } else if(s1 != "?" && s2 != "?") {
473 fint.value = NStr::StringToInt(s1);
474 fint.fuzz->SetRange().SetMin(NStr::StringToInt(s1));
475 fint.fuzz->SetRange().SetMax(NStr::StringToInt(s2));
476 } else if(s2 == "?") {
477 fint.value = NStr::StringToInt(s1);
478 fint.fuzz->SetLim(CInt_fuzz::eLim_gt);
479 } else if(s1 == "?") {
480 fint.value = NStr::StringToInt(s2);
481 fint.fuzz->SetLim(CInt_fuzz::eLim_lt);
482 } else {
483 HGVS_THROW(eLogic, "Unreachable code");
484 }
485 }
486
487 return fint;
488 }
489
490
491
492 /* In HGVS:
493 * the nucleotide 3' of the translation stop codon is *1, the next *2, etc.
494 * # there is no nucleotide 0
495 * # nucleotide 1 is the A of the ATG-translation initiation codon
496 * # the nucleotide 5' of the ATG-translation initiation codon is -1, the previous -2, etc.
497 *
498 * I.e. need to adjust if dealing with positive coordinates, except for *-relative ones.
499 */
500 template<typename T>
AdjustHgvsCoord(T val,TSeqPos offset,bool adjust)501 T AdjustHgvsCoord(T val, TSeqPos offset, bool adjust)
502 {
503 val += offset;
504 if(adjust && val > (T)offset) {
505 // note: val may be unsigned, so check for (val+offset > offset)
506 // instead of (val > 0)
507 val -= 1;
508 }
509 return val;
510 }
511
x_abs_pos(TIterator const & i,const CContext & context)512 CRef<CSeq_point> CHgvsParser::x_abs_pos(TIterator const& i, const CContext& context)
513 {
514 HGVS_ASSERT_RULE(i, eID_abs_pos);
515 TIterator it = i->children.begin();
516
517 TSeqPos offset(0);
518 bool adjust = true; //see AdjustHgvsCoord comments above
519 if(i->children.size() == 2) {
520 adjust = false;
521 string s(it->value.begin(), it->value.end());
522 if(s != "*") {
523 HGVS_THROW(eGrammatic, "Expected literal '*'");
524 }
525 if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_cdna) {
526 HGVS_THROW(eContext, "Expected 'c.' context for stop-codon-relative coordinate");
527 }
528
529 offset = context.GetCDS().GetLocation().GetStop(eExtreme_Biological);
530 ++it;
531 } else {
532 if (context.GetPlacement().GetMol() == CVariantPlacement::eMol_cdna) {
533 //Note: in RNA coordinates (r.) the coordinates are absolute, like in genomic sequences,
534 // "The RNA sequence type uses only GenBank mRNA records. The value 1 is assigned to the first
535 // base in the record and from there all bases are counted normally."
536 //so the cds-start offset applies only to "c." coordinates
537 offset = context.GetCDS().GetLocation().GetStart(eExtreme_Biological);
538 }
539 }
540
541 CRef<CSeq_point> pnt(new CSeq_point);
542 {{
543 SFuzzyInt int_fuzz = x_int_fuzz(it, context);
544
545 pnt->SetId().Assign(context.GetId());
546 pnt->SetStrand(eNa_strand_plus);
547 if(int_fuzz.IsPureFuzz()) {
548 pnt->SetPoint(kInvalidSeqPos);
549 } else {
550 pnt->SetPoint(AdjustHgvsCoord(int_fuzz.value, offset, adjust));
551 }
552
553 if(!int_fuzz.fuzz.IsNull()) {
554 pnt->SetFuzz(*int_fuzz.fuzz);
555 if(pnt->GetFuzz().IsRange()) {
556 CInt_fuzz_Base::TRange& r = pnt->SetFuzz().SetRange();
557 r.SetMin(AdjustHgvsCoord(r.GetMin(), offset, adjust));
558 r.SetMax(AdjustHgvsCoord(r.GetMax(), offset, adjust));
559 }
560 }
561 }}
562
563 return pnt;
564 }
565
IsPureFuzzPoint(const CSeq_point & p)566 bool IsPureFuzzPoint(const CSeq_point& p)
567 {
568 return p.GetPoint() == kInvalidSeqPos
569 && p.IsSetFuzz()
570 && p.GetFuzz().IsLim()
571 && p.GetFuzz().GetLim() == CInt_fuzz::eLim_other;
572 }
573
574
575 /*
576 * general_pos is either simple abs-pos that is passed down to x_abs_pos,
577 * or an intronic location that is specified by a mapping point in the
578 * local coordinates and the -upstream / +downstream offset after remapping.
579 *
580 * The mapping point can either be an abs-pos in local coordinates, or
581 * specified as offset in intron-specific coordinate system where IVS# specifies
582 * the intron number
583 */
x_general_pos(TIterator const & i,const CContext & context)584 CHgvsParser::SOffsetPoint CHgvsParser::x_general_pos(TIterator const& i, const CContext& context)
585 {
586 HGVS_ASSERT_RULE(i, eID_general_pos);
587
588 SOffsetPoint ofpnt;
589
590 if(i->children.size() == 1) {
591 //local coordinates
592 ofpnt.pnt = x_abs_pos(i->children.begin(), context);
593 } else {
594 //(str_p("IVS") >> int_p | abs_pos) >> sign_p >> int_fuzz
595
596 TIterator it = i->children.end() - 1;
597 ofpnt.offset = x_int_fuzz(it, context);
598 --it;
599
600 //adjust for sign; convert +? or -? offsets to
601 //0> or 0<
602 string s_sign(it->value.begin(), it->value.end());
603 int sign1 = s_sign == "-" ? -1 : 1;
604 ofpnt.offset.value *= sign1;
605 if(ofpnt.offset.fuzz && ofpnt.offset.fuzz->IsRange()) {
606 ofpnt.offset.fuzz->SetRange().SetMin() *= sign1;
607 ofpnt.offset.fuzz->SetRange().SetMax() *= sign1;
608 } else if(ofpnt.offset.IsPureFuzz()) {
609 ofpnt.offset.fuzz->SetLim(sign1 < 0 ? CInt_fuzz::eLim_lt : CInt_fuzz::eLim_gt);
610 }
611
612 --it;
613 if(it->value.id() == SGrammar::eID_abs_pos) {
614 //base-loc is an abs-pos
615 ofpnt.pnt = x_abs_pos(i->children.begin(), context);
616 } else {
617 //base-loc is IVS-relative.
618 ofpnt.pnt.Reset(new CSeq_point);
619 ofpnt.pnt->SetId().Assign(context.GetId());
620 ofpnt.pnt->SetStrand(eNa_strand_plus);
621
622 TIterator it = i->children.begin();
623 string s_ivs(it->value.begin(), it->value.end());
624 ++it;
625 string s_ivs_num(it->value.begin(), it->value.end());
626 int ivs_num = NStr::StringToInt(s_ivs_num);
627
628 //If IVS3+50, the mapping point is the last base of third exon
629 //if IVS3-50, the mapping point is the first base of the fourth exon
630 size_t target_exon_num = sign1 < 0 ? ivs_num + 1 : ivs_num;
631
632 SAnnotSelector sel;
633 sel.IncludeFeatSubtype(CSeqFeatData::eSubtype_exon);
634 CBioseq_Handle bsh = context.GetScope().GetBioseqHandle(context.GetId());
635 size_t exon_num = 1;
636 //Note: IVS is cDNA-centric, so we'll have to use ordinals of the exons instead of /number qual
637 for(CFeat_CI ci(bsh, sel); ci; ++ci) {
638 const CMappedFeat& mf = *ci;
639 if(exon_num == target_exon_num) {
640 ofpnt.pnt->SetPoint(sign1 > 0 ? mf.GetLocation().GetStop(eExtreme_Biological)
641 : mf.GetLocation().GetStart(eExtreme_Biological));
642 break;
643 }
644 exon_num++;
645 }
646 }
647 }
648
649 //We could be dealing with improper HGVS expression (that we need to support anyway)
650 //where the coordinate extends beyond the range of sequence
651 //e.g. NM_000518:c.-78A>G, where codon-start is at 51. In this case the resulting
652 //coordinate will be negative; we'll convert it to offset-format (27 bases upstream of pos 0)
653
654 if(!IsPureFuzzPoint(*ofpnt.pnt)
655 && ( context.GetPlacement().GetMol() == CVariantPlacement::eMol_cdna
656 || context.GetPlacement().GetMol() == CVariantPlacement::eMol_rna))
657 {
658 if(static_cast<TSignedSeqPos>(ofpnt.pnt->GetPoint()) < 0) {
659 ofpnt.offset.value += static_cast<TSignedSeqPos>(ofpnt.pnt->GetPoint());
660 ofpnt.pnt->SetPoint(0);
661 } else {
662 CVariationUtil vu(context.GetScope());
663 TSeqPos transcribed_len = vu.GetEffectiveTranscriptLength(context.GetBioseqHandle());
664 // Note: positions past the last exon are interpreted as specifying near-gene/intronic
665 // target rather than polyA. JIRA:SNP-7341
666
667 if(ofpnt.pnt->GetPoint() >= transcribed_len) {
668 TSeqPos anchor_pos = transcribed_len - 1;
669 TSeqPos overrun = ofpnt.pnt->GetPoint() - anchor_pos;
670 ofpnt.offset.value += overrun;
671 ofpnt.pnt->SetPoint(anchor_pos);
672 }
673 }
674 }
675
676 return ofpnt;
677 }
678
679
x_fuzzy_pos(TIterator const & i,const CContext & context)680 CHgvsParser::SOffsetPoint CHgvsParser::x_fuzzy_pos(TIterator const& i, const CContext& context)
681 {
682 HGVS_ASSERT_RULE(i, eID_fuzzy_pos);
683
684 SOffsetPoint pnt1 = x_general_pos(i->children.begin(), context);
685 SOffsetPoint pnt2 = x_general_pos(i->children.begin() + 1, context);
686
687 //Verify that on the same seq-id.
688 if(!pnt1.pnt->GetId().Equals(pnt2.pnt->GetId())) {
689 HGVS_THROW(eSemantic, "Points in a fuzzy pos are on different sequences");
690 }
691 if(pnt1.pnt->GetStrand() != pnt2.pnt->GetStrand()) {
692 HGVS_THROW(eSemantic, "Range-loc start/stop are on different strands.");
693 }
694
695 if(IsPureFuzzPoint(*pnt1.pnt) || IsPureFuzzPoint(*pnt2.pnt)) {
696 if(IsPureFuzzPoint(*pnt2.pnt)) {
697 pnt1.pnt->SetFuzz().SetLim(CInt_fuzz::eLim_tr);
698 return pnt1;
699 } else {
700 pnt2.pnt->SetFuzz().SetLim(CInt_fuzz::eLim_tl);
701 return pnt2;
702 }
703 }
704
705 if((pnt1.offset.value != 0 || pnt2.offset.value != 0) && !pnt1.pnt->Equals(*pnt2.pnt)) {
706 HGVS_THROW(eSemantic, "Base-points in an intronic fuzzy position must be equal");
707 }
708
709 SOffsetPoint pnt = pnt1;
710 if(pnt1.offset.value != pnt2.offset.value) {
711 pnt.offset.fuzz.Reset(new CInt_fuzz);
712 pnt.offset.fuzz->SetRange().SetMin(pnt1.offset.value);
713 pnt.offset.fuzz->SetRange().SetMax(pnt2.offset.value);
714 }
715
716 return pnt;
717
718 #if 0
719 todo: reconcile
720 //If Both are Empty - the result is empty, otherwise reconciliate
721 if(pnt1.pnt->GetPoint() == kInvalidSeqPos && pnt2.pnt->GetPoint() == kInvalidSeqPos) {
722 pnt.pnt = pnt1.pnt;
723 pnt.offset = pnt1.offset;
724 } else {
725 pnt.pnt.Reset(new CSeq_point);
726 pnt.pnt.Assign(*pnt1.pnt);
727
728 TSeqPos min_pos = min(pnt1.pnt->GetPoint(), pnt2.pnt->GetPoint());
729 TSeqPos max_pos = max(pnt1.pnt->GetPoint(), pnt2.pnt->GetPoint());
730
731 if(!pnt1->IsSetFuzz() && !pnt2->IsSetFuzz()) {
732 //Both are non-fuzzy - create the min-max fuzz.
733 //(10+50_10+60)
734 pnt->SetFuzz().SetRange().SetMin(min_pos);
735 pnt->SetFuzz().SetRange().SetMax(max_pos);
736
737 } else if(pnt1->IsSetFuzz() && pnt2->IsSetFuzz()) {
738 //Both are fuzzy - reconcile the fuzz.
739
740 if(pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tr
741 && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tl)
742 {
743 //fuzz points inwards - create min-max fuzz
744 //(10+?_11-?)
745 pnt->SetFuzz().SetRange().SetMin(min_pos);
746 pnt->SetFuzz().SetRange().SetMax(max_pos);
747
748 } else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tl
749 && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tr)
750 {
751 //fuzz points outwards - set fuzz to unk
752 //(10-?_10+?)
753 //(?_10+?)
754 //(10-?_?)
755 pnt->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
756
757 } else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tl
758 && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tl)
759 {
760 //fuzz is to the left - use 5'-most
761 //(?_10-?)
762 //(10-?_11-?)
763 pnt->SetPoint(pnt->GetStrand() == eNa_strand_minus ? max_pos : min_pos);
764
765 } else if (pnt1->GetFuzz().GetLim() == CInt_fuzz::eLim_tr
766 && pnt2->GetFuzz().GetLim() == CInt_fuzz::eLim_tr)
767 {
768 //fuzz is to the right - use 3'-most
769 //(10+?_?)
770 //(10+?_11+?)
771 pnt->SetPoint(pnt->GetStrand() == eNa_strand_minus ? min_pos : max_pos);
772
773 } else {
774 pnt->SetFuzz().SetLim(CInt_fuzz::eLim_unk);
775 }
776 } else {
777 // One of the two is non-fuzzy:
778 // use it to specify position, and the fuzz of the other to specify the fuzz
779 // e.g. (10+5_10+?) -> loc1=100005; loc2=100000tr -> 100005tr
780
781 pnt->Assign(pnt1->IsSetFuzz() ? *pnt2 : *pnt1);
782 pnt->SetFuzz().Assign(pnt1->IsSetFuzz() ? pnt1->GetFuzz()
783 : pnt2->GetFuzz());
784
785 }
786 }
787 #endif
788
789
790
791 }
792
GetUniquePrimaryTranscriptId(CBioseq_Handle & bsh)793 CSeq_id_Handle GetUniquePrimaryTranscriptId(CBioseq_Handle& bsh)
794 {
795 set<CSeq_id_Handle> annotated_transcript_feats;
796 set<CSeq_id_Handle> annotated_transcript_aligns;
797
798 SAnnotSelector sel;
799 sel.SetResolveTSE();
800 sel.IncludeFeatType(CSeqFeatData::e_Rna);
801 for(CFeat_CI ci(bsh, sel); ci; ++ci) {
802 const CMappedFeat& mf = *ci;
803 if(mf.IsSetProduct() && mf.GetProduct().GetId()) {
804 annotated_transcript_feats.insert(
805 CSeq_id_Handle::GetHandle(*mf.GetProduct().GetId()));
806 }
807 }
808
809 for(CAlign_CI ci(bsh, sel); ci; ++ci) {
810 const CSeq_align& aln = *ci;
811 if(aln.GetSegs().IsSpliced()) {
812 annotated_transcript_aligns.insert(
813 CSeq_id_Handle::GetHandle(aln.GetSeq_id(0)));
814 }
815 }
816
817 vector<CSeq_id_Handle> v;
818 set_intersection(annotated_transcript_feats.begin(),
819 annotated_transcript_feats.end(),
820 annotated_transcript_aligns.begin(),
821 annotated_transcript_aligns.end(),
822 back_inserter(v));
823
824 return v.size() != 1 ? CSeq_id_Handle()
825 : sequence::GetId(v.front(),
826 bsh.GetScope(),
827 sequence::eGetId_ForceAcc);
828 }
829
IsLRG(CBioseq_Handle & bsh)830 bool IsLRG(CBioseq_Handle& bsh)
831 {
832 ITERATE(CBioseq_Handle::TId, it, bsh.GetId()) {
833 const CSeq_id& id = *it->GetSeqId();
834 if( id.IsGeneral()
835 && id.GetGeneral().GetDb() == "LRG")
836 {
837 return true;
838 }
839 }
840 return false;
841 }
842
843
x_header(TIterator const & i,const CContext & context)844 CHgvsParser::CContext CHgvsParser::x_header(TIterator const& i, const CContext& context)
845 {
846 HGVS_ASSERT_RULE(i, eID_header);
847
848 CContext ctx(context);
849
850 TIterator it = i->children.rbegin()->children.begin();
851 string mol(it->value.begin(), it->value.end());
852 CVariantPlacement::TMol mol_type =
853 mol == "c" ? CVariantPlacement::eMol_cdna
854 : mol == "g" ? CVariantPlacement::eMol_genomic
855 : mol == "r" ? CVariantPlacement::eMol_rna
856 : mol == "n" ? CVariantPlacement::eMol_rna
857 : mol == "p" ? CVariantPlacement::eMol_protein
858 : mol == "m" ? CVariantPlacement::eMol_mitochondrion
859 : mol == "mt" ? CVariantPlacement::eMol_mitochondrion
860 : CVariantPlacement::eMol_unknown;
861
862 it = (i->children.rbegin() + 1)->children.begin();
863 string id_str(it->value.begin(), it->value.end());
864
865 CSeq_id_Handle idh = context.ResolevSeqId(id_str);
866 CBioseq_Handle bsh = context.GetScope().GetBioseqHandle(idh);
867 if(!bsh) {
868 HGVS_THROW(eSemantic, "Could not resolve seq-id-str='" + id_str + "'; idh=" + idh.AsString());
869 }
870
871
872 if(bsh.IsNucleotide()
873 && mol_type == CVariantPlacement::eMol_protein
874 && NStr::Find(id_str, "CCDS") == 0)
875 {
876 //If we have something like CCDS2.1:p., the CCDS2.1 will resolve
877 //to an NM, but we need to resolve it to corresponding NP.
878 //
879 //We could do this for all seq-ids, as long as there's unique CDS,
880 //but as per SNP-4536 the seq-id and moltype correspondence must be enforced,
881 //so we do it for CCDS only
882
883 SAnnotSelector sel;
884 sel.SetResolveTSE();
885 sel.IncludeFeatType(CSeqFeatData::e_Cdregion);
886 bool already_found = false;
887 for(CFeat_CI ci(bsh, sel); ci; ++ci) {
888 const CMappedFeat& mf = *ci;
889 if(mf.IsSetProduct() && mf.GetProduct().GetId()) {
890 if(already_found) {
891 HGVS_THROW(eSemantic, "Can't resolve to prot - multiple CDSes on " + idh.AsString());
892 } else {
893 idh = sequence::GetId(
894 *mf.GetProduct().GetId(),
895 context.GetScope(),
896 sequence::eGetId_ForceAcc);
897 already_found = true;
898 }
899 }
900 }
901 if(!already_found) {
902 HGVS_THROW(eSemantic, "Can't resolve to prot - can't find CDS on " + idh.AsString());
903 }
904 } else if( ( mol_type == CVariantPlacement::eMol_cdna
905 || mol_type == CVariantPlacement::eMol_rna)
906 && idh.IdentifyAccession() == CSeq_id::eAcc_refseq_genomic) //e.g. NG_009822.1:c.1437+1G>A
907 {
908 //VAR-861
909 if(!IsLRG(bsh)) {
910 HGVS_THROW(eSemantic, "Specifying c. expression in NG coordinates is only supported for LRG subset where NM/NG associations are stable");
911 }
912
913 CSeq_id_Handle idh2 = GetUniquePrimaryTranscriptId(bsh);
914 if(!idh2) {
915 HGVS_THROW(eSemantic, "Can't resolve to a unique transcript on NG: " + idh.AsString());
916 } else {
917 idh = idh2;
918 }
919 }
920
921 ctx.SetId(*idh.GetSeqId(), mol_type);
922
923 if(i->children.size() == 3) {
924 it = (i->children.rbegin() + 2)->children.begin();
925 string tag_str(it->value.begin(), it->value.end());
926 //record tag in context, if it is necessary in the future
927 }
928
929 return ctx;
930 }
931
932
x_pos_spec(TIterator const & i,const CContext & context)933 CHgvsParser::SOffsetPoint CHgvsParser::x_pos_spec(TIterator const& i, const CContext& context)
934 {
935 HGVS_ASSERT_RULE(i, eID_pos_spec);
936
937 SOffsetPoint pnt;
938 TIterator it = i->children.begin();
939 if(it->value.id() == SGrammar::eID_general_pos) {
940 pnt = x_general_pos(it, context);
941 } else if(it->value.id() == SGrammar::eID_fuzzy_pos) {
942 pnt = x_fuzzy_pos(it, context);
943 } else {
944 bool flip_strand = false;
945 if(i->children.size() == 3) {
946 //first child is 'o' - opposite
947 flip_strand = true;
948 ++it;
949 }
950
951 CContext local_ctx = x_header(it, context);
952 ++it;
953 pnt = x_pos_spec(it, local_ctx);
954
955 if(flip_strand) {
956 pnt.pnt->FlipStrand();
957 }
958 }
959
960 return pnt;
961 }
962
963
x_prot_pos(TIterator const & i,const CContext & context)964 CHgvsParser::SOffsetPoint CHgvsParser::x_prot_pos(TIterator const& i, const CContext& context)
965 {
966 HGVS_ASSERT_RULE(i, eID_prot_pos);
967 TIterator it = i->children.begin();
968
969 CRef<CSeq_literal> prot_literal = x_raw_seq(it, context);
970
971 if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
972 HGVS_THROW(eSemantic, "Expected protein context");
973 }
974
975 if(prot_literal->GetLength() != 1) {
976 HGVS_THROW(eSemantic, "Expected single aa literal in prot-pos");
977 }
978
979 ++it;
980 SOffsetPoint pnt = x_pos_spec(it, context);
981
982 pnt.asserted_sequence = prot_literal->GetSeq_data().GetNcbieaa();
983
984 return pnt;
985 }
986
987
x_range(TIterator const & i,const CContext & context)988 CRef<CVariantPlacement> CHgvsParser::x_range(TIterator const& i, const CContext& context)
989 {
990 SOffsetPoint pnt1, pnt2;
991
992 CRef<CVariantPlacement> p(new CVariantPlacement);
993 p->Assign(context.GetPlacement());
994
995 if(i->value.id() == SGrammar::eID_prot_range) {
996 pnt1 = x_prot_pos(i->children.begin(), context);
997 pnt2 = x_prot_pos(i->children.begin() + 1, context);
998 } else if(i->value.id() == SGrammar::eID_nuc_range) {
999 pnt1 = x_pos_spec(i->children.begin(), context);
1000 pnt2 = x_pos_spec(i->children.begin() + 1, context);
1001 } else {
1002 HGVS_ASSERT_RULE(i, eID_NONE);
1003 }
1004
1005 if(!pnt1.pnt->GetId().Equals(pnt2.pnt->GetId())) {
1006 HGVS_THROW(eSemantic, "Range-loc start/stop are on different seq-ids.");
1007 }
1008 if(pnt1.pnt->GetStrand() != pnt2.pnt->GetStrand()) {
1009 HGVS_THROW(eSemantic, "Range-loc start/stop are on different strands.");
1010 }
1011
1012 p->SetLoc().SetInt().SetId(pnt1.pnt->SetId());
1013 p->SetLoc().SetInt().SetFrom(pnt1.pnt->GetPoint());
1014 p->SetLoc().SetInt().SetTo(pnt2.pnt->GetPoint());
1015 p->SetLoc().SetInt().SetStrand(pnt1.pnt->GetStrand());
1016 if(pnt1.pnt->IsSetFuzz()) {
1017 p->SetLoc().SetInt().SetFuzz_from(pnt1.pnt->SetFuzz());
1018 }
1019
1020 if(pnt2.pnt->IsSetFuzz()) {
1021 p->SetLoc().SetInt().SetFuzz_to(pnt2.pnt->SetFuzz());
1022 }
1023
1024 s_SetStartOffset(*p, pnt1.offset);
1025 s_SetStopOffset(*p, pnt2.offset);
1026
1027 if(pnt1.asserted_sequence != "" || pnt2.asserted_sequence != "") {
1028 //for proteins, the asserted sequence is specified as part of location, rather than variation
1029 p->SetSeq().SetLength(CVariationUtil::s_GetLength(*p, NULL));
1030 string& seq_str = (context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein)
1031 ? p->SetSeq().SetSeq_data().SetNcbieaa().Set()
1032 : p->SetSeq().SetSeq_data().SetIupacna().Set();
1033 seq_str = pnt1.asserted_sequence + ".." + pnt2.asserted_sequence;
1034 }
1035
1036 return p;
1037 }
1038
x_location(TIterator const & i,const CContext & context)1039 CRef<CVariantPlacement> CHgvsParser::x_location(TIterator const& i, const CContext& context)
1040 {
1041 HGVS_ASSERT_RULE(i, eID_location);
1042
1043 CRef<CVariantPlacement> placement(new CVariantPlacement);
1044 placement->Assign(context.GetPlacement());
1045
1046 TIterator it = i->children.begin();
1047 CRef<CSeq_loc> loc(new CSeq_loc);
1048 if(it->value.id() == SGrammar::eID_prot_pos || it->value.id() == SGrammar::eID_pos_spec) {
1049 SOffsetPoint pnt = it->value.id() == SGrammar::eID_prot_pos
1050 ? x_prot_pos(it, context)
1051 : x_pos_spec(it, context);
1052 placement->SetLoc().SetPnt(*pnt.pnt);
1053 s_SetStartOffset(*placement, pnt.offset);
1054 if(pnt.asserted_sequence != "") {
1055 placement->SetSeq().SetLength(CVariationUtil::s_GetLength(*placement, NULL));
1056 string& seq_str = (context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein)
1057 ? placement->SetSeq().SetSeq_data().SetNcbieaa().Set()
1058 : placement->SetSeq().SetSeq_data().SetIupacna().Set();
1059 seq_str = pnt.asserted_sequence;
1060 }
1061
1062 //todo point with pos=0 and fuzz=unk -> unknown pos ->set loc to empty
1063
1064 } else if(it->value.id() == SGrammar::eID_nuc_range || it->value.id() == SGrammar::eID_prot_range) {
1065 placement = x_range(it, context);
1066 } else {
1067 HGVS_ASSERT_RULE(it, eID_NONE);
1068 }
1069
1070 if(placement->GetLoc().IsPnt() && placement->GetLoc().GetPnt().GetPoint() == kInvalidSeqPos) {
1071 placement->SetLoc().SetEmpty().Assign(context.GetId());
1072 }
1073
1074 CVariationUtil util(context.GetScope());
1075 if(CVariationUtil::eFail == util.CheckExonBoundary(*placement)) {
1076 CRef<CVariationException> exception(new CVariationException);
1077 exception->SetCode(CVariationException::eCode_hgvs_exon_boundary);
1078 exception->SetMessage("HGVS exon-boundary position not represented in the transcript annotation");
1079 placement->SetExceptions().push_back(exception);
1080 }
1081 util.CheckPlacement(*placement);
1082
1083 return placement;
1084 }
1085
1086
x_seq_loc(TIterator const & i,const CContext & context)1087 CRef<CSeq_loc> CHgvsParser::x_seq_loc(TIterator const& i, const CContext& context)
1088 {
1089 HGVS_ASSERT_RULE(i, eID_seq_loc);
1090 TIterator it = i->children.begin();
1091
1092 bool flip_strand = false;
1093 if(i->children.size() == 3) {
1094 //first child is 'o' - opposite
1095 flip_strand = true;
1096 ++it;
1097 }
1098
1099 CContext local_context = x_header(it, context);
1100 ++it;
1101 CRef<CVariantPlacement> p = x_location(it, local_context);
1102
1103 if(flip_strand) {
1104 p->SetLoc().FlipStrand();
1105 }
1106
1107 if(p->IsSetStop_offset() || p->IsSetStart_offset()) {
1108 HGVS_THROW(eSemantic, "Intronic seq-locs are not supported in this context");
1109 }
1110
1111 CRef<CSeq_loc> loc(new CSeq_loc);
1112 loc->Assign(p->GetLoc());
1113 return loc;
1114 }
1115
x_raw_seq_or_len(TIterator const & i,const CContext & context)1116 CRef<CSeq_literal> CHgvsParser::x_raw_seq_or_len(TIterator const& i, const CContext& context)
1117 {
1118 HGVS_ASSERT_RULE(i, eID_raw_seq_or_len);
1119
1120 CRef<CSeq_literal> literal;
1121 TIterator it = i->children.begin();
1122
1123 if(it == i->children.end()) {
1124 HGVS_THROW(eLogic, "Unexpected parse-tree state when parsing " + context.GetHgvs());
1125 }
1126
1127 if(it->value.id() == SGrammar::eID_raw_seq) {
1128 literal = x_raw_seq(it, context);
1129 } else if(it->value.id() == SGrammar::eID_int_fuzz) {
1130 SFuzzyInt int_fuzz = x_int_fuzz(it, context);
1131 literal.Reset(new CSeq_literal);
1132 literal->SetLength(int_fuzz.value);
1133 if(int_fuzz.fuzz.IsNull()) {
1134 ;//no-fuzz;
1135 } else if(int_fuzz.IsPureFuzz()) {
1136 //unknown length (no value) - will represent as length=0 with gt fuzz
1137 literal->SetFuzz().SetLim(CInt_fuzz::eLim_gt);
1138 } else {
1139 literal->SetFuzz(*int_fuzz.fuzz);
1140 }
1141 } else {
1142 HGVS_ASSERT_RULE(it, eID_NONE);
1143 }
1144 return literal;
1145 }
1146
x_seq_ref(TIterator const & i,const CContext & context)1147 CHgvsParser::TDelta CHgvsParser::x_seq_ref(TIterator const& i, const CContext& context)
1148 {
1149 HGVS_ASSERT_RULE(i, eID_seq_ref);
1150 CHgvsParser::TDelta delta(new TDelta::TObjectType);
1151 TIterator it = i->children.begin();
1152
1153 if(it->value.id() == SGrammar::eID_seq_loc) {
1154 CRef<CSeq_loc> loc = x_seq_loc(it, context);
1155 delta->SetSeq().SetLoc(*loc);
1156 } else if(it->value.id() == SGrammar::eID_nuc_range || it->value.id() == SGrammar::eID_prot_range) {
1157 CRef<CVariantPlacement> p = x_range(it, context);
1158 if(p->IsSetStart_offset() || p->IsSetStop_offset()) {
1159 HGVS_THROW(eSemantic, "Intronic loc is not supported in this context");
1160 }
1161 delta->SetSeq().SetLoc().Assign(p->GetLoc());
1162 } else if(it->value.id() == SGrammar::eID_raw_seq_or_len) {
1163 CRef<CSeq_literal> literal = x_raw_seq_or_len(it, context);
1164 delta->SetSeq().SetLiteral(*literal);
1165 } else {
1166 HGVS_ASSERT_RULE(it, eID_NONE);
1167 }
1168
1169 return delta;
1170 }
1171
1172
s_hgvsaa2ncbieaa(const string & hgvsaa,string & out)1173 bool CHgvsParser::s_hgvsaa2ncbieaa(const string& hgvsaa, string& out)
1174 {
1175 //try to interpret sequence that was matched by either of aminoacid1, aminoacid, or aminoacid3
1176 string tmp_out(""); //so that the caller may pass same variable for in and out
1177 bool ret = s_hgvsaa2ncbieaa(hgvsaa, true, tmp_out);
1178 if(!ret) {
1179 ret = s_hgvsaa2ncbieaa(hgvsaa, false, tmp_out);
1180 }
1181 if(!ret) {
1182 ret = s_hgvs_iupacaa2ncbieaa(hgvsaa, tmp_out);
1183 }
1184
1185 out = tmp_out;
1186 return ret;
1187 }
1188
s_hgvs_iupacaa2ncbieaa(const string & hgvsaa,string & out)1189 bool CHgvsParser::s_hgvs_iupacaa2ncbieaa(const string& hgvsaa, string& out)
1190 {
1191 out = hgvsaa;
1192
1193 //"X" used to mean "Ter" in HGVS; now it means "unknown aminoacid"
1194 //Still, we'll interpret it as Ter, simply beacuse it is more likely
1195 //that the submitter is using legacy representation.
1196 NStr::ReplaceInPlace(out, "X", "*");
1197 NStr::ReplaceInPlace(out, "?", "X");
1198 return true;
1199 }
1200
s_hgvsaa2ncbieaa(const string & hgvsaa,bool uplow,string & out)1201 bool CHgvsParser::s_hgvsaa2ncbieaa(const string& hgvsaa, bool uplow, string& out)
1202 {
1203 string in = hgvsaa;
1204 out = "";
1205 while(in != "") {
1206 bool found = false;
1207 for(size_t i_ncbistdaa = 0; i_ncbistdaa < 28; i_ncbistdaa++) {
1208 string iupac3 = CSeqportUtil::GetIupacaa3(i_ncbistdaa);
1209 if(NStr::StartsWith(in, uplow ? iupac3 : NStr::ToUpper(iupac3))) {
1210 size_t i_ncbieaa = CSeqportUtil::GetMapToIndex(CSeq_data::e_Ncbistdaa,
1211 CSeq_data::e_Ncbieaa,
1212 i_ncbistdaa);
1213 out += CSeqportUtil::GetCode(CSeq_data::e_Ncbieaa, i_ncbieaa);
1214 found = true;
1215 break;
1216 }
1217 }
1218 if(found) {
1219 in = in.substr(3);
1220 } else if(NStr::StartsWith(in, "*")) { out.push_back('*'); in = in.substr(1);
1221 } else if(NStr::StartsWith(in, "X")) { out.push_back('*'); in = in.substr(1);
1222 } else if(NStr::StartsWith(in, "?")) { out.push_back('X'); in = in.substr(1);
1223 //} else if(NStr::StartsWith(in, "STOP", NStr::eNocase)) { out.push_back('X'); in = in.substr(4); //VAR-283
1224 } else {
1225 out = hgvsaa;
1226 return false;
1227 }
1228 }
1229 return true;
1230 }
1231
1232
1233
x_raw_seq(TIterator const & i,const CContext & context)1234 CRef<CSeq_literal> CHgvsParser::x_raw_seq(TIterator const& i, const CContext& context)
1235 {
1236 HGVS_ASSERT_RULE(i, eID_raw_seq);
1237 TIterator it = i->children.begin();
1238
1239 string seq_str(it->value.begin(), it->value.end());
1240
1241 CRef<CSeq_literal>literal(new CSeq_literal);
1242 if(context.GetPlacement().GetMol() == CVariantPlacement::eMol_protein) {
1243 s_hgvsaa2ncbieaa(seq_str, seq_str);
1244 literal->SetSeq_data().SetNcbieaa().Set(seq_str);
1245 } else {
1246 seq_str = NStr::ToUpper(seq_str);
1247 NStr::ReplaceInPlace(seq_str, "U", "T");
1248 literal->SetSeq_data().SetIupacna().Set(seq_str);
1249 }
1250
1251 literal->SetLength(seq_str.size());
1252
1253 vector<TSeqPos> bad;
1254 CSeqportUtil::Validate(literal->GetSeq_data(), &bad);
1255
1256 if(bad.size() > 0) {
1257 HGVS_THROW(eSemantic, "Invalid sequence at pos " + NStr::IntToString(bad[0]) + " in " + seq_str);
1258 }
1259
1260 return literal;
1261 }
1262
1263
GetDeltaLength(const CDelta_item & delta,int loc_len)1264 int GetDeltaLength(const CDelta_item& delta, int loc_len)
1265 {
1266 int len = !delta.IsSetSeq() ? 0
1267 : delta.GetSeq().IsLiteral() ? delta.GetSeq().GetLiteral().GetLength()
1268 : delta.GetSeq().IsLoc() ? sequence::GetLength(delta.GetSeq().GetLoc(), NULL)
1269 : delta.GetSeq().IsThis() ? loc_len
1270 : 0;
1271 if(delta.IsSetMultiplier()) {
1272 len *= delta.GetMultiplier();
1273 }
1274 return len;
1275 }
1276
GetDelInsSubtype(int del_len,int ins_len)1277 CVariation_inst::EType GetDelInsSubtype(int del_len, int ins_len)
1278 {
1279 return del_len > 0 && ins_len == 0 ? CVariation_inst::eType_del
1280 : del_len == 0 && ins_len > 0 ? CVariation_inst::eType_ins
1281 : del_len == ins_len && del_len != 1 ? CVariation_inst::eType_mnp
1282 : del_len == ins_len && del_len == 1 ? CVariation_inst::eType_snv
1283 : CVariation_inst::eType_delins;
1284 }
1285
x_delins(TIterator const & i,const CContext & context)1286 CRef<CVariation> CHgvsParser::x_delins(TIterator const& i, const CContext& context)
1287 {
1288 HGVS_ASSERT_RULE(i, eID_delins);
1289 TIterator it = i->children.begin();
1290 CRef<CVariation> del_vr = x_deletion(it, context);
1291 ++it;
1292 CRef<CVariation> ins_vr = x_insertion(it, context, false);
1293 //note: don't verify location, as it must be len=2 for pure-insertion only
1294
1295 //The resulting delins variation has deletion's placement (with asserted seq, if any),
1296 //and insertion's inst, except action type is "replace" (default) rather than "ins-before",
1297 //so we reset action
1298
1299 int placement_len = CVariationUtil::s_GetLength(SetFirstPlacement(*del_vr), NULL);
1300 int del_len = GetDeltaLength(*del_vr->GetData().GetInstance().GetDelta().front(), placement_len);
1301 int ins_len = GetDeltaLength(*ins_vr->GetData().GetInstance().GetDelta().front(), placement_len);
1302 del_vr->SetData().SetInstance().SetType(GetDelInsSubtype(del_len, ins_len));
1303
1304 del_vr->SetData().SetInstance().SetDelta() = ins_vr->SetData().SetInstance().SetDelta();
1305 del_vr->SetData().SetInstance().SetDelta().front()->ResetAction();
1306
1307 if(ins_len == 1 && del_len == 1) {
1308 CRef<CVariationException> ex(new CVariationException);
1309 ex->SetCode(CVariationException::eCode_hgvs_parsing);
1310 ex->SetMessage("delins used for single-nt substitution");
1311 SetFirstPlacement(*del_vr).SetExceptions().push_back(ex);
1312 }
1313
1314 return del_vr;
1315 }
1316
x_deletion(TIterator const & i,const CContext & context)1317 CRef<CVariation> CHgvsParser::x_deletion(TIterator const& i, const CContext& context)
1318 {
1319 HGVS_ASSERT_RULE(i, eID_deletion);
1320 TIterator it = i->children.begin();
1321 CRef<CVariation> vr(new CVariation);
1322 CVariation_inst& var_inst = vr->SetData().SetInstance();
1323
1324 var_inst.SetType(CVariation_inst::eType_del);
1325 CVariantPlacement& p = SetFirstPlacement(*vr);
1326 p.Assign(context.GetPlacement());
1327
1328 CRef<CDelta_item> di(new CDelta_item);
1329 di->SetAction(CDelta_item::eAction_del_at);
1330 di->SetSeq().SetThis();
1331 var_inst.SetDelta().push_back(di);
1332
1333 ++it;
1334
1335 if(it != i->children.end() && it->value.id() == SGrammar::eID_raw_seq_or_len) {
1336 CRef<CSeq_literal> literal = x_raw_seq_or_len(it, context);
1337 ++it;
1338 SetFirstPlacement(*vr).SetSeq(*literal);
1339
1340 if(literal->GetLength() != CVariationUtil::s_GetLength(p, NULL)) {
1341 CRef<CVariationException> ex(new CVariationException);
1342 ex->SetCode(CVariationException::eCode_hgvs_parsing);
1343 ex->SetMessage("Sequence length is inconsistent with location length");
1344 p.SetExceptions().push_back(ex);
1345 }
1346 }
1347
1348 var_inst.SetDelta();
1349 return vr;
1350 }
1351
1352
x_insertion(TIterator const & i,const CContext & context,bool check_loc)1353 CRef<CVariation> CHgvsParser::x_insertion(TIterator const& i, const CContext& context, bool check_loc)
1354 {
1355 HGVS_ASSERT_RULE(i, eID_insertion);
1356 TIterator it = i->children.begin();
1357 ++it; //skip ins
1358 CRef<CVariation> vr(new CVariation);
1359 CVariation_inst& var_inst = vr->SetData().SetInstance();
1360
1361 var_inst.SetType(CVariation_inst::eType_ins);
1362
1363 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1364
1365 if(check_loc && CVariationUtil::s_GetLength(*vr->GetPlacements().front(), NULL) != 2) {
1366 HGVS_THROW(eSemantic, "Location must be a dinucleotide");
1367 }
1368
1369 TDelta delta_ins = x_seq_ref(it, context);
1370
1371 //todo:
1372 //alternative representation: if delta is literal, might use action=morph and prefix/suffix the insertion with the flanking nucleotides.
1373 delta_ins->SetAction(CDelta_item::eAction_ins_before);
1374
1375 var_inst.SetDelta().push_back(delta_ins);
1376
1377 return vr;
1378 }
1379
1380
x_duplication(TIterator const & i,const CContext & context)1381 CRef<CVariation> CHgvsParser::x_duplication(TIterator const& i, const CContext& context)
1382 {
1383 HGVS_ASSERT_RULE(i, eID_duplication);
1384 TIterator it = i->children.begin();
1385 CRef<CVariation> vr(new CVariation);
1386 CVariation_inst& var_inst = vr->SetData().SetInstance();
1387 var_inst.SetType(CVariation_inst::eType_ins); //replace seq @ location with this*2
1388
1389 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1390
1391 TDelta delta(new TDelta::TObjectType);
1392 delta->SetSeq().SetThis(); //delta->SetSeq().SetLoc(vr->SetLocation());
1393 delta->SetMultiplier(2);
1394 var_inst.SetDelta().push_back(delta);
1395
1396 ++it; //skip dup
1397
1398 //the next node is either expected length or expected sequence
1399 if(it != i->children.end() && it->value.id() == SGrammar::eID_seq_ref) {
1400 TDelta dup_seq = x_seq_ref(it, context);
1401 if(dup_seq->GetSeq().IsLiteral()) {
1402 SetFirstPlacement(*vr).SetSeq(dup_seq->SetSeq().SetLiteral());
1403
1404 if(CVariationUtil::s_GetLength(*vr->GetPlacements().front(), NULL) != dup_seq->GetSeq().GetLiteral().GetLength()) {
1405 HGVS_THROW(eSemantic, "Location length and asserted sequence length differ");
1406 }
1407 }
1408 }
1409
1410 return vr;
1411 }
1412
1413
x_no_change(TIterator const & i,const CContext & context)1414 CRef<CVariation> CHgvsParser::x_no_change(TIterator const& i, const CContext& context)
1415 {
1416 HGVS_ASSERT_RULE(i, eID_no_change);
1417 TIterator it = i->children.begin();
1418 CRef<CVariation> vr(new CVariation);
1419 CVariation_inst& var_inst = vr->SetData().SetInstance();
1420
1421 CVariantPlacement& p = SetFirstPlacement(*vr);
1422 p.Assign(context.GetPlacement());
1423
1424 //VAR-574: The no-change variation is interpreted as X>X
1425
1426 if(it->value.id() == SGrammar::eID_raw_seq) {
1427 CRef<CSeq_literal> seq_from = x_raw_seq(it, context);
1428 p.SetSeq(*seq_from);
1429 ++it;
1430 } else if(p.GetLoc().IsWhole()) {
1431 // will fall-back 'identity' inst-type instead of whole-seq mnp
1432 } else {
1433 CVariationUtil util(context.GetScope());
1434 util.AttachSeq(p);
1435 p.ResetExceptions();
1436 // if could not attach seq (e.g. too-large or intronic context)
1437 // will fall-back on 'identity' inst-type below, so ignoring
1438 // the exceptions.
1439 }
1440
1441 var_inst.SetType(
1442 p.GetMol() == CVariantPlacement::eMol_protein ? CVariation_inst::eType_prot_silent
1443 : !p.IsSetSeq() ? CVariation_inst::eType_identity
1444 : p.GetSeq().GetLength() == 1 ? CVariation_inst::eType_snv
1445 : CVariation_inst::eType_mnp);
1446
1447 TDelta delta(new TDelta::TObjectType);
1448 if(p.IsSetSeq()) {
1449 delta->SetSeq().SetLiteral().Assign(p.GetSeq());
1450 } else {
1451 delta->SetSeq().SetThis();
1452 }
1453 var_inst.SetDelta().push_back(delta);
1454
1455 return vr;
1456 }
1457
1458
x_nuc_subst(TIterator const & i,const CContext & context)1459 CRef<CVariation> CHgvsParser::x_nuc_subst(TIterator const& i, const CContext& context)
1460 {
1461 HGVS_ASSERT_RULE(i, eID_nuc_subst);
1462 TIterator it = i->children.begin();
1463 CRef<CVariation> vr(new CVariation);
1464 CVariation_inst& var_inst = vr->SetData().SetInstance();
1465
1466 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1467
1468 if(it->value.id() == SGrammar::eID_raw_seq) {
1469 CRef<CSeq_literal> seq_from = x_raw_seq(it, context);
1470 SetFirstPlacement(*vr).SetSeq(*seq_from);
1471 ++it;
1472 }
1473
1474 ++it;//skip ">"
1475
1476 CRef<CSeq_literal> seq_to = x_raw_seq(it, context);
1477 TDelta delta(new TDelta::TObjectType);
1478 delta->SetSeq().SetLiteral(*seq_to);
1479 var_inst.SetDelta().push_back(delta);
1480
1481 var_inst.SetType(
1482 GetDelInsSubtype(
1483 CVariationUtil::s_GetLength(SetFirstPlacement(*vr), NULL),
1484 seq_to->GetLength()));
1485
1486 return vr;
1487 }
1488
1489
x_nuc_inv(TIterator const & i,const CContext & context)1490 CRef<CVariation> CHgvsParser::x_nuc_inv(TIterator const& i, const CContext& context)
1491 {
1492 HGVS_ASSERT_RULE(i, eID_nuc_inv);
1493
1494 TIterator it = i->children.begin();
1495 CRef<CVariation> vr(new CVariation);
1496 CVariation_inst& var_inst = vr->SetData().SetInstance();
1497 var_inst.SetType(CVariation_inst::eType_inv);
1498
1499 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1500
1501 #if 0
1502 TDelta delta(new TDelta::TObjectType);
1503 delta->SetSeq().SetLoc().Assign(*loc);
1504 delta->SetSeq().SetLoc().FlipStrand();
1505 var_inst.SetDelta().push_back(delta);
1506 #else
1507 //don't put anything in the delta, as the inversion sequence is placement-specific, not variation-specific
1508 var_inst.SetDelta();
1509 #endif
1510
1511 ++it;
1512
1513 //capture asserted seq
1514 if(it != i->children.end() && it->value.id() == SGrammar::eID_seq_ref) {
1515 TDelta dup_seq = x_seq_ref(it, context);
1516 if(dup_seq->GetSeq().IsLiteral()) {
1517 SetFirstPlacement(*vr).SetSeq(dup_seq->SetSeq().SetLiteral());
1518 }
1519 }
1520
1521 return vr;
1522 }
1523
1524
x_ssr(TIterator const & i,const CContext & context)1525 CRef<CVariation> CHgvsParser::x_ssr(TIterator const& i, const CContext& context)
1526 {
1527 HGVS_ASSERT_RULE(i, eID_ssr);
1528 TIterator it = i->children.begin();
1529 CRef<CVariation> vr(new CVariation);
1530 vr->SetData().SetInstance().SetType(CVariation_inst::eType_microsatellite);
1531
1532
1533 CRef<CSeq_literal> literal;
1534 if(it->value.id() == SGrammar::eID_raw_seq) {
1535 literal = x_raw_seq(it, context);
1536 ++it;
1537 }
1538
1539 CVariantPlacement& p = SetFirstPlacement(*vr);
1540 p.Assign(context.GetPlacement());
1541
1542 // The location may either specify a repeat unit,
1543 // or point to the first base of a repeat unit.
1544 // We normalize it so it is alwas the repeat unit.
1545 #if 1
1546 if( !literal.IsNull()
1547 && literal->IsSetSeq_data()
1548 && literal->GetSeq_data().IsIupacna()
1549 && !p.IsSetStart_offset()
1550 && !p.IsSetStop_offset())
1551 {
1552 CRef<CSeq_loc> ssr_loc = FindSSRLoc(
1553 p.GetLoc(),
1554 literal->GetSeq_data().GetIupacna(),
1555 context.GetScope());
1556 p.SetLoc().Assign(*ssr_loc);
1557 } else if(p.IsSetStart_offset() && !p.IsSetStop_offset()) {
1558 p.SetStop_offset(p.GetStart_offset()
1559 + (literal.IsNull() ? 0 : literal->GetLength() - 1));
1560 }
1561 #else
1562 if(SetFirstPlacement(*vr).GetLoc().IsPnt() && !literal.IsNull()) {
1563 ExtendDownstream(SetFirstPlacement(*vr), literal->GetLength() - 1);
1564 }
1565 #endif
1566
1567
1568 if(it->value.id() == SGrammar::eID_ssr) { // list('['>>int_p>>']', '+') with '[',']','+' nodes discarded;
1569 //Note: see ssr grammar in the header for reasons why we have to match all alleles here
1570 //rather than match them separately as mut_insts
1571
1572 vr->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_genotype);
1573 for(; it != i->children.end(); ++it) {
1574 string s1(it->value.begin(), it->value.end());
1575 CRef<CVariation> vr2(new CVariation);
1576 vr2->SetData().SetInstance().SetType(CVariation_inst::eType_microsatellite);
1577
1578 TDelta delta(new TDelta::TObjectType);
1579 if(!literal.IsNull()) {
1580 delta->SetSeq().SetLiteral().Assign(*literal);
1581 } else {
1582 delta->SetSeq().SetThis();
1583 }
1584 delta->SetMultiplier(NStr::StringToInt(s1));
1585
1586 vr2->SetData().SetInstance().SetDelta().push_back(delta);
1587 vr->SetData().SetSet().SetVariations().push_back(vr2);
1588 }
1589 vr = x_unwrap_iff_singleton(*vr);
1590 } else {
1591 TDelta delta(new TDelta::TObjectType);
1592 if(!literal.IsNull()) {
1593 delta->SetSeq().SetLiteral().Assign(*literal);
1594 } else {
1595 delta->SetSeq().SetThis();
1596 }
1597
1598 SFuzzyInt int_fuzz = x_int_fuzz(it, context);
1599 delta->SetMultiplier(int_fuzz.value);
1600 if(int_fuzz.fuzz.IsNull()) {
1601 ;
1602 } else {
1603 delta->SetMultiplier_fuzz(*int_fuzz.fuzz);
1604 }
1605 vr->SetData().SetInstance().SetDelta().push_back(delta);
1606 }
1607
1608 return vr;
1609 }
1610
1611
x_translocation(TIterator const & i,const CContext & context)1612 CRef<CVariation> CHgvsParser::x_translocation(TIterator const& i, const CContext& context)
1613 {
1614 HGVS_ASSERT_RULE(i, eID_translocation);
1615 TIterator it = i->children.end() - 1; //note: seq-loc follows iscn expression, i.e. last child
1616 CRef<CVariation> vr(new CVariation);
1617 CVariation_inst& var_inst = vr->SetData().SetInstance();
1618 var_inst.SetType(CVariation_inst::eType_translocation);
1619
1620 CRef<CSeq_loc> loc = x_seq_loc(it, context);
1621 SetFirstPlacement(*vr).SetLoc().Assign(*loc);
1622 CVariationUtil util(context.GetScope());
1623 SetFirstPlacement(*vr).SetMol(util.GetMolType(sequence::GetId(*loc, NULL)));
1624
1625 it = i->children.begin();
1626 string iscn_expr(it->value.begin(), it->value.end());
1627 vr->SetSynonyms().push_back("ISCN:" + iscn_expr);
1628 var_inst.SetDelta(); //no delta contents
1629
1630 return vr;
1631 }
1632
1633
x_conversion(TIterator const & i,const CContext & context)1634 CRef<CVariation> CHgvsParser::x_conversion(TIterator const& i, const CContext& context)
1635 {
1636 HGVS_ASSERT_RULE(i, eID_conversion);
1637 TIterator it = i->children.begin();
1638 CRef<CVariation> vr(new CVariation);
1639 CVariation_inst& var_inst = vr->SetData().SetInstance();
1640 var_inst.SetType(CVariation_inst::eType_transposon);
1641
1642 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1643
1644 ++it;
1645 CRef<CSeq_loc> loc_other = x_seq_loc(it, context);
1646
1647 TDelta delta(new TDelta::TObjectType);
1648 delta->SetSeq().SetLoc().Assign(*loc_other);
1649 var_inst.SetDelta().push_back(delta);
1650
1651 return vr;
1652 }
1653
1654
x_prot_fs(TIterator const & i,const CContext & context)1655 CRef<CVariation> CHgvsParser::x_prot_fs(TIterator const& i, const CContext& context)
1656 {
1657 HGVS_ASSERT_RULE(i, eID_prot_fs);
1658 TIterator it = i->children.begin();
1659 CRef<CVariation> vr(new CVariation);
1660
1661 if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
1662 HGVS_THROW(eContext, "Frameshift can only be specified in protein context");
1663 }
1664
1665 vr->SetData().SetNote("Frameshift");
1666 vr->SetFrameshift();
1667
1668 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1669
1670 ++it; //skip 'fs'
1671 if(it != i->children.end()) {
1672 //fsX# description: the remaining tokens are 'X' and integer
1673 ++it; //skip 'X'
1674 if(it != i->children.end()) {
1675 string s(it->value.begin(), it->value.end());
1676 int x_length = NStr::StringToInt(s);
1677 vr->SetFrameshift().SetX_length(x_length);
1678 }
1679 }
1680
1681 return vr;
1682 }
1683
1684
x_prot_ext(TIterator const & i,const CContext & context)1685 CRef<CVariation> CHgvsParser::x_prot_ext(TIterator const& i, const CContext& context)
1686 {
1687 HGVS_ASSERT_RULE(i, eID_prot_ext);
1688 TIterator it = i->children.begin();
1689
1690 if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
1691 HGVS_THROW(eContext, "Expected protein context");
1692 }
1693
1694 CRef<CVariation> vr(new CVariation);
1695 CVariation_inst& var_inst = vr->SetData().SetInstance();
1696 var_inst.SetType(CVariation_inst::eType_prot_other);
1697 string ext_type_str(it->value.begin(), it->value.end());
1698 ++it;
1699 string ext_len_str(it->value.begin(), it->value.end());
1700 int ext_len = NStr::StringToInt(ext_len_str);
1701
1702 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1703 SetFirstPlacement(*vr).SetLoc().SetPnt().SetId().Assign(context.GetId());
1704 SetFirstPlacement(*vr).SetLoc().SetPnt().SetStrand(eNa_strand_plus);
1705
1706 TDelta delta(new TDelta::TObjectType);
1707 delta->SetSeq().SetLiteral().SetLength(abs(ext_len) + 1);
1708 //extension of Met or X by N bases = replacing first or last AA with (N+1) AAs
1709
1710 if(ext_type_str == "extMet") {
1711 if(ext_len > 0) {
1712 HGVS_THROW(eSemantic, "extMet must be followed by a negative integer");
1713 }
1714 SetFirstPlacement(*vr).SetLoc().SetPnt().SetPoint(0);
1715 //extension precedes first AA
1716 var_inst.SetDelta().push_back(delta);
1717 } else if(ext_type_str == "extX"
1718 || ext_type_str == "ext*"
1719 || ext_type_str == "extTer")
1720 {
1721 if(ext_len < 0) {
1722 HGVS_THROW(eSemantic, "exX must be followed by a non-negative integer");
1723 }
1724
1725 SetFirstPlacement(*vr).SetLoc().SetPnt().SetPoint(context.GetBioseqHandle().GetInst_Length() - 1);
1726 //extension follows last AA
1727 var_inst.SetDelta().push_back(delta);
1728 } else {
1729 HGVS_THROW(eGrammatic, "Unexpected ext_type: " + ext_type_str);
1730 }
1731
1732 return vr;
1733 }
1734
1735
x_prot_missense(TIterator const & i,const CContext & context)1736 CRef<CVariation> CHgvsParser::x_prot_missense(TIterator const& i, const CContext& context)
1737 {
1738 HGVS_ASSERT_RULE(i, eID_prot_missense);
1739 TIterator it = i->children.begin();
1740
1741 CRef<CSeq_literal> prot_literal = x_raw_seq(it, context);
1742
1743 if(context.GetPlacement().GetMol() != CVariantPlacement::eMol_protein) {
1744 HGVS_THROW(eContext, "Expected protein context");
1745 }
1746
1747 CRef<CVariation> vr(new CVariation);
1748 CVariation_inst& var_inst = vr->SetData().SetInstance();
1749 var_inst.SetType(prot_literal->GetLength() == 1 ?
1750 CVariation_inst::eType_prot_missense
1751 : CVariation_inst::eType_prot_other);
1752
1753 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1754
1755 TDelta delta(new TDelta::TObjectType);
1756 delta->SetSeq().SetLiteral(*prot_literal);
1757 var_inst.SetDelta().push_back(delta);
1758
1759 return vr;
1760 }
1761
1762
x_string_content(TIterator const & i,const CContext & context)1763 CRef<CVariation> CHgvsParser::x_string_content(TIterator const& i, const CContext& context)
1764 {
1765 CRef<CVariation> vr(new CVariation);
1766 string s(i->value.begin(), i->value.end());
1767 s = s.substr(1); //truncate the leading pipe
1768 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1769 vr->SetData().SetNote(s);
1770 return vr;
1771 }
1772
1773
x_mut_inst(TIterator const & i,const CContext & context)1774 CRef<CVariation> CHgvsParser::x_mut_inst(TIterator const& i, const CContext& context)
1775 {
1776 HGVS_ASSERT_RULE(i, eID_mut_inst);
1777
1778 TIterator it = i->children.begin();
1779
1780 CRef<CVariation> vr(new CVariation);
1781 if(it->value.id() == SGrammar::eID_mut_inst) {
1782 string s(it->value.begin(), it->value.end());
1783 if(s == "?") {
1784 vr->SetData().SetUnknown();
1785 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1786 } else {
1787 vr = x_string_content(it, context);
1788 }
1789 } else {
1790 vr =
1791 it->value.id() == SGrammar::eID_no_change ? x_no_change(it, context)
1792 : it->value.id() == SGrammar::eID_delins ? x_delins(it, context)
1793 : it->value.id() == SGrammar::eID_deletion ? x_deletion(it, context)
1794 : it->value.id() == SGrammar::eID_insertion ? x_insertion(it, context, true)
1795 : it->value.id() == SGrammar::eID_duplication ? x_duplication(it, context)
1796 : it->value.id() == SGrammar::eID_nuc_subst ? x_nuc_subst(it, context)
1797 : it->value.id() == SGrammar::eID_nuc_inv ? x_nuc_inv(it, context)
1798 : it->value.id() == SGrammar::eID_ssr ? x_ssr(it, context)
1799 : it->value.id() == SGrammar::eID_conversion ? x_conversion(it, context)
1800 : it->value.id() == SGrammar::eID_prot_ext ? x_prot_ext(it, context)
1801 : it->value.id() == SGrammar::eID_prot_fs ? x_prot_fs(it, context)
1802 : it->value.id() == SGrammar::eID_prot_missense ? x_prot_missense(it, context)
1803 : it->value.id() == SGrammar::eID_translocation ? x_translocation(it, context)
1804 : CRef<CVariation>(NULL);
1805
1806 if(vr.IsNull()) {
1807 HGVS_ASSERT_RULE(it, eID_NONE);
1808 }
1809 }
1810
1811 return vr;
1812 }
1813
x_expr1(TIterator const & i,const CContext & context)1814 CRef<CVariation> CHgvsParser::x_expr1(TIterator const& i, const CContext& context)
1815 {
1816 HGVS_ASSERT_RULE(i, eID_expr1);
1817 TIterator it = i->children.begin();
1818 CRef<CVariation> vr;
1819
1820 string s(it->value.begin(), it->value.end());
1821 if(it->value.id() == i->value.id() && s == "(") {
1822 ++it;
1823 vr = x_expr1(it, context);
1824 SetComputational(*vr);
1825 } else if(it->value.id() == SGrammar::eID_list1a) {
1826 vr = x_list(it, context);
1827 } else if(it->value.id() == SGrammar::eID_header) {
1828 CContext local_ctx = x_header(it, context);
1829 ++it;
1830 vr = x_expr2(it, local_ctx);
1831 } else if(it->value.id() == SGrammar::eID_translocation) {
1832 vr = x_translocation(it, context);
1833 } else {
1834 HGVS_ASSERT_RULE(it, eID_NONE);
1835 }
1836
1837 return vr;
1838 }
1839
x_expr2(TIterator const & i,const CContext & context)1840 CRef<CVariation> CHgvsParser::x_expr2(TIterator const& i, const CContext& context)
1841 {
1842 HGVS_ASSERT_RULE(i, eID_expr2);
1843 TIterator it = i->children.begin();
1844 CRef<CVariation> vr;
1845
1846 string s(it->value.begin(), it->value.end());
1847 if(it->value.id() == i->value.id() && s == "(") {
1848 ++it;
1849 vr = x_expr2(it, context);
1850 SetComputational(*vr);
1851 } else if(it->value.id() == SGrammar::eID_list2a) {
1852 vr = x_list(it, context);
1853 } else if(it->value.id() == SGrammar::eID_location) {
1854 CContext local_context(context);
1855 CRef<CVariantPlacement> placement = x_location(it, local_context);
1856 local_context.SetPlacement().Assign(*placement);
1857 ++it;
1858 vr = x_expr3(it, local_context);
1859 } else if(it->value.id() == SGrammar::eID_prot_ext) {
1860 vr = x_prot_ext(it, context);
1861 } else if(it->value.id() == SGrammar::eID_no_change) {
1862 vr = x_no_change(it, context);
1863 } else if(it->value.id() == i->value.id()) {
1864 vr.Reset(new CVariation);
1865 SetFirstPlacement(*vr).Assign(context.GetPlacement());
1866
1867 if(s == "?") {
1868 vr->SetData().SetUnknown();
1869 //SetFirstPlacement(*vr).SetLoc().SetEmpty().Assign(context.GetId());
1870 } else if(s == "0?" || s == "0") {
1871 //loss of product: represent as deletion of the whole product sequence.
1872 SetFirstPlacement(*vr).SetLoc().SetWhole().Assign(context.GetId());
1873 CVariation_inst& var_inst = vr->SetData().SetInstance();
1874 var_inst.SetType(CVariation_inst::eType_del);
1875 CRef<CDelta_item> di(new CDelta_item);
1876 di->SetAction(CDelta_item::eAction_del_at);
1877 di->SetSeq().SetThis();
1878 var_inst.SetDelta().push_back(di);
1879
1880 if(s == "0?") {
1881 SetComputational(*vr);
1882 }
1883 } else {
1884 HGVS_THROW(eGrammatic, "Unexpected expr terminal: " + s);
1885 }
1886 } else {
1887 HGVS_ASSERT_RULE(it, eID_NONE);
1888 }
1889
1890 return vr;
1891 }
1892
1893
x_expr3(TIterator const & i,const CContext & context)1894 CRef<CVariation> CHgvsParser::x_expr3(TIterator const& i, const CContext& context)
1895 {
1896 HGVS_ASSERT_RULE(i, eID_expr3);
1897 TIterator it = i->children.begin();
1898 CRef<CVariation> vr;
1899
1900 string s(it->value.begin(), it->value.end());
1901 if(it->value.id() == i->value.id() && s == "(") {
1902 ++it;
1903 vr = x_expr3(it, context);
1904 SetComputational(*vr);
1905 } else if(it->value.id() == SGrammar::eID_list3a) {
1906 vr = x_list(it, context);
1907 } else if(it->value.id() == SGrammar::eID_mut_inst) {
1908 vr.Reset(new CVariation);
1909 vr->SetData().SetSet().SetType(CVariation::TData::TSet::eData_set_type_compound);
1910 for(; it != i->children.end(); ++it) {
1911 CRef<CVariation> inst_ref = x_mut_inst(it, context);
1912
1913 if(inst_ref->GetData().IsNote()
1914 && inst_ref->GetData().GetNote() == "Frameshift")
1915 {
1916 if(vr->SetData().SetSet().SetVariations().size() > 0) {
1917 //if inst_ref is a frameshift subexpression, we need to attach it as attribute of the
1918 //previous variation in a compound inst-list, since frameshift is not a subtype of
1919 //Variation.data, and thus not represented as a separate subvariation.
1920 //e.g. p.Ile20Serfs - frameshift an attribute of Ile20Ser
1921 vr->SetData().SetSet().SetVariations().back()->SetFrameshift().Assign(inst_ref->GetFrameshift());
1922 } else {
1923 //in updated HGVS spec fs can be a standalone, e.g. p.Ile20fs.
1924 //In this case we'll represent it as p.Ile20Xaafs
1925 CVariation_inst& inst = inst_ref->SetData().SetInstance();
1926 inst.SetType(CVariation_inst::eType_prot_other);
1927
1928 CRef<CDelta_item> delta(new CDelta_item);
1929 delta->SetSeq().SetLiteral().SetLength(1);
1930 delta->SetSeq().SetLiteral().SetSeq_data().SetNcbieaa().Set("X");
1931 inst.SetDelta().push_back(delta);
1932 vr->SetData().SetSet().SetVariations().push_back(inst_ref);
1933 }
1934 } else {
1935 vr->SetData().SetSet().SetVariations().push_back(inst_ref);
1936 }
1937 }
1938 vr = x_unwrap_iff_singleton(*vr);
1939 } else {
1940 HGVS_ASSERT_RULE(it, eID_NONE);
1941 }
1942
1943 return vr;
1944 }
1945
1946
x_list_delimiter(TIterator const & i,const CContext & context)1947 CVariation::TData::TSet::EData_set_type CHgvsParser::x_list_delimiter(TIterator const& i, const CContext& context)
1948 {
1949 HGVS_ASSERT_RULE(i, eID_list_delimiter);
1950 TIterator it = i->children.begin();
1951 string s(it->value.begin(), it->value.end());
1952
1953 return s == "//" ? CVariation::TData::TSet::eData_set_type_chimeric
1954 : s == "/" ? CVariation::TData::TSet::eData_set_type_mosaic
1955 : s == "+" ? CVariation::TData::TSet::eData_set_type_genotype
1956 : s == "," ? CVariation::TData::TSet::eData_set_type_products
1957 : s == ";" ? CVariation::TData::TSet::eData_set_type_haplotype //note that within context of list#a, ";" delimits genotype
1958 : s == "(+)" || s == "(;)" ? CVariation::TData::TSet::eData_set_type_individual
1959 : CVariation::TData::TSet::eData_set_type_unknown;
1960 }
1961
1962
x_list(TIterator const & i,const CContext & context)1963 CRef<CVariation> CHgvsParser::x_list(TIterator const& i, const CContext& context)
1964 {
1965 if(!SGrammar::s_is_list(i->value.id())) {
1966 HGVS_ASSERT_RULE(i, eID_NONE);
1967 }
1968
1969 CRef<CVariation> vr(new CVariation);
1970 TVariationSet& varset = vr->SetData().SetSet();
1971 varset.SetType(CVariation::TData::TSet::eData_set_type_unknown);
1972
1973
1974 for(TIterator it = i->children.begin(); it != i->children.end(); ++it) {
1975 //will process two elements from the children list: delimiter and following expression.
1976 //The first one does not have the delimiter. The delimiter determines the set-type.
1977 if(it != i->children.begin()) {
1978 if(SGrammar::s_is_list_a(i->value.id())) {
1979 /*
1980 * list#a is delimited by either ";"(HGVS-2.0) or "+"(HGVS_1.0);
1981 * Both represent alleles within genotype.
1982 * Note: the delimiter rule in the context is chset_p<>("+;"), i.e.
1983 * a terminal, not a rule like list_delimiter; so calling
1984 * x_list_delimiter(...) parser here would throw
1985 */
1986 varset.SetType(CVariation::TData::TSet::eData_set_type_genotype);
1987 } else {
1988 CVariation::TData::TSet::EData_set_type set_type = x_list_delimiter(it, context);
1989 if(varset.GetType() == CVariation::TData::TSet::eData_set_type_unknown) {
1990 varset.SetType(set_type);
1991 } else if(set_type != varset.GetType()) {
1992 HGVS_THROW(eSemantic, "Non-unique delimiters within a list");
1993 }
1994 }
1995 ++it;
1996 }
1997
1998 CRef<CVariation> vr;
1999 if(it->value.id() == SGrammar::eID_expr1) {
2000 vr = x_expr1(it, context);
2001 } else if(it->value.id() == SGrammar::eID_expr2) {
2002 vr = x_expr2(it, context);
2003 } else if(it->value.id() == SGrammar::eID_expr3) {
2004 vr = x_expr3(it, context);
2005 } else if(SGrammar::s_is_list(it->value.id())) {
2006 vr = x_list(it, context);
2007 } else {
2008 HGVS_ASSERT_RULE(it, eID_NONE);
2009 }
2010
2011 varset.SetVariations().push_back(vr);
2012 }
2013
2014 vr = x_unwrap_iff_singleton(*vr);
2015 return vr;
2016 }
2017
2018
x_root(TIterator const & i,const CContext & context)2019 CRef<CVariation> CHgvsParser::x_root(TIterator const& i, const CContext& context)
2020 {
2021 HGVS_ASSERT_RULE(i, eID_root);
2022
2023 CRef<CVariation> vr = x_list(i, context);
2024
2025 RepackageAssertedSequence(*vr);
2026 AdjustMoltype(*vr, context.GetScope());
2027 CVariationUtil::s_FactorOutPlacements(*vr);
2028
2029 vr->Index();
2030 return vr;
2031 }
2032
x_unwrap_iff_singleton(CVariation & v)2033 CRef<CVariation> CHgvsParser::x_unwrap_iff_singleton(CVariation& v)
2034 {
2035 if(v.GetData().IsSet() && v.GetData().GetSet().GetVariations().size() == 1) {
2036 CRef<CVariation> first = v.SetData().SetSet().SetVariations().front();
2037 if(!first->IsSetPlacements() && v.IsSetPlacements()) {
2038 first->SetPlacements() = v.SetPlacements();
2039 }
2040 return first;
2041 } else {
2042 return CRef<CVariation>(&v);
2043 }
2044 }
2045
2046
sx_AppendMoltypeExceptions(CVariation & v,CScope & scope)2047 void CHgvsParser::sx_AppendMoltypeExceptions(CVariation& v, CScope& scope)
2048 {
2049 CVariationUtil util(scope);
2050 for(CTypeIterator<CVariantPlacement> it(Begin(v)); it; ++it) {
2051 CVariantPlacement& p = *it;
2052 CVariantPlacement::TMol mol = util.GetMolType(sequence::GetId(p.GetLoc(), NULL));
2053 if(p.GetMol() != mol) {
2054 CRef<CVariantPlacement> p2(new CVariantPlacement);
2055 p2->Assign(p);
2056 p2->SetMol(mol);
2057
2058 string asserted_header = CHgvsParser::s_SeqIdToHgvsStr(p, &scope);
2059 string expected_header = CHgvsParser::s_SeqIdToHgvsStr(*p2, &scope);
2060
2061 CRef<CVariationException> ex(new CVariationException);
2062 ex->SetCode(CVariationException::eCode_inconsistent_asserted_moltype);
2063 ex->SetMessage("Inconsistent mol-type. asserted:'" + asserted_header + "'; expected:'" + expected_header + "'");
2064 p.SetExceptions().push_back(ex);
2065 }
2066 }
2067 }
2068
AsVariation(const string & hgvs,TOpFlags flags)2069 CRef<CVariation> CHgvsParser::AsVariation(const string& hgvs, TOpFlags flags)
2070 {
2071 string hgvs2 = NStr::TruncateSpaces(hgvs);
2072 tree_parse_info<> info = pt_parse(hgvs2.c_str(), *s_grammar, +space_p);
2073
2074 if(!info.full) {
2075 #if 0
2076 CNcbiOstrstream ostr;
2077 tree_to_xml(ostr, info.trees, hgvs2.c_str() , CHgvsParser::SGrammar::s_GetRuleNames());
2078 string tree_str = CNcbiOstrstreamToString(ostr);
2079 #endif
2080 HGVS_THROW(eGrammatic, "Syntax error at pos " + NStr::SizetToString(info.length + 1) + " in " + hgvs2 + "");
2081 }
2082
2083 CContext context(m_scope, m_seq_id_resolvers, hgvs);
2084 CRef<CVariation> vr = x_root(info.trees.begin(), context);
2085 vr->SetName(hgvs2);
2086 sx_AppendMoltypeExceptions(*vr, context.GetScope());
2087
2088 CVariationUtil util(context.GetScope());
2089 util.CheckAmbiguitiesInLiterals(*vr);
2090
2091 return vr;
2092 }
2093
2094
2095
AttachHgvs(CVariation & v)2096 void CHgvsParser::AttachHgvs(CVariation& v)
2097 {
2098 v.Index();
2099
2100 //compute and attach placement-specific HGVS expressions
2101 for(CTypeIterator<CVariation> it(Begin(v)); it; ++it) {
2102 CVariation& v2 = *it;
2103 if(!v2.IsSetPlacements()) {
2104 continue;
2105 }
2106 NON_CONST_ITERATE(CVariation::TPlacements, it2, v2.SetPlacements()) {
2107 CVariantPlacement& p2 = **it2;
2108
2109 if(!p2.GetLoc().GetId()) {
2110 continue;
2111 }
2112
2113 if(p2.GetMol() != CVariantPlacement::eMol_protein && v2.GetConsequenceParent()) {
2114 //if this variation is in consequnece, only compute HGVS for protein variations
2115 //(as otherwise it will throw - can't have HGVS expression for protein with nuc placement)
2116 continue;
2117 }
2118
2119 //compute hgvs-expression specific to the placement and the variation to which it is attached
2120 try {
2121 string hgvs_expression = AsHgvsExpression(v2, CConstRef<CSeq_id>(p2.GetLoc().GetId()));
2122 p2.SetHgvs_name(hgvs_expression);
2123 } catch (CException& e ) {
2124 CNcbiOstrstream ostr;
2125 ostr << MSerial_AsnText << p2;
2126 string s = CNcbiOstrstreamToString(ostr);
2127 NCBI_REPORT_EXCEPTION("Can't compute HGVS expression for " + s, e);
2128 }
2129 }
2130 }
2131
2132 //If the root variation does not have placements (e.g. a container for placement-specific subvariations)
2133 //then compute the hgvs expression for the root placement and attach it to variation itself as a synonym.
2134 if(!v.IsSetPlacements()) {
2135 string root_output_hgvs = AsHgvsExpression(v);
2136 v.SetSynonyms().push_back(root_output_hgvs);
2137 }
2138 }
2139
2140
2141
2142 };
2143
2144 END_NCBI_SCOPE
2145
2146