1 /* $Id: hgvs_parser.hpp 497570 2016-04-07 17:36:06Z moyered $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * File Description: 27 * 28 * Translate HGVS expression to Variation-ref seq-feats. 29 * HGVS nomenclature rules: http://www.hgvs.org/mutnomen/ 30 * 31 * =========================================================================== 32 */ 33 34 #ifndef HGVSPARSER_HPP_ 35 #define HGVSPARSER_HPP_ 36 37 #include <corelib/ncbiobj.hpp> 38 #include <corelib/ncbistd.hpp> 39 40 #include <boost/version.hpp> 41 #if BOOST_VERSION >= 103800 42 #include <boost/spirit/include/classic.hpp> 43 #include <boost/spirit/include/classic_core.hpp> 44 #include <boost/spirit/include/classic_ast.hpp> 45 #include <boost/spirit/include/classic_parse_tree.hpp> 46 #include <boost/spirit/include/classic_tree_to_xml.hpp> 47 48 using namespace BOOST_SPIRIT_CLASSIC_NS; 49 #else 50 //older boost 51 #include <boost/spirit.hpp> 52 #include <boost/spirit/core.hpp> 53 #include <boost/spirit/tree/ast.hpp> 54 #include <boost/spirit/tree/parse_tree.hpp> 55 #include <boost/spirit/tree/tree_to_xml.hpp> 56 using namespace boost::spirit; 57 #endif 58 59 #include <objects/seqloc/Seq_loc.hpp> 60 #include <objects/seqloc/Seq_interval.hpp> 61 #include <objects/seq/Seq_literal.hpp> 62 #include <objects/seqalign/Seq_align.hpp> 63 #include <objects/seqalign/Seq_align_set.hpp> 64 #include <objects/seq/Seq_data.hpp> 65 66 #include <objects/seqfeat/Seq_feat.hpp> 67 #include <objects/seqfeat/Variation_ref.hpp> 68 #include <objects/seqfeat/Variation_inst.hpp> 69 70 #include <objmgr/scope.hpp> 71 #include <objmgr/feat_ci.hpp> 72 #include <objmgr/seq_vector.hpp> 73 74 75 BEGIN_NCBI_SCOPE 76 USING_SCOPE(objects); 77 78 #define HGVS_THROW(err_code, message) NCBI_THROW(CHgvsParser::CHgvsParserException, err_code, message) 79 80 namespace variation_ref { 81 82 class CHgvsParser : public CObject 83 { 84 public: CHgvsParser(CScope & scope)85 CHgvsParser(CScope& scope) 86 : m_scope(&scope) 87 {} 88 89 enum EOpFlags 90 { 91 fOpFlags_RelaxedAA = 1 << 0, ///< try assumbing all-uppercase three-letter AA representation 92 fOpFlags_Default = fOpFlags_RelaxedAA 93 }; 94 typedef int TOpFlags; 95 96 97 CRef<CSeq_feat> AsVariationFeat(const string& hgvs_expression, TOpFlags = fOpFlags_Default); 98 string AsHgvsExpression(const CSeq_feat& feat); 99 SetScope()100 CScope& SetScope() 101 { 102 return *m_scope; 103 } 104 105 class CHgvsParserException : public CException 106 { 107 public: 108 enum EErrCode { 109 eLogic, ///<Problem with the code 110 eGrammatic, ///<Expression is not a valid language 111 eSemantic, ///<Expression is invalid in some way 112 eContext, ///<Some problem with context 113 eAlignment, ///<Some problem with getting alignment 114 ePrecondition, ///<Precondition is not met 115 eOther, 116 }; 117 GetErrCodeString(void) const118 virtual const char* GetErrCodeString(void) const 119 { 120 switch(GetErrCode()) { 121 case eLogic: return "eLogic"; 122 case eGrammatic: return "eGrammatic"; 123 case eSemantic: return "eSemantic"; 124 case eContext: return "eContext"; 125 case eAlignment: return "eAlignment"; 126 case ePrecondition: return "ePrecondition"; 127 case eOther: return "eOther"; 128 default: return CException::GetErrCodeString(); 129 } 130 } 131 NCBI_EXCEPTION_DEFAULT(CHgvsParserException, CException); 132 }; 133 134 135 protected: 136 137 struct SFuzzyInt 138 { SFuzzyIntvariation_ref::CHgvsParser::SFuzzyInt139 SFuzzyInt() 140 { 141 Reset(); 142 } 143 Assignvariation_ref::CHgvsParser::SFuzzyInt144 void Assign(const SFuzzyInt& other) { 145 value = other.value; 146 if(!other.fuzz) { 147 fuzz.Reset(); 148 } else { 149 if(!fuzz) { 150 fuzz.Reset(new CInt_fuzz); 151 } 152 fuzz->Assign(*other.fuzz); 153 } 154 } 155 Resetvariation_ref::CHgvsParser::SFuzzyInt156 void Reset() 157 { 158 value = 0; 159 fuzz.Reset(); 160 } 161 162 long value; 163 CRef<CInt_fuzz> fuzz; //can be null; 164 }; 165 166 struct SOffsetPoint 167 { SOffsetPointvariation_ref::CHgvsParser::SOffsetPoint168 SOffsetPoint() 169 { 170 Reset(); 171 } 172 IsOffsetvariation_ref::CHgvsParser::SOffsetPoint173 bool IsOffset() const { 174 return offset.value || offset.fuzz; 175 } 176 Resetvariation_ref::CHgvsParser::SOffsetPoint177 void Reset() 178 { 179 pnt.Reset(); 180 offset.Reset(); 181 } 182 Assignvariation_ref::CHgvsParser::SOffsetPoint183 void Assign(const SOffsetPoint& other) 184 { 185 offset.Assign(other.offset); 186 if(!other.pnt) { 187 pnt.Reset(); 188 } else { 189 if(!pnt) { 190 pnt.Reset(new CSeq_point); 191 } 192 pnt->Assign(*other.pnt); 193 } 194 } 195 196 string asserted_sequence; 197 CRef<CSeq_point> pnt; 198 SFuzzyInt offset; 199 }; 200 201 struct SOffsetLoc 202 { SOffsetLocvariation_ref::CHgvsParser::SOffsetLoc203 SOffsetLoc() 204 { 205 Reset(); 206 } 207 Resetvariation_ref::CHgvsParser::SOffsetLoc208 void Reset() 209 { 210 loc.Reset(); 211 start_offset.Reset(); 212 stop_offset.Reset(); 213 } 214 Assignvariation_ref::CHgvsParser::SOffsetLoc215 void Assign(const SOffsetLoc& other) 216 { 217 start_offset.Assign(other.start_offset); 218 stop_offset.Assign(other.stop_offset); 219 if(!other.loc) { 220 loc.Reset(); 221 } else { 222 if(!loc) { 223 loc.Reset(new CSeq_loc); 224 } 225 loc->Assign(*other.loc); 226 } 227 } 228 IsOffsetvariation_ref::CHgvsParser::SOffsetLoc229 bool IsOffset() const 230 { 231 return start_offset.value || start_offset.value || stop_offset.fuzz || stop_offset.fuzz; 232 } 233 234 TSeqPos GetLength() const; 235 236 string asserted_sequence; 237 CRef<CSeq_loc> loc; 238 SFuzzyInt start_offset; 239 SFuzzyInt stop_offset; 240 }; 241 242 /*! 243 * CContext encapsulates sequence or location context for an hgvs sub-expression. 244 * E.g. given an expression id:c.5_10delinsAT, when creating a variation-ref 245 * for delinsAT the context will refer to sequence "id" and location "5_10" 246 */ 247 class CContext 248 { 249 public: CContext(CRef<CScope> scope)250 CContext(CRef<CScope> scope) 251 : m_scope(scope) 252 { 253 Clear(); 254 } 255 CContext(const CContext & other)256 CContext(const CContext& other) 257 { 258 *this = other; //shallow copy will suffice. 259 } 260 261 enum EMolType { 262 eMol_not_set, 263 eMol_g, 264 eMol_c, 265 eMol_r, 266 eMol_p, 267 eMol_mt 268 }; 269 Clear()270 void Clear() 271 { 272 m_bsh.Reset(); 273 m_mol_type = eMol_not_set; 274 m_cds.Reset(); 275 m_seq_id.Reset(); 276 m_loc.Reset(); 277 } 278 GetLength() const279 TSeqPos GetLength() const 280 { 281 return m_bsh.GetBioseqLength(); 282 } 283 284 /*! 285 * Clear the context and reset it for the given seq-id. 286 * If the sequence is cdna and we're working with "c." coordinates, 287 * also find the CDS, as the coordinates are (start|stop)codon-relative. 288 */ 289 void SetId(const CSeq_id& id, EMolType mol_type); 290 Validate(const CSeq_literal & literal) const291 void Validate(const CSeq_literal& literal) const 292 { 293 if(!m_loc.IsOffset()) { 294 //Can only validate normal locs, as with offset loc the asserted 295 //allele does not correspond to the base loc. 296 Validate(literal, GetLoc()); 297 } else { 298 //LOG_POST("Ignoring validation of literal due to offset location"); 299 } 300 } 301 302 void Validate(const CSeq_literal& literal, const CSeq_loc& loc) const; 303 SetLoc(const SOffsetLoc & loc)304 void SetLoc(const SOffsetLoc& loc) 305 { 306 m_loc.Assign(loc); 307 } 308 IsSetLoc() const309 bool IsSetLoc() const 310 { 311 return !m_loc.loc.IsNull(); 312 } 313 GetScope() const314 CScope& GetScope() const 315 { 316 return *m_scope; 317 } 318 319 const CSeq_loc& GetLoc() const; 320 321 const SOffsetLoc& GetOffsetLoc() const; 322 323 const CSeq_id& GetId() const; 324 325 const CSeq_feat& GetCDS() const; 326 327 EMolType GetMolType(bool check=true) const; 328 329 330 private: 331 CBioseq_Handle m_bsh; 332 EMolType m_mol_type; 333 CRef<CSeq_feat> m_cds; 334 CRef<CSeq_id> m_seq_id; 335 SOffsetLoc m_loc; 336 mutable CRef<CScope> m_scope; 337 }; 338 339 struct SGrammar: public grammar<SGrammar> 340 { 341 /*! 342 * Deviations from the recommendations: 343 * 344 * Flattened compound lists are not supported (a list must have same delimiter type) 345 * [[[a,b];c;f](+)[d,e]] - OK 346 * [a,b;c;f(+)d,e] - NOT OK 347 * 348 * No mixing of different expression types within a list. 349 * Note: this example is not mentioned in specifications, but found it in Mutalyzer docs. 350 * AB026906:c.[1del;6_7insoAL449423.14(CDKN2A):c.[1_10;5del]] 351 * 1_10 specifies a location 352 * 5del specifies a variation instance 353 * [1_10;5del] is not a valid expression 354 * 355 * Only seq-id header containing actual seq-id (gi or acc.ver) are supported: 356 * Y : NM_004006.1:c.3G>T - uses a GenBank file as indicator 357 N : GJB2:c.76A>C - uses a HGNC-approved gene symbol as indicator 358 Y : DMD{NM_004006.1}:c.3G>T - uses both a HGNC-approved gene symbol and a GenBank file as indicator 359 N : chrX:g.32,218,983_32,984,039del - uses a chromosome indicator (here X) 360 N : rs2306220:A>G - using a dbSNP-identifier as indicator 361 N : DXS1219:g.CA[18] (or AFM297yd1:g.CA[18]) - uses marker DXS1219 / AFM297yd1 as indicator 362 Note: cases below are not described in recommendations, but found in Mutalyzer docs. 363 N : AL449423.14(CDKN2A_v003):g.10del - not in symbol{seq_id} format 364 N : CDKN2A_v003{AL449423.14}:c.1_*3352del cDNA coordinates on a genomic sequence. 365 * 366 */ 367 368 enum E_NodeIds { 369 eID_NONE = 0, 370 eID_root, 371 eID_list1a, 372 eID_list2a, 373 eID_list3a, 374 eID_list1b, 375 eID_list2b, 376 eID_list3b, 377 eID_expr1, 378 eID_expr2, 379 eID_expr3, 380 eID_translocation, 381 eID_header, 382 eID_seq_id, 383 eID_mut_list, 384 eID_mut_ref, 385 eID_mol, 386 eID_int_fuzz, 387 eID_abs_pos, 388 eID_general_pos, 389 eID_fuzzy_pos, 390 eID_pos_spec, 391 eID_location, 392 eID_nuc_range, 393 eID_prot_range, 394 eID_mut_inst, 395 eID_raw_seq, 396 eID_aminoacid, 397 eID_nuc_subst, 398 eID_deletion, 399 eID_insertion, 400 eID_delins, 401 eID_duplication, 402 eID_nuc_inv, 403 eID_ssr, 404 eID_conversion, 405 eID_seq_loc, 406 eID_seq_ref, 407 eID_prot_pos, 408 eID_prot_fs, 409 eID_prot_missense, 410 eID_prot_ext 411 }; 412 413 414 typedef std::map<parser_id, std::string> TRuleNames; 415 static TRuleNames s_rule_names; s_GetRuleNamesvariation_ref::CHgvsParser::SGrammar416 static TRuleNames& s_GetRuleNames() 417 { 418 if(s_rule_names.size() == 0) { 419 s_rule_names[eID_NONE] = "NONE"; 420 s_rule_names[eID_root] = "root"; 421 s_rule_names[eID_list1a] = "list1a"; 422 s_rule_names[eID_list2a] = "list2a"; 423 s_rule_names[eID_list3a] = "list3a"; 424 s_rule_names[eID_list1b] = "list1b"; 425 s_rule_names[eID_list2b] = "list2b"; 426 s_rule_names[eID_list3b] = "list3b"; 427 s_rule_names[eID_expr1] = "expr1"; 428 s_rule_names[eID_expr2] = "expr2"; 429 s_rule_names[eID_expr3] = "expr3"; 430 s_rule_names[eID_translocation] = "translocation"; 431 s_rule_names[eID_header] = "header"; 432 s_rule_names[eID_location] = "location"; 433 s_rule_names[eID_mol] = "mol"; 434 s_rule_names[eID_seq_id] = "seq_id"; 435 s_rule_names[eID_mut_list] = "mut_list"; 436 s_rule_names[eID_mut_ref] = "mut_ref"; 437 s_rule_names[eID_nuc_range] = "nuc_range"; 438 s_rule_names[eID_prot_range] = "prot_range"; 439 s_rule_names[eID_mut_inst] = "mut_inst"; 440 s_rule_names[eID_int_fuzz] = "int_fuzz"; 441 s_rule_names[eID_abs_pos] = "abs_pos"; 442 s_rule_names[eID_general_pos] = "general_pos"; 443 s_rule_names[eID_fuzzy_pos] = "fuzzy_pos"; 444 s_rule_names[eID_pos_spec] = "pos_spec"; 445 s_rule_names[eID_raw_seq] = "raw_seq"; 446 s_rule_names[eID_aminoacid] = "aminoacid"; 447 s_rule_names[eID_nuc_subst] = "nuc_subst"; 448 s_rule_names[eID_deletion] = "deletion"; 449 s_rule_names[eID_insertion] = "insertion"; 450 s_rule_names[eID_delins] = "delins"; 451 s_rule_names[eID_duplication] = "duplication"; 452 s_rule_names[eID_nuc_inv] = "nuc_inv"; 453 s_rule_names[eID_ssr] = "ssr"; 454 s_rule_names[eID_conversion] = "conversion"; 455 s_rule_names[eID_seq_loc] = "seq_loc"; 456 s_rule_names[eID_seq_ref] = "seq_ref"; 457 s_rule_names[eID_prot_pos] = "prot_pos"; 458 s_rule_names[eID_prot_fs] = "prot_fs"; 459 s_rule_names[eID_prot_missense] = "prot_missense"; 460 s_rule_names[eID_prot_ext] = "prot_ext"; 461 } 462 return s_rule_names; 463 } 464 465 static const string& s_GetRuleName(parser_id id); 466 467 template <typename ScannerT> 468 struct definition 469 { 470 rule<ScannerT, parser_context<>, parser_tag<eID_root> > root; 471 rule<ScannerT, parser_context<>, parser_tag<eID_list1a> > list1a; 472 rule<ScannerT, parser_context<>, parser_tag<eID_list2a> > list2a; 473 rule<ScannerT, parser_context<>, parser_tag<eID_list3a> > list3a; 474 rule<ScannerT, parser_context<>, parser_tag<eID_list1b> > list1b; 475 rule<ScannerT, parser_context<>, parser_tag<eID_list2b> > list2b; 476 rule<ScannerT, parser_context<>, parser_tag<eID_list3b> > list3b; 477 rule<ScannerT, parser_context<>, parser_tag<eID_expr1> > expr1; 478 rule<ScannerT, parser_context<>, parser_tag<eID_expr2> > expr2; 479 rule<ScannerT, parser_context<>, parser_tag<eID_expr3> > expr3; 480 rule<ScannerT, parser_context<>, parser_tag<eID_translocation> > translocation; 481 rule<ScannerT, parser_context<>, parser_tag<eID_header> > header; 482 rule<ScannerT, parser_context<>, parser_tag<eID_seq_id> > seq_id; 483 rule<ScannerT, parser_context<>, parser_tag<eID_mol> > mol; 484 rule<ScannerT, parser_context<>, parser_tag<eID_mut_list > > mut_list; 485 rule<ScannerT, parser_context<>, parser_tag<eID_mut_ref> > mut_ref; 486 rule<ScannerT, parser_context<>, parser_tag<eID_mut_inst> > mut_inst; 487 rule<ScannerT, parser_context<>, parser_tag<eID_int_fuzz> > int_fuzz; 488 rule<ScannerT, parser_context<>, parser_tag<eID_abs_pos> > abs_pos; 489 rule<ScannerT, parser_context<>, parser_tag<eID_general_pos> > general_pos; 490 rule<ScannerT, parser_context<>, parser_tag<eID_fuzzy_pos> > fuzzy_pos; 491 rule<ScannerT, parser_context<>, parser_tag<eID_pos_spec> > pos_spec; 492 rule<ScannerT, parser_context<>, parser_tag<eID_location> > location; 493 rule<ScannerT, parser_context<>, parser_tag<eID_nuc_range> > nuc_range; 494 rule<ScannerT, parser_context<>, parser_tag<eID_prot_range> > prot_range; 495 rule<ScannerT, parser_context<>, parser_tag<eID_raw_seq> > raw_seq; 496 rule<ScannerT, parser_context<>, parser_tag<eID_aminoacid> > aminoacid; 497 rule<ScannerT, parser_context<>, parser_tag<eID_nuc_subst> > nuc_subst; 498 rule<ScannerT, parser_context<>, parser_tag<eID_deletion> > deletion; 499 rule<ScannerT, parser_context<>, parser_tag<eID_insertion> > insertion; 500 rule<ScannerT, parser_context<>, parser_tag<eID_delins> > delins; 501 rule<ScannerT, parser_context<>, parser_tag<eID_duplication> > duplication; 502 rule<ScannerT, parser_context<>, parser_tag<eID_nuc_inv> > nuc_inv; 503 rule<ScannerT, parser_context<>, parser_tag<eID_ssr> > ssr; 504 rule<ScannerT, parser_context<>, parser_tag<eID_conversion> > conversion; 505 rule<ScannerT, parser_context<>, parser_tag<eID_seq_loc> > seq_loc; 506 rule<ScannerT, parser_context<>, parser_tag<eID_seq_ref> > seq_ref; 507 rule<ScannerT, parser_context<>, parser_tag<eID_prot_pos> > prot_pos; 508 rule<ScannerT, parser_context<>, parser_tag<eID_prot_missense> > prot_missense; 509 rule<ScannerT, parser_context<>, parser_tag<eID_prot_ext> > prot_ext; 510 rule<ScannerT, parser_context<>, parser_tag<eID_prot_fs> > prot_fs; 511 definitionvariation_ref::CHgvsParser::SGrammar::definition512 definition(SGrammar const&) 513 { 514 aminoacid = str_p("Ala") 515 | str_p("Asx") 516 | str_p("Cys") 517 | str_p("Asp") 518 | str_p("Glu") 519 | str_p("Phe") 520 | str_p("Gly") 521 | str_p("His") 522 | str_p("Ile") 523 | str_p("Lys") 524 | str_p("Leu") 525 | str_p("Met") 526 | str_p("Asn") 527 | str_p("Pro") 528 | str_p("Gln") 529 | str_p("Arg") 530 | str_p("Ser") 531 | str_p("Thr") 532 | str_p("Val") 533 | str_p("Trp") 534 | str_p("Tyr") 535 | str_p("Glx") 536 | chset<>("XARNDCEQGHILKMFPSTWYV") //"X" as in p.X110GlnextX17 537 ; 538 539 raw_seq = leaf_node_d[+aminoacid | +chset<>("ACGTN") | +chset<>("acgun")]; 540 /* 541 * Note: there's no separation between protein, DNA and RNA sequences, as 542 * at the parse time it is not known without context which sequence it is - i.e. 543 * "AC" could be a dna sequence, or AlaCys. Hence, raw_seq will match any sequence 544 * and at the parse-tree transformation stage we'll create proper seq-type as 545 * the mol will be known then. 546 * 547 * Note: +aminoacid precedes +chset<>("ACGTN") so that we don't prematurely match T in Trp etc. 548 */ 549 550 551 /* 552 * Positions and Locations 553 */ 554 555 int_fuzz = ch_p('(') >> (ch_p('?')|int_p) >> ch_p('_') >> (ch_p('?')|int_p) >> ch_p(')') 556 | ch_p('(') >> int_p >> ch_p(')') //note: not ch_p('?') here as this makes grammar amgiguous 557 | (ch_p('?')|int_p); 558 559 abs_pos = !ch_p('*') >> int_fuzz; 560 //Note: '*' means the location is CDS-stop-relative; cdna context required. 561 //Otherwise it is CDS-start-relative iff in cdna-context. 562 563 general_pos = (str_p("IVS") >> int_p | abs_pos) >> sign_p >> int_fuzz 564 | abs_pos; 565 //Warning: offset-pos must be followed by a sign because otherwise 566 //it is ambiguous whether c.123(1_2) denotes a SSR or a fuzzy intronic location 567 568 569 fuzzy_pos = discard_node_d[ch_p('(')] 570 >> general_pos 571 >> discard_node_d[ch_p('_')] 572 >> general_pos 573 >> discard_node_d[ch_p(')')]; 574 575 pos_spec = general_pos //intronic offset-pos 576 | fuzzy_pos //(generalpos_generalpos) 577 | !ch_p('o') >> header >> pos_spec; //far-loc, as in id:c.88+101_id2:c.355-1045del 578 579 580 prot_pos = raw_seq >> pos_spec; //raw_seq must be of length one (single aminoacid) 581 582 prot_range = prot_pos >> discard_node_d[ch_p('_')] >> prot_pos; 583 584 nuc_range = pos_spec >> discard_node_d[ch_p('_')] >> pos_spec; 585 586 location = nuc_range | pos_spec | prot_range | prot_pos; 587 //Note that this describes "local" coordinates within a sequence context, not a seq-loc 588 589 /* 590 * Seq-ids and seq-locs 591 */ 592 seq_id = leaf_node_d[alpha_p >> +(alnum_p | chset<>("._-|"))]; 593 594 mol = str_p("mt") | chset<>("gcrpm"); //note: for 'mt.' also supporting 'm.' 595 596 header = seq_id 597 >> !(discard_node_d[ch_p('{')] 598 >> seq_id 599 >> discard_node_d[ch_p('}')]) 600 >> discard_node_d[ch_p(':')] 601 >> mol 602 >> discard_node_d[ch_p('.')]; 603 /* 604 * A the parsing stage we'll not require that the mutations for 605 * different mol-types don't mix, i.e. protein mutation type for a 606 * mol-type "c." - this will be deferred to the semantic check that will 607 * validate the parsed expression. The reason is that the g/c/r/p specs mostly 608 * overlap, so we can avoid exploding the grammar. 609 */ 610 611 seq_loc = !ch_p('o') >> header >> location; 612 613 seq_ref = seq_loc //far-loc 614 | (nuc_range|prot_range) //local-loc of range-type, e.g. c.17_18ins5_16 http://www.hgvs.org/mutnomen/FAQ.html 615 //This is to exclude point-locs (see below) 616 | raw_seq //literal sequence 617 | int_fuzz; //unknown sequence of some length 618 // WARNING: this is ambiguous WRT local-loc! 619 // e.g. p.Glu5Valins2fsX3 - 2 in ins2 indicates sequence of length two, NOT the sequence at position 2. 620 // Hence, the local-locs above must be specified via range-types only. 621 622 nuc_subst = raw_seq >> ch_p('>') >> raw_seq; //semantic check: must be of length 1 623 624 deletion = str_p("del") >> !(raw_seq | int_p); 625 626 duplication = str_p("dup") >> !seq_ref; 627 628 insertion = str_p("ins") >> seq_ref; 629 630 conversion = str_p("con") >> seq_loc; 631 632 delins = str_p("del") >> !raw_seq >> str_p("ins") >> seq_ref; 633 634 nuc_inv = str_p("inv") >> !int_p; 635 636 ssr = !raw_seq >> ( int_fuzz - (ch_p('?')|int_p) //don't want to interpret 'id:5_6?' as ssr 637 | list_p(discard_node_d[ch_p('[')] 638 >> int_p 639 >> discard_node_d[ch_p(']')], 640 discard_node_d[ch_p('+')])); 641 /* 642 * Note: It is not correct to match [5] and [3] in NM_000815.2:c.101TC[5]+[3] 643 * individually as list3a within c.101 location context, because 644 * the location-spec for an ssr may point to the first repeat-unit base only, 645 * in which case we need to calculate the actual range-loc based on the sequence literal, 646 * and when processing "[3]" later the "TC" literal will not be in context, 647 * and it will appear as if the context is NM_000815.2:c.101 rather than c.101_102. 648 * 649 * Hence, the ssr rule will have to consume the list of multipliers [5]+[3] 650 * and generate compound variation-ref "manually" 651 */ 652 653 654 prot_fs = str_p("fs") >> !(ch_p('X') >> int_p); 655 656 prot_ext = (str_p("extMet") | str_p("extX")) >> int_p; 657 658 prot_missense = aminoacid; 659 660 //ISCN expression followed by a seq-loc 661 translocation = str_p("t(") 662 >> leaf_node_d[*(print_p - ch_p('(') - ch_p(')'))] 663 >> str_p(")(") 664 >> leaf_node_d[*(print_p - ch_p('(') - ch_p(')'))] 665 >> str_p(")") 666 >> seq_loc; 667 668 mut_inst = ch_p('?') //can exist within +mut_inst, e.g. p.Met1?extMet-5 669 | ch_p('=') 670 | delins //delins must precede del 671 | deletion 672 | insertion 673 | duplication 674 | nuc_subst 675 | nuc_inv 676 | ssr 677 | conversion 678 | prot_fs 679 | prot_missense 680 | prot_ext //this may occur inside location context, e.g. p.Met1ValextMet-12 681 ; 682 //Note: '?' and '=' can exist both within a location context 683 //(i.e. expr3) or within a sequence context (i.e. expr2) 684 //additionally, prot_ext may exist as expr2 (outside of location context) as well. 685 686 687 root = list_p(expr1, ch_p('+')); 688 //At the root level the '+'-delimited expressions are not required to be bracketed, i.e. 689 //NM_004004.2:c.[35delG]+NM_006783.1:c.[689_690insT] instead of 690 //[NM_004004.2:c.35delG]+[NM_006783.1:c.689_690insT] 691 692 693 expr1 = ch_p('(') >> expr1 >> ch_p(')') 694 | list1a 695 | header >> expr2 696 | translocation; 697 list1a = list_p(discard_node_d[ch_p('[')] >> list1b >> discard_node_d[ch_p(']')], ch_p('+')); 698 list1b = list_p(expr1, chset<>(",;") | str_p("(+)")); 699 700 701 expr2 = ch_p('(') >> expr2 >> ch_p(')') 702 | list2a 703 | str_p("0?") //note: precdes location>>expr3 such that not matched as unknown variation at pos=0 here 704 | ch_p('0') 705 | location >> expr3 706 | prot_ext //can also exist within location context (mut_inst) 707 | ch_p('?') //note: follows location>>expr3 such that variation at unknown pos is not partially-matched as unknown variation 708 | ch_p('='); 709 list2a = list_p(discard_node_d[ch_p('[')] >> list2b >> discard_node_d[ch_p(']')], ch_p('+')); 710 list2b = list_p(expr2, chset<>(",;") | str_p("(+)")); 711 712 713 expr3 = ch_p('(') >> expr3 >> ch_p(')') 714 | list3a 715 | +mut_inst; 716 /* 717 * Note: Multiple mut_insts that are not delimited, i.e. 718 * abc instead of [a;b;c] are legit, e.g. 719 * 720 * a) p.X110GlnextX17 721 * b) NM_012345.3:c.123+45_123+51dupinsAB012345.3:g.393_1295 722 */ 723 list3a = list_p(discard_node_d[ch_p('[')] >> list3b >> discard_node_d[ch_p(']')], ch_p('+')); 724 list3b = list_p(expr3, chset<>(",;") | str_p("(+)")); 725 726 //BOOST_SPIRIT_DEBUG_RULE(expr1); 727 } 728 startvariation_ref::CHgvsParser::SGrammar::definition729 rule<ScannerT, parser_context<>, parser_tag<eID_root> > const& start() const 730 { 731 return root; 732 } 733 }; 734 s_is_listvariation_ref::CHgvsParser::SGrammar735 static bool s_is_list(parser_id id) 736 { 737 return id == SGrammar::eID_list1a 738 || id == SGrammar::eID_list2a 739 || id == SGrammar::eID_list3a 740 || id == SGrammar::eID_list1b 741 || id == SGrammar::eID_list2b 742 || id == SGrammar::eID_list3b 743 || id == SGrammar::eID_root; 744 } 745 }; 746 static SGrammar s_grammar; 747 748 749 private: 750 typedef tree_match<char const*> TParseTreeMatch; 751 typedef TParseTreeMatch::const_tree_iterator TIterator; 752 typedef CVariation_inst::TDelta::value_type TDelta; 753 typedef CVariation_ref::TData::TSet TVariationSet; 754 755 756 static SFuzzyInt x_int_fuzz (TIterator const& i, const CContext& context); 757 758 static CRef<CSeq_point> x_abs_pos (TIterator const& i, const CContext& context); 759 static SOffsetPoint x_general_pos (TIterator const& i, const CContext& context); 760 static SOffsetPoint x_fuzzy_pos (TIterator const& i, const CContext& context); 761 static SOffsetPoint x_pos_spec (TIterator const& i, const CContext& context); 762 static SOffsetPoint x_prot_pos (TIterator const& i, const CContext& context); 763 764 static SOffsetLoc x_range (TIterator const& i, const CContext& context); 765 static SOffsetLoc x_location (TIterator const& i, const CContext& context); 766 767 static CRef<CSeq_loc> x_seq_loc (TIterator const& i, const CContext& context); 768 static CRef<CSeq_literal> x_raw_seq (TIterator const& i, const CContext& context); 769 static TDelta x_seq_ref (TIterator const& i, const CContext& context); 770 static CRef<CVariation_ref> x_identity (const CContext& context); 771 static CRef<CVariation_ref> x_delins (TIterator const& i, const CContext& context); 772 static CRef<CVariation_ref> x_deletion (TIterator const& i, const CContext& context); 773 static CRef<CVariation_ref> x_insertion (TIterator const& i, const CContext& context); 774 static CRef<CVariation_ref> x_duplication (TIterator const& i, const CContext& context); 775 static CRef<CVariation_ref> x_nuc_subst (TIterator const& i, const CContext& context); 776 static CRef<CVariation_ref> x_nuc_inv (TIterator const& i, const CContext& context); 777 static CRef<CVariation_ref> x_ssr (TIterator const& i, const CContext& context); 778 static CRef<CVariation_ref> x_conversion (TIterator const& i, const CContext& context); 779 static CRef<CVariation_ref> x_prot_ext (TIterator const& i, const CContext& context); 780 static CRef<CVariation_ref> x_prot_missense (TIterator const& i, const CContext& context); 781 static CRef<CVariation_ref> x_translocation (TIterator const& i, const CContext& context); 782 static CRef<CVariation_ref> x_mut_inst (TIterator const& i, const CContext& context); 783 static CRef<CVariation_ref> x_expr1 (TIterator const& i, const CContext& context); 784 static CRef<CVariation_ref> x_expr2 (TIterator const& i, const CContext& context); 785 static CRef<CVariation_ref> x_expr3 (TIterator const& i, const CContext& context); 786 static CRef<CVariation_ref> x_prot_fs (TIterator const& i, const CContext& context); 787 static CRef<CVariation_ref> x_list (TIterator const& i, const CContext& context); 788 static CContext x_header (TIterator const& i, const CContext& context); 789 static CRef<CSeq_feat> x_root (TIterator const& i, const CContext& context); 790 791 static CRef<CVariation_ref> x_unwrap_iff_singleton(CVariation_ref& v); 792 793 794 ///Convert HGVS amino-acid code to ncbieaa 795 static string s_hgvsaa2ncbieaa(const string& hgvsaa); 796 797 ///Convert non-HGVS compliant all-uppercase AAs to UpLow, e.g. ILECYS ->IleCys 798 static string s_hgvsUCaa2hgvsUL(const string& hgvsaa); 799 800 private: 801 //functions to create hgvs expression from a variation-ref 802 803 /// variatino must have seq-loc specified 804 string x_AsHgvsExpression(const CVariation_ref& variation, 805 const CSeq_loc& parent_loc, //if variation has seq-loc set, it will be used instead. 806 bool is_top_level); 807 808 809 //Calculate the length of the inst, multipliers taken into account. 810 TSeqPos x_GetInstLength(const CVariation_inst& inst, const CSeq_loc& this_loc); 811 812 string x_GetInstData(const CVariation_inst& inst, const CSeq_loc& this_loc); 813 814 815 /// Only subset of insts can be expressed as HGVS, this will throw otherwise. 816 /// loc is required to capture sequence at loc for snps, e.g. 'A' in 'A>G' 817 /// 818 /// loc is modified to conform to HGVS flavor (on plus strand, modify 819 /// starts/stops for ins or microsatellite locatinos to conform to HGVS 820 /// convention where differes from Variation-ref standard) 821 string x_InstToString(const CVariation_inst& inst, CSeq_loc& loc); 822 823 string x_LocToSeqStr(const CSeq_loc& loc); 824 825 /* 826 * Convert a loc to hgvs representation. 827 * 828 * 829 * if alignment is provided (spliced-seg), then it is assumed that loc is the 830 * genomic loc but the output should be represented in cdna context. 831 */ 832 string x_SeqLocToStr(const CSeq_loc& loc, bool with_header); 833 834 // force-map: remap via alignment 835 string x_SeqPntToStr(const CSeq_point& pnt, TSeqPos first_pos); 836 837 /// Convert seq-id to HGVS seq-id header, e.g. "NM_123456.7:c." or "NG_123456.7:p" 838 string x_SeqIdToHgvsHeader(const CSeq_id& id); 839 840 string x_IntWithFuzzToStr(int value, const CInt_fuzz* fuzz = NULL); 841 842 //string Delta_itemToStr(const CDelta_item& delta, bool flip_strand); 843 844 string x_SeqLiteralToStr(const CSeq_literal& literal, bool flip_strand); 845 846 847 848 static CRef<CVariation_ref> s_ProtToCdna(const CVariation_ref& vr, CScope& scope); 849 850 private: 851 CRef<CScope> m_scope; 852 }; 853 854 }; 855 856 END_NCBI_SCOPE 857 858 859 #endif /* HGVSPARSER_HPP_ */ 860