1 /*  $Id: hgvs_parser.hpp 497570 2016-04-07 17:36:06Z moyered $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Description:
27 *
28 *   Translate HGVS expression to Variation-ref seq-feats.
29 *   HGVS nomenclature rules: http://www.hgvs.org/mutnomen/
30 *
31 * ===========================================================================
32 */
33 
34 #ifndef HGVSPARSER_HPP_
35 #define HGVSPARSER_HPP_
36 
37 #include <corelib/ncbiobj.hpp>
38 #include <corelib/ncbistd.hpp>
39 
40 #include <boost/version.hpp>
41 #if BOOST_VERSION >= 103800
42 #include <boost/spirit/include/classic.hpp>
43 #include <boost/spirit/include/classic_core.hpp>
44 #include <boost/spirit/include/classic_ast.hpp>
45 #include <boost/spirit/include/classic_parse_tree.hpp>
46 #include <boost/spirit/include/classic_tree_to_xml.hpp>
47 
48 using namespace BOOST_SPIRIT_CLASSIC_NS;
49 #else
50 //older boost
51 #include <boost/spirit.hpp>
52 #include <boost/spirit/core.hpp>
53 #include <boost/spirit/tree/ast.hpp>
54 #include <boost/spirit/tree/parse_tree.hpp>
55 #include <boost/spirit/tree/tree_to_xml.hpp>
56 using namespace boost::spirit;
57 #endif
58 
59 #include <objects/seqloc/Seq_loc.hpp>
60 #include <objects/seqloc/Seq_interval.hpp>
61 #include <objects/seq/Seq_literal.hpp>
62 #include <objects/seqalign/Seq_align.hpp>
63 #include <objects/seqalign/Seq_align_set.hpp>
64 #include <objects/seq/Seq_data.hpp>
65 
66 #include <objects/seqfeat/Seq_feat.hpp>
67 #include <objects/seqfeat/Variation_ref.hpp>
68 #include <objects/seqfeat/Variation_inst.hpp>
69 
70 #include <objmgr/scope.hpp>
71 #include <objmgr/feat_ci.hpp>
72 #include <objmgr/seq_vector.hpp>
73 
74 
75 BEGIN_NCBI_SCOPE
76 USING_SCOPE(objects);
77 
78 #define HGVS_THROW(err_code, message) NCBI_THROW(CHgvsParser::CHgvsParserException, err_code, message)
79 
80 namespace variation_ref {
81 
82 class CHgvsParser : public CObject
83 {
84 public:
CHgvsParser(CScope & scope)85     CHgvsParser(CScope& scope)
86        : m_scope(&scope)
87     {}
88 
89     enum EOpFlags
90     {
91         fOpFlags_RelaxedAA = 1 << 0, ///< try assumbing all-uppercase three-letter AA representation
92         fOpFlags_Default = fOpFlags_RelaxedAA
93     };
94     typedef int TOpFlags;
95 
96 
97     CRef<CSeq_feat> AsVariationFeat(const string& hgvs_expression, TOpFlags = fOpFlags_Default);
98     string AsHgvsExpression(const CSeq_feat& feat);
99 
SetScope()100     CScope& SetScope()
101     {
102         return *m_scope;
103     }
104 
105     class CHgvsParserException : public CException
106     {
107     public:
108         enum EErrCode {
109             eLogic,         ///<Problem with the code
110             eGrammatic,     ///<Expression is not a valid language
111             eSemantic,      ///<Expression is invalid in some way
112             eContext,       ///<Some problem with context
113             eAlignment,     ///<Some problem with getting alignment
114             ePrecondition,  ///<Precondition is not met
115             eOther,
116         };
117 
GetErrCodeString(void) const118         virtual const char* GetErrCodeString(void) const
119         {
120             switch(GetErrCode()) {
121             case eLogic:            return "eLogic";
122             case eGrammatic:        return "eGrammatic";
123             case eSemantic:         return "eSemantic";
124             case eContext:          return "eContext";
125             case eAlignment:        return "eAlignment";
126             case ePrecondition:     return "ePrecondition";
127             case eOther:            return "eOther";
128             default:                return CException::GetErrCodeString();
129             }
130         }
131         NCBI_EXCEPTION_DEFAULT(CHgvsParserException, CException);
132     };
133 
134 
135 protected:
136 
137     struct SFuzzyInt
138     {
SFuzzyIntvariation_ref::CHgvsParser::SFuzzyInt139         SFuzzyInt()
140         {
141             Reset();
142         }
143 
Assignvariation_ref::CHgvsParser::SFuzzyInt144         void Assign(const SFuzzyInt& other) {
145             value = other.value;
146             if(!other.fuzz) {
147                 fuzz.Reset();
148             } else {
149                 if(!fuzz) {
150                     fuzz.Reset(new CInt_fuzz);
151                 }
152                 fuzz->Assign(*other.fuzz);
153             }
154         }
155 
Resetvariation_ref::CHgvsParser::SFuzzyInt156         void Reset()
157         {
158             value = 0;
159             fuzz.Reset();
160         }
161 
162         long value;
163         CRef<CInt_fuzz> fuzz; //can be null;
164     };
165 
166     struct SOffsetPoint
167     {
SOffsetPointvariation_ref::CHgvsParser::SOffsetPoint168         SOffsetPoint()
169         {
170             Reset();
171         }
172 
IsOffsetvariation_ref::CHgvsParser::SOffsetPoint173         bool IsOffset() const {
174             return offset.value || offset.fuzz;
175         }
176 
Resetvariation_ref::CHgvsParser::SOffsetPoint177         void Reset()
178         {
179             pnt.Reset();
180             offset.Reset();
181         }
182 
Assignvariation_ref::CHgvsParser::SOffsetPoint183         void Assign(const SOffsetPoint& other)
184         {
185             offset.Assign(other.offset);
186             if(!other.pnt) {
187                 pnt.Reset();
188             } else {
189                 if(!pnt) {
190                     pnt.Reset(new CSeq_point);
191                 }
192                 pnt->Assign(*other.pnt);
193             }
194         }
195 
196         string asserted_sequence;
197         CRef<CSeq_point> pnt;
198         SFuzzyInt offset;
199     };
200 
201     struct SOffsetLoc
202     {
SOffsetLocvariation_ref::CHgvsParser::SOffsetLoc203         SOffsetLoc()
204         {
205             Reset();
206         }
207 
Resetvariation_ref::CHgvsParser::SOffsetLoc208         void Reset()
209         {
210             loc.Reset();
211             start_offset.Reset();
212             stop_offset.Reset();
213         }
214 
Assignvariation_ref::CHgvsParser::SOffsetLoc215         void Assign(const SOffsetLoc& other)
216         {
217             start_offset.Assign(other.start_offset);
218             stop_offset.Assign(other.stop_offset);
219             if(!other.loc) {
220                 loc.Reset();
221             } else {
222                 if(!loc) {
223                     loc.Reset(new CSeq_loc);
224                 }
225                 loc->Assign(*other.loc);
226             }
227         }
228 
IsOffsetvariation_ref::CHgvsParser::SOffsetLoc229         bool IsOffset() const
230         {
231             return start_offset.value || start_offset.value || stop_offset.fuzz || stop_offset.fuzz;
232         }
233 
234         TSeqPos GetLength() const;
235 
236         string asserted_sequence;
237         CRef<CSeq_loc> loc;
238         SFuzzyInt start_offset;
239         SFuzzyInt stop_offset;
240     };
241 
242     /*!
243      * CContext encapsulates sequence or location context for an hgvs sub-expression.
244      * E.g. given an expression id:c.5_10delinsAT, when creating a variation-ref
245      * for delinsAT the context will refer to sequence "id" and location "5_10"
246      */
247     class CContext
248     {
249     public:
CContext(CRef<CScope> scope)250         CContext(CRef<CScope> scope)
251           : m_scope(scope)
252         {
253             Clear();
254         }
255 
CContext(const CContext & other)256         CContext(const CContext& other)
257         {
258             *this = other; //shallow copy will suffice.
259         }
260 
261         enum EMolType {
262             eMol_not_set,
263             eMol_g,
264             eMol_c,
265             eMol_r,
266             eMol_p,
267             eMol_mt
268         };
269 
Clear()270         void Clear()
271         {
272             m_bsh.Reset();
273             m_mol_type = eMol_not_set;
274             m_cds.Reset();
275             m_seq_id.Reset();
276             m_loc.Reset();
277         }
278 
GetLength() const279         TSeqPos GetLength() const
280         {
281             return m_bsh.GetBioseqLength();
282         }
283 
284         /*!
285          * Clear the context and reset it for the given seq-id.
286          * If the sequence is cdna and we're working with "c." coordinates,
287          * also find the CDS, as the coordinates are (start|stop)codon-relative.
288          */
289         void SetId(const CSeq_id& id, EMolType mol_type);
290 
Validate(const CSeq_literal & literal) const291         void Validate(const CSeq_literal& literal) const
292         {
293             if(!m_loc.IsOffset()) {
294                 //Can only validate normal locs, as with offset loc the asserted
295                 //allele does not correspond to the base loc.
296                 Validate(literal, GetLoc());
297             } else {
298                 //LOG_POST("Ignoring validation of literal due to offset location");
299             }
300         }
301 
302         void Validate(const CSeq_literal& literal, const CSeq_loc& loc) const;
303 
SetLoc(const SOffsetLoc & loc)304         void SetLoc(const SOffsetLoc& loc)
305         {
306             m_loc.Assign(loc);
307         }
308 
IsSetLoc() const309         bool IsSetLoc() const
310         {
311             return !m_loc.loc.IsNull();
312         }
313 
GetScope() const314         CScope& GetScope() const
315         {
316             return *m_scope;
317         }
318 
319         const CSeq_loc& GetLoc() const;
320 
321         const SOffsetLoc& GetOffsetLoc() const;
322 
323         const CSeq_id& GetId() const;
324 
325         const CSeq_feat& GetCDS() const;
326 
327         EMolType GetMolType(bool check=true) const;
328 
329 
330     private:
331         CBioseq_Handle m_bsh;
332         EMolType m_mol_type;
333         CRef<CSeq_feat> m_cds;
334         CRef<CSeq_id> m_seq_id;
335         SOffsetLoc m_loc;
336         mutable CRef<CScope> m_scope;
337     };
338 
339     struct SGrammar: public grammar<SGrammar>
340     {
341         /*!
342          * Deviations from the recommendations:
343          *
344          * Flattened compound lists are not supported (a list must have same delimiter type)
345          *     [[[a,b];c;f](+)[d,e]] - OK
346          *     [a,b;c;f(+)d,e] - NOT OK
347          *
348          * No mixing of different expression types within a list.
349          * Note: this example is not mentioned in specifications, but found it in Mutalyzer docs.
350          * AB026906:c.[1del;6_7insoAL449423.14(CDKN2A):c.[1_10;5del]]
351          *    1_10 specifies a location
352          *    5del specifies a variation instance
353          *    [1_10;5del] is not a valid expression
354          *
355          * Only seq-id header containing actual seq-id (gi or acc.ver) are supported:
356          * Y : NM_004006.1:c.3G>T - uses a GenBank file as indicator
357            N : GJB2:c.76A>C - uses a HGNC-approved gene symbol as indicator
358            Y : DMD{NM_004006.1}:c.3G>T - uses both a HGNC-approved gene symbol and a GenBank file as indicator
359            N : chrX:g.32,218,983_32,984,039del - uses a chromosome indicator (here X)
360            N : rs2306220:A>G - using a dbSNP-identifier as indicator
361            N : DXS1219:g.CA[18] (or AFM297yd1:g.CA[18]) - uses marker DXS1219 / AFM297yd1 as indicator
362            Note: cases below are not described in recommendations, but found in Mutalyzer docs.
363            N : AL449423.14(CDKN2A_v003):g.10del - not in symbol{seq_id} format
364            N : CDKN2A_v003{AL449423.14}:c.1_*3352del cDNA coordinates on a genomic sequence.
365          *
366          */
367 
368         enum E_NodeIds {
369             eID_NONE = 0,
370             eID_root,
371             eID_list1a,
372             eID_list2a,
373             eID_list3a,
374             eID_list1b,
375             eID_list2b,
376             eID_list3b,
377             eID_expr1,
378             eID_expr2,
379             eID_expr3,
380             eID_translocation,
381             eID_header,
382             eID_seq_id,
383             eID_mut_list,
384             eID_mut_ref,
385             eID_mol,
386             eID_int_fuzz,
387             eID_abs_pos,
388             eID_general_pos,
389             eID_fuzzy_pos,
390             eID_pos_spec,
391             eID_location,
392             eID_nuc_range,
393             eID_prot_range,
394             eID_mut_inst,
395             eID_raw_seq,
396             eID_aminoacid,
397             eID_nuc_subst,
398             eID_deletion,
399             eID_insertion,
400             eID_delins,
401             eID_duplication,
402             eID_nuc_inv,
403             eID_ssr,
404             eID_conversion,
405             eID_seq_loc,
406             eID_seq_ref,
407             eID_prot_pos,
408             eID_prot_fs,
409             eID_prot_missense,
410             eID_prot_ext
411         };
412 
413 
414         typedef std::map<parser_id, std::string> TRuleNames;
415         static TRuleNames s_rule_names;
s_GetRuleNamesvariation_ref::CHgvsParser::SGrammar416         static TRuleNames& s_GetRuleNames()
417         {
418             if(s_rule_names.size() == 0) {
419                 s_rule_names[eID_NONE]              = "NONE";
420                 s_rule_names[eID_root]              = "root";
421                 s_rule_names[eID_list1a]            = "list1a";
422                 s_rule_names[eID_list2a]            = "list2a";
423                 s_rule_names[eID_list3a]            = "list3a";
424                 s_rule_names[eID_list1b]            = "list1b";
425                 s_rule_names[eID_list2b]            = "list2b";
426                 s_rule_names[eID_list3b]            = "list3b";
427                 s_rule_names[eID_expr1]             = "expr1";
428                 s_rule_names[eID_expr2]             = "expr2";
429                 s_rule_names[eID_expr3]             = "expr3";
430                 s_rule_names[eID_translocation]     = "translocation";
431                 s_rule_names[eID_header]            = "header";
432                 s_rule_names[eID_location]          = "location";
433                 s_rule_names[eID_mol]               = "mol";
434                 s_rule_names[eID_seq_id]            = "seq_id";
435                 s_rule_names[eID_mut_list]          = "mut_list";
436                 s_rule_names[eID_mut_ref]           = "mut_ref";
437                 s_rule_names[eID_nuc_range]         = "nuc_range";
438                 s_rule_names[eID_prot_range]        = "prot_range";
439                 s_rule_names[eID_mut_inst]          = "mut_inst";
440                 s_rule_names[eID_int_fuzz]          = "int_fuzz";
441                 s_rule_names[eID_abs_pos]           = "abs_pos";
442                 s_rule_names[eID_general_pos]       = "general_pos";
443                 s_rule_names[eID_fuzzy_pos]         = "fuzzy_pos";
444                 s_rule_names[eID_pos_spec]          = "pos_spec";
445                 s_rule_names[eID_raw_seq]           = "raw_seq";
446                 s_rule_names[eID_aminoacid]         = "aminoacid";
447                 s_rule_names[eID_nuc_subst]         = "nuc_subst";
448                 s_rule_names[eID_deletion]          = "deletion";
449                 s_rule_names[eID_insertion]         = "insertion";
450                 s_rule_names[eID_delins]            = "delins";
451                 s_rule_names[eID_duplication]       = "duplication";
452                 s_rule_names[eID_nuc_inv]           = "nuc_inv";
453                 s_rule_names[eID_ssr]               = "ssr";
454                 s_rule_names[eID_conversion]        = "conversion";
455                 s_rule_names[eID_seq_loc]           = "seq_loc";
456                 s_rule_names[eID_seq_ref]           = "seq_ref";
457                 s_rule_names[eID_prot_pos]          = "prot_pos";
458                 s_rule_names[eID_prot_fs]           = "prot_fs";
459                 s_rule_names[eID_prot_missense]     = "prot_missense";
460                 s_rule_names[eID_prot_ext]          = "prot_ext";
461             }
462             return s_rule_names;
463         }
464 
465         static const string& s_GetRuleName(parser_id id);
466 
467         template <typename ScannerT>
468         struct definition
469         {
470             rule<ScannerT, parser_context<>, parser_tag<eID_root> >             root;
471             rule<ScannerT, parser_context<>, parser_tag<eID_list1a> >           list1a;
472             rule<ScannerT, parser_context<>, parser_tag<eID_list2a> >           list2a;
473             rule<ScannerT, parser_context<>, parser_tag<eID_list3a> >           list3a;
474             rule<ScannerT, parser_context<>, parser_tag<eID_list1b> >           list1b;
475             rule<ScannerT, parser_context<>, parser_tag<eID_list2b> >           list2b;
476             rule<ScannerT, parser_context<>, parser_tag<eID_list3b> >           list3b;
477             rule<ScannerT, parser_context<>, parser_tag<eID_expr1> >            expr1;
478             rule<ScannerT, parser_context<>, parser_tag<eID_expr2> >            expr2;
479             rule<ScannerT, parser_context<>, parser_tag<eID_expr3> >            expr3;
480             rule<ScannerT, parser_context<>, parser_tag<eID_translocation> >    translocation;
481             rule<ScannerT, parser_context<>, parser_tag<eID_header> >           header;
482             rule<ScannerT, parser_context<>, parser_tag<eID_seq_id> >           seq_id;
483             rule<ScannerT, parser_context<>, parser_tag<eID_mol> >              mol;
484             rule<ScannerT, parser_context<>, parser_tag<eID_mut_list > >        mut_list;
485             rule<ScannerT, parser_context<>, parser_tag<eID_mut_ref> >          mut_ref;
486             rule<ScannerT, parser_context<>, parser_tag<eID_mut_inst> >         mut_inst;
487             rule<ScannerT, parser_context<>, parser_tag<eID_int_fuzz> >         int_fuzz;
488             rule<ScannerT, parser_context<>, parser_tag<eID_abs_pos> >          abs_pos;
489             rule<ScannerT, parser_context<>, parser_tag<eID_general_pos> >      general_pos;
490             rule<ScannerT, parser_context<>, parser_tag<eID_fuzzy_pos> >        fuzzy_pos;
491             rule<ScannerT, parser_context<>, parser_tag<eID_pos_spec> >         pos_spec;
492             rule<ScannerT, parser_context<>, parser_tag<eID_location> >         location;
493             rule<ScannerT, parser_context<>, parser_tag<eID_nuc_range> >        nuc_range;
494             rule<ScannerT, parser_context<>, parser_tag<eID_prot_range> >       prot_range;
495             rule<ScannerT, parser_context<>, parser_tag<eID_raw_seq> >          raw_seq;
496             rule<ScannerT, parser_context<>, parser_tag<eID_aminoacid> >        aminoacid;
497             rule<ScannerT, parser_context<>, parser_tag<eID_nuc_subst> >        nuc_subst;
498             rule<ScannerT, parser_context<>, parser_tag<eID_deletion> >         deletion;
499             rule<ScannerT, parser_context<>, parser_tag<eID_insertion> >        insertion;
500             rule<ScannerT, parser_context<>, parser_tag<eID_delins> >           delins;
501             rule<ScannerT, parser_context<>, parser_tag<eID_duplication> >      duplication;
502             rule<ScannerT, parser_context<>, parser_tag<eID_nuc_inv> >          nuc_inv;
503             rule<ScannerT, parser_context<>, parser_tag<eID_ssr> >              ssr;
504             rule<ScannerT, parser_context<>, parser_tag<eID_conversion> >       conversion;
505             rule<ScannerT, parser_context<>, parser_tag<eID_seq_loc> >          seq_loc;
506             rule<ScannerT, parser_context<>, parser_tag<eID_seq_ref> >          seq_ref;
507             rule<ScannerT, parser_context<>, parser_tag<eID_prot_pos> >         prot_pos;
508             rule<ScannerT, parser_context<>, parser_tag<eID_prot_missense> >    prot_missense;
509             rule<ScannerT, parser_context<>, parser_tag<eID_prot_ext> >         prot_ext;
510             rule<ScannerT, parser_context<>, parser_tag<eID_prot_fs> >          prot_fs;
511 
definitionvariation_ref::CHgvsParser::SGrammar::definition512             definition(SGrammar const&)
513             {
514                 aminoacid       = str_p("Ala")
515                                 | str_p("Asx")
516                                 | str_p("Cys")
517                                 | str_p("Asp")
518                                 | str_p("Glu")
519                                 | str_p("Phe")
520                                 | str_p("Gly")
521                                 | str_p("His")
522                                 | str_p("Ile")
523                                 | str_p("Lys")
524                                 | str_p("Leu")
525                                 | str_p("Met")
526                                 | str_p("Asn")
527                                 | str_p("Pro")
528                                 | str_p("Gln")
529                                 | str_p("Arg")
530                                 | str_p("Ser")
531                                 | str_p("Thr")
532                                 | str_p("Val")
533                                 | str_p("Trp")
534                                 | str_p("Tyr")
535                                 | str_p("Glx")
536                                 | chset<>("XARNDCEQGHILKMFPSTWYV") //"X" as in p.X110GlnextX17
537                                 ;
538 
539                 raw_seq         = leaf_node_d[+aminoacid | +chset<>("ACGTN") | +chset<>("acgun")];
540                     /*
541                      * Note: there's no separation between protein, DNA and RNA sequences, as
542                      * at the parse time it is not known without context which sequence it is - i.e.
543                      * "AC" could be a dna sequence, or AlaCys. Hence, raw_seq will match any sequence
544                      * and at the parse-tree transformation stage we'll create proper seq-type as
545                      * the mol will be known then.
546                      *
547                      * Note: +aminoacid precedes +chset<>("ACGTN") so that we don't prematurely match T in Trp etc.
548                      */
549 
550 
551                 /*
552                  * Positions and Locations
553                  */
554 
555                 int_fuzz        = ch_p('(') >> (ch_p('?')|int_p) >> ch_p('_') >> (ch_p('?')|int_p) >> ch_p(')')
556                                 | ch_p('(') >> int_p >> ch_p(')') //note: not ch_p('?') here as this makes grammar amgiguous
557                                 | (ch_p('?')|int_p);
558 
559                 abs_pos         = !ch_p('*') >> int_fuzz;
560                     //Note: '*' means the location is CDS-stop-relative; cdna context required.
561                     //Otherwise it is CDS-start-relative iff in cdna-context.
562 
563                 general_pos     = (str_p("IVS") >> int_p | abs_pos) >> sign_p >> int_fuzz
564                                 | abs_pos;
565                     //Warning: offset-pos must be followed by a sign because otherwise
566                     //it is ambiguous whether c.123(1_2) denotes a SSR or a fuzzy intronic location
567 
568 
569                 fuzzy_pos       = discard_node_d[ch_p('(')]
570                                 >> general_pos
571                                 >> discard_node_d[ch_p('_')]
572                                 >> general_pos
573                                 >> discard_node_d[ch_p(')')];
574 
575                 pos_spec        = general_pos   //intronic offset-pos
576                                 | fuzzy_pos     //(generalpos_generalpos)
577                                 | !ch_p('o') >> header >> pos_spec;               //far-loc, as in id:c.88+101_id2:c.355-1045del
578 
579 
580                 prot_pos        = raw_seq >> pos_spec; //raw_seq must be of length one (single aminoacid)
581 
582                 prot_range      = prot_pos >> discard_node_d[ch_p('_')] >> prot_pos;
583 
584                 nuc_range       = pos_spec >> discard_node_d[ch_p('_')] >> pos_spec;
585 
586                 location        = nuc_range | pos_spec | prot_range | prot_pos;
587                     //Note that this describes "local" coordinates within a sequence context, not a seq-loc
588 
589                 /*
590                  * Seq-ids and seq-locs
591                  */
592                 seq_id          = leaf_node_d[alpha_p >> +(alnum_p | chset<>("._-|"))];
593 
594                 mol             = str_p("mt") | chset<>("gcrpm"); //note: for 'mt.' also supporting 'm.'
595 
596                 header          = seq_id
597                                   >> !(discard_node_d[ch_p('{')]
598                                        >> seq_id
599                                        >> discard_node_d[ch_p('}')])
600                                   >> discard_node_d[ch_p(':')]
601                                   >> mol
602                                   >> discard_node_d[ch_p('.')];
603                    /*
604                     * A the parsing stage we'll not require that the mutations for
605                     * different mol-types don't mix, i.e. protein mutation type for a
606                     * mol-type "c." - this will be deferred to the semantic check that will
607                     * validate the parsed expression. The reason is that the g/c/r/p specs mostly
608                     * overlap, so we can avoid exploding the grammar.
609                     */
610 
611                 seq_loc         = !ch_p('o') >> header >> location;
612 
613                 seq_ref         = seq_loc       //far-loc
614                                 | (nuc_range|prot_range)      //local-loc of range-type, e.g. c.17_18ins5_16 http://www.hgvs.org/mutnomen/FAQ.html
615                                                               //This is to exclude point-locs (see below)
616                                 | raw_seq       //literal sequence
617                                 | int_fuzz;     //unknown sequence of some length
618                                                 //  WARNING: this is ambiguous WRT local-loc!
619                                                 //  e.g. p.Glu5Valins2fsX3 - 2 in ins2 indicates sequence of length two, NOT the sequence at position 2.
620                                                 //  Hence, the local-locs above must be specified via range-types only.
621 
622                 nuc_subst       = raw_seq >> ch_p('>') >> raw_seq; //semantic check: must be of length 1
623 
624                 deletion        = str_p("del") >> !(raw_seq | int_p);
625 
626                 duplication     = str_p("dup") >> !seq_ref;
627 
628                 insertion       = str_p("ins") >> seq_ref;
629 
630                 conversion      = str_p("con") >> seq_loc;
631 
632                 delins          = str_p("del") >> !raw_seq >> str_p("ins") >> seq_ref;
633 
634                 nuc_inv         = str_p("inv") >> !int_p;
635 
636                 ssr             = !raw_seq >> ( int_fuzz - (ch_p('?')|int_p) //don't want to interpret 'id:5_6?' as ssr
637                                               | list_p(discard_node_d[ch_p('[')]
638                                                     >> int_p
639                                                     >> discard_node_d[ch_p(']')],
640                                                        discard_node_d[ch_p('+')]));
641                     /*
642                      * Note: It is not correct to match [5] and [3] in NM_000815.2:c.101TC[5]+[3]
643                      * individually as list3a within c.101 location context, because
644                      * the location-spec for an ssr may point to the first repeat-unit base only,
645                      * in which case we need to calculate the actual range-loc based on the sequence literal,
646                      * and when processing "[3]" later the "TC" literal will not be in context,
647                      * and it will appear as if the context is NM_000815.2:c.101 rather than c.101_102.
648                      *
649                      * Hence, the ssr rule will have to consume the list of multipliers [5]+[3]
650                      * and generate compound variation-ref "manually"
651                      */
652 
653 
654                 prot_fs         = str_p("fs") >> !(ch_p('X') >> int_p);
655 
656                 prot_ext        = (str_p("extMet") | str_p("extX")) >> int_p;
657 
658                 prot_missense   = aminoacid;
659 
660                 //ISCN expression followed by a seq-loc
661                 translocation = str_p("t(")
662                                   >> leaf_node_d[*(print_p - ch_p('(') - ch_p(')'))]
663                                   >> str_p(")(")
664                                   >> leaf_node_d[*(print_p - ch_p('(') - ch_p(')'))]
665                                   >> str_p(")")
666                                   >> seq_loc;
667 
668                 mut_inst        = ch_p('?') //can exist within +mut_inst, e.g. p.Met1?extMet-5
669                                 | ch_p('=')
670                                 | delins    //delins must precede del
671                                 | deletion
672                                 | insertion
673                                 | duplication
674                                 | nuc_subst
675                                 | nuc_inv
676                                 | ssr
677                                 | conversion
678                                 | prot_fs
679                                 | prot_missense
680                                 | prot_ext //this may occur inside location context, e.g. p.Met1ValextMet-12
681                                 ;
682                     //Note: '?' and '=' can exist both within a location context
683                     //(i.e. expr3) or within a sequence context (i.e. expr2)
684                     //additionally, prot_ext may exist as expr2 (outside of location context) as well.
685 
686 
687                 root            = list_p(expr1, ch_p('+'));
688                     //At the root level the '+'-delimited expressions are not required to be bracketed, i.e.
689                     //NM_004004.2:c.[35delG]+NM_006783.1:c.[689_690insT] instead of
690                     //[NM_004004.2:c.35delG]+[NM_006783.1:c.689_690insT]
691 
692 
693                 expr1           = ch_p('(') >> expr1 >> ch_p(')')
694                                 | list1a
695                                 | header >> expr2
696                                 | translocation;
697                 list1a          = list_p(discard_node_d[ch_p('[')] >> list1b >> discard_node_d[ch_p(']')], ch_p('+'));
698                 list1b          = list_p(expr1, chset<>(",;") | str_p("(+)"));
699 
700 
701                 expr2           = ch_p('(') >> expr2 >> ch_p(')')
702                                 | list2a
703                                 | str_p("0?") //note: precdes location>>expr3 such that not matched as unknown variation at pos=0 here
704                                 | ch_p('0')
705                                 | location >> expr3
706                                 | prot_ext    //can also exist within location context (mut_inst)
707                                 | ch_p('?')   //note: follows location>>expr3 such that variation at unknown pos is not partially-matched as unknown variation
708                                 | ch_p('=');
709                 list2a          = list_p(discard_node_d[ch_p('[')] >> list2b >> discard_node_d[ch_p(']')], ch_p('+'));
710                 list2b          = list_p(expr2, chset<>(",;") | str_p("(+)"));
711 
712 
713                 expr3           = ch_p('(') >> expr3 >> ch_p(')')
714                                 | list3a
715                                 | +mut_inst;
716                     /*
717                      * Note: Multiple mut_insts that are not delimited, i.e.
718                      * abc instead of [a;b;c] are legit, e.g.
719                      *
720                      * a) p.X110GlnextX17
721                      * b) NM_012345.3:c.123+45_123+51dupinsAB012345.3:g.393_1295
722                      */
723                 list3a          = list_p(discard_node_d[ch_p('[')] >> list3b >> discard_node_d[ch_p(']')], ch_p('+'));
724                 list3b          = list_p(expr3, chset<>(",;") | str_p("(+)"));
725 
726                 //BOOST_SPIRIT_DEBUG_RULE(expr1);
727             }
728 
startvariation_ref::CHgvsParser::SGrammar::definition729             rule<ScannerT, parser_context<>, parser_tag<eID_root> > const& start() const
730             {
731                 return root;
732             }
733         };
734 
s_is_listvariation_ref::CHgvsParser::SGrammar735         static bool s_is_list(parser_id id)
736         {
737             return id == SGrammar::eID_list1a
738                 || id == SGrammar::eID_list2a
739                 || id == SGrammar::eID_list3a
740                 || id == SGrammar::eID_list1b
741                 || id == SGrammar::eID_list2b
742                 || id == SGrammar::eID_list3b
743                 || id == SGrammar::eID_root;
744         }
745     };
746     static SGrammar s_grammar;
747 
748 
749 private:
750     typedef tree_match<char const*> TParseTreeMatch;
751     typedef TParseTreeMatch::const_tree_iterator TIterator;
752     typedef CVariation_inst::TDelta::value_type TDelta;
753     typedef CVariation_ref::TData::TSet TVariationSet;
754 
755 
756     static SFuzzyInt x_int_fuzz (TIterator const& i, const CContext& context);
757 
758     static CRef<CSeq_point>      x_abs_pos         (TIterator const& i, const CContext& context);
759     static SOffsetPoint          x_general_pos     (TIterator const& i, const CContext& context);
760     static SOffsetPoint          x_fuzzy_pos       (TIterator const& i, const CContext& context);
761     static SOffsetPoint          x_pos_spec        (TIterator const& i, const CContext& context);
762     static SOffsetPoint          x_prot_pos        (TIterator const& i, const CContext& context);
763 
764     static SOffsetLoc            x_range           (TIterator const& i, const CContext& context);
765     static SOffsetLoc            x_location        (TIterator const& i, const CContext& context);
766 
767     static CRef<CSeq_loc>        x_seq_loc         (TIterator const& i, const CContext& context);
768     static CRef<CSeq_literal>    x_raw_seq         (TIterator const& i, const CContext& context);
769     static TDelta                x_seq_ref         (TIterator const& i, const CContext& context);
770     static CRef<CVariation_ref>  x_identity        (const CContext& context);
771     static CRef<CVariation_ref>  x_delins          (TIterator const& i, const CContext& context);
772     static CRef<CVariation_ref>  x_deletion        (TIterator const& i, const CContext& context);
773     static CRef<CVariation_ref>  x_insertion       (TIterator const& i, const CContext& context);
774     static CRef<CVariation_ref>  x_duplication     (TIterator const& i, const CContext& context);
775     static CRef<CVariation_ref>  x_nuc_subst       (TIterator const& i, const CContext& context);
776     static CRef<CVariation_ref>  x_nuc_inv         (TIterator const& i, const CContext& context);
777     static CRef<CVariation_ref>  x_ssr             (TIterator const& i, const CContext& context);
778     static CRef<CVariation_ref>  x_conversion      (TIterator const& i, const CContext& context);
779     static CRef<CVariation_ref>  x_prot_ext        (TIterator const& i, const CContext& context);
780     static CRef<CVariation_ref>  x_prot_missense   (TIterator const& i, const CContext& context);
781     static CRef<CVariation_ref>  x_translocation   (TIterator const& i, const CContext& context);
782     static CRef<CVariation_ref>  x_mut_inst        (TIterator const& i, const CContext& context);
783     static CRef<CVariation_ref>  x_expr1           (TIterator const& i, const CContext& context);
784     static CRef<CVariation_ref>  x_expr2           (TIterator const& i, const CContext& context);
785     static CRef<CVariation_ref>  x_expr3           (TIterator const& i, const CContext& context);
786     static CRef<CVariation_ref>  x_prot_fs         (TIterator const& i, const CContext& context);
787     static CRef<CVariation_ref>  x_list            (TIterator const& i, const CContext& context);
788     static CContext              x_header          (TIterator const& i, const CContext& context);
789     static CRef<CSeq_feat>       x_root            (TIterator const& i, const CContext& context);
790 
791     static CRef<CVariation_ref>  x_unwrap_iff_singleton(CVariation_ref& v);
792 
793 
794     ///Convert HGVS amino-acid code to ncbieaa
795     static string s_hgvsaa2ncbieaa(const string& hgvsaa);
796 
797     ///Convert non-HGVS compliant all-uppercase AAs to UpLow, e.g. ILECYS ->IleCys
798     static string s_hgvsUCaa2hgvsUL(const string& hgvsaa);
799 
800 private:
801     //functions to create hgvs expression from a variation-ref
802 
803     /// variatino must have seq-loc specified
804     string x_AsHgvsExpression(const CVariation_ref& variation,
805                             const CSeq_loc& parent_loc, //if variation has seq-loc set, it will be used instead.
806                             bool is_top_level);
807 
808 
809     //Calculate the length of the inst, multipliers taken into account.
810     TSeqPos x_GetInstLength(const CVariation_inst& inst, const CSeq_loc& this_loc);
811 
812     string x_GetInstData(const CVariation_inst& inst, const CSeq_loc& this_loc);
813 
814 
815     /// Only subset of insts can be expressed as HGVS, this will throw otherwise.
816     /// loc is required to capture sequence at loc for snps, e.g. 'A' in 'A>G'
817     ///
818     /// loc is modified to conform to HGVS flavor (on plus strand, modify
819     /// starts/stops for ins or microsatellite locatinos to conform to HGVS
820     /// convention where differes from Variation-ref standard)
821     string x_InstToString(const CVariation_inst& inst, CSeq_loc& loc);
822 
823     string x_LocToSeqStr(const CSeq_loc& loc);
824 
825     /*
826      * Convert a loc to hgvs representation.
827      *
828      *
829      * if alignment is provided (spliced-seg), then it is assumed that loc is the
830      * genomic loc but the output should be represented in cdna context.
831      */
832     string x_SeqLocToStr(const CSeq_loc& loc, bool with_header);
833 
834     // force-map: remap via alignment
835     string x_SeqPntToStr(const CSeq_point& pnt, TSeqPos first_pos);
836 
837     /// Convert seq-id to HGVS seq-id header, e.g. "NM_123456.7:c." or "NG_123456.7:p"
838     string x_SeqIdToHgvsHeader(const CSeq_id& id);
839 
840     string x_IntWithFuzzToStr(int value, const CInt_fuzz* fuzz = NULL);
841 
842     //string Delta_itemToStr(const CDelta_item& delta, bool flip_strand);
843 
844     string x_SeqLiteralToStr(const CSeq_literal& literal, bool flip_strand);
845 
846 
847 
848     static CRef<CVariation_ref> s_ProtToCdna(const CVariation_ref& vr, CScope& scope);
849 
850 private:
851     CRef<CScope> m_scope;
852 };
853 
854 };
855 
856 END_NCBI_SCOPE
857 
858 
859 #endif /* HGVSPARSER_HPP_ */
860