1--$Revision: 460847 $
2--**********************************************************************
3--
4--  NCBI Variation container
5--  by Variation Working Group, 2011
6--
7--  The Variation type describes a sequence change at location(s),
8--  or a hierarchical combination thereof.
9--
10--  Related location-centric type is SeqFeatData.Variation-ref
11--
12--**********************************************************************
13
14NCBI-VariationPackage DEFINITIONS ::=
15BEGIN
16
17EXPORTS Variation, VariantPlacement;
18
19IMPORTS Int-fuzz, Dbtag, User-object, Object-id FROM NCBI-General
20        Population-data, Phenotype, Variation-inst, VariantProperties FROM NCBI-Variation
21        Seq-loc FROM NCBI-Seqloc
22        SubSource FROM NCBI-BioSource
23        Seq-literal, Bioseq FROM NCBI-Sequence
24        Pub-set FROM NCBI-Pub;
25
26
27VariationException ::= SEQUENCE
28{
29    code INTEGER {
30        hgvs-parsing (1), --invalid hgvs expression
31        hgvs-exon-boundary (2), --anchor position in an intronic HGVS expression is not at an exon boundary
32
33
34        inconsistent-consequence (3), --consequence protein variation attached to precursor variation's consequence
35                                      --could not be derived from it.
36
37        inconsistent-asserted-allele (4), --asserted allele is inconsistent with the reference
38
39        no-mapping (5),      --could not remap
40        partial-mapping (6), --mapped location is shorter than the query
41        split-mapping (7),   --a source interval maps to multiple non-abutting intervals.
42        mismatches-in-mapping (8), --the source sequence differs from sequence at mapped loc
43        inconsistent-asserted-moltype (9), --asserted mol-type is inconsistent with seq-id (e.g. NM_12345.6:g.)
44        bioseq-state (10),
45        ambiguous-sequence (11),
46        ref-same-as-variant (12), --reference sequence at the location is same as variant sequence in the variation
47        seqfetch-too-long (13), --can't fetch sequence because location is longer than specified threshold
48        seqfetch-intronic (14), --can't fetch sequence for an intronic (anchor+offset)-based location
49        seqfetch-invalid (15),   --can't fetch sequence because location is invalid (e.g. extends past the end)
50        no-mapping-from-newer-version (16), --have mapping from older version of a sequence, but not from newer
51        source-location-overhang (17), --The source location overhangs the alignment by at least 5kb (VAR-1307)
52        hgvs-exon-boundary-induced (18)  --Similar to (2), except induced by 5'/3'-terminal or an exon extension (VAR-1309)
53    } OPTIONAL,
54
55    message VisibleString
56}
57
58VariantPlacement ::= SEQUENCE
59{
60    -- actual concrete placement we are considering
61    loc Seq-loc,
62
63    mol INTEGER {
64        unknown(0),
65        genomic(1),      --"g." coordinates in HGVS
66        cdna(2),         --"c." coordinates in HGVS
67        rna(3),          --"n." coordinates in HGVS
68        protein(4),      --"p." coordinates in HGVS
69        mitochondrion(5) --"mt." coordinates in HGVS
70    },
71
72    -- location flags
73    placement-method INTEGER {
74        projected(1),
75        asserted(2),
76        aligned(3)
77    } OPTIONAL,
78
79    -- location refinements, describing offsets into introns from product coordinates.
80    -- Biological semantics: start-offset/stop-offset apply to bio-start/bio-stop respectively.
81    -- positive = downstream; negative = upstream.
82    start-offset INTEGER OPTIONAL,
83    start-offset-fuzz Int-fuzz OPTIONAL,
84    stop-offset INTEGER OPTIONAL,
85    stop-offset-fuzz Int-fuzz OPTIONAL,
86
87    -- 0-based position of bio-start relative to containing codon
88    frame INTEGER OPTIONAL,
89
90    -- for situations in which a raw location isn't sufficient
91    seq Seq-literal OPTIONAL,
92
93    -- reference to the assembly (GenColl ID) for this location
94    assembly Dbtag OPTIONAL,
95
96    hgvs-name VisibleString OPTIONAL,
97
98    -- the reference location for this variant
99    comment VisibleString OPTIONAL,
100
101    exceptions SET OF VariationException OPTIONAL,
102
103    dbxrefs SET OF Dbtag OPTIONAL, --e.g. rs#, that are placement-specific
104
105    ext SET OF User-object OPTIONAL, --for process-specific placement tags/labels
106
107    gene-location INTEGER OPTIONAL, --Same semantics as VariantProperties.gene-location, except placement-specific
108
109    id Object-id OPTIONAL,
110    parent-id Object-id OPTIONAL, --id of the placement from which this one was derived
111
112    so-terms SEQUENCE OF INTEGER OPTIONAL --Sequence Ontology terms for this placement
113}
114
115VariationMethod ::= SEQUENCE
116{
117    -- sequencing / acuisition method
118    method SET OF INTEGER {
119        unknown             (0),
120        bac-acgh            (1),
121        computational       (2),
122        curated             (3),
123        digital-array       (4),
124        expression-array    (5),
125        fish                (6),
126        flanking-sequence   (7),
127        maph                (8),
128        mcd-analysis        (9),
129        mlpa                (10),
130        oea-assembly        (11),
131        oligo-acgh          (12),
132        paired-end          (13),
133        pcr                 (14),
134        qpcr                (15),
135        read-depth          (16),
136        roma                (17),
137        rt-pcr              (18),
138        sage                (19),
139        sequence-alignment  (20),
140        sequencing          (21),
141        snp-array           (22),
142        snp-genoytyping     (23),
143        southern            (24),
144        western             (25),
145        optical-mapping     (26),
146
147        other               (255)
148    },
149
150    -- if sequence-based validation methods are used,
151    -- what reference sequence location validated the presence of this?
152    reference-location Seq-loc OPTIONAL
153}
154
155
156Variation ::= SEQUENCE
157{
158    -- ids (i.e., SNP rsid / ssid, dbVar nsv/nssv)
159    -- expected values include 'dbSNP|rs12334', 'dbSNP|ss12345', 'dbVar|nsv1'
160    --
161    -- we relate three kinds of IDs here:
162    --  - our current object's id
163    --  - the id of this object's parent, if it exists
164    --  - the sample ID that this item originates from
165    id        Dbtag OPTIONAL,
166    parent-id Dbtag OPTIONAL,
167    sample-id SET OF Object-id OPTIONAL,
168    other-ids SET OF Dbtag OPTIONAL,
169
170    -- names and synonyms
171    -- some variants have well-known canonical names and possible accepted
172    -- synonyms
173    name VisibleString OPTIONAL,
174    synonyms SET OF VisibleString OPTIONAL,
175
176    -- tag for comment and descriptions
177    description VisibleString OPTIONAL,
178
179    -- where this beast is seen
180    -- note that this is a set of locations, and there are no restrictions to
181    -- the contents to this set.
182    placements SEQUENCE OF VariantPlacement OPTIONAL,
183
184    -- phenotype
185    phenotype SET OF Phenotype OPTIONAL,
186
187    -- sequencing / acuisition method
188    method VariationMethod OPTIONAL,
189
190    -- Note about SNP representation and pretinent fields: allele-frequency,
191    -- population, quality-codes:
192    -- The case of multiple alleles for a SNP would be described by
193    -- parent-feature of type Variation-set.diff-alleles, where the child
194    -- features of type Variation-inst, all at the same location, would
195    -- describe individual alleles.
196
197    -- population data
198    population-data SET OF Population-data OPTIONAL,
199
200    -- variant properties bit fields
201    variant-prop VariantProperties OPTIONAL,
202
203    -- publication support; same type as in seq-feat
204    pub Pub-set OPTIONAL,
205
206    -- References to external
207    clinical-test Dbtag OPTIONAL,
208
209    data CHOICE {
210        unknown NULL,
211        note    VisibleString, --free-form
212        uniparental-disomy NULL,
213
214        -- actual sequence-edit at feat.location
215        instance        Variation-inst,
216
217        -- Set of related Variations.
218        -- Location of the set equals to the union of member locations
219        set SEQUENCE {
220            type INTEGER {
221                unknown     (0),
222                compound    (1), -- complex change at the same location on the
223                                 -- same molecule
224                products    (2), -- different products arising from the same
225                                 -- variation in a precursor, e.g. r.[13g>a,
226                                 -- 13_88del]
227                haplotype   (3), -- changes on the same allele, e.g
228                                 -- r.[13g>a;15u>c]
229                genotype    (4), -- changes on different alleles in the same
230                                 -- genotype, e.g. g.[476C>T]+[476C>T]
231                mosaic      (5), -- different genotypes in the same individual
232                individual  (6), -- same organism; allele relationship unknown,
233                                 -- e.g. g.[476C>T(+)183G>C]
234                population  (7), -- population
235                alleles     (8), -- set represents a set of observed alleles
236                package     (9), -- set represents a package of observations at
237                                 -- a given location, generally containing
238                                 -- asserted + reference
239                chimeric    (10), -- e.g. c.[1C>T//2G>T]
240                other       (255)
241            },
242            variations SET OF Variation,
243            name  VisibleString OPTIONAL
244        },
245
246        -- variant is a complex and undescribed change at the location
247        -- This type of variant is known to occur in dbVar submissions
248        complex NULL,
249
250        seq Bioseq -- Sequnece as it exists post-alteration
251    },
252
253    consequence SET OF CHOICE {
254        unknown     NULL,
255        splicing    NULL, --some effect on splicing
256        note        VisibleString,  --freeform
257
258        -- Describe resulting variation in the product, e.g. missense,
259        -- nonsense, silent, neutral, etc in a protein, that arises from
260        -- THIS variation.
261        variation   Variation,
262
263        loss-of-heterozygosity SEQUENCE {
264            -- In germline comparison, it will be reference genome assembly
265            -- (default) or reference/normal population. In somatic mutation,
266            -- it will be a name of the normal tissue.
267            reference VisibleString OPTIONAL,
268
269            -- Name of the testing subject type or the testing tissue.
270            test VisibleString OPTIONAL
271        }
272    } OPTIONAL,
273
274    -- Frameshift-related info. Applies only to protein-level variations.
275    -- see http://www.hgvs.org/mutnomen/recs-prot.html
276    frameshift SEQUENCE {
277       phase INTEGER OPTIONAL,
278       x-length INTEGER OPTIONAL
279    } OPTIONAL,
280
281    -- Additional undescribed extensions
282    ext             SET OF User-object OPTIONAL,
283
284    somatic-origin SET OF SEQUENCE {
285        -- description of the somatic origin itself
286        source SubSource OPTIONAL,
287        -- condition related to this origin's type
288        condition SEQUENCE {
289            description VisibleString OPTIONAL,
290            -- reference to BioTerm / other descriptive database
291            object-id SET OF Dbtag OPTIONAL
292        } OPTIONAL
293    } OPTIONAL,
294
295    exceptions SET OF VariationException OPTIONAL,
296
297    so-terms SET OF INTEGER OPTIONAL
298}
299
300
301END
302
303