1--$Revision: 370718 $
2--**********************************************************************
3--
4--  NCBI Sequence Alignment elements
5--  by James Ostell, 1990
6--
7--**********************************************************************
8
9NCBI-Seqalign DEFINITIONS ::=
10BEGIN
11
12EXPORTS Seq-align, Score, Score-set, Seq-align-set;
13
14IMPORTS Seq-id, Seq-loc , Na-strand FROM NCBI-Seqloc
15        User-object, Object-id FROM NCBI-General;
16
17--*** Sequence Alignment ********************************
18--*
19
20Seq-align-set ::= SET OF Seq-align
21
22Seq-align ::= SEQUENCE {
23    type ENUMERATED {
24        not-set (0) ,
25        global (1) ,
26        diags (2) ,     -- unbroken, but not ordered, diagonals
27        partial (3) ,   -- mapping pieces together
28        disc (4) ,      -- discontinuous alignment
29        other (255) } ,
30    dim INTEGER OPTIONAL ,     -- dimensionality
31    score SET OF Score OPTIONAL ,   -- for whole alignment
32    segs CHOICE {                   -- alignment data
33        dendiag SEQUENCE OF Dense-diag ,
34        denseg              Dense-seg ,
35        std     SEQUENCE OF Std-seg ,
36        packed              Packed-seg ,
37        disc                Seq-align-set,
38        spliced             Spliced-seg,
39        sparse              Sparse-seg
40    } ,
41
42    -- regions of sequence over which align
43    --  was computed
44    bounds SET OF Seq-loc OPTIONAL,
45
46    -- alignment id
47    id SEQUENCE OF Object-id OPTIONAL,
48
49    --extra info
50    ext SEQUENCE OF User-object OPTIONAL
51}
52
53Dense-diag ::= SEQUENCE {         -- for (multiway) diagonals
54    dim INTEGER DEFAULT 2 ,    -- dimensionality
55    ids SEQUENCE OF Seq-id ,   -- sequences in order
56    starts SEQUENCE OF INTEGER ,  -- start OFFSETS in ids order
57    len INTEGER ,                 -- len of aligned segments
58    strands SEQUENCE OF Na-strand OPTIONAL ,
59    scores SET OF Score OPTIONAL }
60
61    -- Dense-seg: the densist packing for sequence alignments only.
62    --            a start of -1 indicates a gap for that sequence of
63    --            length lens.
64    --
65    -- id=100  AAGGCCTTTTAGAGATGATGATGATGATGA
66    -- id=200  AAGGCCTTTTAG.......GATGATGATGA
67    -- id=300  ....CCTTTTAGAGATGATGAT....ATGA
68    --
69    -- dim = 3, numseg = 6, ids = { 100, 200, 300 }
70    -- starts = { 0,0,-1, 4,4,0, 12,-1,8, 19,12,15, 22,15,-1, 26,19,18 }
71    -- lens = { 4, 8, 7, 3, 4, 4 }
72    --
73
74Dense-seg ::= SEQUENCE {          -- for (multiway) global or partial alignments
75    dim INTEGER DEFAULT 2 ,       -- dimensionality
76    numseg INTEGER ,              -- number of segments here
77    ids SEQUENCE OF Seq-id ,      -- sequences in order
78    starts SEQUENCE OF INTEGER ,  -- start OFFSETS in ids order within segs
79    lens SEQUENCE OF INTEGER ,    -- lengths in ids order within segs
80    strands SEQUENCE OF Na-strand OPTIONAL ,
81    scores SEQUENCE OF Score OPTIONAL }  -- score for each seg
82
83Packed-seg ::= SEQUENCE {         -- for (multiway) global or partial alignments
84    dim INTEGER DEFAULT 2 ,       -- dimensionality
85    numseg INTEGER ,              -- number of segments here
86    ids SEQUENCE OF Seq-id ,      -- sequences in order
87    starts SEQUENCE OF INTEGER ,  -- start OFFSETS in ids order for whole alignment
88    present OCTET STRING ,        -- Boolean if each sequence present or absent in
89                                  --   each segment
90    lens SEQUENCE OF INTEGER ,    -- length of each segment
91    strands SEQUENCE OF Na-strand OPTIONAL ,
92    scores SEQUENCE OF Score OPTIONAL }  -- score for each segment
93
94Std-seg ::= SEQUENCE {
95    dim INTEGER DEFAULT 2 ,       -- dimensionality
96    ids SEQUENCE OF Seq-id OPTIONAL ,
97    loc SEQUENCE OF Seq-loc ,
98    scores SET OF Score OPTIONAL }
99
100
101Spliced-seg ::= SEQUENCE {
102    -- product is either protein or transcript (cDNA)
103    product-id Seq-id OPTIONAL,
104    genomic-id Seq-id OPTIONAL,
105
106    -- should be 'plus' or 'minus'
107    product-strand Na-strand OPTIONAL ,
108    genomic-strand Na-strand OPTIONAL ,
109
110    product-type ENUMERATED {
111        transcript(0),
112        protein(1)
113    },
114
115    -- set of segments involved
116    -- each segment corresponds to one exon
117    -- exons are always in biological order
118    exons SEQUENCE OF Spliced-exon ,
119
120    -- start of poly(A) tail on the transcript
121    -- For sense transcripts:
122    --   aligned product positions < poly-a <= product-length
123    --   poly-a == product-length indicates inferred poly(A) tail at transcript's end
124    -- For antisense transcripts:
125    --   -1 <= poly-a < aligned product positions
126    --   poly-a == -1 indicates inferred poly(A) tail at transcript's start
127    poly-a INTEGER OPTIONAL,
128
129    -- length of the product, in bases/residues
130    -- from this (or from poly-a if present), a 3' unaligned length can be extracted
131    product-length INTEGER OPTIONAL,
132
133    -- alignment descriptors / modifiers
134    -- this provides us a set for extension
135    modifiers SET OF Spliced-seg-modifier OPTIONAL
136}
137
138Spliced-seg-modifier ::= CHOICE {
139    -- protein aligns from the start and the first codon
140    -- on both product and genomic is start codon
141    start-codon-found BOOLEAN,
142
143    -- protein aligns to it's end and there is stop codon
144    -- on the genomic right after the alignment
145    stop-codon-found BOOLEAN
146}
147
148
149-- complete or partial exon
150-- two consecutive Spliced-exons may belong to one exon
151Spliced-exon ::= SEQUENCE {
152    -- product-end >= product-start
153    product-start Product-pos ,
154    product-end Product-pos ,
155
156    -- genomic-end >= genomic-start
157    genomic-start INTEGER ,
158    genomic-end INTEGER ,
159
160    -- product is either protein or transcript (cDNA)
161    product-id Seq-id OPTIONAL ,
162    genomic-id Seq-id OPTIONAL ,
163
164    -- should be 'plus' or 'minus'
165    product-strand Na-strand OPTIONAL ,
166
167    -- genomic-strand represents the strand of translation
168    genomic-strand Na-strand OPTIONAL ,
169
170    -- basic seqments always are in biologic order
171    parts SEQUENCE OF Spliced-exon-chunk OPTIONAL ,
172
173    -- scores for this exon
174    scores Score-set OPTIONAL ,
175
176    -- splice sites
177    acceptor-before-exon Splice-site OPTIONAL,
178    donor-after-exon Splice-site OPTIONAL,
179
180    -- flag: is this exon complete or partial?
181    partial BOOLEAN OPTIONAL,
182
183    --extra info
184    ext SEQUENCE OF User-object OPTIONAL
185}
186
187
188Product-pos ::= CHOICE {
189    nucpos INTEGER,
190    protpos Prot-pos
191}
192
193
194-- position on protein (1/3 of amino-acid resolution)
195Prot-pos ::= SEQUENCE {
196    -- amino-acid position (0-based)
197    amin INTEGER ,
198
199    -- position within codon (1-based)
200    -- 0 = not set (meaning 1)
201    frame INTEGER DEFAULT 0
202}
203
204
205-- Spliced-exon-chunk: piece of an exon
206-- lengths are given in nucleotide bases (1/3 of aminoacid when product is a
207-- protein)
208Spliced-exon-chunk ::= CHOICE {
209    -- both sequences represented, product and genomic sequences match
210    match INTEGER ,
211
212    -- both sequences represented, product and genomic sequences do not match
213    mismatch INTEGER ,
214
215    -- both sequences are represented, there is sufficient similarity
216    -- between product and genomic sequences. Can be used to replace stretches
217    -- of matches and mismatches, mostly for protein to genomic where
218    -- definition of match or mismatch depends on translation table
219    diag INTEGER ,
220
221     -- insertion in product sequence (i.e. gap in the genomic sequence)
222    product-ins INTEGER ,
223
224     -- insertion in genomic sequence (i.e. gap in the product sequence)
225    genomic-ins INTEGER
226}
227
228
229-- site involved in splice
230Splice-site ::= SEQUENCE {
231    -- typically two bases in the intronic region, always
232    -- in IUPAC format
233    bases VisibleString
234}
235
236
237-- ==========================================================================
238--
239-- Sparse-seg follows the semantics of dense-seg and is more optimal for
240-- representing sparse multiple alignments
241--
242-- ==========================================================================
243
244
245Sparse-seg ::= SEQUENCE {
246    master-id Seq-id OPTIONAL,
247
248    -- pairwise alignments constituting this multiple alignment
249    rows SET OF Sparse-align,
250
251    -- per-row scores
252    row-scores SET OF Score OPTIONAL,
253
254    -- index of extra items
255    ext  SET OF Sparse-seg-ext OPTIONAL
256}
257
258Sparse-align ::= SEQUENCE {
259    first-id Seq-id,
260    second-id Seq-id,
261
262    numseg INTEGER,                      --number of segments
263    first-starts SEQUENCE OF INTEGER ,   --starts on the first sequence [numseg]
264    second-starts SEQUENCE OF INTEGER ,  --starts on the second sequence [numseg]
265    lens SEQUENCE OF INTEGER ,           --lengths of segments [numseg]
266    second-strands SEQUENCE OF Na-strand OPTIONAL ,
267
268    -- per-segment scores
269    seg-scores SET OF Score OPTIONAL
270}
271
272Sparse-seg-ext ::= SEQUENCE {
273    --seg-ext SET OF {
274    --    index INTEGER,
275    --    data User-field
276    -- }
277    index INTEGER
278}
279
280
281
282-- use of Score is discouraged for external ASN.1 specifications
283Score ::= SEQUENCE {
284    id Object-id OPTIONAL ,
285    value CHOICE {
286        real REAL ,
287        int INTEGER
288    }
289}
290
291-- use of Score-set is encouraged for external ASN.1 specifications
292Score-set ::= SET OF Score
293
294END
295
296