1--$Revision: 587101 $
2--**********************************************************************
3--
4--  NCBI Sequence elements
5--  by James Ostell, 1990
6--  Version 3.0 - June 1994
7--
8--**********************************************************************
9
10NCBI-Sequence DEFINITIONS ::=
11BEGIN
12
13EXPORTS Annotdesc, Annot-descr, Bioseq, GIBB-mol, Heterogen, MolInfo,
14        Numbering, Pubdesc, Seq-annot, Seq-data, Seqdesc, Seq-descr, Seq-ext,
15        Seq-hist, Seq-inst, Seq-literal, Seqdesc, Delta-ext, Seq-gap;
16
17IMPORTS Date, Int-fuzz, Dbtag, Object-id, User-object FROM NCBI-General
18        Seq-align FROM NCBI-Seqalign
19        Seq-feat, ModelEvidenceSupport FROM NCBI-Seqfeat
20        Seq-graph FROM NCBI-Seqres
21        Pub-equiv FROM NCBI-Pub
22        Org-ref FROM NCBI-Organism
23        BioSource FROM NCBI-BioSource
24        Seq-id, Seq-loc FROM NCBI-Seqloc
25        GB-block FROM GenBank-General
26        PIR-block FROM PIR-General
27        EMBL-block FROM EMBL-General
28        SP-block FROM SP-General
29        PRF-block FROM PRF-General
30        PDB-block FROM PDB-General
31        Seq-table FROM NCBI-SeqTable;
32
33--*** Sequence ********************************
34--*
35
36Bioseq ::= SEQUENCE {
37    id SET OF Seq-id ,            -- equivalent identifiers
38    descr Seq-descr OPTIONAL , -- descriptors
39    inst Seq-inst ,            -- the sequence data
40    annot SET OF Seq-annot OPTIONAL }
41
42--*** Descriptors *****************************
43--*
44
45Seq-descr ::= SET OF Seqdesc
46
47Seqdesc ::= CHOICE {
48    mol-type GIBB-mol ,          -- type of molecule
49    modif SET OF GIBB-mod ,             -- modifiers
50    method GIBB-method ,         -- sequencing method
51    name VisibleString ,         -- a name for this sequence
52    title VisibleString ,        -- a title for this sequence
53    org Org-ref ,                -- if all from one organism
54    comment VisibleString ,      -- a more extensive comment
55    num Numbering ,              -- a numbering system
56    maploc Dbtag ,               -- map location of this sequence
57    pir PIR-block ,              -- PIR specific info
58    genbank GB-block ,           -- GenBank specific info
59    pub Pubdesc ,                -- a reference to the publication
60    region VisibleString ,       -- overall region (globin locus)
61    user User-object ,           -- user defined object
62    sp SP-block ,                -- SWISSPROT specific info
63    dbxref Dbtag ,               -- xref to other databases
64    embl EMBL-block ,            -- EMBL specific information
65    create-date Date ,           -- date entry first created/released
66    update-date Date ,           -- date of last update
67    prf PRF-block ,              -- PRF specific information
68    pdb PDB-block ,              -- PDB specific information
69    het Heterogen ,              -- cofactor, etc associated but not bound
70    source BioSource ,           -- source of materials, includes Org-ref
71    molinfo MolInfo ,            -- info on the molecule and techniques
72    modelev ModelEvidenceSupport -- model evidence for XM records
73}
74
75--******* NOTE:
76--*       mol-type, modif, method, and org are consolidated and expanded
77--*       in Org-ref, BioSource, and MolInfo in this specification. They
78--*       will be removed in later specifications. Do not use them in the
79--*       the future. Instead expect the new structures.
80--*
81--***************************
82
83--********************************************************************
84--
85-- MolInfo gives information on the
86-- classification of the type and quality of the sequence
87--
88-- WARNING: this will replace GIBB-mol, GIBB-mod, GIBB-method
89--
90--********************************************************************
91
92MolInfo ::= SEQUENCE {
93    biomol INTEGER {
94        unknown (0) ,
95        genomic (1) ,
96        pre-RNA (2) ,              -- precursor RNA of any sort really
97        mRNA (3) ,
98        rRNA (4) ,
99        tRNA (5) ,
100        snRNA (6) ,
101        scRNA (7) ,
102        peptide (8) ,
103        other-genetic (9) ,      -- other genetic material
104        genomic-mRNA (10) ,      -- reported a mix of genomic and cdna sequence
105        cRNA (11) ,              -- viral RNA genome copy intermediate
106        snoRNA (12) ,            -- small nucleolar RNA
107        transcribed-RNA (13) ,   -- transcribed RNA other than existing classes
108        ncRNA (14) ,
109        tmRNA (15) ,
110        other (255) } DEFAULT unknown ,
111    tech INTEGER {
112        unknown (0) ,
113        standard (1) ,          -- standard sequencing
114        est (2) ,               -- Expressed Sequence Tag
115        sts (3) ,               -- Sequence Tagged Site
116        survey (4) ,            -- one-pass genomic sequence
117        genemap (5) ,           -- from genetic mapping techniques
118        physmap (6) ,           -- from physical mapping techniques
119        derived (7) ,           -- derived from other data, not a primary entity
120        concept-trans (8) ,     -- conceptual translation
121        seq-pept (9) ,          -- peptide was sequenced
122        both (10) ,             -- concept transl. w/ partial pept. seq.
123        seq-pept-overlap (11) , -- sequenced peptide, ordered by overlap
124        seq-pept-homol (12) ,   -- sequenced peptide, ordered by homology
125        concept-trans-a (13) ,  -- conceptual transl. supplied by author
126        htgs-1 (14) ,           -- unordered High Throughput sequence contig
127        htgs-2 (15) ,           -- ordered High Throughput sequence contig
128        htgs-3 (16) ,           -- finished High Throughput sequence
129        fli-cdna (17) ,         -- full length insert cDNA
130        htgs-0 (18) ,           -- single genomic reads for coordination
131        htc (19) ,              -- high throughput cDNA
132        wgs (20) ,              -- whole genome shotgun sequencing
133        barcode (21) ,          -- barcode of life project
134        composite-wgs-htgs (22) , -- composite of WGS and HTGS
135        tsa (23) ,              -- transcriptome shotgun assembly
136        targeted (24) ,         -- targeted locus sets/studies
137        other (255) }           -- use Source.techexp
138               DEFAULT unknown ,
139    techexp VisibleString OPTIONAL ,   -- explanation if tech not enough
140    --
141    -- Completeness is not indicated in most records.  For genomes, assume
142    -- the sequences are incomplete unless specifically marked as complete.
143    -- For mRNAs, assume the ends are not known exactly unless marked as
144    -- having the left or right end.
145    --
146    completeness INTEGER {
147      unknown (0) ,
148      complete (1) ,                   -- complete biological entity
149      partial (2) ,                    -- partial but no details given
150      no-left (3) ,                    -- missing 5' or NH3 end
151      no-right (4) ,                   -- missing 3' or COOH end
152      no-ends (5) ,                    -- missing both ends
153      has-left (6) ,                   -- 5' or NH3 end present
154      has-right (7) ,                  -- 3' or COOH end present
155      other (255) } DEFAULT unknown ,
156    gbmoltype VisibleString OPTIONAL } -- identifies particular ncRNA
157
158
159GIBB-mol ::= ENUMERATED {       -- type of molecule represented
160    unknown (0) ,
161    genomic (1) ,
162    pre-mRNA (2) ,              -- precursor RNA of any sort really
163    mRNA (3) ,
164    rRNA (4) ,
165    tRNA (5) ,
166    snRNA (6) ,
167    scRNA (7) ,
168    peptide (8) ,
169    other-genetic (9) ,      -- other genetic material
170    genomic-mRNA (10) ,      -- reported a mix of genomic and cdna sequence
171    other (255) }
172
173GIBB-mod ::= ENUMERATED {        -- GenInfo Backbone modifiers
174    dna (0) ,
175    rna (1) ,
176    extrachrom (2) ,
177    plasmid (3) ,
178    mitochondrial (4) ,
179    chloroplast (5) ,
180    kinetoplast (6) ,
181    cyanelle (7) ,
182    synthetic (8) ,
183    recombinant (9) ,
184    partial (10) ,
185    complete (11) ,
186    mutagen (12) ,    -- subject of mutagenesis ?
187    natmut (13) ,     -- natural mutant ?
188    transposon (14) ,
189    insertion-seq (15) ,
190    no-left (16) ,    -- missing left end (5' for na, NH2 for aa)
191    no-right (17) ,   -- missing right end (3' or COOH)
192    macronuclear (18) ,
193    proviral (19) ,
194    est (20) ,        -- expressed sequence tag
195    sts (21) ,        -- sequence tagged site
196    survey (22) ,     -- one pass survey sequence
197    chromoplast (23) ,
198    genemap (24) ,    -- is a genetic map
199    restmap (25) ,    -- is an ordered restriction map
200    physmap (26) ,    -- is a physical map (not ordered restriction map)
201    other (255) }
202
203GIBB-method ::= ENUMERATED {        -- sequencing methods
204    concept-trans (1) ,    -- conceptual translation
205    seq-pept (2) ,         -- peptide was sequenced
206    both (3) ,             -- concept transl. w/ partial pept. seq.
207    seq-pept-overlap (4) , -- sequenced peptide, ordered by overlap
208    seq-pept-homol (5) ,   -- sequenced peptide, ordered by homology
209    concept-trans-a (6) ,  -- conceptual transl. supplied by author
210    other (255) }
211
212Numbering ::= CHOICE {           -- any display numbering system
213    cont Num-cont ,              -- continuous numbering
214    enum Num-enum ,              -- enumerated names for residues
215    ref Num-ref ,                -- by reference to another sequence
216    real Num-real }              -- supports mapping to a float system
217
218Num-cont ::= SEQUENCE {          -- continuous display numbering system
219    refnum INTEGER DEFAULT 1,         -- number assigned to first residue
220    has-zero BOOLEAN DEFAULT FALSE ,  -- 0 used?
221    ascending BOOLEAN DEFAULT TRUE }  -- ascending numbers?
222
223Num-enum ::= SEQUENCE {          -- any tags to residues
224    num INTEGER ,                        -- number of tags to follow
225    names SEQUENCE OF VisibleString }    -- the tags
226
227Num-ref ::= SEQUENCE {           -- by reference to other sequences
228    type ENUMERATED {            -- type of reference
229        not-set (0) ,
230        sources (1) ,            -- by segmented or const seq sources
231        aligns (2) } ,           -- by alignments given below
232    aligns Seq-align OPTIONAL }
233
234Num-real ::= SEQUENCE {          -- mapping to floating point system
235    a REAL ,                     -- from an integer system used by Bioseq
236    b REAL ,                     -- position = (a * int_position) + b
237    units VisibleString OPTIONAL }
238
239Pubdesc ::= SEQUENCE {              -- how sequence presented in pub
240    pub Pub-equiv ,                 -- the citation(s)
241    name VisibleString OPTIONAL ,   -- name used in paper
242    fig VisibleString OPTIONAL ,    -- figure in paper
243    num Numbering OPTIONAL ,        -- numbering from paper
244    numexc BOOLEAN OPTIONAL ,       -- numbering problem with paper
245    poly-a BOOLEAN OPTIONAL ,       -- poly A tail indicated in figure?
246    maploc VisibleString OPTIONAL , -- map location reported in paper
247    seq-raw StringStore OPTIONAL ,  -- original sequence from paper
248    align-group INTEGER OPTIONAL ,  -- this seq aligned with others in paper
249    comment VisibleString OPTIONAL, -- any comment on this pub in context
250    reftype INTEGER {           -- type of reference in a GenBank record
251        seq (0) ,               -- refers to sequence
252        sites (1) ,             -- refers to unspecified features
253        feats (2) ,             -- refers to specified features
254        no-target (3) }         -- nothing specified (EMBL)
255        DEFAULT seq }
256
257Heterogen ::= VisibleString       -- cofactor, prosthetic group, inhibitor, etc
258
259--*** Instances of sequences *******************************
260--*
261
262Seq-inst ::= SEQUENCE {            -- the sequence data itself
263    repr ENUMERATED {              -- representation class
264        not-set (0) ,              -- empty
265        virtual (1) ,              -- no seq data
266        raw (2) ,                  -- continuous sequence
267        seg (3) ,                  -- segmented sequence
268        const (4) ,                -- constructed sequence
269        ref (5) ,                  -- reference to another sequence
270        consen (6) ,               -- consensus sequence or pattern
271        map (7) ,                  -- ordered map of any kind
272        delta (8) ,              -- sequence made by changes (delta) to others
273        other (255) } ,
274    mol ENUMERATED {               -- molecule class in living organism
275        not-set (0) ,              --   > cdna = rna
276        dna (1) ,
277        rna (2) ,
278        aa (3) ,
279        na (4) ,                   -- just a nucleic acid
280        other (255) } ,
281    length INTEGER OPTIONAL ,      -- length of sequence in residues
282    fuzz Int-fuzz OPTIONAL ,       -- length uncertainty
283    topology ENUMERATED {          -- topology of molecule
284        not-set (0) ,
285        linear (1) ,
286        circular (2) ,
287        tandem (3) ,               -- some part of tandem repeat
288        other (255) } DEFAULT linear ,
289    strand ENUMERATED {            -- strandedness in living organism
290        not-set (0) ,
291        ss (1) ,                   -- single strand
292        ds (2) ,                   -- double strand
293        mixed (3) ,
294        other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept
295    seq-data Seq-data OPTIONAL ,   -- the sequence
296    ext Seq-ext OPTIONAL ,         -- extensions for special types
297    hist Seq-hist OPTIONAL }       -- sequence history
298
299--*** Sequence Extensions **********************************
300--*  for representing more complex types
301--*  const type uses Seq-hist.assembly
302
303Seq-ext ::= CHOICE {
304    seg Seg-ext ,        -- segmented sequences
305    ref Ref-ext ,        -- hot link to another sequence (a view)
306    map Map-ext ,        -- ordered map of markers
307    delta Delta-ext }
308
309Seg-ext ::= SEQUENCE OF Seq-loc
310
311Ref-ext ::= Seq-loc
312
313Map-ext ::= SEQUENCE OF Seq-feat
314
315Delta-ext ::= SEQUENCE OF Delta-seq
316
317Delta-seq ::= CHOICE {
318    loc Seq-loc ,       -- point to a sequence
319    literal Seq-literal }   -- a piece of sequence
320
321Seq-literal ::= SEQUENCE {
322    length INTEGER ,         -- must give a length in residues
323    fuzz Int-fuzz OPTIONAL , -- could be unsure
324    seq-data Seq-data OPTIONAL } -- may have the data
325
326--*** Sequence History Record ***********************************
327--** assembly = records how seq was assembled from others
328--** replaces = records sequences made obsolete by this one
329--** replaced-by = this seq is made obsolete by another(s)
330
331Seq-hist ::= SEQUENCE {
332    assembly SET OF Seq-align OPTIONAL ,-- how was this assembled?
333    replaces Seq-hist-rec OPTIONAL ,    -- seq makes these seqs obsolete
334    replaced-by Seq-hist-rec OPTIONAL , -- these seqs make this one obsolete
335    deleted CHOICE {
336        bool BOOLEAN ,
337        date Date } OPTIONAL }
338
339Seq-hist-rec ::= SEQUENCE {
340    date Date OPTIONAL ,
341    ids SET OF Seq-id }
342
343--*** Various internal sequence representations ************
344--*      all are controlled, fixed length forms
345
346Seq-data ::= CHOICE {              -- sequence representations
347    iupacna IUPACna ,              -- IUPAC 1 letter nuc acid code
348    iupacaa IUPACaa ,              -- IUPAC 1 letter amino acid code
349    ncbi2na NCBI2na ,              -- 2 bit nucleic acid code
350    ncbi4na NCBI4na ,              -- 4 bit nucleic acid code
351    ncbi8na NCBI8na ,              -- 8 bit extended nucleic acid code
352    ncbipna NCBIpna ,              -- nucleic acid probabilities
353    ncbi8aa NCBI8aa ,              -- 8 bit extended amino acid codes
354    ncbieaa NCBIeaa ,              -- extended ASCII 1 letter aa codes
355    ncbipaa NCBIpaa ,              -- amino acid probabilities
356    ncbistdaa NCBIstdaa,           -- consecutive codes for std aas
357    gap Seq-gap                    -- gap types
358}
359
360Seq-gap ::= SEQUENCE {
361    type INTEGER {
362        unknown(0),
363        fragment(1),               -- Deprecated. Used only for AGP 1.1
364        clone(2),                  -- Deprecated. Used only for AGP 1.1
365        short-arm(3),
366        heterochromatin(4),
367        centromere(5),
368        telomere(6),
369        repeat(7),
370        contig(8),
371        scaffold(9),
372        contamination(10),
373        other(255)
374    },
375    linkage INTEGER {
376        unlinked(0),
377        linked(1),
378        other(255)
379    } OPTIONAL,
380    linkage-evidence SET OF Linkage-evidence OPTIONAL
381}
382
383Linkage-evidence ::= SEQUENCE {
384    type INTEGER {
385        paired-ends(0),
386        align-genus(1),
387        align-xgenus(2),
388        align-trnscpt(3),
389        within-clone(4),
390        clone-contig(5),
391        map(6),
392        strobe(7),
393        unspecified(8),
394        pcr(9),
395        proximity-ligation(10),
396        other(255)
397    }
398}
399
400IUPACna ::= StringStore       -- IUPAC 1 letter codes, no spaces
401IUPACaa ::= StringStore       -- IUPAC 1 letter codes, no spaces
402NCBI2na ::= OCTET STRING      -- 00=A, 01=C, 10=G, 11=T
403NCBI4na ::= OCTET STRING      -- 1 bit each for agct
404                              -- 0001=A, 0010=C, 0100=G, 1000=T/U
405                              -- 0101=Purine, 1010=Pyrimidine, etc
406NCBI8na ::= OCTET STRING      -- for modified nucleic acids
407NCBIpna ::= OCTET STRING      -- 5 octets/base, prob for a,c,g,t,n
408                              -- probabilities are coded 0-255 = 0.0-1.0
409NCBI8aa ::= OCTET STRING      -- for modified amino acids
410NCBIeaa ::= StringStore       -- ASCII extended 1 letter aa codes
411                              -- IUPAC codes + U=selenocysteine
412NCBIpaa ::= OCTET STRING      -- 25 octets/aa, prob for IUPAC aas in order:
413                              -- A-Y,B,Z,X,(ter),anything
414                              -- probabilities are coded 0-255 = 0.0-1.0
415NCBIstdaa ::= OCTET STRING    -- codes 0-25, 1 per byte
416
417--*** Sequence Annotation *************************************
418--*
419
420-- This is a replica of Textseq-id
421-- This is specific for annotations, and exists to maintain a semantic
422-- difference between IDs assigned to annotations and IDs assigned to
423-- sequences
424Textannot-id ::= SEQUENCE {
425    name	  VisibleString OPTIONAL ,
426    accession VisibleString OPTIONAL ,
427    release   VisibleString OPTIONAL ,
428    version   INTEGER       OPTIONAL
429}
430
431Annot-id ::= CHOICE {
432    local Object-id ,
433    ncbi INTEGER ,
434    general Dbtag,
435    other Textannot-id
436}
437
438Annot-descr ::= SET OF Annotdesc
439
440Annotdesc ::= CHOICE {
441    name VisibleString ,         -- a short name for this collection
442    title VisibleString ,        -- a title for this collection
443    comment VisibleString ,      -- a more extensive comment
444    pub Pubdesc ,                -- a reference to the publication
445    user User-object ,           -- user defined object
446    create-date Date ,           -- date entry first created/released
447    update-date Date ,           -- date of last update
448    src Seq-id ,                 -- source sequence from which annot came
449    align Align-def,             -- definition of the SeqAligns
450    region Seq-loc }             -- all contents cover this region
451
452Align-def ::= SEQUENCE {
453    align-type INTEGER {         -- class of align Seq-annot
454      ref (1) ,                  -- set of alignments to the same sequence
455      alt (2) ,                  -- set of alternate alignments of the same seqs
456      blocks (3) ,               -- set of aligned blocks in the same seqs
457      other (255) } ,
458    ids SET OF Seq-id OPTIONAL } -- used for the one ref seqid for now
459
460Seq-annot ::= SEQUENCE {
461    id SET OF Annot-id OPTIONAL ,
462    db INTEGER {                 -- source of annotation
463        genbank (1) ,
464        embl (2) ,
465        ddbj (3) ,
466        pir  (4) ,
467        sp   (5) ,
468        bbone (6) ,
469        pdb   (7) ,
470        other (255) } OPTIONAL ,
471    name VisibleString OPTIONAL ,-- source if "other" above
472    desc Annot-descr OPTIONAL ,  -- used only for stand alone Seq-annots
473    data CHOICE {
474        ftable SET OF Seq-feat ,
475        align SET OF Seq-align ,
476        graph SET OF Seq-graph ,
477        ids SET OF Seq-id ,      -- used for communication between tools
478        locs SET OF Seq-loc ,    -- used for communication between tools
479        seq-table Seq-table } }  -- features in table form
480
481END
482
483
484