--$Revision: 370718 $
--**********************************************************************
--
--  NCBI Sequence Alignment elements
--  by James Ostell, 1990
--
--**********************************************************************

NCBI-Seqalign DEFINITIONS ::=
BEGIN

EXPORTS Seq-align, Score, Score-set, Seq-align-set;

IMPORTS Seq-id, Seq-loc , Na-strand FROM NCBI-Seqloc
        User-object, Object-id FROM NCBI-General;

--*** Sequence Alignment ********************************
--*

Seq-align-set ::= SET OF Seq-align

Seq-align ::= SEQUENCE {
    type ENUMERATED {
        not-set (0) ,
        global (1) ,
        diags (2) ,     -- unbroken, but not ordered, diagonals
        partial (3) ,   -- mapping pieces together
        disc (4) ,      -- discontinuous alignment
        other (255) } ,
    dim INTEGER OPTIONAL ,     -- dimensionality
    score SET OF Score OPTIONAL ,   -- for whole alignment
    segs CHOICE {                   -- alignment data
        dendiag SEQUENCE OF Dense-diag ,
        denseg              Dense-seg ,
        std     SEQUENCE OF Std-seg ,
        packed              Packed-seg ,
        disc                Seq-align-set,
        spliced             Spliced-seg,
        sparse              Sparse-seg
    } ,
    
    -- regions of sequence over which align
    --  was computed
    bounds SET OF Seq-loc OPTIONAL,

    -- alignment id
    id SEQUENCE OF Object-id OPTIONAL,

    --extra info
    ext SEQUENCE OF User-object OPTIONAL
}

Dense-diag ::= SEQUENCE {         -- for (multiway) diagonals
    dim INTEGER DEFAULT 2 ,    -- dimensionality
    ids SEQUENCE OF Seq-id ,   -- sequences in order
    starts SEQUENCE OF INTEGER ,  -- start OFFSETS in ids order
    len INTEGER ,                 -- len of aligned segments
    strands SEQUENCE OF Na-strand OPTIONAL ,
    scores SET OF Score OPTIONAL }

    -- Dense-seg: the densist packing for sequence alignments only.
    --            a start of -1 indicates a gap for that sequence of
    --            length lens.
    --
    -- id=100  AAGGCCTTTTAGAGATGATGATGATGATGA
    -- id=200  AAGGCCTTTTAG.......GATGATGATGA
    -- id=300  ....CCTTTTAGAGATGATGAT....ATGA
    --
    -- dim = 3, numseg = 6, ids = { 100, 200, 300 }
    -- starts = { 0,0,-1, 4,4,0, 12,-1,8, 19,12,15, 22,15,-1, 26,19,18 }
    -- lens = { 4, 8, 7, 3, 4, 4 }
    --

Dense-seg ::= SEQUENCE {          -- for (multiway) global or partial alignments
    dim INTEGER DEFAULT 2 ,       -- dimensionality
    numseg INTEGER ,              -- number of segments here
    ids SEQUENCE OF Seq-id ,      -- sequences in order
    starts SEQUENCE OF INTEGER ,  -- start OFFSETS in ids order within segs
    lens SEQUENCE OF INTEGER ,    -- lengths in ids order within segs
    strands SEQUENCE OF Na-strand OPTIONAL ,
    scores SEQUENCE OF Score OPTIONAL }  -- score for each seg

Packed-seg ::= SEQUENCE {         -- for (multiway) global or partial alignments
    dim INTEGER DEFAULT 2 ,       -- dimensionality
    numseg INTEGER ,              -- number of segments here
    ids SEQUENCE OF Seq-id ,      -- sequences in order
    starts SEQUENCE OF INTEGER ,  -- start OFFSETS in ids order for whole alignment
    present OCTET STRING ,        -- Boolean if each sequence present or absent in
                                  --   each segment
    lens SEQUENCE OF INTEGER ,    -- length of each segment
    strands SEQUENCE OF Na-strand OPTIONAL ,
    scores SEQUENCE OF Score OPTIONAL }  -- score for each segment

Std-seg ::= SEQUENCE {
    dim INTEGER DEFAULT 2 ,       -- dimensionality
    ids SEQUENCE OF Seq-id OPTIONAL ,
    loc SEQUENCE OF Seq-loc ,
    scores SET OF Score OPTIONAL }


Spliced-seg ::= SEQUENCE {
    -- product is either protein or transcript (cDNA)
    product-id Seq-id OPTIONAL,
    genomic-id Seq-id OPTIONAL,

    -- should be 'plus' or 'minus'
    product-strand Na-strand OPTIONAL ,
    genomic-strand Na-strand OPTIONAL ,
    
    product-type ENUMERATED {
        transcript(0),
        protein(1)
    },

    -- set of segments involved
    -- each segment corresponds to one exon
    -- exons are always in biological order
    exons SEQUENCE OF Spliced-exon ,

    -- start of poly(A) tail on the transcript
    -- For sense transcripts:
    --   aligned product positions < poly-a <= product-length
    --   poly-a == product-length indicates inferred poly(A) tail at transcript's end
    -- For antisense transcripts:
    --   -1 <= poly-a < aligned product positions
    --   poly-a == -1 indicates inferred poly(A) tail at transcript's start
    poly-a INTEGER OPTIONAL,

    -- length of the product, in bases/residues
    -- from this (or from poly-a if present), a 3' unaligned length can be extracted
    product-length INTEGER OPTIONAL,

    -- alignment descriptors / modifiers
    -- this provides us a set for extension
    modifiers SET OF Spliced-seg-modifier OPTIONAL
}

Spliced-seg-modifier ::= CHOICE {
    -- protein aligns from the start and the first codon 
    -- on both product and genomic is start codon
    start-codon-found BOOLEAN,
    
    -- protein aligns to it's end and there is stop codon 
    -- on the genomic right after the alignment
    stop-codon-found BOOLEAN
}


-- complete or partial exon
-- two consecutive Spliced-exons may belong to one exon
Spliced-exon ::= SEQUENCE {
    -- product-end >= product-start
    product-start Product-pos ,
    product-end Product-pos ,

    -- genomic-end >= genomic-start
    genomic-start INTEGER ,
    genomic-end INTEGER ,

    -- product is either protein or transcript (cDNA)
    product-id Seq-id OPTIONAL ,
    genomic-id Seq-id OPTIONAL ,

    -- should be 'plus' or 'minus'
    product-strand Na-strand OPTIONAL ,
    
    -- genomic-strand represents the strand of translation
    genomic-strand Na-strand OPTIONAL ,

    -- basic seqments always are in biologic order
    parts SEQUENCE OF Spliced-exon-chunk OPTIONAL ,

    -- scores for this exon
    scores Score-set OPTIONAL ,

    -- splice sites
    acceptor-before-exon Splice-site OPTIONAL,
    donor-after-exon Splice-site OPTIONAL,
    
    -- flag: is this exon complete or partial?
    partial BOOLEAN OPTIONAL,

    --extra info
    ext SEQUENCE OF User-object OPTIONAL
}


Product-pos ::= CHOICE {
    nucpos INTEGER,
    protpos Prot-pos
}


-- position on protein (1/3 of amino-acid resolution)
Prot-pos ::= SEQUENCE {
    -- amino-acid position (0-based)
    amin INTEGER ,

    -- position within codon (1-based)
    -- 0 = not set (meaning 1)
    frame INTEGER DEFAULT 0
}


-- Spliced-exon-chunk: piece of an exon
-- lengths are given in nucleotide bases (1/3 of aminoacid when product is a
-- protein)
Spliced-exon-chunk ::= CHOICE {
    -- both sequences represented, product and genomic sequences match
    match INTEGER ,

    -- both sequences represented, product and genomic sequences do not match
    mismatch INTEGER ,

    -- both sequences are represented, there is sufficient similarity 
    -- between product and genomic sequences. Can be used to replace stretches
    -- of matches and mismatches, mostly for protein to genomic where 
    -- definition of match or mismatch depends on translation table
    diag INTEGER ,

     -- insertion in product sequence (i.e. gap in the genomic sequence)
    product-ins INTEGER ,

     -- insertion in genomic sequence (i.e. gap in the product sequence)
    genomic-ins INTEGER
}


-- site involved in splice
Splice-site ::= SEQUENCE {
    -- typically two bases in the intronic region, always
    -- in IUPAC format
    bases VisibleString
}


-- ==========================================================================
--
-- Sparse-seg follows the semantics of dense-seg and is more optimal for
-- representing sparse multiple alignments
--
-- ==========================================================================


Sparse-seg ::= SEQUENCE {
    master-id Seq-id OPTIONAL,

    -- pairwise alignments constituting this multiple alignment
    rows SET OF Sparse-align,

    -- per-row scores
    row-scores SET OF Score OPTIONAL,

    -- index of extra items
    ext  SET OF Sparse-seg-ext OPTIONAL
}

Sparse-align ::= SEQUENCE {
    first-id Seq-id,
    second-id Seq-id,

    numseg INTEGER,                      --number of segments
    first-starts SEQUENCE OF INTEGER ,   --starts on the first sequence [numseg]
    second-starts SEQUENCE OF INTEGER ,  --starts on the second sequence [numseg]
    lens SEQUENCE OF INTEGER ,           --lengths of segments [numseg]
    second-strands SEQUENCE OF Na-strand OPTIONAL ,

    -- per-segment scores
    seg-scores SET OF Score OPTIONAL
}

Sparse-seg-ext ::= SEQUENCE {
    --seg-ext SET OF {
    --    index INTEGER,
    --    data User-field
    -- }
    index INTEGER
}


-- use of Score is discouraged for external ASN.1 specifications
Score ::= SEQUENCE {
    id Object-id OPTIONAL ,
    value CHOICE {
        real REAL ,
        int INTEGER
    }
}

-- use of Score-set is encouraged for external ASN.1 specifications
Score-set ::= SET OF Score

END