1/*===========================================================================
2*
3*                            PUBLIC DOMAIN NOTICE
4*               National Center for Biotechnology Information
5*
6*  This software/database is a "United States Government Work" under the
7*  terms of the United States Copyright Act.  It was written as part of
8*  the author's official duties as a United States Government employee and
9*  thus cannot be copyrighted.  This software/database is freely available
10*  to the public for use. The National Library of Medicine and the U.S.
11*  Government have not placed any restriction on its use or reproduction.
12*
13*  Although all reasonable efforts have been taken to ensure the accuracy
14*  and reliability of the software and data, the NLM and the U.S.
15*  Government do not and cannot warrant the performance or results that
16*  may be obtained by using this software or data. The NLM and the U.S.
17*  Government disclaim all warranties, express or implied, including
18*  warranties of performance, merchantability or fitness for any particular
19*  purpose.
20*
21*  Please cite the author in any work or product based on this material.
22*
23* ===========================================================================
24*
25*/
26
27/*==========================================================================
28 * INSDC types, constants
29 */
30version 1;
31
32
33/*--------------------------------------------------------------------------
34 * dna
35 *  represented in IUPAC characters
36 */
37typedef ascii INSDC:dna:text;
38
39
40/*--------------------------------------------------------------------------
41 * 4na
42 *  nucleotide data with all possible ambiguity
43 *  does not represent all possible EVENTS
44 *
45 *  text encodings use the IUPAC character set
46 *  legal values: [ACMGRSVTWYHKDBNacmgrsvtwyhkdbn.]
47 *  canonical values: [ACMGRSVTWYHKDBN]
48 *
49 *  binary values are 0..15 => { NACMGRSVTWYHKDBN }
50 *
51 *  4na values use bits for each letter:
52 *
53 *       A | C | G | T
54 *    ==================
55 *    N    |   |   |
56 *    A  * |   |   |
57 *    C    | * |   |
58 *    M  * | * |   |
59 *    G    |   | * |
60 *    R  * |   | * |
61 *    S    | * | * |
62 *    V  * | * | * |
63 *    T    |   |   | *
64 *    W  * |   |   | *
65 *    Y    | * |   | *
66 *    H  * | * |   | *
67 *    K    |   | * | *
68 *    D  * |   | * | *
69 *    B    | * | * | *
70 *    N  * | * | * | *
71 */
72typedef	U8 INSDC:4na:bin;
73typedef	B1 INSDC:4na:packed [ 4 ];
74
75const INSDC:4na:bin INSDC:4na:map:BINSET
76    = [ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ];
77const INSDC:dna:text INSDC:4na:map:CHARSET
78    = ".ACMGRSVTWYHKDBN";
79const INSDC:dna:text INSDC:4na:accept:CHARSET
80    = ".ACMGRSVTWYHKDBNacmgrsvtwyhkdbn";
81
82/*--------------------------------------------------------------------------
83 * 2na  - nucleotide data A,T,G,C
84 * x2na - nucleotide data extended with single ambiguity value (N)
85 *
86 *  text encodings use the IUPAC character set
87 *  legal values: [ACGTNacgtn.]
88 *  canonical values: [ACGTN]
89 *
90 *  x2na values are 0..4 => { ACGTN } or { ACGUN }
91 *
92 *  2na values exclude N:
93 *    A = 0
94 *    C = 1
95 *    G = 2
96 *    T = 3
97 */
98typedef U8 INSDC:2na:bin;
99typedef U8 INSDC:x2na:bin;
100typedef B1 INSDC:2na:packed [ 2 ];
101
102const INSDC:2na:bin  INSDC:2na:map:BINSET      = [ 0,1,2,3 ];
103const INSDC:dna:text INSDC:2na:map:CHARSET     = "ACGT";
104const INSDC:dna:text INSDC:2na:accept:CHARSET  = "ACGTacgt";
105
106const INSDC:x2na:bin INSDC:x2na:map:BINSET     = [ 0,1,2,3,4 ];
107const INSDC:dna:text INSDC:x2na:map:CHARSET    = "ACGTN";
108const INSDC:dna:text INSDC:x2na:accept:CHARSET = "ACGTNacgtn.";
109
110
111/*--------------------------------------------------------------------------
112 * color - color-space text
113 * 2cs   - color-space data 0,1,2,3
114 * x2cs  - color-space data extended with single ambiguity value (.)
115 *
116 *  text encodings use the ASCII numeric character set
117 *  values: [0123.]
118 *
119 *  x2cs values are 0..4 = { 0123. }
120 *
121 *  2cs values exclude '.':
122 *    '0' = 0
123 *    '1' = 1
124 *    '2' = 2
125 *    '3' = 3
126 */
127typedef ascii INSDC:color:text;
128typedef U8 INSDC:2cs:bin;
129typedef U8 INSDC:x2cs:bin;
130typedef B1 INSDC:2cs:packed [ 2 ];
131
132const INSDC:2cs:bin  INSDC:2cs:map:BINSET        = [ 0,1,2,3 ];
133const INSDC:color:text INSDC:2cs:map:CHARSET     = "0123";
134const INSDC:color:text INSDC:2cs:accept:CHARSET  = "0123";
135const INSDC:x2cs:bin INSDC:x2cs:map:BINSET       = [ 0,1,2,3,4 ];
136const INSDC:color:text INSDC:x2cs:map:CHARSET    = "0123.";
137const INSDC:color:text INSDC:x2cs:accept:CHARSET = "0123.";
138
139const U8 INSDC:color:default_matrix =
140[
141    0, 1, 2, 3, 4,
142    1, 0, 3, 2, 4,
143    2, 3, 0, 1, 4,
144    3, 2, 1, 0, 4,
145    4, 4, 4, 4, 4
146];
147
148/*--------------------------------------------------------------------------
149 * protein
150 *  represented in IUPAC characters
151 */
152typedef ascii INSDC:protein:text;
153
154/*--------------------------------------------------------------------------
155 * aa
156 *  protein data
157 *  text encodings use the IUPAC character set
158 */
159typedef	U8 INSDC:aa:bin;
160
161const INSDC:aa:bin INSDC:aa:map:BINSET
162= [ 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27 ];
163const INSDC:protein:text INSDC:aa:map:CHARSET
164    = "ABCDEFGHIKLMNPQRSTVWXYZU*OJ";
165const INSDC:protein:text INSDC:aa:accept:CHARSET
166    = "ABCDEFGHIJKLMNOPQRSTVWXYZU*abcdefghijklmnopqrstvwxyzu";
167
168
169/*--------------------------------------------------------------------------
170 * quality
171 *  quality scoring values
172 *
173 *  phred legal values: 0..63
174 */
175typedef U8 INSDC:quality:phred;
176typedef I8 INSDC:quality:log_odds;
177
178// text-encoding of quality scores
179// offsets are 33 = '!' and 64 = '@'
180typedef ascii INSDC:quality:text:phred_33;
181typedef ascii INSDC:quality:text:phred_64;
182typedef ascii INSDC:quality:text:log_odds_64;
183
184
185/*--------------------------------------------------------------------------
186 * coordinate
187 *  zero and one based coordinates
188 */
189
190// 32 bit coordinates
191typedef I32 INSDC:coord:val;
192typedef U32 INSDC:coord:len;
193
194// zero or one based coordinate system
195typedef INSDC:coord:val INSDC:coord:zero;
196typedef INSDC:coord:val INSDC:coord:one;
197
198// POSITION types for relating bases to their location in signal
199typedef INSDC:coord:zero INSDC:position:zero;
200typedef INSDC:coord:one INSDC:position:one;
201
202// one-based coordinate limits
203const INSDC:coord:one INSDC:coord:min:one = 0x80000001;
204const INSDC:coord:one INSDC:coord:max:one = 0x3FFFFFFF;
205
206// zero-based coordinate limits
207const INSDC:coord:zero INSDC:coord:min:zero = 0x80000000;
208const INSDC:coord:zero INSDC:coord:max:zero = 0x3FFFFFFE;
209
210/*-------------------------------------------------------------------------
211 * spot and read filters bits
212 */
213typedef U8 INSDC:SRA:read_filter;
214const INSDC:SRA:read_filter SRA_READ_FILTER_PASS = 0;
215const INSDC:SRA:read_filter SRA_READ_FILTER_REJECT = 1;
216const INSDC:SRA:read_filter SRA_READ_FILTER_CRITERIA = 2;
217const INSDC:SRA:read_filter SRA_READ_FILTER_REDACTED = 3;
218
219typedef U8 INSDC:SRA:spot_filter;
220const INSDC:SRA:spot_filter SRA_SPOT_FILTER_PASS = 0;
221const INSDC:SRA:spot_filter SRA_SPOT_FILTER_REJECT = 1;
222const INSDC:SRA:spot_filter SRA_SPOT_FILTER_CRITERIA = 2;
223const INSDC:SRA:spot_filter SRA_SPOT_FILTER_REDACTED = 3;
224
225/*-------------------------------------------------------------------------
226 * read type bits
227 */
228typedef U8 INSDC:SRA:xread_type;
229const INSDC:SRA:xread_type SRA_READ_TYPE_TECHNICAL  = 0;
230const INSDC:SRA:xread_type SRA_READ_TYPE_BIOLOGICAL = 1;
231const INSDC:SRA:xread_type SRA_READ_TYPE_FORWARD    = 2;
232const INSDC:SRA:xread_type SRA_READ_TYPE_REVERSE    = 4;
233
234// original read-types included only technical and biological
235typedef INSDC:SRA:xread_type INSDC:SRA:read_type;
236
237