1/*=========================================================================== 2* 3* PUBLIC DOMAIN NOTICE 4* National Center for Biotechnology Information 5* 6* This software/database is a "United States Government Work" under the 7* terms of the United States Copyright Act. It was written as part of 8* the author's official duties as a United States Government employee and 9* thus cannot be copyrighted. This software/database is freely available 10* to the public for use. The National Library of Medicine and the U.S. 11* Government have not placed any restriction on its use or reproduction. 12* 13* Although all reasonable efforts have been taken to ensure the accuracy 14* and reliability of the software and data, the NLM and the U.S. 15* Government do not and cannot warrant the performance or results that 16* may be obtained by using this software or data. The NLM and the U.S. 17* Government disclaim all warranties, express or implied, including 18* warranties of performance, merchantability or fitness for any particular 19* purpose. 20* 21* Please cite the author in any work or product based on this material. 22* 23* =========================================================================== 24* 25*/ 26 27/*========================================================================== 28 * INSDC types, constants 29 */ 30version 1; 31 32 33/*-------------------------------------------------------------------------- 34 * dna 35 * represented in IUPAC characters 36 */ 37typedef ascii INSDC:dna:text; 38 39 40/*-------------------------------------------------------------------------- 41 * 4na 42 * nucleotide data with all possible ambiguity 43 * does not represent all possible EVENTS 44 * 45 * text encodings use the IUPAC character set 46 * legal values: [ACMGRSVTWYHKDBNacmgrsvtwyhkdbn.] 47 * canonical values: [ACMGRSVTWYHKDBN] 48 * 49 * binary values are 0..15 => { NACMGRSVTWYHKDBN } 50 * 51 * 4na values use bits for each letter: 52 * 53 * A | C | G | T 54 * ================== 55 * N | | | 56 * A * | | | 57 * C | * | | 58 * M * | * | | 59 * G | | * | 60 * R * | | * | 61 * S | * | * | 62 * V * | * | * | 63 * T | | | * 64 * W * | | | * 65 * Y | * | | * 66 * H * | * | | * 67 * K | | * | * 68 * D * | | * | * 69 * B | * | * | * 70 * N * | * | * | * 71 */ 72typedef U8 INSDC:4na:bin; 73typedef B1 INSDC:4na:packed [ 4 ]; 74 75const INSDC:4na:bin INSDC:4na:map:BINSET 76 = [ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ]; 77const INSDC:dna:text INSDC:4na:map:CHARSET 78 = ".ACMGRSVTWYHKDBN"; 79const INSDC:dna:text INSDC:4na:accept:CHARSET 80 = ".ACMGRSVTWYHKDBNacmgrsvtwyhkdbn"; 81 82/*-------------------------------------------------------------------------- 83 * 2na - nucleotide data A,T,G,C 84 * x2na - nucleotide data extended with single ambiguity value (N) 85 * 86 * text encodings use the IUPAC character set 87 * legal values: [ACGTNacgtn.] 88 * canonical values: [ACGTN] 89 * 90 * x2na values are 0..4 => { ACGTN } or { ACGUN } 91 * 92 * 2na values exclude N: 93 * A = 0 94 * C = 1 95 * G = 2 96 * T = 3 97 */ 98typedef U8 INSDC:2na:bin; 99typedef U8 INSDC:x2na:bin; 100typedef B1 INSDC:2na:packed [ 2 ]; 101 102const INSDC:2na:bin INSDC:2na:map:BINSET = [ 0,1,2,3 ]; 103const INSDC:dna:text INSDC:2na:map:CHARSET = "ACGT"; 104const INSDC:dna:text INSDC:2na:accept:CHARSET = "ACGTacgt"; 105 106const INSDC:x2na:bin INSDC:x2na:map:BINSET = [ 0,1,2,3,4 ]; 107const INSDC:dna:text INSDC:x2na:map:CHARSET = "ACGTN"; 108const INSDC:dna:text INSDC:x2na:accept:CHARSET = "ACGTNacgtn."; 109 110 111/*-------------------------------------------------------------------------- 112 * color - color-space text 113 * 2cs - color-space data 0,1,2,3 114 * x2cs - color-space data extended with single ambiguity value (.) 115 * 116 * text encodings use the ASCII numeric character set 117 * values: [0123.] 118 * 119 * x2cs values are 0..4 = { 0123. } 120 * 121 * 2cs values exclude '.': 122 * '0' = 0 123 * '1' = 1 124 * '2' = 2 125 * '3' = 3 126 */ 127typedef ascii INSDC:color:text; 128typedef U8 INSDC:2cs:bin; 129typedef U8 INSDC:x2cs:bin; 130typedef B1 INSDC:2cs:packed [ 2 ]; 131 132const INSDC:2cs:bin INSDC:2cs:map:BINSET = [ 0,1,2,3 ]; 133const INSDC:color:text INSDC:2cs:map:CHARSET = "0123"; 134const INSDC:color:text INSDC:2cs:accept:CHARSET = "0123"; 135const INSDC:x2cs:bin INSDC:x2cs:map:BINSET = [ 0,1,2,3,4 ]; 136const INSDC:color:text INSDC:x2cs:map:CHARSET = "0123."; 137const INSDC:color:text INSDC:x2cs:accept:CHARSET = "0123."; 138 139const U8 INSDC:color:default_matrix = 140[ 141 0, 1, 2, 3, 4, 142 1, 0, 3, 2, 4, 143 2, 3, 0, 1, 4, 144 3, 2, 1, 0, 4, 145 4, 4, 4, 4, 4 146]; 147 148/*-------------------------------------------------------------------------- 149 * protein 150 * represented in IUPAC characters 151 */ 152typedef ascii INSDC:protein:text; 153 154/*-------------------------------------------------------------------------- 155 * aa 156 * protein data 157 * text encodings use the IUPAC character set 158 */ 159typedef U8 INSDC:aa:bin; 160 161const INSDC:aa:bin INSDC:aa:map:BINSET 162= [ 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27 ]; 163const INSDC:protein:text INSDC:aa:map:CHARSET 164 = "ABCDEFGHIKLMNPQRSTVWXYZU*OJ"; 165const INSDC:protein:text INSDC:aa:accept:CHARSET 166 = "ABCDEFGHIJKLMNOPQRSTVWXYZU*abcdefghijklmnopqrstvwxyzu"; 167 168 169/*-------------------------------------------------------------------------- 170 * quality 171 * quality scoring values 172 * 173 * phred legal values: 0..63 174 */ 175typedef U8 INSDC:quality:phred; 176typedef I8 INSDC:quality:log_odds; 177 178// text-encoding of quality scores 179// offsets are 33 = '!' and 64 = '@' 180typedef ascii INSDC:quality:text:phred_33; 181typedef ascii INSDC:quality:text:phred_64; 182typedef ascii INSDC:quality:text:log_odds_64; 183 184 185/*-------------------------------------------------------------------------- 186 * coordinate 187 * zero and one based coordinates 188 */ 189 190// 32 bit coordinates 191typedef I32 INSDC:coord:val; 192typedef U32 INSDC:coord:len; 193 194// zero or one based coordinate system 195typedef INSDC:coord:val INSDC:coord:zero; 196typedef INSDC:coord:val INSDC:coord:one; 197 198// POSITION types for relating bases to their location in signal 199typedef INSDC:coord:zero INSDC:position:zero; 200typedef INSDC:coord:one INSDC:position:one; 201 202// one-based coordinate limits 203const INSDC:coord:one INSDC:coord:min:one = 0x80000001; 204const INSDC:coord:one INSDC:coord:max:one = 0x3FFFFFFF; 205 206// zero-based coordinate limits 207const INSDC:coord:zero INSDC:coord:min:zero = 0x80000000; 208const INSDC:coord:zero INSDC:coord:max:zero = 0x3FFFFFFE; 209 210/*------------------------------------------------------------------------- 211 * spot and read filters bits 212 */ 213typedef U8 INSDC:SRA:read_filter; 214const INSDC:SRA:read_filter SRA_READ_FILTER_PASS = 0; 215const INSDC:SRA:read_filter SRA_READ_FILTER_REJECT = 1; 216const INSDC:SRA:read_filter SRA_READ_FILTER_CRITERIA = 2; 217const INSDC:SRA:read_filter SRA_READ_FILTER_REDACTED = 3; 218 219typedef U8 INSDC:SRA:spot_filter; 220const INSDC:SRA:spot_filter SRA_SPOT_FILTER_PASS = 0; 221const INSDC:SRA:spot_filter SRA_SPOT_FILTER_REJECT = 1; 222const INSDC:SRA:spot_filter SRA_SPOT_FILTER_CRITERIA = 2; 223const INSDC:SRA:spot_filter SRA_SPOT_FILTER_REDACTED = 3; 224 225/*------------------------------------------------------------------------- 226 * read type bits 227 */ 228typedef U8 INSDC:SRA:xread_type; 229const INSDC:SRA:xread_type SRA_READ_TYPE_TECHNICAL = 0; 230const INSDC:SRA:xread_type SRA_READ_TYPE_BIOLOGICAL = 1; 231const INSDC:SRA:xread_type SRA_READ_TYPE_FORWARD = 2; 232const INSDC:SRA:xread_type SRA_READ_TYPE_REVERSE = 4; 233 234// original read-types included only technical and biological 235typedef INSDC:SRA:xread_type INSDC:SRA:read_type; 236 237