1/*=========================================================================== 2* 3* PUBLIC DOMAIN NOTICE 4* National Center for Biotechnology Information 5* 6* This software/database is a "United States Government Work" under the 7* terms of the United States Copyright Act. It was written as part of 8* the author's official duties as a United States Government employee and 9* thus cannot be copyrighted. This software/database is freely available 10* to the public for use. The National Library of Medicine and the U.S. 11* Government have not placed any restriction on its use or reproduction. 12* 13* Although all reasonable efforts have been taken to ensure the accuracy 14* and reliability of the software and data, the NLM and the U.S. 15* Government do not and cannot warrant the performance or results that 16* may be obtained by using this software or data. The NLM and the U.S. 17* Government disclaim all warranties, express or implied, including 18* warranties of performance, merchantability or fitness for any particular 19* purpose. 20* 21* Please cite the author in any work or product based on this material. 22* 23* =========================================================================== 24* 25*/ 26 27/*========================================================================== 28 * Sequence schema 29 */ 30version 1; 31 32include 'vdb/vdb.vschema'; 33include 'ncbi/seq.vschema'; 34 35 36/* cmp_base_space 37 * table representing compressed reads in base space, 38 * where the bases are only stored for unaligned reads 39 */ 40table NCBI:align:tbl:cmp_base_space #1 41 = INSDC:tbl:sequence #1.0.1 42 , NCBI:tbl:dcmp_base_space #1 43{ 44 /* CMP_READ 45 * read compressed against a reference sequence 46 */ 47 48 // default is IUPAC character representation 49 extern default column INSDC:dna:text CMP_READ 50 { 51 read = out_cmp_dna_text; 52 validate = < INSDC:dna:text > compare ( in_cmp_dna_text, out_cmp_dna_text ); 53 } 54 55 // 4na representation 56 extern column INSDC:4na:bin CMP_READ = out_cmp_4na_bin; 57 extern column INSDC:4na:packed CMP_READ = out_cmp_4na_packed; 58 59 // x2na representation - 2na with ambiguity 60 extern column INSDC:x2na:bin CMP_READ = out_cmp_x2na_bin; 61 62 // 2na representation - 2na with no ambiguity 63 extern column INSDC:2na:bin CMP_READ = out_cmp_2na_bin; 64 extern column INSDC:2na:packed CMP_READ = out_cmp_2na_packed; 65 66 67 /* input processing rules 68 */ 69 70 // compressed input text 71 INSDC:dna:text in_cmp_dnarna_text 72 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbnu','NACMGRSVTWYHKDBNU' > ( CMP_READ ); 73 INSDC:dna:text in_cmp_dna_text = NCBI:SRA:setRnaFlag ( in_cmp_dnarna_text ); // change U to T 74 75 // compressed input 4na bin 76 INSDC:4na:bin in_cmp_4na_bin 77 = < INSDC:4na:bin > range_validate < 0, 15 > ( CMP_READ ) 78 | ( INSDC:4na:bin ) unpack ( in_cmp_4na_packed ) 79 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_cmp_dna_text ) 80 | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_cmp_x2na_bin ); 81 82 // compressed input 4na packed 83 INSDC:4na:packed in_cmp_4na_packed = CMP_READ; 84 85 // compressed input x2na bin 86 INSDC:x2na:bin in_cmp_x2na_bin 87 = < INSDC:x2na:bin > range_validate < 0, 4 > ( CMP_READ ) 88 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_cmp_4na_bin ); 89 90 // compressed input 2na bin 91 INSDC:2na:bin in_cmp_2na_bin 92 = < INSDC:2na:bin > range_validate < 0, 3 > ( CMP_READ ) 93 | ( INSDC:2na:bin ) unpack ( in_cmp_2na_packed ) 94 | INSDC:SEQ:rand_4na_2na ( in_cmp_4na_bin ); 95 96 // compressed input 2na packed 97 INSDC:2na:packed in_cmp_2na_packed = CMP_READ; 98 99 // input 4na alt-read ( ambiguities ) 100 INSDC:4na:bin in_cmp_alt_4na_bin 101 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_cmp_4na_bin ); 102 103 // preparing a feed into stats column 104 U8 in_cmp_stats_bin = in_cmp_2na_bin; 105 106 107 /* physical columns 108 */ 109 110 physical column INSDC:2na:packed .CMP_READ 111 = in_cmp_2na_packed 112 | ( INSDC:2na:packed ) pack ( in_cmp_2na_bin ); 113 114 physical column < INSDC:4na:bin > zip_encoding .CMP_ALTREAD 115 = < INSDC:4na:bin > trim < 0, 0 > ( in_cmp_alt_4na_bin ); 116 117 118 /* output processing rules 119 */ 120 121 // output 2na packed 122 INSDC:2na:packed out_cmp_2na_packed = .CMP_READ; 123 124 // unambiguous unpacked 2na 125 INSDC:2na:bin out_cmp_2na_bin 126 = ( INSDC:2na:bin ) unpack ( out_cmp_2na_packed ); 127 128 // output x2na bin 129 INSDC:x2na:bin out_cmp_x2na_bin 130 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_cmp_4na_bin ); 131 132 // output 2na->4na bin 133 INSDC:4na:bin out_cmp_2na_4na_bin 134 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_cmp_2na_bin ); 135 136 // output 4na bin 137 INSDC:4na:bin out_cmp_4na_bin 138 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_cmp_2na_4na_bin, .CMP_ALTREAD ) 139 | out_cmp_2na_4na_bin; 140 141 // synthesized packed 4na 142 INSDC:4na:packed out_cmp_4na_packed 143 = ( INSDC:4na:packed ) pack ( out_cmp_4na_bin ); 144 145 // output text 146 INSDC:dna:text out_cmp_dnarna_text 147 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_cmp_4na_bin ); 148 INSDC:dna:text out_cmp_dna_text 149 = NCBI:SRA:useRnaFlag ( out_cmp_dnarna_text ); 150 151 152 /* decompressed sequences 153 * source is out_dcmp_4na_bin - a virtual production 154 */ 155 156 // synthesize x2na_bin, 2na_bin and 2na_packed 157 INSDC:x2na:bin out_dcmp_x2na_bin 158 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin ); 159 INSDC:2na:bin out_dcmp_2na_bin 160 = < INSDC:x2na:bin, INSDC:2na:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2na_bin ); 161 INSDC:2na:packed out_dcmp_2na_packed 162 = ( INSDC:2na:packed ) pack ( out_dcmp_2na_bin ); 163 164 165 /* INSDC:tbl:sequence inherited productions 166 * cs_native 167 * out_cs_key 168 * out_signal 169 * out_2cs_bin 170 * out_2na_bin 171 * out_4na_bin 172 * out_dna_text 173 * out_x2cs_bin 174 * out_x2na_bin 175 * out_2cs_packed 176 * out_2na_packed 177 * out_4na_packed 178 * out_color_text 179 * out_color_matrix 180 */ 181 182 /* NCBI:tbl:dcmp_base_space inherited productions 183 * out_dcmp_4na_bin 184 */ 185} 186 187 188/* cmp_color_space 189 * table representing compressed reads in color space, 190 * where the colors are only stored for unaligned reads 191 */ 192table NCBI:align:tbl:cmp_color_space #1 = 193 INSDC:tbl:sequence #1.0.1, NCBI:tbl:dcmp_color_space #1 194{ 195 /* CMP_CSREAD 196 * read compressed against a reference sequence 197 */ 198 199 // default is IUPAC character representation 200 extern default column INSDC:color:text CMP_CSREAD = out_cmp_color_text; 201 202 // x2cs representation - 2cs with ambiguity 203 extern column INSDC:x2cs:bin CMP_CSREAD = out_cmp_x2cs_bin; 204 205 // 2cs representation - 2cs with no ambiguity 206 extern column INSDC:2cs:bin CMP_CSREAD = out_cmp_2cs_bin; 207 extern column INSDC:2cs:packed CMP_CSREAD = out_cmp_2cs_packed; 208 209 210 /* input processing rules 211 */ 212 213 // compressed input text 214 INSDC:color:text in_cmp_color_text = CMP_CSREAD; 215 216 // compressed input x2cs bin 217 INSDC:x2cs:bin in_cmp_x2cs_bin 218 = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CMP_CSREAD ) 219 | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_cmp_color_text ); 220 221 // compressed input 2cs bin 222 INSDC:2cs:bin in_cmp_2cs_bin 223 = < INSDC:2cs:bin > range_validate < 0, 3 > ( CMP_CSREAD ) 224 | ( INSDC:2cs:bin ) unpack ( in_cmp_2cs_packed ) 225 | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin ); 226 227 // compressed input 2cs packed 228 INSDC:2cs:packed in_cmp_2cs_packed = CMP_CSREAD; 229 230 // compressed input x2cs alt-read ( ambiguities ) 231 INSDC:x2cs:bin in_cmp_alt_x2cs_bin 232 = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin ); 233 234 // preparing a feed into stats column 235 U8 in_cmp_stats_bin = in_cmp_2cs_bin; 236 237 238 /* physical columns 239 */ 240 241 physical column INSDC:2cs:packed .CMP_CSREAD 242 = in_cmp_2cs_packed 243 | ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin ); 244 245 physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD 246 = < INSDC:x2cs:bin > trim < 0, 0 > ( in_cmp_alt_x2cs_bin ); 247 248 249 /* output processing rules 250 */ 251 252 // compressed output 2cs packed 253 INSDC:2cs:packed out_cmp_2cs_packed = .CMP_CSREAD; 254 255 // unambiguous unpacked 2cs 256 INSDC:2cs:bin out_cmp_2cs_bin 257 = ( INSDC:2cs:bin ) unpack ( out_cmp_2cs_packed ); 258 259 // unpacked 2cs with ambiguity 260 INSDC:x2cs:bin out_cmp_x2cs_bin 261 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, .CMP_ALTCSREAD ) 262 | ( INSDC:x2cs:bin ) out_cmp_2cs_bin; 263 264 // output text 265 INSDC:color:text out_cmp_color_text 266 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin ); 267 268 269 /* decompressed sequences 270 * sources are out_dcmp_x2cs_bin - virtual production 271 */ 272 273 // synthesize 2cs_bin and 2cs_packed 274 INSDC:2cs:bin out_dcmp_2cs_bin 275 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2cs_bin ); 276 INSDC:2cs:packed out_dcmp_2cs_packed 277 = ( INSDC:2cs:packed ) pack ( out_dcmp_2cs_bin ); 278 279 280 /* INSDC:tbl:sequence inherited productions 281 * cs_native 282 * out_cs_key 283 * out_signal 284 * out_2cs_bin 285 * out_2na_bin 286 * out_4na_bin 287 * out_dna_text 288 * out_x2cs_bin 289 * out_x2na_bin 290 * out_2cs_packed 291 * out_2na_packed 292 * out_4na_packed 293 * out_color_text 294 * out_qual_phred 295 * out_color_matrix 296 * out_qual_text_phred_33 297 * out_qual_text_phred_64 298 */ 299 300 /* NCBI:tbl:dcmp_color_space inherited productions 301 * out_dcmp_x2cs_bin 302 */ 303} 304