1/*===========================================================================
2*
3*                            PUBLIC DOMAIN NOTICE
4*               National Center for Biotechnology Information
5*
6*  This software/database is a "United States Government Work" under the
7*  terms of the United States Copyright Act.  It was written as part of
8*  the author's official duties as a United States Government employee and
9*  thus cannot be copyrighted.  This software/database is freely available
10*  to the public for use. The National Library of Medicine and the U.S.
11*  Government have not placed any restriction on its use or reproduction.
12*
13*  Although all reasonable efforts have been taken to ensure the accuracy
14*  and reliability of the software and data, the NLM and the U.S.
15*  Government do not and cannot warrant the performance or results that
16*  may be obtained by using this software or data. The NLM and the U.S.
17*  Government disclaim all warranties, express or implied, including
18*  warranties of performance, merchantability or fitness for any particular
19*  purpose.
20*
21*  Please cite the author in any work or product based on this material.
22*
23* ===========================================================================
24*
25*/
26
27/*==========================================================================
28 * Sequence schema
29 */
30version 1;
31
32include 'vdb/vdb.vschema';
33include 'ncbi/seq.vschema';
34
35
36/* cmp_base_space
37 *  table representing compressed reads in base space,
38 *  where the bases are only stored for unaligned reads
39 */
40table NCBI:align:tbl:cmp_base_space #1
41    = INSDC:tbl:sequence #1.0.1
42    , NCBI:tbl:dcmp_base_space #1
43{
44    /* CMP_READ
45     *  read compressed against a reference sequence
46     */
47
48    // default is IUPAC character representation
49    extern default column INSDC:dna:text CMP_READ
50    {
51        read = out_cmp_dna_text;
52        validate = < INSDC:dna:text > compare ( in_cmp_dna_text, out_cmp_dna_text );
53    }
54
55    // 4na representation
56    extern column INSDC:4na:bin CMP_READ = out_cmp_4na_bin;
57    extern column INSDC:4na:packed CMP_READ = out_cmp_4na_packed;
58
59    // x2na representation - 2na with ambiguity
60    extern column INSDC:x2na:bin CMP_READ = out_cmp_x2na_bin;
61
62    // 2na representation - 2na with no ambiguity
63    extern column INSDC:2na:bin CMP_READ = out_cmp_2na_bin;
64    extern column INSDC:2na:packed CMP_READ = out_cmp_2na_packed;
65
66
67    /* input processing rules
68     */
69
70    // compressed input text
71    INSDC:dna:text in_cmp_dnarna_text
72        = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbnu','NACMGRSVTWYHKDBNU' > ( CMP_READ );
73    INSDC:dna:text in_cmp_dna_text = NCBI:SRA:setRnaFlag ( in_cmp_dnarna_text ); // change U to T
74
75    // compressed input 4na bin
76    INSDC:4na:bin in_cmp_4na_bin
77        = < INSDC:4na:bin > range_validate < 0, 15 > ( CMP_READ )
78        | ( INSDC:4na:bin ) unpack ( in_cmp_4na_packed )
79        | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_cmp_dna_text )
80        | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_cmp_x2na_bin );
81
82    // compressed input 4na packed
83    INSDC:4na:packed in_cmp_4na_packed = CMP_READ;
84
85    // compressed input x2na bin
86    INSDC:x2na:bin in_cmp_x2na_bin
87        = < INSDC:x2na:bin > range_validate < 0, 4 > ( CMP_READ )
88        | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_cmp_4na_bin );
89
90    // compressed input 2na bin
91    INSDC:2na:bin in_cmp_2na_bin
92        = < INSDC:2na:bin > range_validate < 0, 3 > ( CMP_READ )
93        | ( INSDC:2na:bin ) unpack ( in_cmp_2na_packed )
94        | INSDC:SEQ:rand_4na_2na ( in_cmp_4na_bin );
95
96    // compressed input 2na packed
97    INSDC:2na:packed in_cmp_2na_packed = CMP_READ;
98
99    // input 4na alt-read ( ambiguities )
100    INSDC:4na:bin in_cmp_alt_4na_bin
101        = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_cmp_4na_bin );
102
103    // preparing a feed into stats column
104    U8 in_cmp_stats_bin = in_cmp_2na_bin;
105
106
107    /* physical columns
108     */
109
110    physical column INSDC:2na:packed .CMP_READ
111        = in_cmp_2na_packed
112        | ( INSDC:2na:packed ) pack ( in_cmp_2na_bin );
113
114    physical column < INSDC:4na:bin > zip_encoding .CMP_ALTREAD
115        = < INSDC:4na:bin > trim < 0, 0 > ( in_cmp_alt_4na_bin );
116
117
118    /* output processing rules
119     */
120
121    // output 2na packed
122    INSDC:2na:packed out_cmp_2na_packed = .CMP_READ;
123
124    // unambiguous unpacked 2na
125    INSDC:2na:bin out_cmp_2na_bin
126        = ( INSDC:2na:bin ) unpack ( out_cmp_2na_packed );
127
128    // output x2na bin
129    INSDC:x2na:bin out_cmp_x2na_bin
130        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_cmp_4na_bin );
131
132    // output 2na->4na bin
133    INSDC:4na:bin out_cmp_2na_4na_bin
134        = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_cmp_2na_bin );
135
136    // output 4na bin
137    INSDC:4na:bin out_cmp_4na_bin
138        = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_cmp_2na_4na_bin, .CMP_ALTREAD )
139        | out_cmp_2na_4na_bin;
140
141    // synthesized packed 4na
142    INSDC:4na:packed out_cmp_4na_packed
143        = ( INSDC:4na:packed ) pack ( out_cmp_4na_bin );
144
145    // output text
146    INSDC:dna:text out_cmp_dnarna_text
147        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_cmp_4na_bin );
148    INSDC:dna:text out_cmp_dna_text
149        = NCBI:SRA:useRnaFlag ( out_cmp_dnarna_text );
150
151
152    /* decompressed sequences
153     *   source is out_dcmp_4na_bin - a virtual production
154     */
155
156    // synthesize x2na_bin, 2na_bin and 2na_packed
157    INSDC:x2na:bin out_dcmp_x2na_bin
158        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin );
159    INSDC:2na:bin out_dcmp_2na_bin
160        = < INSDC:x2na:bin, INSDC:2na:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2na_bin );
161    INSDC:2na:packed out_dcmp_2na_packed
162        = ( INSDC:2na:packed ) pack ( out_dcmp_2na_bin );
163
164
165	/* INSDC:tbl:sequence inherited productions
166	 *  cs_native
167	 *  out_cs_key
168	 *  out_signal
169	 *  out_2cs_bin
170	 *  out_2na_bin
171	 *  out_4na_bin
172	 *  out_dna_text
173	 *  out_x2cs_bin
174	 *  out_x2na_bin
175	 *  out_2cs_packed
176	 *  out_2na_packed
177	 *  out_4na_packed
178	 *  out_color_text
179	 *  out_color_matrix
180	 */
181
182	/* NCBI:tbl:dcmp_base_space inherited productions
183	 *  out_dcmp_4na_bin
184	 */
185}
186
187
188/* cmp_color_space
189 *  table representing compressed reads in color space,
190 *  where the colors are only stored for unaligned reads
191 */
192table NCBI:align:tbl:cmp_color_space #1 =
193    INSDC:tbl:sequence #1.0.1, NCBI:tbl:dcmp_color_space #1
194{
195    /* CMP_CSREAD
196     *  read compressed against a reference sequence
197     */
198
199    // default is IUPAC character representation
200    extern default column INSDC:color:text CMP_CSREAD = out_cmp_color_text;
201
202    // x2cs representation - 2cs with ambiguity
203    extern column INSDC:x2cs:bin CMP_CSREAD = out_cmp_x2cs_bin;
204
205    // 2cs representation - 2cs with no ambiguity
206    extern column INSDC:2cs:bin CMP_CSREAD = out_cmp_2cs_bin;
207    extern column INSDC:2cs:packed CMP_CSREAD = out_cmp_2cs_packed;
208
209
210    /* input processing rules
211     */
212
213    // compressed input text
214    INSDC:color:text in_cmp_color_text = CMP_CSREAD;
215
216    // compressed input x2cs bin
217    INSDC:x2cs:bin in_cmp_x2cs_bin
218        = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CMP_CSREAD )
219        | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_cmp_color_text );
220
221    // compressed input 2cs bin
222    INSDC:2cs:bin in_cmp_2cs_bin
223        = < INSDC:2cs:bin > range_validate < 0, 3 > ( CMP_CSREAD )
224        | ( INSDC:2cs:bin ) unpack ( in_cmp_2cs_packed )
225        | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin );
226
227    // compressed input 2cs packed
228    INSDC:2cs:packed in_cmp_2cs_packed = CMP_CSREAD;
229
230    // compressed input x2cs alt-read ( ambiguities )
231    INSDC:x2cs:bin in_cmp_alt_x2cs_bin
232        = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin );
233
234    // preparing a feed into stats column
235    U8 in_cmp_stats_bin = in_cmp_2cs_bin;
236
237
238    /* physical columns
239     */
240
241    physical column INSDC:2cs:packed .CMP_CSREAD
242        = in_cmp_2cs_packed
243        | ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin );
244
245    physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD
246        = < INSDC:x2cs:bin > trim < 0, 0 > ( in_cmp_alt_x2cs_bin );
247
248
249    /* output processing rules
250     */
251
252    // compressed output 2cs packed
253    INSDC:2cs:packed out_cmp_2cs_packed = .CMP_CSREAD;
254
255    // unambiguous unpacked 2cs
256    INSDC:2cs:bin out_cmp_2cs_bin
257        = ( INSDC:2cs:bin ) unpack ( out_cmp_2cs_packed );
258
259    // unpacked 2cs with ambiguity
260    INSDC:x2cs:bin out_cmp_x2cs_bin
261        = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, .CMP_ALTCSREAD )
262        | ( INSDC:x2cs:bin ) out_cmp_2cs_bin;
263
264    // output text
265    INSDC:color:text out_cmp_color_text
266        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin );
267
268
269    /* decompressed sequences
270     *   sources are out_dcmp_x2cs_bin - virtual production
271     */
272
273    // synthesize 2cs_bin and 2cs_packed
274    INSDC:2cs:bin out_dcmp_2cs_bin
275        = < INSDC:x2cs:bin, INSDC:2cs:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2cs_bin );
276    INSDC:2cs:packed out_dcmp_2cs_packed
277        = ( INSDC:2cs:packed ) pack ( out_dcmp_2cs_bin );
278
279
280	/* INSDC:tbl:sequence inherited productions
281	 *  cs_native
282	 *  out_cs_key
283	 *  out_signal
284	 *  out_2cs_bin
285	 *  out_2na_bin
286	 *  out_4na_bin
287	 *  out_dna_text
288	 *  out_x2cs_bin
289	 *  out_x2na_bin
290	 *  out_2cs_packed
291	 *  out_2na_packed
292	 *  out_4na_packed
293	 *  out_color_text
294	 *  out_qual_phred
295	 *  out_color_matrix
296	 *  out_qual_text_phred_33
297	 *  out_qual_text_phred_64
298	 */
299
300	/* NCBI:tbl:dcmp_color_space inherited productions
301	 *  out_dcmp_x2cs_bin
302	 */
303}
304