1 /*
2  * Copyright (c) 2007-2009 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef _SRF_H_
35 #define _SRF_H_
36 
37 #include "io_lib/hash_table.h"
38 #include "io_lib/ztr.h"
39 #include "io_lib/mFILE.h"
40 
41 #define SRF_MAGIC		"SSRF"
42 #define SRF_VERSION             "1.3"
43 
44 #define SRFB_CONTAINER 		'S'
45 #define SRFB_XML		'X'
46 #define SRFB_TRACE_HEADER	'H'
47 #define SRFB_TRACE_BODY		'R'
48 #define SRFB_INDEX		'I'
49 
50 /* Lack of index => 8 zero bytes at end of file to indicate zero length */
51 #define SRFB_NULL_INDEX		'\0'
52 
53 /*--- Public structures */
54 
55 /* Container header - several per file */
56 typedef struct {
57     int block_type;
58     char version[256];
59     char container_type;
60     char base_caller[256];
61     char base_caller_version[256];
62 } srf_cont_hdr_t;
63 
64 /* Trace header - several per container */
65 typedef struct {
66     int block_type;
67     char read_prefix_type;
68     char id_prefix[256];
69     uint32_t trace_hdr_size;
70     unsigned char *trace_hdr;
71 } srf_trace_hdr_t;
72 
73 /* Trace body - several per trace header */
74 typedef struct {
75     int block_type;
76     int read_id_length;
77     char read_id[256];
78     unsigned char flags;
79     uint32_t trace_size;
80     unsigned char *trace;
81 } srf_trace_body_t;
82 
83 /* XML - NCBI TraceInfo data block */
84 typedef struct {
85     uint32_t xml_len;
86     char *xml;
87 } srf_xml_t;
88 
89 #define SRF_READ_FLAG_BAD_MASK       (1<<0)
90 #define SRF_READ_FLAG_WITHDRAWN_MASK (1<<1)
91 #define SRF_READ_FLAG_USER_MASK      (7<<5)
92 
93 /* Indexing */
94 typedef struct {
95     char     magic[4];
96     char     version[4];
97     uint64_t size;
98     uint32_t n_container;
99     uint32_t n_data_block_hdr;
100     uint64_t n_buckets;
101     int8_t   index_type;
102     int8_t   dbh_pos_stored_sep;
103     char     dbh_file[256];
104     char     cont_file[256];
105     int      index_hdr_sz; /* size of the above data on disk */
106 } srf_index_hdr_t;
107 
108 /* In-memory index itself */
109 #define SRF_INDEX_NAME_BLOCK_SIZE 10000000
110 
111 typedef struct {
112   size_t  used;
113   size_t  space;
114   char   *names;
115 } srf_name_block_t;
116 
117 typedef struct {
118     char ch_file[PATH_MAX+1];
119     char th_file[PATH_MAX+1];
120     Array ch_pos;
121     Array th_pos;
122     Array name_blocks;
123     int dbh_pos_stored_sep;
124     HashTable *db_hash;
125 } srf_index_t;
126 
127 /* Master SRF object */
128 typedef struct {
129     FILE *fp;
130 
131     /* Cached copies of each of the most recent chunk types loaded */
132     srf_cont_hdr_t    ch;
133     srf_trace_hdr_t   th;
134     srf_trace_body_t  tb;
135     srf_xml_t         xml;
136     srf_index_hdr_t   hdr;
137 
138     /* Private: cached data for use by srf_next_ztr */
139     ztr_t *ztr;
140     mFILE *mf;
141     long mf_pos, mf_end;
142 } srf_t;
143 
144 #define SRF_INDEX_MAGIC    "Ihsh"
145 #define SRF_INDEX_VERSION  "1.01"
146 
147 
148 /*--- Initialisation */
149 srf_t *srf_create(FILE *fp);
150 srf_t *srf_open(char *fn, char *mode);
151 void srf_destroy(srf_t *srf, int auto_close);
152 
153 /*--- Base type I/O methods */
154 
155 int srf_write_pstring(srf_t *srf, char *str);
156 int srf_write_pstringb(srf_t *srf, char *str, int length);
157 int srf_read_pstring(srf_t *srf, char *str);
158 
159 int srf_read_uint32(srf_t *srf, uint32_t *val);
160 int srf_write_uint32(srf_t *srf, uint32_t val);
161 
162 int srf_read_uint64(srf_t *srf, uint64_t *val);
163 int srf_write_uint64(srf_t *srf, uint64_t val);
164 
165 
166 /*--- Mid level I/O - srf block */
167 srf_cont_hdr_t *srf_construct_cont_hdr(srf_cont_hdr_t *ch,
168 				       char *bc,
169 				       char *bc_version);
170 void srf_destroy_cont_hdr(srf_cont_hdr_t *ch);
171 int srf_read_cont_hdr(srf_t *srf, srf_cont_hdr_t *ch);
172 int srf_write_cont_hdr(srf_t *srf, srf_cont_hdr_t *ch);
173 
174 int srf_read_xml(srf_t *srf, srf_xml_t *xml);
175 int srf_write_xml(srf_t *srf, srf_xml_t *xml);
176 
177 srf_trace_hdr_t *srf_construct_trace_hdr(srf_trace_hdr_t *th,
178 					 char *prefix,
179 					 unsigned char *header,
180 					 uint32_t header_sz);
181 void srf_destroy_trace_hdr(srf_trace_hdr_t *th);
182 int srf_read_trace_hdr(srf_t *srf, srf_trace_hdr_t *th);
183 int srf_write_trace_hdr(srf_t *srf, srf_trace_hdr_t *th);
184 
185 srf_trace_body_t *srf_construct_trace_body(srf_trace_body_t *th,
186 					   char *suffix,
187 					   int suffix_len,
188 					   unsigned char *body,
189 					   uint32_t body_size,
190 					   unsigned char flags);
191 void srf_destroy_trace_body(srf_trace_body_t *th);
192 int srf_write_trace_body(srf_t *srf, srf_trace_body_t *th);
193 int srf_read_trace_body(srf_t *srf, srf_trace_body_t *th, int no_trace);
194 
195 int srf_read_index_hdr(srf_t *srf, srf_index_hdr_t *hdr, int no_seek);
196 int srf_write_index_hdr(srf_t *srf, srf_index_hdr_t *hdr);
197 srf_index_t *srf_index_create(char *ch_file, char *th_file, int dbh_sep);
198 void srf_index_destroy(srf_index_t *idx);
199 void srf_index_stats(srf_index_t *idx, FILE *fp);
200 int srf_index_add_cont_hdr(srf_index_t *idx, uint64_t pos);
201 int srf_index_add_trace_hdr(srf_index_t *idx, uint64_t pos);
202 int srf_index_add_trace_body(srf_index_t *idx, char *name, uint64_t pos);
203 int srf_index_write(srf_t *srf, srf_index_t *idx);
204 
205 /*--- Higher level I/O functions */
206 mFILE *srf_next_trace(srf_t *srf, char *name);
207 ztr_t *srf_next_ztr_flags(srf_t *srf, char *name, int filter_mask, int *flags);
208 ztr_t *srf_next_ztr(srf_t *srf, char *name, int filter_mask);
209 
210 ztr_t *partial_decode_ztr(srf_t *srf, mFILE *mf, ztr_t *z);
211 ztr_t *ztr_dup(ztr_t *src);
212 
213 int srf_next_block_type(srf_t *srf); /* peek ahead */
214 int srf_next_block_details(srf_t *srf, uint64_t *pos, char *name);
215 
216 int srf_find_trace(srf_t *srf, char *trace,
217 		   uint64_t *cpos, uint64_t *hpos, uint64_t *dpos);
218 
219 int construct_trace_name(char *fmt,
220 			 unsigned char *suffix, int suffix_len,
221 			 char *name, int name_len);
222 
223 #endif /* _SRF_H_ */
224