1 #ifndef _h_libs_blast_run_set_
2 #define _h_libs_blast_run_set_
3 
4 /*===========================================================================
5  *
6  *                            PUBLIC DOMAIN NOTICE
7  *               National Center for Biotechnology Information
8  *
9  *  This software/database is a "United States Government Work" under the
10  *  terms of the United States Copyright Act.  It was written as part of
11  *  the author's official duties as a United States Government employee and
12  *  thus cannot be copyrighted.  This software/database is freely available
13  *  to the public for use. The National Library of Medicine and the U.S.
14  *  Government have not placed any restriction on its use or reproduction.
15  *
16  *  Although all reasonable efforts have been taken to ensure the accuracy
17  *  and reliability of the software and data, the NLM and the U.S.
18  *  Government do not and cannot warrant the performance or results that
19  *  may be obtained by using this software or data. The NLM and the U.S.
20  *  Government disclaim all warranties, express or implied, including
21  *  warranties of performance, merchantability or fitness for any particular
22  *  purpose.
23  *
24  *  Please cite the author in any work or product based on this material.
25  *
26  * ===========================================================================
27  *
28  */
29 
30 #include "blast-mgr.h" /* BTableType */
31 
32 #ifndef _h_insdc_insdc_
33 #include <insdc/insdc.h> /* INSDC_coord_len */
34 #endif
35 
36 #ifndef _h_insdc_sra_
37 #include <insdc/sra.h> /* INSDC_SRA_platform_id */
38 #endif
39 
40 #ifndef _h_klib_container_
41 #include <klib/container.h> /* BSTree */
42 #endif
43 
44 #ifndef _h_klib_refcount_
45 #include <klib/refcount.h> /* KRefcount */
46 #endif
47 
48 #ifndef _h_ncbi_vdb_blast_
49 #include <ncbi/vdb-blast.h> /* VdbBlastStatus */
50 #endif
51 
52 #include <stdbool.h> /* bool */
53 #include <stddef.h> /* size_t */
54 #include <stdint.h> /* uint32_t */
55 
56 #ifdef __cplusplus
57 extern "C" {
58 #endif
59 
60 typedef enum {
61     eColTypeError,
62     eColTypeAbsent,
63     eColTypeStatic,
64     eColTypeNonStatic
65 } EColType;
66 
67 typedef enum {
68     eFixedReadN,
69     eFactor10,
70 } EReadIdType;
71 
72 typedef struct {
73     EReadIdType idType;
74     uint32_t runBits;
75     bool varReadN;
76 } ReadIdDesc;
77 
78 typedef struct {
79     uint32_t index;
80 
81     uint64_t spotCount;
82     uint32_t spotBits;
83 
84     uint8_t nReads;
85     uint8_t nBioReads; /* knowing filtering (if static) and min_read_len info */
86     uint64_t bioLen; /* per read. is assigned just when allStatic */
87     INSDC_SRA_platform_id platform;
88 
89     uint64_t bioBaseCount; /* BIO_BASE_COUNT, ~0 if not found */
90     uint64_t cmpBaseCount; /* CMP_BASE_COUNT, ~0 if not found */
91 
92     INSDC_read_type *readType;
93     EColType readTypeStatic;
94 
95     uint32_t *readLen;
96     EColType readLenStatic;
97 
98     uint8_t *rdFilter;
99     EColType rdFilterStatic;
100 
101     bool varReadLen;
102     ReadIdDesc readIdDesc;
103 } RunDesc;
104 
105 typedef struct {
106     const struct VDatabase *db;
107 
108     const struct VTable *seqTbl;
109     const struct VTable *prAlgnTbl;
110     const struct VTable *refTbl;
111 
112     /* WGS */
113     const struct VCursor *cursACCESSION;
114     uint32_t col_ACCESSION;
115 
116     /* SRA_PLATFORM_PACBIO_SMRT : variable read number */
117     const struct VCursor *cursSeq;
118     uint32_t col_READ_FILTER;
119     uint32_t col_READ_LEN;
120     uint32_t col_READ_TYPE;
121     uint32_t col_TRIM_LEN;
122     uint32_t col_TRIM_START;
123 } VdbBlastDb;
124 
125 typedef struct {
126     /* rundesc; */
127     char *acc;
128     char *path;
129 
130     VdbBlastDb *obj;
131     BTableType type;
132     bool cSra;
133 
134    /* bioReads = numSequences = number-of-spots * number-of-bio-reads-in-spot */
135     uint64_t bioReads;
136 
137     uint64_t alignments; /* rows number in PRIMARY_ALIGNMENT table */
138 
139     bool bioReadsTooExpensive; /* numSequences is TooExpensive */
140     uint64_t bioReadsApprox;   /* numSequencesApprox; */
141 
142     uint64_t bioBases;         /* length; */
143     bool bioBasesTooExpensive; /* totalLength is TooExpensive */
144     uint64_t bioBasesApprox;   /* lengthApprox; */
145 
146     RunDesc rd;
147 
148     uint32_t min_read_length;
149 } VdbBlastRun;
150 
151 typedef struct VdbBlastRef VdbBlastRef;
152 typedef struct {
153     VdbBlastRef  *rfd;
154     size_t        rfdk; /* Number of rfd members */
155     size_t        rfdn; /* Allocated rfd members */
156 
157     uint64_t  totalLen; /* Total number of bases in reference set.
158                            Base count for circular references is doubled. */
159 
160     BSTree tRuns;       /* rundesc-s */
161     BSTree tExtRefs;    /* SEQ_ID-s of external references */
162     BSTree tIntRefs;    /* SEQ_ID-s of external references */
163 } RefSet;
164 
165 void _RefSetFini(RefSet *self);
166 
167 typedef struct RunSet {
168     VdbBlastRun *run;
169     uint32_t krun; /* number of run-s */
170     uint32_t nrun; /* sizeof of run-s */
171 
172     RefSet refs;
173 } RunSet;
174 
175 typedef struct {
176     const VdbBlastRun *prev;
177 
178     VdbBlastRun *run;
179 
180     uint32_t tableId;
181  /* VDB_READ_UNALIGNED, VDB_READ_ALIGNED or VDB_READ_DIRECT */
182 
183     uint64_t spot; /* 1-based */
184     uint32_t read; /* 1-based */
185     uint32_t nReads; /* is variable in SRA_PLATFORM_PACBIO_SMRT */
186 
187     uint64_t read_id; /* BioReadId in RunSet */
188 
189     bool circular;
190                /* we are going to return a circular reference the second time */
191 } ReadDesc;
192 
193 typedef struct {
194     uint32_t col_PRIMARY_ALIGNMENT_ID;
195     uint32_t col_READ_FILTER;
196     uint32_t col_READ_LEN;
197     uint32_t col_TRIM_LEN;
198     uint32_t col_TRIM_START;
199 
200     int64_t *primary_alignment_id;
201     uint8_t *read_filter;
202     uint32_t *read_len;
203     INSDC_coord_len TRIM_LEN;
204     INSDC_coord_val TRIM_START;
205 
206     uint8_t nReadsAllocated;
207 } ReaderCols;
208 
209 /* cSRA READ mode : is ignored for non-cSRA runs */
210 typedef uint32_t KVdbBlastReadMode;
211 enum {
212     VDB_READ_UNALIGNED =       1, /* return unaligned reads */
213     VDB_READ_ALIGNED   =       2, /* return aligned reads */
214     VDB_READ_REFERENCE =       3, /* return reference sequence */
215 };
216 
217 struct References;
218 
219 typedef struct {
220     bool eor;
221     ReadDesc desc;
222     uint32_t col_READ;
223     const struct VCursor *curs;
224     size_t starting_base; /* 0-based, in current read */
225     ReaderCols cols;
226     KVdbBlastReadMode mode;
227     const struct References *refs;
228 } Reader2na;
229 
230 typedef struct Core2na {
231     uint32_t min_read_length;
232     bool hasReader;
233     struct KLock *mutex;
234     uint64_t initial_read_id;
235     uint32_t irun; /* index in RunSet */
236     bool eos;
237     Reader2na reader;
238 } Core2na;
239 
240 typedef struct Core4na {
241     uint32_t min_read_length;
242     struct KLock *mutex;
243     ReadDesc desc;
244     const struct VCursor *curs;
245     const struct VBlob *blob; /* TODO */
246     ReaderCols cols;
247     uint32_t col_READ;
248     KVdbBlastReadMode mode;
249 } Core4na;
250 
251 struct VdbBlastRunSet {
252     KRefcount refcount;
253     bool protein;
254     VdbBlastMgr *mgr;
255 
256     RunSet runs;
257 
258     bool beingRead;
259     ReadIdDesc readIdDesc;
260 
261     Core2na core2na;
262     Core4na core4na;
263 
264     Core2na core2naRef;
265     Core4na core4naRef;
266 
267     uint64_t minSeqLen;
268     uint64_t avgSeqLen;
269     uint64_t maxSeqLen;
270 };
271 
272 rc_t _VTableMakeCursor(const struct VTable *self, const struct VCursor **curs,
273     uint32_t *col_idx, const char *col_name, const char *acc);
274 
275 rc_t _ReadDescFindNextRead(ReadDesc *self, bool *found);
276 VdbBlastStatus _ReadDescFixReadId(ReadDesc *self);
277 
278 uint64_t _VdbBlastRunAdjustSequencesAmountForAlignments(VdbBlastRun *self,
279     VdbBlastStatus *status);
280 
281 #ifdef TEST_VdbBlastRunFillReadDesc
282 VDB_EXTERN
283 #endif
284 uint32_t _VdbBlastRunFillReadDesc(VdbBlastRun *self,
285     uint64_t read_id, ReadDesc *desc);
286 
287 uint64_t _VdbBlastRunGetNumAlignments(VdbBlastRun *self,
288     VdbBlastStatus *status);
289 
290 bool _VdbBlastRunVarReadNum(const VdbBlastRun *self);
291 
292 uint32_t _RunSetFindReadDesc(const struct RunSet *self,
293     uint64_t read_id, ReadDesc *desc);
294 
295 uint64_t _VdbBlastRunSet2naRead(const VdbBlastRunSet *self,
296     VdbBlastStatus *status, uint64_t *read_id, size_t *starting_base,
297     uint8_t *buffer, size_t buffer_size, KVdbBlastReadMode mode);
298 
299 void _VdbBlastRunSetBeingRead(const VdbBlastRunSet *self);
300 
301 VdbBlastStatus _VdbBlastRunSetFindFirstRead
302     (const VdbBlastRunSet *self, uint64_t *read_id, bool useGetFirstRead);
303 
304 uint64_t _VdbBlastRunSetGetAllReads(const VdbBlastRunSet *self, uint32_t run);
305 
306 EReadIdType _VdbBlastRunSetGetReadIdType(const VdbBlastRunSet *self);
307 
308 
309 #ifdef __cplusplus
310 }
311 #endif
312 
313 #endif /* _h_libs_blast_run_set_ */
314