1 #ifndef _h_libs_blast_run_set_ 2 #define _h_libs_blast_run_set_ 3 4 /*=========================================================================== 5 * 6 * PUBLIC DOMAIN NOTICE 7 * National Center for Biotechnology Information 8 * 9 * This software/database is a "United States Government Work" under the 10 * terms of the United States Copyright Act. It was written as part of 11 * the author's official duties as a United States Government employee and 12 * thus cannot be copyrighted. This software/database is freely available 13 * to the public for use. The National Library of Medicine and the U.S. 14 * Government have not placed any restriction on its use or reproduction. 15 * 16 * Although all reasonable efforts have been taken to ensure the accuracy 17 * and reliability of the software and data, the NLM and the U.S. 18 * Government do not and cannot warrant the performance or results that 19 * may be obtained by using this software or data. The NLM and the U.S. 20 * Government disclaim all warranties, express or implied, including 21 * warranties of performance, merchantability or fitness for any particular 22 * purpose. 23 * 24 * Please cite the author in any work or product based on this material. 25 * 26 * =========================================================================== 27 * 28 */ 29 30 #include "blast-mgr.h" /* BTableType */ 31 32 #ifndef _h_insdc_insdc_ 33 #include <insdc/insdc.h> /* INSDC_coord_len */ 34 #endif 35 36 #ifndef _h_insdc_sra_ 37 #include <insdc/sra.h> /* INSDC_SRA_platform_id */ 38 #endif 39 40 #ifndef _h_klib_container_ 41 #include <klib/container.h> /* BSTree */ 42 #endif 43 44 #ifndef _h_klib_refcount_ 45 #include <klib/refcount.h> /* KRefcount */ 46 #endif 47 48 #ifndef _h_ncbi_vdb_blast_ 49 #include <ncbi/vdb-blast.h> /* VdbBlastStatus */ 50 #endif 51 52 #include <stdbool.h> /* bool */ 53 #include <stddef.h> /* size_t */ 54 #include <stdint.h> /* uint32_t */ 55 56 #ifdef __cplusplus 57 extern "C" { 58 #endif 59 60 typedef enum { 61 eColTypeError, 62 eColTypeAbsent, 63 eColTypeStatic, 64 eColTypeNonStatic 65 } EColType; 66 67 typedef enum { 68 eFixedReadN, 69 eFactor10, 70 } EReadIdType; 71 72 typedef struct { 73 EReadIdType idType; 74 uint32_t runBits; 75 bool varReadN; 76 } ReadIdDesc; 77 78 typedef struct { 79 uint32_t index; 80 81 uint64_t spotCount; 82 uint32_t spotBits; 83 84 uint8_t nReads; 85 uint8_t nBioReads; /* knowing filtering (if static) and min_read_len info */ 86 uint64_t bioLen; /* per read. is assigned just when allStatic */ 87 INSDC_SRA_platform_id platform; 88 89 uint64_t bioBaseCount; /* BIO_BASE_COUNT, ~0 if not found */ 90 uint64_t cmpBaseCount; /* CMP_BASE_COUNT, ~0 if not found */ 91 92 INSDC_read_type *readType; 93 EColType readTypeStatic; 94 95 uint32_t *readLen; 96 EColType readLenStatic; 97 98 uint8_t *rdFilter; 99 EColType rdFilterStatic; 100 101 bool varReadLen; 102 ReadIdDesc readIdDesc; 103 } RunDesc; 104 105 typedef struct { 106 const struct VDatabase *db; 107 108 const struct VTable *seqTbl; 109 const struct VTable *prAlgnTbl; 110 const struct VTable *refTbl; 111 112 /* WGS */ 113 const struct VCursor *cursACCESSION; 114 uint32_t col_ACCESSION; 115 116 /* SRA_PLATFORM_PACBIO_SMRT : variable read number */ 117 const struct VCursor *cursSeq; 118 uint32_t col_READ_FILTER; 119 uint32_t col_READ_LEN; 120 uint32_t col_READ_TYPE; 121 uint32_t col_TRIM_LEN; 122 uint32_t col_TRIM_START; 123 } VdbBlastDb; 124 125 typedef struct { 126 /* rundesc; */ 127 char *acc; 128 char *path; 129 130 VdbBlastDb *obj; 131 BTableType type; 132 bool cSra; 133 134 /* bioReads = numSequences = number-of-spots * number-of-bio-reads-in-spot */ 135 uint64_t bioReads; 136 137 uint64_t alignments; /* rows number in PRIMARY_ALIGNMENT table */ 138 139 bool bioReadsTooExpensive; /* numSequences is TooExpensive */ 140 uint64_t bioReadsApprox; /* numSequencesApprox; */ 141 142 uint64_t bioBases; /* length; */ 143 bool bioBasesTooExpensive; /* totalLength is TooExpensive */ 144 uint64_t bioBasesApprox; /* lengthApprox; */ 145 146 RunDesc rd; 147 148 uint32_t min_read_length; 149 } VdbBlastRun; 150 151 typedef struct VdbBlastRef VdbBlastRef; 152 typedef struct { 153 VdbBlastRef *rfd; 154 size_t rfdk; /* Number of rfd members */ 155 size_t rfdn; /* Allocated rfd members */ 156 157 uint64_t totalLen; /* Total number of bases in reference set. 158 Base count for circular references is doubled. */ 159 160 BSTree tRuns; /* rundesc-s */ 161 BSTree tExtRefs; /* SEQ_ID-s of external references */ 162 BSTree tIntRefs; /* SEQ_ID-s of external references */ 163 } RefSet; 164 165 void _RefSetFini(RefSet *self); 166 167 typedef struct RunSet { 168 VdbBlastRun *run; 169 uint32_t krun; /* number of run-s */ 170 uint32_t nrun; /* sizeof of run-s */ 171 172 RefSet refs; 173 } RunSet; 174 175 typedef struct { 176 const VdbBlastRun *prev; 177 178 VdbBlastRun *run; 179 180 uint32_t tableId; 181 /* VDB_READ_UNALIGNED, VDB_READ_ALIGNED or VDB_READ_DIRECT */ 182 183 uint64_t spot; /* 1-based */ 184 uint32_t read; /* 1-based */ 185 uint32_t nReads; /* is variable in SRA_PLATFORM_PACBIO_SMRT */ 186 187 uint64_t read_id; /* BioReadId in RunSet */ 188 189 bool circular; 190 /* we are going to return a circular reference the second time */ 191 } ReadDesc; 192 193 typedef struct { 194 uint32_t col_PRIMARY_ALIGNMENT_ID; 195 uint32_t col_READ_FILTER; 196 uint32_t col_READ_LEN; 197 uint32_t col_TRIM_LEN; 198 uint32_t col_TRIM_START; 199 200 int64_t *primary_alignment_id; 201 uint8_t *read_filter; 202 uint32_t *read_len; 203 INSDC_coord_len TRIM_LEN; 204 INSDC_coord_val TRIM_START; 205 206 uint8_t nReadsAllocated; 207 } ReaderCols; 208 209 /* cSRA READ mode : is ignored for non-cSRA runs */ 210 typedef uint32_t KVdbBlastReadMode; 211 enum { 212 VDB_READ_UNALIGNED = 1, /* return unaligned reads */ 213 VDB_READ_ALIGNED = 2, /* return aligned reads */ 214 VDB_READ_REFERENCE = 3, /* return reference sequence */ 215 }; 216 217 struct References; 218 219 typedef struct { 220 bool eor; 221 ReadDesc desc; 222 uint32_t col_READ; 223 const struct VCursor *curs; 224 size_t starting_base; /* 0-based, in current read */ 225 ReaderCols cols; 226 KVdbBlastReadMode mode; 227 const struct References *refs; 228 } Reader2na; 229 230 typedef struct Core2na { 231 uint32_t min_read_length; 232 bool hasReader; 233 struct KLock *mutex; 234 uint64_t initial_read_id; 235 uint32_t irun; /* index in RunSet */ 236 bool eos; 237 Reader2na reader; 238 } Core2na; 239 240 typedef struct Core4na { 241 uint32_t min_read_length; 242 struct KLock *mutex; 243 ReadDesc desc; 244 const struct VCursor *curs; 245 const struct VBlob *blob; /* TODO */ 246 ReaderCols cols; 247 uint32_t col_READ; 248 KVdbBlastReadMode mode; 249 } Core4na; 250 251 struct VdbBlastRunSet { 252 KRefcount refcount; 253 bool protein; 254 VdbBlastMgr *mgr; 255 256 RunSet runs; 257 258 bool beingRead; 259 ReadIdDesc readIdDesc; 260 261 Core2na core2na; 262 Core4na core4na; 263 264 Core2na core2naRef; 265 Core4na core4naRef; 266 267 uint64_t minSeqLen; 268 uint64_t avgSeqLen; 269 uint64_t maxSeqLen; 270 }; 271 272 rc_t _VTableMakeCursor(const struct VTable *self, const struct VCursor **curs, 273 uint32_t *col_idx, const char *col_name, const char *acc); 274 275 rc_t _ReadDescFindNextRead(ReadDesc *self, bool *found); 276 VdbBlastStatus _ReadDescFixReadId(ReadDesc *self); 277 278 uint64_t _VdbBlastRunAdjustSequencesAmountForAlignments(VdbBlastRun *self, 279 VdbBlastStatus *status); 280 281 #ifdef TEST_VdbBlastRunFillReadDesc 282 VDB_EXTERN 283 #endif 284 uint32_t _VdbBlastRunFillReadDesc(VdbBlastRun *self, 285 uint64_t read_id, ReadDesc *desc); 286 287 uint64_t _VdbBlastRunGetNumAlignments(VdbBlastRun *self, 288 VdbBlastStatus *status); 289 290 bool _VdbBlastRunVarReadNum(const VdbBlastRun *self); 291 292 uint32_t _RunSetFindReadDesc(const struct RunSet *self, 293 uint64_t read_id, ReadDesc *desc); 294 295 uint64_t _VdbBlastRunSet2naRead(const VdbBlastRunSet *self, 296 VdbBlastStatus *status, uint64_t *read_id, size_t *starting_base, 297 uint8_t *buffer, size_t buffer_size, KVdbBlastReadMode mode); 298 299 void _VdbBlastRunSetBeingRead(const VdbBlastRunSet *self); 300 301 VdbBlastStatus _VdbBlastRunSetFindFirstRead 302 (const VdbBlastRunSet *self, uint64_t *read_id, bool useGetFirstRead); 303 304 uint64_t _VdbBlastRunSetGetAllReads(const VdbBlastRunSet *self, uint32_t run); 305 306 EReadIdType _VdbBlastRunSetGetReadIdType(const VdbBlastRunSet *self); 307 308 309 #ifdef __cplusplus 310 } 311 #endif 312 313 #endif /* _h_libs_blast_run_set_ */ 314