1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include <vdb/extern.h>
28
29 #include "reader.h" /* _RunSetFindReadDesc */
30 #include "blast-mgr.h" /* BTableType */
31 #include "run-set.h" /* VdbBlastRunSet */
32
33 #include <kdb/kdb-priv.h> /* KTableGetPath */
34 #include <kdb/table.h> /* KTable */
35
36 #include <klib/debug.h> /* DBGMSG */
37 #include <klib/log.h> /* LOGERR */
38 #include <klib/rc.h> /* RC */
39 #include <klib/refcount.h> /* KRefcount */
40 #include <klib/status.h> /* STSMSG */
41
42 #include <kproc/lock.h> /* KLockMake */
43
44 #include <ncbi/vdb-blast.h> /* VdbBlastRunSet */
45
46 #include <vdb/cursor.h> /* VCursor */
47 #include <vdb/database.h> /* VDatabase */
48 #include <vdb/table.h> /* VTable */
49 #include <vdb/vdb-priv.h> /* VTableOpenKTableRead */
50
51 #include <sysalloc.h>
52
53 #include <string.h> /* memset */
54 #include <time.h> /* time_t */
55
56 #include <limits.h> /* PATH_MAX */
57 #ifndef PATH_MAX
58 #define PATH_MAX 4096
59 #endif
60
61 void _Core2naFini(Core2na *self);
62 void _Core4naFini(Core4na *self);
63
64 /******************************************************************************/
65
_NotImplementedP(const char * func)66 static void *_NotImplementedP(const char *func) {
67 PLOGERR(klogErr, (klogErr, -1,
68 "$(func): is not implemented", "func=%s", func));
69 return 0;
70 }
71
_NotImplemented(const char * func)72 static size_t _NotImplemented(const char *func)
73 { return (size_t)_NotImplementedP(func); }
74
Min(uint64_t cand,uint64_t champ,int64_t minRead,bool * done)75 static uint64_t CC Min(uint64_t cand,
76 uint64_t champ, int64_t minRead, bool *done)
77 {
78 assert(done);
79 if (minRead >= 0) {
80 if (cand < minRead) {
81 return champ;
82 }
83 else if (cand == minRead) {
84 *done = true;
85 return minRead;
86 }
87 }
88 return cand < champ ? cand : champ;
89 }
90
Max(uint64_t cand,uint64_t champ,int64_t minRead,bool * done)91 static uint64_t CC Max(uint64_t cand,
92 uint64_t champ, int64_t minRead, bool *done)
93 {
94 return cand > champ ? cand : champ;
95 }
96
_CanonocalName(const char * name)97 static char* _CanonocalName(const char *name) {
98 size_t noExtSize = 0;
99 size_t namelen = 0;
100 const char ext[] = ".sra";
101
102 if (name == NULL) {
103 return NULL;
104 }
105
106 noExtSize = namelen = string_size(name);
107
108 if (namelen >= sizeof(ext)) {
109 const char *tail = NULL;
110 noExtSize -= sizeof ext - 1;
111 tail = name + noExtSize;
112 if (string_cmp(ext, sizeof ext - 1, tail, sizeof ext - 1, 99) != 0) {
113 noExtSize = namelen;
114 }
115 }
116
117 return string_dup(name, noExtSize);
118 }
119
120 /******************************************************************************/
121
_VCursorCellDataDirect(const VCursor * self,int64_t row_id,uint32_t col_idx,uint32_t elemBits,const void ** base,uint32_t nreads,const char * name)122 static rc_t _VCursorCellDataDirect(const VCursor *self,
123 int64_t row_id, uint32_t col_idx, uint32_t elemBits,
124 const void **base, uint32_t nreads, const char *name)
125 {
126 uint32_t elem_bits = 0;
127 uint32_t row_len = 0;
128 uint32_t boff = 0;
129
130 rc_t rc = VCursorCellDataDirect(self, row_id, col_idx,
131 &elem_bits, base, &boff, &row_len);
132
133 if (rc != 0) {
134 PLOGERR(klogInt, (klogInt, rc,
135 "Error during VCursorCellDataDirect($(name), $(spot))",
136 "name=%s,spot=%lu", name, col_idx));
137 }
138 else if (boff != 0 || elem_bits != elemBits) {
139 rc = RC(rcSRA, rcCursor, rcReading, rcData, rcUnexpected);
140 PLOGERR(klogInt, (klogInt, rc,
141 "Bad VCursorCellDataDirect($(name), $(spot)) result: "
142 "boff=$(boff), elem_bits=$(elem_bits)",
143 "name=%s,spot=%lu,boff=%u,elem_bits=%u",
144 name, col_idx, boff, elem_bits));
145 }
146 else if (row_len != nreads) {
147 rc = RC(rcSRA, rcCursor, rcReading, rcData, rcUnexpected);
148 PLOGERR(klogInt, (klogInt, rc,
149 "Bad VCursorCellDataDirect($(name), $(spot)) result: "
150 "row_len=$(row_len)",
151 "name=%s,spot=%lu,row_len=%u", name, col_idx, row_len));
152 }
153
154 return rc;
155 }
156
157 /******************************************************************************/
158
_VDatabaseOpenAlignmentTable(const VDatabase * self,const char * path,const VTable ** tbl)159 static VdbBlastStatus _VDatabaseOpenAlignmentTable(const VDatabase *self,
160 const char *path,
161 const VTable **tbl)
162 {
163 const char *table = "PRIMARY_ALIGNMENT";
164
165 rc_t rc = VDatabaseOpenTableRead(self, tbl, table);
166 if (rc != 0) {
167 PLOGERR(klogInt, (klogInt, rc,
168 "Error in VDatabaseOpenTableRead($(name), $(tbl))",
169 "name=%s,tbl=%s", path, table));
170 STSMSG(1, ("Error: failed to open DB table '%s/%s'", path, table));
171 }
172 else {
173 STSMSG(1, ("Opened DB table '%s/%s'", path, table));
174 }
175
176 return rc != 0 ? eVdbBlastErr : eVdbBlastNoErr;;
177 }
178
179 /******************************************************************************/
180
181 typedef struct {
182 const VCursor *curs;
183 uint32_t colREAD_LEN;
184 uint32_t colREAD_TYPE;
185 uint64_t techBasesPerSpot;
186 bool techBasesPerSpotEquals;
187 uint64_t bioBasesCnt;
188 } ApprCnt;
_ApprCntInit(ApprCnt * self,const VTable * tbl,const char * acc)189 static rc_t _ApprCntInit(ApprCnt *self, const VTable *tbl, const char *acc) {
190 rc_t rc = 0;
191
192 assert(self);
193
194 memset(self, 0, sizeof *self);
195
196 self->techBasesPerSpotEquals = true;
197 self->techBasesPerSpot = ~0;
198
199 if (rc == 0) {
200 rc = VTableCreateCursorRead(tbl, &self->curs);
201 if (rc != 0) {
202 PLOGERR(klogInt, (klogInt, rc,
203 "Error during VTableCreateCursorRead for $(acc)",
204 "acc=%s", acc));
205 }
206 }
207
208 if (rc == 0) {
209 const char name[] = "READ_LEN";
210 rc = VCursorAddColumn(self->curs, &self->colREAD_LEN, name);
211 if (rc != 0) {
212 PLOGERR(klogInt, (klogInt, rc,
213 "Error during VCursorAddColumn($(name))", "name=%s", name));
214 }
215 }
216
217 if (rc == 0) {
218 const char name[] = "READ_TYPE";
219 rc = VCursorAddColumn(self->curs, &self->colREAD_TYPE, name);
220 if (rc != 0) {
221 PLOGERR(klogInt, (klogInt, rc,
222 "Error during VCursorAddColumn($(name))", "name=%s", name));
223 }
224 }
225
226 if (rc == 0) {
227 rc = VCursorOpen(self->curs);
228 if (rc != 0) {
229 LOGERR(klogInt, rc, "Error during VCursorOpen");
230 }
231 }
232
233 return rc;
234 }
235
_ApprCntFini(ApprCnt * self)236 static rc_t _ApprCntFini(ApprCnt *self) {
237 rc_t rc = 0;
238 assert(self);
239 RELEASE(VCursor, self->curs);
240 memset(self, 0, sizeof *self);
241 return rc;
242 }
243
_ApprCntChunk(ApprCnt * self,uint64_t chunk,uint64_t l,uint64_t nspots,uint32_t nreads)244 static rc_t _ApprCntChunk(ApprCnt *self,
245 uint64_t chunk, uint64_t l, uint64_t nspots, uint32_t nreads)
246 {
247 rc_t rc = 0;
248 uint64_t start = nspots / 10 * chunk + 1;
249 uint64_t spot = 0;
250 uint64_t end = start + l;
251 assert(self);
252 if (end - 1 > nspots) {
253 end = nspots + 1;
254 }
255 for (spot = start; spot < end; ++spot) {
256 uint64_t techBases = 0;
257 uint64_t bioBases = 0;
258 uint32_t read = 0;
259 const uint32_t *readLen = NULL;
260 const INSDC_read_type *readType = NULL;
261 const void *base = NULL;
262 rc = _VCursorCellDataDirect(self->curs, spot, self->colREAD_LEN,
263 32, &base, nreads, "READ_LEN");
264 if (rc != 0) {
265 return rc;
266 }
267 readLen = base;
268 rc = _VCursorCellDataDirect(self->curs, spot, self->colREAD_TYPE,
269 8, &base, nreads, "READ_TYPE");
270 if (rc != 0) {
271 return rc;
272 }
273 readType = base;
274 for (read = 0; read < nreads; ++read) {
275 INSDC_read_type type = readType[read] & 1;
276 if (type == READ_TYPE_BIOLOGICAL) {
277 bioBases += readLen[read];
278 }
279 else {
280 techBases += readLen[read];
281 }
282 }
283 if (self->techBasesPerSpotEquals) {
284 if (self->techBasesPerSpot != techBases) {
285 if (self->techBasesPerSpot == ~0) {
286 self->techBasesPerSpot = techBases;
287 }
288 else {
289 self->techBasesPerSpotEquals = false;
290 }
291 }
292 }
293 self->bioBasesCnt += bioBases;
294 }
295 return rc;
296 }
297
298 /******************************************************************************/
299
_VTableLogRowData(const VTable * self,const char * column,void * buffer,uint32_t blen)300 static rc_t _VTableLogRowData(const VTable *self,
301 const char *column, void *buffer, uint32_t blen)
302 {
303 rc_t rc = 0;
304
305 #if _DEBUGGING
306 if (buffer && blen == 64) {
307 uint64_t data = *((uint64_t*)buffer);
308 const KTable *ktbl = NULL;
309 rc_t rc = VTableOpenKTableRead(self, &ktbl);
310 if (rc == 0) {
311 const char *path = NULL;
312 rc = KTableGetPath(ktbl, &path);
313 if (rc == 0) {
314 DBGMSG(DBG_BLAST, DBG_FLAG(DBG_BLAST_BLAST),
315 ("%s: %s: %lu\n", path, column, data));
316 }
317 }
318
319 KTableRelease(ktbl);
320 }
321 #endif
322
323 if (rc != 0)
324 { PLOGERR(klogInt, (klogInt, rc, "Error in $(f)", "f=%s", __func__)); }
325
326 return rc;
327 }
328
_VTableMakeCursorImpl(const VTable * self,const VCursor ** curs,uint32_t * col_idx,const char * col_name,bool canBeMissed,const char * acc)329 static rc_t _VTableMakeCursorImpl(const VTable *self, const VCursor **curs,
330 uint32_t *col_idx, const char *col_name, bool canBeMissed, const char *acc)
331 {
332 rc_t rc = 0;
333
334 assert(curs && col_name);
335
336 if (rc == 0) {
337 rc = VTableCreateCursorRead(self, curs);
338 if (rc != 0) {
339 LOGERR(klogInt, rc, "Error during VTableCreateCursorRead");
340 }
341 }
342
343 if (rc == 0) {
344 VCursorPermitPostOpenAdd(*curs);
345 if (rc != 0) {
346 LOGERR(klogInt, rc, "Error during VCursorPermitPostOpenAdd");
347 }
348 }
349
350 if (rc == 0) {
351 rc = VCursorOpen(*curs);
352 if (rc != 0) {
353 PLOGERR(klogInt, (klogInt, rc,
354 "Error in VCursorOpen($(name))", "name=%s", col_name));
355 }
356 }
357
358 if (rc == 0) {
359 assert(*curs);
360 rc = VCursorAddColumn(*curs, col_idx, "%s", col_name);
361 if (rc != 0 && !canBeMissed) {
362 PLOGERR(klogInt, (klogInt, rc,
363 "Error in VCursorAddColumn($(name))", "name=%s", col_name));
364 }
365 }
366
367 STSMSG(2, ("Prepared a VCursor to read '%s'", col_name));
368
369 return rc;
370 }
371
_VTableMakeCursor(const VTable * self,const VCursor ** curs,uint32_t * col_idx,const char * col_name,const char * acc)372 rc_t _VTableMakeCursor(const VTable *self, const VCursor **curs,
373 uint32_t *col_idx, const char *col_name, const char *acc)
374 {
375 return _VTableMakeCursorImpl(self, curs, col_idx, col_name, false, acc);
376 }
377
378 /*static*/
_VTableReadFirstRowImpl(const VTable * self,const char * name,void * buffer,uint32_t blen,EColType * is_static,bool canBeMissed,uint32_t * row_len,const char * acc)379 uint32_t _VTableReadFirstRowImpl(const VTable *self, const char *name,
380 void *buffer, uint32_t blen, EColType *is_static, bool canBeMissed,
381 uint32_t *row_len, const char *acc)
382 {
383 VdbBlastStatus status = eVdbBlastNoErr;
384
385 rc_t rc = 0;
386
387 const VCursor *curs = NULL;
388 uint32_t idx = 0;
389
390 uint32_t dummy = 0;
391 if (row_len == NULL) {
392 row_len = &dummy;
393 }
394
395 assert(self && name);
396
397 blen *= 8;
398
399 rc = _VTableMakeCursorImpl(self, &curs, &idx, name, canBeMissed, acc);
400 if (rc != 0) {
401 if (rc ==
402 SILENT_RC(rcVDB, rcCursor, rcOpening, rcColumn, rcUndefined)
403 || rc ==
404 SILENT_RC(rcVDB, rcCursor, rcUpdating, rcColumn, rcNotFound))
405 {
406 if (!canBeMissed) {
407 PLOGMSG(klogInfo, (klogInfo, "$(f): Column '$(name)' not found",
408 "f=%s,name=%s", __func__, name));
409 }
410 if (is_static != NULL) {
411 *is_static = eColTypeAbsent;
412 }
413 status = eVdbBlastTooExpensive;
414 }
415 else {
416 status = eVdbBlastErr;
417 if (is_static != NULL) {
418 *is_static = eColTypeError;
419 }
420 }
421 }
422
423 if (status == eVdbBlastNoErr && rc == 0) {
424 rc = VCursorOpenRow(curs);
425 if (rc != 0) {
426 PLOGERR(klogInt, (klogInt, rc,
427 "Error in VCursorOpenRow($(name))", "name=%s", name));
428 }
429 }
430
431 if (status == eVdbBlastNoErr && rc == 0 && buffer != NULL && blen > 0) {
432 rc = VCursorRead(curs, idx, 8, buffer, blen, row_len);
433 if (rc != 0) {
434 PLOGERR(klogInt, (klogInt, rc,
435 "Error in VCursorRead($(name))", "name=%s", name));
436 }
437 }
438
439 /* TODO: needs to be verified what row_len is expected
440 if (*row_len != 1) return eVdbBlastErr; */
441
442 STSMSG(2, ("Read the first row of '%s'", name));
443
444 if (status == eVdbBlastNoErr && rc == 0) {
445 if (blen == 64) {
446 _VTableLogRowData(self, name, buffer, blen);
447 }
448 if (is_static != NULL) {
449 bool isStatic = false;
450 rc = VCursorIsStaticColumn(curs, idx, &isStatic);
451 if (rc != 0) {
452 PLOGERR(klogInt, (klogInt, rc,
453 "Error in VCursorIsStaticColumn($(name))",
454 "name=%s", name));
455 }
456 else {
457 *is_static = isStatic ? eColTypeStatic : eColTypeNonStatic;
458 }
459 }
460 }
461
462 VCursorRelease(curs);
463
464 if (status == eVdbBlastNoErr && rc != 0) {
465 status = eVdbBlastErr;
466 }
467 return status;
468 }
469
_VTableReadFirstRow(const VTable * self,const char * name,void * buffer,uint32_t blen,EColType * is_static,const char * acc)470 static VdbBlastStatus _VTableReadFirstRow(const VTable *self, const char *name,
471 void *buffer, uint32_t blen, EColType *is_static, const char *acc)
472 {
473 return _VTableReadFirstRowImpl
474 (self, name, buffer, blen, is_static, false, NULL, acc);
475 }
476
_VTableReadPLATFORMoptional(const VTable * self,const char * acc)477 static INSDC_SRA_platform_id _VTableReadPLATFORMoptional
478 (const VTable *self, const char *acc)
479 {
480 INSDC_SRA_platform_id platform = SRA_PLATFORM_UNDEFINED;
481
482 VdbBlastStatus status = _VTableReadFirstRowImpl(self,
483 "PLATFORM", &platform, sizeof platform, NULL, true, NULL, acc);
484
485 if (status == eVdbBlastNoErr) {
486 return platform;
487 }
488 else {
489 return SRA_PLATFORM_UNDEFINED;
490 }
491 }
492
_VTableVarReadNum(const VTable * self,const char * acc)493 static bool _VTableVarReadNum(const VTable *self, const char *acc) {
494 return _VTableReadPLATFORMoptional(self, acc) == SRA_PLATFORM_PACBIO_SMRT;
495 }
496
497 static uint64_t BIG = 10000 /*11*/;
_VTableBioBaseCntApprox(const VTable * self,uint64_t nspots,uint32_t nreads,uint64_t * bio_base_count,const char * acc)498 static rc_t _VTableBioBaseCntApprox(const VTable *self,
499 uint64_t nspots, uint32_t nreads, uint64_t *bio_base_count, const char *acc)
500 {
501 rc_t rc = 0;
502 uint64_t c = nspots;
503 uint64_t d = 1;
504 ApprCnt ac;
505 rc = _ApprCntInit(&ac, self, acc);
506 assert(bio_base_count);
507 if (rc == 0) {
508 if (nspots < BIG) {
509 STSMSG(2,
510 ("VdbBlastRunSetGetTotalLengthApprox: counting all reads\n"));
511 rc = _ApprCntChunk(&ac, 0, nspots, nspots, nreads);
512 }
513 else {
514 uint64_t i = 0;
515 uint64_t l = 0;
516 uint64_t n = 10 /* 3 */;
517 while (c > BIG) {
518 c /= 2;
519 d *= 2;
520 }
521 l = c / n;
522 if (l == 0) {
523 l = 1;
524 }
525 for (i = 1; i < n - 1 && rc == 0; ++i) {
526 rc = _ApprCntChunk(&ac, i, l, nspots, nreads);
527 }
528 }
529 }
530 if (rc == 0) {
531 VdbBlastStatus status = eVdbBlastNoErr;
532 if (nspots < BIG) {
533 *bio_base_count = ac.bioBasesCnt;
534 }
535 else {
536 if (ac.techBasesPerSpotEquals && ac.techBasesPerSpot == ~0) {
537 ac.techBasesPerSpotEquals = false;
538 }
539 if (ac.techBasesPerSpotEquals ) {
540 uint64_t baseCount = 0;
541 status = _VTableReadFirstRow(self,
542 "BASE_COUNT", &baseCount, sizeof baseCount, NULL, acc);
543 if (status == eVdbBlastNoErr) {
544 STSMSG(2, ("VdbBlastRunSetGetTotalLengthApprox: "
545 "fixed technical reads length\n"));
546 *bio_base_count = baseCount - nspots * ac.techBasesPerSpot;
547 }
548 else {
549 STSMSG(1, ("VdbBlastRunSetGetTotalLengthApprox: "
550 "cannot read BASE_COUNT\n"));
551 }
552 }
553 if (!ac.techBasesPerSpotEquals || status != eVdbBlastNoErr) {
554 /* double dd = nspots / c; */
555 STSMSG(2, ("VdbBlastRunSetGetTotalLengthApprox: "
556 "extrapolating variable read length\n"));
557 *bio_base_count = ac.bioBasesCnt * d;
558 }
559 }
560 }
561 _ApprCntFini(&ac);
562 return rc;
563 }
564
565 static
_VTableGetNReads(const VTable * self,uint32_t * nreads,const char * acc)566 uint32_t _VTableGetNReads(const VTable *self, uint32_t *nreads, const char *acc)
567 {
568 rc_t rc = 0;
569 VdbBlastStatus status = eVdbBlastNoErr;
570 const VCursor *curs = NULL;
571 uint32_t idx = 0;
572
573 const char name[] = "READ_LEN";
574
575 assert(nreads);
576
577 rc = _VTableMakeCursor(self, &curs, &idx, name, acc);
578 if (rc != 0) {
579 status = eVdbBlastErr;
580 if (rc ==
581 SILENT_RC(rcVDB, rcCursor, rcOpening, rcColumn, rcUndefined))
582 {
583 PLOGMSG(klogInfo, (klogInfo, "$(f): Column '$(name)' not found",
584 "f=%s,name=%s", __func__, name));
585 }
586 else {
587 PLOGMSG(klogInfo, (klogInfo, "$(f): Cannot open column '$(name)'",
588 "f=%s,name=%s", __func__, name));
589 }
590 }
591
592 if (status == eVdbBlastNoErr) {
593 uint32_t elem_bits, elem_off, elem_cnt;
594 const void *base = NULL;
595 rc = VCursorCellDataDirect(curs, 1, idx,
596 &elem_bits, &base, &elem_off, &elem_cnt);
597 if (rc != 0) {
598 status = eVdbBlastErr;
599 PLOGMSG(klogInfo, (klogInfo,
600 "$(f): Cannot '$(name)' CellDataDirect",
601 "f=%s,name=%s", __func__, name));
602 }
603 else if (elem_off != 0 || elem_bits != 32) {
604 status = eVdbBlastErr;
605 PLOGERR(klogInt, (klogInt, rc,
606 "Bad VCursorCellDataDirect(READ_LEN) result: "
607 "boff=$(elem_off), elem_bits=$(elem_bits)",
608 "elem_off=%u, elem_bits=%u", elem_off, elem_bits));
609 }
610 else {
611 *nreads = elem_cnt;
612 }
613 }
614
615 RELEASE(VCursor, curs);
616
617 return status;
618 }
619
_VTableCSra(const VTable * self)620 static bool _VTableCSra(const VTable *self) {
621 bool cSra = false;
622
623 KNamelist *names = NULL;
624
625 uint32_t i = 0, count = 0;
626
627 rc_t rc = VTableListPhysColumns(self, &names);
628
629 if (rc == 0) {
630 rc = KNamelistCount(names, &count);
631 }
632
633 for (i = 0 ; i < count && rc == 0; ++i) {
634 const char *name = NULL;
635 rc = KNamelistGet(names, i, &name);
636 if (rc == 0) {
637 const char b[] = "CMP_READ";
638 if (string_cmp(name, string_measure(name, NULL),
639 b, sizeof b - 1, sizeof b - 1) == 0)
640 {
641 cSra = true;
642 break;
643 }
644 }
645 }
646
647 RELEASE(KNamelist, names);
648
649 return cSra;
650 }
651
652 /******************************************************************************/
653
_RunDescFini(RunDesc * self)654 static void _RunDescFini(RunDesc *self) {
655 assert(self);
656 free(self->readLen);
657 free(self->readType);
658 free(self->rdFilter);
659 memset(self, 0, sizeof *self);
660 }
661
662 static
_RunDescMakeReadId(const RunDesc * self,uint64_t spot,uint8_t read,uint64_t read_id,VdbBlastStatus * status)663 uint64_t _RunDescMakeReadId(const RunDesc *self, uint64_t spot, uint8_t read,
664 uint64_t read_id, VdbBlastStatus *status)
665 {
666 bool overflow = false;
667
668 assert(self && status);
669
670 *status = eVdbBlastNoErr;
671
672 if (self->readIdDesc.idType == eFixedReadN) {
673 return read_id;
674 }
675
676 assert(self->readIdDesc.idType == eFactor10);
677
678 read_id = read * self->spotBits + spot;
679 if (read_id < self->spotBits) {
680 overflow = true;
681 }
682
683 if (self->readIdDesc.runBits > 0) {
684 uint64_t prev = read_id;
685 read_id *= self->readIdDesc.runBits;
686 read_id += self->index;
687 if (read_id < prev) {
688 overflow = true;
689 }
690 }
691
692 if (overflow) {
693 *status = eVdbBlastErr;
694 }
695
696 return read_id;
697 }
698
699 /******************************************************************************/
700 /* VdbBlastDb */
_VdbBlastDbWhack(VdbBlastDb * self)701 static void _VdbBlastDbWhack(VdbBlastDb *self) {
702 if (self == NULL) {
703 return;
704 }
705
706 VCursorRelease(self->cursACCESSION);
707 VCursorRelease(self->cursSeq);
708
709 VTableRelease(self->prAlgnTbl);
710 VTableRelease(self->refTbl);
711 VTableRelease(self->seqTbl);
712
713 VDatabaseRelease(self->db);
714
715 memset(self, 0, sizeof *self);
716
717 free(self);
718 }
719
_VdbBlastDbOpenSeqCurs(VdbBlastDb * self,const char * acc)720 static rc_t _VdbBlastDbOpenSeqCurs(VdbBlastDb *self, const char *acc) {
721 rc_t rc = 0;
722 const VCursor *curs = NULL;
723 assert(self);
724 if (self->cursSeq != NULL) {
725 return 0;
726 }
727 rc = VTableCreateCursorRead(self->seqTbl, &self->cursSeq);
728 if (rc != 0) {
729 PLOGERR(klogInt, (klogInt, rc,
730 "Error during VTableCreateCursorRead for $(acc)", "acc=%s", acc));
731 }
732 else {
733 curs = self->cursSeq;
734 }
735 if (rc == 0) {
736 const char name[] = "READ_FILTER";
737 rc = VCursorAddColumn(curs, &self->col_READ_FILTER, name);
738 if (rc != 0) {
739 PLOGERR(klogInt, (klogInt, rc,
740 "Error during VCursorAddColumn($(name))", "name=%s", name));
741 }
742 }
743 if (rc == 0) {
744 const char name[] = "READ_LEN";
745 rc = VCursorAddColumn(curs, &self->col_READ_LEN, name);
746 if (rc != 0) {
747 PLOGERR(klogInt, (klogInt, rc,
748 "Error during VCursorAddColumn($(name))", "name=%s", name));
749 }
750 }
751 if (rc == 0) {
752 const char name[] = "READ_TYPE";
753 rc = VCursorAddColumn(curs, &self->col_READ_TYPE, name);
754 if (rc != 0) {
755 PLOGERR(klogInt, (klogInt, rc,
756 "Error during VCursorAddColumn($(name))", "name=%s", name));
757 }
758 }
759 if (rc == 0) {
760 const char name[] = "TRIM_LEN";
761 rc = VCursorAddColumn(curs, &self->col_TRIM_LEN, name);
762 if (rc != 0) {
763 PLOGERR(klogInt, (klogInt, rc,
764 "Error during VCursorAddColumn($(name))", "name=%s", name));
765 }
766 }
767 if (rc == 0) {
768 const char name[] = "TRIM_START";
769 rc = VCursorAddColumn(curs, &self->col_TRIM_START, name);
770 if (rc != 0) {
771 PLOGERR(klogInt, (klogInt, rc,
772 "Error during VCursorAddColumn($(name))", "name=%s", name));
773 }
774 }
775 if (rc == 0) {
776 rc = VCursorOpen(curs);
777 if (rc != 0) {
778 LOGERR(klogInt, rc, "Error during VCursorOpen");
779 }
780 }
781 return rc;
782 }
783
784 typedef enum {
785 eFirstRead,
786 eNextRead,
787 eAllReads,
788 } TReadType;
_VdbBlastDbFindRead(const VdbBlastDb * self,ReadDesc * rd,TReadType type,bool * found)789 static rc_t _VdbBlastDbFindRead
790 (const VdbBlastDb *self, ReadDesc *rd, TReadType type, bool *found)
791 {
792 rc_t rc = 0;
793
794 int64_t first = 0;
795 uint64_t count = 0;
796
797 const VCursor *curs = NULL;
798
799 assert(self && rd && found);
800
801 *found = false;
802
803 if (type == eAllReads) {
804 memset(rd, 0, sizeof *rd);
805 }
806
807 curs = self->cursSeq;
808
809 rc = VCursorIdRange(curs, self->col_READ_LEN, &first, &count);
810
811 if (rc == 0) {
812 int64_t i = 0;
813 uint32_t c = 0;
814 if (type == eNextRead) {
815 if (rd->spot < first) {
816 return -1;
817 }
818 i = rd->spot - first;
819 c = rd->read;
820 if (c + 1 >= rd->nReads) {
821 c = 0;
822 ++i;
823 }
824 }
825 for (; i < count; ++i) {
826 int64_t spot = first + i;
827
828 const void *base = NULL;
829 uint32_t elem_bits = 0;
830 uint32_t elem_cnt = 0;
831 uint32_t elem_off = 0;
832 rc = VCursorCellDataDirect(curs, spot, self->col_READ_TYPE,
833 &elem_bits, &base, &elem_off, &elem_cnt);
834 if (rc != 0) {
835 break;
836 }
837 else {
838 INSDC_read_type *aRt = (INSDC_read_type*)base;
839 for (; c < elem_cnt; ++c) {
840 INSDC_read_type *rt = aRt + c;
841 assert(rt);
842
843 if (*rt & SRA_READ_TYPE_BIOLOGICAL) {
844 *found = true;
845 if (type == eAllReads) {
846 ++rd->spot;
847 }
848 else {
849 rd->spot = spot;
850 rd->read = c + 1;
851 rd->nReads = elem_cnt;
852 return rc;
853 }
854 }
855 }
856 c = 0;
857 }
858 }
859 }
860
861 return rc;
862 }
863
_VdbBlastDbFindFirstRead(const VdbBlastDb * self,ReadDesc * rd,bool * found)864 static rc_t _VdbBlastDbFindFirstRead
865 (const VdbBlastDb *self, ReadDesc *rd, bool *found)
866 { return _VdbBlastDbFindRead(self, rd, eFirstRead, found); }
_VdbBlastDbFindNextRead(const VdbBlastDb * self,ReadDesc * rd,bool * found)867 static rc_t _VdbBlastDbFindNextRead
868 (const VdbBlastDb *self, ReadDesc *rd, bool *found)
869 { return _VdbBlastDbFindRead(self, rd, eNextRead , found); }
870
_ReadDescFindNextRead(ReadDesc * self,bool * found)871 rc_t _ReadDescFindNextRead(ReadDesc *self, bool *found) {
872 assert(self && self->run && self->run);
873 return _VdbBlastDbFindNextRead(self->run->obj, self, found);
874 }
875
_VdbBlastDbOpenCursAndGetFirstRead(VdbBlastDb * self,ReadDesc * desc,bool * found,const char * acc)876 static rc_t _VdbBlastDbOpenCursAndGetFirstRead
877 (VdbBlastDb *self, ReadDesc *desc, bool *found, const char *acc)
878 {
879 rc_t rc = _VdbBlastDbOpenSeqCurs(self, acc);
880 if (rc == 0) {
881 rc = _VdbBlastDbFindFirstRead(self, desc, found);
882 }
883 return rc;
884 }
885
_VdbBlastDbOpenCursAndGetAllReads(VdbBlastDb * self,ReadDesc * desc,const char * acc)886 static rc_t _VdbBlastDbOpenCursAndGetAllReads
887 (VdbBlastDb *self, ReadDesc *desc, const char *acc)
888 {
889 rc_t rc = _VdbBlastDbOpenSeqCurs(self, acc);
890 if (rc == 0) {
891 bool found = false;
892 rc = _VdbBlastDbFindRead(self, desc, eAllReads, &found);
893 }
894 return rc;
895 }
896
_VdbBlastRunSetGetAllReads(const VdbBlastRunSet * self,uint32_t run)897 uint64_t _VdbBlastRunSetGetAllReads
898 (const VdbBlastRunSet *self, uint32_t run)
899 {
900 ReadDesc desc;
901
902 assert(self);
903
904 if (run >= self->runs.krun) {
905 return 0;
906 }
907
908 if (_VdbBlastDbOpenCursAndGetAllReads
909 (self->runs.run[run].obj, &desc, self->runs.run[run].acc))
910 {
911 return 0;
912 }
913
914 return desc.spot;
915 }
916
_VdbBlastDbGetNReads(VdbBlastDb * self,uint64_t spot,uint32_t * nReads,const char * acc)917 static rc_t _VdbBlastDbGetNReads
918 (VdbBlastDb *self, uint64_t spot, uint32_t *nReads, const char *acc)
919 {
920 rc_t rc = _VdbBlastDbOpenSeqCurs(self, acc);
921 assert(self && nReads);
922 *nReads = 0;
923 if (rc == 0) {
924 const void *base = NULL;
925 uint32_t elem_bits = 0;
926 uint32_t elem_cnt = 0;
927 uint32_t elem_off = 0;
928 rc = VCursorCellDataDirect(self->cursSeq, spot, self->col_READ_TYPE,
929 &elem_bits, &base, &elem_off, &elem_cnt);
930 if (rc == 0) {
931 *nReads = elem_cnt;
932 }
933 else {
934 PLOGERR(klogInt, (klogInt, rc, "Cannot get "
935 "CellDataDirect for $(acc)/READ_TYPE/$(spot)",
936 "acc=%s,spot=%lu", acc, spot));
937 }
938 }
939 return rc;
940 }
941
942 /******************************************************************************/
943 /*VdbBlastRun*/
944
_VdbBlastRunFini(VdbBlastRun * self)945 static void _VdbBlastRunFini(VdbBlastRun *self) {
946 if (self == NULL) {
947 return;
948 }
949
950 _VdbBlastDbWhack(self->obj);
951
952 free(self->acc);
953 free(self->path);
954
955 _RunDescFini(&self->rd);
956
957 memset(self, 0, sizeof *self);
958 }
959
_VdbBlastRunInit(VdbBlastRun * self,VdbBlastDb * obj,const char * rundesc,BTableType type,const KDirectory * dir,char * fullpath,uint32_t min_read_length,uint32_t index)960 static VdbBlastStatus _VdbBlastRunInit(VdbBlastRun *self, VdbBlastDb *obj,
961 const char *rundesc, BTableType type, const KDirectory *dir,
962 char *fullpath, uint32_t min_read_length, uint32_t index)
963 {
964 rc_t rc = 0;
965 const char *acc = rundesc;
966 char rbuff[4096] = "";
967 size_t size = 0;
968
969 char slash = '/';
970
971 assert(!dir && self && obj && type != btpUndefined && rundesc);
972
973 {
974 KDirectory *dir = NULL;
975 if (KDirectoryNativeDir(&dir) != 0) {
976 LOGERR(klogInt, rc, "Error during KDirectoryNativeDir");
977 return eVdbBlastErr;
978 }
979 /* TODO This is obsolete and incorrect */
980 rc = KDirectoryResolvePath(dir, true,
981 rbuff, sizeof rbuff, "%s", rundesc);
982 KDirectoryRelease(dir);
983 if (rc != 0) {
984 PLOGERR(klogInt, (klogInt, rc,
985 "Error during KDirectoryResolvePath($(path))",
986 "path=%s", rundesc));
987 return eVdbBlastErr;
988 }
989 }
990
991 memset(self, 0, sizeof *self);
992
993 self->rd.index = index;
994 self->obj = obj;
995 self->type = type;
996
997 self->alignments
998 = self->bioBases = self->bioBasesApprox
999 = self->bioReads = self->bioReadsApprox = ~0;
1000
1001 acc = strrchr(rbuff, slash);
1002 if (acc == NULL) {
1003 acc = rbuff;
1004 }
1005 else if (string_measure(acc, &size) > 1) {
1006 ++acc;
1007 }
1008 else {
1009 acc = rbuff;
1010 }
1011
1012 if (fullpath == NULL) {
1013 self->path = string_dup(rbuff, sizeof(rbuff));
1014 if (self->path == NULL) {
1015 return eVdbBlastMemErr;
1016 }
1017 }
1018 else {
1019 self->path = fullpath;
1020 }
1021 self->acc = _CanonocalName(acc);
1022 if (self->acc == NULL) {
1023 return eVdbBlastMemErr;
1024 }
1025
1026 self->min_read_length = min_read_length;
1027
1028 self->cSra = _VTableCSra(self->obj->seqTbl);
1029
1030 return eVdbBlastNoErr;
1031 }
1032
Bits(uint64_t n,EReadIdType idType)1033 static uint32_t Bits(uint64_t n, EReadIdType idType) {
1034 uint32_t bits = 1;
1035
1036 if (idType == eFixedReadN) {
1037 return 0;
1038 }
1039
1040 assert(idType == eFactor10);
1041
1042 if (n == 0) {
1043 return 0;
1044 }
1045
1046 while (n > 0) {
1047 bits *= 10;
1048 n /= 10;
1049 }
1050
1051 return bits;
1052 }
1053
_VdbBlastRunFillRunDesc(VdbBlastRun * self)1054 /*static*/ VdbBlastStatus _VdbBlastRunFillRunDesc(VdbBlastRun *self) {
1055 VdbBlastStatus status = eVdbBlastNoErr;
1056 RunDesc *rd = NULL;
1057
1058 int i = 0;
1059 const char *col = NULL;
1060
1061 assert(self);
1062
1063 rd = &self->rd;
1064
1065 if (rd->spotCount || rd->readType || rd->nReads || rd->nBioReads) {
1066 if (self->cSra && rd->cmpBaseCount == ~0) {
1067 rc_t rc = RC(rcSRA, rcTable, rcReading, rcColumn, rcNotFound);
1068 PLOGERR(klogInt, (klogInt, rc,
1069 "Cannot read CMP_BASE_COUNT column for $(p)",
1070 "p=%s", self->path));
1071 STSMSG(1, ("Error: failed to read %s/%s",
1072 self->path, "CMP_BASE_COUNT"));
1073 return eVdbBlastErr;
1074 }
1075 else {
1076 S
1077 return eVdbBlastNoErr;
1078 }
1079 }
1080
1081 assert(self->path && self->obj);
1082
1083 col = "SPOT_COUNT";
1084 status = _VTableReadFirstRow(self->obj->seqTbl,
1085 col, &rd->spotCount, sizeof rd->spotCount, NULL, self->acc);
1086 if (status != eVdbBlastNoErr) {
1087 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1088 return status;
1089 }
1090
1091 rd->spotBits = Bits(rd->spotCount, rd->readIdDesc.idType);
1092
1093 if (self->type == btpWGS) {
1094 S
1095 status = eVdbBlastNoErr;
1096 rd->nReads = rd->spotCount > 0 ? 1 : 0;
1097 }
1098 else if (self->type == btpREFSEQ) {
1099 S
1100 status = eVdbBlastNoErr;
1101 rd->nReads = 1;
1102 }
1103 else {
1104 uint32_t nreads = 0;
1105 status = _VTableGetNReads(self->obj->seqTbl, &nreads, self->acc);
1106 if (status == eVdbBlastNoErr) {
1107 rd->nReads = nreads;
1108 }
1109 }
1110
1111 switch (self->type) {
1112 case btpSRA:
1113 col = "PLATFORM";
1114 status = _VTableReadFirstRow(self->obj->seqTbl,
1115 col, &rd->platform, sizeof rd->platform, NULL, self->acc);
1116 if (status != eVdbBlastNoErr) {
1117 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1118 return status;
1119 }
1120 switch (rd->platform) { /* TODO */
1121 case SRA_PLATFORM_ILLUMINA:
1122 case SRA_PLATFORM_ABSOLID:
1123 case SRA_PLATFORM_COMPLETE_GENOMICS:
1124 rd->varReadLen = false;
1125 break;
1126 case SRA_PLATFORM_UNDEFINED:
1127 case SRA_PLATFORM_454:
1128 case SRA_PLATFORM_HELICOS:
1129 case SRA_PLATFORM_PACBIO_SMRT:
1130 case SRA_PLATFORM_ION_TORRENT:
1131 case SRA_PLATFORM_CAPILLARY:
1132 case SRA_PLATFORM_OXFORD_NANOPORE:
1133 default:
1134 rd->varReadLen = true;
1135 break;
1136 }
1137 break;
1138 case btpWGS:
1139 rd->varReadLen = true;
1140 break;
1141 case btpREFSEQ:
1142 break;
1143 default:
1144 assert(0);
1145 break;
1146 }
1147
1148 col = "READ_TYPE";
1149 if (rd->readType == NULL) {
1150 rd->readType = calloc(rd->nReads, sizeof *rd->readType);
1151 if (rd->readType == NULL)
1152 { return eVdbBlastMemErr; }
1153 }
1154 {
1155 bool optional = self->type == btpREFSEQ;
1156 status = _VTableReadFirstRowImpl(self->obj->seqTbl, col, rd->readType,
1157 rd->nReads * sizeof *rd->readType, &rd->readTypeStatic,
1158 optional, NULL, self->acc);
1159 }
1160 /* TODO: check case when ($#READ_TYPE == 0 && nreads > 0) */
1161 if (status != eVdbBlastNoErr) {
1162 if (status == eVdbBlastTooExpensive) {
1163 status = eVdbBlastNoErr;
1164 }
1165 else {
1166 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1167 return status;
1168 }
1169 }
1170
1171 col = "READ_LEN";
1172 if (rd->readLen == NULL) {
1173 rd->readLen = calloc(rd->nReads, sizeof *rd->readLen);
1174 if (rd->readLen == NULL)
1175 { return eVdbBlastMemErr; }
1176 }
1177 status = _VTableReadFirstRow(self->obj->seqTbl, col, rd->readLen,
1178 rd->nReads * sizeof *rd->readLen, &rd->readLenStatic, self->acc);
1179 /* TODO: check case when ($#READ_TYPE == 0 && nreads > 0) */
1180 if (status != eVdbBlastNoErr) {
1181 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1182 return status;
1183 }
1184
1185 col = "READ_FILTER"; /* col = "RD_FILTER"; */
1186 if (rd->rdFilter == NULL) {
1187 rd->rdFilter = calloc(rd->nReads, sizeof *rd->rdFilter);
1188 if (rd->rdFilter == NULL)
1189 { return eVdbBlastMemErr; }
1190 }
1191 status = _VTableReadFirstRow(self->obj->seqTbl, col, rd->rdFilter,
1192 rd->nReads * sizeof *rd->rdFilter, &rd->rdFilterStatic, self->acc);
1193 if (status != eVdbBlastNoErr) {
1194 if (status == eVdbBlastTooExpensive) {
1195 status = eVdbBlastNoErr;
1196 }
1197 else {
1198 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1199 return status;
1200 }
1201 }
1202
1203 col = "BIO_BASE_COUNT";
1204 status = _VTableReadFirstRowImpl(self->obj->seqTbl, col, &rd->bioBaseCount,
1205 sizeof rd->bioBaseCount, NULL, true, NULL, self->acc);
1206 /* Do not generate error message when BIO_BASE_COUNT is not found */
1207
1208 if (status != eVdbBlastNoErr) {
1209 if (status == eVdbBlastTooExpensive) {
1210 status = eVdbBlastNoErr;
1211 rd->bioBaseCount = ~0;
1212 }
1213 else {
1214 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1215 return status;
1216 }
1217 }
1218
1219 col = "CMP_BASE_COUNT";
1220 status = _VTableReadFirstRow(self->obj->seqTbl, col,
1221 &rd->cmpBaseCount, sizeof rd->cmpBaseCount, NULL, self->acc);
1222 if (status != eVdbBlastNoErr) {
1223 if (status == eVdbBlastTooExpensive) {
1224 /* CMP_BASE_COUNT should be always found */
1225 rc_t rc = RC(rcSRA, rcTable, rcReading, rcColumn, rcNotFound);
1226 PLOGERR(klogInt, (klogInt, rc,
1227 "Cannot read CMP_BASE_COUNT column for $(p)",
1228 "p=%s", self->path));
1229 STSMSG(1, ("Error: failed to read %s/%s",
1230 self->path, "CMP_BASE_COUNT"));
1231 rd->cmpBaseCount = ~0;
1232 return eVdbBlastErr;
1233 }
1234 else {
1235 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1236 return status;
1237 }
1238 }
1239
1240 if (self->type == btpREFSEQ) {
1241 rd->nBioReads = 1;
1242 /* rd->bioLen is set incorrectly for refseq-s */
1243 }
1244 else {
1245 for (rd->nBioReads = 0, i = 0; i < rd->nReads; ++i) {
1246 S
1247 if (rd->readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
1248 if ((rd->rdFilterStatic == eColTypeStatic &&
1249 rd->rdFilter[i] == READ_FILTER_PASS)
1250 || (rd->rdFilterStatic == eColTypeAbsent))
1251 {
1252 ++rd->nBioReads;
1253 rd->bioLen += rd->readLen[i];
1254 }
1255 else {
1256 ++rd->nBioReads;
1257 }
1258 }
1259 }
1260 }
1261 S /* LOG nBioReads */
1262
1263 return status;
1264 }
1265
_VdbBlastRunVarReadNum(const VdbBlastRun * self)1266 bool _VdbBlastRunVarReadNum(const VdbBlastRun *self) {
1267 VdbBlastStatus status = _VdbBlastRunFillRunDesc((VdbBlastRun*)self);
1268 if (status != eVdbBlastNoErr) {
1269 LOGERR(klogInt, 1, "Cannot fill run descriptor");
1270 }
1271
1272 assert(self);
1273
1274 return self->rd.platform == SRA_PLATFORM_PACBIO_SMRT;
1275 }
1276
1277 /* _VdbBlastRunGetNumSequences
1278 returns (number of spots) * (number of biological reads in spot).
1279 If read_filter is not static: some reads could be filtered,
1280 so status is set to eVdbBlastTooExpensive */
1281 /*static*/
_VdbBlastRunGetNumSequences(VdbBlastRun * self,VdbBlastStatus * status)1282 uint64_t _VdbBlastRunGetNumSequences(VdbBlastRun *self,
1283 VdbBlastStatus *status)
1284 {
1285 assert(self && status);
1286
1287 *status = eVdbBlastNoErr;
1288
1289 if (self->bioReads == ~0) {
1290 RunDesc *rd = NULL;
1291
1292 if (self->type == btpREFSEQ) {
1293 S
1294 self->bioReads = 1;
1295 }
1296 else if (_VdbBlastRunVarReadNum(self)) {
1297 rc_t rc = 0;
1298 ReadDesc desc;
1299 memset(&desc, 0, sizeof desc);
1300 rc = _VdbBlastDbOpenCursAndGetAllReads(self->obj, &desc, self->acc);
1301 if (rc != 0) {
1302 *status = eVdbBlastErr;
1303 }
1304 else {
1305 self->bioReads = desc.spot;
1306 }
1307 }
1308 else {
1309 *status = _VdbBlastRunFillRunDesc(self);
1310 if (*status != eVdbBlastNoErr) {
1311 S
1312 return 0;
1313 }
1314
1315 rd = &self->rd;
1316
1317 if (rd->rdFilterStatic != eColTypeStatic) {
1318 self->bioReadsTooExpensive = true;
1319 }
1320
1321 if (self->cSra) {
1322 self->bioReadsTooExpensive = true;
1323 }
1324
1325 if (_VdbBlastRunVarReadNum(self)) {
1326 self->bioReads = 0;
1327 S
1328 }
1329 else {
1330 self->bioReads = rd->spotCount * rd->nBioReads;
1331 S
1332 }
1333 }
1334 }
1335 else {
1336 S
1337 }
1338
1339 if (*status == eVdbBlastNoErr && self->bioReadsTooExpensive) {
1340 *status = eVdbBlastTooExpensive;
1341 }
1342 return self->bioReads;
1343 }
1344
_VdbBlastRunCountBioBaseCount(VdbBlastRun * self,VdbBlastStatus * status)1345 static uint64_t _VdbBlastRunCountBioBaseCount(VdbBlastRun *self,
1346 VdbBlastStatus *status)
1347 {
1348 uint64_t bio_base_count = 0;
1349
1350 rc_t rc = _VTableBioBaseCntApprox(self->obj->seqTbl,
1351 self->rd.spotCount, self->rd.nReads, &bio_base_count, self->acc);
1352
1353 if (rc != 0) {
1354 *status = eVdbBlastErr;
1355 }
1356
1357 return bio_base_count;
1358 }
1359
_VdbBlastSraRunGetLengthApprox(VdbBlastRun * self,VdbBlastStatus * status)1360 static uint64_t _VdbBlastSraRunGetLengthApprox(VdbBlastRun *self,
1361 VdbBlastStatus *status)
1362 {
1363 assert(self && status);
1364
1365 *status = eVdbBlastNoErr;
1366
1367 if (self->bioBasesApprox == ~0) {
1368 if (! self->bioBasesTooExpensive && self->bioBases != ~0) {
1369 self->bioBasesApprox = self->bioBases;
1370 }
1371 else {
1372 RunDesc *rd = NULL;
1373 *status = _VdbBlastRunFillRunDesc(self);
1374 if (*status != eVdbBlastNoErr) {
1375 S
1376 return 0;
1377 }
1378
1379 rd = &self->rd;
1380 if (rd->nReads == 0) {
1381 S
1382 self->bioBasesApprox = 0;
1383 }
1384 else if (rd->varReadLen) {
1385 S
1386 self->bioBasesApprox
1387 = _VdbBlastRunCountBioBaseCount(self, status);
1388 }
1389 else {
1390 if (self->type == btpREFSEQ) {
1391 if (rd->bioBaseCount == ~0) {
1392 S
1393 *status = eVdbBlastErr;
1394 }
1395 else {
1396 self->bioBasesApprox = rd->bioBaseCount;
1397 }
1398 }
1399 else {
1400 uint8_t read = 0;
1401 S
1402 for (read = 0, self->bioBasesApprox = 0;
1403 read < rd->nReads; ++read)
1404 {
1405 if (rd->readType[read] & SRA_READ_TYPE_BIOLOGICAL) {
1406 self->bioBasesApprox += rd->readLen[read];
1407 }
1408 }
1409 self->bioBasesApprox *= rd->spotCount;
1410 }
1411 }
1412 }
1413 }
1414
1415 return self->bioBasesApprox;
1416 }
1417
_VdbBlastRunGetNumSequencesApprox(VdbBlastRun * self,VdbBlastStatus * status)1418 static uint64_t _VdbBlastRunGetNumSequencesApprox(VdbBlastRun *self,
1419 VdbBlastStatus *status)
1420 {
1421
1422 assert(self && status);
1423
1424 *status = eVdbBlastNoErr;
1425
1426 if (self->bioReadsApprox == ~0) {
1427 RunDesc *rd = NULL;
1428
1429 if (! self->bioReadsTooExpensive && self->bioReads != ~0) {
1430 self->bioReadsApprox = self->bioReads;
1431 }
1432 else if (self->type == btpREFSEQ) {
1433 S
1434 self->bioReadsApprox = 1;
1435 }
1436 else if (self->cSra) {
1437 /* Number of Bio Reads for cSra == number of CMP reads
1438 = Number of all bio reads * CMP_BASE_COUNT / BIO_BASE_COUNT
1439 Number of all bio reads = nSpots * n of bio reads per spot */
1440 double r = 0;
1441 uint64_t n = 0;
1442
1443 *status = _VdbBlastRunFillRunDesc(self);
1444 if (*status != eVdbBlastNoErr) {
1445 S
1446 return 0;
1447 }
1448
1449 rd = &self->rd;
1450
1451 n = _VdbBlastSraRunGetLengthApprox(self, status);
1452 if (*status != eVdbBlastNoErr) {
1453 S
1454 return 0;
1455 }
1456 r = rd->cmpBaseCount * rd->spotCount * rd->nBioReads;
1457 r /= n;
1458 self->bioReadsApprox = r;
1459 }
1460 else if (_VdbBlastRunVarReadNum(self)) {
1461 self->bioReads = _VdbBlastRunGetNumSequences(self, status);
1462 }
1463 else {
1464 *status = _VdbBlastRunFillRunDesc(self);
1465 if (*status != eVdbBlastNoErr) {
1466 S
1467 return 0;
1468 }
1469
1470 rd = &self->rd;
1471
1472 self->bioReadsApprox = rd->spotCount * rd->nBioReads;
1473 S
1474 }
1475 }
1476
1477 return self->bioReadsApprox;
1478 }
1479
_VdbBlastRunGetLength(VdbBlastRun * self,VdbBlastStatus * status)1480 static uint64_t _VdbBlastRunGetLength
1481 (VdbBlastRun *self, VdbBlastStatus *status)
1482 {
1483 VdbBlastStatus dummy = eVdbBlastNoErr;
1484 if (status == NULL)
1485 { status = &dummy; }
1486
1487 *status = eVdbBlastNoErr;
1488
1489 if (self->bioBasesTooExpensive) {
1490 *status = eVdbBlastTooExpensive;
1491 return 0;
1492 }
1493 else if (self->bioBases == ~0) {
1494 if (self->cSra) {
1495 *status = _VdbBlastRunFillRunDesc(self);
1496 if (*status != eVdbBlastNoErr) {
1497 S
1498 return 0;
1499 }
1500 self->bioBases = self->rd.cmpBaseCount;
1501 }
1502 else {
1503 /* if BIO_BASE_COUNT is not found then status is set to eVdbBlastTooExpensive */
1504 *status = _VTableReadFirstRowImpl(self->obj->seqTbl,
1505 "BIO_BASE_COUNT", &self->bioBases,
1506 sizeof self->bioBases, NULL, true, NULL, self->acc);
1507 if (*status == eVdbBlastTooExpensive) {
1508 self->bioBasesTooExpensive = true;
1509 }
1510 }
1511 }
1512
1513 if (*status == eVdbBlastNoErr) {
1514 S
1515 return self->bioBases;
1516 }
1517 else {
1518 S
1519 return 0;
1520 }
1521 }
1522
_VdbBlastRunGetLengthApprox(VdbBlastRun * self,VdbBlastStatus * status)1523 static uint64_t _VdbBlastRunGetLengthApprox(VdbBlastRun *self,
1524 VdbBlastStatus *status)
1525 {
1526 if (self->cSra) {
1527 return _VdbBlastRunGetLength(self, status);
1528 }
1529 else {
1530 return _VdbBlastSraRunGetLengthApprox(self, status);
1531 }
1532 }
1533
_VdbBlastRunScan(const VdbBlastRun * self,uint64_t (* cmp)(uint64_t cand,uint64_t champ,int64_t minRead,bool * done),uint64_t minRead,uint64_t start,VdbBlastStatus * status)1534 static uint64_t _VdbBlastRunScan(const VdbBlastRun *self,
1535 uint64_t (*cmp)
1536 (uint64_t cand, uint64_t champ, int64_t minRead, bool *done),
1537 uint64_t minRead, uint64_t start, VdbBlastStatus *status)
1538 {
1539 uint64_t res = start;
1540 uint64_t bad = start;
1541 uint64_t spot = 0;
1542 bool done = false;
1543 rc_t rc = 0;
1544 const VCursor *curs = NULL;
1545 uint32_t idxREAD_LEN = 0;
1546 uint32_t idxREAD_TYPE = 0;
1547 assert(self && status &&
1548 (self->rd.spotCount || self->rd.readType ||
1549 self->rd.nReads || self->rd.nBioReads));
1550 rc = _VTableMakeCursor
1551 (self->obj->seqTbl, &curs, &idxREAD_LEN, "READ_LEN", self->acc);
1552 if (rc != 0) {
1553 return bad;
1554 }
1555 if (_VdbBlastRunVarReadNum(self)) {
1556 rc = VCursorAddColumn(curs, &idxREAD_TYPE, "READ_TYPE");
1557 if (rc != 0) {
1558 return bad;
1559 }
1560 }
1561 for (spot = 1;
1562 spot <= self->rd.spotCount && *status == eVdbBlastNoErr && !done;
1563 ++spot)
1564 {
1565 uint32_t elem_bits, elem_off, elem_cnt;
1566 const void *base = NULL;
1567 if (_VdbBlastRunVarReadNum(self)) {
1568 rc = VCursorCellDataDirect(curs, spot, idxREAD_TYPE,
1569 &elem_bits, &base, &elem_off, &elem_cnt);
1570 if (rc != 0) {
1571 *status = eVdbBlastErr;
1572 PLOGMSG(klogInfo, (klogInfo,
1573 "$(f): Cannot '$(name)' CellDataDirect",
1574 "f=%s,name=%s", __func__, "READ_TYPE"));
1575 res = bad;
1576 break;
1577 }
1578 else {
1579 INSDC_read_type *aRt = (INSDC_read_type*)base;
1580 uint32_t c = 0;
1581 for (c = 0; c < elem_cnt; ++c) {
1582 INSDC_read_type *rt = aRt + c;
1583 assert(rt);
1584
1585 if (*rt & SRA_READ_TYPE_BIOLOGICAL) {
1586 uint32_t elem_bits, elem_off, elem_cnt;
1587 const void *base = NULL;
1588 rc = VCursorCellDataDirect(curs, spot, idxREAD_LEN,
1589 &elem_bits, &base, &elem_off, &elem_cnt);
1590 if (rc != 0) {
1591 *status = eVdbBlastErr;
1592 PLOGMSG(klogInfo, (klogInfo,
1593 "$(f): Cannot '$(name)' CellDataDirect",
1594 "f=%s,name=%s", __func__, "READ_LEN"));
1595 res = bad;
1596 break;
1597 }
1598 else if (elem_off != 0 || elem_bits != 32) {
1599 *status = eVdbBlastErr;
1600 PLOGERR(klogInt, (klogInt, rc,
1601 "Bad VCursorCellDataDirect(READ_LEN) result: "
1602 "boff=$(elem_off), elem_bits=$(elem_bits)",
1603 "elem_off=%u,elem_bits=%u",
1604 elem_off, elem_bits));
1605 res = bad;
1606 break;
1607 }
1608 else {
1609 uint8_t read = c;
1610 const uint32_t *readLen = base;
1611 res = cmp(readLen[read], res, minRead, &done);
1612 }
1613 }
1614 }
1615 }
1616 }
1617 else {
1618 rc = VCursorCellDataDirect(curs, spot, idxREAD_LEN,
1619 &elem_bits, &base, &elem_off, &elem_cnt);
1620 if (rc != 0) {
1621 *status = eVdbBlastErr;
1622 PLOGMSG(klogInfo, (klogInfo,
1623 "$(f): Cannot '$(name)' CellDataDirect",
1624 "f=%s,name=%s", __func__, "READ_LEN"));
1625 res = bad;
1626 break;
1627 }
1628 else if (elem_off != 0 || elem_bits != 32) {
1629 *status = eVdbBlastErr;
1630 PLOGERR(klogInt, (klogInt, rc,
1631 "Bad VCursorCellDataDirect(READ_LEN) result: "
1632 "boff=$(elem_off), elem_bits=$(elem_bits)",
1633 "elem_off=%u,elem_bits=%u", elem_off, elem_bits));
1634 res = bad;
1635 break;
1636 }
1637 else {
1638 uint8_t read = 0;
1639 const uint32_t *readLen = base;
1640 assert(self->rd.readType && self->rd.readLen);
1641 assert(self->rd.nReads == elem_cnt);
1642 for (read = 0; read < self->rd.nReads; ++read) {
1643 if (self->rd.readType[read] & SRA_READ_TYPE_BIOLOGICAL) {
1644 res = cmp(readLen[read], res, minRead, &done);
1645 }
1646 }
1647 }
1648 }
1649 }
1650 RELEASE(VCursor, curs);
1651 return res;
1652 }
1653
1654 static
_VdbBlastRunGetWgsAccession(VdbBlastRun * self,int64_t spot,char * name_buffer,size_t bsize,size_t * num_required)1655 VdbBlastStatus _VdbBlastRunGetWgsAccession(VdbBlastRun *self, int64_t spot,
1656 char *name_buffer, size_t bsize, size_t *num_required)
1657 {
1658 rc_t rc = 0;
1659 uint32_t row_len = 0;
1660
1661 assert(num_required);
1662
1663 if (self == NULL || spot <= 0 || name_buffer == NULL || bsize == 0) {
1664 STSMSG(0, ("Error: some of %s parameters is NULL or 0", __func__));
1665 return eVdbBlastErr;
1666 }
1667 assert(self->obj);
1668 if (self->obj->seqTbl == NULL) {
1669 STSMSG(0, ("Error: %s: VTable is NULL in VdbBlastRun", __func__));
1670 return eVdbBlastErr;
1671 }
1672
1673 if (self->obj->cursACCESSION == NULL) {
1674 rc = _VTableMakeCursor(self->obj->seqTbl, &self->obj->cursACCESSION,
1675 &self->obj->col_ACCESSION, "ACCESSION", self->acc);
1676 if (rc != 0) {
1677 VCursorRelease(self->obj->cursACCESSION);
1678 self->obj->cursACCESSION = NULL;
1679 return eVdbBlastErr;
1680 }
1681 }
1682
1683 assert(self->obj->cursACCESSION && rc == 0);
1684
1685 rc = VCursorReadDirect(self->obj->cursACCESSION, spot,
1686 self->obj->col_ACCESSION, 8, name_buffer, bsize, &row_len);
1687 *num_required = row_len;
1688 if (row_len > 0) /* include ending '\0' */
1689 { ++(*num_required); }
1690 if (rc == 0) {
1691 if (bsize > row_len)
1692 { name_buffer[row_len] = '\0'; }
1693 return eVdbBlastNoErr;
1694 }
1695 else if (rc == SILENT_RC
1696 (rcVDB, rcCursor, rcReading, rcBuffer, rcInsufficient))
1697 { return eVdbBlastNoErr; }
1698 else {
1699 assert(self->path);
1700 PLOGERR(klogInt, (klogInt, rc, "Error in VCursorReadDirect"
1701 "$(path), ACCESSION, spot=$(spot))",
1702 "path=%s,spot=%ld", self->path, spot));
1703 return eVdbBlastErr;
1704 }
1705 }
1706
_VdbBlastRunGetNumAlignments(VdbBlastRun * self,VdbBlastStatus * status)1707 uint64_t _VdbBlastRunGetNumAlignments(VdbBlastRun *self,
1708 VdbBlastStatus *status)
1709 {
1710 assert(status);
1711
1712 *status = eVdbBlastNoErr;
1713
1714 if (self->alignments == ~0) {
1715 assert(self->obj);
1716
1717 if (self->obj->prAlgnTbl == NULL) {
1718 self->alignments = 0;
1719 }
1720 else {
1721 const char col[] = "SPOT_COUNT";
1722 *status = _VTableReadFirstRow(self->obj->prAlgnTbl, col,
1723 &self->alignments, sizeof self->alignments, NULL, self->acc);
1724 if (*status != eVdbBlastNoErr) {
1725 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1726 return 0;
1727 }
1728 }
1729 }
1730
1731 assert(self->alignments != ~0);
1732
1733 return self->alignments;
1734 }
1735
1736 #ifdef TEST_VdbBlastRunFillReadDesc
1737 LIB_EXPORT
1738 #endif
_VdbBlastRunFillReadDesc(VdbBlastRun * self,uint64_t read_id,ReadDesc * desc)1739 VdbBlastStatus _VdbBlastRunFillReadDesc(VdbBlastRun *self,
1740 uint64_t read_id, ReadDesc *desc)
1741 {
1742 VdbBlastStatus status = eVdbBlastNoErr;
1743
1744 const VdbBlastRun *prev = NULL;
1745 const RunDesc *rd = NULL;
1746
1747 int bioIdx = 0;
1748
1749 if (self == NULL || desc == NULL) {
1750 S
1751 return eVdbBlastErr;
1752 }
1753
1754 prev = desc->run;
1755 memset(desc, 0, sizeof *desc);
1756 desc->prev = prev;
1757 desc->run = self;
1758
1759 rd = &self->rd;
1760
1761 if (rd->nReads == 0 || rd->readType == NULL) {
1762 status = _VdbBlastRunFillRunDesc(self);
1763 if (status != eVdbBlastNoErr)
1764 { return status; }
1765 assert(rd->nReads && rd->readType);
1766 }
1767
1768 if (rd->readIdDesc.idType == eFixedReadN || read_id == 0) {
1769 if (!_VdbBlastRunVarReadNum(self)) {
1770 desc->nReads = rd->nReads;
1771 desc->spot = read_id / rd->nBioReads + 1;
1772 if (desc->spot <= rd->spotCount) {
1773 int idInSpot
1774 = read_id - (desc->spot - 1) * rd->nBioReads; /* 0-based */
1775
1776 int i = 0;
1777 for (i = 0; i < rd->nReads; ++i) {
1778 if (rd->readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
1779 if (bioIdx++ == idInSpot) {
1780 S
1781 desc->tableId = VDB_READ_UNALIGNED;
1782 desc->read = i + 1;
1783 desc->read_id = _RunDescMakeReadId(rd,
1784 desc->spot, desc->read, read_id, &status);
1785 return status;
1786 }
1787 }
1788 }
1789 S
1790 }
1791 else {
1792 uint64_t alignments = 0;
1793 S
1794 desc->spot -= rd->spotCount;
1795 alignments = _VdbBlastRunGetNumAlignments(self, &status);
1796 if (status != eVdbBlastNoErr) {
1797 return status;
1798 }
1799 if (desc->spot <= alignments) {
1800 desc->tableId = VDB_READ_ALIGNED;
1801 desc->read = 1;
1802 return eVdbBlastNoErr;
1803 }
1804 S
1805 }
1806 }
1807 else {
1808 bool found = false;
1809 if (_VdbBlastDbOpenCursAndGetFirstRead
1810 (self->obj, desc, &found, self->acc) == 0)
1811 {
1812 return eVdbBlastNoErr;
1813 }
1814 }
1815 }
1816 else if (rd->readIdDesc.idType == eFactor10) {
1817 if (rd->readIdDesc.runBits > 0) {
1818 read_id /= rd->readIdDesc.runBits;
1819 }
1820 if (rd->spotBits > 0) {
1821 rc_t rc = 0;
1822 desc->read = read_id / rd->spotBits;
1823 desc->spot = read_id % rd->spotBits;
1824 if (desc->read > 0 && desc->spot > 0) {
1825 rc = _VdbBlastDbGetNReads
1826 (self->obj, desc->spot, &desc->nReads, self->acc);
1827 if (rc == 0) {
1828 desc->tableId = VDB_READ_UNALIGNED;
1829 S
1830 return eVdbBlastNoErr;
1831 }
1832 else {
1833 S
1834 }
1835 }
1836 else {
1837 S
1838 }
1839 }
1840 }
1841 else {
1842 assert(0);
1843 }
1844
1845 memset(desc, 0, sizeof *desc);
1846 return eVdbBlastErr;
1847 }
1848
_VdbBlastRunSetFindFirstRead(const VdbBlastRunSet * self,uint64_t * read_id,bool useGetFirstRead)1849 VdbBlastStatus _VdbBlastRunSetFindFirstRead(const VdbBlastRunSet *self,
1850 uint64_t *read_id, bool useGetFirstRead)
1851 {
1852 VdbBlastStatus status = eVdbBlastErr;
1853 const RunSet *rs = NULL;
1854 ReadDesc desc;
1855 assert(self);
1856 memset(&desc, 0, sizeof desc);
1857 rs = &self->runs;
1858 _VdbBlastRunSetBeingRead(self);
1859 if (useGetFirstRead) {
1860 bool found = false;
1861 uint32_t i = 0;
1862 for (i = 0; i < rs->krun; ++i) {
1863 VdbBlastRun *run = &rs->run[i];
1864 rc_t rc = _VdbBlastDbOpenCursAndGetFirstRead
1865 (rs->run[i].obj, &desc, &found, rs->run[i].acc);
1866 if (rc != 0) {
1867 break;
1868 }
1869 else if (!found) {
1870 continue;
1871 }
1872
1873 if (desc.spot == 0) {
1874 if (rs->krun > 1) {
1875 desc.prev = run;
1876 }
1877 }
1878 else {
1879 desc.run = run;
1880 status = _ReadDescFixReadId(&desc);
1881 break;
1882 }
1883 }
1884 }
1885 else {
1886 status = _RunSetFindReadDesc(rs, 0, &desc);
1887 if (status == eVdbBlastNoErr) {
1888 status = _ReadDescFixReadId(&desc);
1889 }
1890 }
1891 if (status == eVdbBlastNoErr) {
1892 assert(read_id);
1893 *read_id = desc.read_id;
1894 }
1895 return status;
1896 }
1897
_VdbBlastRunSetGetReadIdType(const VdbBlastRunSet * self)1898 EReadIdType _VdbBlastRunSetGetReadIdType(const VdbBlastRunSet *self) {
1899 assert(self);
1900
1901 return self->readIdDesc.idType;
1902 }
1903
_VdbBlastRunGetReadId(VdbBlastRun * self,const char * acc,uint64_t spot,uint32_t read,uint64_t * read_id)1904 static VdbBlastStatus _VdbBlastRunGetReadId(VdbBlastRun *self, const char *acc,
1905 uint64_t spot, /* 1-based */
1906 uint32_t read, /* 1-based */
1907 uint64_t *read_id)
1908 {
1909 uint64_t id = ~0;
1910 VdbBlastStatus status = eVdbBlastErr;
1911 size_t size;
1912
1913 assert(self && acc && read_id && self->acc);
1914 assert(memcmp(self->acc, acc, string_measure(self->acc, &size)) == 0);
1915
1916 if ((spot <= 0 && read > 0) || (spot > 0 && read <= 0)) {
1917 S
1918 return eVdbBlastErr;
1919 }
1920
1921 if (spot > 0) {
1922 if (self->type != btpSRA) {
1923 return eVdbBlastErr;
1924 }
1925
1926 if (self->rd.readIdDesc.idType == eFixedReadN) {
1927 for (id = (spot - 1) * self->rd.nBioReads; ; ++id) {
1928 ReadDesc desc;
1929 status = _VdbBlastRunFillReadDesc(self, id, &desc);
1930 if (status != eVdbBlastNoErr)
1931 { return status; }
1932 if (desc.spot < spot) {
1933 S
1934 return eVdbBlastErr;
1935 }
1936 if (desc.spot > spot) {
1937 S
1938 return eVdbBlastErr;
1939 }
1940 if (desc.read == read) {
1941 *read_id = id;
1942 return eVdbBlastNoErr;
1943 }
1944 }
1945 S
1946 return eVdbBlastErr;
1947 }
1948 else if (self->rd.readIdDesc.idType == eFactor10) {
1949 if (spot <= self->rd.spotCount) {
1950 if (self->rd.readIdDesc.varReadN) {
1951 /* TODO How to make sure this read exists in this spot?*/
1952 }
1953 else {
1954 if (read > self->rd.nReads) {
1955 S
1956 return eVdbBlastErr;
1957 }
1958 else {
1959 /* TODO how to check it when READ_TYPE is variable
1960 withing a run (is it possible?) */
1961 if (read == 0) {
1962 S
1963 return eVdbBlastErr;
1964 }
1965 if (! (self->rd.readType[read - 1]
1966 & SRA_READ_TYPE_BIOLOGICAL))
1967 {
1968 S
1969 return eVdbBlastErr;
1970 }
1971 }
1972 }
1973 id = _RunDescMakeReadId(&self->rd, spot, read, ~0, &status);
1974 *read_id = id;
1975 return status;
1976 }
1977 else {
1978 S
1979 return eVdbBlastErr;
1980 }
1981 }
1982 else {
1983 assert(0);
1984 return eVdbBlastErr;
1985 }
1986 }
1987 else {
1988 uint64_t n = ~0;
1989 uint64_t i = ~0;
1990 if (self->type == btpSRA)
1991 { return eVdbBlastErr; }
1992 else if (self->type == btpREFSEQ) {
1993 *read_id = 0;
1994 return eVdbBlastNoErr;
1995 }
1996 else if (self->type == btpWGS) {
1997 n = _VdbBlastRunGetNumSequences(self, &status);
1998 if (status != eVdbBlastNoErr
1999 && status != eVdbBlastTooExpensive)
2000 {
2001 return status;
2002 }
2003 /* TODO optimize: avoid full run scan */
2004 for (i = 0; i < n ; ++i) {
2005 size_t need = ~0;
2006 #define SZ 4096
2007 char name_buffer[SZ + 1];
2008 if (string_measure(acc, &size) > SZ) {
2009 S
2010 return eVdbBlastErr;
2011 }
2012 #undef SZ
2013 status = _VdbBlastRunGetWgsAccession(
2014 self, i + 1, name_buffer, sizeof name_buffer, &need);
2015 if (need > sizeof name_buffer) {
2016 S
2017 return eVdbBlastErr;
2018 }
2019 if (strcmp(name_buffer, acc) == 0) {
2020 i = _RunDescMakeReadId(&self->rd, i + 1, 1, i, &status);
2021 *read_id = i;
2022 return status;
2023 }
2024 }
2025 }
2026 else { assert(0); }
2027 return eVdbBlastErr;
2028 }
2029 }
2030
_VdbBlastRunGetSequencesAmount(VdbBlastRun * self,VdbBlastStatus * status)2031 static uint64_t _VdbBlastRunGetSequencesAmount(
2032 VdbBlastRun *self, VdbBlastStatus *status)
2033 {
2034 uint64_t n = _VdbBlastRunGetNumSequences(self, status);
2035
2036 assert(status);
2037
2038 if (*status == eVdbBlastNoErr) {
2039 n += _VdbBlastRunGetNumAlignments(self, status);
2040 }
2041
2042 return n;
2043 }
2044
2045 /******************************************************************************/
2046
_ReadDescFixReadId(ReadDesc * self)2047 VdbBlastStatus _ReadDescFixReadId(ReadDesc *self) {
2048 VdbBlastStatus status = eVdbBlastNoErr;
2049
2050 assert(self && self->run);
2051
2052 self->read_id = _RunDescMakeReadId
2053 (&self->run->rd, self->spot, self->read, self->read_id, &status);
2054
2055 return status;
2056 }
2057
2058 /******************************************************************************/
2059
_RunSetFini(RunSet * self)2060 static void _RunSetFini(RunSet *self) {
2061 assert(self);
2062
2063 if (self->run) {
2064 uint32_t i = 0;
2065
2066 for (i = 0; i < self->krun; ++i) {
2067 _VdbBlastRunFini(&self->run[i]);
2068 }
2069
2070 free(self->run);
2071 }
2072
2073 _RefSetFini(&self->refs);
2074
2075 memset(self, 0, sizeof *self);
2076 }
2077
_RunSetAllocTbl(RunSet * self)2078 static VdbBlastStatus _RunSetAllocTbl(RunSet *self) {
2079 size_t nmemb = 16;
2080
2081 if (self == NULL)
2082 { return eVdbBlastErr; }
2083
2084 if (self->run && self->krun < self->nrun) {
2085 return eVdbBlastNoErr;
2086 }
2087
2088 if (self->run == NULL) {
2089 self->run = calloc(nmemb, sizeof *self->run);
2090 if (self->run == NULL)
2091 { return eVdbBlastMemErr; }
2092 S
2093 }
2094 else {
2095 void *p = NULL;
2096 nmemb += self->nrun;
2097 p = realloc(self->run, nmemb * sizeof *self->run);
2098 if (p == NULL)
2099 { return eVdbBlastMemErr; }
2100 self->run = p;
2101 S
2102 }
2103
2104 self->nrun = nmemb;
2105 return eVdbBlastNoErr;
2106 }
2107
_RunSetAddObj(RunSet * self,VdbBlastDb * obj,const char * rundesc,BTableType type,const KDirectory * dir,char * fullpath,uint32_t min_read_length)2108 static VdbBlastStatus _RunSetAddObj(RunSet *self, VdbBlastDb *obj,
2109 const char *rundesc, BTableType type, const KDirectory *dir,
2110 char *fullpath, uint32_t min_read_length)
2111 {
2112
2113 VdbBlastRun* run = NULL;
2114
2115 VdbBlastStatus status = _RunSetAllocTbl(self);
2116 if (status != eVdbBlastNoErr) {
2117 return status;
2118 }
2119
2120 assert(self && self->run);
2121
2122 run = &self->run[self->krun];
2123
2124 status = _VdbBlastRunInit(run,
2125 obj, rundesc, type, dir, fullpath, min_read_length, self->krun++);
2126
2127 return status;
2128 }
2129
_RunSetGetNumSequences(const RunSet * self,VdbBlastStatus * aStatus)2130 static uint64_t _RunSetGetNumSequences
2131 (const RunSet *self, VdbBlastStatus *aStatus)
2132 {
2133 uint64_t num = 0;
2134 uint32_t i = 0;
2135
2136 assert(self && aStatus);
2137
2138 *aStatus = eVdbBlastNoErr;
2139
2140 for (i = 0; i < self->krun; ++i) {
2141 VdbBlastStatus status = eVdbBlastNoErr;
2142 VdbBlastRun *run = NULL;
2143
2144 assert(self->run);
2145
2146 run = &self->run[i];
2147
2148 num += _VdbBlastRunGetNumSequences(run, &status);
2149 if (status != eVdbBlastNoErr) {
2150 assert(run->path);
2151
2152 if (*aStatus == eVdbBlastNoErr) {
2153 *aStatus = status;
2154 }
2155
2156 if (status != eVdbBlastTooExpensive) {
2157 STSMSG(1, (
2158 "Error: failed to GetNumSequences(on run %s)", run->path));
2159 return 0;
2160 }
2161
2162 assert(*aStatus == eVdbBlastTooExpensive);
2163 }
2164 }
2165
2166 STSMSG(1, ("_RunSetGetNumSequences = %ld", num));
2167
2168 return num;
2169 }
2170
2171 static
_RunSetGetNumSequencesApprox(const RunSet * self,VdbBlastStatus * status)2172 uint64_t _RunSetGetNumSequencesApprox(const RunSet *self,
2173 VdbBlastStatus *status)
2174 {
2175 uint64_t num = 0;
2176 uint32_t i = 0;
2177
2178 assert(self && status);
2179
2180 *status = eVdbBlastNoErr;
2181
2182 for (i = 0; i < self->krun; ++i) {
2183 VdbBlastRun *run = NULL;
2184
2185 assert(self->run);
2186
2187 run = &self->run[i];
2188
2189 num += _VdbBlastRunGetNumSequencesApprox(run, status);
2190 if (*status != eVdbBlastNoErr) {
2191 assert(run->path);
2192
2193 STSMSG(1, (
2194 "Error: failed to GetNumSequencesApprox(on run %s)",
2195 run->path));
2196
2197 return 0;
2198 }
2199 }
2200
2201 STSMSG(1, ("_RunSetGetNumSequencesApprox = %ld", num));
2202
2203 return num;
2204 }
2205
2206 static
_RunSetGetTotalLength(const RunSet * self,VdbBlastStatus * status)2207 uint64_t _RunSetGetTotalLength(const RunSet *self,
2208 VdbBlastStatus *status)
2209 {
2210 uint64_t num = 0;
2211 uint32_t i = 0;
2212 assert(self && status);
2213
2214 if (self->krun)
2215 { assert(self->run); }
2216
2217 for (i = 0; i < self->krun; ++i) {
2218 VdbBlastRun *run = &self->run[i];
2219 assert(run && run->path);
2220 num += _VdbBlastRunGetLength(run, status);
2221 if (*status != eVdbBlastNoErr) {
2222 STSMSG(1, (
2223 "Error: failed to _RunSetGetTotalLength(on run %s)",
2224 run->path));
2225 return 0;
2226 }
2227 }
2228
2229 STSMSG(1, ("_RunSetGetTotalLength = %ld", num));
2230
2231 return num;
2232 }
2233
_RunSetGetTotalLengthApprox(const RunSet * self,VdbBlastStatus * status)2234 static uint64_t _RunSetGetTotalLengthApprox(const RunSet *self,
2235 VdbBlastStatus *status)
2236 {
2237 uint64_t num = 0;
2238
2239 uint32_t i = 0;
2240
2241 assert(self && status);
2242
2243 for (num = 0, i = 0; i < self->krun; ++i) {
2244 VdbBlastRun *run = NULL;
2245
2246 assert(self->run);
2247
2248 run = &self->run[i];
2249
2250 num += _VdbBlastRunGetLengthApprox(run, status);
2251 if (*status != eVdbBlastNoErr) {
2252 STSMSG(1, ("Error: failed "
2253 "to _VdbBlastRunGetLengthApprox(on run %s)", run->path));
2254 return 0;
2255 }
2256 }
2257
2258 STSMSG(1, ("VdbBlastRunSetGetTotalLengthApprox = %ld", num));
2259
2260 return num;
2261 }
2262
_RunSetGetName(const RunSet * self,VdbBlastStatus * status,char * name_buffer,size_t bsize)2263 static size_t _RunSetGetName(const RunSet *self,
2264 VdbBlastStatus *status, char *name_buffer, size_t bsize)
2265 {
2266 size_t need = 0, idx = 0;
2267 int i = 0;
2268 size_t size;
2269
2270 assert(self && status);
2271 for (i = 0; i < self->krun; ++i) {
2272 VdbBlastRun *run = &self->run[i];
2273 if (run && run->acc) {
2274 if (i)
2275 { ++need; }
2276 need += string_measure(run->acc, &size);
2277 }
2278 else {
2279 S
2280 return 0;
2281 }
2282 }
2283
2284 if (name_buffer == NULL || bsize == 0) {
2285 S
2286 return need;
2287 }
2288
2289 for (i = 0; i < self->krun; ++i) {
2290 VdbBlastRun *run = &self->run[i];
2291 if (run && run->acc) {
2292 if (i)
2293 { name_buffer[idx++] = '|'; }
2294 if (idx >= bsize) {
2295 S
2296 return need;
2297 }
2298 string_copy(name_buffer + idx, bsize - idx,
2299 run->acc, string_size(run->acc));
2300 idx += string_measure(run->acc, &size);
2301 if (idx >= bsize) {
2302 S
2303 return need;
2304 }
2305 }
2306 }
2307 name_buffer[idx++] = '\0';
2308 *status = eVdbBlastNoErr;
2309
2310 S
2311 return need;
2312 }
2313
2314 typedef struct {
2315 uint64_t read_id;
2316 } ReadId;
_RunSetFindReadDescContinuous(const RunSet * self,uint64_t read_id,ReadDesc * desc)2317 static VdbBlastStatus _RunSetFindReadDescContinuous(
2318 const RunSet *self, uint64_t read_id, ReadDesc *desc)
2319 {
2320 uint64_t i = 0;
2321 uint64_t prev = 0;
2322 uint64_t crnt = 0;
2323
2324 assert(self && desc);
2325
2326 for (i = 0, prev = 0; i < self->krun; ++i) {
2327 VdbBlastStatus status = eVdbBlastNoErr;
2328 VdbBlastRun *run = NULL;
2329 uint64_t l = 0;
2330
2331 if (prev > 0 && i < prev) {
2332 S
2333 return eVdbBlastErr;
2334 }
2335
2336 run = &self->run[i];
2337 if (run == NULL) {
2338 S
2339 return eVdbBlastErr;
2340 }
2341
2342 l = _VdbBlastRunGetSequencesAmount(run, &status);
2343 if (status != eVdbBlastNoErr &&
2344 status != eVdbBlastTooExpensive)
2345 {
2346 S
2347 return status;
2348 }
2349
2350 if (crnt + l <= read_id) {
2351 crnt += l;
2352 }
2353 else {
2354 uint64_t a_read_id = read_id - crnt;
2355 status = _VdbBlastRunFillReadDesc(run, a_read_id, desc);
2356 if (status == eVdbBlastNoErr) {
2357 S
2358 desc->read_id = read_id;
2359 }
2360 else
2361 { S }
2362
2363 return status;
2364 }
2365
2366 prev = i;
2367 }
2368
2369 S
2370 return eVdbBlastErr;
2371 }
2372
_RunSetFindReadDescFactor10(const RunSet * self,uint64_t read_id,ReadDesc * desc)2373 static VdbBlastStatus _RunSetFindReadDescFactor10(
2374 const RunSet *self, uint64_t read_id, ReadDesc *desc)
2375 {
2376 VdbBlastStatus status = eVdbBlastErr;
2377
2378 VdbBlastRun *run = NULL;
2379 uint64_t i = 0;
2380
2381 assert(self && desc && self->run);
2382
2383 if (self->run[0].rd.readIdDesc.runBits > 0) {
2384 i = read_id % self->run[0].rd.readIdDesc.runBits;
2385 }
2386 else {
2387 i = 0;
2388 }
2389
2390 if (i >= self->krun) {
2391 S
2392 return eVdbBlastErr;
2393 }
2394
2395 run = &self->run[i];
2396
2397 status = _VdbBlastRunFillReadDesc(run, read_id, desc);
2398 if (status == eVdbBlastNoErr) {
2399 S
2400 desc->read_id = read_id;
2401 }
2402 else
2403 { S }
2404
2405 return status;
2406 }
2407
_RunSetFindReadDesc(const RunSet * self,uint64_t read_id,ReadDesc * desc)2408 VdbBlastStatus _RunSetFindReadDesc(const RunSet *self,
2409 uint64_t read_id, ReadDesc *desc)
2410 {
2411 if (self == NULL || desc == NULL || self->krun == 0) {
2412 S
2413 return eVdbBlastErr;
2414 }
2415
2416 switch (self->run[0].rd.readIdDesc.idType) {
2417 case eFixedReadN:
2418 return _RunSetFindReadDescContinuous(self, read_id, desc);
2419 case eFactor10:
2420 return _RunSetFindReadDescFactor10(self, read_id, desc);
2421 default:
2422 assert(0);
2423 return eVdbBlastErr;
2424 }
2425 }
2426
2427 /******************************************************************************/
2428
2429 static const char VDB_BLAST_RUN_SET[] = "VdbBlastRunSet";
2430
_VdbBlastRunSetWhack(VdbBlastRunSet * self)2431 static void _VdbBlastRunSetWhack(VdbBlastRunSet *self) {
2432 assert(self);
2433
2434 STSMSG(1, ("Deleting VdbBlastRunSet(min_read_length=%d, protein=%s)",
2435 self->core2na.min_read_length, self->protein ? "true" : "false"));
2436
2437 VdbBlastMgrRelease(self->mgr);
2438
2439 _RunSetFini(&self->runs);
2440 _Core2naFini(&self->core2na);
2441 _Core4naFini(&self->core4na);
2442 _Core2naFini(&self->core2naRef);
2443 _Core4naFini(&self->core4naRef);
2444
2445 memset(self, 0, sizeof *self);
2446 free(self);
2447 }
2448
2449 LIB_EXPORT
VdbBlastMgrMakeRunSet(const VdbBlastMgr * cself,VdbBlastStatus * status,uint32_t min_read_length,bool protein)2450 VdbBlastRunSet *VdbBlastMgrMakeRunSet(const VdbBlastMgr *cself,
2451 VdbBlastStatus *status,
2452 uint32_t min_read_length,
2453 bool protein)
2454 {
2455 rc_t rc = 0;
2456 VdbBlastRunSet *item = NULL;
2457 VdbBlastMgr *self = (VdbBlastMgr*)cself;
2458
2459 VdbBlastStatus dummy = eVdbBlastNoErr;
2460 if (status == NULL)
2461 { status = &dummy; }
2462
2463 *status = eVdbBlastNoErr;
2464
2465 item = calloc(1, sizeof *item);
2466 if (item == NULL) {
2467 *status = eVdbBlastMemErr;
2468 return item;
2469 }
2470
2471 item->protein = protein;
2472
2473 item->core2na.min_read_length = min_read_length;
2474
2475 item->core2naRef.min_read_length = min_read_length;
2476
2477 item->core4na.min_read_length = min_read_length;
2478 item->core4na.mode = VDB_READ_UNALIGNED;
2479
2480 item->core4naRef.min_read_length = min_read_length;
2481 item->core4naRef.mode = VDB_READ_REFERENCE;
2482
2483 item->minSeqLen = ~0;
2484 item->avgSeqLen = ~0;
2485 item->maxSeqLen = ~0;
2486
2487 if (rc == 0) {
2488 rc = KLockMake(&item->core2na.mutex);
2489 if (rc != 0) {
2490 LOGERR(klogInt, rc, "Error in KLockMake");
2491 }
2492 }
2493 if (rc == 0) {
2494 rc = KLockMake(&item->core2naRef.mutex);
2495 if (rc != 0) {
2496 LOGERR(klogInt, rc, "Error in KLockMake");
2497 }
2498 }
2499 if (rc == 0) {
2500 rc = KLockMake(&item->core4na.mutex);
2501 if (rc != 0) {
2502 LOGERR(klogInt, rc, "Error in KLockMake");
2503 }
2504 }
2505 if (rc == 0) {
2506 rc = KLockMake(&item->core4naRef.mutex);
2507 if (rc != 0) {
2508 LOGERR(klogInt, rc, "Error in KLockMake");
2509 }
2510 }
2511 if (rc == 0) {
2512 item->mgr = VdbBlastMgrAddRef(self);
2513 if (item->mgr) {
2514 KRefcountInit(&item->refcount,
2515 1, VDB_BLAST_RUN_SET, __func__, "set");
2516 STSMSG(1, ("Created VdbBlastRunSet(min_read_length=%d, protein=%s)",
2517 min_read_length, protein ? "true" : "false"));
2518 return item;
2519 }
2520 }
2521
2522 STSMSG(1, ("Error: failed to create VdbBlastRunSet"));
2523 _VdbBlastRunSetWhack(item);
2524
2525 *status = eVdbBlastErr;
2526
2527 return NULL;
2528 }
2529
VdbBlastRunSetRelease(VdbBlastRunSet * self)2530 LIB_EXPORT void CC VdbBlastRunSetRelease(VdbBlastRunSet *self) {
2531 if (self == NULL) {
2532 return;
2533 }
2534
2535 STSMSG(1, ("VdbBlastRunSetRelease"));
2536 if (KRefcountDrop(&self->refcount, VDB_BLAST_RUN_SET) != krefWhack)
2537 { return; }
2538
2539 _VdbBlastRunSetWhack(self);
2540 }
2541
2542 LIB_EXPORT
VdbBlastRunSetAddRef(VdbBlastRunSet * self)2543 VdbBlastRunSet* CC VdbBlastRunSetAddRef(VdbBlastRunSet *self)
2544 {
2545 if (self == NULL) {
2546 STSMSG(1, ("VdbBlastRunSetAddRef(NULL)"));
2547 return self;
2548 }
2549
2550 if (KRefcountAdd(&self->refcount, VDB_BLAST_RUN_SET) == krefOkay) {
2551 STSMSG(1, ("VdbBlastRunSetAddRef"));
2552 return self;
2553 }
2554
2555 STSMSG(1, ("Error: failed to VdbBlastRunSetAddRef"));
2556 return NULL;
2557 }
2558
2559 LIB_EXPORT
VdbBlastRunSetAddRun(VdbBlastRunSet * self,const char * native)2560 VdbBlastStatus CC VdbBlastRunSetAddRun(VdbBlastRunSet *self,
2561 const char *native)
2562 {
2563 rc_t rc = 0;
2564 char rundesc[PATH_MAX] = "";
2565 VdbBlastStatus status = eVdbBlastNoErr;
2566 BTableType type = btpUndefined;
2567
2568 /* allocated in _VdbBlastMgrFindNOpenSeqTable()
2569 in _RunSetAddObj() is assigned to VdbBlastRun::path
2570 freed during VdbBlastRun release */
2571 char *fullpath = NULL;
2572
2573 VdbBlastDb *obj = NULL;
2574 if (self == NULL || self->mgr == NULL || self->beingRead) {
2575 S
2576 return eVdbBlastErr;
2577 }
2578
2579 obj = calloc(1, sizeof *obj);
2580 if (obj == NULL) {
2581 return eVdbBlastMemErr;
2582 }
2583
2584 rc = _VdbBlastMgrNativeToPosix(self->mgr, native, rundesc, sizeof rundesc);
2585 if (rc != 0) {
2586 S
2587 status = eVdbBlastErr;
2588 }
2589
2590 status = _VdbBlastMgrFindNOpenSeqTable(self->mgr,
2591 rundesc, &obj->seqTbl, &type, &fullpath, &obj->db);
2592 if (status != eVdbBlastNoErr) {
2593 S
2594 PLOGMSG(klogInfo,
2595 (klogInfo, "failed to open $(rundesc)", "rundesc=%s", rundesc));
2596 }
2597 else {
2598 S
2599 PLOGMSG(klogInfo,
2600 (klogInfo, "opened $(rundesc)", "rundesc=%s", rundesc));
2601 }
2602
2603 if (status == eVdbBlastNoErr && _VTableCSra(obj->seqTbl)) {
2604 if (obj->db == NULL) {
2605 S
2606 status = eVdbBlastErr;
2607 }
2608 else {
2609 status = _VDatabaseOpenAlignmentTable(
2610 obj->db, rundesc, &obj->prAlgnTbl);
2611 }
2612 }
2613
2614 if (status == eVdbBlastNoErr) {
2615 if (_VTableVarReadNum(obj->seqTbl, rundesc)) {
2616 self->readIdDesc.varReadN = true;
2617 self->readIdDesc.idType = eFactor10;
2618 }
2619
2620 status = _RunSetAddObj(&self->runs, obj, rundesc, type,
2621 NULL, fullpath, self->core2na.min_read_length);
2622 S
2623 }
2624
2625 return status;
2626 }
2627
_VdbBlastRunSetBeingRead(const VdbBlastRunSet * cself)2628 void _VdbBlastRunSetBeingRead(const VdbBlastRunSet *cself) {
2629 uint32_t i = 0;
2630 uint32_t runBits = 0;
2631 VdbBlastRunSet *self = (VdbBlastRunSet*)cself;
2632 EReadIdType idType = eFixedReadN;
2633
2634 if (self == NULL || self->beingRead) {
2635 return;
2636 }
2637
2638 idType = self->readIdDesc.idType;
2639 runBits = Bits(self->runs.krun - 1, idType);
2640
2641 for (i = 0; i < self->runs.krun; ++i) {
2642 VdbBlastRun *r = &self->runs.run[i];
2643 assert(r);
2644 r->rd.readIdDesc.idType = idType;
2645 r->rd.readIdDesc.runBits = runBits;
2646 r->rd.readIdDesc.varReadN = _VdbBlastRunVarReadNum(r);
2647 }
2648
2649 self->beingRead = true;
2650 }
2651
2652 LIB_EXPORT
VdbBlastRunSetIsProtein(const VdbBlastRunSet * self)2653 bool CC VdbBlastRunSetIsProtein(const VdbBlastRunSet *self)
2654 {
2655 if (self == NULL) {
2656 STSMSG(1, ("VdbBlastRunSetIsProtein(self=NULL)"));
2657 return false;
2658 }
2659
2660 STSMSG(1, (
2661 "VdbBlastRunSetIsProtein = %s", self->protein ? "true" : "false"));
2662
2663 return self->protein;
2664 }
2665
2666 LIB_EXPORT
VdbBlastRunSetGetNumSequences(const VdbBlastRunSet * self,VdbBlastStatus * status)2667 uint64_t CC VdbBlastRunSetGetNumSequences(const VdbBlastRunSet *self,
2668 VdbBlastStatus *status)
2669 {
2670 VdbBlastStatus dummy = eVdbBlastNoErr;
2671 if (status == NULL)
2672 { status = &dummy; }
2673
2674 if (self == NULL) {
2675 *status = eVdbBlastErr;
2676 return 0;
2677 }
2678
2679 _VdbBlastRunSetBeingRead(self);
2680
2681 return _RunSetGetNumSequences(&self->runs, status);
2682 }
2683
VdbBlastRunSetGetNumSequencesApprox(const VdbBlastRunSet * self)2684 LIB_EXPORT uint64_t CC VdbBlastRunSetGetNumSequencesApprox(
2685 const VdbBlastRunSet *self)
2686 {
2687 uint64_t num = 0;
2688 VdbBlastStatus status = eVdbBlastNoErr;
2689
2690 _VdbBlastRunSetBeingRead(self);
2691
2692 num = _RunSetGetNumSequencesApprox(&self->runs, &status);
2693
2694 STSMSG(1, ("VdbBlastRunSetGetNumSequencesApprox=%lu", num));
2695
2696 return num;
2697 }
2698
2699 LIB_EXPORT
VdbBlastRunSetGetTotalLength(const VdbBlastRunSet * self,VdbBlastStatus * status)2700 uint64_t CC VdbBlastRunSetGetTotalLength(const VdbBlastRunSet *self,
2701 VdbBlastStatus *status)
2702 {
2703 VdbBlastStatus dummy = eVdbBlastNoErr;
2704 if (status == NULL)
2705 { status = &dummy; }
2706
2707 if (self == NULL) {
2708 *status = eVdbBlastErr;
2709 return 0;
2710 }
2711
2712 _VdbBlastRunSetBeingRead(self);
2713
2714 return _RunSetGetTotalLength(&self->runs, status);
2715 }
2716
2717 LIB_EXPORT
VdbBlastRunSetGetTotalLengthApprox(const VdbBlastRunSet * self)2718 uint64_t CC VdbBlastRunSetGetTotalLengthApprox(
2719 const VdbBlastRunSet *self)
2720 {
2721 VdbBlastStatus status = eVdbBlastNoErr;
2722
2723 if (self == NULL) {
2724 STSMSG(1, ("VdbBlastRunSetGetTotalLengthApprox(self=NULL)"));
2725 return 0;
2726 }
2727
2728 _VdbBlastRunSetBeingRead(self);
2729
2730 return _RunSetGetTotalLengthApprox(&self->runs, &status);
2731 }
2732
2733 LIB_EXPORT
VdbBlastRunSetGetMinSeqLen(const VdbBlastRunSet * self)2734 uint64_t CC VdbBlastRunSetGetMinSeqLen(const VdbBlastRunSet *self)
2735 {
2736 if (self->minSeqLen == ~0) {
2737 bool empty = true;
2738 VdbBlastStatus status = eVdbBlastNoErr;
2739 uint64_t res = ~0;
2740 uint32_t i = 0;
2741 _VdbBlastRunSetBeingRead(self);
2742 for (i = 0; i < self->runs.krun; ++i) {
2743 VdbBlastRun *run = &self->runs.run[i];
2744 assert(run);
2745 if (run->type == btpREFSEQ) {
2746 uint64_t cand = _VdbBlastRunGetLengthApprox(run, &status);
2747 if (status != eVdbBlastNoErr) {
2748 S
2749 return ~0;
2750 }
2751 if (cand < res && cand >= self->core2na.min_read_length) {
2752 res = cand;
2753 }
2754 }
2755 else {
2756 status = _VdbBlastRunFillRunDesc(run);
2757 if (status != eVdbBlastNoErr) {
2758 S
2759 return ~0;
2760 }
2761 if (!run->rd.varReadLen) {
2762 uint8_t read = 0;
2763 assert(run->rd.readType && run->rd.readLen);
2764 for (read = 0; read < run->rd.nReads; ++read) {
2765 if (run->rd.readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
2766 if (run->rd.readLen[i]
2767 == self->core2na.min_read_length)
2768 {
2769 ((VdbBlastRunSet*)self)->minSeqLen
2770 = self->core2na.min_read_length;
2771 return self->minSeqLen;
2772 }
2773 else if (run->rd.readLen[i]
2774 > self->core2na.min_read_length
2775 && run->rd.readLen[i] < res)
2776 {
2777 res = run->rd.readLen[i];
2778 empty = false;
2779 }
2780 }
2781 }
2782 }
2783 else {
2784 res = _VdbBlastRunScan(
2785 run, Min, self->core2na.min_read_length, res, &status);
2786 if (status != eVdbBlastNoErr) {
2787 S
2788 return ~0;
2789 }
2790 }
2791 }
2792 }
2793 if (empty && res == ~0) {
2794 res = 0;
2795 }
2796 ((VdbBlastRunSet*)self)->minSeqLen = res;
2797 }
2798 return self->minSeqLen;
2799 }
2800
2801 LIB_EXPORT
VdbBlastRunSetGetMaxSeqLen(const VdbBlastRunSet * self)2802 uint64_t CC VdbBlastRunSetGetMaxSeqLen(const VdbBlastRunSet *self)
2803 {
2804 if (self->maxSeqLen == ~0) {
2805 VdbBlastStatus status = eVdbBlastNoErr;
2806 uint64_t res = 0;
2807 uint32_t i = 0;
2808 _VdbBlastRunSetBeingRead(self);
2809 for (i = 0; i < self->runs.krun; ++i) {
2810 VdbBlastRun *run = &self->runs.run[i];
2811 assert(run);
2812 if (run->type == btpREFSEQ) {
2813 uint64_t cand = _VdbBlastRunGetLengthApprox(run, &status);
2814 if (status != eVdbBlastNoErr) {
2815 S
2816 return ~0;
2817 }
2818 if (cand > res) {
2819 res = cand;
2820 }
2821 }
2822 else {
2823 status = _VdbBlastRunFillRunDesc(run);
2824 if (status != eVdbBlastNoErr) {
2825 S
2826 return ~0;
2827 }
2828 if (!run->rd.varReadLen) {
2829 uint8_t read = 0;
2830 assert(run->rd.readType && run->rd.readLen);
2831 for (read = 0; read < run->rd.nReads; ++read) {
2832 if (run->rd.readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
2833 if (run->rd.readLen[i] > res) {
2834 res = run->rd.readLen[i];
2835 }
2836 }
2837 }
2838 }
2839 else {
2840 res = _VdbBlastRunScan(
2841 run, Max, -1, res, &status);
2842 if (status != eVdbBlastNoErr) {
2843 S
2844 return ~0;
2845 }
2846 }
2847 }
2848 }
2849 ((VdbBlastRunSet*)self)->maxSeqLen = res;
2850 }
2851 return self->maxSeqLen;
2852 }
2853
2854 LIB_EXPORT
VdbBlastRunSetGetAvgSeqLen(const VdbBlastRunSet * self)2855 uint64_t CC VdbBlastRunSetGetAvgSeqLen(const VdbBlastRunSet *self)
2856 {
2857 uint64_t num = 0;
2858
2859 if (self == NULL) {
2860 STSMSG(1, ("VdbBlastRunSetGetAvgSeqLen(self=NULL)"));
2861 return 0;
2862 }
2863
2864 if (self->avgSeqLen == ~0) {
2865 uint64_t n = 0;
2866 _VdbBlastRunSetBeingRead(self);
2867
2868 n = VdbBlastRunSetGetNumSequencesApprox(self);
2869 if (n != 0) {
2870 num = VdbBlastRunSetGetTotalLengthApprox(self) / n;
2871 }
2872 else {
2873 num = n;
2874 }
2875 ((VdbBlastRunSet*)self)->avgSeqLen = num;
2876 }
2877
2878 STSMSG(1, ("VdbBlastRunSetGetAvgSeqLen = %ld", num));
2879 return self->avgSeqLen;
2880 }
2881
VdbBlastRunSetGetName(const VdbBlastRunSet * self,VdbBlastStatus * status,char * name_buffer,size_t bsize)2882 LIB_EXPORT size_t CC VdbBlastRunSetGetName(const VdbBlastRunSet *self,
2883 VdbBlastStatus *status, char *name_buffer, size_t bsize)
2884 {
2885 size_t sz = 0;
2886
2887 VdbBlastStatus dummy = eVdbBlastNoErr;
2888 if (status == NULL)
2889 { status = &dummy; }
2890
2891 *status = eVdbBlastErr;
2892
2893 if (self == NULL)
2894 { return 0; }
2895
2896 _VdbBlastRunSetBeingRead(self);
2897
2898 sz = _RunSetGetName(&self->runs, status, name_buffer, bsize);
2899
2900 STSMSG(1, ("VdbBlastRunSetGetName = '%.*s'", bsize, name_buffer));
2901
2902 return sz;
2903 }
2904
2905 LIB_EXPORT
VdbBlastRunSetLastUpdatedDate(const VdbBlastRunSet * self)2906 time_t CC VdbBlastRunSetLastUpdatedDate(const VdbBlastRunSet *self)
2907 {
2908 _VdbBlastRunSetBeingRead(self);
2909 return _NotImplemented(__func__);
2910 }
2911
2912 LIB_EXPORT
VdbBlastRunSetGetReadName(const VdbBlastRunSet * self,uint64_t read_id,char * name_buffer,size_t bsize)2913 size_t CC VdbBlastRunSetGetReadName(const VdbBlastRunSet *self,
2914 uint64_t read_id, /* 0-based in RunSet */
2915 char *name_buffer,
2916 size_t bsize)
2917 {
2918 rc_t rc = 0;
2919 VdbBlastStatus status = eVdbBlastNoErr;
2920 size_t need = 0;
2921 size_t num_writ = 0;
2922
2923 ReadDesc desc;
2924 memset(&desc, 0, sizeof desc);
2925
2926 if (name_buffer && bsize)
2927 { name_buffer[0] = '\0'; }
2928
2929 if (self == NULL) {
2930 STSMSG(1, ("VdbBlastRunSetGetReadName(self=NULL)"));
2931 return 0;
2932 }
2933
2934 _VdbBlastRunSetBeingRead(self);
2935
2936 status = _RunSetFindReadDesc(&self->runs, read_id, &desc);
2937 if (status != eVdbBlastNoErr) {
2938 STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadName: "
2939 "cannot find RunSet ReadDesc"));
2940 return 0;
2941 }
2942
2943 assert(desc.run && desc.run->path && desc.run->acc && desc.spot
2944 && desc.read);
2945
2946 if (desc.run->type == btpUndefined) {
2947 desc.run->type
2948 = _VdbBlastMgrBTableType(self->mgr, desc.run->path);
2949 assert(desc.run->type != btpUndefined);
2950 }
2951 if (desc.run->type == btpWGS) {
2952 if (desc.read != 1) {
2953 STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadName: "
2954 "Unexpected read='%u' for run '%s', spot='%lu'",
2955 desc.read, desc.run->path, desc.spot));
2956 return 0;
2957 }
2958 status = _VdbBlastRunGetWgsAccession(
2959 desc.run, desc.spot, name_buffer, bsize, &need);
2960 if (status != eVdbBlastNoErr)
2961 { need = 0; }
2962 return need;
2963 }
2964 else if (desc.run->type == btpREFSEQ) {
2965 rc = string_printf(name_buffer, bsize, &num_writ, "%s", desc.run->acc);
2966 if (rc == 0) {
2967 S
2968 need = num_writ;
2969 }
2970 else if (GetRCObject(rc) == (enum RCObject)rcBuffer
2971 && GetRCState(rc) == rcInsufficient)
2972 {
2973 size_t size;
2974 S
2975 need = string_measure(desc.run->acc, &size) + 1;
2976 }
2977 }
2978 else {
2979 rc = string_printf(name_buffer, bsize, &num_writ,
2980 "%s.%lu.%u", desc.run->acc, desc.spot, desc.read);
2981 if (rc == 0) {
2982 S
2983 need = num_writ;
2984 }
2985 else if (GetRCObject(rc) == (enum RCObject)rcBuffer
2986 && GetRCState(rc) == rcInsufficient)
2987 {
2988 int i = 0;
2989 size_t size;
2990 S
2991 need = string_measure(desc.run->acc, &size) + 2;
2992 i = desc.spot;
2993 while (i > 0) {
2994 ++need;
2995 i /= 10;
2996 }
2997 i = desc.read;
2998 while (i > 0) {
2999 ++need;
3000 i /= 10;
3001 }
3002 }
3003 else
3004 { LOGERR(klogInt, rc, "Unexpecter error in string_printf"); }
3005 }
3006
3007 STSMSG(1, ("VdbBlastRunSetGetName = '%.*s'", bsize, name_buffer));
3008 return need;
3009 }
3010
VdbBlastRunSetGetReadId(const VdbBlastRunSet * self,const char * name_buffer,size_t bsize,uint64_t * read_id)3011 LIB_EXPORT uint32_t CC VdbBlastRunSetGetReadId(const VdbBlastRunSet *self,
3012 const char *name_buffer, size_t bsize, uint64_t *read_id)
3013 {
3014 VdbBlastStatus status = eVdbBlastNoErr;
3015 bool found = false;
3016
3017 uint64_t result = 0;
3018 char *acc = NULL;
3019 uint64_t spot = 0;
3020 uint32_t read = 0;
3021 uint32_t i = ~0;
3022 if (self == NULL || name_buffer == NULL || name_buffer[0] == '\0' ||
3023 bsize == 0 || read_id == 0)
3024 { return eVdbBlastErr; }
3025
3026 {
3027 size_t n = bsize;
3028 const char *end = name_buffer + bsize;
3029 char *dot2 = NULL;
3030 char *dot1 = memchr(name_buffer, '.', bsize);
3031 if (dot1 != NULL) {
3032 if (dot1 == name_buffer)
3033 { return eVdbBlastErr; }
3034 if (dot1 - name_buffer + 1 >= bsize)
3035 { return eVdbBlastErr; }
3036 n -= (dot1 - name_buffer + 1);
3037 dot2 = memchr(dot1 + 1, '.', n);
3038 if (dot2 != NULL) {
3039 if (dot2 - name_buffer + 1 >= bsize)
3040 { return eVdbBlastErr; }
3041 acc = string_dup(name_buffer, dot1 - name_buffer + 1);
3042 if (acc == NULL)
3043 { return eVdbBlastMemErr; }
3044 acc[dot1 - name_buffer] = '\0';
3045 while (++dot1 < dot2) {
3046 char c = *dot1;
3047 if (c < '0' || c > '9') {
3048 S
3049 status = eVdbBlastErr;
3050 break;
3051 }
3052 spot = spot * 10 + c - '0';
3053 }
3054 while (status == eVdbBlastNoErr && ++dot2 < end) {
3055 char c = *dot2;
3056 if (c < '0' || c > '9') {
3057 S
3058 status = eVdbBlastErr;
3059 break;
3060 }
3061 read = read * 10 + c - '0';
3062 }
3063 }
3064 else {
3065 acc = malloc(bsize + 1);
3066 if (acc == NULL)
3067 { return eVdbBlastMemErr; }
3068 string_copy(acc, bsize + 1, name_buffer, bsize);
3069 acc[bsize] = '\0';
3070 }
3071 }
3072 else {
3073 acc = malloc(bsize + 1);
3074 if (acc == NULL)
3075 { return eVdbBlastMemErr; }
3076 string_copy(acc, bsize + 1, name_buffer, bsize);
3077 acc[bsize] = '\0';
3078 }
3079 }
3080
3081 _VdbBlastRunSetBeingRead(self);
3082
3083 for (i = 0; i < self->runs.krun && status == eVdbBlastNoErr; ++i) {
3084 uint64_t id = ~0;
3085 VdbBlastRun *run = self->runs.run + i;
3086 size_t size;
3087 assert(run && run->acc);
3088 if (string_measure(run->acc, &size)
3089 == string_measure(acc, &size))
3090 {
3091 if (strcmp(run->acc, acc) == 0) {
3092 status = _VdbBlastRunGetReadId(run, acc, spot, read, &id);
3093 if (status == eVdbBlastNoErr) {
3094 if (run->rd.readIdDesc.idType == eFixedReadN) {
3095 id += result;
3096 }
3097 *read_id = id;
3098 found = true;
3099 }
3100 break;
3101 }
3102 }
3103 else if ((string_measure(run->acc, &size) < string_measure(acc, &size))
3104 && (run->type == btpWGS)
3105 && (memcmp(run->acc, acc, string_measure(run->acc, &size))
3106 == 0))
3107 {
3108 status = _VdbBlastRunGetReadId(run, acc, spot, read, &id);
3109 if (status == eVdbBlastNoErr) {
3110 if (run->rd.readIdDesc.idType == eFixedReadN) {
3111 result += id;
3112 }
3113 else {
3114 result = id;
3115 }
3116 *read_id = result;
3117 found = true;
3118 }
3119 break;
3120 }
3121 result += _VdbBlastRunGetSequencesAmount(run, &status);
3122 if (status != eVdbBlastNoErr) {
3123 if (status == eVdbBlastTooExpensive) {
3124 status = eVdbBlastNoErr;
3125 }
3126 else {
3127 break;
3128 }
3129 }
3130 }
3131
3132 if (status == eVdbBlastNoErr && !found) {
3133 S
3134 status = eVdbBlastErr;
3135 }
3136
3137 free (acc);
3138 acc = NULL;
3139
3140 return status;
3141 }
3142
3143 /* TODO: make sure
3144 ReadLength is correct when there are multiple reads in the same spot */
3145 LIB_EXPORT
VdbBlastRunSetGetReadLength(const VdbBlastRunSet * self,uint64_t read_id)3146 uint64_t CC VdbBlastRunSetGetReadLength(const VdbBlastRunSet *self,
3147 uint64_t read_id)
3148 {
3149 rc_t rc = 0;
3150 const VCursor *curs = NULL;
3151 uint32_t col_idx = 0;
3152 char buffer[84] = "";
3153 uint32_t row_len = 0;
3154 ReadDesc desc;
3155 VdbBlastStatus status = eVdbBlastErr;
3156
3157 if (self == NULL) {
3158 STSMSG(1, ("VdbBlastRunSetGetReadLength(self=NULL)"));
3159 return 0;
3160 }
3161
3162 status = _RunSetFindReadDesc(&self->runs, read_id, &desc);
3163 if (status != eVdbBlastNoErr) {
3164 STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadLength: "
3165 "cannot find RunSet ReadDesc"));
3166 return 0;
3167 }
3168 assert(desc.run && desc.spot && desc.run->path);
3169
3170 _VdbBlastRunSetBeingRead(self);
3171
3172 if (rc == 0) {
3173 rc = _VTableMakeCursor
3174 (desc.run->obj->seqTbl, &curs, &col_idx, "READ", desc.run->acc);
3175 if (rc != 0) {
3176 PLOGERR(klogInt, (klogInt, rc, "Error in _VTableMakeCursor"
3177 "$(path), READ)", "path=%s", desc.run->path));
3178 }
3179 }
3180 if (rc == 0) {
3181 rc = VCursorReadDirect
3182 (curs, desc.spot, col_idx, 8, buffer, sizeof buffer, &row_len);
3183 if (rc != 0) {
3184 PLOGERR(klogInt, (klogInt, rc, "Error in VCursorReadDirect"
3185 "$(path), READ, spot=$(spot))",
3186 "path=%s,spot=%ld", desc.run->path, desc.spot));
3187 }
3188 }
3189 VCursorRelease(curs);
3190 curs = NULL;
3191 if (rc == 0) {
3192 STSMSG(1, ("VdbBlastRunSetGetReadLength = %ld", row_len));
3193 return row_len;
3194 }
3195 else {
3196 STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadLength"));
3197 return 0;
3198 }
3199 }
3200
_VdbBlastRunSet2naRead(const VdbBlastRunSet * self,VdbBlastStatus * status,uint64_t * read_id,size_t * starting_base,uint8_t * buffer,size_t buffer_size,KVdbBlastReadMode mode)3201 uint64_t _VdbBlastRunSet2naRead(const VdbBlastRunSet *self,
3202 VdbBlastStatus *status, uint64_t *read_id, size_t *starting_base,
3203 uint8_t *buffer, size_t buffer_size, KVdbBlastReadMode mode)
3204 {
3205 uint64_t n = 0;
3206 rc_t rc = 0;
3207 assert(self && status);
3208
3209 rc = KLockAcquire(self->core2na.mutex);
3210 if (rc != 0) {
3211 LOGERR(klogInt, rc, "Error in KLockAcquire");
3212 }
3213 else {
3214 const Core2na *c
3215 = mode != VDB_READ_REFERENCE ? &self->core2na : &self->core2naRef;
3216 n = _Core2naRead((Core2na*)c, &self->runs, status,
3217 read_id, starting_base, buffer, buffer_size);
3218 if (n == 0 && self->core2na.eos) {
3219 *read_id = ~0;
3220 }
3221 rc = KLockUnlock(self->core2na.mutex);
3222 if (rc != 0) {
3223 LOGERR(klogInt, rc, "Error in KLockUnlock");
3224 }
3225 }
3226 if (rc != 0) {
3227 *status = eVdbBlastErr;
3228 }
3229
3230 if (*status == eVdbBlastNoErr || *status == eVdbBlastCircularSequence) {
3231 if (read_id != NULL && starting_base != NULL) {
3232 STSMSG(3, (
3233 "VdbBlast2naReaderRead(read_id=%ld, starting_base=%ld) = %ld",
3234 *read_id, *starting_base, n));
3235 }
3236 else {
3237 STSMSG(2, ("VdbBlast2naReaderRead = %ld", n));
3238 }
3239 }
3240 else {
3241 if (read_id != NULL && starting_base != NULL) {
3242 STSMSG(1, ("Error: failed to VdbBlast2naReaderRead(%ld)", n));
3243 }
3244 else {
3245 STSMSG(1, ("Error: failed to VdbBlast2naReaderRead"));
3246 }
3247 }
3248 return n;
3249 }
3250
3251 /******************************************************************************/
3252