1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #include <vdb/extern.h>
28 
29 #include "reader.h" /* _RunSetFindReadDesc */
30 #include "blast-mgr.h" /* BTableType */
31 #include "run-set.h" /* VdbBlastRunSet */
32 
33 #include <kdb/kdb-priv.h> /* KTableGetPath */
34 #include <kdb/table.h> /* KTable */
35 
36 #include <klib/debug.h> /* DBGMSG */
37 #include <klib/log.h> /* LOGERR */
38 #include <klib/rc.h> /* RC */
39 #include <klib/refcount.h> /* KRefcount */
40 #include <klib/status.h> /* STSMSG */
41 
42 #include <kproc/lock.h> /* KLockMake */
43 
44 #include <ncbi/vdb-blast.h> /* VdbBlastRunSet */
45 
46 #include <vdb/cursor.h> /* VCursor */
47 #include <vdb/database.h> /* VDatabase */
48 #include <vdb/table.h> /* VTable */
49 #include <vdb/vdb-priv.h> /* VTableOpenKTableRead */
50 
51 #include <sysalloc.h>
52 
53 #include <string.h> /* memset */
54 #include <time.h> /* time_t */
55 
56 #include <limits.h> /* PATH_MAX */
57 #ifndef PATH_MAX
58 #define PATH_MAX 4096
59 #endif
60 
61 void _Core2naFini(Core2na *self);
62 void _Core4naFini(Core4na *self);
63 
64 /******************************************************************************/
65 
_NotImplementedP(const char * func)66 static void *_NotImplementedP(const char *func) {
67     PLOGERR(klogErr, (klogErr, -1,
68         "$(func): is not implemented", "func=%s", func));
69     return 0;
70 }
71 
_NotImplemented(const char * func)72 static size_t _NotImplemented(const char *func)
73 {   return (size_t)_NotImplementedP(func); }
74 
Min(uint64_t cand,uint64_t champ,int64_t minRead,bool * done)75 static uint64_t CC Min(uint64_t cand,
76     uint64_t champ, int64_t minRead, bool *done)
77 {
78     assert(done);
79     if (minRead >= 0) {
80         if (cand < minRead) {
81             return champ;
82         }
83         else if (cand == minRead) {
84             *done = true;
85             return minRead;
86         }
87     }
88     return cand < champ ? cand : champ;
89 }
90 
Max(uint64_t cand,uint64_t champ,int64_t minRead,bool * done)91 static uint64_t CC Max(uint64_t cand,
92     uint64_t champ, int64_t minRead, bool *done)
93 {
94     return cand > champ ? cand : champ;
95 }
96 
_CanonocalName(const char * name)97 static char* _CanonocalName(const char *name) {
98     size_t noExtSize = 0;
99     size_t namelen = 0;
100     const char ext[] = ".sra";
101 
102     if (name == NULL) {
103         return NULL;
104     }
105 
106     noExtSize = namelen = string_size(name);
107 
108     if (namelen >= sizeof(ext)) {
109         const char *tail = NULL;
110         noExtSize -= sizeof ext - 1;
111         tail = name + noExtSize;
112         if (string_cmp(ext, sizeof ext - 1, tail, sizeof ext - 1, 99) != 0) {
113             noExtSize = namelen;
114         }
115     }
116 
117     return string_dup(name, noExtSize);
118 }
119 
120 /******************************************************************************/
121 
_VCursorCellDataDirect(const VCursor * self,int64_t row_id,uint32_t col_idx,uint32_t elemBits,const void ** base,uint32_t nreads,const char * name)122 static rc_t _VCursorCellDataDirect(const VCursor *self,
123     int64_t row_id, uint32_t col_idx, uint32_t elemBits,
124     const void **base, uint32_t nreads, const char *name)
125 {
126     uint32_t elem_bits = 0;
127     uint32_t row_len = 0;
128     uint32_t boff = 0;
129 
130     rc_t rc = VCursorCellDataDirect(self, row_id, col_idx,
131         &elem_bits, base, &boff, &row_len);
132 
133     if (rc != 0) {
134         PLOGERR(klogInt, (klogInt, rc,
135             "Error during VCursorCellDataDirect($(name), $(spot))",
136             "name=%s,spot=%lu", name, col_idx));
137     }
138     else if (boff != 0 || elem_bits != elemBits) {
139         rc = RC(rcSRA, rcCursor, rcReading, rcData, rcUnexpected);
140         PLOGERR(klogInt, (klogInt, rc,
141             "Bad VCursorCellDataDirect($(name), $(spot)) result: "
142             "boff=$(boff), elem_bits=$(elem_bits)",
143             "name=%s,spot=%lu,boff=%u,elem_bits=%u",
144             name, col_idx, boff, elem_bits));
145     }
146     else if (row_len != nreads) {
147         rc = RC(rcSRA, rcCursor, rcReading, rcData, rcUnexpected);
148         PLOGERR(klogInt, (klogInt, rc,
149             "Bad VCursorCellDataDirect($(name), $(spot)) result: "
150             "row_len=$(row_len)",
151             "name=%s,spot=%lu,row_len=%u", name, col_idx, row_len));
152     }
153 
154     return rc;
155 }
156 
157 /******************************************************************************/
158 
_VDatabaseOpenAlignmentTable(const VDatabase * self,const char * path,const VTable ** tbl)159 static VdbBlastStatus _VDatabaseOpenAlignmentTable(const VDatabase *self,
160     const char *path,
161     const VTable **tbl)
162 {
163     const char *table = "PRIMARY_ALIGNMENT";
164 
165     rc_t rc = VDatabaseOpenTableRead(self, tbl, table);
166     if (rc != 0) {
167         PLOGERR(klogInt, (klogInt, rc,
168             "Error in VDatabaseOpenTableRead($(name), $(tbl))",
169             "name=%s,tbl=%s", path, table));
170         STSMSG(1, ("Error: failed to open DB table '%s/%s'", path, table));
171     }
172     else {
173         STSMSG(1, ("Opened DB table '%s/%s'", path, table));
174     }
175 
176     return rc != 0 ? eVdbBlastErr : eVdbBlastNoErr;;
177 }
178 
179 /******************************************************************************/
180 
181 typedef struct {
182     const VCursor *curs;
183     uint32_t colREAD_LEN;
184     uint32_t colREAD_TYPE;
185     uint64_t techBasesPerSpot;
186     bool techBasesPerSpotEquals;
187     uint64_t bioBasesCnt;
188 } ApprCnt;
_ApprCntInit(ApprCnt * self,const VTable * tbl,const char * acc)189 static rc_t _ApprCntInit(ApprCnt *self, const VTable *tbl, const char *acc) {
190     rc_t rc = 0;
191 
192     assert(self);
193 
194     memset(self, 0, sizeof *self);
195 
196     self->techBasesPerSpotEquals = true;
197     self->techBasesPerSpot = ~0;
198 
199     if (rc == 0) {
200         rc = VTableCreateCursorRead(tbl, &self->curs);
201         if (rc != 0) {
202             PLOGERR(klogInt, (klogInt, rc,
203                 "Error during VTableCreateCursorRead for $(acc)",
204                 "acc=%s", acc));
205         }
206     }
207 
208     if (rc == 0) {
209         const char name[] = "READ_LEN";
210         rc = VCursorAddColumn(self->curs, &self->colREAD_LEN, name);
211         if (rc != 0) {
212             PLOGERR(klogInt, (klogInt, rc,
213                 "Error during VCursorAddColumn($(name))", "name=%s", name));
214         }
215     }
216 
217     if (rc == 0) {
218         const char name[] = "READ_TYPE";
219         rc = VCursorAddColumn(self->curs, &self->colREAD_TYPE, name);
220         if (rc != 0) {
221             PLOGERR(klogInt, (klogInt, rc,
222                 "Error during VCursorAddColumn($(name))", "name=%s", name));
223         }
224     }
225 
226     if (rc == 0) {
227         rc = VCursorOpen(self->curs);
228         if (rc != 0) {
229             LOGERR(klogInt, rc, "Error during VCursorOpen");
230         }
231     }
232 
233     return rc;
234 }
235 
_ApprCntFini(ApprCnt * self)236 static rc_t _ApprCntFini(ApprCnt *self) {
237     rc_t rc = 0;
238     assert(self);
239     RELEASE(VCursor, self->curs);
240     memset(self, 0, sizeof *self);
241     return rc;
242 }
243 
_ApprCntChunk(ApprCnt * self,uint64_t chunk,uint64_t l,uint64_t nspots,uint32_t nreads)244 static rc_t _ApprCntChunk(ApprCnt *self,
245     uint64_t chunk, uint64_t l, uint64_t nspots, uint32_t nreads)
246 {
247     rc_t rc = 0;
248     uint64_t start = nspots / 10 * chunk + 1;
249     uint64_t spot = 0;
250     uint64_t end = start + l;
251     assert(self);
252     if (end - 1 > nspots) {
253         end = nspots + 1;
254     }
255     for (spot = start; spot < end; ++spot) {
256         uint64_t techBases = 0;
257         uint64_t bioBases = 0;
258         uint32_t read = 0;
259         const uint32_t *readLen = NULL;
260         const INSDC_read_type *readType = NULL;
261         const void *base = NULL;
262         rc = _VCursorCellDataDirect(self->curs, spot, self->colREAD_LEN,
263             32, &base, nreads, "READ_LEN");
264         if (rc != 0) {
265             return rc;
266         }
267         readLen = base;
268         rc = _VCursorCellDataDirect(self->curs, spot, self->colREAD_TYPE,
269             8, &base, nreads, "READ_TYPE");
270         if (rc != 0) {
271             return rc;
272         }
273         readType = base;
274         for (read = 0; read < nreads; ++read) {
275             INSDC_read_type type = readType[read] & 1;
276             if (type == READ_TYPE_BIOLOGICAL) {
277                 bioBases += readLen[read];
278             }
279             else {
280                 techBases += readLen[read];
281             }
282         }
283         if (self->techBasesPerSpotEquals) {
284             if (self->techBasesPerSpot != techBases) {
285                 if (self->techBasesPerSpot == ~0) {
286                     self->techBasesPerSpot = techBases;
287                 }
288                 else {
289                     self->techBasesPerSpotEquals = false;
290                 }
291             }
292         }
293         self->bioBasesCnt += bioBases;
294     }
295     return rc;
296 }
297 
298 /******************************************************************************/
299 
_VTableLogRowData(const VTable * self,const char * column,void * buffer,uint32_t blen)300 static rc_t _VTableLogRowData(const VTable *self,
301     const char *column, void *buffer, uint32_t blen)
302 {
303     rc_t rc = 0;
304 
305 #if _DEBUGGING
306     if (buffer && blen == 64) {
307         uint64_t data = *((uint64_t*)buffer);
308         const KTable *ktbl = NULL;
309         rc_t rc = VTableOpenKTableRead(self, &ktbl);
310         if (rc == 0) {
311             const char *path = NULL;
312             rc = KTableGetPath(ktbl, &path);
313             if (rc == 0) {
314                 DBGMSG(DBG_BLAST, DBG_FLAG(DBG_BLAST_BLAST),
315                     ("%s: %s: %lu\n", path, column, data));
316             }
317         }
318 
319         KTableRelease(ktbl);
320     }
321 #endif
322 
323     if (rc != 0)
324     {   PLOGERR(klogInt, (klogInt, rc, "Error in $(f)", "f=%s", __func__)); }
325 
326     return rc;
327 }
328 
_VTableMakeCursorImpl(const VTable * self,const VCursor ** curs,uint32_t * col_idx,const char * col_name,bool canBeMissed,const char * acc)329 static rc_t _VTableMakeCursorImpl(const VTable *self, const VCursor **curs,
330     uint32_t *col_idx, const char *col_name, bool canBeMissed, const char *acc)
331 {
332     rc_t rc = 0;
333 
334     assert(curs && col_name);
335 
336     if (rc == 0) {
337         rc = VTableCreateCursorRead(self, curs);
338         if (rc != 0) {
339             LOGERR(klogInt, rc, "Error during VTableCreateCursorRead");
340         }
341     }
342 
343     if (rc == 0) {
344         VCursorPermitPostOpenAdd(*curs);
345         if (rc != 0) {
346             LOGERR(klogInt, rc, "Error during VCursorPermitPostOpenAdd");
347         }
348     }
349 
350     if (rc == 0) {
351         rc = VCursorOpen(*curs);
352         if (rc != 0) {
353             PLOGERR(klogInt, (klogInt, rc,
354                 "Error in VCursorOpen($(name))", "name=%s", col_name));
355         }
356     }
357 
358     if (rc == 0) {
359         assert(*curs);
360         rc = VCursorAddColumn(*curs, col_idx, "%s", col_name);
361         if (rc != 0 && !canBeMissed) {
362             PLOGERR(klogInt, (klogInt, rc,
363                 "Error in VCursorAddColumn($(name))", "name=%s", col_name));
364         }
365     }
366 
367     STSMSG(2, ("Prepared a VCursor to read '%s'", col_name));
368 
369     return rc;
370 }
371 
_VTableMakeCursor(const VTable * self,const VCursor ** curs,uint32_t * col_idx,const char * col_name,const char * acc)372 rc_t _VTableMakeCursor(const VTable *self, const VCursor **curs,
373     uint32_t *col_idx, const char *col_name, const char *acc)
374 {
375     return _VTableMakeCursorImpl(self, curs, col_idx, col_name, false, acc);
376 }
377 
378 /*static*/
_VTableReadFirstRowImpl(const VTable * self,const char * name,void * buffer,uint32_t blen,EColType * is_static,bool canBeMissed,uint32_t * row_len,const char * acc)379 uint32_t _VTableReadFirstRowImpl(const VTable *self, const char *name,
380     void *buffer, uint32_t blen, EColType *is_static, bool canBeMissed,
381     uint32_t *row_len, const char *acc)
382 {
383     VdbBlastStatus status = eVdbBlastNoErr;
384 
385     rc_t rc = 0;
386 
387     const VCursor *curs = NULL;
388     uint32_t idx = 0;
389 
390     uint32_t dummy = 0;
391     if (row_len == NULL) {
392         row_len = &dummy;
393     }
394 
395     assert(self && name);
396 
397     blen *= 8;
398 
399     rc = _VTableMakeCursorImpl(self, &curs, &idx, name, canBeMissed, acc);
400     if (rc != 0) {
401         if (rc ==
402             SILENT_RC(rcVDB, rcCursor, rcOpening, rcColumn, rcUndefined)
403          || rc ==
404             SILENT_RC(rcVDB, rcCursor, rcUpdating, rcColumn, rcNotFound))
405         {
406             if (!canBeMissed) {
407                 PLOGMSG(klogInfo, (klogInfo, "$(f): Column '$(name)' not found",
408                     "f=%s,name=%s", __func__, name));
409             }
410             if (is_static != NULL) {
411                 *is_static = eColTypeAbsent;
412             }
413             status = eVdbBlastTooExpensive;
414         }
415         else {
416             status = eVdbBlastErr;
417             if (is_static != NULL) {
418                 *is_static = eColTypeError;
419             }
420         }
421     }
422 
423     if (status == eVdbBlastNoErr && rc == 0) {
424         rc = VCursorOpenRow(curs);
425         if (rc != 0) {
426             PLOGERR(klogInt, (klogInt, rc,
427                 "Error in VCursorOpenRow($(name))", "name=%s", name));
428         }
429     }
430 
431     if (status == eVdbBlastNoErr && rc == 0 && buffer != NULL && blen > 0) {
432         rc = VCursorRead(curs, idx, 8, buffer, blen, row_len);
433         if (rc != 0) {
434             PLOGERR(klogInt, (klogInt, rc,
435                 "Error in VCursorRead($(name))", "name=%s", name));
436         }
437     }
438 
439 /* TODO: needs to be verified what row_len is expected
440     if (*row_len != 1) return eVdbBlastErr; */
441 
442     STSMSG(2, ("Read the first row of '%s'", name));
443 
444     if (status == eVdbBlastNoErr && rc == 0) {
445         if (blen == 64) {
446             _VTableLogRowData(self, name, buffer, blen);
447         }
448         if (is_static != NULL) {
449             bool isStatic = false;
450             rc = VCursorIsStaticColumn(curs, idx, &isStatic);
451             if (rc != 0) {
452                 PLOGERR(klogInt, (klogInt, rc,
453                     "Error in VCursorIsStaticColumn($(name))",
454                     "name=%s", name));
455             }
456             else {
457                 *is_static = isStatic ? eColTypeStatic : eColTypeNonStatic;
458             }
459         }
460     }
461 
462     VCursorRelease(curs);
463 
464     if (status == eVdbBlastNoErr && rc != 0) {
465         status = eVdbBlastErr;
466     }
467     return status;
468 }
469 
_VTableReadFirstRow(const VTable * self,const char * name,void * buffer,uint32_t blen,EColType * is_static,const char * acc)470 static VdbBlastStatus _VTableReadFirstRow(const VTable *self, const char *name,
471     void *buffer, uint32_t blen, EColType *is_static, const char *acc)
472 {
473     return _VTableReadFirstRowImpl
474         (self, name, buffer, blen, is_static, false, NULL, acc);
475 }
476 
_VTableReadPLATFORMoptional(const VTable * self,const char * acc)477 static INSDC_SRA_platform_id _VTableReadPLATFORMoptional
478     (const VTable *self, const char *acc)
479 {
480     INSDC_SRA_platform_id platform = SRA_PLATFORM_UNDEFINED;
481 
482     VdbBlastStatus status = _VTableReadFirstRowImpl(self,
483         "PLATFORM", &platform, sizeof platform, NULL, true, NULL, acc);
484 
485     if (status == eVdbBlastNoErr) {
486         return platform;
487     }
488     else {
489         return SRA_PLATFORM_UNDEFINED;
490     }
491 }
492 
_VTableVarReadNum(const VTable * self,const char * acc)493 static bool _VTableVarReadNum(const VTable *self, const char *acc) {
494     return _VTableReadPLATFORMoptional(self, acc) == SRA_PLATFORM_PACBIO_SMRT;
495 }
496 
497 static uint64_t BIG = 10000 /*11*/;
_VTableBioBaseCntApprox(const VTable * self,uint64_t nspots,uint32_t nreads,uint64_t * bio_base_count,const char * acc)498 static rc_t _VTableBioBaseCntApprox(const VTable *self,
499     uint64_t nspots, uint32_t nreads, uint64_t *bio_base_count, const char *acc)
500 {
501     rc_t rc = 0;
502     uint64_t c = nspots;
503     uint64_t d = 1;
504     ApprCnt ac;
505     rc = _ApprCntInit(&ac, self, acc);
506     assert(bio_base_count);
507     if (rc == 0) {
508         if (nspots < BIG) {
509             STSMSG(2,
510                 ("VdbBlastRunSetGetTotalLengthApprox: counting all reads\n"));
511             rc = _ApprCntChunk(&ac, 0, nspots, nspots, nreads);
512         }
513         else {
514             uint64_t i = 0;
515             uint64_t l = 0;
516             uint64_t n = 10 /* 3 */;
517             while (c > BIG) {
518                 c /= 2;
519                 d *= 2;
520             }
521             l = c / n;
522             if (l == 0) {
523                 l = 1;
524             }
525             for (i = 1; i < n - 1 && rc == 0; ++i) {
526                 rc = _ApprCntChunk(&ac, i, l, nspots, nreads);
527             }
528         }
529     }
530     if (rc == 0) {
531         VdbBlastStatus status = eVdbBlastNoErr;
532         if (nspots < BIG) {
533             *bio_base_count = ac.bioBasesCnt;
534         }
535         else {
536             if (ac.techBasesPerSpotEquals && ac.techBasesPerSpot == ~0) {
537                 ac.techBasesPerSpotEquals = false;
538             }
539             if (ac.techBasesPerSpotEquals ) {
540                 uint64_t baseCount = 0;
541                 status = _VTableReadFirstRow(self,
542                     "BASE_COUNT", &baseCount, sizeof baseCount, NULL, acc);
543                 if (status == eVdbBlastNoErr) {
544                     STSMSG(2, ("VdbBlastRunSetGetTotalLengthApprox: "
545                         "fixed technical reads length\n"));
546                     *bio_base_count = baseCount - nspots * ac.techBasesPerSpot;
547                 }
548                 else {
549                     STSMSG(1, ("VdbBlastRunSetGetTotalLengthApprox: "
550                         "cannot read BASE_COUNT\n"));
551                 }
552             }
553             if (!ac.techBasesPerSpotEquals || status != eVdbBlastNoErr) {
554 /*              double dd = nspots / c; */
555                 STSMSG(2, ("VdbBlastRunSetGetTotalLengthApprox: "
556                     "extrapolating variable read length\n"));
557                 *bio_base_count = ac.bioBasesCnt * d;
558             }
559         }
560     }
561     _ApprCntFini(&ac);
562     return rc;
563 }
564 
565 static
_VTableGetNReads(const VTable * self,uint32_t * nreads,const char * acc)566 uint32_t _VTableGetNReads(const VTable *self, uint32_t *nreads, const char *acc)
567 {
568     rc_t rc = 0;
569     VdbBlastStatus status = eVdbBlastNoErr;
570     const VCursor *curs = NULL;
571     uint32_t idx = 0;
572 
573     const char name[] = "READ_LEN";
574 
575     assert(nreads);
576 
577     rc = _VTableMakeCursor(self, &curs, &idx, name, acc);
578     if (rc != 0) {
579         status = eVdbBlastErr;
580         if (rc ==
581             SILENT_RC(rcVDB, rcCursor, rcOpening, rcColumn, rcUndefined))
582         {
583             PLOGMSG(klogInfo, (klogInfo, "$(f): Column '$(name)' not found",
584                 "f=%s,name=%s", __func__, name));
585         }
586         else {
587             PLOGMSG(klogInfo, (klogInfo, "$(f): Cannot open column '$(name)'",
588                 "f=%s,name=%s", __func__, name));
589         }
590     }
591 
592     if (status == eVdbBlastNoErr) {
593         uint32_t elem_bits, elem_off, elem_cnt;
594         const void *base = NULL;
595         rc = VCursorCellDataDirect(curs, 1, idx,
596                 &elem_bits, &base, &elem_off, &elem_cnt);
597         if (rc != 0) {
598             status = eVdbBlastErr;
599             PLOGMSG(klogInfo, (klogInfo,
600                 "$(f): Cannot '$(name)' CellDataDirect",
601                 "f=%s,name=%s", __func__, name));
602         }
603         else if (elem_off != 0 || elem_bits != 32) {
604             status = eVdbBlastErr;
605             PLOGERR(klogInt, (klogInt, rc,
606                 "Bad VCursorCellDataDirect(READ_LEN) result: "
607                 "boff=$(elem_off), elem_bits=$(elem_bits)",
608                 "elem_off=%u, elem_bits=%u", elem_off, elem_bits));
609         }
610         else {
611             *nreads = elem_cnt;
612         }
613     }
614 
615     RELEASE(VCursor, curs);
616 
617     return status;
618 }
619 
_VTableCSra(const VTable * self)620 static bool _VTableCSra(const VTable *self) {
621     bool cSra = false;
622 
623     KNamelist *names = NULL;
624 
625     uint32_t i = 0, count = 0;
626 
627     rc_t rc = VTableListPhysColumns(self, &names);
628 
629     if (rc == 0) {
630         rc = KNamelistCount(names, &count);
631     }
632 
633     for (i = 0 ; i < count && rc == 0; ++i) {
634         const char *name = NULL;
635         rc = KNamelistGet(names, i, &name);
636         if (rc == 0) {
637             const char b[] = "CMP_READ";
638             if (string_cmp(name, string_measure(name, NULL),
639                 b, sizeof b - 1, sizeof b - 1) == 0)
640             {
641                 cSra = true;
642                 break;
643             }
644         }
645     }
646 
647     RELEASE(KNamelist, names);
648 
649     return cSra;
650 }
651 
652 /******************************************************************************/
653 
_RunDescFini(RunDesc * self)654 static void _RunDescFini(RunDesc *self) {
655     assert(self);
656     free(self->readLen);
657     free(self->readType);
658     free(self->rdFilter);
659     memset(self, 0, sizeof *self);
660 }
661 
662 static
_RunDescMakeReadId(const RunDesc * self,uint64_t spot,uint8_t read,uint64_t read_id,VdbBlastStatus * status)663 uint64_t _RunDescMakeReadId(const RunDesc *self, uint64_t spot, uint8_t read,
664     uint64_t read_id, VdbBlastStatus *status)
665 {
666     bool overflow = false;
667 
668     assert(self && status);
669 
670     *status = eVdbBlastNoErr;
671 
672     if (self->readIdDesc.idType == eFixedReadN) {
673         return read_id;
674     }
675 
676     assert(self->readIdDesc.idType == eFactor10);
677 
678     read_id = read * self->spotBits + spot;
679     if (read_id < self->spotBits) {
680         overflow = true;
681     }
682 
683     if (self->readIdDesc.runBits > 0) {
684         uint64_t prev = read_id;
685         read_id *= self->readIdDesc.runBits;
686         read_id += self->index;
687         if (read_id < prev) {
688             overflow = true;
689         }
690     }
691 
692     if (overflow) {
693         *status = eVdbBlastErr;
694     }
695 
696     return read_id;
697 }
698 
699 /******************************************************************************/
700 /* VdbBlastDb */
_VdbBlastDbWhack(VdbBlastDb * self)701 static void _VdbBlastDbWhack(VdbBlastDb *self) {
702     if (self == NULL) {
703         return;
704     }
705 
706     VCursorRelease(self->cursACCESSION);
707     VCursorRelease(self->cursSeq);
708 
709     VTableRelease(self->prAlgnTbl);
710     VTableRelease(self->refTbl);
711     VTableRelease(self->seqTbl);
712 
713     VDatabaseRelease(self->db);
714 
715     memset(self, 0, sizeof *self);
716 
717     free(self);
718 }
719 
_VdbBlastDbOpenSeqCurs(VdbBlastDb * self,const char * acc)720 static rc_t _VdbBlastDbOpenSeqCurs(VdbBlastDb *self, const char *acc) {
721     rc_t rc = 0;
722     const VCursor *curs = NULL;
723     assert(self);
724     if (self->cursSeq != NULL) {
725         return 0;
726     }
727     rc = VTableCreateCursorRead(self->seqTbl, &self->cursSeq);
728     if (rc != 0) {
729         PLOGERR(klogInt, (klogInt, rc,
730             "Error during VTableCreateCursorRead for $(acc)", "acc=%s", acc));
731     }
732     else {
733         curs = self->cursSeq;
734     }
735     if (rc == 0) {
736         const char name[] = "READ_FILTER";
737         rc = VCursorAddColumn(curs, &self->col_READ_FILTER, name);
738         if (rc != 0) {
739             PLOGERR(klogInt, (klogInt, rc,
740                 "Error during VCursorAddColumn($(name))", "name=%s", name));
741         }
742     }
743     if (rc == 0) {
744         const char name[] = "READ_LEN";
745         rc = VCursorAddColumn(curs, &self->col_READ_LEN, name);
746         if (rc != 0) {
747             PLOGERR(klogInt, (klogInt, rc,
748                 "Error during VCursorAddColumn($(name))", "name=%s", name));
749         }
750     }
751     if (rc == 0) {
752         const char name[] = "READ_TYPE";
753         rc = VCursorAddColumn(curs, &self->col_READ_TYPE, name);
754         if (rc != 0) {
755             PLOGERR(klogInt, (klogInt, rc,
756                 "Error during VCursorAddColumn($(name))", "name=%s", name));
757         }
758     }
759     if (rc == 0) {
760         const char name[] = "TRIM_LEN";
761         rc = VCursorAddColumn(curs, &self->col_TRIM_LEN, name);
762         if (rc != 0) {
763             PLOGERR(klogInt, (klogInt, rc,
764                 "Error during VCursorAddColumn($(name))", "name=%s", name));
765         }
766     }
767     if (rc == 0) {
768         const char name[] = "TRIM_START";
769         rc = VCursorAddColumn(curs, &self->col_TRIM_START, name);
770         if (rc != 0) {
771             PLOGERR(klogInt, (klogInt, rc,
772                 "Error during VCursorAddColumn($(name))", "name=%s", name));
773         }
774     }
775     if (rc == 0) {
776         rc = VCursorOpen(curs);
777         if (rc != 0) {
778             LOGERR(klogInt, rc, "Error during VCursorOpen");
779         }
780     }
781     return rc;
782 }
783 
784 typedef enum {
785     eFirstRead,
786     eNextRead,
787     eAllReads,
788 } TReadType;
_VdbBlastDbFindRead(const VdbBlastDb * self,ReadDesc * rd,TReadType type,bool * found)789 static rc_t _VdbBlastDbFindRead
790     (const VdbBlastDb *self, ReadDesc *rd, TReadType type, bool *found)
791 {
792     rc_t rc = 0;
793 
794     int64_t first = 0;
795     uint64_t count = 0;
796 
797     const VCursor *curs = NULL;
798 
799     assert(self && rd && found);
800 
801     *found = false;
802 
803     if (type == eAllReads) {
804         memset(rd, 0, sizeof *rd);
805     }
806 
807     curs = self->cursSeq;
808 
809     rc = VCursorIdRange(curs, self->col_READ_LEN, &first, &count);
810 
811     if (rc == 0) {
812         int64_t i = 0;
813         uint32_t c = 0;
814         if (type == eNextRead) {
815             if (rd->spot < first) {
816                 return -1;
817             }
818             i = rd->spot - first;
819             c = rd->read;
820             if (c + 1 >= rd->nReads) {
821                 c = 0;
822                 ++i;
823             }
824         }
825         for (; i < count; ++i) {
826             int64_t spot = first + i;
827 
828             const void *base = NULL;
829             uint32_t elem_bits = 0;
830             uint32_t elem_cnt = 0;
831             uint32_t elem_off = 0;
832             rc = VCursorCellDataDirect(curs, spot, self->col_READ_TYPE,
833                 &elem_bits, &base, &elem_off, &elem_cnt);
834             if (rc != 0) {
835                 break;
836             }
837             else {
838                 INSDC_read_type *aRt = (INSDC_read_type*)base;
839                 for (; c < elem_cnt; ++c) {
840                     INSDC_read_type *rt = aRt + c;
841                     assert(rt);
842 
843                     if (*rt & SRA_READ_TYPE_BIOLOGICAL) {
844                         *found = true;
845                         if (type == eAllReads) {
846                             ++rd->spot;
847                         }
848                         else {
849                             rd->spot = spot;
850                             rd->read = c + 1;
851                             rd->nReads = elem_cnt;
852                             return rc;
853                         }
854                     }
855                 }
856                 c = 0;
857             }
858         }
859     }
860 
861     return rc;
862 }
863 
_VdbBlastDbFindFirstRead(const VdbBlastDb * self,ReadDesc * rd,bool * found)864 static rc_t _VdbBlastDbFindFirstRead
865     (const VdbBlastDb *self, ReadDesc *rd, bool *found)
866 {   return _VdbBlastDbFindRead(self, rd, eFirstRead, found); }
_VdbBlastDbFindNextRead(const VdbBlastDb * self,ReadDesc * rd,bool * found)867 static rc_t _VdbBlastDbFindNextRead
868     (const VdbBlastDb *self, ReadDesc *rd, bool *found)
869 {   return _VdbBlastDbFindRead(self, rd, eNextRead , found); }
870 
_ReadDescFindNextRead(ReadDesc * self,bool * found)871 rc_t _ReadDescFindNextRead(ReadDesc *self, bool *found) {
872     assert(self && self->run && self->run);
873     return _VdbBlastDbFindNextRead(self->run->obj, self, found);
874 }
875 
_VdbBlastDbOpenCursAndGetFirstRead(VdbBlastDb * self,ReadDesc * desc,bool * found,const char * acc)876 static rc_t _VdbBlastDbOpenCursAndGetFirstRead
877     (VdbBlastDb *self, ReadDesc *desc, bool *found, const char *acc)
878 {
879     rc_t rc = _VdbBlastDbOpenSeqCurs(self, acc);
880     if (rc == 0) {
881         rc = _VdbBlastDbFindFirstRead(self, desc, found);
882     }
883     return rc;
884 }
885 
_VdbBlastDbOpenCursAndGetAllReads(VdbBlastDb * self,ReadDesc * desc,const char * acc)886 static rc_t _VdbBlastDbOpenCursAndGetAllReads
887     (VdbBlastDb *self, ReadDesc *desc, const char *acc)
888 {
889     rc_t rc = _VdbBlastDbOpenSeqCurs(self, acc);
890     if (rc == 0) {
891         bool found = false;
892         rc = _VdbBlastDbFindRead(self, desc, eAllReads, &found);
893     }
894     return rc;
895 }
896 
_VdbBlastRunSetGetAllReads(const VdbBlastRunSet * self,uint32_t run)897 uint64_t _VdbBlastRunSetGetAllReads
898     (const VdbBlastRunSet *self, uint32_t run)
899 {
900     ReadDesc desc;
901 
902     assert(self);
903 
904     if (run >= self->runs.krun) {
905         return 0;
906     }
907 
908     if (_VdbBlastDbOpenCursAndGetAllReads
909         (self->runs.run[run].obj, &desc, self->runs.run[run].acc))
910     {
911         return 0;
912     }
913 
914     return desc.spot;
915 }
916 
_VdbBlastDbGetNReads(VdbBlastDb * self,uint64_t spot,uint32_t * nReads,const char * acc)917 static rc_t _VdbBlastDbGetNReads
918     (VdbBlastDb *self, uint64_t spot, uint32_t *nReads, const char *acc)
919 {
920     rc_t rc = _VdbBlastDbOpenSeqCurs(self, acc);
921     assert(self && nReads);
922     *nReads = 0;
923     if (rc == 0) {
924         const void *base = NULL;
925         uint32_t elem_bits = 0;
926         uint32_t elem_cnt = 0;
927         uint32_t elem_off = 0;
928         rc = VCursorCellDataDirect(self->cursSeq, spot, self->col_READ_TYPE,
929             &elem_bits, &base, &elem_off, &elem_cnt);
930         if (rc == 0) {
931             *nReads = elem_cnt;
932         }
933         else {
934             PLOGERR(klogInt, (klogInt, rc, "Cannot get "
935                 "CellDataDirect for $(acc)/READ_TYPE/$(spot)",
936                 "acc=%s,spot=%lu", acc, spot));
937         }
938     }
939     return rc;
940 }
941 
942 /******************************************************************************/
943 /*VdbBlastRun*/
944 
_VdbBlastRunFini(VdbBlastRun * self)945 static void _VdbBlastRunFini(VdbBlastRun *self) {
946     if (self == NULL) {
947         return;
948     }
949 
950     _VdbBlastDbWhack(self->obj);
951 
952     free(self->acc);
953     free(self->path);
954 
955     _RunDescFini(&self->rd);
956 
957     memset(self, 0, sizeof *self);
958 }
959 
_VdbBlastRunInit(VdbBlastRun * self,VdbBlastDb * obj,const char * rundesc,BTableType type,const KDirectory * dir,char * fullpath,uint32_t min_read_length,uint32_t index)960 static VdbBlastStatus _VdbBlastRunInit(VdbBlastRun *self, VdbBlastDb *obj,
961     const char *rundesc, BTableType type, const KDirectory *dir,
962     char *fullpath, uint32_t min_read_length, uint32_t index)
963 {
964     rc_t rc = 0;
965     const char *acc = rundesc;
966     char rbuff[4096] = "";
967     size_t size = 0;
968 
969     char slash = '/';
970 
971     assert(!dir && self && obj && type != btpUndefined && rundesc);
972 
973     {
974         KDirectory *dir = NULL;
975         if (KDirectoryNativeDir(&dir) != 0) {
976             LOGERR(klogInt, rc, "Error during KDirectoryNativeDir");
977             return eVdbBlastErr;
978         }
979 /* TODO This is obsolete and incorrect */
980         rc = KDirectoryResolvePath(dir, true,
981             rbuff, sizeof rbuff, "%s", rundesc);
982         KDirectoryRelease(dir);
983         if (rc != 0) {
984             PLOGERR(klogInt, (klogInt, rc,
985                 "Error during KDirectoryResolvePath($(path))",
986                 "path=%s", rundesc));
987             return eVdbBlastErr;
988         }
989     }
990 
991     memset(self, 0, sizeof *self);
992 
993     self->rd.index = index;
994     self->obj = obj;
995     self->type = type;
996 
997     self->alignments
998         = self->bioBases = self->bioBasesApprox
999         = self->bioReads = self->bioReadsApprox = ~0;
1000 
1001     acc = strrchr(rbuff, slash);
1002     if (acc == NULL) {
1003         acc = rbuff;
1004     }
1005     else if (string_measure(acc, &size) > 1) {
1006         ++acc;
1007     }
1008     else {
1009         acc = rbuff;
1010     }
1011 
1012     if (fullpath == NULL) {
1013         self->path = string_dup(rbuff, sizeof(rbuff));
1014         if (self->path == NULL) {
1015             return eVdbBlastMemErr;
1016         }
1017     }
1018     else {
1019         self->path = fullpath;
1020     }
1021     self->acc = _CanonocalName(acc);
1022     if (self->acc == NULL) {
1023         return eVdbBlastMemErr;
1024     }
1025 
1026     self->min_read_length = min_read_length;
1027 
1028     self->cSra = _VTableCSra(self->obj->seqTbl);
1029 
1030     return eVdbBlastNoErr;
1031 }
1032 
Bits(uint64_t n,EReadIdType idType)1033 static uint32_t Bits(uint64_t n, EReadIdType idType) {
1034     uint32_t bits = 1;
1035 
1036     if (idType == eFixedReadN) {
1037         return 0;
1038     }
1039 
1040     assert(idType == eFactor10);
1041 
1042     if (n == 0) {
1043         return 0;
1044     }
1045 
1046     while (n > 0) {
1047         bits *= 10;
1048         n /= 10;
1049     }
1050 
1051     return bits;
1052 }
1053 
_VdbBlastRunFillRunDesc(VdbBlastRun * self)1054 /*static*/ VdbBlastStatus _VdbBlastRunFillRunDesc(VdbBlastRun *self) {
1055     VdbBlastStatus status = eVdbBlastNoErr;
1056     RunDesc *rd = NULL;
1057 
1058     int i = 0;
1059     const char *col = NULL;
1060 
1061     assert(self);
1062 
1063     rd = &self->rd;
1064 
1065     if (rd->spotCount || rd->readType || rd->nReads || rd->nBioReads) {
1066         if (self->cSra && rd->cmpBaseCount == ~0) {
1067             rc_t rc = RC(rcSRA, rcTable, rcReading, rcColumn, rcNotFound);
1068             PLOGERR(klogInt, (klogInt, rc,
1069                 "Cannot read CMP_BASE_COUNT column for $(p)",
1070                 "p=%s", self->path));
1071             STSMSG(1, ("Error: failed to read %s/%s",
1072                 self->path, "CMP_BASE_COUNT"));
1073             return eVdbBlastErr;
1074         }
1075         else {
1076             S
1077             return eVdbBlastNoErr;
1078         }
1079     }
1080 
1081     assert(self->path && self->obj);
1082 
1083     col = "SPOT_COUNT";
1084     status = _VTableReadFirstRow(self->obj->seqTbl,
1085         col, &rd->spotCount, sizeof rd->spotCount, NULL, self->acc);
1086     if (status != eVdbBlastNoErr) {
1087         STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1088         return status;
1089     }
1090 
1091     rd->spotBits = Bits(rd->spotCount, rd->readIdDesc.idType);
1092 
1093     if (self->type == btpWGS) {
1094         S
1095         status = eVdbBlastNoErr;
1096         rd->nReads = rd->spotCount > 0 ? 1 : 0;
1097     }
1098     else if (self->type == btpREFSEQ) {
1099         S
1100         status = eVdbBlastNoErr;
1101         rd->nReads = 1;
1102     }
1103     else {
1104         uint32_t nreads = 0;
1105         status = _VTableGetNReads(self->obj->seqTbl, &nreads, self->acc);
1106         if (status == eVdbBlastNoErr) {
1107             rd->nReads = nreads;
1108         }
1109     }
1110 
1111     switch (self->type) {
1112         case btpSRA:
1113             col = "PLATFORM";
1114             status = _VTableReadFirstRow(self->obj->seqTbl,
1115                 col, &rd->platform, sizeof rd->platform, NULL, self->acc);
1116             if (status != eVdbBlastNoErr) {
1117                 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1118                 return status;
1119             }
1120             switch (rd->platform) { /* TODO */
1121                 case SRA_PLATFORM_ILLUMINA:
1122                 case SRA_PLATFORM_ABSOLID:
1123                 case SRA_PLATFORM_COMPLETE_GENOMICS:
1124                     rd->varReadLen = false;
1125                     break;
1126                 case SRA_PLATFORM_UNDEFINED:
1127                 case SRA_PLATFORM_454:
1128                 case SRA_PLATFORM_HELICOS:
1129                 case SRA_PLATFORM_PACBIO_SMRT:
1130                 case SRA_PLATFORM_ION_TORRENT:
1131                 case SRA_PLATFORM_CAPILLARY:
1132                 case SRA_PLATFORM_OXFORD_NANOPORE:
1133                 default:
1134                     rd->varReadLen = true;
1135                     break;
1136             }
1137             break;
1138         case btpWGS:
1139             rd->varReadLen = true;
1140             break;
1141         case btpREFSEQ:
1142             break;
1143         default:
1144             assert(0);
1145             break;
1146     }
1147 
1148     col = "READ_TYPE";
1149     if (rd->readType == NULL) {
1150         rd->readType = calloc(rd->nReads, sizeof *rd->readType);
1151         if (rd->readType == NULL)
1152         {   return eVdbBlastMemErr; }
1153     }
1154     {
1155         bool optional = self->type == btpREFSEQ;
1156         status = _VTableReadFirstRowImpl(self->obj->seqTbl, col, rd->readType,
1157             rd->nReads * sizeof *rd->readType, &rd->readTypeStatic,
1158             optional, NULL, self->acc);
1159     }
1160     /* TODO: check case when ($#READ_TYPE == 0 && nreads > 0) */
1161     if (status != eVdbBlastNoErr) {
1162         if (status == eVdbBlastTooExpensive) {
1163             status = eVdbBlastNoErr;
1164         }
1165         else {
1166             STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1167             return status;
1168         }
1169     }
1170 
1171     col = "READ_LEN";
1172     if (rd->readLen == NULL) {
1173         rd->readLen = calloc(rd->nReads, sizeof *rd->readLen);
1174         if (rd->readLen == NULL)
1175         {   return eVdbBlastMemErr; }
1176     }
1177     status = _VTableReadFirstRow(self->obj->seqTbl, col, rd->readLen,
1178         rd->nReads * sizeof *rd->readLen, &rd->readLenStatic, self->acc);
1179     /* TODO: check case when ($#READ_TYPE == 0 && nreads > 0) */
1180     if (status != eVdbBlastNoErr) {
1181         STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1182         return status;
1183     }
1184 
1185     col = "READ_FILTER"; /* col = "RD_FILTER"; */
1186     if (rd->rdFilter == NULL) {
1187         rd->rdFilter = calloc(rd->nReads, sizeof *rd->rdFilter);
1188         if (rd->rdFilter == NULL)
1189         {   return eVdbBlastMemErr; }
1190     }
1191     status = _VTableReadFirstRow(self->obj->seqTbl, col, rd->rdFilter,
1192         rd->nReads * sizeof *rd->rdFilter, &rd->rdFilterStatic, self->acc);
1193     if (status != eVdbBlastNoErr) {
1194         if (status == eVdbBlastTooExpensive) {
1195             status = eVdbBlastNoErr;
1196         }
1197         else {
1198             STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1199             return status;
1200         }
1201     }
1202 
1203     col = "BIO_BASE_COUNT";
1204     status = _VTableReadFirstRowImpl(self->obj->seqTbl, col, &rd->bioBaseCount,
1205         sizeof rd->bioBaseCount, NULL, true, NULL, self->acc);
1206      /* Do not generate error message when BIO_BASE_COUNT is not found */
1207 
1208     if (status != eVdbBlastNoErr) {
1209         if (status == eVdbBlastTooExpensive) {
1210             status = eVdbBlastNoErr;
1211             rd->bioBaseCount = ~0;
1212         }
1213         else {
1214             STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1215             return status;
1216         }
1217     }
1218 
1219     col = "CMP_BASE_COUNT";
1220     status = _VTableReadFirstRow(self->obj->seqTbl, col,
1221         &rd->cmpBaseCount, sizeof rd->cmpBaseCount, NULL, self->acc);
1222     if (status != eVdbBlastNoErr) {
1223         if (status == eVdbBlastTooExpensive) {
1224             /* CMP_BASE_COUNT should be always found */
1225             rc_t rc = RC(rcSRA, rcTable, rcReading, rcColumn, rcNotFound);
1226             PLOGERR(klogInt, (klogInt, rc,
1227                 "Cannot read CMP_BASE_COUNT column for $(p)",
1228                 "p=%s", self->path));
1229             STSMSG(1, ("Error: failed to read %s/%s",
1230                 self->path, "CMP_BASE_COUNT"));
1231             rd->cmpBaseCount = ~0;
1232             return eVdbBlastErr;
1233         }
1234         else {
1235             STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1236             return status;
1237         }
1238     }
1239 
1240     if (self->type == btpREFSEQ) {
1241         rd->nBioReads = 1;
1242         /* rd->bioLen is set incorrectly for refseq-s */
1243     }
1244     else {
1245         for (rd->nBioReads = 0, i = 0; i < rd->nReads; ++i) {
1246             S
1247             if (rd->readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
1248                 if ((rd->rdFilterStatic == eColTypeStatic &&
1249                      rd->rdFilter[i] == READ_FILTER_PASS)
1250                     || (rd->rdFilterStatic == eColTypeAbsent))
1251                 {
1252                     ++rd->nBioReads;
1253                     rd->bioLen += rd->readLen[i];
1254                 }
1255                 else {
1256                     ++rd->nBioReads;
1257                 }
1258             }
1259         }
1260     }
1261     S /* LOG nBioReads */
1262 
1263     return status;
1264 }
1265 
_VdbBlastRunVarReadNum(const VdbBlastRun * self)1266 bool _VdbBlastRunVarReadNum(const VdbBlastRun *self) {
1267     VdbBlastStatus status = _VdbBlastRunFillRunDesc((VdbBlastRun*)self);
1268     if (status != eVdbBlastNoErr) {
1269         LOGERR(klogInt, 1, "Cannot fill run descriptor");
1270     }
1271 
1272     assert(self);
1273 
1274     return self->rd.platform == SRA_PLATFORM_PACBIO_SMRT;
1275 }
1276 
1277 /* _VdbBlastRunGetNumSequences
1278     returns (number of spots) * (number of biological reads in spot).
1279     If read_filter is not static: some reads could be filtered,
1280     so status is set to eVdbBlastTooExpensive */
1281 /*static*/
_VdbBlastRunGetNumSequences(VdbBlastRun * self,VdbBlastStatus * status)1282 uint64_t _VdbBlastRunGetNumSequences(VdbBlastRun *self,
1283     VdbBlastStatus *status)
1284 {
1285     assert(self && status);
1286 
1287     *status = eVdbBlastNoErr;
1288 
1289     if (self->bioReads == ~0) {
1290         RunDesc *rd = NULL;
1291 
1292         if (self->type == btpREFSEQ) {
1293             S
1294             self->bioReads = 1;
1295         }
1296         else if (_VdbBlastRunVarReadNum(self)) {
1297             rc_t rc = 0;
1298             ReadDesc desc;
1299             memset(&desc, 0, sizeof desc);
1300             rc = _VdbBlastDbOpenCursAndGetAllReads(self->obj, &desc, self->acc);
1301             if (rc != 0) {
1302                 *status = eVdbBlastErr;
1303             }
1304             else {
1305                 self->bioReads = desc.spot;
1306             }
1307         }
1308         else {
1309             *status = _VdbBlastRunFillRunDesc(self);
1310             if (*status != eVdbBlastNoErr) {
1311                 S
1312                 return 0;
1313             }
1314 
1315             rd = &self->rd;
1316 
1317             if (rd->rdFilterStatic != eColTypeStatic) {
1318                 self->bioReadsTooExpensive = true;
1319             }
1320 
1321             if (self->cSra) {
1322                 self->bioReadsTooExpensive = true;
1323             }
1324 
1325             if (_VdbBlastRunVarReadNum(self)) {
1326                 self->bioReads = 0;
1327                 S
1328             }
1329             else {
1330                 self->bioReads = rd->spotCount * rd->nBioReads;
1331                 S
1332             }
1333         }
1334     }
1335     else {
1336         S
1337     }
1338 
1339     if (*status == eVdbBlastNoErr && self->bioReadsTooExpensive) {
1340         *status = eVdbBlastTooExpensive;
1341     }
1342     return self->bioReads;
1343 }
1344 
_VdbBlastRunCountBioBaseCount(VdbBlastRun * self,VdbBlastStatus * status)1345 static uint64_t _VdbBlastRunCountBioBaseCount(VdbBlastRun *self,
1346     VdbBlastStatus *status)
1347 {
1348     uint64_t bio_base_count = 0;
1349 
1350     rc_t rc = _VTableBioBaseCntApprox(self->obj->seqTbl,
1351         self->rd.spotCount, self->rd.nReads, &bio_base_count, self->acc);
1352 
1353     if (rc != 0) {
1354         *status = eVdbBlastErr;
1355     }
1356 
1357     return bio_base_count;
1358 }
1359 
_VdbBlastSraRunGetLengthApprox(VdbBlastRun * self,VdbBlastStatus * status)1360 static uint64_t _VdbBlastSraRunGetLengthApprox(VdbBlastRun *self,
1361     VdbBlastStatus *status)
1362 {
1363     assert(self && status);
1364 
1365     *status = eVdbBlastNoErr;
1366 
1367     if (self->bioBasesApprox == ~0) {
1368         if (! self->bioBasesTooExpensive && self->bioBases != ~0) {
1369             self->bioBasesApprox = self->bioBases;
1370         }
1371         else {
1372             RunDesc *rd = NULL;
1373             *status = _VdbBlastRunFillRunDesc(self);
1374             if (*status != eVdbBlastNoErr) {
1375                 S
1376                 return 0;
1377             }
1378 
1379             rd = &self->rd;
1380             if (rd->nReads == 0) {
1381                 S
1382                 self->bioBasesApprox = 0;
1383             }
1384             else if (rd->varReadLen) {
1385                 S
1386                 self->bioBasesApprox
1387                     = _VdbBlastRunCountBioBaseCount(self, status);
1388             }
1389             else {
1390                 if (self->type == btpREFSEQ) {
1391                     if (rd->bioBaseCount == ~0) {
1392                         S
1393                         *status = eVdbBlastErr;
1394                     }
1395                     else {
1396                         self->bioBasesApprox = rd->bioBaseCount;
1397                     }
1398                 }
1399                 else {
1400                     uint8_t read = 0;
1401                     S
1402                     for (read = 0, self->bioBasesApprox = 0;
1403                         read < rd->nReads; ++read)
1404                     {
1405                         if (rd->readType[read] & SRA_READ_TYPE_BIOLOGICAL) {
1406                             self->bioBasesApprox += rd->readLen[read];
1407                         }
1408                     }
1409                     self->bioBasesApprox *= rd->spotCount;
1410                 }
1411             }
1412         }
1413     }
1414 
1415     return self->bioBasesApprox;
1416 }
1417 
_VdbBlastRunGetNumSequencesApprox(VdbBlastRun * self,VdbBlastStatus * status)1418 static uint64_t _VdbBlastRunGetNumSequencesApprox(VdbBlastRun *self,
1419     VdbBlastStatus *status)
1420 {
1421 
1422     assert(self && status);
1423 
1424     *status = eVdbBlastNoErr;
1425 
1426     if (self->bioReadsApprox == ~0) {
1427         RunDesc *rd = NULL;
1428 
1429         if (! self->bioReadsTooExpensive && self->bioReads != ~0) {
1430             self->bioReadsApprox = self->bioReads;
1431         }
1432         else if (self->type == btpREFSEQ) {
1433             S
1434             self->bioReadsApprox = 1;
1435         }
1436         else if (self->cSra) {
1437 /* Number of Bio Reads for cSra == number of CMP reads
1438     = Number of all bio reads * CMP_BASE_COUNT / BIO_BASE_COUNT
1439    Number of all bio reads = nSpots * n of bio reads per spot */
1440             double r = 0;
1441             uint64_t n = 0;
1442 
1443             *status = _VdbBlastRunFillRunDesc(self);
1444             if (*status != eVdbBlastNoErr) {
1445                 S
1446                 return 0;
1447             }
1448 
1449             rd = &self->rd;
1450 
1451             n = _VdbBlastSraRunGetLengthApprox(self, status);
1452             if (*status != eVdbBlastNoErr) {
1453                 S
1454                 return 0;
1455             }
1456             r = rd->cmpBaseCount * rd->spotCount * rd->nBioReads;
1457             r /= n;
1458             self->bioReadsApprox = r;
1459         }
1460         else if (_VdbBlastRunVarReadNum(self)) {
1461             self->bioReads = _VdbBlastRunGetNumSequences(self, status);
1462         }
1463         else {
1464             *status = _VdbBlastRunFillRunDesc(self);
1465             if (*status != eVdbBlastNoErr) {
1466                 S
1467                 return 0;
1468             }
1469 
1470             rd = &self->rd;
1471 
1472             self->bioReadsApprox = rd->spotCount * rd->nBioReads;
1473             S
1474         }
1475     }
1476 
1477     return self->bioReadsApprox;
1478 }
1479 
_VdbBlastRunGetLength(VdbBlastRun * self,VdbBlastStatus * status)1480 static uint64_t _VdbBlastRunGetLength
1481     (VdbBlastRun *self, VdbBlastStatus *status)
1482 {
1483     VdbBlastStatus dummy = eVdbBlastNoErr;
1484     if (status == NULL)
1485     {   status = &dummy; }
1486 
1487     *status = eVdbBlastNoErr;
1488 
1489     if (self->bioBasesTooExpensive) {
1490         *status = eVdbBlastTooExpensive;
1491         return 0;
1492     }
1493     else if (self->bioBases == ~0) {
1494         if (self->cSra) {
1495             *status = _VdbBlastRunFillRunDesc(self);
1496             if (*status != eVdbBlastNoErr) {
1497                 S
1498                 return 0;
1499             }
1500             self->bioBases = self->rd.cmpBaseCount;
1501         }
1502         else {
1503 /* if BIO_BASE_COUNT is not found then status is set to eVdbBlastTooExpensive */
1504             *status = _VTableReadFirstRowImpl(self->obj->seqTbl,
1505                 "BIO_BASE_COUNT", &self->bioBases,
1506                 sizeof self->bioBases, NULL, true, NULL, self->acc);
1507             if (*status == eVdbBlastTooExpensive) {
1508                 self->bioBasesTooExpensive = true;
1509             }
1510         }
1511     }
1512 
1513     if (*status == eVdbBlastNoErr) {
1514         S
1515         return self->bioBases;
1516     }
1517     else {
1518         S
1519         return 0;
1520     }
1521 }
1522 
_VdbBlastRunGetLengthApprox(VdbBlastRun * self,VdbBlastStatus * status)1523 static uint64_t _VdbBlastRunGetLengthApprox(VdbBlastRun *self,
1524     VdbBlastStatus *status)
1525 {
1526     if (self->cSra) {
1527         return _VdbBlastRunGetLength(self, status);
1528     }
1529     else {
1530         return _VdbBlastSraRunGetLengthApprox(self, status);
1531     }
1532 }
1533 
_VdbBlastRunScan(const VdbBlastRun * self,uint64_t (* cmp)(uint64_t cand,uint64_t champ,int64_t minRead,bool * done),uint64_t minRead,uint64_t start,VdbBlastStatus * status)1534 static uint64_t _VdbBlastRunScan(const VdbBlastRun *self,
1535     uint64_t (*cmp)
1536         (uint64_t cand, uint64_t champ, int64_t minRead, bool *done),
1537     uint64_t minRead, uint64_t start, VdbBlastStatus *status)
1538 {
1539     uint64_t res = start;
1540     uint64_t bad = start;
1541     uint64_t spot = 0;
1542     bool done = false;
1543     rc_t rc = 0;
1544     const VCursor *curs = NULL;
1545     uint32_t idxREAD_LEN = 0;
1546     uint32_t idxREAD_TYPE = 0;
1547     assert(self && status &&
1548         (self->rd.spotCount || self->rd.readType ||
1549          self->rd.nReads    || self->rd.nBioReads));
1550     rc = _VTableMakeCursor
1551         (self->obj->seqTbl, &curs, &idxREAD_LEN, "READ_LEN", self->acc);
1552     if (rc != 0) {
1553         return bad;
1554     }
1555     if (_VdbBlastRunVarReadNum(self)) {
1556         rc = VCursorAddColumn(curs, &idxREAD_TYPE, "READ_TYPE");
1557         if (rc != 0) {
1558             return bad;
1559         }
1560     }
1561     for (spot = 1;
1562         spot <= self->rd.spotCount && *status == eVdbBlastNoErr && !done;
1563         ++spot)
1564     {
1565         uint32_t elem_bits, elem_off, elem_cnt;
1566         const void *base = NULL;
1567         if (_VdbBlastRunVarReadNum(self)) {
1568             rc = VCursorCellDataDirect(curs, spot, idxREAD_TYPE,
1569                 &elem_bits, &base, &elem_off, &elem_cnt);
1570             if (rc != 0) {
1571                 *status = eVdbBlastErr;
1572                 PLOGMSG(klogInfo, (klogInfo,
1573                     "$(f): Cannot '$(name)' CellDataDirect",
1574                     "f=%s,name=%s", __func__, "READ_TYPE"));
1575                 res = bad;
1576                 break;
1577             }
1578             else {
1579                 INSDC_read_type *aRt = (INSDC_read_type*)base;
1580                 uint32_t c = 0;
1581                 for (c = 0; c < elem_cnt; ++c) {
1582                     INSDC_read_type *rt = aRt + c;
1583                     assert(rt);
1584 
1585                     if (*rt & SRA_READ_TYPE_BIOLOGICAL) {
1586                         uint32_t elem_bits, elem_off, elem_cnt;
1587                         const void *base = NULL;
1588                         rc = VCursorCellDataDirect(curs, spot, idxREAD_LEN,
1589                             &elem_bits, &base, &elem_off, &elem_cnt);
1590                         if (rc != 0) {
1591                             *status = eVdbBlastErr;
1592                             PLOGMSG(klogInfo, (klogInfo,
1593                                 "$(f): Cannot '$(name)' CellDataDirect",
1594                                 "f=%s,name=%s", __func__, "READ_LEN"));
1595                             res = bad;
1596                             break;
1597                         }
1598                         else if (elem_off != 0 || elem_bits != 32) {
1599                             *status = eVdbBlastErr;
1600                             PLOGERR(klogInt, (klogInt, rc,
1601                                 "Bad VCursorCellDataDirect(READ_LEN) result: "
1602                                 "boff=$(elem_off), elem_bits=$(elem_bits)",
1603                                 "elem_off=%u,elem_bits=%u",
1604                                 elem_off, elem_bits));
1605                             res = bad;
1606                             break;
1607                         }
1608                         else {
1609                             uint8_t read = c;
1610                             const uint32_t *readLen = base;
1611                             res = cmp(readLen[read], res, minRead, &done);
1612                         }
1613                     }
1614                 }
1615             }
1616         }
1617         else {
1618             rc = VCursorCellDataDirect(curs, spot, idxREAD_LEN,
1619                 &elem_bits, &base, &elem_off, &elem_cnt);
1620             if (rc != 0) {
1621                 *status = eVdbBlastErr;
1622                 PLOGMSG(klogInfo, (klogInfo,
1623                     "$(f): Cannot '$(name)' CellDataDirect",
1624                     "f=%s,name=%s", __func__, "READ_LEN"));
1625                 res = bad;
1626                 break;
1627             }
1628             else if (elem_off != 0 || elem_bits != 32) {
1629                 *status = eVdbBlastErr;
1630                 PLOGERR(klogInt, (klogInt, rc,
1631                     "Bad VCursorCellDataDirect(READ_LEN) result: "
1632                     "boff=$(elem_off), elem_bits=$(elem_bits)",
1633                     "elem_off=%u,elem_bits=%u", elem_off, elem_bits));
1634                 res = bad;
1635                 break;
1636             }
1637             else {
1638                 uint8_t read = 0;
1639                 const uint32_t *readLen = base;
1640                 assert(self->rd.readType && self->rd.readLen);
1641                 assert(self->rd.nReads == elem_cnt);
1642                 for (read = 0; read < self->rd.nReads; ++read) {
1643                     if (self->rd.readType[read] & SRA_READ_TYPE_BIOLOGICAL) {
1644                        res = cmp(readLen[read], res, minRead, &done);
1645                     }
1646                 }
1647             }
1648         }
1649     }
1650     RELEASE(VCursor, curs);
1651     return res;
1652 }
1653 
1654 static
_VdbBlastRunGetWgsAccession(VdbBlastRun * self,int64_t spot,char * name_buffer,size_t bsize,size_t * num_required)1655 VdbBlastStatus _VdbBlastRunGetWgsAccession(VdbBlastRun *self, int64_t spot,
1656     char *name_buffer, size_t bsize, size_t *num_required)
1657 {
1658     rc_t rc = 0;
1659     uint32_t row_len = 0;
1660 
1661     assert(num_required);
1662 
1663     if (self == NULL || spot <= 0 || name_buffer == NULL || bsize == 0) {
1664         STSMSG(0, ("Error: some of %s parameters is NULL or 0", __func__));
1665         return eVdbBlastErr;
1666     }
1667     assert(self->obj);
1668     if (self->obj->seqTbl == NULL) {
1669         STSMSG(0, ("Error: %s: VTable is NULL in VdbBlastRun", __func__));
1670         return eVdbBlastErr;
1671     }
1672 
1673     if (self->obj->cursACCESSION == NULL) {
1674         rc = _VTableMakeCursor(self->obj->seqTbl, &self->obj->cursACCESSION,
1675             &self->obj->col_ACCESSION, "ACCESSION", self->acc);
1676         if (rc != 0) {
1677             VCursorRelease(self->obj->cursACCESSION);
1678             self->obj->cursACCESSION = NULL;
1679             return eVdbBlastErr;
1680         }
1681     }
1682 
1683     assert(self->obj->cursACCESSION && rc == 0);
1684 
1685     rc = VCursorReadDirect(self->obj->cursACCESSION, spot,
1686         self->obj->col_ACCESSION, 8, name_buffer, bsize, &row_len);
1687     *num_required = row_len;
1688     if (row_len > 0) /* include ending '\0' */
1689     {   ++(*num_required); }
1690     if (rc == 0) {
1691         if (bsize > row_len)
1692         { name_buffer[row_len] = '\0'; }
1693         return eVdbBlastNoErr;
1694     }
1695     else if (rc == SILENT_RC
1696         (rcVDB, rcCursor, rcReading, rcBuffer, rcInsufficient))
1697     {   return eVdbBlastNoErr; }
1698     else {
1699         assert(self->path);
1700         PLOGERR(klogInt, (klogInt, rc, "Error in VCursorReadDirect"
1701             "$(path), ACCESSION, spot=$(spot))",
1702             "path=%s,spot=%ld", self->path, spot));
1703         return eVdbBlastErr;
1704     }
1705 }
1706 
_VdbBlastRunGetNumAlignments(VdbBlastRun * self,VdbBlastStatus * status)1707 uint64_t _VdbBlastRunGetNumAlignments(VdbBlastRun *self,
1708     VdbBlastStatus *status)
1709 {
1710     assert(status);
1711 
1712     *status = eVdbBlastNoErr;
1713 
1714     if (self->alignments == ~0) {
1715         assert(self->obj);
1716 
1717         if (self->obj->prAlgnTbl == NULL) {
1718             self->alignments = 0;
1719         }
1720         else {
1721             const char col[] = "SPOT_COUNT";
1722             *status = _VTableReadFirstRow(self->obj->prAlgnTbl, col,
1723                 &self->alignments, sizeof self->alignments, NULL, self->acc);
1724             if (*status != eVdbBlastNoErr) {
1725                 STSMSG(1, ("Error: failed to read %s/%s", self->path, col));
1726                 return 0;
1727             }
1728         }
1729     }
1730 
1731     assert(self->alignments != ~0);
1732 
1733     return self->alignments;
1734 }
1735 
1736 #ifdef TEST_VdbBlastRunFillReadDesc
1737 LIB_EXPORT
1738 #endif
_VdbBlastRunFillReadDesc(VdbBlastRun * self,uint64_t read_id,ReadDesc * desc)1739 VdbBlastStatus _VdbBlastRunFillReadDesc(VdbBlastRun *self,
1740     uint64_t read_id, ReadDesc *desc)
1741 {
1742     VdbBlastStatus status = eVdbBlastNoErr;
1743 
1744     const VdbBlastRun *prev = NULL;
1745     const RunDesc *rd = NULL;
1746 
1747     int bioIdx = 0;
1748 
1749     if (self == NULL || desc == NULL) {
1750         S
1751         return eVdbBlastErr;
1752     }
1753 
1754     prev = desc->run;
1755     memset(desc, 0, sizeof *desc);
1756     desc->prev = prev;
1757     desc->run = self;
1758 
1759     rd = &self->rd;
1760 
1761     if (rd->nReads == 0 || rd->readType == NULL) {
1762         status = _VdbBlastRunFillRunDesc(self);
1763         if (status != eVdbBlastNoErr)
1764         {   return status; }
1765         assert(rd->nReads && rd->readType);
1766     }
1767 
1768     if (rd->readIdDesc.idType == eFixedReadN || read_id == 0) {
1769         if (!_VdbBlastRunVarReadNum(self)) {
1770             desc->nReads = rd->nReads;
1771             desc->spot = read_id / rd->nBioReads + 1;
1772             if (desc->spot <= rd->spotCount) {
1773                 int idInSpot
1774                     = read_id - (desc->spot - 1) * rd->nBioReads; /* 0-based */
1775 
1776                 int i = 0;
1777                 for (i = 0; i < rd->nReads; ++i) {
1778                     if (rd->readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
1779                         if (bioIdx++ == idInSpot) {
1780                             S
1781                             desc->tableId = VDB_READ_UNALIGNED;
1782                             desc->read = i + 1;
1783                             desc->read_id = _RunDescMakeReadId(rd,
1784                                 desc->spot, desc->read, read_id, &status);
1785                             return status;
1786                         }
1787                     }
1788                 }
1789                 S
1790             }
1791             else {
1792                 uint64_t alignments = 0;
1793                 S
1794                 desc->spot -= rd->spotCount;
1795                 alignments = _VdbBlastRunGetNumAlignments(self, &status);
1796                 if (status != eVdbBlastNoErr) {
1797                     return status;
1798                 }
1799                 if (desc->spot <= alignments) {
1800                     desc->tableId = VDB_READ_ALIGNED;
1801                     desc->read = 1;
1802                     return eVdbBlastNoErr;
1803                 }
1804                 S
1805             }
1806         }
1807         else {
1808             bool found = false;
1809             if (_VdbBlastDbOpenCursAndGetFirstRead
1810                 (self->obj, desc, &found, self->acc) == 0)
1811             {
1812                 return eVdbBlastNoErr;
1813             }
1814         }
1815     }
1816     else if (rd->readIdDesc.idType == eFactor10) {
1817         if (rd->readIdDesc.runBits > 0) {
1818             read_id /= rd->readIdDesc.runBits;
1819         }
1820         if (rd->spotBits > 0) {
1821             rc_t rc = 0;
1822             desc->read = read_id / rd->spotBits;
1823             desc->spot = read_id % rd->spotBits;
1824             if (desc->read > 0 && desc->spot > 0) {
1825                 rc = _VdbBlastDbGetNReads
1826                     (self->obj, desc->spot, &desc->nReads, self->acc);
1827                 if (rc == 0) {
1828                     desc->tableId = VDB_READ_UNALIGNED;
1829                     S
1830                     return eVdbBlastNoErr;
1831                 }
1832                 else {
1833                     S
1834                 }
1835             }
1836             else {
1837                 S
1838             }
1839         }
1840     }
1841     else {
1842         assert(0);
1843     }
1844 
1845     memset(desc, 0, sizeof *desc);
1846     return eVdbBlastErr;
1847 }
1848 
_VdbBlastRunSetFindFirstRead(const VdbBlastRunSet * self,uint64_t * read_id,bool useGetFirstRead)1849 VdbBlastStatus _VdbBlastRunSetFindFirstRead(const VdbBlastRunSet *self,
1850     uint64_t *read_id, bool useGetFirstRead)
1851 {
1852     VdbBlastStatus status = eVdbBlastErr;
1853     const RunSet *rs = NULL;
1854     ReadDesc desc;
1855     assert(self);
1856     memset(&desc, 0, sizeof desc);
1857     rs = &self->runs;
1858     _VdbBlastRunSetBeingRead(self);
1859     if (useGetFirstRead) {
1860         bool found = false;
1861         uint32_t i = 0;
1862         for (i = 0; i < rs->krun; ++i) {
1863             VdbBlastRun *run = &rs->run[i];
1864             rc_t rc = _VdbBlastDbOpenCursAndGetFirstRead
1865                 (rs->run[i].obj, &desc, &found, rs->run[i].acc);
1866             if (rc != 0) {
1867                 break;
1868             }
1869             else if (!found) {
1870                 continue;
1871             }
1872 
1873             if (desc.spot == 0) {
1874                 if (rs->krun > 1) {
1875                     desc.prev = run;
1876                 }
1877             }
1878             else {
1879                 desc.run = run;
1880                 status = _ReadDescFixReadId(&desc);
1881                 break;
1882             }
1883         }
1884     }
1885     else {
1886         status = _RunSetFindReadDesc(rs, 0, &desc);
1887         if (status == eVdbBlastNoErr) {
1888             status = _ReadDescFixReadId(&desc);
1889         }
1890     }
1891     if (status == eVdbBlastNoErr) {
1892         assert(read_id);
1893         *read_id = desc.read_id;
1894     }
1895     return status;
1896 }
1897 
_VdbBlastRunSetGetReadIdType(const VdbBlastRunSet * self)1898 EReadIdType _VdbBlastRunSetGetReadIdType(const VdbBlastRunSet *self) {
1899     assert(self);
1900 
1901     return self->readIdDesc.idType;
1902 }
1903 
_VdbBlastRunGetReadId(VdbBlastRun * self,const char * acc,uint64_t spot,uint32_t read,uint64_t * read_id)1904 static VdbBlastStatus _VdbBlastRunGetReadId(VdbBlastRun *self, const char *acc,
1905     uint64_t spot, /* 1-based */
1906     uint32_t read, /* 1-based */
1907     uint64_t *read_id)
1908 {
1909     uint64_t id = ~0;
1910     VdbBlastStatus status = eVdbBlastErr;
1911     size_t size;
1912 
1913     assert(self && acc && read_id && self->acc);
1914     assert(memcmp(self->acc, acc, string_measure(self->acc, &size)) == 0);
1915 
1916     if ((spot <= 0 && read > 0) || (spot > 0 && read <= 0)) {
1917         S
1918         return eVdbBlastErr;
1919     }
1920 
1921     if (spot > 0) {
1922         if (self->type != btpSRA) {
1923             return eVdbBlastErr;
1924         }
1925 
1926         if (self->rd.readIdDesc.idType == eFixedReadN) {
1927             for (id = (spot - 1) * self->rd.nBioReads; ; ++id) {
1928                 ReadDesc desc;
1929                 status = _VdbBlastRunFillReadDesc(self, id, &desc);
1930                 if (status != eVdbBlastNoErr)
1931                 {   return status; }
1932                 if (desc.spot < spot) {
1933                     S
1934                     return eVdbBlastErr;
1935                 }
1936                 if (desc.spot > spot) {
1937                     S
1938                     return eVdbBlastErr;
1939                 }
1940                 if (desc.read == read) {
1941                     *read_id = id;
1942                     return eVdbBlastNoErr;
1943                 }
1944             }
1945             S
1946             return eVdbBlastErr;
1947         }
1948         else if (self->rd.readIdDesc.idType == eFactor10) {
1949             if (spot <= self->rd.spotCount) {
1950                 if (self->rd.readIdDesc.varReadN) {
1951                     /* TODO How to make sure this read exists in this spot?*/
1952                 }
1953                 else {
1954                     if (read > self->rd.nReads) {
1955                         S
1956                         return eVdbBlastErr;
1957                     }
1958                     else {
1959                         /* TODO how to check it when READ_TYPE is variable
1960                                 withing a run (is it possible?) */
1961                         if (read == 0) {
1962                             S
1963                             return eVdbBlastErr;
1964                         }
1965                         if (! (self->rd.readType[read - 1]
1966                                 & SRA_READ_TYPE_BIOLOGICAL))
1967                         {
1968                             S
1969                             return eVdbBlastErr;
1970                         }
1971                     }
1972                 }
1973                 id = _RunDescMakeReadId(&self->rd, spot, read, ~0, &status);
1974                 *read_id = id;
1975                 return status;
1976             }
1977             else {
1978                 S
1979                 return eVdbBlastErr;
1980             }
1981         }
1982         else {
1983             assert(0);
1984             return eVdbBlastErr;
1985         }
1986     }
1987     else {
1988         uint64_t n = ~0;
1989         uint64_t i = ~0;
1990         if (self->type == btpSRA)
1991         {   return eVdbBlastErr; }
1992         else if (self->type == btpREFSEQ) {
1993             *read_id = 0;
1994             return eVdbBlastNoErr;
1995         }
1996         else if (self->type == btpWGS) {
1997             n = _VdbBlastRunGetNumSequences(self, &status);
1998             if (status != eVdbBlastNoErr
1999                 && status != eVdbBlastTooExpensive)
2000             {
2001                 return status;
2002             }
2003             /* TODO optimize: avoid full run scan */
2004             for (i = 0; i < n ; ++i) {
2005                 size_t need = ~0;
2006 #define SZ 4096
2007                 char name_buffer[SZ + 1];
2008                 if (string_measure(acc, &size) > SZ) {
2009                     S
2010                     return eVdbBlastErr;
2011                 }
2012 #undef SZ
2013                 status = _VdbBlastRunGetWgsAccession(
2014                     self, i + 1, name_buffer, sizeof name_buffer, &need);
2015                 if (need > sizeof name_buffer) {
2016                     S
2017                     return eVdbBlastErr;
2018                 }
2019                 if (strcmp(name_buffer, acc) == 0) {
2020                     i = _RunDescMakeReadId(&self->rd, i + 1, 1, i, &status);
2021                     *read_id = i;
2022                     return status;
2023                 }
2024             }
2025         }
2026         else { assert(0); }
2027         return eVdbBlastErr;
2028     }
2029 }
2030 
_VdbBlastRunGetSequencesAmount(VdbBlastRun * self,VdbBlastStatus * status)2031 static uint64_t _VdbBlastRunGetSequencesAmount(
2032     VdbBlastRun *self, VdbBlastStatus *status)
2033 {
2034     uint64_t n = _VdbBlastRunGetNumSequences(self, status);
2035 
2036     assert(status);
2037 
2038     if (*status == eVdbBlastNoErr) {
2039         n += _VdbBlastRunGetNumAlignments(self, status);
2040     }
2041 
2042     return n;
2043 }
2044 
2045 /******************************************************************************/
2046 
_ReadDescFixReadId(ReadDesc * self)2047 VdbBlastStatus _ReadDescFixReadId(ReadDesc *self) {
2048     VdbBlastStatus status = eVdbBlastNoErr;
2049 
2050     assert(self && self->run);
2051 
2052     self->read_id = _RunDescMakeReadId
2053         (&self->run->rd, self->spot, self->read, self->read_id, &status);
2054 
2055     return status;
2056 }
2057 
2058 /******************************************************************************/
2059 
_RunSetFini(RunSet * self)2060 static  void _RunSetFini(RunSet *self) {
2061     assert(self);
2062 
2063     if (self->run) {
2064         uint32_t i = 0;
2065 
2066         for (i = 0; i < self->krun; ++i) {
2067             _VdbBlastRunFini(&self->run[i]);
2068         }
2069 
2070         free(self->run);
2071     }
2072 
2073     _RefSetFini(&self->refs);
2074 
2075     memset(self, 0, sizeof *self);
2076 }
2077 
_RunSetAllocTbl(RunSet * self)2078 static VdbBlastStatus _RunSetAllocTbl(RunSet *self) {
2079     size_t nmemb = 16;
2080 
2081     if (self == NULL)
2082     {   return eVdbBlastErr; }
2083 
2084     if (self->run && self->krun < self->nrun) {
2085         return eVdbBlastNoErr;
2086     }
2087 
2088     if (self->run == NULL) {
2089         self->run = calloc(nmemb, sizeof *self->run);
2090         if (self->run == NULL)
2091         {   return eVdbBlastMemErr; }
2092         S
2093     }
2094     else {
2095         void *p = NULL;
2096         nmemb += self->nrun;
2097         p = realloc(self->run, nmemb * sizeof *self->run);
2098         if (p == NULL)
2099         {   return eVdbBlastMemErr; }
2100         self->run = p;
2101         S
2102     }
2103 
2104     self->nrun = nmemb;
2105     return eVdbBlastNoErr;
2106 }
2107 
_RunSetAddObj(RunSet * self,VdbBlastDb * obj,const char * rundesc,BTableType type,const KDirectory * dir,char * fullpath,uint32_t min_read_length)2108 static VdbBlastStatus _RunSetAddObj(RunSet *self, VdbBlastDb *obj,
2109     const char *rundesc, BTableType type, const KDirectory *dir,
2110     char *fullpath, uint32_t min_read_length)
2111 {
2112 
2113     VdbBlastRun* run = NULL;
2114 
2115     VdbBlastStatus status = _RunSetAllocTbl(self);
2116     if (status != eVdbBlastNoErr) {
2117         return status;
2118     }
2119 
2120     assert(self && self->run);
2121 
2122     run = &self->run[self->krun];
2123 
2124     status = _VdbBlastRunInit(run,
2125         obj, rundesc, type, dir, fullpath, min_read_length, self->krun++);
2126 
2127     return status;
2128 }
2129 
_RunSetGetNumSequences(const RunSet * self,VdbBlastStatus * aStatus)2130 static uint64_t _RunSetGetNumSequences
2131     (const RunSet *self, VdbBlastStatus *aStatus)
2132 {
2133     uint64_t num = 0;
2134     uint32_t i = 0;
2135 
2136     assert(self && aStatus);
2137 
2138     *aStatus = eVdbBlastNoErr;
2139 
2140     for (i = 0; i < self->krun; ++i) {
2141         VdbBlastStatus status = eVdbBlastNoErr;
2142         VdbBlastRun *run = NULL;
2143 
2144         assert(self->run);
2145 
2146         run = &self->run[i];
2147 
2148         num += _VdbBlastRunGetNumSequences(run, &status);
2149         if (status != eVdbBlastNoErr) {
2150             assert(run->path);
2151 
2152             if (*aStatus == eVdbBlastNoErr) {
2153                 *aStatus = status;
2154             }
2155 
2156             if (status != eVdbBlastTooExpensive) {
2157                 STSMSG(1, (
2158                     "Error: failed to GetNumSequences(on run %s)", run->path));
2159                 return 0;
2160             }
2161 
2162             assert(*aStatus == eVdbBlastTooExpensive);
2163         }
2164     }
2165 
2166     STSMSG(1, ("_RunSetGetNumSequences = %ld", num));
2167 
2168     return num;
2169 }
2170 
2171 static
_RunSetGetNumSequencesApprox(const RunSet * self,VdbBlastStatus * status)2172 uint64_t _RunSetGetNumSequencesApprox(const RunSet *self,
2173     VdbBlastStatus *status)
2174 {
2175     uint64_t num = 0;
2176     uint32_t i = 0;
2177 
2178     assert(self && status);
2179 
2180     *status = eVdbBlastNoErr;
2181 
2182     for (i = 0; i < self->krun; ++i) {
2183         VdbBlastRun *run = NULL;
2184 
2185         assert(self->run);
2186 
2187         run = &self->run[i];
2188 
2189         num += _VdbBlastRunGetNumSequencesApprox(run, status);
2190         if (*status != eVdbBlastNoErr) {
2191             assert(run->path);
2192 
2193             STSMSG(1, (
2194                 "Error: failed to GetNumSequencesApprox(on run %s)",
2195                 run->path));
2196 
2197             return 0;
2198         }
2199     }
2200 
2201     STSMSG(1, ("_RunSetGetNumSequencesApprox = %ld", num));
2202 
2203     return num;
2204 }
2205 
2206 static
_RunSetGetTotalLength(const RunSet * self,VdbBlastStatus * status)2207 uint64_t _RunSetGetTotalLength(const RunSet *self,
2208     VdbBlastStatus *status)
2209 {
2210     uint64_t num = 0;
2211     uint32_t i = 0;
2212     assert(self && status);
2213 
2214     if (self->krun)
2215     {   assert(self->run); }
2216 
2217     for (i = 0; i < self->krun; ++i) {
2218         VdbBlastRun *run = &self->run[i];
2219         assert(run && run->path);
2220         num += _VdbBlastRunGetLength(run, status);
2221         if (*status != eVdbBlastNoErr) {
2222             STSMSG(1, (
2223                 "Error: failed to _RunSetGetTotalLength(on run %s)",
2224                 run->path));
2225             return 0;
2226         }
2227     }
2228 
2229     STSMSG(1, ("_RunSetGetTotalLength = %ld", num));
2230 
2231     return num;
2232 }
2233 
_RunSetGetTotalLengthApprox(const RunSet * self,VdbBlastStatus * status)2234 static uint64_t _RunSetGetTotalLengthApprox(const RunSet *self,
2235     VdbBlastStatus *status)
2236 {
2237     uint64_t num = 0;
2238 
2239     uint32_t i = 0;
2240 
2241     assert(self && status);
2242 
2243     for (num = 0, i = 0; i < self->krun; ++i) {
2244         VdbBlastRun *run = NULL;
2245 
2246         assert(self->run);
2247 
2248         run = &self->run[i];
2249 
2250         num += _VdbBlastRunGetLengthApprox(run, status);
2251         if (*status != eVdbBlastNoErr) {
2252             STSMSG(1, ("Error: failed "
2253                 "to _VdbBlastRunGetLengthApprox(on run %s)", run->path));
2254             return 0;
2255         }
2256     }
2257 
2258     STSMSG(1, ("VdbBlastRunSetGetTotalLengthApprox = %ld", num));
2259 
2260     return num;
2261 }
2262 
_RunSetGetName(const RunSet * self,VdbBlastStatus * status,char * name_buffer,size_t bsize)2263 static size_t _RunSetGetName(const RunSet *self,
2264     VdbBlastStatus *status, char *name_buffer, size_t bsize)
2265 {
2266     size_t need = 0, idx = 0;
2267     int i = 0;
2268     size_t size;
2269 
2270     assert(self && status);
2271     for (i = 0; i < self->krun; ++i) {
2272         VdbBlastRun *run = &self->run[i];
2273         if (run && run->acc) {
2274             if (i)
2275             {   ++need; }
2276             need += string_measure(run->acc, &size);
2277         }
2278         else {
2279             S
2280             return 0;
2281         }
2282     }
2283 
2284     if (name_buffer == NULL || bsize == 0) {
2285         S
2286         return need;
2287     }
2288 
2289     for (i = 0; i < self->krun; ++i) {
2290         VdbBlastRun *run = &self->run[i];
2291         if (run && run->acc) {
2292             if (i)
2293             {   name_buffer[idx++] = '|'; }
2294             if (idx >= bsize) {
2295                 S
2296                 return need;
2297             }
2298             string_copy(name_buffer + idx, bsize - idx,
2299                 run->acc, string_size(run->acc));
2300             idx += string_measure(run->acc, &size);
2301             if (idx >= bsize) {
2302                 S
2303                 return need;
2304             }
2305         }
2306     }
2307     name_buffer[idx++] = '\0';
2308     *status = eVdbBlastNoErr;
2309 
2310     S
2311     return need;
2312 }
2313 
2314 typedef struct {
2315     uint64_t read_id;
2316 } ReadId;
_RunSetFindReadDescContinuous(const RunSet * self,uint64_t read_id,ReadDesc * desc)2317 static VdbBlastStatus _RunSetFindReadDescContinuous(
2318     const RunSet *self, uint64_t read_id, ReadDesc *desc)
2319 {
2320     uint64_t i = 0;
2321     uint64_t prev = 0;
2322     uint64_t crnt = 0;
2323 
2324     assert(self && desc);
2325 
2326     for (i = 0, prev = 0; i < self->krun; ++i) {
2327         VdbBlastStatus status = eVdbBlastNoErr;
2328         VdbBlastRun *run = NULL;
2329         uint64_t l = 0;
2330 
2331         if (prev > 0 && i < prev) {
2332             S
2333             return eVdbBlastErr;
2334         }
2335 
2336         run = &self->run[i];
2337         if (run == NULL) {
2338             S
2339             return eVdbBlastErr;
2340         }
2341 
2342         l = _VdbBlastRunGetSequencesAmount(run, &status);
2343         if (status != eVdbBlastNoErr &&
2344             status != eVdbBlastTooExpensive)
2345         {
2346             S
2347             return status;
2348         }
2349 
2350         if (crnt + l <= read_id) {
2351             crnt += l;
2352         }
2353         else {
2354             uint64_t a_read_id = read_id - crnt;
2355             status = _VdbBlastRunFillReadDesc(run, a_read_id, desc);
2356             if (status == eVdbBlastNoErr) {
2357                 S
2358                 desc->read_id = read_id;
2359             }
2360             else
2361             {   S }
2362 
2363             return status;
2364         }
2365 
2366         prev = i;
2367     }
2368 
2369     S
2370     return eVdbBlastErr;
2371 }
2372 
_RunSetFindReadDescFactor10(const RunSet * self,uint64_t read_id,ReadDesc * desc)2373 static VdbBlastStatus _RunSetFindReadDescFactor10(
2374     const RunSet *self, uint64_t read_id, ReadDesc *desc)
2375 {
2376     VdbBlastStatus status = eVdbBlastErr;
2377 
2378     VdbBlastRun *run = NULL;
2379     uint64_t i = 0;
2380 
2381     assert(self && desc && self->run);
2382 
2383     if (self->run[0].rd.readIdDesc.runBits > 0) {
2384         i = read_id % self->run[0].rd.readIdDesc.runBits;
2385     }
2386     else {
2387         i = 0;
2388     }
2389 
2390     if (i >= self->krun) {
2391         S
2392         return eVdbBlastErr;
2393     }
2394 
2395     run = &self->run[i];
2396 
2397     status = _VdbBlastRunFillReadDesc(run, read_id, desc);
2398     if (status == eVdbBlastNoErr) {
2399         S
2400         desc->read_id = read_id;
2401     }
2402     else
2403     {   S }
2404 
2405     return status;
2406 }
2407 
_RunSetFindReadDesc(const RunSet * self,uint64_t read_id,ReadDesc * desc)2408 VdbBlastStatus _RunSetFindReadDesc(const RunSet *self,
2409     uint64_t read_id, ReadDesc *desc)
2410 {
2411     if (self == NULL || desc == NULL || self->krun == 0) {
2412         S
2413         return eVdbBlastErr;
2414     }
2415 
2416     switch (self->run[0].rd.readIdDesc.idType) {
2417         case eFixedReadN:
2418             return _RunSetFindReadDescContinuous(self, read_id, desc);
2419         case eFactor10:
2420             return _RunSetFindReadDescFactor10(self, read_id, desc);
2421         default:
2422             assert(0);
2423             return eVdbBlastErr;
2424     }
2425 }
2426 
2427 /******************************************************************************/
2428 
2429 static const char VDB_BLAST_RUN_SET[] = "VdbBlastRunSet";
2430 
_VdbBlastRunSetWhack(VdbBlastRunSet * self)2431 static void _VdbBlastRunSetWhack(VdbBlastRunSet *self) {
2432     assert(self);
2433 
2434     STSMSG(1, ("Deleting VdbBlastRunSet(min_read_length=%d, protein=%s)",
2435         self->core2na.min_read_length, self->protein ? "true" : "false"));
2436 
2437     VdbBlastMgrRelease(self->mgr);
2438 
2439     _RunSetFini(&self->runs);
2440     _Core2naFini(&self->core2na);
2441     _Core4naFini(&self->core4na);
2442     _Core2naFini(&self->core2naRef);
2443     _Core4naFini(&self->core4naRef);
2444 
2445     memset(self, 0, sizeof *self);
2446     free(self);
2447 }
2448 
2449 LIB_EXPORT
VdbBlastMgrMakeRunSet(const VdbBlastMgr * cself,VdbBlastStatus * status,uint32_t min_read_length,bool protein)2450 VdbBlastRunSet *VdbBlastMgrMakeRunSet(const VdbBlastMgr *cself,
2451     VdbBlastStatus *status,
2452     uint32_t min_read_length,
2453     bool protein)
2454 {
2455     rc_t rc = 0;
2456     VdbBlastRunSet *item = NULL;
2457     VdbBlastMgr *self = (VdbBlastMgr*)cself;
2458 
2459     VdbBlastStatus dummy = eVdbBlastNoErr;
2460     if (status == NULL)
2461     {   status = &dummy; }
2462 
2463     *status = eVdbBlastNoErr;
2464 
2465     item = calloc(1, sizeof *item);
2466     if (item == NULL) {
2467         *status = eVdbBlastMemErr;
2468         return item;
2469     }
2470 
2471     item->protein = protein;
2472 
2473     item->core2na.min_read_length = min_read_length;
2474 
2475     item->core2naRef.min_read_length = min_read_length;
2476 
2477     item->core4na.min_read_length = min_read_length;
2478     item->core4na.mode = VDB_READ_UNALIGNED;
2479 
2480     item->core4naRef.min_read_length = min_read_length;
2481     item->core4naRef.mode = VDB_READ_REFERENCE;
2482 
2483     item->minSeqLen = ~0;
2484     item->avgSeqLen = ~0;
2485     item->maxSeqLen = ~0;
2486 
2487     if (rc == 0) {
2488         rc = KLockMake(&item->core2na.mutex);
2489         if (rc != 0) {
2490             LOGERR(klogInt, rc, "Error in KLockMake");
2491         }
2492     }
2493     if (rc == 0) {
2494         rc = KLockMake(&item->core2naRef.mutex);
2495         if (rc != 0) {
2496             LOGERR(klogInt, rc, "Error in KLockMake");
2497         }
2498     }
2499     if (rc == 0) {
2500         rc = KLockMake(&item->core4na.mutex);
2501         if (rc != 0) {
2502             LOGERR(klogInt, rc, "Error in KLockMake");
2503         }
2504     }
2505     if (rc == 0) {
2506         rc = KLockMake(&item->core4naRef.mutex);
2507         if (rc != 0) {
2508             LOGERR(klogInt, rc, "Error in KLockMake");
2509         }
2510     }
2511     if (rc == 0) {
2512         item->mgr = VdbBlastMgrAddRef(self);
2513         if (item->mgr) {
2514             KRefcountInit(&item->refcount,
2515                 1, VDB_BLAST_RUN_SET, __func__, "set");
2516             STSMSG(1, ("Created VdbBlastRunSet(min_read_length=%d, protein=%s)",
2517                 min_read_length, protein ? "true" : "false"));
2518             return item;
2519         }
2520     }
2521 
2522     STSMSG(1, ("Error: failed to create VdbBlastRunSet"));
2523     _VdbBlastRunSetWhack(item);
2524 
2525     *status = eVdbBlastErr;
2526 
2527     return NULL;
2528 }
2529 
VdbBlastRunSetRelease(VdbBlastRunSet * self)2530 LIB_EXPORT void CC VdbBlastRunSetRelease(VdbBlastRunSet *self) {
2531     if (self == NULL) {
2532         return;
2533     }
2534 
2535     STSMSG(1, ("VdbBlastRunSetRelease"));
2536     if (KRefcountDrop(&self->refcount, VDB_BLAST_RUN_SET) != krefWhack)
2537     {   return; }
2538 
2539     _VdbBlastRunSetWhack(self);
2540 }
2541 
2542 LIB_EXPORT
VdbBlastRunSetAddRef(VdbBlastRunSet * self)2543 VdbBlastRunSet* CC VdbBlastRunSetAddRef(VdbBlastRunSet *self)
2544 {
2545     if (self == NULL) {
2546         STSMSG(1, ("VdbBlastRunSetAddRef(NULL)"));
2547         return self;
2548     }
2549 
2550     if (KRefcountAdd(&self->refcount, VDB_BLAST_RUN_SET) == krefOkay) {
2551         STSMSG(1, ("VdbBlastRunSetAddRef"));
2552         return self;
2553     }
2554 
2555     STSMSG(1, ("Error: failed to VdbBlastRunSetAddRef"));
2556     return NULL;
2557 }
2558 
2559 LIB_EXPORT
VdbBlastRunSetAddRun(VdbBlastRunSet * self,const char * native)2560 VdbBlastStatus CC VdbBlastRunSetAddRun(VdbBlastRunSet *self,
2561     const char *native)
2562 {
2563     rc_t rc = 0;
2564     char rundesc[PATH_MAX] = "";
2565     VdbBlastStatus status = eVdbBlastNoErr;
2566     BTableType type = btpUndefined;
2567 
2568     /* allocated in _VdbBlastMgrFindNOpenSeqTable()
2569        in _RunSetAddObj() is assigned to VdbBlastRun::path
2570        freed during VdbBlastRun release */
2571     char *fullpath = NULL;
2572 
2573     VdbBlastDb *obj = NULL;
2574     if (self == NULL || self->mgr == NULL || self->beingRead) {
2575         S
2576         return eVdbBlastErr;
2577     }
2578 
2579     obj = calloc(1, sizeof *obj);
2580     if (obj == NULL) {
2581         return eVdbBlastMemErr;
2582     }
2583 
2584     rc = _VdbBlastMgrNativeToPosix(self->mgr, native, rundesc, sizeof rundesc);
2585     if (rc != 0) {
2586         S
2587         status = eVdbBlastErr;
2588     }
2589 
2590     status = _VdbBlastMgrFindNOpenSeqTable(self->mgr,
2591         rundesc, &obj->seqTbl, &type, &fullpath, &obj->db);
2592     if (status != eVdbBlastNoErr) {
2593         S
2594         PLOGMSG(klogInfo,
2595             (klogInfo, "failed to open $(rundesc)", "rundesc=%s", rundesc));
2596     }
2597     else {
2598         S
2599         PLOGMSG(klogInfo,
2600             (klogInfo, "opened $(rundesc)", "rundesc=%s", rundesc));
2601     }
2602 
2603     if (status == eVdbBlastNoErr && _VTableCSra(obj->seqTbl)) {
2604         if (obj->db == NULL) {
2605             S
2606             status = eVdbBlastErr;
2607         }
2608         else {
2609             status = _VDatabaseOpenAlignmentTable(
2610                 obj->db, rundesc, &obj->prAlgnTbl);
2611         }
2612     }
2613 
2614     if (status == eVdbBlastNoErr) {
2615         if (_VTableVarReadNum(obj->seqTbl, rundesc)) {
2616             self->readIdDesc.varReadN = true;
2617             self->readIdDesc.idType   = eFactor10;
2618         }
2619 
2620         status = _RunSetAddObj(&self->runs, obj, rundesc, type,
2621             NULL, fullpath, self->core2na.min_read_length);
2622         S
2623     }
2624 
2625     return status;
2626 }
2627 
_VdbBlastRunSetBeingRead(const VdbBlastRunSet * cself)2628 void _VdbBlastRunSetBeingRead(const VdbBlastRunSet *cself) {
2629     uint32_t i = 0;
2630     uint32_t runBits = 0;
2631     VdbBlastRunSet *self = (VdbBlastRunSet*)cself;
2632     EReadIdType idType = eFixedReadN;
2633 
2634     if (self == NULL || self->beingRead) {
2635         return;
2636     }
2637 
2638     idType = self->readIdDesc.idType;
2639     runBits = Bits(self->runs.krun - 1, idType);
2640 
2641     for (i = 0; i < self->runs.krun; ++i) {
2642         VdbBlastRun *r = &self->runs.run[i];
2643         assert(r);
2644         r->rd.readIdDesc.idType = idType;
2645         r->rd.readIdDesc.runBits = runBits;
2646         r->rd.readIdDesc.varReadN = _VdbBlastRunVarReadNum(r);
2647     }
2648 
2649     self->beingRead = true;
2650 }
2651 
2652 LIB_EXPORT
VdbBlastRunSetIsProtein(const VdbBlastRunSet * self)2653 bool CC VdbBlastRunSetIsProtein(const VdbBlastRunSet *self)
2654 {
2655     if (self == NULL) {
2656         STSMSG(1, ("VdbBlastRunSetIsProtein(self=NULL)"));
2657         return false;
2658     }
2659 
2660     STSMSG(1, (
2661         "VdbBlastRunSetIsProtein = %s", self->protein ? "true" : "false"));
2662 
2663     return self->protein;
2664 }
2665 
2666 LIB_EXPORT
VdbBlastRunSetGetNumSequences(const VdbBlastRunSet * self,VdbBlastStatus * status)2667 uint64_t CC VdbBlastRunSetGetNumSequences(const VdbBlastRunSet *self,
2668     VdbBlastStatus *status)
2669 {
2670     VdbBlastStatus dummy = eVdbBlastNoErr;
2671     if (status == NULL)
2672     {   status = &dummy; }
2673 
2674     if (self == NULL) {
2675         *status = eVdbBlastErr;
2676         return 0;
2677     }
2678 
2679     _VdbBlastRunSetBeingRead(self);
2680 
2681     return _RunSetGetNumSequences(&self->runs, status);
2682 }
2683 
VdbBlastRunSetGetNumSequencesApprox(const VdbBlastRunSet * self)2684 LIB_EXPORT uint64_t CC VdbBlastRunSetGetNumSequencesApprox(
2685     const VdbBlastRunSet *self)
2686 {
2687     uint64_t num = 0;
2688     VdbBlastStatus status = eVdbBlastNoErr;
2689 
2690     _VdbBlastRunSetBeingRead(self);
2691 
2692     num = _RunSetGetNumSequencesApprox(&self->runs, &status);
2693 
2694     STSMSG(1, ("VdbBlastRunSetGetNumSequencesApprox=%lu", num));
2695 
2696     return num;
2697 }
2698 
2699 LIB_EXPORT
VdbBlastRunSetGetTotalLength(const VdbBlastRunSet * self,VdbBlastStatus * status)2700 uint64_t CC VdbBlastRunSetGetTotalLength(const VdbBlastRunSet *self,
2701     VdbBlastStatus *status)
2702 {
2703     VdbBlastStatus dummy = eVdbBlastNoErr;
2704     if (status == NULL)
2705     {   status = &dummy; }
2706 
2707     if (self == NULL) {
2708         *status = eVdbBlastErr;
2709         return 0;
2710     }
2711 
2712     _VdbBlastRunSetBeingRead(self);
2713 
2714     return _RunSetGetTotalLength(&self->runs, status);
2715 }
2716 
2717 LIB_EXPORT
VdbBlastRunSetGetTotalLengthApprox(const VdbBlastRunSet * self)2718 uint64_t CC VdbBlastRunSetGetTotalLengthApprox(
2719     const VdbBlastRunSet *self)
2720 {
2721     VdbBlastStatus status = eVdbBlastNoErr;
2722 
2723     if (self == NULL) {
2724         STSMSG(1, ("VdbBlastRunSetGetTotalLengthApprox(self=NULL)"));
2725         return 0;
2726     }
2727 
2728     _VdbBlastRunSetBeingRead(self);
2729 
2730     return _RunSetGetTotalLengthApprox(&self->runs, &status);
2731 }
2732 
2733 LIB_EXPORT
VdbBlastRunSetGetMinSeqLen(const VdbBlastRunSet * self)2734 uint64_t CC VdbBlastRunSetGetMinSeqLen(const VdbBlastRunSet *self)
2735 {
2736     if (self->minSeqLen == ~0) {
2737         bool empty = true;
2738         VdbBlastStatus status = eVdbBlastNoErr;
2739         uint64_t res = ~0;
2740         uint32_t i = 0;
2741         _VdbBlastRunSetBeingRead(self);
2742         for (i = 0; i < self->runs.krun; ++i) {
2743             VdbBlastRun *run = &self->runs.run[i];
2744             assert(run);
2745             if (run->type == btpREFSEQ) {
2746                 uint64_t cand = _VdbBlastRunGetLengthApprox(run, &status);
2747                 if (status != eVdbBlastNoErr) {
2748                     S
2749                     return ~0;
2750                 }
2751                 if (cand < res && cand >= self->core2na.min_read_length) {
2752                     res = cand;
2753                 }
2754             }
2755             else {
2756                 status = _VdbBlastRunFillRunDesc(run);
2757                 if (status != eVdbBlastNoErr) {
2758                     S
2759                     return ~0;
2760                 }
2761                 if (!run->rd.varReadLen) {
2762                     uint8_t read = 0;
2763                     assert(run->rd.readType && run->rd.readLen);
2764                     for (read = 0; read < run->rd.nReads; ++read) {
2765                         if (run->rd.readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
2766                             if (run->rd.readLen[i]
2767                                 == self->core2na.min_read_length)
2768                             {
2769                                 ((VdbBlastRunSet*)self)->minSeqLen
2770                                     = self->core2na.min_read_length;
2771                                 return self->minSeqLen;
2772                             }
2773                             else if (run->rd.readLen[i]
2774                                     > self->core2na.min_read_length
2775                                 && run->rd.readLen[i] < res)
2776                             {
2777                                 res = run->rd.readLen[i];
2778                                 empty = false;
2779                             }
2780                         }
2781                     }
2782                 }
2783                 else {
2784                     res = _VdbBlastRunScan(
2785                         run, Min, self->core2na.min_read_length, res, &status);
2786                     if (status != eVdbBlastNoErr) {
2787                         S
2788                         return ~0;
2789                     }
2790                 }
2791             }
2792         }
2793         if (empty && res == ~0) {
2794             res = 0;
2795         }
2796         ((VdbBlastRunSet*)self)->minSeqLen = res;
2797     }
2798     return self->minSeqLen;
2799 }
2800 
2801 LIB_EXPORT
VdbBlastRunSetGetMaxSeqLen(const VdbBlastRunSet * self)2802 uint64_t CC VdbBlastRunSetGetMaxSeqLen(const VdbBlastRunSet *self)
2803 {
2804     if (self->maxSeqLen == ~0) {
2805         VdbBlastStatus status = eVdbBlastNoErr;
2806         uint64_t res = 0;
2807         uint32_t i = 0;
2808         _VdbBlastRunSetBeingRead(self);
2809         for (i = 0; i < self->runs.krun; ++i) {
2810             VdbBlastRun *run = &self->runs.run[i];
2811             assert(run);
2812             if (run->type == btpREFSEQ) {
2813                 uint64_t cand = _VdbBlastRunGetLengthApprox(run, &status);
2814                 if (status != eVdbBlastNoErr) {
2815                     S
2816                     return ~0;
2817                 }
2818                 if (cand > res) {
2819                     res = cand;
2820                 }
2821             }
2822             else {
2823                 status = _VdbBlastRunFillRunDesc(run);
2824                 if (status != eVdbBlastNoErr) {
2825                     S
2826                     return ~0;
2827                 }
2828                 if (!run->rd.varReadLen) {
2829                     uint8_t read = 0;
2830                     assert(run->rd.readType && run->rd.readLen);
2831                     for (read = 0; read < run->rd.nReads; ++read) {
2832                         if (run->rd.readType[i] & SRA_READ_TYPE_BIOLOGICAL) {
2833                             if (run->rd.readLen[i] > res) {
2834                                 res = run->rd.readLen[i];
2835                             }
2836                         }
2837                     }
2838                 }
2839                 else {
2840                     res = _VdbBlastRunScan(
2841                         run, Max, -1, res, &status);
2842                     if (status != eVdbBlastNoErr) {
2843                         S
2844                         return ~0;
2845                     }
2846                 }
2847             }
2848         }
2849         ((VdbBlastRunSet*)self)->maxSeqLen = res;
2850     }
2851     return self->maxSeqLen;
2852 }
2853 
2854 LIB_EXPORT
VdbBlastRunSetGetAvgSeqLen(const VdbBlastRunSet * self)2855 uint64_t CC VdbBlastRunSetGetAvgSeqLen(const VdbBlastRunSet *self)
2856 {
2857     uint64_t num = 0;
2858 
2859     if (self == NULL) {
2860         STSMSG(1, ("VdbBlastRunSetGetAvgSeqLen(self=NULL)"));
2861         return 0;
2862     }
2863 
2864     if (self->avgSeqLen == ~0) {
2865         uint64_t n = 0;
2866         _VdbBlastRunSetBeingRead(self);
2867 
2868         n = VdbBlastRunSetGetNumSequencesApprox(self);
2869         if (n != 0) {
2870             num = VdbBlastRunSetGetTotalLengthApprox(self) / n;
2871         }
2872         else {
2873             num = n;
2874         }
2875         ((VdbBlastRunSet*)self)->avgSeqLen = num;
2876     }
2877 
2878     STSMSG(1, ("VdbBlastRunSetGetAvgSeqLen = %ld", num));
2879     return self->avgSeqLen;
2880 }
2881 
VdbBlastRunSetGetName(const VdbBlastRunSet * self,VdbBlastStatus * status,char * name_buffer,size_t bsize)2882 LIB_EXPORT size_t CC VdbBlastRunSetGetName(const VdbBlastRunSet *self,
2883     VdbBlastStatus *status, char *name_buffer, size_t bsize)
2884 {
2885     size_t sz = 0;
2886 
2887     VdbBlastStatus dummy = eVdbBlastNoErr;
2888     if (status == NULL)
2889     {   status = &dummy; }
2890 
2891     *status = eVdbBlastErr;
2892 
2893     if (self == NULL)
2894     {   return 0; }
2895 
2896     _VdbBlastRunSetBeingRead(self);
2897 
2898     sz = _RunSetGetName(&self->runs, status, name_buffer, bsize);
2899 
2900     STSMSG(1, ("VdbBlastRunSetGetName = '%.*s'", bsize, name_buffer));
2901 
2902     return sz;
2903 }
2904 
2905 LIB_EXPORT
VdbBlastRunSetLastUpdatedDate(const VdbBlastRunSet * self)2906 time_t CC VdbBlastRunSetLastUpdatedDate(const VdbBlastRunSet *self)
2907 {
2908     _VdbBlastRunSetBeingRead(self);
2909     return _NotImplemented(__func__);
2910 }
2911 
2912 LIB_EXPORT
VdbBlastRunSetGetReadName(const VdbBlastRunSet * self,uint64_t read_id,char * name_buffer,size_t bsize)2913 size_t CC VdbBlastRunSetGetReadName(const VdbBlastRunSet *self,
2914     uint64_t read_id, /* 0-based in RunSet */
2915     char *name_buffer,
2916     size_t bsize)
2917 {
2918     rc_t rc = 0;
2919     VdbBlastStatus status = eVdbBlastNoErr;
2920     size_t need = 0;
2921     size_t num_writ = 0;
2922 
2923     ReadDesc desc;
2924     memset(&desc, 0, sizeof desc);
2925 
2926     if (name_buffer && bsize)
2927     {   name_buffer[0] = '\0'; }
2928 
2929     if (self == NULL) {
2930         STSMSG(1, ("VdbBlastRunSetGetReadName(self=NULL)"));
2931         return 0;
2932     }
2933 
2934     _VdbBlastRunSetBeingRead(self);
2935 
2936     status = _RunSetFindReadDesc(&self->runs, read_id, &desc);
2937     if (status != eVdbBlastNoErr) {
2938         STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadName: "
2939             "cannot find RunSet ReadDesc"));
2940         return 0;
2941     }
2942 
2943     assert(desc.run && desc.run->path && desc.run->acc && desc.spot
2944         && desc.read);
2945 
2946     if (desc.run->type == btpUndefined) {
2947         desc.run->type
2948             = _VdbBlastMgrBTableType(self->mgr, desc.run->path);
2949         assert(desc.run->type != btpUndefined);
2950     }
2951     if (desc.run->type == btpWGS) {
2952         if (desc.read != 1) {
2953             STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadName: "
2954                 "Unexpected read='%u' for run '%s', spot='%lu'",
2955                 desc.read, desc.run->path, desc.spot));
2956             return 0;
2957         }
2958         status = _VdbBlastRunGetWgsAccession(
2959             desc.run, desc.spot, name_buffer, bsize, &need);
2960         if (status != eVdbBlastNoErr)
2961         {   need = 0; }
2962         return need;
2963     }
2964     else if (desc.run->type == btpREFSEQ) {
2965         rc = string_printf(name_buffer, bsize, &num_writ, "%s", desc.run->acc);
2966         if (rc == 0) {
2967             S
2968             need = num_writ;
2969         }
2970         else if (GetRCObject(rc) == (enum RCObject)rcBuffer
2971             && GetRCState(rc) == rcInsufficient)
2972         {
2973             size_t size;
2974             S
2975             need = string_measure(desc.run->acc, &size) + 1;
2976         }
2977     }
2978     else {
2979         rc = string_printf(name_buffer, bsize, &num_writ,
2980             "%s.%lu.%u", desc.run->acc, desc.spot, desc.read);
2981         if (rc == 0) {
2982             S
2983             need = num_writ;
2984         }
2985         else if (GetRCObject(rc) == (enum RCObject)rcBuffer
2986             && GetRCState(rc) == rcInsufficient)
2987         {
2988             int i = 0;
2989             size_t size;
2990             S
2991             need = string_measure(desc.run->acc, &size) + 2;
2992             i = desc.spot;
2993             while (i > 0) {
2994                 ++need;
2995                 i /= 10;
2996             }
2997             i = desc.read;
2998             while (i > 0) {
2999                 ++need;
3000                 i /= 10;
3001             }
3002         }
3003         else
3004         {   LOGERR(klogInt, rc, "Unexpecter error in string_printf"); }
3005     }
3006 
3007     STSMSG(1, ("VdbBlastRunSetGetName = '%.*s'", bsize, name_buffer));
3008     return need;
3009 }
3010 
VdbBlastRunSetGetReadId(const VdbBlastRunSet * self,const char * name_buffer,size_t bsize,uint64_t * read_id)3011 LIB_EXPORT uint32_t CC VdbBlastRunSetGetReadId(const VdbBlastRunSet *self,
3012     const char *name_buffer, size_t bsize, uint64_t *read_id)
3013 {
3014     VdbBlastStatus status = eVdbBlastNoErr;
3015     bool found = false;
3016 
3017     uint64_t result = 0;
3018     char *acc = NULL;
3019     uint64_t spot = 0;
3020     uint32_t read = 0;
3021     uint32_t i = ~0;
3022     if (self == NULL || name_buffer == NULL || name_buffer[0] == '\0' ||
3023         bsize == 0 || read_id == 0)
3024     {   return eVdbBlastErr; }
3025 
3026     {
3027         size_t n = bsize;
3028         const char *end = name_buffer + bsize;
3029         char *dot2 = NULL;
3030         char *dot1 = memchr(name_buffer, '.', bsize);
3031         if (dot1 != NULL) {
3032             if (dot1 == name_buffer)
3033             {   return eVdbBlastErr; }
3034             if (dot1 - name_buffer + 1 >= bsize)
3035             {   return eVdbBlastErr; }
3036             n -= (dot1 - name_buffer + 1);
3037             dot2 = memchr(dot1 + 1, '.', n);
3038             if (dot2 != NULL) {
3039                 if (dot2 - name_buffer + 1 >= bsize)
3040                 {   return eVdbBlastErr; }
3041                 acc = string_dup(name_buffer, dot1 - name_buffer + 1);
3042                 if (acc == NULL)
3043                 {   return eVdbBlastMemErr; }
3044                 acc[dot1 - name_buffer] = '\0';
3045                 while (++dot1 < dot2) {
3046                     char c = *dot1;
3047                     if (c < '0' || c > '9') {
3048                         S
3049                         status = eVdbBlastErr;
3050                         break;
3051                     }
3052                     spot = spot * 10 + c - '0';
3053                 }
3054                 while (status == eVdbBlastNoErr && ++dot2 < end) {
3055                     char c = *dot2;
3056                     if (c < '0' || c > '9') {
3057                         S
3058                         status = eVdbBlastErr;
3059                         break;
3060                     }
3061                     read = read * 10 + c - '0';
3062                 }
3063             }
3064             else {
3065                 acc = malloc(bsize + 1);
3066                 if (acc == NULL)
3067                 {   return eVdbBlastMemErr; }
3068                 string_copy(acc, bsize + 1, name_buffer, bsize);
3069                 acc[bsize] = '\0';
3070             }
3071         }
3072         else {
3073             acc = malloc(bsize + 1);
3074             if (acc == NULL)
3075             {   return eVdbBlastMemErr; }
3076             string_copy(acc, bsize + 1, name_buffer, bsize);
3077             acc[bsize] = '\0';
3078         }
3079     }
3080 
3081     _VdbBlastRunSetBeingRead(self);
3082 
3083     for (i = 0; i < self->runs.krun && status == eVdbBlastNoErr; ++i) {
3084         uint64_t id = ~0;
3085         VdbBlastRun *run = self->runs.run + i;
3086         size_t size;
3087         assert(run && run->acc);
3088         if (string_measure(run->acc, &size)
3089             == string_measure(acc, &size))
3090         {
3091             if (strcmp(run->acc, acc) == 0) {
3092                 status = _VdbBlastRunGetReadId(run, acc, spot, read, &id);
3093                 if (status == eVdbBlastNoErr) {
3094                     if (run->rd.readIdDesc.idType == eFixedReadN) {
3095                         id += result;
3096                     }
3097                     *read_id = id;
3098                     found = true;
3099                 }
3100                 break;
3101             }
3102         }
3103         else if ((string_measure(run->acc, &size) < string_measure(acc, &size))
3104             && (run->type == btpWGS)
3105             && (memcmp(run->acc, acc, string_measure(run->acc, &size))
3106                 == 0))
3107         {
3108             status = _VdbBlastRunGetReadId(run, acc, spot, read, &id);
3109             if (status == eVdbBlastNoErr) {
3110                 if (run->rd.readIdDesc.idType == eFixedReadN) {
3111                     result += id;
3112                 }
3113                 else {
3114                     result = id;
3115                 }
3116                 *read_id = result;
3117                 found = true;
3118             }
3119             break;
3120         }
3121         result += _VdbBlastRunGetSequencesAmount(run, &status);
3122         if (status != eVdbBlastNoErr) {
3123             if (status == eVdbBlastTooExpensive) {
3124                 status = eVdbBlastNoErr;
3125             }
3126             else {
3127                 break;
3128             }
3129         }
3130     }
3131 
3132     if (status == eVdbBlastNoErr && !found) {
3133         S
3134         status = eVdbBlastErr;
3135     }
3136 
3137     free (acc);
3138     acc = NULL;
3139 
3140     return status;
3141 }
3142 
3143 /* TODO: make sure
3144          ReadLength is correct when there are multiple reads in the same spot */
3145 LIB_EXPORT
VdbBlastRunSetGetReadLength(const VdbBlastRunSet * self,uint64_t read_id)3146 uint64_t CC VdbBlastRunSetGetReadLength(const VdbBlastRunSet *self,
3147     uint64_t read_id)
3148 {
3149     rc_t rc = 0;
3150     const VCursor *curs = NULL;
3151     uint32_t col_idx = 0;
3152     char buffer[84] = "";
3153     uint32_t row_len = 0;
3154     ReadDesc desc;
3155     VdbBlastStatus status = eVdbBlastErr;
3156 
3157     if (self == NULL) {
3158         STSMSG(1, ("VdbBlastRunSetGetReadLength(self=NULL)"));
3159         return 0;
3160     }
3161 
3162     status = _RunSetFindReadDesc(&self->runs, read_id, &desc);
3163     if (status != eVdbBlastNoErr) {
3164         STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadLength: "
3165             "cannot find RunSet ReadDesc"));
3166         return 0;
3167     }
3168     assert(desc.run && desc.spot && desc.run->path);
3169 
3170     _VdbBlastRunSetBeingRead(self);
3171 
3172     if (rc == 0) {
3173         rc = _VTableMakeCursor
3174             (desc.run->obj->seqTbl, &curs, &col_idx, "READ", desc.run->acc);
3175         if (rc != 0) {
3176             PLOGERR(klogInt, (klogInt, rc, "Error in _VTableMakeCursor"
3177                 "$(path), READ)", "path=%s", desc.run->path));
3178         }
3179     }
3180     if (rc == 0) {
3181         rc = VCursorReadDirect
3182             (curs, desc.spot, col_idx, 8, buffer, sizeof buffer, &row_len);
3183         if (rc != 0) {
3184             PLOGERR(klogInt, (klogInt, rc, "Error in VCursorReadDirect"
3185                 "$(path), READ, spot=$(spot))",
3186                 "path=%s,spot=%ld", desc.run->path, desc.spot));
3187         }
3188     }
3189     VCursorRelease(curs);
3190     curs = NULL;
3191     if (rc == 0) {
3192         STSMSG(1, ("VdbBlastRunSetGetReadLength = %ld", row_len));
3193         return row_len;
3194     }
3195     else {
3196         STSMSG(1, ("Error: failed to VdbBlastRunSetGetReadLength"));
3197         return 0;
3198     }
3199 }
3200 
_VdbBlastRunSet2naRead(const VdbBlastRunSet * self,VdbBlastStatus * status,uint64_t * read_id,size_t * starting_base,uint8_t * buffer,size_t buffer_size,KVdbBlastReadMode mode)3201 uint64_t _VdbBlastRunSet2naRead(const VdbBlastRunSet *self,
3202     VdbBlastStatus *status, uint64_t *read_id, size_t *starting_base,
3203     uint8_t *buffer, size_t buffer_size, KVdbBlastReadMode mode)
3204 {
3205     uint64_t n = 0;
3206     rc_t rc = 0;
3207     assert(self && status);
3208 
3209     rc = KLockAcquire(self->core2na.mutex);
3210     if (rc != 0) {
3211         LOGERR(klogInt, rc, "Error in KLockAcquire");
3212     }
3213     else {
3214         const Core2na *c
3215             = mode != VDB_READ_REFERENCE ? &self->core2na : &self->core2naRef;
3216         n = _Core2naRead((Core2na*)c, &self->runs, status,
3217             read_id, starting_base, buffer, buffer_size);
3218         if (n == 0 && self->core2na.eos) {
3219             *read_id = ~0;
3220         }
3221         rc = KLockUnlock(self->core2na.mutex);
3222         if (rc != 0) {
3223             LOGERR(klogInt, rc, "Error in KLockUnlock");
3224         }
3225     }
3226     if (rc != 0) {
3227         *status = eVdbBlastErr;
3228     }
3229 
3230     if (*status == eVdbBlastNoErr || *status == eVdbBlastCircularSequence) {
3231         if (read_id != NULL && starting_base != NULL) {
3232             STSMSG(3, (
3233                 "VdbBlast2naReaderRead(read_id=%ld, starting_base=%ld) = %ld",
3234                 *read_id, *starting_base, n));
3235         }
3236         else {
3237             STSMSG(2, ("VdbBlast2naReaderRead = %ld", n));
3238         }
3239     }
3240     else {
3241         if (read_id != NULL && starting_base != NULL) {
3242             STSMSG(1, ("Error: failed to VdbBlast2naReaderRead(%ld)", n));
3243         }
3244         else {
3245             STSMSG(1, ("Error: failed to VdbBlast2naReaderRead"));
3246         }
3247     }
3248     return n;
3249 }
3250 
3251 /******************************************************************************/
3252