1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <kapp/main.h>
27 #include <kapp/args.h>
28 #include <klib/container.h>
29 #include <klib/log.h>
30 #include <klib/out.h>
31 #include <klib/status.h>
32 #include <klib/checksum.h>
33 #include <klib/rc.h>
34 #include <kdb/manager.h>
35 #include <kdb/table.h>
36 #include <kdb/meta.h>
37 #include <kdb/index.h>
38 
39 #include <sra/wsradb.h>
40 #include <sra/sradb-priv.h>
41 #include <sra/fastq.h>
42 #include <sra/sff.h>
43 
44 #include "zlib-simple.h"
45 #include "debug.h"
46 
47 #include <stdlib.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <errno.h>
51 
52 uint32_t g_file_block_sz = 32 * 1024;
53 const char* g_accession = NULL;
54 bool g_dump = false;
55 bool g_ungzip = false;
56 
57 typedef struct SIndexObj_struct {
58     KMDataNode* meta;
59     const char* const file;
60     const char* const format;
61     const char* const index;
62     rc_t (*func)(const SRATable* sratbl, struct SIndexObj_struct* obj, char* buffer, const size_t buffer_sz);
63     uint64_t file_size;
64     uint32_t buffer_sz;
65     uint64_t minSpotId;
66     uint64_t maxSpotId;
67     SLList li;
68     MD5State md5;
69     uint8_t md5_digest[16];
70 } SIndexObj;
71 
72 typedef struct SIndexNode_struct {
73     SLNode n;
74     uint64_t key;
75     uint64_t key_size;
76     int64_t id;
77     uint64_t id_qty;
78 } SIndexNode;
79 
80 typedef struct SIndexData_struct {
81     rc_t rc;
82     KIndex* kidx;
83 } SIndexData;
84 
85 static
InsertIndexData(SLNode * node,void * data)86 bool InsertIndexData( SLNode *node, void *data )
87 {
88     SIndexNode* n = (SIndexNode*)node;
89     SIndexData* d = (SIndexData*)data;
90 
91     d->rc = KIndexInsertU64(d->kidx, true, n->key, n->key_size, n->id, n->id_qty);
92     return d->rc == 0 ? false : true;
93 }
94 
95 static
WhackIndexData(SLNode * n,void * data)96 void WhackIndexData( SLNode *n, void *data )
97 {
98     free(n);
99 }
100 
101 static
CommitIndex(KTable * ktbl,const char * name,const SLList * li)102 rc_t CommitIndex(KTable* ktbl, const char* name, const SLList* li)
103 {
104     SIndexData data;
105 
106     STSMSG(0, ("Saving index %s", name));
107     data.rc = KTableCreateIndex(ktbl, &data.kidx, kitU64, kcmInit, name);
108     if( data.rc == 0 ) {
109         if( !SLListDoUntil(li, InsertIndexData, &data) ) {
110             data.rc = KIndexCommit(data.kidx);
111         }
112         KIndexRelease(data.kidx);
113     }
114     return data.rc;
115 }
116 
WriteFileMeta(SIndexObj * obj)117 rc_t WriteFileMeta(SIndexObj* obj)
118 {
119     rc_t rc = 0;
120     KMDataNode* nd = NULL;
121 
122     PLOGMSG(klogInfo, (klogInfo, "Meta $(f) on index $(i): file size $(s), buffer $(b)",
123         PLOG_4(PLOG_S(f),PLOG_S(i),PLOG_U64(s),PLOG_U32(b)), obj->file, obj->index, obj->file_size, obj->buffer_sz));
124 
125     if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Format")) == 0 ) {
126         KMDataNode* opt = NULL;
127         rc = KMDataNodeWriteCString(nd, obj->format);
128         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(nd, &opt, "Options")) == 0 ) {
129             KMDataNode* ond = NULL;
130             if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &ond, "accession")) == 0 ) {
131                 rc = KMDataNodeWriteCString(ond, g_accession);
132                 KMDataNodeRelease(ond);
133             }
134             if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &ond, "minSpotId")) == 0 ) {
135                 rc = KMDataNodeWriteB64(ond, &obj->minSpotId);
136                 KMDataNodeRelease(ond);
137             }
138             if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &ond, "maxSpotId")) == 0 ) {
139                 rc = KMDataNodeWriteB64(ond, &obj->maxSpotId);
140                 KMDataNodeRelease(ond);
141             }
142             KMDataNodeRelease(opt);
143         }
144         KMDataNodeRelease(nd);
145     }
146 
147     if( rc == 0 && obj->file_size > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Size")) == 0 ) {
148         rc = KMDataNodeWriteB64(nd, &obj->file_size);
149         KMDataNodeRelease(nd);
150     }
151 
152     if( rc == 0 && obj->buffer_sz > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Buffer")) == 0 ) {
153         rc = KMDataNodeWriteB32(nd, &obj->buffer_sz);
154         KMDataNodeRelease(nd);
155     }
156 
157     if( rc == 0 && strlen(obj->index) > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Index")) == 0 ) {
158         rc = KMDataNodeWriteCString(nd, obj->index);
159         KMDataNodeRelease(nd);
160     }
161 
162     if( rc == 0 && obj->file_size > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "md5")) == 0 ) {
163         char x[5];
164         int i;
165         for( i = 0; rc == 0 && i < sizeof(obj->md5_digest); i++ ) {
166             int l = snprintf(x, 4, "%02x", obj->md5_digest[i]);
167             rc = KMDataNodeAppend(nd, x, l);
168         }
169         KMDataNodeRelease(nd);
170     }
171     return rc;
172 }
173 
174 static
SFF_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)175 rc_t SFF_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
176 {
177     rc_t rc = 0;
178     const SFFReader* reader = NULL;
179 
180     if( (rc = SFFReaderMake(&reader, sratbl, g_accession, obj->minSpotId, obj->maxSpotId)) != 0 ) {
181         return rc;
182     } else {
183         size_t written = 0;
184         uint32_t blk = 0;
185         SIndexNode* inode = NULL;
186 
187         while( rc == 0 ) {
188             rc = SFFReader_GetNextSpotData(reader, buffer, buffer_sz, &written);
189             if( blk >= g_file_block_sz || (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted) ) {
190                 inode->key_size = blk;
191                 SLListPushTail(&obj->li, &inode->n);
192                 DEBUG_MSG(5, ("SFF index closed spots %lu, offset %lu, block size %lu\n", inode->id_qty, inode->key, inode->key_size));
193                 inode = NULL;
194                 if( blk > obj->buffer_sz ) {
195                     obj->buffer_sz = blk;
196                 }
197                 blk = 0;
198             }
199             if( GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted ) {
200                 rc = 0;
201                 break;
202             }
203             if( inode == NULL ) {
204                 spotid_t spotid = 0;
205                 if( (rc = SFFReaderCurrentSpot(reader, &spotid)) != 0 ) {
206                     break;
207                 }
208                 inode = malloc(sizeof(SIndexNode));
209                 if( inode == NULL ) {
210                     rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
211                     break;
212                 }
213                 inode->key = obj->file_size;
214                 inode->key_size = 0;
215                 inode->id = spotid;
216                 inode->id_qty = 0;
217                 DEBUG_MSG(5, ("SFF index opened spot %ld, offset %lu\n", inode->id, inode->key));
218                 if( spotid == 1 ) {
219                     char hd[10240];
220                     size_t hd_sz = 0;
221                     if( (rc = SFFReaderHeader(reader, 0, hd, sizeof(hd), &hd_sz)) == 0 ) {
222                         obj->file_size += hd_sz;
223                         blk += hd_sz;
224                         MD5StateAppend(&obj->md5, hd, hd_sz);
225                         if( g_dump ) {
226                             fwrite(hd, hd_sz, 1, stderr);
227                         }
228                     }
229                 }
230             }
231             obj->file_size += written;
232             blk += written;
233             inode->id_qty++;
234             MD5StateAppend(&obj->md5, buffer, written);
235             if( g_dump ) {
236                 fwrite(buffer, written, 1, stderr);
237             }
238         }
239         rc = rc ? rc : Quitting();
240         if( rc != 0 ) {
241             spotid_t spot = 0;
242             SFFReaderCurrentSpot(reader, &spot);
243             PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
244         }
245     }
246     SFFReaderWhack(reader);
247     return rc;
248 }
249 
250 static
SFFGzip_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)251 rc_t SFFGzip_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
252 {
253     rc_t rc = 0;
254     uint16_t zlib_ver = ZLIB_VERNUM;
255     const SFFReader* reader = NULL;
256 
257     if( (rc = SFFReaderMake(&reader, sratbl, g_accession, obj->minSpotId, obj->maxSpotId)) != 0 ) {
258         return rc;
259     } else {
260         size_t written = 0;
261         uint32_t blk = 0, spots_per_block = 0, proj_id_qty = 0;
262         SIndexNode* inode = NULL;
263         size_t z_blk = 0;
264         size_t spots_buf_sz = g_file_block_sz * 100;
265         size_t zbuf_sz = spots_buf_sz + 100;
266 
267         char* zbuf = malloc(zbuf_sz);
268         char* spots_buf = malloc(spots_buf_sz);
269         bool eof = false;
270 
271         if( zbuf == NULL || spots_buf == NULL ) {
272             rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
273         }
274         while( rc == 0 ) {
275             if( (rc = SFFReader_GetNextSpotData(reader, buffer, buffer_sz, &written)) == 0 ) {
276                 if( inode == NULL ) {
277                     spotid_t spotid = 0;
278                     if( (rc = SFFReaderCurrentSpot(reader, &spotid)) != 0 ) {
279                         break;
280                     }
281                     inode = malloc(sizeof(SIndexNode));
282                     if( inode == NULL ) {
283                         rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
284                         break;
285                     }
286                     inode->key = obj->file_size;
287                     inode->key_size = 0;
288                     inode->id = spotid;
289                     inode->id_qty = 0;
290                     DEBUG_MSG(5, ("%s open key: spot %ld, offset %lu\n", obj->index, inode->id, inode->key));
291                     if( spotid == 1 ) {
292                         char hd[10240];
293                         size_t hd_sz = 0;
294                         if( (rc = SFFReaderHeader(reader, 0, hd, sizeof(hd), &hd_sz)) == 0 ) {
295                             if( hd_sz + written > spots_buf_sz ) {
296                                 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcInsufficient);
297                                 break;
298                             }
299                             memmove(&spots_buf[blk], hd, hd_sz);
300                             blk += hd_sz;
301                             if( g_dump ) {
302                                 fwrite(hd, hd_sz, 1, stderr);
303                             }
304                         }
305                     }
306 
307                 }
308                 if( blk + written > spots_buf_sz ) {
309                     rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcInsufficient);
310                     break;
311                 }
312                 inode->id_qty++;
313                 memmove(&spots_buf[blk], buffer, written);
314                 blk += written;
315                 if( g_dump ) {
316                     fwrite(buffer, written, 1, stderr);
317                 }
318             }
319             if( (eof = (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted)) ) {
320                 rc = 0;
321                 if( inode == NULL ) {
322                     break;
323                 }
324             }
325             if( rc == 0 && (eof ||
326                             (proj_id_qty == 0 && inode->id_qty > (spots_per_block * 0.95)) ||
327                             (proj_id_qty > 0 && inode->id_qty >= proj_id_qty) ) ) {
328                 rc = ZLib_DeflateBlock(spots_buf, blk, zbuf, zbuf_sz, &z_blk);
329                 if( z_blk < g_file_block_sz ) {
330                     /* project needed id_qty */
331                     proj_id_qty = g_file_block_sz * inode->id_qty / z_blk * 1.05;
332                     DEBUG_MSG(5, ("%s: project id qty %lu\n", obj->index, proj_id_qty));
333                 } else {
334                     DEBUG_MSG(10, ("%s: no projection %lu > %lu\n", obj->index, z_blk, g_file_block_sz));
335                 }
336             }
337             if( rc == 0 && (eof || z_blk >= g_file_block_sz) ) {
338                 obj->file_size += z_blk;
339                 MD5StateAppend(&obj->md5, zbuf, z_blk);
340                 inode->key_size = z_blk;
341                 SLListPushTail(&obj->li, &inode->n);
342                 DEBUG_MSG(5, ("%s close key: spots %lu, size %lu, ratio %hu%%, raw %lu\n",
343                          obj->index, inode->id_qty, inode->key_size, (uint16_t)(((float)(blk - z_blk)/blk)*100), blk));
344                 spots_per_block = inode->id_qty;
345                 inode = NULL;
346                 if( blk > obj->buffer_sz ) {
347                     obj->buffer_sz = blk;
348                 }
349                 blk = 0;
350                 z_blk = 0;
351                 proj_id_qty = 0;
352             }
353             if( eof ) {
354                 break;
355             }
356         }
357         rc = rc ? rc : Quitting();
358         if( rc != 0 ) {
359             spotid_t spot = 0;
360             SFFReaderCurrentSpot(reader, &spot);
361             PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
362         }
363         free(zbuf);
364         free(spots_buf);
365     }
366     if( rc == 0 ) {
367         KMDataNode* opt = NULL, *nd = NULL;
368 
369         if( (rc = KMDataNodeOpenNodeUpdate(obj->meta, &opt, "Format/Options")) != 0 ) {
370             return rc;
371         }
372         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "ZlibVersion")) == 0 ) {
373             rc = KMDataNodeWriteB16(nd, &zlib_ver);
374             KMDataNodeRelease(nd);
375         }
376         KMDataNodeRelease(opt);
377     }
378     SFFReaderWhack(reader);
379     return rc;
380 }
381 
382 static
Fastq_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)383 rc_t Fastq_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
384 {
385     rc_t rc = 0;
386     const FastqReader* reader = NULL;
387 
388     uint8_t colorSpace = false;
389     char* colorSpaceKey = "\0";
390     uint8_t origFormat = false;
391     uint8_t printLabel = true;
392     uint8_t printReadId = true;
393     uint8_t clipQuality = true;
394     uint32_t minReadLen = 0;
395     uint16_t qualityOffset = 0;
396 
397     {{
398         const SRAColumn* c = NULL;
399         const uint8_t *platform = SRA_PLATFORM_UNDEFINED;
400         bitsz_t o, z;
401 
402         if( (rc = SRATableOpenColumnRead(sratbl, &c, "PLATFORM", sra_platform_id_t)) != 0 ) {
403             return rc;
404         }
405         if( (rc = SRAColumnRead(c, 1, (const void **)&platform, &o, &z)) != 0 ) {
406             return rc;
407         }
408         if( *platform == SRA_PLATFORM_ABSOLID ) {
409             colorSpace = true;
410         }
411         SRAColumnRelease(c);
412     }}
413 
414     if( (rc = FastqReaderMake(&reader, sratbl, g_accession,
415                         colorSpace, origFormat, false, printLabel, printReadId,
416                         !clipQuality, minReadLen, qualityOffset, colorSpaceKey[0],
417                         obj->minSpotId, obj->maxSpotId)) != 0 ) {
418         return rc;
419     } else {
420         KMDataNode* opt = NULL, *nd = NULL;
421 
422         if( (rc = KMDataNodeOpenNodeUpdate(obj->meta, &opt, "Format/Options")) != 0 ) {
423             return rc;
424         }
425         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpace")) == 0 ) {
426             rc = KMDataNodeWriteB8(nd, &colorSpace);
427             KMDataNodeRelease(nd);
428         }
429         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpaceKey")) == 0 ) {
430             rc = KMDataNodeWrite(nd, colorSpaceKey, 1);
431             KMDataNodeRelease(nd);
432         }
433         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "origFormat")) == 0 ) {
434             rc = KMDataNodeWriteB8(nd, &origFormat);
435             KMDataNodeRelease(nd);
436         }
437         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printLabel")) == 0 ) {
438             rc = KMDataNodeWriteB8(nd, &printLabel);
439             KMDataNodeRelease(nd);
440         }
441         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printReadId")) == 0 ) {
442             rc = KMDataNodeWriteB8(nd, &printReadId);
443             KMDataNodeRelease(nd);
444         }
445         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "clipQuality")) == 0 ) {
446             rc = KMDataNodeWriteB8(nd, &clipQuality);
447             KMDataNodeRelease(nd);
448         }
449         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "minReadLen")) == 0 ) {
450             rc = KMDataNodeWriteB32(nd, &minReadLen);
451             KMDataNodeRelease(nd);
452         }
453         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "qualityOffset")) == 0 ) {
454             rc = KMDataNodeWriteB16(nd, &qualityOffset);
455             KMDataNodeRelease(nd);
456         }
457         KMDataNodeRelease(opt);
458     }
459 
460     if( rc == 0 ) {
461         size_t written = 0;
462         uint32_t blk = 0;
463         SIndexNode* inode = NULL;
464 
465         while( rc == 0 ) {
466             rc = FastqReader_GetNextSpotSplitData(reader, buffer, buffer_sz, &written);
467             if( blk >= g_file_block_sz || (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted) ) {
468                 inode->key_size = blk;
469                 SLListPushTail(&obj->li, &inode->n);
470                 DEBUG_MSG(5, ("Fastq index closed spots %lu, offset %lu, block size %lu\n",
471                                                             inode->id_qty, inode->key, inode->key_size));
472                 inode = NULL;
473                 if( blk > obj->buffer_sz ) {
474                     obj->buffer_sz = blk;
475                 }
476                 blk = 0;
477             }
478             if( GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted ) {
479                 rc = 0;
480                 break;
481             }
482             if( inode == NULL ) {
483                 spotid_t spotid = 0;
484                 if( (rc = FastqReaderCurrentSpot(reader, &spotid)) != 0 ) {
485                     break;
486                 }
487                 inode = malloc(sizeof(SIndexNode));
488                 if( inode == NULL ) {
489                     rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
490                     break;
491                 }
492                 inode->key = obj->file_size;
493                 inode->key_size = 0;
494                 inode->id = spotid;
495                 inode->id_qty = 0;
496                 DEBUG_MSG(5, ("Fastq index opened spot %ld, offset %lu\n", inode->id, inode->key));
497             }
498             inode->id_qty++;
499             obj->file_size += written;
500             blk += written;
501             MD5StateAppend(&obj->md5, buffer, written);
502             if( g_dump ) {
503                 fwrite(buffer, written, 1, stderr);
504             }
505         }
506         rc = rc ? rc : Quitting();
507         if( rc != 0 ) {
508             spotid_t spot = 0;
509             FastqReaderCurrentSpot(reader, &spot);
510             PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
511         }
512     }
513     FastqReaderWhack(reader);
514     return rc;
515 }
516 
517 static
FastqGzip_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)518 rc_t FastqGzip_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
519 {
520     rc_t rc = 0;
521     const FastqReader* reader = NULL;
522 
523     uint16_t zlib_ver = ZLIB_VERNUM;
524     uint8_t colorSpace = false;
525     char* colorSpaceKey = "\0";
526     uint8_t origFormat = false;
527     uint8_t printLabel = true;
528     uint8_t printReadId = true;
529     uint8_t clipQuality = true;
530     uint32_t minReadLen = 0;
531     uint16_t qualityOffset = 0;
532 
533     {{
534         const SRAColumn* c = NULL;
535         const uint8_t *platform = SRA_PLATFORM_UNDEFINED;
536         bitsz_t o, z;
537 
538         if( (rc = SRATableOpenColumnRead(sratbl, &c, "PLATFORM", sra_platform_id_t)) != 0 ) {
539             return rc;
540         }
541         if( (rc = SRAColumnRead(c, 1, (const void **)&platform, &o, &z)) != 0 ) {
542             return rc;
543         }
544         if( *platform == SRA_PLATFORM_ABSOLID ) {
545             colorSpace = true;
546         }
547         SRAColumnRelease(c);
548     }}
549 
550     if( (rc = FastqReaderMake(&reader, sratbl, g_accession,
551                         colorSpace, origFormat, false, printLabel, printReadId,
552                         !clipQuality, minReadLen, qualityOffset, colorSpaceKey[0],
553                         obj->minSpotId, obj->maxSpotId)) != 0 ) {
554         return rc;
555     } else {
556         size_t written = 0;
557         uint32_t blk = 0, spots_per_block = 0, proj_id_qty = 0;
558         SIndexNode* inode = NULL;
559         size_t z_blk = 0;
560         size_t spots_buf_sz = g_file_block_sz * 100;
561         size_t zbuf_sz = spots_buf_sz + 100;
562         char* zbuf = malloc(zbuf_sz);
563         char* spots_buf = malloc(spots_buf_sz);
564         bool eof = false;
565 
566         if( zbuf == NULL || spots_buf == NULL ) {
567             rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
568         }
569         while( rc == 0 ) {
570             if( (rc = FastqReader_GetNextSpotSplitData(reader, buffer, buffer_sz, &written)) == 0 ) {
571                 if( inode == NULL ) {
572                     spotid_t spotid = 0;
573                     if( (rc = FastqReaderCurrentSpot(reader, &spotid)) != 0 ) {
574                         break;
575                     }
576                     inode = malloc(sizeof(SIndexNode));
577                     if( inode == NULL ) {
578                         rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
579                         break;
580                     }
581                     inode->key = obj->file_size;
582                     inode->key_size = 0;
583                     inode->id = spotid;
584                     inode->id_qty = 0;
585                     DEBUG_MSG(5, ("%s open key: spot %ld, offset %lu\n", obj->index, inode->id, inode->key));
586                 }
587                 if( blk + written > spots_buf_sz ) {
588                     rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcInsufficient);
589                     break;
590                 }
591                 inode->id_qty++;
592                 memmove(&spots_buf[blk], buffer, written);
593                 blk += written;
594                 if( g_dump ) {
595                     fwrite(buffer, written, 1, stderr);
596                 }
597             }
598             if( (eof = (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted)) ) {
599                 rc = 0;
600                 if( inode == NULL ) {
601                     break;
602                 }
603             }
604             if( rc == 0 && (eof ||
605                             (proj_id_qty == 0 && inode->id_qty > (spots_per_block * 0.95)) ||
606                             (proj_id_qty > 0 && inode->id_qty >= proj_id_qty) ) ) {
607                 rc = ZLib_DeflateBlock(spots_buf, blk, zbuf, zbuf_sz, &z_blk);
608                 if( z_blk < g_file_block_sz ) {
609                     /* project needed id_qty */
610                     proj_id_qty = g_file_block_sz * inode->id_qty / z_blk * 1.05;
611                     DEBUG_MSG(5, ("%s: project id qty %u\n", obj->index, proj_id_qty));
612                 } else {
613                     DEBUG_MSG(10, ("%s: no projection %u > %u\n", obj->index, z_blk, g_file_block_sz));
614                 }
615             }
616             if( rc == 0 && (eof || z_blk >= g_file_block_sz) ) {
617                 obj->file_size += z_blk;
618                 MD5StateAppend(&obj->md5, zbuf, z_blk);
619                 inode->key_size = z_blk;
620                 SLListPushTail(&obj->li, &inode->n);
621                 DEBUG_MSG(5, ("%s close key: spots %lu, size %lu, ratio %hu%%, raw %u\n",
622                          obj->index, inode->id_qty, inode->key_size, (uint16_t)(((float)(blk - z_blk)/blk)*100), blk ));
623                 spots_per_block = inode->id_qty;
624                 inode = NULL;
625                 if( blk > obj->buffer_sz ) {
626                     obj->buffer_sz = blk;
627                 }
628                 blk = 0;
629                 z_blk = 0;
630                 proj_id_qty = 0;
631             }
632             if( eof ) {
633                 break;
634             }
635         }
636         rc = rc ? rc : Quitting();
637         if( rc != 0 ) {
638             spotid_t spot = 0;
639             FastqReaderCurrentSpot(reader, &spot);
640             PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
641         }
642         free(zbuf);
643         free(spots_buf);
644     }
645     if( rc == 0 ) {
646         KMDataNode* opt = NULL, *nd = NULL;
647 
648         if( (rc = KMDataNodeOpenNodeUpdate(obj->meta, &opt, "Format/Options")) != 0 ) {
649             return rc;
650         }
651         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "ZlibVersion")) == 0 ) {
652             rc = KMDataNodeWriteB16(nd, &zlib_ver);
653             KMDataNodeRelease(nd);
654         }
655         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpace")) == 0 ) {
656             rc = KMDataNodeWriteB8(nd, &colorSpace);
657             KMDataNodeRelease(nd);
658         }
659         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpaceKey")) == 0 ) {
660             rc = KMDataNodeWrite(nd, colorSpaceKey, 1);
661             KMDataNodeRelease(nd);
662         }
663         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "origFormat")) == 0 ) {
664             rc = KMDataNodeWriteB8(nd, &origFormat);
665             KMDataNodeRelease(nd);
666         }
667         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printLabel")) == 0 ) {
668             rc = KMDataNodeWriteB8(nd, &printLabel);
669             KMDataNodeRelease(nd);
670         }
671         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printReadId")) == 0 ) {
672             rc = KMDataNodeWriteB8(nd, &printReadId);
673             KMDataNodeRelease(nd);
674         }
675         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "clipQuality")) == 0 ) {
676             rc = KMDataNodeWriteB8(nd, &clipQuality);
677             KMDataNodeRelease(nd);
678         }
679         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "minReadLen")) == 0 ) {
680             rc = KMDataNodeWriteB32(nd, &minReadLen);
681             KMDataNodeRelease(nd);
682         }
683         if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "qualityOffset")) == 0 ) {
684             rc = KMDataNodeWriteB16(nd, &qualityOffset);
685             KMDataNodeRelease(nd);
686         }
687         KMDataNodeRelease(opt);
688     }
689     FastqReaderWhack(reader);
690     return rc;
691 }
692 
693 static
MakeIndexes(const SRATable * stbl,KTable * ktbl,KMetadata * meta)694 rc_t MakeIndexes(const SRATable* stbl, KTable* ktbl, KMetadata* meta)
695 {
696     rc_t rc = 0;
697     int i;
698     char* buffer = NULL;
699     size_t buffer_sz = g_file_block_sz * 100;
700 
701     SIndexObj idx[] = {
702      /*  meta, file,        format,         index,          func,    file_size, buffer_sz, minSpotId, maxSpotId */
703         {NULL, "fastq",    "fastq",      "fuse-fastq",    Fastq_Idx,     0, 0, 0, 0},
704         {NULL, "sff",      "SFF",        "fuse-sff",      SFF_Idx,       0, 0, 0, 0},
705         {NULL, "fastq.gz", "fastq-gzip", "fuse-fastq-gz", FastqGzip_Idx, 0, 0, 0, 0},
706         {NULL, "sff.gz",   "SFF-gzip",   "fuse-sff-gz",   SFFGzip_Idx,   0, 0, 0, 0}
707     };
708 
709     for(i = 0; rc == 0 && i < sizeof(idx) / sizeof(idx[0]); i++) {
710         KMDataNode* parent = NULL;
711         if( (rc = KMetadataOpenNodeUpdate(meta, &parent, "/FUSE")) == 0 ) {
712             KMDataNodeDropChild(parent, "root"); /* drop old stuff */
713             if( g_ungzip || strcmp(&idx[i].file[strlen(idx[i].file) - 3], ".gz") == 0 ) {
714                 STSMSG(0, ("Preparing index %s", idx[i].index));
715                 MD5StateInit(&idx[i].md5);
716                 SLListInit(&idx[i].li);
717                 KMDataNodeDropChild(parent, "%s.tmp", idx[i].file);
718                 if( (rc = KMDataNodeOpenNodeUpdate(parent, &idx[i].meta, "%s.tmp", idx[i].file)) == 0 ) {
719                     if( idx[i].func != NULL ) {
720                         if( buffer == NULL ) {
721                             if( (buffer = malloc(buffer_sz)) == NULL ) {
722                                 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
723                                 break;
724                             }
725                         }
726                         rc = idx[i].func(stbl, &idx[i], buffer, buffer_sz);
727                         if( rc == 0 ) {
728                             MD5StateFinish(&idx[i].md5, idx[i].md5_digest);
729                             rc = CommitIndex(ktbl, idx[i].index, &idx[i].li);
730                         }
731                     }
732                     if( rc == 0 ) {
733                         rc = WriteFileMeta(&idx[i]);
734                     }
735                     KMDataNodeRelease(idx[i].meta);
736                 }
737                 if( GetRCState(rc) == rcUnsupported ) {
738                     KMDataNodeDropChild(parent, "%s", idx[i].file);
739                     PLOGERR(klogWarn, (klogWarn, rc, "Index $(i) is not supported for this table", PLOG_S(i), idx[i].index));
740                     rc = 0;
741                 } else if( rc == 0 ) {
742                     char f[4096];
743                     strcpy(f, idx[i].file);
744                     strcat(f, ".tmp");
745                     KMDataNodeDropChild(parent, "%s", idx[i].file);
746                     rc = KMDataNodeRenameChild(parent, f, idx[i].file);
747                 }
748             } else if( !g_ungzip ) {
749                 KTableDropIndex(ktbl, idx[i].index);
750                 KMDataNodeDropChild(parent, "%s", idx[i].file);
751             }
752             KMDataNodeDropChild(parent, "%s.tmp", idx[i].file);
753             KMDataNodeRelease(parent);
754         }
755         SLListWhack(&idx[i].li, WhackIndexData, NULL);
756     }
757     free(buffer);
758     return rc;
759 }
760 
761 const char* blocksize_usage[] = {"Index block size", NULL};
762 const char* accession_usage[] = {"Accession", NULL};
763 
764 /* this enum must have same order as MainArgs array below */
765 enum OptDefIndex {
766     eopt_BlockSize = 0,
767     eopt_Accession,
768     eopt_DumpIndex,
769     eopt_noGzip
770 };
771 
772 OptDef MainArgs[] =
773 {
774     /* if you change order in this array, rearrange enum above accordingly! */
775     {"block-size", "b", NULL, blocksize_usage, 1, true, false},
776     {"accession", "a", NULL, accession_usage, 1, true, false},
777     {"hidden-dump", "d", NULL, NULL, 1, false, false},
778     {"hidden-nogzip", "g", NULL, NULL, 1, false, false}
779 };
780 const char* MainParams[] =
781 {
782     /* if you change order in this array, rearrange enum above accordingly! */
783     "size",
784     "accession",
785     NULL,
786     NULL
787 };
788 const size_t MainArgsQty = sizeof(MainArgs) / sizeof(MainArgs[0]);
789 
790 const char UsageDefaultName[] = "sra-makeidx";
791 
UsageSummary(const char * name)792 rc_t CC UsageSummary (const char * name)
793 {
794     return 0;
795 }
796 
Usage(const Args * args)797 rc_t CC Usage(const Args* args)
798 {
799     const char * progname = UsageDefaultName;
800     const char * fullpath = UsageDefaultName;
801     rc_t rc;
802     int i;
803 
804     if (args == NULL)
805         rc = RC (rcApp, rcArgv, rcAccessing, rcSelf, rcNull);
806     else
807         rc = ArgsProgram (args, &fullpath, &progname);
808 
809     OUTMSG(( "\nUsage:\n\t%s [options] <table>\n\n", progname));
810 
811     for(i = 0; i < MainArgsQty; i++ ) {
812         if( MainArgs[i].required && MainArgs[i].help ) {
813             HelpOptionLine(MainArgs[i].aliases, MainArgs[i].name, MainParams[i], MainArgs[i].help);
814         }
815     }
816     OUTMSG(("\nOptions:\n"));
817     for(i = 0; i < MainArgsQty; i++ ) {
818         if( !MainArgs[i].required && MainArgs[i].help ) {
819             HelpOptionLine(MainArgs[i].aliases, MainArgs[i].name, MainParams[i], MainArgs[i].help);
820         }
821     }
822     OUTMSG(("\n"));
823     HelpOptionsStandard();
824     HelpVersion(fullpath, KAppVersion());
825     return rc;
826 }
KMain(int argc,char * argv[])827 rc_t KMain(int argc, char *argv[])
828 {
829     rc_t rc = 0;
830     Args* args = NULL;
831     const char* errmsg = NULL, *table_dir = NULL;
832     char accn[1024];
833 
834     if( (rc = ArgsMakeAndHandle(&args, argc, argv, 1, MainArgs, MainArgsQty)) == 0 ) {
835         const char* blksz = NULL;
836         uint32_t count, dump = 0, gzip = 0;
837 
838         if( (rc = ArgsParamCount(args, &count)) != 0 || count != 1 ) {
839             rc = rc ? rc : RC(rcExe, rcArgv, rcParsing, rcParam, count > 1 ? rcExcessive : rcInsufficient);
840             errmsg = "table";
841 
842         } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_BlockSize].name, &count)) != 0 || count > 1 ) {
843             rc = rc ? rc : RC(rcExe, rcArgv, rcParsing, rcParam, rcExcessive);
844             errmsg = MainArgs[eopt_BlockSize].name;
845         } else if( count > 0 && (rc = ArgsOptionValue(args, MainArgs[eopt_BlockSize].name, 0, (const void **)&blksz)) != 0 ) {
846             errmsg = MainArgs[eopt_BlockSize].name;
847 
848         } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_Accession].name, &count)) != 0 || count > 1 ) {
849             rc = rc ? rc : RC(rcExe, rcArgv, rcParsing, rcParam, rcExcessive);
850             errmsg = MainArgs[eopt_Accession].name;
851         } else if( count > 0 && (rc = ArgsOptionValue(args, MainArgs[eopt_Accession].name, 0, (const void **)&g_accession)) != 0 ) {
852             errmsg = MainArgs[eopt_Accession].name;
853 
854         } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_DumpIndex].name, &dump)) != 0 ) {
855             errmsg = MainArgs[eopt_DumpIndex].name;
856 
857         } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_noGzip].name, &gzip)) != 0 ) {
858             errmsg = MainArgs[eopt_noGzip].name;
859         }
860         while( rc == 0 ) {
861             long val = 0;
862             char* end = NULL;
863 
864             if( blksz != NULL ) {
865                 errno = 0;
866                 val = strtol(blksz, &end, 10);
867                 if( errno != 0 || blksz == end || *end != '\0' || val <= 0 ) {
868                     rc = RC(rcExe, rcArgv, rcReading, rcParam, rcInvalid);
869                     errmsg = MainArgs[eopt_BlockSize].name;
870                     break;
871                 } else if( val <= 128 || val > (1024 * 1024 * 1024) ) {
872                     rc = RC(rcExe, rcArgv, rcValidating, rcParam, rcEmpty);
873                     errmsg = "block size invalid";
874                     break;
875                 }
876                 g_file_block_sz = val;
877             }
878             if( (rc = ArgsParamValue(args, 0, (const void **)&table_dir)) != 0 ) {
879                 errmsg = "table";
880                 break;
881             }
882             if( g_accession == NULL ) {
883                 const char* p = strchr(table_dir, '/');
884                 size_t l = 0;
885 
886                 g_accession = accn;
887                 if( p == NULL ) {
888                     p = strchr(table_dir, '\\');
889                 }
890                 strncpy(accn, p == NULL ? table_dir : p + 1, sizeof(accn) - 1);
891                 if( accn[0] == '\0' ) {
892                     rc = RC(rcExe, rcArgv, rcValidating, rcParam, rcEmpty);
893                     errmsg = "accession";
894                 }
895                 l = strlen(accn);
896                 if( accn[l - 1] == '/' || accn[l - 1] == '\\') {
897                     accn[--l] = '\0';
898                 }
899                 if( strncmp(&accn[l - 9], ".lite.sra", 9) == 0 ) {
900                     accn[l - 9] = '\0';
901                 } else if( strncmp(&accn[l - 4], ".sra", 4) == 0 ) {
902                     accn[l - 4] = '\0';
903                 }
904             }
905             g_dump = dump > 0;
906             g_ungzip = gzip > 0;
907             break;
908         }
909     }
910     if( rc == 0 ) {
911         SRAMgr* smgr = NULL;
912         KDBManager* kmgr = NULL;
913 
914         DEBUG_MSG(5, ("table %s, accession %s\n", table_dir, g_accession));
915         if( (rc = SRAMgrMakeUpdate(&smgr, NULL)) == 0 ) {
916             if( (rc = KDBManagerMakeUpdate(&kmgr, NULL)) == 0 ) {
917                 bool relock = true;
918                 if( (rc = KDBManagerUnlock(kmgr, table_dir)) != 0 ) {
919                     relock = false;
920                     rc = GetRCState(rc) == rcUnlocked ? 0 : rc;
921                 } else {
922                     PLOGMSG(klogInfo, (klogInfo, "Table $(p) locked, unlocking", PLOG_S(p), table_dir));
923                 }
924                 if( rc == 0 ) {
925                     KTable* ktbl = NULL;
926                     if( (rc = KDBManagerOpenTableUpdate(kmgr, &ktbl, table_dir)) == 0 ) {
927                         KMetadata* meta = NULL;
928                         if( (rc = KTableOpenMetadataUpdate(ktbl, &meta)) == 0 ) {
929                             const SRATable* stbl = NULL;
930                             if( (rc = SRAMgrOpenTableRead(smgr, &stbl, table_dir)) == 0 ) {
931                                 rc = MakeIndexes(stbl, ktbl, meta);
932                                 SRATableRelease(stbl);
933                             }
934                         }
935                         KMetadataRelease(meta);
936                     }
937                     KTableRelease(ktbl);
938                 }
939                 if( rc == 0 && relock ) {
940                     rc = KDBManagerLock(kmgr, table_dir);
941                 }
942                 KDBManagerRelease(kmgr);
943             }
944             SRAMgrRelease(smgr);
945         }
946     }
947     if( rc != 0 && rc != KLogLastErrorCode() ) {
948         if( errmsg ) {
949             Usage(args);
950         }
951         LOGERR(klogErr, rc, errmsg ? errmsg : "stop");
952     }
953     ArgsWhack(args);
954     return rc;
955 }
956