1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 #include <kapp/main.h>
27 #include <kapp/args.h>
28 #include <klib/container.h>
29 #include <klib/log.h>
30 #include <klib/out.h>
31 #include <klib/status.h>
32 #include <klib/checksum.h>
33 #include <klib/rc.h>
34 #include <kdb/manager.h>
35 #include <kdb/table.h>
36 #include <kdb/meta.h>
37 #include <kdb/index.h>
38
39 #include <sra/wsradb.h>
40 #include <sra/sradb-priv.h>
41 #include <sra/fastq.h>
42 #include <sra/sff.h>
43
44 #include "zlib-simple.h"
45 #include "debug.h"
46
47 #include <stdlib.h>
48 #include <string.h>
49 #include <stdio.h>
50 #include <errno.h>
51
52 uint32_t g_file_block_sz = 32 * 1024;
53 const char* g_accession = NULL;
54 bool g_dump = false;
55 bool g_ungzip = false;
56
57 typedef struct SIndexObj_struct {
58 KMDataNode* meta;
59 const char* const file;
60 const char* const format;
61 const char* const index;
62 rc_t (*func)(const SRATable* sratbl, struct SIndexObj_struct* obj, char* buffer, const size_t buffer_sz);
63 uint64_t file_size;
64 uint32_t buffer_sz;
65 uint64_t minSpotId;
66 uint64_t maxSpotId;
67 SLList li;
68 MD5State md5;
69 uint8_t md5_digest[16];
70 } SIndexObj;
71
72 typedef struct SIndexNode_struct {
73 SLNode n;
74 uint64_t key;
75 uint64_t key_size;
76 int64_t id;
77 uint64_t id_qty;
78 } SIndexNode;
79
80 typedef struct SIndexData_struct {
81 rc_t rc;
82 KIndex* kidx;
83 } SIndexData;
84
85 static
InsertIndexData(SLNode * node,void * data)86 bool InsertIndexData( SLNode *node, void *data )
87 {
88 SIndexNode* n = (SIndexNode*)node;
89 SIndexData* d = (SIndexData*)data;
90
91 d->rc = KIndexInsertU64(d->kidx, true, n->key, n->key_size, n->id, n->id_qty);
92 return d->rc == 0 ? false : true;
93 }
94
95 static
WhackIndexData(SLNode * n,void * data)96 void WhackIndexData( SLNode *n, void *data )
97 {
98 free(n);
99 }
100
101 static
CommitIndex(KTable * ktbl,const char * name,const SLList * li)102 rc_t CommitIndex(KTable* ktbl, const char* name, const SLList* li)
103 {
104 SIndexData data;
105
106 STSMSG(0, ("Saving index %s", name));
107 data.rc = KTableCreateIndex(ktbl, &data.kidx, kitU64, kcmInit, name);
108 if( data.rc == 0 ) {
109 if( !SLListDoUntil(li, InsertIndexData, &data) ) {
110 data.rc = KIndexCommit(data.kidx);
111 }
112 KIndexRelease(data.kidx);
113 }
114 return data.rc;
115 }
116
WriteFileMeta(SIndexObj * obj)117 rc_t WriteFileMeta(SIndexObj* obj)
118 {
119 rc_t rc = 0;
120 KMDataNode* nd = NULL;
121
122 PLOGMSG(klogInfo, (klogInfo, "Meta $(f) on index $(i): file size $(s), buffer $(b)",
123 PLOG_4(PLOG_S(f),PLOG_S(i),PLOG_U64(s),PLOG_U32(b)), obj->file, obj->index, obj->file_size, obj->buffer_sz));
124
125 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Format")) == 0 ) {
126 KMDataNode* opt = NULL;
127 rc = KMDataNodeWriteCString(nd, obj->format);
128 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(nd, &opt, "Options")) == 0 ) {
129 KMDataNode* ond = NULL;
130 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &ond, "accession")) == 0 ) {
131 rc = KMDataNodeWriteCString(ond, g_accession);
132 KMDataNodeRelease(ond);
133 }
134 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &ond, "minSpotId")) == 0 ) {
135 rc = KMDataNodeWriteB64(ond, &obj->minSpotId);
136 KMDataNodeRelease(ond);
137 }
138 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &ond, "maxSpotId")) == 0 ) {
139 rc = KMDataNodeWriteB64(ond, &obj->maxSpotId);
140 KMDataNodeRelease(ond);
141 }
142 KMDataNodeRelease(opt);
143 }
144 KMDataNodeRelease(nd);
145 }
146
147 if( rc == 0 && obj->file_size > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Size")) == 0 ) {
148 rc = KMDataNodeWriteB64(nd, &obj->file_size);
149 KMDataNodeRelease(nd);
150 }
151
152 if( rc == 0 && obj->buffer_sz > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Buffer")) == 0 ) {
153 rc = KMDataNodeWriteB32(nd, &obj->buffer_sz);
154 KMDataNodeRelease(nd);
155 }
156
157 if( rc == 0 && strlen(obj->index) > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "Index")) == 0 ) {
158 rc = KMDataNodeWriteCString(nd, obj->index);
159 KMDataNodeRelease(nd);
160 }
161
162 if( rc == 0 && obj->file_size > 0 && (rc = KMDataNodeOpenNodeUpdate(obj->meta, &nd, "md5")) == 0 ) {
163 char x[5];
164 int i;
165 for( i = 0; rc == 0 && i < sizeof(obj->md5_digest); i++ ) {
166 int l = snprintf(x, 4, "%02x", obj->md5_digest[i]);
167 rc = KMDataNodeAppend(nd, x, l);
168 }
169 KMDataNodeRelease(nd);
170 }
171 return rc;
172 }
173
174 static
SFF_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)175 rc_t SFF_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
176 {
177 rc_t rc = 0;
178 const SFFReader* reader = NULL;
179
180 if( (rc = SFFReaderMake(&reader, sratbl, g_accession, obj->minSpotId, obj->maxSpotId)) != 0 ) {
181 return rc;
182 } else {
183 size_t written = 0;
184 uint32_t blk = 0;
185 SIndexNode* inode = NULL;
186
187 while( rc == 0 ) {
188 rc = SFFReader_GetNextSpotData(reader, buffer, buffer_sz, &written);
189 if( blk >= g_file_block_sz || (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted) ) {
190 inode->key_size = blk;
191 SLListPushTail(&obj->li, &inode->n);
192 DEBUG_MSG(5, ("SFF index closed spots %lu, offset %lu, block size %lu\n", inode->id_qty, inode->key, inode->key_size));
193 inode = NULL;
194 if( blk > obj->buffer_sz ) {
195 obj->buffer_sz = blk;
196 }
197 blk = 0;
198 }
199 if( GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted ) {
200 rc = 0;
201 break;
202 }
203 if( inode == NULL ) {
204 spotid_t spotid = 0;
205 if( (rc = SFFReaderCurrentSpot(reader, &spotid)) != 0 ) {
206 break;
207 }
208 inode = malloc(sizeof(SIndexNode));
209 if( inode == NULL ) {
210 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
211 break;
212 }
213 inode->key = obj->file_size;
214 inode->key_size = 0;
215 inode->id = spotid;
216 inode->id_qty = 0;
217 DEBUG_MSG(5, ("SFF index opened spot %ld, offset %lu\n", inode->id, inode->key));
218 if( spotid == 1 ) {
219 char hd[10240];
220 size_t hd_sz = 0;
221 if( (rc = SFFReaderHeader(reader, 0, hd, sizeof(hd), &hd_sz)) == 0 ) {
222 obj->file_size += hd_sz;
223 blk += hd_sz;
224 MD5StateAppend(&obj->md5, hd, hd_sz);
225 if( g_dump ) {
226 fwrite(hd, hd_sz, 1, stderr);
227 }
228 }
229 }
230 }
231 obj->file_size += written;
232 blk += written;
233 inode->id_qty++;
234 MD5StateAppend(&obj->md5, buffer, written);
235 if( g_dump ) {
236 fwrite(buffer, written, 1, stderr);
237 }
238 }
239 rc = rc ? rc : Quitting();
240 if( rc != 0 ) {
241 spotid_t spot = 0;
242 SFFReaderCurrentSpot(reader, &spot);
243 PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
244 }
245 }
246 SFFReaderWhack(reader);
247 return rc;
248 }
249
250 static
SFFGzip_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)251 rc_t SFFGzip_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
252 {
253 rc_t rc = 0;
254 uint16_t zlib_ver = ZLIB_VERNUM;
255 const SFFReader* reader = NULL;
256
257 if( (rc = SFFReaderMake(&reader, sratbl, g_accession, obj->minSpotId, obj->maxSpotId)) != 0 ) {
258 return rc;
259 } else {
260 size_t written = 0;
261 uint32_t blk = 0, spots_per_block = 0, proj_id_qty = 0;
262 SIndexNode* inode = NULL;
263 size_t z_blk = 0;
264 size_t spots_buf_sz = g_file_block_sz * 100;
265 size_t zbuf_sz = spots_buf_sz + 100;
266
267 char* zbuf = malloc(zbuf_sz);
268 char* spots_buf = malloc(spots_buf_sz);
269 bool eof = false;
270
271 if( zbuf == NULL || spots_buf == NULL ) {
272 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
273 }
274 while( rc == 0 ) {
275 if( (rc = SFFReader_GetNextSpotData(reader, buffer, buffer_sz, &written)) == 0 ) {
276 if( inode == NULL ) {
277 spotid_t spotid = 0;
278 if( (rc = SFFReaderCurrentSpot(reader, &spotid)) != 0 ) {
279 break;
280 }
281 inode = malloc(sizeof(SIndexNode));
282 if( inode == NULL ) {
283 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
284 break;
285 }
286 inode->key = obj->file_size;
287 inode->key_size = 0;
288 inode->id = spotid;
289 inode->id_qty = 0;
290 DEBUG_MSG(5, ("%s open key: spot %ld, offset %lu\n", obj->index, inode->id, inode->key));
291 if( spotid == 1 ) {
292 char hd[10240];
293 size_t hd_sz = 0;
294 if( (rc = SFFReaderHeader(reader, 0, hd, sizeof(hd), &hd_sz)) == 0 ) {
295 if( hd_sz + written > spots_buf_sz ) {
296 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcInsufficient);
297 break;
298 }
299 memmove(&spots_buf[blk], hd, hd_sz);
300 blk += hd_sz;
301 if( g_dump ) {
302 fwrite(hd, hd_sz, 1, stderr);
303 }
304 }
305 }
306
307 }
308 if( blk + written > spots_buf_sz ) {
309 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcInsufficient);
310 break;
311 }
312 inode->id_qty++;
313 memmove(&spots_buf[blk], buffer, written);
314 blk += written;
315 if( g_dump ) {
316 fwrite(buffer, written, 1, stderr);
317 }
318 }
319 if( (eof = (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted)) ) {
320 rc = 0;
321 if( inode == NULL ) {
322 break;
323 }
324 }
325 if( rc == 0 && (eof ||
326 (proj_id_qty == 0 && inode->id_qty > (spots_per_block * 0.95)) ||
327 (proj_id_qty > 0 && inode->id_qty >= proj_id_qty) ) ) {
328 rc = ZLib_DeflateBlock(spots_buf, blk, zbuf, zbuf_sz, &z_blk);
329 if( z_blk < g_file_block_sz ) {
330 /* project needed id_qty */
331 proj_id_qty = g_file_block_sz * inode->id_qty / z_blk * 1.05;
332 DEBUG_MSG(5, ("%s: project id qty %lu\n", obj->index, proj_id_qty));
333 } else {
334 DEBUG_MSG(10, ("%s: no projection %lu > %lu\n", obj->index, z_blk, g_file_block_sz));
335 }
336 }
337 if( rc == 0 && (eof || z_blk >= g_file_block_sz) ) {
338 obj->file_size += z_blk;
339 MD5StateAppend(&obj->md5, zbuf, z_blk);
340 inode->key_size = z_blk;
341 SLListPushTail(&obj->li, &inode->n);
342 DEBUG_MSG(5, ("%s close key: spots %lu, size %lu, ratio %hu%%, raw %lu\n",
343 obj->index, inode->id_qty, inode->key_size, (uint16_t)(((float)(blk - z_blk)/blk)*100), blk));
344 spots_per_block = inode->id_qty;
345 inode = NULL;
346 if( blk > obj->buffer_sz ) {
347 obj->buffer_sz = blk;
348 }
349 blk = 0;
350 z_blk = 0;
351 proj_id_qty = 0;
352 }
353 if( eof ) {
354 break;
355 }
356 }
357 rc = rc ? rc : Quitting();
358 if( rc != 0 ) {
359 spotid_t spot = 0;
360 SFFReaderCurrentSpot(reader, &spot);
361 PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
362 }
363 free(zbuf);
364 free(spots_buf);
365 }
366 if( rc == 0 ) {
367 KMDataNode* opt = NULL, *nd = NULL;
368
369 if( (rc = KMDataNodeOpenNodeUpdate(obj->meta, &opt, "Format/Options")) != 0 ) {
370 return rc;
371 }
372 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "ZlibVersion")) == 0 ) {
373 rc = KMDataNodeWriteB16(nd, &zlib_ver);
374 KMDataNodeRelease(nd);
375 }
376 KMDataNodeRelease(opt);
377 }
378 SFFReaderWhack(reader);
379 return rc;
380 }
381
382 static
Fastq_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)383 rc_t Fastq_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
384 {
385 rc_t rc = 0;
386 const FastqReader* reader = NULL;
387
388 uint8_t colorSpace = false;
389 char* colorSpaceKey = "\0";
390 uint8_t origFormat = false;
391 uint8_t printLabel = true;
392 uint8_t printReadId = true;
393 uint8_t clipQuality = true;
394 uint32_t minReadLen = 0;
395 uint16_t qualityOffset = 0;
396
397 {{
398 const SRAColumn* c = NULL;
399 const uint8_t *platform = SRA_PLATFORM_UNDEFINED;
400 bitsz_t o, z;
401
402 if( (rc = SRATableOpenColumnRead(sratbl, &c, "PLATFORM", sra_platform_id_t)) != 0 ) {
403 return rc;
404 }
405 if( (rc = SRAColumnRead(c, 1, (const void **)&platform, &o, &z)) != 0 ) {
406 return rc;
407 }
408 if( *platform == SRA_PLATFORM_ABSOLID ) {
409 colorSpace = true;
410 }
411 SRAColumnRelease(c);
412 }}
413
414 if( (rc = FastqReaderMake(&reader, sratbl, g_accession,
415 colorSpace, origFormat, false, printLabel, printReadId,
416 !clipQuality, minReadLen, qualityOffset, colorSpaceKey[0],
417 obj->minSpotId, obj->maxSpotId)) != 0 ) {
418 return rc;
419 } else {
420 KMDataNode* opt = NULL, *nd = NULL;
421
422 if( (rc = KMDataNodeOpenNodeUpdate(obj->meta, &opt, "Format/Options")) != 0 ) {
423 return rc;
424 }
425 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpace")) == 0 ) {
426 rc = KMDataNodeWriteB8(nd, &colorSpace);
427 KMDataNodeRelease(nd);
428 }
429 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpaceKey")) == 0 ) {
430 rc = KMDataNodeWrite(nd, colorSpaceKey, 1);
431 KMDataNodeRelease(nd);
432 }
433 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "origFormat")) == 0 ) {
434 rc = KMDataNodeWriteB8(nd, &origFormat);
435 KMDataNodeRelease(nd);
436 }
437 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printLabel")) == 0 ) {
438 rc = KMDataNodeWriteB8(nd, &printLabel);
439 KMDataNodeRelease(nd);
440 }
441 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printReadId")) == 0 ) {
442 rc = KMDataNodeWriteB8(nd, &printReadId);
443 KMDataNodeRelease(nd);
444 }
445 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "clipQuality")) == 0 ) {
446 rc = KMDataNodeWriteB8(nd, &clipQuality);
447 KMDataNodeRelease(nd);
448 }
449 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "minReadLen")) == 0 ) {
450 rc = KMDataNodeWriteB32(nd, &minReadLen);
451 KMDataNodeRelease(nd);
452 }
453 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "qualityOffset")) == 0 ) {
454 rc = KMDataNodeWriteB16(nd, &qualityOffset);
455 KMDataNodeRelease(nd);
456 }
457 KMDataNodeRelease(opt);
458 }
459
460 if( rc == 0 ) {
461 size_t written = 0;
462 uint32_t blk = 0;
463 SIndexNode* inode = NULL;
464
465 while( rc == 0 ) {
466 rc = FastqReader_GetNextSpotSplitData(reader, buffer, buffer_sz, &written);
467 if( blk >= g_file_block_sz || (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted) ) {
468 inode->key_size = blk;
469 SLListPushTail(&obj->li, &inode->n);
470 DEBUG_MSG(5, ("Fastq index closed spots %lu, offset %lu, block size %lu\n",
471 inode->id_qty, inode->key, inode->key_size));
472 inode = NULL;
473 if( blk > obj->buffer_sz ) {
474 obj->buffer_sz = blk;
475 }
476 blk = 0;
477 }
478 if( GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted ) {
479 rc = 0;
480 break;
481 }
482 if( inode == NULL ) {
483 spotid_t spotid = 0;
484 if( (rc = FastqReaderCurrentSpot(reader, &spotid)) != 0 ) {
485 break;
486 }
487 inode = malloc(sizeof(SIndexNode));
488 if( inode == NULL ) {
489 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
490 break;
491 }
492 inode->key = obj->file_size;
493 inode->key_size = 0;
494 inode->id = spotid;
495 inode->id_qty = 0;
496 DEBUG_MSG(5, ("Fastq index opened spot %ld, offset %lu\n", inode->id, inode->key));
497 }
498 inode->id_qty++;
499 obj->file_size += written;
500 blk += written;
501 MD5StateAppend(&obj->md5, buffer, written);
502 if( g_dump ) {
503 fwrite(buffer, written, 1, stderr);
504 }
505 }
506 rc = rc ? rc : Quitting();
507 if( rc != 0 ) {
508 spotid_t spot = 0;
509 FastqReaderCurrentSpot(reader, &spot);
510 PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
511 }
512 }
513 FastqReaderWhack(reader);
514 return rc;
515 }
516
517 static
FastqGzip_Idx(const SRATable * sratbl,SIndexObj * obj,char * buffer,const size_t buffer_sz)518 rc_t FastqGzip_Idx(const SRATable* sratbl, SIndexObj* obj, char* buffer, const size_t buffer_sz)
519 {
520 rc_t rc = 0;
521 const FastqReader* reader = NULL;
522
523 uint16_t zlib_ver = ZLIB_VERNUM;
524 uint8_t colorSpace = false;
525 char* colorSpaceKey = "\0";
526 uint8_t origFormat = false;
527 uint8_t printLabel = true;
528 uint8_t printReadId = true;
529 uint8_t clipQuality = true;
530 uint32_t minReadLen = 0;
531 uint16_t qualityOffset = 0;
532
533 {{
534 const SRAColumn* c = NULL;
535 const uint8_t *platform = SRA_PLATFORM_UNDEFINED;
536 bitsz_t o, z;
537
538 if( (rc = SRATableOpenColumnRead(sratbl, &c, "PLATFORM", sra_platform_id_t)) != 0 ) {
539 return rc;
540 }
541 if( (rc = SRAColumnRead(c, 1, (const void **)&platform, &o, &z)) != 0 ) {
542 return rc;
543 }
544 if( *platform == SRA_PLATFORM_ABSOLID ) {
545 colorSpace = true;
546 }
547 SRAColumnRelease(c);
548 }}
549
550 if( (rc = FastqReaderMake(&reader, sratbl, g_accession,
551 colorSpace, origFormat, false, printLabel, printReadId,
552 !clipQuality, minReadLen, qualityOffset, colorSpaceKey[0],
553 obj->minSpotId, obj->maxSpotId)) != 0 ) {
554 return rc;
555 } else {
556 size_t written = 0;
557 uint32_t blk = 0, spots_per_block = 0, proj_id_qty = 0;
558 SIndexNode* inode = NULL;
559 size_t z_blk = 0;
560 size_t spots_buf_sz = g_file_block_sz * 100;
561 size_t zbuf_sz = spots_buf_sz + 100;
562 char* zbuf = malloc(zbuf_sz);
563 char* spots_buf = malloc(spots_buf_sz);
564 bool eof = false;
565
566 if( zbuf == NULL || spots_buf == NULL ) {
567 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
568 }
569 while( rc == 0 ) {
570 if( (rc = FastqReader_GetNextSpotSplitData(reader, buffer, buffer_sz, &written)) == 0 ) {
571 if( inode == NULL ) {
572 spotid_t spotid = 0;
573 if( (rc = FastqReaderCurrentSpot(reader, &spotid)) != 0 ) {
574 break;
575 }
576 inode = malloc(sizeof(SIndexNode));
577 if( inode == NULL ) {
578 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
579 break;
580 }
581 inode->key = obj->file_size;
582 inode->key_size = 0;
583 inode->id = spotid;
584 inode->id_qty = 0;
585 DEBUG_MSG(5, ("%s open key: spot %ld, offset %lu\n", obj->index, inode->id, inode->key));
586 }
587 if( blk + written > spots_buf_sz ) {
588 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcInsufficient);
589 break;
590 }
591 inode->id_qty++;
592 memmove(&spots_buf[blk], buffer, written);
593 blk += written;
594 if( g_dump ) {
595 fwrite(buffer, written, 1, stderr);
596 }
597 }
598 if( (eof = (GetRCObject(rc) == rcRow && GetRCState(rc) == rcExhausted)) ) {
599 rc = 0;
600 if( inode == NULL ) {
601 break;
602 }
603 }
604 if( rc == 0 && (eof ||
605 (proj_id_qty == 0 && inode->id_qty > (spots_per_block * 0.95)) ||
606 (proj_id_qty > 0 && inode->id_qty >= proj_id_qty) ) ) {
607 rc = ZLib_DeflateBlock(spots_buf, blk, zbuf, zbuf_sz, &z_blk);
608 if( z_blk < g_file_block_sz ) {
609 /* project needed id_qty */
610 proj_id_qty = g_file_block_sz * inode->id_qty / z_blk * 1.05;
611 DEBUG_MSG(5, ("%s: project id qty %u\n", obj->index, proj_id_qty));
612 } else {
613 DEBUG_MSG(10, ("%s: no projection %u > %u\n", obj->index, z_blk, g_file_block_sz));
614 }
615 }
616 if( rc == 0 && (eof || z_blk >= g_file_block_sz) ) {
617 obj->file_size += z_blk;
618 MD5StateAppend(&obj->md5, zbuf, z_blk);
619 inode->key_size = z_blk;
620 SLListPushTail(&obj->li, &inode->n);
621 DEBUG_MSG(5, ("%s close key: spots %lu, size %lu, ratio %hu%%, raw %u\n",
622 obj->index, inode->id_qty, inode->key_size, (uint16_t)(((float)(blk - z_blk)/blk)*100), blk ));
623 spots_per_block = inode->id_qty;
624 inode = NULL;
625 if( blk > obj->buffer_sz ) {
626 obj->buffer_sz = blk;
627 }
628 blk = 0;
629 z_blk = 0;
630 proj_id_qty = 0;
631 }
632 if( eof ) {
633 break;
634 }
635 }
636 rc = rc ? rc : Quitting();
637 if( rc != 0 ) {
638 spotid_t spot = 0;
639 FastqReaderCurrentSpot(reader, &spot);
640 PLOGERR(klogErr, (klogErr, rc, "spot $(s)", PLOG_U32(s), spot));
641 }
642 free(zbuf);
643 free(spots_buf);
644 }
645 if( rc == 0 ) {
646 KMDataNode* opt = NULL, *nd = NULL;
647
648 if( (rc = KMDataNodeOpenNodeUpdate(obj->meta, &opt, "Format/Options")) != 0 ) {
649 return rc;
650 }
651 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "ZlibVersion")) == 0 ) {
652 rc = KMDataNodeWriteB16(nd, &zlib_ver);
653 KMDataNodeRelease(nd);
654 }
655 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpace")) == 0 ) {
656 rc = KMDataNodeWriteB8(nd, &colorSpace);
657 KMDataNodeRelease(nd);
658 }
659 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "colorSpaceKey")) == 0 ) {
660 rc = KMDataNodeWrite(nd, colorSpaceKey, 1);
661 KMDataNodeRelease(nd);
662 }
663 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "origFormat")) == 0 ) {
664 rc = KMDataNodeWriteB8(nd, &origFormat);
665 KMDataNodeRelease(nd);
666 }
667 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printLabel")) == 0 ) {
668 rc = KMDataNodeWriteB8(nd, &printLabel);
669 KMDataNodeRelease(nd);
670 }
671 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "printReadId")) == 0 ) {
672 rc = KMDataNodeWriteB8(nd, &printReadId);
673 KMDataNodeRelease(nd);
674 }
675 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "clipQuality")) == 0 ) {
676 rc = KMDataNodeWriteB8(nd, &clipQuality);
677 KMDataNodeRelease(nd);
678 }
679 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "minReadLen")) == 0 ) {
680 rc = KMDataNodeWriteB32(nd, &minReadLen);
681 KMDataNodeRelease(nd);
682 }
683 if( rc == 0 && (rc = KMDataNodeOpenNodeUpdate(opt, &nd, "qualityOffset")) == 0 ) {
684 rc = KMDataNodeWriteB16(nd, &qualityOffset);
685 KMDataNodeRelease(nd);
686 }
687 KMDataNodeRelease(opt);
688 }
689 FastqReaderWhack(reader);
690 return rc;
691 }
692
693 static
MakeIndexes(const SRATable * stbl,KTable * ktbl,KMetadata * meta)694 rc_t MakeIndexes(const SRATable* stbl, KTable* ktbl, KMetadata* meta)
695 {
696 rc_t rc = 0;
697 int i;
698 char* buffer = NULL;
699 size_t buffer_sz = g_file_block_sz * 100;
700
701 SIndexObj idx[] = {
702 /* meta, file, format, index, func, file_size, buffer_sz, minSpotId, maxSpotId */
703 {NULL, "fastq", "fastq", "fuse-fastq", Fastq_Idx, 0, 0, 0, 0},
704 {NULL, "sff", "SFF", "fuse-sff", SFF_Idx, 0, 0, 0, 0},
705 {NULL, "fastq.gz", "fastq-gzip", "fuse-fastq-gz", FastqGzip_Idx, 0, 0, 0, 0},
706 {NULL, "sff.gz", "SFF-gzip", "fuse-sff-gz", SFFGzip_Idx, 0, 0, 0, 0}
707 };
708
709 for(i = 0; rc == 0 && i < sizeof(idx) / sizeof(idx[0]); i++) {
710 KMDataNode* parent = NULL;
711 if( (rc = KMetadataOpenNodeUpdate(meta, &parent, "/FUSE")) == 0 ) {
712 KMDataNodeDropChild(parent, "root"); /* drop old stuff */
713 if( g_ungzip || strcmp(&idx[i].file[strlen(idx[i].file) - 3], ".gz") == 0 ) {
714 STSMSG(0, ("Preparing index %s", idx[i].index));
715 MD5StateInit(&idx[i].md5);
716 SLListInit(&idx[i].li);
717 KMDataNodeDropChild(parent, "%s.tmp", idx[i].file);
718 if( (rc = KMDataNodeOpenNodeUpdate(parent, &idx[i].meta, "%s.tmp", idx[i].file)) == 0 ) {
719 if( idx[i].func != NULL ) {
720 if( buffer == NULL ) {
721 if( (buffer = malloc(buffer_sz)) == NULL ) {
722 rc = RC(rcExe, rcIndex, rcConstructing, rcMemory, rcExhausted);
723 break;
724 }
725 }
726 rc = idx[i].func(stbl, &idx[i], buffer, buffer_sz);
727 if( rc == 0 ) {
728 MD5StateFinish(&idx[i].md5, idx[i].md5_digest);
729 rc = CommitIndex(ktbl, idx[i].index, &idx[i].li);
730 }
731 }
732 if( rc == 0 ) {
733 rc = WriteFileMeta(&idx[i]);
734 }
735 KMDataNodeRelease(idx[i].meta);
736 }
737 if( GetRCState(rc) == rcUnsupported ) {
738 KMDataNodeDropChild(parent, "%s", idx[i].file);
739 PLOGERR(klogWarn, (klogWarn, rc, "Index $(i) is not supported for this table", PLOG_S(i), idx[i].index));
740 rc = 0;
741 } else if( rc == 0 ) {
742 char f[4096];
743 strcpy(f, idx[i].file);
744 strcat(f, ".tmp");
745 KMDataNodeDropChild(parent, "%s", idx[i].file);
746 rc = KMDataNodeRenameChild(parent, f, idx[i].file);
747 }
748 } else if( !g_ungzip ) {
749 KTableDropIndex(ktbl, idx[i].index);
750 KMDataNodeDropChild(parent, "%s", idx[i].file);
751 }
752 KMDataNodeDropChild(parent, "%s.tmp", idx[i].file);
753 KMDataNodeRelease(parent);
754 }
755 SLListWhack(&idx[i].li, WhackIndexData, NULL);
756 }
757 free(buffer);
758 return rc;
759 }
760
761 const char* blocksize_usage[] = {"Index block size", NULL};
762 const char* accession_usage[] = {"Accession", NULL};
763
764 /* this enum must have same order as MainArgs array below */
765 enum OptDefIndex {
766 eopt_BlockSize = 0,
767 eopt_Accession,
768 eopt_DumpIndex,
769 eopt_noGzip
770 };
771
772 OptDef MainArgs[] =
773 {
774 /* if you change order in this array, rearrange enum above accordingly! */
775 {"block-size", "b", NULL, blocksize_usage, 1, true, false},
776 {"accession", "a", NULL, accession_usage, 1, true, false},
777 {"hidden-dump", "d", NULL, NULL, 1, false, false},
778 {"hidden-nogzip", "g", NULL, NULL, 1, false, false}
779 };
780 const char* MainParams[] =
781 {
782 /* if you change order in this array, rearrange enum above accordingly! */
783 "size",
784 "accession",
785 NULL,
786 NULL
787 };
788 const size_t MainArgsQty = sizeof(MainArgs) / sizeof(MainArgs[0]);
789
790 const char UsageDefaultName[] = "sra-makeidx";
791
UsageSummary(const char * name)792 rc_t CC UsageSummary (const char * name)
793 {
794 return 0;
795 }
796
Usage(const Args * args)797 rc_t CC Usage(const Args* args)
798 {
799 const char * progname = UsageDefaultName;
800 const char * fullpath = UsageDefaultName;
801 rc_t rc;
802 int i;
803
804 if (args == NULL)
805 rc = RC (rcApp, rcArgv, rcAccessing, rcSelf, rcNull);
806 else
807 rc = ArgsProgram (args, &fullpath, &progname);
808
809 OUTMSG(( "\nUsage:\n\t%s [options] <table>\n\n", progname));
810
811 for(i = 0; i < MainArgsQty; i++ ) {
812 if( MainArgs[i].required && MainArgs[i].help ) {
813 HelpOptionLine(MainArgs[i].aliases, MainArgs[i].name, MainParams[i], MainArgs[i].help);
814 }
815 }
816 OUTMSG(("\nOptions:\n"));
817 for(i = 0; i < MainArgsQty; i++ ) {
818 if( !MainArgs[i].required && MainArgs[i].help ) {
819 HelpOptionLine(MainArgs[i].aliases, MainArgs[i].name, MainParams[i], MainArgs[i].help);
820 }
821 }
822 OUTMSG(("\n"));
823 HelpOptionsStandard();
824 HelpVersion(fullpath, KAppVersion());
825 return rc;
826 }
KMain(int argc,char * argv[])827 rc_t KMain(int argc, char *argv[])
828 {
829 rc_t rc = 0;
830 Args* args = NULL;
831 const char* errmsg = NULL, *table_dir = NULL;
832 char accn[1024];
833
834 if( (rc = ArgsMakeAndHandle(&args, argc, argv, 1, MainArgs, MainArgsQty)) == 0 ) {
835 const char* blksz = NULL;
836 uint32_t count, dump = 0, gzip = 0;
837
838 if( (rc = ArgsParamCount(args, &count)) != 0 || count != 1 ) {
839 rc = rc ? rc : RC(rcExe, rcArgv, rcParsing, rcParam, count > 1 ? rcExcessive : rcInsufficient);
840 errmsg = "table";
841
842 } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_BlockSize].name, &count)) != 0 || count > 1 ) {
843 rc = rc ? rc : RC(rcExe, rcArgv, rcParsing, rcParam, rcExcessive);
844 errmsg = MainArgs[eopt_BlockSize].name;
845 } else if( count > 0 && (rc = ArgsOptionValue(args, MainArgs[eopt_BlockSize].name, 0, (const void **)&blksz)) != 0 ) {
846 errmsg = MainArgs[eopt_BlockSize].name;
847
848 } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_Accession].name, &count)) != 0 || count > 1 ) {
849 rc = rc ? rc : RC(rcExe, rcArgv, rcParsing, rcParam, rcExcessive);
850 errmsg = MainArgs[eopt_Accession].name;
851 } else if( count > 0 && (rc = ArgsOptionValue(args, MainArgs[eopt_Accession].name, 0, (const void **)&g_accession)) != 0 ) {
852 errmsg = MainArgs[eopt_Accession].name;
853
854 } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_DumpIndex].name, &dump)) != 0 ) {
855 errmsg = MainArgs[eopt_DumpIndex].name;
856
857 } else if( (rc = ArgsOptionCount(args, MainArgs[eopt_noGzip].name, &gzip)) != 0 ) {
858 errmsg = MainArgs[eopt_noGzip].name;
859 }
860 while( rc == 0 ) {
861 long val = 0;
862 char* end = NULL;
863
864 if( blksz != NULL ) {
865 errno = 0;
866 val = strtol(blksz, &end, 10);
867 if( errno != 0 || blksz == end || *end != '\0' || val <= 0 ) {
868 rc = RC(rcExe, rcArgv, rcReading, rcParam, rcInvalid);
869 errmsg = MainArgs[eopt_BlockSize].name;
870 break;
871 } else if( val <= 128 || val > (1024 * 1024 * 1024) ) {
872 rc = RC(rcExe, rcArgv, rcValidating, rcParam, rcEmpty);
873 errmsg = "block size invalid";
874 break;
875 }
876 g_file_block_sz = val;
877 }
878 if( (rc = ArgsParamValue(args, 0, (const void **)&table_dir)) != 0 ) {
879 errmsg = "table";
880 break;
881 }
882 if( g_accession == NULL ) {
883 const char* p = strchr(table_dir, '/');
884 size_t l = 0;
885
886 g_accession = accn;
887 if( p == NULL ) {
888 p = strchr(table_dir, '\\');
889 }
890 strncpy(accn, p == NULL ? table_dir : p + 1, sizeof(accn) - 1);
891 if( accn[0] == '\0' ) {
892 rc = RC(rcExe, rcArgv, rcValidating, rcParam, rcEmpty);
893 errmsg = "accession";
894 }
895 l = strlen(accn);
896 if( accn[l - 1] == '/' || accn[l - 1] == '\\') {
897 accn[--l] = '\0';
898 }
899 if( strncmp(&accn[l - 9], ".lite.sra", 9) == 0 ) {
900 accn[l - 9] = '\0';
901 } else if( strncmp(&accn[l - 4], ".sra", 4) == 0 ) {
902 accn[l - 4] = '\0';
903 }
904 }
905 g_dump = dump > 0;
906 g_ungzip = gzip > 0;
907 break;
908 }
909 }
910 if( rc == 0 ) {
911 SRAMgr* smgr = NULL;
912 KDBManager* kmgr = NULL;
913
914 DEBUG_MSG(5, ("table %s, accession %s\n", table_dir, g_accession));
915 if( (rc = SRAMgrMakeUpdate(&smgr, NULL)) == 0 ) {
916 if( (rc = KDBManagerMakeUpdate(&kmgr, NULL)) == 0 ) {
917 bool relock = true;
918 if( (rc = KDBManagerUnlock(kmgr, table_dir)) != 0 ) {
919 relock = false;
920 rc = GetRCState(rc) == rcUnlocked ? 0 : rc;
921 } else {
922 PLOGMSG(klogInfo, (klogInfo, "Table $(p) locked, unlocking", PLOG_S(p), table_dir));
923 }
924 if( rc == 0 ) {
925 KTable* ktbl = NULL;
926 if( (rc = KDBManagerOpenTableUpdate(kmgr, &ktbl, table_dir)) == 0 ) {
927 KMetadata* meta = NULL;
928 if( (rc = KTableOpenMetadataUpdate(ktbl, &meta)) == 0 ) {
929 const SRATable* stbl = NULL;
930 if( (rc = SRAMgrOpenTableRead(smgr, &stbl, table_dir)) == 0 ) {
931 rc = MakeIndexes(stbl, ktbl, meta);
932 SRATableRelease(stbl);
933 }
934 }
935 KMetadataRelease(meta);
936 }
937 KTableRelease(ktbl);
938 }
939 if( rc == 0 && relock ) {
940 rc = KDBManagerLock(kmgr, table_dir);
941 }
942 KDBManagerRelease(kmgr);
943 }
944 SRAMgrRelease(smgr);
945 }
946 }
947 if( rc != 0 && rc != KLogLastErrorCode() ) {
948 if( errmsg ) {
949 Usage(args);
950 }
951 LOGERR(klogErr, rc, errmsg ? errmsg : "stop");
952 }
953 ArgsWhack(args);
954 return rc;
955 }
956