1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "cmn_iter.h"
28 #include "sorter.h"
29 #include "merge_sorter.h"
30 #include "join.h"
31 #include "tbl_join.h"
32 #include "concatenator.h"
33 #include "cleanup_task.h"
34 #include "lookup_reader.h"
35 #include "raw_read_iter.h"
36 #include "temp_dir.h"
37
38 #include <kapp/main.h>
39 #include <kapp/args.h>
40
41 #include <kfg/config.h> /* KConfigSetNgcFile */
42
43 #include <klib/out.h>
44 #include <klib/printf.h>
45 #include <search/grep.h>
46 #include <kfs/directory.h>
47 #include <kproc/procmgr.h>
48 #include <vdb/manager.h>
49
50 #include <stdio.h>
51 #include <os-native.h>
52 #include <sysalloc.h>
53
54 /* ---------------------------------------------------------------------------------- */
55
56 static const char * format_usage[] = { "format (special, fastq, lookup, default=special)", NULL };
57 #define OPTION_FORMAT "format"
58 #define ALIAS_FORMAT "F"
59
60 static const char * outputf_usage[] = { "output-file", NULL };
61 #define OPTION_OUTPUT_F "outfile"
62 #define ALIAS_OUTPUT_F "o"
63
64 static const char * outputd_usage[] = { "output-dir", NULL };
65 #define OPTION_OUTPUT_D "outdir"
66 #define ALIAS_OUTPUT_D "O"
67
68 static const char * progress_usage[] = { "show progress", NULL };
69 #define OPTION_PROGRESS "progress"
70 #define ALIAS_PROGRESS "p"
71
72 static const char * bufsize_usage[] = { "size of file-buffer dflt=1MB", NULL };
73 #define OPTION_BUFSIZE "bufsize"
74 #define ALIAS_BUFSIZE "b"
75
76 static const char * curcache_usage[] = { "size of cursor-cache dflt=10MB", NULL };
77 #define OPTION_CURCACHE "curcache"
78 #define ALIAS_CURCACHE "c"
79
80 static const char * mem_usage[] = { "memory limit for sorting dflt=100MB", NULL };
81 #define OPTION_MEM "mem"
82 #define ALIAS_MEM "m"
83
84 static const char * temp_usage[] = { "where to put temp. files dflt=curr dir", NULL };
85 #define OPTION_TEMP "temp"
86 #define ALIAS_TEMP "t"
87
88 static const char * threads_usage[] = { "how many thread dflt=6", NULL };
89 #define OPTION_THREADS "threads"
90 #define ALIAS_THREADS "e"
91
92 static const char * detail_usage[] = { "print details", NULL };
93 #define OPTION_DETAILS "details"
94 #define ALIAS_DETAILS "x"
95
96 static const char * split_spot_usage[] = { "split spots into reads", NULL };
97 #define OPTION_SPLIT_SPOT "split-spot"
98 #define ALIAS_SPLIT_SPOT "s"
99
100 static const char * split_file_usage[] = { "write reads into different files", NULL };
101 #define OPTION_SPLIT_FILE "split-files"
102 #define ALIAS_SPLIT_FILE "S"
103
104 static const char * split_3_usage[] = { "writes single reads in special file", NULL };
105 #define OPTION_SPLIT_3 "split-3"
106 #define ALIAS_SPLIT_3 "3"
107
108 static const char * whole_spot_usage[] = { "writes whole spots into one file", NULL };
109 #define OPTION_WHOLE_SPOT "concatenate-reads"
110
111 static const char * stdout_usage[] = { "print output to stdout", NULL };
112 #define OPTION_STDOUT "stdout"
113 #define ALIAS_STDOUT "Z"
114
115 /*
116 static const char * gzip_usage[] = { "compress output using gzip", NULL };
117 #define OPTION_GZIP "gzip"
118 #define ALIAS_GZIP "g"
119
120 static const char * bzip2_usage[] = { "compress output using bzip2", NULL };
121 #define OPTION_BZIP2 "bzip2"
122 #define ALIAS_BZIP2 "z"
123
124 static const char * maxfd_usage[] = { "maximal number of file-descriptors", NULL };
125 #define OPTION_MAXFD "maxfd"
126 #define ALIAS_MAXFD "a"
127 */
128
129 static const char * force_usage[] = { "force to overwrite existing file(s)", NULL };
130 #define OPTION_FORCE "force"
131 #define ALIAS_FORCE "f"
132
133 static const char * ridn_usage[] = { "use row-id as name", NULL };
134 #define OPTION_RIDN "rowid-as-name"
135 #define ALIAS_RIDN "N"
136
137 static const char * skip_tech_usage[] = { "skip technical reads", NULL };
138 #define OPTION_SKIP_TECH "skip-technical"
139
140 static const char * incl_tech_usage[] = { "include technical reads", NULL };
141 #define OPTION_INCL_TECH "include-technical"
142
143 static const char * print_read_nr[] = { "print read-numbers", NULL };
144 #define OPTION_PRNR "print-read-nr"
145 #define ALIAS_PRNR "P"
146
147 static const char * min_rl_usage[] = { "filter by sequence-len", NULL };
148 #define OPTION_MINRDLEN "min-read-len"
149 #define ALIAS_MINRDLEN "M"
150
151 static const char * base_flt_usage[] = { "filter by bases", NULL };
152 #define OPTION_BASE_FLT "bases"
153 #define ALIAS_BASE_FLT "B"
154
155 static const char * table_usage[] = { "which seq-table to use in case of pacbio", NULL };
156 #define OPTION_TABLE "table"
157
158 static const char * strict_usage[] = { "terminate on invalid read", NULL };
159 #define OPTION_STRICT "strict"
160
161 static const char * append_usage[] = { "append to output-file", NULL };
162 #define OPTION_APPEND "append"
163 #define ALIAS_APPEND "A"
164
165 static const char * ngc_usage[] = { "PATH to ngc file", NULL };
166 #define OPTION_NGC "ngc"
167
168 /* ---------------------------------------------------------------------------------- */
169
170 OptDef ToolOptions[] =
171 {
172 { OPTION_FORMAT, ALIAS_FORMAT, NULL, format_usage, 1, true, false },
173 { OPTION_OUTPUT_F, ALIAS_OUTPUT_F, NULL, outputf_usage, 1, true, false },
174 { OPTION_OUTPUT_D, ALIAS_OUTPUT_D, NULL, outputd_usage, 1, true, false },
175 { OPTION_BUFSIZE, ALIAS_BUFSIZE, NULL, bufsize_usage, 1, true, false },
176 { OPTION_CURCACHE, ALIAS_CURCACHE, NULL, curcache_usage, 1, true, false },
177 { OPTION_MEM, ALIAS_MEM, NULL, mem_usage, 1, true, false },
178 { OPTION_TEMP, ALIAS_TEMP, NULL, temp_usage, 1, true, false },
179 { OPTION_THREADS, ALIAS_THREADS, NULL, threads_usage, 1, true, false },
180 { OPTION_PROGRESS, ALIAS_PROGRESS, NULL, progress_usage, 1, false, false },
181 { OPTION_DETAILS, ALIAS_DETAILS, NULL, detail_usage, 1, false, false },
182 { OPTION_SPLIT_SPOT,ALIAS_SPLIT_SPOT,NULL, split_spot_usage, 1, false, false },
183 { OPTION_SPLIT_FILE,ALIAS_SPLIT_FILE,NULL, split_file_usage, 1, false, false },
184 { OPTION_SPLIT_3, ALIAS_SPLIT_3, NULL, split_3_usage, 1, false, false },
185 { OPTION_WHOLE_SPOT, NULL, NULL, whole_spot_usage, 1, false, false },
186 { OPTION_STDOUT, ALIAS_STDOUT, NULL, stdout_usage, 1, false, false },
187 /* { OPTION_GZIP, ALIAS_GZIP, NULL, gzip_usage, 1, false, false }, */
188 /* { OPTION_BZIP2, ALIAS_BZIP2, NULL, bzip2_usage, 1, false, false }, */
189 /* { OPTION_MAXFD, ALIAS_MAXFD, NULL, maxfd_usage, 1, true, false }, */
190 { OPTION_FORCE, ALIAS_FORCE, NULL, force_usage, 1, false, false },
191 { OPTION_RIDN, ALIAS_RIDN, NULL, ridn_usage, 1, false, false },
192 { OPTION_SKIP_TECH, NULL, NULL, skip_tech_usage, 1, false, false },
193 { OPTION_INCL_TECH, NULL, NULL, incl_tech_usage, 1, false, false },
194 { OPTION_PRNR, ALIAS_PRNR, NULL, print_read_nr, 1, false, false },
195 { OPTION_MINRDLEN, ALIAS_MINRDLEN, NULL, min_rl_usage, 1, true, false },
196 { OPTION_TABLE, NULL, NULL, table_usage, 1, true, false },
197 { OPTION_STRICT, NULL, NULL, strict_usage, 1, false, false },
198 { OPTION_BASE_FLT, ALIAS_BASE_FLT, NULL, base_flt_usage, 10, true, false },
199 { OPTION_APPEND, ALIAS_APPEND, NULL, append_usage, 1, false, false },
200 { OPTION_NGC, NULL, NULL, ngc_usage, 1, true, false },
201 };
202
203 /* ----------------------------------------------------------------------------------- */
204
205 const char UsageDefaultName[] = "fasterq-dump";
206
207 /* ----------------------------------------------------------------------------------- */
208
UsageSummary(const char * progname)209 rc_t CC UsageSummary( const char * progname )
210 {
211 return KOutMsg( "\n"
212 "Usage:\n"
213 " %s <path> [options]\n"
214 "\n", progname );
215 }
216
217 /* ----------------------------------------------------------------------------------- */
218
Usage(const Args * args)219 rc_t CC Usage ( const Args * args )
220 {
221 rc_t rc;
222 uint32_t idx, count = ( sizeof ToolOptions ) / ( sizeof ToolOptions[ 0 ] );
223 const char * progname = UsageDefaultName;
224 const char * fullpath = UsageDefaultName;
225
226 if ( NULL == args )
227 {
228 rc = RC( rcApp, rcArgv, rcAccessing, rcSelf, rcNull );
229 }
230 else
231 {
232 rc = ArgsProgram( args, &fullpath, &progname );
233 }
234
235 if ( 0 != rc )
236 {
237 progname = fullpath = UsageDefaultName;
238 }
239
240 UsageSummary( progname );
241
242 KOutMsg( "Options:\n" );
243 for ( idx = 1; idx < count; ++idx ) /* start with 1, do not advertize row-range-option*/
244 {
245 const OptDef * opt = &ToolOptions[ idx ];
246 const char * param = NULL;
247
248 assert( opt );
249 if ( 0 == strcmp( opt -> name, OPTION_NGC ) )
250 {
251 param = "PATH";
252 }
253 HelpOptionLine( opt -> aliases, opt -> name, param, opt -> help );
254 }
255
256 KOutMsg("\n");
257 HelpOptionsStandard();
258 HelpVersion( fullpath, KAppVersion() );
259 return rc;
260 }
261
262 /* -------------------------------------------------------------------------------------------- */
263
264 #define DFLT_PATH_LEN 4096
265
266 typedef struct tool_ctx_t
267 {
268 KDirectory * dir;
269 const VDBManager * vdb_mgr; /* created, but unused to avoid race-condition in threads */
270
271 const char * requested_temp_path;
272 const char * accession_path;
273 const char * accession_short;
274 const char * output_filename;
275 const char * output_dirname;
276 const char * seq_tbl_name;
277
278 struct temp_dir * temp_dir; /* temp_dir.h */
279
280 char lookup_filename[ DFLT_PATH_LEN ];
281 char index_filename[ DFLT_PATH_LEN ];
282 char dflt_output[ DFLT_PATH_LEN ];
283
284 struct KFastDumpCleanupTask * cleanup_task; /* cleanup_task.h */
285
286 size_t cursor_cache, buf_size, mem_limit;
287
288 uint32_t num_threads /*, max_fds */;
289 uint64_t total_ram;
290
291 format_t fmt; /* helper.h */
292
293 compress_t compress; /* helper.h */
294
295 bool force, show_progress, show_details, append, use_stdout;
296
297 join_options join_options; /* helper.h */
298 } tool_ctx_t;
299
300 /* taken form libs/kapp/main-priv.h */
301 rc_t KAppGetTotalRam ( uint64_t * totalRam );
302
get_environment(tool_ctx_t * tool_ctx)303 static rc_t get_environment( tool_ctx_t * tool_ctx )
304 {
305 rc_t rc = KAppGetTotalRam ( &( tool_ctx -> total_ram ) );
306 if ( 0 != rc )
307 {
308 ErrMsg( "KAppGetTotalRam() -> %R", rc );
309 }
310 else
311 {
312 rc = KDirectoryNativeDir( &( tool_ctx -> dir ) );
313 if ( 0 != rc )
314 {
315 ErrMsg( "KDirectoryNativeDir() -> %R", rc );
316 }
317 }
318 return rc;
319 }
320
show_details(tool_ctx_t * tool_ctx)321 static rc_t show_details( tool_ctx_t * tool_ctx )
322 {
323 rc_t rc = KOutMsg( "cursor-cache : %,ld bytes\n", tool_ctx -> cursor_cache );
324 if ( 0 == rc )
325 {
326 rc = KOutMsg( "buf-size : %,ld bytes\n", tool_ctx -> buf_size );
327 }
328 if ( 0 == rc )
329 {
330 rc = KOutMsg( "mem-limit : %,ld bytes\n", tool_ctx -> mem_limit );
331 }
332 if ( 0 == rc )
333 {
334 rc = KOutMsg( "threads : %d\n", tool_ctx -> num_threads );
335 }
336 if ( 0 == rc )
337 {
338 rc = KOutMsg( "scratch-path : '%s'\n", get_temp_dir( tool_ctx -> temp_dir ) );
339 }
340 if ( 0 == rc )
341 {
342 rc = KOutMsg( "output-format: " );
343 }
344 if ( 0 == rc )
345 {
346 switch ( tool_ctx -> fmt )
347 {
348 case ft_special : rc = KOutMsg( "SPECIAL\n" ); break;
349 case ft_whole_spot : rc = KOutMsg( "FASTQ whole spot\n" ); break;
350 case ft_fastq_split_spot : rc = KOutMsg( "FASTQ split spot\n" ); break;
351 case ft_fastq_split_file : rc = KOutMsg( "FASTQ split file\n" ); break;
352 case ft_fastq_split_3 : rc = KOutMsg( "FASTQ split 3\n" ); break;
353 default : rc = KOutMsg( "unknow format\n" ); break;
354 }
355 }
356 if ( 0 == rc )
357 {
358 rc = KOutMsg( "output-file : '%s'\n", tool_ctx -> output_filename );
359 }
360 if ( 0 == rc )
361 {
362 rc = KOutMsg( "output-dir : '%s'\n", tool_ctx -> output_dirname );
363 }
364 if ( 0 == rc )
365 {
366 rc = KOutMsg( "append-mode : '%s'\n", tool_ctx -> append ? "YES" : "NO" );
367 }
368 if ( 0 == rc )
369 {
370 rc = KOutMsg( "stdout-mode : '%s'\n", tool_ctx -> append ? "YES" : "NO" );
371 }
372 return rc;
373 }
374
375 static const char * dflt_seq_tabl_name = "SEQUENCE";
376
377 #define DFLT_CUR_CACHE ( 5 * 1024 * 1024 )
378 #define DFLT_BUF_SIZE ( 1024 * 1024 )
379 #define DFLT_MEM_LIMIT ( 1024L * 1024 * 50 )
380 #define DFLT_NUM_THREADS 6
get_user_input(tool_ctx_t * tool_ctx,const Args * args)381 static void get_user_input( tool_ctx_t * tool_ctx, const Args * args )
382 {
383 bool split_spot, split_file, split_3, whole_spot;
384
385 #if 0
386 tool_ctx -> compress = get_compress_t( get_bool_option( args, OPTION_GZIP ),
387 get_bool_option( args, OPTION_BZIP2 ) );
388 #endif
389 tool_ctx -> compress = ct_none;
390
391 tool_ctx -> cursor_cache = get_size_t_option( args, OPTION_CURCACHE, DFLT_CUR_CACHE );
392 tool_ctx -> show_progress = get_bool_option( args, OPTION_PROGRESS );
393 tool_ctx -> show_details = get_bool_option( args, OPTION_DETAILS );
394 tool_ctx -> requested_temp_path = get_str_option( args, OPTION_TEMP, NULL );
395 tool_ctx -> force = get_bool_option( args, OPTION_FORCE );
396 tool_ctx -> output_filename = get_str_option( args, OPTION_OUTPUT_F, NULL );
397 tool_ctx -> output_dirname = get_str_option( args, OPTION_OUTPUT_D, NULL );
398 tool_ctx -> buf_size = get_size_t_option( args, OPTION_BUFSIZE, DFLT_BUF_SIZE );
399 tool_ctx -> mem_limit = get_size_t_option( args, OPTION_MEM, DFLT_MEM_LIMIT );
400 tool_ctx -> num_threads = get_uint32_t_option( args, OPTION_THREADS, DFLT_NUM_THREADS );
401
402 tool_ctx -> join_options . rowid_as_name = get_bool_option( args, OPTION_RIDN );
403 tool_ctx -> join_options . skip_tech = !( get_bool_option( args, OPTION_INCL_TECH ) );
404 tool_ctx -> join_options . print_read_nr = get_bool_option( args, OPTION_PRNR );
405 tool_ctx -> join_options . print_name = true;
406 tool_ctx -> join_options . min_read_len = get_uint32_t_option( args, OPTION_MINRDLEN, 0 );
407 tool_ctx -> join_options . filter_bases = get_str_option( args, OPTION_BASE_FLT, NULL );
408
409 #if 0
410 tool_ctx -> join_options . terminate_on_invalid = get_bool_option( args, OPTION_STRICT );
411 #else
412 tool_ctx -> join_options . terminate_on_invalid = true;
413 #endif
414
415 split_spot = get_bool_option( args, OPTION_SPLIT_SPOT );
416 split_file = get_bool_option( args, OPTION_SPLIT_FILE );
417 split_3 = get_bool_option( args, OPTION_SPLIT_3 );
418 whole_spot = get_bool_option( args, OPTION_WHOLE_SPOT );
419
420 tool_ctx -> fmt = get_format_t( get_str_option( args, OPTION_FORMAT, NULL ),
421 split_spot, split_file, split_3, whole_spot ); /* helper.c */
422 if ( ft_fastq_split_3 == tool_ctx -> fmt )
423 {
424 tool_ctx -> join_options . skip_tech = true;
425 }
426
427 tool_ctx -> seq_tbl_name = get_str_option( args, OPTION_TABLE, dflt_seq_tabl_name );
428 tool_ctx -> append = get_bool_option( args, OPTION_APPEND );
429 tool_ctx -> use_stdout = get_bool_option( args, OPTION_STDOUT );
430
431 {
432 const char * ngc = get_str_option( args, OPTION_NGC, NULL );
433 if ( NULL != ngc )
434 {
435 KConfigSetNgcFile( ngc );
436 }
437 }
438 }
439
440 #define DFLT_MAX_FD 32
441 #define MIN_NUM_THREADS 2
442 #define MIN_MEM_LIMIT ( 1024L * 1024 * 5 )
443 #define MAX_BUF_SIZE ( 1024L * 1024 * 1024 )
encforce_constrains(tool_ctx_t * tool_ctx)444 static void encforce_constrains( tool_ctx_t * tool_ctx )
445 {
446 if ( tool_ctx -> num_threads < MIN_NUM_THREADS )
447 {
448 tool_ctx -> num_threads = MIN_NUM_THREADS;
449 }
450
451 if ( tool_ctx -> mem_limit < MIN_MEM_LIMIT )
452 {
453 tool_ctx -> mem_limit = MIN_MEM_LIMIT;
454 }
455
456 if ( tool_ctx -> buf_size > MAX_BUF_SIZE )
457 {
458 tool_ctx -> buf_size = MAX_BUF_SIZE;
459 }
460
461 if ( tool_ctx -> use_stdout )
462 {
463 switch( tool_ctx -> fmt )
464 {
465 case ft_unknown : break;
466 case ft_special : break;
467 case ft_whole_spot : break;
468 case ft_fastq_split_spot : break;
469 case ft_fastq_split_file : tool_ctx -> use_stdout = false; break;
470 case ft_fastq_split_3 : tool_ctx -> use_stdout = false; break;
471 }
472 }
473
474 if ( tool_ctx -> use_stdout )
475 {
476 tool_ctx -> compress = ct_none;
477 //tool_ctx -> show_progress = false;
478 tool_ctx -> force = false;
479 tool_ctx -> append = false;
480 }
481 }
482
handle_accession(tool_ctx_t * tool_ctx)483 static rc_t handle_accession( tool_ctx_t * tool_ctx )
484 {
485 rc_t rc = 0;
486 tool_ctx -> accession_short = extract_acc2( tool_ctx -> accession_path ); /* helper.c */
487
488 // in case something goes wrong with acc-extraction via VFS-manager
489 if ( NULL == tool_ctx -> accession_short )
490 {
491 tool_ctx -> accession_short = extract_acc( tool_ctx -> accession_path ); /* helper.c */
492 }
493
494 if ( NULL == tool_ctx -> accession_short )
495 {
496 rc = RC( rcApp, rcArgv, rcAccessing, rcParam, rcInvalid );
497 ErrMsg( "accession '%s' invalid", tool_ctx -> accession_path );
498 }
499 return rc;
500 }
501
handle_lookup_path(tool_ctx_t * tool_ctx)502 static rc_t handle_lookup_path( tool_ctx_t * tool_ctx )
503 {
504 rc_t rc = generate_lookup_filename( tool_ctx -> temp_dir,
505 &tool_ctx -> lookup_filename[ 0 ],
506 sizeof tool_ctx -> lookup_filename );
507 if ( 0 != rc )
508 {
509 ErrMsg( "fasterq-dump.c handle_lookup_path( lookup_filename ) -> %R", rc );
510 }
511 else
512 {
513 size_t num_writ;
514 /* generate the full path of the lookup-index-table */
515 rc = string_printf( &tool_ctx -> index_filename[ 0 ], sizeof tool_ctx -> index_filename,
516 &num_writ,
517 "%s.idx",
518 &tool_ctx -> lookup_filename[ 0 ] );
519 if ( 0 != rc )
520 {
521 ErrMsg( "fasterq-dump.c handle_lookup_path( index_filename ) -> %R", rc );
522 }
523 }
524 return rc;
525 }
526
527 /* we have NO output-dir and NO output-file */
make_output_filename_from_accession(tool_ctx_t * tool_ctx)528 static rc_t make_output_filename_from_accession( tool_ctx_t * tool_ctx )
529 {
530 /* we DO NOT have a output-directory : build output-filename from the accession */
531 /* generate the full path of the output-file, if not given */
532 size_t num_writ;
533 rc_t rc = string_printf( &tool_ctx -> dflt_output[ 0 ], sizeof tool_ctx -> dflt_output,
534 &num_writ,
535 "%s.fastq",
536 tool_ctx -> accession_short );
537 if ( 0 != rc )
538 {
539 ErrMsg( "string_printf( output-filename ) -> %R", rc );
540 }
541 else
542 {
543 tool_ctx -> output_filename = tool_ctx -> dflt_output;
544 }
545 return rc;
546 }
547
548 /* we have an output-dir and NO output-file */
make_output_filename_from_dir_and_accession(tool_ctx_t * tool_ctx)549 static rc_t make_output_filename_from_dir_and_accession( tool_ctx_t * tool_ctx )
550 {
551 size_t num_writ;
552 bool es = ends_in_slash( tool_ctx -> output_dirname ); /* helper.c */
553 rc_t rc = string_printf( tool_ctx -> dflt_output, sizeof tool_ctx -> dflt_output,
554 &num_writ,
555 es ? "%s%s.fastq" : "%s/%s.fastq",
556 tool_ctx -> output_dirname,
557 tool_ctx -> accession_short );
558 if ( 0 != rc )
559 {
560 ErrMsg( "string_printf( output-filename ) -> %R", rc );
561 }
562 else
563 {
564 tool_ctx -> output_filename = tool_ctx -> dflt_output;
565 }
566 return rc;
567 }
568
optionally_create_paths_in_output_filename(tool_ctx_t * tool_ctx)569 static rc_t optionally_create_paths_in_output_filename( tool_ctx_t * tool_ctx )
570 {
571 rc_t rc = 0;
572 String path;
573 if ( extract_path( tool_ctx -> output_filename, &path ) )
574 {
575 /* the output-filename contains a path... */
576 if ( !dir_exists( tool_ctx -> dir, "%S", &path ) )
577 {
578 /* this path does not ( yet ) exist, create it... */
579 rc = create_this_dir( tool_ctx -> dir, &path, true );
580 }
581 }
582 return rc;
583 }
584
adjust_output_filename(tool_ctx_t * tool_ctx)585 static rc_t adjust_output_filename( tool_ctx_t * tool_ctx )
586 {
587 rc_t rc = 0;
588 /* we do have a output-filename : use it */
589 if ( dir_exists( tool_ctx -> dir, "%s", tool_ctx -> output_filename ) ) /* helper.c */
590 {
591 /* the given output-filename is an existing directory ! */
592 rc = RC( rcVDB, rcNoTarg, rcConstructing, rcParam, rcInvalid );
593 ErrMsg( "string_printf( output-filename ) -> %R", rc );
594 }
595 else
596 {
597 rc = optionally_create_paths_in_output_filename( tool_ctx );
598 }
599 return rc;
600 }
601
adjust_output_filename_by_dir(tool_ctx_t * tool_ctx)602 static rc_t adjust_output_filename_by_dir( tool_ctx_t * tool_ctx )
603 {
604 size_t num_writ;
605 bool es = ends_in_slash( tool_ctx -> output_dirname ); /* helper.c */
606 rc_t rc = string_printf( tool_ctx -> dflt_output, sizeof tool_ctx -> dflt_output,
607 &num_writ,
608 es ? "%s%s" : "%s/%s",
609 tool_ctx -> output_dirname,
610 tool_ctx -> output_filename );
611 if ( 0 != rc )
612 {
613 ErrMsg( "string_printf( output-filename ) -> %R", rc );
614 }
615 else
616 {
617 tool_ctx -> output_filename = tool_ctx -> dflt_output;
618 rc = optionally_create_paths_in_output_filename( tool_ctx );
619 }
620 return rc;
621 }
622
populate_tool_ctx(tool_ctx_t * tool_ctx,const Args * args)623 static rc_t populate_tool_ctx( tool_ctx_t * tool_ctx, const Args * args )
624 {
625 rc_t rc = ArgsParamValue( args, 0, ( const void ** )&( tool_ctx -> accession_path ) );
626 if ( 0 != rc )
627 {
628 ErrMsg( "ArgsParamValue() -> %R", rc );
629 }
630 else
631 {
632 tool_ctx -> lookup_filename[ 0 ] = 0;
633 tool_ctx -> index_filename[ 0 ] = 0;
634 tool_ctx -> dflt_output[ 0 ] = 0;
635
636 get_user_input( tool_ctx, args );
637 encforce_constrains( tool_ctx );
638 get_environment( tool_ctx );
639
640 rc = make_temp_dir( &tool_ctx -> temp_dir,
641 tool_ctx -> requested_temp_path,
642 tool_ctx -> dir );
643 }
644
645 if ( 0 == rc )
646 {
647 rc = handle_accession( tool_ctx );
648 }
649
650 if ( 0 == rc )
651 {
652 rc = handle_lookup_path( tool_ctx );
653 }
654
655 if ( 0 == rc && NULL != tool_ctx -> output_dirname )
656 {
657 if ( !dir_exists( tool_ctx -> dir, "%s", tool_ctx -> output_dirname ) )
658 {
659 rc = create_this_dir_2( tool_ctx -> dir, tool_ctx -> output_dirname, true );
660 }
661 }
662
663 if ( rc == 0 )
664 {
665 if ( NULL == tool_ctx -> output_filename )
666 {
667 if ( NULL == tool_ctx -> output_dirname )
668 {
669 rc = make_output_filename_from_accession( tool_ctx );
670 }
671 else
672 {
673 rc = make_output_filename_from_dir_and_accession( tool_ctx );
674 }
675 }
676 else
677 {
678 if ( NULL == tool_ctx -> output_dirname )
679 {
680 rc = adjust_output_filename( tool_ctx );
681 }
682 else
683 {
684 rc = adjust_output_filename_by_dir( tool_ctx );
685 }
686 }
687 }
688
689 if ( 0 == rc )
690 {
691 rc = Make_FastDump_Cleanup_Task ( &( tool_ctx -> cleanup_task ) ); /* cleanup_task.c */
692 }
693
694 if ( 0 == rc )
695 {
696 rc = Add_Directory_to_Cleanup_Task ( tool_ctx -> cleanup_task,
697 get_temp_dir( tool_ctx -> temp_dir ) );
698 }
699
700 if ( 0 == rc )
701 {
702 rc = VDBManagerMakeRead( &( tool_ctx -> vdb_mgr ), tool_ctx -> dir );
703 if ( 0 != rc )
704 {
705 ErrMsg( "fasterq-dump.c populate_tool_ctx().VDBManagerMakeRead() -> %R\n", rc );
706 }
707 }
708 return rc;
709 }
710
print_stats(const join_stats * stats)711 static rc_t print_stats( const join_stats * stats )
712 {
713 KOutHandlerSetStdErr();
714 rc_t rc = KOutMsg( "spots read : %,lu\n", stats -> spots_read );
715 if ( 0 == rc )
716 {
717 rc = KOutMsg( "reads read : %,lu\n", stats -> reads_read );
718 }
719 if ( 0 == rc )
720 {
721 rc = KOutMsg( "reads written : %,lu\n", stats -> reads_written );
722 }
723 if ( 0 == rc && stats -> reads_zero_length > 0 )
724 {
725 rc = KOutMsg( "reads 0-length : %,lu\n", stats -> reads_zero_length );
726 }
727 if ( 0 == rc && stats -> reads_technical > 0 )
728 {
729 rc = KOutMsg( "technical reads : %,lu\n", stats -> reads_technical );
730 }
731 if ( 0 == rc && stats -> reads_too_short > 0 )
732 {
733 rc = KOutMsg( "reads too short : %,lu\n", stats -> reads_too_short );
734 }
735 if ( 0 == rc && stats -> reads_invalid > 0 )
736 {
737 rc = KOutMsg( "reads invalid : %,lu\n", stats -> reads_invalid );
738 }
739 KOutHandlerSetStdOut();
740 return rc;
741 }
742
743 /* --------------------------------------------------------------------------------------------
744 produce special-output ( SPOT_ID,READ,SPOT_GROUP ) by iterating over the SEQUENCE - table:
745 produce fastq-output by iterating over the SEQUENCE - table:
746 --------------------------------------------------------------------------------------------
747 each thread iterates over a slice of the SEQUENCE-table
748 for each SPOT it may look up an entry in the lookup-table to get the READ
749 if it is not stored in the SEQ-tbl
750 -------------------------------------------------------------------------------------------- */
751
752 static const uint32_t queue_timeout = 200; /* ms */
753
produce_lookup_files(tool_ctx_t * tool_ctx)754 static rc_t produce_lookup_files( tool_ctx_t * tool_ctx )
755 {
756 rc_t rc = 0;
757 struct bg_update * gap = NULL;
758 struct background_file_merger * bg_file_merger;
759 struct background_vector_merger * bg_vec_merger;
760
761 if ( tool_ctx -> show_progress )
762 {
763 rc = bg_update_make( &gap, 0 );
764 }
765
766 /* the background-file-merger catches the files produced by
767 the background-vector-merger */
768 if ( 0 == rc )
769 {
770 rc = make_background_file_merger( &bg_file_merger,
771 tool_ctx -> dir,
772 tool_ctx -> temp_dir,
773 tool_ctx -> cleanup_task,
774 &tool_ctx -> lookup_filename[ 0 ],
775 &tool_ctx -> index_filename[ 0 ],
776 tool_ctx -> num_threads,
777 queue_timeout,
778 tool_ctx -> buf_size,
779 gap ); /* merge_sorter.c */
780 }
781
782 /* the background-vector-merger catches the KVectors produced by
783 the lookup-produceer */
784 if ( 0 == rc )
785 {
786 rc = make_background_vector_merger( &bg_vec_merger,
787 tool_ctx -> dir,
788 tool_ctx -> temp_dir,
789 tool_ctx -> cleanup_task,
790 bg_file_merger,
791 tool_ctx -> num_threads,
792 queue_timeout,
793 tool_ctx -> buf_size,
794 gap ); /* merge_sorter.c */
795 }
796
797 /* --------------------------------------------------------------------------------------------
798 produce the lookup-table by iterating over the PRIMARY_ALIGNMENT - table:
799 --------------------------------------------------------------------------------------------
800 reading SEQ_SPOT_ID, SEQ_READ_ID and RAW_READ
801 SEQ_SPOT_ID and SEQ_READ_ID is merged into a 64-bit-key
802 RAW_READ is read as 4na-unpacked ( Schema does not provide 4na-packed for this column )
803 these key-pairs are temporarely stored in a KVector until a limit is reached
804 after that limit is reached they are pushed to the background-vector-merger
805 This KVector looks like this:
806 content: [KEY][RAW_READ]
807 KEY... 64-bit value as SEQ_SPOT_ID shifted left by 1 bit, zero-bit contains SEQ_READ_ID
808 RAW_READ... 16-bit binary-chunk-lenght, followed by n bytes of packed 4na
809 -------------------------------------------------------------------------------------------- */
810 /* the lookup-producer is the source of the chain */
811 if ( 0 == rc )
812 {
813 rc = execute_lookup_production( tool_ctx -> dir,
814 tool_ctx -> vdb_mgr,
815 tool_ctx -> accession_short,
816 tool_ctx -> accession_path,
817 bg_vec_merger, /* drives the bg_file_merger */
818 tool_ctx -> cursor_cache,
819 tool_ctx -> buf_size,
820 tool_ctx -> mem_limit,
821 tool_ctx -> num_threads,
822 tool_ctx -> show_progress ); /* sorter.c */
823 }
824 bg_update_start( gap, "merge : " ); /* progress_thread.c ...start showing the activity... */
825
826 if ( 0 == rc )
827 {
828 rc = wait_for_and_release_background_vector_merger( bg_vec_merger ); /* merge_sorter.c */
829 }
830
831 if ( 0 == rc )
832 {
833 rc = wait_for_and_release_background_file_merger( bg_file_merger ); /* merge_sorter.c */
834 }
835
836 bg_update_release( gap );
837
838 if ( 0 != rc )
839 {
840 ErrMsg( "fasterq-dump.c produce_lookup_files() -> %R", rc );
841 }
842
843 return rc;
844 }
845
846
847 /* -------------------------------------------------------------------------------------------- */
848
849
produce_final_db_output(tool_ctx_t * tool_ctx)850 static rc_t produce_final_db_output( tool_ctx_t * tool_ctx )
851 {
852 struct temp_registry * registry = NULL;
853 join_stats stats;
854
855 rc_t rc = make_temp_registry( ®istry, tool_ctx -> cleanup_task ); /* temp_registry.c */
856
857 clear_join_stats( &stats ); /* helper.c */
858 /* join SEQUENCE-table with lookup-table === this is the actual purpos of the tool === */
859
860 /* --------------------------------------------------------------------------------------------
861 produce special-output ( SPOT_ID,READ,SPOT_GROUP ) by iterating over the SEQUENCE - table:
862 produce fastq-output by iterating over the SEQUENCE - table:
863 --------------------------------------------------------------------------------------------
864 each thread iterates over a slice of the SEQUENCE-table
865 for each SPOT it may look up an entry in the lookup-table to get the READ
866 if it is not stored in the SEQ-tbl
867 -------------------------------------------------------------------------------------------- */
868
869 if ( rc == 0 )
870 {
871 rc = execute_db_join( tool_ctx -> dir,
872 tool_ctx -> vdb_mgr,
873 tool_ctx -> accession_path,
874 tool_ctx -> accession_short,
875 &stats,
876 &tool_ctx -> lookup_filename[ 0 ],
877 &tool_ctx -> index_filename[ 0 ],
878 tool_ctx -> temp_dir,
879 registry,
880 tool_ctx -> cursor_cache,
881 tool_ctx -> buf_size,
882 tool_ctx -> num_threads,
883 tool_ctx -> show_progress,
884 tool_ctx -> fmt,
885 & tool_ctx -> join_options ); /* join.c */
886 }
887
888 /* from now on we do not need the lookup-file and it's index any more... */
889 if ( 0 != tool_ctx -> lookup_filename[ 0 ] )
890 {
891 KDirectoryRemove( tool_ctx -> dir, true, "%s", &tool_ctx -> lookup_filename[ 0 ] );
892 }
893
894 if ( 0 != tool_ctx -> index_filename[ 0 ] )
895 {
896 KDirectoryRemove( tool_ctx -> dir, true, "%s", &tool_ctx -> index_filename[ 0 ] );
897 }
898
899 /* STEP 4 : concatenate output-chunks */
900 if ( 0 == rc )
901 {
902 if ( tool_ctx -> use_stdout )
903 {
904 rc = temp_registry_to_stdout( registry,
905 tool_ctx -> dir,
906 tool_ctx -> buf_size ); /* temp_registry.c */
907 }
908 else
909 {
910 rc = temp_registry_merge( registry,
911 tool_ctx -> dir,
912 tool_ctx -> output_filename,
913 tool_ctx -> buf_size,
914 tool_ctx -> show_progress,
915 tool_ctx -> force,
916 tool_ctx -> compress,
917 tool_ctx -> append ); /* temp_registry.c */
918 }
919 }
920
921 /* in case some of the partial results have not been deleted be the concatenator */
922 if ( NULL != registry )
923 {
924 destroy_temp_registry( registry ); /* temp_registry.c */
925 }
926
927 if ( 0 == rc )
928 {
929 print_stats( &stats ); /* above */
930 }
931
932 return rc;
933 }
934
935 /* -------------------------------------------------------------------------------------------- */
936
output_exists_whole(tool_ctx_t * tool_ctx)937 static bool output_exists_whole( tool_ctx_t * tool_ctx )
938 {
939 return file_exists( tool_ctx -> dir, "%s", tool_ctx -> output_filename );
940 }
941
output_exists_idx(tool_ctx_t * tool_ctx,uint32_t idx)942 static bool output_exists_idx( tool_ctx_t * tool_ctx, uint32_t idx )
943 {
944 bool res = false;
945 SBuffer s_filename;
946 rc_t rc = split_filename_insert_idx( &s_filename, 4096,
947 tool_ctx -> output_filename, idx ); /* helper.c */
948 if ( 0 == rc )
949 {
950 res = file_exists( tool_ctx -> dir, "%S", &( s_filename . S ) ); /* helper.c */
951 release_SBuffer( &s_filename ); /* helper.c */
952 }
953 return res;
954 }
955
output_exists_split(tool_ctx_t * tool_ctx)956 static bool output_exists_split( tool_ctx_t * tool_ctx )
957 {
958 bool res = output_exists_whole( tool_ctx );
959 if ( !res )
960 {
961 res = output_exists_idx( tool_ctx, 1 );
962 }
963 if ( !res )
964 {
965 res = output_exists_idx( tool_ctx, 2 );
966 }
967 return res;
968 }
969
check_output_exits(tool_ctx_t * tool_ctx)970 static rc_t check_output_exits( tool_ctx_t * tool_ctx )
971 {
972 rc_t rc = 0;
973 /* check if the output-file(s) do already exist, in case we are not overwriting */
974 if ( !( tool_ctx -> force ) && !( tool_ctx -> append ) )
975 {
976 bool exists = false;
977 switch( tool_ctx -> fmt )
978 {
979 case ft_unknown : break;
980 case ft_special : exists = output_exists_whole( tool_ctx ); break;
981 case ft_whole_spot : exists = output_exists_whole( tool_ctx ); break;
982 case ft_fastq_split_spot : exists = output_exists_whole( tool_ctx ); break;
983 case ft_fastq_split_file : exists = output_exists_split( tool_ctx ); break;
984 case ft_fastq_split_3 : exists = output_exists_split( tool_ctx ); break;
985 }
986 if ( exists )
987 {
988 rc = RC( rcExe, rcFile, rcPacking, rcName, rcExists );
989 ErrMsg( "fasterq-dump.c fastdump_csra() checking ouput-file '%s' -> %R",
990 tool_ctx -> output_filename, rc );
991 }
992 }
993 return rc;
994 }
995
fastdump_csra(tool_ctx_t * tool_ctx)996 static rc_t fastdump_csra( tool_ctx_t * tool_ctx )
997 {
998 rc_t rc = 0;
999
1000 if ( tool_ctx -> show_details )
1001 {
1002 rc = show_details( tool_ctx ); /* above */
1003 }
1004
1005 if ( 0 == rc )
1006 {
1007 rc = check_output_exits( tool_ctx ); /* above */
1008 }
1009
1010 if ( 0 == rc )
1011 {
1012 rc = produce_lookup_files( tool_ctx ); /* above */
1013 }
1014
1015 if ( 0 == rc )
1016 {
1017 rc = produce_final_db_output( tool_ctx ); /* above */
1018 }
1019 return rc;
1020 }
1021
1022
1023 /* -------------------------------------------------------------------------------------------- */
1024
fastdump_table(tool_ctx_t * tool_ctx,const char * tbl_name)1025 static rc_t fastdump_table( tool_ctx_t * tool_ctx, const char * tbl_name )
1026 {
1027 rc_t rc = 0;
1028 struct temp_registry * registry = NULL;
1029 join_stats stats;
1030
1031 clear_join_stats( &stats ); /* helper.c */
1032
1033 if ( tool_ctx -> show_details )
1034 {
1035 rc = show_details( tool_ctx ); /* above */
1036 }
1037
1038 if ( 0 == rc )
1039 {
1040 rc = check_output_exits( tool_ctx ); /* above */
1041 }
1042
1043 if ( 0 == rc )
1044 {
1045 rc = make_temp_registry( ®istry, tool_ctx -> cleanup_task ); /* temp_registry.c */
1046 }
1047
1048 if ( 0 == rc )
1049 {
1050 rc = execute_tbl_join( tool_ctx -> dir,
1051 tool_ctx -> vdb_mgr,
1052 tool_ctx -> accession_short,
1053 tool_ctx -> accession_path,
1054 &stats,
1055 tbl_name,
1056 tool_ctx -> temp_dir,
1057 registry,
1058 tool_ctx -> cursor_cache,
1059 tool_ctx -> buf_size,
1060 tool_ctx -> num_threads,
1061 tool_ctx -> show_progress,
1062 tool_ctx -> fmt,
1063 & tool_ctx -> join_options ); /* tbl_join.c */
1064 }
1065
1066 if ( 0 == rc )
1067 {
1068 if ( tool_ctx -> use_stdout )
1069 {
1070 rc = temp_registry_to_stdout( registry,
1071 tool_ctx -> dir,
1072 tool_ctx -> buf_size ); /* temp_registry.c */
1073 }
1074 else
1075 {
1076 rc = temp_registry_merge( registry,
1077 tool_ctx -> dir,
1078 tool_ctx -> output_filename,
1079 tool_ctx -> buf_size,
1080 tool_ctx -> show_progress,
1081 tool_ctx -> force,
1082 tool_ctx -> compress,
1083 tool_ctx -> append ); /* temp_registry.c */
1084 }
1085 }
1086
1087 if ( NULL != registry )
1088 {
1089 destroy_temp_registry( registry ); /* temp_registry.c */
1090 }
1091
1092 if ( 0 == rc )
1093 {
1094 print_stats( &stats ); /* above */
1095 }
1096
1097 return rc;
1098 }
1099
1100 static const char * consensus_table = "CONSENSUS";
1101
get_db_seq_tbl_name(tool_ctx_t * tool_ctx)1102 static const char * get_db_seq_tbl_name( tool_ctx_t * tool_ctx )
1103 {
1104 const char * res = tool_ctx -> seq_tbl_name;
1105 VNamelist * tables = cmn_get_table_names( tool_ctx -> dir, tool_ctx -> vdb_mgr,
1106 tool_ctx -> accession_short,
1107 tool_ctx -> accession_path ); /* cmn_iter.c */
1108 if ( NULL != tables )
1109 {
1110 int32_t idx;
1111 rc_t rc = VNamelistContainsStr( tables, consensus_table, &idx );
1112 if ( 0 == rc && idx > -1 )
1113 {
1114 res = consensus_table;
1115 }
1116 VNamelistRelease ( tables );
1117 }
1118 return res;
1119 }
1120
1121 /* -------------------------------------------------------------------------------------------- */
1122
perform_tool(tool_ctx_t * tool_ctx)1123 static rc_t perform_tool( tool_ctx_t * tool_ctx )
1124 {
1125 acc_type_t acc_type; /* cmn_iter.h */
1126 rc_t rc = cmn_get_acc_type( tool_ctx -> dir, tool_ctx -> vdb_mgr,
1127 tool_ctx -> accession_short, tool_ctx -> accession_path,
1128 &acc_type ); /* cmn_iter.c */
1129 if ( 0 == rc )
1130 {
1131 /* =================================================== */
1132 switch( acc_type )
1133 {
1134 case acc_csra : rc = fastdump_csra( tool_ctx ); /* above */
1135 break;
1136
1137 case acc_pacbio : ErrMsg( "accession '%s' is PACBIO, please use fastq-dump instead", tool_ctx -> accession_path );
1138 rc = 3; /* signal to main() that the accession is not-processed */
1139 break;
1140
1141 case acc_sra_flat : rc = fastdump_table( tool_ctx, NULL ); /* above */
1142 break;
1143
1144 case acc_sra_db : rc = fastdump_table( tool_ctx, get_db_seq_tbl_name( tool_ctx ) ); /* above */
1145 break;
1146
1147 default : ErrMsg( "invalid accession '%s'", tool_ctx -> accession_path );
1148 rc = 3; /* signal to main() that the accession is not-found/invalid */
1149 break;
1150 }
1151 /* =================================================== */
1152 }
1153 else
1154 {
1155 ErrMsg( "invalid accession '%s'", tool_ctx -> accession_path );
1156 }
1157
1158 return rc;
1159 }
1160
1161 /* -------------------------------------------------------------------------------------------- */
1162
KMain(int argc,char * argv[])1163 rc_t CC KMain ( int argc, char *argv [] )
1164 {
1165 Args * args;
1166 uint32_t num_options = sizeof ToolOptions / sizeof ToolOptions [ 0 ];
1167
1168 rc_t rc = ArgsMakeAndHandle ( &args, argc, argv, 1, ToolOptions, num_options );
1169 if ( 0 != rc )
1170 {
1171 ErrMsg( "ArgsMakeAndHandle() -> %R", rc );
1172 }
1173 else
1174 {
1175 uint32_t param_count;
1176 rc = ArgsParamCount( args, ¶m_count );
1177 if ( 0 != rc )
1178 {
1179 ErrMsg( "ArgsParamCount() -> %R", rc );
1180 }
1181 else
1182 {
1183 /* in case we are given no or more than one accessions/files to process */
1184 if ( param_count == 0 || param_count > 1 )
1185 {
1186 Usage ( args );
1187 /* will make the caller of this function aka KMane() in man.c return
1188 error code of 3 */
1189 rc = 3;
1190 }
1191 else
1192 {
1193 tool_ctx_t tool_ctx;
1194 rc = populate_tool_ctx( &tool_ctx, args ); /* above */
1195 if ( 0 == rc )
1196 {
1197 rc = perform_tool( &tool_ctx ); /* above */
1198
1199 {
1200 rc_t rc2 = KDirectoryRelease( tool_ctx . dir );
1201 if ( 0 != rc2 )
1202 {
1203 ErrMsg( "KDirectoryRelease() -> %R", rc2 );
1204 rc = ( 0 == rc ) ? rc2 : rc;
1205 }
1206 }
1207 destroy_temp_dir( tool_ctx . temp_dir ); /* temp_dir.c */
1208 {
1209 rc_t rc2 = VDBManagerRelease( tool_ctx . vdb_mgr );
1210 if ( 0 != rc2 )
1211 {
1212 ErrMsg( "VDBManagerRelease() -> %R", rc2 );
1213 rc = ( 0 == rc ) ? rc2 : rc;
1214 }
1215 }
1216 }
1217 if ( NULL != tool_ctx . accession_short )
1218 {
1219 free( ( char * )tool_ctx . accession_short );
1220 }
1221 }
1222 }
1223 }
1224 return rc;
1225 }
1226