1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "cmdline_cmn.h"
28
29 #include <kapp/args.h>
30
31 #include <vdb/report.h> /* ReportResetTable */
32
33 #include <klib/rc.h>
34 #include <klib/log.h>
35 #include <klib/out.h>
36
37 #include <sra/srapath.h>
38
39 #include <vfs/manager.h>
40 #include <vfs/path.h>
41 #include <vfs/path-priv.h>
42
43 #include <stdlib.h>
44 #include <stdio.h>
45 #include <string.h>
46 #include <ctype.h>
47
48 #include <os-native.h>
49 #include <sysalloc.h>
50
51 const char * ref_usage[] = { "Filter by position on genome.",
52 "Name can either be file specific name",
53 "(ex: \"chr1\" or \"1\").",
54 "\"from\" and \"to\" are 1-based coordinates",
55 NULL };
56
57 const char * outf_usage[] = { "Output will be written to this file",
58 "instead of std-out", NULL };
59
60 const char * table_usage[] = { "Which alignment table(s) to use (p|s|e):",
61 "p - primary, s - secondary, e - evidence-interval",
62 "(default = p)", NULL };
63
64 const char * gzip_usage[] = { "Compress output using gzip", NULL };
65
66 const char * bzip_usage[] = { "Compress output using bzip2", NULL };
67
68 const char * inf_usage[] = { "File with all input-parameters / options", NULL };
69
70 const char * schema_usage[] = { "optional schema-file to be used", NULL };
71
72 const char * no_mt_usage[] = { "disable multithreading", NULL };
73
74 const char * timing_usage[] = { "write timing log-file", NULL };
75
76 #define OPTION_OUTF "outfile"
77 #define ALIAS_OUTF "o"
78
79 #define OPTION_TABLE "table"
80 #define ALIAS_TABLE "t"
81
82 #define OPTION_GZIP "gzip"
83 #define ALIAS_GZIP NULL
84
85 #define OPTION_BZIP "bzip2"
86 #define ALIAS_BZIP NULL
87
88 #define OPTION_INF "infile"
89 #define ALIAS_INF "f"
90
91 #define OPTION_SCHEMA "schema"
92 #define ALIAS_SCHEMA "S"
93
94 #define OPTION_NO_MT "disable-multithreading"
95 #define OPTION_TIMING "timing"
96
97 OptDef CommonOptions[] =
98 {
99 /*name, alias, hfkt, usage-help, maxcount, needs value, required */
100 { OPTION_REF, ALIAS_REF, NULL, ref_usage, 0, true, false },
101 { OPTION_OUTF, ALIAS_OUTF, NULL, outf_usage, 1, true, false },
102 { OPTION_TABLE, ALIAS_TABLE, NULL, table_usage, 1, true, false },
103 { OPTION_GZIP, ALIAS_GZIP, NULL, gzip_usage, 1, false, false },
104 { OPTION_BZIP, ALIAS_BZIP, NULL, bzip_usage, 1, false, false },
105 { OPTION_INF, ALIAS_INF, NULL, inf_usage, 0, true, false },
106 { OPTION_SCHEMA, ALIAS_SCHEMA, NULL, schema_usage, 1, true, false },
107 { OPTION_NO_MT, NULL, NULL, no_mt_usage, 1, false, false },
108 { OPTION_TIMING, NULL, NULL, timing_usage, 1, true, false }
109 };
110
111
112 /* =========================================================================================== */
113
get_str_option(const Args * args,const char * name,const char ** res)114 static rc_t get_str_option( const Args *args, const char *name, const char ** res )
115 {
116 uint32_t count;
117 rc_t rc = ArgsOptionCount( args, name, &count );
118 *res = NULL;
119 if ( rc != 0 )
120 {
121 LOGERR( klogInt, rc, "ArgsOptionCount() failed" );
122 }
123 else
124 {
125 if ( count > 0 )
126 {
127 rc = ArgsOptionValue( args, name, 0, (const void **)res );
128 if ( rc != 0 )
129 {
130 LOGERR( klogInt, rc, "ArgsOptionValue() failed" );
131 }
132 }
133 }
134 return rc;
135 }
136
137
get_bool_option(const Args * args,const char * name,bool * res,const bool def)138 static rc_t get_bool_option( const Args *args, const char *name, bool *res, const bool def )
139 {
140 uint32_t count;
141 rc_t rc = ArgsOptionCount( args, name, &count );
142 if ( rc == 0 && count > 0 )
143 {
144 *res = true;
145 }
146 else
147 {
148 *res = def;
149 }
150 return rc;
151 }
152
153 /* =========================================================================================== */
154
get_common_options(Args * args,common_options * opts)155 rc_t get_common_options( Args * args, common_options *opts )
156 {
157 rc_t rc = get_str_option( args, OPTION_OUTF, &opts->output_file );
158
159 if ( rc == 0 )
160 rc = get_str_option( args, OPTION_INF, &opts->input_file );
161
162 if ( rc == 0 )
163 rc = get_bool_option( args, OPTION_GZIP, &opts->gzip_output, false );
164
165 if ( rc == 0 )
166 rc = get_bool_option( args, OPTION_BZIP, &opts->bzip_output, false );
167
168 if ( rc == 0 )
169 rc = get_bool_option( args, OPTION_NO_MT, &opts->no_mt, false );
170
171 if ( rc == 0 )
172 rc = get_str_option( args, OPTION_SCHEMA, &opts->schema_file );
173
174 if ( rc == 0 )
175 rc = get_str_option( args, OPTION_TIMING, &opts->timing_file );
176
177 if ( rc == 0 )
178 {
179 const char * table2use = NULL;
180 rc = get_str_option( args, OPTION_TABLE, &table2use );
181 opts->tab_select = primary_ats;
182 if ( rc == 0 && table2use != NULL )
183 {
184 size_t l = string_size ( table2use );
185 opts->tab_select = 0;
186 if ( ( string_chr ( table2use, l, 'p' ) != NULL )||
187 ( string_chr ( table2use, l, 'P' ) != NULL ) )
188 { opts->tab_select |= primary_ats; };
189
190 if ( ( string_chr ( table2use, l, 's' ) != NULL )||
191 ( string_chr ( table2use, l, 'S' ) != NULL ) )
192 { opts->tab_select |= secondary_ats; };
193
194 if ( ( string_chr ( table2use, l, 'e' ) != NULL )||
195 ( string_chr ( table2use, l, 'E' ) != NULL ) )
196 { opts->tab_select |= evidence_ats; };
197 }
198 }
199
200 return rc;
201 }
202
print_common_helplines(void)203 void print_common_helplines( void )
204 {
205 HelpOptionLine ( ALIAS_REF, OPTION_REF, "name[:from-to]", ref_usage );
206 HelpOptionLine ( ALIAS_OUTF, OPTION_OUTF, "output-file", outf_usage );
207 HelpOptionLine ( ALIAS_TABLE, OPTION_TABLE, "shortcut", table_usage );
208 HelpOptionLine ( ALIAS_BZIP, OPTION_BZIP, NULL, bzip_usage );
209 HelpOptionLine ( ALIAS_GZIP, OPTION_GZIP, NULL, gzip_usage );
210 HelpOptionLine ( NULL, OPTION_NO_MT, NULL, no_mt_usage );
211 HelpOptionLine ( NULL, OPTION_TIMING, NULL, timing_usage );
212 }
213
214
CommonOptions_ptr(void)215 OptDef * CommonOptions_ptr( void )
216 {
217 return &CommonOptions[ 0 ];
218 }
219
CommonOptions_count(void)220 size_t CommonOptions_count( void )
221 {
222 return ( sizeof CommonOptions / sizeof CommonOptions [ 0 ] );
223 }
224
225
226 /* =========================================================================================== */
227
228 #if 0
229 static int cmp_pchar( const char * a, const char * b )
230 {
231 int res = 0;
232 if ( ( a != NULL )&&( b != NULL ) )
233 {
234 size_t len_a = string_size( a );
235 size_t len_b = string_size( b );
236 res = string_cmp ( a, len_a, b, len_b, ( len_a < len_b ) ? len_b : len_a );
237 }
238 return res;
239 }
240 #endif
241
242 /* =========================================================================================== */
243
244
init_ref_regions(BSTree * tree,Args * args)245 rc_t init_ref_regions( BSTree * tree, Args * args )
246 {
247 uint32_t count;
248 rc_t rc;
249
250 BSTreeInit( tree );
251 rc = ArgsOptionCount( args, OPTION_REF, &count );
252 if ( rc != 0 )
253 {
254 LOGERR( klogInt, rc, "ArgsOptionCount() failed" );
255 }
256 else
257 {
258 uint32_t i;
259 for ( i = 0; i < count && rc == 0; ++i )
260 {
261 const char * s;
262 rc = ArgsOptionValue( args, OPTION_REF, i, (const void **)&s );
263 if ( rc != 0 )
264 LOGERR( klogInt, rc, "ArgsOptionValue() failed" );
265 else
266 rc = parse_and_add_region( tree, s );
267 }
268 }
269 return rc;
270 }
271
272
273 /* =========================================================================================== */
274
275 #if TOOLS_USE_SRAPATH != 0
is_this_a_filesystem_path(const char * path)276 static bool is_this_a_filesystem_path( const char * path )
277 {
278 bool res = false;
279 size_t i, n = string_size ( path );
280 for ( i = 0; i < n && !res; ++i )
281 {
282 char c = path[ i ];
283 res = ( c == '.' || c == '/' || c == '\\' );
284 }
285 return res;
286 }
287 #endif
288
289 #if TOOLS_USE_SRAPATH != 0
translate_accession(SRAPath * my_sra_path,const char * accession,const size_t bufsize)290 static char *translate_accession( SRAPath *my_sra_path,
291 const char *accession,
292 const size_t bufsize )
293 {
294 rc_t rc;
295 char * res = calloc( 1, bufsize );
296 if ( res == NULL ) return NULL;
297
298 rc = SRAPathFind( my_sra_path, accession, res, bufsize );
299 if ( GetRCState( rc ) == rcNotFound )
300 {
301 free( res );
302 return NULL;
303 }
304 else if ( GetRCState( rc ) == rcInsufficient )
305 {
306 free( res );
307 return translate_accession( my_sra_path, accession, bufsize * 2 );
308 }
309 else if ( rc != 0 )
310 {
311 free( res );
312 return NULL;
313 }
314 return res;
315 }
316 #endif
317
318 #if TOOLS_USE_SRAPATH != 0
resolve_accession(const KDirectory * my_dir,char ** path)319 static rc_t resolve_accession( const KDirectory *my_dir, char ** path )
320 {
321 SRAPath *my_sra_path;
322 rc_t rc = 0;
323
324 if ( strchr ( *path, '/' ) != NULL )
325 return 0;
326
327 rc = SRAPathMake( &my_sra_path, my_dir );
328 if ( rc != 0 )
329 {
330 if ( GetRCState ( rc ) != rcNotFound || GetRCTarget ( rc ) != rcDylib )
331 {
332 if ( rc != 0 )
333 {
334 LOGERR( klogInt, rc, "SRAPathMake() failed" );
335 }
336 }
337 else
338 rc = 0;
339 }
340 else
341 {
342 if ( !SRAPathTest( my_sra_path, *path ) )
343 {
344 char *buf = translate_accession( my_sra_path, *path, 64 );
345 if ( buf != NULL )
346 {
347 free( (char*)(*path) );
348 *path = buf;
349 }
350 }
351 SRAPathRelease( my_sra_path );
352 }
353 return rc;
354 }
355 #endif
356
357
358 /* =========================================================================================== */
359
360
361 /****************************************************************************************
362 splits an argument
363
364 example: "/path/file=grp1" into path = "/path/file" and attribute = "grp1"
365 or
366 example: "/path/file" into path = "/path/file" and attribute = NULL
367
368 ****************************************************************************************/
split_argument(const char * argument,char ** path,char ** attribute,char delim)369 static rc_t split_argument( const char *argument, char ** path, char ** attribute, char delim )
370 {
371 if ( argument == NULL || path == NULL || attribute == NULL )
372 return RC( rcApp, rcNoTarg, rcConstructing, rcParam, rcNull );
373 else
374 {
375 char * delim_ptr = string_chr ( argument, string_size ( argument ), delim );
376 if ( delim_ptr == NULL )
377 {
378 *path = string_dup_measure( argument, NULL );
379 *attribute = NULL;
380 }
381 else
382 {
383 size_t len = string_size( argument );
384 size_t len1 = ( delim_ptr - argument );
385 *path = string_dup ( argument, len1 );
386 if ( delim_ptr < argument + len - 1 )
387 *attribute = string_dup ( delim_ptr + 1, len - ( len1 + 1 ) );
388 else
389 *attribute = NULL;
390 }
391 }
392 return 0;
393 }
394
395
split_vpath_into_path_and_readgroup(VPath * vpath,const char * argument,char ** path,char ** attribute)396 static rc_t split_vpath_into_path_and_readgroup( VPath *vpath, const char *argument, char ** path, char ** attribute )
397 {
398 size_t zz;
399 char readgroup_buffer[ 256 ];
400 rc_t rc1 = VPathOption( vpath, vpopt_readgroup, readgroup_buffer, sizeof readgroup_buffer - 1, &zz );
401 if ( rc1 == 0 )
402 *attribute = string_dup( readgroup_buffer, zz );
403 *path = string_dup( argument, string_size( argument ) );
404 return 0;
405 }
406
407
test_split_vpath_into_path_and_readgroup(VPath * vpath,const char * argument,char ** path,char ** attribute)408 static rc_t test_split_vpath_into_path_and_readgroup( VPath *vpath, const char *argument, char ** path, char ** attribute )
409 {
410 rc_t rc = 0;
411 #if 1
412 if ( VPathFromUri ( vpath ) )
413 rc = split_vpath_into_path_and_readgroup ( vpath, argument, path, attribute );
414 else
415 rc = split_argument ( argument, path, attribute, '=' );
416 #else
417 VPUri_t uri_type = VPathGetUri_t( vpath );
418 switch ( uri_type )
419 {
420 default:
421 case vpuri_invalid:
422 rc = RC( rcExe, rcParam, rcAccessing, rcPath, rcInvalid );
423 break;
424
425 case vpuri_not_supported:
426 rc = RC( rcExe, rcParam, rcAccessing, rcPath, rcUnsupported );
427 break;
428
429 case vpuri_none:
430 rc = split_argument( argument, path, attribute, '=' );
431 break;
432
433 case vpuri_ncbi_vfs:
434 case vpuri_file:
435 case vpuri_ncbi_acc:
436 case vpuri_http:
437 rc = split_vpath_into_path_and_readgroup( vpath, argument, path, attribute );
438 break;
439 }
440 #endif
441 return rc;
442 }
443
444
split_argument_into_path_and_readgroup(const char * argument,char ** path,char ** attribute)445 static rc_t split_argument_into_path_and_readgroup( const char *argument, char ** path, char ** attribute )
446 {
447 rc_t rc;
448 char * colon_ptr = string_chr ( argument, string_size ( argument ), ':' );
449 if ( colon_ptr == NULL )
450 {
451 /* we do not have a colon in the argument, that means: there is no uri-syntax involved
452 ---> we can split the "old fashioned way" at the equal-sign */
453 rc = split_argument( argument, path, attribute, '=' );
454 }
455 else
456 {
457 VFSManager * mgr;
458 rc_t rc = VFSManagerMake ( & mgr );
459
460 *path = NULL;
461 *attribute = NULL;
462
463 if ( rc == 0 )
464 {
465 VPath * vpath;
466 rc = VFSManagerMakePath ( mgr, &vpath, "%s", argument );
467 if ( rc == 0 )
468 {
469 rc = test_split_vpath_into_path_and_readgroup( vpath, argument, path, attribute );
470 VPathRelease( vpath );
471 }
472
473 VFSManagerRelease ( mgr );
474 }
475 }
476 return rc;
477 }
478
479
480 /* =========================================================================================== */
481
482
foreach_argument(Args * args,KDirectory * dir,bool div_by_spotgrp,bool * empty,rc_t (CC * on_argument)(const char * path,const char * spot_group,void * data),void * data)483 rc_t foreach_argument( Args * args, KDirectory *dir, bool div_by_spotgrp, bool * empty,
484 rc_t ( CC * on_argument ) ( const char * path, const char * spot_group, void * data ), void * data )
485 {
486 uint32_t count;
487 rc_t rc = ArgsParamCount( args, &count );
488 if ( rc != 0 )
489 {
490 LOGERR( klogInt, rc, "ArgsParamCount() failed" );
491 }
492 else
493 {
494 uint32_t idx;
495 if ( empty != NULL )
496 {
497 *empty = ( count == 0 );
498 }
499 for ( idx = 0; idx < count && rc == 0; ++idx )
500 {
501 const char *param = NULL;
502 rc = ArgsParamValue( args, idx, (const void **)¶m );
503 if ( rc != 0 )
504 {
505 LOGERR( klogInt, rc, "ArgsParamvalue() failed" );
506 }
507 else
508 {
509
510 char * path = NULL;
511 char * spot_group = NULL;
512
513 rc = split_argument_into_path_and_readgroup( param, &path, &spot_group );
514 if ( rc == 0 && path != NULL )
515 {
516 /* in case there is no spotgroup-override from the commandline AND
517 the option to divide by spot-group is set, let spot_group point
518 to an empty string ---> divide by original spot-group! */
519 if ( spot_group == NULL && div_by_spotgrp )
520 {
521 spot_group = calloc( 1, 1 );
522 }
523
524 #if TOOLS_USE_SRAPATH != 0
525 if ( !is_this_a_filesystem_path( path ) )
526 {
527 rc = resolve_accession( dir, &path );
528 }
529 #endif
530
531 if ( rc == 0 )
532 {
533 rc = on_argument( path, spot_group, data );
534 }
535
536 free( path );
537 if ( spot_group != NULL )
538 free( spot_group );
539 }
540 }
541 }
542 }
543 return rc;
544 }
545
546
547 /* =========================================================================================== */
548
549
prepare_whole_file(prepare_ctx * ctx)550 static rc_t prepare_whole_file( prepare_ctx * ctx )
551 {
552 rc_t rc = 0;
553 if ( ctx->reflist != NULL )
554 {
555 uint32_t count;
556 rc = ReferenceList_Count( ctx->reflist, &count );
557 if ( rc != 0 )
558 {
559 LOGERR( klogInt, rc, "ReferenceList_Count() failed" );
560 }
561 else
562 {
563 uint32_t idx;
564 for ( idx = 0; idx < count && rc == 0; ++idx )
565 {
566 rc = ReferenceList_Get( ctx->reflist, &ctx->refobj, idx );
567 if ( rc != 0 )
568 {
569 LOGERR( klogInt, rc, "ReferenceList_Get() failed" );
570 }
571 else
572 {
573 rc = ctx->on_section( ctx, NULL );
574 if ( rc == 0 )
575 ReferenceObj_Release( ctx->refobj );
576 }
577 }
578 }
579 }
580 else
581 {
582 ctx->refobj = NULL;
583 rc = ctx->on_section( ctx, NULL );
584 }
585 return rc;
586 }
587
588
prepare_region_cb(const char * name,const struct reference_range * range,void * data)589 static rc_t CC prepare_region_cb( const char * name, const struct reference_range * range, void * data )
590 {
591 prepare_ctx * ctx = ( prepare_ctx * )data;
592 rc_t rc = ReferenceList_Find( ctx->reflist, &ctx->refobj, name, string_size( name ) );
593 if ( rc != 0 )
594 {
595 rc = 0;
596 }
597 else
598 {
599 rc = ctx->on_section( ctx, range );
600 if ( rc == 0 )
601 ReferenceObj_Release( ctx->refobj );
602 }
603 return rc;
604 }
605
606
prepare_db_table(prepare_ctx * ctx,const VDBManager * vdb_mgr,VSchema * vdb_schema,const char * path)607 static rc_t prepare_db_table( prepare_ctx *ctx,
608 const VDBManager *vdb_mgr,
609 VSchema *vdb_schema,
610 const char * path )
611 {
612 rc_t rc;
613 ctx->db = NULL;
614 ctx->seq_tab = NULL;
615
616 rc = VDBManagerOpenDBRead ( vdb_mgr, &ctx->db, vdb_schema, "%s", path );
617 if ( rc != 0 )
618 {
619 rc = VDBManagerOpenTableRead ( vdb_mgr, &ctx->seq_tab, NULL, "%s", path );
620 if ( rc != 0 )
621 {
622 PLOGERR( klogErr, ( klogErr, rc, "failed to open '$(path)'", "path=%s", path ) );
623 }
624 else {
625 ReportResetTable(path, ctx->seq_tab);
626 }
627 }
628 else
629 {
630 rc = VDatabaseOpenTableRead( ctx->db, &ctx->seq_tab, "SEQUENCE" );
631 if ( rc != 0 )
632 {
633 LOGERR( klogInt, rc, "VDatabaseOpenTableRead( SEQUENCE ) failed" );
634 }
635 else
636 {
637 ReportResetDatabase( path, ctx->db );
638 }
639 }
640 return rc;
641 }
642
643
prepare_reflist(prepare_ctx * ctx)644 static rc_t prepare_reflist( prepare_ctx *ctx )
645 {
646 rc_t rc = 0;
647 ctx->reflist = NULL;
648 if ( ctx->db != NULL )
649 {
650 uint32_t reflist_options = ereferencelist_4na;
651
652 if ( ctx->use_primary_alignments )
653 reflist_options |= ereferencelist_usePrimaryIds;
654
655 if ( ctx->use_secondary_alignments )
656 reflist_options |= ereferencelist_useSecondaryIds;
657
658 if ( ctx->use_evidence_alignments )
659 reflist_options |= ereferencelist_useEvidenceIds;
660
661 rc = ReferenceList_MakeDatabase( &ctx->reflist, ctx->db, reflist_options, 0, NULL, 0 );
662 if ( rc != 0 )
663 {
664 LOGERR( klogInt, rc, "ReferenceList_MakeDatabase() failed" );
665 }
666 }
667 return rc;
668 }
669
670
prepare_ref_iter(prepare_ctx * ctx,const VDBManager * vdb_mgr,VSchema * vdb_schema,const char * path,BSTree * regions)671 rc_t prepare_ref_iter( prepare_ctx *ctx,
672 const VDBManager *vdb_mgr,
673 VSchema *vdb_schema,
674 const char * path,
675 BSTree * regions )
676 {
677 rc_t rc = prepare_db_table( ctx, vdb_mgr, vdb_schema, path );
678 if ( rc == 0 )
679 {
680 rc = prepare_reflist( ctx );
681 if( rc == 0 )
682 {
683 if ( ctx->reflist == NULL || count_ref_regions( regions ) == 0 )
684 {
685 /* the user has not specified a reference-range : use the whole file... */
686 rc = prepare_whole_file( ctx );
687 }
688 else
689 {
690 /* pick only the requested ranges... */
691 rc = foreach_ref_region( regions, prepare_region_cb, ctx ); /* ref_regions.c */
692 }
693 }
694 if ( ctx->reflist != NULL )
695 {
696 ReferenceList_Release( ctx->reflist );
697 }
698 }
699 VTableRelease ( ctx->seq_tab );
700 VDatabaseRelease ( ctx->db );
701 return rc;
702 }
703
704
705 /* =========================================================================================== */
706
707
parse_inf_file(Args * args)708 rc_t parse_inf_file( Args * args )
709 {
710 return Args_parse_inf_file( args, OPTION_INF );
711 }
712