1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 
27 #include <kapp/main.h>
28 #include <kapp/args.h>
29 #include <klib/text.h>
30 #include <klib/log.h>
31 #include <klib/out.h>
32 #include <klib/status.h>
33 #include <klib/rc.h>
34 #include <klib/printf.h>
35 #include <kfs/directory.h>
36 #include <kfs/file.h>
37 #include <kapp/log-xml.h>
38 #include <align/writer-refseq.h>
39 
40 #include <stdlib.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <ctype.h>
44 #include <assert.h>
45 #include <limits.h>
46 
47 #include "Globals.h"
48 #include "loader-imp.h"
49 
50 /*: ARGS
51 Summary:
52 	Load a BAM formatted data file
53 
54 Usage:
55     --help                          display this text and quit
56     --version                       display the version string and quit
57     [global-options] [options] <file...> [ --remap [options] <file>... ]...
58 
59 //:global-options
60 Global Options:
61 
62 * options effecting logging
63   log-level <level>                 logging level values: [fatal|sys|int|err|warn|info|0-5] default: info
64   xml-log <filename>                produce an XML-formatted log file
65 
66 * options effecting performance optimisation
67   tmpfs <directory>                 where to store temparary files, default: '/tmp'
68   cache-size <mbytes>               the limit in MB for temparary files
69 
70 * options effecting error limits
71   max-err-count <number>            the maximum number of errors to ignore
72   max-warning-dup-flag <count>      the limit for number of duplicate flag mismatch warnings
73 
74 //:options
75 Options:
76   output <name>                     name of the output, required
77   config <file>                     reference configuration file (See Configuration)
78   header <file>                     file containing the SAM header
79   remap                             special option to enable processing sets of remapped files. remap MUST be given between each set, all regular options can be respecified, in fact the output must be unique for each set. This is for when a set of reads are aligned multiple times, for example against different reference builds or with different aligners. This mode ensures that spot ids are the same across the several outputs.
80 
81 Debugging Options:
82   only-verify                       exit after verifying existence of references
83   max-rec-count <number>            exit after processing this many records (per file)
84   nomatch-log <path>                log alignments with no matching bases
85 
86 Filtering Options:
87   minimum-match <number>            minimum number of matches for an alignment
88   no-secondary                      ignore alignments marked as secondary
89   accept-dups                       accept spots with inconsistent PCR duplicate flags
90   accept-nomatch                    accept alignments with no matching bases
91   ref-config                        limit processing to references in the config file, ignoring all others
92   ref-filter <name>                 limit processing to the given reference, ignoring all others
93   min-mapq <number>                 filter secondary alignments by minimum mapping quality
94 
95 Rare or Esoteric Options:
96   input <directory>                 where to find fasta files
97   ref-file <file>                   fasta file with references
98   unsorted                          expect unsorted input (requires more memory)
99   sorted                            require sorted input
100   TI                                look for trace id optional tag
101   unaligned <file>                  file without aligned reads
102 
103 Deprecated Options:
104   use-OQ                            use OQ option column for quality values instead of QUAL
105   no-verify                         skip verify existence of references from the BAM file
106   accept-hard-clip                  allow hard clipping in CIGAR
107   allow-multi-map                   allow the same reference to be mapped to multiple names in the input files
108   edit-aligned-qual <number>        convert quality at aligned positions to this value
109   cs                                turn on awareness of colorspace
110   qual-quant                        quality scores quantization level
111   keep-mismatch-qual                don't quantized quality at mismatched positions
112 
113 
114 Example:
115     bam-load -o /tmp/SRZ123456 -k analysis.bam.cfg 123456.bam
116 */
117 
118 /* MARK: Arguments and Usage */
119 static char const option_input[] = "input";
120 static char const option_output[] = "output";
121 static char const option_tmpfs[] = "tmpfs";
122 static char const option_config[] = "config";
123 static char const option_min_mapq[] = "min-mapq";
124 static char const option_qual_compress[] = "qual-quant";
125 static char const option_cache_size[] = "cache-size";
126 static char const option_unsorted[] = "unsorted";
127 static char const option_sorted[] = "sorted";
128 static char const option_max_err_count[] = "max-err-count";
129 static char const option_max_rec_count[] = "max-rec-count";
130 static char const option_no_verify[] = "no-verify";
131 static char const option_only_verify[] = "only-verify";
132 static char const option_use_qual[] = "use-QUAL";
133 static char const option_ref_filter[] = "ref-filter";
134 static char const option_ref_config[] = "ref-config";
135 static char const option_edit_aligned_qual[] = "edit-aligned-qual";
136 static char const option_unaligned[] = "unaligned";
137 static char const option_accept_dup[] = "accept-dups";
138 static char const option_accept_nomatch[] = "accept-nomatch";
139 static char const option_nomatch_log[] = "nomatch-log";
140 static char const option_keep_mismatch_qual[] = "keep-mismatch-qual";
141 static char const option_min_match[] = "minimum-match";
142 static char const option_header[] = "header";
143 static char const option_no_cs[] = "no-cs";
144 static char const option_no_secondary[] = "no-secondary";
145 static char const option_ref_file[] = "ref-file";
146 static char const option_TI[] = "TI";
147 static char const option_max_warn_dup_flag[] = "max-warning-dup-flag";
148 static char const option_accept_hard_clip[] = "accept-hard-clip";
149 static char const option_allow_multi_map[] = "allow-multi-map";
150 static char const option_allow_secondary[] = "make-spots-with-secondary";
151 static char const option_defer_secondary[] = "defer-secondary";
152 
153 #define OPTION_INPUT option_input
154 #define OPTION_OUTPUT option_output
155 #define OPTION_TMPFS option_tmpfs
156 #define OPTION_CONFIG option_config
157 #define OPTION_MINMAPQ option_min_mapq
158 #define OPTION_QCOMP option_qual_compress
159 #define OPTION_CACHE_SIZE option_cache_size
160 #define OPTION_MAX_ERR_COUNT option_max_err_count
161 #define OPTION_MAX_REC_COUNT option_max_rec_count
162 #define OPTION_UNALIGNED option_unaligned
163 #define OPTION_ACCEPT_DUP option_accept_dup
164 #define OPTION_ACCEPT_NOMATCH option_accept_nomatch
165 #define OPTION_NOMATCH_LOG option_nomatch_log
166 #define OPTION_MIN_MATCH option_min_match
167 #define OPTION_HEADER option_header
168 #define OPTION_NO_CS option_no_cs
169 #define OPTION_NO_SECONDARY option_no_secondary
170 #define OPTION_REF_FILE option_ref_file
171 #define OPTION_TI option_TI
172 #define OPTION_MAX_WARN_DUP_FLAG option_max_warn_dup_flag
173 #define OPTION_ACCEPT_HARD_CLIP option_accept_hard_clip
174 #define OPTION_ALLOW_MULTI_MAP option_allow_multi_map
175 #define OPTION_ALLOW_SECONDARY option_allow_secondary
176 #define OPTION_DEFER_SECONDARY option_defer_secondary
177 
178 #define ALIAS_INPUT  "i"
179 #define ALIAS_OUTPUT "o"
180 #define ALIAS_TMPFS "t"
181 #define ALIAS_CONFIG "k"
182 #define ALIAS_MINMAPQ "q"
183 #define ALIAS_QCOMP "Q"
184 #define ALIAS_MAX_ERR_COUNT "E"
185 #define ALIAS_UNALIGNED "u"
186 #define ALIAS_ACCEPT_DUP "d"
187 #define ALIAS_NO_SECONDARY "P"
188 #define ALIAS_REF_FILE "r"
189 
190 static
191 char const * input_usage[] =
192 {
193     "Path where to get fasta files from.",
194     NULL
195 };
196 
197 static
198 char const * output_usage[] =
199 {
200     "Path and Name of the output database.",
201     NULL
202 };
203 
204 static
205 char const * tmpfs_usage[] =
206 {
207     "Path to be used for scratch files.",
208     NULL
209 };
210 
211 static
212 char const * config_usage[] =
213 {
214     "Path to configuration file:",
215     "maps the input BAM file's reference names to the equivalent GenBank accession.",
216     "It is a tab-delimited text file with unix line endings (\\n - LF) with the following fields in this order:",
217     "#1 reference name as it occurs in the BAM file's SN field of @SQ header record;",
218     "#2 INSDC reference ID",
219     NULL
220 };
221 
222 static
223 char const * min_mapq_usage[] =
224 {
225     "Filter secondary alignments by minimum mapping quality.",
226     NULL
227 };
228 
229 static
230 char const * qcomp_usage[] =
231 {
232     "Quality scores quantization level, can be a number (0: none, 1: 2bit, 2: 1bit), or a string like '1:10,10:20,20:30,30:-' (which is equivalent to 1) (nb. the endpoint is exclusive).",
233     NULL
234 };
235 
236 static
237 char const * unsorted_usage[] =
238 {
239     "Tell the loader to expect unsorted input (requires more memory)",
240     NULL
241 };
242 
243 static
244 char const * sorted_usage[] =
245 {
246     "Tell the loader to require sorted input",
247     NULL
248 };
249 
250 static
251 char const * cache_size_usage[] =
252 {
253     "Set the cache size in MB for the temporary indices",
254     NULL
255 };
256 
257 static
258 char const * mrc_usage[] =
259 {
260     "Set the maximum number of records to process from the BAM file",
261     NULL
262 };
263 
264 static
265 char const * mec_usage[] =
266 {
267     "Set the maximum number of errors to ignore from the BAM file",
268     NULL
269 };
270 
271 static
272 char const * no_verify_usage[] =
273 {
274     "Skip verify existence of references from the BAM file",
275     NULL
276 };
277 
278 static
279 char const * only_verify_usage[] =
280 {
281     "Exit after verifying existence of references from the BAM file",
282     NULL
283 };
284 
285 static
286 char const * use_QUAL_usage[] =
287 {
288     "use QUAL column for quality values (default is to use OQ if it is available)",
289     NULL
290 };
291 
292 static
293 char const * use_ref_filter[] =
294 {
295     "Only process alignments to the given reference",
296     NULL
297 };
298 
299 static
300 char const * use_ref_config[] =
301 {
302     "Only process alignments to references in the config file",
303     NULL
304 };
305 
306 static
307 char const * use_edit_aligned_qual[] =
308 {
309     "Convert quality at aligned positions to this value",
310     NULL
311 };
312 
313 static
314 char const * use_keep_mismatch_qual[] =
315 {
316     "Don't quantized quality at mismatched positions",
317     NULL
318 };
319 
320 static
321 char const * use_unaligned[] =
322 {
323     "Specify file without aligned reads",
324     NULL
325 };
326 
327 static
328 char const * use_accept_dups[] =
329 {
330     "Accept spots inconsistent PCR duplicate flags",
331     NULL
332 };
333 
334 static
335 char const * use_accept_nomatch[] =
336 {
337     "Accept alignments with no matching bases",
338     NULL
339 };
340 
341 static
342 char const * use_nomatch_log[] =
343 {
344     "Where to write info for alignments with no matching bases",
345     NULL
346 };
347 
348 static
349 char const * use_min_match[] =
350 {
351     "minimum number of matches for an alignment",
352     NULL
353 };
354 
355 static
356 char const * use_header[] =
357 {
358     "path to a file containing the SAM header to store in the resulting cSRA, recommended in case of multiple input BAMs",
359     NULL
360 };
361 
362 static
363 char const * use_no_cs[] =
364 {
365     "turn off awareness of colorspace",
366     NULL
367 };
368 
369 static
370 char const * use_no_secondary[] =
371 {
372     "ignore alignments marked as secondary",
373     NULL
374 };
375 
376 static
377 char const * use_ref_file[] =
378 {
379     "path to a fasta file with references",
380     NULL
381 };
382 
383 static
384 char const * use_TI[] =
385 {
386     "for trace alignments",
387     NULL
388 };
389 
390 static
391 char const * use_max_dup_warnings[] =
392 {
393     "set limit for number of duplicate flag mismatch warnings",
394     NULL
395 };
396 
397 static
398 char const * use_accept_hard_clip[] =
399 {
400     "accept hard clipping in CIGAR",
401     NULL
402 };
403 
404 static
405 char const * use_allow_multi_map[] =
406 {
407     "allow the same reference to be mapped to multiple names in the input files",
408     "(default is disallow, old behaviors was to allow it)",
409     NULL
410 };
411 
412 static
413 char const * use_allow_secondary[] =
414 {
415     "use secondary alignments for constructing spots",
416     NULL
417 };
418 
419 static
420 char const * use_defer_secondary[] =
421 {
422     "defer processing of secondary alignments until the end of the file",
423     NULL
424 };
425 
426 OptDef Options[] =
427 {
428     /* order here is same as in param array below!!! */
429     { OPTION_INPUT, ALIAS_INPUT,  NULL,  input_usage, 1, true,  false },
430     { OPTION_OUTPUT, ALIAS_OUTPUT, NULL, output_usage, 1, true,  true },
431     { OPTION_CONFIG, ALIAS_CONFIG,  NULL,  config_usage, 1, true, false },
432     { OPTION_HEADER, NULL, NULL, use_header, 1, true, false },
433     { OPTION_TMPFS, ALIAS_TMPFS, NULL, tmpfs_usage, 1, true,  false },
434     { OPTION_UNALIGNED, ALIAS_UNALIGNED, NULL, use_unaligned, 256, true, false },
435     { OPTION_ACCEPT_DUP, ALIAS_ACCEPT_DUP, NULL, use_accept_dups, 1, false, false },
436     { OPTION_ACCEPT_NOMATCH, NULL, NULL, use_accept_nomatch, 1, false, false },
437     { OPTION_NOMATCH_LOG, NULL, NULL, use_nomatch_log, 1, true, false },
438     { OPTION_QCOMP, ALIAS_QCOMP, NULL, qcomp_usage, 1, true,  false },
439     { OPTION_MINMAPQ, ALIAS_MINMAPQ, NULL, min_mapq_usage, 1, true,  false },
440     { OPTION_CACHE_SIZE, NULL, NULL, cache_size_usage, 1, true,  false },
441     { OPTION_NO_CS, NULL, NULL, use_no_cs, 1, false,  false },
442     { OPTION_MIN_MATCH, NULL, NULL, use_min_match, 1, true, false },
443     { OPTION_NO_SECONDARY, ALIAS_NO_SECONDARY, NULL, use_no_secondary, 1, false, false },
444     { option_unsorted, NULL, NULL, unsorted_usage, 1, false,  false },
445     { option_sorted, NULL, NULL, sorted_usage, 1, false,  false },
446     { option_no_verify, NULL, NULL, no_verify_usage, 1, false,  false },
447     { option_only_verify, NULL, NULL, only_verify_usage, 1, false,  false },
448     { option_use_qual, NULL, NULL, use_QUAL_usage, 1, false,  false },
449     { option_ref_config, NULL, NULL, use_ref_config, 1, false,  false },
450     { option_ref_filter, NULL, NULL, use_ref_filter, 1, true,  false },
451     { option_edit_aligned_qual, NULL, NULL, use_edit_aligned_qual, 1, true, false },
452     { option_keep_mismatch_qual, NULL, NULL, use_keep_mismatch_qual, 1, false,  false },
453     { OPTION_MAX_REC_COUNT, NULL, NULL, mrc_usage, 1, true,  false },
454     { OPTION_MAX_ERR_COUNT, ALIAS_MAX_ERR_COUNT, NULL, mec_usage, 1, true,  false },
455     { OPTION_REF_FILE, ALIAS_REF_FILE, NULL, use_ref_file, 0, true, false },
456     { OPTION_TI, NULL, NULL, use_TI, 1, false, false },
457     { OPTION_MAX_WARN_DUP_FLAG, NULL, NULL, use_max_dup_warnings, 1, true, false },
458     { OPTION_ACCEPT_HARD_CLIP, NULL, NULL, use_accept_hard_clip, 1, false, false },
459     { OPTION_ALLOW_MULTI_MAP, NULL, NULL, use_allow_multi_map, 1, false, false },
460     { OPTION_ALLOW_SECONDARY, NULL, NULL, use_allow_secondary, 1, false, false },
461     { OPTION_DEFER_SECONDARY, NULL, NULL, use_defer_secondary, 1, false, false }
462 };
463 
464 const char* OptHelpParam[] =
465 {
466     /* order here is same as in OptDef array above!!! */
467     "path",				/* input */
468     "path",				/* output */
469     "path-to-file",		/* config */
470     "path-to-file",		/* header */
471     "path",				/* tmpfs */
472     "path-to-file",		/* unaligned */
473     NULL,				/* accept dups */
474     NULL,				/* accept no-match */
475     "path-to-file",		/* no-match log */
476     "level",			/* quality compression */
477     "phred-score",		/* min. mapq */
478     "mbytes",			/* cache size */
479     NULL,				/* no colorspace */
480     "count",			/* min. match count */
481     NULL,				/* no secondary */
482     NULL,				/* unsorted */
483     NULL,				/* sorted */
484     NULL,				/* no verify ref's */
485     NULL,				/* quit after verify ref's */
486     NULL,				/* force QUAL */
487     NULL,				/* ref's from config */
488     "name",				/* only this ref */
489     "new-value",		/* value for aligned qualities */
490     NULL,				/* no quantize mismatch qualities */
491     "number",			/* max. record count to process */
492     "number",			/* max. error count */
493     "path-to-file",		/* reference fasta file */
494     NULL,				/* use XT->TI */
495     "count",			/* max. duplicate warning count */
496     NULL,				/* allow hard clipping */
497     NULL,				/* allow multimapping */
498     NULL,				/* allow secondary */
499     NULL				/* defer secondary */
500 };
501 
UsageSummary(char const * progname)502 rc_t UsageSummary (char const * progname)
503 {
504     return KOutMsg (
505         "Usage:\n"
506         "\t%s [options] <bam-file>\n"
507         "\n"
508         "Summary:\n"
509         "\tLoad a BAM formatted data file\n"
510         "\n"
511         "Example:\n"
512         "\t%s -o /tmp/SRZ123456 -k analysis.bam.cfg 123456.bam\n"
513         "\n"
514         ,progname, progname);
515 }
516 
517 char const UsageDefaultName[] = "bam-load";
518 
Usage(const Args * args)519 rc_t CC Usage (const Args * args)
520 {
521     rc_t rc;
522     int i;
523     const char * progname = UsageDefaultName;
524     const char * fullpath = UsageDefaultName;
525     const size_t argsQty = sizeof(Options) / sizeof(Options[0]);
526 
527     if (args == NULL)
528         rc = RC (rcApp, rcArgv, rcAccessing, rcSelf, rcNull);
529     else
530         rc = ArgsProgram (args, &fullpath, &progname);
531     if (rc)
532         progname = fullpath = UsageDefaultName;
533 
534     UsageSummary (progname);
535 
536     for(i = 0; i < argsQty; i++ ) {
537         if( Options[i].required && Options[i].help[0] != NULL ) {
538             HelpOptionLine(Options[i].aliases, Options[i].name, OptHelpParam[i], Options[i].help);
539         }
540     }
541     OUTMSG(("\nOptions:\n"));
542     for(i = 0; i < argsQty; i++ ) {
543         if( !Options[i].required && Options[i].help[0] != NULL ) {
544             HelpOptionLine(Options[i].aliases, Options[i].name, OptHelpParam[i], Options[i].help);
545         }
546     }
547     XMLLogger_Usage();
548     OUTMSG(("\n"));
549     HelpOptionsStandard ();
550     HelpVersion (fullpath, KAppVersion());
551     return rc;
552 }
553 
554 /* MARK: Definitions and Globals */
555 
556 #define SCHEMAFILE "align/align.vschema"
557 
558 Globals G;
559 
560 #ifdef _WIN32
561 #include <process.h>
562 #else
563 #include <unistd.h>
564 #endif
set_pid(void)565 static void set_pid(void)
566 {
567     G.pid = getpid();
568 }
569 
PathWithBasePath(char rslt[],size_t sz,char const path[],char const base[])570 static rc_t PathWithBasePath(char rslt[], size_t sz, char const path[], char const base[])
571 {
572     size_t const plen = strlen(path);
573     bool const hasBase = base && base[0];
574     bool const isBareName = strchr(path, '/') == NULL;
575 
576     if (isBareName && hasBase) {
577         if (string_printf(rslt, sz, NULL, "%s/%s", base, path) == 0)
578             return 0;
579     }
580     else if (plen < sz) {
581         strncpy(rslt, path, sz);
582         return 0;
583     }
584     {
585         rc_t const rc = RC(rcApp, rcArgv, rcAccessing, rcBuffer, rcInsufficient);
586         (void)LOGERR(klogErr, rc, "The path to the file is too long");
587         return rc;
588     }
589 }
590 
OpenFile(KFile const ** kf,char const path[],char const base[])591 static rc_t OpenFile(KFile const **kf, char const path[], char const base[])
592 {
593     char fname[4096];
594     rc_t rc = PathWithBasePath(fname, sizeof(fname), path, base);
595 
596     if (rc == 0) {
597         KDirectory *dir;
598 
599         rc = KDirectoryNativeDir(&dir);
600         if (rc == 0) {
601             rc = KDirectoryOpenFileRead(dir, kf, "%s", fname);
602             KDirectoryRelease(dir);
603         }
604     }
605     return rc;
606 }
607 
LoadHeader(char const ** rslt,char const path[],char const base[])608 static rc_t LoadHeader(char const **rslt, char const path[], char const base[])
609 {
610     KFile const *kf;
611     rc_t rc = OpenFile(&kf, path, base);
612 
613     *rslt = NULL;
614     if (rc == 0) {
615         uint64_t fsize;
616         rc = KFileSize(kf, &fsize);
617         if (rc == 0) {
618             char *fdata = malloc(fsize+1);
619 
620             if (fdata) {
621                 size_t nread;
622                 rc = KFileRead(kf, 0, fdata, fsize, &nread);
623                 if (rc == 0) {
624                     if (nread) {
625                         fdata[nread] = '\0';
626                         *rslt = fdata;
627                     }
628                     else {
629                         free(fdata);
630                         rc = RC(rcApp, rcArgv, rcAccessing, rcFile, rcEmpty);
631                         (void)PLOGERR(klogErr, (klogErr, rc, "File '$(file)' is empty", "file=%s", path));
632                     }
633                 }
634                 else {
635                     (void)PLOGERR(klogErr, (klogErr, rc, "Failed to read file '$(file)'", "file=%s", path));
636                 }
637             }
638             else {
639                 rc = RC(rcApp, rcArgv, rcAccessing, rcMemory, rcExhausted);
640                 (void)PLOGERR(klogErr, (klogErr, rc, "Failed to read file '$(file)'", "file=%s", path));
641             }
642         }
643         KFileRelease(kf);
644     }
645     else {
646         (void)PLOGERR(klogErr, (klogErr, rc, "Failed to open file '$(file)'", "file=%s", path));
647     }
648     return rc;
649 }
650 
main_help_vers(int argc,char * argv[])651 static rc_t main_help_vers(int argc, char * argv[])
652 {
653     Args *args = NULL;
654     rc_t const rc = ArgsMakeAndHandle (&args, argc, argv, 2, Options,
655         sizeof Options / sizeof (OptDef), XMLLogger_Args, XMLLogger_ArgsQty);
656     ArgsWhack(args);
657     return rc;
658 }
659 
getArgValue(Args * const args,char const * name,int index,char const ** result)660 static rc_t getArgValue(Args *const args, char const *name, int index, char const **result)
661 {
662     void const *value;
663     rc_t const rc = ArgsOptionValue(args, name, index, &value);
664     if (rc) return rc;
665     free((void *)*result);
666     *result = strdup(value);
667     assert(*result);
668     return 0;
669 }
670 
main_1(int argc,char * argv[],bool const continuing,unsigned const load)671 static rc_t main_1(int argc, char *argv[], bool const continuing, unsigned const load)
672 {
673     Args *args;
674     rc_t rc;
675     unsigned n_aligned = 0;
676     unsigned n_unalgnd = 0;
677     char *aligned[256];
678     char *unalgnd[256];
679     char *name_buffer = NULL;
680     unsigned next_name = 0;
681     unsigned nbsz = 0;
682     char const *value;
683     char *dummy;
684 
685     rc = ArgsMakeAndHandle (&args, argc, argv, 1, Options, sizeof(Options)/sizeof(Options[0]));
686     while (rc == 0) {
687         uint32_t pcount;
688 
689         rc = ArgsOptionCount(args, option_only_verify, &pcount);
690         if (rc)
691             break;
692         G.onlyVerifyReferences |= (pcount > 0);
693 
694         rc = ArgsOptionCount(args, option_no_verify, &pcount);
695         if (rc)
696             break;
697         G.noVerifyReferences |= (pcount > 0);
698 
699         rc = ArgsOptionCount(args, option_use_qual, &pcount);
700         if (rc)
701             break;
702         G.useQUAL |= (pcount > 0);
703 
704         rc = ArgsOptionCount(args, option_ref_config, &pcount);
705         if (rc)
706             break;
707         G.limit2config |= (pcount > 0);
708 
709         rc = ArgsOptionCount(args, OPTION_REF_FILE, &pcount);
710         if (rc)
711             break;
712         if (pcount && G.refFiles) {
713             int i;
714 
715             for (i = 0; G.refFiles[i]; ++i)
716                 free((void *)G.refFiles[i]);
717             free((void *)G.refFiles);
718         }
719         G.refFiles = calloc(pcount + 1, sizeof(*(G.refFiles)));
720         if (!G.refFiles) {
721             rc = RC(rcApp, rcArgv, rcAccessing, rcMemory, rcExhausted);
722             break;
723         }
724         while(pcount-- > 0) {
725             rc = getArgValue(args, OPTION_REF_FILE, pcount, &G.refFiles[pcount]);
726             if (rc)
727                 break;
728         }
729 
730         rc = ArgsOptionCount (args, OPTION_TMPFS, &pcount);
731         if (rc)
732             break;
733         if (pcount == 1)
734         {
735             rc = getArgValue(args, OPTION_TMPFS, 0, &G.tmpfs);
736             if (rc)
737                 break;
738         }
739         else if (pcount > 1)
740         {
741             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcExcessive);
742             OUTMSG (("Single parameter required\n"));
743             MiniUsage (args);
744             break;
745         }
746 
747         rc = ArgsOptionCount (args, OPTION_INPUT, &pcount);
748         if (rc)
749             break;
750         if (pcount == 1)
751         {
752             rc = getArgValue(args, OPTION_INPUT, 0, &G.inpath);
753             if (rc)
754                 break;
755         }
756         else if (pcount > 1)
757         {
758             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcExcessive);
759             OUTMSG (("Single input parameter required\n"));
760             MiniUsage (args);
761             break;
762         }
763 
764         rc = ArgsOptionCount (args, option_ref_filter, &pcount);
765         if (rc)
766             break;
767         if (pcount == 1)
768         {
769             rc = getArgValue(args, option_ref_filter, 0, &G.refFilter);
770             if (rc)
771                 break;
772         }
773         else if (pcount > 1)
774         {
775             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcExcessive);
776             OUTMSG (("Single parameter required\n"));
777             MiniUsage (args);
778             break;
779         }
780 
781         rc = ArgsOptionCount (args, OPTION_CONFIG, &pcount);
782         if (rc)
783             break;
784         if (pcount == 1)
785         {
786             rc = getArgValue(args, OPTION_CONFIG, 0, &G.refXRefPath);
787             if (rc)
788                 break;
789         }
790         else if (pcount > 1)
791         {
792             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcExcessive);
793             OUTMSG (("Single input parameter required\n"));
794             MiniUsage (args);
795             break;
796         }
797 
798         rc = ArgsOptionCount (args, OPTION_OUTPUT, &pcount);
799         if (rc)
800             break;
801         if (pcount == 1)
802         {
803             rc = getArgValue(args, OPTION_OUTPUT, 0, &G.outpath);
804             if (rc)
805                 break;
806             if (load == 0) {
807                 G.firstOut = strdup(G.outpath);
808             }
809             value = strrchr(G.outpath, '/');
810             G.outname = value ? (value + 1) : G.outpath;
811         }
812         else if (pcount > 1)
813         {
814             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcExcessive);
815             OUTMSG (("Single output parameter required\n"));
816             MiniUsage (args);
817             break;
818         }
819         else if (!G.onlyVerifyReferences) {
820             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcInsufficient);
821             OUTMSG (("Output parameter required\n"));
822             MiniUsage (args);
823             break;
824         }
825 
826         rc = ArgsOptionCount (args, OPTION_MINMAPQ, &pcount);
827         if (rc)
828             break;
829         if (pcount == 1)
830         {
831             rc = ArgsOptionValue(args, OPTION_MINMAPQ, 0, (const void **)&value);
832             if (rc)
833                 break;
834             G.minMapQual = strtoul(value, &dummy, 0);
835         }
836 
837         rc = ArgsOptionCount (args, OPTION_QCOMP, &pcount);
838         if (rc)
839             break;
840         if (pcount == 1)
841         {
842             rc = getArgValue(args, OPTION_QCOMP, 0, &G.QualQuantizer);
843             if (rc)
844                 break;
845         }
846 
847         rc = ArgsOptionCount (args, option_edit_aligned_qual, &pcount);
848         if (rc)
849             break;
850         if (pcount == 1)
851         {
852             rc = ArgsOptionValue (args, option_edit_aligned_qual, 0, (const void **)&value);
853             if (rc)
854                 break;
855             G.alignedQualValue = strtoul(value, &dummy, 0);
856             if (G.alignedQualValue == 0) {
857                 rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcIncorrect);
858                 OUTMSG (("edit-aligned-qual: bad value\n"));
859                 MiniUsage (args);
860                 break;
861             }
862             G.editAlignedQual = true;
863         }
864 
865         rc = ArgsOptionCount (args, OPTION_CACHE_SIZE, &pcount);
866         if (rc)
867             break;
868         if (pcount == 1)
869         {
870             rc = ArgsOptionValue (args, OPTION_CACHE_SIZE, 0, (const void **)&value);
871             if (rc)
872                 break;
873             G.cache_size = strtoul(value, &dummy, 0) * 1024UL * 1024UL;
874             if (G.cache_size == 0) {
875                 rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcIncorrect);
876                 OUTMSG (("cache-size: bad value\n"));
877                 MiniUsage (args);
878                 break;
879             }
880         }
881 
882         rc = ArgsOptionCount (args, OPTION_MAX_WARN_DUP_FLAG, &pcount);
883         if (rc)
884             break;
885         if (pcount == 1)
886         {
887             rc = ArgsOptionValue (args, OPTION_MAX_WARN_DUP_FLAG, 0, (const void **)&value);
888             if (rc)
889                 break;
890             G.maxWarnCount_DupConflict = strtoul(value, &dummy, 0);
891         }
892 
893         rc = ArgsOptionCount (args, option_unsorted, &pcount);
894         if (rc)
895             break;
896         G.expectUnsorted |= (pcount > 0);
897 
898         rc = ArgsOptionCount (args, option_sorted, &pcount);
899         if (rc)
900             break;
901         G.requireSorted |= (pcount > 0);
902 
903         rc = ArgsOptionCount (args, OPTION_MAX_REC_COUNT, &pcount);
904         if (rc)
905             break;
906         if (pcount == 1)
907         {
908             rc = ArgsOptionValue (args, OPTION_MAX_REC_COUNT, 0, (const void **)&value);
909             if (rc)
910                 break;
911             G.maxAlignCount = strtoul(value, &dummy, 0);
912         }
913 
914         rc = ArgsOptionCount (args, OPTION_MAX_ERR_COUNT, &pcount);
915         if (rc)
916             break;
917         if (pcount == 1)
918         {
919             rc = ArgsOptionValue (args, OPTION_MAX_ERR_COUNT, 0, (const void **)&value);
920             if (rc)
921                 break;
922             G.maxErrCount = strtoul(value, &dummy, 0);
923         }
924 
925         rc = ArgsOptionCount (args, OPTION_MIN_MATCH, &pcount);
926         if (rc)
927             break;
928         if (pcount == 1)
929         {
930             rc = ArgsOptionValue (args, OPTION_MIN_MATCH, 0, (const void **)&value);
931             if (rc)
932                 break;
933             G.minMatchCount = strtoul(value, &dummy, 0);
934         }
935 
936         rc = ArgsOptionCount (args, OPTION_ACCEPT_DUP, &pcount);
937         if (rc)
938             break;
939         G.acceptBadDups |= (pcount > 0);
940 
941         rc = ArgsOptionCount (args, OPTION_ACCEPT_NOMATCH, &pcount);
942         if (rc)
943             break;
944         G.acceptNoMatch |= (pcount > 0);
945 
946         rc = ArgsOptionCount (args, option_keep_mismatch_qual, &pcount);
947         if (rc)
948             break;
949         G.keepMismatchQual |= (pcount > 0);
950 
951         rc = ArgsOptionCount (args, OPTION_NO_CS, &pcount);
952         if (rc)
953             break;
954         G.noColorSpace |= (pcount > 0);
955 
956         rc = ArgsOptionCount (args, OPTION_NO_SECONDARY, &pcount);
957         if (rc)
958             break;
959         G.noSecondary |= (pcount > 0);
960 
961         rc = ArgsOptionCount (args, OPTION_TI, &pcount);
962         if (rc)
963             break;
964         G.hasTI |= (pcount > 0);
965 
966         rc = ArgsOptionCount (args, OPTION_ACCEPT_HARD_CLIP, &pcount);
967         if (rc)
968             break;
969         G.acceptHardClip |= (pcount > 0);
970 
971         rc = ArgsOptionCount (args, OPTION_ALLOW_MULTI_MAP, &pcount);
972         if (rc)
973             break;
974         G.allowMultiMapping |= (pcount > 0);
975 
976         rc = ArgsOptionCount (args, OPTION_ALLOW_SECONDARY, &pcount);
977         if (rc)
978             break;
979         G.assembleWithSecondary |= (pcount > 0);
980 
981         rc = ArgsOptionCount (args, OPTION_DEFER_SECONDARY, &pcount);
982         if (rc)
983             break;
984         G.deferSecondary |= (pcount > 0);
985 
986         rc = ArgsOptionCount (args, OPTION_NOMATCH_LOG, &pcount);
987         if (rc)
988             break;
989         if (pcount == 1)
990         {
991             KDirectory *dir;
992 
993             rc = ArgsOptionValue (args, OPTION_NOMATCH_LOG, 0, (const void **)&value);
994             if (rc) break;
995             rc = KDirectoryNativeDir(&dir);
996             if (rc) break;
997             rc = KDirectoryCreateFile(dir, &G.noMatchLog, 0, 0664, kcmInit, "%s", value);
998             KDirectoryRelease(dir);
999             if (rc) break;
1000         }
1001 
1002         rc = ArgsOptionCount (args, OPTION_HEADER, &pcount);
1003         if (rc)
1004             break;
1005         if (pcount == 1) {
1006             rc = ArgsOptionValue (args, OPTION_HEADER, 0, (const void **)&value);
1007             if (rc) break;
1008             free((void *)G.headerText);
1009             rc = LoadHeader(&G.headerText, value, G.inpath);
1010             if (rc) break;
1011         }
1012 
1013         rc = ArgsParamCount (args, &pcount);
1014         if (rc) break;
1015         if (pcount == 0)
1016         {
1017             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcInsufficient);
1018             MiniUsage (args);
1019             break;
1020         }
1021         else if (pcount > sizeof(aligned)/sizeof(aligned[0])) {
1022             rc = RC(rcApp, rcArgv, rcAccessing, rcParam, rcExcessive);
1023             (void)PLOGERR(klogErr, (klogErr, rc, "$(count) input files is too many, $(max) is the limit",
1024                         "count=%u,max=%u", (unsigned)pcount, (unsigned)(sizeof(aligned)/sizeof(aligned[0]))));
1025             break;
1026         }
1027         else {
1028             unsigned need = G.inpath ? (strlen(G.inpath) + 1) * pcount : 0;
1029             unsigned i;
1030 
1031             for (i = 0; i < pcount; ++i) {
1032                 rc = ArgsParamValue(args, i, (const void **)&value);
1033                 if (rc) break;
1034                 need += strlen(value) + 1;
1035             }
1036             nbsz = need;
1037         }
1038 
1039         rc = ArgsOptionCount (args, OPTION_UNALIGNED, &pcount);
1040         if (rc)
1041             break;
1042         if (pcount > 0)
1043         {
1044             unsigned need = G.inpath ? (strlen(G.inpath) + 1) * pcount : 0;
1045             unsigned i;
1046 
1047             for (i = 0; i < pcount; ++i) {
1048                 rc = ArgsOptionValue(args, OPTION_UNALIGNED, i, (const void **)&value);
1049                 if (rc) break;
1050                 need += strlen(value) + 1;
1051             }
1052             if (rc) break;
1053             nbsz += need;
1054         }
1055 
1056         name_buffer = malloc(nbsz);
1057         if (name_buffer == NULL) {
1058             rc = RC(rcApp, rcArgv, rcAccessing, rcMemory, rcExhausted);
1059             break;
1060         }
1061 
1062         rc = ArgsOptionCount (args, OPTION_UNALIGNED, &pcount);
1063         if (rc == 0) {
1064             unsigned i;
1065 
1066             for (i = 0; i < pcount; ++i) {
1067                 rc = ArgsOptionValue(args, OPTION_UNALIGNED, i, (const void **)&value);
1068                 if (rc) break;
1069 
1070                 unalgnd[n_unalgnd++] = name_buffer + next_name;
1071                 rc = PathWithBasePath(name_buffer + next_name, nbsz - next_name, value, G.inpath);
1072                 if (rc) break;
1073                 next_name += strlen(name_buffer + next_name) + 1;
1074             }
1075             if (rc) break;
1076         }
1077         else
1078             break;
1079 
1080         rc = ArgsParamCount (args, &pcount);
1081         if (rc == 0) {
1082             unsigned i;
1083 
1084             for (i = 0; i < pcount; ++i) {
1085                 rc = ArgsParamValue(args, i, (const void **)&value);
1086                 if (rc) break;
1087 
1088                 aligned[n_aligned++] = name_buffer + next_name;
1089                 rc = PathWithBasePath(name_buffer + next_name, nbsz - next_name, value, G.inpath);
1090                 if (rc) break;
1091                 next_name += strlen(name_buffer + next_name) + 1;
1092             }
1093         }
1094         else
1095             break;
1096 
1097         rc = run(argv[0], n_aligned, (char const **)aligned, n_unalgnd, (char const **)unalgnd, continuing);
1098         break;
1099     }
1100     free(name_buffer);
1101 
1102     if (rc) {
1103         (void)PLOGERR(klogErr, (klogErr, rc, "load failed",
1104                 "severity=total,status=failure,accession=%s,errors=%u", G.outname, G.errCount));
1105     } else {
1106         (void)PLOGMSG(klogInfo, (klogInfo, "loaded",
1107                 "severity=total,status=success,accession=%s,errors=%u", G.outname, G.errCount));
1108     }
1109     ArgsWhack(args);
1110     return rc;
1111 }
1112 
cleanupGlobal(void)1113 static void cleanupGlobal(void)
1114 {
1115     if (G.refFiles) {
1116         int i;
1117 
1118         for (i = 0; G.refFiles[i]; ++i)
1119             free((void *)G.refFiles[i]);
1120         free((void *)G.refFiles);
1121     }
1122     free((void *)G.tmpfs);
1123     free((void *)G.inpath);
1124     free((void *)G.refFilter);
1125     free((void *)G.refXRefPath);
1126     free((void *)G.outpath);
1127     free((void *)G.firstOut);
1128     free((void *)G.headerText);
1129     free((void *)G.QualQuantizer);
1130     free((void *)G.schemaPath);
1131 }
1132 
find_arg(char const * const * const query,int const first,int const argc,char ** const argv)1133 static int find_arg(char const *const *const query, int const first, int const argc, char **const argv)
1134 {
1135     int i;
1136 
1137     for (i = first; i < argc; ++i) {
1138         int j;
1139 
1140         for (j = 0; query[j] != NULL; ++j) {
1141             if (strcmp(argv[i], query[j]) == 0)
1142                 return i;
1143         }
1144     }
1145     return 0;
1146 }
1147 
has_arg(char const * const * const query,int const argc,char ** const argv)1148 static bool has_arg(char const *const *const query, int const argc, char **const argv)
1149 {
1150     return find_arg(query, 1, argc, argv) > 0;
1151 }
1152 
1153 static const char *logger_options[] = { "--xml-log-fd", "--xml-log", "-z" };
make_logger(int * argc,char * argv[])1154 static XMLLogger const *make_logger(int *argc, char *argv[])
1155 {
1156     XMLLogger const *rslt = NULL;
1157     char *argf[4];
1158     int i;
1159 
1160     argf[0] = argv[0];
1161     argf[1] = NULL;
1162     argf[2] = NULL;
1163     argf[3] = NULL;
1164 
1165     for (i = 1; i < *argc; ++i) {
1166         int remove = 0;
1167 
1168         if (strcmp(argv[i], logger_options[2]) == 0) {
1169             argf[1] = argv[i];
1170             argf[2] = argv[i + 1];
1171             remove = 2;
1172         }
1173         else {
1174             int j;
1175 
1176             for (j = 0; j < 2; ++j) {
1177                 if (strstr(argv[i], logger_options[j]) == argv[i]) {
1178                     int const n = strlen(logger_options[j]);
1179 
1180                     if (argv[i][n] == '\0') {
1181                         argf[1] = argv[i];
1182                         argf[2] = argv[i + 1];
1183                         remove = 2;
1184                     }
1185                     else if (argv[i][n] == '=') {
1186                         argv[i][n] = '\0';
1187                         argf[1] = argv[i];
1188                         argf[2] = argv[i] + n + 1;
1189                         remove = 1;
1190                     }
1191                     break;
1192                 }
1193             }
1194         }
1195         if (argf[1] != NULL) {
1196             Args *args = NULL;
1197 
1198             ArgsMakeAndHandle(&args, 3, argf, 1, XMLLogger_Args, XMLLogger_ArgsQty);
1199             if (args) {
1200                 XMLLogger_Make(&rslt, NULL, args);
1201                 ArgsWhack(args);
1202             }
1203         }
1204         if (remove) {
1205             *argc -= remove;
1206             memmove(argv + i, argv + i + remove, (*argc + 1) * sizeof(argv[0]));
1207             break;
1208         }
1209     }
1210     return rslt;
1211 }
1212 
KMain(int argc,char * argv[])1213 rc_t CC KMain(int argc, char *argv[])
1214 {
1215     static const char *help[] = { "--help", "-h", "-?", NULL };
1216     static const char *vers[] = { "--version", "-V", NULL };
1217 
1218     bool const has_help = has_arg(help, argc, argv);
1219     bool const has_vers = has_arg(vers, argc, argv);
1220     XMLLogger const *logger = NULL;
1221     int argfirst = 0;
1222     int arglast = 0;
1223     rc_t rc = 0;
1224     unsigned load = 0;
1225 
1226     if (has_help) {
1227         argc = 2;
1228         argv[1] = "--help";
1229         return main_help_vers(argc, argv);
1230     }
1231     if (has_vers) {
1232         argc = 2;
1233         argv[1] = "--version";
1234         return main_help_vers(argc, argv);
1235     }
1236 
1237     logger = make_logger(&argc, argv);
1238 
1239     memset(&G, 0, sizeof(G));
1240     G.mode = mode_Archive;
1241     G.globalMode = mode_Archive;
1242     G.maxSeqLen = TableWriterRefSeq_MAX_SEQ_LEN;
1243     G.schemaPath = strdup(SCHEMAFILE);
1244     G.omit_aligned_reads = true;
1245     G.omit_reference_reads = true;
1246     G.minMapQual = 0; /* accept all */
1247     G.tmpfs = strdup("/tmp");
1248     G.cache_size = ((size_t)16) << 30;
1249     G.maxErrCount = 1000;
1250     G.minMatchCount = 10;
1251 
1252     set_pid();
1253 
1254     for (arglast = 1; arglast < argc; ++arglast) {
1255         if (strcmp(argv[arglast], "--remap") == 0) {
1256             argv[arglast] = argv[0];
1257             G.globalMode = mode_Remap;
1258             rc = main_1(arglast - argfirst, argv + argfirst, true, load);
1259             if (rc)
1260                 break;
1261             G.mode = mode_Remap;
1262             argfirst = arglast;
1263             ++load;
1264         }
1265     }
1266     rc = main_1(arglast - argfirst, argv + argfirst, false, load);
1267     XMLLogger_Release(logger);
1268     cleanupGlobal();
1269     return rc;
1270 }
1271