1 static char rcsid[] = "$Id: gsnap.c 222931 2020-06-29 16:10:56Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #ifdef USE_MPI
7 #include <mpi.h>
8 #include "mpidebug.h"
9 #endif
10 
11 #ifdef HAVE_SYS_TYPES_H
12 #include <sys/types.h>		/* Needed to define pthread_t on Solaris */
13 #endif
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>		/* For strcpy */
18 #include <strings.h>		/* For rindex */
19 #include <ctype.h>
20 #include <math.h>		/* For rint */
21 #ifdef HAVE_SSE2
22 #include <emmintrin.h>
23 #endif
24 #ifdef HAVE_SSE4_1
25 #include <smmintrin.h>
26 #endif
27 #if !defined(HAVE_SSE4_2)
28 /* Skip popcnt */
29 #elif defined(HAVE_POPCNT)
30 #include <immintrin.h>
31 #endif
32 
33 #if !defined(HAVE_SSE4_2)
34 /* Skip mm_popcnt */
35 #elif defined(HAVE_MM_POPCNT)
36 #include <nmmintrin.h>
37 #endif
38 
39 
40 #if !defined(HAVE_SSE4_2)
41 /* Skip lzcnt/tzcnt */
42 #elif defined(HAVE_LZCNT) || defined(HAVE_TZCNT)
43 #include <immintrin.h>
44 #endif
45 
46 #ifdef HAVE_PTHREAD
47 #include <pthread.h>
48 #endif
49 
50 #ifdef HAVE_ZLIB
51 #include <zlib.h>
52 #define GZBUFFER_SIZE 131072
53 #endif
54 
55 #ifdef HAVE_BZLIB
56 #include "bzip2.h"
57 #endif
58 
59 #include <signal.h>
60 
61 #include "assert.h"
62 #include "except.h"
63 #include "mem.h"
64 #include "bool.h"
65 #include "types.h"
66 #include "univcoord.h"
67 #include "fopen.h"
68 #include "filesuffix.h"
69 
70 #include "mode.h"
71 #include "shortread.h"		/* For Shortread_setup */
72 #include "stopwatch.h"
73 #include "transcriptome.h"
74 #include "transcript.h"		/* For Transcript_setup */
75 #include "genome.h"
76 #include "genome128_hr.h"	/* For Genome_hr_setup */
77 #include "genome128_consec.h"	/* For Genome_consec_setup */
78 #include "genome_sites.h"	/* For Genome_sites_setup */
79 #include "maxent_hr.h"		/* For Maxent_hr_setup */
80 #include "knownsplicing.h"
81 #include "mapq.h"
82 #include "substring.h"
83 #include "stage3hr.h"
84 #include "concordance.h"	/* For Concordance_setup */
85 #include "splice.h"		/* For Splice_setup */
86 #include "oligo.h"		/* For Oligo_setup */
87 /* #include "oligoindex_hr.h" */	/* For Oligoindex_hr_setup */
88 /* #include "oligoindex_localdb.h" --  For Oligoindex_local_setup */
89 /* #include "pairpool.h" */
90 /* #include "diagpool.h" */
91 /* #include "cellpool.h" */
92 #include "listpool.h"
93 #include "univdiagpool.h"
94 #include "hitlistpool.h"
95 
96 /* #include "stage2.h" */		/* For Stage2_setup */
97 #include "path-solve.h"
98 #include "kmer-search.h"
99 #include "extension-search.h"
100 #include "segment-search.h"
101 #include "terminal.h"
102 #include "distant-rna.h"
103 #include "distant-dna.h"
104 #include "indel.h"		/* For Indel_setup */
105 #include "stage1hr.h"
106 #include "indexdb.h"
107 #include "regiondb.h"
108 #include "resulthr.h"
109 #include "request.h"
110 #include "intlist.h"
111 #include "list.h"
112 #include "iit-read.h"
113 #include "iit-read-univ.h"
114 #include "datadir.h"
115 #include "samprint.h"		/* For SAM_setup */
116 #include "cigar.h"		/* For Cigar_setup */
117 
118 #include "filestring.h"
119 #include "output.h"
120 #include "inbuffer.h"
121 #include "outbuffer.h"
122 #ifdef USE_MPI
123 #include "master.h"
124 #endif
125 
126 #include "simplepair.h"
127 #include "getopt.h"
128 
129 
130 #define ONE_GB 1000000000
131 
132 #define MIN_INDEXDB_SIZE_THRESHOLD 1000
133 
134 #define MAX_QUERYLENGTH_FOR_ALLOC    100000
135 #define MAX_GENOMICLENGTH_FOR_ALLOC 1000000
136 
137 
138 /* MPI Processing */
139 #ifdef DEBUGM
140 #define debugm(x) x
141 #else
142 #define debugm(x)
143 #endif
144 
145 /* File open/close.  Want to turn on in shortread.c also. */
146 #ifdef DEBUGF
147 #define debugf(x) x
148 #else
149 #define debugf(x)
150 #endif
151 
152 
153 #ifdef DEBUG
154 #define debug(x) x
155 #else
156 #define debug(x)
157 #endif
158 
159 
160 /************************************************************************
161  *   GMAP parameters
162  ************************************************************************/
163 
164 /* GMAP_IMPROVEMENT shouldn't be necessary anymore with localdb */
165 /* Might reinstate GMAP_ENDS if we move localdb to GMAP */
166 #if 0
167 static int gmap_mode = GMAP_IMPROVEMENT;  /* GMAP_PAIRSEARCH | GMAP_ENDS | GMAP_IMPROVEMENT; */
168 static int gmap_min_nconsecutive = 20;
169 static int nullgap = 600;
170 static int maxpeelback = 20;	/* Now controlled by defect_rate */
171 static int maxpeelback_distalmedial = 24;
172 static int extramaterial_end = 10;
173 static int extramaterial_paired = 8;
174 static int sufflookback = 60;
175 static int nsufflookback = 5;
176 static int extraband_single = 3;
177 static int extraband_end = 3;  /* Shouldn't differ from 0, since onesidegapp is true? */
178 static int extraband_paired = 7;
179 static int ngap = 3;  /* 0? */
180 
181 static int max_gmap_pairsearch = 50; /* Will perform GMAP on up to this many hits5 or hits3 */
182 static int max_gmap_terminal = 50;   /* Will perform GMAP on up to this many terminals5 or terminals3 */
183 static int max_gmap_improvement = 5;
184 
185 static int trigger_score_for_gmap = 5;
186 static int gmap_allowance = 3;
187 /* static int trigger_score_for_terminals = 5; -- obsolete */
188 #endif
189 
190 static int min_intronlength = 9;
191 /* static int max_deletionlength = 50; -- Now depends on querylength */
192 
193 
194 /************************************************************************
195  *   Global parameters
196  ************************************************************************/
197 
198 /* Transcriptome */
199 static Transcriptome_T transcriptome = NULL;
200 static Univ_IIT_T transcript_iit = NULL;
201 static Trcoord_T transcriptomelength = 0;
202 static int ntranscripts = 0;
203 
204 /* Genome */
205 static Univ_IIT_T chromosome_iit = NULL;
206 static int circular_typeint = -1;
207 static Univcoord_T genomelength;
208 static int nchromosomes = 0;
209 static bool *circularp = NULL;
210 static bool any_circular_p;
211 
212 static bool *altlocp = NULL;
213 static Univcoord_T *alias_starts = NULL;
214 static Univcoord_T *alias_ends = NULL;
215 
216 static Indexdb_T indexdb = NULL;
217 static Indexdb_T indexdb2 = NULL; /* For cmet or atoi */
218 static Indexdb_T indexdb_tr = NULL;
219 
220 static bool user_localdb_p = false;
221 static bool use_localdb_p = false;
222 static Regiondb_T regiondb = NULL;
223 static Regiondb_T regiondb2 = NULL;
224 
225 static Genome_T genomecomp = NULL;
226 static Genome_T genomecomp_alt = NULL;
227 static Genome_T genomebits = NULL;
228 static Genome_T genomebits_alt = NULL;
229 
230 static Genome_T transcriptomebits = NULL;
231 static Genome_T transcriptomebits_alt = NULL;
232 
233 static bool use_only_transcriptome_p = false;
234 
235 #if 0
236 static char STANDARD_CHARTABLE[4] = {'A','C','G','T'};
237 static char CMET_FWD_CHARTABLE[4] = {'A','T','G','T'}; /* CT */
238 static char CMET_REV_CHARTABLE[4] = {'A','C','A','T'}; /* GA */
239 static char ATOI_FWD_CHARTABLE[4] = {'G','C','G','T'};     /* AG */
240 static char ATOI_REV_CHARTABLE[4] = {'A','C','G','C'};     /* TC */
241 #endif
242 
243 
244 static bool fastq_format_p = false;
245 static bool want_random_p = true; /* randomize among equivalent scores */
246 static Stopwatch_T stopwatch = NULL;
247 
248 /************************************************************************
249  *   Program options
250  ************************************************************************/
251 
252 /* Input options */
253 static char *user_transcriptomedir = NULL;
254 static char *transcriptome_dbroot = NULL;
255 
256 static char *user_genomedir = NULL;
257 static char *genome_dbroot = NULL;
258 
259 static int part_modulus = 0;
260 static int part_interval = 1;
261 static int barcode_length = 0;
262 static int endtrim_length = 0;
263 static bool invert_first_p = false;
264 static bool invert_second_p = true;
265 static int acc_fieldi_start = 0;
266 static int acc_fieldi_end = 0;
267 static bool force_single_end_p = false;
268 static bool filter_chastity_p = false;
269 static bool allow_paired_end_mismatch_p = false;
270 static bool filter_if_both_p = false;
271 
272 static char *read_files_command = NULL;
273 static bool gunzip_p = false;
274 static bool bunzip2_p = false;
275 static bool interleavedp = false;
276 
277 /* Compute options */
278 static bool chop_primers_p = false;
279 static bool query_unk_mismatch_p = false;
280 static bool genome_unk_mismatch_p = true;
281 static bool novelsplicingp = false;
282 static bool splicingp = false;
283 static bool find_dna_chimeras_p = true; /* On by default, because it improves even RNA-Seq reads */
284 
285 static bool user_trim_indel_score_p = false;
286 static int trim_indel_score = -2; /* was -4 */
287 
288 
289 static bool multiple_sequences_p;
290 static bool sharedp = false;
291 static bool preload_shared_memory_p = false;
292 static bool unload_shared_memory_p = false;
293 static bool expand_offsets_p = false;
294 
295 #ifdef HAVE_MMAP
296 /* Level 4 is now default */
297 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
298 static Access_mode_T positions_access = USE_ALLOCATE;
299 static Access_mode_T regiondb_access = USE_ALLOCATE;
300 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
301 static Access_mode_T locpositions_access = USE_ALLOCATE;
302 
303 static Access_mode_T genome_access = USE_ALLOCATE;
304 
305 #else
306 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
307 static Access_mode_T positions_access = USE_ALLOCATE;
308 static Access_mode_T regiondb_access = USE_ALLOCATE;
309 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
310 static Access_mode_T locpositions_access = USE_ALLOCATE;
311 
312 static Access_mode_T genome_access = USE_ALLOCATE;
313 
314 #endif
315 
316 static int pairmax_transcriptome;
317 static int pairmax_linear;
318 static int pairmax_circular;
319 static int pairmax_dna = 2000;
320 static int pairmax_rna = 200000;
321 static int expected_pairlength = 500;
322 static int pairlength_deviation = 100;
323 
324 #ifdef USE_MPI
325 static int nranks, n_slave_ranks, myid, provided;
326 static int exclude_ranks[1];
327 static MPI_Comm workers_comm;
328 static MPI_Group world_group, workers_group;
329 static int nthreads0;
330 static bool master_is_worker_p = false; /* default behavior */
331 #endif
332 
333 #ifdef HAVE_PTHREAD
334 static pthread_t output_thread_id, *worker_thread_ids;
335 #ifdef USE_MPI
336 static pthread_t write_stdout_thread_id, parser_thread_id, mpi_interface_thread_id;
337 #endif
338 static pthread_key_t global_request_key;
339 static int nthreads = 1;	/* (int) sysconf(_SC_NPROCESSORS_ONLN) */
340 #else
341 static int nthreads = 0;	/* (int) sysconf(_SC_NPROCESSORS_ONLN) */
342 #endif
343 
344 /* static Masktype_T masktype = MASK_REPETITIVE; */
345 static int subopt_levels = 0;
346 
347 /* If negative, then hasn't been specified by user.  If between 0 and
348    1, then treated as a fraction of the querylength.  Else, treated as
349    an integer */
350 static double user_mismatches_refalt_float = -1.0;
351 static double user_mismatches_ref_float = -1.0;
352 static double user_mincoverage_float = 0.5;
353 
354 /* Really have only one indel penalty */
355 static int indel_penalty_middle = 2;
356 static int indel_penalty_end = 2;
357 
358 static bool allow_end_indels_p = true;
359 static double max_middle_insertions_float = 0.2;
360 static double max_middle_deletions_float= 0.2;
361 static int max_end_insertions = 3;
362 static int max_end_deletions = 6;
363 static int min_indel_end_matches = 4;
364 static Chrpos_T shortsplicedist = 200000;
365 static Chrpos_T shortsplicedist_known;
366 static Chrpos_T shortsplicedist_novelend = 80000;
367 static int localsplicing_penalty = 0;
368 static int distantsplicing_penalty = 1;
369 static int min_distantsplicing_end_matches = 20; /* Should be about index1part */
370 static double min_distantsplicing_identity = 0.95;
371 static int min_shortend = 2;
372 /* static bool find_novel_doublesplices_p = true; */
373 static int antistranded_penalty = 0; /* Most RNA-Seq is non-stranded */
374 
375 static Width_T index1part = 15;
376 static Width_T index1part_tr = 15;
377 static Width_T required_index1part = 0;
378 static Width_T index1interval;
379 static Width_T index1interval_tr;
380 static Width_T required_index1interval = 0;
381 
382 static Width_T local1part = 8;
383 static Width_T required_local1part = 0;
384 static Width_T local1interval;
385 static Width_T required_local1interval = 0;
386 
387 static Width_T region1part = 6;
388 static Width_T required_region1part = 0;
389 static Width_T region1interval;
390 static Width_T required_region1interval = 0;
391 
392 static int indexdb_size_threshold;
393 
394 /* Option for GSNAP complete set algorithm.  Affects speed significantly. */
395 static int max_anchors = 10;
396 
397 
398 /* Genes IIT */
399 static char *genes_file = (char *) NULL;
400 static IIT_T genes_iit = NULL;	    /* For genome alignment */
401 static int *genes_divint_crosstable = NULL;
402 static int *genes_chrnum_crosstable = NULL;
403 static bool favor_multiexon_p = false;
404 
405 
406 /* Splicing IIT */
407 static bool knownsplicingp = false;
408 static bool distances_observed_p = false;
409 static char *user_splicingdir = (char *) NULL;
410 static char *splicing_file = (char *) NULL;
411 static IIT_T splicing_iit = NULL;
412 static bool amb_clip_p = true;
413 
414 static int donor_typeint = -1;		/* for splicing_iit */
415 static int acceptor_typeint = -1;	/* for splicing_iit */
416 
417 static int *splicing_divint_crosstable = NULL;
418 static Genomecomp_T *splicecomp = NULL;
419 static Univcoord_T *splicesites = NULL;
420 static Splicetype_T *splicetypes = NULL;
421 static Chrpos_T *splicedists = NULL; /* maximum observed splice distance for given splice site */
422 static int nsplicesites = 0;
423 
424 
425 /* Cmet and AtoI */
426 static char *user_modedir = NULL;  /* user_cmetdir, user_atoidir */
427 static Mode_T mode = STANDARD;
428 
429 /* Masked (e.g., introns) genome */
430 static char *masked_root = (char *) NULL;
431 static bool maskedp = false;
432 
433 
434 /* SNPs IIT */
435 static char *user_snpsdir = NULL;
436 static char *snps_root = (char *) NULL;
437 static IIT_T snps_iit = NULL;
438 static int *snps_divint_crosstable = NULL;
439 
440 
441 /* Tally IIT */
442 static char *user_tallydir = NULL;
443 static char *tally_root = (char *) NULL;
444 static IIT_T tally_iit = NULL;
445 static int *tally_divint_crosstable = NULL;
446 
447 static char *user_runlengthdir = NULL;
448 static char *runlength_root = (char *) NULL;
449 static IIT_T runlength_iit = NULL;
450 static int *runlength_divint_crosstable = NULL;
451 
452 
453 /* Output options */
454 static unsigned int output_buffer_size = 1000;
455 static Outputtype_T output_type = STD_OUTPUT;
456 
457 /* For Illumina, subtract 64.  For Sanger, subtract 33. */
458 /* static int quality_score_adj = 64;  -- Stored in mapq.c */
459 
460 static bool user_quality_score_adj = false;
461 static bool user_quality_shift = false;
462 static int quality_shift = 0;   /* For printing, may want -31 */
463 
464 static bool exception_raise_p = true;
465 static bool add_paired_nomappers_p = false;
466 static bool paired_flag_means_concordant_p = false;
467 static bool quiet_if_excessive_p = false;
468 static int maxpaths_search = 1000;
469 static int maxpaths_report = 100;
470 static bool orderedp = false;
471 static bool failsonlyp = false;
472 static bool nofailsp = false;
473 
474 static bool print_nsnpdiffs_p = false;
475 static bool print_snplabels_p = false;
476 
477 static bool show_refdiff_p = false;
478 static bool clip_overlap_p = false;
479 static bool merge_overlap_p = false;
480 static bool merge_samechr_p = false;
481 static bool method_print_p = false;
482 
483 /* SAM */
484 static int sam_headers_batch = -1;
485 static bool sam_headers_p = true;
486 static bool sam_hardclip_use_S_p = false;
487 static bool sam_insert_0M_p = false;
488 static bool sam_cigar_extended_p = false;
489 static bool sam_multiple_primaries_p = false;
490 static bool sam_sparse_secondaries_p = false;
491 static char *sam_read_group_id = NULL;
492 static char *sam_read_group_name = NULL;
493 static char *sam_read_group_library = NULL;
494 static char *sam_read_group_platform = NULL;
495 static bool force_xs_direction_p = false;
496 static bool md_lowercase_variant_p = false;
497 static bool hide_soft_clips_p = false;
498 static Cigar_action_T cigar_action = CIGAR_ACTION_WARNING;
499 
500 static bool filter_within_trims_p;
501 static bool specified_filter_within_trims_p = false;
502 static bool user_filter_within_trims_p;
503 
504 static bool omit_concordant_uniq_p = false;
505 static bool omit_concordant_mult_p = false;
506 
507 
508 /* Input/output */
509 static char *split_output_root = NULL;
510 static char *output_file = NULL;
511 static char *failedinput_root = NULL;
512 static bool appendp = false;
513 static Outbuffer_T outbuffer;
514 static Inbuffer_T inbuffer;
515 static unsigned int inbuffer_nspaces = 1000;
516 static bool timingp = false;
517 static bool unloadp = false;
518 
519 
520 /* Alignment options */
521 static bool uncompressedp = false;
522 
523 /* getopt used alphabetically: AaBCcDdEeGgiJjKklMmNnOoQqstVvwYyZz7 */
524 
525 static struct option long_options[] = {
526   /* Input options */
527   {"transcriptdir", required_argument, 0, 'C'},	/* user_transcriptomedir */
528   {"transcriptdb", required_argument, 0, 'c'}, /* transcriptome_dbroot */
529   {"dir", required_argument, 0, 'D'},	/* user_genomedir */
530   {"db", required_argument, 0, 'd'}, /* genome_dbroot */
531   {"use-localdb", required_argument, 0, 0}, /* user_localdb_p, use_localdb_p */
532   {"use-transcriptome-only", no_argument, 0, 0}, /* use_only_transcriptome_p */
533   {"kmer", required_argument, 0, 'k'}, /* required_index1part, index1part */
534   {"sampling", required_argument, 0, 0}, /* required_index1interval, index1interval */
535   {"genomefull", no_argument, 0, 'G'}, /* uncompressedp */
536   {"part", required_argument, 0, 'q'}, /* part_modulus, part_interval */
537   {"orientation", required_argument, 0, 0}, /* invert_first_p, invert_second_p */
538   {"input-buffer-size", required_argument, 0, 0}, /* inbuffer_nspaces */
539   {"barcode-length", required_argument, 0, 0},	  /* barcode_length */
540   {"endtrim-length", required_argument, 0, 0},	  /* endtrim_length */
541   {"fastq-id-start", required_argument, 0, 0},	  /* acc_fieldi_start */
542   {"fastq-id-end", required_argument, 0, 0},	  /* acc_fieldi_end */
543   {"force-single-end", no_argument, 0, 0},	  /* force_single_end_p */
544   {"filter-chastity", required_argument, 0, 0},	/* filter_chastity_p, filter_if_both_p */
545   {"allow-pe-name-mismatch", no_argument, 0, 0}, /* allow_paired_end_mismatch_p */
546 
547   {"read-files-command", required_argument, 0, 0}, /* read_files_command */
548 #ifdef HAVE_ZLIB
549   {"gunzip", no_argument, 0, 0}, /* gunzip_p */
550 #endif
551 
552 #ifdef HAVE_BZLIB
553   {"bunzip2", no_argument, 0, 0}, /* bunzip2_p */
554 #endif
555 
556   {"interleaved", no_argument, 0, 0}, /* interleavedp */
557 
558   /* Compute options */
559   {"use-shared-memory", required_argument, 0, 0}, /* sharedp */
560   {"preload-shared-memory", no_argument, 0, 0},	  /* preload_shared_memory_p */
561   {"unload-shared-memory", no_argument, 0, 0},	  /* unload_shared_memory_p */
562 #ifdef HAVE_MMAP
563   {"batch", required_argument, 0, 'B'}, /* offsetsstrm_access, positions_access, genome_access */
564 #endif
565   {"expand-offsets", required_argument, 0, 0}, /* expand_offsets_p */
566   {"pairmax-dna", required_argument, 0, 0}, /* pairmax_dna */
567   {"pairmax-rna", required_argument, 0, 0}, /* pairmax_rna */
568   {"pairexpect", required_argument, 0, 0},  /* expected_pairlength */
569   {"pairdev", required_argument, 0, 0},  /* pairlength_deviation */
570 
571   {"nthreads", required_argument, 0, 't'}, /* nthreads */
572   {"adapter-strip", required_argument, 0, 'a'},	/* chop_primers_p */
573 
574   {"query-unk-mismatch", required_argument, 0, 0}, /* query_unk_mismatch_p */
575   {"genome-unk-mismatch", required_argument, 0, 0}, /* genome_unk_mismatch_p */
576 
577   {"trim-mismatch-score", required_argument, 0, 0}, /* trim_mismatch_score, user_trim_mismatch_score_p */
578   {"trim-indel-score", required_argument, 0, 0}, /* trim_indel_score, user_trim_mismatch_score_p */
579   {"novelsplicing", required_argument, 0, 'N'}, /* novelsplicingp */
580   {"find-dna-chimeras", required_argument, 0, 0}, /* find_dna_chimeras */
581 
582   {"max-mismatches", required_argument, 0, 'm'}, /* user_mismatches_refalt_float */
583   {"max-ref-mismatches", required_argument, 0, 0}, /* user_mismatches_ref_float */
584   {"min-coverage", required_argument, 0, 0}, /* user_mincoverage_float */
585   {"filter-within-trims", required_argument, 0, 0}, /* filter_within_trims_p */
586 
587 #if 0
588   {"indel-penalty-middle", required_argument, 0, 'i'}, /* indel_penalty_middle */
589   {"indel-penalty-end", required_argument, 0, 'I'}, /* indel_penalty_end */
590 #else
591   {"indel-penalty", required_argument, 0, 'i'}, /* indel_penalty_middle, indel_penalty_end */
592 #endif
593 
594   {"indel-endlength", required_argument, 0, 0}, /* min_indel_end_matches, allow_end_indels_p */
595 
596   {"max-middle-insertions", required_argument, 0, 'y'}, /* max_middle_insertions_float */
597   {"max-middle-deletions", required_argument, 0, 'z'}, /* max_middle_deletions_float */
598   {"max-end-insertions", required_argument, 0, 'Y'}, /* max_end_insertions */
599   {"max-end-deletions", required_argument, 0, 'Z'}, /* max_end_deletions */
600   {"suboptimal-levels", required_argument, 0, 'M'}, /* subopt_levels */
601 
602   {"localsplicedist", required_argument, 0, 'w'}, /* shortsplicedist */
603   {"novelend-splicedist", required_argument, 0, 0}, /* shortsplicedist_novelend */
604   {"splicingdir", required_argument, 0, 0},	  /* user_splicingdir */
605   {"use-splicing", required_argument, 0, 's'}, /* splicing_iit, knownsplicingp, find_dna_chimeras_p */
606   {"ambig-splice-noclip", no_argument, 0, 0},  /* amb_clip_p */
607   {"genes", required_argument, 0, 'g'}, /* genes_iit */
608   {"favor-multiexon", no_argument, 0, 0}, /* favor_multiexon_p */
609   {"end-detail", required_argument, 0, 0}, /* end_detail */
610 
611   {"local-splice-penalty", required_argument, 0, 0}, /* localsplicing_penalty */
612   {"distant-splice-penalty", required_argument, 0, 'E'}, /* distantsplicing_penalty */
613   {"distant-splice-endlength", required_argument, 0, 'K'}, /* min_distantsplicing_end_matches */
614   {"shortend-splice-endlength", required_argument, 0, 'l'}, /* min_shortend */
615   {"distant-splice-identity", required_argument, 0, 0}, /* min_distantsplicing_identity */
616   {"antistranded-penalty", required_argument, 0, 0},	    /* antistranded_penalty */
617   {"merge-distant-samechr", no_argument, 0, 0},		    /* merge_samechr_p */
618 
619   {"cmetdir", required_argument, 0, 0}, /* user_modedir */
620   {"atoidir", required_argument, 0, 0}, /* user_modedir */
621   {"mode", required_argument, 0, 0}, /* mode */
622 
623   {"use-mask", required_argument, 0, 'e'}, /* masked_root */
624   {"snpsdir", required_argument, 0, 'V'},   /* user_snpsdir */
625   {"use-snps", required_argument, 0, 'v'}, /* snps_root */
626 
627   {"tallydir", required_argument, 0, 0},   /* user_tallydir */
628   {"use-tally", required_argument, 0, 0}, /* tally_root */
629 
630   {"runlengthdir", required_argument, 0, 0},   /* user_runlengthdir */
631   {"use-runlength", required_argument, 0, 0}, /* runlength_root */
632 
633   {"max-anchors", required_argument, 0, 0}, /* max_anchors */
634   {"gmap-mode", required_argument, 0, 0}, /* gmap_mode */
635   {"trigger-score-for-gmap", required_argument, 0, 0}, /* trigger_score_for_gmap */
636   {"gmap-min-match-length", required_argument, 0, 0},      /* gmap_min_nconsecutive */
637   {"gmap-allowance", required_argument, 0, 0}, /* gmap_allowance */
638   {"max-gmap-pairsearch", required_argument, 0, 0}, /* max_gmap_pairsearch */
639   {"max-gmap-terminal", required_argument, 0, 0}, /* max_gmap_terminal */
640   {"max-gmap-improvement", required_argument, 0, 0}, /* max_gmap_improvement */
641 
642   /* Output options */
643   {"output-buffer-size", required_argument, 0, 0}, /* output_buffer_size */
644   {"format", required_argument, 0, 'A'}, /* output_type */
645 
646   {"quality-protocol", required_argument, 0, 0}, /* quality_score_adj, quality_shift */
647   {"quality-zero-score", required_argument, 0, 'J'}, /* quality_score_adj */
648   {"quality-print-shift", required_argument, 0, 'j'}, /* quality_shift */
649   {"sam-headers-batch", required_argument, 0, 0},	/* sam_headers_batch */
650   {"no-sam-headers", no_argument, 0, 0},	/* sam_headers_p */
651   {"sam-hardclip-use-S", no_argument, 0, 0},		/* sam_hardclip_use_S_p */
652   {"sam-use-0M", no_argument, 0, 0},		/* sam_insert_0M_p */
653   {"sam-extended-cigar", no_argument, 0, 0},	/* sam_cigar_extended_p */
654   {"sam-multiple-primaries", no_argument, 0, 0}, /* sam_multiple_primaries_p */
655   {"sam-sparse-secondaries", no_argument, 0, 0}, /* sam_sparse_secondaries_p */
656   {"read-group-id", required_argument, 0, 0},	/* sam_read_group_id */
657   {"read-group-name", required_argument, 0, 0},	/* sam_read_group_name */
658   {"read-group-library", required_argument, 0, 0},	/* sam_read_group_library */
659   {"read-group-platform", required_argument, 0, 0},	/* sam_read_group_platform */
660   {"force-xs-dir", no_argument, 0, 0},			/* force_xs_direction_p */
661   {"md-lowercase-snp", no_argument, 0, 0},		/* md_lowercase_variant_p */
662   {"extend-soft-clips", no_argument, 0, 0},		/* hide_soft_clips_p */
663   {"action-if-cigar-error", required_argument, 0, 0},	/* cigar_action */
664 
665   {"noexceptions", no_argument, 0, '0'}, /* exception_raise_p */
666 #if 0
667   {"maxsearch", required_argument, 0, 0}, /* maxpaths_search */
668 #endif
669   {"npaths", required_argument, 0, 'n'}, /* maxpaths_report */
670   {"add-paired-nomappers", no_argument, 0, 0}, /* add_paired_nomappers_p */
671   {"paired-flag-means-concordant", required_argument, 0, 0}, /* paired_flag_means_concordant_p */
672   {"quiet-if-excessive", no_argument, 0, 'Q'}, /* quiet_if_excessive_p */
673   {"ordered", no_argument, 0, 'O'}, /* orderedp */
674   {"clip-overlap", no_argument, 0, 0},	     /* clip_overlap_p */
675   {"merge-overlap", no_argument, 0, 0},	     /* merge_overlap_p */
676   {"show-method", no_argument, 0, 0},	     /* method_print_p */
677 
678   {"show-refdiff", no_argument, 0, 0},	       /* show_refdiff_p */
679   {"print-snps", no_argument, 0, 0}, /* print_snplabels_p */
680   {"failsonly", no_argument, 0, 0}, /* failsonlyp */
681   {"nofails", no_argument, 0, 0}, /* nofailsp */
682   {"output-file", required_argument, 0, 'o'}, /* output_file */
683   {"split-output", required_argument, 0, 0}, /* split_output_root */
684   {"failed-input", required_argument, 0, 0}, /* failed_input_root */
685   {"append-output", no_argument, 0, 0},	     /* appendp */
686 
687   {"order-among-best", required_argument, 0, 0}, /* want_random_p */
688 
689   {"omit-concordant-uniq", no_argument, 0, 0}, /* omit_concordant_uniq_p */
690   {"omit-concordant-mult", no_argument, 0, 0}, /* omit_concordant_mult_p */
691 
692   /* Diagnostic options */
693   {"time", no_argument, 0, 0},	/* timingp */
694   {"unload", no_argument, 0, 0},	/* unloadp */
695 
696   /* Obsolete, but included for backward compatibility */
697   {"use-sarray", required_argument, 0, 0},
698   {"terminal-threshold", required_argument, 0, 0},
699 
700   /* Help options */
701   {"check", no_argument, 0, 0}, /* check_compiler_assumptions */
702   {"version", no_argument, 0, 0}, /* print_program_version */
703   {"help", no_argument, 0, 0}, /* print_program_usage */
704   {0, 0, 0, 0}
705 };
706 
707 
708 static void
print_program_version()709 print_program_version () {
710   char *genomedir;
711 
712   fprintf(stdout,"\n");
713   fprintf(stdout,"GSNAP: Genomic Short Nucleotide Alignment Program\n");
714   fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
715   fprintf(stdout,"Build target: %s\n",TARGET);
716   fprintf(stdout,"Features: ");
717 #ifdef HAVE_PTHREAD
718   fprintf(stdout,"pthreads enabled, ");
719 #else
720   fprintf(stdout,"no pthreads, ");
721 #endif
722 #ifdef HAVE_ALLOCA
723   fprintf(stdout,"alloca available, ");
724 #else
725   fprintf(stdout,"no alloca, ");
726 #endif
727 #ifdef HAVE_ZLIB
728   fprintf(stdout,"zlib available, ");
729 #else
730   fprintf(stdout,"no zlib, ");
731 #endif
732 #ifdef HAVE_MMAP
733   fprintf(stdout,"mmap available, ");
734 #else
735   fprintf(stdout,"no mmap, ");
736 #endif
737 #ifdef WORDS_BIGENDIAN
738   fprintf(stdout,"bigendian, ");
739 #else
740   fprintf(stdout,"littleendian, ");
741 #endif
742 #ifdef HAVE_SIGACTION
743   fprintf(stdout,"sigaction available, ");
744 #else
745   fprintf(stdout,"no sigaction, ");
746 #endif
747 #ifdef HAVE_64_BIT
748   fprintf(stdout,"64 bits available");
749 #else
750   fprintf(stdout,"64 bits not available");
751 #endif
752   fprintf(stdout,"\n");
753 
754   fprintf(stdout,"Popcnt:");
755 #ifdef HAVE_POPCNT
756   fprintf(stdout," popcnt/lzcnt/tzcnt");
757 #endif
758 #ifdef HAVE_MM_POPCNT
759   fprintf(stdout," mm_popcnt");
760 #endif
761 #ifdef HAVE_BUILTIN_POPCOUNT
762   fprintf(stdout," builtin_popcount");
763 #endif
764   fprintf(stdout,"\n");
765 
766   fprintf(stdout,"Builtin functions:");
767 #ifdef HAVE_BUILTIN_CLZ
768   fprintf(stdout," builtin_clz");
769 #endif
770 #ifdef HAVE_BUILTIN_CTZ
771   fprintf(stdout," builtin_ctz");
772 #endif
773 #ifdef HAVE_BUILTIN_POPCOUNT
774   fprintf(stdout," builtin_popcount");
775 #endif
776   fprintf(stdout,"\n");
777 
778   fprintf(stdout,"SIMD functions compiled:");
779 #ifdef HAVE_ALTIVEC
780   fprintf(stdout," Altivec");
781 #endif
782 #ifdef HAVE_MMX
783   fprintf(stdout," MMX");
784 #endif
785 #ifdef HAVE_SSE
786   fprintf(stdout," SSE");
787 #endif
788 #ifdef HAVE_SSE2
789   fprintf(stdout," SSE2");
790 #endif
791 #ifdef HAVE_SSE3
792   fprintf(stdout," SSE3");
793 #endif
794 #ifdef HAVE_SSSE3
795   fprintf(stdout," SSSE3");
796 #endif
797 #ifdef HAVE_SSE4_1
798   fprintf(stdout," SSE4.1");
799 #endif
800 #ifdef HAVE_SSE4_2
801   fprintf(stdout," SSE4.2");
802 #endif
803 #ifdef HAVE_AVX2
804   fprintf(stdout," AVX2");
805 #endif
806 #ifdef HAVE_AVX512
807   fprintf(stdout," AVX512");
808 #endif
809 #ifdef HAVE_AVX512BW
810   fprintf(stdout," AVX512BW");
811 #endif
812   fprintf(stdout,"\n");
813 
814 
815   fprintf(stdout,"Sizes: off_t (%d), size_t (%d), unsigned int (%d), long int (%d), long long int (%d)\n",
816 	  (int) sizeof(off_t),(int) sizeof(size_t),(int) sizeof(unsigned int),(int) sizeof(long int),(int) sizeof(long long int));
817   fprintf(stdout,"Default gmap directory (compiled): %s\n",GMAPDB);
818   genomedir = Datadir_find_genomedir(/*user_genomedir*/NULL);
819   fprintf(stdout,"Default gmap directory (environment): %s\n",genomedir);
820   FREE(genomedir);
821   fprintf(stdout,"Maximum stack read length: %d\n",MAX_STACK_READLENGTH);
822   fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
823   fprintf(stdout,"Contact: twu@gene.com\n");
824   fprintf(stdout,"\n");
825   return;
826 }
827 
828 /* This flag is not well-supported, and therefore hidden, but
829    kept for backwards compatibility */
830 /*  -R, --rel=STRING               Release\n\ */
831 
832 
833 static void
834 print_program_usage ();
835 
836 
837 static void
check_compiler_assumptions()838 check_compiler_assumptions () {
839   unsigned int x = rand(), y = rand();
840 #ifdef HAVE_SSE2
841   int z;
842   __m128i a;
843 #ifdef HAVE_SSE4_1
844   char negx, negy;
845 #endif
846 #endif
847 
848 #ifdef HAVE_SSE2
849   /* With -mavx, compiler may use assembly instructions for _mm_set1_epi32 that don't work on non-AVX machines */
850   fprintf(stderr,"Checking compiler assumptions for SSE2: ");
851   fprintf(stderr,"%08X %08X",x,y);
852   a = _mm_xor_si128(_mm_set1_epi32(x),_mm_set1_epi32(y));
853   z = _mm_cvtsi128_si32(a);
854   fprintf(stderr," xor=%08X\n",z);
855 #endif
856 
857 
858 #ifdef HAVE_SSE4_1
859   if ((negx = (char) x) > 0) {
860     negx = -negx;
861   }
862   if ((negy = (char) y) > 0) {
863     negy = -negy;
864   }
865 
866   fprintf(stderr,"Checking compiler assumptions for SSE4.1: ");
867   fprintf(stderr,"%d %d",negx,negy);
868   a = _mm_max_epi8(_mm_set1_epi8(negx),_mm_set1_epi8(negy));
869   z = _mm_extract_epi8(a,0);
870   fprintf(stderr," max=%d => ",z);
871   if (negx > negy) {
872     if (z == (int) negx) {
873       fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
874     } else {
875       fprintf(stderr,"compiler zero extends\n");
876     }
877   } else {
878     if (z == (int) negy) {
879       fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
880     } else {
881       fprintf(stderr,"compiler zero extends\n");
882     }
883   }
884 #endif
885 
886 
887 #ifdef HAVE_SSE4_2
888   fprintf(stderr,"Checking compiler assumptions for SSE4.2 options: ");
889   fprintf(stderr,"%08X ",x);
890 #ifdef HAVE_LZCNT
891   fprintf(stderr,"_lzcnt_u32=%d ",_lzcnt_u32(x));
892 #endif
893 #ifdef HAVE_BUILTIN_CLZ
894   fprintf(stderr,"__builtin_clz=%d ",__builtin_clz(x));
895 #endif
896 #ifdef HAVE_TZCNT
897   fprintf(stderr,"_tzcnt_u32=%d ",_tzcnt_u32(x));
898 #endif
899 #ifdef HAVE_BUILTIN_CTZ
900   fprintf(stderr,"__builtin_ctz=%d ",__builtin_ctz(x));
901 #endif
902 
903 #ifdef HAVE_POPCNT
904   fprintf(stderr,"_popcnt32=%d ",_popcnt32(x));
905 #endif
906 #if defined(HAVE_MM_POPCNT)
907   fprintf(stderr,"_mm_popcnt_u32=%d ",_mm_popcnt_u32(x));
908 #endif
909 #if defined(HAVE_BUILTIN_POPCOUNT)
910   fprintf(stderr,"__builtin_popcount=%d ",__builtin_popcount(x));
911 #endif
912 
913   fprintf(stderr,"\n");
914 #endif
915 
916   fprintf(stderr,"Finished checking compiler assumptions\n");
917 
918   return;
919 }
920 
921 
922 /************************************************************************/
923 
924 
925 static Filestring_T
process_request(Filestring_T * fp_failedinput,Filestring_T * fp_failedinput_1,Filestring_T * fp_failedinput_2,double * worker_runtime,Request_T request,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool,Stopwatch_T worker_stopwatch)926 process_request (Filestring_T *fp_failedinput, Filestring_T *fp_failedinput_1, Filestring_T *fp_failedinput_2,
927 		 double *worker_runtime, Request_T request,
928 #if 0
929 		 Oligoindex_array_T oligoindices_minor,
930 		 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
931 		 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
932 #endif
933 		 Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
934 		 Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool,
935 		 Stopwatch_T worker_stopwatch) {
936   Filestring_T fp;
937   Result_T result;
938   int jobid;
939   Shortread_T queryseq1, queryseq2;
940   Stage3end_T *stage3array, *stage3array5, *stage3array3;
941   Stage3pair_T *stage3pairarray;
942 
943   int npaths_primary, npaths_altloc, npaths5_primary, npaths5_altloc, npaths3_primary, npaths3_altloc, i;
944   int first_absmq, second_absmq, first_absmq5, second_absmq5, first_absmq3, second_absmq3;
945   Pairtype_T final_pairtype;
946 
947   jobid = Request_id(request);
948   queryseq1 = Request_queryseq1(request);
949   queryseq2 = Request_queryseq2(request);
950 
951 #if 0
952   Pairpool_reset(pairpool);
953   Diagpool_reset(diagpool);
954   Cellpool_reset(cellpool);
955 #endif
956   Intlistpool_reset(intlistpool);
957   Univcoordlistpool_reset(univcoordlistpool);
958   Listpool_reset(listpool);
959   Univdiagpool_reset(univdiagpool);
960   Hitlistpool_reset(hitlistpool);
961 
962   /* printf("%s\n",Shortread_accession(queryseq1)); */
963 
964   if (worker_stopwatch != NULL) {
965     Stopwatch_start(worker_stopwatch);
966   }
967 
968   if (queryseq2 == NULL) {
969     stage3array = Stage1_single_read(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,
970 				     queryseq1,intlistpool,univcoordlistpool,listpool,univdiagpool,
971 				     hitlistpool);
972 
973     result = Result_single_read_new(jobid,(void **) stage3array,npaths_primary,npaths_altloc,first_absmq,second_absmq);
974     fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
975     *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
976     Result_free(&result);
977     return fp;
978 
979   } else if ((stage3pairarray = Stage1_paired_read(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,&final_pairtype,
980 						   &stage3array5,&npaths5_primary,&npaths5_altloc,&first_absmq5,&second_absmq5,
981 						   &stage3array3,&npaths3_primary,&npaths3_altloc,&first_absmq3,&second_absmq3,
982 						   queryseq1,queryseq2,(Chrpos_T) pairmax_linear,
983 						   intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool)) != NULL) {
984     /* Paired or concordant hits found */
985     result = Result_paired_read_new(jobid,(void **) stage3pairarray,npaths_primary,npaths_altloc,first_absmq,second_absmq,
986 				    final_pairtype);
987     fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
988     *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
989     Result_free(&result);
990     return fp;
991 
992   } else if (chop_primers_p == false || Shortread_chop_primers(queryseq1,queryseq2) == false) {
993     /* No paired or concordant hits found, and no adapters found */
994     /* Report ends as unpaired */
995     result = Result_paired_as_singles_new(jobid,(void **) stage3array5,npaths5_primary,npaths5_altloc,first_absmq5,second_absmq5,
996 					  (void **) stage3array3,npaths3_primary,npaths3_altloc,first_absmq3,second_absmq3);
997     fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
998     *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
999     Result_free(&result);
1000     return fp;
1001 
1002   } else {
1003     /* Try with potential primers chopped.  queryseq1 and queryseq2 altered by Shortread_chop_primers. */
1004     for (i = 0; i < npaths5_primary + npaths5_altloc; i++) {
1005       Stage3end_free(&(stage3array5[i]));
1006     }
1007     FREE_OUT(stage3array5);
1008 
1009     for (i = 0; i < npaths3_primary + npaths3_altloc; i++) {
1010       Stage3end_free(&(stage3array3[i]));
1011     }
1012     FREE_OUT(stage3array3);
1013 
1014     if ((stage3pairarray = Stage1_paired_read(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,&final_pairtype,
1015 					      &stage3array5,&npaths5_primary,&npaths5_altloc,&first_absmq5,&second_absmq5,
1016 					      &stage3array3,&npaths3_primary,&npaths3_altloc,&first_absmq3,&second_absmq3,
1017 					      queryseq1,queryseq2,(Chrpos_T) pairmax_linear,
1018 					      intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool)) != NULL) {
1019       /* Paired or concordant hits found, after chopping adapters */
1020       result = Result_paired_read_new(jobid,(void **) stage3pairarray,npaths_primary,npaths_altloc,first_absmq,second_absmq,
1021 				      final_pairtype);
1022 
1023     } else {
1024       /* No paired or concordant hits found, after chopping adapters */
1025       result = Result_paired_as_singles_new(jobid,(void **) stage3array5,npaths5_primary,npaths5_altloc,first_absmq5,second_absmq5,
1026 					    (void **) stage3array3,npaths3_primary,npaths3_altloc,first_absmq3,second_absmq3);
1027     }
1028 
1029     fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
1030     *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
1031     Result_free(&result);
1032     return fp;
1033   }
1034 }
1035 
1036 
1037 
1038 #ifdef HAVE_SIGACTION
1039 static const Except_T sigfpe_error = {"SIGFPE--arithmetic exception"};
1040 static const Except_T sigsegv_error = {"SIGSEGV--segmentation violation"};
1041 static const Except_T sigtrap_error = {"SIGTRAP--hardware fault"};
1042 static const Except_T misc_signal_error = {"Miscellaneous signal"};
1043 
1044 static void
signal_handler(int sig)1045 signal_handler (int sig) {
1046   Request_T request;
1047   Shortread_T queryseq1, queryseq2;
1048 
1049   if (sig == SIGPIPE) {
1050     /* Allow pipe */
1051     return;
1052   }
1053 
1054   switch (sig) {
1055   case SIGABRT: fprintf(stderr,"Signal received: SIGABRT\n"); break;
1056   case SIGFPE: fprintf(stderr,"Signal received: SIGFPE\n"); break;
1057   case SIGHUP: fprintf(stderr,"Signal received: SIGHUP\n"); break;
1058   case SIGILL:
1059     fprintf(stderr,"Signal received: SIGILL\n");
1060     fprintf(stderr,"An illegal instruction means that this program is being run on a computer\n");
1061     fprintf(stderr,"  with different features than the computer used to compile the program\n");
1062     fprintf(stderr,"You may need to re-compile the program on the same computer type as the target machine\n");
1063     fprintf(stderr,"  or re-compile with fewer features by doing something like\n");
1064     fprintf(stderr,"  ./configure --disable-simd\n");
1065     break;
1066   case SIGINT: fprintf(stderr,"Signal received: SIGINT\n"); break;
1067   case SIGPIPE: fprintf(stderr,"Signal received: SIGPIPE\n"); break;
1068   case SIGQUIT: fprintf(stderr,"Signal received: SIGQUIT\n"); break;
1069   case SIGSEGV: fprintf(stderr,"Signal received: SIGSEGV\n"); break;
1070   case SIGSYS: fprintf(stderr,"Signal received: SIGSYS\n"); break;
1071   case SIGTERM: fprintf(stderr,"Signal received: SIGTERM\n"); break;
1072   case SIGTRAP: fprintf(stderr,"Signal received: SIGTRAP\n"); break;
1073   case SIGXCPU: fprintf(stderr,"Signal received: SIGXCPU\n"); break;
1074   case SIGXFSZ: fprintf(stderr,"Signal received: SIGXFSZ\n"); break;
1075   }
1076 
1077   Access_emergency_cleanup();
1078 
1079 #if 0
1080   /* Appears to hang */
1081 #ifdef USE_MPI
1082   MPI_Barrier(MPI_COMM_WORLD);
1083 #endif
1084 #endif
1085 
1086 #ifdef HAVE_PTHREAD
1087   request = (Request_T) pthread_getspecific(global_request_key);
1088   if (request == NULL) {
1089     /* fprintf(stderr,"Unable to retrieve request for thread\n"); */
1090   } else {
1091     queryseq1 = Request_queryseq1(request);
1092     queryseq2 = Request_queryseq2(request);
1093     if (queryseq1 == NULL) {
1094       fprintf(stderr,"Unable to retrieve queryseq for request\n");
1095     } else {
1096       fprintf(stderr,"Problem sequence: ");
1097       fprintf(stderr,"%s (%d bp)\n",Shortread_accession(queryseq1),Shortread_fulllength(queryseq1));
1098       if (queryseq2 == NULL) {
1099 	Shortread_stderr_query_singleend_fasta(queryseq1,/*headerseq*/queryseq1);
1100       } else {
1101 	Shortread_stderr_query_pairedend_fasta(queryseq1,queryseq2,invert_first_p,invert_second_p);
1102       }
1103     }
1104   }
1105 #endif
1106 
1107   exit(9);
1108 
1109   return;
1110 }
1111 #endif
1112 
1113 
1114 /* #define POOL_FREE_INTERVAL 200 */
1115 #define POOL_FREE_INTERVAL 1
1116 
1117 
1118 static void
single_thread()1119 single_thread () {
1120   Request_T request;
1121   Filestring_T fp, fp_failedinput, fp_failedinput_1, fp_failedinput_2;
1122   Shortread_T queryseq1;
1123   Stopwatch_T worker_stopwatch;
1124 
1125   /* For GMAP */
1126   /* Oligoindex_array_T oligoindices_major, oligoindices_minor; */
1127   /* Dynprog_T dynprogL, dynprogM, dynprogR; */
1128 #if 0
1129   Pairpool_T pairpool;
1130   Diagpool_T diagpool;
1131   Cellpool_T cellpool;
1132 #endif
1133   Intlistpool_T intlistpool;
1134   Univcoordlistpool_T univcoordlistpool;
1135   Listpool_T listpool;
1136   Univdiagpool_T univdiagpool;
1137   Hitlistpool_T hitlistpool;
1138   int jobid = 0;
1139   double worker_runtime;
1140 
1141 #ifdef MEMUSAGE
1142   long int memusage_constant = 0, memusage;
1143   char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
1144 #endif
1145 
1146 #if 0
1147   oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1148   oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1149   dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1150 			 /*doublep*/true);
1151   dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1152 			 /*doublep*/false);
1153   dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1154 			 /*doublep*/true);
1155 #endif
1156 
1157 #if 0
1158   pairpool = Pairpool_new();
1159   diagpool = Diagpool_new();
1160   cellpool = Cellpool_new();
1161 #endif
1162   intlistpool = Intlistpool_new();
1163   univcoordlistpool = Univcoordlistpool_new();
1164   listpool = Listpool_new();
1165   univdiagpool = Univdiagpool_new();
1166   hitlistpool = Hitlistpool_new();
1167   worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
1168 
1169   /* Except_stack_create(); -- requires pthreads */
1170 
1171 #ifdef MEMUSAGE
1172   memusage_constant += Mem_usage_report_std_heap();
1173   Genomicpos_commafmt_fill(comma0,memusage_constant);
1174   Mem_usage_reset_heap_baseline(0);
1175 #endif
1176 
1177   while ((request = Inbuffer_get_request(inbuffer)) != NULL) {
1178 #ifdef USE_MPI
1179     debug(printf("rank %d, ",myid));
1180 #endif
1181     debug(printf("single_thread got request %d\n",Request_id(request)));
1182 
1183 #ifdef MEMUSAGE
1184     queryseq1 = Request_queryseq1(request);
1185     /* fprintf(stderr,"Single thread starting %s\n",Shortread_accession(queryseq1)); */
1186     Mem_usage_reset_stack_max();
1187     Mem_usage_reset_heap_max();
1188 #endif
1189 
1190     TRY
1191       fp = process_request(&fp_failedinput,&fp_failedinput_1,&fp_failedinput_2,&worker_runtime,
1192 			   request,
1193 			   /*oligoindices_minor,dynprogL,dynprogM,dynprogR,*/
1194 			   /*pairpool,diagpool,cellpool,*/
1195 			   intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1196 			   worker_stopwatch);
1197       if (timingp == true) {
1198         queryseq1 = Request_queryseq1(request);
1199         fprintf(stderr,"%s\t%.6f\n",Shortread_accession(queryseq1),worker_runtime);
1200       }
1201 
1202     ELSE
1203       queryseq1 = Request_queryseq1(request);
1204       if (queryseq1 == NULL) {
1205 	fprintf(stderr,"NULL");
1206       } else if (Shortread_accession(queryseq1) == NULL) {
1207 	fprintf(stderr,"unnamed (%d bp)",Shortread_fulllength(queryseq1));
1208       } else {
1209 	fprintf(stderr,"Problem sequence: ");
1210 	fprintf(stderr,"%s (%d bp)",Shortread_accession(queryseq1),Shortread_fulllength(queryseq1));
1211       }
1212       fprintf(stderr,"\n");
1213       if (Request_queryseq2(request) == NULL) {
1214 	Shortread_stderr_query_singleend_fasta(queryseq1,/*headerseq*/queryseq1);
1215       } else {
1216 	Shortread_stderr_query_pairedend_fasta(queryseq1,Request_queryseq2(request),
1217 					       invert_first_p,invert_second_p);
1218       }
1219       fprintf(stderr,"\n");
1220       fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
1221 
1222       fprintf(stderr,"Exiting...\n");
1223       exit(9);
1224     RERAISE;
1225     END_TRY;
1226 
1227     Outbuffer_print_filestrings(fp,fp_failedinput,fp_failedinput_1,fp_failedinput_2);
1228 
1229     if (jobid % POOL_FREE_INTERVAL == 0) {
1230 #if 0
1231       Pairpool_free_memory(pairpool);
1232       Diagpool_free_memory(diagpool);
1233       Cellpool_free_memory(cellpool);
1234 #endif
1235       Intlistpool_free_memory(intlistpool);
1236       Univcoordlistpool_free_memory(univcoordlistpool);
1237       Listpool_free_memory(listpool);
1238       Univdiagpool_free_memory(univdiagpool);
1239       Hitlistpool_free_memory(hitlistpool);
1240     }
1241 
1242 #ifdef MEMUSAGE
1243     /* Copy acc before we free the request */
1244     queryseq1 = Request_queryseq1(request);
1245     strncpy(acc,Shortread_accession(queryseq1),100);
1246     acc[100] = '\0';
1247 #endif
1248 
1249     Request_free(&request);
1250 
1251 #ifdef MEMUSAGE
1252     Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
1253     Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
1254     Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
1255     Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
1256     Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
1257 
1258 #if 0
1259     fprintf(stderr,"Acc %s: constant %s  max %s  std %s  keep %s  in %s  out %s\n",
1260 	    acc,comma0,comma1,comma2,comma3,comma4,comma5);
1261 #endif
1262 
1263     if ((memusage = Mem_usage_report_std_heap()) != 0) {
1264       fprintf(stderr,"Acc %s: Memory leak in single thread of %ld bytes\n",acc,memusage);
1265       fflush(stdout);
1266       exit(9);
1267     } else if (Mem_usage_report_std_heap_max() > ONE_GB) {
1268       fprintf(stderr,"Acc %s: Excessive memory usage in single thread of %s bytes\n",acc,comma1);
1269       fflush(stdout);
1270       exit(9);
1271     }
1272 #endif
1273   }
1274 
1275 #ifdef MEMUSAGE
1276   Mem_usage_std_heap_add(memusage_constant);
1277 #endif
1278 
1279   /* Except_stack_destroy(); -- requires pthreads */
1280 
1281   if (worker_stopwatch != NULL) {
1282     Stopwatch_free(&worker_stopwatch);
1283   }
1284   Hitlistpool_free(&hitlistpool);
1285   Univdiagpool_free(&univdiagpool);
1286   Listpool_free(&listpool);
1287   Univcoordlistpool_free(&univcoordlistpool);
1288   Intlistpool_free(&intlistpool);
1289 #if 0
1290   Cellpool_free(&cellpool);
1291   Diagpool_free(&diagpool);
1292   Pairpool_free(&pairpool);
1293   Dynprog_free(&dynprogR);
1294   Dynprog_free(&dynprogM);
1295   Dynprog_free(&dynprogL);
1296   Oligoindex_array_free(&oligoindices_minor);
1297   Oligoindex_array_free(&oligoindices_major);
1298 #endif
1299 
1300 #ifdef MEMUSAGE
1301   Mem_usage_set_threadname("main");
1302 #endif
1303 
1304   return;
1305 }
1306 
1307 
1308 #ifdef HAVE_PTHREAD
1309 static void *
worker_thread(void * data)1310 worker_thread (void *data) {
1311   Request_T request;
1312   Filestring_T fp, fp_failedinput, fp_failedinput_1, fp_failedinput_2;
1313   Shortread_T queryseq1;
1314   Stopwatch_T worker_stopwatch;
1315 
1316   /* For GMAP */
1317   /* Oligoindex_array_T oligoindices_major, oligoindices_minor; */
1318   /* Dynprog_T dynprogL, dynprogM, dynprogR; */
1319 #if 0
1320   Pairpool_T pairpool;
1321   Diagpool_T diagpool;
1322   Cellpool_T cellpool;
1323 #endif
1324   Intlistpool_T intlistpool;
1325   Univcoordlistpool_T univcoordlistpool;
1326   Listpool_T listpool;
1327   Univdiagpool_T univdiagpool;
1328   Hitlistpool_T hitlistpool;
1329   int worker_jobid = 0;
1330   double worker_runtime;
1331 #if defined(DEBUG) || defined(MEMUSAGE)
1332   long int worker_id = (long int) data;
1333 #endif
1334 
1335 #ifdef MEMUSAGE
1336   long int memusage_constant = 0, memusage;
1337   char threadname[12];
1338   char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
1339   sprintf(threadname,"thread-%ld",worker_id);
1340   Mem_usage_set_threadname(threadname);
1341 #endif
1342 
1343 #ifdef USE_MPI
1344   debug(fprintf(stderr,"rank %d, ",myid));
1345 #endif
1346   debug(fprintf(stderr,"worker_thread %ld starting\n",worker_id));
1347 
1348   /* Thread-specific data and storage */
1349 #if 0
1350   oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1351   oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1352   dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1353 			 /*doublep*/true);
1354   dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1355 			 /*doublep*/false);
1356   dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1357 			 /*doublep*/true);
1358   pairpool = Pairpool_new();
1359   diagpool = Diagpool_new();
1360   cellpool = Cellpool_new();
1361 #endif
1362 
1363   intlistpool = Intlistpool_new();
1364   univcoordlistpool = Univcoordlistpool_new();
1365   listpool = Listpool_new();
1366   univdiagpool = Univdiagpool_new();
1367   hitlistpool = Hitlistpool_new();
1368   worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
1369 
1370   Except_stack_create();
1371 
1372 #ifdef MEMUSAGE
1373   memusage_constant += Mem_usage_report_std_heap();
1374   Genomicpos_commafmt_fill(comma0,memusage_constant);
1375   Mem_usage_reset_heap_baseline(0);
1376 #endif
1377 
1378   while ((request = Inbuffer_get_request(inbuffer)) != NULL) {
1379 #ifdef USE_MPI
1380     debug(fprintf(stderr,"rank %d, ",myid));
1381 #endif
1382     debug(fprintf(stderr,"worker_thread %ld got request %d (%s)\n",
1383 		  worker_id,Request_id(request),Shortread_accession(Request_queryseq1(request))));
1384     pthread_setspecific(global_request_key,(void *) request);
1385 
1386 #ifdef MEMUSAGE
1387     queryseq1 = Request_queryseq1(request);
1388     /* fprintf(stderr,"Thread %d starting %s\n",worker_id,Shortread_accession(queryseq1)); */
1389     Mem_usage_reset_stack_max();
1390     Mem_usage_reset_heap_max();
1391 #endif
1392 
1393     TRY
1394       fp = process_request(&fp_failedinput,&fp_failedinput_1,&fp_failedinput_2,&worker_runtime,
1395 			   request,
1396 			   /*oligoindices_minor,dynprogL,dynprogM,dynprogR,*/
1397 			   /*pairpool,diagpool,cellpool,*/
1398 			   intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1399 			   worker_stopwatch);
1400       if (timingp == true) {
1401         queryseq1 = Request_queryseq1(request);
1402         fprintf(stderr,"%s\t%.6f\n",Shortread_accession(queryseq1),worker_runtime);
1403       }
1404 
1405     ELSE
1406       queryseq1 = Request_queryseq1(request);
1407       if (queryseq1 == NULL) {
1408 	fprintf(stderr,"NULL");
1409       } else if (Shortread_accession(queryseq1) == NULL) {
1410 	fprintf(stderr,"unnamed (%d bp)",Shortread_fulllength(queryseq1));
1411       } else {
1412 	fprintf(stderr,"Problem sequence: ");
1413 	fprintf(stderr,"%s (%d bp)",Shortread_accession(queryseq1),Shortread_fulllength(queryseq1));
1414       }
1415       fprintf(stderr,"\n");
1416       if (Request_queryseq2(request) == NULL) {
1417 	Shortread_stderr_query_singleend_fasta(queryseq1,/*headerseq*/queryseq1);
1418       } else {
1419 	Shortread_stderr_query_pairedend_fasta(queryseq1,Request_queryseq2(request),
1420 					       invert_first_p,invert_second_p);
1421       }
1422       fprintf(stderr,"\n");
1423       fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
1424 
1425       fprintf(stderr,"Exiting...\n");
1426       exit(9);
1427     RERAISE;
1428     END_TRY;
1429 
1430     Outbuffer_put_filestrings(outbuffer,fp,fp_failedinput,fp_failedinput_1,fp_failedinput_2);
1431 
1432     if (worker_jobid % POOL_FREE_INTERVAL == 0) {
1433 #if 0
1434       Pairpool_free_memory(pairpool);
1435       Diagpool_free_memory(diagpool);
1436       Cellpool_free_memory(cellpool);
1437 #endif
1438       Intlistpool_free_memory(intlistpool);
1439       Univcoordlistpool_free_memory(univcoordlistpool);
1440       Listpool_free_memory(listpool);
1441       Univdiagpool_free_memory(univdiagpool);
1442       Hitlistpool_free_memory(hitlistpool);
1443     }
1444 
1445 #ifdef MEMUSAGE
1446     /* Copy acc before we free the request */
1447     queryseq1 = Request_queryseq1(request);
1448     strncpy(acc,Shortread_accession(queryseq1),100);
1449     acc[100] = '\0';
1450 #endif
1451 
1452     Request_free(&request);
1453 
1454 #ifdef MEMUSAGE
1455     Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
1456     Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
1457     Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
1458     Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
1459     Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
1460 
1461 #if 0
1462     fprintf(stderr,"Acc %s, thread %d: constant %s  max %s  std %s  keep %s  in %s  out %s\n",
1463 	    acc,worker_id,comma0,comma1,comma2,comma3,comma4,comma5);
1464 #endif
1465 
1466     if ((memusage = Mem_usage_report_std_heap()) != 0) {
1467       fprintf(stderr,"Acc %s: Memory leak in worker thread %ld of %ld bytes\n",acc,worker_id,memusage);
1468       fflush(stdout);
1469       exit(9);
1470     } else if (Mem_usage_report_std_heap_max() > ONE_GB) {
1471       fprintf(stderr,"Acc %s: Excessive memory usage in worker thread %ld of %s bytes\n",acc,worker_id,comma1);
1472       fflush(stdout);
1473       exit(9);
1474     }
1475 #endif
1476   }
1477 
1478 #ifdef MEMUSAGE
1479   Mem_usage_std_heap_add(memusage_constant);
1480 #endif
1481 
1482   Except_stack_destroy();
1483 
1484   if (worker_stopwatch != NULL) {
1485     Stopwatch_free(&worker_stopwatch);
1486   }
1487   Hitlistpool_free(&hitlistpool);
1488   Univdiagpool_free(&univdiagpool);
1489   Listpool_free(&listpool);
1490   Univcoordlistpool_free(&univcoordlistpool);
1491   Intlistpool_free(&intlistpool);
1492 
1493 #if 0
1494   Cellpool_free(&cellpool);
1495   Diagpool_free(&diagpool);
1496   Pairpool_free(&pairpool);
1497   Dynprog_free(&dynprogR);
1498   Dynprog_free(&dynprogM);
1499   Dynprog_free(&dynprogL);
1500   Oligoindex_array_free(&oligoindices_minor);
1501   Oligoindex_array_free(&oligoindices_major);
1502 #endif
1503 
1504 #ifdef MEMUSAGE
1505   Mem_usage_set_threadname("main");
1506 #endif
1507 
1508 #ifdef USE_MPI
1509   debug(fprintf(stderr,"rank %d, ",myid));
1510 #endif
1511   debug(fprintf(stderr,"worker_thread %ld finished\n",worker_id));
1512 
1513   return (void *) NULL;
1514 }
1515 #endif
1516 
1517 
1518 static void
parse_part(int * part_modulus,int * part_interval,char * string)1519 parse_part (int *part_modulus, int *part_interval, char *string) {
1520   char *p = string;
1521 
1522   if (sscanf(p,"%d",&(*part_modulus)) < 1) {
1523     fprintf(stderr,"Cannot parse first integer from %s\n",string);
1524     exit(9);
1525   }
1526 
1527   while (*p != '\0' && isdigit(*p)) {
1528     p++;
1529   }
1530   while (*p != '\0' && !isdigit(*p)) {
1531     p++;
1532   }
1533   if (sscanf(p,"%d",&(*part_interval)) < 1) {
1534     fprintf(stderr,"Cannot parse first integer from %s\n",string);
1535     exit(9);
1536   }
1537   if ((*part_modulus) >= (*part_interval)) {
1538     fprintf(stderr,"In %s, batch number %d must be less than the number of batches %d\n",
1539 	    string,*part_modulus,*part_interval);
1540     exit(9);
1541   }
1542   if (*part_interval == 0) {
1543     fprintf(stderr,"Bad batch specification %s.  Batch interval cannot be 0.\n",string);
1544     exit(9);
1545   }
1546 
1547   return;
1548 }
1549 
1550 
1551 #if 0
1552 static int
1553 add_gmap_mode (char *string) {
1554   if (!strcmp(string,"none")) {
1555     gmap_mode = 0;
1556     return 0;
1557   } else if (!strcmp(string,"all")) {
1558     gmap_mode = (GMAP_IMPROVEMENT | GMAP_ENDS | GMAP_PAIRSEARCH);
1559     return 1;
1560   } else {
1561     if (!strcmp(string,"improve")) {
1562       gmap_mode |= GMAP_IMPROVEMENT;
1563     } else if (!strcmp(string,"ends")) {
1564       gmap_mode |= GMAP_ENDS;
1565     } else if (!strcmp(string,"pairsearch")) {
1566       gmap_mode |= GMAP_PAIRSEARCH;
1567     } else {
1568       fprintf(stderr,"Don't recognize gmap-mode type %s\n",string);
1569       fprintf(stderr,"Allowed values are: none, all, improve, ends, pairsearch\n");
1570       exit(9);
1571     }
1572     return 1;
1573   }
1574 }
1575 #endif
1576 
1577 
1578 static char *
check_valid_int(char * string)1579 check_valid_int (char *string) {
1580   char *p = string;
1581 
1582   if (*p == '+' || *p == '-') {
1583     p++;
1584   }
1585 
1586   if (!isdigit(*p)) {
1587     fprintf(stderr,"value %s is not a valid int\n",string);
1588     exit(9);
1589     return NULL;
1590   }
1591   while (*p != '\0' && isdigit(*p)) {
1592     p++;
1593   }
1594 
1595   if (*p == 'e') {
1596     p++;
1597     if (*p == '+') {
1598       p++;
1599     }
1600     if (!isdigit(*p)) {
1601       return false;
1602     }
1603     while (*p != '\0' && isdigit(*p)) {
1604       p++;
1605     }
1606   }
1607 
1608   if (*p == '\0') {
1609     return string;
1610   } else {
1611     fprintf(stderr,"value %s is not a valid int\n",string);
1612     exit(9);
1613     return NULL;
1614   }
1615 }
1616 
1617 
1618 static double
check_valid_float(char * string,const char * option)1619 check_valid_float (char *string, const char *option) {
1620   double value;
1621   char *p = string;
1622 
1623   if (*p == '+' || *p == '-') {
1624     p++;
1625   }
1626 
1627   while (*p != '\0' && isdigit(*p)) {
1628     p++;
1629   }
1630   if (*p == '\0') {
1631     if ((value = atof(string)) > 1.0 || value < 0.0) {
1632       fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
1633       exit(9);
1634     } else {
1635       return value;
1636     }
1637   }
1638 
1639   if (*p == '.') {
1640     p++;
1641   }
1642 
1643   if (!isdigit(*p)) {
1644     fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
1645     exit(9);
1646     return 0.0;
1647   }
1648   while (*p != '\0' && isdigit(*p)) {
1649     p++;
1650   }
1651 
1652   if (*p == 'e') {
1653     p++;
1654     if (*p == '+' || *p == '-') {
1655       p++;
1656     }
1657     if (!isdigit(*p)) {
1658       fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
1659       exit(9);
1660       return 0.0;
1661     }
1662     while (*p != '\0' && isdigit(*p)) {
1663       p++;
1664     }
1665   }
1666 
1667   if (*p == '\0') {
1668     if ((value = atof(string)) > 1.0 || value < 0.0) {
1669       fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
1670       exit(9);
1671     } else {
1672       return value;
1673     }
1674   } else {
1675     fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
1676     exit(9);
1677     return 0.0;
1678   }
1679 }
1680 
1681 static char *
check_valid_float_or_int(char * string)1682 check_valid_float_or_int (char *string) {
1683   char *p = string;
1684 
1685   if (*p == '+' || *p == '-') {
1686     p++;
1687   }
1688 
1689   while (*p != '\0' && isdigit(*p)) {
1690     p++;
1691   }
1692   if (*p == '\0') {
1693     return string;
1694   }
1695 
1696   if (*p == '.') {
1697     p++;
1698   }
1699 
1700   if (!isdigit(*p)) {
1701     fprintf(stderr,"value %s is not a valid float\n",string);
1702     exit(9);
1703     return NULL;
1704   }
1705   while (*p != '\0' && isdigit(*p)) {
1706     p++;
1707   }
1708 
1709   if (*p == 'e') {
1710     p++;
1711     if (*p == '+' || *p == '-') {
1712       p++;
1713     }
1714     if (!isdigit(*p)) {
1715       fprintf(stderr,"value %s is not a valid float\n",string);
1716       exit(9);
1717       return NULL;
1718     }
1719     while (*p != '\0' && isdigit(*p)) {
1720       p++;
1721     }
1722   }
1723 
1724   if (*p == '\0') {
1725     return string;
1726   } else {
1727     fprintf(stderr,"value %s is not a valid float\n",string);
1728     exit(9);
1729     return NULL;
1730   }
1731 }
1732 
1733 
1734 static int
parse_command_line(int argc,char * argv[],int optind)1735 parse_command_line (int argc, char *argv[], int optind) {
1736   int opt, c;
1737   extern char *optarg;
1738   int long_option_index = 0;
1739   const char *long_name;
1740   char **argstart;
1741   /* char *string; */
1742 
1743   fprintf(stderr,"GSNAP version %s called with args:",PACKAGE_VERSION);
1744   argstart = &(argv[-optind]);
1745   for (c = 1; c < argc + optind; c++) {
1746       fprintf(stderr," %s",argstart[c]);
1747   }
1748   fprintf(stderr,"\n");
1749 
1750   while ((opt = getopt_long(argc,argv,
1751 			    "C:c:D:d:k:Gq:o:a:N:M:m:i:y:Y:z:Z:w:e:E:J:K:l:g:s:V:v:B:t:A:j:0n:QO",
1752 			    long_options, &long_option_index)) != -1) {
1753     switch (opt) {
1754     case 0:
1755       long_name = long_options[long_option_index].name;
1756       if (!strcmp(long_name,"version")) {
1757 	print_program_version();
1758 	return 1;
1759       } else if (!strcmp(long_name,"check")) {
1760 	check_compiler_assumptions();
1761 	return 1;
1762       } else if (!strcmp(long_name,"help")) {
1763 	print_program_usage();
1764 	return 1;
1765 
1766       } else if (!strcmp(long_name,"use-sarray")) {
1767 	fprintf(stderr,"Ignoring the --use-sarray flag, since this version of GSNAP does not use suffix arrays\n");
1768 
1769       } else if (!strcmp(long_name,"terminal-threshold")) {
1770 	fprintf(stderr,"Ignoring the --terminal-threshold flag, which is obsolete\n");
1771 
1772       } else if (!strcmp(long_name,"use-localdb")) {
1773 	if (!strcmp(optarg,"1")) {
1774 	  user_localdb_p = true;
1775 	  use_localdb_p = true;
1776 	} else if (!strcmp(optarg,"0")) {
1777 	  user_localdb_p = true;
1778 	  use_localdb_p = false;
1779 	} else {
1780 	  fprintf(stderr,"--use-localdb flag must be 0 or 1\n");
1781 	  return 9;
1782 	}
1783 
1784       } else if (!strcmp(long_name,"use-transcriptome-only")) {
1785 	use_only_transcriptome_p = true;
1786 
1787       } else if (!strcmp(long_name,"use-shared-memory")) {
1788 	if (!strcmp(optarg,"1")) {
1789 	  sharedp = true;
1790 	} else if (!strcmp(optarg,"0")) {
1791 	  sharedp = false;
1792 	} else {
1793 	  fprintf(stderr,"--use-shared-memory flag must be 0 or 1\n");
1794 	  return 9;
1795 	}
1796 
1797       } else if (!strcmp(long_name,"preload-shared-memory")) {
1798 	preload_shared_memory_p = true;
1799 
1800       } else if (!strcmp(long_name,"unload-shared-memory")) {
1801 	unload_shared_memory_p = true;
1802 
1803       } else if (!strcmp(long_name,"expand-offsets")) {
1804 	fprintf(stderr,"Note: --expand-offsets flag is no longer supported.  With the latest algorithms, it doesn't improve speed much.  Ignoring this flag");
1805 
1806       } else if (!strcmp(long_name,"sampling")) {
1807 	required_index1interval = atoi(check_valid_int(optarg));
1808 
1809       } else if (!strcmp(long_name,"time")) {
1810 	timingp = true;
1811 
1812       } else if (!strcmp(long_name,"unload")) {
1813 	unloadp = true;
1814 
1815 #if 0
1816       } else if (!strcmp(long_name,"maxsearch")) {
1817 	maxpaths_search = atoi(optarg);
1818 #endif
1819 
1820       } else if (!strcmp(long_name,"mode")) {
1821 	if (!strcmp(optarg,"standard")) {
1822 	  mode = STANDARD;
1823 	} else if (!strcmp(optarg,"cmet-stranded")) {
1824 	  mode = CMET_STRANDED;
1825 	} else if (!strcmp(optarg,"cmet-nonstranded")) {
1826 	  mode = CMET_NONSTRANDED;
1827 	} else if (!strcmp(optarg,"atoi-stranded")) {
1828 	  mode = ATOI_STRANDED;
1829 	} else if (!strcmp(optarg,"atoi-nonstranded")) {
1830 	  mode = ATOI_NONSTRANDED;
1831 	} else if (!strcmp(optarg,"ttoc-stranded")) {
1832 	  mode = TTOC_STRANDED;
1833 	} else if (!strcmp(optarg,"ttoc-nonstranded")) {
1834 	  mode = TTOC_NONSTRANDED;
1835 	} else {
1836 	  fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded\n");
1837 	  return 9;
1838 	}
1839 
1840       } else if (!strcmp(long_name,"cmetdir")) {
1841 	user_modedir = optarg;
1842 
1843       } else if (!strcmp(long_name,"atoidir")) {
1844 	user_modedir = optarg;
1845 
1846       } else if (!strcmp(long_name,"novelend-splicedist")) {
1847 	shortsplicedist_novelend = (Chrpos_T) strtoul(optarg,NULL,10);
1848 
1849       } else if (!strcmp(long_name,"splicingdir")) {
1850 	user_splicingdir = optarg;
1851 
1852       } else if (!strcmp(long_name,"ambig-splice-noclip")) {
1853 	amb_clip_p = false;
1854 
1855       } else if (!strcmp(long_name,"find-dna-chimeras")) {
1856 	if (!strcmp(optarg,"1")) {
1857 	  find_dna_chimeras_p = true;
1858 	} else if (!strcmp(optarg,"0")) {
1859 	  find_dna_chimeras_p = false;
1860 	} else {
1861 	  fprintf(stderr,"--find-dna-chimeras flag must be 0 or 1\n");
1862 	  exit(9);
1863 	}
1864 
1865       } else if (!strcmp(long_name,"tallydir")) {
1866 	user_tallydir = optarg;
1867 
1868       } else if (!strcmp(long_name,"use-tally")) {
1869 	tally_root = optarg;
1870 
1871       } else if (!strcmp(long_name,"runlengthdir")) {
1872 	user_runlengthdir = optarg;
1873 
1874       } else if (!strcmp(long_name,"use-runlength")) {
1875 	runlength_root = optarg;
1876 
1877       } else if (!strcmp(long_name,"max-anchors")) {
1878 	max_anchors = atoi(check_valid_int(optarg));
1879 
1880       } else if (!strcmp(long_name,"gmap-mode")) {
1881 	fprintf(stderr,"Note: --gmap-mode is no longer used in GSNAP starting with 2019 versions.  Ignoring.\n");
1882 #if 0
1883 	gmap_mode = 0;		/* Initialize */
1884 	string = strtok(optarg,",");
1885 	if (add_gmap_mode(string) != 0) {
1886 	  while ((string = strtok(NULL,",")) != NULL && add_gmap_mode(string) != 0) {
1887 	  }
1888 	}
1889 #endif
1890 
1891       } else if (!strcmp(long_name,"trigger-score-for-gmap")) {
1892 	fprintf(stderr,"Note: --trigger-score-for-gmap is now ignored in GSNAP\n");
1893 	/* trigger_score_for_gmap = atoi(check_valid_int(optarg)); */
1894 
1895       } else if (!strcmp(long_name,"gmap-min-match-length")) {
1896 	fprintf(stderr,"Note: --gmap-min-match-length is now ignored in GSNAP\n");
1897 	/* gmap_min_nconsecutive = atoi(check_valid_int(optarg)); */
1898 
1899       } else if (!strcmp(long_name,"gmap-allowance")) {
1900 	fprintf(stderr,"Note: --gmap-allowance is now ignored in GSNAP\n");
1901 	/* gmap_allowance = atoi(check_valid_int(optarg)); */
1902 
1903       } else if (!strcmp(long_name,"max-gmap-pairsearch")) {
1904 	fprintf(stderr,"Note: --max-gmap-pairsearch is now ignored in GSNAP\n");
1905 	/* max_gmap_pairsearch = atoi(check_valid_int(optarg)); */
1906 
1907       } else if (!strcmp(long_name,"max-gmap-terminal")) {
1908 	fprintf(stderr,"Note: --max-gmap-terminal is now ignored in GSNAP\n");
1909 	/* max_gmap_terminal = atoi(check_valid_int(optarg)); */
1910 
1911       } else if (!strcmp(long_name,"max-gmap-improvement")) {
1912 	fprintf(stderr,"Note: --max-gmap-improvement is now ignored in GSNAP\n");
1913 	/* max_gmap_improvement = atoi(check_valid_int(optarg)); */
1914 
1915       } else if (!strcmp(long_name,"input-buffer-size")) {
1916 	inbuffer_nspaces = atoi(check_valid_int(optarg));
1917 
1918       } else if (!strcmp(long_name,"output-buffer-size")) {
1919 	output_buffer_size = atoi(check_valid_int(optarg));
1920 
1921       } else if (!strcmp(long_name,"barcode-length")) {
1922 	barcode_length = atoi(check_valid_int(optarg));
1923 
1924       } else if (!strcmp(long_name,"endtrim-length")) {
1925 	endtrim_length = atoi(check_valid_int(optarg));
1926 
1927       } else if (!strcmp(long_name,"fastq-id-start")) {
1928 	acc_fieldi_start = atoi(check_valid_int(optarg)) - 1;
1929 	if (acc_fieldi_start < 0) {
1930 	  fprintf(stderr,"Value for fastq-id-start must be 1 or greater\n");
1931 	  return 9;
1932 	}
1933 
1934       } else if (!strcmp(long_name,"fastq-id-end")) {
1935 	acc_fieldi_end = atoi(check_valid_int(optarg)) - 1;
1936 	if (acc_fieldi_end < 0) {
1937 	  fprintf(stderr,"Value for fastq-id-end must be 1 or greater\n");
1938 	  return 9;
1939 	}
1940 
1941       } else if (!strcmp(long_name,"force-single-end")) {
1942 	force_single_end_p = true;
1943 
1944       } else if (!strcmp(long_name,"filter-chastity")) {
1945 	if (!strcmp(optarg,"off")) {
1946 	  filter_chastity_p = false;
1947 	  filter_if_both_p = false;
1948 	} else if (!strcmp(optarg,"either")) {
1949 	  filter_chastity_p = true;
1950 	  filter_if_both_p = false;
1951 	} else if (!strcmp(optarg,"both")) {
1952 	  filter_chastity_p = true;
1953 	  filter_if_both_p = true;
1954 	} else {
1955 	  fprintf(stderr,"--filter-chastity values allowed: off, either, both\n");
1956 	  return 9;
1957 	}
1958 
1959       } else if (!strcmp(long_name,"allow-pe-name-mismatch")) {
1960 	allow_paired_end_mismatch_p = true;
1961 
1962       } else if (!strcmp(long_name,"read-files-command")) {
1963 	read_files_command = optarg;
1964 
1965 #ifdef HAVE_ZLIB
1966       } else if (!strcmp(long_name,"gunzip")) {
1967 	gunzip_p = true;
1968 #endif
1969 
1970 #ifdef HAVE_BZLIB
1971       } else if (!strcmp(long_name,"bunzip2")) {
1972 	bunzip2_p = true;
1973 #endif
1974 
1975       } else if (!strcmp(long_name,"interleaved")) {
1976 	interleavedp = true;
1977 
1978       } else if (!strcmp(long_name,"orientation")) {
1979 	if (!strcmp(optarg,"FR")) {
1980 	  invert_first_p = false;
1981 	  invert_second_p = true;
1982 	} else if (!strcmp(optarg,"RF")) {
1983 	  invert_first_p = true;
1984 	  invert_second_p = false;
1985 	} else if (!strcmp(optarg,"FF")) {
1986 	  invert_first_p = invert_second_p = false;
1987 	} else {
1988 	  fprintf(stderr,"Currently allowed values for orientation: FR (fwd-rev), RF (rev-fwd) or FF (fwd-fwd)\n");
1989 	  return 9;
1990 	}
1991 
1992       } else if (!strcmp(long_name,"split-output")) {
1993 	split_output_root = optarg;
1994 
1995       } else if (!strcmp(long_name,"failed-input")) {
1996 	failedinput_root = optarg;
1997 
1998       } else if (!strcmp(long_name,"append-output")) {
1999 	appendp = true;
2000 
2001       } else if (!strcmp(long_name,"order-among-best")) {
2002 	if (!strcmp(optarg,"genomic")) {
2003 	  want_random_p = false;
2004 	} else if (!strcmp(optarg,"random")) {
2005 	  want_random_p = true;
2006 	} else {
2007 	  fprintf(stderr,"--order-among-best values allowed: genomic, random (default)\n");
2008 	  return 9;
2009 	}
2010 
2011       } else if (!strcmp(long_name,"pairmax-dna")) {
2012 	pairmax_dna = atoi(check_valid_int(optarg));
2013 
2014       } else if (!strcmp(long_name,"pairmax-rna")) {
2015 	pairmax_rna = atoi(check_valid_int(optarg));
2016 
2017       } else if (!strcmp(long_name,"pairexpect")) {
2018 	expected_pairlength = atoi(check_valid_int(optarg));
2019 
2020       } else if (!strcmp(long_name,"pairdev")) {
2021 	pairlength_deviation = atoi(check_valid_int(optarg));
2022 
2023       } else if (!strcmp(long_name,"max-ref-mismatches")) {
2024 	user_mismatches_ref_float = atof(check_valid_float_or_int(optarg));
2025 	if (user_mismatches_ref_float > 1.0 && user_mismatches_ref_float != rint(user_mismatches_ref_float)) {
2026 	  fprintf(stderr,"Cannot specify fractional value %f for --max-mismatches except between 0.0 and 1.0\n",user_mismatches_ref_float);
2027 	  return 9;
2028 	}
2029 
2030       } else if (!strcmp(long_name,"min-coverage")) {
2031 	user_mincoverage_float = atof(check_valid_float_or_int(optarg));
2032 	if (user_mincoverage_float > 1.0 && user_mincoverage_float != rint(user_mincoverage_float)) {
2033 	  fprintf(stderr,"Cannot specify fractional value %f for --max-mismatches except between 0.0 and 1.0\n",user_mincoverage_float);
2034 	  return 9;
2035 	}
2036 
2037       } else if (!strcmp(long_name,"indel-endlength")) {
2038 	min_indel_end_matches = atoi(check_valid_int(optarg));
2039 	if (min_indel_end_matches > 14) {
2040 	  allow_end_indels_p = false;
2041 	}
2042 
2043       } else if (!strcmp(long_name,"antistranded-penalty")) {
2044 	antistranded_penalty = atoi(check_valid_int(optarg));
2045 
2046       } else if (!strcmp(long_name,"favor-multiexon")) {
2047 	favor_multiexon_p = true;
2048 
2049       } else if (!strcmp(long_name,"end-detail")) {
2050 	fprintf(stderr,"--end-detail is now deprecated.  GSNAP is now tuned to trim well at the ends\n");
2051 
2052       } else if (!strcmp(long_name,"merge-distant-samechr")) {
2053 	merge_samechr_p = true;
2054 
2055       } else if (!strcmp(long_name,"query-unk-mismatch")) {
2056 	if (!strcmp(optarg,"1")) {
2057 	  query_unk_mismatch_p = true;
2058 	} else if (!strcmp(optarg,"0")) {
2059 	  query_unk_mismatch_p = false;
2060 	} else {
2061 	  fprintf(stderr,"--query-unk-mismatch flag must be 0 or 1\n");
2062 	  return 9;
2063 	}
2064 
2065       } else if (!strcmp(long_name,"genome-unk-mismatch")) {
2066 	if (!strcmp(optarg,"1")) {
2067 	  genome_unk_mismatch_p = true;
2068 	} else if (!strcmp(optarg,"0")) {
2069 	  genome_unk_mismatch_p = false;
2070 	} else {
2071 	  fprintf(stderr,"--genome-unk-mismatch flag must be 0 or 1\n");
2072 	  return 9;
2073 	}
2074 
2075       } else if (!strcmp(long_name,"trim-mismatch-score")) {
2076 	fprintf(stderr,"--trim-mismatch-score is now deprecated.  GSNAP is now tuned to trim well at the ends\n");
2077 	/* trim_mismatch_score = atoi(check_valid_int(optarg)); */
2078 	/* user_trim_mismatch_score_p = true; */
2079 
2080       } else if (!strcmp(long_name,"trim-indel-score")) {
2081 	trim_indel_score = atoi(check_valid_int(optarg));
2082 	user_trim_indel_score_p = true;
2083 
2084       } else if (!strcmp(long_name,"local-splice-penalty")) {
2085 	localsplicing_penalty = atoi(check_valid_int(optarg));
2086 
2087       } else if (!strcmp(long_name,"distant-splice-penalty")) {
2088 	distantsplicing_penalty = atoi(check_valid_int(optarg));
2089 
2090       } else if (!strcmp(long_name,"distant-splice-identity")) {
2091 	min_distantsplicing_identity = check_valid_float(optarg,long_name);
2092 
2093       } else if (!strcmp(long_name,"force-xs-dir")) {
2094 	force_xs_direction_p = true;
2095 
2096       } else if (!strcmp(long_name,"show-refdiff")) {
2097 	show_refdiff_p = true;
2098 
2099       } else if (!strcmp(long_name,"clip-overlap")) {
2100 	clip_overlap_p = true;
2101 
2102       } else if (!strcmp(long_name,"merge-overlap")) {
2103 	merge_overlap_p = true;
2104 
2105       } else if (!strcmp(long_name,"show-method")) {
2106 	method_print_p = true;
2107 
2108       } else if (!strcmp(long_name,"no-sam-headers")) {
2109 	sam_headers_p = false;
2110 
2111       } else if (!strcmp(long_name,"add-paired-nomappers")) {
2112 	add_paired_nomappers_p = true;
2113 
2114       } else if (!strcmp(long_name,"paired-flag-means-concordant")) {
2115 	if (!strcmp(optarg,"1")) {
2116 	  paired_flag_means_concordant_p = true;
2117 	} else if (!strcmp(optarg,"0")) {
2118 	  paired_flag_means_concordant_p = false; /* Default */
2119 	} else {
2120 	  fprintf(stderr,"--paired-flag-means-concordant flag must be 0 or 1\n");
2121 	  return 9;
2122 	}
2123 
2124       } else if (!strcmp(long_name,"sam-headers-batch")) {
2125 	sam_headers_batch = atoi(check_valid_int(optarg));
2126 
2127       } else if (!strcmp(long_name,"sam-hardclip-use-S")) {
2128 	sam_hardclip_use_S_p = true;
2129 
2130       } else if (!strcmp(long_name,"sam-use-0M")) {
2131 	sam_insert_0M_p = true;
2132 
2133       } else if (!strcmp(long_name,"sam-extended-cigar")) {
2134 	sam_cigar_extended_p = true;
2135 
2136       } else if (!strcmp(long_name,"sam-multiple-primaries")) {
2137 	sam_multiple_primaries_p = true;
2138 
2139       } else if (!strcmp(long_name,"sam-sparse-secondaries")) {
2140 	sam_sparse_secondaries_p = true;
2141 
2142       } else if (!strcmp(long_name,"quality-protocol")) {
2143 	if (user_quality_score_adj == true) {
2144 	  fprintf(stderr,"Cannot specify both -J (--quality-zero-score) and --quality-protocol\n");
2145 	  return 9;
2146 	} else if (user_quality_shift == true) {
2147 	  fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
2148 	  return 9;
2149 	} else if (!strcmp(optarg,"illumina")) {
2150 	  MAPQ_init(/*quality_score_adj*/64);
2151 	  user_quality_score_adj = true;
2152 	  quality_shift = -31;
2153 	  user_quality_shift = true;
2154 	} else if (!strcmp(optarg,"sanger")) {
2155 	  MAPQ_init(/*quality_score_adj*/33);
2156 	  user_quality_score_adj = true;
2157 	  quality_shift = 0;
2158 	  user_quality_shift = true;
2159 	} else {
2160 	  fprintf(stderr,"The only values allowed for --quality-protocol are illumina or sanger\n");
2161 	  return 9;
2162 	}
2163 
2164       } else if (!strcmp(long_name,"md-lowercase-snp")) {
2165 	md_lowercase_variant_p = true;
2166 
2167       } else if (!strcmp(long_name,"extend-soft-clips")) {
2168 	hide_soft_clips_p = true;
2169 
2170       } else if (!strcmp(long_name,"action-if-cigar-error")) {
2171 	if (!strcmp(optarg,"ignore")) {
2172 	  cigar_action = CIGAR_ACTION_IGNORE;
2173 	} else if (!strcmp(optarg,"warning")) {
2174 	  cigar_action = CIGAR_ACTION_WARNING;
2175 	} else if (!strcmp(optarg,"noprint")) {
2176 	  cigar_action = CIGAR_ACTION_NOPRINT;
2177 	} else if (!strcmp(optarg,"abort")) {
2178 	  cigar_action = CIGAR_ACTION_ABORT;
2179 	} else {
2180 	  fprintf(stderr,"The only values allowed for --action-if-cigar-error are ignore, warning, noprint, abort\n");
2181 	  return 9;
2182 	}
2183 
2184       } else if (!strcmp(long_name,"read-group-id")) {
2185 	sam_read_group_id = optarg;
2186 
2187       } else if (!strcmp(long_name,"read-group-name")) {
2188 	sam_read_group_name = optarg;
2189 
2190       } else if (!strcmp(long_name,"read-group-library")) {
2191 	sam_read_group_library = optarg;
2192 
2193       } else if (!strcmp(long_name,"read-group-platform")) {
2194 	sam_read_group_platform = optarg;
2195 
2196 #ifdef USE_MPI
2197       } else if (!strcmp(long_name,"master-is-worker")) {
2198 	if (!strcmp(optarg,"1")) {
2199 	  master_is_worker_p = true;
2200 	} else if (!strcmp(optarg,"0")) {
2201 	  master_is_worker_p = false; /* Default */
2202 	} else {
2203 	  fprintf(stderr,"--master-is-worker flag must be 0 or 1\n");
2204 	  return 9;
2205 	}
2206 #endif
2207 
2208       } else if (!strcmp(long_name,"print-snps")) {
2209 	print_snplabels_p = true;
2210 
2211       } else if (!strcmp(long_name,"failsonly")) {
2212 	if (nofailsp == true) {
2213 	  fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
2214 	  return 9;
2215 	} else {
2216 	  failsonlyp = true;
2217 	}
2218       } else if (!strcmp(long_name,"nofails")) {
2219 	if (failsonlyp == true) {
2220 	  fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
2221 	  return 9;
2222 	} else {
2223 	  nofailsp = true;
2224 	}
2225 
2226       } else if (!strcmp(long_name,"ignore-trim-in-filtering")) {
2227 	if (!strcmp(optarg,"0")) {
2228 	  user_filter_within_trims_p = false;
2229 	  specified_filter_within_trims_p = true;
2230 	} else if (!strcmp(optarg,"1")) {
2231 	  user_filter_within_trims_p = true;
2232 	  specified_filter_within_trims_p = true;
2233 	} else {
2234 	  fprintf(stderr,"The only values allowed for --ignore-trim-in-filtering are 0, 1\n");
2235 	  return 9;
2236 	}
2237 
2238       } else if (!strcmp(long_name,"omit-concordant-uniq")) {
2239 	omit_concordant_uniq_p = true;
2240       } else if (!strcmp(long_name,"omit-concordant-mult")) {
2241 	omit_concordant_mult_p = true;
2242 
2243       } else {
2244 	/* Shouldn't reach here */
2245 	fprintf(stderr,"Don't recognize option %s.  For usage, run 'gsnap --help'",long_name);
2246 	return 9;
2247       }
2248       break;
2249 
2250     case 'C': user_transcriptomedir = optarg; break;
2251     case 'c': transcriptome_dbroot = optarg; break;
2252 
2253     case 'D': user_genomedir = optarg; break;
2254     case 'd': genome_dbroot = optarg; break;
2255 
2256     case 'k':
2257       required_index1part = atoi(check_valid_int(optarg));
2258       if (required_index1part > MAXIMUM_KMER) {
2259 	fprintf(stderr,"The value for k-mer size must be %d or less\n",MAXIMUM_KMER);
2260 	return 9;
2261       }
2262       break;
2263 
2264     case 'G': uncompressedp = true; break;
2265 
2266     case 'q': parse_part(&part_modulus,&part_interval,optarg); break;
2267     case 'o': output_file = optarg; break;
2268 
2269     case 'a':
2270       if (!strcmp(optarg,"paired")) {
2271 	chop_primers_p = true;
2272       } else if (!strcmp(optarg,"off")) {
2273 	chop_primers_p = false;
2274       } else {
2275 	fprintf(stderr,"Currently allowed values for adapter stripping (-a): off, paired\n");
2276 	return 9;
2277       }
2278       break;
2279 
2280     case 'N':
2281       if (!strcmp(optarg,"1")) {
2282 	novelsplicingp = true;
2283       } else if (!strcmp(optarg,"0")) {
2284 	novelsplicingp = false;
2285       } else {
2286 	fprintf(stderr,"Novel splicing (-N flag) must be 0 or 1\n");
2287 	return 9;
2288       }
2289       break;
2290 
2291 #if 0
2292     case 'R':
2293       if (!strcmp(optarg,"0")) {
2294 	masktype = MASK_NONE;
2295       } else if (!strcmp(optarg,"1")) {
2296 	masktype = MASK_FREQUENT;
2297       } else if (!strcmp(optarg,"2")) {
2298 	masktype = MASK_REPETITIVE;
2299       } else if (!strcmp(optarg,"3")) {
2300 	masktype = MASK_GREEDY_FREQUENT;
2301       } else if (!strcmp(optarg,"4")) {
2302 	masktype = MASK_GREEDY_REPETITIVE;
2303       } else {
2304 	fprintf(stderr,"Masking mode %s not recognized.\n",optarg);
2305 	fprintf(stderr,"Mode 0 means no masking, mode 1 masks frequent oligomers;\n");
2306 	fprintf(stderr,"  mode 2 masks frequent and repetitive oligomers;\n");
2307 	fprintf(stderr,"  mode 3 does greedy masking of frequent oligomers,\n");
2308 	fprintf(stderr,"    then no masking if necessary;\n");
2309 	fprintf(stderr,"  mode 4 does greedy masking of frequent and repetitive oligomers,\n");
2310 	fprintf(stderr,"    then no masking if necessary.\n");
2311 	return 9;
2312       }
2313       break;
2314 #endif
2315 
2316     case 'M': subopt_levels = atoi(check_valid_int(optarg)); break;
2317     case 'm':
2318       user_mismatches_refalt_float = atof(check_valid_float_or_int(optarg));
2319       if (user_mismatches_refalt_float > 1.0 && user_mismatches_refalt_float != rint(user_mismatches_refalt_float)) {
2320 	fprintf(stderr,"Cannot specify fractional value %f for --max-mismatches except between 0.0 and 1.0\n",user_mismatches_refalt_float);
2321 	return 9;
2322       }
2323       break;
2324 
2325     case 'i': indel_penalty_middle = indel_penalty_end = atoi(check_valid_int(optarg)); break;
2326 
2327     case 'y':
2328       max_middle_insertions_float = atof(check_valid_float_or_int(optarg));
2329       if (max_middle_insertions_float > 1.0 && max_middle_insertions_float != rint(max_middle_insertions_float)) {
2330 	fprintf(stderr,"Cannot specify fractional value %f for --max-middle-insertions except between 0.0 and 1.0\n",max_middle_insertions_float);
2331 	return 9;
2332       }
2333       break;
2334 
2335     case 'z':
2336       max_middle_deletions_float = atof(check_valid_float_or_int(optarg));
2337       if (max_middle_deletions_float > 1.0 && max_middle_deletions_float != rint(max_middle_deletions_float)) {
2338 	fprintf(stderr,"Cannot specify fractional value %f for --max-middle-deletions except between 0.0 and 1.0\n",max_middle_deletions_float);
2339 	return 9;
2340       }
2341       break;
2342 
2343     case 'Y': max_end_insertions = atoi(check_valid_int(optarg)); break;
2344     case 'Z': max_end_deletions = atoi(check_valid_int(optarg)); break;
2345 
2346     case 'w': shortsplicedist = strtoul(optarg,NULL,10); break;
2347 
2348     case 'K': min_distantsplicing_end_matches = atoi(check_valid_int(optarg)); break;
2349     case 'l': min_shortend = atoi(check_valid_int(optarg)); break;
2350 
2351     case 'g': genes_file = optarg; break;
2352 
2353     case 's':
2354       splicing_file = optarg;
2355       knownsplicingp = true;
2356       break;
2357 
2358     case 'e':
2359       masked_root = optarg;
2360       genome_unk_mismatch_p = false;
2361       maskedp = true;
2362       print_nsnpdiffs_p = true;
2363       break;
2364 
2365     case 'E':
2366       fprintf(stderr,"In the future, please use --distant-splice-penalty instead of -E, which may be reassigned\n");
2367       distantsplicing_penalty = atoi(check_valid_int(optarg));
2368       break;
2369 
2370     case 'V': user_snpsdir = optarg; break;
2371     case 'v': snps_root = optarg; break;
2372 
2373     case 'B':
2374       if (!strcmp(optarg,"5")) {
2375 #if 0
2376 	/* Not true.  -B 5 allocates suffix array and suffix aux files */
2377 	fprintf(stderr,"Note: Batch mode 5 is now the same as batch mode 4.\n");
2378 	fprintf(stderr,"Expansion of offsets is now controlled separately by --expand-offsets (default=0).\n");
2379 #endif
2380 	offsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
2381 	positions_access = USE_ALLOCATE;
2382 	locoffsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
2383 	locpositions_access = USE_ALLOCATE;
2384 
2385 	genome_access = USE_ALLOCATE;
2386 
2387 #ifdef HAVE_MMAP
2388       } else if (!strcmp(optarg,"4")) {
2389 	offsetsstrm_access = USE_ALLOCATE;
2390 	positions_access = USE_ALLOCATE;
2391 	locoffsetsstrm_access = USE_ALLOCATE;
2392 	locpositions_access = USE_ALLOCATE;
2393 
2394 	genome_access = USE_ALLOCATE;
2395 
2396       } else if (!strcmp(optarg,"3")) {
2397 	offsetsstrm_access = USE_ALLOCATE;
2398 	positions_access = USE_ALLOCATE;
2399 	locoffsetsstrm_access = USE_ALLOCATE;
2400 	locpositions_access = USE_ALLOCATE;
2401 
2402 	genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
2403 
2404       } else if (!strcmp(optarg,"2")) {
2405 	offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2406 	positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2407 	locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2408 	locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2409 
2410 	genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
2411 
2412       } else if (!strcmp(optarg,"1")) {
2413 	offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2414 	positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2415 	locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2416 	locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2417 
2418 	genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
2419 
2420       } else if (!strcmp(optarg,"0")) {
2421 	offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2422 	positions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
2423 	locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2424 	locpositions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
2425 
2426 	genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
2427 
2428 #endif
2429       } else {
2430 #ifdef HAVE_MMAP
2431 	fprintf(stderr,"Batch mode %s not recognized.  Only allow 0-5.  Run 'gsnap --help' for more information.\n",optarg);
2432 #else
2433 	fprintf(stderr,"Batch mode %s not recognized.  Only allow 4-5, since mmap is disabled.  Run 'gsnap --help' for more information.\n",optarg);
2434 #endif
2435 	return 9;
2436       }
2437       break;
2438 
2439 #if defined(HAVE_PTHREAD)
2440     case 't': nthreads = atoi(check_valid_int(optarg)); break;
2441 #else
2442     case 't': fprintf(stderr,"This version of GSNAP has pthreads disabled, so ignoring the value of %s for -t\n",optarg); break;
2443 #endif
2444 
2445     case 'A':
2446       if (!strcmp(optarg,"standard")) {
2447 	output_type = STD_OUTPUT;
2448       } else if (!strcmp(optarg,"sam")) {
2449 	output_type = SAM_OUTPUT;
2450       } else if (!strcmp(optarg,"m8")) {
2451 	output_type = M8_OUTPUT;
2452       } else {
2453 	fprintf(stderr,"Output format %s not recognized.  Allowed values: standard (default), sam, m8\n",optarg);
2454 	return 9;
2455       }
2456       break;
2457 
2458     case 'j':
2459       if (user_quality_shift == true) {
2460 	fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
2461 	return 9;
2462       } else {
2463 	quality_shift = atoi(check_valid_int(optarg));
2464 	user_quality_shift = true;
2465       }
2466       break;
2467 
2468     case 'J':
2469       if (user_quality_score_adj == true) {
2470 	fprintf(stderr,"Cannot specify both -J (--quality-zero-score) and --quality-protocol\n");
2471 	return 9;
2472       } else {
2473 	MAPQ_init(/*quality_score_adj*/atoi(check_valid_int(optarg)));
2474 	user_quality_score_adj = true;
2475       }
2476       break;
2477 
2478     case '0': exception_raise_p = false; break; /* Allows signals to pass through */
2479     case 'n': maxpaths_report = atoi(check_valid_int(optarg)); break;
2480     case 'Q': quiet_if_excessive_p = true; break;
2481 
2482     case 'O': orderedp = true; break;
2483 
2484     case '?': fprintf(stderr,"For usage, run 'gsnap --help'\n"); return 9;
2485     default: return 9;
2486     }
2487   }
2488 
2489   /* Make inferences */
2490   if (genome_dbroot == NULL) {
2491     fprintf(stderr,"Need to specify the -d flag.  For usage, run 'gsnap --help'\n");
2492     /* print_program_usage(); */
2493     return 9;
2494   }
2495 
2496   if (acc_fieldi_end < acc_fieldi_start) {
2497     fprintf(stderr,"--fastq-id-end must be equal to or greater than --fastq-id-start\n");
2498     return 9;
2499   }
2500 
2501   if (clip_overlap_p == true && merge_overlap_p == true) {
2502     fprintf(stderr,"Cannot specify both --clip-overlap and --merge-overlap.  Please choose one.\n");
2503     return 9;
2504   }
2505 
2506   if (masked_root != NULL && genome_unk_mismatch_p == true) {
2507     fprintf(stderr,"Unexpected combination of --use-mask and --genome-unk-mismatch=true.  Are you sure?\n");
2508   }
2509 
2510   if (transcriptome_dbroot == NULL && use_only_transcriptome_p == true) {
2511     fprintf(stderr,"Flag --use-transcriptome-only was given, but no transcriptdb was specified.  Ignoring this flag\n");
2512   }
2513 
2514   if (transcriptome_dbroot != NULL) {
2515     if (novelsplicingp == false) {
2516       fprintf(stderr,"Transcriptome specified => assume reads are RNA-Seq.  Turning on novel splicing.\n");
2517       novelsplicingp = true;
2518     } else {
2519       fprintf(stderr,"Transcriptome specified => assume reads are RNA-Seq\n");
2520     }
2521     pairmax_transcriptome = pairmax_dna;
2522     pairmax_linear = pairmax_rna;
2523     pairmax_circular = pairmax_dna;
2524     shortsplicedist_known = shortsplicedist;
2525 
2526     if (specified_filter_within_trims_p == true) {
2527       filter_within_trims_p = user_filter_within_trims_p;
2528     } else {
2529       filter_within_trims_p = true;	/* Use trims by default for RNA-Seq */
2530     }
2531 
2532     if (user_localdb_p == false) {
2533       /* User did not specify whether to use localdb, so using it by default */
2534       use_localdb_p = true;
2535     }
2536 
2537 #if 0
2538   } else if (find_dna_chimeras_p == true) {
2539     /* DNA-Seq, but need to trim to find chimeras */
2540     fprintf(stderr,"Neither novel splicing (-N) nor known splicing (-s) turned on => assume reads are DNA-Seq (genomic)\n");
2541     pairmax_transcriptome = 0;
2542     pairmax_linear = pairmax_dna;
2543     pairmax_circular = pairmax_dna;
2544     shortsplicedist = shortsplicedist_known = 0U;
2545     shortsplicedist_novelend = 0U;
2546 
2547     if (specified_filter_within_trims_p == true) {
2548       filter_within_trims_p = user_filter_within_trims_p;
2549     } else {
2550       filter_within_trims_p = true;	/* Use trims by default for DNA-Seq */
2551     }
2552 #endif
2553 
2554   } else if (novelsplicingp == true && knownsplicingp == true) {
2555     fprintf(stderr,"Novel splicing (-N) and known splicing (-s) both turned on => assume reads are RNA-Seq\n");
2556     pairmax_transcriptome = 0;
2557     pairmax_linear = pairmax_rna;
2558     pairmax_circular = pairmax_dna;
2559     shortsplicedist_known = shortsplicedist;
2560 
2561     if (specified_filter_within_trims_p == true) {
2562       filter_within_trims_p = user_filter_within_trims_p;
2563     } else {
2564       filter_within_trims_p = true;	/* Use trims by default for RNA-Seq */
2565     }
2566 
2567     if (user_localdb_p == false) {
2568       /* User did not specify whether to use localdb, so using it by default */
2569       use_localdb_p = true;
2570     }
2571 
2572   } else if (knownsplicingp == true) {
2573     fprintf(stderr,"Known splicing (-s) turned on => assume reads are RNA-Seq\n");
2574     pairmax_transcriptome = 0;
2575     pairmax_linear = pairmax_rna;
2576     pairmax_circular = pairmax_dna;
2577     shortsplicedist_known = shortsplicedist;
2578 
2579     if (specified_filter_within_trims_p == true) {
2580       filter_within_trims_p = user_filter_within_trims_p;
2581     } else {
2582       filter_within_trims_p = true;	/* Use trims by default for RNA-Seq */
2583     }
2584 
2585     if (user_localdb_p == false) {
2586       /* User did not specify whether to use localdb, so using it by default */
2587       use_localdb_p = true;
2588     }
2589 
2590   } else if (novelsplicingp == true) {
2591     fprintf(stderr,"Novel splicing (-N) turned on => assume reads are RNA-Seq\n");
2592     pairmax_transcriptome = 0;
2593     pairmax_linear = pairmax_rna;
2594     pairmax_circular = pairmax_dna;
2595     shortsplicedist_known = 0;
2596 
2597     if (specified_filter_within_trims_p == true) {
2598       filter_within_trims_p = user_filter_within_trims_p;
2599     } else {
2600       filter_within_trims_p = true;	/* Use trims by default for RNA-Seq */
2601     }
2602 
2603     if (user_localdb_p == false) {
2604       /* User did not specify whether to use localdb, so using it by default */
2605       use_localdb_p = true;
2606     }
2607 
2608   } else {
2609     /* Straight DNA-Seq */
2610     fprintf(stderr,"Neither novel splicing (-N) nor known splicing (-s) turned on => assume reads are DNA-Seq (genomic)\n");
2611     pairmax_transcriptome = 0;
2612     pairmax_linear = pairmax_dna;
2613     pairmax_circular = pairmax_dna;
2614     shortsplicedist = shortsplicedist_known = 0U;
2615     shortsplicedist_novelend = 0U;
2616 
2617 #if 0
2618     /* This ignores datasets where sequence ends are indeed bad, or have primers.  User can set values to 0 if desired */
2619     if (user_trim_mismatch_score_p == false) {
2620       trim_mismatch_score = 0;
2621     }
2622     if (user_trim_indel_score_p == false) {
2623       trim_indel_score = 0;
2624     }
2625 #endif
2626 
2627     if (specified_filter_within_trims_p == true) {
2628       filter_within_trims_p = user_filter_within_trims_p;
2629     } else {
2630       filter_within_trims_p = false;	/* Ignore trims by default (i.e., extend to ends) for DNA-Seq */
2631     }
2632 
2633     if (user_localdb_p == false) {
2634       /* User did not specify whether to use localdb, so not using it by default for DNA-seq */
2635       use_localdb_p = false;
2636     }
2637 
2638 #if 0
2639     /* Large soft-clips not expected for DNA-Seq, so require good alignments */
2640     if (user_mismatches_refalt_float < 0.0) {
2641       user_mismatches_refalt_float = 0.10;
2642     }
2643 #endif
2644   }
2645 
2646 
2647   if (shortsplicedist_novelend > shortsplicedist) {
2648     fprintf(stderr,"The novelend-splicedist %d is greater than the localsplicedist %d.  Resetting novelend-splicedist to be %d\n",
2649 	    shortsplicedist_novelend,shortsplicedist,shortsplicedist);
2650     shortsplicedist_novelend = shortsplicedist;
2651   }
2652 
2653   if (distantsplicing_penalty < localsplicing_penalty) {
2654     fprintf(stderr,"The distant splicing penalty %d cannot be less than local splicing penalty %d\n",
2655 	    distantsplicing_penalty,localsplicing_penalty);
2656     return 9;
2657   }
2658 
2659   if (sam_headers_batch >= 0) {
2660     if (part_modulus == sam_headers_batch) {
2661       sam_headers_p = true;
2662     } else {
2663       sam_headers_p = false;
2664     }
2665   }
2666 
2667   if (sam_read_group_id == NULL && sam_read_group_name != NULL) {
2668     sam_read_group_id = sam_read_group_name;
2669   } else if (sam_read_group_id != NULL && sam_read_group_name == NULL) {
2670     sam_read_group_name = sam_read_group_id;
2671   }
2672 
2673   if (chop_primers_p == true) {
2674     if (invert_first_p == false && invert_second_p == true) {
2675       /* orientation FR */
2676     } else {
2677       fprintf(stderr,"Adapter stripping not currently implemented for given orientation\n");
2678       return 9;
2679     }
2680   }
2681 
2682 #ifdef USE_MPI
2683   /* Code does allow for MPI output to stdout, but appears not to work
2684      yet, and may not work if rank 0 is also a worker */
2685   if (split_output_root == NULL && output_file == NULL) {
2686     fprintf(stderr,"For MPI version, need to specify either --split-output or --output-file\n");
2687     return 9;
2688   }
2689 #endif
2690 
2691   return 0;
2692 }
2693 
2694 
2695 static bool
open_input_streams_parser(int * nextchar,int * nchars1,int * nchars2,char *** files,int * nfiles,FILE ** input,FILE ** input2,gzFile * gzipped,gzFile * gzipped2,Bzip2_T * bzipped,Bzip2_T * bzipped2,char * read_files_command,bool gunzip_p,bool bunzip2_p,bool interleavedp,int argc,char ** argv)2696 open_input_streams_parser (int *nextchar, int *nchars1, int *nchars2, char ***files, int *nfiles,
2697 			   FILE **input, FILE **input2,
2698 #ifdef HAVE_ZLIB
2699 			   gzFile *gzipped, gzFile *gzipped2,
2700 #endif
2701 #ifdef HAVE_BZLIB
2702 			   Bzip2_T *bzipped, Bzip2_T *bzipped2,
2703 #endif
2704 			   char *read_files_command, bool gunzip_p, bool bunzip2_p, bool interleavedp,
2705 			   int argc, char **argv) {
2706   bool fastq_format_p = false;
2707 
2708   *input = *input2 = NULL;
2709 #ifdef HAVE_ZLIB
2710   *gzipped = *gzipped2 = NULL;
2711 #endif
2712 #ifdef HAVE_BZLIB
2713   *bzipped = *bzipped2 = NULL;
2714 #endif
2715 
2716   /* Open input stream and peek at first char */
2717   if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
2718     /* Ignore any input */
2719     *input = stdin;
2720     *files = (char **) NULL;
2721     *nfiles = 0;
2722     *nextchar = EOF;
2723     return false;
2724 
2725   } else if (argc == 0) {
2726 #ifdef USE_MPI
2727     fprintf(stderr,"For mpi_gsnap, cannot read from stdin\n");
2728     exit(9);
2729 #else
2730     fprintf(stderr,"Reading from stdin\n");
2731     *input = stdin;
2732     *files = (char **) NULL;
2733     *nfiles = 0;
2734     if (interleavedp == true) {
2735       *nextchar = '\0';
2736     } else {
2737       *nextchar = Shortread_input_init(&(*nchars1),*input);
2738     }
2739 #endif
2740 
2741   } else {
2742     *files = argv;
2743     *nfiles = argc;
2744 
2745     if (gunzip_p == true) {
2746 #ifdef HAVE_ZLIB
2747       if ((*gzipped = gzopen((*files)[0],"rb")) == NULL) {
2748 	fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2749 	exit(9);
2750       } else {
2751 #ifdef HAVE_ZLIB_GZBUFFER
2752 	gzbuffer(*gzipped,GZBUFFER_SIZE);
2753 #endif
2754 	if (interleavedp == true) {
2755 	  *nextchar = '\0';
2756 	} else {
2757 	  *nextchar = Shortread_input_init_gzip(*gzipped);
2758 	}
2759       }
2760 #endif
2761 
2762     } else if (bunzip2_p == true) {
2763 #ifdef HAVE_BZLIB
2764       if ((*bzipped = Bzip2_new((*files)[0])) == NULL) {
2765 	fprintf(stderr,"Cannot open bzipped file %s\n",(*files)[0]);
2766 	exit(9);
2767       } else {
2768 	if (interleavedp == true) {
2769 	  *nextchar = '\0';
2770 	} else {
2771 	  *nextchar = Shortread_input_init_bzip2(*bzipped);
2772 	}
2773       }
2774 #endif
2775 
2776     } else {
2777       if ((*input = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2778 	fprintf(stderr,"Unable to read input\n");
2779 	exit(9);
2780       } else {
2781 	debugf(fprintf(stderr,"Master opening file %s using fopen for input\n",(*files)[0]));
2782 	if (interleavedp == true) {
2783 	  *nextchar = '\0';
2784 	} else {
2785 	  *nextchar = Shortread_input_init(&(*nchars1),*input);
2786 	}
2787       }
2788     }
2789 
2790     (*files)++;
2791     (*nfiles)--;
2792   }
2793 
2794   if (interleavedp == true) {
2795     /* Read one line at a time */
2796     return /*fastq_format_p*/false;
2797 
2798   } else if (*nextchar == EOF) {
2799     fprintf(stderr,"Warning: input is empty\n");
2800     exit(0);
2801 
2802   } else if (*nextchar == '@') {
2803     /* Looks like a FASTQ file */
2804     if (*nfiles == 0 || force_single_end_p == true) {
2805 #ifdef HAVE_ZLIB
2806       *gzipped2 = (gzFile) NULL;
2807 #endif
2808 #ifdef HAVE_BZLIB
2809       *bzipped2 = (Bzip2_T) NULL;
2810 #endif
2811       *input2 = (FILE *) NULL;
2812     } else {
2813       if (gunzip_p == true) {
2814 #ifdef HAVE_ZLIB
2815 	if ((*gzipped2 = gzopen((*files)[0],"rb")) == NULL) {
2816 	  fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2817 	  exit(9);
2818 	} else {
2819 #ifdef HAVE_ZLIB_GZBUFFER
2820 	  gzbuffer(*gzipped2,GZBUFFER_SIZE);
2821 #endif
2822 	  /* nextchar2 = */ Shortread_input_init_gzip(*gzipped2);
2823 	}
2824 #endif
2825 
2826       } else if (bunzip2_p == true) {
2827 #ifdef HAVE_BZLIB
2828 	if ((*bzipped2 = Bzip2_new((*files)[0])) == NULL) {
2829 	  fprintf(stderr,"Cannot open bzip2 file %s\n",(*files)[0]);
2830 	  exit(9);
2831 	} else {
2832 	  /* nextchar2 = */ Shortread_input_init_bzip2(*bzipped2);
2833 	}
2834 #endif
2835 
2836       } else {
2837 	if ((*input2 = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2838 	  fprintf(stderr,"Unable to read input\n");
2839 	  exit(9);
2840 	} else {
2841 	  debugf(fprintf(stderr,"Master opening file %s using fopen for input2\n",(*files)[0]));
2842 	  /* nextchar2 = */ Shortread_input_init(&(*nchars2),*input2);
2843 	}
2844       }
2845       (*files)++;
2846       (*nfiles)--;
2847     }
2848     fastq_format_p = true;
2849 
2850   } else if (*nextchar == '>') {
2851     /* Looks like a FASTA file */
2852 
2853   } else {
2854     fprintf(stderr,"First char is %c.  Expecting either '>' for FASTA or '@' for FASTQ format.\n",*nextchar);
2855     exit(9);
2856   }
2857 
2858   return fastq_format_p;
2859 }
2860 
2861 
2862 #ifdef USE_MPI
2863 static void
open_input_streams_worker(char *** files,int * nfiles,MPI_File * input,MPI_File * input2,MPI_Comm workers_comm,gzFile * gzipped,gzFile * gzipped2,Bzip2_T * bzipped,Bzip2_T * bzipped2,char * read_files_command,bool gunzip_p,bool bunzip2_p,bool interleavedp,bool fastq_format_p,int argc,char ** argv)2864 open_input_streams_worker (char ***files, int *nfiles,
2865 #if defined(USE_MPI_FILE_INPUT)
2866 			   MPI_File *input, MPI_File *input2, MPI_Comm workers_comm,
2867 #else
2868 			   FILE **input, FILE **input2,
2869 #endif
2870 #ifdef HAVE_ZLIB
2871 			   gzFile *gzipped, gzFile *gzipped2,
2872 #endif
2873 #ifdef HAVE_BZLIB
2874 			   Bzip2_T *bzipped, Bzip2_T *bzipped2,
2875 #endif
2876 			   char *read_files_command, bool gunzip_p, bool bunzip2_p,
2877 			   bool interleavedp, bool fastq_format_p, int argc, char **argv) {
2878 
2879   *input = *input2 = NULL;
2880 #ifdef HAVE_ZLIB
2881   *gzipped = *gzipped2 = NULL;
2882 #endif
2883 #ifdef HAVE_BZLIB
2884   *bzipped = *bzipped2 = NULL;
2885 #endif
2886 
2887   /* Open input stream and peek at first char */
2888   if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
2889     /* Ignore input */
2890 
2891   } else if (argc == 0) {
2892     fprintf(stderr,"For mpi_gsnap, cannot read from stdin\n");
2893     exit(9);
2894 
2895   } else {
2896     *files = argv;
2897     *nfiles = argc;
2898 
2899     if (gunzip_p == true) {
2900 #ifdef HAVE_ZLIB
2901       if ((*gzipped = gzopen((*files)[0],"rb")) == NULL) {
2902 	fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2903 	exit(9);
2904       } else {
2905 #ifdef HAVE_ZLIB_GZBUFFER
2906 	gzbuffer(*gzipped,GZBUFFER_SIZE);
2907 #endif
2908       }
2909 #endif
2910 
2911     } else if (bunzip2_p == true) {
2912 #ifdef HAVE_BZLIB
2913       if ((*bzipped = Bzip2_new((*files)[0])) == NULL) {
2914 	fprintf(stderr,"Cannot open bzipped file %s\n",(*files)[0]);
2915 	exit(9);
2916       }
2917 #endif
2918 
2919     } else {
2920 #if defined(USE_MPI_FILE_INPUT)
2921       if ((*input = MPI_fopen((*files)[0],workers_comm)) == NULL) {
2922 	fprintf(stderr,"Cannot open file %s\n",(*files)[0]);
2923 	exit(9);
2924       }
2925       debugf(fprintf(stderr,"Slave opening file %s using MPI_File_open\n",(*files)[0]));
2926 #else
2927       if ((*input = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2928 	fprintf(stderr,"Unable to read input\n");
2929 	exit(9);
2930       }
2931       debugf(fprintf(stderr,"Slave opening file %s using fopen\n",(*files)[0]));
2932 #endif
2933     }
2934 
2935     (*files)++;
2936     (*nfiles)--;
2937   }
2938 
2939 
2940   if (interleavedp == true || fastq_format_p == true) {
2941     /* Looks like a FASTQ file */
2942     if (*nfiles == 0 || force_single_end_p == true) {
2943 #ifdef HAVE_ZLIB
2944       *gzipped2 = (gzFile) NULL;
2945 #endif
2946 #ifdef HAVE_BZLIB
2947       *bzipped2 = (Bzip2_T) NULL;
2948 #endif
2949 #if defined(USE_MPI_FILE_INPUT)
2950       *input2 = (MPI_File) NULL;
2951 #else
2952       *input2 = (FILE *) NULL;
2953 #endif
2954     } else {
2955       if (gunzip_p == true) {
2956 #ifdef HAVE_ZLIB
2957 	if ((*gzipped2 = gzopen((*files)[0],"rb")) == NULL) {
2958 	  fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2959 	  exit(9);
2960 	} else {
2961 #ifdef HAVE_ZLIB_GZBUFFER
2962 	  gzbuffer(*gzipped2,GZBUFFER_SIZE);
2963 #endif
2964 	}
2965 #endif
2966 
2967       } else if (bunzip2_p == true) {
2968 #ifdef HAVE_BZLIB
2969 	if ((*bzipped2 = Bzip2_new((*files)[0])) == NULL) {
2970 	  fprintf(stderr,"Cannot open bzip2 file %s\n",(*files)[0]);
2971 	  exit(9);
2972 	}
2973 #endif
2974 
2975       } else {
2976 #if defined(USE_MPI_FILE_INPUT)
2977 	if ((*input2 = MPI_fopen((*files)[0],workers_comm)) == NULL) {
2978 	  fprintf(stderr,"Cannot open file %s\n",(*files)[0]);
2979 	  exit(9);
2980 	}
2981 	debugf(fprintf(stderr,"Slave opening file %s using MPI_File_open\n",(*files)[0]));
2982 #else
2983 	if ((*input2 = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2984 	  fprintf(stderr,"Unable to read input\n");
2985 	  exit(9);
2986 	}
2987 	debugf(fprintf(stderr,"Slave opening file %s using fopen\n",(*files)[0]));
2988 #endif
2989       }
2990       (*files)++;
2991       (*nfiles)--;
2992     }
2993   }
2994 
2995   return;
2996 }
2997 #endif
2998 
2999 
3000 static Univ_IIT_T
transcript_iit_setup(Trcoord_T * transcriptomelength,int * ntranscripts,char * transcriptomesubdir,char * fileroot)3001 transcript_iit_setup (Trcoord_T *transcriptomelength, int *ntranscripts,
3002 		      char *transcriptomesubdir, char *fileroot) {
3003   Univ_IIT_T transcript_iit = NULL;
3004   char *iitfile = NULL;
3005 
3006   /* Prepare transcript data */
3007 
3008   iitfile = (char *) CALLOC(strlen(transcriptomesubdir)+strlen("/")+
3009 			    strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
3010   sprintf(iitfile,"%s/%s.chromosome.iit",transcriptomesubdir,fileroot);
3011   if ((transcript_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
3012     fprintf(stderr,"IIT file %s is not valid\n",iitfile);
3013     exit(9);
3014   } else {
3015     FREE(iitfile);
3016     *ntranscripts = Univ_IIT_total_nintervals(transcript_iit);
3017   }
3018 
3019   *transcriptomelength = (Trcoord_T) Univ_IIT_genomelength(transcript_iit,/*with_circular_alias_p*/false);
3020   return transcript_iit;
3021 }
3022 
3023 
3024 
3025 static Univ_IIT_T
chromosome_iit_setup(Univcoord_T * genomelength,int * nchromosomes,int * circular_typeint,bool * any_circular_p,bool ** circularp,char * genomesubdir,char * fileroot)3026 chromosome_iit_setup (Univcoord_T *genomelength, int *nchromosomes, int *circular_typeint,
3027 		      bool *any_circular_p, bool **circularp, char *genomesubdir, char *fileroot) {
3028   Univ_IIT_T chromosome_iit = NULL, altscaffold_iit = NULL;
3029   char *iitfile = NULL;
3030 
3031 
3032   /* Prepare genomic data */
3033 
3034   iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
3035 			    strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
3036   sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
3037   if ((chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
3038     fprintf(stderr,"IIT file %s is not valid\n",iitfile);
3039     exit(9);
3040 #ifdef LARGE_GENOMES
3041   } else if (Univ_IIT_coord_values_8p(chromosome_iit) == false) {
3042     fprintf(stderr,"This program gsnapl is designed for large genomes.\n");
3043     fprintf(stderr,"For small genomes of less than 2^32 (4 billion) bp, please run gsnap instead.\n");
3044     exit(9);
3045 #endif
3046   } else {
3047     FREE(iitfile);
3048     *nchromosomes = Univ_IIT_total_nintervals(chromosome_iit);
3049     *circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
3050     *circularp = Univ_IIT_circularp(&(*any_circular_p),chromosome_iit);
3051 
3052     iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
3053 			      strlen(fileroot)+strlen(".altscaffold.iit")+1,sizeof(char));
3054     sprintf(iitfile,"%s/%s.altscaffold.iit",genomesubdir,fileroot);
3055     if ((altscaffold_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
3056       /* fprintf(stderr,"No altscaffold file found\n"); */
3057       altlocp = (bool *) CALLOC((*nchromosomes)+1,sizeof(bool));
3058       alias_starts = (Univcoord_T *) CALLOC((*nchromosomes)+1,sizeof(Univcoord_T));
3059       alias_ends = (Univcoord_T *) CALLOC((*nchromosomes)+1,sizeof(Univcoord_T));
3060 
3061     } else {
3062       fprintf(stderr,"Found altscaffold file found\n");
3063       altlocp = Univ_IIT_altlocp(&alias_starts,&alias_ends,chromosome_iit,altscaffold_iit);
3064       Univ_IIT_free(&altscaffold_iit);
3065     }
3066 
3067     FREE(iitfile);
3068   }
3069 
3070   *genomelength = Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias_p*/true);
3071   return chromosome_iit;
3072 }
3073 
3074 
3075 static void
worker_setup(char * transcriptomesubdir,char * transcriptome_fileroot,char * genomesubdir,char * genome_fileroot)3076 worker_setup (char *transcriptomesubdir, char *transcriptome_fileroot,
3077 	      char *genomesubdir, char *genome_fileroot) {
3078   char *snpsdir = NULL, *modedir = NULL, *mapdir = NULL, *iitfile = NULL;
3079   /* Univcoord_T genome_totallength; */
3080   char *idx_filesuffix1, *idx_filesuffix2;
3081 
3082   if (snps_root == NULL && maskedp == false) {
3083     if (transcriptome_fileroot != NULL) {
3084       transcriptomebits = Genome_new(transcriptomesubdir,transcriptome_fileroot,/*snps_root*/NULL,/*genometype*/GENOME_BITS,
3085 				     uncompressedp,genome_access,sharedp);
3086     }
3087 
3088     genomecomp = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_OLIGOS,
3089 			    uncompressedp,genome_access,sharedp);
3090     genomebits = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_BITS,
3091 			    uncompressedp,genome_access,sharedp);
3092 
3093   } else if (maskedp == true) {
3094     genomecomp = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_OLIGOS,
3095 			    uncompressedp,genome_access,sharedp);
3096     genomebits = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_BITS,
3097 			    uncompressedp,genome_access,sharedp);
3098     genomecomp_alt = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/masked_root,/*genometype*/GENOME_OLIGOS,
3099 				uncompressedp,genome_access,sharedp);
3100     genomebits_alt = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/masked_root,/*genometype*/GENOME_BITS,
3101 				uncompressedp,genome_access,sharedp);
3102   } else {
3103     if (user_snpsdir == NULL) {
3104       snpsdir = genomesubdir;
3105       mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,genome_fileroot);
3106     } else {
3107       snpsdir = user_snpsdir;
3108       mapdir = user_snpsdir;
3109     }
3110 
3111     /* SNPs */
3112     if (transcriptome_fileroot != NULL) {
3113       transcriptomebits = Genome_new(transcriptomesubdir,transcriptome_fileroot,/*snps_root*/NULL,/*genometype*/GENOME_BITS,
3114 				     uncompressedp,genome_access,sharedp);
3115       transcriptomebits_alt = Genome_new(snpsdir,transcriptome_fileroot,snps_root,/*genometype*/GENOME_BITS,
3116 					 uncompressedp,genome_access,sharedp);
3117     }
3118 
3119     genomecomp = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_OLIGOS,
3120 			    uncompressedp,genome_access,sharedp);
3121     genomebits = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_BITS,
3122 			    uncompressedp,genome_access,sharedp);
3123     genomecomp_alt = Genome_new(snpsdir,genome_fileroot,snps_root,/*genometype*/GENOME_OLIGOS,
3124 				uncompressedp,genome_access,sharedp);
3125     genomebits_alt = Genome_new(snpsdir,genome_fileroot,snps_root,/*genometype*/GENOME_BITS,
3126 				uncompressedp,genome_access,sharedp);
3127   }
3128 
3129   /* Must be done before Knownsplicing_retrieve_via_splicesites */
3130   Genome_setup(genomecomp,genomecomp_alt,genomelength,mode,circular_typeint);
3131 
3132 
3133   if (user_modedir == NULL) {
3134     modedir = genomesubdir;
3135   } else {
3136     modedir = user_modedir;
3137   }
3138 
3139   if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
3140     idx_filesuffix1 = "metct";
3141     idx_filesuffix2 = "metga";
3142   } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
3143     idx_filesuffix1 = "a2iag";
3144     idx_filesuffix2 = "a2itc";
3145   } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
3146     idx_filesuffix1 = "a2itc";
3147     idx_filesuffix2 = "a2iag";
3148   } else {
3149     idx_filesuffix1 = IDX_FILESUFFIX; /* "ref" */
3150     idx_filesuffix2 = (char *) NULL;
3151   }
3152 
3153   if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
3154 				    modedir,genome_fileroot,idx_filesuffix1,snps_root,
3155 				    required_index1part,required_index1interval,
3156 				    offsetsstrm_access,positions_access,sharedp,
3157 				    multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p)) == NULL) {
3158 
3159     if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
3160       fprintf(stderr,"Cannot find %s index file.  Need to run cmetindex first\n",idx_filesuffix1);
3161     } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED ||
3162 	       mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
3163       fprintf(stderr,"Cannot find %s index file.  Need to run atoiindex first\n",idx_filesuffix1);
3164     } else {
3165       fprintf(stderr,"Cannot find %s index file\n",idx_filesuffix1);
3166     }
3167     exit(9);
3168   }
3169 
3170   if (idx_filesuffix2 == NULL) {
3171     indexdb2 = indexdb;
3172   } else if ((indexdb2 = Indexdb_new_genome(&index1part,&index1interval,
3173 					    modedir,genome_fileroot,idx_filesuffix2,snps_root,
3174 					    required_index1part,required_index1interval,
3175 					    offsetsstrm_access,positions_access,sharedp,
3176 					    multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p)) == NULL) {
3177     if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
3178       fprintf(stderr,"Cannot find %s index file.  Need to run cmetindex first\n",idx_filesuffix2);
3179     } else {
3180       fprintf(stderr,"Cannot find %s index file.  Need to run atoiindex first\n",idx_filesuffix2);
3181     }
3182     exit(9);
3183   }
3184 
3185   /* Note: regiondb not currently SNP-tolerant */
3186   if (use_localdb_p == false) {
3187     regiondb = (Regiondb_T) NULL;
3188   } else if ((regiondb = Regiondb_new_genome(&region1part,&region1interval,
3189 					     genomesubdir,genome_fileroot,idx_filesuffix1,/*snps_root*/NULL,
3190 					     required_region1part,required_region1interval,
3191 					     regiondb_access,sharedp,
3192 					     multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p)) == NULL) {
3193     fprintf(stderr,"Warning: Cannot find regiondb files.  GSNAP will run on this older index, but there is a newer index format available\n");
3194     regiondb2 = (Regiondb_T) NULL;
3195   } else if (idx_filesuffix2 == NULL) {
3196     regiondb2 = regiondb;
3197   } else {
3198     regiondb2 = Regiondb_new_genome(&local1part,&local1interval,
3199 				    genomesubdir,genome_fileroot,idx_filesuffix2,/*snps_root*/NULL,
3200 				    required_local1part,required_local1interval,
3201 				    regiondb_access,sharedp,
3202 				    multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
3203   }
3204 
3205 
3206   if (snps_root != NULL) {
3207     iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
3208 			      strlen(snps_root)+1,sizeof(char));
3209     sprintf(iitfile,"%s/%s",mapdir,snps_root);
3210     fprintf(stderr,"Reading SNPs file %s/%s...",mapdir,snps_root);
3211     if ((snps_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3212 			     /*divstring*/NULL,/*add_iit_p*/true)) == NULL) {
3213       fprintf(stderr,"SNPs file %s.iit not found in %s.\n",snps_root,mapdir);
3214       if (user_snpsdir == NULL) {
3215 	fprintf(stderr,"Available files:\n");
3216 	Datadir_list_directory(stderr,genomesubdir);
3217 	fprintf(stderr,"Either install file %s.iit or specify a directory for the IIT file\n",snps_root);
3218 	fprintf(stderr,"using the -M flag.\n");
3219 	exit(9);
3220       }
3221     }
3222 
3223     print_nsnpdiffs_p = true;
3224     snps_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,snps_iit);
3225 
3226     fprintf(stderr,"done\n");
3227     FREE(iitfile);
3228     if (user_snpsdir == NULL) {
3229       FREE(mapdir);
3230     }
3231   }
3232 
3233 
3234   if (min_distantsplicing_end_matches < index1part) {
3235     fprintf(stderr,"Minimum value for distant-splice-endlength is the value for -k (kmer size) %d\n",index1part);
3236     exit(9);
3237   }
3238 
3239   indexdb_size_threshold = (int) (10*Indexdb_mean_size(indexdb,mode,index1part));
3240   debug(printf("Size threshold is %d\n",indexdb_size_threshold));
3241   if (indexdb_size_threshold < MIN_INDEXDB_SIZE_THRESHOLD) {
3242     indexdb_size_threshold = MIN_INDEXDB_SIZE_THRESHOLD;
3243   }
3244 
3245   if (genes_file != NULL) {
3246     if ((genes_iit = IIT_read(genes_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3247 			      /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3248       fprintf(stderr,"Reading genes file %s locally...",genes_file);
3249     } else {
3250       mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,genome_fileroot);
3251       iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
3252 				strlen(genes_file)+1,sizeof(char));
3253       sprintf(iitfile,"%s/%s",mapdir,genes_file);
3254       if ((genes_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3255 				/*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3256 	fprintf(stderr,"Reading genes file %s...",iitfile);
3257 	FREE(iitfile);
3258 	FREE(mapdir);
3259       } else {
3260 	fprintf(stderr,"Genes file %s.iit not found locally or in %s.  Available files:\n",genes_file,mapdir);
3261 	Datadir_list_directory(stderr,mapdir);
3262 	fprintf(stderr,"Either install file %s or specify a full directory path\n",genes_file);
3263 	exit(9);
3264       }
3265     }
3266     genes_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,genes_iit);
3267     genes_chrnum_crosstable = Univ_IIT_chrnum_crosstable(chromosome_iit,genes_iit);
3268   }
3269 
3270 
3271   if (splicing_file != NULL) {
3272     if (user_splicingdir == NULL) {
3273       if ((splicing_iit = IIT_read(splicing_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3274 				   /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3275 	fprintf(stderr,"Reading splicing file %s locally...",splicing_file);
3276       }
3277     } else {
3278       iitfile = (char *) CALLOC(strlen(user_splicingdir)+strlen("/")+strlen(splicing_file)+1,sizeof(char));
3279       sprintf(iitfile,"%s/%s",user_splicingdir,splicing_file);
3280       if ((splicing_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3281 				   /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3282 	fprintf(stderr,"Reading splicing file %s...",iitfile);
3283 	FREE(iitfile);
3284       }
3285     }
3286 
3287     if (splicing_iit == NULL) {
3288       mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,genome_fileroot);
3289       iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
3290 				strlen(splicing_file)+1,sizeof(char));
3291       sprintf(iitfile,"%s/%s",mapdir,splicing_file);
3292       if ((splicing_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3293 				   /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3294 	fprintf(stderr,"Reading splicing file %s...",iitfile);
3295 	FREE(iitfile);
3296 	FREE(mapdir);
3297       } else {
3298 	fprintf(stderr,"Splicing file %s.iit not found locally or in %s.  Available files:\n",splicing_file,mapdir);
3299 	Datadir_list_directory(stderr,mapdir);
3300 	fprintf(stderr,"Either install file %s or specify a full directory path\n",splicing_file);
3301 	exit(9);
3302       }
3303     }
3304 
3305     splicing_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,splicing_iit);
3306     if ((donor_typeint = IIT_typeint(splicing_iit,"donor")) >= 0 &&
3307 	(acceptor_typeint = IIT_typeint(splicing_iit,"acceptor")) >= 0) {
3308       fprintf(stderr,"found donor and acceptor tags, so treating as splicesites file\n");
3309       splicesites = Knownsplicing_retrieve_via_splicesites(&distances_observed_p,&splicecomp,&splicetypes,&splicedists,
3310 							   &nsplicesites,splicing_iit,splicing_divint_crosstable,
3311 							   donor_typeint,acceptor_typeint,chromosome_iit,
3312 							   genomecomp,genomecomp_alt,shortsplicedist);
3313       if (nsplicesites == 0) {
3314 	fprintf(stderr,"\nWarning: No splicesites observed for genome %s.  Are you sure this splicesite file was built for this genome?  Please compare chromosomes below:\n",
3315 		genome_dbroot);
3316 	fprintf(stderr,"Chromosomes in the genome: ");
3317 	Univ_IIT_dump_labels(stderr,chromosome_iit);
3318 	fprintf(stderr,"Chromosomes in the splicesites IIT file: ");
3319 	IIT_dump_divstrings(stderr,splicing_iit);
3320 	exit(9);
3321       }
3322 
3323     } else {
3324       fprintf(stderr,"no donor or acceptor tags found, so treating as introns file\n");
3325       fprintf(stderr,"Intron mode for splicing file no longer supported in GSNAP\n");
3326       exit(9);
3327     }
3328 
3329     /* For benchmarking purposes.  Can spend time/memory to load
3330        splicesites, but then not use them. */
3331     if (unloadp == true) {
3332       fprintf(stderr,"unloading...");
3333 
3334       if (nsplicesites > 0) {
3335 	FREE(splicedists);
3336 	FREE(splicetypes);
3337 	FREE(splicesites);
3338 	FREE(splicecomp);
3339 	nsplicesites = 0;
3340       }
3341 
3342       FREE(splicing_divint_crosstable);
3343       IIT_free(&splicing_iit);
3344       splicing_iit = NULL;
3345       knownsplicingp = false;
3346       splicing_file = (char *) NULL;
3347     }
3348 
3349     fprintf(stderr,"done\n");
3350   }
3351 
3352   if (novelsplicingp == true || knownsplicingp == true) {
3353     /* RNA-seq */
3354     splicingp = true;
3355   } else {
3356     /* DNA-seq */
3357     splicingp = false;
3358   }
3359 
3360 
3361   if (tally_root != NULL) {
3362     if (user_tallydir == NULL) {
3363       if ((tally_iit = IIT_read(tally_root,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3364 				/*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3365 	fprintf(stderr,"Reading tally file %s.iit locally...",tally_root);
3366       }
3367     } else {
3368       iitfile = (char *) CALLOC(strlen(user_tallydir)+strlen("/")+strlen(tally_root)+1,sizeof(char));
3369       sprintf(iitfile,"%s/%s",user_tallydir,tally_root);
3370       if ((tally_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3371 				/*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3372 	fprintf(stderr,"Reading tally file %s...",iitfile);
3373 	FREE(iitfile);
3374       }
3375     }
3376 
3377     if (tally_iit == NULL) {
3378       iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(tally_root)+1,sizeof(char));
3379       sprintf(iitfile,"%s/%s",genomesubdir,tally_root);
3380       if ((tally_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3381 				/*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3382 	fprintf(stderr,"Reading tally file %s...",iitfile);
3383 	FREE(iitfile);
3384       } else {
3385 	fprintf(stderr,"Tally file %s.iit not found locally",tally_root);
3386 	if (user_tallydir != NULL) {
3387 	  fprintf(stderr," or in %s",user_tallydir);
3388 	}
3389 	fprintf(stderr," or in %s\n",genomesubdir);
3390 	exit(9);
3391       }
3392     }
3393 
3394     tally_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,tally_iit);
3395     fprintf(stderr,"done\n");
3396   }
3397 
3398 
3399   if (runlength_root != NULL) {
3400     if (user_runlengthdir == NULL) {
3401       if ((runlength_iit = IIT_read(runlength_root,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3402 				    /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3403 	fprintf(stderr,"Reading runlength file %s.iit locally...",runlength_root);
3404       }
3405     } else {
3406       iitfile = (char *) CALLOC(strlen(user_runlengthdir)+strlen("/")+strlen(runlength_root)+1,sizeof(char));
3407       sprintf(iitfile,"%s/%s",user_runlengthdir,runlength_root);
3408       if ((runlength_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3409 				    /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3410 	fprintf(stderr,"Reading runlength file %s...",iitfile);
3411 	FREE(iitfile);
3412       }
3413     }
3414 
3415     if (runlength_iit == NULL) {
3416       iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(runlength_root)+1,sizeof(char));
3417       sprintf(iitfile,"%s/%s",genomesubdir,runlength_root);
3418       if ((runlength_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3419 				    /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3420 	fprintf(stderr,"Reading runlength file %s...",iitfile);
3421 	FREE(iitfile);
3422       } else {
3423 	fprintf(stderr,"Runlength file %s.iit not found locally",runlength_root);
3424 	if (user_runlengthdir != NULL) {
3425 	  fprintf(stderr," or in %s",user_runlengthdir);
3426 	}
3427 	fprintf(stderr," or in %s\n",genomesubdir);
3428 	exit(9);
3429       }
3430     }
3431 
3432     runlength_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,runlength_iit);
3433     fprintf(stderr,"done\n");
3434   }
3435 
3436 
3437   Path_solve_setup(genomelength,circularp,genomebits,genomebits_alt,regiondb,regiondb2,
3438 		   min_intronlength,novelsplicingp,splicingp,
3439 		   splicesites,splicetypes,splicedists,nsplicesites,
3440 		   index1part,index1interval,local1part,region1part);
3441   Kmer_search_setup(mode,index1part_tr,index1part,index1interval,local1part,transcriptomelength,
3442 		    min_intronlength,chromosome_iit,genomelength,circular_typeint,circularp,
3443 		    genomebits,genomebits_alt,indexdb,indexdb2,indexdb_tr,
3444 		    splicingp,splicesites,splicetypes,splicedists,nsplicesites);
3445   Extension_search_setup(mode,chromosome_iit,genomelength,circular_typeint,circularp,
3446 			 genomebits,genomebits_alt,indexdb,indexdb2,
3447 			 min_intronlength,max_end_insertions,max_end_deletions,
3448 			 index1part,index1interval,local1part);
3449   Segment_search_setup(index1part,index1interval,max_anchors,chromosome_iit,nchromosomes,
3450 		       circular_typeint,mode,splicesites,splicetypes,splicedists,nsplicesites);
3451   Terminal_setup(chromosome_iit,genomelength,circular_typeint,genomebits,genomebits_alt,
3452 		 splicingp,index1part,index1interval,subopt_levels);
3453   Distant_rna_setup(chromosome_iit,genomelength,circular_typeint,genomebits,genomebits_alt,
3454 		    index1part,index1interval,subopt_levels,shortsplicedist,
3455 		    min_distantsplicing_end_matches,min_distantsplicing_identity,
3456 		    localsplicing_penalty,distantsplicing_penalty);
3457   Distant_dna_setup(genomebits,genomebits_alt,genomelength,shortsplicedist,splicingp);
3458 
3459 
3460   if (genomebits == NULL) {
3461     fprintf(stderr,"This version of GSNAP requires the genomebits128 file for the genome\n");
3462     exit(9);
3463   } else {
3464     Genome_hr_setup(Genome_blocks(genomebits),/*snp_blocks*/genomebits_alt ? Genome_blocks(genomebits_alt) : NULL,
3465 		    query_unk_mismatch_p,genome_unk_mismatch_p,mode,maskedp);
3466     Genome_consec_setup(query_unk_mismatch_p,genome_unk_mismatch_p,mode);
3467   }
3468 
3469   Genome_sites_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
3470   Maxent_hr_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
3471 
3472   Simplepair_setup(splicingp,transcript_iit,sam_insert_0M_p,
3473 		   md_lowercase_variant_p,/*snps_p*/snps_iit ? true : false,
3474 		   sam_cigar_extended_p,cigar_action);
3475 
3476   /* Oligoindex_hr_setup(Genome_blocks(genomecomp),mode); */
3477   /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
3478 
3479   Indexdb_setup(index1part);
3480   Oligo_setup(index1part,mode);
3481   Regiondb_setup(region1part,mode);
3482   /* spansize = Spanningelt_setup(index1part,index1interval); */
3483 
3484   Knownsplicing_setup(splicecomp);
3485   Splice_setup(genomebits,genomebits_alt,min_shortend);
3486   Indel_setup(genomebits,genomebits_alt,genomelength,transcriptomelength,
3487 	      min_indel_end_matches,indel_penalty_middle,maskedp);
3488   Transcript_setup(pairmax_transcriptome,expected_pairlength);
3489 
3490   Stage1hr_setup(transcript_iit,transcriptome,transcriptomebits,use_only_transcriptome_p,
3491 		 indexdb,indexdb2,indexdb_tr,index1part_tr,index1part,index1interval,
3492 		 user_mismatches_refalt_float,user_mismatches_ref_float,user_mincoverage_float,
3493 		 max_middle_insertions_float,max_middle_deletions_float,
3494 		 max_end_insertions,max_end_deletions,
3495 		 chromosome_iit,nchromosomes,genomecomp,genomebits,genomebits_alt,
3496 		 mode,maxpaths_search,maxpaths_report,find_dna_chimeras_p,distances_observed_p,
3497 		 subopt_levels,splicingp,shortsplicedist,shortsplicedist_novelend,
3498 		 min_intronlength,expected_pairlength,pairlength_deviation,
3499 		 distantsplicing_penalty,min_distantsplicing_end_matches,min_distantsplicing_identity);
3500   Substring_setup(print_nsnpdiffs_p,print_snplabels_p,
3501 		  show_refdiff_p,snps_iit,snps_divint_crosstable,
3502 		  genomebits,genomebits_alt,chromosome_iit,genomelength,nchromosomes,circular_typeint,
3503 		  novelsplicingp,knownsplicingp,output_type,mode,maskedp);
3504   Stage3hr_setup(/*transcriptomep*/transcriptome != NULL ? true : false,
3505 		 invert_first_p,invert_second_p,genomecomp,genomebits,genomebits_alt,
3506 		 chromosome_iit,genomelength,nchromosomes,circular_typeint,
3507 		 transcriptomebits,transcriptome,transcript_iit,
3508 		 tally_iit,tally_divint_crosstable,runlength_iit,runlength_divint_crosstable,
3509 		 distances_observed_p,pairmax_linear,pairmax_circular,
3510 		 expected_pairlength,pairlength_deviation,
3511 		 localsplicing_penalty,indel_penalty_middle,antistranded_penalty,
3512 		 favor_multiexon_p,subopt_levels,circularp,altlocp,alias_starts,alias_ends,
3513 		 filter_within_trims_p,omit_concordant_uniq_p,omit_concordant_mult_p,
3514 		 failedinput_root,output_type,merge_samechr_p,method_print_p,want_random_p);
3515 
3516   /* Note: For NM_145699_163_, want to increase subopt_levels by 8 for
3517      Concordance_setup, and in pruning based on nmatches in
3518      Stage3pair_optimal_score_final */
3519   Concordance_setup(subopt_levels,pairmax_transcriptome,pairmax_linear,pairmax_circular,
3520 		    expected_pairlength,pairlength_deviation,circularp,merge_samechr_p);
3521   SAM_setup(add_paired_nomappers_p,paired_flag_means_concordant_p,
3522 	    omit_concordant_uniq_p,omit_concordant_mult_p,quiet_if_excessive_p,
3523 	    maxpaths_report,failedinput_root,fastq_format_p,hide_soft_clips_p,method_print_p,
3524 	    clip_overlap_p,merge_overlap_p,merge_samechr_p,sam_multiple_primaries_p,sam_sparse_secondaries_p,
3525 	    force_xs_direction_p,snps_iit,find_dna_chimeras_p,
3526 	    splicingp,donor_typeint,acceptor_typeint,chromosome_iit,transcript_iit);
3527   Cigar_setup(genomecomp,sam_cigar_extended_p,hide_soft_clips_p,merge_samechr_p,md_lowercase_variant_p,
3528 	      sam_hardclip_use_S_p,sam_insert_0M_p);
3529   Output_setup(chromosome_iit,nofailsp,failsonlyp,quiet_if_excessive_p,maxpaths_report,
3530 	       failedinput_root,quality_shift,
3531 	       output_type,invert_first_p,invert_second_p,
3532 	       sam_read_group_id);
3533 
3534   return;
3535 }
3536 
3537 
3538 static void
worker_cleanup()3539 worker_cleanup () {
3540 
3541   Segment_search_cleanup();
3542 
3543   if (regiondb2 != regiondb) {
3544     Regiondb_free(&regiondb2);
3545   }
3546   if (regiondb != NULL) {
3547     Regiondb_free(&regiondb);
3548   }
3549 
3550   if (indexdb2 != indexdb) {
3551     Indexdb_free(&indexdb2);
3552   }
3553   if (indexdb != NULL) {
3554     Indexdb_free(&indexdb);
3555   }
3556 
3557   if (transcriptomebits_alt != NULL) {
3558     Genome_free(&transcriptomebits_alt);
3559   }
3560   if (transcriptomebits != NULL) {
3561     Genome_free(&transcriptomebits);
3562   }
3563 
3564 
3565   if (genomecomp_alt != NULL) {
3566     Genome_free(&genomecomp_alt);
3567     Genome_free(&genomebits_alt);
3568   }
3569   if (genomebits != NULL) {
3570     Genome_free(&genomebits);
3571   }
3572   if (genomecomp != NULL) {
3573     Genome_free(&genomecomp);
3574   }
3575 
3576   if (nsplicesites > 0) {
3577     FREE(splicedists);
3578     FREE(splicetypes);
3579     FREE(splicecomp);
3580     FREE(splicesites);
3581   }
3582 
3583   if (runlength_iit != NULL) {
3584     FREE(runlength_divint_crosstable);
3585     IIT_free(&runlength_iit);
3586   }
3587 
3588   if (tally_iit != NULL) {
3589     FREE(tally_divint_crosstable);
3590     IIT_free(&tally_iit);
3591   }
3592 
3593   if (splicing_iit != NULL) {
3594     FREE(splicing_divint_crosstable);
3595     IIT_free(&splicing_iit);
3596   }
3597 
3598   if (genes_iit != NULL) {
3599     FREE(genes_chrnum_crosstable);
3600     FREE(genes_divint_crosstable);
3601     IIT_free(&genes_iit);
3602   }
3603 
3604   if (snps_iit != NULL) {
3605     FREE(snps_divint_crosstable);
3606     IIT_free(&snps_iit);
3607   }
3608 
3609   if (altlocp != NULL) {
3610     FREE(alias_ends);
3611     FREE(alias_starts);
3612     FREE(altlocp);
3613   }
3614 
3615   if (circularp != NULL) {
3616     FREE(circularp);
3617   }
3618 
3619   if (indexdb_tr != NULL) {
3620     Indexdb_free(&indexdb_tr);
3621   }
3622 
3623   if (transcriptome != NULL) {
3624     Transcriptome_free(&transcriptome);
3625   }
3626 
3627   if (transcript_iit != NULL) {
3628     Univ_IIT_free(&transcript_iit);
3629   }
3630 
3631   if (chromosome_iit != NULL) {
3632     Univ_IIT_free(&chromosome_iit);
3633   }
3634 
3635   Access_controlled_cleanup();
3636 
3637   return;
3638 }
3639 
3640 
3641 int
main(int argc,char * argv[])3642 main (int argc, char *argv[]) {
3643   int nchars1 = 0, nchars2 = 0;
3644   int cmdline_status;
3645 
3646   char *transcriptomesubdir = NULL, *genomesubdir, *transcriptome_fileroot = NULL, *genome_fileroot, *dbversion;
3647   char **files;
3648   int nfiles;
3649 #if defined(USE_MPI) && defined(USE_MPI_FILE_INPUT)
3650   MPI_File mpi_file_input, mpi_file_input_2;
3651 #endif
3652 
3653 #ifdef USE_MPI
3654   Master_T master;
3655   char **files_master;
3656   int nfiles_master;
3657   FILE *input_parser, *input2_parser;
3658 #endif
3659   FILE *input, *input2;
3660 
3661 #ifdef HAVE_ZLIB
3662 #ifdef USE_MPI
3663   gzFile gzipped_master, gzipped2_master;
3664 #endif
3665   gzFile gzipped, gzipped2;
3666 #endif
3667 
3668 #ifdef HAVE_BZLIB
3669 #ifdef USE_MPI
3670   Bzip2_T bzipped_master, bzipped2_master;
3671 #endif
3672   Bzip2_T bzipped, bzipped2;
3673 #endif
3674 
3675   long int worker_id;
3676 
3677   int nread;
3678   int nextchar = '\0';
3679   double runtime;
3680 
3681 #ifdef HAVE_PTHREAD
3682   int ret;
3683   pthread_attr_t thread_attr_join;
3684 #ifdef USE_MPI
3685   pthread_attr_t thread_attr_detach;
3686 #endif
3687 #endif
3688 
3689 #ifdef HAVE_SIGACTION
3690   struct sigaction signal_action;
3691 #endif
3692 
3693   extern int optind;
3694 
3695 #ifdef MEMUSAGE
3696   Mem_usage_init();
3697   Mem_usage_set_threadname("main");
3698 #endif
3699 
3700 
3701   cmdline_status = parse_command_line(argc,argv,optind);
3702   argc -= optind;
3703   argv += optind;
3704 
3705   if (cmdline_status == 0) {
3706     /* okay to continue */
3707   } else if (cmdline_status == 1) {
3708     exit(0);
3709   } else {
3710     exit(cmdline_status);
3711   }
3712 
3713   check_compiler_assumptions();
3714 
3715   if (exception_raise_p == false) {
3716     fprintf(stderr,"Allowing signals and exceptions to pass through.  If using shared memory, need to remove segments manually.\n");
3717     Except_inactivate();
3718   } else {
3719 #ifdef HAVE_SIGACTION
3720     signal_action.sa_handler = signal_handler;
3721     signal_action.sa_flags = 0;
3722     sigfillset(&signal_action.sa_mask); /* After first signal, block all other signals */
3723 
3724     /* Note: SIGKILL and SIGSTOP cannot be caught */
3725 
3726     sigaction(SIGABRT,&signal_action,NULL); /* abnormal termination (abort) */
3727     sigaction(SIGBUS,&signal_action,NULL);  /* bus error */
3728     sigaction(SIGFPE,&signal_action,NULL);  /* arithmetic exception */
3729     sigaction(SIGHUP,&signal_action,NULL);  /* hangup */
3730     sigaction(SIGILL,&signal_action,NULL);  /* illegal hardware instruction */
3731     sigaction(SIGINT,&signal_action,NULL);  /* terminal interruption (control-C) */
3732     sigaction(SIGPIPE,&signal_action,NULL);  /* write to pipe with no readers */
3733     sigaction(SIGQUIT,&signal_action,NULL);  /* terminal quit (control-backslash) */
3734     sigaction(SIGSEGV,&signal_action,NULL);  /* invalid memory reference */
3735     sigaction(SIGSYS,&signal_action,NULL);  /* invalid system call */
3736     sigaction(SIGTERM,&signal_action,NULL);  /* Unix kill command */
3737     sigaction(SIGTRAP,&signal_action,NULL);  /* hardware fault */
3738     sigaction(SIGXCPU,&signal_action,NULL);  /* CPU limit exceeded */
3739     sigaction(SIGXFSZ,&signal_action,NULL);  /* file size limit exceeded */
3740 #endif
3741   }
3742 
3743 #ifdef USE_MPI
3744   /* MPI_Init(&argc,&argv); */
3745   MPI_Init_thread(&argc,&argv,/*requested*/MPI_THREAD_MULTIPLE,&provided);
3746   MPI_Comm_rank(MPI_COMM_WORLD,&myid);
3747   MPI_Comm_size(MPI_COMM_WORLD,&nranks);
3748   MPI_Debug_setup(myid);
3749 
3750   nthreads0 = nthreads - 1;
3751   if (master_is_worker_p == false) {
3752     /* Default is to exclude master node from working */
3753     exclude_ranks[0] = 0;
3754     MPI_Comm_group(MPI_COMM_WORLD,&world_group);
3755     MPI_Group_excl(world_group,1,exclude_ranks,&workers_group);
3756     MPI_Comm_create(MPI_COMM_WORLD,workers_group,&workers_comm);
3757     MPI_Group_free(&workers_group);
3758     MPI_Group_free(&world_group);
3759 
3760   } else if (nthreads0 <= 0) {
3761     /* If insufficient threads, then also exclude master node from working */
3762     exclude_ranks[0] = 0;
3763     MPI_Comm_group(MPI_COMM_WORLD,&world_group);
3764     MPI_Group_excl(world_group,1,exclude_ranks,&workers_group);
3765     MPI_Comm_create(MPI_COMM_WORLD,workers_group,&workers_comm);
3766     MPI_Group_free(&workers_group);
3767     MPI_Group_free(&world_group);
3768     master_is_worker_p = false;
3769 
3770   } else {
3771     /* Include master rank 0 in workers group */
3772     MPI_Comm_group(MPI_COMM_WORLD,&world_group);
3773     MPI_Comm_create(MPI_COMM_WORLD,world_group,&workers_comm);
3774     MPI_Group_free(&world_group);
3775     /* master_is_worker_p = true; */
3776   }
3777   n_slave_ranks = nranks - 1;	/* Don't include master, even if it's a worker */
3778 
3779   if (myid == 0) {
3780     nthreads = nthreads0;
3781     fastq_format_p = open_input_streams_parser(&nextchar,&nchars1,&nchars2,
3782 					       &files_master,&nfiles_master,&input_parser,&input2_parser,
3783 #ifdef HAVE_ZLIB
3784 					       &gzipped_master,&gzipped2_master,
3785 #endif
3786 #ifdef HAVE_BZLIB
3787 					       &bzipped_master,&bzipped2_master,
3788 #endif
3789 					       read_files_command,gunzip_p,bunzip2_p,interleavedp,argc,argv);
3790     master = Master_new(n_slave_ranks,nextchar,nchars1,nchars2,
3791 			input_parser,input2_parser,
3792 #ifdef HAVE_ZLIB
3793 			gzipped_master,gzipped2_master,
3794 #endif
3795 #ifdef HAVE_BZLIB
3796 			bzipped_master,bzipped2_master,
3797 #endif
3798 			files_master,nfiles_master,inbuffer_nspaces,part_modulus,part_interval);
3799   }
3800 
3801   MPI_Bcast(&fastq_format_p,1,MPI_BOOL_T,/*root*/0,MPI_COMM_WORLD);
3802   MPI_Bcast(&nextchar,1,MPI_CHAR,/*root*/0,MPI_COMM_WORLD);
3803 
3804   /* If not using MPI_File, then master already has input and input2,
3805      and does not need mpi_file_input or mpi_file_input_2 (because of the workers_comm) */
3806   if (myid > 0 || master_is_worker_p == true) {
3807     open_input_streams_worker(&files,&nfiles,
3808 #ifdef USE_MPI_FILE_INPUT
3809 			      &mpi_file_input,&mpi_file_input_2,workers_comm,
3810 #else
3811 			      &input,&input2,
3812 #endif
3813 #ifdef HAVE_ZLIB
3814 			      &gzipped,&gzipped2,
3815 #endif
3816 #ifdef HAVE_BZLIB
3817 			      &bzipped,&bzipped2,
3818 #endif
3819 			      read_files_command,gunzip_p,bunzip2_p,interleavedp,fastq_format_p,argc,argv);
3820 
3821     /* Inbuffer_master_process skips to part_modulus, so workers need it set to 0 */
3822     Inbuffer_setup(filter_if_both_p,
3823 #ifdef USE_MPI_FILE_INPUT
3824 		   workers_comm,
3825 #endif
3826 		   /*part_modulus*/0,part_interval);
3827 
3828     inbuffer = Inbuffer_new(nextchar,myid,
3829 #ifdef USE_MPI_FILE_INPUT
3830 			    mpi_file_input,mpi_file_input_2,
3831 #else
3832 			    input,input2,
3833 #endif
3834 #ifdef HAVE_ZLIB
3835 			    gzipped,gzipped2,
3836 #endif
3837 #ifdef HAVE_BZLIB
3838 			    bzipped,bzipped2,
3839 #endif
3840 			    read_files_command,files,nfiles,inbuffer_nspaces);
3841   }
3842 
3843   Shortread_setup(acc_fieldi_start,acc_fieldi_end,force_single_end_p,filter_chastity_p,
3844 		  allow_paired_end_mismatch_p,fastq_format_p,barcode_length,endtrim_length,
3845 		  invert_first_p,invert_second_p);
3846   multiple_sequences_p = true;
3847 
3848 
3849 #else
3850   /* Non-MPI version */
3851   fastq_format_p = open_input_streams_parser(&nextchar,&nchars1,&nchars2,&files,&nfiles,&input,&input2,
3852 #ifdef HAVE_ZLIB
3853 					     &gzipped,&gzipped2,
3854 #endif
3855 #ifdef HAVE_BZLIB
3856 					     &bzipped,&bzipped2,
3857 #endif
3858 					     read_files_command,gunzip_p,bunzip2_p,interleavedp,argc,argv);
3859 
3860   Inbuffer_setup(filter_if_both_p,part_modulus,part_interval);
3861 
3862   inbuffer = Inbuffer_new(nextchar,input,input2,
3863 #ifdef HAVE_ZLIB
3864 			  gzipped,gzipped2,
3865 #endif
3866 #ifdef HAVE_BZLIB
3867 			  bzipped,bzipped2,
3868 #endif
3869 			  interleavedp,read_files_command,files,nfiles,inbuffer_nspaces);
3870 
3871   Shortread_setup(acc_fieldi_start,acc_fieldi_end,force_single_end_p,filter_chastity_p,
3872 		  allow_paired_end_mismatch_p,fastq_format_p,barcode_length,endtrim_length,
3873 		  invert_first_p,invert_second_p);
3874 
3875   if ((nread = Inbuffer_fill_init(inbuffer)) > 1) {
3876     multiple_sequences_p = true;
3877   } else {
3878     multiple_sequences_p = false;
3879   }
3880 #endif
3881 
3882   if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
3883     /* No need for pre-load */
3884 
3885   } else if (multiple_sequences_p == true) {
3886 #if 0
3887     if (offsetsstrm_access != USE_ALLOCATE || genome_access != USE_ALLOCATE) {
3888       fprintf(stderr,"Note: >1 sequence detected, so index files are being memory mapped.\n");
3889       fprintf(stderr,"  GSNAP can run slowly at first while the computer starts to accumulate\n");
3890       fprintf(stderr,"  pages from the hard disk into its cache.  To copy index files into RAM\n");
3891       fprintf(stderr,"  instead of memory mapping, use -B 3, -B 4, or -B 5, if you have enough RAM.\n");
3892 #ifdef HAVE_PTHREAD
3893       fprintf(stderr,"  For more speed, also try multiple threads (-t <int>), if you have multiple processors or cores.");
3894 #endif
3895       fprintf(stderr,"\n");
3896     }
3897 #endif
3898 
3899   } else {
3900     /* fprintf(stderr,"Note: only 1 sequence detected.  Ignoring batch (-B) command\n"); */
3901     expand_offsets_p = false;
3902 #ifdef HAVE_MMAP
3903     offsetsstrm_access = USE_MMAP_ONLY;
3904     positions_access = USE_MMAP_ONLY;
3905     genome_access = USE_MMAP_ONLY;
3906     locoffsetsstrm_access = USE_MMAP_ONLY;
3907     locpositions_access = USE_MMAP_ONLY;
3908 #else
3909     /* No choice, since mmap is not available */
3910     offsetsstrm_access = USE_ALLOCATE;
3911     positions_access = USE_ALLOCATE;
3912     genome_access = USE_ALLOCATE;
3913     locoffsetsstrm_access = USE_ALLOCATE;
3914     locpositions_access = USE_ALLOCATE;
3915 #endif
3916   }
3917 
3918   genomesubdir = Datadir_find_genomesubdir(&genome_fileroot,&dbversion,user_genomedir,genome_dbroot);
3919   FREE(dbversion);
3920   chromosome_iit = chromosome_iit_setup(&genomelength,&nchromosomes,&circular_typeint,&any_circular_p,&circularp,
3921 					genomesubdir,genome_fileroot);
3922   Outbuffer_setup(argc,argv,optind,chromosome_iit,any_circular_p,
3923 		  nthreads,orderedp,quiet_if_excessive_p,
3924 		  output_type,sam_headers_p,sam_read_group_id,sam_read_group_name,
3925 		  sam_read_group_library,sam_read_group_platform,
3926 		  appendp,output_file,split_output_root,failedinput_root);
3927 
3928 
3929   if (transcriptome_dbroot != NULL) {
3930     if (user_transcriptomedir == NULL && user_genomedir != NULL) {
3931       user_transcriptomedir = user_genomedir;
3932     }
3933     transcriptomesubdir = Datadir_find_genomesubdir(&transcriptome_fileroot,&dbversion,user_transcriptomedir,transcriptome_dbroot);
3934     FREE(dbversion);
3935     transcript_iit = transcript_iit_setup(&transcriptomelength,&ntranscripts,transcriptomesubdir,transcriptome_fileroot);
3936     transcriptome = Transcriptome_new(genomesubdir,genome_fileroot,transcriptome_fileroot,sharedp);
3937     indexdb_tr = Indexdb_new_transcriptome(&index1part_tr,&index1interval_tr,transcriptomesubdir,
3938 					   transcriptome_fileroot,/*idx_filesuffix*/"ref",/*snps_root*/NULL,
3939 					   /*required_index1part*/0,/*required_index1interval*/0,
3940 					   offsetsstrm_access,positions_access,sharedp,
3941 					   multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
3942   }
3943 
3944 
3945 #if defined(USE_MPI) && defined(HAVE_PTHREAD)
3946   /* Needed for Master_parser and possibly Master_write_stdout, which never terminate */
3947   pthread_attr_init(&thread_attr_detach);
3948   if ((ret = pthread_attr_setdetachstate(&thread_attr_detach,PTHREAD_CREATE_DETACHED)) != 0) {
3949     fprintf(stderr,"ERROR: pthread_attr_setdetachstate returned %d\n",ret);
3950     exit(1);
3951   }
3952 #endif
3953 
3954 #ifdef USE_MPI
3955   if (myid == 0 && master_is_worker_p == false) {
3956     FREE(transcriptomesubdir);
3957     FREE(transcriptome_fileroot);
3958     FREE(genomesubdir);
3959     FREE(genome_fileroot);
3960 
3961     /* Master rank, which is not a worker */
3962     if (output_file != NULL) {
3963       fprintf(stderr,"Starting alignment.  Writing results to %s\n",output_file);
3964     } else if (split_output_root != NULL) {
3965       fprintf(stderr,"Starting alignment.  Writing results to %s.*\n",split_output_root);
3966     } else {
3967       fprintf(stderr,"Starting alignment\n");
3968     }
3969 
3970     stopwatch = Stopwatch_new();
3971     Stopwatch_start(stopwatch);
3972 
3973     if (split_output_root == NULL && output_file == NULL) {
3974       pthread_create(&write_stdout_thread_id,&thread_attr_detach,Master_write_stdout,(void *) NULL);
3975     }
3976     pthread_create(&parser_thread_id,&thread_attr_detach,Master_parser,(void *) master);
3977     Master_mpi_interface((void *) master); /* Can run as a normal procedure, not as a thread */
3978 
3979   } else {
3980     worker_setup(transcriptomesubdir,transcriptome_fileroot,genomesubdir,genome_fileroot);
3981     FREE(transcriptomesubdir);
3982     FREE(transcriptome_fileroot);
3983     FREE(genomesubdir);
3984     FREE(genome_fileroot);
3985     MPI_Barrier(workers_comm);
3986 
3987     outbuffer = Outbuffer_new(output_buffer_size,/*nread*/0);
3988     Inbuffer_set_outbuffer(inbuffer,outbuffer);
3989     /* MPI worker ranks continue on with creating output_thread and worker_threads below */
3990 
3991     if (myid == 0) {
3992       Inbuffer_set_master(inbuffer,master);
3993 
3994       if (output_file != NULL) {
3995 	fprintf(stderr,"Starting alignment.  Writing results to %s\n",output_file);
3996       } else if (split_output_root != NULL) {
3997 	fprintf(stderr,"Starting alignment.  Writing results to %s.*\n",split_output_root);
3998       } else {
3999 	fprintf(stderr,"Starting alignment\n");
4000       }
4001       stopwatch = Stopwatch_new();
4002       Stopwatch_start(stopwatch);
4003     }
4004 
4005 #else
4006   Access_setup(preload_shared_memory_p,unload_shared_memory_p);
4007   worker_setup(transcriptomesubdir,transcriptome_fileroot,genomesubdir,genome_fileroot);
4008   FREE(transcriptomesubdir);
4009   FREE(transcriptome_fileroot);
4010   FREE(genomesubdir);
4011   FREE(genome_fileroot);
4012   if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
4013     worker_cleanup();
4014     return 0;
4015   }
4016 
4017   outbuffer = Outbuffer_new(output_buffer_size,nread);
4018   Inbuffer_set_outbuffer(inbuffer,outbuffer);
4019 
4020   if (output_file != NULL) {
4021     fprintf(stderr,"Starting alignment.  Writing results to %s\n",output_file);
4022   } else if (split_output_root != NULL) {
4023     fprintf(stderr,"Starting alignment.  Writing results to %s.*\n",split_output_root);
4024   } else {
4025     fprintf(stderr,"Starting alignment\n");
4026   }
4027   stopwatch = Stopwatch_new();
4028   Stopwatch_start(stopwatch);
4029 #endif
4030 
4031 
4032 
4033 #if !defined(HAVE_PTHREAD)
4034   /* Serial version */
4035   single_thread();
4036 
4037 #else
4038   /* Pthreads version */
4039   if (nthreads == 0) {
4040     single_thread();
4041 
4042   } else if (multiple_sequences_p == false) {
4043     single_thread();
4044 
4045   } else {
4046     pthread_attr_init(&thread_attr_join);
4047     if ((ret = pthread_attr_setdetachstate(&thread_attr_join,PTHREAD_CREATE_JOINABLE)) != 0) {
4048       fprintf(stderr,"ERROR: pthread_attr_setdetachstate returned %d\n",ret);
4049       exit(1);
4050     }
4051 
4052 #ifdef USE_MPI
4053     /* Master rank that is working or a Slave rank */
4054     if (myid == 0) {
4055       if (split_output_root == NULL && output_file == NULL) {
4056 	pthread_create(&write_stdout_thread_id,&thread_attr_detach,Master_write_stdout,(void *) NULL);
4057       }
4058       pthread_create(&parser_thread_id,&thread_attr_detach,Master_parser,(void *) master);
4059       pthread_create(&mpi_interface_thread_id,&thread_attr_join,Master_mpi_interface,(void *) master);
4060     }
4061 #endif
4062 
4063     worker_thread_ids = (pthread_t *) CALLOC(nthreads,sizeof(pthread_t));
4064     Except_init_pthread();
4065     pthread_key_create(&global_request_key,NULL);
4066 
4067     if (orderedp == true) {
4068       pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_ordered,
4069 		     (void *) outbuffer);
4070     } else {
4071       pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_anyorder,
4072 		     (void *) outbuffer);
4073     }
4074 
4075     for (worker_id = 0; worker_id < nthreads; worker_id++) {
4076       /* Need to have worker threads finish before we call Inbuffer_free() */
4077       pthread_create(&(worker_thread_ids[worker_id]),&thread_attr_join,worker_thread,(void *) worker_id);
4078     }
4079 
4080     pthread_join(output_thread_id,NULL);
4081     for (worker_id = 0; worker_id < nthreads; worker_id++) {
4082       pthread_join(worker_thread_ids[worker_id],NULL);
4083     }
4084 #ifdef USE_MPI
4085     if (myid == 0) {
4086       pthread_join(mpi_interface_thread_id,NULL);
4087     }
4088 #endif
4089 
4090     pthread_key_delete(global_request_key);
4091     /* Do not delete global_except_key, because worker threads might still need it */
4092     /* Except_term_pthread(); */
4093 
4094     FREE(worker_thread_ids);
4095 
4096   }
4097 #endif /* HAVE_PTHREAD */
4098 
4099 #ifdef USE_MPI
4100   /* MPI worker ranks finished with creating output_thread and worker_threads below */
4101   }
4102 #endif
4103 
4104 
4105   /* Note: Shortread and Sequence procedures should close their own input files */
4106 #ifdef USE_MPI
4107   if (myid == 0) {
4108     runtime = Stopwatch_stop(stopwatch);
4109     Stopwatch_free(&stopwatch);
4110 
4111     nread = Master_ntotal(master);
4112     fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
4113 	    nread,runtime,(double) nread/runtime);
4114     /* Master_free(&master); -- Master_parser thread still needs this */
4115   }
4116 
4117   if (myid > 0 || master_is_worker_p) {
4118     Outbuffer_free(&outbuffer);
4119     Inbuffer_free(&inbuffer);
4120   }
4121 
4122   Outbuffer_close_files();	/* All ranks have to close the files */
4123 
4124 #else
4125   /* Single CPU or Pthreads version */
4126   runtime = Stopwatch_stop(stopwatch);
4127   Stopwatch_free(&stopwatch);
4128 
4129   nread = Outbuffer_nread(outbuffer);
4130   /* nbeyond = Outbuffer_nbeyond(outbuffer); */
4131   fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
4132 	  nread,runtime,(double) nread/runtime);
4133 
4134   Outbuffer_free(&outbuffer);
4135   Inbuffer_free(&inbuffer);
4136 
4137   Outbuffer_close_files();
4138 #endif
4139 
4140   Outbuffer_cleanup();
4141 
4142 #ifdef USE_MPI
4143   if (myid > 0 || master_is_worker_p == true) {
4144     worker_cleanup();
4145     MPI_Comm_free(&workers_comm);
4146   }
4147   MPI_Barrier(MPI_COMM_WORLD);	/* Make sure all processes have cleaned up */
4148   MPI_Finalize();
4149 #else
4150   worker_cleanup();
4151 #endif
4152 
4153   return 0;
4154 }
4155 
4156 
4157 static void
print_program_usage()4158 print_program_usage () {
4159   fprintf(stdout,"\
4160 Usage: gsnap [OPTIONS...] <FASTA file>, or\n\
4161        cat <FASTA file> | gmap [OPTIONS...]\n\
4162 \n\
4163 ");
4164 
4165   /* Input options */
4166   fprintf(stdout,"Input options (must include -d)\n");
4167   fprintf(stdout,"\
4168   -D, --dir=directory            Genome directory.  Default (as specified by --with-gmapdb to the configure program) is\n\
4169                                    %s\n\
4170 ",GMAPDB);
4171   fprintf(stdout,"\
4172   -d, --db=STRING                Genome database\n\
4173   --use-localdb=INT              Whether to use the local hash tables, which help with finding extensions to the ends\n\
4174                                    of alignments in the presence of splicing or indels (0=no, 1=yes if available (default))\n\
4175 \n\
4176 ");
4177 
4178   /* Transcriptome-guided options */
4179   fprintf(stdout,"Transcriptome-guided options (optional)\n");
4180   fprintf(stdout,"\
4181   -C, --transcriptdir=directory  Transcriptome directory.  Default is the value for --dir above\n\
4182   -c, --transcriptdb=STRING      Transcriptome database\n\
4183   --use-transcriptome-only       Use only the transcriptome index and not the genome index\n\
4184 \n\
4185 ");
4186 
4187   /* Computation options */
4188   fprintf(stdout,"Computation options\n");
4189   fprintf(stdout,"\
4190   -k, --kmer=INT                 kmer size to use in genome database (allowed values: 16 or less)\n\
4191                                    If not specified, the program will find the highest available\n\
4192                                    kmer size in the genome database\n\
4193   --sampling=INT                 Sampling to use in genome database.  If not specified, the program\n\
4194                                    will find the smallest available sampling value in the genome database\n\
4195                                    within selected k-mer size\n\
4196   -q, --part=INT/INT             Process only the i-th out of every n sequences\n\
4197                                    e.g., 0/100 or 99/100 (useful for distributing jobs\n\
4198                                    to a computer farm).\n\
4199 ");
4200   fprintf(stdout,"\
4201   --input-buffer-size=INT        Size of input buffer (program reads this many sequences\n\
4202                                    at a time for efficiency) (default %d)\n\
4203 ",inbuffer_nspaces);
4204 
4205   fprintf(stdout,"\
4206   --barcode-length=INT           Amount of barcode to remove from start of every read before alignment\n\
4207                                    (default %d)\n\
4208 ",barcode_length);
4209 
4210   fprintf(stdout,"\
4211   --endtrim-length=INT           Amount of trim to remove from the end of every read before alignment\n\
4212                                    (default %d)\n\
4213 ",endtrim_length);
4214 
4215   fprintf(stdout,"\
4216   --orientation=STRING           Orientation of paired-end reads\n\
4217                                    Allowed values: FR (fwd-rev, or typical Illumina; default),\n\
4218                                    RF (rev-fwd, for circularized inserts), or FF (fwd-fwd, same strand)\n\
4219   --fastq-id-start=INT           Starting position of identifier in FASTQ header, space-delimited (>= 1)\n\
4220   --fastq-id-end=INT             Ending position of identifier in FASTQ header, space-delimited (>= 1)\n\
4221                                  Examples:\n\
4222                                    @HWUSI-EAS100R:6:73:941:1973#0/1\n\
4223                                       start=1, end=1 (default) => identifier is HWUSI-EAS100R:6:73:941:1973#0\n\
4224                                    @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36\n\
4225                                       start=1, end=1  => identifier is SRR001666.1\n\
4226                                       start=2, end=2  => identifier is 071112_SLXA-EAS1_s_7:5:1:817:345\n\
4227                                       start=1, end=2  => identifier is SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345\n\
4228   --force-single-end             When multiple FASTQ files are provided on the command line, GSNAP assumes\n\
4229                                     they are matching paired-end files.  This flag treats each file as single-end.\n\
4230   --filter-chastity=STRING       Skips reads marked by the Illumina chastity program.  Expecting a string\n\
4231                                    after the accession having a 'Y' after the first colon, like this:\n\
4232                                          @accession 1:Y:0:CTTGTA\n\
4233                                    where the 'Y' signifies filtering by chastity.\n\
4234                                    Values: off (default), either, both.  For 'either', a 'Y' on either end\n\
4235                                    of a paired-end read will be filtered.  For 'both', a 'Y' is required\n\
4236                                    on both ends of a paired-end read (or on the only end of a single-end read).\n\
4237   --allow-pe-name-mismatch       Allows accession names of reads to mismatch in paired-end files\n\
4238   --interleaved                  Input is in interleaved format (one read per line, tab-delimited\n\
4239 ");
4240 #ifdef HAVE_ZLIB
4241   fprintf(stdout,"\
4242   --gunzip                       Uncompress gzipped input files\n\
4243 ");
4244 #endif
4245 #ifdef HAVE_BZLIB
4246   fprintf(stdout,"\
4247   --bunzip2                      Uncompress bzip2-compressed input files\n\
4248 ");
4249 #endif
4250   fprintf(stdout,"\n");
4251 
4252   /* Computation options */
4253   fprintf(stdout,"Computation options\n");
4254 #if 0
4255   fprintf(stdout,"\
4256 \n\
4257   Note: GSNAP has an ultrafast algorithm for calculating mismatches up to and including\n\
4258 ((readlength+2)/kmer - 2) (\"ultrafast mismatches\").  The program will run fastest if\n\
4259 max-mismatches (plus suboptimal-levels) is within that value.\n\
4260 Also, indels, especially end indels, take longer to compute, although the algorithm\n\
4261 is still designed to be fast.\n\
4262 \n\
4263 ");
4264 #endif
4265 
4266 #ifdef HAVE_MMAP
4267   fprintf(stdout,"\
4268   -B, --batch=INT                Batch mode (default = 2)\n\
4269                                  Mode     Hash offsets  Hash positions  Genome          Local hash offsets  Local hash positions\n\
4270                                    0      allocate      mmap            mmap            allocate            mmap\n\
4271                                    1      allocate      mmap & preload  mmap            allocate            mmap & preload\n\
4272                                    2      allocate      mmap & preload  mmap & preload  allocate            mmap & preload\n\
4273                                    3      allocate      allocate        mmap & preload  allocate            allocate\n\
4274                       (default)    4      allocate      allocate        allocate        allocate            allocate\n\
4275                            Note: For a single sequence, all data structures use mmap\n\
4276                            A batch level of 5 means the same as 4, and is kept only for backward compatibility\n\
4277 ");
4278 #else
4279   fprintf(stdout,"\
4280   -B, --batch=INT                Batch mode (default = 4, modes 0-3 disallowed because program configured without mmap)\n\
4281                                  Mode     Hash offsets  Hash positions  Genome          Local hash offsets  Local hash positions\n\
4282                       (default)    4      allocate      allocate        allocate        allocate            allocate\n\
4283 ");
4284 #endif
4285   fprintf(stdout,"\
4286   --use-shared-memory=INT        If 1, then allocated memory is shared among all processes on this node\n\
4287                                    If 0 (default), then each process has private allocated memory\n\
4288   --preload-shared-memory        Load files indicated by --batch mode into shared memory for use by other\n\
4289                                    GMAP/GSNAP processes on this node, and then exit.  Ignore any input files.\n\
4290   --unload-shared-memory         Unload files indicated by --batch mode into shared memory, or allow them\n\
4291                                    to be unloaded when existing GMAP/GSNAP processes on this node are finished\n\
4292                                    with them.  Ignore any input files.\n\
4293 ");
4294 
4295   fprintf(stdout,"\
4296   -m, --max-mismatches=FLOAT     Maximum number of mismatches allowed (if not specified, then\n\
4297                                    GSNAP tries to find the best possible match in the genome)\n\
4298                                    If specified between 0.0 and 1.0, then treated as a fraction\n\
4299                                    of each read length.  Otherwise, treated as an integral number\n\
4300                                    of mismatches (including indel and splicing penalties).\n\
4301                                    Default is to be unspecified (to find the best possible match)\n\
4302   --max-ref-mismatches=FLOAT     If GSNAP is run under SNP-tolerant or masked genome mode, then the\n\
4303                                    --max-mismatches parameter above is for mismatches against reference and\n\
4304                                    SNPs, or against the unmasked genome, respectively.  The --max-ref-mismatches\n\
4305                                    parameter is against mismatches against the reference genome or masked genome.\n\
4306   --min-coverage=FLOAT           Minimum coverage required for an alignment.\n\
4307                                    If specified between 0.0 and 1.0, then treated as a fraction\n\
4308                                    of each read length.  Otherwise, treated as an integral number\n\
4309                                    of base pairs.  Default value is 0.5.\n\
4310   --filter-within-trims=INT      Whether to count mismatches in trimmed part of alignment (1, yes) or\n\
4311                                    mismatches to the ends of the read (0, no), when applying the\n\
4312                                    --max-mismatches and --max-ref-mismatches parameters.\n\
4313                                    Default for RNA-Seq is 1 (yes) so we can allow for\n\
4314                                    reads that align past the ends of an exon.  Default for DNA-Seq\n\
4315                                    is 0 (no).\n\
4316                                  For RNA-Seq, trimmed ends should be ignored, because trimming\n\
4317                                    is performed at probable splice sites, to allow for reads that\n\
4318                                    align past the ends of an exon.\n\
4319   --query-unk-mismatch=INT       Whether to count unknown (N) characters in the query as a mismatch\n\
4320                                    (0=no (default), 1=yes)\n\
4321   --genome-unk-mismatch=INT      Whether to count unknown (N) characters in the genome as a mismatch\n\
4322                                    (0=no, 1=yes).  If --use-mask is specified, default is no, otherwise yes.\n\
4323 ");
4324 
4325 #if 0
4326   fprintf(stdout,"\
4327   --maxsearch=INT                Maximum number of alignments to find (default %d).\n\
4328                                    Must be larger than --npaths, which is the number to report.\n\
4329                                    Keeping this number large will allow for random selection among multiple alignments.\n\
4330                                    Reducing this number can speed up the program.\n\
4331 ",maxpaths_search);
4332 #endif
4333 
4334 #if 0
4335   fprintf(stdout,"\
4336   -i, --indel-penalty-middle=INT Penalty for an indel in middle of read (default %d).\n\
4337                                    Counts against mismatches allowed.  To find indels, make\n\
4338                                    indel-penalty less than or equal to max-mismatches\n\
4339 ",indel_penalty_middle);
4340   fprintf(stdout,"\
4341   -I, --indel-penalty-end=INT    Penalty for an indel at end of read (default %d).\n\
4342                                    Counts against mismatches allowed.  To find indels, make\n\
4343                                    indel-penalty less than or equal to max-mismatches\n\
4344 ",indel_penalty_end);
4345 #endif
4346 
4347   fprintf(stdout,"\
4348   -i, --indel-penalty=INT        Penalty for an indel (default %d).\n\
4349                                    Counts against mismatches allowed.  To find indels, make\n\
4350                                    indel-penalty less than or equal to max-mismatches.\n\
4351                                    A value < 2 can lead to false positives at read ends\n\
4352 ",indel_penalty_middle);
4353 
4354 #if 0
4355   /* No longer used */
4356   fprintf(stdout,"\
4357   -R, --masking=INT              Masking of frequent/repetitive oligomers to avoid spending time\n\
4358                                    on non-unique or repetitive reads\n\
4359                                    0 = no masking (will try to find non-unique or repetitive matches)\n\
4360                                    1 = mask frequent oligomers\n\
4361                                    2 = mask frequent and repetitive oligomers (fastest) (default)\n\
4362                                    3 = greedy frequent: mask frequent oligomers first, then\n\
4363                                        try no masking if alignments not found\n\
4364                                    4 = greedy repetitive: mask frequent and repetitive oligomers first, then\n\
4365                                        try no masking if alignments not found\n\
4366 ");
4367 #endif
4368 
4369   fprintf(stdout,"\
4370   --indel-endlength=INT          Minimum length at end required for indel alignments (default %d)\n\
4371 ",min_indel_end_matches);
4372   fprintf(stdout,"\
4373   -y, --max-middle-insertions=FLOAT  Maximum number of middle insertions allowed (default is %.1f)\n\
4374                                      If specified between 0.0 and 1.0, then treated as a fraction\n\
4375                                      of each read length.  Otherwise, treated as an integral number\n\
4376                                      of base pairs\n\
4377 ",max_middle_insertions_float);
4378   fprintf(stdout,"\
4379   -z, --max-middle-deletions=FLOAT   Maximum number of middle deletions allowed (default %.1f)\n\
4380                                      If specified between 0.0 and 1.0, then treated as a fraction\n\
4381                                      of each read length.  Otherwise, treated as an integral number\n\
4382                                      of base pairs\n\
4383 ",max_middle_deletions_float);
4384   fprintf(stdout,"\
4385   -Y, --max-end-insertions=INT   Maximum number of end insertions allowed (default %d)\n\
4386 ",max_end_insertions);
4387   fprintf(stdout,"\
4388   -Z, --max-end-deletions=INT    Maximum number of end deletions allowed (default %d)\n\
4389 ",max_end_deletions);
4390   fprintf(stdout,"\
4391   -M, --suboptimal-levels=INT    Report suboptimal hits beyond best hit (default %d)\n\
4392                                    All hits with best score plus suboptimal-levels are reported\n\
4393 ",subopt_levels);
4394   fprintf(stdout,"\
4395   -a, --adapter-strip=STRING     Method for removing adapters from reads.  Currently allowed values: off, paired.\n\
4396                                    Default is \"off\".  To turn on, specify \"paired\", which removes adapters\n\
4397                                    from paired-end reads if they appear to be present.\n\
4398 ");
4399   fprintf(stdout,"\
4400   --trim-indel-score=INT         Score to use for indels when trimming at ends.  To turn off trimming,\n\
4401                                    specify 0.  Default is %d for both RNA-Seq and DNA-Seq.  Warning:\n\
4402                                    Turning trimming off in RNA-Seq can give false positive indels\n\
4403                                    at the ends of reads\n\
4404 ",trim_indel_score);
4405 
4406   fprintf(stdout,"\
4407   -e, --use-mask=STRING          Use genome containing masks (e.g. for non-exons) for scoring preference\n\
4408   -V, --snpsdir=STRING           Directory for SNPs index files (created using snpindex) (default is\n\
4409                                    location of genome index files specified using -D and -d)\n \
4410   -v, --use-snps=STRING          Use database containing known SNPs (in <STRING>.iit, built\n\
4411                                    previously using snpindex) for tolerance to SNPs\n\
4412   --cmetdir=STRING               Directory for methylcytosine index files (created using cmetindex)\n\
4413                                    (default is location of genome index files specified using -D, -V, and -d)\n\
4414   --atoidir=STRING               Directory for A-to-I RNA editing index files (created using atoiindex)\n\
4415                                    (default is location of genome index files specified using -D, -V, and -d)\n\
4416   --mode=STRING                  Alignment mode: standard (default), cmet-stranded, cmet-nonstranded,\n\
4417                                     atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded.\n\
4418                                     Non-standard modes requires you to have previously run the cmetindex\n\
4419                                     or atoiindex programs (which also cover the ttoc modes) on the genome\n\
4420 ");
4421 
4422 
4423 #if 0
4424   fprintf(stdout,"\
4425   --tallydir=STRING              Directory for tally IIT file to resolve concordant multiple alignments (default is\n\
4426                                    location of genome index files specified using -D and -d).  Note: can\n\
4427                                    just give full path name to --use-tally instead.\n\
4428   --use-tally=STRING             Use this tally IIT file to resolve concordant multiple alignments\n\
4429   --runlengthdir=STRING          Directory for runlength IIT file to resolve concordant multiple alignments (default is\n\
4430                                    location of genome index files specified using -D and -d).  Note: can\n\
4431                                    just give full path name to --use-runlength instead.\n\
4432   --use-runlength=STRING         Use this runlength IIT file to resolve concordant multiple alignments\n\
4433 ");
4434 #endif
4435 
4436 
4437 #ifdef HAVE_PTHREAD
4438   fprintf(stdout,"\
4439   -t, --nthreads=INT             Number of worker threads\n\
4440 ");
4441 #else
4442   fprintf(stdout,"\
4443   -t, --nthreads=INT             Number of worker threads.  Flag is ignored in this version of GSNAP, which has pthreads disabled\n\
4444 ");
4445 #endif
4446 
4447   fprintf(stdout,"\
4448   --max-anchors=INT              Controls number of candidate segments returned by the complete set algorithm\n\
4449                                    Default is 10.  Can be increased to higher values to solve alignments with\n\
4450                                    evenly spaced mismatches at close distances.  However, higher values will\n\
4451                                    cause GSNAP to run more slowly.  A value of 1000, for example, slows down\n\
4452                                    the program by a factor of 10 or so.  Therefore, change this value only if\n\
4453                                    absolutely necessary.\n\
4454 ");
4455 
4456 #if 0
4457   fprintf(stdout,"\n");
4458   fprintf(stdout,"Options for GMAP alignment within GSNAP\n");
4459   fprintf(stdout,"\
4460   --gmap-mode=STRING             Cases to use GMAP for complex alignments containing multiple splices or indels\n\
4461                                  Allowed values: none, all, pairsearch, ends, improve\n\
4462                                    (or multiple values, separated by commas).\n\
4463                                    Default: all, i.e., pairsearch,ends,improve\n\
4464 ");
4465   fprintf(stdout,"\
4466   --trigger-score-for-gmap=INT   Try GMAP pairsearch on nearby genomic regions if best score (the total\n\
4467                                    of both ends if paired-end) exceeds this value (default %d)\n\
4468 ",trigger_score_for_gmap);
4469   fprintf(stdout,"\
4470   --gmap-min-match-length=INT    Keep GMAP hit only if it has this many consecutive matches (default %d)\n\
4471 ",gmap_min_nconsecutive);
4472   fprintf(stdout,"\
4473   --gmap-allowance=INT           Extra mismatch/indel score allowed for GMAP alignments (default %d)\n\
4474 ",gmap_allowance);
4475   fprintf(stdout,"\
4476   --max-gmap-pairsearch=INT      Perform GMAP pairsearch on nearby genomic regions up to this many\n\
4477                                    many candidate ends (default %d).  Requires pairsearch in --gmap-mode\n\
4478 ",max_gmap_pairsearch);
4479   fprintf(stdout,"\
4480   --max-gmap-terminal=INT        Perform GMAP terminal on nearby genomic regions up to this many\n\
4481                                    candidate ends (default %d).  Requires terminal in --gmap-mode\n\
4482 ",max_gmap_terminal);
4483   fprintf(stdout,"\
4484   --max-gmap-improvement=INT     Perform GMAP improvement on nearby genomic regions up to this many\n\
4485                                    candidate ends (default %d).  Requires improve in --gmap-mode\n\
4486 ",max_gmap_improvement);
4487   fprintf(stdout,"\n");
4488 #endif
4489 
4490 #if 0
4491   fprintf(stdout,"Genes options for RNA-Seq\n");
4492   fprintf(stdout,"\
4493   -g, --genes=STRING             Look for known genes in <STRING>.iit, to be used for resolving\n\
4494                                    multiple mapping reads.  See README instructions for the correct\n\
4495                                    formatting of a genes IIT file.\n\
4496   --favor-multiexon              In resolving multiple mapping reads, overlaps with known\n\
4497                                    multi-exon genes are favored over those with known single-exon\n\
4498                                    genes.  This favors spliced genes over psuedogenes.\n\
4499 ");
4500   fprintf(stdout,"\n");
4501 #endif
4502 
4503 
4504   /* Splicing options */
4505   fprintf(stdout,"Splicing options for DNA-Seq\n");
4506   fprintf(stdout,"\
4507   --find-dna-chimeras=INT              Look for distant splicing involving poor splice sites (0=no, 1=yes (default))\n\
4508                                          Current implementation improves even RNA-Seq alignments at poor splice sites\n\
4509                                          so it is now on by default.\n\
4510 ");
4511   fprintf(stdout,"\n");
4512 
4513   /* Splicing options */
4514   fprintf(stdout,"Splicing options for RNA-Seq\n");
4515   fprintf(stdout,"\
4516   -N, --novelsplicing=INT              Look for novel splicing (0=no (default), 1=yes)\n\
4517   --splicingdir=STRING                 Directory for splicing involving known sites or known introns,\n\
4518                                          as specified by the -s or --use-splicing flag (default is\n\
4519                                          directory computed from -D and -d flags).  Note: can\n\
4520                                          just give full pathname to the -s flag instead.\n\
4521   -s, --use-splicing=STRING            Look for splicing involving known sites or known introns\n\
4522                                          (in <STRING>.iit), at short or long distances\n\
4523                                          See README instructions for the distinction between known sites\n\
4524                                          and known introns\n\
4525   --ambig-splice-noclip                For ambiguous known splicing at ends of the read, do not clip at the\n\
4526                                          splice site, but extend instead into the intron.  This flag makes\n\
4527                                          sense only if you provide the --use-splicing flag, and you are trying\n\
4528                                          to eliminate all soft clipping with --trim-mismatch-score=0\n\
4529 ");
4530   fprintf(stdout,"\
4531   -w, --localsplicedist=INT            Definition of local novel splicing event (default %d)\n\
4532 ",shortsplicedist);
4533   fprintf(stdout,"\
4534   --novelend-splicedist=INT            Distance to look for novel splices at the ends of reads (default %d)\n\
4535 ",shortsplicedist_novelend);
4536   fprintf(stdout,"\
4537   --local-splice-penalty=INT       Penalty for a local splice (default %d).  Counts against mismatches allowed\n\
4538 ",localsplicing_penalty);
4539   fprintf(stdout,"\
4540   --distant-splice-penalty=INT     Penalty for a distant splice (default %d).  A distant splice is one where\n\
4541                                          the intron length exceeds the value of -w, or --localsplicedist, or is an\n\
4542                                          inversion, scramble, or translocation between two different chromosomes\n\
4543                                          Counts against mismatches allowed\n\
4544 ",distantsplicing_penalty);
4545   fprintf(stdout,"\
4546   -K, --distant-splice-endlength=INT   Minimum length at end required for distant spliced alignments (default %d, min\n\
4547                                          allowed is the value of -k, or kmer size)\n\
4548 ",min_distantsplicing_end_matches);
4549   fprintf(stdout,"\
4550   -l, --shortend-splice-endlength=INT  Minimum length at end required for short-end spliced alignments (default %d,\n\
4551                                          but unless known splice sites are provided with the -s flag, GSNAP may still\n\
4552                                          need the end length to be the value of -k, or kmer size to find a given splice\n\
4553 ",min_shortend);
4554   fprintf(stdout,"\
4555   --distant-splice-identity=FLOAT      Minimum identity at end required for distant spliced alignments (default %.2f)\n\
4556 ",min_distantsplicing_identity);
4557   fprintf(stdout,"\
4558   --antistranded-penalty=INT           (Not currently implemented, since it leads to poor results)\n\
4559                                          Penalty for antistranded splicing when using stranded RNA-Seq protocols.\n\
4560                                          A positive value, such as 1, expects antisense on the first read\n\
4561                                          and sense on the second read.  Default is 0, which treats sense and antisense\n\
4562                                          equally well\n\
4563   --merge-distant-samechr              Report distant splices on the same chromosome as a single splice, if possible.\n\
4564                                          Will produce a single SAM line instead of two SAM lines, which is also done\n\
4565                                          for translocations, inversions, and scramble events\n\
4566 ");
4567   fprintf(stdout,"\n");
4568 
4569 
4570   /* Paired-end options */
4571   fprintf(stdout,"Options for paired-end reads\n");
4572   fprintf(stdout,"\
4573   --pairmax-dna=INT              Max total genomic length for DNA-Seq paired reads, or other reads\n\
4574                                    without splicing (default %d).  Used if -N or -s is not specified.\n\
4575                                    This value is also used for circular chromosomes when splicing in\n\
4576                                    linear chromosomes is allowed\n\
4577 ",pairmax_dna);
4578   fprintf(stdout,"\
4579   --pairmax-rna=INT              Max total genomic length for RNA-Seq paired reads, or other reads\n\
4580                                    that could have a splice (default %d).  Used if -N or -s is specified.\n\
4581                                    Should probably match the value for -w, --localsplicedist.\n\
4582 ",pairmax_rna);
4583   fprintf(stdout,"\
4584   --pairexpect=INT               Expected paired-end length, used for calling splices in medial part\n\
4585                                    of paired-end reads (default %d).  Was turned off in previous versions, but reinstated.\n\
4586 ",expected_pairlength);
4587   fprintf(stdout,"\
4588   --pairdev=INT                  Allowable deviation from expected paired-end length, used for\n\
4589                                    calling splices in medial part of paired-end reads (default %d).\n\
4590                                    Was turned off in previous versions, but reinstated.\n\
4591 ",pairlength_deviation);
4592   fprintf(stdout,"\n");
4593 
4594 
4595   /* Quality score options */
4596   fprintf(stdout,"Options for quality scores\n");
4597   fprintf(stdout,"\
4598   --quality-protocol=STRING      Protocol for input quality scores.  Allowed values:\n\
4599                                    illumina (ASCII 64-126) (equivalent to -J 64 -j -31)\n\
4600                                    sanger   (ASCII 33-126) (equivalent to -J 33 -j 0)\n\
4601                                  Default is sanger (no quality print shift)\n\
4602                                  SAM output files should have quality scores in sanger protocol\n\
4603 \n\
4604                                  Or you can customize this behavior with these flags:\n\
4605   -J, --quality-zero-score=INT   FASTQ quality scores are zero at this ASCII value\n\
4606                                    (default is 33 for sanger protocol; for Illumina, select 64)\n\
4607   -j, --quality-print-shift=INT  Shift FASTQ quality scores by this amount in output\n\
4608                                    (default is 0 for sanger protocol; to change Illumina input\n\
4609                                    to Sanger output, select -31)\n\
4610 ");
4611 
4612   /* Output options */
4613   fprintf(stdout,"Output options\n");
4614   fprintf(stdout,"\
4615   -n, --npaths=INT               Maximum number of paths to print (default %d).\n\
4616 ",maxpaths_report);
4617   fprintf(stdout,"\
4618   -Q, --quiet-if-excessive       If more than maximum number of paths are found,\n\
4619                                    then nothing is printed.\n\
4620   -O, --ordered                  Print output in same order as input (relevant\n\
4621                                    only if there is more than one worker thread)\n\
4622   --show-refdiff                 For GSNAP output in SNP-tolerant alignment, shows all differences\n\
4623                                    relative to the reference genome as lower case (otherwise, it shows\n\
4624                                    all differences relative to both the reference and alternate genome)\n\
4625   --clip-overlap                 For paired-end reads whose alignments overlap, clip the overlapping region.\n\
4626   --merge-overlap                For paired-end reads whose alignments overlap, merge the two ends into a single end (beta implementation)\n\
4627   --print-snps                   Print detailed information about SNPs in reads (works only if -v also selected)\n\
4628                                    (not fully implemented yet)\n\
4629   --failsonly                    Print only failed alignments, those with no results\n\
4630   --nofails                      Exclude printing of failed alignments\n\
4631 ");
4632 
4633   fprintf(stdout,"\
4634   -A, --format=STRING            Another format type, other than default.\n\
4635                                    Currently implemented: sam, m8 (BLAST tabular format)\n\
4636 ");
4637 
4638   fprintf(stdout,"\
4639   --split-output=STRING          Basename for multiple-file output, separately for nomapping,\n\
4640                                    halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult,\n\
4641                                    paired_uniq, paired_mult, concordant_uniq, and concordant_mult results\n\
4642   -o, --output-file=STRING       File name for a single stream of output results.\n\
4643   --failed-input=STRING          Print completely failed alignments as input FASTA or FASTQ format,\n\
4644                                     to the given file, appending .1 or .2, for paired-end data.\n\
4645                                     If the --split-output flag is also given, this file is generated\n\
4646                                     in addition to the output in the .nomapping file.\n\
4647   --append-output                When --split-output or --failed-input is given, this flag will append output\n\
4648                                     to the existing files.  Otherwise, the default is to create new files.\n\
4649   --order-among-best=STRING      Among alignments tied with the best score, order those alignments in this order.\n\
4650                                     Allowed values: genomic, random (default)\n\
4651 ");
4652   fprintf(stdout,"\
4653   --output-buffer-size=INT       Buffer size, in queries, for output thread (default %d).  When the number\n\
4654                                    of results to be printed exceeds this size, the worker threads are halted\n\
4655                                    until the backlog is cleared\n\
4656 ",output_buffer_size);
4657   fprintf(stdout,"\n");
4658 
4659   /* SAM options */
4660   fprintf(stdout,"Options for SAM output\n");
4661   fprintf(stdout,"\
4662   --no-sam-headers               Do not print headers beginning with '@'\n\
4663   --add-paired-nomappers         Add nomapper lines as needed to make all paired-end results alternate\n\
4664                                    between first end and second end\n\
4665   --paired-flag-means-concordant=INT  Whether the paired bit in the SAM flags means concordant only (1)\n\
4666                                  or paired plus concordant (0, default)\n\
4667   --sam-headers-batch=INT        Print headers only for this batch, as specified by -q\n\
4668   --sam-hardclip-use-S           Use S instead of H for hardclips\n\
4669   --sam-use-0M                   Insert 0M in CIGAR between adjacent insertions, deletions, and introns\n\
4670                                    Picard disallows 0M, other tools may require it\n\
4671   --sam-extended-cigar           Use extended CIGAR format (using X and = symbols instead of M,\n\
4672                                    to indicate matches and mismatches, respectively\n\
4673   --sam-multiple-primaries       Allows multiple alignments to be marked as primary if they\n\
4674                                    have equally good mapping scores\n\
4675   --sam-sparse-secondaries       For secondary alignments (in multiple mappings), uses '*' for SEQ\n\
4676                                    and QUAL fields, to give smaller file sizes.  However, the output\n\
4677                                    will give warnings in Picard to give warnings and may not work\n\
4678                                    with downstream tools\n\
4679   --force-xs-dir                 For RNA-Seq alignments, disallows XS:A:? when the sense direction\n\
4680                                    is unclear, and replaces this value arbitrarily with XS:A:+.\n\
4681                                    May be useful for some programs, such as Cufflinks, that cannot\n\
4682                                    handle XS:A:?.  However, if you use this flag, the reported value\n\
4683                                    of XS:A:+ in these cases will not be meaningful.\n\
4684   --md-lowercase-snp             In MD string, when known SNPs are given by the -v flag,\n\
4685                                    prints difference nucleotides as lower-case when they,\n\
4686                                    differ from reference but match a known alternate allele\n\
4687   --extend-soft-clips            Extends alignments through soft clipped regions\n\
4688   --action-if-cigar-error        Action to take if there is a disagreement between CIGAR length and sequence length\n\
4689                                    Allowed values: ignore, warning (default), noprint, abort\n\
4690                                    Note that the noprint option does not print the CIGAR string at all if there\n\
4691                                    is an error, so it may break a SAM parser\n\
4692   --read-group-id=STRING         Value to put into read-group id (RG-ID) field\n\
4693   --read-group-name=STRING       Value to put into read-group name (RG-SM) field\n\
4694   --read-group-library=STRING    Value to put into read-group library (RG-LB) field\n\
4695   --read-group-platform=STRING   Value to put into read-group library (RG-PL) field\n\
4696 ");
4697   fprintf(stdout,"\n");
4698 
4699 #ifdef USE_MPI
4700   fprintf(stdout,"Options for MPI\n");
4701   fprintf(stdout,"\
4702   --master-is-worker=INT         Determines whether master node allocates threads for performing computation\n\
4703                                    in addition to coordinating input and output.  Number of worker threads\n\
4704                                    will be --nthreads minus 2\n\
4705                                    Values: 0 (no, default), 1 (yes if enough worker threads available)\n\
4706 ");
4707   fprintf(stdout,"\n");
4708 #endif
4709 
4710   /* Help options */
4711   fprintf(stdout,"Help options\n");
4712   fprintf(stdout,"\
4713   --check                        Check compiler assumptions\n\
4714   --version                      Show version\n\
4715   --help                         Show this help message\n\
4716 ");
4717   return;
4718 }
4719 
4720