1 static char rcsid[] = "$Id: gsnap.c 222931 2020-06-29 16:10:56Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #ifdef USE_MPI
7 #include <mpi.h>
8 #include "mpidebug.h"
9 #endif
10
11 #ifdef HAVE_SYS_TYPES_H
12 #include <sys/types.h> /* Needed to define pthread_t on Solaris */
13 #endif
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h> /* For strcpy */
18 #include <strings.h> /* For rindex */
19 #include <ctype.h>
20 #include <math.h> /* For rint */
21 #ifdef HAVE_SSE2
22 #include <emmintrin.h>
23 #endif
24 #ifdef HAVE_SSE4_1
25 #include <smmintrin.h>
26 #endif
27 #if !defined(HAVE_SSE4_2)
28 /* Skip popcnt */
29 #elif defined(HAVE_POPCNT)
30 #include <immintrin.h>
31 #endif
32
33 #if !defined(HAVE_SSE4_2)
34 /* Skip mm_popcnt */
35 #elif defined(HAVE_MM_POPCNT)
36 #include <nmmintrin.h>
37 #endif
38
39
40 #if !defined(HAVE_SSE4_2)
41 /* Skip lzcnt/tzcnt */
42 #elif defined(HAVE_LZCNT) || defined(HAVE_TZCNT)
43 #include <immintrin.h>
44 #endif
45
46 #ifdef HAVE_PTHREAD
47 #include <pthread.h>
48 #endif
49
50 #ifdef HAVE_ZLIB
51 #include <zlib.h>
52 #define GZBUFFER_SIZE 131072
53 #endif
54
55 #ifdef HAVE_BZLIB
56 #include "bzip2.h"
57 #endif
58
59 #include <signal.h>
60
61 #include "assert.h"
62 #include "except.h"
63 #include "mem.h"
64 #include "bool.h"
65 #include "types.h"
66 #include "univcoord.h"
67 #include "fopen.h"
68 #include "filesuffix.h"
69
70 #include "mode.h"
71 #include "shortread.h" /* For Shortread_setup */
72 #include "stopwatch.h"
73 #include "transcriptome.h"
74 #include "transcript.h" /* For Transcript_setup */
75 #include "genome.h"
76 #include "genome128_hr.h" /* For Genome_hr_setup */
77 #include "genome128_consec.h" /* For Genome_consec_setup */
78 #include "genome_sites.h" /* For Genome_sites_setup */
79 #include "maxent_hr.h" /* For Maxent_hr_setup */
80 #include "knownsplicing.h"
81 #include "mapq.h"
82 #include "substring.h"
83 #include "stage3hr.h"
84 #include "concordance.h" /* For Concordance_setup */
85 #include "splice.h" /* For Splice_setup */
86 #include "oligo.h" /* For Oligo_setup */
87 /* #include "oligoindex_hr.h" */ /* For Oligoindex_hr_setup */
88 /* #include "oligoindex_localdb.h" -- For Oligoindex_local_setup */
89 /* #include "pairpool.h" */
90 /* #include "diagpool.h" */
91 /* #include "cellpool.h" */
92 #include "listpool.h"
93 #include "univdiagpool.h"
94 #include "hitlistpool.h"
95
96 /* #include "stage2.h" */ /* For Stage2_setup */
97 #include "path-solve.h"
98 #include "kmer-search.h"
99 #include "extension-search.h"
100 #include "segment-search.h"
101 #include "terminal.h"
102 #include "distant-rna.h"
103 #include "distant-dna.h"
104 #include "indel.h" /* For Indel_setup */
105 #include "stage1hr.h"
106 #include "indexdb.h"
107 #include "regiondb.h"
108 #include "resulthr.h"
109 #include "request.h"
110 #include "intlist.h"
111 #include "list.h"
112 #include "iit-read.h"
113 #include "iit-read-univ.h"
114 #include "datadir.h"
115 #include "samprint.h" /* For SAM_setup */
116 #include "cigar.h" /* For Cigar_setup */
117
118 #include "filestring.h"
119 #include "output.h"
120 #include "inbuffer.h"
121 #include "outbuffer.h"
122 #ifdef USE_MPI
123 #include "master.h"
124 #endif
125
126 #include "simplepair.h"
127 #include "getopt.h"
128
129
130 #define ONE_GB 1000000000
131
132 #define MIN_INDEXDB_SIZE_THRESHOLD 1000
133
134 #define MAX_QUERYLENGTH_FOR_ALLOC 100000
135 #define MAX_GENOMICLENGTH_FOR_ALLOC 1000000
136
137
138 /* MPI Processing */
139 #ifdef DEBUGM
140 #define debugm(x) x
141 #else
142 #define debugm(x)
143 #endif
144
145 /* File open/close. Want to turn on in shortread.c also. */
146 #ifdef DEBUGF
147 #define debugf(x) x
148 #else
149 #define debugf(x)
150 #endif
151
152
153 #ifdef DEBUG
154 #define debug(x) x
155 #else
156 #define debug(x)
157 #endif
158
159
160 /************************************************************************
161 * GMAP parameters
162 ************************************************************************/
163
164 /* GMAP_IMPROVEMENT shouldn't be necessary anymore with localdb */
165 /* Might reinstate GMAP_ENDS if we move localdb to GMAP */
166 #if 0
167 static int gmap_mode = GMAP_IMPROVEMENT; /* GMAP_PAIRSEARCH | GMAP_ENDS | GMAP_IMPROVEMENT; */
168 static int gmap_min_nconsecutive = 20;
169 static int nullgap = 600;
170 static int maxpeelback = 20; /* Now controlled by defect_rate */
171 static int maxpeelback_distalmedial = 24;
172 static int extramaterial_end = 10;
173 static int extramaterial_paired = 8;
174 static int sufflookback = 60;
175 static int nsufflookback = 5;
176 static int extraband_single = 3;
177 static int extraband_end = 3; /* Shouldn't differ from 0, since onesidegapp is true? */
178 static int extraband_paired = 7;
179 static int ngap = 3; /* 0? */
180
181 static int max_gmap_pairsearch = 50; /* Will perform GMAP on up to this many hits5 or hits3 */
182 static int max_gmap_terminal = 50; /* Will perform GMAP on up to this many terminals5 or terminals3 */
183 static int max_gmap_improvement = 5;
184
185 static int trigger_score_for_gmap = 5;
186 static int gmap_allowance = 3;
187 /* static int trigger_score_for_terminals = 5; -- obsolete */
188 #endif
189
190 static int min_intronlength = 9;
191 /* static int max_deletionlength = 50; -- Now depends on querylength */
192
193
194 /************************************************************************
195 * Global parameters
196 ************************************************************************/
197
198 /* Transcriptome */
199 static Transcriptome_T transcriptome = NULL;
200 static Univ_IIT_T transcript_iit = NULL;
201 static Trcoord_T transcriptomelength = 0;
202 static int ntranscripts = 0;
203
204 /* Genome */
205 static Univ_IIT_T chromosome_iit = NULL;
206 static int circular_typeint = -1;
207 static Univcoord_T genomelength;
208 static int nchromosomes = 0;
209 static bool *circularp = NULL;
210 static bool any_circular_p;
211
212 static bool *altlocp = NULL;
213 static Univcoord_T *alias_starts = NULL;
214 static Univcoord_T *alias_ends = NULL;
215
216 static Indexdb_T indexdb = NULL;
217 static Indexdb_T indexdb2 = NULL; /* For cmet or atoi */
218 static Indexdb_T indexdb_tr = NULL;
219
220 static bool user_localdb_p = false;
221 static bool use_localdb_p = false;
222 static Regiondb_T regiondb = NULL;
223 static Regiondb_T regiondb2 = NULL;
224
225 static Genome_T genomecomp = NULL;
226 static Genome_T genomecomp_alt = NULL;
227 static Genome_T genomebits = NULL;
228 static Genome_T genomebits_alt = NULL;
229
230 static Genome_T transcriptomebits = NULL;
231 static Genome_T transcriptomebits_alt = NULL;
232
233 static bool use_only_transcriptome_p = false;
234
235 #if 0
236 static char STANDARD_CHARTABLE[4] = {'A','C','G','T'};
237 static char CMET_FWD_CHARTABLE[4] = {'A','T','G','T'}; /* CT */
238 static char CMET_REV_CHARTABLE[4] = {'A','C','A','T'}; /* GA */
239 static char ATOI_FWD_CHARTABLE[4] = {'G','C','G','T'}; /* AG */
240 static char ATOI_REV_CHARTABLE[4] = {'A','C','G','C'}; /* TC */
241 #endif
242
243
244 static bool fastq_format_p = false;
245 static bool want_random_p = true; /* randomize among equivalent scores */
246 static Stopwatch_T stopwatch = NULL;
247
248 /************************************************************************
249 * Program options
250 ************************************************************************/
251
252 /* Input options */
253 static char *user_transcriptomedir = NULL;
254 static char *transcriptome_dbroot = NULL;
255
256 static char *user_genomedir = NULL;
257 static char *genome_dbroot = NULL;
258
259 static int part_modulus = 0;
260 static int part_interval = 1;
261 static int barcode_length = 0;
262 static int endtrim_length = 0;
263 static bool invert_first_p = false;
264 static bool invert_second_p = true;
265 static int acc_fieldi_start = 0;
266 static int acc_fieldi_end = 0;
267 static bool force_single_end_p = false;
268 static bool filter_chastity_p = false;
269 static bool allow_paired_end_mismatch_p = false;
270 static bool filter_if_both_p = false;
271
272 static char *read_files_command = NULL;
273 static bool gunzip_p = false;
274 static bool bunzip2_p = false;
275 static bool interleavedp = false;
276
277 /* Compute options */
278 static bool chop_primers_p = false;
279 static bool query_unk_mismatch_p = false;
280 static bool genome_unk_mismatch_p = true;
281 static bool novelsplicingp = false;
282 static bool splicingp = false;
283 static bool find_dna_chimeras_p = true; /* On by default, because it improves even RNA-Seq reads */
284
285 static bool user_trim_indel_score_p = false;
286 static int trim_indel_score = -2; /* was -4 */
287
288
289 static bool multiple_sequences_p;
290 static bool sharedp = false;
291 static bool preload_shared_memory_p = false;
292 static bool unload_shared_memory_p = false;
293 static bool expand_offsets_p = false;
294
295 #ifdef HAVE_MMAP
296 /* Level 4 is now default */
297 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
298 static Access_mode_T positions_access = USE_ALLOCATE;
299 static Access_mode_T regiondb_access = USE_ALLOCATE;
300 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
301 static Access_mode_T locpositions_access = USE_ALLOCATE;
302
303 static Access_mode_T genome_access = USE_ALLOCATE;
304
305 #else
306 static Access_mode_T offsetsstrm_access = USE_ALLOCATE;
307 static Access_mode_T positions_access = USE_ALLOCATE;
308 static Access_mode_T regiondb_access = USE_ALLOCATE;
309 static Access_mode_T locoffsetsstrm_access = USE_ALLOCATE;
310 static Access_mode_T locpositions_access = USE_ALLOCATE;
311
312 static Access_mode_T genome_access = USE_ALLOCATE;
313
314 #endif
315
316 static int pairmax_transcriptome;
317 static int pairmax_linear;
318 static int pairmax_circular;
319 static int pairmax_dna = 2000;
320 static int pairmax_rna = 200000;
321 static int expected_pairlength = 500;
322 static int pairlength_deviation = 100;
323
324 #ifdef USE_MPI
325 static int nranks, n_slave_ranks, myid, provided;
326 static int exclude_ranks[1];
327 static MPI_Comm workers_comm;
328 static MPI_Group world_group, workers_group;
329 static int nthreads0;
330 static bool master_is_worker_p = false; /* default behavior */
331 #endif
332
333 #ifdef HAVE_PTHREAD
334 static pthread_t output_thread_id, *worker_thread_ids;
335 #ifdef USE_MPI
336 static pthread_t write_stdout_thread_id, parser_thread_id, mpi_interface_thread_id;
337 #endif
338 static pthread_key_t global_request_key;
339 static int nthreads = 1; /* (int) sysconf(_SC_NPROCESSORS_ONLN) */
340 #else
341 static int nthreads = 0; /* (int) sysconf(_SC_NPROCESSORS_ONLN) */
342 #endif
343
344 /* static Masktype_T masktype = MASK_REPETITIVE; */
345 static int subopt_levels = 0;
346
347 /* If negative, then hasn't been specified by user. If between 0 and
348 1, then treated as a fraction of the querylength. Else, treated as
349 an integer */
350 static double user_mismatches_refalt_float = -1.0;
351 static double user_mismatches_ref_float = -1.0;
352 static double user_mincoverage_float = 0.5;
353
354 /* Really have only one indel penalty */
355 static int indel_penalty_middle = 2;
356 static int indel_penalty_end = 2;
357
358 static bool allow_end_indels_p = true;
359 static double max_middle_insertions_float = 0.2;
360 static double max_middle_deletions_float= 0.2;
361 static int max_end_insertions = 3;
362 static int max_end_deletions = 6;
363 static int min_indel_end_matches = 4;
364 static Chrpos_T shortsplicedist = 200000;
365 static Chrpos_T shortsplicedist_known;
366 static Chrpos_T shortsplicedist_novelend = 80000;
367 static int localsplicing_penalty = 0;
368 static int distantsplicing_penalty = 1;
369 static int min_distantsplicing_end_matches = 20; /* Should be about index1part */
370 static double min_distantsplicing_identity = 0.95;
371 static int min_shortend = 2;
372 /* static bool find_novel_doublesplices_p = true; */
373 static int antistranded_penalty = 0; /* Most RNA-Seq is non-stranded */
374
375 static Width_T index1part = 15;
376 static Width_T index1part_tr = 15;
377 static Width_T required_index1part = 0;
378 static Width_T index1interval;
379 static Width_T index1interval_tr;
380 static Width_T required_index1interval = 0;
381
382 static Width_T local1part = 8;
383 static Width_T required_local1part = 0;
384 static Width_T local1interval;
385 static Width_T required_local1interval = 0;
386
387 static Width_T region1part = 6;
388 static Width_T required_region1part = 0;
389 static Width_T region1interval;
390 static Width_T required_region1interval = 0;
391
392 static int indexdb_size_threshold;
393
394 /* Option for GSNAP complete set algorithm. Affects speed significantly. */
395 static int max_anchors = 10;
396
397
398 /* Genes IIT */
399 static char *genes_file = (char *) NULL;
400 static IIT_T genes_iit = NULL; /* For genome alignment */
401 static int *genes_divint_crosstable = NULL;
402 static int *genes_chrnum_crosstable = NULL;
403 static bool favor_multiexon_p = false;
404
405
406 /* Splicing IIT */
407 static bool knownsplicingp = false;
408 static bool distances_observed_p = false;
409 static char *user_splicingdir = (char *) NULL;
410 static char *splicing_file = (char *) NULL;
411 static IIT_T splicing_iit = NULL;
412 static bool amb_clip_p = true;
413
414 static int donor_typeint = -1; /* for splicing_iit */
415 static int acceptor_typeint = -1; /* for splicing_iit */
416
417 static int *splicing_divint_crosstable = NULL;
418 static Genomecomp_T *splicecomp = NULL;
419 static Univcoord_T *splicesites = NULL;
420 static Splicetype_T *splicetypes = NULL;
421 static Chrpos_T *splicedists = NULL; /* maximum observed splice distance for given splice site */
422 static int nsplicesites = 0;
423
424
425 /* Cmet and AtoI */
426 static char *user_modedir = NULL; /* user_cmetdir, user_atoidir */
427 static Mode_T mode = STANDARD;
428
429 /* Masked (e.g., introns) genome */
430 static char *masked_root = (char *) NULL;
431 static bool maskedp = false;
432
433
434 /* SNPs IIT */
435 static char *user_snpsdir = NULL;
436 static char *snps_root = (char *) NULL;
437 static IIT_T snps_iit = NULL;
438 static int *snps_divint_crosstable = NULL;
439
440
441 /* Tally IIT */
442 static char *user_tallydir = NULL;
443 static char *tally_root = (char *) NULL;
444 static IIT_T tally_iit = NULL;
445 static int *tally_divint_crosstable = NULL;
446
447 static char *user_runlengthdir = NULL;
448 static char *runlength_root = (char *) NULL;
449 static IIT_T runlength_iit = NULL;
450 static int *runlength_divint_crosstable = NULL;
451
452
453 /* Output options */
454 static unsigned int output_buffer_size = 1000;
455 static Outputtype_T output_type = STD_OUTPUT;
456
457 /* For Illumina, subtract 64. For Sanger, subtract 33. */
458 /* static int quality_score_adj = 64; -- Stored in mapq.c */
459
460 static bool user_quality_score_adj = false;
461 static bool user_quality_shift = false;
462 static int quality_shift = 0; /* For printing, may want -31 */
463
464 static bool exception_raise_p = true;
465 static bool add_paired_nomappers_p = false;
466 static bool paired_flag_means_concordant_p = false;
467 static bool quiet_if_excessive_p = false;
468 static int maxpaths_search = 1000;
469 static int maxpaths_report = 100;
470 static bool orderedp = false;
471 static bool failsonlyp = false;
472 static bool nofailsp = false;
473
474 static bool print_nsnpdiffs_p = false;
475 static bool print_snplabels_p = false;
476
477 static bool show_refdiff_p = false;
478 static bool clip_overlap_p = false;
479 static bool merge_overlap_p = false;
480 static bool merge_samechr_p = false;
481 static bool method_print_p = false;
482
483 /* SAM */
484 static int sam_headers_batch = -1;
485 static bool sam_headers_p = true;
486 static bool sam_hardclip_use_S_p = false;
487 static bool sam_insert_0M_p = false;
488 static bool sam_cigar_extended_p = false;
489 static bool sam_multiple_primaries_p = false;
490 static bool sam_sparse_secondaries_p = false;
491 static char *sam_read_group_id = NULL;
492 static char *sam_read_group_name = NULL;
493 static char *sam_read_group_library = NULL;
494 static char *sam_read_group_platform = NULL;
495 static bool force_xs_direction_p = false;
496 static bool md_lowercase_variant_p = false;
497 static bool hide_soft_clips_p = false;
498 static Cigar_action_T cigar_action = CIGAR_ACTION_WARNING;
499
500 static bool filter_within_trims_p;
501 static bool specified_filter_within_trims_p = false;
502 static bool user_filter_within_trims_p;
503
504 static bool omit_concordant_uniq_p = false;
505 static bool omit_concordant_mult_p = false;
506
507
508 /* Input/output */
509 static char *split_output_root = NULL;
510 static char *output_file = NULL;
511 static char *failedinput_root = NULL;
512 static bool appendp = false;
513 static Outbuffer_T outbuffer;
514 static Inbuffer_T inbuffer;
515 static unsigned int inbuffer_nspaces = 1000;
516 static bool timingp = false;
517 static bool unloadp = false;
518
519
520 /* Alignment options */
521 static bool uncompressedp = false;
522
523 /* getopt used alphabetically: AaBCcDdEeGgiJjKklMmNnOoQqstVvwYyZz7 */
524
525 static struct option long_options[] = {
526 /* Input options */
527 {"transcriptdir", required_argument, 0, 'C'}, /* user_transcriptomedir */
528 {"transcriptdb", required_argument, 0, 'c'}, /* transcriptome_dbroot */
529 {"dir", required_argument, 0, 'D'}, /* user_genomedir */
530 {"db", required_argument, 0, 'd'}, /* genome_dbroot */
531 {"use-localdb", required_argument, 0, 0}, /* user_localdb_p, use_localdb_p */
532 {"use-transcriptome-only", no_argument, 0, 0}, /* use_only_transcriptome_p */
533 {"kmer", required_argument, 0, 'k'}, /* required_index1part, index1part */
534 {"sampling", required_argument, 0, 0}, /* required_index1interval, index1interval */
535 {"genomefull", no_argument, 0, 'G'}, /* uncompressedp */
536 {"part", required_argument, 0, 'q'}, /* part_modulus, part_interval */
537 {"orientation", required_argument, 0, 0}, /* invert_first_p, invert_second_p */
538 {"input-buffer-size", required_argument, 0, 0}, /* inbuffer_nspaces */
539 {"barcode-length", required_argument, 0, 0}, /* barcode_length */
540 {"endtrim-length", required_argument, 0, 0}, /* endtrim_length */
541 {"fastq-id-start", required_argument, 0, 0}, /* acc_fieldi_start */
542 {"fastq-id-end", required_argument, 0, 0}, /* acc_fieldi_end */
543 {"force-single-end", no_argument, 0, 0}, /* force_single_end_p */
544 {"filter-chastity", required_argument, 0, 0}, /* filter_chastity_p, filter_if_both_p */
545 {"allow-pe-name-mismatch", no_argument, 0, 0}, /* allow_paired_end_mismatch_p */
546
547 {"read-files-command", required_argument, 0, 0}, /* read_files_command */
548 #ifdef HAVE_ZLIB
549 {"gunzip", no_argument, 0, 0}, /* gunzip_p */
550 #endif
551
552 #ifdef HAVE_BZLIB
553 {"bunzip2", no_argument, 0, 0}, /* bunzip2_p */
554 #endif
555
556 {"interleaved", no_argument, 0, 0}, /* interleavedp */
557
558 /* Compute options */
559 {"use-shared-memory", required_argument, 0, 0}, /* sharedp */
560 {"preload-shared-memory", no_argument, 0, 0}, /* preload_shared_memory_p */
561 {"unload-shared-memory", no_argument, 0, 0}, /* unload_shared_memory_p */
562 #ifdef HAVE_MMAP
563 {"batch", required_argument, 0, 'B'}, /* offsetsstrm_access, positions_access, genome_access */
564 #endif
565 {"expand-offsets", required_argument, 0, 0}, /* expand_offsets_p */
566 {"pairmax-dna", required_argument, 0, 0}, /* pairmax_dna */
567 {"pairmax-rna", required_argument, 0, 0}, /* pairmax_rna */
568 {"pairexpect", required_argument, 0, 0}, /* expected_pairlength */
569 {"pairdev", required_argument, 0, 0}, /* pairlength_deviation */
570
571 {"nthreads", required_argument, 0, 't'}, /* nthreads */
572 {"adapter-strip", required_argument, 0, 'a'}, /* chop_primers_p */
573
574 {"query-unk-mismatch", required_argument, 0, 0}, /* query_unk_mismatch_p */
575 {"genome-unk-mismatch", required_argument, 0, 0}, /* genome_unk_mismatch_p */
576
577 {"trim-mismatch-score", required_argument, 0, 0}, /* trim_mismatch_score, user_trim_mismatch_score_p */
578 {"trim-indel-score", required_argument, 0, 0}, /* trim_indel_score, user_trim_mismatch_score_p */
579 {"novelsplicing", required_argument, 0, 'N'}, /* novelsplicingp */
580 {"find-dna-chimeras", required_argument, 0, 0}, /* find_dna_chimeras */
581
582 {"max-mismatches", required_argument, 0, 'm'}, /* user_mismatches_refalt_float */
583 {"max-ref-mismatches", required_argument, 0, 0}, /* user_mismatches_ref_float */
584 {"min-coverage", required_argument, 0, 0}, /* user_mincoverage_float */
585 {"filter-within-trims", required_argument, 0, 0}, /* filter_within_trims_p */
586
587 #if 0
588 {"indel-penalty-middle", required_argument, 0, 'i'}, /* indel_penalty_middle */
589 {"indel-penalty-end", required_argument, 0, 'I'}, /* indel_penalty_end */
590 #else
591 {"indel-penalty", required_argument, 0, 'i'}, /* indel_penalty_middle, indel_penalty_end */
592 #endif
593
594 {"indel-endlength", required_argument, 0, 0}, /* min_indel_end_matches, allow_end_indels_p */
595
596 {"max-middle-insertions", required_argument, 0, 'y'}, /* max_middle_insertions_float */
597 {"max-middle-deletions", required_argument, 0, 'z'}, /* max_middle_deletions_float */
598 {"max-end-insertions", required_argument, 0, 'Y'}, /* max_end_insertions */
599 {"max-end-deletions", required_argument, 0, 'Z'}, /* max_end_deletions */
600 {"suboptimal-levels", required_argument, 0, 'M'}, /* subopt_levels */
601
602 {"localsplicedist", required_argument, 0, 'w'}, /* shortsplicedist */
603 {"novelend-splicedist", required_argument, 0, 0}, /* shortsplicedist_novelend */
604 {"splicingdir", required_argument, 0, 0}, /* user_splicingdir */
605 {"use-splicing", required_argument, 0, 's'}, /* splicing_iit, knownsplicingp, find_dna_chimeras_p */
606 {"ambig-splice-noclip", no_argument, 0, 0}, /* amb_clip_p */
607 {"genes", required_argument, 0, 'g'}, /* genes_iit */
608 {"favor-multiexon", no_argument, 0, 0}, /* favor_multiexon_p */
609 {"end-detail", required_argument, 0, 0}, /* end_detail */
610
611 {"local-splice-penalty", required_argument, 0, 0}, /* localsplicing_penalty */
612 {"distant-splice-penalty", required_argument, 0, 'E'}, /* distantsplicing_penalty */
613 {"distant-splice-endlength", required_argument, 0, 'K'}, /* min_distantsplicing_end_matches */
614 {"shortend-splice-endlength", required_argument, 0, 'l'}, /* min_shortend */
615 {"distant-splice-identity", required_argument, 0, 0}, /* min_distantsplicing_identity */
616 {"antistranded-penalty", required_argument, 0, 0}, /* antistranded_penalty */
617 {"merge-distant-samechr", no_argument, 0, 0}, /* merge_samechr_p */
618
619 {"cmetdir", required_argument, 0, 0}, /* user_modedir */
620 {"atoidir", required_argument, 0, 0}, /* user_modedir */
621 {"mode", required_argument, 0, 0}, /* mode */
622
623 {"use-mask", required_argument, 0, 'e'}, /* masked_root */
624 {"snpsdir", required_argument, 0, 'V'}, /* user_snpsdir */
625 {"use-snps", required_argument, 0, 'v'}, /* snps_root */
626
627 {"tallydir", required_argument, 0, 0}, /* user_tallydir */
628 {"use-tally", required_argument, 0, 0}, /* tally_root */
629
630 {"runlengthdir", required_argument, 0, 0}, /* user_runlengthdir */
631 {"use-runlength", required_argument, 0, 0}, /* runlength_root */
632
633 {"max-anchors", required_argument, 0, 0}, /* max_anchors */
634 {"gmap-mode", required_argument, 0, 0}, /* gmap_mode */
635 {"trigger-score-for-gmap", required_argument, 0, 0}, /* trigger_score_for_gmap */
636 {"gmap-min-match-length", required_argument, 0, 0}, /* gmap_min_nconsecutive */
637 {"gmap-allowance", required_argument, 0, 0}, /* gmap_allowance */
638 {"max-gmap-pairsearch", required_argument, 0, 0}, /* max_gmap_pairsearch */
639 {"max-gmap-terminal", required_argument, 0, 0}, /* max_gmap_terminal */
640 {"max-gmap-improvement", required_argument, 0, 0}, /* max_gmap_improvement */
641
642 /* Output options */
643 {"output-buffer-size", required_argument, 0, 0}, /* output_buffer_size */
644 {"format", required_argument, 0, 'A'}, /* output_type */
645
646 {"quality-protocol", required_argument, 0, 0}, /* quality_score_adj, quality_shift */
647 {"quality-zero-score", required_argument, 0, 'J'}, /* quality_score_adj */
648 {"quality-print-shift", required_argument, 0, 'j'}, /* quality_shift */
649 {"sam-headers-batch", required_argument, 0, 0}, /* sam_headers_batch */
650 {"no-sam-headers", no_argument, 0, 0}, /* sam_headers_p */
651 {"sam-hardclip-use-S", no_argument, 0, 0}, /* sam_hardclip_use_S_p */
652 {"sam-use-0M", no_argument, 0, 0}, /* sam_insert_0M_p */
653 {"sam-extended-cigar", no_argument, 0, 0}, /* sam_cigar_extended_p */
654 {"sam-multiple-primaries", no_argument, 0, 0}, /* sam_multiple_primaries_p */
655 {"sam-sparse-secondaries", no_argument, 0, 0}, /* sam_sparse_secondaries_p */
656 {"read-group-id", required_argument, 0, 0}, /* sam_read_group_id */
657 {"read-group-name", required_argument, 0, 0}, /* sam_read_group_name */
658 {"read-group-library", required_argument, 0, 0}, /* sam_read_group_library */
659 {"read-group-platform", required_argument, 0, 0}, /* sam_read_group_platform */
660 {"force-xs-dir", no_argument, 0, 0}, /* force_xs_direction_p */
661 {"md-lowercase-snp", no_argument, 0, 0}, /* md_lowercase_variant_p */
662 {"extend-soft-clips", no_argument, 0, 0}, /* hide_soft_clips_p */
663 {"action-if-cigar-error", required_argument, 0, 0}, /* cigar_action */
664
665 {"noexceptions", no_argument, 0, '0'}, /* exception_raise_p */
666 #if 0
667 {"maxsearch", required_argument, 0, 0}, /* maxpaths_search */
668 #endif
669 {"npaths", required_argument, 0, 'n'}, /* maxpaths_report */
670 {"add-paired-nomappers", no_argument, 0, 0}, /* add_paired_nomappers_p */
671 {"paired-flag-means-concordant", required_argument, 0, 0}, /* paired_flag_means_concordant_p */
672 {"quiet-if-excessive", no_argument, 0, 'Q'}, /* quiet_if_excessive_p */
673 {"ordered", no_argument, 0, 'O'}, /* orderedp */
674 {"clip-overlap", no_argument, 0, 0}, /* clip_overlap_p */
675 {"merge-overlap", no_argument, 0, 0}, /* merge_overlap_p */
676 {"show-method", no_argument, 0, 0}, /* method_print_p */
677
678 {"show-refdiff", no_argument, 0, 0}, /* show_refdiff_p */
679 {"print-snps", no_argument, 0, 0}, /* print_snplabels_p */
680 {"failsonly", no_argument, 0, 0}, /* failsonlyp */
681 {"nofails", no_argument, 0, 0}, /* nofailsp */
682 {"output-file", required_argument, 0, 'o'}, /* output_file */
683 {"split-output", required_argument, 0, 0}, /* split_output_root */
684 {"failed-input", required_argument, 0, 0}, /* failed_input_root */
685 {"append-output", no_argument, 0, 0}, /* appendp */
686
687 {"order-among-best", required_argument, 0, 0}, /* want_random_p */
688
689 {"omit-concordant-uniq", no_argument, 0, 0}, /* omit_concordant_uniq_p */
690 {"omit-concordant-mult", no_argument, 0, 0}, /* omit_concordant_mult_p */
691
692 /* Diagnostic options */
693 {"time", no_argument, 0, 0}, /* timingp */
694 {"unload", no_argument, 0, 0}, /* unloadp */
695
696 /* Obsolete, but included for backward compatibility */
697 {"use-sarray", required_argument, 0, 0},
698 {"terminal-threshold", required_argument, 0, 0},
699
700 /* Help options */
701 {"check", no_argument, 0, 0}, /* check_compiler_assumptions */
702 {"version", no_argument, 0, 0}, /* print_program_version */
703 {"help", no_argument, 0, 0}, /* print_program_usage */
704 {0, 0, 0, 0}
705 };
706
707
708 static void
print_program_version()709 print_program_version () {
710 char *genomedir;
711
712 fprintf(stdout,"\n");
713 fprintf(stdout,"GSNAP: Genomic Short Nucleotide Alignment Program\n");
714 fprintf(stdout,"Part of GMAP package, version %s\n",PACKAGE_VERSION);
715 fprintf(stdout,"Build target: %s\n",TARGET);
716 fprintf(stdout,"Features: ");
717 #ifdef HAVE_PTHREAD
718 fprintf(stdout,"pthreads enabled, ");
719 #else
720 fprintf(stdout,"no pthreads, ");
721 #endif
722 #ifdef HAVE_ALLOCA
723 fprintf(stdout,"alloca available, ");
724 #else
725 fprintf(stdout,"no alloca, ");
726 #endif
727 #ifdef HAVE_ZLIB
728 fprintf(stdout,"zlib available, ");
729 #else
730 fprintf(stdout,"no zlib, ");
731 #endif
732 #ifdef HAVE_MMAP
733 fprintf(stdout,"mmap available, ");
734 #else
735 fprintf(stdout,"no mmap, ");
736 #endif
737 #ifdef WORDS_BIGENDIAN
738 fprintf(stdout,"bigendian, ");
739 #else
740 fprintf(stdout,"littleendian, ");
741 #endif
742 #ifdef HAVE_SIGACTION
743 fprintf(stdout,"sigaction available, ");
744 #else
745 fprintf(stdout,"no sigaction, ");
746 #endif
747 #ifdef HAVE_64_BIT
748 fprintf(stdout,"64 bits available");
749 #else
750 fprintf(stdout,"64 bits not available");
751 #endif
752 fprintf(stdout,"\n");
753
754 fprintf(stdout,"Popcnt:");
755 #ifdef HAVE_POPCNT
756 fprintf(stdout," popcnt/lzcnt/tzcnt");
757 #endif
758 #ifdef HAVE_MM_POPCNT
759 fprintf(stdout," mm_popcnt");
760 #endif
761 #ifdef HAVE_BUILTIN_POPCOUNT
762 fprintf(stdout," builtin_popcount");
763 #endif
764 fprintf(stdout,"\n");
765
766 fprintf(stdout,"Builtin functions:");
767 #ifdef HAVE_BUILTIN_CLZ
768 fprintf(stdout," builtin_clz");
769 #endif
770 #ifdef HAVE_BUILTIN_CTZ
771 fprintf(stdout," builtin_ctz");
772 #endif
773 #ifdef HAVE_BUILTIN_POPCOUNT
774 fprintf(stdout," builtin_popcount");
775 #endif
776 fprintf(stdout,"\n");
777
778 fprintf(stdout,"SIMD functions compiled:");
779 #ifdef HAVE_ALTIVEC
780 fprintf(stdout," Altivec");
781 #endif
782 #ifdef HAVE_MMX
783 fprintf(stdout," MMX");
784 #endif
785 #ifdef HAVE_SSE
786 fprintf(stdout," SSE");
787 #endif
788 #ifdef HAVE_SSE2
789 fprintf(stdout," SSE2");
790 #endif
791 #ifdef HAVE_SSE3
792 fprintf(stdout," SSE3");
793 #endif
794 #ifdef HAVE_SSSE3
795 fprintf(stdout," SSSE3");
796 #endif
797 #ifdef HAVE_SSE4_1
798 fprintf(stdout," SSE4.1");
799 #endif
800 #ifdef HAVE_SSE4_2
801 fprintf(stdout," SSE4.2");
802 #endif
803 #ifdef HAVE_AVX2
804 fprintf(stdout," AVX2");
805 #endif
806 #ifdef HAVE_AVX512
807 fprintf(stdout," AVX512");
808 #endif
809 #ifdef HAVE_AVX512BW
810 fprintf(stdout," AVX512BW");
811 #endif
812 fprintf(stdout,"\n");
813
814
815 fprintf(stdout,"Sizes: off_t (%d), size_t (%d), unsigned int (%d), long int (%d), long long int (%d)\n",
816 (int) sizeof(off_t),(int) sizeof(size_t),(int) sizeof(unsigned int),(int) sizeof(long int),(int) sizeof(long long int));
817 fprintf(stdout,"Default gmap directory (compiled): %s\n",GMAPDB);
818 genomedir = Datadir_find_genomedir(/*user_genomedir*/NULL);
819 fprintf(stdout,"Default gmap directory (environment): %s\n",genomedir);
820 FREE(genomedir);
821 fprintf(stdout,"Maximum stack read length: %d\n",MAX_STACK_READLENGTH);
822 fprintf(stdout,"Thomas D. Wu, Genentech, Inc.\n");
823 fprintf(stdout,"Contact: twu@gene.com\n");
824 fprintf(stdout,"\n");
825 return;
826 }
827
828 /* This flag is not well-supported, and therefore hidden, but
829 kept for backwards compatibility */
830 /* -R, --rel=STRING Release\n\ */
831
832
833 static void
834 print_program_usage ();
835
836
837 static void
check_compiler_assumptions()838 check_compiler_assumptions () {
839 unsigned int x = rand(), y = rand();
840 #ifdef HAVE_SSE2
841 int z;
842 __m128i a;
843 #ifdef HAVE_SSE4_1
844 char negx, negy;
845 #endif
846 #endif
847
848 #ifdef HAVE_SSE2
849 /* With -mavx, compiler may use assembly instructions for _mm_set1_epi32 that don't work on non-AVX machines */
850 fprintf(stderr,"Checking compiler assumptions for SSE2: ");
851 fprintf(stderr,"%08X %08X",x,y);
852 a = _mm_xor_si128(_mm_set1_epi32(x),_mm_set1_epi32(y));
853 z = _mm_cvtsi128_si32(a);
854 fprintf(stderr," xor=%08X\n",z);
855 #endif
856
857
858 #ifdef HAVE_SSE4_1
859 if ((negx = (char) x) > 0) {
860 negx = -negx;
861 }
862 if ((negy = (char) y) > 0) {
863 negy = -negy;
864 }
865
866 fprintf(stderr,"Checking compiler assumptions for SSE4.1: ");
867 fprintf(stderr,"%d %d",negx,negy);
868 a = _mm_max_epi8(_mm_set1_epi8(negx),_mm_set1_epi8(negy));
869 z = _mm_extract_epi8(a,0);
870 fprintf(stderr," max=%d => ",z);
871 if (negx > negy) {
872 if (z == (int) negx) {
873 fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
874 } else {
875 fprintf(stderr,"compiler zero extends\n");
876 }
877 } else {
878 if (z == (int) negy) {
879 fprintf(stderr,"compiler sign extends\n"); /* technically incorrect, but SIMD procedures behave properly */
880 } else {
881 fprintf(stderr,"compiler zero extends\n");
882 }
883 }
884 #endif
885
886
887 #ifdef HAVE_SSE4_2
888 fprintf(stderr,"Checking compiler assumptions for SSE4.2 options: ");
889 fprintf(stderr,"%08X ",x);
890 #ifdef HAVE_LZCNT
891 fprintf(stderr,"_lzcnt_u32=%d ",_lzcnt_u32(x));
892 #endif
893 #ifdef HAVE_BUILTIN_CLZ
894 fprintf(stderr,"__builtin_clz=%d ",__builtin_clz(x));
895 #endif
896 #ifdef HAVE_TZCNT
897 fprintf(stderr,"_tzcnt_u32=%d ",_tzcnt_u32(x));
898 #endif
899 #ifdef HAVE_BUILTIN_CTZ
900 fprintf(stderr,"__builtin_ctz=%d ",__builtin_ctz(x));
901 #endif
902
903 #ifdef HAVE_POPCNT
904 fprintf(stderr,"_popcnt32=%d ",_popcnt32(x));
905 #endif
906 #if defined(HAVE_MM_POPCNT)
907 fprintf(stderr,"_mm_popcnt_u32=%d ",_mm_popcnt_u32(x));
908 #endif
909 #if defined(HAVE_BUILTIN_POPCOUNT)
910 fprintf(stderr,"__builtin_popcount=%d ",__builtin_popcount(x));
911 #endif
912
913 fprintf(stderr,"\n");
914 #endif
915
916 fprintf(stderr,"Finished checking compiler assumptions\n");
917
918 return;
919 }
920
921
922 /************************************************************************/
923
924
925 static Filestring_T
process_request(Filestring_T * fp_failedinput,Filestring_T * fp_failedinput_1,Filestring_T * fp_failedinput_2,double * worker_runtime,Request_T request,Intlistpool_T intlistpool,Univcoordlistpool_T univcoordlistpool,Listpool_T listpool,Univdiagpool_T univdiagpool,Hitlistpool_T hitlistpool,Stopwatch_T worker_stopwatch)926 process_request (Filestring_T *fp_failedinput, Filestring_T *fp_failedinput_1, Filestring_T *fp_failedinput_2,
927 double *worker_runtime, Request_T request,
928 #if 0
929 Oligoindex_array_T oligoindices_minor,
930 Dynprog_T dynprogL, Dynprog_T dynprogM, Dynprog_T dynprogR,
931 Pairpool_T pairpool, Diagpool_T diagpool, Cellpool_T cellpool,
932 #endif
933 Intlistpool_T intlistpool, Univcoordlistpool_T univcoordlistpool,
934 Listpool_T listpool, Univdiagpool_T univdiagpool, Hitlistpool_T hitlistpool,
935 Stopwatch_T worker_stopwatch) {
936 Filestring_T fp;
937 Result_T result;
938 int jobid;
939 Shortread_T queryseq1, queryseq2;
940 Stage3end_T *stage3array, *stage3array5, *stage3array3;
941 Stage3pair_T *stage3pairarray;
942
943 int npaths_primary, npaths_altloc, npaths5_primary, npaths5_altloc, npaths3_primary, npaths3_altloc, i;
944 int first_absmq, second_absmq, first_absmq5, second_absmq5, first_absmq3, second_absmq3;
945 Pairtype_T final_pairtype;
946
947 jobid = Request_id(request);
948 queryseq1 = Request_queryseq1(request);
949 queryseq2 = Request_queryseq2(request);
950
951 #if 0
952 Pairpool_reset(pairpool);
953 Diagpool_reset(diagpool);
954 Cellpool_reset(cellpool);
955 #endif
956 Intlistpool_reset(intlistpool);
957 Univcoordlistpool_reset(univcoordlistpool);
958 Listpool_reset(listpool);
959 Univdiagpool_reset(univdiagpool);
960 Hitlistpool_reset(hitlistpool);
961
962 /* printf("%s\n",Shortread_accession(queryseq1)); */
963
964 if (worker_stopwatch != NULL) {
965 Stopwatch_start(worker_stopwatch);
966 }
967
968 if (queryseq2 == NULL) {
969 stage3array = Stage1_single_read(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,
970 queryseq1,intlistpool,univcoordlistpool,listpool,univdiagpool,
971 hitlistpool);
972
973 result = Result_single_read_new(jobid,(void **) stage3array,npaths_primary,npaths_altloc,first_absmq,second_absmq);
974 fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
975 *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
976 Result_free(&result);
977 return fp;
978
979 } else if ((stage3pairarray = Stage1_paired_read(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,&final_pairtype,
980 &stage3array5,&npaths5_primary,&npaths5_altloc,&first_absmq5,&second_absmq5,
981 &stage3array3,&npaths3_primary,&npaths3_altloc,&first_absmq3,&second_absmq3,
982 queryseq1,queryseq2,(Chrpos_T) pairmax_linear,
983 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool)) != NULL) {
984 /* Paired or concordant hits found */
985 result = Result_paired_read_new(jobid,(void **) stage3pairarray,npaths_primary,npaths_altloc,first_absmq,second_absmq,
986 final_pairtype);
987 fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
988 *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
989 Result_free(&result);
990 return fp;
991
992 } else if (chop_primers_p == false || Shortread_chop_primers(queryseq1,queryseq2) == false) {
993 /* No paired or concordant hits found, and no adapters found */
994 /* Report ends as unpaired */
995 result = Result_paired_as_singles_new(jobid,(void **) stage3array5,npaths5_primary,npaths5_altloc,first_absmq5,second_absmq5,
996 (void **) stage3array3,npaths3_primary,npaths3_altloc,first_absmq3,second_absmq3);
997 fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
998 *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
999 Result_free(&result);
1000 return fp;
1001
1002 } else {
1003 /* Try with potential primers chopped. queryseq1 and queryseq2 altered by Shortread_chop_primers. */
1004 for (i = 0; i < npaths5_primary + npaths5_altloc; i++) {
1005 Stage3end_free(&(stage3array5[i]));
1006 }
1007 FREE_OUT(stage3array5);
1008
1009 for (i = 0; i < npaths3_primary + npaths3_altloc; i++) {
1010 Stage3end_free(&(stage3array3[i]));
1011 }
1012 FREE_OUT(stage3array3);
1013
1014 if ((stage3pairarray = Stage1_paired_read(&npaths_primary,&npaths_altloc,&first_absmq,&second_absmq,&final_pairtype,
1015 &stage3array5,&npaths5_primary,&npaths5_altloc,&first_absmq5,&second_absmq5,
1016 &stage3array3,&npaths3_primary,&npaths3_altloc,&first_absmq3,&second_absmq3,
1017 queryseq1,queryseq2,(Chrpos_T) pairmax_linear,
1018 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool)) != NULL) {
1019 /* Paired or concordant hits found, after chopping adapters */
1020 result = Result_paired_read_new(jobid,(void **) stage3pairarray,npaths_primary,npaths_altloc,first_absmq,second_absmq,
1021 final_pairtype);
1022
1023 } else {
1024 /* No paired or concordant hits found, after chopping adapters */
1025 result = Result_paired_as_singles_new(jobid,(void **) stage3array5,npaths5_primary,npaths5_altloc,first_absmq5,second_absmq5,
1026 (void **) stage3array3,npaths3_primary,npaths3_altloc,first_absmq3,second_absmq3);
1027 }
1028
1029 fp = Output_filestring_fromresult(&(*fp_failedinput),&(*fp_failedinput_1),&(*fp_failedinput_2),result,request);
1030 *worker_runtime = worker_stopwatch == NULL ? 0.00 : Stopwatch_stop(worker_stopwatch);
1031 Result_free(&result);
1032 return fp;
1033 }
1034 }
1035
1036
1037
1038 #ifdef HAVE_SIGACTION
1039 static const Except_T sigfpe_error = {"SIGFPE--arithmetic exception"};
1040 static const Except_T sigsegv_error = {"SIGSEGV--segmentation violation"};
1041 static const Except_T sigtrap_error = {"SIGTRAP--hardware fault"};
1042 static const Except_T misc_signal_error = {"Miscellaneous signal"};
1043
1044 static void
signal_handler(int sig)1045 signal_handler (int sig) {
1046 Request_T request;
1047 Shortread_T queryseq1, queryseq2;
1048
1049 if (sig == SIGPIPE) {
1050 /* Allow pipe */
1051 return;
1052 }
1053
1054 switch (sig) {
1055 case SIGABRT: fprintf(stderr,"Signal received: SIGABRT\n"); break;
1056 case SIGFPE: fprintf(stderr,"Signal received: SIGFPE\n"); break;
1057 case SIGHUP: fprintf(stderr,"Signal received: SIGHUP\n"); break;
1058 case SIGILL:
1059 fprintf(stderr,"Signal received: SIGILL\n");
1060 fprintf(stderr,"An illegal instruction means that this program is being run on a computer\n");
1061 fprintf(stderr," with different features than the computer used to compile the program\n");
1062 fprintf(stderr,"You may need to re-compile the program on the same computer type as the target machine\n");
1063 fprintf(stderr," or re-compile with fewer features by doing something like\n");
1064 fprintf(stderr," ./configure --disable-simd\n");
1065 break;
1066 case SIGINT: fprintf(stderr,"Signal received: SIGINT\n"); break;
1067 case SIGPIPE: fprintf(stderr,"Signal received: SIGPIPE\n"); break;
1068 case SIGQUIT: fprintf(stderr,"Signal received: SIGQUIT\n"); break;
1069 case SIGSEGV: fprintf(stderr,"Signal received: SIGSEGV\n"); break;
1070 case SIGSYS: fprintf(stderr,"Signal received: SIGSYS\n"); break;
1071 case SIGTERM: fprintf(stderr,"Signal received: SIGTERM\n"); break;
1072 case SIGTRAP: fprintf(stderr,"Signal received: SIGTRAP\n"); break;
1073 case SIGXCPU: fprintf(stderr,"Signal received: SIGXCPU\n"); break;
1074 case SIGXFSZ: fprintf(stderr,"Signal received: SIGXFSZ\n"); break;
1075 }
1076
1077 Access_emergency_cleanup();
1078
1079 #if 0
1080 /* Appears to hang */
1081 #ifdef USE_MPI
1082 MPI_Barrier(MPI_COMM_WORLD);
1083 #endif
1084 #endif
1085
1086 #ifdef HAVE_PTHREAD
1087 request = (Request_T) pthread_getspecific(global_request_key);
1088 if (request == NULL) {
1089 /* fprintf(stderr,"Unable to retrieve request for thread\n"); */
1090 } else {
1091 queryseq1 = Request_queryseq1(request);
1092 queryseq2 = Request_queryseq2(request);
1093 if (queryseq1 == NULL) {
1094 fprintf(stderr,"Unable to retrieve queryseq for request\n");
1095 } else {
1096 fprintf(stderr,"Problem sequence: ");
1097 fprintf(stderr,"%s (%d bp)\n",Shortread_accession(queryseq1),Shortread_fulllength(queryseq1));
1098 if (queryseq2 == NULL) {
1099 Shortread_stderr_query_singleend_fasta(queryseq1,/*headerseq*/queryseq1);
1100 } else {
1101 Shortread_stderr_query_pairedend_fasta(queryseq1,queryseq2,invert_first_p,invert_second_p);
1102 }
1103 }
1104 }
1105 #endif
1106
1107 exit(9);
1108
1109 return;
1110 }
1111 #endif
1112
1113
1114 /* #define POOL_FREE_INTERVAL 200 */
1115 #define POOL_FREE_INTERVAL 1
1116
1117
1118 static void
single_thread()1119 single_thread () {
1120 Request_T request;
1121 Filestring_T fp, fp_failedinput, fp_failedinput_1, fp_failedinput_2;
1122 Shortread_T queryseq1;
1123 Stopwatch_T worker_stopwatch;
1124
1125 /* For GMAP */
1126 /* Oligoindex_array_T oligoindices_major, oligoindices_minor; */
1127 /* Dynprog_T dynprogL, dynprogM, dynprogR; */
1128 #if 0
1129 Pairpool_T pairpool;
1130 Diagpool_T diagpool;
1131 Cellpool_T cellpool;
1132 #endif
1133 Intlistpool_T intlistpool;
1134 Univcoordlistpool_T univcoordlistpool;
1135 Listpool_T listpool;
1136 Univdiagpool_T univdiagpool;
1137 Hitlistpool_T hitlistpool;
1138 int jobid = 0;
1139 double worker_runtime;
1140
1141 #ifdef MEMUSAGE
1142 long int memusage_constant = 0, memusage;
1143 char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
1144 #endif
1145
1146 #if 0
1147 oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1148 oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1149 dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1150 /*doublep*/true);
1151 dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1152 /*doublep*/false);
1153 dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1154 /*doublep*/true);
1155 #endif
1156
1157 #if 0
1158 pairpool = Pairpool_new();
1159 diagpool = Diagpool_new();
1160 cellpool = Cellpool_new();
1161 #endif
1162 intlistpool = Intlistpool_new();
1163 univcoordlistpool = Univcoordlistpool_new();
1164 listpool = Listpool_new();
1165 univdiagpool = Univdiagpool_new();
1166 hitlistpool = Hitlistpool_new();
1167 worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
1168
1169 /* Except_stack_create(); -- requires pthreads */
1170
1171 #ifdef MEMUSAGE
1172 memusage_constant += Mem_usage_report_std_heap();
1173 Genomicpos_commafmt_fill(comma0,memusage_constant);
1174 Mem_usage_reset_heap_baseline(0);
1175 #endif
1176
1177 while ((request = Inbuffer_get_request(inbuffer)) != NULL) {
1178 #ifdef USE_MPI
1179 debug(printf("rank %d, ",myid));
1180 #endif
1181 debug(printf("single_thread got request %d\n",Request_id(request)));
1182
1183 #ifdef MEMUSAGE
1184 queryseq1 = Request_queryseq1(request);
1185 /* fprintf(stderr,"Single thread starting %s\n",Shortread_accession(queryseq1)); */
1186 Mem_usage_reset_stack_max();
1187 Mem_usage_reset_heap_max();
1188 #endif
1189
1190 TRY
1191 fp = process_request(&fp_failedinput,&fp_failedinput_1,&fp_failedinput_2,&worker_runtime,
1192 request,
1193 /*oligoindices_minor,dynprogL,dynprogM,dynprogR,*/
1194 /*pairpool,diagpool,cellpool,*/
1195 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1196 worker_stopwatch);
1197 if (timingp == true) {
1198 queryseq1 = Request_queryseq1(request);
1199 fprintf(stderr,"%s\t%.6f\n",Shortread_accession(queryseq1),worker_runtime);
1200 }
1201
1202 ELSE
1203 queryseq1 = Request_queryseq1(request);
1204 if (queryseq1 == NULL) {
1205 fprintf(stderr,"NULL");
1206 } else if (Shortread_accession(queryseq1) == NULL) {
1207 fprintf(stderr,"unnamed (%d bp)",Shortread_fulllength(queryseq1));
1208 } else {
1209 fprintf(stderr,"Problem sequence: ");
1210 fprintf(stderr,"%s (%d bp)",Shortread_accession(queryseq1),Shortread_fulllength(queryseq1));
1211 }
1212 fprintf(stderr,"\n");
1213 if (Request_queryseq2(request) == NULL) {
1214 Shortread_stderr_query_singleend_fasta(queryseq1,/*headerseq*/queryseq1);
1215 } else {
1216 Shortread_stderr_query_pairedend_fasta(queryseq1,Request_queryseq2(request),
1217 invert_first_p,invert_second_p);
1218 }
1219 fprintf(stderr,"\n");
1220 fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
1221
1222 fprintf(stderr,"Exiting...\n");
1223 exit(9);
1224 RERAISE;
1225 END_TRY;
1226
1227 Outbuffer_print_filestrings(fp,fp_failedinput,fp_failedinput_1,fp_failedinput_2);
1228
1229 if (jobid % POOL_FREE_INTERVAL == 0) {
1230 #if 0
1231 Pairpool_free_memory(pairpool);
1232 Diagpool_free_memory(diagpool);
1233 Cellpool_free_memory(cellpool);
1234 #endif
1235 Intlistpool_free_memory(intlistpool);
1236 Univcoordlistpool_free_memory(univcoordlistpool);
1237 Listpool_free_memory(listpool);
1238 Univdiagpool_free_memory(univdiagpool);
1239 Hitlistpool_free_memory(hitlistpool);
1240 }
1241
1242 #ifdef MEMUSAGE
1243 /* Copy acc before we free the request */
1244 queryseq1 = Request_queryseq1(request);
1245 strncpy(acc,Shortread_accession(queryseq1),100);
1246 acc[100] = '\0';
1247 #endif
1248
1249 Request_free(&request);
1250
1251 #ifdef MEMUSAGE
1252 Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
1253 Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
1254 Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
1255 Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
1256 Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
1257
1258 #if 0
1259 fprintf(stderr,"Acc %s: constant %s max %s std %s keep %s in %s out %s\n",
1260 acc,comma0,comma1,comma2,comma3,comma4,comma5);
1261 #endif
1262
1263 if ((memusage = Mem_usage_report_std_heap()) != 0) {
1264 fprintf(stderr,"Acc %s: Memory leak in single thread of %ld bytes\n",acc,memusage);
1265 fflush(stdout);
1266 exit(9);
1267 } else if (Mem_usage_report_std_heap_max() > ONE_GB) {
1268 fprintf(stderr,"Acc %s: Excessive memory usage in single thread of %s bytes\n",acc,comma1);
1269 fflush(stdout);
1270 exit(9);
1271 }
1272 #endif
1273 }
1274
1275 #ifdef MEMUSAGE
1276 Mem_usage_std_heap_add(memusage_constant);
1277 #endif
1278
1279 /* Except_stack_destroy(); -- requires pthreads */
1280
1281 if (worker_stopwatch != NULL) {
1282 Stopwatch_free(&worker_stopwatch);
1283 }
1284 Hitlistpool_free(&hitlistpool);
1285 Univdiagpool_free(&univdiagpool);
1286 Listpool_free(&listpool);
1287 Univcoordlistpool_free(&univcoordlistpool);
1288 Intlistpool_free(&intlistpool);
1289 #if 0
1290 Cellpool_free(&cellpool);
1291 Diagpool_free(&diagpool);
1292 Pairpool_free(&pairpool);
1293 Dynprog_free(&dynprogR);
1294 Dynprog_free(&dynprogM);
1295 Dynprog_free(&dynprogL);
1296 Oligoindex_array_free(&oligoindices_minor);
1297 Oligoindex_array_free(&oligoindices_major);
1298 #endif
1299
1300 #ifdef MEMUSAGE
1301 Mem_usage_set_threadname("main");
1302 #endif
1303
1304 return;
1305 }
1306
1307
1308 #ifdef HAVE_PTHREAD
1309 static void *
worker_thread(void * data)1310 worker_thread (void *data) {
1311 Request_T request;
1312 Filestring_T fp, fp_failedinput, fp_failedinput_1, fp_failedinput_2;
1313 Shortread_T queryseq1;
1314 Stopwatch_T worker_stopwatch;
1315
1316 /* For GMAP */
1317 /* Oligoindex_array_T oligoindices_major, oligoindices_minor; */
1318 /* Dynprog_T dynprogL, dynprogM, dynprogR; */
1319 #if 0
1320 Pairpool_T pairpool;
1321 Diagpool_T diagpool;
1322 Cellpool_T cellpool;
1323 #endif
1324 Intlistpool_T intlistpool;
1325 Univcoordlistpool_T univcoordlistpool;
1326 Listpool_T listpool;
1327 Univdiagpool_T univdiagpool;
1328 Hitlistpool_T hitlistpool;
1329 int worker_jobid = 0;
1330 double worker_runtime;
1331 #if defined(DEBUG) || defined(MEMUSAGE)
1332 long int worker_id = (long int) data;
1333 #endif
1334
1335 #ifdef MEMUSAGE
1336 long int memusage_constant = 0, memusage;
1337 char threadname[12];
1338 char acc[100+1], comma0[20], comma1[20], comma2[20], comma3[20], comma4[20], comma5[20];
1339 sprintf(threadname,"thread-%ld",worker_id);
1340 Mem_usage_set_threadname(threadname);
1341 #endif
1342
1343 #ifdef USE_MPI
1344 debug(fprintf(stderr,"rank %d, ",myid));
1345 #endif
1346 debug(fprintf(stderr,"worker_thread %ld starting\n",worker_id));
1347
1348 /* Thread-specific data and storage */
1349 #if 0
1350 oligoindices_major = Oligoindex_array_new_major(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1351 oligoindices_minor = Oligoindex_array_new_minor(MAX_QUERYLENGTH_FOR_ALLOC,MAX_GENOMICLENGTH_FOR_ALLOC);
1352 dynprogL = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1353 /*doublep*/true);
1354 dynprogM = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1355 /*doublep*/false);
1356 dynprogR = Dynprog_new(nullgap,EXTRAQUERYGAP,maxpeelback,extramaterial_end,extramaterial_paired,
1357 /*doublep*/true);
1358 pairpool = Pairpool_new();
1359 diagpool = Diagpool_new();
1360 cellpool = Cellpool_new();
1361 #endif
1362
1363 intlistpool = Intlistpool_new();
1364 univcoordlistpool = Univcoordlistpool_new();
1365 listpool = Listpool_new();
1366 univdiagpool = Univdiagpool_new();
1367 hitlistpool = Hitlistpool_new();
1368 worker_stopwatch = (timingp == true) ? Stopwatch_new() : (Stopwatch_T) NULL;
1369
1370 Except_stack_create();
1371
1372 #ifdef MEMUSAGE
1373 memusage_constant += Mem_usage_report_std_heap();
1374 Genomicpos_commafmt_fill(comma0,memusage_constant);
1375 Mem_usage_reset_heap_baseline(0);
1376 #endif
1377
1378 while ((request = Inbuffer_get_request(inbuffer)) != NULL) {
1379 #ifdef USE_MPI
1380 debug(fprintf(stderr,"rank %d, ",myid));
1381 #endif
1382 debug(fprintf(stderr,"worker_thread %ld got request %d (%s)\n",
1383 worker_id,Request_id(request),Shortread_accession(Request_queryseq1(request))));
1384 pthread_setspecific(global_request_key,(void *) request);
1385
1386 #ifdef MEMUSAGE
1387 queryseq1 = Request_queryseq1(request);
1388 /* fprintf(stderr,"Thread %d starting %s\n",worker_id,Shortread_accession(queryseq1)); */
1389 Mem_usage_reset_stack_max();
1390 Mem_usage_reset_heap_max();
1391 #endif
1392
1393 TRY
1394 fp = process_request(&fp_failedinput,&fp_failedinput_1,&fp_failedinput_2,&worker_runtime,
1395 request,
1396 /*oligoindices_minor,dynprogL,dynprogM,dynprogR,*/
1397 /*pairpool,diagpool,cellpool,*/
1398 intlistpool,univcoordlistpool,listpool,univdiagpool,hitlistpool,
1399 worker_stopwatch);
1400 if (timingp == true) {
1401 queryseq1 = Request_queryseq1(request);
1402 fprintf(stderr,"%s\t%.6f\n",Shortread_accession(queryseq1),worker_runtime);
1403 }
1404
1405 ELSE
1406 queryseq1 = Request_queryseq1(request);
1407 if (queryseq1 == NULL) {
1408 fprintf(stderr,"NULL");
1409 } else if (Shortread_accession(queryseq1) == NULL) {
1410 fprintf(stderr,"unnamed (%d bp)",Shortread_fulllength(queryseq1));
1411 } else {
1412 fprintf(stderr,"Problem sequence: ");
1413 fprintf(stderr,"%s (%d bp)",Shortread_accession(queryseq1),Shortread_fulllength(queryseq1));
1414 }
1415 fprintf(stderr,"\n");
1416 if (Request_queryseq2(request) == NULL) {
1417 Shortread_stderr_query_singleend_fasta(queryseq1,/*headerseq*/queryseq1);
1418 } else {
1419 Shortread_stderr_query_pairedend_fasta(queryseq1,Request_queryseq2(request),
1420 invert_first_p,invert_second_p);
1421 }
1422 fprintf(stderr,"\n");
1423 fprintf(stderr,"To obtain a core dump, re-run program on problem sequence with the -0 [zero] flag\n");
1424
1425 fprintf(stderr,"Exiting...\n");
1426 exit(9);
1427 RERAISE;
1428 END_TRY;
1429
1430 Outbuffer_put_filestrings(outbuffer,fp,fp_failedinput,fp_failedinput_1,fp_failedinput_2);
1431
1432 if (worker_jobid % POOL_FREE_INTERVAL == 0) {
1433 #if 0
1434 Pairpool_free_memory(pairpool);
1435 Diagpool_free_memory(diagpool);
1436 Cellpool_free_memory(cellpool);
1437 #endif
1438 Intlistpool_free_memory(intlistpool);
1439 Univcoordlistpool_free_memory(univcoordlistpool);
1440 Listpool_free_memory(listpool);
1441 Univdiagpool_free_memory(univdiagpool);
1442 Hitlistpool_free_memory(hitlistpool);
1443 }
1444
1445 #ifdef MEMUSAGE
1446 /* Copy acc before we free the request */
1447 queryseq1 = Request_queryseq1(request);
1448 strncpy(acc,Shortread_accession(queryseq1),100);
1449 acc[100] = '\0';
1450 #endif
1451
1452 Request_free(&request);
1453
1454 #ifdef MEMUSAGE
1455 Genomicpos_commafmt_fill(comma1,Mem_usage_report_std_heap_max());
1456 Genomicpos_commafmt_fill(comma2,Mem_usage_report_std_heap());
1457 Genomicpos_commafmt_fill(comma3,Mem_usage_report_keep());
1458 Genomicpos_commafmt_fill(comma4,Mem_usage_report_in());
1459 Genomicpos_commafmt_fill(comma5,Mem_usage_report_out());
1460
1461 #if 0
1462 fprintf(stderr,"Acc %s, thread %d: constant %s max %s std %s keep %s in %s out %s\n",
1463 acc,worker_id,comma0,comma1,comma2,comma3,comma4,comma5);
1464 #endif
1465
1466 if ((memusage = Mem_usage_report_std_heap()) != 0) {
1467 fprintf(stderr,"Acc %s: Memory leak in worker thread %ld of %ld bytes\n",acc,worker_id,memusage);
1468 fflush(stdout);
1469 exit(9);
1470 } else if (Mem_usage_report_std_heap_max() > ONE_GB) {
1471 fprintf(stderr,"Acc %s: Excessive memory usage in worker thread %ld of %s bytes\n",acc,worker_id,comma1);
1472 fflush(stdout);
1473 exit(9);
1474 }
1475 #endif
1476 }
1477
1478 #ifdef MEMUSAGE
1479 Mem_usage_std_heap_add(memusage_constant);
1480 #endif
1481
1482 Except_stack_destroy();
1483
1484 if (worker_stopwatch != NULL) {
1485 Stopwatch_free(&worker_stopwatch);
1486 }
1487 Hitlistpool_free(&hitlistpool);
1488 Univdiagpool_free(&univdiagpool);
1489 Listpool_free(&listpool);
1490 Univcoordlistpool_free(&univcoordlistpool);
1491 Intlistpool_free(&intlistpool);
1492
1493 #if 0
1494 Cellpool_free(&cellpool);
1495 Diagpool_free(&diagpool);
1496 Pairpool_free(&pairpool);
1497 Dynprog_free(&dynprogR);
1498 Dynprog_free(&dynprogM);
1499 Dynprog_free(&dynprogL);
1500 Oligoindex_array_free(&oligoindices_minor);
1501 Oligoindex_array_free(&oligoindices_major);
1502 #endif
1503
1504 #ifdef MEMUSAGE
1505 Mem_usage_set_threadname("main");
1506 #endif
1507
1508 #ifdef USE_MPI
1509 debug(fprintf(stderr,"rank %d, ",myid));
1510 #endif
1511 debug(fprintf(stderr,"worker_thread %ld finished\n",worker_id));
1512
1513 return (void *) NULL;
1514 }
1515 #endif
1516
1517
1518 static void
parse_part(int * part_modulus,int * part_interval,char * string)1519 parse_part (int *part_modulus, int *part_interval, char *string) {
1520 char *p = string;
1521
1522 if (sscanf(p,"%d",&(*part_modulus)) < 1) {
1523 fprintf(stderr,"Cannot parse first integer from %s\n",string);
1524 exit(9);
1525 }
1526
1527 while (*p != '\0' && isdigit(*p)) {
1528 p++;
1529 }
1530 while (*p != '\0' && !isdigit(*p)) {
1531 p++;
1532 }
1533 if (sscanf(p,"%d",&(*part_interval)) < 1) {
1534 fprintf(stderr,"Cannot parse first integer from %s\n",string);
1535 exit(9);
1536 }
1537 if ((*part_modulus) >= (*part_interval)) {
1538 fprintf(stderr,"In %s, batch number %d must be less than the number of batches %d\n",
1539 string,*part_modulus,*part_interval);
1540 exit(9);
1541 }
1542 if (*part_interval == 0) {
1543 fprintf(stderr,"Bad batch specification %s. Batch interval cannot be 0.\n",string);
1544 exit(9);
1545 }
1546
1547 return;
1548 }
1549
1550
1551 #if 0
1552 static int
1553 add_gmap_mode (char *string) {
1554 if (!strcmp(string,"none")) {
1555 gmap_mode = 0;
1556 return 0;
1557 } else if (!strcmp(string,"all")) {
1558 gmap_mode = (GMAP_IMPROVEMENT | GMAP_ENDS | GMAP_PAIRSEARCH);
1559 return 1;
1560 } else {
1561 if (!strcmp(string,"improve")) {
1562 gmap_mode |= GMAP_IMPROVEMENT;
1563 } else if (!strcmp(string,"ends")) {
1564 gmap_mode |= GMAP_ENDS;
1565 } else if (!strcmp(string,"pairsearch")) {
1566 gmap_mode |= GMAP_PAIRSEARCH;
1567 } else {
1568 fprintf(stderr,"Don't recognize gmap-mode type %s\n",string);
1569 fprintf(stderr,"Allowed values are: none, all, improve, ends, pairsearch\n");
1570 exit(9);
1571 }
1572 return 1;
1573 }
1574 }
1575 #endif
1576
1577
1578 static char *
check_valid_int(char * string)1579 check_valid_int (char *string) {
1580 char *p = string;
1581
1582 if (*p == '+' || *p == '-') {
1583 p++;
1584 }
1585
1586 if (!isdigit(*p)) {
1587 fprintf(stderr,"value %s is not a valid int\n",string);
1588 exit(9);
1589 return NULL;
1590 }
1591 while (*p != '\0' && isdigit(*p)) {
1592 p++;
1593 }
1594
1595 if (*p == 'e') {
1596 p++;
1597 if (*p == '+') {
1598 p++;
1599 }
1600 if (!isdigit(*p)) {
1601 return false;
1602 }
1603 while (*p != '\0' && isdigit(*p)) {
1604 p++;
1605 }
1606 }
1607
1608 if (*p == '\0') {
1609 return string;
1610 } else {
1611 fprintf(stderr,"value %s is not a valid int\n",string);
1612 exit(9);
1613 return NULL;
1614 }
1615 }
1616
1617
1618 static double
check_valid_float(char * string,const char * option)1619 check_valid_float (char *string, const char *option) {
1620 double value;
1621 char *p = string;
1622
1623 if (*p == '+' || *p == '-') {
1624 p++;
1625 }
1626
1627 while (*p != '\0' && isdigit(*p)) {
1628 p++;
1629 }
1630 if (*p == '\0') {
1631 if ((value = atof(string)) > 1.0 || value < 0.0) {
1632 fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
1633 exit(9);
1634 } else {
1635 return value;
1636 }
1637 }
1638
1639 if (*p == '.') {
1640 p++;
1641 }
1642
1643 if (!isdigit(*p)) {
1644 fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
1645 exit(9);
1646 return 0.0;
1647 }
1648 while (*p != '\0' && isdigit(*p)) {
1649 p++;
1650 }
1651
1652 if (*p == 'e') {
1653 p++;
1654 if (*p == '+' || *p == '-') {
1655 p++;
1656 }
1657 if (!isdigit(*p)) {
1658 fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
1659 exit(9);
1660 return 0.0;
1661 }
1662 while (*p != '\0' && isdigit(*p)) {
1663 p++;
1664 }
1665 }
1666
1667 if (*p == '\0') {
1668 if ((value = atof(string)) > 1.0 || value < 0.0) {
1669 fprintf(stderr,"Value for option %s should be between 0.0 and 1.0\n",option);
1670 exit(9);
1671 } else {
1672 return value;
1673 }
1674 } else {
1675 fprintf(stderr,"Value %s for option %s is not a valid float\n",string,option);
1676 exit(9);
1677 return 0.0;
1678 }
1679 }
1680
1681 static char *
check_valid_float_or_int(char * string)1682 check_valid_float_or_int (char *string) {
1683 char *p = string;
1684
1685 if (*p == '+' || *p == '-') {
1686 p++;
1687 }
1688
1689 while (*p != '\0' && isdigit(*p)) {
1690 p++;
1691 }
1692 if (*p == '\0') {
1693 return string;
1694 }
1695
1696 if (*p == '.') {
1697 p++;
1698 }
1699
1700 if (!isdigit(*p)) {
1701 fprintf(stderr,"value %s is not a valid float\n",string);
1702 exit(9);
1703 return NULL;
1704 }
1705 while (*p != '\0' && isdigit(*p)) {
1706 p++;
1707 }
1708
1709 if (*p == 'e') {
1710 p++;
1711 if (*p == '+' || *p == '-') {
1712 p++;
1713 }
1714 if (!isdigit(*p)) {
1715 fprintf(stderr,"value %s is not a valid float\n",string);
1716 exit(9);
1717 return NULL;
1718 }
1719 while (*p != '\0' && isdigit(*p)) {
1720 p++;
1721 }
1722 }
1723
1724 if (*p == '\0') {
1725 return string;
1726 } else {
1727 fprintf(stderr,"value %s is not a valid float\n",string);
1728 exit(9);
1729 return NULL;
1730 }
1731 }
1732
1733
1734 static int
parse_command_line(int argc,char * argv[],int optind)1735 parse_command_line (int argc, char *argv[], int optind) {
1736 int opt, c;
1737 extern char *optarg;
1738 int long_option_index = 0;
1739 const char *long_name;
1740 char **argstart;
1741 /* char *string; */
1742
1743 fprintf(stderr,"GSNAP version %s called with args:",PACKAGE_VERSION);
1744 argstart = &(argv[-optind]);
1745 for (c = 1; c < argc + optind; c++) {
1746 fprintf(stderr," %s",argstart[c]);
1747 }
1748 fprintf(stderr,"\n");
1749
1750 while ((opt = getopt_long(argc,argv,
1751 "C:c:D:d:k:Gq:o:a:N:M:m:i:y:Y:z:Z:w:e:E:J:K:l:g:s:V:v:B:t:A:j:0n:QO",
1752 long_options, &long_option_index)) != -1) {
1753 switch (opt) {
1754 case 0:
1755 long_name = long_options[long_option_index].name;
1756 if (!strcmp(long_name,"version")) {
1757 print_program_version();
1758 return 1;
1759 } else if (!strcmp(long_name,"check")) {
1760 check_compiler_assumptions();
1761 return 1;
1762 } else if (!strcmp(long_name,"help")) {
1763 print_program_usage();
1764 return 1;
1765
1766 } else if (!strcmp(long_name,"use-sarray")) {
1767 fprintf(stderr,"Ignoring the --use-sarray flag, since this version of GSNAP does not use suffix arrays\n");
1768
1769 } else if (!strcmp(long_name,"terminal-threshold")) {
1770 fprintf(stderr,"Ignoring the --terminal-threshold flag, which is obsolete\n");
1771
1772 } else if (!strcmp(long_name,"use-localdb")) {
1773 if (!strcmp(optarg,"1")) {
1774 user_localdb_p = true;
1775 use_localdb_p = true;
1776 } else if (!strcmp(optarg,"0")) {
1777 user_localdb_p = true;
1778 use_localdb_p = false;
1779 } else {
1780 fprintf(stderr,"--use-localdb flag must be 0 or 1\n");
1781 return 9;
1782 }
1783
1784 } else if (!strcmp(long_name,"use-transcriptome-only")) {
1785 use_only_transcriptome_p = true;
1786
1787 } else if (!strcmp(long_name,"use-shared-memory")) {
1788 if (!strcmp(optarg,"1")) {
1789 sharedp = true;
1790 } else if (!strcmp(optarg,"0")) {
1791 sharedp = false;
1792 } else {
1793 fprintf(stderr,"--use-shared-memory flag must be 0 or 1\n");
1794 return 9;
1795 }
1796
1797 } else if (!strcmp(long_name,"preload-shared-memory")) {
1798 preload_shared_memory_p = true;
1799
1800 } else if (!strcmp(long_name,"unload-shared-memory")) {
1801 unload_shared_memory_p = true;
1802
1803 } else if (!strcmp(long_name,"expand-offsets")) {
1804 fprintf(stderr,"Note: --expand-offsets flag is no longer supported. With the latest algorithms, it doesn't improve speed much. Ignoring this flag");
1805
1806 } else if (!strcmp(long_name,"sampling")) {
1807 required_index1interval = atoi(check_valid_int(optarg));
1808
1809 } else if (!strcmp(long_name,"time")) {
1810 timingp = true;
1811
1812 } else if (!strcmp(long_name,"unload")) {
1813 unloadp = true;
1814
1815 #if 0
1816 } else if (!strcmp(long_name,"maxsearch")) {
1817 maxpaths_search = atoi(optarg);
1818 #endif
1819
1820 } else if (!strcmp(long_name,"mode")) {
1821 if (!strcmp(optarg,"standard")) {
1822 mode = STANDARD;
1823 } else if (!strcmp(optarg,"cmet-stranded")) {
1824 mode = CMET_STRANDED;
1825 } else if (!strcmp(optarg,"cmet-nonstranded")) {
1826 mode = CMET_NONSTRANDED;
1827 } else if (!strcmp(optarg,"atoi-stranded")) {
1828 mode = ATOI_STRANDED;
1829 } else if (!strcmp(optarg,"atoi-nonstranded")) {
1830 mode = ATOI_NONSTRANDED;
1831 } else if (!strcmp(optarg,"ttoc-stranded")) {
1832 mode = TTOC_STRANDED;
1833 } else if (!strcmp(optarg,"ttoc-nonstranded")) {
1834 mode = TTOC_NONSTRANDED;
1835 } else {
1836 fprintf(stderr,"--mode must be standard, cmet-stranded, cmet-nonstranded, atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded\n");
1837 return 9;
1838 }
1839
1840 } else if (!strcmp(long_name,"cmetdir")) {
1841 user_modedir = optarg;
1842
1843 } else if (!strcmp(long_name,"atoidir")) {
1844 user_modedir = optarg;
1845
1846 } else if (!strcmp(long_name,"novelend-splicedist")) {
1847 shortsplicedist_novelend = (Chrpos_T) strtoul(optarg,NULL,10);
1848
1849 } else if (!strcmp(long_name,"splicingdir")) {
1850 user_splicingdir = optarg;
1851
1852 } else if (!strcmp(long_name,"ambig-splice-noclip")) {
1853 amb_clip_p = false;
1854
1855 } else if (!strcmp(long_name,"find-dna-chimeras")) {
1856 if (!strcmp(optarg,"1")) {
1857 find_dna_chimeras_p = true;
1858 } else if (!strcmp(optarg,"0")) {
1859 find_dna_chimeras_p = false;
1860 } else {
1861 fprintf(stderr,"--find-dna-chimeras flag must be 0 or 1\n");
1862 exit(9);
1863 }
1864
1865 } else if (!strcmp(long_name,"tallydir")) {
1866 user_tallydir = optarg;
1867
1868 } else if (!strcmp(long_name,"use-tally")) {
1869 tally_root = optarg;
1870
1871 } else if (!strcmp(long_name,"runlengthdir")) {
1872 user_runlengthdir = optarg;
1873
1874 } else if (!strcmp(long_name,"use-runlength")) {
1875 runlength_root = optarg;
1876
1877 } else if (!strcmp(long_name,"max-anchors")) {
1878 max_anchors = atoi(check_valid_int(optarg));
1879
1880 } else if (!strcmp(long_name,"gmap-mode")) {
1881 fprintf(stderr,"Note: --gmap-mode is no longer used in GSNAP starting with 2019 versions. Ignoring.\n");
1882 #if 0
1883 gmap_mode = 0; /* Initialize */
1884 string = strtok(optarg,",");
1885 if (add_gmap_mode(string) != 0) {
1886 while ((string = strtok(NULL,",")) != NULL && add_gmap_mode(string) != 0) {
1887 }
1888 }
1889 #endif
1890
1891 } else if (!strcmp(long_name,"trigger-score-for-gmap")) {
1892 fprintf(stderr,"Note: --trigger-score-for-gmap is now ignored in GSNAP\n");
1893 /* trigger_score_for_gmap = atoi(check_valid_int(optarg)); */
1894
1895 } else if (!strcmp(long_name,"gmap-min-match-length")) {
1896 fprintf(stderr,"Note: --gmap-min-match-length is now ignored in GSNAP\n");
1897 /* gmap_min_nconsecutive = atoi(check_valid_int(optarg)); */
1898
1899 } else if (!strcmp(long_name,"gmap-allowance")) {
1900 fprintf(stderr,"Note: --gmap-allowance is now ignored in GSNAP\n");
1901 /* gmap_allowance = atoi(check_valid_int(optarg)); */
1902
1903 } else if (!strcmp(long_name,"max-gmap-pairsearch")) {
1904 fprintf(stderr,"Note: --max-gmap-pairsearch is now ignored in GSNAP\n");
1905 /* max_gmap_pairsearch = atoi(check_valid_int(optarg)); */
1906
1907 } else if (!strcmp(long_name,"max-gmap-terminal")) {
1908 fprintf(stderr,"Note: --max-gmap-terminal is now ignored in GSNAP\n");
1909 /* max_gmap_terminal = atoi(check_valid_int(optarg)); */
1910
1911 } else if (!strcmp(long_name,"max-gmap-improvement")) {
1912 fprintf(stderr,"Note: --max-gmap-improvement is now ignored in GSNAP\n");
1913 /* max_gmap_improvement = atoi(check_valid_int(optarg)); */
1914
1915 } else if (!strcmp(long_name,"input-buffer-size")) {
1916 inbuffer_nspaces = atoi(check_valid_int(optarg));
1917
1918 } else if (!strcmp(long_name,"output-buffer-size")) {
1919 output_buffer_size = atoi(check_valid_int(optarg));
1920
1921 } else if (!strcmp(long_name,"barcode-length")) {
1922 barcode_length = atoi(check_valid_int(optarg));
1923
1924 } else if (!strcmp(long_name,"endtrim-length")) {
1925 endtrim_length = atoi(check_valid_int(optarg));
1926
1927 } else if (!strcmp(long_name,"fastq-id-start")) {
1928 acc_fieldi_start = atoi(check_valid_int(optarg)) - 1;
1929 if (acc_fieldi_start < 0) {
1930 fprintf(stderr,"Value for fastq-id-start must be 1 or greater\n");
1931 return 9;
1932 }
1933
1934 } else if (!strcmp(long_name,"fastq-id-end")) {
1935 acc_fieldi_end = atoi(check_valid_int(optarg)) - 1;
1936 if (acc_fieldi_end < 0) {
1937 fprintf(stderr,"Value for fastq-id-end must be 1 or greater\n");
1938 return 9;
1939 }
1940
1941 } else if (!strcmp(long_name,"force-single-end")) {
1942 force_single_end_p = true;
1943
1944 } else if (!strcmp(long_name,"filter-chastity")) {
1945 if (!strcmp(optarg,"off")) {
1946 filter_chastity_p = false;
1947 filter_if_both_p = false;
1948 } else if (!strcmp(optarg,"either")) {
1949 filter_chastity_p = true;
1950 filter_if_both_p = false;
1951 } else if (!strcmp(optarg,"both")) {
1952 filter_chastity_p = true;
1953 filter_if_both_p = true;
1954 } else {
1955 fprintf(stderr,"--filter-chastity values allowed: off, either, both\n");
1956 return 9;
1957 }
1958
1959 } else if (!strcmp(long_name,"allow-pe-name-mismatch")) {
1960 allow_paired_end_mismatch_p = true;
1961
1962 } else if (!strcmp(long_name,"read-files-command")) {
1963 read_files_command = optarg;
1964
1965 #ifdef HAVE_ZLIB
1966 } else if (!strcmp(long_name,"gunzip")) {
1967 gunzip_p = true;
1968 #endif
1969
1970 #ifdef HAVE_BZLIB
1971 } else if (!strcmp(long_name,"bunzip2")) {
1972 bunzip2_p = true;
1973 #endif
1974
1975 } else if (!strcmp(long_name,"interleaved")) {
1976 interleavedp = true;
1977
1978 } else if (!strcmp(long_name,"orientation")) {
1979 if (!strcmp(optarg,"FR")) {
1980 invert_first_p = false;
1981 invert_second_p = true;
1982 } else if (!strcmp(optarg,"RF")) {
1983 invert_first_p = true;
1984 invert_second_p = false;
1985 } else if (!strcmp(optarg,"FF")) {
1986 invert_first_p = invert_second_p = false;
1987 } else {
1988 fprintf(stderr,"Currently allowed values for orientation: FR (fwd-rev), RF (rev-fwd) or FF (fwd-fwd)\n");
1989 return 9;
1990 }
1991
1992 } else if (!strcmp(long_name,"split-output")) {
1993 split_output_root = optarg;
1994
1995 } else if (!strcmp(long_name,"failed-input")) {
1996 failedinput_root = optarg;
1997
1998 } else if (!strcmp(long_name,"append-output")) {
1999 appendp = true;
2000
2001 } else if (!strcmp(long_name,"order-among-best")) {
2002 if (!strcmp(optarg,"genomic")) {
2003 want_random_p = false;
2004 } else if (!strcmp(optarg,"random")) {
2005 want_random_p = true;
2006 } else {
2007 fprintf(stderr,"--order-among-best values allowed: genomic, random (default)\n");
2008 return 9;
2009 }
2010
2011 } else if (!strcmp(long_name,"pairmax-dna")) {
2012 pairmax_dna = atoi(check_valid_int(optarg));
2013
2014 } else if (!strcmp(long_name,"pairmax-rna")) {
2015 pairmax_rna = atoi(check_valid_int(optarg));
2016
2017 } else if (!strcmp(long_name,"pairexpect")) {
2018 expected_pairlength = atoi(check_valid_int(optarg));
2019
2020 } else if (!strcmp(long_name,"pairdev")) {
2021 pairlength_deviation = atoi(check_valid_int(optarg));
2022
2023 } else if (!strcmp(long_name,"max-ref-mismatches")) {
2024 user_mismatches_ref_float = atof(check_valid_float_or_int(optarg));
2025 if (user_mismatches_ref_float > 1.0 && user_mismatches_ref_float != rint(user_mismatches_ref_float)) {
2026 fprintf(stderr,"Cannot specify fractional value %f for --max-mismatches except between 0.0 and 1.0\n",user_mismatches_ref_float);
2027 return 9;
2028 }
2029
2030 } else if (!strcmp(long_name,"min-coverage")) {
2031 user_mincoverage_float = atof(check_valid_float_or_int(optarg));
2032 if (user_mincoverage_float > 1.0 && user_mincoverage_float != rint(user_mincoverage_float)) {
2033 fprintf(stderr,"Cannot specify fractional value %f for --max-mismatches except between 0.0 and 1.0\n",user_mincoverage_float);
2034 return 9;
2035 }
2036
2037 } else if (!strcmp(long_name,"indel-endlength")) {
2038 min_indel_end_matches = atoi(check_valid_int(optarg));
2039 if (min_indel_end_matches > 14) {
2040 allow_end_indels_p = false;
2041 }
2042
2043 } else if (!strcmp(long_name,"antistranded-penalty")) {
2044 antistranded_penalty = atoi(check_valid_int(optarg));
2045
2046 } else if (!strcmp(long_name,"favor-multiexon")) {
2047 favor_multiexon_p = true;
2048
2049 } else if (!strcmp(long_name,"end-detail")) {
2050 fprintf(stderr,"--end-detail is now deprecated. GSNAP is now tuned to trim well at the ends\n");
2051
2052 } else if (!strcmp(long_name,"merge-distant-samechr")) {
2053 merge_samechr_p = true;
2054
2055 } else if (!strcmp(long_name,"query-unk-mismatch")) {
2056 if (!strcmp(optarg,"1")) {
2057 query_unk_mismatch_p = true;
2058 } else if (!strcmp(optarg,"0")) {
2059 query_unk_mismatch_p = false;
2060 } else {
2061 fprintf(stderr,"--query-unk-mismatch flag must be 0 or 1\n");
2062 return 9;
2063 }
2064
2065 } else if (!strcmp(long_name,"genome-unk-mismatch")) {
2066 if (!strcmp(optarg,"1")) {
2067 genome_unk_mismatch_p = true;
2068 } else if (!strcmp(optarg,"0")) {
2069 genome_unk_mismatch_p = false;
2070 } else {
2071 fprintf(stderr,"--genome-unk-mismatch flag must be 0 or 1\n");
2072 return 9;
2073 }
2074
2075 } else if (!strcmp(long_name,"trim-mismatch-score")) {
2076 fprintf(stderr,"--trim-mismatch-score is now deprecated. GSNAP is now tuned to trim well at the ends\n");
2077 /* trim_mismatch_score = atoi(check_valid_int(optarg)); */
2078 /* user_trim_mismatch_score_p = true; */
2079
2080 } else if (!strcmp(long_name,"trim-indel-score")) {
2081 trim_indel_score = atoi(check_valid_int(optarg));
2082 user_trim_indel_score_p = true;
2083
2084 } else if (!strcmp(long_name,"local-splice-penalty")) {
2085 localsplicing_penalty = atoi(check_valid_int(optarg));
2086
2087 } else if (!strcmp(long_name,"distant-splice-penalty")) {
2088 distantsplicing_penalty = atoi(check_valid_int(optarg));
2089
2090 } else if (!strcmp(long_name,"distant-splice-identity")) {
2091 min_distantsplicing_identity = check_valid_float(optarg,long_name);
2092
2093 } else if (!strcmp(long_name,"force-xs-dir")) {
2094 force_xs_direction_p = true;
2095
2096 } else if (!strcmp(long_name,"show-refdiff")) {
2097 show_refdiff_p = true;
2098
2099 } else if (!strcmp(long_name,"clip-overlap")) {
2100 clip_overlap_p = true;
2101
2102 } else if (!strcmp(long_name,"merge-overlap")) {
2103 merge_overlap_p = true;
2104
2105 } else if (!strcmp(long_name,"show-method")) {
2106 method_print_p = true;
2107
2108 } else if (!strcmp(long_name,"no-sam-headers")) {
2109 sam_headers_p = false;
2110
2111 } else if (!strcmp(long_name,"add-paired-nomappers")) {
2112 add_paired_nomappers_p = true;
2113
2114 } else if (!strcmp(long_name,"paired-flag-means-concordant")) {
2115 if (!strcmp(optarg,"1")) {
2116 paired_flag_means_concordant_p = true;
2117 } else if (!strcmp(optarg,"0")) {
2118 paired_flag_means_concordant_p = false; /* Default */
2119 } else {
2120 fprintf(stderr,"--paired-flag-means-concordant flag must be 0 or 1\n");
2121 return 9;
2122 }
2123
2124 } else if (!strcmp(long_name,"sam-headers-batch")) {
2125 sam_headers_batch = atoi(check_valid_int(optarg));
2126
2127 } else if (!strcmp(long_name,"sam-hardclip-use-S")) {
2128 sam_hardclip_use_S_p = true;
2129
2130 } else if (!strcmp(long_name,"sam-use-0M")) {
2131 sam_insert_0M_p = true;
2132
2133 } else if (!strcmp(long_name,"sam-extended-cigar")) {
2134 sam_cigar_extended_p = true;
2135
2136 } else if (!strcmp(long_name,"sam-multiple-primaries")) {
2137 sam_multiple_primaries_p = true;
2138
2139 } else if (!strcmp(long_name,"sam-sparse-secondaries")) {
2140 sam_sparse_secondaries_p = true;
2141
2142 } else if (!strcmp(long_name,"quality-protocol")) {
2143 if (user_quality_score_adj == true) {
2144 fprintf(stderr,"Cannot specify both -J (--quality-zero-score) and --quality-protocol\n");
2145 return 9;
2146 } else if (user_quality_shift == true) {
2147 fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
2148 return 9;
2149 } else if (!strcmp(optarg,"illumina")) {
2150 MAPQ_init(/*quality_score_adj*/64);
2151 user_quality_score_adj = true;
2152 quality_shift = -31;
2153 user_quality_shift = true;
2154 } else if (!strcmp(optarg,"sanger")) {
2155 MAPQ_init(/*quality_score_adj*/33);
2156 user_quality_score_adj = true;
2157 quality_shift = 0;
2158 user_quality_shift = true;
2159 } else {
2160 fprintf(stderr,"The only values allowed for --quality-protocol are illumina or sanger\n");
2161 return 9;
2162 }
2163
2164 } else if (!strcmp(long_name,"md-lowercase-snp")) {
2165 md_lowercase_variant_p = true;
2166
2167 } else if (!strcmp(long_name,"extend-soft-clips")) {
2168 hide_soft_clips_p = true;
2169
2170 } else if (!strcmp(long_name,"action-if-cigar-error")) {
2171 if (!strcmp(optarg,"ignore")) {
2172 cigar_action = CIGAR_ACTION_IGNORE;
2173 } else if (!strcmp(optarg,"warning")) {
2174 cigar_action = CIGAR_ACTION_WARNING;
2175 } else if (!strcmp(optarg,"noprint")) {
2176 cigar_action = CIGAR_ACTION_NOPRINT;
2177 } else if (!strcmp(optarg,"abort")) {
2178 cigar_action = CIGAR_ACTION_ABORT;
2179 } else {
2180 fprintf(stderr,"The only values allowed for --action-if-cigar-error are ignore, warning, noprint, abort\n");
2181 return 9;
2182 }
2183
2184 } else if (!strcmp(long_name,"read-group-id")) {
2185 sam_read_group_id = optarg;
2186
2187 } else if (!strcmp(long_name,"read-group-name")) {
2188 sam_read_group_name = optarg;
2189
2190 } else if (!strcmp(long_name,"read-group-library")) {
2191 sam_read_group_library = optarg;
2192
2193 } else if (!strcmp(long_name,"read-group-platform")) {
2194 sam_read_group_platform = optarg;
2195
2196 #ifdef USE_MPI
2197 } else if (!strcmp(long_name,"master-is-worker")) {
2198 if (!strcmp(optarg,"1")) {
2199 master_is_worker_p = true;
2200 } else if (!strcmp(optarg,"0")) {
2201 master_is_worker_p = false; /* Default */
2202 } else {
2203 fprintf(stderr,"--master-is-worker flag must be 0 or 1\n");
2204 return 9;
2205 }
2206 #endif
2207
2208 } else if (!strcmp(long_name,"print-snps")) {
2209 print_snplabels_p = true;
2210
2211 } else if (!strcmp(long_name,"failsonly")) {
2212 if (nofailsp == true) {
2213 fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
2214 return 9;
2215 } else {
2216 failsonlyp = true;
2217 }
2218 } else if (!strcmp(long_name,"nofails")) {
2219 if (failsonlyp == true) {
2220 fprintf(stderr,"Cannot specify both --nofails and --failsonly\n");
2221 return 9;
2222 } else {
2223 nofailsp = true;
2224 }
2225
2226 } else if (!strcmp(long_name,"ignore-trim-in-filtering")) {
2227 if (!strcmp(optarg,"0")) {
2228 user_filter_within_trims_p = false;
2229 specified_filter_within_trims_p = true;
2230 } else if (!strcmp(optarg,"1")) {
2231 user_filter_within_trims_p = true;
2232 specified_filter_within_trims_p = true;
2233 } else {
2234 fprintf(stderr,"The only values allowed for --ignore-trim-in-filtering are 0, 1\n");
2235 return 9;
2236 }
2237
2238 } else if (!strcmp(long_name,"omit-concordant-uniq")) {
2239 omit_concordant_uniq_p = true;
2240 } else if (!strcmp(long_name,"omit-concordant-mult")) {
2241 omit_concordant_mult_p = true;
2242
2243 } else {
2244 /* Shouldn't reach here */
2245 fprintf(stderr,"Don't recognize option %s. For usage, run 'gsnap --help'",long_name);
2246 return 9;
2247 }
2248 break;
2249
2250 case 'C': user_transcriptomedir = optarg; break;
2251 case 'c': transcriptome_dbroot = optarg; break;
2252
2253 case 'D': user_genomedir = optarg; break;
2254 case 'd': genome_dbroot = optarg; break;
2255
2256 case 'k':
2257 required_index1part = atoi(check_valid_int(optarg));
2258 if (required_index1part > MAXIMUM_KMER) {
2259 fprintf(stderr,"The value for k-mer size must be %d or less\n",MAXIMUM_KMER);
2260 return 9;
2261 }
2262 break;
2263
2264 case 'G': uncompressedp = true; break;
2265
2266 case 'q': parse_part(&part_modulus,&part_interval,optarg); break;
2267 case 'o': output_file = optarg; break;
2268
2269 case 'a':
2270 if (!strcmp(optarg,"paired")) {
2271 chop_primers_p = true;
2272 } else if (!strcmp(optarg,"off")) {
2273 chop_primers_p = false;
2274 } else {
2275 fprintf(stderr,"Currently allowed values for adapter stripping (-a): off, paired\n");
2276 return 9;
2277 }
2278 break;
2279
2280 case 'N':
2281 if (!strcmp(optarg,"1")) {
2282 novelsplicingp = true;
2283 } else if (!strcmp(optarg,"0")) {
2284 novelsplicingp = false;
2285 } else {
2286 fprintf(stderr,"Novel splicing (-N flag) must be 0 or 1\n");
2287 return 9;
2288 }
2289 break;
2290
2291 #if 0
2292 case 'R':
2293 if (!strcmp(optarg,"0")) {
2294 masktype = MASK_NONE;
2295 } else if (!strcmp(optarg,"1")) {
2296 masktype = MASK_FREQUENT;
2297 } else if (!strcmp(optarg,"2")) {
2298 masktype = MASK_REPETITIVE;
2299 } else if (!strcmp(optarg,"3")) {
2300 masktype = MASK_GREEDY_FREQUENT;
2301 } else if (!strcmp(optarg,"4")) {
2302 masktype = MASK_GREEDY_REPETITIVE;
2303 } else {
2304 fprintf(stderr,"Masking mode %s not recognized.\n",optarg);
2305 fprintf(stderr,"Mode 0 means no masking, mode 1 masks frequent oligomers;\n");
2306 fprintf(stderr," mode 2 masks frequent and repetitive oligomers;\n");
2307 fprintf(stderr," mode 3 does greedy masking of frequent oligomers,\n");
2308 fprintf(stderr," then no masking if necessary;\n");
2309 fprintf(stderr," mode 4 does greedy masking of frequent and repetitive oligomers,\n");
2310 fprintf(stderr," then no masking if necessary.\n");
2311 return 9;
2312 }
2313 break;
2314 #endif
2315
2316 case 'M': subopt_levels = atoi(check_valid_int(optarg)); break;
2317 case 'm':
2318 user_mismatches_refalt_float = atof(check_valid_float_or_int(optarg));
2319 if (user_mismatches_refalt_float > 1.0 && user_mismatches_refalt_float != rint(user_mismatches_refalt_float)) {
2320 fprintf(stderr,"Cannot specify fractional value %f for --max-mismatches except between 0.0 and 1.0\n",user_mismatches_refalt_float);
2321 return 9;
2322 }
2323 break;
2324
2325 case 'i': indel_penalty_middle = indel_penalty_end = atoi(check_valid_int(optarg)); break;
2326
2327 case 'y':
2328 max_middle_insertions_float = atof(check_valid_float_or_int(optarg));
2329 if (max_middle_insertions_float > 1.0 && max_middle_insertions_float != rint(max_middle_insertions_float)) {
2330 fprintf(stderr,"Cannot specify fractional value %f for --max-middle-insertions except between 0.0 and 1.0\n",max_middle_insertions_float);
2331 return 9;
2332 }
2333 break;
2334
2335 case 'z':
2336 max_middle_deletions_float = atof(check_valid_float_or_int(optarg));
2337 if (max_middle_deletions_float > 1.0 && max_middle_deletions_float != rint(max_middle_deletions_float)) {
2338 fprintf(stderr,"Cannot specify fractional value %f for --max-middle-deletions except between 0.0 and 1.0\n",max_middle_deletions_float);
2339 return 9;
2340 }
2341 break;
2342
2343 case 'Y': max_end_insertions = atoi(check_valid_int(optarg)); break;
2344 case 'Z': max_end_deletions = atoi(check_valid_int(optarg)); break;
2345
2346 case 'w': shortsplicedist = strtoul(optarg,NULL,10); break;
2347
2348 case 'K': min_distantsplicing_end_matches = atoi(check_valid_int(optarg)); break;
2349 case 'l': min_shortend = atoi(check_valid_int(optarg)); break;
2350
2351 case 'g': genes_file = optarg; break;
2352
2353 case 's':
2354 splicing_file = optarg;
2355 knownsplicingp = true;
2356 break;
2357
2358 case 'e':
2359 masked_root = optarg;
2360 genome_unk_mismatch_p = false;
2361 maskedp = true;
2362 print_nsnpdiffs_p = true;
2363 break;
2364
2365 case 'E':
2366 fprintf(stderr,"In the future, please use --distant-splice-penalty instead of -E, which may be reassigned\n");
2367 distantsplicing_penalty = atoi(check_valid_int(optarg));
2368 break;
2369
2370 case 'V': user_snpsdir = optarg; break;
2371 case 'v': snps_root = optarg; break;
2372
2373 case 'B':
2374 if (!strcmp(optarg,"5")) {
2375 #if 0
2376 /* Not true. -B 5 allocates suffix array and suffix aux files */
2377 fprintf(stderr,"Note: Batch mode 5 is now the same as batch mode 4.\n");
2378 fprintf(stderr,"Expansion of offsets is now controlled separately by --expand-offsets (default=0).\n");
2379 #endif
2380 offsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
2381 positions_access = USE_ALLOCATE;
2382 locoffsetsstrm_access = USE_ALLOCATE; /* Doesn't matter */
2383 locpositions_access = USE_ALLOCATE;
2384
2385 genome_access = USE_ALLOCATE;
2386
2387 #ifdef HAVE_MMAP
2388 } else if (!strcmp(optarg,"4")) {
2389 offsetsstrm_access = USE_ALLOCATE;
2390 positions_access = USE_ALLOCATE;
2391 locoffsetsstrm_access = USE_ALLOCATE;
2392 locpositions_access = USE_ALLOCATE;
2393
2394 genome_access = USE_ALLOCATE;
2395
2396 } else if (!strcmp(optarg,"3")) {
2397 offsetsstrm_access = USE_ALLOCATE;
2398 positions_access = USE_ALLOCATE;
2399 locoffsetsstrm_access = USE_ALLOCATE;
2400 locpositions_access = USE_ALLOCATE;
2401
2402 genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
2403
2404 } else if (!strcmp(optarg,"2")) {
2405 offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2406 positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2407 locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2408 locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2409
2410 genome_access = USE_MMAP_PRELOAD; /* was batch_genome_p = true */
2411
2412 } else if (!strcmp(optarg,"1")) {
2413 offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2414 positions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2415 locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2416 locpositions_access = USE_MMAP_PRELOAD; /* was batch_positions_p = true */
2417
2418 genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
2419
2420 } else if (!strcmp(optarg,"0")) {
2421 offsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2422 positions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
2423 locoffsetsstrm_access = USE_ALLOCATE; /* was batch_offsets_p = true */
2424 locpositions_access = USE_MMAP_ONLY; /* was batch_positions_p = false */
2425
2426 genome_access = USE_MMAP_ONLY; /* was batch_genome_p = false */
2427
2428 #endif
2429 } else {
2430 #ifdef HAVE_MMAP
2431 fprintf(stderr,"Batch mode %s not recognized. Only allow 0-5. Run 'gsnap --help' for more information.\n",optarg);
2432 #else
2433 fprintf(stderr,"Batch mode %s not recognized. Only allow 4-5, since mmap is disabled. Run 'gsnap --help' for more information.\n",optarg);
2434 #endif
2435 return 9;
2436 }
2437 break;
2438
2439 #if defined(HAVE_PTHREAD)
2440 case 't': nthreads = atoi(check_valid_int(optarg)); break;
2441 #else
2442 case 't': fprintf(stderr,"This version of GSNAP has pthreads disabled, so ignoring the value of %s for -t\n",optarg); break;
2443 #endif
2444
2445 case 'A':
2446 if (!strcmp(optarg,"standard")) {
2447 output_type = STD_OUTPUT;
2448 } else if (!strcmp(optarg,"sam")) {
2449 output_type = SAM_OUTPUT;
2450 } else if (!strcmp(optarg,"m8")) {
2451 output_type = M8_OUTPUT;
2452 } else {
2453 fprintf(stderr,"Output format %s not recognized. Allowed values: standard (default), sam, m8\n",optarg);
2454 return 9;
2455 }
2456 break;
2457
2458 case 'j':
2459 if (user_quality_shift == true) {
2460 fprintf(stderr,"Cannot specify both -j (--quality-print-shift) and --quality-protocol\n");
2461 return 9;
2462 } else {
2463 quality_shift = atoi(check_valid_int(optarg));
2464 user_quality_shift = true;
2465 }
2466 break;
2467
2468 case 'J':
2469 if (user_quality_score_adj == true) {
2470 fprintf(stderr,"Cannot specify both -J (--quality-zero-score) and --quality-protocol\n");
2471 return 9;
2472 } else {
2473 MAPQ_init(/*quality_score_adj*/atoi(check_valid_int(optarg)));
2474 user_quality_score_adj = true;
2475 }
2476 break;
2477
2478 case '0': exception_raise_p = false; break; /* Allows signals to pass through */
2479 case 'n': maxpaths_report = atoi(check_valid_int(optarg)); break;
2480 case 'Q': quiet_if_excessive_p = true; break;
2481
2482 case 'O': orderedp = true; break;
2483
2484 case '?': fprintf(stderr,"For usage, run 'gsnap --help'\n"); return 9;
2485 default: return 9;
2486 }
2487 }
2488
2489 /* Make inferences */
2490 if (genome_dbroot == NULL) {
2491 fprintf(stderr,"Need to specify the -d flag. For usage, run 'gsnap --help'\n");
2492 /* print_program_usage(); */
2493 return 9;
2494 }
2495
2496 if (acc_fieldi_end < acc_fieldi_start) {
2497 fprintf(stderr,"--fastq-id-end must be equal to or greater than --fastq-id-start\n");
2498 return 9;
2499 }
2500
2501 if (clip_overlap_p == true && merge_overlap_p == true) {
2502 fprintf(stderr,"Cannot specify both --clip-overlap and --merge-overlap. Please choose one.\n");
2503 return 9;
2504 }
2505
2506 if (masked_root != NULL && genome_unk_mismatch_p == true) {
2507 fprintf(stderr,"Unexpected combination of --use-mask and --genome-unk-mismatch=true. Are you sure?\n");
2508 }
2509
2510 if (transcriptome_dbroot == NULL && use_only_transcriptome_p == true) {
2511 fprintf(stderr,"Flag --use-transcriptome-only was given, but no transcriptdb was specified. Ignoring this flag\n");
2512 }
2513
2514 if (transcriptome_dbroot != NULL) {
2515 if (novelsplicingp == false) {
2516 fprintf(stderr,"Transcriptome specified => assume reads are RNA-Seq. Turning on novel splicing.\n");
2517 novelsplicingp = true;
2518 } else {
2519 fprintf(stderr,"Transcriptome specified => assume reads are RNA-Seq\n");
2520 }
2521 pairmax_transcriptome = pairmax_dna;
2522 pairmax_linear = pairmax_rna;
2523 pairmax_circular = pairmax_dna;
2524 shortsplicedist_known = shortsplicedist;
2525
2526 if (specified_filter_within_trims_p == true) {
2527 filter_within_trims_p = user_filter_within_trims_p;
2528 } else {
2529 filter_within_trims_p = true; /* Use trims by default for RNA-Seq */
2530 }
2531
2532 if (user_localdb_p == false) {
2533 /* User did not specify whether to use localdb, so using it by default */
2534 use_localdb_p = true;
2535 }
2536
2537 #if 0
2538 } else if (find_dna_chimeras_p == true) {
2539 /* DNA-Seq, but need to trim to find chimeras */
2540 fprintf(stderr,"Neither novel splicing (-N) nor known splicing (-s) turned on => assume reads are DNA-Seq (genomic)\n");
2541 pairmax_transcriptome = 0;
2542 pairmax_linear = pairmax_dna;
2543 pairmax_circular = pairmax_dna;
2544 shortsplicedist = shortsplicedist_known = 0U;
2545 shortsplicedist_novelend = 0U;
2546
2547 if (specified_filter_within_trims_p == true) {
2548 filter_within_trims_p = user_filter_within_trims_p;
2549 } else {
2550 filter_within_trims_p = true; /* Use trims by default for DNA-Seq */
2551 }
2552 #endif
2553
2554 } else if (novelsplicingp == true && knownsplicingp == true) {
2555 fprintf(stderr,"Novel splicing (-N) and known splicing (-s) both turned on => assume reads are RNA-Seq\n");
2556 pairmax_transcriptome = 0;
2557 pairmax_linear = pairmax_rna;
2558 pairmax_circular = pairmax_dna;
2559 shortsplicedist_known = shortsplicedist;
2560
2561 if (specified_filter_within_trims_p == true) {
2562 filter_within_trims_p = user_filter_within_trims_p;
2563 } else {
2564 filter_within_trims_p = true; /* Use trims by default for RNA-Seq */
2565 }
2566
2567 if (user_localdb_p == false) {
2568 /* User did not specify whether to use localdb, so using it by default */
2569 use_localdb_p = true;
2570 }
2571
2572 } else if (knownsplicingp == true) {
2573 fprintf(stderr,"Known splicing (-s) turned on => assume reads are RNA-Seq\n");
2574 pairmax_transcriptome = 0;
2575 pairmax_linear = pairmax_rna;
2576 pairmax_circular = pairmax_dna;
2577 shortsplicedist_known = shortsplicedist;
2578
2579 if (specified_filter_within_trims_p == true) {
2580 filter_within_trims_p = user_filter_within_trims_p;
2581 } else {
2582 filter_within_trims_p = true; /* Use trims by default for RNA-Seq */
2583 }
2584
2585 if (user_localdb_p == false) {
2586 /* User did not specify whether to use localdb, so using it by default */
2587 use_localdb_p = true;
2588 }
2589
2590 } else if (novelsplicingp == true) {
2591 fprintf(stderr,"Novel splicing (-N) turned on => assume reads are RNA-Seq\n");
2592 pairmax_transcriptome = 0;
2593 pairmax_linear = pairmax_rna;
2594 pairmax_circular = pairmax_dna;
2595 shortsplicedist_known = 0;
2596
2597 if (specified_filter_within_trims_p == true) {
2598 filter_within_trims_p = user_filter_within_trims_p;
2599 } else {
2600 filter_within_trims_p = true; /* Use trims by default for RNA-Seq */
2601 }
2602
2603 if (user_localdb_p == false) {
2604 /* User did not specify whether to use localdb, so using it by default */
2605 use_localdb_p = true;
2606 }
2607
2608 } else {
2609 /* Straight DNA-Seq */
2610 fprintf(stderr,"Neither novel splicing (-N) nor known splicing (-s) turned on => assume reads are DNA-Seq (genomic)\n");
2611 pairmax_transcriptome = 0;
2612 pairmax_linear = pairmax_dna;
2613 pairmax_circular = pairmax_dna;
2614 shortsplicedist = shortsplicedist_known = 0U;
2615 shortsplicedist_novelend = 0U;
2616
2617 #if 0
2618 /* This ignores datasets where sequence ends are indeed bad, or have primers. User can set values to 0 if desired */
2619 if (user_trim_mismatch_score_p == false) {
2620 trim_mismatch_score = 0;
2621 }
2622 if (user_trim_indel_score_p == false) {
2623 trim_indel_score = 0;
2624 }
2625 #endif
2626
2627 if (specified_filter_within_trims_p == true) {
2628 filter_within_trims_p = user_filter_within_trims_p;
2629 } else {
2630 filter_within_trims_p = false; /* Ignore trims by default (i.e., extend to ends) for DNA-Seq */
2631 }
2632
2633 if (user_localdb_p == false) {
2634 /* User did not specify whether to use localdb, so not using it by default for DNA-seq */
2635 use_localdb_p = false;
2636 }
2637
2638 #if 0
2639 /* Large soft-clips not expected for DNA-Seq, so require good alignments */
2640 if (user_mismatches_refalt_float < 0.0) {
2641 user_mismatches_refalt_float = 0.10;
2642 }
2643 #endif
2644 }
2645
2646
2647 if (shortsplicedist_novelend > shortsplicedist) {
2648 fprintf(stderr,"The novelend-splicedist %d is greater than the localsplicedist %d. Resetting novelend-splicedist to be %d\n",
2649 shortsplicedist_novelend,shortsplicedist,shortsplicedist);
2650 shortsplicedist_novelend = shortsplicedist;
2651 }
2652
2653 if (distantsplicing_penalty < localsplicing_penalty) {
2654 fprintf(stderr,"The distant splicing penalty %d cannot be less than local splicing penalty %d\n",
2655 distantsplicing_penalty,localsplicing_penalty);
2656 return 9;
2657 }
2658
2659 if (sam_headers_batch >= 0) {
2660 if (part_modulus == sam_headers_batch) {
2661 sam_headers_p = true;
2662 } else {
2663 sam_headers_p = false;
2664 }
2665 }
2666
2667 if (sam_read_group_id == NULL && sam_read_group_name != NULL) {
2668 sam_read_group_id = sam_read_group_name;
2669 } else if (sam_read_group_id != NULL && sam_read_group_name == NULL) {
2670 sam_read_group_name = sam_read_group_id;
2671 }
2672
2673 if (chop_primers_p == true) {
2674 if (invert_first_p == false && invert_second_p == true) {
2675 /* orientation FR */
2676 } else {
2677 fprintf(stderr,"Adapter stripping not currently implemented for given orientation\n");
2678 return 9;
2679 }
2680 }
2681
2682 #ifdef USE_MPI
2683 /* Code does allow for MPI output to stdout, but appears not to work
2684 yet, and may not work if rank 0 is also a worker */
2685 if (split_output_root == NULL && output_file == NULL) {
2686 fprintf(stderr,"For MPI version, need to specify either --split-output or --output-file\n");
2687 return 9;
2688 }
2689 #endif
2690
2691 return 0;
2692 }
2693
2694
2695 static bool
open_input_streams_parser(int * nextchar,int * nchars1,int * nchars2,char *** files,int * nfiles,FILE ** input,FILE ** input2,gzFile * gzipped,gzFile * gzipped2,Bzip2_T * bzipped,Bzip2_T * bzipped2,char * read_files_command,bool gunzip_p,bool bunzip2_p,bool interleavedp,int argc,char ** argv)2696 open_input_streams_parser (int *nextchar, int *nchars1, int *nchars2, char ***files, int *nfiles,
2697 FILE **input, FILE **input2,
2698 #ifdef HAVE_ZLIB
2699 gzFile *gzipped, gzFile *gzipped2,
2700 #endif
2701 #ifdef HAVE_BZLIB
2702 Bzip2_T *bzipped, Bzip2_T *bzipped2,
2703 #endif
2704 char *read_files_command, bool gunzip_p, bool bunzip2_p, bool interleavedp,
2705 int argc, char **argv) {
2706 bool fastq_format_p = false;
2707
2708 *input = *input2 = NULL;
2709 #ifdef HAVE_ZLIB
2710 *gzipped = *gzipped2 = NULL;
2711 #endif
2712 #ifdef HAVE_BZLIB
2713 *bzipped = *bzipped2 = NULL;
2714 #endif
2715
2716 /* Open input stream and peek at first char */
2717 if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
2718 /* Ignore any input */
2719 *input = stdin;
2720 *files = (char **) NULL;
2721 *nfiles = 0;
2722 *nextchar = EOF;
2723 return false;
2724
2725 } else if (argc == 0) {
2726 #ifdef USE_MPI
2727 fprintf(stderr,"For mpi_gsnap, cannot read from stdin\n");
2728 exit(9);
2729 #else
2730 fprintf(stderr,"Reading from stdin\n");
2731 *input = stdin;
2732 *files = (char **) NULL;
2733 *nfiles = 0;
2734 if (interleavedp == true) {
2735 *nextchar = '\0';
2736 } else {
2737 *nextchar = Shortread_input_init(&(*nchars1),*input);
2738 }
2739 #endif
2740
2741 } else {
2742 *files = argv;
2743 *nfiles = argc;
2744
2745 if (gunzip_p == true) {
2746 #ifdef HAVE_ZLIB
2747 if ((*gzipped = gzopen((*files)[0],"rb")) == NULL) {
2748 fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2749 exit(9);
2750 } else {
2751 #ifdef HAVE_ZLIB_GZBUFFER
2752 gzbuffer(*gzipped,GZBUFFER_SIZE);
2753 #endif
2754 if (interleavedp == true) {
2755 *nextchar = '\0';
2756 } else {
2757 *nextchar = Shortread_input_init_gzip(*gzipped);
2758 }
2759 }
2760 #endif
2761
2762 } else if (bunzip2_p == true) {
2763 #ifdef HAVE_BZLIB
2764 if ((*bzipped = Bzip2_new((*files)[0])) == NULL) {
2765 fprintf(stderr,"Cannot open bzipped file %s\n",(*files)[0]);
2766 exit(9);
2767 } else {
2768 if (interleavedp == true) {
2769 *nextchar = '\0';
2770 } else {
2771 *nextchar = Shortread_input_init_bzip2(*bzipped);
2772 }
2773 }
2774 #endif
2775
2776 } else {
2777 if ((*input = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2778 fprintf(stderr,"Unable to read input\n");
2779 exit(9);
2780 } else {
2781 debugf(fprintf(stderr,"Master opening file %s using fopen for input\n",(*files)[0]));
2782 if (interleavedp == true) {
2783 *nextchar = '\0';
2784 } else {
2785 *nextchar = Shortread_input_init(&(*nchars1),*input);
2786 }
2787 }
2788 }
2789
2790 (*files)++;
2791 (*nfiles)--;
2792 }
2793
2794 if (interleavedp == true) {
2795 /* Read one line at a time */
2796 return /*fastq_format_p*/false;
2797
2798 } else if (*nextchar == EOF) {
2799 fprintf(stderr,"Warning: input is empty\n");
2800 exit(0);
2801
2802 } else if (*nextchar == '@') {
2803 /* Looks like a FASTQ file */
2804 if (*nfiles == 0 || force_single_end_p == true) {
2805 #ifdef HAVE_ZLIB
2806 *gzipped2 = (gzFile) NULL;
2807 #endif
2808 #ifdef HAVE_BZLIB
2809 *bzipped2 = (Bzip2_T) NULL;
2810 #endif
2811 *input2 = (FILE *) NULL;
2812 } else {
2813 if (gunzip_p == true) {
2814 #ifdef HAVE_ZLIB
2815 if ((*gzipped2 = gzopen((*files)[0],"rb")) == NULL) {
2816 fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2817 exit(9);
2818 } else {
2819 #ifdef HAVE_ZLIB_GZBUFFER
2820 gzbuffer(*gzipped2,GZBUFFER_SIZE);
2821 #endif
2822 /* nextchar2 = */ Shortread_input_init_gzip(*gzipped2);
2823 }
2824 #endif
2825
2826 } else if (bunzip2_p == true) {
2827 #ifdef HAVE_BZLIB
2828 if ((*bzipped2 = Bzip2_new((*files)[0])) == NULL) {
2829 fprintf(stderr,"Cannot open bzip2 file %s\n",(*files)[0]);
2830 exit(9);
2831 } else {
2832 /* nextchar2 = */ Shortread_input_init_bzip2(*bzipped2);
2833 }
2834 #endif
2835
2836 } else {
2837 if ((*input2 = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2838 fprintf(stderr,"Unable to read input\n");
2839 exit(9);
2840 } else {
2841 debugf(fprintf(stderr,"Master opening file %s using fopen for input2\n",(*files)[0]));
2842 /* nextchar2 = */ Shortread_input_init(&(*nchars2),*input2);
2843 }
2844 }
2845 (*files)++;
2846 (*nfiles)--;
2847 }
2848 fastq_format_p = true;
2849
2850 } else if (*nextchar == '>') {
2851 /* Looks like a FASTA file */
2852
2853 } else {
2854 fprintf(stderr,"First char is %c. Expecting either '>' for FASTA or '@' for FASTQ format.\n",*nextchar);
2855 exit(9);
2856 }
2857
2858 return fastq_format_p;
2859 }
2860
2861
2862 #ifdef USE_MPI
2863 static void
open_input_streams_worker(char *** files,int * nfiles,MPI_File * input,MPI_File * input2,MPI_Comm workers_comm,gzFile * gzipped,gzFile * gzipped2,Bzip2_T * bzipped,Bzip2_T * bzipped2,char * read_files_command,bool gunzip_p,bool bunzip2_p,bool interleavedp,bool fastq_format_p,int argc,char ** argv)2864 open_input_streams_worker (char ***files, int *nfiles,
2865 #if defined(USE_MPI_FILE_INPUT)
2866 MPI_File *input, MPI_File *input2, MPI_Comm workers_comm,
2867 #else
2868 FILE **input, FILE **input2,
2869 #endif
2870 #ifdef HAVE_ZLIB
2871 gzFile *gzipped, gzFile *gzipped2,
2872 #endif
2873 #ifdef HAVE_BZLIB
2874 Bzip2_T *bzipped, Bzip2_T *bzipped2,
2875 #endif
2876 char *read_files_command, bool gunzip_p, bool bunzip2_p,
2877 bool interleavedp, bool fastq_format_p, int argc, char **argv) {
2878
2879 *input = *input2 = NULL;
2880 #ifdef HAVE_ZLIB
2881 *gzipped = *gzipped2 = NULL;
2882 #endif
2883 #ifdef HAVE_BZLIB
2884 *bzipped = *bzipped2 = NULL;
2885 #endif
2886
2887 /* Open input stream and peek at first char */
2888 if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
2889 /* Ignore input */
2890
2891 } else if (argc == 0) {
2892 fprintf(stderr,"For mpi_gsnap, cannot read from stdin\n");
2893 exit(9);
2894
2895 } else {
2896 *files = argv;
2897 *nfiles = argc;
2898
2899 if (gunzip_p == true) {
2900 #ifdef HAVE_ZLIB
2901 if ((*gzipped = gzopen((*files)[0],"rb")) == NULL) {
2902 fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2903 exit(9);
2904 } else {
2905 #ifdef HAVE_ZLIB_GZBUFFER
2906 gzbuffer(*gzipped,GZBUFFER_SIZE);
2907 #endif
2908 }
2909 #endif
2910
2911 } else if (bunzip2_p == true) {
2912 #ifdef HAVE_BZLIB
2913 if ((*bzipped = Bzip2_new((*files)[0])) == NULL) {
2914 fprintf(stderr,"Cannot open bzipped file %s\n",(*files)[0]);
2915 exit(9);
2916 }
2917 #endif
2918
2919 } else {
2920 #if defined(USE_MPI_FILE_INPUT)
2921 if ((*input = MPI_fopen((*files)[0],workers_comm)) == NULL) {
2922 fprintf(stderr,"Cannot open file %s\n",(*files)[0]);
2923 exit(9);
2924 }
2925 debugf(fprintf(stderr,"Slave opening file %s using MPI_File_open\n",(*files)[0]));
2926 #else
2927 if ((*input = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2928 fprintf(stderr,"Unable to read input\n");
2929 exit(9);
2930 }
2931 debugf(fprintf(stderr,"Slave opening file %s using fopen\n",(*files)[0]));
2932 #endif
2933 }
2934
2935 (*files)++;
2936 (*nfiles)--;
2937 }
2938
2939
2940 if (interleavedp == true || fastq_format_p == true) {
2941 /* Looks like a FASTQ file */
2942 if (*nfiles == 0 || force_single_end_p == true) {
2943 #ifdef HAVE_ZLIB
2944 *gzipped2 = (gzFile) NULL;
2945 #endif
2946 #ifdef HAVE_BZLIB
2947 *bzipped2 = (Bzip2_T) NULL;
2948 #endif
2949 #if defined(USE_MPI_FILE_INPUT)
2950 *input2 = (MPI_File) NULL;
2951 #else
2952 *input2 = (FILE *) NULL;
2953 #endif
2954 } else {
2955 if (gunzip_p == true) {
2956 #ifdef HAVE_ZLIB
2957 if ((*gzipped2 = gzopen((*files)[0],"rb")) == NULL) {
2958 fprintf(stderr,"Cannot open gzipped file %s\n",(*files)[0]);
2959 exit(9);
2960 } else {
2961 #ifdef HAVE_ZLIB_GZBUFFER
2962 gzbuffer(*gzipped2,GZBUFFER_SIZE);
2963 #endif
2964 }
2965 #endif
2966
2967 } else if (bunzip2_p == true) {
2968 #ifdef HAVE_BZLIB
2969 if ((*bzipped2 = Bzip2_new((*files)[0])) == NULL) {
2970 fprintf(stderr,"Cannot open bzip2 file %s\n",(*files)[0]);
2971 exit(9);
2972 }
2973 #endif
2974
2975 } else {
2976 #if defined(USE_MPI_FILE_INPUT)
2977 if ((*input2 = MPI_fopen((*files)[0],workers_comm)) == NULL) {
2978 fprintf(stderr,"Cannot open file %s\n",(*files)[0]);
2979 exit(9);
2980 }
2981 debugf(fprintf(stderr,"Slave opening file %s using MPI_File_open\n",(*files)[0]));
2982 #else
2983 if ((*input2 = Fopen_read_text(read_files_command,(*files)[0])) == NULL) {
2984 fprintf(stderr,"Unable to read input\n");
2985 exit(9);
2986 }
2987 debugf(fprintf(stderr,"Slave opening file %s using fopen\n",(*files)[0]));
2988 #endif
2989 }
2990 (*files)++;
2991 (*nfiles)--;
2992 }
2993 }
2994
2995 return;
2996 }
2997 #endif
2998
2999
3000 static Univ_IIT_T
transcript_iit_setup(Trcoord_T * transcriptomelength,int * ntranscripts,char * transcriptomesubdir,char * fileroot)3001 transcript_iit_setup (Trcoord_T *transcriptomelength, int *ntranscripts,
3002 char *transcriptomesubdir, char *fileroot) {
3003 Univ_IIT_T transcript_iit = NULL;
3004 char *iitfile = NULL;
3005
3006 /* Prepare transcript data */
3007
3008 iitfile = (char *) CALLOC(strlen(transcriptomesubdir)+strlen("/")+
3009 strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
3010 sprintf(iitfile,"%s/%s.chromosome.iit",transcriptomesubdir,fileroot);
3011 if ((transcript_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
3012 fprintf(stderr,"IIT file %s is not valid\n",iitfile);
3013 exit(9);
3014 } else {
3015 FREE(iitfile);
3016 *ntranscripts = Univ_IIT_total_nintervals(transcript_iit);
3017 }
3018
3019 *transcriptomelength = (Trcoord_T) Univ_IIT_genomelength(transcript_iit,/*with_circular_alias_p*/false);
3020 return transcript_iit;
3021 }
3022
3023
3024
3025 static Univ_IIT_T
chromosome_iit_setup(Univcoord_T * genomelength,int * nchromosomes,int * circular_typeint,bool * any_circular_p,bool ** circularp,char * genomesubdir,char * fileroot)3026 chromosome_iit_setup (Univcoord_T *genomelength, int *nchromosomes, int *circular_typeint,
3027 bool *any_circular_p, bool **circularp, char *genomesubdir, char *fileroot) {
3028 Univ_IIT_T chromosome_iit = NULL, altscaffold_iit = NULL;
3029 char *iitfile = NULL;
3030
3031
3032 /* Prepare genomic data */
3033
3034 iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
3035 strlen(fileroot)+strlen(".chromosome.iit")+1,sizeof(char));
3036 sprintf(iitfile,"%s/%s.chromosome.iit",genomesubdir,fileroot);
3037 if ((chromosome_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
3038 fprintf(stderr,"IIT file %s is not valid\n",iitfile);
3039 exit(9);
3040 #ifdef LARGE_GENOMES
3041 } else if (Univ_IIT_coord_values_8p(chromosome_iit) == false) {
3042 fprintf(stderr,"This program gsnapl is designed for large genomes.\n");
3043 fprintf(stderr,"For small genomes of less than 2^32 (4 billion) bp, please run gsnap instead.\n");
3044 exit(9);
3045 #endif
3046 } else {
3047 FREE(iitfile);
3048 *nchromosomes = Univ_IIT_total_nintervals(chromosome_iit);
3049 *circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
3050 *circularp = Univ_IIT_circularp(&(*any_circular_p),chromosome_iit);
3051
3052 iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
3053 strlen(fileroot)+strlen(".altscaffold.iit")+1,sizeof(char));
3054 sprintf(iitfile,"%s/%s.altscaffold.iit",genomesubdir,fileroot);
3055 if ((altscaffold_iit = Univ_IIT_read(iitfile,/*readonlyp*/true,/*add_iit_p*/false)) == NULL) {
3056 /* fprintf(stderr,"No altscaffold file found\n"); */
3057 altlocp = (bool *) CALLOC((*nchromosomes)+1,sizeof(bool));
3058 alias_starts = (Univcoord_T *) CALLOC((*nchromosomes)+1,sizeof(Univcoord_T));
3059 alias_ends = (Univcoord_T *) CALLOC((*nchromosomes)+1,sizeof(Univcoord_T));
3060
3061 } else {
3062 fprintf(stderr,"Found altscaffold file found\n");
3063 altlocp = Univ_IIT_altlocp(&alias_starts,&alias_ends,chromosome_iit,altscaffold_iit);
3064 Univ_IIT_free(&altscaffold_iit);
3065 }
3066
3067 FREE(iitfile);
3068 }
3069
3070 *genomelength = Univ_IIT_genomelength(chromosome_iit,/*with_circular_alias_p*/true);
3071 return chromosome_iit;
3072 }
3073
3074
3075 static void
worker_setup(char * transcriptomesubdir,char * transcriptome_fileroot,char * genomesubdir,char * genome_fileroot)3076 worker_setup (char *transcriptomesubdir, char *transcriptome_fileroot,
3077 char *genomesubdir, char *genome_fileroot) {
3078 char *snpsdir = NULL, *modedir = NULL, *mapdir = NULL, *iitfile = NULL;
3079 /* Univcoord_T genome_totallength; */
3080 char *idx_filesuffix1, *idx_filesuffix2;
3081
3082 if (snps_root == NULL && maskedp == false) {
3083 if (transcriptome_fileroot != NULL) {
3084 transcriptomebits = Genome_new(transcriptomesubdir,transcriptome_fileroot,/*snps_root*/NULL,/*genometype*/GENOME_BITS,
3085 uncompressedp,genome_access,sharedp);
3086 }
3087
3088 genomecomp = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_OLIGOS,
3089 uncompressedp,genome_access,sharedp);
3090 genomebits = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_BITS,
3091 uncompressedp,genome_access,sharedp);
3092
3093 } else if (maskedp == true) {
3094 genomecomp = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_OLIGOS,
3095 uncompressedp,genome_access,sharedp);
3096 genomebits = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_BITS,
3097 uncompressedp,genome_access,sharedp);
3098 genomecomp_alt = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/masked_root,/*genometype*/GENOME_OLIGOS,
3099 uncompressedp,genome_access,sharedp);
3100 genomebits_alt = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/masked_root,/*genometype*/GENOME_BITS,
3101 uncompressedp,genome_access,sharedp);
3102 } else {
3103 if (user_snpsdir == NULL) {
3104 snpsdir = genomesubdir;
3105 mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,genome_fileroot);
3106 } else {
3107 snpsdir = user_snpsdir;
3108 mapdir = user_snpsdir;
3109 }
3110
3111 /* SNPs */
3112 if (transcriptome_fileroot != NULL) {
3113 transcriptomebits = Genome_new(transcriptomesubdir,transcriptome_fileroot,/*snps_root*/NULL,/*genometype*/GENOME_BITS,
3114 uncompressedp,genome_access,sharedp);
3115 transcriptomebits_alt = Genome_new(snpsdir,transcriptome_fileroot,snps_root,/*genometype*/GENOME_BITS,
3116 uncompressedp,genome_access,sharedp);
3117 }
3118
3119 genomecomp = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_OLIGOS,
3120 uncompressedp,genome_access,sharedp);
3121 genomebits = Genome_new(genomesubdir,genome_fileroot,/*alt_root*/NULL,/*genometype*/GENOME_BITS,
3122 uncompressedp,genome_access,sharedp);
3123 genomecomp_alt = Genome_new(snpsdir,genome_fileroot,snps_root,/*genometype*/GENOME_OLIGOS,
3124 uncompressedp,genome_access,sharedp);
3125 genomebits_alt = Genome_new(snpsdir,genome_fileroot,snps_root,/*genometype*/GENOME_BITS,
3126 uncompressedp,genome_access,sharedp);
3127 }
3128
3129 /* Must be done before Knownsplicing_retrieve_via_splicesites */
3130 Genome_setup(genomecomp,genomecomp_alt,genomelength,mode,circular_typeint);
3131
3132
3133 if (user_modedir == NULL) {
3134 modedir = genomesubdir;
3135 } else {
3136 modedir = user_modedir;
3137 }
3138
3139 if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
3140 idx_filesuffix1 = "metct";
3141 idx_filesuffix2 = "metga";
3142 } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED) {
3143 idx_filesuffix1 = "a2iag";
3144 idx_filesuffix2 = "a2itc";
3145 } else if (mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
3146 idx_filesuffix1 = "a2itc";
3147 idx_filesuffix2 = "a2iag";
3148 } else {
3149 idx_filesuffix1 = IDX_FILESUFFIX; /* "ref" */
3150 idx_filesuffix2 = (char *) NULL;
3151 }
3152
3153 if ((indexdb = Indexdb_new_genome(&index1part,&index1interval,
3154 modedir,genome_fileroot,idx_filesuffix1,snps_root,
3155 required_index1part,required_index1interval,
3156 offsetsstrm_access,positions_access,sharedp,
3157 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p)) == NULL) {
3158
3159 if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
3160 fprintf(stderr,"Cannot find %s index file. Need to run cmetindex first\n",idx_filesuffix1);
3161 } else if (mode == ATOI_STRANDED || mode == ATOI_NONSTRANDED ||
3162 mode == TTOC_STRANDED || mode == TTOC_NONSTRANDED) {
3163 fprintf(stderr,"Cannot find %s index file. Need to run atoiindex first\n",idx_filesuffix1);
3164 } else {
3165 fprintf(stderr,"Cannot find %s index file\n",idx_filesuffix1);
3166 }
3167 exit(9);
3168 }
3169
3170 if (idx_filesuffix2 == NULL) {
3171 indexdb2 = indexdb;
3172 } else if ((indexdb2 = Indexdb_new_genome(&index1part,&index1interval,
3173 modedir,genome_fileroot,idx_filesuffix2,snps_root,
3174 required_index1part,required_index1interval,
3175 offsetsstrm_access,positions_access,sharedp,
3176 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p)) == NULL) {
3177 if (mode == CMET_STRANDED || mode == CMET_NONSTRANDED) {
3178 fprintf(stderr,"Cannot find %s index file. Need to run cmetindex first\n",idx_filesuffix2);
3179 } else {
3180 fprintf(stderr,"Cannot find %s index file. Need to run atoiindex first\n",idx_filesuffix2);
3181 }
3182 exit(9);
3183 }
3184
3185 /* Note: regiondb not currently SNP-tolerant */
3186 if (use_localdb_p == false) {
3187 regiondb = (Regiondb_T) NULL;
3188 } else if ((regiondb = Regiondb_new_genome(®ion1part,®ion1interval,
3189 genomesubdir,genome_fileroot,idx_filesuffix1,/*snps_root*/NULL,
3190 required_region1part,required_region1interval,
3191 regiondb_access,sharedp,
3192 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p)) == NULL) {
3193 fprintf(stderr,"Warning: Cannot find regiondb files. GSNAP will run on this older index, but there is a newer index format available\n");
3194 regiondb2 = (Regiondb_T) NULL;
3195 } else if (idx_filesuffix2 == NULL) {
3196 regiondb2 = regiondb;
3197 } else {
3198 regiondb2 = Regiondb_new_genome(&local1part,&local1interval,
3199 genomesubdir,genome_fileroot,idx_filesuffix2,/*snps_root*/NULL,
3200 required_local1part,required_local1interval,
3201 regiondb_access,sharedp,
3202 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
3203 }
3204
3205
3206 if (snps_root != NULL) {
3207 iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
3208 strlen(snps_root)+1,sizeof(char));
3209 sprintf(iitfile,"%s/%s",mapdir,snps_root);
3210 fprintf(stderr,"Reading SNPs file %s/%s...",mapdir,snps_root);
3211 if ((snps_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3212 /*divstring*/NULL,/*add_iit_p*/true)) == NULL) {
3213 fprintf(stderr,"SNPs file %s.iit not found in %s.\n",snps_root,mapdir);
3214 if (user_snpsdir == NULL) {
3215 fprintf(stderr,"Available files:\n");
3216 Datadir_list_directory(stderr,genomesubdir);
3217 fprintf(stderr,"Either install file %s.iit or specify a directory for the IIT file\n",snps_root);
3218 fprintf(stderr,"using the -M flag.\n");
3219 exit(9);
3220 }
3221 }
3222
3223 print_nsnpdiffs_p = true;
3224 snps_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,snps_iit);
3225
3226 fprintf(stderr,"done\n");
3227 FREE(iitfile);
3228 if (user_snpsdir == NULL) {
3229 FREE(mapdir);
3230 }
3231 }
3232
3233
3234 if (min_distantsplicing_end_matches < index1part) {
3235 fprintf(stderr,"Minimum value for distant-splice-endlength is the value for -k (kmer size) %d\n",index1part);
3236 exit(9);
3237 }
3238
3239 indexdb_size_threshold = (int) (10*Indexdb_mean_size(indexdb,mode,index1part));
3240 debug(printf("Size threshold is %d\n",indexdb_size_threshold));
3241 if (indexdb_size_threshold < MIN_INDEXDB_SIZE_THRESHOLD) {
3242 indexdb_size_threshold = MIN_INDEXDB_SIZE_THRESHOLD;
3243 }
3244
3245 if (genes_file != NULL) {
3246 if ((genes_iit = IIT_read(genes_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3247 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3248 fprintf(stderr,"Reading genes file %s locally...",genes_file);
3249 } else {
3250 mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,genome_fileroot);
3251 iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
3252 strlen(genes_file)+1,sizeof(char));
3253 sprintf(iitfile,"%s/%s",mapdir,genes_file);
3254 if ((genes_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3255 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3256 fprintf(stderr,"Reading genes file %s...",iitfile);
3257 FREE(iitfile);
3258 FREE(mapdir);
3259 } else {
3260 fprintf(stderr,"Genes file %s.iit not found locally or in %s. Available files:\n",genes_file,mapdir);
3261 Datadir_list_directory(stderr,mapdir);
3262 fprintf(stderr,"Either install file %s or specify a full directory path\n",genes_file);
3263 exit(9);
3264 }
3265 }
3266 genes_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,genes_iit);
3267 genes_chrnum_crosstable = Univ_IIT_chrnum_crosstable(chromosome_iit,genes_iit);
3268 }
3269
3270
3271 if (splicing_file != NULL) {
3272 if (user_splicingdir == NULL) {
3273 if ((splicing_iit = IIT_read(splicing_file,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3274 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3275 fprintf(stderr,"Reading splicing file %s locally...",splicing_file);
3276 }
3277 } else {
3278 iitfile = (char *) CALLOC(strlen(user_splicingdir)+strlen("/")+strlen(splicing_file)+1,sizeof(char));
3279 sprintf(iitfile,"%s/%s",user_splicingdir,splicing_file);
3280 if ((splicing_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3281 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3282 fprintf(stderr,"Reading splicing file %s...",iitfile);
3283 FREE(iitfile);
3284 }
3285 }
3286
3287 if (splicing_iit == NULL) {
3288 mapdir = Datadir_find_mapdir(/*user_mapdir*/NULL,genomesubdir,genome_fileroot);
3289 iitfile = (char *) CALLOC(strlen(mapdir)+strlen("/")+
3290 strlen(splicing_file)+1,sizeof(char));
3291 sprintf(iitfile,"%s/%s",mapdir,splicing_file);
3292 if ((splicing_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3293 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3294 fprintf(stderr,"Reading splicing file %s...",iitfile);
3295 FREE(iitfile);
3296 FREE(mapdir);
3297 } else {
3298 fprintf(stderr,"Splicing file %s.iit not found locally or in %s. Available files:\n",splicing_file,mapdir);
3299 Datadir_list_directory(stderr,mapdir);
3300 fprintf(stderr,"Either install file %s or specify a full directory path\n",splicing_file);
3301 exit(9);
3302 }
3303 }
3304
3305 splicing_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,splicing_iit);
3306 if ((donor_typeint = IIT_typeint(splicing_iit,"donor")) >= 0 &&
3307 (acceptor_typeint = IIT_typeint(splicing_iit,"acceptor")) >= 0) {
3308 fprintf(stderr,"found donor and acceptor tags, so treating as splicesites file\n");
3309 splicesites = Knownsplicing_retrieve_via_splicesites(&distances_observed_p,&splicecomp,&splicetypes,&splicedists,
3310 &nsplicesites,splicing_iit,splicing_divint_crosstable,
3311 donor_typeint,acceptor_typeint,chromosome_iit,
3312 genomecomp,genomecomp_alt,shortsplicedist);
3313 if (nsplicesites == 0) {
3314 fprintf(stderr,"\nWarning: No splicesites observed for genome %s. Are you sure this splicesite file was built for this genome? Please compare chromosomes below:\n",
3315 genome_dbroot);
3316 fprintf(stderr,"Chromosomes in the genome: ");
3317 Univ_IIT_dump_labels(stderr,chromosome_iit);
3318 fprintf(stderr,"Chromosomes in the splicesites IIT file: ");
3319 IIT_dump_divstrings(stderr,splicing_iit);
3320 exit(9);
3321 }
3322
3323 } else {
3324 fprintf(stderr,"no donor or acceptor tags found, so treating as introns file\n");
3325 fprintf(stderr,"Intron mode for splicing file no longer supported in GSNAP\n");
3326 exit(9);
3327 }
3328
3329 /* For benchmarking purposes. Can spend time/memory to load
3330 splicesites, but then not use them. */
3331 if (unloadp == true) {
3332 fprintf(stderr,"unloading...");
3333
3334 if (nsplicesites > 0) {
3335 FREE(splicedists);
3336 FREE(splicetypes);
3337 FREE(splicesites);
3338 FREE(splicecomp);
3339 nsplicesites = 0;
3340 }
3341
3342 FREE(splicing_divint_crosstable);
3343 IIT_free(&splicing_iit);
3344 splicing_iit = NULL;
3345 knownsplicingp = false;
3346 splicing_file = (char *) NULL;
3347 }
3348
3349 fprintf(stderr,"done\n");
3350 }
3351
3352 if (novelsplicingp == true || knownsplicingp == true) {
3353 /* RNA-seq */
3354 splicingp = true;
3355 } else {
3356 /* DNA-seq */
3357 splicingp = false;
3358 }
3359
3360
3361 if (tally_root != NULL) {
3362 if (user_tallydir == NULL) {
3363 if ((tally_iit = IIT_read(tally_root,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3364 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3365 fprintf(stderr,"Reading tally file %s.iit locally...",tally_root);
3366 }
3367 } else {
3368 iitfile = (char *) CALLOC(strlen(user_tallydir)+strlen("/")+strlen(tally_root)+1,sizeof(char));
3369 sprintf(iitfile,"%s/%s",user_tallydir,tally_root);
3370 if ((tally_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3371 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3372 fprintf(stderr,"Reading tally file %s...",iitfile);
3373 FREE(iitfile);
3374 }
3375 }
3376
3377 if (tally_iit == NULL) {
3378 iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(tally_root)+1,sizeof(char));
3379 sprintf(iitfile,"%s/%s",genomesubdir,tally_root);
3380 if ((tally_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3381 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3382 fprintf(stderr,"Reading tally file %s...",iitfile);
3383 FREE(iitfile);
3384 } else {
3385 fprintf(stderr,"Tally file %s.iit not found locally",tally_root);
3386 if (user_tallydir != NULL) {
3387 fprintf(stderr," or in %s",user_tallydir);
3388 }
3389 fprintf(stderr," or in %s\n",genomesubdir);
3390 exit(9);
3391 }
3392 }
3393
3394 tally_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,tally_iit);
3395 fprintf(stderr,"done\n");
3396 }
3397
3398
3399 if (runlength_root != NULL) {
3400 if (user_runlengthdir == NULL) {
3401 if ((runlength_iit = IIT_read(runlength_root,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3402 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3403 fprintf(stderr,"Reading runlength file %s.iit locally...",runlength_root);
3404 }
3405 } else {
3406 iitfile = (char *) CALLOC(strlen(user_runlengthdir)+strlen("/")+strlen(runlength_root)+1,sizeof(char));
3407 sprintf(iitfile,"%s/%s",user_runlengthdir,runlength_root);
3408 if ((runlength_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3409 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3410 fprintf(stderr,"Reading runlength file %s...",iitfile);
3411 FREE(iitfile);
3412 }
3413 }
3414
3415 if (runlength_iit == NULL) {
3416 iitfile = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+strlen(runlength_root)+1,sizeof(char));
3417 sprintf(iitfile,"%s/%s",genomesubdir,runlength_root);
3418 if ((runlength_iit = IIT_read(iitfile,/*name*/NULL,/*readonlyp*/true,/*divread*/READ_ALL,
3419 /*divstring*/NULL,/*add_iit_p*/true)) != NULL) {
3420 fprintf(stderr,"Reading runlength file %s...",iitfile);
3421 FREE(iitfile);
3422 } else {
3423 fprintf(stderr,"Runlength file %s.iit not found locally",runlength_root);
3424 if (user_runlengthdir != NULL) {
3425 fprintf(stderr," or in %s",user_runlengthdir);
3426 }
3427 fprintf(stderr," or in %s\n",genomesubdir);
3428 exit(9);
3429 }
3430 }
3431
3432 runlength_divint_crosstable = Univ_IIT_divint_crosstable(chromosome_iit,runlength_iit);
3433 fprintf(stderr,"done\n");
3434 }
3435
3436
3437 Path_solve_setup(genomelength,circularp,genomebits,genomebits_alt,regiondb,regiondb2,
3438 min_intronlength,novelsplicingp,splicingp,
3439 splicesites,splicetypes,splicedists,nsplicesites,
3440 index1part,index1interval,local1part,region1part);
3441 Kmer_search_setup(mode,index1part_tr,index1part,index1interval,local1part,transcriptomelength,
3442 min_intronlength,chromosome_iit,genomelength,circular_typeint,circularp,
3443 genomebits,genomebits_alt,indexdb,indexdb2,indexdb_tr,
3444 splicingp,splicesites,splicetypes,splicedists,nsplicesites);
3445 Extension_search_setup(mode,chromosome_iit,genomelength,circular_typeint,circularp,
3446 genomebits,genomebits_alt,indexdb,indexdb2,
3447 min_intronlength,max_end_insertions,max_end_deletions,
3448 index1part,index1interval,local1part);
3449 Segment_search_setup(index1part,index1interval,max_anchors,chromosome_iit,nchromosomes,
3450 circular_typeint,mode,splicesites,splicetypes,splicedists,nsplicesites);
3451 Terminal_setup(chromosome_iit,genomelength,circular_typeint,genomebits,genomebits_alt,
3452 splicingp,index1part,index1interval,subopt_levels);
3453 Distant_rna_setup(chromosome_iit,genomelength,circular_typeint,genomebits,genomebits_alt,
3454 index1part,index1interval,subopt_levels,shortsplicedist,
3455 min_distantsplicing_end_matches,min_distantsplicing_identity,
3456 localsplicing_penalty,distantsplicing_penalty);
3457 Distant_dna_setup(genomebits,genomebits_alt,genomelength,shortsplicedist,splicingp);
3458
3459
3460 if (genomebits == NULL) {
3461 fprintf(stderr,"This version of GSNAP requires the genomebits128 file for the genome\n");
3462 exit(9);
3463 } else {
3464 Genome_hr_setup(Genome_blocks(genomebits),/*snp_blocks*/genomebits_alt ? Genome_blocks(genomebits_alt) : NULL,
3465 query_unk_mismatch_p,genome_unk_mismatch_p,mode,maskedp);
3466 Genome_consec_setup(query_unk_mismatch_p,genome_unk_mismatch_p,mode);
3467 }
3468
3469 Genome_sites_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
3470 Maxent_hr_setup(Genome_blocks(genomecomp),/*snp_blocks*/genomecomp_alt ? Genome_blocks(genomecomp_alt) : NULL);
3471
3472 Simplepair_setup(splicingp,transcript_iit,sam_insert_0M_p,
3473 md_lowercase_variant_p,/*snps_p*/snps_iit ? true : false,
3474 sam_cigar_extended_p,cigar_action);
3475
3476 /* Oligoindex_hr_setup(Genome_blocks(genomecomp),mode); */
3477 /* Oligoindex_localdb_setup(chromosome_iit,circular_typeint,localdb,local1part); */
3478
3479 Indexdb_setup(index1part);
3480 Oligo_setup(index1part,mode);
3481 Regiondb_setup(region1part,mode);
3482 /* spansize = Spanningelt_setup(index1part,index1interval); */
3483
3484 Knownsplicing_setup(splicecomp);
3485 Splice_setup(genomebits,genomebits_alt,min_shortend);
3486 Indel_setup(genomebits,genomebits_alt,genomelength,transcriptomelength,
3487 min_indel_end_matches,indel_penalty_middle,maskedp);
3488 Transcript_setup(pairmax_transcriptome,expected_pairlength);
3489
3490 Stage1hr_setup(transcript_iit,transcriptome,transcriptomebits,use_only_transcriptome_p,
3491 indexdb,indexdb2,indexdb_tr,index1part_tr,index1part,index1interval,
3492 user_mismatches_refalt_float,user_mismatches_ref_float,user_mincoverage_float,
3493 max_middle_insertions_float,max_middle_deletions_float,
3494 max_end_insertions,max_end_deletions,
3495 chromosome_iit,nchromosomes,genomecomp,genomebits,genomebits_alt,
3496 mode,maxpaths_search,maxpaths_report,find_dna_chimeras_p,distances_observed_p,
3497 subopt_levels,splicingp,shortsplicedist,shortsplicedist_novelend,
3498 min_intronlength,expected_pairlength,pairlength_deviation,
3499 distantsplicing_penalty,min_distantsplicing_end_matches,min_distantsplicing_identity);
3500 Substring_setup(print_nsnpdiffs_p,print_snplabels_p,
3501 show_refdiff_p,snps_iit,snps_divint_crosstable,
3502 genomebits,genomebits_alt,chromosome_iit,genomelength,nchromosomes,circular_typeint,
3503 novelsplicingp,knownsplicingp,output_type,mode,maskedp);
3504 Stage3hr_setup(/*transcriptomep*/transcriptome != NULL ? true : false,
3505 invert_first_p,invert_second_p,genomecomp,genomebits,genomebits_alt,
3506 chromosome_iit,genomelength,nchromosomes,circular_typeint,
3507 transcriptomebits,transcriptome,transcript_iit,
3508 tally_iit,tally_divint_crosstable,runlength_iit,runlength_divint_crosstable,
3509 distances_observed_p,pairmax_linear,pairmax_circular,
3510 expected_pairlength,pairlength_deviation,
3511 localsplicing_penalty,indel_penalty_middle,antistranded_penalty,
3512 favor_multiexon_p,subopt_levels,circularp,altlocp,alias_starts,alias_ends,
3513 filter_within_trims_p,omit_concordant_uniq_p,omit_concordant_mult_p,
3514 failedinput_root,output_type,merge_samechr_p,method_print_p,want_random_p);
3515
3516 /* Note: For NM_145699_163_, want to increase subopt_levels by 8 for
3517 Concordance_setup, and in pruning based on nmatches in
3518 Stage3pair_optimal_score_final */
3519 Concordance_setup(subopt_levels,pairmax_transcriptome,pairmax_linear,pairmax_circular,
3520 expected_pairlength,pairlength_deviation,circularp,merge_samechr_p);
3521 SAM_setup(add_paired_nomappers_p,paired_flag_means_concordant_p,
3522 omit_concordant_uniq_p,omit_concordant_mult_p,quiet_if_excessive_p,
3523 maxpaths_report,failedinput_root,fastq_format_p,hide_soft_clips_p,method_print_p,
3524 clip_overlap_p,merge_overlap_p,merge_samechr_p,sam_multiple_primaries_p,sam_sparse_secondaries_p,
3525 force_xs_direction_p,snps_iit,find_dna_chimeras_p,
3526 splicingp,donor_typeint,acceptor_typeint,chromosome_iit,transcript_iit);
3527 Cigar_setup(genomecomp,sam_cigar_extended_p,hide_soft_clips_p,merge_samechr_p,md_lowercase_variant_p,
3528 sam_hardclip_use_S_p,sam_insert_0M_p);
3529 Output_setup(chromosome_iit,nofailsp,failsonlyp,quiet_if_excessive_p,maxpaths_report,
3530 failedinput_root,quality_shift,
3531 output_type,invert_first_p,invert_second_p,
3532 sam_read_group_id);
3533
3534 return;
3535 }
3536
3537
3538 static void
worker_cleanup()3539 worker_cleanup () {
3540
3541 Segment_search_cleanup();
3542
3543 if (regiondb2 != regiondb) {
3544 Regiondb_free(®iondb2);
3545 }
3546 if (regiondb != NULL) {
3547 Regiondb_free(®iondb);
3548 }
3549
3550 if (indexdb2 != indexdb) {
3551 Indexdb_free(&indexdb2);
3552 }
3553 if (indexdb != NULL) {
3554 Indexdb_free(&indexdb);
3555 }
3556
3557 if (transcriptomebits_alt != NULL) {
3558 Genome_free(&transcriptomebits_alt);
3559 }
3560 if (transcriptomebits != NULL) {
3561 Genome_free(&transcriptomebits);
3562 }
3563
3564
3565 if (genomecomp_alt != NULL) {
3566 Genome_free(&genomecomp_alt);
3567 Genome_free(&genomebits_alt);
3568 }
3569 if (genomebits != NULL) {
3570 Genome_free(&genomebits);
3571 }
3572 if (genomecomp != NULL) {
3573 Genome_free(&genomecomp);
3574 }
3575
3576 if (nsplicesites > 0) {
3577 FREE(splicedists);
3578 FREE(splicetypes);
3579 FREE(splicecomp);
3580 FREE(splicesites);
3581 }
3582
3583 if (runlength_iit != NULL) {
3584 FREE(runlength_divint_crosstable);
3585 IIT_free(&runlength_iit);
3586 }
3587
3588 if (tally_iit != NULL) {
3589 FREE(tally_divint_crosstable);
3590 IIT_free(&tally_iit);
3591 }
3592
3593 if (splicing_iit != NULL) {
3594 FREE(splicing_divint_crosstable);
3595 IIT_free(&splicing_iit);
3596 }
3597
3598 if (genes_iit != NULL) {
3599 FREE(genes_chrnum_crosstable);
3600 FREE(genes_divint_crosstable);
3601 IIT_free(&genes_iit);
3602 }
3603
3604 if (snps_iit != NULL) {
3605 FREE(snps_divint_crosstable);
3606 IIT_free(&snps_iit);
3607 }
3608
3609 if (altlocp != NULL) {
3610 FREE(alias_ends);
3611 FREE(alias_starts);
3612 FREE(altlocp);
3613 }
3614
3615 if (circularp != NULL) {
3616 FREE(circularp);
3617 }
3618
3619 if (indexdb_tr != NULL) {
3620 Indexdb_free(&indexdb_tr);
3621 }
3622
3623 if (transcriptome != NULL) {
3624 Transcriptome_free(&transcriptome);
3625 }
3626
3627 if (transcript_iit != NULL) {
3628 Univ_IIT_free(&transcript_iit);
3629 }
3630
3631 if (chromosome_iit != NULL) {
3632 Univ_IIT_free(&chromosome_iit);
3633 }
3634
3635 Access_controlled_cleanup();
3636
3637 return;
3638 }
3639
3640
3641 int
main(int argc,char * argv[])3642 main (int argc, char *argv[]) {
3643 int nchars1 = 0, nchars2 = 0;
3644 int cmdline_status;
3645
3646 char *transcriptomesubdir = NULL, *genomesubdir, *transcriptome_fileroot = NULL, *genome_fileroot, *dbversion;
3647 char **files;
3648 int nfiles;
3649 #if defined(USE_MPI) && defined(USE_MPI_FILE_INPUT)
3650 MPI_File mpi_file_input, mpi_file_input_2;
3651 #endif
3652
3653 #ifdef USE_MPI
3654 Master_T master;
3655 char **files_master;
3656 int nfiles_master;
3657 FILE *input_parser, *input2_parser;
3658 #endif
3659 FILE *input, *input2;
3660
3661 #ifdef HAVE_ZLIB
3662 #ifdef USE_MPI
3663 gzFile gzipped_master, gzipped2_master;
3664 #endif
3665 gzFile gzipped, gzipped2;
3666 #endif
3667
3668 #ifdef HAVE_BZLIB
3669 #ifdef USE_MPI
3670 Bzip2_T bzipped_master, bzipped2_master;
3671 #endif
3672 Bzip2_T bzipped, bzipped2;
3673 #endif
3674
3675 long int worker_id;
3676
3677 int nread;
3678 int nextchar = '\0';
3679 double runtime;
3680
3681 #ifdef HAVE_PTHREAD
3682 int ret;
3683 pthread_attr_t thread_attr_join;
3684 #ifdef USE_MPI
3685 pthread_attr_t thread_attr_detach;
3686 #endif
3687 #endif
3688
3689 #ifdef HAVE_SIGACTION
3690 struct sigaction signal_action;
3691 #endif
3692
3693 extern int optind;
3694
3695 #ifdef MEMUSAGE
3696 Mem_usage_init();
3697 Mem_usage_set_threadname("main");
3698 #endif
3699
3700
3701 cmdline_status = parse_command_line(argc,argv,optind);
3702 argc -= optind;
3703 argv += optind;
3704
3705 if (cmdline_status == 0) {
3706 /* okay to continue */
3707 } else if (cmdline_status == 1) {
3708 exit(0);
3709 } else {
3710 exit(cmdline_status);
3711 }
3712
3713 check_compiler_assumptions();
3714
3715 if (exception_raise_p == false) {
3716 fprintf(stderr,"Allowing signals and exceptions to pass through. If using shared memory, need to remove segments manually.\n");
3717 Except_inactivate();
3718 } else {
3719 #ifdef HAVE_SIGACTION
3720 signal_action.sa_handler = signal_handler;
3721 signal_action.sa_flags = 0;
3722 sigfillset(&signal_action.sa_mask); /* After first signal, block all other signals */
3723
3724 /* Note: SIGKILL and SIGSTOP cannot be caught */
3725
3726 sigaction(SIGABRT,&signal_action,NULL); /* abnormal termination (abort) */
3727 sigaction(SIGBUS,&signal_action,NULL); /* bus error */
3728 sigaction(SIGFPE,&signal_action,NULL); /* arithmetic exception */
3729 sigaction(SIGHUP,&signal_action,NULL); /* hangup */
3730 sigaction(SIGILL,&signal_action,NULL); /* illegal hardware instruction */
3731 sigaction(SIGINT,&signal_action,NULL); /* terminal interruption (control-C) */
3732 sigaction(SIGPIPE,&signal_action,NULL); /* write to pipe with no readers */
3733 sigaction(SIGQUIT,&signal_action,NULL); /* terminal quit (control-backslash) */
3734 sigaction(SIGSEGV,&signal_action,NULL); /* invalid memory reference */
3735 sigaction(SIGSYS,&signal_action,NULL); /* invalid system call */
3736 sigaction(SIGTERM,&signal_action,NULL); /* Unix kill command */
3737 sigaction(SIGTRAP,&signal_action,NULL); /* hardware fault */
3738 sigaction(SIGXCPU,&signal_action,NULL); /* CPU limit exceeded */
3739 sigaction(SIGXFSZ,&signal_action,NULL); /* file size limit exceeded */
3740 #endif
3741 }
3742
3743 #ifdef USE_MPI
3744 /* MPI_Init(&argc,&argv); */
3745 MPI_Init_thread(&argc,&argv,/*requested*/MPI_THREAD_MULTIPLE,&provided);
3746 MPI_Comm_rank(MPI_COMM_WORLD,&myid);
3747 MPI_Comm_size(MPI_COMM_WORLD,&nranks);
3748 MPI_Debug_setup(myid);
3749
3750 nthreads0 = nthreads - 1;
3751 if (master_is_worker_p == false) {
3752 /* Default is to exclude master node from working */
3753 exclude_ranks[0] = 0;
3754 MPI_Comm_group(MPI_COMM_WORLD,&world_group);
3755 MPI_Group_excl(world_group,1,exclude_ranks,&workers_group);
3756 MPI_Comm_create(MPI_COMM_WORLD,workers_group,&workers_comm);
3757 MPI_Group_free(&workers_group);
3758 MPI_Group_free(&world_group);
3759
3760 } else if (nthreads0 <= 0) {
3761 /* If insufficient threads, then also exclude master node from working */
3762 exclude_ranks[0] = 0;
3763 MPI_Comm_group(MPI_COMM_WORLD,&world_group);
3764 MPI_Group_excl(world_group,1,exclude_ranks,&workers_group);
3765 MPI_Comm_create(MPI_COMM_WORLD,workers_group,&workers_comm);
3766 MPI_Group_free(&workers_group);
3767 MPI_Group_free(&world_group);
3768 master_is_worker_p = false;
3769
3770 } else {
3771 /* Include master rank 0 in workers group */
3772 MPI_Comm_group(MPI_COMM_WORLD,&world_group);
3773 MPI_Comm_create(MPI_COMM_WORLD,world_group,&workers_comm);
3774 MPI_Group_free(&world_group);
3775 /* master_is_worker_p = true; */
3776 }
3777 n_slave_ranks = nranks - 1; /* Don't include master, even if it's a worker */
3778
3779 if (myid == 0) {
3780 nthreads = nthreads0;
3781 fastq_format_p = open_input_streams_parser(&nextchar,&nchars1,&nchars2,
3782 &files_master,&nfiles_master,&input_parser,&input2_parser,
3783 #ifdef HAVE_ZLIB
3784 &gzipped_master,&gzipped2_master,
3785 #endif
3786 #ifdef HAVE_BZLIB
3787 &bzipped_master,&bzipped2_master,
3788 #endif
3789 read_files_command,gunzip_p,bunzip2_p,interleavedp,argc,argv);
3790 master = Master_new(n_slave_ranks,nextchar,nchars1,nchars2,
3791 input_parser,input2_parser,
3792 #ifdef HAVE_ZLIB
3793 gzipped_master,gzipped2_master,
3794 #endif
3795 #ifdef HAVE_BZLIB
3796 bzipped_master,bzipped2_master,
3797 #endif
3798 files_master,nfiles_master,inbuffer_nspaces,part_modulus,part_interval);
3799 }
3800
3801 MPI_Bcast(&fastq_format_p,1,MPI_BOOL_T,/*root*/0,MPI_COMM_WORLD);
3802 MPI_Bcast(&nextchar,1,MPI_CHAR,/*root*/0,MPI_COMM_WORLD);
3803
3804 /* If not using MPI_File, then master already has input and input2,
3805 and does not need mpi_file_input or mpi_file_input_2 (because of the workers_comm) */
3806 if (myid > 0 || master_is_worker_p == true) {
3807 open_input_streams_worker(&files,&nfiles,
3808 #ifdef USE_MPI_FILE_INPUT
3809 &mpi_file_input,&mpi_file_input_2,workers_comm,
3810 #else
3811 &input,&input2,
3812 #endif
3813 #ifdef HAVE_ZLIB
3814 &gzipped,&gzipped2,
3815 #endif
3816 #ifdef HAVE_BZLIB
3817 &bzipped,&bzipped2,
3818 #endif
3819 read_files_command,gunzip_p,bunzip2_p,interleavedp,fastq_format_p,argc,argv);
3820
3821 /* Inbuffer_master_process skips to part_modulus, so workers need it set to 0 */
3822 Inbuffer_setup(filter_if_both_p,
3823 #ifdef USE_MPI_FILE_INPUT
3824 workers_comm,
3825 #endif
3826 /*part_modulus*/0,part_interval);
3827
3828 inbuffer = Inbuffer_new(nextchar,myid,
3829 #ifdef USE_MPI_FILE_INPUT
3830 mpi_file_input,mpi_file_input_2,
3831 #else
3832 input,input2,
3833 #endif
3834 #ifdef HAVE_ZLIB
3835 gzipped,gzipped2,
3836 #endif
3837 #ifdef HAVE_BZLIB
3838 bzipped,bzipped2,
3839 #endif
3840 read_files_command,files,nfiles,inbuffer_nspaces);
3841 }
3842
3843 Shortread_setup(acc_fieldi_start,acc_fieldi_end,force_single_end_p,filter_chastity_p,
3844 allow_paired_end_mismatch_p,fastq_format_p,barcode_length,endtrim_length,
3845 invert_first_p,invert_second_p);
3846 multiple_sequences_p = true;
3847
3848
3849 #else
3850 /* Non-MPI version */
3851 fastq_format_p = open_input_streams_parser(&nextchar,&nchars1,&nchars2,&files,&nfiles,&input,&input2,
3852 #ifdef HAVE_ZLIB
3853 &gzipped,&gzipped2,
3854 #endif
3855 #ifdef HAVE_BZLIB
3856 &bzipped,&bzipped2,
3857 #endif
3858 read_files_command,gunzip_p,bunzip2_p,interleavedp,argc,argv);
3859
3860 Inbuffer_setup(filter_if_both_p,part_modulus,part_interval);
3861
3862 inbuffer = Inbuffer_new(nextchar,input,input2,
3863 #ifdef HAVE_ZLIB
3864 gzipped,gzipped2,
3865 #endif
3866 #ifdef HAVE_BZLIB
3867 bzipped,bzipped2,
3868 #endif
3869 interleavedp,read_files_command,files,nfiles,inbuffer_nspaces);
3870
3871 Shortread_setup(acc_fieldi_start,acc_fieldi_end,force_single_end_p,filter_chastity_p,
3872 allow_paired_end_mismatch_p,fastq_format_p,barcode_length,endtrim_length,
3873 invert_first_p,invert_second_p);
3874
3875 if ((nread = Inbuffer_fill_init(inbuffer)) > 1) {
3876 multiple_sequences_p = true;
3877 } else {
3878 multiple_sequences_p = false;
3879 }
3880 #endif
3881
3882 if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
3883 /* No need for pre-load */
3884
3885 } else if (multiple_sequences_p == true) {
3886 #if 0
3887 if (offsetsstrm_access != USE_ALLOCATE || genome_access != USE_ALLOCATE) {
3888 fprintf(stderr,"Note: >1 sequence detected, so index files are being memory mapped.\n");
3889 fprintf(stderr," GSNAP can run slowly at first while the computer starts to accumulate\n");
3890 fprintf(stderr," pages from the hard disk into its cache. To copy index files into RAM\n");
3891 fprintf(stderr," instead of memory mapping, use -B 3, -B 4, or -B 5, if you have enough RAM.\n");
3892 #ifdef HAVE_PTHREAD
3893 fprintf(stderr," For more speed, also try multiple threads (-t <int>), if you have multiple processors or cores.");
3894 #endif
3895 fprintf(stderr,"\n");
3896 }
3897 #endif
3898
3899 } else {
3900 /* fprintf(stderr,"Note: only 1 sequence detected. Ignoring batch (-B) command\n"); */
3901 expand_offsets_p = false;
3902 #ifdef HAVE_MMAP
3903 offsetsstrm_access = USE_MMAP_ONLY;
3904 positions_access = USE_MMAP_ONLY;
3905 genome_access = USE_MMAP_ONLY;
3906 locoffsetsstrm_access = USE_MMAP_ONLY;
3907 locpositions_access = USE_MMAP_ONLY;
3908 #else
3909 /* No choice, since mmap is not available */
3910 offsetsstrm_access = USE_ALLOCATE;
3911 positions_access = USE_ALLOCATE;
3912 genome_access = USE_ALLOCATE;
3913 locoffsetsstrm_access = USE_ALLOCATE;
3914 locpositions_access = USE_ALLOCATE;
3915 #endif
3916 }
3917
3918 genomesubdir = Datadir_find_genomesubdir(&genome_fileroot,&dbversion,user_genomedir,genome_dbroot);
3919 FREE(dbversion);
3920 chromosome_iit = chromosome_iit_setup(&genomelength,&nchromosomes,&circular_typeint,&any_circular_p,&circularp,
3921 genomesubdir,genome_fileroot);
3922 Outbuffer_setup(argc,argv,optind,chromosome_iit,any_circular_p,
3923 nthreads,orderedp,quiet_if_excessive_p,
3924 output_type,sam_headers_p,sam_read_group_id,sam_read_group_name,
3925 sam_read_group_library,sam_read_group_platform,
3926 appendp,output_file,split_output_root,failedinput_root);
3927
3928
3929 if (transcriptome_dbroot != NULL) {
3930 if (user_transcriptomedir == NULL && user_genomedir != NULL) {
3931 user_transcriptomedir = user_genomedir;
3932 }
3933 transcriptomesubdir = Datadir_find_genomesubdir(&transcriptome_fileroot,&dbversion,user_transcriptomedir,transcriptome_dbroot);
3934 FREE(dbversion);
3935 transcript_iit = transcript_iit_setup(&transcriptomelength,&ntranscripts,transcriptomesubdir,transcriptome_fileroot);
3936 transcriptome = Transcriptome_new(genomesubdir,genome_fileroot,transcriptome_fileroot,sharedp);
3937 indexdb_tr = Indexdb_new_transcriptome(&index1part_tr,&index1interval_tr,transcriptomesubdir,
3938 transcriptome_fileroot,/*idx_filesuffix*/"ref",/*snps_root*/NULL,
3939 /*required_index1part*/0,/*required_index1interval*/0,
3940 offsetsstrm_access,positions_access,sharedp,
3941 multiple_sequences_p,preload_shared_memory_p,unload_shared_memory_p);
3942 }
3943
3944
3945 #if defined(USE_MPI) && defined(HAVE_PTHREAD)
3946 /* Needed for Master_parser and possibly Master_write_stdout, which never terminate */
3947 pthread_attr_init(&thread_attr_detach);
3948 if ((ret = pthread_attr_setdetachstate(&thread_attr_detach,PTHREAD_CREATE_DETACHED)) != 0) {
3949 fprintf(stderr,"ERROR: pthread_attr_setdetachstate returned %d\n",ret);
3950 exit(1);
3951 }
3952 #endif
3953
3954 #ifdef USE_MPI
3955 if (myid == 0 && master_is_worker_p == false) {
3956 FREE(transcriptomesubdir);
3957 FREE(transcriptome_fileroot);
3958 FREE(genomesubdir);
3959 FREE(genome_fileroot);
3960
3961 /* Master rank, which is not a worker */
3962 if (output_file != NULL) {
3963 fprintf(stderr,"Starting alignment. Writing results to %s\n",output_file);
3964 } else if (split_output_root != NULL) {
3965 fprintf(stderr,"Starting alignment. Writing results to %s.*\n",split_output_root);
3966 } else {
3967 fprintf(stderr,"Starting alignment\n");
3968 }
3969
3970 stopwatch = Stopwatch_new();
3971 Stopwatch_start(stopwatch);
3972
3973 if (split_output_root == NULL && output_file == NULL) {
3974 pthread_create(&write_stdout_thread_id,&thread_attr_detach,Master_write_stdout,(void *) NULL);
3975 }
3976 pthread_create(&parser_thread_id,&thread_attr_detach,Master_parser,(void *) master);
3977 Master_mpi_interface((void *) master); /* Can run as a normal procedure, not as a thread */
3978
3979 } else {
3980 worker_setup(transcriptomesubdir,transcriptome_fileroot,genomesubdir,genome_fileroot);
3981 FREE(transcriptomesubdir);
3982 FREE(transcriptome_fileroot);
3983 FREE(genomesubdir);
3984 FREE(genome_fileroot);
3985 MPI_Barrier(workers_comm);
3986
3987 outbuffer = Outbuffer_new(output_buffer_size,/*nread*/0);
3988 Inbuffer_set_outbuffer(inbuffer,outbuffer);
3989 /* MPI worker ranks continue on with creating output_thread and worker_threads below */
3990
3991 if (myid == 0) {
3992 Inbuffer_set_master(inbuffer,master);
3993
3994 if (output_file != NULL) {
3995 fprintf(stderr,"Starting alignment. Writing results to %s\n",output_file);
3996 } else if (split_output_root != NULL) {
3997 fprintf(stderr,"Starting alignment. Writing results to %s.*\n",split_output_root);
3998 } else {
3999 fprintf(stderr,"Starting alignment\n");
4000 }
4001 stopwatch = Stopwatch_new();
4002 Stopwatch_start(stopwatch);
4003 }
4004
4005 #else
4006 Access_setup(preload_shared_memory_p,unload_shared_memory_p);
4007 worker_setup(transcriptomesubdir,transcriptome_fileroot,genomesubdir,genome_fileroot);
4008 FREE(transcriptomesubdir);
4009 FREE(transcriptome_fileroot);
4010 FREE(genomesubdir);
4011 FREE(genome_fileroot);
4012 if (preload_shared_memory_p == true || unload_shared_memory_p == true) {
4013 worker_cleanup();
4014 return 0;
4015 }
4016
4017 outbuffer = Outbuffer_new(output_buffer_size,nread);
4018 Inbuffer_set_outbuffer(inbuffer,outbuffer);
4019
4020 if (output_file != NULL) {
4021 fprintf(stderr,"Starting alignment. Writing results to %s\n",output_file);
4022 } else if (split_output_root != NULL) {
4023 fprintf(stderr,"Starting alignment. Writing results to %s.*\n",split_output_root);
4024 } else {
4025 fprintf(stderr,"Starting alignment\n");
4026 }
4027 stopwatch = Stopwatch_new();
4028 Stopwatch_start(stopwatch);
4029 #endif
4030
4031
4032
4033 #if !defined(HAVE_PTHREAD)
4034 /* Serial version */
4035 single_thread();
4036
4037 #else
4038 /* Pthreads version */
4039 if (nthreads == 0) {
4040 single_thread();
4041
4042 } else if (multiple_sequences_p == false) {
4043 single_thread();
4044
4045 } else {
4046 pthread_attr_init(&thread_attr_join);
4047 if ((ret = pthread_attr_setdetachstate(&thread_attr_join,PTHREAD_CREATE_JOINABLE)) != 0) {
4048 fprintf(stderr,"ERROR: pthread_attr_setdetachstate returned %d\n",ret);
4049 exit(1);
4050 }
4051
4052 #ifdef USE_MPI
4053 /* Master rank that is working or a Slave rank */
4054 if (myid == 0) {
4055 if (split_output_root == NULL && output_file == NULL) {
4056 pthread_create(&write_stdout_thread_id,&thread_attr_detach,Master_write_stdout,(void *) NULL);
4057 }
4058 pthread_create(&parser_thread_id,&thread_attr_detach,Master_parser,(void *) master);
4059 pthread_create(&mpi_interface_thread_id,&thread_attr_join,Master_mpi_interface,(void *) master);
4060 }
4061 #endif
4062
4063 worker_thread_ids = (pthread_t *) CALLOC(nthreads,sizeof(pthread_t));
4064 Except_init_pthread();
4065 pthread_key_create(&global_request_key,NULL);
4066
4067 if (orderedp == true) {
4068 pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_ordered,
4069 (void *) outbuffer);
4070 } else {
4071 pthread_create(&output_thread_id,&thread_attr_join,Outbuffer_thread_anyorder,
4072 (void *) outbuffer);
4073 }
4074
4075 for (worker_id = 0; worker_id < nthreads; worker_id++) {
4076 /* Need to have worker threads finish before we call Inbuffer_free() */
4077 pthread_create(&(worker_thread_ids[worker_id]),&thread_attr_join,worker_thread,(void *) worker_id);
4078 }
4079
4080 pthread_join(output_thread_id,NULL);
4081 for (worker_id = 0; worker_id < nthreads; worker_id++) {
4082 pthread_join(worker_thread_ids[worker_id],NULL);
4083 }
4084 #ifdef USE_MPI
4085 if (myid == 0) {
4086 pthread_join(mpi_interface_thread_id,NULL);
4087 }
4088 #endif
4089
4090 pthread_key_delete(global_request_key);
4091 /* Do not delete global_except_key, because worker threads might still need it */
4092 /* Except_term_pthread(); */
4093
4094 FREE(worker_thread_ids);
4095
4096 }
4097 #endif /* HAVE_PTHREAD */
4098
4099 #ifdef USE_MPI
4100 /* MPI worker ranks finished with creating output_thread and worker_threads below */
4101 }
4102 #endif
4103
4104
4105 /* Note: Shortread and Sequence procedures should close their own input files */
4106 #ifdef USE_MPI
4107 if (myid == 0) {
4108 runtime = Stopwatch_stop(stopwatch);
4109 Stopwatch_free(&stopwatch);
4110
4111 nread = Master_ntotal(master);
4112 fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
4113 nread,runtime,(double) nread/runtime);
4114 /* Master_free(&master); -- Master_parser thread still needs this */
4115 }
4116
4117 if (myid > 0 || master_is_worker_p) {
4118 Outbuffer_free(&outbuffer);
4119 Inbuffer_free(&inbuffer);
4120 }
4121
4122 Outbuffer_close_files(); /* All ranks have to close the files */
4123
4124 #else
4125 /* Single CPU or Pthreads version */
4126 runtime = Stopwatch_stop(stopwatch);
4127 Stopwatch_free(&stopwatch);
4128
4129 nread = Outbuffer_nread(outbuffer);
4130 /* nbeyond = Outbuffer_nbeyond(outbuffer); */
4131 fprintf(stderr,"Processed %u queries in %.2f seconds (%.2f queries/sec)\n",
4132 nread,runtime,(double) nread/runtime);
4133
4134 Outbuffer_free(&outbuffer);
4135 Inbuffer_free(&inbuffer);
4136
4137 Outbuffer_close_files();
4138 #endif
4139
4140 Outbuffer_cleanup();
4141
4142 #ifdef USE_MPI
4143 if (myid > 0 || master_is_worker_p == true) {
4144 worker_cleanup();
4145 MPI_Comm_free(&workers_comm);
4146 }
4147 MPI_Barrier(MPI_COMM_WORLD); /* Make sure all processes have cleaned up */
4148 MPI_Finalize();
4149 #else
4150 worker_cleanup();
4151 #endif
4152
4153 return 0;
4154 }
4155
4156
4157 static void
print_program_usage()4158 print_program_usage () {
4159 fprintf(stdout,"\
4160 Usage: gsnap [OPTIONS...] <FASTA file>, or\n\
4161 cat <FASTA file> | gmap [OPTIONS...]\n\
4162 \n\
4163 ");
4164
4165 /* Input options */
4166 fprintf(stdout,"Input options (must include -d)\n");
4167 fprintf(stdout,"\
4168 -D, --dir=directory Genome directory. Default (as specified by --with-gmapdb to the configure program) is\n\
4169 %s\n\
4170 ",GMAPDB);
4171 fprintf(stdout,"\
4172 -d, --db=STRING Genome database\n\
4173 --use-localdb=INT Whether to use the local hash tables, which help with finding extensions to the ends\n\
4174 of alignments in the presence of splicing or indels (0=no, 1=yes if available (default))\n\
4175 \n\
4176 ");
4177
4178 /* Transcriptome-guided options */
4179 fprintf(stdout,"Transcriptome-guided options (optional)\n");
4180 fprintf(stdout,"\
4181 -C, --transcriptdir=directory Transcriptome directory. Default is the value for --dir above\n\
4182 -c, --transcriptdb=STRING Transcriptome database\n\
4183 --use-transcriptome-only Use only the transcriptome index and not the genome index\n\
4184 \n\
4185 ");
4186
4187 /* Computation options */
4188 fprintf(stdout,"Computation options\n");
4189 fprintf(stdout,"\
4190 -k, --kmer=INT kmer size to use in genome database (allowed values: 16 or less)\n\
4191 If not specified, the program will find the highest available\n\
4192 kmer size in the genome database\n\
4193 --sampling=INT Sampling to use in genome database. If not specified, the program\n\
4194 will find the smallest available sampling value in the genome database\n\
4195 within selected k-mer size\n\
4196 -q, --part=INT/INT Process only the i-th out of every n sequences\n\
4197 e.g., 0/100 or 99/100 (useful for distributing jobs\n\
4198 to a computer farm).\n\
4199 ");
4200 fprintf(stdout,"\
4201 --input-buffer-size=INT Size of input buffer (program reads this many sequences\n\
4202 at a time for efficiency) (default %d)\n\
4203 ",inbuffer_nspaces);
4204
4205 fprintf(stdout,"\
4206 --barcode-length=INT Amount of barcode to remove from start of every read before alignment\n\
4207 (default %d)\n\
4208 ",barcode_length);
4209
4210 fprintf(stdout,"\
4211 --endtrim-length=INT Amount of trim to remove from the end of every read before alignment\n\
4212 (default %d)\n\
4213 ",endtrim_length);
4214
4215 fprintf(stdout,"\
4216 --orientation=STRING Orientation of paired-end reads\n\
4217 Allowed values: FR (fwd-rev, or typical Illumina; default),\n\
4218 RF (rev-fwd, for circularized inserts), or FF (fwd-fwd, same strand)\n\
4219 --fastq-id-start=INT Starting position of identifier in FASTQ header, space-delimited (>= 1)\n\
4220 --fastq-id-end=INT Ending position of identifier in FASTQ header, space-delimited (>= 1)\n\
4221 Examples:\n\
4222 @HWUSI-EAS100R:6:73:941:1973#0/1\n\
4223 start=1, end=1 (default) => identifier is HWUSI-EAS100R:6:73:941:1973#0\n\
4224 @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36\n\
4225 start=1, end=1 => identifier is SRR001666.1\n\
4226 start=2, end=2 => identifier is 071112_SLXA-EAS1_s_7:5:1:817:345\n\
4227 start=1, end=2 => identifier is SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345\n\
4228 --force-single-end When multiple FASTQ files are provided on the command line, GSNAP assumes\n\
4229 they are matching paired-end files. This flag treats each file as single-end.\n\
4230 --filter-chastity=STRING Skips reads marked by the Illumina chastity program. Expecting a string\n\
4231 after the accession having a 'Y' after the first colon, like this:\n\
4232 @accession 1:Y:0:CTTGTA\n\
4233 where the 'Y' signifies filtering by chastity.\n\
4234 Values: off (default), either, both. For 'either', a 'Y' on either end\n\
4235 of a paired-end read will be filtered. For 'both', a 'Y' is required\n\
4236 on both ends of a paired-end read (or on the only end of a single-end read).\n\
4237 --allow-pe-name-mismatch Allows accession names of reads to mismatch in paired-end files\n\
4238 --interleaved Input is in interleaved format (one read per line, tab-delimited\n\
4239 ");
4240 #ifdef HAVE_ZLIB
4241 fprintf(stdout,"\
4242 --gunzip Uncompress gzipped input files\n\
4243 ");
4244 #endif
4245 #ifdef HAVE_BZLIB
4246 fprintf(stdout,"\
4247 --bunzip2 Uncompress bzip2-compressed input files\n\
4248 ");
4249 #endif
4250 fprintf(stdout,"\n");
4251
4252 /* Computation options */
4253 fprintf(stdout,"Computation options\n");
4254 #if 0
4255 fprintf(stdout,"\
4256 \n\
4257 Note: GSNAP has an ultrafast algorithm for calculating mismatches up to and including\n\
4258 ((readlength+2)/kmer - 2) (\"ultrafast mismatches\"). The program will run fastest if\n\
4259 max-mismatches (plus suboptimal-levels) is within that value.\n\
4260 Also, indels, especially end indels, take longer to compute, although the algorithm\n\
4261 is still designed to be fast.\n\
4262 \n\
4263 ");
4264 #endif
4265
4266 #ifdef HAVE_MMAP
4267 fprintf(stdout,"\
4268 -B, --batch=INT Batch mode (default = 2)\n\
4269 Mode Hash offsets Hash positions Genome Local hash offsets Local hash positions\n\
4270 0 allocate mmap mmap allocate mmap\n\
4271 1 allocate mmap & preload mmap allocate mmap & preload\n\
4272 2 allocate mmap & preload mmap & preload allocate mmap & preload\n\
4273 3 allocate allocate mmap & preload allocate allocate\n\
4274 (default) 4 allocate allocate allocate allocate allocate\n\
4275 Note: For a single sequence, all data structures use mmap\n\
4276 A batch level of 5 means the same as 4, and is kept only for backward compatibility\n\
4277 ");
4278 #else
4279 fprintf(stdout,"\
4280 -B, --batch=INT Batch mode (default = 4, modes 0-3 disallowed because program configured without mmap)\n\
4281 Mode Hash offsets Hash positions Genome Local hash offsets Local hash positions\n\
4282 (default) 4 allocate allocate allocate allocate allocate\n\
4283 ");
4284 #endif
4285 fprintf(stdout,"\
4286 --use-shared-memory=INT If 1, then allocated memory is shared among all processes on this node\n\
4287 If 0 (default), then each process has private allocated memory\n\
4288 --preload-shared-memory Load files indicated by --batch mode into shared memory for use by other\n\
4289 GMAP/GSNAP processes on this node, and then exit. Ignore any input files.\n\
4290 --unload-shared-memory Unload files indicated by --batch mode into shared memory, or allow them\n\
4291 to be unloaded when existing GMAP/GSNAP processes on this node are finished\n\
4292 with them. Ignore any input files.\n\
4293 ");
4294
4295 fprintf(stdout,"\
4296 -m, --max-mismatches=FLOAT Maximum number of mismatches allowed (if not specified, then\n\
4297 GSNAP tries to find the best possible match in the genome)\n\
4298 If specified between 0.0 and 1.0, then treated as a fraction\n\
4299 of each read length. Otherwise, treated as an integral number\n\
4300 of mismatches (including indel and splicing penalties).\n\
4301 Default is to be unspecified (to find the best possible match)\n\
4302 --max-ref-mismatches=FLOAT If GSNAP is run under SNP-tolerant or masked genome mode, then the\n\
4303 --max-mismatches parameter above is for mismatches against reference and\n\
4304 SNPs, or against the unmasked genome, respectively. The --max-ref-mismatches\n\
4305 parameter is against mismatches against the reference genome or masked genome.\n\
4306 --min-coverage=FLOAT Minimum coverage required for an alignment.\n\
4307 If specified between 0.0 and 1.0, then treated as a fraction\n\
4308 of each read length. Otherwise, treated as an integral number\n\
4309 of base pairs. Default value is 0.5.\n\
4310 --filter-within-trims=INT Whether to count mismatches in trimmed part of alignment (1, yes) or\n\
4311 mismatches to the ends of the read (0, no), when applying the\n\
4312 --max-mismatches and --max-ref-mismatches parameters.\n\
4313 Default for RNA-Seq is 1 (yes) so we can allow for\n\
4314 reads that align past the ends of an exon. Default for DNA-Seq\n\
4315 is 0 (no).\n\
4316 For RNA-Seq, trimmed ends should be ignored, because trimming\n\
4317 is performed at probable splice sites, to allow for reads that\n\
4318 align past the ends of an exon.\n\
4319 --query-unk-mismatch=INT Whether to count unknown (N) characters in the query as a mismatch\n\
4320 (0=no (default), 1=yes)\n\
4321 --genome-unk-mismatch=INT Whether to count unknown (N) characters in the genome as a mismatch\n\
4322 (0=no, 1=yes). If --use-mask is specified, default is no, otherwise yes.\n\
4323 ");
4324
4325 #if 0
4326 fprintf(stdout,"\
4327 --maxsearch=INT Maximum number of alignments to find (default %d).\n\
4328 Must be larger than --npaths, which is the number to report.\n\
4329 Keeping this number large will allow for random selection among multiple alignments.\n\
4330 Reducing this number can speed up the program.\n\
4331 ",maxpaths_search);
4332 #endif
4333
4334 #if 0
4335 fprintf(stdout,"\
4336 -i, --indel-penalty-middle=INT Penalty for an indel in middle of read (default %d).\n\
4337 Counts against mismatches allowed. To find indels, make\n\
4338 indel-penalty less than or equal to max-mismatches\n\
4339 ",indel_penalty_middle);
4340 fprintf(stdout,"\
4341 -I, --indel-penalty-end=INT Penalty for an indel at end of read (default %d).\n\
4342 Counts against mismatches allowed. To find indels, make\n\
4343 indel-penalty less than or equal to max-mismatches\n\
4344 ",indel_penalty_end);
4345 #endif
4346
4347 fprintf(stdout,"\
4348 -i, --indel-penalty=INT Penalty for an indel (default %d).\n\
4349 Counts against mismatches allowed. To find indels, make\n\
4350 indel-penalty less than or equal to max-mismatches.\n\
4351 A value < 2 can lead to false positives at read ends\n\
4352 ",indel_penalty_middle);
4353
4354 #if 0
4355 /* No longer used */
4356 fprintf(stdout,"\
4357 -R, --masking=INT Masking of frequent/repetitive oligomers to avoid spending time\n\
4358 on non-unique or repetitive reads\n\
4359 0 = no masking (will try to find non-unique or repetitive matches)\n\
4360 1 = mask frequent oligomers\n\
4361 2 = mask frequent and repetitive oligomers (fastest) (default)\n\
4362 3 = greedy frequent: mask frequent oligomers first, then\n\
4363 try no masking if alignments not found\n\
4364 4 = greedy repetitive: mask frequent and repetitive oligomers first, then\n\
4365 try no masking if alignments not found\n\
4366 ");
4367 #endif
4368
4369 fprintf(stdout,"\
4370 --indel-endlength=INT Minimum length at end required for indel alignments (default %d)\n\
4371 ",min_indel_end_matches);
4372 fprintf(stdout,"\
4373 -y, --max-middle-insertions=FLOAT Maximum number of middle insertions allowed (default is %.1f)\n\
4374 If specified between 0.0 and 1.0, then treated as a fraction\n\
4375 of each read length. Otherwise, treated as an integral number\n\
4376 of base pairs\n\
4377 ",max_middle_insertions_float);
4378 fprintf(stdout,"\
4379 -z, --max-middle-deletions=FLOAT Maximum number of middle deletions allowed (default %.1f)\n\
4380 If specified between 0.0 and 1.0, then treated as a fraction\n\
4381 of each read length. Otherwise, treated as an integral number\n\
4382 of base pairs\n\
4383 ",max_middle_deletions_float);
4384 fprintf(stdout,"\
4385 -Y, --max-end-insertions=INT Maximum number of end insertions allowed (default %d)\n\
4386 ",max_end_insertions);
4387 fprintf(stdout,"\
4388 -Z, --max-end-deletions=INT Maximum number of end deletions allowed (default %d)\n\
4389 ",max_end_deletions);
4390 fprintf(stdout,"\
4391 -M, --suboptimal-levels=INT Report suboptimal hits beyond best hit (default %d)\n\
4392 All hits with best score plus suboptimal-levels are reported\n\
4393 ",subopt_levels);
4394 fprintf(stdout,"\
4395 -a, --adapter-strip=STRING Method for removing adapters from reads. Currently allowed values: off, paired.\n\
4396 Default is \"off\". To turn on, specify \"paired\", which removes adapters\n\
4397 from paired-end reads if they appear to be present.\n\
4398 ");
4399 fprintf(stdout,"\
4400 --trim-indel-score=INT Score to use for indels when trimming at ends. To turn off trimming,\n\
4401 specify 0. Default is %d for both RNA-Seq and DNA-Seq. Warning:\n\
4402 Turning trimming off in RNA-Seq can give false positive indels\n\
4403 at the ends of reads\n\
4404 ",trim_indel_score);
4405
4406 fprintf(stdout,"\
4407 -e, --use-mask=STRING Use genome containing masks (e.g. for non-exons) for scoring preference\n\
4408 -V, --snpsdir=STRING Directory for SNPs index files (created using snpindex) (default is\n\
4409 location of genome index files specified using -D and -d)\n \
4410 -v, --use-snps=STRING Use database containing known SNPs (in <STRING>.iit, built\n\
4411 previously using snpindex) for tolerance to SNPs\n\
4412 --cmetdir=STRING Directory for methylcytosine index files (created using cmetindex)\n\
4413 (default is location of genome index files specified using -D, -V, and -d)\n\
4414 --atoidir=STRING Directory for A-to-I RNA editing index files (created using atoiindex)\n\
4415 (default is location of genome index files specified using -D, -V, and -d)\n\
4416 --mode=STRING Alignment mode: standard (default), cmet-stranded, cmet-nonstranded,\n\
4417 atoi-stranded, atoi-nonstranded, ttoc-stranded, or ttoc-nonstranded.\n\
4418 Non-standard modes requires you to have previously run the cmetindex\n\
4419 or atoiindex programs (which also cover the ttoc modes) on the genome\n\
4420 ");
4421
4422
4423 #if 0
4424 fprintf(stdout,"\
4425 --tallydir=STRING Directory for tally IIT file to resolve concordant multiple alignments (default is\n\
4426 location of genome index files specified using -D and -d). Note: can\n\
4427 just give full path name to --use-tally instead.\n\
4428 --use-tally=STRING Use this tally IIT file to resolve concordant multiple alignments\n\
4429 --runlengthdir=STRING Directory for runlength IIT file to resolve concordant multiple alignments (default is\n\
4430 location of genome index files specified using -D and -d). Note: can\n\
4431 just give full path name to --use-runlength instead.\n\
4432 --use-runlength=STRING Use this runlength IIT file to resolve concordant multiple alignments\n\
4433 ");
4434 #endif
4435
4436
4437 #ifdef HAVE_PTHREAD
4438 fprintf(stdout,"\
4439 -t, --nthreads=INT Number of worker threads\n\
4440 ");
4441 #else
4442 fprintf(stdout,"\
4443 -t, --nthreads=INT Number of worker threads. Flag is ignored in this version of GSNAP, which has pthreads disabled\n\
4444 ");
4445 #endif
4446
4447 fprintf(stdout,"\
4448 --max-anchors=INT Controls number of candidate segments returned by the complete set algorithm\n\
4449 Default is 10. Can be increased to higher values to solve alignments with\n\
4450 evenly spaced mismatches at close distances. However, higher values will\n\
4451 cause GSNAP to run more slowly. A value of 1000, for example, slows down\n\
4452 the program by a factor of 10 or so. Therefore, change this value only if\n\
4453 absolutely necessary.\n\
4454 ");
4455
4456 #if 0
4457 fprintf(stdout,"\n");
4458 fprintf(stdout,"Options for GMAP alignment within GSNAP\n");
4459 fprintf(stdout,"\
4460 --gmap-mode=STRING Cases to use GMAP for complex alignments containing multiple splices or indels\n\
4461 Allowed values: none, all, pairsearch, ends, improve\n\
4462 (or multiple values, separated by commas).\n\
4463 Default: all, i.e., pairsearch,ends,improve\n\
4464 ");
4465 fprintf(stdout,"\
4466 --trigger-score-for-gmap=INT Try GMAP pairsearch on nearby genomic regions if best score (the total\n\
4467 of both ends if paired-end) exceeds this value (default %d)\n\
4468 ",trigger_score_for_gmap);
4469 fprintf(stdout,"\
4470 --gmap-min-match-length=INT Keep GMAP hit only if it has this many consecutive matches (default %d)\n\
4471 ",gmap_min_nconsecutive);
4472 fprintf(stdout,"\
4473 --gmap-allowance=INT Extra mismatch/indel score allowed for GMAP alignments (default %d)\n\
4474 ",gmap_allowance);
4475 fprintf(stdout,"\
4476 --max-gmap-pairsearch=INT Perform GMAP pairsearch on nearby genomic regions up to this many\n\
4477 many candidate ends (default %d). Requires pairsearch in --gmap-mode\n\
4478 ",max_gmap_pairsearch);
4479 fprintf(stdout,"\
4480 --max-gmap-terminal=INT Perform GMAP terminal on nearby genomic regions up to this many\n\
4481 candidate ends (default %d). Requires terminal in --gmap-mode\n\
4482 ",max_gmap_terminal);
4483 fprintf(stdout,"\
4484 --max-gmap-improvement=INT Perform GMAP improvement on nearby genomic regions up to this many\n\
4485 candidate ends (default %d). Requires improve in --gmap-mode\n\
4486 ",max_gmap_improvement);
4487 fprintf(stdout,"\n");
4488 #endif
4489
4490 #if 0
4491 fprintf(stdout,"Genes options for RNA-Seq\n");
4492 fprintf(stdout,"\
4493 -g, --genes=STRING Look for known genes in <STRING>.iit, to be used for resolving\n\
4494 multiple mapping reads. See README instructions for the correct\n\
4495 formatting of a genes IIT file.\n\
4496 --favor-multiexon In resolving multiple mapping reads, overlaps with known\n\
4497 multi-exon genes are favored over those with known single-exon\n\
4498 genes. This favors spliced genes over psuedogenes.\n\
4499 ");
4500 fprintf(stdout,"\n");
4501 #endif
4502
4503
4504 /* Splicing options */
4505 fprintf(stdout,"Splicing options for DNA-Seq\n");
4506 fprintf(stdout,"\
4507 --find-dna-chimeras=INT Look for distant splicing involving poor splice sites (0=no, 1=yes (default))\n\
4508 Current implementation improves even RNA-Seq alignments at poor splice sites\n\
4509 so it is now on by default.\n\
4510 ");
4511 fprintf(stdout,"\n");
4512
4513 /* Splicing options */
4514 fprintf(stdout,"Splicing options for RNA-Seq\n");
4515 fprintf(stdout,"\
4516 -N, --novelsplicing=INT Look for novel splicing (0=no (default), 1=yes)\n\
4517 --splicingdir=STRING Directory for splicing involving known sites or known introns,\n\
4518 as specified by the -s or --use-splicing flag (default is\n\
4519 directory computed from -D and -d flags). Note: can\n\
4520 just give full pathname to the -s flag instead.\n\
4521 -s, --use-splicing=STRING Look for splicing involving known sites or known introns\n\
4522 (in <STRING>.iit), at short or long distances\n\
4523 See README instructions for the distinction between known sites\n\
4524 and known introns\n\
4525 --ambig-splice-noclip For ambiguous known splicing at ends of the read, do not clip at the\n\
4526 splice site, but extend instead into the intron. This flag makes\n\
4527 sense only if you provide the --use-splicing flag, and you are trying\n\
4528 to eliminate all soft clipping with --trim-mismatch-score=0\n\
4529 ");
4530 fprintf(stdout,"\
4531 -w, --localsplicedist=INT Definition of local novel splicing event (default %d)\n\
4532 ",shortsplicedist);
4533 fprintf(stdout,"\
4534 --novelend-splicedist=INT Distance to look for novel splices at the ends of reads (default %d)\n\
4535 ",shortsplicedist_novelend);
4536 fprintf(stdout,"\
4537 --local-splice-penalty=INT Penalty for a local splice (default %d). Counts against mismatches allowed\n\
4538 ",localsplicing_penalty);
4539 fprintf(stdout,"\
4540 --distant-splice-penalty=INT Penalty for a distant splice (default %d). A distant splice is one where\n\
4541 the intron length exceeds the value of -w, or --localsplicedist, or is an\n\
4542 inversion, scramble, or translocation between two different chromosomes\n\
4543 Counts against mismatches allowed\n\
4544 ",distantsplicing_penalty);
4545 fprintf(stdout,"\
4546 -K, --distant-splice-endlength=INT Minimum length at end required for distant spliced alignments (default %d, min\n\
4547 allowed is the value of -k, or kmer size)\n\
4548 ",min_distantsplicing_end_matches);
4549 fprintf(stdout,"\
4550 -l, --shortend-splice-endlength=INT Minimum length at end required for short-end spliced alignments (default %d,\n\
4551 but unless known splice sites are provided with the -s flag, GSNAP may still\n\
4552 need the end length to be the value of -k, or kmer size to find a given splice\n\
4553 ",min_shortend);
4554 fprintf(stdout,"\
4555 --distant-splice-identity=FLOAT Minimum identity at end required for distant spliced alignments (default %.2f)\n\
4556 ",min_distantsplicing_identity);
4557 fprintf(stdout,"\
4558 --antistranded-penalty=INT (Not currently implemented, since it leads to poor results)\n\
4559 Penalty for antistranded splicing when using stranded RNA-Seq protocols.\n\
4560 A positive value, such as 1, expects antisense on the first read\n\
4561 and sense on the second read. Default is 0, which treats sense and antisense\n\
4562 equally well\n\
4563 --merge-distant-samechr Report distant splices on the same chromosome as a single splice, if possible.\n\
4564 Will produce a single SAM line instead of two SAM lines, which is also done\n\
4565 for translocations, inversions, and scramble events\n\
4566 ");
4567 fprintf(stdout,"\n");
4568
4569
4570 /* Paired-end options */
4571 fprintf(stdout,"Options for paired-end reads\n");
4572 fprintf(stdout,"\
4573 --pairmax-dna=INT Max total genomic length for DNA-Seq paired reads, or other reads\n\
4574 without splicing (default %d). Used if -N or -s is not specified.\n\
4575 This value is also used for circular chromosomes when splicing in\n\
4576 linear chromosomes is allowed\n\
4577 ",pairmax_dna);
4578 fprintf(stdout,"\
4579 --pairmax-rna=INT Max total genomic length for RNA-Seq paired reads, or other reads\n\
4580 that could have a splice (default %d). Used if -N or -s is specified.\n\
4581 Should probably match the value for -w, --localsplicedist.\n\
4582 ",pairmax_rna);
4583 fprintf(stdout,"\
4584 --pairexpect=INT Expected paired-end length, used for calling splices in medial part\n\
4585 of paired-end reads (default %d). Was turned off in previous versions, but reinstated.\n\
4586 ",expected_pairlength);
4587 fprintf(stdout,"\
4588 --pairdev=INT Allowable deviation from expected paired-end length, used for\n\
4589 calling splices in medial part of paired-end reads (default %d).\n\
4590 Was turned off in previous versions, but reinstated.\n\
4591 ",pairlength_deviation);
4592 fprintf(stdout,"\n");
4593
4594
4595 /* Quality score options */
4596 fprintf(stdout,"Options for quality scores\n");
4597 fprintf(stdout,"\
4598 --quality-protocol=STRING Protocol for input quality scores. Allowed values:\n\
4599 illumina (ASCII 64-126) (equivalent to -J 64 -j -31)\n\
4600 sanger (ASCII 33-126) (equivalent to -J 33 -j 0)\n\
4601 Default is sanger (no quality print shift)\n\
4602 SAM output files should have quality scores in sanger protocol\n\
4603 \n\
4604 Or you can customize this behavior with these flags:\n\
4605 -J, --quality-zero-score=INT FASTQ quality scores are zero at this ASCII value\n\
4606 (default is 33 for sanger protocol; for Illumina, select 64)\n\
4607 -j, --quality-print-shift=INT Shift FASTQ quality scores by this amount in output\n\
4608 (default is 0 for sanger protocol; to change Illumina input\n\
4609 to Sanger output, select -31)\n\
4610 ");
4611
4612 /* Output options */
4613 fprintf(stdout,"Output options\n");
4614 fprintf(stdout,"\
4615 -n, --npaths=INT Maximum number of paths to print (default %d).\n\
4616 ",maxpaths_report);
4617 fprintf(stdout,"\
4618 -Q, --quiet-if-excessive If more than maximum number of paths are found,\n\
4619 then nothing is printed.\n\
4620 -O, --ordered Print output in same order as input (relevant\n\
4621 only if there is more than one worker thread)\n\
4622 --show-refdiff For GSNAP output in SNP-tolerant alignment, shows all differences\n\
4623 relative to the reference genome as lower case (otherwise, it shows\n\
4624 all differences relative to both the reference and alternate genome)\n\
4625 --clip-overlap For paired-end reads whose alignments overlap, clip the overlapping region.\n\
4626 --merge-overlap For paired-end reads whose alignments overlap, merge the two ends into a single end (beta implementation)\n\
4627 --print-snps Print detailed information about SNPs in reads (works only if -v also selected)\n\
4628 (not fully implemented yet)\n\
4629 --failsonly Print only failed alignments, those with no results\n\
4630 --nofails Exclude printing of failed alignments\n\
4631 ");
4632
4633 fprintf(stdout,"\
4634 -A, --format=STRING Another format type, other than default.\n\
4635 Currently implemented: sam, m8 (BLAST tabular format)\n\
4636 ");
4637
4638 fprintf(stdout,"\
4639 --split-output=STRING Basename for multiple-file output, separately for nomapping,\n\
4640 halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult,\n\
4641 paired_uniq, paired_mult, concordant_uniq, and concordant_mult results\n\
4642 -o, --output-file=STRING File name for a single stream of output results.\n\
4643 --failed-input=STRING Print completely failed alignments as input FASTA or FASTQ format,\n\
4644 to the given file, appending .1 or .2, for paired-end data.\n\
4645 If the --split-output flag is also given, this file is generated\n\
4646 in addition to the output in the .nomapping file.\n\
4647 --append-output When --split-output or --failed-input is given, this flag will append output\n\
4648 to the existing files. Otherwise, the default is to create new files.\n\
4649 --order-among-best=STRING Among alignments tied with the best score, order those alignments in this order.\n\
4650 Allowed values: genomic, random (default)\n\
4651 ");
4652 fprintf(stdout,"\
4653 --output-buffer-size=INT Buffer size, in queries, for output thread (default %d). When the number\n\
4654 of results to be printed exceeds this size, the worker threads are halted\n\
4655 until the backlog is cleared\n\
4656 ",output_buffer_size);
4657 fprintf(stdout,"\n");
4658
4659 /* SAM options */
4660 fprintf(stdout,"Options for SAM output\n");
4661 fprintf(stdout,"\
4662 --no-sam-headers Do not print headers beginning with '@'\n\
4663 --add-paired-nomappers Add nomapper lines as needed to make all paired-end results alternate\n\
4664 between first end and second end\n\
4665 --paired-flag-means-concordant=INT Whether the paired bit in the SAM flags means concordant only (1)\n\
4666 or paired plus concordant (0, default)\n\
4667 --sam-headers-batch=INT Print headers only for this batch, as specified by -q\n\
4668 --sam-hardclip-use-S Use S instead of H for hardclips\n\
4669 --sam-use-0M Insert 0M in CIGAR between adjacent insertions, deletions, and introns\n\
4670 Picard disallows 0M, other tools may require it\n\
4671 --sam-extended-cigar Use extended CIGAR format (using X and = symbols instead of M,\n\
4672 to indicate matches and mismatches, respectively\n\
4673 --sam-multiple-primaries Allows multiple alignments to be marked as primary if they\n\
4674 have equally good mapping scores\n\
4675 --sam-sparse-secondaries For secondary alignments (in multiple mappings), uses '*' for SEQ\n\
4676 and QUAL fields, to give smaller file sizes. However, the output\n\
4677 will give warnings in Picard to give warnings and may not work\n\
4678 with downstream tools\n\
4679 --force-xs-dir For RNA-Seq alignments, disallows XS:A:? when the sense direction\n\
4680 is unclear, and replaces this value arbitrarily with XS:A:+.\n\
4681 May be useful for some programs, such as Cufflinks, that cannot\n\
4682 handle XS:A:?. However, if you use this flag, the reported value\n\
4683 of XS:A:+ in these cases will not be meaningful.\n\
4684 --md-lowercase-snp In MD string, when known SNPs are given by the -v flag,\n\
4685 prints difference nucleotides as lower-case when they,\n\
4686 differ from reference but match a known alternate allele\n\
4687 --extend-soft-clips Extends alignments through soft clipped regions\n\
4688 --action-if-cigar-error Action to take if there is a disagreement between CIGAR length and sequence length\n\
4689 Allowed values: ignore, warning (default), noprint, abort\n\
4690 Note that the noprint option does not print the CIGAR string at all if there\n\
4691 is an error, so it may break a SAM parser\n\
4692 --read-group-id=STRING Value to put into read-group id (RG-ID) field\n\
4693 --read-group-name=STRING Value to put into read-group name (RG-SM) field\n\
4694 --read-group-library=STRING Value to put into read-group library (RG-LB) field\n\
4695 --read-group-platform=STRING Value to put into read-group library (RG-PL) field\n\
4696 ");
4697 fprintf(stdout,"\n");
4698
4699 #ifdef USE_MPI
4700 fprintf(stdout,"Options for MPI\n");
4701 fprintf(stdout,"\
4702 --master-is-worker=INT Determines whether master node allocates threads for performing computation\n\
4703 in addition to coordinating input and output. Number of worker threads\n\
4704 will be --nthreads minus 2\n\
4705 Values: 0 (no, default), 1 (yes if enough worker threads available)\n\
4706 ");
4707 fprintf(stdout,"\n");
4708 #endif
4709
4710 /* Help options */
4711 fprintf(stdout,"Help options\n");
4712 fprintf(stdout,"\
4713 --check Check compiler assumptions\n\
4714 --version Show version\n\
4715 --help Show this help message\n\
4716 ");
4717 return;
4718 }
4719
4720