1 /*
2 * Copyright 2014, Daehwan Kim <infphilo@gmail.com>
3 *
4 * This file is part of HISAT.
5 *
6 * HISAT is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * HISAT is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with HISAT. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <stdlib.h>
21 #include <iostream>
22 #include <fstream>
23 #include <string>
24 #include <cassert>
25 #include <stdexcept>
26 #include <getopt.h>
27 #include <math.h>
28 #include <utility>
29 #include <limits>
30 #include "alphabet.h"
31 #include "assert_helpers.h"
32 #include "endian_swap.h"
33 #include "bt2_idx.h"
34 #include "bt2_io.h"
35 #include "bt2_util.h"
36 #include "hier_idx.h"
37 #include "formats.h"
38 #include "sequence_io.h"
39 #include "tokenize.h"
40 #include "aln_sink.h"
41 #include "pat.h"
42 #include "threading.h"
43 #include "ds.h"
44 #include "aligner_metrics.h"
45 #include "sam.h"
46 #include "aligner_seed.h"
47 #include "splice_site.h"
48 #include "bp_aligner.h"
49 #include "aligner_seed_policy.h"
50 #include "aligner_driver.h"
51 #include "aligner_sw.h"
52 #include "aligner_sw_driver.h"
53 #include "aligner_cache.h"
54 #include "util.h"
55 #include "pe.h"
56 #include "simple_func.h"
57 #include "presets.h"
58 #include "opts.h"
59 #include "outq.h"
60 #include "aligner_seed2.h"
61
62 using namespace std;
63
64 static EList<string> mates1; // mated reads (first mate)
65 static EList<string> mates2; // mated reads (second mate)
66 static EList<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
67 static string adjIdxBase;
68 bool gColor; // colorspace (not supported)
69 int gVerbose; // be talkative
70 static bool startVerbose; // be talkative at startup
71 int gQuiet; // print nothing but the alignments
72 static int sanityCheck; // enable expensive sanity checks
73 static int format; // default read format is FASTQ
74 static string origString; // reference text, or filename(s)
75 static int seed; // srandom() seed
76 static int timing; // whether to report basic timing data
77 static int metricsIval; // interval between alignment metrics messages (0 = no messages)
78 static string metricsFile;// output file to put alignment metrics in
79 static bool metricsStderr;// output file to put alignment metrics in
80 static bool metricsPerRead; // report a metrics tuple for every read
81 static bool allHits; // for multihits, report just one
82 static bool showVersion; // just print version and quit?
83 static int ipause; // pause before maching?
84 static uint32_t qUpto; // max # of queries to read
85 int gTrim5; // amount to trim from 5' end
86 int gTrim3; // amount to trim from 3' end
87 static int offRate; // keep default offRate
88 static bool solexaQuals; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
89 static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33)
90 static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII
91 static int nthreads; // number of pthreads operating concurrently
92 static int outType; // style of output
93 static bool noRefNames; // true -> print reference indexes; not names
94 static uint32_t khits; // number of hits per read; >1 is much slower
95 static uint32_t mhits; // don't report any hits if there are > mhits
96 static int partitionSz; // output a partitioning key in first field
97 static bool useSpinlock; // false -> don't use of spinlocks even if they're #defines
98 static bool fileParallel; // separate threads read separate input files in parallel
99 static bool useShmem; // use shared memory to hold the index
100 static bool useMm; // use memory-mapped files to hold the index
101 static bool mmSweep; // sweep through memory-mapped files immediately after mapping
102 int gMinInsert; // minimum insert size
103 int gMaxInsert; // maximum insert size
104 bool gMate1fw; // -1 mate aligns in fw orientation on fw strand
105 bool gMate2fw; // -2 mate aligns in rc orientation on fw strand
106 bool gFlippedMatesOK; // allow mates to be in wrong order
107 bool gDovetailMatesOK; // allow one mate to extend off the end of the other
108 bool gContainMatesOK; // allow one mate to contain the other in PE alignment
109 bool gOlapMatesOK; // allow mates to overlap in PE alignment
110 bool gExpandToFrag; // incr max frag length to =larger mate len if necessary
111 bool gReportDiscordant; // find and report discordant paired-end alignments
112 bool gReportMixed; // find and report unpaired alignments for paired reads
113 static uint32_t cacheLimit; // ranges w/ size > limit will be cached
114 static uint32_t cacheSize; // # words per range cache
115 static uint32_t skipReads; // # reads/read pairs to skip
116 bool gNofw; // don't align fw orientation of read
117 bool gNorc; // don't align rc orientation of read
118 static uint32_t fastaContLen;
119 static uint32_t fastaContFreq;
120 static bool hadoopOut; // print Hadoop status and summary messages
121 static bool fuzzy;
122 static bool fullRef;
123 static bool samTruncQname; // whether to truncate QNAME to 255 chars
124 static bool samOmitSecSeqQual; // omit SEQ/QUAL for 2ndary alignments?
125 static bool samNoUnal; // don't print records for unaligned reads
126 static bool samNoHead; // don't print any header lines in SAM output
127 static bool samNoSQ; // don't print @SQ header lines
128 static bool sam_print_as;
129 static bool sam_print_xs; // XS:i
130 static bool sam_print_xss; // Xs:i and Ys:i
131 static bool sam_print_yn; // YN:i and Yn:i
132 static bool sam_print_xn;
133 static bool sam_print_cs;
134 static bool sam_print_cq;
135 static bool sam_print_x0;
136 static bool sam_print_x1;
137 static bool sam_print_xm;
138 static bool sam_print_xo;
139 static bool sam_print_xg;
140 static bool sam_print_nm;
141 static bool sam_print_md;
142 static bool sam_print_yf;
143 static bool sam_print_yi;
144 static bool sam_print_ym;
145 static bool sam_print_yp;
146 static bool sam_print_yt;
147 static bool sam_print_ys;
148 static bool sam_print_zs;
149 static bool sam_print_xr;
150 static bool sam_print_xt;
151 static bool sam_print_xd;
152 static bool sam_print_xu;
153 static bool sam_print_yl;
154 static bool sam_print_ye;
155 static bool sam_print_yu;
156 static bool sam_print_xp;
157 static bool sam_print_yr;
158 static bool sam_print_zb;
159 static bool sam_print_zr;
160 static bool sam_print_zf;
161 static bool sam_print_zm;
162 static bool sam_print_zi;
163 static bool sam_print_zp;
164 static bool sam_print_zu;
165 static bool sam_print_xs_a;
166 static bool bwaSwLike;
167 static float bwaSwLikeC;
168 static float bwaSwLikeT;
169 static bool qcFilter;
170 static bool sortByScore; // prioritize alignments to report by score?
171 bool gReportOverhangs; // false -> filter out alignments that fall off the end of a reference sequence
172 static string rgid; // ID: setting for @RG header line
173 static string rgs; // SAM outputs for @RG header line
174 static string rgs_optflag; // SAM optional flag to add corresponding to @RG ID
175 static bool msample; // whether to report a random alignment when maxed-out via -m/-M
176 int gGapBarrier; // # diags on top/bot only to be entered diagonally
177 static EList<string> qualities;
178 static EList<string> qualities1;
179 static EList<string> qualities2;
180 static string polstr; // temporary holder for policy string
181 static bool msNoCache; // true -> disable local cache
182 static int bonusMatchType; // how to reward matches
183 static int bonusMatch; // constant reward if bonusMatchType=constant
184 static int penMmcType; // how to penalize mismatches
185 static int penMmcMax; // max mm penalty
186 static int penMmcMin; // min mm penalty
187 static int penNType; // how to penalize Ns in the read
188 static int penN; // constant if N pelanty is a constant
189 static bool penNCatPair; // concatenate mates before N filtering?
190 static bool localAlign; // do local alignment in DP steps
191 static bool noisyHpolymer; // set to true if gap penalties should be reduced to be consistent with a sequencer that under- and overcalls homopolymers
192 static int penRdGapConst; // constant cost of extending a gap in the read
193 static int penRfGapConst; // constant cost of extending a gap in the reference
194 static int penRdGapLinear; // coeff of linear term for cost of gap extension in read
195 static int penRfGapLinear; // coeff of linear term for cost of gap extension in ref
196 static SimpleFunc scoreMin; // minimum valid score as function of read len
197 static SimpleFunc nCeil; // max # Ns allowed as function of read len
198 static SimpleFunc msIval; // interval between seeds as function of read len
199 static double descConsExp; // how to adjust score minimum as we descent further into index-assisted alignment
200 static size_t descentLanding; // don't place a search root if it's within this many positions of end
201 static SimpleFunc descentTotSz; // maximum space a DescentDriver can use in bytes
202 static SimpleFunc descentTotFmops; // maximum # FM ops a DescentDriver can perform
203 static int multiseedMms; // mismatches permitted in a multiseed seed
204 static int multiseedLen; // length of multiseed seeds
205 static size_t multiseedOff; // offset to begin extracting seeds
206 static uint32_t seedCacheLocalMB; // # MB to use for non-shared seed alignment cacheing
207 static uint32_t seedCacheCurrentMB; // # MB to use for current-read seed hit cacheing
208 static uint32_t exactCacheCurrentMB; // # MB to use for current-read seed hit cacheing
209 static size_t maxhalf; // max width on one side of DP table
210 static bool seedSumm; // print summary information about seed hits, not alignments
211 static bool doUngapped; // do ungapped alignment
212 static size_t maxIters; // stop after this many extend loop iterations
213 static size_t maxUg; // stop after this many ungap extends
214 static size_t maxDp; // stop after this many DPs
215 static size_t maxItersIncr; // amt to add to maxIters for each -k > 1
216 static size_t maxEeStreak; // stop after this many end-to-end fails in a row
217 static size_t maxUgStreak; // stop after this many ungap fails in a row
218 static size_t maxDpStreak; // stop after this many dp fails in a row
219 static size_t maxStreakIncr; // amt to add to streak for each -k > 1
220 static size_t maxMateStreak; // stop seed range after this many mate-find fails
221 static bool doExtend; // extend seed hits
222 static bool enable8; // use 8-bit SSE where possible?
223 static size_t cminlen; // longer reads use checkpointing
224 static size_t cpow2; // checkpoint interval log2
225 static bool doTri; // do triangular mini-fills?
226 static string defaultPreset; // default preset; applied immediately
227 static bool ignoreQuals; // all mms incur same penalty, regardless of qual
228 static string wrapper; // type of wrapper script, so we can print correct usage
229 static EList<string> queries; // list of query files
230 static string outfile; // write SAM output to this file
231 static int mapqv; // MAPQ calculation version
232 static int tighten; // -M tighten mode (0=none, 1=best, 2=secbest+1)
233 static bool doExactUpFront; // do exact search up front if seeds seem good enough
234 static bool do1mmUpFront; // do 1mm search up front if seeds seem good enough
235 static size_t do1mmMinLen; // length below which we disable 1mm e2e search
236 static int seedBoostThresh; // if average non-zero position has more than this many elements
237 static size_t nSeedRounds; // # seed rounds
238 static bool reorder; // true -> reorder SAM recs in -p mode
239 static float sampleFrac; // only align random fraction of input reads
240 static bool arbitraryRandom; // pseudo-randoms no longer a function of read properties
241 static bool bowtie2p5;
242 static bool useTempSpliceSite;
243 static int penCanSplice;
244 static int penNoncanSplice;
245 static SimpleFunc penIntronLen;
246 static string knownSpliceSiteInfile; //
247 static string novelSpliceSiteInfile; //
248 static string novelSpliceSiteOutfile; //
249 static bool no_spliced_alignment;
250 static int rna_strandness; //
251
252 static string bt2index; // read Bowtie 2 index from files with this prefix
253 static EList<pair<int, string> > extra_opts;
254 static size_t extra_opts_cur;
255
256 static EList<uint64_t> thread_rids;
257 static MUTEX_T thread_rids_mutex;
258 static uint64_t thread_rids_mindist;
259
260 #define DMAX std::numeric_limits<double>::max()
261
resetOptions()262 static void resetOptions() {
263 mates1.clear();
264 mates2.clear();
265 mates12.clear();
266 adjIdxBase = "";
267 gColor = false;
268 gVerbose = 0;
269 startVerbose = 0;
270 gQuiet = false;
271 sanityCheck = 0; // enable expensive sanity checks
272 format = FASTQ; // default read format is FASTQ
273 origString = ""; // reference text, or filename(s)
274 seed = 0; // srandom() seed
275 timing = 0; // whether to report basic timing data
276 metricsIval = 1; // interval between alignment metrics messages (0 = no messages)
277 metricsFile = ""; // output file to put alignment metrics in
278 metricsStderr = false; // print metrics to stderr (in addition to --metrics-file if it's specified
279 metricsPerRead = false; // report a metrics tuple for every read?
280 allHits = false; // for multihits, report just one
281 showVersion = false; // just print version and quit?
282 ipause = 0; // pause before maching?
283 qUpto = 0xffffffff; // max # of queries to read
284 gTrim5 = 0; // amount to trim from 5' end
285 gTrim3 = 0; // amount to trim from 3' end
286 offRate = -1; // keep default offRate
287 solexaQuals = false; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
288 phred64Quals = false; // quality chars are phred, but must subtract 64 (not 33)
289 integerQuals = false; // quality strings are space-separated strings of integers, not ASCII
290 nthreads = 1; // number of pthreads operating concurrently
291 outType = OUTPUT_SAM; // style of output
292 noRefNames = false; // true -> print reference indexes; not names
293 khits = 5; // number of hits per read; >1 is much slower
294 mhits = 0; // stop after finding this many alignments+1
295 partitionSz = 0; // output a partitioning key in first field
296 useSpinlock = true; // false -> don't use of spinlocks even if they're #defines
297 fileParallel = false; // separate threads read separate input files in parallel
298 useShmem = false; // use shared memory to hold the index
299 useMm = false; // use memory-mapped files to hold the index
300 mmSweep = false; // sweep through memory-mapped files immediately after mapping
301 gMinInsert = 0; // minimum insert size
302 gMaxInsert = 500; // maximum insert size
303 gMate1fw = true; // -1 mate aligns in fw orientation on fw strand
304 gMate2fw = false; // -2 mate aligns in rc orientation on fw strand
305 gFlippedMatesOK = false; // allow mates to be in wrong order
306 gDovetailMatesOK = false; // allow one mate to extend off the end of the other
307 gContainMatesOK = true; // allow one mate to contain the other in PE alignment
308 gOlapMatesOK = true; // allow mates to overlap in PE alignment
309 gExpandToFrag = true; // incr max frag length to =larger mate len if necessary
310 gReportDiscordant = true; // find and report discordant paired-end alignments
311 gReportMixed = true; // find and report unpaired alignments for paired reads
312
313 cacheLimit = 5; // ranges w/ size > limit will be cached
314 cacheSize = 0; // # words per range cache
315 skipReads = 0; // # reads/read pairs to skip
316 gNofw = false; // don't align fw orientation of read
317 gNorc = false; // don't align rc orientation of read
318 fastaContLen = 0;
319 fastaContFreq = 0;
320 hadoopOut = false; // print Hadoop status and summary messages
321 fuzzy = false; // reads will have alternate basecalls w/ qualities
322 fullRef = false; // print entire reference name instead of just up to 1st space
323 samTruncQname = true; // whether to truncate QNAME to 255 chars
324 samOmitSecSeqQual = false; // omit SEQ/QUAL for 2ndary alignments?
325 samNoUnal = false; // omit SAM records for unaligned reads
326 samNoHead = false; // don't print any header lines in SAM output
327 samNoSQ = false; // don't print @SQ header lines
328 sam_print_as = true;
329 sam_print_xs = true;
330 sam_print_xss = false; // Xs:i and Ys:i
331 sam_print_yn = false; // YN:i and Yn:i
332 sam_print_xn = true;
333 sam_print_cs = false;
334 sam_print_cq = false;
335 sam_print_x0 = true;
336 sam_print_x1 = true;
337 sam_print_xm = true;
338 sam_print_xo = true;
339 sam_print_xg = true;
340 sam_print_nm = true;
341 sam_print_md = true;
342 sam_print_yf = true;
343 sam_print_yi = false;
344 sam_print_ym = false;
345 sam_print_yp = false;
346 sam_print_yt = true;
347 sam_print_ys = true;
348 sam_print_zs = false;
349 sam_print_xr = false;
350 sam_print_xt = false;
351 sam_print_xd = false;
352 sam_print_xu = false;
353 sam_print_yl = false;
354 sam_print_ye = false;
355 sam_print_yu = false;
356 sam_print_xp = false;
357 sam_print_yr = false;
358 sam_print_zb = false;
359 sam_print_zr = false;
360 sam_print_zf = false;
361 sam_print_zm = false;
362 sam_print_zi = false;
363 sam_print_zp = false;
364 sam_print_zu = false;
365 sam_print_xs_a = true;
366 bwaSwLike = false;
367 bwaSwLikeC = 5.5f;
368 bwaSwLikeT = 20.0f;
369 qcFilter = false; // don't believe upstream qc by default
370 sortByScore = true; // prioritize alignments to report by score?
371 rgid = ""; // SAM outputs for @RG header line
372 rgs = ""; // SAM outputs for @RG header line
373 rgs_optflag = ""; // SAM optional flag to add corresponding to @RG ID
374 msample = true;
375 gGapBarrier = 4; // disallow gaps within this many chars of either end of alignment
376 qualities.clear();
377 qualities1.clear();
378 qualities2.clear();
379 polstr.clear();
380 msNoCache = true; // true -> disable local cache
381 bonusMatchType = DEFAULT_MATCH_BONUS_TYPE;
382 bonusMatch = DEFAULT_MATCH_BONUS;
383 penMmcType = DEFAULT_MM_PENALTY_TYPE;
384 penMmcMax = DEFAULT_MM_PENALTY_MAX;
385 penMmcMin = DEFAULT_MM_PENALTY_MIN;
386 penNType = DEFAULT_N_PENALTY_TYPE;
387 penN = DEFAULT_N_PENALTY;
388 penNCatPair = DEFAULT_N_CAT_PAIR; // concatenate mates before N filtering?
389 localAlign = false; // do local alignment in DP steps
390 noisyHpolymer = false;
391 penRdGapConst = DEFAULT_READ_GAP_CONST;
392 penRfGapConst = DEFAULT_REF_GAP_CONST;
393 penRdGapLinear = DEFAULT_READ_GAP_LINEAR;
394 penRfGapLinear = DEFAULT_REF_GAP_LINEAR;
395 // scoreMin.init (SIMPLE_FUNC_LINEAR, DEFAULT_MIN_CONST, DEFAULT_MIN_LINEAR);
396 scoreMin.init (SIMPLE_FUNC_CONST, -18, 0);
397 nCeil.init (SIMPLE_FUNC_LINEAR, 0.0f, DMAX, 2.0f, 0.1f);
398 msIval.init (SIMPLE_FUNC_LINEAR, 1.0f, DMAX, DEFAULT_IVAL_B, DEFAULT_IVAL_A);
399 descConsExp = 2.0;
400 descentLanding = 20;
401 descentTotSz.init(SIMPLE_FUNC_LINEAR, 1024.0, DMAX, 0.0, 1024.0);
402 descentTotFmops.init(SIMPLE_FUNC_LINEAR, 100.0, DMAX, 0.0, 10.0);
403 multiseedMms = DEFAULT_SEEDMMS;
404 multiseedLen = DEFAULT_SEEDLEN;
405 multiseedOff = 0;
406 seedCacheLocalMB = 32; // # MB to use for non-shared seed alignment cacheing
407 seedCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing
408 exactCacheCurrentMB = 20; // # MB to use for current-read seed hit cacheing
409 maxhalf = 15; // max width on one side of DP table
410 seedSumm = false; // print summary information about seed hits, not alignments
411 doUngapped = true; // do ungapped alignment
412 maxIters = 400; // max iterations of extend loop
413 maxUg = 300; // stop after this many ungap extends
414 maxDp = 300; // stop after this many dp extends
415 maxItersIncr = 20; // amt to add to maxIters for each -k > 1
416 maxEeStreak = 15; // stop after this many end-to-end fails in a row
417 maxUgStreak = 15; // stop after this many ungap fails in a row
418 maxDpStreak = 15; // stop after this many dp fails in a row
419 maxStreakIncr = 10; // amt to add to streak for each -k > 1
420 maxMateStreak = 10; // in PE: abort seed range after N mate-find fails
421 doExtend = true; // do seed extensions
422 enable8 = true; // use 8-bit SSE where possible?
423 cminlen = 2000; // longer reads use checkpointing
424 cpow2 = 4; // checkpoint interval log2
425 doTri = false; // do triangular mini-fills?
426 defaultPreset = "sensitive%LOCAL%"; // default preset; applied immediately
427 extra_opts.clear();
428 extra_opts_cur = 0;
429 bt2index.clear(); // read Bowtie 2 index from files with this prefix
430 ignoreQuals = false; // all mms incur same penalty, regardless of qual
431 wrapper.clear(); // type of wrapper script, so we can print correct usage
432 queries.clear(); // list of query files
433 outfile.clear(); // write SAM output to this file
434 mapqv = 2; // MAPQ calculation version
435 tighten = 3; // -M tightening mode
436 doExactUpFront = true; // do exact search up front if seeds seem good enough
437 do1mmUpFront = true; // do 1mm search up front if seeds seem good enough
438 seedBoostThresh = 300; // if average non-zero position has more than this many elements
439 nSeedRounds = 2; // # rounds of seed searches to do for repetitive reads
440 do1mmMinLen = 60; // length below which we disable 1mm search
441 reorder = false; // reorder SAM records with -p > 1
442 sampleFrac = 1.1f; // align all reads
443 arbitraryRandom = false; // let pseudo-random seeds be a function of read properties
444 bowtie2p5 = false;
445 useTempSpliceSite = true;
446 penCanSplice = 0;
447 penNoncanSplice = 3;
448 penIntronLen.init(SIMPLE_FUNC_LOG, -8, 1);
449 knownSpliceSiteInfile = "";
450 novelSpliceSiteInfile = "";
451 novelSpliceSiteOutfile = "";
452 no_spliced_alignment = false;
453 rna_strandness = RNA_STRANDNESS_UNKNOWN;
454 }
455
456 static const char *short_options = "fF:qbzhcu:rv:s:aP:t3:5:w:p:k:M:1:2:I:X:CQ:N:i:L:U:x:S:g:O:D:R:";
457
458 static struct option long_options[] = {
459 {(char*)"verbose", no_argument, 0, ARG_VERBOSE},
460 {(char*)"startverbose", no_argument, 0, ARG_STARTVERBOSE},
461 {(char*)"quiet", no_argument, 0, ARG_QUIET},
462 {(char*)"sanity", no_argument, 0, ARG_SANITY},
463 {(char*)"pause", no_argument, &ipause, 1},
464 {(char*)"orig", required_argument, 0, ARG_ORIG},
465 {(char*)"all", no_argument, 0, 'a'},
466 {(char*)"solexa-quals", no_argument, 0, ARG_SOLEXA_QUALS},
467 {(char*)"integer-quals",no_argument, 0, ARG_INTEGER_QUALS},
468 {(char*)"int-quals", no_argument, 0, ARG_INTEGER_QUALS},
469 {(char*)"metrics", required_argument, 0, ARG_METRIC_IVAL},
470 {(char*)"metrics-file", required_argument, 0, ARG_METRIC_FILE},
471 {(char*)"metrics-stderr",no_argument, 0, ARG_METRIC_STDERR},
472 {(char*)"metrics-per-read", no_argument, 0, ARG_METRIC_PER_READ},
473 {(char*)"met-read", no_argument, 0, ARG_METRIC_PER_READ},
474 {(char*)"met", required_argument, 0, ARG_METRIC_IVAL},
475 {(char*)"met-file", required_argument, 0, ARG_METRIC_FILE},
476 {(char*)"met-stderr", no_argument, 0, ARG_METRIC_STDERR},
477 {(char*)"time", no_argument, 0, 't'},
478 {(char*)"trim3", required_argument, 0, '3'},
479 {(char*)"trim5", required_argument, 0, '5'},
480 {(char*)"seed", required_argument, 0, ARG_SEED},
481 {(char*)"qupto", required_argument, 0, 'u'},
482 {(char*)"upto", required_argument, 0, 'u'},
483 {(char*)"version", no_argument, 0, ARG_VERSION},
484 {(char*)"filepar", no_argument, 0, ARG_FILEPAR},
485 {(char*)"help", no_argument, 0, 'h'},
486 {(char*)"threads", required_argument, 0, 'p'},
487 {(char*)"khits", required_argument, 0, 'k'},
488 {(char*)"minins", required_argument, 0, 'I'},
489 {(char*)"maxins", required_argument, 0, 'X'},
490 {(char*)"quals", required_argument, 0, 'Q'},
491 {(char*)"Q1", required_argument, 0, ARG_QUALS1},
492 {(char*)"Q2", required_argument, 0, ARG_QUALS2},
493 {(char*)"refidx", no_argument, 0, ARG_REFIDX},
494 {(char*)"partition", required_argument, 0, ARG_PARTITION},
495 {(char*)"ff", no_argument, 0, ARG_FF},
496 {(char*)"fr", no_argument, 0, ARG_FR},
497 {(char*)"rf", no_argument, 0, ARG_RF},
498 {(char*)"cachelim", required_argument, 0, ARG_CACHE_LIM},
499 {(char*)"cachesz", required_argument, 0, ARG_CACHE_SZ},
500 {(char*)"nofw", no_argument, 0, ARG_NO_FW},
501 {(char*)"norc", no_argument, 0, ARG_NO_RC},
502 {(char*)"skip", required_argument, 0, 's'},
503 {(char*)"12", required_argument, 0, ARG_ONETWO},
504 {(char*)"tab5", required_argument, 0, ARG_TAB5},
505 {(char*)"tab6", required_argument, 0, ARG_TAB6},
506 {(char*)"phred33-quals", no_argument, 0, ARG_PHRED33},
507 {(char*)"phred64-quals", no_argument, 0, ARG_PHRED64},
508 {(char*)"phred33", no_argument, 0, ARG_PHRED33},
509 {(char*)"phred64", no_argument, 0, ARG_PHRED64},
510 {(char*)"solexa1.3-quals", no_argument, 0, ARG_PHRED64},
511 {(char*)"mm", no_argument, 0, ARG_MM},
512 {(char*)"shmem", no_argument, 0, ARG_SHMEM},
513 {(char*)"mmsweep", no_argument, 0, ARG_MMSWEEP},
514 {(char*)"hadoopout", no_argument, 0, ARG_HADOOPOUT},
515 {(char*)"fuzzy", no_argument, 0, ARG_FUZZY},
516 {(char*)"fullref", no_argument, 0, ARG_FULLREF},
517 {(char*)"usage", no_argument, 0, ARG_USAGE},
518 {(char*)"sam-no-qname-trunc", no_argument, 0, ARG_SAM_NO_QNAME_TRUNC},
519 {(char*)"sam-omit-sec-seq", no_argument, 0, ARG_SAM_OMIT_SEC_SEQ},
520 {(char*)"omit-sec-seq", no_argument, 0, ARG_SAM_OMIT_SEC_SEQ},
521 {(char*)"sam-no-head", no_argument, 0, ARG_SAM_NOHEAD},
522 {(char*)"sam-nohead", no_argument, 0, ARG_SAM_NOHEAD},
523 {(char*)"sam-noHD", no_argument, 0, ARG_SAM_NOHEAD},
524 {(char*)"sam-no-hd", no_argument, 0, ARG_SAM_NOHEAD},
525 {(char*)"sam-nosq", no_argument, 0, ARG_SAM_NOSQ},
526 {(char*)"sam-no-sq", no_argument, 0, ARG_SAM_NOSQ},
527 {(char*)"sam-noSQ", no_argument, 0, ARG_SAM_NOSQ},
528 {(char*)"no-head", no_argument, 0, ARG_SAM_NOHEAD},
529 {(char*)"no-hd", no_argument, 0, ARG_SAM_NOHEAD},
530 {(char*)"no-sq", no_argument, 0, ARG_SAM_NOSQ},
531 {(char*)"no-HD", no_argument, 0, ARG_SAM_NOHEAD},
532 {(char*)"no-SQ", no_argument, 0, ARG_SAM_NOSQ},
533 {(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL},
534 {(char*)"color", no_argument, 0, 'C'},
535 {(char*)"sam-RG", required_argument, 0, ARG_SAM_RG},
536 {(char*)"sam-rg", required_argument, 0, ARG_SAM_RG},
537 {(char*)"sam-rg-id", required_argument, 0, ARG_SAM_RGID},
538 {(char*)"RG", required_argument, 0, ARG_SAM_RG},
539 {(char*)"rg", required_argument, 0, ARG_SAM_RG},
540 {(char*)"rg-id", required_argument, 0, ARG_SAM_RGID},
541 {(char*)"snpphred", required_argument, 0, ARG_SNPPHRED},
542 {(char*)"snpfrac", required_argument, 0, ARG_SNPFRAC},
543 {(char*)"gbar", required_argument, 0, ARG_GAP_BAR},
544 {(char*)"qseq", no_argument, 0, ARG_QSEQ},
545 {(char*)"policy", required_argument, 0, ARG_ALIGN_POLICY},
546 {(char*)"preset", required_argument, 0, 'P'},
547 {(char*)"seed-summ", no_argument, 0, ARG_SEED_SUMM},
548 {(char*)"seed-summary", no_argument, 0, ARG_SEED_SUMM},
549 {(char*)"overhang", no_argument, 0, ARG_OVERHANG},
550 {(char*)"no-cache", no_argument, 0, ARG_NO_CACHE},
551 {(char*)"cache", no_argument, 0, ARG_USE_CACHE},
552 {(char*)"454", no_argument, 0, ARG_NOISY_HPOLY},
553 {(char*)"ion-torrent", no_argument, 0, ARG_NOISY_HPOLY},
554 {(char*)"no-mixed", no_argument, 0, ARG_NO_MIXED},
555 {(char*)"no-discordant",no_argument, 0, ARG_NO_DISCORDANT},
556 {(char*)"local", no_argument, 0, ARG_LOCAL},
557 {(char*)"end-to-end", no_argument, 0, ARG_END_TO_END},
558 {(char*)"ungapped", no_argument, 0, ARG_UNGAPPED},
559 {(char*)"no-ungapped", no_argument, 0, ARG_UNGAPPED_NO},
560 {(char*)"sse8", no_argument, 0, ARG_SSE8},
561 {(char*)"no-sse8", no_argument, 0, ARG_SSE8_NO},
562 {(char*)"scan-narrowed",no_argument, 0, ARG_SCAN_NARROWED},
563 {(char*)"qc-filter", no_argument, 0, ARG_QC_FILTER},
564 {(char*)"bwa-sw-like", no_argument, 0, ARG_BWA_SW_LIKE},
565 {(char*)"multiseed", required_argument, 0, ARG_MULTISEED_IVAL},
566 {(char*)"ma", required_argument, 0, ARG_SCORE_MA},
567 {(char*)"mp", required_argument, 0, ARG_SCORE_MMP},
568 {(char*)"np", required_argument, 0, ARG_SCORE_NP},
569 {(char*)"rdg", required_argument, 0, ARG_SCORE_RDG},
570 {(char*)"rfg", required_argument, 0, ARG_SCORE_RFG},
571 {(char*)"score-min", required_argument, 0, ARG_SCORE_MIN},
572 {(char*)"min-score", required_argument, 0, ARG_SCORE_MIN},
573 {(char*)"n-ceil", required_argument, 0, ARG_N_CEIL},
574 {(char*)"dpad", required_argument, 0, ARG_DPAD},
575 {(char*)"mapq-print-inputs",no_argument, 0, ARG_SAM_PRINT_YI},
576 {(char*)"very-fast", no_argument, 0, ARG_PRESET_VERY_FAST},
577 {(char*)"fast", no_argument, 0, ARG_PRESET_FAST},
578 {(char*)"sensitive", no_argument, 0, ARG_PRESET_SENSITIVE},
579 {(char*)"very-sensitive", no_argument, 0, ARG_PRESET_VERY_SENSITIVE},
580 {(char*)"very-fast-local", no_argument, 0, ARG_PRESET_VERY_FAST_LOCAL},
581 {(char*)"fast-local", no_argument, 0, ARG_PRESET_FAST_LOCAL},
582 {(char*)"sensitive-local", no_argument, 0, ARG_PRESET_SENSITIVE_LOCAL},
583 {(char*)"very-sensitive-local", no_argument, 0, ARG_PRESET_VERY_SENSITIVE_LOCAL},
584 {(char*)"no-score-priority",no_argument, 0, ARG_NO_SCORE_PRIORITY},
585 {(char*)"seedlen", required_argument, 0, 'L'},
586 {(char*)"seedmms", required_argument, 0, 'N'},
587 {(char*)"seedival", required_argument, 0, 'i'},
588 {(char*)"ignore-quals", no_argument, 0, ARG_IGNORE_QUALS},
589 {(char*)"index", required_argument, 0, 'x'},
590 {(char*)"arg-desc", no_argument, 0, ARG_DESC},
591 {(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
592 {(char*)"unpaired", required_argument, 0, 'U'},
593 {(char*)"output", required_argument, 0, 'S'},
594 {(char*)"mapq-v", required_argument, 0, ARG_MAPQ_V},
595 {(char*)"dovetail", no_argument, 0, ARG_DOVETAIL},
596 {(char*)"no-dovetail", no_argument, 0, ARG_NO_DOVETAIL},
597 {(char*)"contain", no_argument, 0, ARG_CONTAIN},
598 {(char*)"no-contain", no_argument, 0, ARG_NO_CONTAIN},
599 {(char*)"overlap", no_argument, 0, ARG_OVERLAP},
600 {(char*)"no-overlap", no_argument, 0, ARG_NO_OVERLAP},
601 {(char*)"tighten", required_argument, 0, ARG_TIGHTEN},
602 {(char*)"exact-upfront", no_argument, 0, ARG_EXACT_UPFRONT},
603 {(char*)"1mm-upfront", no_argument, 0, ARG_1MM_UPFRONT},
604 {(char*)"no-exact-upfront", no_argument, 0, ARG_EXACT_UPFRONT_NO},
605 {(char*)"no-1mm-upfront", no_argument, 0, ARG_1MM_UPFRONT_NO},
606 {(char*)"1mm-minlen", required_argument, 0, ARG_1MM_MINLEN},
607 {(char*)"seed-off", required_argument, 0, 'O'},
608 {(char*)"seed-boost", required_argument, 0, ARG_SEED_BOOST_THRESH},
609 {(char*)"read-times", no_argument, 0, ARG_READ_TIMES},
610 {(char*)"show-rand-seed", no_argument, 0, ARG_SHOW_RAND_SEED},
611 {(char*)"dp-fail-streak", required_argument, 0, ARG_DP_FAIL_STREAK_THRESH},
612 {(char*)"ee-fail-streak", required_argument, 0, ARG_EE_FAIL_STREAK_THRESH},
613 {(char*)"ug-fail-streak", required_argument, 0, ARG_UG_FAIL_STREAK_THRESH},
614 {(char*)"fail-streak", required_argument, 0, 'D'},
615 {(char*)"dp-fails", required_argument, 0, ARG_DP_FAIL_THRESH},
616 {(char*)"ug-fails", required_argument, 0, ARG_UG_FAIL_THRESH},
617 {(char*)"extends", required_argument, 0, ARG_EXTEND_ITERS},
618 {(char*)"no-extend", no_argument, 0, ARG_NO_EXTEND},
619 {(char*)"mapq-extra", no_argument, 0, ARG_MAPQ_EX},
620 {(char*)"seed-rounds", required_argument, 0, 'R'},
621 {(char*)"reorder", no_argument, 0, ARG_REORDER},
622 {(char*)"passthrough", no_argument, 0, ARG_READ_PASSTHRU},
623 {(char*)"sample", required_argument, 0, ARG_SAMPLE},
624 {(char*)"cp-min", required_argument, 0, ARG_CP_MIN},
625 {(char*)"cp-ival", required_argument, 0, ARG_CP_IVAL},
626 {(char*)"tri", no_argument, 0, ARG_TRI},
627 {(char*)"nondeterministic", no_argument, 0, ARG_NON_DETERMINISTIC},
628 {(char*)"non-deterministic", no_argument, 0, ARG_NON_DETERMINISTIC},
629 {(char*)"local-seed-cache-sz", required_argument, 0, ARG_LOCAL_SEED_CACHE_SZ},
630 {(char*)"seed-cache-sz", required_argument, 0, ARG_CURRENT_SEED_CACHE_SZ},
631 {(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL},
632 {(char*)"test-25", no_argument, 0, ARG_TEST_25},
633 // TODO: following should be a function of read length?
634 {(char*)"desc-kb", required_argument, 0, ARG_DESC_KB},
635 {(char*)"desc-landing", required_argument, 0, ARG_DESC_LANDING},
636 {(char*)"desc-exp", required_argument, 0, ARG_DESC_EXP},
637 {(char*)"desc-fmops", required_argument, 0, ARG_DESC_FMOPS},
638 {(char*)"no-temp-splicesite", no_argument, 0, ARG_NO_TEMPSPLICESITE},
639 {(char*)"pen-cansplice", required_argument, 0, ARG_PEN_CANSPLICE},
640 {(char*)"pen-noncansplice", required_argument, 0, ARG_PEN_NONCANSPLICE},
641 {(char*)"pen-intronlen", required_argument, 0, ARG_PEN_INTRONLEN},
642 {(char*)"known-splicesite-infile", required_argument, 0, ARG_KNOWN_SPLICESITE_INFILE},
643 {(char*)"novel-splicesite-infile", required_argument, 0, ARG_NOVEL_SPLICESITE_INFILE},
644 {(char*)"novel-splicesite-outfile", required_argument, 0, ARG_NOVEL_SPLICESITE_OUTFILE},
645 {(char*)"no-spliced-alignment", no_argument, 0, ARG_NO_SPLICED_ALIGNMENT},
646 {(char*)"rna-strandness", required_argument, 0, ARG_RNA_STRANDNESS},
647 {(char*)0, 0, 0, 0} // terminator
648 };
649
650 /**
651 * Print out a concise description of what options are taken and whether they
652 * take an argument.
653 */
printArgDesc(ostream & out)654 static void printArgDesc(ostream& out) {
655 // struct option {
656 // const char *name;
657 // int has_arg;
658 // int *flag;
659 // int val;
660 // };
661 size_t i = 0;
662 while(long_options[i].name != 0) {
663 out << long_options[i].name << "\t"
664 << (long_options[i].has_arg == no_argument ? 0 : 1)
665 << endl;
666 i++;
667 }
668 size_t solen = strlen(short_options);
669 for(i = 0; i < solen; i++) {
670 // Has an option? Does if next char is :
671 if(i == solen-1) {
672 assert_neq(':', short_options[i]);
673 cout << (char)short_options[i] << "\t" << 0 << endl;
674 } else {
675 if(short_options[i+1] == ':') {
676 // Option with argument
677 cout << (char)short_options[i] << "\t" << 1 << endl;
678 i++; // skip the ':'
679 } else {
680 // Option with no argument
681 cout << (char)short_options[i] << "\t" << 0 << endl;
682 }
683 }
684 }
685 }
686
687 /**
688 * Print a summary usage message to the provided output stream.
689 */
printUsage(ostream & out)690 static void printUsage(ostream& out) {
691 out << "HISAT version " << string(HISAT_VERSION).c_str() << " by Daehwan Kim (infphilo@gmail.com, www.ccb.jhu.edu/people/infphilo)" << endl;
692 string tool_name = "hisat2-align";
693 if(wrapper == "basic-0") {
694 tool_name = "hisat";
695 }
696 out << "Usage: " << endl
697 << " " << tool_name.c_str() << " [options]* -x <ht2-idx> {-1 <m1> -2 <m2> | -U <r>} [-S <sam>]" << endl
698 << endl
699 << " <ht2-idx> Index filename prefix (minus trailing .X." << gfm_ext << ")." << endl
700 << " <m1> Files with #1 mates, paired with files in <m2>." << endl;
701 if(wrapper == "basic-0") {
702 out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
703 }
704 out << " <m2> Files with #2 mates, paired with files in <m1>." << endl;
705 if(wrapper == "basic-0") {
706 out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
707 }
708 out << " <r> Files with unpaired reads." << endl;
709 if(wrapper == "basic-0") {
710 out << " Could be gzip'ed (extension: .gz) or bzip2'ed (extension: .bz2)." << endl;
711 }
712 out << " <sam> File for SAM output (default: stdout)" << endl
713 << endl
714 << " <m1>, <m2>, <r> can be comma-separated lists (no whitespace) and can be" << endl
715 << " specified many times. E.g. '-U file1.fq,file2.fq -U file3.fq'." << endl
716 // Wrapper script should write <bam> line next
717 << endl
718 << "Options (defaults in parentheses):" << endl
719 << endl
720 << " Input:" << endl
721 << " -q query input files are FASTQ .fq/.fastq (default)" << endl
722 << " --qseq query input files are in Illumina's qseq format" << endl
723 << " -f query input files are (multi-)FASTA .fa/.mfa" << endl
724 << " -r query input files are raw one-sequence-per-line" << endl
725 << " -c <m1>, <m2>, <r> are sequences themselves, not files" << endl
726 << " -s/--skip <int> skip the first <int> reads/pairs in the input (none)" << endl
727 << " -u/--upto <int> stop after first <int> reads/pairs (no limit)" << endl
728 << " -5/--trim5 <int> trim <int> bases from 5'/left end of reads (0)" << endl
729 << " -3/--trim3 <int> trim <int> bases from 3'/right end of reads (0)" << endl
730 << " --phred33 qualities are Phred+33 (default)" << endl
731 << " --phred64 qualities are Phred+64" << endl
732 << " --int-quals qualities encoded as space-delimited integers" << endl
733 << endl
734 << " Presets: Same as:" << endl
735 << " For --end-to-end:" << endl
736 << " --very-fast -D 5 -R 1 -N 0 -L 22 -i S,0,2.50" << endl
737 << " --fast -D 10 -R 2 -N 0 -L 22 -i S,0,2.50" << endl
738 << " --sensitive -D 15 -R 2 -N 0 -L 22 -i S,1,1.15 (default)" << endl
739 << " --very-sensitive -D 20 -R 3 -N 0 -L 20 -i S,1,0.50" << endl
740 << endl
741 << " For --local:" << endl
742 << " --very-fast-local -D 5 -R 1 -N 0 -L 25 -i S,1,2.00" << endl
743 << " --fast-local -D 10 -R 2 -N 0 -L 22 -i S,1,1.75" << endl
744 << " --sensitive-local -D 15 -R 2 -N 0 -L 20 -i S,1,0.75 (default)" << endl
745 << " --very-sensitive-local -D 20 -R 3 -N 0 -L 20 -i S,1,0.50" << endl
746 << endl
747 << " Alignment:" << endl
748 << " -N <int> max # mismatches in seed alignment; can be 0 or 1 (0)" << endl
749 << " -L <int> length of seed substrings; must be >3, <32 (22)" << endl
750 << " -i <func> interval between seed substrings w/r/t read len (S,1,1.15)" << endl
751 << " --n-ceil <func> func for max # non-A/C/G/Ts permitted in aln (L,0,0.15)" << endl
752 << " --dpad <int> include <int> extra ref chars on sides of DP table (15)" << endl
753 << " --gbar <int> disallow gaps within <int> nucs of read extremes (4)" << endl
754 << " --ignore-quals treat all quality values as 30 on Phred scale (off)" << endl
755 << " --nofw do not align forward (original) version of read (off)" << endl
756 << " --norc do not align reverse-complement version of read (off)" << endl
757 << endl
758 << " Spliced Alignment:" << endl
759 << " --pen-cansplice <int> penalty for a canonical splice site (0)" << endl
760 << " --pen-noncansplice <int> penalty for a non-canonical splice site (3)" << endl
761 << " --pen-intronlen <func> penalty for long introns (G,-8,1)" << endl
762 << " --known-splicesite-infile <path> provide a list of known splice sites" << endl
763 << " --novel-splicesite-outfile <path> report a list of splice sites" << endl
764 << " --novel-splicesite-infile <path> provide a list of novel splice sites" << endl
765 << " --no-temp-splicesite disable the use of splice sites found" << endl
766 << " --no-spliced-alignment disable spliced alignment" << endl
767 << " --rna-strandness <string> Specify strand-specific information (unstranded)" << endl
768 << endl
769 << " Scoring:" << endl
770 << " --ma <int> match bonus (0 for --end-to-end, 2 for --local) " << endl
771 << " --mp <int> max penalty for mismatch; lower qual = lower penalty (6)" << endl
772 << " --np <int> penalty for non-A/C/G/Ts in read/ref (1)" << endl
773 << " --rdg <int>,<int> read gap open, extend penalties (5,3)" << endl
774 << " --rfg <int>,<int> reference gap open, extend penalties (5,3)" << endl
775 << " --score-min <func> min acceptable alignment score w/r/t read length" << endl
776 << " (G,20,8 for local, L,-0.6,-0.6 for end-to-end)" << endl
777 << endl
778 << " Reporting:" << endl
779 << " (default) look for multiple alignments, report best, with MAPQ" << endl
780 << " OR" << endl
781 << " -k <int> report up to <int> alns per read; MAPQ not meaningful" << endl
782 << " OR" << endl
783 << " -a/--all report all alignments; very slow, MAPQ not meaningful" << endl
784 << endl
785 << " Effort:" << endl
786 << " -D <int> give up extending after <int> failed extends in a row (15)" << endl
787 << " -R <int> for reads w/ repetitive seeds, try <int> sets of seeds (2)" << endl
788 << endl
789 << " Paired-end:" << endl
790 << " -I/--minins <int> minimum fragment length (0)" << endl
791 << " -X/--maxins <int> maximum fragment length (500)" << endl
792 << " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (--fr)" << endl
793 << " --no-mixed suppress unpaired alignments for paired reads" << endl
794 << " --no-discordant suppress discordant alignments for paired reads" << endl
795 << " --no-dovetail not concordant when mates extend past each other" << endl
796 << " --no-contain not concordant when one mate alignment contains other" << endl
797 << " --no-overlap not concordant when mates overlap at all" << endl
798 << endl
799 << " Output:" << endl;
800 //if(wrapper == "basic-0") {
801 // out << " --bam output directly to BAM (by piping through 'samtools view')" << endl;
802 //}
803 out << " -t/--time print wall-clock time taken by search phases" << endl;
804 if(wrapper == "basic-0") {
805 out << " --un <path> write unpaired reads that didn't align to <path>" << endl
806 << " --al <path> write unpaired reads that aligned at least once to <path>" << endl
807 << " --un-conc <path> write pairs that didn't align concordantly to <path>" << endl
808 << " --al-conc <path> write pairs that aligned concordantly at least once to <path>" << endl
809 << " (Note: for --un, --al, --un-conc, or --al-conc, add '-gz' to the option name, e.g." << endl
810 << " --un-gz <path>, to gzip compress output, or add '-bz2' to bzip2 compress output.)" << endl;
811 }
812 out << " --quiet print nothing to stderr except serious errors" << endl
813 // << " --refidx refer to ref. seqs by 0-based index rather than name" << endl
814 << " --met-file <path> send metrics to file at <path> (off)" << endl
815 << " --met-stderr send metrics to stderr (off)" << endl
816 << " --met <int> report internal counters & metrics every <int> secs (1)" << endl
817 // Following is supported in the wrapper instead
818 // << " --no-unal supppress SAM records for unaligned reads" << endl
819 << " --no-head supppress header lines, i.e. lines starting with @" << endl
820 << " --no-sq supppress @SQ header lines" << endl
821 << " --rg-id <text> set read group id, reflected in @RG line and RG:Z: opt field" << endl
822 << " --rg <text> add <text> (\"lab:value\") to @RG line of SAM header." << endl
823 << " Note: @RG line only printed when --rg-id is set." << endl
824 << " --omit-sec-seq put '*' in SEQ and QUAL fields for secondary alignments." << endl
825 << endl
826 << " Performance:" << endl
827 << " -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
828 << " -p/--threads <int> number of alignment threads to launch (1)" << endl
829 << " --reorder force SAM output order to match order of input reads" << endl
830 #ifdef BOWTIE_MM
831 << " --mm use memory-mapped I/O for index; many 'hisat2's can share" << endl
832 #endif
833 #ifdef BOWTIE_SHARED_MEM
834 //<< " --shmem use shared mem for index; many 'hisat2's can share" << endl
835 #endif
836 << endl
837 << " Other:" << endl
838 << " --qc-filter filter out reads that are bad according to QSEQ filter" << endl
839 << " --seed <int> seed for random number generator (0)" << endl
840 << " --non-deterministic seed rand. gen. arbitrarily instead of using read attributes" << endl
841 // << " --verbose verbose output for debugging" << endl
842 << " --version print version information and quit" << endl
843 << " -h/--help print this usage message" << endl
844 ;
845 if(wrapper.empty()) {
846 cerr << endl
847 << "*** Warning ***" << endl
848 << "'hisat2-align' was run directly. It is recommended that you run the wrapper script 'hisat2' instead." << endl
849 << endl;
850 }
851 }
852
853 /**
854 * Parse an int out of optarg and enforce that it be at least 'lower';
855 * if it is less than 'lower', than output the given error message and
856 * exit with an error and a usage message.
857 */
parseInt(int lower,int upper,const char * errmsg,const char * arg)858 static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
859 long l;
860 char *endPtr= NULL;
861 l = strtol(arg, &endPtr, 10);
862 if (endPtr != NULL) {
863 if (l < lower || l > upper) {
864 cerr << errmsg << endl;
865 printUsage(cerr);
866 throw 1;
867 }
868 return (int32_t)l;
869 }
870 cerr << errmsg << endl;
871 printUsage(cerr);
872 throw 1;
873 return -1;
874 }
875
876 /**
877 * Upper is maximum int by default.
878 */
parseInt(int lower,const char * errmsg,const char * arg)879 static int parseInt(int lower, const char *errmsg, const char *arg) {
880 return parseInt(lower, std::numeric_limits<int>::max(), errmsg, arg);
881 }
882
883 /**
884 * Parse a T string 'str'.
885 */
886 template<typename T>
parse(const char * s)887 T parse(const char *s) {
888 T tmp;
889 stringstream ss(s);
890 ss >> tmp;
891 return tmp;
892 }
893
894 /**
895 * Parse a pair of Ts from a string, 'str', delimited with 'delim'.
896 */
897 template<typename T>
parsePair(const char * str,char delim)898 pair<T, T> parsePair(const char *str, char delim) {
899 string s(str);
900 EList<string> ss;
901 tokenize(s, delim, ss);
902 pair<T, T> ret;
903 ret.first = parse<T>(ss[0].c_str());
904 ret.second = parse<T>(ss[1].c_str());
905 return ret;
906 }
907
908 /**
909 * Parse a pair of Ts from a string, 'str', delimited with 'delim'.
910 */
911 template<typename T>
parseTuple(const char * str,char delim,EList<T> & ret)912 void parseTuple(const char *str, char delim, EList<T>& ret) {
913 string s(str);
914 EList<string> ss;
915 tokenize(s, delim, ss);
916 for(size_t i = 0; i < ss.size(); i++) {
917 ret.push_back(parse<T>(ss[i].c_str()));
918 }
919 }
920
applyPreset(const string & sorig,Presets & presets)921 static string applyPreset(const string& sorig, Presets& presets) {
922 string s = sorig;
923 size_t found = s.find("%LOCAL%");
924 if(found != string::npos) {
925 s.replace(found, strlen("%LOCAL%"), localAlign ? "-local" : "");
926 }
927 if(gVerbose) {
928 cerr << "Applying preset: '" << s.c_str() << "' using preset menu '"
929 << presets.name() << "'" << endl;
930 }
931 string pol;
932 presets.apply(s, pol, extra_opts);
933 return pol;
934 }
935
936 static bool saw_M;
937 static bool saw_a;
938 static bool saw_k;
939 static EList<string> presetList;
940
941 /**
942 * TODO: Argument parsing is very, very flawed. The biggest problem is that
943 * there are two separate worlds of arguments, the ones set via polstr, and
944 * the ones set directly in variables. This makes for nasty interactions,
945 * e.g., with the -M option being resolved at an awkward time relative to
946 * the -k and -a options.
947 */
parseOption(int next_option,const char * arg)948 static void parseOption(int next_option, const char *arg) {
949 switch (next_option) {
950 case ARG_TEST_25: bowtie2p5 = true; break;
951 case ARG_DESC_KB: descentTotSz = SimpleFunc::parse(arg, 0.0, 1024.0, 1024.0, DMAX); break;
952 case ARG_DESC_FMOPS: descentTotFmops = SimpleFunc::parse(arg, 0.0, 10.0, 100.0, DMAX); break;
953 case ARG_DESC_LANDING: descentLanding = parse<int>(arg); break;
954 case ARG_DESC_EXP: {
955 descConsExp = parse<double>(arg);
956 if(descConsExp < 0.0) {
957 cerr << "Error: --desc-exp must be greater than or equal to 0" << endl;
958 throw 1;
959 }
960 break;
961 }
962 case '1': tokenize(arg, ",", mates1); break;
963 case '2': tokenize(arg, ",", mates2); break;
964 case ARG_ONETWO: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
965 case ARG_TAB5: tokenize(arg, ",", mates12); format = TAB_MATE5; break;
966 case ARG_TAB6: tokenize(arg, ",", mates12); format = TAB_MATE6; break;
967 case 'f': format = FASTA; break;
968 case 'F': {
969 format = FASTA_CONT;
970 pair<uint32_t, uint32_t> p = parsePair<uint32_t>(arg, ',');
971 fastaContLen = p.first;
972 fastaContFreq = p.second;
973 break;
974 }
975 case ARG_BWA_SW_LIKE: {
976 bwaSwLikeC = 5.5f;
977 bwaSwLikeT = 30;
978 bwaSwLike = true;
979 localAlign = true;
980 // -a INT Score of a match [1]
981 // -b INT Mismatch penalty [3]
982 // -q INT Gap open penalty [5]
983 // -r INT Gap extension penalty. The penalty for a contiguous
984 // gap of size k is q+k*r. [2]
985 polstr += ";MA=1;MMP=C3;RDG=5,2;RFG=5,2";
986 break;
987 }
988 case 'q': format = FASTQ; break;
989 case 'r': format = RAW; break;
990 case 'c': format = CMDLINE; break;
991 case ARG_QSEQ: format = QSEQ; break;
992 case 'C': {
993 cerr << "Error: -C specified but Bowtie 2 does not support colorspace input." << endl;
994 throw 1;
995 break;
996 }
997 case 'I':
998 gMinInsert = parseInt(0, "-I arg must be positive", arg);
999 break;
1000 case 'X':
1001 gMaxInsert = parseInt(1, "-X arg must be at least 1", arg);
1002 break;
1003 case ARG_NO_DISCORDANT: gReportDiscordant = false; break;
1004 case ARG_NO_MIXED: gReportMixed = false; break;
1005 case 's':
1006 skipReads = (uint32_t)parseInt(0, "-s arg must be positive", arg);
1007 break;
1008 case ARG_FF: gMate1fw = true; gMate2fw = true; break;
1009 case ARG_RF: gMate1fw = false; gMate2fw = true; break;
1010 case ARG_FR: gMate1fw = true; gMate2fw = false; break;
1011 case ARG_SHMEM: useShmem = true; break;
1012 case ARG_SEED_SUMM: seedSumm = true; break;
1013 case ARG_MM: {
1014 #ifdef BOWTIE_MM
1015 useMm = true;
1016 break;
1017 #else
1018 cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl
1019 << "BOWTIE_MM defined. Memory-mapped I/O is not supported under Windows. If you" << endl
1020 << "would like to use memory-mapped I/O on a platform that supports it, please" << endl
1021 << "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl;
1022 throw 1;
1023 #endif
1024 }
1025 case ARG_MMSWEEP: mmSweep = true; break;
1026 case ARG_HADOOPOUT: hadoopOut = true; break;
1027 case ARG_SOLEXA_QUALS: solexaQuals = true; break;
1028 case ARG_INTEGER_QUALS: integerQuals = true; break;
1029 case ARG_PHRED64: phred64Quals = true; break;
1030 case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break;
1031 case ARG_OVERHANG: gReportOverhangs = true; break;
1032 case ARG_NO_CACHE: msNoCache = true; break;
1033 case ARG_USE_CACHE: msNoCache = false; break;
1034 case ARG_LOCAL_SEED_CACHE_SZ:
1035 seedCacheLocalMB = (uint32_t)parseInt(1, "--local-seed-cache-sz arg must be at least 1", arg);
1036 break;
1037 case ARG_CURRENT_SEED_CACHE_SZ:
1038 seedCacheCurrentMB = (uint32_t)parseInt(1, "--seed-cache-sz arg must be at least 1", arg);
1039 break;
1040 case ARG_REFIDX: noRefNames = true; break;
1041 case ARG_FUZZY: fuzzy = true; break;
1042 case ARG_FULLREF: fullRef = true; break;
1043 case ARG_GAP_BAR:
1044 gGapBarrier = parseInt(1, "--gbar must be no less than 1", arg);
1045 break;
1046 case ARG_SEED:
1047 seed = parseInt(0, "--seed arg must be at least 0", arg);
1048 break;
1049 case ARG_NON_DETERMINISTIC:
1050 arbitraryRandom = true;
1051 break;
1052 case 'u':
1053 qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1", arg);
1054 break;
1055 case 'Q':
1056 tokenize(arg, ",", qualities);
1057 integerQuals = true;
1058 break;
1059 case ARG_QUALS1:
1060 tokenize(arg, ",", qualities1);
1061 integerQuals = true;
1062 break;
1063 case ARG_QUALS2:
1064 tokenize(arg, ",", qualities2);
1065 integerQuals = true;
1066 break;
1067 case ARG_CACHE_LIM:
1068 cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1", arg);
1069 break;
1070 case ARG_CACHE_SZ:
1071 cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1", arg);
1072 cacheSize *= (1024 * 1024); // convert from MB to B
1073 break;
1074 case ARG_WRAPPER: wrapper = arg; break;
1075 case 'p':
1076 nthreads = parseInt(1, "-p/--threads arg must be at least 1", arg);
1077 break;
1078 case ARG_FILEPAR:
1079 fileParallel = true;
1080 break;
1081 case '3': gTrim3 = parseInt(0, "-3/--trim3 arg must be at least 0", arg); break;
1082 case '5': gTrim5 = parseInt(0, "-5/--trim5 arg must be at least 0", arg); break;
1083 case 'h': printUsage(cout); throw 0; break;
1084 case ARG_USAGE: printUsage(cout); throw 0; break;
1085 //
1086 // NOTE that unlike in Bowtie 1, -M, -a and -k are mutually
1087 // exclusive here.
1088 //
1089 case 'M': {
1090 msample = true;
1091 mhits = parse<uint32_t>(arg);
1092 if(saw_a || saw_k) {
1093 cerr << "Warning: -M, -k and -a are mutually exclusive. "
1094 << "-M will override" << endl;
1095 khits = 1;
1096 }
1097 assert_eq(1, khits);
1098 saw_M = true;
1099 cerr << "Warning: -M is deprecated. Use -D and -R to adjust " <<
1100 "effort instead." << endl;
1101 break;
1102 }
1103 case ARG_EXTEND_ITERS: {
1104 maxIters = parse<size_t>(arg);
1105 break;
1106 }
1107 case ARG_NO_EXTEND: {
1108 doExtend = false;
1109 break;
1110 }
1111 case 'R': { polstr += ";ROUNDS="; polstr += arg; break; }
1112 case 'D': { polstr += ";DPS="; polstr += arg; break; }
1113 case ARG_DP_MATE_STREAK_THRESH: {
1114 maxMateStreak = parse<size_t>(arg);
1115 break;
1116 }
1117 case ARG_DP_FAIL_STREAK_THRESH: {
1118 maxDpStreak = parse<size_t>(arg);
1119 break;
1120 }
1121 case ARG_EE_FAIL_STREAK_THRESH: {
1122 maxEeStreak = parse<size_t>(arg);
1123 break;
1124 }
1125 case ARG_UG_FAIL_STREAK_THRESH: {
1126 maxUgStreak = parse<size_t>(arg);
1127 break;
1128 }
1129 case ARG_DP_FAIL_THRESH: {
1130 maxDp = parse<size_t>(arg);
1131 break;
1132 }
1133 case ARG_UG_FAIL_THRESH: {
1134 maxUg = parse<size_t>(arg);
1135 break;
1136 }
1137 case ARG_SEED_BOOST_THRESH: {
1138 seedBoostThresh = parse<int>(arg);
1139 break;
1140 }
1141 case 'a': {
1142 msample = false;
1143 allHits = true;
1144 mhits = 0; // disable -M
1145 if(saw_M || saw_k) {
1146 cerr << "Warning: -M, -k and -a are mutually exclusive. "
1147 << "-a will override" << endl;
1148 }
1149 saw_a = true;
1150 break;
1151 }
1152 case 'k': {
1153 msample = false;
1154 khits = (uint32_t)parseInt(1, "-k arg must be at least 1", arg);
1155 mhits = 0; // disable -M
1156 if(saw_M || saw_a) {
1157 cerr << "Warning: -M, -k and -a are mutually exclusive. "
1158 << "-k will override" << endl;
1159 }
1160 saw_k = true;
1161 break;
1162 }
1163 case ARG_VERBOSE: gVerbose = 1; break;
1164 case ARG_STARTVERBOSE: startVerbose = true; break;
1165 case ARG_QUIET: gQuiet = true; break;
1166 case ARG_SANITY: sanityCheck = true; break;
1167 case 't': timing = true; break;
1168 case ARG_METRIC_IVAL: {
1169 metricsIval = parseInt(1, "--metrics arg must be at least 1", arg);
1170 break;
1171 }
1172 case ARG_METRIC_FILE: metricsFile = arg; break;
1173 case ARG_METRIC_STDERR: metricsStderr = true; break;
1174 case ARG_METRIC_PER_READ: metricsPerRead = true; break;
1175 case ARG_NO_FW: gNofw = true; break;
1176 case ARG_NO_RC: gNorc = true; break;
1177 case ARG_SAM_NO_QNAME_TRUNC: samTruncQname = false; break;
1178 case ARG_SAM_OMIT_SEC_SEQ: samOmitSecSeqQual = true; break;
1179 case ARG_SAM_NO_UNAL: samNoUnal = true; break;
1180 case ARG_SAM_NOHEAD: samNoHead = true; break;
1181 case ARG_SAM_NOSQ: samNoSQ = true; break;
1182 case ARG_SAM_PRINT_YI: sam_print_yi = true; break;
1183 case ARG_REORDER: reorder = true; break;
1184 case ARG_MAPQ_EX: {
1185 sam_print_zp = true;
1186 sam_print_zu = true;
1187 sam_print_xp = true;
1188 sam_print_xss = true;
1189 sam_print_yn = true;
1190 break;
1191 }
1192 case ARG_SHOW_RAND_SEED: {
1193 sam_print_zs = true;
1194 break;
1195 }
1196 case ARG_SAMPLE:
1197 sampleFrac = parse<float>(arg);
1198 break;
1199 case ARG_CP_MIN:
1200 cminlen = parse<size_t>(arg);
1201 break;
1202 case ARG_CP_IVAL:
1203 cpow2 = parse<size_t>(arg);
1204 break;
1205 case ARG_TRI:
1206 doTri = true;
1207 break;
1208 case ARG_READ_PASSTHRU: {
1209 sam_print_xr = true;
1210 break;
1211 }
1212 case ARG_READ_TIMES: {
1213 sam_print_xt = true;
1214 sam_print_xd = true;
1215 sam_print_xu = true;
1216 sam_print_yl = true;
1217 sam_print_ye = true;
1218 sam_print_yu = true;
1219 sam_print_yr = true;
1220 sam_print_zb = true;
1221 sam_print_zr = true;
1222 sam_print_zf = true;
1223 sam_print_zm = true;
1224 sam_print_zi = true;
1225 break;
1226 }
1227 case ARG_SAM_RG: {
1228 string argstr = arg;
1229 if(argstr.substr(0, 3) == "ID:") {
1230 rgid = "\t";
1231 rgid += argstr;
1232 rgs_optflag = "RG:Z:" + argstr.substr(3);
1233 } else {
1234 rgs += '\t';
1235 rgs += argstr;
1236 }
1237 break;
1238 }
1239 case ARG_SAM_RGID: {
1240 string argstr = arg;
1241 rgid = "\t";
1242 rgid = "\tID:" + argstr;
1243 rgs_optflag = "RG:Z:" + argstr;
1244 break;
1245 }
1246 case ARG_PARTITION: partitionSz = parse<int>(arg); break;
1247 case ARG_DPAD:
1248 maxhalf = parseInt(0, "--dpad must be no less than 0", arg);
1249 break;
1250 case ARG_ORIG:
1251 if(arg == NULL || strlen(arg) == 0) {
1252 cerr << "--orig arg must be followed by a string" << endl;
1253 printUsage(cerr);
1254 throw 1;
1255 }
1256 origString = arg;
1257 break;
1258 case ARG_LOCAL: localAlign = true; break;
1259 case ARG_END_TO_END: localAlign = false; break;
1260 case ARG_SSE8: enable8 = true; break;
1261 case ARG_SSE8_NO: enable8 = false; break;
1262 case ARG_UNGAPPED: doUngapped = true; break;
1263 case ARG_UNGAPPED_NO: doUngapped = false; break;
1264 case ARG_NO_DOVETAIL: gDovetailMatesOK = false; break;
1265 case ARG_NO_CONTAIN: gContainMatesOK = false; break;
1266 case ARG_NO_OVERLAP: gOlapMatesOK = false; break;
1267 case ARG_DOVETAIL: gDovetailMatesOK = true; break;
1268 case ARG_CONTAIN: gContainMatesOK = true; break;
1269 case ARG_OVERLAP: gOlapMatesOK = true; break;
1270 case ARG_QC_FILTER: qcFilter = true; break;
1271 case ARG_NO_SCORE_PRIORITY: sortByScore = false; break;
1272 case ARG_IGNORE_QUALS: ignoreQuals = true; break;
1273 case ARG_MAPQ_V: mapqv = parse<int>(arg); break;
1274 case ARG_TIGHTEN: tighten = parse<int>(arg); break;
1275 case ARG_EXACT_UPFRONT: doExactUpFront = true; break;
1276 case ARG_1MM_UPFRONT: do1mmUpFront = true; break;
1277 case ARG_EXACT_UPFRONT_NO: doExactUpFront = false; break;
1278 case ARG_1MM_UPFRONT_NO: do1mmUpFront = false; break;
1279 case ARG_1MM_MINLEN: do1mmMinLen = parse<size_t>(arg); break;
1280 case ARG_NOISY_HPOLY: noisyHpolymer = true; break;
1281 case 'x': bt2index = arg; break;
1282 case ARG_PRESET_VERY_FAST_LOCAL: localAlign = true;
1283 case ARG_PRESET_VERY_FAST: {
1284 presetList.push_back("very-fast%LOCAL%"); break;
1285 }
1286 case ARG_PRESET_FAST_LOCAL: localAlign = true;
1287 case ARG_PRESET_FAST: {
1288 presetList.push_back("fast%LOCAL%"); break;
1289 }
1290 case ARG_PRESET_SENSITIVE_LOCAL: localAlign = true;
1291 case ARG_PRESET_SENSITIVE: {
1292 presetList.push_back("sensitive%LOCAL%"); break;
1293 }
1294 case ARG_PRESET_VERY_SENSITIVE_LOCAL: localAlign = true;
1295 case ARG_PRESET_VERY_SENSITIVE: {
1296 presetList.push_back("very-sensitive%LOCAL%"); break;
1297 }
1298 case 'P': { presetList.push_back(arg); break; }
1299 case ARG_ALIGN_POLICY: {
1300 if(strlen(arg) > 0) {
1301 polstr += ";"; polstr += arg;
1302 }
1303 break;
1304 }
1305 case 'N': { polstr += ";SEED="; polstr += arg; break; }
1306 case 'L': {
1307 int64_t len = parse<size_t>(arg);
1308 if(len < 0) {
1309 cerr << "Error: -L argument must be >= 0; was " << arg << endl;
1310 throw 1;
1311 }
1312 if(len > 32) {
1313 cerr << "Error: -L argument must be <= 32; was" << arg << endl;
1314 throw 1;
1315 }
1316 polstr += ";SEEDLEN="; polstr += arg; break;
1317 }
1318 case 'O':
1319 multiseedOff = parse<size_t>(arg);
1320 break;
1321 case 'i': {
1322 EList<string> args;
1323 tokenize(arg, ",", args);
1324 if(args.size() > 3 || args.size() == 0) {
1325 cerr << "Error: expected 3 or fewer comma-separated "
1326 << "arguments to -i option, got "
1327 << args.size() << endl;
1328 throw 1;
1329 }
1330 // Interval-settings arguments
1331 polstr += (";IVAL=" + args[0]); // Function type
1332 if(args.size() > 1) {
1333 polstr += ("," + args[1]); // Constant term
1334 }
1335 if(args.size() > 2) {
1336 polstr += ("," + args[2]); // Coefficient
1337 }
1338 break;
1339 }
1340 case ARG_MULTISEED_IVAL: {
1341 polstr += ";";
1342 // Split argument by comma
1343 EList<string> args;
1344 tokenize(arg, ",", args);
1345 if(args.size() > 5 || args.size() == 0) {
1346 cerr << "Error: expected 5 or fewer comma-separated "
1347 << "arguments to --multiseed option, got "
1348 << args.size() << endl;
1349 throw 1;
1350 }
1351 // Seed mm and length arguments
1352 polstr += "SEED=";
1353 polstr += (args[0]); // # mismatches
1354 if(args.size() > 1) polstr += ("," + args[ 1]); // length
1355 if(args.size() > 2) polstr += (";IVAL=" + args[2]); // Func type
1356 if(args.size() > 3) polstr += ("," + args[ 3]); // Constant term
1357 if(args.size() > 4) polstr += ("," + args[ 4]); // Coefficient
1358 break;
1359 }
1360 case ARG_N_CEIL: {
1361 // Split argument by comma
1362 EList<string> args;
1363 tokenize(arg, ",", args);
1364 if(args.size() > 3) {
1365 cerr << "Error: expected 3 or fewer comma-separated "
1366 << "arguments to --n-ceil option, got "
1367 << args.size() << endl;
1368 throw 1;
1369 }
1370 if(args.size() == 0) {
1371 cerr << "Error: expected at least one argument to --n-ceil option" << endl;
1372 throw 1;
1373 }
1374 polstr += ";NCEIL=";
1375 if(args.size() == 3) {
1376 polstr += (args[0] + "," + args[1] + "," + args[2]);
1377 } else {
1378 if(args.size() == 1) {
1379 polstr += ("C," + args[0]);
1380 } else {
1381 polstr += (args[0] + "," + args[1]);
1382 }
1383 }
1384 break;
1385 }
1386 case ARG_SCORE_MA: polstr += ";MA="; polstr += arg; break;
1387 case ARG_SCORE_MMP: {
1388 EList<string> args;
1389 tokenize(arg, ",", args);
1390 if(args.size() > 2 || args.size() == 0) {
1391 cerr << "Error: expected 1 or 2 comma-separated "
1392 << "arguments to --mmp option, got " << args.size() << endl;
1393 throw 1;
1394 }
1395 if(args.size() >= 1) {
1396 polstr += ";MMP=Q,";
1397 polstr += args[0];
1398 if(args.size() >= 2) {
1399 polstr += ",";
1400 polstr += args[1];
1401 }
1402 }
1403 break;
1404 }
1405 case ARG_SCORE_NP: polstr += ";NP=C"; polstr += arg; break;
1406 case ARG_SCORE_RDG: polstr += ";RDG="; polstr += arg; break;
1407 case ARG_SCORE_RFG: polstr += ";RFG="; polstr += arg; break;
1408 case ARG_SCORE_MIN: {
1409 polstr += ";";
1410 EList<string> args;
1411 tokenize(arg, ",", args);
1412 if(args.size() > 3 && args.size() == 0) {
1413 cerr << "Error: expected 3 or fewer comma-separated "
1414 << "arguments to --n-ceil option, got "
1415 << args.size() << endl;
1416 throw 1;
1417 }
1418 polstr += ("MIN=" + args[0]);
1419 if(args.size() > 1) {
1420 polstr += ("," + args[1]);
1421 }
1422 if(args.size() > 2) {
1423 polstr += ("," + args[2]);
1424 }
1425 break;
1426 }
1427 case ARG_DESC: printArgDesc(cout); throw 0;
1428 case 'S': outfile = arg; break;
1429 case 'U': {
1430 EList<string> args;
1431 tokenize(arg, ",", args);
1432 for(size_t i = 0; i < args.size(); i++) {
1433 queries.push_back(args[i]);
1434 }
1435 break;
1436 }
1437 case ARG_VERSION: showVersion = 1; break;
1438 case ARG_NO_TEMPSPLICESITE: useTempSpliceSite = false; break;
1439 case ARG_PEN_CANSPLICE: {
1440 penCanSplice = parseInt(0, "-k arg must be at least 0", arg);
1441 break;
1442 }
1443 case ARG_PEN_NONCANSPLICE: {
1444 penNoncanSplice = parseInt(0, "-k arg must be at least 0", arg);
1445 break;
1446 }
1447 case ARG_PEN_INTRONLEN: {
1448 polstr += ";";
1449 EList<string> args;
1450 tokenize(arg, ",", args);
1451 if(args.size() > 3 && args.size() == 0) {
1452 cerr << "Error: expected 3 or fewer comma-separated "
1453 << "arguments to --n-ceil option, got "
1454 << args.size() << endl;
1455 throw 1;
1456 }
1457 polstr += ("INTRONLEN=" + args[0]);
1458 if(args.size() > 1) {
1459 polstr += ("," + args[1]);
1460 }
1461 if(args.size() > 2) {
1462 polstr += ("," + args[2]);
1463 }
1464 break;
1465 }
1466 case ARG_KNOWN_SPLICESITE_INFILE: knownSpliceSiteInfile = arg; break;
1467 case ARG_NOVEL_SPLICESITE_INFILE: novelSpliceSiteInfile = arg; break;
1468 case ARG_NOVEL_SPLICESITE_OUTFILE: novelSpliceSiteOutfile = arg; break;
1469 case ARG_NO_SPLICED_ALIGNMENT: no_spliced_alignment = true; break;
1470 case ARG_RNA_STRANDNESS: {
1471 string strandness = arg;
1472 if(strandness == "F") rna_strandness = RNA_STRANDNESS_F;
1473 else if(strandness == "R") rna_strandness = RNA_STRANDNESS_R;
1474 else if(strandness == "FR") rna_strandness = RNA_STRANDNESS_FR;
1475 else if(strandness == "RF") rna_strandness = RNA_STRANDNESS_RF;
1476 else {
1477 // daehwan - throw exception with details
1478 cerr << "Error: should be one of F, R, FR, or RF " << endl;
1479 throw 1;
1480 }
1481 break;
1482 }
1483 default:
1484 printUsage(cerr);
1485 throw 1;
1486 }
1487 }
1488
1489 /**
1490 * Read command-line arguments
1491 */
parseOptions(int argc,const char ** argv)1492 static void parseOptions(int argc, const char **argv) {
1493 int option_index = 0;
1494 int next_option;
1495 saw_M = false;
1496 saw_a = false;
1497 saw_k = true;
1498 presetList.clear();
1499 if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); }
1500 while(true) {
1501 next_option = getopt_long(
1502 argc, const_cast<char**>(argv),
1503 short_options, long_options, &option_index);
1504 const char * arg = optarg;
1505 if(next_option == EOF) {
1506 if(extra_opts_cur < extra_opts.size()) {
1507 next_option = extra_opts[extra_opts_cur].first;
1508 arg = extra_opts[extra_opts_cur].second.c_str();
1509 extra_opts_cur++;
1510 } else {
1511 break;
1512 }
1513 }
1514 parseOption(next_option, arg);
1515 }
1516 // Now parse all the presets. Might want to pick which presets version to
1517 // use according to other parameters.
1518 auto_ptr<Presets> presets(new PresetsV0());
1519 // Apply default preset
1520 if(!defaultPreset.empty()) {
1521 polstr = applyPreset(defaultPreset, *presets.get()) + polstr;
1522 }
1523 // Apply specified presets
1524 for(size_t i = 0; i < presetList.size(); i++) {
1525 polstr += applyPreset(presetList[i], *presets.get());
1526 }
1527 for(size_t i = 0; i < extra_opts.size(); i++) {
1528 next_option = extra_opts[extra_opts_cur].first;
1529 const char *arg = extra_opts[extra_opts_cur].second.c_str();
1530 parseOption(next_option, arg);
1531 }
1532 // Remove initial semicolons
1533 while(!polstr.empty() && polstr[0] == ';') {
1534 polstr = polstr.substr(1);
1535 }
1536 if(gVerbose) {
1537 cerr << "Final policy string: '" << polstr.c_str() << "'" << endl;
1538 }
1539 size_t failStreakTmp = 0;
1540 SeedAlignmentPolicy::parseString(
1541 polstr,
1542 localAlign,
1543 noisyHpolymer,
1544 ignoreQuals,
1545 bonusMatchType,
1546 bonusMatch,
1547 penMmcType,
1548 penMmcMax,
1549 penMmcMin,
1550 penNType,
1551 penN,
1552 penRdGapConst,
1553 penRfGapConst,
1554 penRdGapLinear,
1555 penRfGapLinear,
1556 scoreMin,
1557 nCeil,
1558 penNCatPair,
1559 multiseedMms,
1560 multiseedLen,
1561 msIval,
1562 failStreakTmp,
1563 nSeedRounds,
1564 &penIntronLen);
1565 if(failStreakTmp > 0) {
1566 maxEeStreak = failStreakTmp;
1567 maxUgStreak = failStreakTmp;
1568 maxDpStreak = failStreakTmp;
1569 }
1570 if(saw_a || saw_k) {
1571 msample = false;
1572 mhits = 0;
1573 } else {
1574 assert_gt(mhits, 0);
1575 msample = true;
1576 }
1577 if(mates1.size() != mates2.size()) {
1578 cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl
1579 << "mate files/sequences were specified with -2. The same number of mate files/" << endl
1580 << "sequences must be specified with -1 and -2." << endl;
1581 throw 1;
1582 }
1583 if(qualities.size() && format != FASTA) {
1584 cerr << "Error: one or more quality files were specified with -Q but -f was not" << endl
1585 << "enabled. -Q works only in combination with -f and -C." << endl;
1586 throw 1;
1587 }
1588 if(qualities1.size() && format != FASTA) {
1589 cerr << "Error: one or more quality files were specified with --Q1 but -f was not" << endl
1590 << "enabled. --Q1 works only in combination with -f and -C." << endl;
1591 throw 1;
1592 }
1593 if(qualities2.size() && format != FASTA) {
1594 cerr << "Error: one or more quality files were specified with --Q2 but -f was not" << endl
1595 << "enabled. --Q2 works only in combination with -f and -C." << endl;
1596 throw 1;
1597 }
1598 if(qualities1.size() > 0 && mates1.size() != qualities1.size()) {
1599 cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << qualities1.size() << endl
1600 << "quality files were specified with --Q1. The same number of mate and quality" << endl
1601 << "files must sequences must be specified with -1 and --Q1." << endl;
1602 throw 1;
1603 }
1604 if(qualities2.size() > 0 && mates2.size() != qualities2.size()) {
1605 cerr << "Error: " << mates2.size() << " mate files/sequences were specified with -2, but " << qualities2.size() << endl
1606 << "quality files were specified with --Q2. The same number of mate and quality" << endl
1607 << "files must sequences must be specified with -2 and --Q2." << endl;
1608 throw 1;
1609 }
1610 if(!rgs.empty() && rgid.empty()) {
1611 cerr << "Warning: --rg was specified without --rg-id also "
1612 << "being specified. @RG line is not printed unless --rg-id "
1613 << "is specified." << endl;
1614 }
1615 // Check for duplicate mate input files
1616 if(format != CMDLINE) {
1617 for(size_t i = 0; i < mates1.size(); i++) {
1618 for(size_t j = 0; j < mates2.size(); j++) {
1619 if(mates1[i] == mates2[j] && !gQuiet) {
1620 cerr << "Warning: Same mate file \"" << mates1[i].c_str() << "\" appears as argument to both -1 and -2" << endl;
1621 }
1622 }
1623 }
1624 }
1625 // If both -s and -u are used, we need to adjust qUpto accordingly
1626 // since it uses rdid to know if we've reached the -u limit (and
1627 // rdids are all shifted up by skipReads characters)
1628 if(qUpto + skipReads > qUpto) {
1629 qUpto += skipReads;
1630 }
1631 if(useShmem && useMm && !gQuiet) {
1632 cerr << "Warning: --shmem overrides --mm..." << endl;
1633 useMm = false;
1634 }
1635 if(gGapBarrier < 1) {
1636 cerr << "Warning: --gbar was set less than 1 (=" << gGapBarrier
1637 << "); setting to 1 instead" << endl;
1638 gGapBarrier = 1;
1639 }
1640 if(multiseedMms >= multiseedLen) {
1641 assert_gt(multiseedLen, 0);
1642 cerr << "Warning: seed mismatches (" << multiseedMms
1643 << ") is less than seed length (" << multiseedLen
1644 << "); setting mismatches to " << (multiseedMms-1)
1645 << " instead" << endl;
1646 multiseedMms = multiseedLen-1;
1647 }
1648 sam_print_zm = sam_print_zm && bowtie2p5;
1649 #ifndef NDEBUG
1650 if(!gQuiet) {
1651 cerr << "Warning: Running in debug mode. Please use debug mode only "
1652 << "for diagnosing errors, and not for typical use of HISAT."
1653 << endl;
1654 }
1655 #endif
1656 }
1657
1658 static const char *argv0 = NULL;
1659
1660 /// Create a PatternSourcePerThread for the current thread according
1661 /// to the global params and return a pointer to it
1662 static PatternSourcePerThreadFactory*
createPatsrcFactory(PairedPatternSource & _patsrc,int tid)1663 createPatsrcFactory(PairedPatternSource& _patsrc, int tid) {
1664 PatternSourcePerThreadFactory *patsrcFact;
1665 patsrcFact = new WrappedPatternSourcePerThreadFactory(_patsrc);
1666 assert(patsrcFact != NULL);
1667 return patsrcFact;
1668 }
1669
1670 #define PTHREAD_ATTRS (PTHREAD_CREATE_JOINABLE | PTHREAD_CREATE_DETACHED)
1671
1672 typedef TIndexOffU index_t;
1673 typedef uint16_t local_index_t;
1674 static PairedPatternSource* multiseed_patsrc;
1675 static HierEbwt<index_t>* multiseed_ebwtFw;
1676 static HierEbwt<index_t>* multiseed_ebwtBw;
1677 static Scoring* multiseed_sc;
1678 static BitPairReference* multiseed_refs;
1679 static AlignmentCache<index_t>* multiseed_ca; // seed cache
1680 static AlnSink<index_t>* multiseed_msink;
1681 static EList<string> multiseed_refnames;
1682 static OutFileBuf* multiseed_metricsOfb;
1683 static SpliceSiteDB* ssdb;
1684 static MUTEX_T multiseed_mutex;
1685
1686 /**
1687 * Metrics for measuring the work done by the outer read alignment
1688 * loop.
1689 */
1690 struct OuterLoopMetrics {
1691
OuterLoopMetricsOuterLoopMetrics1692 OuterLoopMetrics() {
1693 reset();
1694 }
1695
1696 /**
1697 * Set all counters to 0.
1698 */
resetOuterLoopMetrics1699 void reset() {
1700 reads = bases = srreads = srbases =
1701 freads = fbases = ureads = ubases = 0;
1702 }
1703
1704 /**
1705 * Sum the counters in m in with the conters in this object. This
1706 * is the only safe way to update an OuterLoopMetrics that's shared
1707 * by multiple threads.
1708 */
mergeOuterLoopMetrics1709 void merge(
1710 const OuterLoopMetrics& m,
1711 bool getLock = false)
1712 {
1713 ThreadSafe ts(&mutex_m, getLock);
1714 reads += m.reads;
1715 bases += m.bases;
1716 srreads += m.srreads;
1717 srbases += m.srbases;
1718 freads += m.freads;
1719 fbases += m.fbases;
1720 ureads += m.ureads;
1721 ubases += m.ubases;
1722 }
1723
1724 uint64_t reads; // total reads
1725 uint64_t bases; // total bases
1726 uint64_t srreads; // same-read reads
1727 uint64_t srbases; // same-read bases
1728 uint64_t freads; // filtered reads
1729 uint64_t fbases; // filtered bases
1730 uint64_t ureads; // unfiltered reads
1731 uint64_t ubases; // unfiltered bases
1732 MUTEX_T mutex_m;
1733 };
1734
1735 /**
1736 * Collection of all relevant performance metrics when aligning in
1737 * multiseed mode.
1738 */
1739 struct PerfMetrics {
1740
PerfMetricsPerfMetrics1741 PerfMetrics() : first(true) { reset(); }
1742
1743 /**
1744 * Set all counters to 0.
1745 */
resetPerfMetrics1746 void reset() {
1747 olm.reset();
1748 sdm.reset();
1749 wlm.reset();
1750 swmSeed.reset();
1751 swmMate.reset();
1752 rpm.reset();
1753 dpSse8Seed.reset(); // 8-bit SSE seed extensions
1754 dpSse8Mate.reset(); // 8-bit SSE mate finds
1755 dpSse16Seed.reset(); // 16-bit SSE seed extensions
1756 dpSse16Mate.reset(); // 16-bit SSE mate finds
1757 nbtfiltst = 0;
1758 nbtfiltsc = 0;
1759 nbtfiltdo = 0;
1760
1761 olmu.reset();
1762 sdmu.reset();
1763 wlmu.reset();
1764 swmuSeed.reset();
1765 swmuMate.reset();
1766 rpmu.reset();
1767 dpSse8uSeed.reset(); // 8-bit SSE seed extensions
1768 dpSse8uMate.reset(); // 8-bit SSE mate finds
1769 dpSse16uSeed.reset(); // 16-bit SSE seed extensions
1770 dpSse16uMate.reset(); // 16-bit SSE mate finds
1771 nbtfiltst_u = 0;
1772 nbtfiltsc_u = 0;
1773 nbtfiltdo_u = 0;
1774
1775 him.reset();
1776 }
1777
1778 /**
1779 * Merge a set of specific metrics into this object.
1780 */
mergePerfMetrics1781 void merge(
1782 const OuterLoopMetrics *ol,
1783 const SeedSearchMetrics *sd,
1784 const WalkMetrics *wl,
1785 const SwMetrics *swSeed,
1786 const SwMetrics *swMate,
1787 const ReportingMetrics *rm,
1788 const SSEMetrics *dpSse8Ex,
1789 const SSEMetrics *dpSse8Ma,
1790 const SSEMetrics *dpSse16Ex,
1791 const SSEMetrics *dpSse16Ma,
1792 uint64_t nbtfiltst_,
1793 uint64_t nbtfiltsc_,
1794 uint64_t nbtfiltdo_,
1795 const HIMetrics *hi,
1796 bool getLock)
1797 {
1798 ThreadSafe ts(&mutex_m, getLock);
1799 if(ol != NULL) {
1800 olmu.merge(*ol, false);
1801 }
1802 if(sd != NULL) {
1803 sdmu.merge(*sd, false);
1804 }
1805 if(wl != NULL) {
1806 wlmu.merge(*wl, false);
1807 }
1808 if(swSeed != NULL) {
1809 swmuSeed.merge(*swSeed, false);
1810 }
1811 if(swMate != NULL) {
1812 swmuMate.merge(*swMate, false);
1813 }
1814 if(rm != NULL) {
1815 rpmu.merge(*rm, false);
1816 }
1817 if(dpSse8Ex != NULL) {
1818 dpSse8uSeed.merge(*dpSse8Ex, false);
1819 }
1820 if(dpSse8Ma != NULL) {
1821 dpSse8uMate.merge(*dpSse8Ma, false);
1822 }
1823 if(dpSse16Ex != NULL) {
1824 dpSse16uSeed.merge(*dpSse16Ex, false);
1825 }
1826 if(dpSse16Ma != NULL) {
1827 dpSse16uMate.merge(*dpSse16Ma, false);
1828 }
1829 nbtfiltst_u += nbtfiltst_;
1830 nbtfiltsc_u += nbtfiltsc_;
1831 nbtfiltdo_u += nbtfiltdo_;
1832 if(hi != NULL) {
1833 him.merge(*hi, false);
1834 }
1835 }
1836
1837 /**
1838 * Reports a matrix of results, incl. column labels, to an OutFileBuf.
1839 * Optionally also sends results to stderr (unbuffered). Can optionally
1840 * print a per-read record with the read name at the beginning.
1841 */
reportIntervalPerfMetrics1842 void reportInterval(
1843 OutFileBuf* o, // file to send output to
1844 bool metricsStderr, // additionally output to stderr?
1845 bool total, // true -> report total, otherwise incremental
1846 bool sync, // synchronize output
1847 const BTString *name) // non-NULL name pointer if is per-read record
1848 {
1849 ThreadSafe ts(&mutex_m, sync);
1850 ostringstream stderrSs;
1851 time_t curtime = time(0);
1852 char buf[1024];
1853 if(first) {
1854 const char *str =
1855 /* 1 */ "Time" "\t"
1856 /* 2 */ "Read" "\t"
1857 /* 3 */ "Base" "\t"
1858 /* 4 */ "SameRead" "\t"
1859 /* 5 */ "SameReadBase" "\t"
1860 /* 6 */ "UnfilteredRead" "\t"
1861 /* 7 */ "UnfilteredBase" "\t"
1862
1863 /* 8 */ "Paired" "\t"
1864 /* 9 */ "Unpaired" "\t"
1865 /* 10 */ "AlConUni" "\t"
1866 /* 11 */ "AlConRep" "\t"
1867 /* 12 */ "AlConFail" "\t"
1868 /* 13 */ "AlDis" "\t"
1869 /* 14 */ "AlConFailUni" "\t"
1870 /* 15 */ "AlConFailRep" "\t"
1871 /* 16 */ "AlConFailFail" "\t"
1872 /* 17 */ "AlConRepUni" "\t"
1873 /* 18 */ "AlConRepRep" "\t"
1874 /* 19 */ "AlConRepFail" "\t"
1875 /* 20 */ "AlUnpUni" "\t"
1876 /* 21 */ "AlUnpRep" "\t"
1877 /* 22 */ "AlUnpFail" "\t"
1878
1879 /* 23 */ "SeedSearch" "\t"
1880 /* 24 */ "IntraSCacheHit" "\t"
1881 /* 25 */ "InterSCacheHit" "\t"
1882 /* 26 */ "OutOfMemory" "\t"
1883 /* 27 */ "AlBWOp" "\t"
1884 /* 28 */ "AlBWBranch" "\t"
1885 /* 29 */ "ResBWOp" "\t"
1886 /* 30 */ "ResBWBranch" "\t"
1887 /* 31 */ "ResResolve" "\t"
1888 /* 34 */ "ResReport" "\t"
1889 /* 35 */ "RedundantSHit" "\t"
1890
1891 /* 36 */ "BestMinEdit0" "\t"
1892 /* 37 */ "BestMinEdit1" "\t"
1893 /* 38 */ "BestMinEdit2" "\t"
1894
1895 /* 39 */ "ExactAttempts" "\t"
1896 /* 40 */ "ExactSucc" "\t"
1897 /* 41 */ "ExactRanges" "\t"
1898 /* 42 */ "ExactRows" "\t"
1899 /* 43 */ "ExactOOMs" "\t"
1900
1901 /* 44 */ "1mmAttempts" "\t"
1902 /* 45 */ "1mmSucc" "\t"
1903 /* 46 */ "1mmRanges" "\t"
1904 /* 47 */ "1mmRows" "\t"
1905 /* 48 */ "1mmOOMs" "\t"
1906
1907 /* 49 */ "UngappedSucc" "\t"
1908 /* 50 */ "UngappedFail" "\t"
1909 /* 51 */ "UngappedNoDec" "\t"
1910
1911 /* 52 */ "DPExLt10Gaps" "\t"
1912 /* 53 */ "DPExLt5Gaps" "\t"
1913 /* 54 */ "DPExLt3Gaps" "\t"
1914
1915 /* 55 */ "DPMateLt10Gaps" "\t"
1916 /* 56 */ "DPMateLt5Gaps" "\t"
1917 /* 57 */ "DPMateLt3Gaps" "\t"
1918
1919 /* 58 */ "DP16ExDps" "\t"
1920 /* 59 */ "DP16ExDpSat" "\t"
1921 /* 60 */ "DP16ExDpFail" "\t"
1922 /* 61 */ "DP16ExDpSucc" "\t"
1923 /* 62 */ "DP16ExCol" "\t"
1924 /* 63 */ "DP16ExCell" "\t"
1925 /* 64 */ "DP16ExInner" "\t"
1926 /* 65 */ "DP16ExFixup" "\t"
1927 /* 66 */ "DP16ExGathSol" "\t"
1928 /* 67 */ "DP16ExBt" "\t"
1929 /* 68 */ "DP16ExBtFail" "\t"
1930 /* 69 */ "DP16ExBtSucc" "\t"
1931 /* 70 */ "DP16ExBtCell" "\t"
1932 /* 71 */ "DP16ExCoreRej" "\t"
1933 /* 72 */ "DP16ExNRej" "\t"
1934
1935 /* 73 */ "DP8ExDps" "\t"
1936 /* 74 */ "DP8ExDpSat" "\t"
1937 /* 75 */ "DP8ExDpFail" "\t"
1938 /* 76 */ "DP8ExDpSucc" "\t"
1939 /* 77 */ "DP8ExCol" "\t"
1940 /* 78 */ "DP8ExCell" "\t"
1941 /* 79 */ "DP8ExInner" "\t"
1942 /* 80 */ "DP8ExFixup" "\t"
1943 /* 81 */ "DP8ExGathSol" "\t"
1944 /* 82 */ "DP8ExBt" "\t"
1945 /* 83 */ "DP8ExBtFail" "\t"
1946 /* 84 */ "DP8ExBtSucc" "\t"
1947 /* 85 */ "DP8ExBtCell" "\t"
1948 /* 86 */ "DP8ExCoreRej" "\t"
1949 /* 87 */ "DP8ExNRej" "\t"
1950
1951 /* 88 */ "DP16MateDps" "\t"
1952 /* 89 */ "DP16MateDpSat" "\t"
1953 /* 90 */ "DP16MateDpFail" "\t"
1954 /* 91 */ "DP16MateDpSucc" "\t"
1955 /* 92 */ "DP16MateCol" "\t"
1956 /* 93 */ "DP16MateCell" "\t"
1957 /* 94 */ "DP16MateInner" "\t"
1958 /* 95 */ "DP16MateFixup" "\t"
1959 /* 96 */ "DP16MateGathSol" "\t"
1960 /* 97 */ "DP16MateBt" "\t"
1961 /* 98 */ "DP16MateBtFail" "\t"
1962 /* 99 */ "DP16MateBtSucc" "\t"
1963 /* 100 */ "DP16MateBtCell" "\t"
1964 /* 101 */ "DP16MateCoreRej" "\t"
1965 /* 102 */ "DP16MateNRej" "\t"
1966
1967 /* 103 */ "DP8MateDps" "\t"
1968 /* 104 */ "DP8MateDpSat" "\t"
1969 /* 105 */ "DP8MateDpFail" "\t"
1970 /* 106 */ "DP8MateDpSucc" "\t"
1971 /* 107 */ "DP8MateCol" "\t"
1972 /* 108 */ "DP8MateCell" "\t"
1973 /* 109 */ "DP8MateInner" "\t"
1974 /* 110 */ "DP8MateFixup" "\t"
1975 /* 111 */ "DP8MateGathSol" "\t"
1976 /* 112 */ "DP8MateBt" "\t"
1977 /* 113 */ "DP8MateBtFail" "\t"
1978 /* 114 */ "DP8MateBtSucc" "\t"
1979 /* 115 */ "DP8MateBtCell" "\t"
1980 /* 116 */ "DP8MateCoreRej" "\t"
1981 /* 117 */ "DP8MateNRej" "\t"
1982
1983 /* 118 */ "DPBtFiltStart" "\t"
1984 /* 119 */ "DPBtFiltScore" "\t"
1985 /* 120 */ "DpBtFiltDom" "\t"
1986
1987 /* 121 */ "MemPeak" "\t"
1988 /* 122 */ "UncatMemPeak" "\t" // 0
1989 /* 123 */ "EbwtMemPeak" "\t" // EBWT_CAT
1990 /* 124 */ "CacheMemPeak" "\t" // CA_CAT
1991 /* 125 */ "ResolveMemPeak" "\t" // GW_CAT
1992 /* 126 */ "AlignMemPeak" "\t" // AL_CAT
1993 /* 127 */ "DPMemPeak" "\t" // DP_CAT
1994 /* 128 */ "MiscMemPeak" "\t" // MISC_CAT
1995 /* 129 */ "DebugMemPeak" "\t" // DEBUG_CAT
1996
1997 /* 130 */ "LocalSearch" "\t"
1998 /* 131 */ "AnchorSearch" "\t"
1999 /* 132 */ "LocalIndexSearch" "\t"
2000 /* 133 */ "LocalExtSearch" "\t"
2001 /* 134 */ "LocalSearchRecur" "\t"
2002 /* 135 */ "GlobalGenomeCoords" "\t"
2003 /* 136 */ "LocalGenomeCoords" "\t"
2004
2005
2006 "\n";
2007
2008 if(name != NULL) {
2009 if(o != NULL) o->writeChars("Name\t");
2010 if(metricsStderr) stderrSs << "Name\t";
2011 }
2012
2013 if(o != NULL) o->writeChars(str);
2014 if(metricsStderr) stderrSs << str;
2015 first = false;
2016 }
2017
2018 if(total) mergeIncrementals();
2019
2020 // 0. Read name, if needed
2021 if(name != NULL) {
2022 if(o != NULL) {
2023 o->writeChars(name->toZBuf());
2024 o->write('\t');
2025 }
2026 if(metricsStderr) {
2027 stderrSs << (*name) << '\t';
2028 }
2029 }
2030
2031 // 1. Current time in secs
2032 itoa10<time_t>(curtime, buf);
2033 if(metricsStderr) stderrSs << buf << '\t';
2034 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2035
2036 const OuterLoopMetrics& ol = total ? olm : olmu;
2037
2038 // 2. Reads
2039 itoa10<uint64_t>(ol.reads, buf);
2040 if(metricsStderr) stderrSs << buf << '\t';
2041 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2042 // 3. Bases
2043 itoa10<uint64_t>(ol.bases, buf);
2044 if(metricsStderr) stderrSs << buf << '\t';
2045 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2046 // 4. Same-read reads
2047 itoa10<uint64_t>(ol.srreads, buf);
2048 if(metricsStderr) stderrSs << buf << '\t';
2049 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2050 // 5. Same-read bases
2051 itoa10<uint64_t>(ol.srbases, buf);
2052 if(metricsStderr) stderrSs << buf << '\t';
2053 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2054 // 6. Unfiltered reads
2055 itoa10<uint64_t>(ol.ureads, buf);
2056 if(metricsStderr) stderrSs << buf << '\t';
2057 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2058 // 7. Unfiltered bases
2059 itoa10<uint64_t>(ol.ubases, buf);
2060 if(metricsStderr) stderrSs << buf << '\t';
2061 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2062
2063 const ReportingMetrics& rp = total ? rpm : rpmu;
2064
2065 // 8. Paired reads
2066 itoa10<uint64_t>(rp.npaired, buf);
2067 if(metricsStderr) stderrSs << buf << '\t';
2068 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2069 // 9. Unpaired reads
2070 itoa10<uint64_t>(rp.nunpaired, buf);
2071 if(metricsStderr) stderrSs << buf << '\t';
2072 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2073 // 10. Pairs with unique concordant alignments
2074 itoa10<uint64_t>(rp.nconcord_uni, buf);
2075 if(metricsStderr) stderrSs << buf << '\t';
2076 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2077 // 11. Pairs with repetitive concordant alignments
2078 itoa10<uint64_t>(rp.nconcord_rep, buf);
2079 if(metricsStderr) stderrSs << buf << '\t';
2080 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2081 // 12. Pairs with 0 concordant alignments
2082 itoa10<uint64_t>(rp.nconcord_0, buf);
2083 if(metricsStderr) stderrSs << buf << '\t';
2084 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2085 // 13. Pairs with 1 discordant alignment
2086 itoa10<uint64_t>(rp.ndiscord, buf);
2087 if(metricsStderr) stderrSs << buf << '\t';
2088 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2089 // 14. Mates from unaligned pairs that align uniquely
2090 itoa10<uint64_t>(rp.nunp_0_uni, buf);
2091 if(metricsStderr) stderrSs << buf << '\t';
2092 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2093 // 15. Mates from unaligned pairs that align repetitively
2094 itoa10<uint64_t>(rp.nunp_0_rep, buf);
2095 if(metricsStderr) stderrSs << buf << '\t';
2096 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2097 // 16. Mates from unaligned pairs that fail to align
2098 itoa10<uint64_t>(rp.nunp_0_0, buf);
2099 if(metricsStderr) stderrSs << buf << '\t';
2100 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2101 // 17. Mates from repetitive pairs that align uniquely
2102 itoa10<uint64_t>(rp.nunp_rep_uni, buf);
2103 if(metricsStderr) stderrSs << buf << '\t';
2104 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2105 // 18. Mates from repetitive pairs that align repetitively
2106 itoa10<uint64_t>(rp.nunp_rep_rep, buf);
2107 if(metricsStderr) stderrSs << buf << '\t';
2108 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2109 // 19. Mates from repetitive pairs that fail to align
2110 itoa10<uint64_t>(rp.nunp_rep_0, buf);
2111 if(metricsStderr) stderrSs << buf << '\t';
2112 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2113 // 20. Unpaired reads that align uniquely
2114 itoa10<uint64_t>(rp.nunp_uni, buf);
2115 if(metricsStderr) stderrSs << buf << '\t';
2116 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2117 // 21. Unpaired reads that align repetitively
2118 itoa10<uint64_t>(rp.nunp_rep, buf);
2119 if(metricsStderr) stderrSs << buf << '\t';
2120 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2121 // 22. Unpaired reads that fail to align
2122 itoa10<uint64_t>(rp.nunp_0, buf);
2123 if(metricsStderr) stderrSs << buf << '\t';
2124 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2125
2126 const SeedSearchMetrics& sd = total ? sdm : sdmu;
2127
2128 // 23. Seed searches
2129 itoa10<uint64_t>(sd.seedsearch, buf);
2130 if(metricsStderr) stderrSs << buf << '\t';
2131 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2132 // 24. Hits in 'current' cache
2133 itoa10<uint64_t>(sd.intrahit, buf);
2134 if(metricsStderr) stderrSs << buf << '\t';
2135 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2136 // 25. Hits in 'local' cache
2137 itoa10<uint64_t>(sd.interhit, buf);
2138 if(metricsStderr) stderrSs << buf << '\t';
2139 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2140 // 26. Out of memory
2141 itoa10<uint64_t>(sd.ooms, buf);
2142 if(metricsStderr) stderrSs << buf << '\t';
2143 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2144 // 27. Burrows-Wheeler ops in aligner
2145 itoa10<uint64_t>(sd.bwops, buf);
2146 if(metricsStderr) stderrSs << buf << '\t';
2147 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2148 // 28. Burrows-Wheeler branches (edits) in aligner
2149 itoa10<uint64_t>(sd.bweds, buf);
2150 if(metricsStderr) stderrSs << buf << '\t';
2151 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2152
2153 const WalkMetrics& wl = total ? wlm : wlmu;
2154
2155 // 29. Burrows-Wheeler ops in resolver
2156 itoa10<uint64_t>(wl.bwops, buf);
2157 if(metricsStderr) stderrSs << buf << '\t';
2158 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2159 // 30. Burrows-Wheeler branches in resolver
2160 itoa10<uint64_t>(wl.branches, buf);
2161 if(metricsStderr) stderrSs << buf << '\t';
2162 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2163 // 31. Burrows-Wheeler offset resolutions
2164 itoa10<uint64_t>(wl.resolves, buf);
2165 if(metricsStderr) stderrSs << buf << '\t';
2166 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2167 // 34. Offset reports
2168 itoa10<uint64_t>(wl.reports, buf);
2169 if(metricsStderr) stderrSs << buf << '\t';
2170 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2171
2172 // 35. Redundant seed hit
2173 itoa10<uint64_t>(total ? swmSeed.rshit : swmuSeed.rshit, buf);
2174 if(metricsStderr) stderrSs << buf << '\t';
2175 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2176
2177 // 36. # times the best (out of fw/rc) minimum # edits was 0
2178 itoa10<uint64_t>(total ? sdm.bestmin0 : sdmu.bestmin0, buf);
2179 if(metricsStderr) stderrSs << buf << '\t';
2180 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2181 // 37. # times the best (out of fw/rc) minimum # edits was 1
2182 itoa10<uint64_t>(total ? sdm.bestmin1 : sdmu.bestmin1, buf);
2183 if(metricsStderr) stderrSs << buf << '\t';
2184 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2185 // 38. # times the best (out of fw/rc) minimum # edits was 2
2186 itoa10<uint64_t>(total ? sdm.bestmin2 : sdmu.bestmin2, buf);
2187 if(metricsStderr) stderrSs << buf << '\t';
2188 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2189
2190 // 39. Exact aligner attempts
2191 itoa10<uint64_t>(total ? swmSeed.exatts : swmuSeed.exatts, buf);
2192 if(metricsStderr) stderrSs << buf << '\t';
2193 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2194 // 40. Exact aligner successes
2195 itoa10<uint64_t>(total ? swmSeed.exsucc : swmuSeed.exsucc, buf);
2196 if(metricsStderr) stderrSs << buf << '\t';
2197 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2198 // 41. Exact aligner ranges
2199 itoa10<uint64_t>(total ? swmSeed.exranges : swmuSeed.exranges, buf);
2200 if(metricsStderr) stderrSs << buf << '\t';
2201 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2202 // 42. Exact aligner rows
2203 itoa10<uint64_t>(total ? swmSeed.exrows : swmuSeed.exrows, buf);
2204 if(metricsStderr) stderrSs << buf << '\t';
2205 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2206 // 43. Exact aligner OOMs
2207 itoa10<uint64_t>(total ? swmSeed.exooms : swmuSeed.exooms, buf);
2208 if(metricsStderr) stderrSs << buf << '\t';
2209 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2210
2211 // 44. 1mm aligner attempts
2212 itoa10<uint64_t>(total ? swmSeed.mm1atts : swmuSeed.mm1atts, buf);
2213 if(metricsStderr) stderrSs << buf << '\t';
2214 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2215 // 45. 1mm aligner successes
2216 itoa10<uint64_t>(total ? swmSeed.mm1succ : swmuSeed.mm1succ, buf);
2217 if(metricsStderr) stderrSs << buf << '\t';
2218 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2219 // 46. 1mm aligner ranges
2220 itoa10<uint64_t>(total ? swmSeed.mm1ranges : swmuSeed.mm1ranges, buf);
2221 if(metricsStderr) stderrSs << buf << '\t';
2222 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2223 // 47. 1mm aligner rows
2224 itoa10<uint64_t>(total ? swmSeed.mm1rows : swmuSeed.mm1rows, buf);
2225 if(metricsStderr) stderrSs << buf << '\t';
2226 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2227 // 48. 1mm aligner OOMs
2228 itoa10<uint64_t>(total ? swmSeed.mm1ooms : swmuSeed.mm1ooms, buf);
2229 if(metricsStderr) stderrSs << buf << '\t';
2230 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2231
2232 // 49 Ungapped aligner success
2233 itoa10<uint64_t>(total ? swmSeed.ungapsucc : swmuSeed.ungapsucc, buf);
2234 if(metricsStderr) stderrSs << buf << '\t';
2235 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2236 // 50. Ungapped aligner fail
2237 itoa10<uint64_t>(total ? swmSeed.ungapfail : swmuSeed.ungapfail, buf);
2238 if(metricsStderr) stderrSs << buf << '\t';
2239 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2240 // 51. Ungapped aligner no decision
2241 itoa10<uint64_t>(total ? swmSeed.ungapnodec : swmuSeed.ungapnodec, buf);
2242 if(metricsStderr) stderrSs << buf << '\t';
2243 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2244
2245 // 52. # seed-extend DPs with < 10 gaps
2246 itoa10<uint64_t>(total ? swmSeed.sws10 : swmuSeed.sws10, buf);
2247 if(metricsStderr) stderrSs << buf << '\t';
2248 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2249 // 53. # seed-extend DPs with < 5 gaps
2250 itoa10<uint64_t>(total ? swmSeed.sws5 : swmuSeed.sws5, buf);
2251 if(metricsStderr) stderrSs << buf << '\t';
2252 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2253 // 54. # seed-extend DPs with < 3 gaps
2254 itoa10<uint64_t>(total ? swmSeed.sws3 : swmuSeed.sws3, buf);
2255 if(metricsStderr) stderrSs << buf << '\t';
2256 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2257
2258 // 55. # seed-extend DPs with < 10 gaps
2259 itoa10<uint64_t>(total ? swmMate.sws10 : swmuMate.sws10, buf);
2260 if(metricsStderr) stderrSs << buf << '\t';
2261 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2262 // 56. # seed-extend DPs with < 5 gaps
2263 itoa10<uint64_t>(total ? swmMate.sws5 : swmuMate.sws5, buf);
2264 if(metricsStderr) stderrSs << buf << '\t';
2265 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2266 // 57. # seed-extend DPs with < 3 gaps
2267 itoa10<uint64_t>(total ? swmMate.sws3 : swmuMate.sws3, buf);
2268 if(metricsStderr) stderrSs << buf << '\t';
2269 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2270
2271 const SSEMetrics& dpSse16s = total ? dpSse16Seed : dpSse16uSeed;
2272
2273 // 58. 16-bit SSE seed-extend DPs tried
2274 itoa10<uint64_t>(dpSse16s.dp, buf);
2275 if(metricsStderr) stderrSs << buf << '\t';
2276 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2277 // 59. 16-bit SSE seed-extend DPs saturated
2278 itoa10<uint64_t>(dpSse16s.dpsat, buf);
2279 if(metricsStderr) stderrSs << buf << '\t';
2280 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2281 // 60. 16-bit SSE seed-extend DPs failed
2282 itoa10<uint64_t>(dpSse16s.dpfail, buf);
2283 if(metricsStderr) stderrSs << buf << '\t';
2284 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2285 // 61. 16-bit SSE seed-extend DPs succeeded
2286 itoa10<uint64_t>(dpSse16s.dpsucc, buf);
2287 if(metricsStderr) stderrSs << buf << '\t';
2288 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2289 // 62. 16-bit SSE seed-extend DP columns completed
2290 itoa10<uint64_t>(dpSse16s.col, buf);
2291 if(metricsStderr) stderrSs << buf << '\t';
2292 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2293 // 63. 16-bit SSE seed-extend DP cells completed
2294 itoa10<uint64_t>(dpSse16s.cell, buf);
2295 if(metricsStderr) stderrSs << buf << '\t';
2296 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2297 // 64. 16-bit SSE seed-extend DP inner loop iters completed
2298 itoa10<uint64_t>(dpSse16s.inner, buf);
2299 if(metricsStderr) stderrSs << buf << '\t';
2300 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2301 // 65. 16-bit SSE seed-extend DP fixup loop iters completed
2302 itoa10<uint64_t>(dpSse16s.fixup, buf);
2303 if(metricsStderr) stderrSs << buf << '\t';
2304 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2305 // 66. 16-bit SSE seed-extend DP gather, cells with potential solutions
2306 itoa10<uint64_t>(dpSse16s.gathsol, buf);
2307 if(metricsStderr) stderrSs << buf << '\t';
2308 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2309 // 67. 16-bit SSE seed-extend DP backtrace attempts
2310 itoa10<uint64_t>(dpSse16s.bt, buf);
2311 if(metricsStderr) stderrSs << buf << '\t';
2312 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2313 // 68. 16-bit SSE seed-extend DP failed backtrace attempts
2314 itoa10<uint64_t>(dpSse16s.btfail, buf);
2315 if(metricsStderr) stderrSs << buf << '\t';
2316 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2317 // 69. 16-bit SSE seed-extend DP succesful backtrace attempts
2318 itoa10<uint64_t>(dpSse16s.btsucc, buf);
2319 if(metricsStderr) stderrSs << buf << '\t';
2320 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2321 // 70. 16-bit SSE seed-extend DP backtrace cells
2322 itoa10<uint64_t>(dpSse16s.btcell, buf);
2323 if(metricsStderr) stderrSs << buf << '\t';
2324 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2325 // 71. 16-bit SSE seed-extend DP core-diag rejections
2326 itoa10<uint64_t>(dpSse16s.corerej, buf);
2327 if(metricsStderr) stderrSs << buf << '\t';
2328 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2329 // 72. 16-bit SSE seed-extend DP N rejections
2330 itoa10<uint64_t>(dpSse16s.nrej, buf);
2331 if(metricsStderr) stderrSs << buf << '\t';
2332 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2333
2334 const SSEMetrics& dpSse8s = total ? dpSse8Seed : dpSse8uSeed;
2335
2336 // 73. 8-bit SSE seed-extend DPs tried
2337 itoa10<uint64_t>(dpSse8s.dp, buf);
2338 if(metricsStderr) stderrSs << buf << '\t';
2339 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2340 // 74. 8-bit SSE seed-extend DPs saturated
2341 itoa10<uint64_t>(dpSse8s.dpsat, buf);
2342 if(metricsStderr) stderrSs << buf << '\t';
2343 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2344 // 75. 8-bit SSE seed-extend DPs failed
2345 itoa10<uint64_t>(dpSse8s.dpfail, buf);
2346 if(metricsStderr) stderrSs << buf << '\t';
2347 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2348 // 76. 8-bit SSE seed-extend DPs succeeded
2349 itoa10<uint64_t>(dpSse8s.dpsucc, buf);
2350 if(metricsStderr) stderrSs << buf << '\t';
2351 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2352 // 77. 8-bit SSE seed-extend DP columns completed
2353 itoa10<uint64_t>(dpSse8s.col, buf);
2354 if(metricsStderr) stderrSs << buf << '\t';
2355 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2356 // 78. 8-bit SSE seed-extend DP cells completed
2357 itoa10<uint64_t>(dpSse8s.cell, buf);
2358 if(metricsStderr) stderrSs << buf << '\t';
2359 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2360 // 79. 8-bit SSE seed-extend DP inner loop iters completed
2361 itoa10<uint64_t>(dpSse8s.inner, buf);
2362 if(metricsStderr) stderrSs << buf << '\t';
2363 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2364 // 80. 8-bit SSE seed-extend DP fixup loop iters completed
2365 itoa10<uint64_t>(dpSse8s.fixup, buf);
2366 if(metricsStderr) stderrSs << buf << '\t';
2367 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2368 // 81. 16-bit SSE seed-extend DP gather, cells with potential solutions
2369 itoa10<uint64_t>(dpSse8s.gathsol, buf);
2370 if(metricsStderr) stderrSs << buf << '\t';
2371 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2372 // 82. 16-bit SSE seed-extend DP backtrace attempts
2373 itoa10<uint64_t>(dpSse8s.bt, buf);
2374 if(metricsStderr) stderrSs << buf << '\t';
2375 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2376 // 83. 16-bit SSE seed-extend DP failed backtrace attempts
2377 itoa10<uint64_t>(dpSse8s.btfail, buf);
2378 if(metricsStderr) stderrSs << buf << '\t';
2379 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2380 // 84. 16-bit SSE seed-extend DP succesful backtrace attempts
2381 itoa10<uint64_t>(dpSse8s.btsucc, buf);
2382 if(metricsStderr) stderrSs << buf << '\t';
2383 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2384 // 85. 16-bit SSE seed-extend DP backtrace cells
2385 itoa10<uint64_t>(dpSse8s.btcell, buf);
2386 if(metricsStderr) stderrSs << buf << '\t';
2387 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2388 // 86. 16-bit SSE seed-extend DP core-diag rejections
2389 itoa10<uint64_t>(dpSse8s.corerej, buf);
2390 if(metricsStderr) stderrSs << buf << '\t';
2391 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2392 // 87. 16-bit SSE seed-extend DP N rejections
2393 itoa10<uint64_t>(dpSse8s.nrej, buf);
2394 if(metricsStderr) stderrSs << buf << '\t';
2395 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2396
2397 const SSEMetrics& dpSse16m = total ? dpSse16Mate : dpSse16uMate;
2398
2399 // 88. 16-bit SSE mate-finding DPs tried
2400 itoa10<uint64_t>(dpSse16m.dp, buf);
2401 if(metricsStderr) stderrSs << buf << '\t';
2402 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2403 // 89. 16-bit SSE mate-finding DPs saturated
2404 itoa10<uint64_t>(dpSse16m.dpsat, buf);
2405 if(metricsStderr) stderrSs << buf << '\t';
2406 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2407 // 90. 16-bit SSE mate-finding DPs failed
2408 itoa10<uint64_t>(dpSse16m.dpfail, buf);
2409 if(metricsStderr) stderrSs << buf << '\t';
2410 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2411 // 91. 16-bit SSE mate-finding DPs succeeded
2412 itoa10<uint64_t>(dpSse16m.dpsucc, buf);
2413 if(metricsStderr) stderrSs << buf << '\t';
2414 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2415 // 92. 16-bit SSE mate-finding DP columns completed
2416 itoa10<uint64_t>(dpSse16m.col, buf);
2417 if(metricsStderr) stderrSs << buf << '\t';
2418 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2419 // 93. 16-bit SSE mate-finding DP cells completed
2420 itoa10<uint64_t>(dpSse16m.cell, buf);
2421 if(metricsStderr) stderrSs << buf << '\t';
2422 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2423 // 94. 16-bit SSE mate-finding DP inner loop iters completed
2424 itoa10<uint64_t>(dpSse16m.inner, buf);
2425 if(metricsStderr) stderrSs << buf << '\t';
2426 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2427 // 95. 16-bit SSE mate-finding DP fixup loop iters completed
2428 itoa10<uint64_t>(dpSse16m.fixup, buf);
2429 if(metricsStderr) stderrSs << buf << '\t';
2430 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2431 // 96. 16-bit SSE mate-finding DP gather, cells with potential solutions
2432 itoa10<uint64_t>(dpSse16m.gathsol, buf);
2433 if(metricsStderr) stderrSs << buf << '\t';
2434 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2435 // 97. 16-bit SSE mate-finding DP backtrace attempts
2436 itoa10<uint64_t>(dpSse16m.bt, buf);
2437 if(metricsStderr) stderrSs << buf << '\t';
2438 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2439 // 98. 16-bit SSE mate-finding DP failed backtrace attempts
2440 itoa10<uint64_t>(dpSse16m.btfail, buf);
2441 if(metricsStderr) stderrSs << buf << '\t';
2442 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2443 // 99. 16-bit SSE mate-finding DP succesful backtrace attempts
2444 itoa10<uint64_t>(dpSse16m.btsucc, buf);
2445 if(metricsStderr) stderrSs << buf << '\t';
2446 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2447 // 100. 16-bit SSE mate-finding DP backtrace cells
2448 itoa10<uint64_t>(dpSse16m.btcell, buf);
2449 if(metricsStderr) stderrSs << buf << '\t';
2450 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2451 // 101. 16-bit SSE mate-finding DP core-diag rejections
2452 itoa10<uint64_t>(dpSse16m.corerej, buf);
2453 if(metricsStderr) stderrSs << buf << '\t';
2454 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2455 // 102. 16-bit SSE mate-finding DP N rejections
2456 itoa10<uint64_t>(dpSse16m.nrej, buf);
2457 if(metricsStderr) stderrSs << buf << '\t';
2458 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2459
2460 const SSEMetrics& dpSse8m = total ? dpSse8Mate : dpSse8uMate;
2461
2462 // 103. 8-bit SSE mate-finding DPs tried
2463 itoa10<uint64_t>(dpSse8m.dp, buf);
2464 if(metricsStderr) stderrSs << buf << '\t';
2465 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2466 // 104. 8-bit SSE mate-finding DPs saturated
2467 itoa10<uint64_t>(dpSse8m.dpsat, buf);
2468 if(metricsStderr) stderrSs << buf << '\t';
2469 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2470 // 105. 8-bit SSE mate-finding DPs failed
2471 itoa10<uint64_t>(dpSse8m.dpfail, buf);
2472 if(metricsStderr) stderrSs << buf << '\t';
2473 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2474 // 106. 8-bit SSE mate-finding DPs succeeded
2475 itoa10<uint64_t>(dpSse8m.dpsucc, buf);
2476 if(metricsStderr) stderrSs << buf << '\t';
2477 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2478 // 107. 8-bit SSE mate-finding DP columns completed
2479 itoa10<uint64_t>(dpSse8m.col, buf);
2480 if(metricsStderr) stderrSs << buf << '\t';
2481 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2482 // 108. 8-bit SSE mate-finding DP cells completed
2483 itoa10<uint64_t>(dpSse8m.cell, buf);
2484 if(metricsStderr) stderrSs << buf << '\t';
2485 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2486 // 109. 8-bit SSE mate-finding DP inner loop iters completed
2487 itoa10<uint64_t>(dpSse8m.inner, buf);
2488 if(metricsStderr) stderrSs << buf << '\t';
2489 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2490 // 110. 8-bit SSE mate-finding DP fixup loop iters completed
2491 itoa10<uint64_t>(dpSse8m.fixup, buf);
2492 if(metricsStderr) stderrSs << buf << '\t';
2493 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2494 // 111. 16-bit SSE mate-finding DP gather, cells with potential solutions
2495 itoa10<uint64_t>(dpSse8m.gathsol, buf);
2496 if(metricsStderr) stderrSs << buf << '\t';
2497 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2498 // 112. 16-bit SSE mate-finding DP backtrace attempts
2499 itoa10<uint64_t>(dpSse8m.bt, buf);
2500 if(metricsStderr) stderrSs << buf << '\t';
2501 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2502 // 113. 16-bit SSE mate-finding DP failed backtrace attempts
2503 itoa10<uint64_t>(dpSse8m.btfail, buf);
2504 if(metricsStderr) stderrSs << buf << '\t';
2505 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2506 // 114. 16-bit SSE mate-finding DP succesful backtrace attempts
2507 itoa10<uint64_t>(dpSse8m.btsucc, buf);
2508 if(metricsStderr) stderrSs << buf << '\t';
2509 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2510 // 115. 16-bit SSE mate-finding DP backtrace cells
2511 itoa10<uint64_t>(dpSse8m.btcell, buf);
2512 if(metricsStderr) stderrSs << buf << '\t';
2513 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2514 // 116. 16-bit SSE mate-finding DP core rejections
2515 itoa10<uint64_t>(dpSse8m.corerej, buf);
2516 if(metricsStderr) stderrSs << buf << '\t';
2517 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2518 // 117. 16-bit SSE mate-finding N rejections
2519 itoa10<uint64_t>(dpSse8m.nrej, buf);
2520 if(metricsStderr) stderrSs << buf << '\t';
2521 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2522
2523 // 118. Backtrace candidates filtered due to starting cell
2524 itoa10<uint64_t>(total ? nbtfiltst : nbtfiltst_u, buf);
2525 if(metricsStderr) stderrSs << buf << '\t';
2526 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2527 // 119. Backtrace candidates filtered due to low score
2528 itoa10<uint64_t>(total ? nbtfiltsc : nbtfiltsc_u, buf);
2529 if(metricsStderr) stderrSs << buf << '\t';
2530 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2531 // 120. Backtrace candidates filtered due to domination
2532 itoa10<uint64_t>(total ? nbtfiltdo : nbtfiltdo_u, buf);
2533 if(metricsStderr) stderrSs << buf << '\t';
2534 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2535
2536 // 121. Overall memory peak
2537 itoa10<size_t>(gMemTally.peak() >> 20, buf);
2538 if(metricsStderr) stderrSs << buf << '\t';
2539 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2540 // 122. Uncategorized memory peak
2541 itoa10<size_t>(gMemTally.peak(0) >> 20, buf);
2542 if(metricsStderr) stderrSs << buf << '\t';
2543 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2544 // 123. Ebwt memory peak
2545 itoa10<size_t>(gMemTally.peak(EBWT_CAT) >> 20, buf);
2546 if(metricsStderr) stderrSs << buf << '\t';
2547 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2548 // 124. Cache memory peak
2549 itoa10<size_t>(gMemTally.peak(CA_CAT) >> 20, buf);
2550 if(metricsStderr) stderrSs << buf << '\t';
2551 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2552 // 125. Resolver memory peak
2553 itoa10<size_t>(gMemTally.peak(GW_CAT) >> 20, buf);
2554 if(metricsStderr) stderrSs << buf << '\t';
2555 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2556 // 126. Seed aligner memory peak
2557 itoa10<size_t>(gMemTally.peak(AL_CAT) >> 20, buf);
2558 if(metricsStderr) stderrSs << buf << '\t';
2559 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2560 // 127. Dynamic programming aligner memory peak
2561 itoa10<size_t>(gMemTally.peak(DP_CAT) >> 20, buf);
2562 if(metricsStderr) stderrSs << buf << '\t';
2563 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2564 // 128. Miscellaneous memory peak
2565 itoa10<size_t>(gMemTally.peak(MISC_CAT) >> 20, buf);
2566 if(metricsStderr) stderrSs << buf << '\t';
2567 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2568 // 129. Debug memory peak
2569 itoa10<size_t>(gMemTally.peak(DEBUG_CAT) >> 20, buf);
2570 if(metricsStderr) stderrSs << buf << '\t';
2571 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2572
2573 // 130
2574 itoa10<size_t>(him.localatts, buf);
2575 if(metricsStderr) stderrSs << buf << '\t';
2576 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2577 // 131
2578 itoa10<size_t>(him.anchoratts, buf);
2579 if(metricsStderr) stderrSs << buf << '\t';
2580 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2581 // 132
2582 itoa10<size_t>(him.localindexatts, buf);
2583 if(metricsStderr) stderrSs << buf << '\t';
2584 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2585 // 133
2586 itoa10<size_t>(him.localextatts, buf);
2587 if(metricsStderr) stderrSs << buf << '\t';
2588 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2589 // 134
2590 itoa10<size_t>(him.localsearchrecur, buf);
2591 if(metricsStderr) stderrSs << buf << '\t';
2592 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2593 // 135
2594 itoa10<size_t>(him.globalgenomecoords, buf);
2595 if(metricsStderr) stderrSs << buf << '\t';
2596 if(o != NULL) { o->writeChars(buf); o->write('\t'); }
2597 // 136
2598 itoa10<size_t>(him.localgenomecoords, buf);
2599 if(metricsStderr) stderrSs << buf;
2600 if(o != NULL) { o->writeChars(buf); }
2601
2602 if(o != NULL) { o->write('\n'); }
2603 if(metricsStderr) cerr << stderrSs.str().c_str() << endl;
2604 if(!total) mergeIncrementals();
2605 }
2606
mergeIncrementalsPerfMetrics2607 void mergeIncrementals() {
2608 olm.merge(olmu, false);
2609 sdm.merge(sdmu, false);
2610 wlm.merge(wlmu, false);
2611 swmSeed.merge(swmuSeed, false);
2612 swmMate.merge(swmuMate, false);
2613 dpSse8Seed.merge(dpSse8uSeed, false);
2614 dpSse8Mate.merge(dpSse8uMate, false);
2615 dpSse16Seed.merge(dpSse16uSeed, false);
2616 dpSse16Mate.merge(dpSse16uMate, false);
2617 nbtfiltst_u += nbtfiltst;
2618 nbtfiltsc_u += nbtfiltsc;
2619 nbtfiltdo_u += nbtfiltdo;
2620
2621 olmu.reset();
2622 sdmu.reset();
2623 wlmu.reset();
2624 swmuSeed.reset();
2625 swmuMate.reset();
2626 rpmu.reset();
2627 dpSse8uSeed.reset();
2628 dpSse8uMate.reset();
2629 dpSse16uSeed.reset();
2630 dpSse16uMate.reset();
2631 nbtfiltst_u = 0;
2632 nbtfiltsc_u = 0;
2633 nbtfiltdo_u = 0;
2634 }
2635
2636 // Total over the whole job
2637 OuterLoopMetrics olm; // overall metrics
2638 SeedSearchMetrics sdm; // metrics related to seed alignment
2639 WalkMetrics wlm; // metrics related to walking left (i.e. resolving reference offsets)
2640 SwMetrics swmSeed; // metrics related to DP seed-extend alignment
2641 SwMetrics swmMate; // metrics related to DP mate-finding alignment
2642 ReportingMetrics rpm; // metrics related to reporting
2643 SSEMetrics dpSse8Seed; // 8-bit SSE seed extensions
2644 SSEMetrics dpSse8Mate; // 8-bit SSE mate finds
2645 SSEMetrics dpSse16Seed; // 16-bit SSE seed extensions
2646 SSEMetrics dpSse16Mate; // 16-bit SSE mate finds
2647 uint64_t nbtfiltst;
2648 uint64_t nbtfiltsc;
2649 uint64_t nbtfiltdo;
2650
2651 // Just since the last update
2652 OuterLoopMetrics olmu; // overall metrics
2653 SeedSearchMetrics sdmu; // metrics related to seed alignment
2654 WalkMetrics wlmu; // metrics related to walking left (i.e. resolving reference offsets)
2655 SwMetrics swmuSeed; // metrics related to DP seed-extend alignment
2656 SwMetrics swmuMate; // metrics related to DP mate-finding alignment
2657 ReportingMetrics rpmu; // metrics related to reporting
2658 SSEMetrics dpSse8uSeed; // 8-bit SSE seed extensions
2659 SSEMetrics dpSse8uMate; // 8-bit SSE mate finds
2660 SSEMetrics dpSse16uSeed; // 16-bit SSE seed extensions
2661 SSEMetrics dpSse16uMate; // 16-bit SSE mate finds
2662 uint64_t nbtfiltst_u;
2663 uint64_t nbtfiltsc_u;
2664 uint64_t nbtfiltdo_u;
2665
2666 //
2667 HIMetrics him;
2668
2669 MUTEX_T mutex_m; // lock for when one ob
2670 bool first; // yet to print first line?
2671 time_t lastElapsed; // used in reportInterval to measure time since last call
2672 };
2673
2674 static PerfMetrics metrics;
2675
2676 // Cyclic rotations
2677 #define ROTL(n, x) (((x) << (n)) | ((x) >> (32-n)))
2678 #define ROTR(n, x) (((x) >> (n)) | ((x) << (32-n)))
2679
printMmsSkipMsg(const PatternSourcePerThread & ps,bool paired,bool mate1,int seedmms)2680 static inline void printMmsSkipMsg(
2681 const PatternSourcePerThread& ps,
2682 bool paired,
2683 bool mate1,
2684 int seedmms)
2685 {
2686 ostringstream os;
2687 if(paired) {
2688 os << "Warning: skipping mate #" << (mate1 ? '1' : '2')
2689 << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2690 << "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length())
2691 << ") <= # seed mismatches (" << seedmms << ")" << endl;
2692 } else {
2693 os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2694 << "' because length (" << (mate1 ? ps.bufa().patFw.length() : ps.bufb().patFw.length())
2695 << ") <= # seed mismatches (" << seedmms << ")" << endl;
2696 }
2697 cerr << os.str().c_str();
2698 }
2699
printLenSkipMsg(const PatternSourcePerThread & ps,bool paired,bool mate1)2700 static inline void printLenSkipMsg(
2701 const PatternSourcePerThread& ps,
2702 bool paired,
2703 bool mate1)
2704 {
2705 ostringstream os;
2706 if(paired) {
2707 os << "Warning: skipping mate #" << (mate1 ? '1' : '2')
2708 << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2709 << "' because it was < 2 characters long" << endl;
2710 } else {
2711 os << "Warning: skipping read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2712 << "' because it was < 2 characters long" << endl;
2713 }
2714 cerr << os.str().c_str();
2715 }
2716
printLocalScoreMsg(const PatternSourcePerThread & ps,bool paired,bool mate1)2717 static inline void printLocalScoreMsg(
2718 const PatternSourcePerThread& ps,
2719 bool paired,
2720 bool mate1)
2721 {
2722 ostringstream os;
2723 if(paired) {
2724 os << "Warning: minimum score function gave negative number in "
2725 << "--local mode for mate #" << (mate1 ? '1' : '2')
2726 << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2727 << "; setting to 0 instead" << endl;
2728 } else {
2729 os << "Warning: minimum score function gave negative number in "
2730 << "--local mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2731 << "; setting to 0 instead" << endl;
2732 }
2733 cerr << os.str().c_str();
2734 }
2735
printEEScoreMsg(const PatternSourcePerThread & ps,bool paired,bool mate1)2736 static inline void printEEScoreMsg(
2737 const PatternSourcePerThread& ps,
2738 bool paired,
2739 bool mate1)
2740 {
2741 ostringstream os;
2742 if(paired) {
2743 os << "Warning: minimum score function gave positive number in "
2744 << "--end-to-end mode for mate #" << (mate1 ? '1' : '2')
2745 << " of read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2746 << "; setting to 0 instead" << endl;
2747 } else {
2748 os << "Warning: minimum score function gave positive number in "
2749 << "--end-to-end mode for read '" << (mate1 ? ps.bufa().name : ps.bufb().name)
2750 << "; setting to 0 instead" << endl;
2751 }
2752 cerr << os.str().c_str();
2753 }
2754
2755
2756 #define MERGE_METRICS(met, sync) { \
2757 msink.mergeMetrics(rpm); \
2758 met.merge( \
2759 &olm, \
2760 &sdm, \
2761 &wlm, \
2762 &swmSeed, \
2763 &swmMate, \
2764 &rpm, \
2765 &sseU8ExtendMet, \
2766 &sseU8MateMet, \
2767 &sseI16ExtendMet, \
2768 &sseI16MateMet, \
2769 nbtfiltst, \
2770 nbtfiltsc, \
2771 nbtfiltdo, \
2772 &him, \
2773 sync); \
2774 olm.reset(); \
2775 sdm.reset(); \
2776 wlm.reset(); \
2777 swmSeed.reset(); \
2778 swmMate.reset(); \
2779 rpm.reset(); \
2780 sseU8ExtendMet.reset(); \
2781 sseU8MateMet.reset(); \
2782 sseI16ExtendMet.reset(); \
2783 sseI16MateMet.reset(); \
2784 him.reset(); \
2785 }
2786
2787 #define MERGE_SW(x) { \
2788 x.merge( \
2789 sseU8ExtendMet, \
2790 sseU8MateMet, \
2791 sseI16ExtendMet, \
2792 sseI16MateMet, \
2793 nbtfiltst, \
2794 nbtfiltsc, \
2795 nbtfiltdo); \
2796 x.resetCounters(); \
2797 }
2798
2799 /**
2800 * Called once per thread. Sets up per-thread pointers to the shared global
2801 * data structures, creates per-thread structures, then enters the alignment
2802 * loop. The general flow of the alignment loop is:
2803 *
2804 * - If it's been a while and we're the master thread, report some alignment
2805 * metrics
2806 * - Get the next read/pair
2807 * - Check if this read/pair is identical to the previous
2808 * + If identical, check whether we can skip any or all alignment stages. If
2809 * we can skip all stages, report the result immediately and move to next
2810 * read/pair
2811 * + If not identical, continue
2812 * -
2813 */
multiseedSearchWorker_hisat_bp(void * vp)2814 static void multiseedSearchWorker_hisat_bp(void *vp) {
2815 int tid = *((int*)vp);
2816 assert(multiseed_ebwtFw != NULL);
2817 assert(multiseedMms == 0 || multiseed_ebwtBw != NULL);
2818 PairedPatternSource& patsrc = *multiseed_patsrc;
2819 const HierEbwt<index_t>& ebwtFw = *multiseed_ebwtFw;
2820 const HierEbwt<index_t>& ebwtBw = *multiseed_ebwtBw;
2821 const Scoring& sc = *multiseed_sc;
2822 const BitPairReference& ref = *multiseed_refs;
2823 AlignmentCache<index_t>& scShared = *multiseed_ca;
2824 AlnSink<index_t>& msink = *multiseed_msink;
2825 OutFileBuf* metricsOfb = multiseed_metricsOfb;
2826
2827 // Sinks: these are so that we can print tables encoding counts for
2828 // events of interest on a per-read, per-seed, per-join, or per-SW
2829 // level. These in turn can be used to diagnose performance
2830 // problems, or generally characterize performance.
2831
2832 //const BitPairReference& refs = *multiseed_refs;
2833 auto_ptr<PatternSourcePerThreadFactory> patsrcFact(createPatsrcFactory(patsrc, tid));
2834 auto_ptr<PatternSourcePerThread> ps(patsrcFact->create());
2835
2836 // Thread-local cache for seed alignments
2837 PtrWrap<AlignmentCache<index_t> > scLocal;
2838 if(!msNoCache) {
2839 scLocal.init(new AlignmentCache<index_t>(seedCacheLocalMB * 1024 * 1024, false));
2840 }
2841 AlignmentCache<index_t> scCurrent(seedCacheCurrentMB * 1024 * 1024, false);
2842 // Thread-local cache for current seed alignments
2843
2844 // Interfaces for alignment and seed caches
2845 AlignmentCacheIface<index_t> ca(
2846 &scCurrent,
2847 scLocal.get(),
2848 msNoCache ? NULL : &scShared);
2849
2850 // Instantiate an object for holding reporting-related parameters.
2851 ReportingParams rp(
2852 (allHits ? std::numeric_limits<THitInt>::max() : khits), // -k
2853 mhits, // -m/-M
2854 0, // penalty gap (not used now)
2855 msample, // true -> -M was specified, otherwise assume -m
2856 gReportDiscordant, // report discordang paired-end alignments?
2857 gReportMixed); // report unpaired alignments for paired reads?
2858
2859 // Instantiate a mapping quality calculator
2860 auto_ptr<Mapq> bmapq(new_mapq(mapqv, scoreMin, sc));
2861
2862 // Make a per-thread wrapper for the global MHitSink object.
2863 AlnSinkWrap<index_t> msinkwrap(
2864 msink, // global sink
2865 rp, // reporting parameters
2866 *bmapq.get(), // MAPQ calculator
2867 (size_t)tid); // thread id
2868
2869 BP_Aligner<index_t, local_index_t> bp_Aligner(
2870 ebwtFw,
2871 multiseed_refnames,
2872 &multiseed_mutex,
2873 thread_rids_mindist,
2874 no_spliced_alignment);
2875 SwAligner sw;
2876 OuterLoopMetrics olm;
2877 SeedSearchMetrics sdm;
2878 WalkMetrics wlm;
2879 SwMetrics swmSeed, swmMate;
2880 ReportingMetrics rpm;
2881 RandomSource rnd, rndArb;
2882 SSEMetrics sseU8ExtendMet;
2883 SSEMetrics sseU8MateMet;
2884 SSEMetrics sseI16ExtendMet;
2885 SSEMetrics sseI16MateMet;
2886 DescentMetrics descm;
2887 uint64_t nbtfiltst = 0; // TODO: find a new home for these
2888 uint64_t nbtfiltsc = 0; // TODO: find a new home for these
2889 uint64_t nbtfiltdo = 0; // TODO: find a new home for these
2890 HIMetrics him;
2891
2892 ASSERT_ONLY(BTDnaString tmp);
2893
2894 int pepolFlag;
2895 if(gMate1fw && gMate2fw) {
2896 pepolFlag = PE_POLICY_FF;
2897 } else if(gMate1fw && !gMate2fw) {
2898 pepolFlag = PE_POLICY_FR;
2899 } else if(!gMate1fw && gMate2fw) {
2900 pepolFlag = PE_POLICY_RF;
2901 } else {
2902 pepolFlag = PE_POLICY_RR;
2903 }
2904 assert_geq(gMaxInsert, gMinInsert);
2905 assert_geq(gMinInsert, 0);
2906 PairedEndPolicy pepol(
2907 pepolFlag,
2908 gMaxInsert,
2909 gMinInsert,
2910 localAlign,
2911 gFlippedMatesOK,
2912 gDovetailMatesOK,
2913 gContainMatesOK,
2914 gOlapMatesOK,
2915 gExpandToFrag);
2916
2917 PerfMetrics metricsPt; // per-thread metrics object; for read-level metrics
2918 BTString nametmp;
2919
2920 PerReadMetrics prm;
2921
2922 // Used by thread with threadid == 1 to measure time elapsed
2923 time_t iTime = time(0);
2924
2925 // Keep track of whether last search was exhaustive for mates 1 and 2
2926 bool exhaustive[2] = { false, false };
2927 // Keep track of whether mates 1/2 were filtered out last time through
2928 bool filt[2] = { true, true };
2929 // Keep track of whether mates 1/2 were filtered out due Ns last time
2930 bool nfilt[2] = { true, true };
2931 // Keep track of whether mates 1/2 were filtered out due to not having
2932 // enough characters to rise about the score threshold.
2933 bool scfilt[2] = { true, true };
2934 // Keep track of whether mates 1/2 were filtered out due to not having
2935 // more characters than the number of mismatches permitted in a seed.
2936 bool lenfilt[2] = { true, true };
2937 // Keep track of whether mates 1/2 were filtered out by upstream qc
2938 bool qcfilt[2] = { true, true };
2939
2940 rndArb.init((uint32_t)time(0));
2941 int mergei = 0;
2942 int mergeival = 16;
2943 while(true) {
2944 bool success = false, done = false, paired = false;
2945 ps->nextReadPair(success, done, paired, outType != OUTPUT_SAM);
2946 if(!success && done) {
2947 break;
2948 } else if(!success) {
2949 continue;
2950 }
2951 TReadId rdid = ps->rdid();
2952
2953 if(nthreads > 1 && useTempSpliceSite) {
2954 while(true) {
2955 uint64_t min_rdid = 0;
2956 {
2957 // ThreadSafe t(&thread_rids_mutex, nthreads > 1);
2958 assert_gt(thread_rids.size(), 0);
2959 min_rdid = thread_rids[0];
2960 for(size_t i = 1; i < thread_rids.size(); i++) {
2961 if(thread_rids[i] < min_rdid) {
2962 min_rdid = thread_rids[i];
2963 }
2964 }
2965 }
2966
2967 if(min_rdid + thread_rids_mindist < rdid) {
2968 #if defined(_TTHREAD_WIN32_)
2969 Sleep(0);
2970 #elif defined(_TTHREAD_POSIX_)
2971 sched_yield();
2972 #endif
2973 } else break;
2974 }
2975 }
2976
2977 bool sample = true;
2978 if(arbitraryRandom) {
2979 ps->bufa().seed = rndArb.nextU32();
2980 ps->bufb().seed = rndArb.nextU32();
2981 }
2982 if(sampleFrac < 1.0f) {
2983 rnd.init(ROTL(ps->bufa().seed, 2));
2984 sample = rnd.nextFloat() < sampleFrac;
2985 }
2986 if(rdid >= skipReads && rdid < qUpto && sample) {
2987 // Align this read/pair
2988 bool retry = true;
2989 //
2990 // Check if there is metrics reporting for us to do.
2991 //
2992 if(metricsIval > 0 &&
2993 (metricsOfb != NULL || metricsStderr) &&
2994 !metricsPerRead &&
2995 ++mergei == mergeival)
2996 {
2997 // Do a periodic merge. Update global metrics, in a
2998 // synchronized manner if needed.
2999 MERGE_METRICS(metrics, nthreads > 1);
3000 mergei = 0;
3001 // Check if a progress message should be printed
3002 if(tid == 0) {
3003 // Only thread 1 prints progress messages
3004 time_t curTime = time(0);
3005 if(curTime - iTime >= metricsIval) {
3006 metrics.reportInterval(metricsOfb, metricsStderr, false, true, NULL);
3007 iTime = curTime;
3008 }
3009 }
3010 }
3011 prm.reset(); // per-read metrics
3012 prm.doFmString = false;
3013 if(sam_print_xt) {
3014 gettimeofday(&prm.tv_beg, &prm.tz_beg);
3015 }
3016 // Try to align this read
3017 while(retry) {
3018 retry = false;
3019 assert_eq(ps->bufa().color, false);
3020 ca.nextRead(); // clear the cache
3021 olm.reads++;
3022 assert(!ca.aligning());
3023 bool pair = paired;
3024 const size_t rdlen1 = ps->bufa().length();
3025 const size_t rdlen2 = pair ? ps->bufb().length() : 0;
3026 olm.bases += (rdlen1 + rdlen2);
3027 #if 0
3028 msinkwrap.nextRead(
3029 &ps->bufa(),
3030 pair ? &ps->bufb() : NULL,
3031 rdid,
3032 sc.qualitiesMatter());
3033 assert(msinkwrap.inited());
3034 #endif
3035 size_t rdlens[2] = { rdlen1, rdlen2 };
3036 // Calculate the minimum valid score threshold for the read
3037 TAlScore minsc[2], maxpen[2];
3038 maxpen[0] = maxpen[1] = 0;
3039 minsc[0] = minsc[1] = std::numeric_limits<TAlScore>::max();
3040 if(bwaSwLike) {
3041 // From BWA-SW manual: "Given an l-long query, the
3042 // threshold for a hit to be retained is
3043 // a*max{T,c*log(l)}." We try to recreate that here.
3044 float a = (float)sc.match(30);
3045 float T = bwaSwLikeT, c = bwaSwLikeC;
3046 minsc[0] = (TAlScore)max<float>(a*T, a*c*log(rdlens[0]));
3047 if(paired) {
3048 minsc[1] = (TAlScore)max<float>(a*T, a*c*log(rdlens[1]));
3049 }
3050 } else {
3051 minsc[0] = scoreMin.f<TAlScore>(rdlens[0]);
3052 if(paired) minsc[1] = scoreMin.f<TAlScore>(rdlens[1]);
3053 if(localAlign) {
3054 if(minsc[0] < 0) {
3055 if(!gQuiet) printLocalScoreMsg(*ps, paired, true);
3056 minsc[0] = 0;
3057 }
3058 if(paired && minsc[1] < 0) {
3059 if(!gQuiet) printLocalScoreMsg(*ps, paired, false);
3060 minsc[1] = 0;
3061 }
3062 } else {
3063 if(minsc[0] > 0) {
3064 if(!gQuiet) printEEScoreMsg(*ps, paired, true);
3065 minsc[0] = 0;
3066 }
3067 if(paired && minsc[1] > 0) {
3068 if(!gQuiet) printEEScoreMsg(*ps, paired, false);
3069 minsc[1] = 0;
3070 }
3071 }
3072 }
3073
3074 // N filter; does the read have too many Ns?
3075 size_t readns[2] = {0, 0};
3076 sc.nFilterPair(
3077 &ps->bufa().patFw,
3078 pair ? &ps->bufb().patFw : NULL,
3079 readns[0],
3080 readns[1],
3081 nfilt[0],
3082 nfilt[1]);
3083 // Score filter; does the read enough character to rise above
3084 // the score threshold?
3085 scfilt[0] = sc.scoreFilter(minsc[0], rdlens[0]);
3086 scfilt[1] = sc.scoreFilter(minsc[1], rdlens[1]);
3087 lenfilt[0] = lenfilt[1] = true;
3088 if(rdlens[0] <= (size_t)multiseedMms || rdlens[0] < 2) {
3089 if(!gQuiet) printMmsSkipMsg(*ps, paired, true, multiseedMms);
3090 lenfilt[0] = false;
3091 }
3092 if((rdlens[1] <= (size_t)multiseedMms || rdlens[1] < 2) && paired) {
3093 if(!gQuiet) printMmsSkipMsg(*ps, paired, false, multiseedMms);
3094 lenfilt[1] = false;
3095 }
3096 if(rdlens[0] < 2) {
3097 if(!gQuiet) printLenSkipMsg(*ps, paired, true);
3098 lenfilt[0] = false;
3099 }
3100 if(rdlens[1] < 2 && paired) {
3101 if(!gQuiet) printLenSkipMsg(*ps, paired, false);
3102 lenfilt[1] = false;
3103 }
3104 qcfilt[0] = qcfilt[1] = true;
3105 if(qcFilter) {
3106 qcfilt[0] = (ps->bufa().filter != '0');
3107 qcfilt[1] = (ps->bufb().filter != '0');
3108 }
3109 filt[0] = (nfilt[0] && scfilt[0] && lenfilt[0] && qcfilt[0]);
3110 filt[1] = (nfilt[1] && scfilt[1] && lenfilt[1] && qcfilt[1]);
3111 prm.nFilt += (filt[0] ? 0 : 1) + (filt[1] ? 0 : 1);
3112 Read* rds[2] = { &ps->bufa(), &ps->bufb() };
3113 // For each mate...
3114 assert(msinkwrap.empty());
3115 //size_t minedfw[2] = { 0, 0 };
3116 //size_t minedrc[2] = { 0, 0 };
3117 // Calcualte nofw / no rc
3118 bool nofw[2] = { false, false };
3119 bool norc[2] = { false, false };
3120 nofw[0] = paired ? (gMate1fw ? gNofw : gNorc) : gNofw;
3121 norc[0] = paired ? (gMate1fw ? gNorc : gNofw) : gNorc;
3122 nofw[1] = paired ? (gMate2fw ? gNofw : gNorc) : gNofw;
3123 norc[1] = paired ? (gMate2fw ? gNorc : gNofw) : gNorc;
3124 // Calculate nceil
3125 int nceil[2] = { 0, 0 };
3126 nceil[0] = nCeil.f<int>((double)rdlens[0]);
3127 nceil[0] = min(nceil[0], (int)rdlens[0]);
3128 if(paired) {
3129 nceil[1] = nCeil.f<int>((double)rdlens[1]);
3130 nceil[1] = min(nceil[1], (int)rdlens[1]);
3131 }
3132 exhaustive[0] = exhaustive[1] = false;
3133 //size_t matemap[2] = { 0, 1 };
3134 bool pairPostFilt = filt[0] && filt[1];
3135 if(pairPostFilt) {
3136 rnd.init(ps->bufa().seed ^ ps->bufb().seed);
3137 } else {
3138 rnd.init(ps->bufa().seed);
3139 }
3140 // Calculate interval length for both mates
3141 int interval[2] = { 0, 0 };
3142 for(size_t mate = 0; mate < (pair ? 2:1); mate++) {
3143 interval[mate] = msIval.f<int>((double)rdlens[mate]);
3144 if(filt[0] && filt[1]) {
3145 // Boost interval length by 20% for paired-end reads
3146 interval[mate] = (int)(interval[mate] * 1.2 + 0.5);
3147 }
3148 interval[mate] = max(interval[mate], 1);
3149 }
3150 // Calculate streak length
3151 size_t streak[2] = { maxDpStreak, maxDpStreak };
3152 size_t mtStreak[2] = { maxMateStreak, maxMateStreak };
3153 size_t mxDp[2] = { maxDp, maxDp };
3154 size_t mxUg[2] = { maxUg, maxUg };
3155 size_t mxIter[2] = { maxIters, maxIters };
3156 if(allHits) {
3157 streak[0] = streak[1] = std::numeric_limits<size_t>::max();
3158 mtStreak[0] = mtStreak[1] = std::numeric_limits<size_t>::max();
3159 mxDp[0] = mxDp[1] = std::numeric_limits<size_t>::max();
3160 mxUg[0] = mxUg[1] = std::numeric_limits<size_t>::max();
3161 mxIter[0] = mxIter[1] = std::numeric_limits<size_t>::max();
3162 } else if(khits > 1) {
3163 for(size_t mate = 0; mate < 2; mate++) {
3164 streak[mate] += (khits-1) * maxStreakIncr;
3165 mtStreak[mate] += (khits-1) * maxStreakIncr;
3166 mxDp[mate] += (khits-1) * maxItersIncr;
3167 mxUg[mate] += (khits-1) * maxItersIncr;
3168 mxIter[mate] += (khits-1) * maxItersIncr;
3169 }
3170 }
3171 if(filt[0] && filt[1]) {
3172 streak[0] = (size_t)ceil((double)streak[0] / 2.0);
3173 streak[1] = (size_t)ceil((double)streak[1] / 2.0);
3174 assert_gt(streak[1], 0);
3175 }
3176 assert_gt(streak[0], 0);
3177 // Calculate # seed rounds for each mate
3178 size_t nrounds[2] = { nSeedRounds, nSeedRounds };
3179 if(filt[0] && filt[1]) {
3180 nrounds[0] = (size_t)ceil((double)nrounds[0] / 2.0);
3181 nrounds[1] = (size_t)ceil((double)nrounds[1] / 2.0);
3182 assert_gt(nrounds[1], 0);
3183 }
3184 assert_gt(nrounds[0], 0);
3185 // Increment counters according to what got filtered
3186 for(size_t mate = 0; mate < (pair ? 2:1); mate++) {
3187 if(!filt[mate]) {
3188 // Mate was rejected by N filter
3189 olm.freads++; // reads filtered out
3190 olm.fbases += rdlens[mate]; // bases filtered out
3191 } else {
3192 //shs[mate].clear();
3193 //shs[mate].nextRead(mate == 0 ? ps->bufa() : ps->bufb());
3194 //assert(shs[mate].empty());
3195 olm.ureads++; // reads passing filter
3196 olm.ubases += rdlens[mate]; // bases passing filter
3197 }
3198 }
3199 //size_t eePeEeltLimit = std::numeric_limits<size_t>::max();
3200 // Whether we're done with mate1 / mate2
3201 bool done[2] = { !filt[0], !filt[1] };
3202 // size_t nelt[2] = {0, 0};
3203 if(filt[0] && filt[1]) {
3204 bp_Aligner.initReads(rds, nofw, norc, minsc, maxpen);
3205 } else if(filt[0] || filt[1]) {
3206 bp_Aligner.initRead(rds[0], nofw[0], norc[0], minsc[0], maxpen[0], filt[1]);
3207 }
3208 if(filt[0] || filt[1]) {
3209 int ret = bp_Aligner.go(sc, ebwtFw, ebwtBw, ref, sw, *ssdb, wlm, prm, swmSeed, him, rnd, msinkwrap);
3210 MERGE_SW(sw);
3211 // daehwan
3212 size_t mate = 0;
3213
3214 assert_gt(ret, 0);
3215 // Clear out the exact hits so that we don't try to
3216 // extend them again later!
3217 if(ret == EXTEND_EXHAUSTED_CANDIDATES) {
3218 // Not done yet
3219 } else if(ret == EXTEND_POLICY_FULFILLED) {
3220 // Policy is satisfied for this mate at least
3221 if(msinkwrap.state().doneWithMate(mate == 0)) {
3222 done[mate] = true;
3223 }
3224 if(msinkwrap.state().doneWithMate(mate == 1)) {
3225 done[mate^1] = true;
3226 }
3227 } else if(ret == EXTEND_PERFECT_SCORE) {
3228 // We exhausted this mode at least
3229 done[mate] = true;
3230 } else if(ret == EXTEND_EXCEEDED_HARD_LIMIT) {
3231 // We exceeded a per-read limit
3232 done[mate] = true;
3233 } else if(ret == EXTEND_EXCEEDED_SOFT_LIMIT) {
3234 // Not done yet
3235 } else {
3236 //
3237 cerr << "Bad return value: " << ret << endl;
3238 throw 1;
3239 }
3240 if(!done[mate]) {
3241 TAlScore perfectScore = sc.perfectScore(rdlens[mate]);
3242 if(!done[mate] && minsc[mate] == perfectScore) {
3243 done[mate] = true;
3244 }
3245 }
3246 }
3247
3248 for(size_t i = 0; i < 2; i++) {
3249 assert_leq(prm.nExIters, mxIter[i]);
3250 assert_leq(prm.nExDps, mxDp[i]);
3251 assert_leq(prm.nMateDps, mxDp[i]);
3252 assert_leq(prm.nExUgs, mxUg[i]);
3253 assert_leq(prm.nMateUgs, mxUg[i]);
3254 assert_leq(prm.nDpFail, streak[i]);
3255 assert_leq(prm.nUgFail, streak[i]);
3256 assert_leq(prm.nEeFail, streak[i]);
3257 }
3258
3259 #if 0
3260 // Commit and report paired-end/unpaired alignments
3261 msinkwrap.finishRead(
3262 NULL,
3263 NULL,
3264 exhaustive[0], // exhausted seed hits for mate 1?
3265 exhaustive[1], // exhausted seed hits for mate 2?
3266 nfilt[0],
3267 nfilt[1],
3268 scfilt[0],
3269 scfilt[1],
3270 lenfilt[0],
3271 lenfilt[1],
3272 qcfilt[0],
3273 qcfilt[1],
3274 sortByScore, // prioritize by alignment score
3275 rnd, // pseudo-random generator
3276 rpm, // reporting metrics
3277 prm, // per-read metrics
3278 sc, // scoring scheme
3279 !seedSumm, // suppress seed summaries?
3280 seedSumm); // suppress alignments?
3281 assert(!retry || msinkwrap.empty());
3282 #endif
3283
3284 if(nthreads > 1 && useTempSpliceSite) {
3285 // ThreadSafe t(&thread_rids_mutex, nthreads > 1);
3286 assert_gt(tid, 0);
3287 assert_leq(tid, thread_rids.size());
3288 assert(thread_rids[tid - 1] == 0 || rdid > thread_rids[tid - 1]);
3289 thread_rids[tid - 1] = rdid;
3290 }
3291 } // while(retry)
3292 } // if(rdid >= skipReads && rdid < qUpto)
3293 else if(rdid >= qUpto) {
3294 break;
3295 }
3296 if(metricsPerRead) {
3297 MERGE_METRICS(metricsPt, nthreads > 1);
3298 nametmp = ps->bufa().name;
3299 metricsPt.reportInterval(
3300 metricsOfb, metricsStderr, true, true, &nametmp);
3301 metricsPt.reset();
3302 }
3303 } // while(true)
3304
3305 // One last metrics merge
3306 MERGE_METRICS(metrics, nthreads > 1);
3307
3308 return;
3309 }
3310
3311 /**
3312 * Called once per alignment job. Sets up global pointers to the
3313 * shared global data structures, creates per-thread structures, then
3314 * enters the search loop.
3315 */
multiseedSearch(Scoring & sc,PairedPatternSource & patsrc,AlnSink<index_t> & msink,HierEbwt<index_t> & ebwtFw,HierEbwt<index_t> & ebwtBw,BitPairReference * refs,const EList<string> & refnames,OutFileBuf * metricsOfb)3316 static void multiseedSearch(
3317 Scoring& sc,
3318 PairedPatternSource& patsrc, // pattern source
3319 AlnSink<index_t>& msink, // hit sink
3320 HierEbwt<index_t>& ebwtFw, // index of original text
3321 HierEbwt<index_t>& ebwtBw, // index of mirror text
3322 BitPairReference* refs,
3323 const EList<string>& refnames,
3324 OutFileBuf *metricsOfb)
3325 {
3326 multiseed_patsrc = &patsrc;
3327 multiseed_msink = &msink;
3328 multiseed_ebwtFw = &ebwtFw;
3329 multiseed_ebwtBw = &ebwtBw;
3330 multiseed_sc = ≻
3331 multiseed_refnames = refnames;
3332 multiseed_metricsOfb = metricsOfb;
3333 multiseed_refs = refs;
3334 AutoArray<tthread::thread*> threads(nthreads);
3335 AutoArray<int> tids(nthreads);
3336 {
3337 // Load the other half of the index into memory
3338 assert(!ebwtFw.isInMemory());
3339 Timer _t(cerr, "Time loading forward index: ", timing);
3340 ebwtFw.loadIntoMemory(
3341 0, // colorspace?
3342 -1, // not the reverse index
3343 true, // load SA samp? (yes, need forward index's SA samp)
3344 true, // load ftab (in forward index)
3345 true, // load rstarts (in forward index)
3346 !noRefNames, // load names?
3347 startVerbose);
3348 }
3349 #if 0
3350 if(multiseedMms > 0 || do1mmUpFront) {
3351 // Load the other half of the index into memory
3352 assert(!ebwtBw.isInMemory());
3353 Timer _t(cerr, "Time loading mirror index: ", timing);
3354 ebwtBw.loadIntoMemory(
3355 0, // colorspace?
3356 // It's bidirectional search, so we need the reverse to be
3357 // constructed as the reverse of the concatenated strings.
3358 1,
3359 true, // load SA samp in reverse index
3360 true, // yes, need ftab in reverse index
3361 true, // load rstarts in reverse index
3362 !noRefNames, // load names?
3363 startVerbose);
3364 }
3365 #endif
3366 // Start the metrics thread
3367 {
3368 Timer _t(cerr, "Multiseed full-index search: ", timing);
3369
3370 thread_rids.resize(nthreads);
3371 thread_rids.fill(0);
3372 thread_rids_mindist = (nthreads == 1 || !useTempSpliceSite ? 0 : 1000 * nthreads);
3373
3374 for(int i = 0; i < nthreads; i++) {
3375 // Thread IDs start at 1
3376 tids[i] = i+1;
3377 threads[i] = new tthread::thread(multiseedSearchWorker_hisat_bp, (void*)&tids[i]);
3378 }
3379
3380 for (int i = 0; i < nthreads; i++)
3381 threads[i]->join();
3382
3383 }
3384 if(!metricsPerRead && (metricsOfb != NULL || metricsStderr)) {
3385 metrics.reportInterval(metricsOfb, metricsStderr, true, false, NULL);
3386 }
3387 }
3388
3389 static string argstr;
3390
3391 extern void initializeCntLut();
3392 extern void initializeCntBit();
3393
3394 template<typename TStr>
driver(const char * type,const string & bt2indexBase,const string & outfile)3395 static void driver(
3396 const char * type,
3397 const string& bt2indexBase,
3398 const string& outfile)
3399 {
3400 if(gVerbose || startVerbose) {
3401 cerr << "Entered driver(): "; logTime(cerr, true);
3402 }
3403
3404 initializeCntLut();
3405 initializeCntBit();
3406
3407 // Vector of the reference sequences; used for sanity-checking
3408 EList<SString<char> > names, os;
3409 EList<size_t> nameLens, seqLens;
3410 // Read reference sequences from the command-line or from a FASTA file
3411 if(!origString.empty()) {
3412 // Read fasta file(s)
3413 EList<string> origFiles;
3414 tokenize(origString, ",", origFiles);
3415 parseFastas(origFiles, names, nameLens, os, seqLens);
3416 }
3417 PatternParams pp(
3418 format, // file format
3419 fileParallel, // true -> wrap files with separate PairedPatternSources
3420 seed, // pseudo-random seed
3421 useSpinlock, // use spin locks instead of pthreads
3422 solexaQuals, // true -> qualities are on solexa64 scale
3423 phred64Quals, // true -> qualities are on phred64 scale
3424 integerQuals, // true -> qualities are space-separated numbers
3425 fuzzy, // true -> try to parse fuzzy fastq
3426 fastaContLen, // length of sampled reads for FastaContinuous...
3427 fastaContFreq, // frequency of sampled reads for FastaContinuous...
3428 skipReads // skip the first 'skip' patterns
3429 );
3430 if(gVerbose || startVerbose) {
3431 cerr << "Creating PatternSource: "; logTime(cerr, true);
3432 }
3433 PairedPatternSource *patsrc = PairedPatternSource::setupPatternSources(
3434 queries, // singles, from argv
3435 mates1, // mate1's, from -1 arg
3436 mates2, // mate2's, from -2 arg
3437 mates12, // both mates on each line, from --12 arg
3438 qualities, // qualities associated with singles
3439 qualities1, // qualities associated with m1
3440 qualities2, // qualities associated with m2
3441 pp, // read read-in parameters
3442 gVerbose || startVerbose); // be talkative
3443 // Open hit output file
3444 if(gVerbose || startVerbose) {
3445 cerr << "Opening hit output file: "; logTime(cerr, true);
3446 }
3447 OutFileBuf *fout;
3448 if(!outfile.empty()) {
3449 fout = new OutFileBuf(outfile.c_str(), false);
3450 } else {
3451 fout = new OutFileBuf();
3452 }
3453 // Initialize Ebwt object and read in header
3454 if(gVerbose || startVerbose) {
3455 cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
3456 }
3457 adjIdxBase = adjustEbwtBase(argv0, bt2indexBase, gVerbose);
3458 HierEbwt<index_t, local_index_t> ebwt(
3459 adjIdxBase,
3460 0, // index is colorspace
3461 -1, // fw index
3462 true, // index is for the forward direction
3463 /* overriding: */ offRate,
3464 0, // amount to add to index offrate or <= 0 to do nothing
3465 useMm, // whether to use memory-mapped files
3466 useShmem, // whether to use shared memory
3467 mmSweep, // sweep memory-mapped files
3468 !noRefNames, // load names?
3469 true, // load SA sample?
3470 true, // load ftab?
3471 true, // load rstarts?
3472 gVerbose, // whether to be talkative
3473 startVerbose, // talkative during initialization
3474 false /*passMemExc*/,
3475 sanityCheck);
3476 HierEbwt<index_t, local_index_t>* ebwtBw = NULL;
3477 #if 0
3478 // We need the mirror index if mismatches are allowed
3479 if(multiseedMms > 0 || do1mmUpFront) {
3480 if(gVerbose || startVerbose) {
3481 cerr << "About to initialize rev Ebwt: "; logTime(cerr, true);
3482 }
3483 ebwtBw = new HierEbwt<index_t, local_index_t>(
3484 adjIdxBase + ".rev",
3485 0, // index is colorspace
3486 1, // TODO: maybe not
3487 false, // index is for the reverse direction
3488 /* overriding: */ offRate,
3489 0, // amount to add to index offrate or <= 0 to do nothing
3490 useMm, // whether to use memory-mapped files
3491 useShmem, // whether to use shared memory
3492 mmSweep, // sweep memory-mapped files
3493 !noRefNames, // load names?
3494 true, // load SA sample?
3495 true, // load ftab?
3496 true, // load rstarts?
3497 gVerbose, // whether to be talkative
3498 startVerbose, // talkative during initialization
3499 false /*passMemExc*/,
3500 sanityCheck);
3501 }
3502 #endif
3503 if(sanityCheck && !os.empty()) {
3504 // Sanity check number of patterns and pattern lengths in Ebwt
3505 // against original strings
3506 assert_eq(os.size(), ebwt.nPat());
3507 for(size_t i = 0; i < os.size(); i++) {
3508 assert_eq(os[i].length(), ebwt.plen()[i]);
3509 }
3510 }
3511 // Sanity-check the restored version of the Ebwt
3512 if(sanityCheck && !os.empty()) {
3513 ebwt.loadIntoMemory(
3514 0,
3515 -1, // fw index
3516 true, // load SA sample
3517 true, // load ftab
3518 true, // load rstarts
3519 !noRefNames,
3520 startVerbose);
3521 ebwt.checkOrigs(os, false, false);
3522 ebwt.evictFromMemory();
3523 }
3524 OutputQueue oq(
3525 *fout, // out file buffer
3526 reorder && nthreads > 1, // whether to reorder when there's >1 thread
3527 nthreads, // # threads
3528 nthreads > 1, // whether to be thread-safe
3529 skipReads); // first read will have this rdid
3530 {
3531 Timer _t(cerr, "Time searching: ", timing);
3532 // Set up penalities
3533 if(bonusMatch > 0 && !localAlign) {
3534 cerr << "Warning: Match bonus always = 0 in --end-to-end mode; ignoring user setting" << endl;
3535 bonusMatch = 0;
3536 }
3537 Scoring sc(
3538 bonusMatch, // constant reward for match
3539 penMmcType, // how to penalize mismatches
3540 penMmcMax, // max mm pelanty
3541 penMmcMin, // min mm pelanty
3542 scoreMin, // min score as function of read len
3543 nCeil, // max # Ns as function of read len
3544 penNType, // how to penalize Ns in the read
3545 penN, // constant if N pelanty is a constant
3546 penNCatPair, // whether to concat mates before N filtering
3547 penRdGapConst, // constant coeff for read gap cost
3548 penRfGapConst, // constant coeff for ref gap cost
3549 penRdGapLinear, // linear coeff for read gap cost
3550 penRfGapLinear, // linear coeff for ref gap cost
3551 gGapBarrier, // # rows at top/bot only entered diagonally
3552 penCanSplice, // canonical splicing penalty
3553 penNoncanSplice,// non-canonical splicing penalty
3554 &penIntronLen); // penalty as to intron length
3555 EList<size_t> reflens;
3556 for(size_t i = 0; i < ebwt.nPat(); i++) {
3557 reflens.push_back(ebwt.plen()[i]);
3558 }
3559 EList<string> refnames;
3560 readEbwtRefnames<index_t>(adjIdxBase, refnames);
3561 SamConfig samc(
3562 refnames, // reference sequence names
3563 reflens, // reference sequence lengths
3564 samTruncQname, // whether to truncate QNAME to 255 chars
3565 samOmitSecSeqQual, // omit SEQ/QUAL for 2ndary alignments?
3566 samNoUnal, // omit unaligned-read records?
3567 string("hisat"), // program id
3568 string("hisat"), // program name
3569 string(HISAT_VERSION), // program version
3570 argstr, // command-line
3571 rgs_optflag, // read-group string
3572 rna_strandness,
3573 sam_print_as,
3574 sam_print_xs,
3575 sam_print_xss,
3576 sam_print_yn,
3577 sam_print_xn,
3578 sam_print_cs,
3579 sam_print_cq,
3580 sam_print_x0,
3581 sam_print_x1,
3582 sam_print_xm,
3583 sam_print_xo,
3584 sam_print_xg,
3585 sam_print_nm,
3586 sam_print_md,
3587 sam_print_yf,
3588 sam_print_yi,
3589 sam_print_ym,
3590 sam_print_yp,
3591 sam_print_yt,
3592 sam_print_ys,
3593 sam_print_zs,
3594 sam_print_xr,
3595 sam_print_xt,
3596 sam_print_xd,
3597 sam_print_xu,
3598 sam_print_yl,
3599 sam_print_ye,
3600 sam_print_yu,
3601 sam_print_xp,
3602 sam_print_yr,
3603 sam_print_zb,
3604 sam_print_zr,
3605 sam_print_zf,
3606 sam_print_zm,
3607 sam_print_zi,
3608 sam_print_zp,
3609 sam_print_zu,
3610 sam_print_xs_a);
3611 // Set up hit sink; if sanityCheck && !os.empty() is true,
3612 // then instruct the sink to "retain" hits in a vector in
3613 // memory so that we can easily sanity check them later on
3614 AlnSink<index_t> *mssink = NULL;
3615 Timer *_tRef = new Timer(cerr, "Time loading reference: ", timing);
3616 auto_ptr<BitPairReference> refs(
3617 new BitPairReference(
3618 adjIdxBase,
3619 false,
3620 sanityCheck,
3621 NULL,
3622 NULL,
3623 false,
3624 useMm,
3625 useShmem,
3626 mmSweep,
3627 gVerbose,
3628 startVerbose)
3629 );
3630 delete _tRef;
3631 if(!refs->loaded()) throw 1;
3632
3633 init_junction_prob();
3634 bool write = novelSpliceSiteOutfile != "" || useTempSpliceSite;
3635 bool read = knownSpliceSiteInfile != "" || novelSpliceSiteInfile != "" || useTempSpliceSite;
3636 ssdb = new SpliceSiteDB(
3637 *(refs.get()),
3638 refnames,
3639 nthreads > 1, // thread-safe
3640 write, // write?
3641 read); // read?
3642 if(ssdb != NULL) {
3643 if(knownSpliceSiteInfile != "") {
3644 ifstream ssdb_file(knownSpliceSiteInfile.c_str(), ios::in);
3645 if(ssdb_file.is_open()) {
3646 ssdb->read(ssdb_file, false); // known splice sites
3647 ssdb_file.close();
3648 }
3649 }
3650 if(novelSpliceSiteInfile != "") {
3651 ifstream ssdb_file(novelSpliceSiteInfile.c_str(), ios::in);
3652 if(ssdb_file.is_open()) {
3653 ssdb->read(ssdb_file, true); // novel splice sites
3654 ssdb_file.close();
3655 }
3656 }
3657 }
3658 switch(outType) {
3659 case OUTPUT_SAM: {
3660 mssink = new AlnSinkSam<index_t>(
3661 oq, // output queue
3662 samc, // settings & routines for SAM output
3663 refnames, // reference names
3664 gQuiet, // don't print alignment summary at end
3665 ssdb);
3666 #if 0
3667 if(!samNoHead) {
3668 bool printHd = true, printSq = true;
3669 BTString buf;
3670 samc.printHeader(buf, rgid, rgs, printHd, !samNoSQ, printSq);
3671 fout->writeString(buf);
3672 }
3673 #endif
3674 break;
3675 }
3676 default:
3677 cerr << "Invalid output type: " << outType << endl;
3678 throw 1;
3679 }
3680 if(gVerbose || startVerbose) {
3681 cerr << "Dispatching to search driver: "; logTime(cerr, true);
3682 }
3683 // Set up global constraint
3684 OutFileBuf *metricsOfb = NULL;
3685 if(!metricsFile.empty() && metricsIval > 0) {
3686 metricsOfb = new OutFileBuf(metricsFile);
3687 }
3688 // Do the search for all input reads
3689 assert(patsrc != NULL);
3690 assert(mssink != NULL);
3691 multiseedSearch(
3692 sc, // scoring scheme
3693 *patsrc, // pattern source
3694 *mssink, // hit sink
3695 ebwt, // BWT
3696 *ebwtBw, // BWT'
3697 refs.get(),
3698 refnames,
3699 metricsOfb);
3700 // Evict any loaded indexes from memory
3701 if(ebwt.isInMemory()) {
3702 ebwt.evictFromMemory();
3703 }
3704 if(ebwtBw != NULL) {
3705 delete ebwtBw;
3706 }
3707 if(!gQuiet && !seedSumm) {
3708 size_t repThresh = mhits;
3709 if(repThresh == 0) {
3710 repThresh = std::numeric_limits<size_t>::max();
3711 }
3712 #if 0
3713 mssink->finish(
3714 repThresh,
3715 gReportDiscordant,
3716 gReportMixed,
3717 hadoopOut);
3718 #endif
3719 }
3720 if(ssdb != NULL) {
3721 if(novelSpliceSiteOutfile != "") {
3722 ofstream ssdb_file(novelSpliceSiteOutfile.c_str(), ios::out);
3723 if(ssdb_file.is_open()) {
3724 ssdb->print(ssdb_file);
3725 ssdb_file.close();
3726 }
3727 }
3728 }
3729 oq.flush(true);
3730 assert_eq(oq.numStarted(), oq.numFinished());
3731 assert_eq(oq.numStarted(), oq.numFlushed());
3732 delete patsrc;
3733 delete mssink;
3734 delete ssdb;
3735 delete metricsOfb;
3736 if(fout != NULL) {
3737 delete fout;
3738 }
3739 }
3740 }
3741
3742 // C++ name mangling is disabled for the bowtie() function to make it
3743 // easier to use Bowtie as a library.
3744 extern "C" {
3745
3746 /**
3747 * Main bowtie entry function. Parses argc/argv style command-line
3748 * options, sets global configuration variables, and calls the driver()
3749 * function.
3750 */
hisat(int argc,const char ** argv)3751 int hisat(int argc, const char **argv) {
3752 try {
3753 // Reset all global state, including getopt state
3754 opterr = optind = 1;
3755 resetOptions();
3756 for(int i = 0; i < argc; i++) {
3757 argstr += argv[i];
3758 if(i < argc-1) argstr += " ";
3759 }
3760 if(startVerbose) { cerr << "Entered main(): "; logTime(cerr, true); }
3761 parseOptions(argc, argv);
3762 argv0 = argv[0];
3763 if(showVersion) {
3764 cout << argv0 << " version " << HISAT_VERSION << endl;
3765 if(sizeof(void*) == 4) {
3766 cout << "32-bit" << endl;
3767 } else if(sizeof(void*) == 8) {
3768 cout << "64-bit" << endl;
3769 } else {
3770 cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
3771 }
3772 cout << "Built on " << BUILD_HOST << endl;
3773 cout << BUILD_TIME << endl;
3774 cout << "Compiler: " << COMPILER_VERSION << endl;
3775 cout << "Options: " << COMPILER_OPTIONS << endl;
3776 cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
3777 << sizeof(int)
3778 << ", " << sizeof(long) << ", " << sizeof(long long)
3779 << ", " << sizeof(void *) << ", " << sizeof(size_t)
3780 << ", " << sizeof(off_t) << "}" << endl;
3781 return 0;
3782 }
3783 {
3784 Timer _t(cerr, "Overall time: ", timing);
3785 if(startVerbose) {
3786 cerr << "Parsing index and read arguments: "; logTime(cerr, true);
3787 }
3788
3789 // Get index basename (but only if it wasn't specified via --index)
3790 if(bt2index.empty()) {
3791 if(optind >= argc) {
3792 cerr << "No index, query, or output file specified!" << endl;
3793 printUsage(cerr);
3794 return 1;
3795 }
3796 bt2index = argv[optind++];
3797 }
3798
3799 // Get query filename
3800 bool got_reads = !queries.empty() || !mates1.empty() || !mates12.empty();
3801 if(optind >= argc) {
3802 if(!got_reads) {
3803 printUsage(cerr);
3804 cerr << "***" << endl
3805 << "Error: Must specify at least one read input with -U/-1/-2" << endl;
3806 return 1;
3807 }
3808 } else if(!got_reads) {
3809 // Tokenize the list of query files
3810 tokenize(argv[optind++], ",", queries);
3811 if(queries.empty()) {
3812 cerr << "Tokenized query file list was empty!" << endl;
3813 printUsage(cerr);
3814 return 1;
3815 }
3816 }
3817
3818 // Get output filename
3819 if(optind < argc && outfile.empty()) {
3820 outfile = argv[optind++];
3821 cerr << "Warning: Output file '" << outfile.c_str()
3822 << "' was specified without -S. This will not work in "
3823 << "future HISAT 2 versions. Please use -S instead."
3824 << endl;
3825 }
3826
3827 // Extra parametesr?
3828 if(optind < argc) {
3829 cerr << "Extra parameter(s) specified: ";
3830 for(int i = optind; i < argc; i++) {
3831 cerr << "\"" << argv[i] << "\"";
3832 if(i < argc-1) cerr << ", ";
3833 }
3834 cerr << endl;
3835 if(mates1.size() > 0) {
3836 cerr << "Note that if <mates> files are specified using -1/-2, a <singles> file cannot" << endl
3837 << "also be specified. Please run HISAT2 separately for mates and singles." << endl;
3838 }
3839 throw 1;
3840 }
3841
3842 // Optionally summarize
3843 if(gVerbose) {
3844 cout << "Input bt2 file: \"" << bt2index.c_str() << "\"" << endl;
3845 cout << "Query inputs (DNA, " << file_format_names[format].c_str() << "):" << endl;
3846 for(size_t i = 0; i < queries.size(); i++) {
3847 cout << " " << queries[i].c_str() << endl;
3848 }
3849 cout << "Quality inputs:" << endl;
3850 for(size_t i = 0; i < qualities.size(); i++) {
3851 cout << " " << qualities[i].c_str() << endl;
3852 }
3853 cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
3854 cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
3855 cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
3856 #ifdef NDEBUG
3857 cout << "Assertions: disabled" << endl;
3858 #else
3859 cout << "Assertions: enabled" << endl;
3860 #endif
3861 }
3862 if(ipause) {
3863 cout << "Press key to continue..." << endl;
3864 getchar();
3865 }
3866 driver<SString<char> >("DNA", bt2index, outfile);
3867 }
3868 return 0;
3869 } catch(std::exception& e) {
3870 cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
3871 cerr << "Command: ";
3872 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
3873 cerr << endl;
3874 return 1;
3875 } catch(int e) {
3876 if(e != 0) {
3877 cerr << "Error: Encountered internal HISAT2 exception (#" << e << ")" << endl;
3878 cerr << "Command: ";
3879 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
3880 cerr << endl;
3881 }
3882 return e;
3883 }
3884 } // bowtie()
3885 } // extern "C"
3886