1 /*==========================================================================
2 RazerS - Fast Read Mapping with Controlled Loss Rate
3 http://www.seqan.de/projects/razers.html
4
5 ============================================================================
6 Copyright (C) 2008 by David Weese
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 3 of the License, or (at your options) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ==========================================================================*/
21
22 #define SEQAN_PROFILE // enable time measuring
23 //#define SEQAN_DEBUG_SWIFT // test SWIFT correctness and print bucket parameters
24 //#define RAZERS_DEBUG // print verification regions
25 #define RAZERS_PRUNE_QGRAM_INDEX // ignore highly abundant k-mers
26 //#define RAZERS_MEMOPT // optimize memory consumption
27 #define RAZERS_MASK_READS // remove matches with max-hits optimal hits on-the-fly
28 //#define NO_PARAM_CHOOSER // disable loss-rate parameter choosing
29 #define RAZERS_ISLAND_CRITERION // island match criterion
30 #define RAZERS_NOOUTERREADGAPS // enforce the alignment of the first and last base (determines the lakes)
31
32 #define RAZERS_OPENADDRESSING // enables open addressing for the k-mer index as well as the possibility to set the load factor (-lf)
33 #define RAZERS_BANDED_MYERS // uses a banded version of Myers bitvector algorithm (analogous to H. Hyyr\"o, 2001)
34 //#define SEQAN_OPENADDRESSING_COMPACT // saves some memory for the openaddressing index / faster hash table access (if undefined)s
35 //#define RAZERS_DEBUG_MATEPAIRS
36 #define RAZERS_DEFER_COMPACTION // mask duplicates on the fly and defer compaction
37 #define RAZERS_EXTERNAL_MATCHES // use external memory algorithms for managing matches
38
39 //#define RAZERS_PROFILE // Extensive profiling information.
40 //#define RAZERS_TIMER // output information on how fast filtration and verification as well as waiting times
41 //#define RAZERS_WINDOW // use the findWindownext function on the "normal" index
42
43 #define RAZERS_MATEPAIRS // enable paired-end matching
44 //#define SEQAN_USE_SSE2_WORDS // use SSE2 128-bit integers for MyersBitVector
45
46 // Warn the user about missing OpenMP. This can be suppressed by setting the
47 // CXX flag "SEQAN_IGNORE_MISSING_OPENMP=1".
48
49 #ifdef _OPENMP
50 #include <omp.h>
51 #else
52 #if !defined(SEQAN_IGNORE_MISSING_OPENMP) || (SEQAN_IGNORE_MISSING_OPENMP == 0)
53 #pragma message("OpenMP not found! Shared-memory parallelization will be disabled in RazerS3.")
54 #endif // #if !defined(SEQAN_IGNORE_MISSING_OPENMP) || (SEQAN_IGNORE_MISSING_OPENMP == 0)
55 #endif
56
57 #include <iostream>
58 #include <sstream>
59
60 #include <seqan/basic.h>
61 #include <seqan/sequence.h>
62 #include <seqan/file.h>
63 #include <seqan/store.h>
64 #include <seqan/arg_parse.h>
65 #include <seqan/parallel.h>
66
67 #ifdef STDLIB_VS
68 #include <process.h>
69 #endif
70
71 #include <memory>
72 #include <unordered_map>
73
74 #include "razers.h"
75 #include "outputFormat.h"
76 #include "paramChooser.h"
77
78 #ifdef RAZERS_WINDOW
79 #include "razers_window.h"
80 #endif
81
82 #ifdef RAZERS_MATEPAIRS
83 #include "razers_matepairs.h"
84 #ifdef _OPENMP
85 #include "razers_matepairs_parallel.h"
86 #endif
87 #endif
88
89 #ifdef _OPENMP
90 #include "razers_parallel.h"
91 #endif
92
93 #ifdef RAZERS_PROFILE
94 #include "profile_timeline.h"
95 #endif // #ifdef RAZERS_PROFILE
96
97 using namespace std;
98 using namespace seqan;
99
100 struct MyFragStoreConfig :
101 public FragmentStoreConfig<>
102 {
103 typedef String<Dna5> TContigSeq;
104 };
105
106 //////////////////////////////////////////////////////////////////////////////
107 // Main read mapper function
108 template <typename TSpec>
mapReads(StringSet<CharString> & genomeFileNames,StringSet<CharString> & readFileNames,RazerSOptions<TSpec> & options)109 int mapReads(
110 StringSet<CharString> & genomeFileNames,
111 StringSet<CharString> & readFileNames, // NULL terminated list of one/two read files (single/mate-pairs)
112 RazerSOptions<TSpec> & options)
113 {
114 FragmentStore<MyFragStoreConfig> store; // stores all of the tables
115 String<String<unsigned short> > stats; // needed for mapping quality calculation
116
117 // dump configuration in verbose mode
118 if (options._debugLevel >= 1)
119 {
120 CharString bitmap;
121 Shape<Dna, GenericShape> shape;
122 stringToShape(shape, options.shape);
123 shapeToString(bitmap, shape);
124
125 cerr << "___SETTINGS____________" << endl;
126 cerr << "Genome file: \t" << genomeFileNames[0] << endl;
127 if (length(readFileNames) < 2)
128 cerr << "Read file: \t" << readFileNames[0] << endl;
129 else
130 {
131 cerr << "Read files: \t" << readFileNames[0] << endl;
132 for (unsigned i = 1; i < length(readFileNames); ++i)
133 cerr << " \t" << readFileNames[i] << endl;
134 }
135 cerr << "Compute forward matches: \t";
136 if (options.forward)
137 cerr << "YES" << endl;
138 else
139 cerr << "NO" << endl;
140 cerr << "Compute reverse matches: \t";
141 if (options.reverse)
142 cerr << "YES" << endl;
143 else
144 cerr << "NO" << endl;
145 cerr << "Allow Indels: \t";
146 if (options.gapMode == RAZERS_GAPPED)
147 cerr << "YES" << endl;
148 else
149 cerr << "NO" << endl;
150 cerr << "Error rate: \t" << options.errorRate << endl;
151 if (options.threshold > 0)
152 cerr << "Minimal threshold: \t" << options.threshold << endl;
153 else
154 cerr << "Pigeonhole mode with overlap: \t" << options.overlap << endl;
155 cerr << "Shape: \t" << bitmap << endl;
156 cerr << "Repeat threshold: \t" << options.repeatLength << endl;
157 cerr << "Overabundance threshold: \t" << options.abundanceCut << endl;
158 if (options.threshold > 0)
159 cerr << "Taboo length: \t" << options.tabooLength << endl;
160 if (options._debugLevel >= 1)
161 {
162 #ifdef STDLIB_VS
163 int pid = _getpid();
164 #else // #ifdef STDLIB_VS
165 int pid = getpid();
166 #endif // #ifdef STDLIB_VS
167 cerr << "Program PID: \t" << pid << endl;
168 }
169 cerr << endl;
170 }
171
172 // circumvent numerical obstacles
173 options.errorRate += 0.0000001;
174
175 #ifdef RAZERS_PROFILE
176 timelineBeginTask(TASK_LOAD);
177 #endif // #ifdef RAZERS_PROFILE
178 //////////////////////////////////////////////////////////////////////////////
179 // Step 1: Load reads
180 SEQAN_PROTIMESTART(load_time);
181
182 #ifdef RAZERS_MATEPAIRS
183 if (length(readFileNames) == 2)
184 {
185 if (!loadReads(store, options.readFile, toCString(readFileNames[1]), options))
186 {
187 //if (!loadReads(readSet, readQualities, readNames, readFileNames[0], readFileNames[1], options)) {
188 cerr << "Failed to load reads" << endl;
189 return RAZERS_READS_FAILED;
190 }
191 }
192 else
193 #endif
194 {
195 if (!loadReads(store, options.readFile, options))
196 {
197 cerr << "Failed to load reads" << endl;
198 return RAZERS_READS_FAILED;
199 }
200 }
201
202 if (options._debugLevel >= 1)
203 cerr << lengthSum(store.readSeqStore) << " bps of " << length(store.readSeqStore) << " reads loaded." << endl;
204 options.timeLoadFiles = SEQAN_PROTIMEDIFF(load_time);
205
206 if (options._debugLevel >= 1)
207 cerr << "Loading reads took \t" << options.timeLoadFiles << " seconds" << endl;
208
209 #ifdef RAZERS_MEMOPT
210 if (length(store.readSeqStore) > 16777216)
211 {
212 cerr << "more than 2^24 reads. Switch of RAZERS_MEMOPT in razers.cpp or use less." << std::endl;
213 return 1;
214 }
215 #endif
216
217 //////////////////////////////////////////////////////////////////////////////
218 // Step 2: Load genomes
219 if (length(genomeFileNames) == 1)
220 {
221 int result = getGenomeFileNameList(genomeFileNames[0], genomeFileNames, options);
222 if (result == RAZERS_GENOME_FAILED)
223 {
224 cerr << "Failed to open genome file " << genomeFileNames[0] << endl;
225 return result;
226 }
227 }
228 #ifdef RAZERS_PROFILE
229 timelineEndTask(TASK_LOAD);
230 #endif // #ifdef RAZERS_PROFILE
231
232 //////////////////////////////////////////////////////////////////////////////
233 // Step 3: Find matches using SWIFT
234 loadContigs(store, genomeFileNames, false); // add filenames to the contig store (they are loaded on-demand)
235 int error = _mapReads(store, stats, (RazerSCoreOptions<TSpec>&)options);
236 if (error != 0)
237 {
238 switch (error)
239 {
240 case RAZERS_GENOME_FAILED:
241 cerr << "Failed to load genomes" << endl;
242 break;
243
244 case RAZERS_INVALID_SHAPE:
245 cerr << "Invalid Shape" << endl;
246 break;
247 }
248 return error;
249 }
250
251 //////////////////////////////////////////////////////////////////////////////
252 // Step 4: Remove duplicates and output matches
253 #ifdef RAZERS_PROFILE
254 timelineBeginTask(TASK_DUMP_MATCHES);
255 #endif // #ifdef RAZERS_PROFILE
256 if (!options.spec.DONT_DUMP_RESULTS)
257 dumpMatches(store, stats, readFileNames[0], options);
258 #ifdef RAZERS_PROFILE
259 timelineEndTask(TASK_DUMP_MATCHES);
260 #endif // #ifdef RAZERS_PROFILE
261
262 return 0;
263 }
264
whichMacros()265 inline void whichMacros()
266 {
267 #ifdef RAZERS_OPENADDRESSING
268 std::cerr << "Index: Open addressing" << std::endl;
269 #else
270 std::cerr << "Index: Normal" << std::endl;
271 #endif
272
273 #ifdef RAZERS_TIMER
274 std::cerr << "Timer: ON" << std::endl;
275 #else
276 std::cerr << "Timer: OFF" << std::endl;
277 #endif
278
279 #ifdef _OPENMP
280 std::cerr << "OpenMP: ON" << std::endl;
281 #else
282 std::cerr << "OpenMP: OFF" << std::endl;
283 #endif
284
285 #ifdef RAZERS_BANDED_MYERS
286 std::cerr << "Myers: Banded" << std::endl;
287 #else
288 std::cerr << "Myers: Unbanded" << std::endl;
289 #endif
290
291 #ifdef RAZERS_PROFILE
292 std::cerr << "Timeline: ON" << std::endl;
293 #else
294 std::cerr << "Timeline: OFF" << std::endl;
295 #endif
296 std::cerr << std::endl;
297 }
298
setUpArgumentParser(ArgumentParser & parser,RazerSOptions<> & options,ParamChooserOptions const & pm_options)299 void setUpArgumentParser(ArgumentParser & parser, RazerSOptions<> & options, ParamChooserOptions const & pm_options)
300 {
301 setAppName(parser, "razers3");
302 setShortDescription(parser, "Faster, fully sensitive read mapping");
303 setCategory(parser, "Read Mapping");
304 options.version = "3.3";
305 setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
306 setDate(parser, SEQAN_DATE);
307
308 // Need genome and reads (hg18.fa reads.fq)
309 addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE));
310 setValidValues(parser, 0, seqan::SeqFileIn::getFileExtensions());
311 setHelpText(parser, 0, "A reference genome file.");
312 addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "READS", true));
313 setValidValues(parser, 1, seqan::SeqFileIn::getFileExtensions());
314 setHelpText(parser, 1, "Either one (single-end) or two (paired-end) read files.");
315
316 addUsageLine(parser, "[\\fIOPTIONS\\fP] <\\fIGENOME FILE\\fP> <\\fIREADS FILE\\fP>");
317 #ifdef RAZERS_MATEPAIRS
318 addUsageLine(parser, "[\\fIOPTIONS\\fP] <\\fIGENOME FILE\\fP> <\\fIPE-READS FILE1\\fP> <\\fIPE-READS FILE2\\fP>");
319 #endif
320
321 addDescription(parser, "RazerS 3 is a versatile full-sensitive read mapper based on k-mer counting and seeding filters. "
322 "It supports single and paired-end mapping, shared-memory parallelism, and optimally parametrizes "
323 "the filter based on a user-defined minimal sensitivity. "
324 "See \\fIhttp://www.seqan.de/projects/razers\\fP for more information.");
325
326 addDescription(parser, "Input to RazerS 3 is a reference genome file and either one file with single-end reads "
327 "or two files containing left or right mates of paired-end reads. Use - to read single-end "
328 "reads from stdin.");
329
330 addDescription(parser, "(c) Copyright 2009-2014 by David Weese.");
331
332 addSection(parser, "Main Options");
333 addOption(parser, ArgParseOption("i", "percent-identity", "Percent identity threshold.", ArgParseOption::DOUBLE));
334 setMinValue(parser, "percent-identity", "50");
335 setMaxValue(parser, "percent-identity", "100");
336 setDefaultValue(parser, "percent-identity", 100 - (100.0 * options.errorRate));
337 addOption(parser, ArgParseOption("rr", "recognition-rate", "Percent recognition rate.", ArgParseOption::DOUBLE));
338 setMinValue(parser, "recognition-rate", "80");
339 setMaxValue(parser, "recognition-rate", "100");
340 setDefaultValue(parser, "recognition-rate", 100 - (100.0 * pm_options.optionLossRate));
341 addOption(parser, ArgParseOption("ng", "no-gaps", "Allow only mismatches, no indels. Default: allow both."));
342 addOption(parser, ArgParseOption("f", "forward", "Map reads only to forward strands."));
343 addOption(parser, ArgParseOption("r", "reverse", "Map reads only to reverse strands."));
344 addOption(parser, ArgParseOption("m", "max-hits", "Output only <\\fINUM\\fP> of the best hits.", ArgParseOption::INTEGER));
345 setMinValue(parser, "max-hits", "1");
346 setDefaultValue(parser, "max-hits", options.maxHits);
347 addOption(parser, ArgParseOption("", "unique", "Output only unique best matches (-m 1 -dr 0 -pa)."));
348 addOption(parser, ArgParseOption("tr", "trim-reads", "Trim reads to given length. Default: off.", ArgParseOption::INTEGER));
349 setMinValue(parser, "trim-reads", "14");
350 addOption(parser, ArgParseOption("o", "output", "Mapping result filename (use - to dump to stdout in razers format). Default: <\\fIREADS FILE\\fP>.razers.", ArgParseOption::OUTPUT_FILE));
351 setValidValues(parser, "output", ".razers .eland .fa .fasta .gff .sam .bam .afg");
352 addOption(parser, ArgParseOption("v", "verbose", "Verbose mode."));
353 addOption(parser, ArgParseOption("vv", "vverbose", "Very verbose mode."));
354
355 #ifdef RAZERS_MATEPAIRS
356 addSection(parser, "Paired-end Options");
357 addOption(parser, ArgParseOption("ll", "library-length", "Paired-end library length.", ArgParseOption::INTEGER));
358 setMinValue(parser, "library-length", "1");
359 setDefaultValue(parser, "library-length", options.libraryLength);
360 addOption(parser, ArgParseOption("le", "library-error", "Paired-end library length tolerance.", ArgParseOption::INTEGER));
361 setMinValue(parser, "library-error", "0");
362 setDefaultValue(parser, "library-error", options.libraryError);
363 #endif
364
365 addSection(parser, "Output Format Options");
366 addOption(parser, ArgParseOption("a", "alignment", "Dump the alignment for each match (only \\fIrazer\\fP or \\fIfasta\\fP format)."));
367 addOption(parser, ArgParseOption("pa", "purge-ambiguous", "Purge reads with more than <\\fImax-hits\\fP> best matches."));
368 addOption(parser, ArgParseOption("dr", "distance-range", "Only consider matches with at most NUM more errors compared to the best. Default: output all.", ArgParseOption::INTEGER));
369 addOption(parser, ArgParseOption("gn", "genome-naming", "Select how genomes are named (see Naming section below).", ArgParseOption::INTEGER));
370 setMinValue(parser, "genome-naming", "0");
371 setMaxValue(parser, "genome-naming", "1");
372 setDefaultValue(parser, "genome-naming", options.genomeNaming);
373 // addHelpLine(parser, "0 = use Fasta id");
374 // addHelpLine(parser, "1 = enumerate beginning with 1");
375 addOption(parser, ArgParseOption("rn", "read-naming", "Select how reads are named (see Naming section below).", ArgParseOption::INTEGER));
376 setDefaultValue(parser, "read-naming", options.readNaming);
377 setMinValue(parser, "read-naming", "0");
378 setMaxValue(parser, "read-naming", "3");
379
380 // addHelpLine(parser, "0 = use Fasta id");
381 // addHelpLine(parser, "1 = enumerate beginning with 1");
382 // addHelpLine(parser, "2 = use the read sequence (only for short reads!)");
383 // addHelpLine(parser, "3 = use the Fasta id, do NOT append '/L' or '/R' for mate pairs");
384 addOption(parser, ArgParseOption("", "full-readid", "Use the whole read id (don't clip after whitespace)."));
385 addOption(parser, ArgParseOption("so", "sort-order", "Select how matches are sorted (see Sorting section below).", ArgParseOption::INTEGER));
386 setDefaultValue(parser, "sort-order", options.sortOrder);
387 setMinValue(parser, "sort-order", "0");
388 setMaxValue(parser, "sort-order", "1");
389 // addHelpLine(parser, "0 = 1. read number, 2. genome position");
390 // addHelpLine(parser, "1 = 1. genome position, 2. read number");
391 addOption(parser, ArgParseOption("pf", "position-format", "Select begin/end position numbering (see Coordinate section below).", ArgParseOption::INTEGER));
392 setMinValue(parser, "position-format", "0");
393 setMaxValue(parser, "position-format", "1");
394 setDefaultValue(parser, "position-format", options.sortOrder);
395 // addHelpLine(parser, "0 = gap space");
396 // addHelpLine(parser, "1 = position space");
397 addOption(parser, ArgParseOption("ds", "dont-shrink-alignments", "Disable alignment shrinking in SAM. This is required for generating a gold mapping for Rabema."));
398 addOption(parser, ArgParseOption("ga", "global-alignment", "Compute global alignment (in SAM output)."));
399 hideOption(parser, "global-alignment");
400
401 addSection(parser, "Filtration Options");
402 addOption(parser, ArgParseOption("fl", "filter", "Select k-mer filter.", ArgParseOption::STRING));
403 setValidValues(parser, "filter", "pigeonhole swift");
404 setDefaultValue(parser, "filter", "pigeonhole");
405 addOption(parser, ArgParseOption("mr", "mutation-rate", "Set the percent mutation rate (\\fIpigeonhole\\fP).", ArgParseOption::DOUBLE));
406 setMinValue(parser, "mutation-rate", "0");
407 setMaxValue(parser, "mutation-rate", "20");
408 setDefaultValue(parser, "mutation-rate", 100.0 * options.mutationRate);
409 addOption(parser, ArgParseOption("ol", "overlap-length", "Manually set the overlap length of adjacent k-mers (\\fIpigeonhole\\fP).", ArgParseOption::INTEGER));
410 setMinValue(parser, "overlap-length", "0");
411 #ifndef NO_PARAM_CHOOSER
412 addOption(parser, ArgParseOption("pd", "param-dir", "Read user-computed parameter files in the directory <\\fIDIR\\fP> (\\fIswift\\fP).", ArgParseOption::STRING, "DIR"));
413 #endif
414 addOption(parser, ArgParseOption("t", "threshold", "Manually set minimum k-mer count threshold (\\fIswift\\fP).", ArgParseOption::INTEGER));
415 setMinValue(parser, "threshold", "1");
416 addOption(parser, ArgParseOption("tl", "taboo-length", "Set taboo length (\\fIswift\\fP).", ArgParseOption::INTEGER));
417 setMinValue(parser, "taboo-length", "1");
418 setDefaultValue(parser, "taboo-length", options.tabooLength);
419 addOption(parser, ArgParseOption("s", "shape", "Manually set k-mer shape.", ArgParseOption::STRING, "BITSTRING"));
420 // setDefaultValue(parser, "shape", options.shape); // <-- doesn't work with KNIME which always sets default values
421 addOption(parser, ArgParseOption("oc", "overabundance-cut", "Set k-mer overabundance cut ratio.", ArgParseOption::INTEGER));
422 setMinValue(parser, "overabundance-cut", "0");
423 setMaxValue(parser, "overabundance-cut", "1");
424 setDefaultValue(parser, "overabundance-cut", options.abundanceCut);
425 addOption(parser, ArgParseOption("rl", "repeat-length", "Skip simple-repeats of length <\\fINUM\\fP>.", ArgParseOption::INTEGER));
426 setMinValue(parser, "repeat-length", "1");
427 setDefaultValue(parser, "repeat-length", options.repeatLength);
428 #ifdef RAZERS_OPENADDRESSING
429 addOption(parser, ArgParseOption("lf", "load-factor", "Set the load factor for the open addressing k-mer index.", ArgParseOption::DOUBLE));
430 setMinValue(parser, "load-factor", "1");
431 setDefaultValue(parser, "load-factor", options.loadFactor);
432 #endif
433
434 addSection(parser, "Verification Options");
435 addOption(parser, ArgParseOption("mN", "match-N", "N matches all other characters. Default: N matches nothing."));
436 addOption(parser, ArgParseOption("ed", "error-distr", "Write error distribution to \\fIFILE\\fP.", ArgParseOption::STRING, "FILE"));
437 addOption(parser, ArgParseOption("mf", "mismatch-file", "Write mismatch patterns to \\fIFILE\\fP.", ArgParseOption::STRING, "FILE"));
438
439 addSection(parser, "Misc Options");
440 addOption(parser, ArgParseOption("cm", "compact-mult", "Multiply compaction threshold by this value after reaching and compacting.", ArgParseOption::DOUBLE));
441 setMinValue(parser, "compact-mult", "0");
442 setDefaultValue(parser, "compact-mult", options.compactMult);
443 addOption(parser, ArgParseOption("ncf", "no-compact-frac", "Don't compact if in this last fraction of genome.", ArgParseOption::DOUBLE));
444 setMinValue(parser, "no-compact-frac", "0");
445 setMaxValue(parser, "no-compact-frac", "1");
446 setDefaultValue(parser, "no-compact-frac", options.noCompactFrac);
447
448 #ifdef _OPENMP
449 addSection(parser, "Parallelism Options");
450 #endif // #ifdef _OPENMP
451 addOption(parser, ArgParseOption("tc", "thread-count", "Set the number of threads to use (0 to force sequential mode).", ArgParseOption::INTEGER));
452 setMinValue(parser, "thread-count", "0");
453 #ifndef _OPENMP
454 hideOption(parser, "tc");
455 #endif // #ifndef _OPENMP
456 #ifdef _OPENMP
457 setDefaultValue(parser, "thread-count", options.threadCount);
458 addOption(parser, ArgParseOption("pws", "parallel-window-size", "Collect candidates in windows of this length.", ArgParseOption::INTEGER));
459 setMinValue(parser, "parallel-window-size", "1");
460 setDefaultValue(parser, "parallel-window-size", options.windowSize);
461 addOption(parser, ArgParseOption("pvs", "parallel-verification-size", "Verify candidates in packages of this size.", ArgParseOption::INTEGER));
462 setMinValue(parser, "parallel-verification-size", "1");
463 setDefaultValue(parser, "parallel-verification-size", options.verificationPackageSize);
464 addOption(parser, ArgParseOption("pvmpc", "parallel-verification-max-package-count", "Largest number of packages to create for verification per thread-1.", ArgParseOption::INTEGER));
465 setMinValue(parser, "parallel-verification-max-package-count", "1");
466 setDefaultValue(parser, "parallel-verification-max-package-count", options.maxVerificationPackageCount);
467 // addHelpLine(parser, "Go over package size if this limit is reached.");
468 addOption(parser, ArgParseOption("amms", "available-matches-memory-size", "Bytes of main memory available for storing matches.", ArgParseOption::INTEGER));
469 setMinValue(parser, "available-matches-memory-size", "-1");
470 setDefaultValue(parser, "available-matches-memory-size", options.availableMatchesMemorySize);
471 // addHelpLine(parser, "Used to switch to external sorting.");
472 // addHelpLine(parser, "-1 = always external");
473 // addHelpLine(parser, " 0 = never");
474 // addHelpLine(parser, " x = use other value x as threshold.");
475 addOption(parser, ArgParseOption("mhst", "match-histo-start-threshold", "When to start histogram.", ArgParseOption::INTEGER));
476 setMinValue(parser, "match-histo-start-threshold", "1");
477 setDefaultValue(parser, "match-histo-start-threshold", options.matchHistoStartThreshold);
478 #endif
479
480 addTextSection(parser, "Formats, Naming, Sorting, and Coordinate Schemes");
481
482 addText(parser, "RazerS 3 supports various output formats. The output format is detected automatically from the file name suffix.");
483 addListItem(parser, ".razers", "Razer format");
484 addListItem(parser, ".fa, .fasta", "Enhanced Fasta format");
485 addListItem(parser, ".eland", "Eland format");
486 addListItem(parser, ".gff", "GFF format");
487 addListItem(parser, ".sam", "SAM format");
488 addListItem(parser, ".bam", "BAM format");
489 addListItem(parser, ".afg", "Amos AFG format");
490
491 addText(parser, "By default, reads and contigs are referred by their Fasta ids given in the input files. "
492 "With the \\fB-gn\\fP and \\fB-rn\\fP options this behaviour can be changed:");
493 addListItem(parser, "0", "Use Fasta id.");
494 addListItem(parser, "1", "Enumerate beginning with 1.");
495 addListItem(parser, "2", "Use the read sequence (only for short reads!).");
496 addListItem(parser, "3", "Use the Fasta id, do NOT append /L or /R for mate pairs.");
497
498 addText(parser, "");
499 addText(parser, "The way matches are sorted in the output file can be changed with the \\fB-so\\fP option for the following formats: "
500 "\\fBrazers\\fP, \\fBfasta\\fP, \\fBsam\\fP, and \\fBafg\\fP. Primary and secondary sort keys are:");
501 addListItem(parser, "0", "1. read number, 2. genome position");
502 addListItem(parser, "1", "1. genome position, 2. read number");
503
504 addText(parser, "");
505 addText(parser, "The coordinate space used for begin and end positions can be changed with the "
506 "\\fB-pf\\fP option for the \\fBrazer\\fP and \\fBfasta\\fP formats:");
507 addListItem(parser, "0", "Gap space. Gaps between characters are counted from 0.");
508 addListItem(parser, "1", "Position space. Characters are counted from 1.");
509
510
511 addTextSection(parser, "Examples");
512 addListItem(parser,
513 "\\fBrazers3\\fP \\fB-i\\fP \\fB96\\fP \\fB-tc\\fP \\fB12\\fP \\fB-o\\fP \\fBmapped.razers\\fP \\fBhg18.fa\\fP \\fBreads.fq\\fP",
514 "Map single-end reads with 4% error rate using 12 threads.");
515 addListItem(parser,
516 "\\fBrazers3\\fP \\fB-i\\fP \\fB95\\fP \\fB-no-gaps\\fP \\fB-o\\fP \\fBmapped.razers\\fP \\fBhg18.fa\\fP \\fBreads.fq.gz\\fP",
517 "Map single-end gzipped reads with 5% error rate and no indels.");
518 addListItem(parser,
519 "\\fBrazers3\\fP \\fB-i\\fP \\fB94\\fP \\fB-rr\\fP \\fB95\\fP \\fB-tc\\fP \\fB12\\fP \\fB-ll\\fP \\fB280\\fP \\fB--le\\fP \\fB80\\fP \\fB-o\\fP \\fBmapped.razers\\fP \\fBhg18.fa\\fP \\fBreads_1.fq\\fP \\fBreads_2.fq\\fP",
520 "Map paired-end reads with up to 6% errors, 95% sensitivity, 12 threads, and only output aligned pairs with an outer distance of 200-360bp.");
521 }
522
523 ArgumentParser::ParseResult
extractOptions(StringSet<CharString> & genomeFileNames,StringSet<CharString> & readFileNames,RazerSOptions<> & options,ParamChooserOptions & pm_options,ArgumentParser const & parser)524 extractOptions(
525 StringSet<CharString> & genomeFileNames, StringSet<CharString> & readFileNames,
526 RazerSOptions<> & options, ParamChooserOptions & pm_options,
527 ArgumentParser const & parser)
528 {
529 //////////////////////////////////////////////////////////////////////////////
530 // Extract options
531
532 bool stop = false;
533 getOptionValue(options.forward, parser, "forward");
534 getOptionValue(options.reverse, parser, "reverse");
535 getOptionValue(options.errorRate, parser, "percent-identity");
536 getOptionValue(options.mutationRate, parser, "mutation-rate");
537 #ifndef NO_PARAM_CHOOSER
538 getOptionValue(pm_options.optionLossRate, parser, "recognition-rate");
539 getOptionValue(pm_options.paramFolder, parser, "param-dir");
540 // append slash/backslash
541 if (!empty(pm_options.paramFolder))
542 {
543 if (back(pm_options.paramFolder) != '/' && back(pm_options.paramFolder) != '\\')
544 {
545 #ifdef STDLIB_VS
546 appendValue(pm_options.paramFolder, '\\');
547 #else
548 appendValue(pm_options.paramFolder, '/');
549 #endif
550 }
551 }
552 #endif
553 options.gapMode = (isSet(parser, "no-gaps")) ? RAZERS_UNGAPPED : RAZERS_GAPPED;
554 #ifdef RAZERS_MATEPAIRS
555 getOptionValue(options.libraryLength, parser, "library-length");
556 getOptionValue(options.libraryError, parser, "library-error");
557 #endif
558 getOptionValue(options.maxHits, parser, "max-hits");
559 getOptionValue(options.purgeAmbiguous, parser, "purge-ambiguous");
560 getOptionValue(options.scoreDistanceRange, parser, "distance-range");
561 if (isSet(parser, "distance-range"))
562 options.scoreDistanceRange++;
563 getOptionValue(options.dumpAlignment, parser, "alignment");
564
565 getOptionValue(options.sortOrder, parser, "sort-order");
566 getOptionValue(options.genomeNaming, parser, "genome-naming");
567 getOptionValue(options.readNaming, parser, "read-naming");
568 getOptionValue(options.fullFastaId, parser, "full-readid");
569 getOptionValue(options.positionFormat, parser, "position-format");
570 getOptionValue(options.compactMult, parser, "compact-mult");
571 getOptionValue(options.noCompactFrac, parser, "no-compact-frac");
572 getOptionValue(options.dontShrinkAlignments, parser, "dont-shrink-alignments");
573 getOptionValue(options.computeGlobal, parser, "global-alignment");
574 getOptionValue(options.shape, parser, "shape");
575 getOptionValue(options.overlap, parser, "overlap-length");
576 getOptionValue(options.abundanceCut, parser, "overabundance-cut");
577 getOptionValue(options.repeatLength, parser, "repeat-length");
578
579 #ifdef _OPENMP
580 getOptionValue(options.threadCount, parser, "thread-count");
581 getOptionValue(options.windowSize, parser, "parallel-window-size");
582 getOptionValue(options.verificationPackageSize, parser, "parallel-verification-size");
583 getOptionValue(options.maxVerificationPackageCount, parser, "parallel-verification-max-package-count");
584 getOptionValue(options.availableMatchesMemorySize, parser, "available-matches-memory-size");
585 getOptionValue(options.matchHistoStartThreshold, parser, "match-histo-start-threshold");
586 #else
587 options.threadCount = 0;
588 #endif
589
590 #ifdef RAZERS_OPENADDRESSING
591 getOptionValue(options.loadFactor, parser, "load-factor");
592 #endif
593 getOptionValue(options.trimLength, parser, "trim-reads");
594 getOptionValue(options.tabooLength, parser, "taboo-length");
595 getOptionValue(options.matchN, parser, "match-N");
596 getOptionValue(options.errorPrbFileName, parser, "error-distr");
597 getOptionValue(options.mismatchFilename, parser, "mismatch-file");
598
599 if (isSet(parser, "verbose"))
600 options._debugLevel = max(options._debugLevel, 1);
601 if (isSet(parser, "vverbose"))
602 options._debugLevel = max(options._debugLevel, 3);
603 if (isSet(parser, "unique"))
604 {
605 options.maxHits = 1;
606 options.scoreDistanceRange = 1;
607 options.purgeAmbiguous = true;
608 }
609 if (!options.forward && !options.reverse) // enable both per default
610 {
611 options.forward = true;
612 options.reverse = true;
613 }
614
615 #ifdef RAZERS_MATEPAIRS
616 unsigned maxReadFiles = 2;
617 #else
618 unsigned maxReadFiles = 1;
619 #endif
620 resize(genomeFileNames, length(genomeFileNames) + 1);
621 getArgumentValue(back(genomeFileNames), parser, 0);
622 resize(readFileNames, _min(getArgumentValueCount(parser, 1), maxReadFiles), Exact());
623 for (unsigned i = 0; i < length(readFileNames); ++i)
624 getArgumentValue(readFileNames[i], parser, 1, i);
625
626 // Get output file name from command line if set. Otherwise, autogenerate from input file name.
627 if (isSet(parser, "output"))
628 {
629 getOptionValue(options.output, parser, "output");
630 }
631 else
632 {
633 options.output = readFileNames[0];
634 append(options.output, ".razers");
635 }
636
637 // Get lower case of the output file name. File endings are accepted in both upper and lower case.
638 CharString tmp = options.output;
639 toLower(tmp);
640
641 if (endsWith(tmp, ".fa") || endsWith(tmp, ".fasta"))
642 options.outputFormat = 1;
643 else if (endsWith(tmp, ".eland"))
644 options.outputFormat = 2;
645 else if (endsWith(tmp, ".gff"))
646 options.outputFormat = 3;
647 else if (endsWith(tmp, ".sam") || endsWith(tmp, ".bam"))
648 options.outputFormat = 4;
649 else if (endsWith(tmp, ".afg"))
650 options.outputFormat = 5;
651 else
652 options.outputFormat = 0; // default is ".razers"
653
654 // don't append /L/R in SAM mode
655 if (!isSet(parser, "read-naming") && options.outputFormat == 4)
656 options.readNaming = 3;
657
658 CharString filter;
659 getOptionValue(filter, parser, "filter");
660 if (filter == "pigeonhole")
661 {
662 if (isSet(parser, "threshold") && (stop = true))
663 cerr << "k-mer threshold can only be set for the swift filter (-fl swift)" << endl;
664 options.threshold = 0;
665 }
666 else
667 {
668 getOptionValue(options.threshold, parser, "threshold");
669 if (isSet(parser, "overlap-length") && (stop = true))
670 cerr << "k-mer overlap length can only be set for the pigeonhole filter (-fl pigeonhole)" << endl;
671 }
672
673 if (isSet(parser, "shape"))
674 {
675 unsigned ones = 0;
676 unsigned zeros = 0;
677 for (unsigned i = 0; i < length(options.shape); ++i)
678 switch (options.shape[i])
679 {
680 case '0':
681 ++zeros;
682 break;
683
684 case '1':
685 ++ones;
686 break;
687
688 default:
689 cerr << "Shape must be a binary string" << endl;
690 stop = true;
691 i = length(options.shape);
692 }
693 if ((ones == 0 || ones > 31) && !stop)
694 {
695 cerr << "Invalid Shape" << endl;
696 stop = true;
697 }
698
699 unsigned maxOnes = 14;
700 #ifdef RAZERS_OPENADDRESSING
701 maxOnes = 31;
702 #endif
703 if ((ones < 7 || ones > maxOnes) && !stop)
704 cerr << "Warning: Shape should contain at least 7 and at most " << maxOnes << " '1's" << endl;
705 options.delta = ones + zeros;
706 }
707 if (getArgumentValueCount(parser, 1) == 1)
708 options.libraryLength = -1; // only 1 readset -> disable mate-pair mapping
709 if ((getArgumentValueCount(parser, 1) > maxReadFiles) && (stop = true))
710 cerr << "More than " << maxReadFiles << " read files specified." << endl;
711 if ((getArgumentValueCount(parser, 1) == 0) && (stop = true))
712 cerr << "No read files specified." << endl;
713
714 options.compMask[4] = (options.matchN) ? 15 : 0;
715 options.errorRate = (100.0 - options.errorRate) / 100.0;
716 options.mutationRate = options.mutationRate / 100.0;
717 options.lossRate = pm_options.optionLossRate = (100.0 - pm_options.optionLossRate) / 100.0;
718
719 return (stop) ? ArgumentParser::PARSE_ERROR : ArgumentParser::PARSE_OK;
720 }
721
722 //////////////////////////////////////////////////////////////////////////////
723 // Command line parsing and parameter choosing
main(int argc,const char * argv[])724 int main(int argc, const char * argv[])
725 {
726 //whichMacros();
727
728 #ifdef RAZERS_PROFILE
729 initTimeline();
730 unsigned x = timelineAddTaskType("ON_CONTIG", "Work on contig.");
731 (void)x; // Disable warning if assertions are disable.
732 SEQAN_ASSERT_EQ(x, 1u); // The following will be OK, too.
733 timelineAddTaskType("INIT", "Initialization.");
734 timelineAddTaskType("REVCOMP", "Reverse-complementing contig.");
735 timelineAddTaskType("FILTER", "Filtration using SWIFT.");
736 timelineAddTaskType("VERIFY", "Verification of SWIFT hits.");
737 timelineAddTaskType("WRITEBACK", "Write back to block-local store.");
738 timelineAddTaskType("COMPACT", "Compaction");
739 timelineAddTaskType("DUMP_MATCHES", "Dump matches.");
740 timelineAddTaskType("LOAD", "Load input.");
741 timelineAddTaskType("SORT", "Sorting.");
742 timelineAddTaskType("COPY_FINDER", "Copy SWIFT Finder.");
743 #endif // #ifndef RAZERS_PROFILE
744
745 RazerSOptions<> options;
746 ParamChooserOptions pm_options;
747 StringSet<CharString> genomeFileNames;
748 StringSet<CharString> readFileNames;
749
750 // Change defaults
751 options.forward = false;
752 options.reverse = false;
753
754 // Set up command line parser.
755 ArgumentParser argParser;
756 setUpArgumentParser(argParser, options, pm_options);
757
758 // Parse command line.
759 ArgumentParser::ParseResult res = parse(argParser, argc, argv);
760 if (res != ArgumentParser::PARSE_OK)
761 {
762 if (res == ArgumentParser::PARSE_ERROR)
763 cerr << "Exiting ..." << endl;
764 return (res == ArgumentParser::PARSE_ERROR) ? RAZERS_INVALID_OPTIONS : 0;
765 }
766 // Extract options.
767 res = extractOptions(genomeFileNames, readFileNames, options, pm_options, argParser);
768 if (res != ArgumentParser::PARSE_OK)
769 {
770 cerr << "Exiting ..." << endl;
771 return RAZERS_INVALID_OPTIONS;
772 }
773
774 #ifdef _OPENMP
775 // Set maximal number of threads.
776 int oldMaxThreads = omp_get_max_threads();
777 omp_set_num_threads(options.threadCount == 0 ? 1 : options.threadCount);
778 #endif // #ifdef _OPENMP
779
780 //////////////////////////////////////////////////////////////////////////////
781 // open left reads file
782
783 bool success;
784 if (!isEqual(readFileNames[0], "-"))
785 success = open(options.readFile, toCString(readFileNames[0]));
786 else
787 success = open(options.readFile, std::cin);
788
789 if (!success)
790 return RAZERS_READS_FAILED;
791
792 //////////////////////////////////////////////////////////////////////////////
793 // get read length
794 int readLength = estimateReadLength(options.readFile);
795 if (readLength == RAZERS_READS_FAILED)
796 {
797 cerr << "Failed to open reads file " << readFileNames[0] << endl;
798 cerr << "Exiting ..." << endl;
799 return RAZERS_READS_FAILED;
800 }
801 if (readLength == 0)
802 {
803 cerr << "Failed to read the first read sequence." << endl;
804 cerr << "Exiting ..." << endl;
805 return RAZERS_READS_FAILED;
806 }
807
808 if (options.trimLength > readLength)
809 options.trimLength = readLength;
810
811 #ifndef NO_PARAM_CHOOSER
812 if (options.threshold != 0 && !(isSet(argParser, "shape") || isSet(argParser, "threshold")))
813 {
814 if (options.lowMemory)
815 pm_options.maxWeight = 13;
816 pm_options.verbose = (options._debugLevel >= 1);
817 pm_options.optionErrorRate = options.errorRate;
818 if (options.gapMode == RAZERS_UNGAPPED)
819 {
820 pm_options.optionProbINSERT = (ParamChooserOptions::TFloat)0.0;
821 pm_options.optionProbDELETE = (ParamChooserOptions::TFloat)0.0;
822 }
823 else
824 {
825 pm_options.optionProbINSERT = (ParamChooserOptions::TFloat)0.01; //this number is basically without meaning, any value > 0 will lead to
826 pm_options.optionProbDELETE = (ParamChooserOptions::TFloat)0.01; //edit distance parameter choosing
827 }
828
829 if (options.trimLength > 0)
830 readLength = options.trimLength;
831 if (readLength > 0)
832 {
833 /* if(options.maqMapping && readLength != options.artSeedLength)
834 pm_options.totalN = options.artSeedLength;
835 else*/
836 pm_options.totalN = readLength;
837 if (options._debugLevel >= 1)
838 cerr << "___PARAMETER_CHOOSING__" << endl;
839 if (!chooseParams(options, pm_options))
840 {
841 if (pm_options.verbose)
842 cerr << "Couldn't find preprocessed parameter files. Please configure manually (options --shape and --threshold)." << endl;
843 if (options._debugLevel >= 1)
844 cerr << "Using default configurations (shape = " << options.shape << " and k-mer lemma)." << endl;
845 }
846 if (options._debugLevel >= 1)
847 cerr << endl;
848 }
849 else
850 {
851 cerr << "Failed to load reads" << endl;
852 cerr << "Exiting ..." << endl;
853 return RAZERS_READS_FAILED;
854 }
855 }
856 #endif
857
858 if (argc > 1)
859 options.commandLine = argv[1];
860 for (int i = 2; i < argc; ++i)
861 {
862 appendValue(options.commandLine, ' ');
863 append(options.commandLine, argv[i]);
864 }
865
866 int result = mapReads(genomeFileNames, readFileNames, options);
867 if (result != 0)
868 cerr << "Exiting ..." << endl;
869
870 #ifdef _OPENMP
871 // Restoring number of threads for side-effect freeness.
872 omp_set_num_threads(oldMaxThreads);
873 #endif // #ifdef _OPENMP
874
875 #ifdef RAZERS_PROFILE
876 dumpTimeline("razers.profile.txt", true);
877 #endif // #ifndef RAZERS_PROFILE
878
879 return result;
880 }
881