1 /*==========================================================================
2              RazerS - Fast Read Mapping with Controlled Loss Rate
3                    http://www.seqan.de/projects/razers.html
4 
5  ============================================================================
6   Copyright (C) 2008 by David Weese
7 
8   This program is free software; you can redistribute it and/or
9   modify it under the terms of the GNU Lesser General Public
10   License as published by the Free Software Foundation; either
11   version 3 of the License, or (at your options) any later version.
12 
13   This program is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17 
18   You should have received a copy of the GNU General Public License
19   along with this program.  If not, see <http://www.gnu.org/licenses/>.
20  ==========================================================================*/
21 
22 #define SEQAN_PROFILE                   // enable time measuring
23 //#define SEQAN_DEBUG_SWIFT				// test SWIFT correctness and print bucket parameters
24 //#define RAZERS_DEBUG					// print verification regions
25 #define RAZERS_PRUNE_QGRAM_INDEX        // ignore highly abundant k-mers
26 //#define RAZERS_MEMOPT					// optimize memory consumption
27 #define RAZERS_MASK_READS               // remove matches with max-hits optimal hits on-the-fly
28 //#define NO_PARAM_CHOOSER				// disable loss-rate parameter choosing
29 #define RAZERS_ISLAND_CRITERION         // island match criterion
30 #define RAZERS_NOOUTERREADGAPS          // enforce the alignment of the first and last base (determines the lakes)
31 
32 #define RAZERS_OPENADDRESSING           // enables open addressing for the k-mer index as well as the possibility to set the load factor (-lf)
33 #define RAZERS_BANDED_MYERS             // uses a banded version of Myers bitvector algorithm (analogous to H. Hyyr\"o, 2001)
34 //#define SEQAN_OPENADDRESSING_COMPACT	// saves some memory for the openaddressing index / faster hash table access (if undefined)s
35 //#define RAZERS_DEBUG_MATEPAIRS
36 #define RAZERS_DEFER_COMPACTION         // mask duplicates on the fly and defer compaction
37 #define RAZERS_EXTERNAL_MATCHES         // use external memory algorithms for managing matches
38 
39 //#define RAZERS_PROFILE                // Extensive profiling information.
40 //#define RAZERS_TIMER					// output information on how fast filtration and verification as well as waiting times
41 //#define RAZERS_WINDOW					// use the findWindownext function on the "normal" index
42 
43 #define RAZERS_MATEPAIRS                // enable paired-end matching
44 //#define SEQAN_USE_SSE2_WORDS			// use SSE2 128-bit integers for MyersBitVector
45 
46 // Warn the user about missing OpenMP.  This can be suppressed by setting the
47 // CXX flag "SEQAN_IGNORE_MISSING_OPENMP=1".
48 
49 #ifdef _OPENMP
50 #include <omp.h>
51 #else
52 #if !defined(SEQAN_IGNORE_MISSING_OPENMP) || (SEQAN_IGNORE_MISSING_OPENMP == 0)
53 #pragma message("OpenMP not found! Shared-memory parallelization will be disabled in RazerS3.")
54 #endif  // #if !defined(SEQAN_IGNORE_MISSING_OPENMP) || (SEQAN_IGNORE_MISSING_OPENMP == 0)
55 #endif
56 
57 #include <iostream>
58 #include <sstream>
59 
60 #include <seqan/basic.h>
61 #include <seqan/sequence.h>
62 #include <seqan/file.h>
63 #include <seqan/store.h>
64 #include <seqan/arg_parse.h>
65 #include <seqan/parallel.h>
66 
67 #ifdef STDLIB_VS
68 #include <process.h>
69 #endif
70 
71 #include <memory>
72 #include <unordered_map>
73 
74 #include "razers.h"
75 #include "outputFormat.h"
76 #include "paramChooser.h"
77 
78 #ifdef RAZERS_WINDOW
79 #include "razers_window.h"
80 #endif
81 
82 #ifdef RAZERS_MATEPAIRS
83 #include "razers_matepairs.h"
84 #ifdef _OPENMP
85 #include "razers_matepairs_parallel.h"
86 #endif
87 #endif
88 
89 #ifdef _OPENMP
90 #include "razers_parallel.h"
91 #endif
92 
93 #ifdef RAZERS_PROFILE
94 #include "profile_timeline.h"
95 #endif  // #ifdef RAZERS_PROFILE
96 
97 using namespace std;
98 using namespace seqan;
99 
100 struct MyFragStoreConfig :
101     public FragmentStoreConfig<>
102 {
103     typedef String<Dna5> TContigSeq;
104 };
105 
106 //////////////////////////////////////////////////////////////////////////////
107 // Main read mapper function
108 template <typename TSpec>
mapReads(StringSet<CharString> & genomeFileNames,StringSet<CharString> & readFileNames,RazerSOptions<TSpec> & options)109 int mapReads(
110     StringSet<CharString> & genomeFileNames,
111     StringSet<CharString> & readFileNames,  // NULL terminated list of one/two read files (single/mate-pairs)
112     RazerSOptions<TSpec> & options)
113 {
114     FragmentStore<MyFragStoreConfig>    store;          // stores all of the tables
115     String<String<unsigned short> >     stats;      // needed for mapping quality calculation
116 
117     // dump configuration in verbose mode
118     if (options._debugLevel >= 1)
119     {
120         CharString bitmap;
121         Shape<Dna, GenericShape> shape;
122         stringToShape(shape, options.shape);
123         shapeToString(bitmap, shape);
124 
125         cerr << "___SETTINGS____________" << endl;
126         cerr << "Genome file:                     \t" << genomeFileNames[0] << endl;
127         if (length(readFileNames) < 2)
128             cerr << "Read file:                       \t" << readFileNames[0] << endl;
129         else
130         {
131             cerr << "Read files:                      \t" << readFileNames[0] << endl;
132             for (unsigned i = 1; i < length(readFileNames); ++i)
133                 cerr << "                                 \t" << readFileNames[i] << endl;
134         }
135         cerr << "Compute forward matches:         \t";
136         if (options.forward)
137             cerr << "YES" << endl;
138         else
139             cerr << "NO" << endl;
140         cerr << "Compute reverse matches:         \t";
141         if (options.reverse)
142             cerr << "YES" << endl;
143         else
144             cerr << "NO" << endl;
145         cerr << "Allow Indels:                    \t";
146         if (options.gapMode == RAZERS_GAPPED)
147             cerr << "YES" << endl;
148         else
149             cerr << "NO" << endl;
150         cerr << "Error rate:                      \t" << options.errorRate << endl;
151         if (options.threshold > 0)
152             cerr << "Minimal threshold:               \t" << options.threshold << endl;
153         else
154             cerr << "Pigeonhole mode with overlap:    \t" << options.overlap << endl;
155         cerr << "Shape:                           \t" << bitmap << endl;
156         cerr << "Repeat threshold:                \t" << options.repeatLength << endl;
157         cerr << "Overabundance threshold:         \t" << options.abundanceCut << endl;
158         if (options.threshold > 0)
159             cerr << "Taboo length:                    \t" << options.tabooLength << endl;
160         if (options._debugLevel >= 1)
161         {
162 #ifdef STDLIB_VS
163             int pid = _getpid();
164 #else // #ifdef STDLIB_VS
165             int pid = getpid();
166 #endif // #ifdef STDLIB_VS
167             cerr << "Program PID:                     \t" << pid << endl;
168         }
169         cerr << endl;
170     }
171 
172     // circumvent numerical obstacles
173     options.errorRate += 0.0000001;
174 
175 #ifdef RAZERS_PROFILE
176     timelineBeginTask(TASK_LOAD);
177 #endif  // #ifdef RAZERS_PROFILE
178     //////////////////////////////////////////////////////////////////////////////
179     // Step 1: Load reads
180     SEQAN_PROTIMESTART(load_time);
181 
182 #ifdef RAZERS_MATEPAIRS
183     if (length(readFileNames) == 2)
184     {
185         if (!loadReads(store, options.readFile, toCString(readFileNames[1]), options))
186         {
187             //if (!loadReads(readSet, readQualities, readNames, readFileNames[0], readFileNames[1], options)) {
188             cerr << "Failed to load reads" << endl;
189             return RAZERS_READS_FAILED;
190         }
191     }
192     else
193 #endif
194     {
195         if (!loadReads(store, options.readFile, options))
196         {
197             cerr << "Failed to load reads" << endl;
198             return RAZERS_READS_FAILED;
199         }
200     }
201 
202     if (options._debugLevel >= 1)
203         cerr << lengthSum(store.readSeqStore) << " bps of " << length(store.readSeqStore) << " reads loaded." << endl;
204     options.timeLoadFiles = SEQAN_PROTIMEDIFF(load_time);
205 
206     if (options._debugLevel >= 1)
207         cerr << "Loading reads took               \t" << options.timeLoadFiles << " seconds" << endl;
208 
209     #ifdef RAZERS_MEMOPT
210     if (length(store.readSeqStore) > 16777216)
211     {
212         cerr << "more than 2^24 reads. Switch of RAZERS_MEMOPT in razers.cpp or use less." << std::endl;
213         return 1;
214     }
215     #endif
216 
217     //////////////////////////////////////////////////////////////////////////////
218     // Step 2: Load genomes
219     if (length(genomeFileNames) == 1)
220     {
221         int result = getGenomeFileNameList(genomeFileNames[0], genomeFileNames, options);
222         if (result == RAZERS_GENOME_FAILED)
223         {
224             cerr << "Failed to open genome file " << genomeFileNames[0] << endl;
225             return result;
226         }
227     }
228 #ifdef RAZERS_PROFILE
229     timelineEndTask(TASK_LOAD);
230 #endif  // #ifdef RAZERS_PROFILE
231 
232     //////////////////////////////////////////////////////////////////////////////
233     // Step 3: Find matches using SWIFT
234     loadContigs(store, genomeFileNames, false); // add filenames to the contig store (they are loaded on-demand)
235     int error = _mapReads(store, stats, (RazerSCoreOptions<TSpec>&)options);
236     if (error != 0)
237     {
238         switch (error)
239         {
240         case RAZERS_GENOME_FAILED:
241             cerr << "Failed to load genomes" << endl;
242             break;
243 
244         case RAZERS_INVALID_SHAPE:
245             cerr << "Invalid Shape" << endl;
246             break;
247         }
248         return error;
249     }
250 
251     //////////////////////////////////////////////////////////////////////////////
252     // Step 4: Remove duplicates and output matches
253 #ifdef RAZERS_PROFILE
254     timelineBeginTask(TASK_DUMP_MATCHES);
255 #endif  // #ifdef RAZERS_PROFILE
256     if (!options.spec.DONT_DUMP_RESULTS)
257         dumpMatches(store, stats, readFileNames[0], options);
258 #ifdef RAZERS_PROFILE
259     timelineEndTask(TASK_DUMP_MATCHES);
260 #endif  // #ifdef RAZERS_PROFILE
261 
262     return 0;
263 }
264 
whichMacros()265 inline void whichMacros()
266 {
267 #ifdef RAZERS_OPENADDRESSING
268     std::cerr << "Index:    Open addressing" << std::endl;
269 #else
270     std::cerr << "Index:    Normal" << std::endl;
271 #endif
272 
273 #ifdef RAZERS_TIMER
274     std::cerr << "Timer:    ON" << std::endl;
275 #else
276     std::cerr << "Timer:    OFF" << std::endl;
277 #endif
278 
279 #ifdef _OPENMP
280     std::cerr << "OpenMP:   ON" << std::endl;
281 #else
282     std::cerr << "OpenMP:   OFF" << std::endl;
283 #endif
284 
285 #ifdef RAZERS_BANDED_MYERS
286     std::cerr << "Myers:    Banded" << std::endl;
287 #else
288     std::cerr << "Myers:    Unbanded" << std::endl;
289 #endif
290 
291 #ifdef RAZERS_PROFILE
292     std::cerr << "Timeline: ON" << std::endl;
293 #else
294     std::cerr << "Timeline: OFF" << std::endl;
295 #endif
296     std::cerr << std::endl;
297 }
298 
setUpArgumentParser(ArgumentParser & parser,RazerSOptions<> & options,ParamChooserOptions const & pm_options)299 void setUpArgumentParser(ArgumentParser & parser, RazerSOptions<> & options, ParamChooserOptions const & pm_options)
300 {
301     setAppName(parser, "razers3");
302     setShortDescription(parser, "Faster, fully sensitive read mapping");
303     setCategory(parser, "Read Mapping");
304     options.version = "3.3";
305 	setVersion(parser, SEQAN_APP_VERSION " [" SEQAN_REVISION "]");
306     setDate(parser, SEQAN_DATE);
307 
308     // Need genome and reads (hg18.fa reads.fq)
309     addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE));
310     setValidValues(parser, 0, seqan::SeqFileIn::getFileExtensions());
311     setHelpText(parser, 0, "A reference genome file.");
312     addArgument(parser, ArgParseArgument(ArgParseArgument::INPUT_FILE, "READS", true));
313     setValidValues(parser, 1, seqan::SeqFileIn::getFileExtensions());
314     setHelpText(parser, 1, "Either one (single-end) or two (paired-end) read files.");
315 
316     addUsageLine(parser, "[\\fIOPTIONS\\fP] <\\fIGENOME FILE\\fP> <\\fIREADS FILE\\fP>");
317 #ifdef RAZERS_MATEPAIRS
318     addUsageLine(parser, "[\\fIOPTIONS\\fP] <\\fIGENOME FILE\\fP> <\\fIPE-READS FILE1\\fP> <\\fIPE-READS FILE2\\fP>");
319 #endif
320 
321     addDescription(parser, "RazerS 3 is a versatile full-sensitive read mapper based on k-mer counting and seeding filters. "
322                            "It supports single and paired-end mapping, shared-memory parallelism, and optimally parametrizes "
323                            "the filter based on a user-defined minimal sensitivity. "
324                            "See \\fIhttp://www.seqan.de/projects/razers\\fP for more information.");
325 
326     addDescription(parser, "Input to RazerS 3 is a reference genome file and either one file with single-end reads "
327                            "or two files containing left or right mates of paired-end reads. Use - to read single-end "
328                            "reads from stdin.");
329 
330     addDescription(parser, "(c) Copyright 2009-2014 by David Weese.");
331 
332     addSection(parser, "Main Options");
333     addOption(parser, ArgParseOption("i", "percent-identity", "Percent identity threshold.", ArgParseOption::DOUBLE));
334     setMinValue(parser, "percent-identity", "50");
335     setMaxValue(parser, "percent-identity", "100");
336     setDefaultValue(parser, "percent-identity", 100 - (100.0 * options.errorRate));
337     addOption(parser, ArgParseOption("rr", "recognition-rate", "Percent recognition rate.", ArgParseOption::DOUBLE));
338     setMinValue(parser, "recognition-rate", "80");
339     setMaxValue(parser, "recognition-rate", "100");
340     setDefaultValue(parser, "recognition-rate", 100 - (100.0 * pm_options.optionLossRate));
341     addOption(parser, ArgParseOption("ng", "no-gaps", "Allow only mismatches, no indels. Default: allow both."));
342     addOption(parser, ArgParseOption("f", "forward", "Map reads only to forward strands."));
343     addOption(parser, ArgParseOption("r", "reverse", "Map reads only to reverse strands."));
344     addOption(parser, ArgParseOption("m", "max-hits", "Output only <\\fINUM\\fP> of the best hits.", ArgParseOption::INTEGER));
345     setMinValue(parser, "max-hits", "1");
346     setDefaultValue(parser, "max-hits", options.maxHits);
347     addOption(parser, ArgParseOption("", "unique", "Output only unique best matches (-m 1 -dr 0 -pa)."));
348     addOption(parser, ArgParseOption("tr", "trim-reads", "Trim reads to given length. Default: off.", ArgParseOption::INTEGER));
349     setMinValue(parser, "trim-reads", "14");
350     addOption(parser, ArgParseOption("o", "output", "Mapping result filename (use - to dump to stdout in razers format). Default: <\\fIREADS FILE\\fP>.razers.", ArgParseOption::OUTPUT_FILE));
351     setValidValues(parser, "output", ".razers .eland .fa .fasta .gff .sam .bam .afg");
352     addOption(parser, ArgParseOption("v", "verbose", "Verbose mode."));
353     addOption(parser, ArgParseOption("vv", "vverbose", "Very verbose mode."));
354 
355 #ifdef RAZERS_MATEPAIRS
356     addSection(parser, "Paired-end Options");
357     addOption(parser, ArgParseOption("ll", "library-length", "Paired-end library length.", ArgParseOption::INTEGER));
358     setMinValue(parser, "library-length", "1");
359     setDefaultValue(parser, "library-length", options.libraryLength);
360     addOption(parser, ArgParseOption("le", "library-error", "Paired-end library length tolerance.", ArgParseOption::INTEGER));
361     setMinValue(parser, "library-error", "0");
362     setDefaultValue(parser, "library-error", options.libraryError);
363 #endif
364 
365     addSection(parser, "Output Format Options");
366     addOption(parser, ArgParseOption("a", "alignment", "Dump the alignment for each match (only \\fIrazer\\fP or \\fIfasta\\fP format)."));
367     addOption(parser, ArgParseOption("pa", "purge-ambiguous", "Purge reads with more than <\\fImax-hits\\fP> best matches."));
368     addOption(parser, ArgParseOption("dr", "distance-range", "Only consider matches with at most NUM more errors compared to the best. Default: output all.", ArgParseOption::INTEGER));
369     addOption(parser, ArgParseOption("gn", "genome-naming", "Select how genomes are named (see Naming section below).", ArgParseOption::INTEGER));
370     setMinValue(parser, "genome-naming", "0");
371     setMaxValue(parser, "genome-naming", "1");
372     setDefaultValue(parser, "genome-naming", options.genomeNaming);
373 //	addHelpLine(parser, "0 = use Fasta id");
374 //	addHelpLine(parser, "1 = enumerate beginning with 1");
375     addOption(parser, ArgParseOption("rn", "read-naming", "Select how reads are named (see Naming section below).", ArgParseOption::INTEGER));
376     setDefaultValue(parser, "read-naming", options.readNaming);
377     setMinValue(parser, "read-naming", "0");
378     setMaxValue(parser, "read-naming", "3");
379 
380 //	addHelpLine(parser, "0 = use Fasta id");
381 //	addHelpLine(parser, "1 = enumerate beginning with 1");
382 //	addHelpLine(parser, "2 = use the read sequence (only for short reads!)");
383 //	addHelpLine(parser, "3 = use the Fasta id, do NOT append '/L' or '/R' for mate pairs");
384     addOption(parser, ArgParseOption("", "full-readid", "Use the whole read id (don't clip after whitespace)."));
385     addOption(parser, ArgParseOption("so", "sort-order", "Select how matches are sorted (see Sorting section below).", ArgParseOption::INTEGER));
386     setDefaultValue(parser, "sort-order", options.sortOrder);
387     setMinValue(parser, "sort-order", "0");
388     setMaxValue(parser, "sort-order", "1");
389 //	addHelpLine(parser, "0 = 1. read number, 2. genome position");
390 //	addHelpLine(parser, "1 = 1. genome position, 2. read number");
391     addOption(parser, ArgParseOption("pf", "position-format", "Select begin/end position numbering (see Coordinate section below).", ArgParseOption::INTEGER));
392     setMinValue(parser, "position-format", "0");
393     setMaxValue(parser, "position-format", "1");
394     setDefaultValue(parser, "position-format", options.sortOrder);
395 //	addHelpLine(parser, "0 = gap space");
396 //	addHelpLine(parser, "1 = position space");
397     addOption(parser, ArgParseOption("ds", "dont-shrink-alignments", "Disable alignment shrinking in SAM.  This is required for generating a gold mapping for Rabema."));
398     addOption(parser, ArgParseOption("ga", "global-alignment", "Compute global alignment (in SAM output)."));
399     hideOption(parser, "global-alignment");
400 
401     addSection(parser, "Filtration Options");
402     addOption(parser, ArgParseOption("fl", "filter", "Select k-mer filter.", ArgParseOption::STRING));
403     setValidValues(parser, "filter", "pigeonhole swift");
404     setDefaultValue(parser, "filter", "pigeonhole");
405     addOption(parser, ArgParseOption("mr", "mutation-rate", "Set the percent mutation rate (\\fIpigeonhole\\fP).", ArgParseOption::DOUBLE));
406     setMinValue(parser, "mutation-rate", "0");
407     setMaxValue(parser, "mutation-rate", "20");
408     setDefaultValue(parser, "mutation-rate", 100.0 * options.mutationRate);
409     addOption(parser, ArgParseOption("ol", "overlap-length", "Manually set the overlap length of adjacent k-mers (\\fIpigeonhole\\fP).", ArgParseOption::INTEGER));
410     setMinValue(parser, "overlap-length", "0");
411 #ifndef NO_PARAM_CHOOSER
412     addOption(parser, ArgParseOption("pd", "param-dir", "Read user-computed parameter files in the directory <\\fIDIR\\fP> (\\fIswift\\fP).", ArgParseOption::STRING, "DIR"));
413 #endif
414     addOption(parser, ArgParseOption("t", "threshold", "Manually set minimum k-mer count threshold (\\fIswift\\fP).", ArgParseOption::INTEGER));
415     setMinValue(parser, "threshold", "1");
416     addOption(parser, ArgParseOption("tl", "taboo-length", "Set taboo length (\\fIswift\\fP).", ArgParseOption::INTEGER));
417     setMinValue(parser, "taboo-length", "1");
418     setDefaultValue(parser, "taboo-length", options.tabooLength);
419     addOption(parser, ArgParseOption("s", "shape", "Manually set k-mer shape.", ArgParseOption::STRING, "BITSTRING"));
420 //    setDefaultValue(parser, "shape", options.shape);  // <-- doesn't work with KNIME which always sets default values
421     addOption(parser, ArgParseOption("oc", "overabundance-cut", "Set k-mer overabundance cut ratio.", ArgParseOption::INTEGER));
422     setMinValue(parser, "overabundance-cut", "0");
423     setMaxValue(parser, "overabundance-cut", "1");
424     setDefaultValue(parser, "overabundance-cut", options.abundanceCut);
425     addOption(parser, ArgParseOption("rl", "repeat-length", "Skip simple-repeats of length <\\fINUM\\fP>.", ArgParseOption::INTEGER));
426     setMinValue(parser, "repeat-length", "1");
427     setDefaultValue(parser, "repeat-length", options.repeatLength);
428 #ifdef RAZERS_OPENADDRESSING
429     addOption(parser, ArgParseOption("lf", "load-factor", "Set the load factor for the open addressing k-mer index.", ArgParseOption::DOUBLE));
430     setMinValue(parser, "load-factor", "1");
431     setDefaultValue(parser, "load-factor", options.loadFactor);
432 #endif
433 
434     addSection(parser, "Verification Options");
435     addOption(parser, ArgParseOption("mN", "match-N", "N matches all other characters. Default: N matches nothing."));
436     addOption(parser, ArgParseOption("ed", "error-distr", "Write error distribution to \\fIFILE\\fP.", ArgParseOption::STRING, "FILE"));
437     addOption(parser, ArgParseOption("mf", "mismatch-file", "Write mismatch patterns to \\fIFILE\\fP.", ArgParseOption::STRING, "FILE"));
438 
439     addSection(parser, "Misc Options");
440     addOption(parser, ArgParseOption("cm", "compact-mult", "Multiply compaction threshold by this value after reaching and compacting.", ArgParseOption::DOUBLE));
441     setMinValue(parser, "compact-mult", "0");
442     setDefaultValue(parser, "compact-mult", options.compactMult);
443     addOption(parser, ArgParseOption("ncf", "no-compact-frac", "Don't compact if in this last fraction of genome.", ArgParseOption::DOUBLE));
444     setMinValue(parser, "no-compact-frac", "0");
445     setMaxValue(parser, "no-compact-frac", "1");
446     setDefaultValue(parser, "no-compact-frac", options.noCompactFrac);
447 
448 #ifdef _OPENMP
449     addSection(parser, "Parallelism Options");
450 #endif  // #ifdef _OPENMP
451     addOption(parser, ArgParseOption("tc", "thread-count", "Set the number of threads to use (0 to force sequential mode).", ArgParseOption::INTEGER));
452     setMinValue(parser, "thread-count", "0");
453 #ifndef _OPENMP
454     hideOption(parser, "tc");
455 #endif  // #ifndef _OPENMP
456 #ifdef _OPENMP
457     setDefaultValue(parser, "thread-count", options.threadCount);
458     addOption(parser, ArgParseOption("pws", "parallel-window-size", "Collect candidates in windows of this length.", ArgParseOption::INTEGER));
459     setMinValue(parser, "parallel-window-size", "1");
460     setDefaultValue(parser, "parallel-window-size", options.windowSize);
461     addOption(parser, ArgParseOption("pvs", "parallel-verification-size", "Verify candidates in packages of this size.", ArgParseOption::INTEGER));
462     setMinValue(parser, "parallel-verification-size", "1");
463     setDefaultValue(parser, "parallel-verification-size", options.verificationPackageSize);
464     addOption(parser, ArgParseOption("pvmpc", "parallel-verification-max-package-count", "Largest number of packages to create for verification per thread-1.", ArgParseOption::INTEGER));
465     setMinValue(parser, "parallel-verification-max-package-count", "1");
466     setDefaultValue(parser, "parallel-verification-max-package-count", options.maxVerificationPackageCount);
467 //    addHelpLine(parser, "Go over package size if this limit is reached.");
468     addOption(parser, ArgParseOption("amms", "available-matches-memory-size", "Bytes of main memory available for storing matches.", ArgParseOption::INTEGER));
469     setMinValue(parser, "available-matches-memory-size", "-1");
470     setDefaultValue(parser, "available-matches-memory-size", options.availableMatchesMemorySize);
471 //    addHelpLine(parser, "Used to switch to external sorting.");
472 //    addHelpLine(parser, "-1 = always external");
473 //    addHelpLine(parser, " 0 = never");
474 //    addHelpLine(parser, " x = use other value x as threshold.");
475     addOption(parser, ArgParseOption("mhst", "match-histo-start-threshold", "When to start histogram.", ArgParseOption::INTEGER));
476     setMinValue(parser, "match-histo-start-threshold", "1");
477     setDefaultValue(parser, "match-histo-start-threshold", options.matchHistoStartThreshold);
478 #endif
479 
480     addTextSection(parser, "Formats, Naming, Sorting, and Coordinate Schemes");
481 
482     addText(parser, "RazerS 3 supports various output formats. The output format is detected automatically from the file name suffix.");
483 	addListItem(parser, ".razers", "Razer format");
484 	addListItem(parser, ".fa, .fasta", "Enhanced Fasta format");
485 	addListItem(parser, ".eland", "Eland format");
486 	addListItem(parser, ".gff", "GFF format");
487 	addListItem(parser, ".sam", "SAM format");
488 	addListItem(parser, ".bam", "BAM format");
489 	addListItem(parser, ".afg", "Amos AFG format");
490 
491     addText(parser, "By default, reads and contigs are referred by their Fasta ids given in the input files. "
492                     "With the \\fB-gn\\fP and \\fB-rn\\fP options this behaviour can be changed:");
493     addListItem(parser, "0", "Use Fasta id.");
494     addListItem(parser, "1", "Enumerate beginning with 1.");
495     addListItem(parser, "2", "Use the read sequence (only for short reads!).");
496     addListItem(parser, "3", "Use the Fasta id, do NOT append /L or /R for mate pairs.");
497 
498     addText(parser, "");
499     addText(parser, "The way matches are sorted in the output file can be changed with the \\fB-so\\fP option for the following formats: "
500                     "\\fBrazers\\fP, \\fBfasta\\fP, \\fBsam\\fP, and \\fBafg\\fP. Primary and secondary sort keys are:");
501     addListItem(parser, "0", "1. read number, 2. genome position");
502     addListItem(parser, "1", "1. genome position, 2. read number");
503 
504     addText(parser, "");
505     addText(parser, "The coordinate space used for begin and end positions can be changed with the "
506                     "\\fB-pf\\fP option for the \\fBrazer\\fP and \\fBfasta\\fP formats:");
507     addListItem(parser, "0", "Gap space. Gaps between characters are counted from 0.");
508     addListItem(parser, "1", "Position space. Characters are counted from 1.");
509 
510 
511     addTextSection(parser, "Examples");
512     addListItem(parser,
513                 "\\fBrazers3\\fP \\fB-i\\fP \\fB96\\fP \\fB-tc\\fP \\fB12\\fP \\fB-o\\fP \\fBmapped.razers\\fP \\fBhg18.fa\\fP \\fBreads.fq\\fP",
514                 "Map single-end reads with 4% error rate using 12 threads.");
515     addListItem(parser,
516                 "\\fBrazers3\\fP \\fB-i\\fP \\fB95\\fP \\fB-no-gaps\\fP \\fB-o\\fP \\fBmapped.razers\\fP \\fBhg18.fa\\fP \\fBreads.fq.gz\\fP",
517                 "Map single-end gzipped reads with 5% error rate and no indels.");
518     addListItem(parser,
519                 "\\fBrazers3\\fP \\fB-i\\fP \\fB94\\fP \\fB-rr\\fP \\fB95\\fP \\fB-tc\\fP \\fB12\\fP \\fB-ll\\fP \\fB280\\fP \\fB--le\\fP \\fB80\\fP \\fB-o\\fP \\fBmapped.razers\\fP \\fBhg18.fa\\fP \\fBreads_1.fq\\fP \\fBreads_2.fq\\fP",
520                 "Map paired-end reads with up to 6% errors, 95% sensitivity, 12 threads, and only output aligned pairs with an outer distance of 200-360bp.");
521 }
522 
523 ArgumentParser::ParseResult
extractOptions(StringSet<CharString> & genomeFileNames,StringSet<CharString> & readFileNames,RazerSOptions<> & options,ParamChooserOptions & pm_options,ArgumentParser const & parser)524 extractOptions(
525     StringSet<CharString> & genomeFileNames, StringSet<CharString> & readFileNames,
526     RazerSOptions<> & options, ParamChooserOptions & pm_options,
527     ArgumentParser const & parser)
528 {
529     //////////////////////////////////////////////////////////////////////////////
530     // Extract options
531 
532     bool stop = false;
533     getOptionValue(options.forward, parser, "forward");
534     getOptionValue(options.reverse, parser, "reverse");
535     getOptionValue(options.errorRate, parser, "percent-identity");
536     getOptionValue(options.mutationRate, parser, "mutation-rate");
537 #ifndef NO_PARAM_CHOOSER
538     getOptionValue(pm_options.optionLossRate, parser, "recognition-rate");
539     getOptionValue(pm_options.paramFolder, parser, "param-dir");
540     // append slash/backslash
541     if (!empty(pm_options.paramFolder))
542     {
543         if (back(pm_options.paramFolder) != '/' && back(pm_options.paramFolder) != '\\')
544         {
545 #ifdef STDLIB_VS
546             appendValue(pm_options.paramFolder, '\\');
547 #else
548             appendValue(pm_options.paramFolder, '/');
549 #endif
550         }
551     }
552 #endif
553     options.gapMode = (isSet(parser, "no-gaps")) ? RAZERS_UNGAPPED : RAZERS_GAPPED;
554 #ifdef RAZERS_MATEPAIRS
555     getOptionValue(options.libraryLength, parser, "library-length");
556     getOptionValue(options.libraryError, parser, "library-error");
557 #endif
558     getOptionValue(options.maxHits, parser, "max-hits");
559     getOptionValue(options.purgeAmbiguous, parser, "purge-ambiguous");
560     getOptionValue(options.scoreDistanceRange, parser, "distance-range");
561     if (isSet(parser, "distance-range"))
562         options.scoreDistanceRange++;
563     getOptionValue(options.dumpAlignment, parser, "alignment");
564 
565     getOptionValue(options.sortOrder, parser, "sort-order");
566     getOptionValue(options.genomeNaming, parser, "genome-naming");
567     getOptionValue(options.readNaming, parser, "read-naming");
568     getOptionValue(options.fullFastaId, parser, "full-readid");
569     getOptionValue(options.positionFormat, parser, "position-format");
570     getOptionValue(options.compactMult, parser, "compact-mult");
571     getOptionValue(options.noCompactFrac, parser, "no-compact-frac");
572     getOptionValue(options.dontShrinkAlignments, parser, "dont-shrink-alignments");
573     getOptionValue(options.computeGlobal, parser, "global-alignment");
574     getOptionValue(options.shape, parser, "shape");
575     getOptionValue(options.overlap, parser, "overlap-length");
576     getOptionValue(options.abundanceCut, parser, "overabundance-cut");
577     getOptionValue(options.repeatLength, parser, "repeat-length");
578 
579 #ifdef _OPENMP
580     getOptionValue(options.threadCount, parser, "thread-count");
581     getOptionValue(options.windowSize, parser, "parallel-window-size");
582     getOptionValue(options.verificationPackageSize, parser, "parallel-verification-size");
583     getOptionValue(options.maxVerificationPackageCount, parser, "parallel-verification-max-package-count");
584     getOptionValue(options.availableMatchesMemorySize, parser, "available-matches-memory-size");
585     getOptionValue(options.matchHistoStartThreshold, parser, "match-histo-start-threshold");
586 #else
587     options.threadCount = 0;
588 #endif
589 
590 #ifdef RAZERS_OPENADDRESSING
591     getOptionValue(options.loadFactor, parser, "load-factor");
592 #endif
593     getOptionValue(options.trimLength, parser, "trim-reads");
594     getOptionValue(options.tabooLength, parser, "taboo-length");
595     getOptionValue(options.matchN, parser, "match-N");
596     getOptionValue(options.errorPrbFileName, parser, "error-distr");
597     getOptionValue(options.mismatchFilename, parser, "mismatch-file");
598 
599     if (isSet(parser, "verbose"))
600         options._debugLevel = max(options._debugLevel, 1);
601     if (isSet(parser, "vverbose"))
602         options._debugLevel = max(options._debugLevel, 3);
603     if (isSet(parser, "unique"))
604     {
605         options.maxHits = 1;
606         options.scoreDistanceRange = 1;
607         options.purgeAmbiguous = true;
608     }
609     if (!options.forward && !options.reverse)  // enable both per default
610     {
611         options.forward = true;
612         options.reverse = true;
613     }
614 
615 #ifdef RAZERS_MATEPAIRS
616     unsigned maxReadFiles = 2;
617 #else
618     unsigned maxReadFiles = 1;
619 #endif
620     resize(genomeFileNames, length(genomeFileNames) + 1);
621     getArgumentValue(back(genomeFileNames), parser, 0);
622     resize(readFileNames, _min(getArgumentValueCount(parser, 1), maxReadFiles), Exact());
623     for (unsigned i = 0; i < length(readFileNames); ++i)
624         getArgumentValue(readFileNames[i], parser, 1, i);
625 
626     // Get output file name from command line if set.  Otherwise, autogenerate from input file name.
627     if (isSet(parser, "output"))
628     {
629         getOptionValue(options.output, parser, "output");
630     }
631     else
632     {
633         options.output = readFileNames[0];
634         append(options.output, ".razers");
635     }
636 
637     // Get lower case of the output file name.  File endings are accepted in both upper and lower case.
638     CharString tmp = options.output;
639     toLower(tmp);
640 
641     if (endsWith(tmp, ".fa") || endsWith(tmp, ".fasta"))
642         options.outputFormat = 1;
643     else if (endsWith(tmp, ".eland"))
644         options.outputFormat = 2;
645     else if (endsWith(tmp, ".gff"))
646         options.outputFormat = 3;
647     else if (endsWith(tmp, ".sam") || endsWith(tmp, ".bam"))
648         options.outputFormat = 4;
649     else if (endsWith(tmp, ".afg"))
650         options.outputFormat = 5;
651     else
652         options.outputFormat = 0;   // default is ".razers"
653 
654     // don't append /L/R in SAM mode
655     if (!isSet(parser, "read-naming") && options.outputFormat == 4)
656         options.readNaming = 3;
657 
658     CharString filter;
659     getOptionValue(filter, parser, "filter");
660     if (filter == "pigeonhole")
661     {
662         if (isSet(parser, "threshold") && (stop = true))
663             cerr << "k-mer threshold can only be set for the swift filter (-fl swift)" << endl;
664         options.threshold = 0;
665     }
666     else
667     {
668         getOptionValue(options.threshold, parser, "threshold");
669         if (isSet(parser, "overlap-length") && (stop = true))
670             cerr << "k-mer overlap length can only be set for the pigeonhole filter (-fl pigeonhole)" << endl;
671     }
672 
673     if (isSet(parser, "shape"))
674     {
675         unsigned ones = 0;
676         unsigned zeros = 0;
677         for (unsigned i = 0; i < length(options.shape); ++i)
678             switch (options.shape[i])
679             {
680             case '0':
681                 ++zeros;
682                 break;
683 
684             case '1':
685                 ++ones;
686                 break;
687 
688             default:
689                 cerr << "Shape must be a binary string" << endl;
690                 stop = true;
691                 i = length(options.shape);
692             }
693         if ((ones == 0 || ones > 31) && !stop)
694         {
695             cerr << "Invalid Shape" << endl;
696             stop = true;
697         }
698 
699         unsigned maxOnes = 14;
700 #ifdef RAZERS_OPENADDRESSING
701         maxOnes = 31;
702 #endif
703         if ((ones < 7 || ones > maxOnes) && !stop)
704             cerr << "Warning: Shape should contain at least 7 and at most " << maxOnes << " '1's" << endl;
705         options.delta = ones + zeros;
706     }
707     if (getArgumentValueCount(parser, 1) == 1)
708         options.libraryLength = -1;     // only 1 readset -> disable mate-pair mapping
709     if ((getArgumentValueCount(parser, 1) > maxReadFiles) && (stop = true))
710         cerr << "More than " << maxReadFiles << " read files specified." << endl;
711     if ((getArgumentValueCount(parser, 1) == 0) && (stop = true))
712         cerr << "No read files specified." << endl;
713 
714     options.compMask[4] = (options.matchN) ? 15 : 0;
715     options.errorRate = (100.0 - options.errorRate) / 100.0;
716     options.mutationRate = options.mutationRate / 100.0;
717     options.lossRate = pm_options.optionLossRate = (100.0 - pm_options.optionLossRate) / 100.0;
718 
719     return (stop) ? ArgumentParser::PARSE_ERROR : ArgumentParser::PARSE_OK;
720 }
721 
722 //////////////////////////////////////////////////////////////////////////////
723 // Command line parsing and parameter choosing
main(int argc,const char * argv[])724 int main(int argc, const char * argv[])
725 {
726     //whichMacros();
727 
728 #ifdef RAZERS_PROFILE
729     initTimeline();
730     unsigned x = timelineAddTaskType("ON_CONTIG", "Work on contig.");
731     (void)x;  // Disable warning if assertions are disable.
732     SEQAN_ASSERT_EQ(x, 1u);  // The following will be OK, too.
733     timelineAddTaskType("INIT", "Initialization.");
734     timelineAddTaskType("REVCOMP", "Reverse-complementing contig.");
735     timelineAddTaskType("FILTER", "Filtration using SWIFT.");
736     timelineAddTaskType("VERIFY", "Verification of SWIFT hits.");
737     timelineAddTaskType("WRITEBACK", "Write back to block-local store.");
738     timelineAddTaskType("COMPACT", "Compaction");
739     timelineAddTaskType("DUMP_MATCHES", "Dump matches.");
740     timelineAddTaskType("LOAD", "Load input.");
741     timelineAddTaskType("SORT", "Sorting.");
742     timelineAddTaskType("COPY_FINDER", "Copy SWIFT Finder.");
743 #endif  // #ifndef RAZERS_PROFILE
744 
745     RazerSOptions<>         options;
746     ParamChooserOptions     pm_options;
747     StringSet<CharString>   genomeFileNames;
748     StringSet<CharString>   readFileNames;
749 
750     // Change defaults
751     options.forward = false;
752     options.reverse = false;
753 
754     // Set up command line parser.
755     ArgumentParser argParser;
756     setUpArgumentParser(argParser, options, pm_options);
757 
758     // Parse command line.
759     ArgumentParser::ParseResult res = parse(argParser, argc, argv);
760     if (res != ArgumentParser::PARSE_OK)
761     {
762         if (res == ArgumentParser::PARSE_ERROR)
763             cerr << "Exiting ..." << endl;
764         return (res == ArgumentParser::PARSE_ERROR) ? RAZERS_INVALID_OPTIONS : 0;
765     }
766     // Extract options.
767     res = extractOptions(genomeFileNames, readFileNames, options, pm_options, argParser);
768     if (res != ArgumentParser::PARSE_OK)
769     {
770         cerr << "Exiting ..." << endl;
771         return RAZERS_INVALID_OPTIONS;
772     }
773 
774 #ifdef _OPENMP
775     // Set maximal number of threads.
776     int oldMaxThreads = omp_get_max_threads();
777     omp_set_num_threads(options.threadCount == 0 ? 1 : options.threadCount);
778 #endif  // #ifdef _OPENMP
779 
780 	//////////////////////////////////////////////////////////////////////////////
781 	// open left reads file
782 
783     bool success;
784     if (!isEqual(readFileNames[0], "-"))
785         success = open(options.readFile, toCString(readFileNames[0]));
786     else
787         success = open(options.readFile, std::cin);
788 
789     if (!success)
790         return RAZERS_READS_FAILED;
791 
792     //////////////////////////////////////////////////////////////////////////////
793     // get read length
794     int readLength = estimateReadLength(options.readFile);
795     if (readLength == RAZERS_READS_FAILED)
796     {
797         cerr << "Failed to open reads file " << readFileNames[0] << endl;
798         cerr << "Exiting ..." << endl;
799         return RAZERS_READS_FAILED;
800     }
801     if (readLength == 0)
802     {
803         cerr << "Failed to read the first read sequence." << endl;
804         cerr << "Exiting ..." << endl;
805         return RAZERS_READS_FAILED;
806     }
807 
808     if (options.trimLength > readLength)
809         options.trimLength = readLength;
810 
811 #ifndef NO_PARAM_CHOOSER
812     if (options.threshold != 0 && !(isSet(argParser, "shape") || isSet(argParser, "threshold")))
813     {
814         if (options.lowMemory)
815             pm_options.maxWeight = 13;
816         pm_options.verbose = (options._debugLevel >= 1);
817         pm_options.optionErrorRate = options.errorRate;
818         if (options.gapMode == RAZERS_UNGAPPED)
819         {
820             pm_options.optionProbINSERT = (ParamChooserOptions::TFloat)0.0;
821             pm_options.optionProbDELETE = (ParamChooserOptions::TFloat)0.0;
822         }
823         else
824         {
825             pm_options.optionProbINSERT = (ParamChooserOptions::TFloat)0.01;    //this number is basically without meaning, any value > 0 will lead to
826             pm_options.optionProbDELETE = (ParamChooserOptions::TFloat)0.01;    //edit distance parameter choosing
827         }
828 
829         if (options.trimLength > 0)
830             readLength = options.trimLength;
831         if (readLength > 0)
832         {
833 /*			if(options.maqMapping && readLength != options.artSeedLength)
834                 pm_options.totalN = options.artSeedLength;
835             else*/
836             pm_options.totalN = readLength;
837             if (options._debugLevel >= 1)
838                 cerr << "___PARAMETER_CHOOSING__" << endl;
839             if (!chooseParams(options, pm_options))
840             {
841                 if (pm_options.verbose)
842                     cerr << "Couldn't find preprocessed parameter files. Please configure manually (options --shape and --threshold)." << endl;
843                 if (options._debugLevel >= 1)
844                     cerr << "Using default configurations (shape = " << options.shape << " and k-mer lemma)." << endl;
845             }
846             if (options._debugLevel >= 1)
847                 cerr << endl;
848         }
849         else
850         {
851             cerr << "Failed to load reads" << endl;
852             cerr << "Exiting ..." << endl;
853             return RAZERS_READS_FAILED;
854         }
855     }
856 #endif
857 
858     if (argc > 1)
859         options.commandLine = argv[1];
860     for (int i = 2; i < argc; ++i)
861     {
862         appendValue(options.commandLine, ' ');
863         append(options.commandLine, argv[i]);
864     }
865 
866     int result = mapReads(genomeFileNames, readFileNames, options);
867     if (result != 0)
868         cerr << "Exiting ..." << endl;
869 
870 #ifdef _OPENMP
871     // Restoring number of threads for side-effect freeness.
872     omp_set_num_threads(oldMaxThreads);
873 #endif  // #ifdef _OPENMP
874 
875 #ifdef RAZERS_PROFILE
876     dumpTimeline("razers.profile.txt", true);
877 #endif  // #ifndef RAZERS_PROFILE
878 
879     return result;
880 }
881