1 /**
2 * Close intra-scaffold gaps
3 * Copyright 2014 Canada's Michael Smith Genome Science Centre
4 */
5
6 #include "config.h"
7
8 #include "Konnector/konnector.h"
9 #include "Konnector/DBGBloom.h"
10 #include "Konnector/DBGBloomAlgorithms.h"
11 #include "Bloom/CascadingBloomFilter.h"
12
13 #include "Align/alignGlobal.h"
14 #include "Common/IOUtil.h"
15 #include "Common/Options.h"
16 #include "Common/StringUtil.h"
17 #include "DataLayer/FastaConcat.h"
18 #include "DataLayer/Options.h"
19 #include "Graph/DotIO.h"
20 #include "Graph/Options.h"
21 #include "Graph/GraphUtil.h"
22
23 #include <cassert>
24 #include <getopt.h>
25 #include <iostream>
26 #include <cstring>
27
28 #include <string>
29 #include <algorithm>
30 #include <fstream>
31 #include <sstream>
32 #include <map>
33
34
35 #if _OPENMP
36 # include <omp.h>
37 # include "Bloom/ConcurrentBloomFilter.h"
38 #endif
39
40 #undef USESEQAN
41
42 #if USESEQAN
43 #include <seqan/align.h>
44 #include <seqan/sequence.h>
45 #include <seqan/align_split.h>
46 #endif
47
48 using namespace std;
49 using Konnector::BloomFilter;
50 #if USESEQAN
51 using namespace seqan;
52 #endif
53
54 #define PROGRAM "abyss-sealer"
55
56 static const char VERSION_MESSAGE[] =
57 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
58 "Written by Shaun Jackman, Hamid Mohamadi, Anthony Raymond,\n"
59 "Ben Vandervalk and Daniel Paulino\n"
60 "\n"
61 "Copyright 2014 Canada's Michael Smith Genome Science Centre\n";
62
63 static const char USAGE_MESSAGE[] =
64 "Usage: " PROGRAM "-b <Bloom filter size> -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...\n"
65 "i.e. abyss-sealer -b20G -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa\n\n"
66 "Close gaps by using left and right flanking sequences of gaps as 'reads' for Konnector\n"
67 "and performing multiple runs with each of the supplied K values.\n"
68 "\n"
69 " Options:\n"
70 "\n"
71 " --print-flanks outputs flank files\n"
72 " -S, --input-scaffold=FILE load scaffold from FILE\n"
73 " -L, --flank-length=N length of flanks to be used as pseudoreads [100]\n"
74 " -G, --max-gap-length=N max gap size to fill in bp [800]; runtime increases\n"
75 " exponentially with respect to this parameter\n"
76 " -j, --threads=N use N parallel threads [1]\n"
77 " -k, --kmer=N the size of a k-mer\n"
78 " -b, --bloom-size=N size of Bloom filter (e.g. '40G'). Required\n"
79 " when not using pre-built Bloom filter(s)\n"
80 " (-i option)\n"
81 " -d, --dot-file=FILE write graph traversals to a DOT file\n"
82 " -e, --fix-errors find and fix single-base errors when reads\n"
83 " have no kmers in bloom filter [disabled]\n"
84 " -C, --max-cost=N max edges to traverse during each graph search [100000]\n"
85 " -i, --input-bloom=FILE load bloom filter from FILE\n"
86 " --mask mask new and changed bases as lower case\n"
87 " --no-mask do not mask bases [default]\n"
88 " --chastity discard unchaste reads [default]\n"
89 " --no-chastity do not discard unchaste reads\n"
90 " --trim-masked trim masked bases from the ends of reads\n"
91 " --no-trim-masked do not trim masked bases from the ends\n"
92 " of reads [default]\n"
93 " -m, --flank-mismatches=N max mismatches between paths and flanks; use\n"
94 " 'nolimit' for no limit [nolimit]\n"
95 " -M, --max-mismatches=N max mismatches between all alternate paths;\n"
96 " use 'nolimit' for no limit [nolimit]\n"
97 " -n --no-limits disable all limits; equivalent to\n"
98 " '-B nolimit -m nolimit -M nolimit -P nolimit'\n"
99 " -o, --output-prefix=FILE prefix of output FASTA files [required]\n"
100 " -P, --max-paths=N merge at most N alternate paths; use 'nolimit'\n"
101 " for no limit [2]\n"
102 " -q, --trim-quality=N trim bases from the ends of reads whose\n"
103 " quality is less than the threshold\n"
104 " --standard-quality zero quality is `!' (33)\n"
105 " default for FASTQ and SAM files\n"
106 " --illumina-quality zero quality is `@' (64)\n"
107 " default for qseq and export files\n"
108 " -r, --read-name=STR only process reads with names that contain STR\n"
109 " -s, --search-mem=N mem limit for graph searches; multiply by the\n"
110 " number of threads (-j) to get the total mem used\n"
111 " for graph traversal [500M]\n"
112 " -g, --gap-file=FILE write sealed gaps to FILE\n"
113 " -t, --trace-file=FILE write graph search stats to FILE\n"
114 " -v, --verbose display verbose output\n"
115 " --help display this help and exit\n"
116 " --version output version information and exit\n"
117 "\n"
118 " Deprecated Options:\n"
119 "\n"
120 " -B, --max-branches=N max branches in de Bruijn graph traversal;\n"
121 " use 'nolimit' for no limit [nolimit]\n"
122 " -f, --min-frag=N min fragment size in base pairs\n"
123 " -F, --max-frag=N max fragment size in base pairs\n"
124 "\n"
125 " Note 1: --max-branches was not effective for truncating expensive searches,\n"
126 " and has been superseded by the --max-cost option.\n"
127 "\n"
128 " Note 2: --max-frag was formerly used to determine the maximum gap\n"
129 " size that abyss-sealer would attempt to close, according to the formula\n"
130 " max_gap_size = max_frag - 2 * flank_length, where flank_length is\n"
131 " determined by the -L option. --max-frag is superseded by the more\n"
132 " intuitive -G (--max-gap-length) option. The related option --min-frag\n"
133 " does not seem to have any practical use.\n"
134 "\n"
135 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
136
137 namespace opt {
138
139 /** Length of flank. */
140 int flankLength = 100;
141
142 /** Max gap size to fill */
143 unsigned maxGapLength = 800;
144
145 /** scaffold file input. */
146 static string inputScaffold;
147
148 /** The number of parallel threads. */
149 static unsigned threads = 1;
150
151 /** The size of the bloom filter in bytes. */
152 size_t bloomSize = 0;
153
154 /** The maximum count value of the BLoom filter. */
155 unsigned max_count = 2;
156
157 /** Input read files are interleaved? */
158 bool interleaved = false;
159
160 /** Max active branches during de Bruijn graph traversal */
161 unsigned maxBranches = NO_LIMIT;
162
163 /**
164 * Max cost for a connecting path search. Searches that
165 * exceed this cost limit will be aborted and the gap
166 * will remain unfilled.
167 */
168 unsigned maxCost = 100000;
169
170 /** multi-graph DOT file containing graph traversals */
171 static string dotPath;
172
173 /**
174 * Find and fix single base errors when a read has no
175 * kmers in the bloom filter.
176 */
177 bool fixErrors = false;
178
179 /** The size of a k-mer. */
180 unsigned k;
181
182 /** Vector of kmers. */
183 vector<unsigned> kvector;
184
185 /** Vector of Bloom filter paths corresponding to kmer values */
186 vector<string> bloomFilterPaths;
187
188 /** The minimum fragment size */
189 unsigned minFrag = 0;
190
191 /** The maximum fragment size */
192 unsigned maxFrag = 0;
193
194 /** Bloom filter input file */
195 static string inputBloomPath;
196
197 /** Max paths between left and right flanking sequences */
198 unsigned maxPaths = 2;
199
200 /** Prefix for output files */
201 static string outputPrefix;
202
203 /** Max mismatches allowed when building consensus seqs */
204 unsigned maxMismatches = NO_LIMIT;
205
206 /** Only process flanks that contain this substring. */
207 static string readName;
208
209 /** Max mem used per thread during graph traversal */
210 static size_t searchMem = 500 * 1024 * 1024;
211
212 /** Output file for graph search stats */
213 static string tracefilePath;
214
215 /** Output file for sealed gaps */
216 static string gapfilePath;
217
218 /** Mask bases not in flanks */
219 static int mask = 0;
220
221 /** Max mismatches between consensus and flanks */
222 static unsigned maxFlankMismatches = NO_LIMIT;
223
224 /** Output flanks files */
225 static int printFlanks = 0;
226
227 /** Output detailed stats */
228 static int detailedStats = 0;
229 }
230
231 /** Counters */
232
233 struct Counters {
234 size_t noStartOrGoalKmer;
235 size_t noPath;
236 size_t uniquePath;
237 size_t multiplePaths;
238 size_t tooManyPaths;
239 size_t tooManyBranches;
240 size_t tooManyMismatches;
241 size_t tooManyReadMismatches;
242 size_t containsCycle;
243 size_t maxCostExceeded;
244 size_t exceededMemLimit;
245 size_t traversalMemExceeded;
246 size_t readPairsProcessed;
247 size_t readPairsMerged;
248 size_t skipped;
249 };
250
251 static const char shortopts[] = "S:L:b:B:C:d:ef:F:G:g:i:Ij:k:lm:M:no:P:q:r:s:t:v";
252
253 enum { OPT_HELP = 1, OPT_VERSION };
254
255 static const struct option longopts[] = {
256 { "detailed-stats", no_argument, &opt::detailedStats, 1},
257 { "print-flanks", no_argument, &opt::printFlanks, 1},
258 { "input-scaffold", required_argument, NULL, 'S' },
259 { "flank-length", required_argument, NULL, 'L' },
260 { "max-gap-length", required_argument, NULL, 'G' },
261 { "bloom-size", required_argument, NULL, 'b' },
262 { "max-branches", required_argument, NULL, 'B' },
263 { "max-cost", required_argument, NULL, 'C' },
264 { "dot-file", required_argument, NULL, 'd' },
265 { "fix-errors", no_argument, NULL, 'e' },
266 { "min-frag", required_argument, NULL, 'f' },
267 { "max-frag", required_argument, NULL, 'F' },
268 { "input-bloom", required_argument, NULL, 'i' },
269 { "interleaved", no_argument, NULL, 'I' },
270 { "threads", required_argument, NULL, 'j' },
271 { "kmer", required_argument, NULL, 'k' },
272 { "chastity", no_argument, &opt::chastityFilter, 1 },
273 { "no-chastity", no_argument, &opt::chastityFilter, 0 },
274 { "mask", no_argument, &opt::mask, 1 },
275 { "no-mask", no_argument, &opt::mask, 0 },
276 { "no-limits", no_argument, NULL, 'n' },
277 { "trim-masked", no_argument, &opt::trimMasked, 1 },
278 { "no-trim-masked", no_argument, &opt::trimMasked, 0 },
279 { "output-prefix", required_argument, NULL, 'o' },
280 { "flank-mismatches", required_argument, NULL, 'm' },
281 { "max-mismatches", required_argument, NULL, 'M' },
282 { "max-paths", required_argument, NULL, 'P' },
283 { "trim-quality", required_argument, NULL, 'q' },
284 { "standard-quality", no_argument, &opt::qualityOffset, 33 },
285 { "illumina-quality", no_argument, &opt::qualityOffset, 64 },
286 { "read-name", required_argument, NULL, 'r' },
287 { "search-mem", required_argument, NULL, 's' },
288 { "trace-file", required_argument, NULL, 't' },
289 { "gap-file", required_argument, NULL, 'g' },
290 { "verbose", no_argument, NULL, 'v' },
291 { "help", no_argument, NULL, OPT_HELP },
292 { "version", no_argument, NULL, OPT_VERSION },
293 { NULL, 0, NULL, 0 }
294 };
295
296 /** The coordinates of a feature. */
297 struct Coord
298 {
299 int start;
300 int end;
301
CoordCoord302 Coord() { }
CoordCoord303 Coord(int start, int end) : start(start), end(end) { }
304
305 /** Return the size of this feature. */
sizeCoord306 int size() const { return end - start; }
307 };
308
309 /** The coordinates of a gap and its flanking scaftigs. */
310 struct Gap
311 {
312 /** The left flanking scaftig. */
313 Coord left;
314
315 /** The right flanking scaftig. */
316 Coord right;
317
GapGap318 Gap() { }
319
GapGap320 Gap(int leftStart, int leftEnd, int rightStart, int rightEnd)
321 : left(leftStart, leftEnd), right(rightStart, rightEnd) { }
322
323 /** The start of the gap. */
gapStartGap324 int gapStart() const { return left.end; }
325
326 /** The end of the gap. */
gapEndGap327 int gapEnd() const { return right.start; }
328
329 /** The total size of the flanks plus the gap. */
totalSizeGap330 int totalSize() const { return right.end - left.start; }
331
332 /** The size of the original gap. */
gapSizeGap333 int gapSize() const { return gapEnd() - gapStart(); }
334
335 /** The size of the flanks. */
flankSizeGap336 int flankSize() const { return left.size() + right.size(); }
337 };
338
339 /** The coordinates of a gap, its flanking scaftigs, and the sequence that
340 * fills it.
341 */
342 struct ClosedGap : Gap
343 {
344 /** The sequence that fills the gap. */
345 std::string seq;
346
ClosedGapClosedGap347 ClosedGap() { }
348
ClosedGapClosedGap349 ClosedGap(Gap gap, std::string seq) : Gap(gap), seq(seq) { }
350 };
351
352 #if USESEQAN
353 const string r1 =
354 "AGAATCAACCAACCGTTCAATGATATAATCAAGAGCGATATTGTAATCTTTGTTTCT";
355 const string r2 =
356 "CGACGTCCACCAATTCGTCCCTGTGCACGAGCAGTTTCCAGTCCAGCTTTTGTTCGT";
357 const string ins =
358 "AGAATCAACCAACCGTTCAATGATATAATCAAGAGCGATATTGTAATCTTTGTTTCTGTCACCCGGCCCCCACGACTCAAGGATTAGACCATAAACACCATCCTCTTCACCTATCGAACACTCAGCTTTCAGTTCAATTCCATTATTATCAAAAACATGCATAATATTAATCTTTAATCAATTTTTCACGACAATACTACTTTTATTGATAAAATTGCAACAAGTTGCTGTTGTTTTACTTTCTTTTGTACACAAAGTGTCTTTAACTTTATTTATCCCCTGCAGGAAACCTCTTATACAAAGTTGACACACCAACATCATAGATAATCGCCACCTTCTGGCGAGGAGTTCCTGCTGCAATTAATCGTCCAGCTTGTGCCCATTGTTCTGGTGTAAGTTTGGGACGACGTCCACCAATTCGTCCCTGTGCACGAGCAGTTTCCAGTCCAGCTTTTGTTCGT";
359
seqanTests()360 static void seqanTests()
361 {
362 typedef String<Dna> DS;
363 typedef Align<DS> Alignment;
364
365 //DS seq1 = "TTGT";
366 //DS seq2 = "TTAGT";
367 DS ref = ins;
368 DS seq1 = r1;
369 DS seq2 = r2;
370
371 Alignment align1;
372 resize(rows(align1), 2);
373 assignSource(row(align1, 0), ref);
374 assignSource(row(align1, 1), seq1);
375 Alignment align2;
376 resize(rows(align2), 2);
377 assignSource(row(align2, 0), ref);
378 assignSource(row(align2, 1), seq2);
379
380 Score<int> scoring(2, -2, -50, -100);
381
382 cout << splitAlignment(align1, align2, scoring) << endl;
383 cout << align1 << endl;
384 cout << align2 << endl;
385
386 cout << localAlignment(align1, scoring) << endl;
387 cout << align1 << endl;
388
389 cout << localAlignment(align2, scoring) << endl;
390 cout << align2 << endl;
391 }
392 #endif
393
394 /**
395 * Set the value for a commandline option, using "nolimit"
396 * to represent NO_LIMIT.
397 */
setMaxOption(unsigned & arg,istream & in)398 static inline void setMaxOption(unsigned& arg, istream& in)
399 {
400 string str;
401 getline(in, str);
402 if (in && str == "nolimit") {
403 arg = NO_LIMIT;
404 } else {
405 istringstream ss(str);
406 ss >> arg;
407 // copy state bits (fail, bad, eof) to
408 // original stream
409 in.clear(ss.rdstate());
410 }
411 }
412
sizetToString(size_t a)413 string sizetToString (size_t a)
414 {
415 ostringstream temp;
416 temp << a;
417 return temp.str();
418 }
419
IntToString(int a)420 string IntToString (int a)
421 {
422 ostringstream temp;
423 temp<<a;
424 return temp.str();
425 }
426
427 // returns merged sequence resulting from Konnector
428 template <typename Graph>
merge(const Graph & g,unsigned k,const Gap & gap,const FastaRecord & read1,const FastaRecord & read2,const ConnectPairsParams & params,Counters & g_count,ofstream & traceStream)429 string merge(const Graph& g,
430 unsigned k,
431 const Gap& gap,
432 const FastaRecord &read1,
433 const FastaRecord &read2,
434 const ConnectPairsParams& params,
435 Counters& g_count,
436 ofstream& traceStream)
437 {
438 ConnectPairsResult result = connectPairs(k, read1, read2, g, params);
439 ostringstream ss;
440 ss << result.readNamePrefix << '_' << gap.gapStart() << '_' << gap.gapSize();
441 result.readNamePrefix = ss.str();
442
443 if (!opt::tracefilePath.empty())
444 #pragma omp critical(tracefile)
445 {
446 traceStream << result;
447 assert_good(traceStream, opt::tracefilePath);
448 }
449
450 vector<FastaRecord>& paths = result.mergedSeqs;
451 switch (result.pathResult) {
452
453 case NO_PATH:
454 assert(paths.empty());
455 if (result.foundStartKmer && result.foundGoalKmer)
456 #pragma omp atomic
457 ++g_count.noPath;
458 else {
459 #pragma omp atomic
460 ++g_count.noStartOrGoalKmer;
461 }
462 break;
463
464 case FOUND_PATH:
465 assert(!paths.empty());
466 if (result.pathMismatches > params.maxPathMismatches ||
467 result.readMismatches > params.maxReadMismatches) {
468 if (result.pathMismatches > params.maxPathMismatches)
469 #pragma omp atomic
470 ++g_count.tooManyMismatches;
471 else
472 #pragma omp atomic
473 ++g_count.tooManyReadMismatches;
474 }
475 else if (paths.size() > 1) {
476 #pragma omp atomic
477 ++g_count.multiplePaths;
478 }
479 else {
480 #pragma omp atomic
481 ++g_count.uniquePath;
482 }
483 break;
484
485 case TOO_MANY_PATHS:
486 #pragma omp atomic
487 ++g_count.tooManyPaths;
488 break;
489
490 case TOO_MANY_BRANCHES:
491 #pragma omp atomic
492 ++g_count.tooManyBranches;
493 break;
494
495 case PATH_CONTAINS_CYCLE:
496 #pragma omp atomic
497 ++g_count.containsCycle;
498 break;
499
500 case MAX_COST_EXCEEDED:
501 #pragma omp atomic
502 ++g_count.maxCostExceeded;
503 break;
504
505 case EXCEEDED_MEM_LIMIT:
506 #pragma omp atomic
507 ++g_count.exceededMemLimit;
508 break;
509 }
510
511 if (result.pathResult == FOUND_PATH) {
512 if (result.pathMismatches > params.maxPathMismatches)
513 return "";
514 else if (result.mergedSeqs.size() > 1)
515 return result.consensusSeq;
516 else
517 return result.mergedSeqs.front();
518 }
519 else
520 return "";
521 }
522
printLog(ofstream & logStream,const string & output)523 void printLog(ofstream &logStream, const string &output) {
524 #pragma omp critical(logStream)
525 logStream << output;
526 if (opt::verbose > 0)
527 #pragma omp critical(cerr)
528 cerr << output;
529 }
530
insertIntoScaffold(ofstream & scaffoldStream,ofstream & mergedStream,FastaRecord & record,map<string,map<int,ClosedGap>> & allmerged,unsigned & gapsclosedfinal)531 void insertIntoScaffold(ofstream &scaffoldStream,
532 ofstream &mergedStream,
533 FastaRecord &record,
534 map<string, map<int, ClosedGap> > &allmerged,
535 unsigned &gapsclosedfinal)
536 {
537 map<string, map<int, ClosedGap> >::iterator scaf_it;
538 map<int, ClosedGap>::reverse_iterator pos_it;
539
540 scaf_it = allmerged.find(record.id);
541 scaffoldStream << ">" << record.id << " " << record.comment << endl;
542 string modifiedSeq = record.seq;
543 if (scaf_it != allmerged.end()) {
544 for (pos_it = allmerged[record.id].rbegin();
545 pos_it != allmerged[record.id].rend();
546 pos_it++)
547 {
548 const ClosedGap& closedGap = pos_it->second;
549 int newseqsize = pos_it->second.seq.length() - closedGap.flankSize();
550 modifiedSeq.replace(
551 closedGap.left.start,
552 closedGap.totalSize(),
553 closedGap.seq
554 );
555 mergedStream << ">" << record.id << "_" << pos_it->first << "_" << newseqsize << endl;
556 mergedStream << pos_it->second.seq << endl;
557 gapsclosedfinal++;
558 }
559 scaffoldStream << modifiedSeq << endl;
560 }
561 else {
562 scaffoldStream << record.seq << endl;
563 }
564
565 }
566
567 std::pair<FastaRecord, FastaRecord>
makePseudoReads(std::string seq,FastaRecord record,const Gap & gap)568 makePseudoReads(std::string seq, FastaRecord record, const Gap& gap)
569 {
570 std::pair<FastaRecord, FastaRecord> scaftigs;
571 // extract left flank
572 std::string leftflank = seq.substr(gap.left.start, gap.left.size());
573 std::transform(leftflank.begin(), leftflank.end(), leftflank.begin(), ::toupper);
574 scaftigs.first.seq = leftflank;
575 scaftigs.first.id = record.id + "/1";
576
577 // extract right flank
578 scaftigs.second.id = record.id + "/2";
579 std::string rightflank = seq.substr(gap.right.start, gap.right.size());
580 std::transform(rightflank.begin(), rightflank.end(), rightflank.begin(), ::toupper);
581 scaftigs.second.seq = reverseComplement(rightflank);
582
583 return scaftigs;
584 }
585
586 template <typename Graph>
kRun(const ConnectPairsParams & params,unsigned k,const Graph & g,map<string,map<int,ClosedGap>> & allmerged,map<FastaRecord,map<FastaRecord,Gap>> & flanks,unsigned & gapsclosed,ofstream & logStream,ofstream & traceStream,ofstream & gapStream)587 void kRun(const ConnectPairsParams& params,
588 unsigned k,
589 const Graph& g,
590 map<string, map<int, ClosedGap> > &allmerged,
591 map<FastaRecord, map<FastaRecord, Gap> > &flanks,
592 unsigned &gapsclosed,
593 ofstream &logStream,
594 ofstream &traceStream,
595 ofstream &gapStream)
596 {
597 map<FastaRecord, map<FastaRecord, Gap> >::iterator read1_it;
598 map<FastaRecord, Gap>::iterator read2_it;
599 unsigned uniqueGapsClosed = 0;
600
601 Counters g_count;
602 g_count.noStartOrGoalKmer = 0;
603 g_count.noPath = 0;
604 g_count.uniquePath = 0;
605 g_count.multiplePaths = 0;
606 g_count.tooManyPaths = 0;
607 g_count.tooManyBranches = 0;
608 g_count.tooManyMismatches = 0;
609 g_count.tooManyReadMismatches = 0;
610 g_count.containsCycle = 0;
611 g_count.maxCostExceeded = 0;
612 g_count.exceededMemLimit = 0;
613 g_count.traversalMemExceeded = 0;
614 g_count.readPairsProcessed = 0;
615 g_count.readPairsMerged = 0;
616 g_count.skipped = 0;
617
618 printLog(logStream, "Flanks inserted into k run = " + IntToString(flanks.size()) + "\n");
619
620 int counter = 0;
621 vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator> flanks_closed;
622 #pragma omp parallel private(read1_it, read2_it) firstprivate(counter)
623 for (read1_it = flanks.begin(); read1_it != flanks.end(); ++read1_it) {
624 FastaRecord read1 = read1_it->first;
625 bool success = false;
626 for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); ++read2_it, ++counter) {
627 #if _OPENMP
628 if (counter % omp_get_num_threads() != omp_get_thread_num())
629 continue;
630 #endif
631 FastaRecord read2 = read2_it->first;
632
633 int startposition = read2_it->second.gapStart();
634 string tempSeq = merge(g, k, read2_it->second, read1, read2, params, g_count, traceStream);
635 if (!tempSeq.empty()) {
636 success = true;
637 #pragma omp critical (allmerged)
638 allmerged[read1.id.substr(0,read1.id.length()-2)][startposition]
639 = ClosedGap(read2_it->second, tempSeq);
640 #pragma omp atomic
641 ++uniqueGapsClosed;
642 #pragma omp critical (gapsclosed)
643 if (++gapsclosed % 100 == 0)
644 printLog(logStream, IntToString(gapsclosed) + " gaps closed so far\n");
645
646 if (!opt::gapfilePath.empty())
647 #pragma omp critical (gapStream)
648 gapStream << ">" << read1.id.substr(0,read1.id.length()-2)
649 << "_" << read2_it->second.gapStart() << "-" << read2_it->second.gapEnd()
650 << " LN:i:" << tempSeq.length() << '\n'
651 << tempSeq << '\n';
652 }
653 }
654 if (success) {
655 #pragma omp critical (flanks_closed)
656 flanks_closed.push_back(read1_it);
657 }
658 }
659
660 for (vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator>::iterator it = flanks_closed.begin();
661 it != flanks_closed.end(); ++it) {
662 if (flanks.count((*it)->first) > 0)
663 flanks.erase(*it);
664 }
665
666 printLog(logStream, IntToString(uniqueGapsClosed) + " unique gaps closed for k" + IntToString(k) + "\n");
667
668 printLog(logStream, "No start/goal kmer: "
669 + sizetToString(g_count.noStartOrGoalKmer) + "\n");
670 printLog(logStream, "No path: "
671 + sizetToString(g_count.noPath) + "\n");
672 printLog(logStream, "Unique path: "
673 + sizetToString(g_count.uniquePath) + "\n");
674 printLog(logStream, "Multiple paths: "
675 + sizetToString(g_count.multiplePaths) + "\n");
676 printLog(logStream, "Too many paths: "
677 + sizetToString(g_count.tooManyPaths) + "\n");
678 printLog(logStream, "Too many branches: "
679 + sizetToString(g_count.tooManyBranches) + "\n");
680 printLog(logStream, "Too many path/path mismatches: "
681 + sizetToString(g_count.tooManyMismatches) + "\n");
682 printLog(logStream, "Too many path/read mismatches: "
683 + sizetToString(g_count.tooManyReadMismatches) + "\n");
684 printLog(logStream, "Contains cycle: "
685 + sizetToString(g_count.containsCycle) + "\n");
686 printLog(logStream, "Max cost exceeded: "
687 + sizetToString(g_count.maxCostExceeded) + "\n");
688 printLog(logStream, "Exceeded mem limit: "
689 + sizetToString(g_count.exceededMemLimit) + "\n");
690 printLog(logStream, "Skipped: "
691 + sizetToString(g_count.skipped) + "\n");
692
693 printLog(logStream, IntToString(flanks.size()) + " flanks left\n");
694
695 }
696
operator <(const FastaRecord & a,const FastaRecord & b)697 bool operator<(const FastaRecord& a, const FastaRecord& b)
698 {
699 if (a.id == b.id)
700 return a.seq < b.seq;
701 else
702 return a.id < b.id;
703 }
704
findFlanks(FastaRecord & record,int flanklength,unsigned & gapnumber,map<FastaRecord,map<FastaRecord,Gap>> & flanks)705 void findFlanks(FastaRecord &record,
706 int flanklength,
707 unsigned &gapnumber,
708 map<FastaRecord, map<FastaRecord, Gap> > &flanks)
709 {
710 const string& seq = record.seq;
711
712 // Iterate over the gaps.
713 for (size_t offset = 0;
714 seq.string::find_first_of("Nn", offset) != string::npos;) {
715 #pragma omp atomic
716 gapnumber++;
717 size_t startposition = seq.string::find_first_of("Nn", offset);
718
719 size_t endposition = seq.string::find_first_not_of("Nn", startposition);
720 if (endposition == string::npos) {
721 std::cerr << PROGRAM ": Warning: sequence ends with an N: " << record.id << "\n";
722 break;
723 }
724
725 size_t right_end = seq.string::find_first_of("Nn", endposition);
726 if (right_end == string::npos)
727 right_end = seq.length();
728
729 Gap gap(
730 max((ssize_t)offset, (ssize_t)startposition - (ssize_t)flanklength),
731 startposition,
732 endposition,
733 min(right_end, endposition + flanklength));
734
735 std::pair<FastaRecord, FastaRecord> scaftigs =
736 makePseudoReads(seq, record, gap);
737 #pragma omp critical (flanks)
738 flanks[scaftigs.first][scaftigs.second] = gap;
739
740 offset = endposition;
741 }
742 }
743
744 /**
745 * Connect pairs using a Bloom filter de Bruijn graph
746 */
main(int argc,char ** argv)747 int main(int argc, char** argv)
748 {
749 bool die = false;
750
751 for (int c; (c = getopt_long(argc, argv,
752 shortopts, longopts, NULL)) != -1;) {
753 istringstream arg(optarg != NULL ? optarg : "");
754 switch (c) {
755 case '?':
756 die = true; break;
757 case 'S':
758 arg >> opt::inputScaffold; break;
759 case 'L':
760 arg >> opt::flankLength; break;
761 case 'G':
762 arg >> opt::maxGapLength; break;
763 case 'b':
764 opt::bloomSize = SIToBytes(arg); break;
765 case 'B':
766 setMaxOption(opt::maxBranches, arg); break;
767 case 'C':
768 setMaxOption(opt::maxCost, arg); break;
769 case 'd':
770 arg >> opt::dotPath; break;
771 case 'e':
772 opt::fixErrors = true; break;
773 case 'f':
774 arg >> opt::minFrag; break;
775 case 'F':
776 arg >> opt::maxFrag; break;
777 case 'i': {
778 string tempPath;
779 arg >> tempPath;
780 opt::bloomFilterPaths.push_back(tempPath);
781 opt::inputBloomPath = tempPath;
782 break;
783 }
784 case 'I':
785 opt::interleaved = true; break;
786 case 'j':
787 arg >> opt::threads; break;
788 case 'k': {
789 unsigned tempK;
790 arg >> tempK;
791 opt::kvector.push_back(tempK);
792 opt::k = tempK;
793 break;
794 }
795 case 'm':
796 setMaxOption(opt::maxFlankMismatches, arg); break;
797 case 'n':
798 opt::maxBranches = NO_LIMIT;
799 opt::maxFlankMismatches = NO_LIMIT;
800 opt::maxMismatches = NO_LIMIT;
801 opt::maxPaths = NO_LIMIT;
802 break;
803 case 'M':
804 setMaxOption(opt::maxMismatches, arg); break;
805 case 'o':
806 arg >> opt::outputPrefix; break;
807 case 'P':
808 setMaxOption(opt::maxPaths, arg); break;
809 case 'q':
810 arg >> opt::qualityThreshold; break;
811 case 'r':
812 arg >> opt::readName; break;
813 case 's':
814 opt::searchMem = SIToBytes(arg); break;
815 case 't':
816 arg >> opt::tracefilePath; break;
817 case 'g':
818 arg >> opt::gapfilePath; break;
819 case 'v':
820 opt::verbose++; break;
821 case OPT_HELP:
822 cout << USAGE_MESSAGE;
823 exit(EXIT_SUCCESS);
824 case OPT_VERSION:
825 cout << VERSION_MESSAGE;
826 exit(EXIT_SUCCESS);
827 }
828 if (optarg != NULL && (!arg.eof() || arg.fail())) {
829 cerr << PROGRAM ": invalid option: `-"
830 << (char)c << optarg << "'\n";
831 exit(EXIT_FAILURE);
832 }
833 }
834
835 /* translate --max-frag to --max-gap-length for backwards compatibility */
836 if (opt::maxFrag > 0) {
837 if ((int)opt::maxFrag < 2 * opt::flankLength)
838 opt::maxGapLength = 0;
839 else
840 opt::maxGapLength = opt::maxFrag - 2 * opt::flankLength;
841 }
842
843 if (opt::inputScaffold.empty()) {
844 cerr << PROGRAM ": missing mandatory option `-S'\n";
845 die = true;
846 }
847
848 if (opt::k == 0) {
849 cerr << PROGRAM ": missing mandatory option `-k'\n";
850 die = true;
851 }
852
853 if (opt::bloomFilterPaths.size() < opt::kvector.size()
854 && opt::bloomSize == 0)
855 {
856 cerr << PROGRAM ": missing mandatory option `-b' (Bloom filter size)\n"
857 << "Here are some guidelines for sizing the Bloom filter:\n"
858 << " * E. coli (~5 Mbp genome), 615X coverage: -b500M\n"
859 << " * S. cerevisiae (~12 Mbp genome), 25X coverage: -b500M\n"
860 << " * C. elegans (~100 Mbp genome), 89X coverage: -b1200M\n"
861 << " * H. sapiens (~3 Gbp genome), 71X coverage: -b40G\n";
862 die = true;
863 }
864
865 if (opt::outputPrefix.empty()) {
866 cerr << PROGRAM ": missing mandatory option `-o'\n";
867 die = true;
868 }
869
870 if (opt::bloomFilterPaths.size() > opt::kvector.size()) {
871 cerr << PROGRAM ": you must specify a k-mer size (-k) for each Bloom "
872 " filter file (-i)\n";
873 die = true;
874 } else if (opt::bloomFilterPaths.size() < opt::kvector.size()
875 && argc - optind < 1) {
876 cerr << PROGRAM ": missing input file arguments\n";
877 die = true;
878 } else if (opt::bloomFilterPaths.size() == opt::kvector.size()
879 && argc - optind > 0) {
880 cerr << PROGRAM ": input FASTA/FASTQ args should be omitted when using "
881 "pre-built Bloom filters (-i) for all k-mer sizes\n";
882 die = true;
883 }
884
885 if (die) {
886 cerr << "Try `" << PROGRAM
887 << " --help' for more information.\n";
888 exit(EXIT_FAILURE);
889 }
890
891 #if _OPENMP
892 if (opt::threads > 0)
893 omp_set_num_threads(opt::threads);
894 #endif
895
896 Kmer::setLength(opt::k);
897
898 #if USESEQAN
899 seqanTests();
900 #endif
901
902 ofstream dotStream;
903 if (!opt::dotPath.empty()) {
904 if (opt::verbose)
905 cerr << "Writing graph traversals to "
906 "dot file `" << opt::dotPath << "'\n";
907 dotStream.open(opt::dotPath.c_str());
908 assert_good(dotStream, opt::dotPath);
909 }
910
911 ofstream traceStream;
912 if (!opt::tracefilePath.empty()) {
913 if (opt::verbose)
914 cerr << "Writing graph search stats to `"
915 << opt::tracefilePath << "'\n";
916 traceStream.open(opt::tracefilePath.c_str());
917 assert(traceStream.is_open());
918 ConnectPairsResult::printHeaders(traceStream);
919 assert_good(traceStream, opt::tracefilePath);
920 }
921
922 ofstream gapStream;
923 if (!opt::gapfilePath.empty()) {
924 gapStream.open(opt::gapfilePath.c_str());
925 assert(gapStream.is_open());
926 assert_good(gapStream, opt::gapfilePath);
927 }
928
929 string logOutputPath(opt::outputPrefix);
930 logOutputPath.append("_log.txt");
931 ofstream logStream(logOutputPath.c_str());
932 assert_good(logStream, logOutputPath);
933
934 string scaffoldOutputPath(opt::outputPrefix);
935 scaffoldOutputPath.append("_scaffold.fa");
936 ofstream scaffoldStream(scaffoldOutputPath.c_str());
937 assert_good(scaffoldStream, scaffoldOutputPath);
938
939 string mergedOutputPath(opt::outputPrefix);
940 mergedOutputPath.append("_merged.fa");
941 ofstream mergedStream(mergedOutputPath.c_str());
942 assert_good(mergedStream, mergedOutputPath);
943
944 printLog(logStream, "Finding flanks\n");
945
946 ConnectPairsParams params;
947
948 params.minMergedSeqLen = opt::minFrag;
949 params.maxMergedSeqLen = opt::maxGapLength + 2 * opt::flankLength;
950 params.maxPaths = opt::maxPaths;
951 params.maxBranches = opt::maxBranches;
952 params.maxCost = opt::maxCost;
953 params.maxPathMismatches = opt::maxMismatches;
954 params.maxReadMismatches = opt::maxFlankMismatches;
955 /*
956 * search from the end of the scaffold flank until
957 * we find this many matches in a row.
958 */
959 params.kmerMatchesThreshold = 1;
960 params.fixErrors = opt::fixErrors;
961 params.maskBases = opt::mask;
962 params.memLimit = opt::searchMem;
963 params.dotPath = opt::dotPath;
964 params.dotStream = opt::dotPath.empty() ? NULL : &dotStream;
965
966 map<FastaRecord, map<FastaRecord, Gap> > flanks;
967 const char* scaffoldInputPath = opt::inputScaffold.c_str();
968 FastaReader reader1(scaffoldInputPath, FastaReader::FOLD_CASE);
969 unsigned gapsfound = 0;
970 string temp;
971
972 #pragma omp parallel
973 for (FastaRecord record;;) {
974 bool good;
975 #pragma omp critical(reader1)
976 good = reader1 >> record;
977 if (good) {
978 findFlanks(record, opt::flankLength, gapsfound, flanks);
979 }
980 else {
981 break;
982 }
983 }
984
985 temp = IntToString(gapsfound) + " gaps found\n";
986 printLog(logStream, temp);
987 temp = IntToString((int)flanks.size()) + " flanks extracted\n\n";
988 printLog(logStream, temp);
989
990 if (opt::printFlanks > 0) {
991 map<FastaRecord, map<FastaRecord, Gap> >::iterator read1_it;
992 map<FastaRecord, Gap>::iterator read2_it;
993
994 string read1OutputPath(opt::outputPrefix);
995 read1OutputPath.append("_flanks_1.fa");
996 ofstream read1Stream(read1OutputPath.c_str());
997 assert_good(read1Stream, read1OutputPath);
998
999 string read2OutputPath(opt::outputPrefix);
1000 read2OutputPath.append("_flanks_2.fa");
1001 ofstream read2Stream(read2OutputPath.c_str());
1002 assert_good(read2Stream, read2OutputPath);
1003
1004 for (read1_it = flanks.begin(); read1_it != flanks.end(); read1_it++) {
1005 FastaRecord read1 = read1_it->first;
1006 for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); read2_it++) {
1007 FastaRecord read2 = read2_it->first;
1008 const Gap& gap = read2_it->second;
1009
1010 read1Stream << ">" << read1.id.substr(0,read2.id.length()-2)
1011 << "_" << gap.gapStart() << "_" << gap.gapSize() << "/1\n";
1012 read1Stream << read1.seq << endl;
1013
1014 read2Stream << ">" << read2.id.substr(0,read2.id.length()-2)
1015 << "_" << gap.gapStart() << "_" << gap.gapSize() << "/2\n";
1016 read2Stream << read2.seq << endl;
1017 }
1018 }
1019 assert_good(read1Stream, read1OutputPath.c_str());
1020 read1Stream.close();
1021 assert_good(read2Stream, read2OutputPath.c_str());
1022 read2Stream.close();
1023 }
1024
1025 /** map for merged sequence resutls */
1026 map<string, map<int, ClosedGap> > allmerged;
1027 unsigned gapsclosed=0;
1028
1029 for (unsigned i = 0; i<opt::kvector.size(); i++) {
1030 opt::k = opt::kvector.at(i);
1031 Kmer::setLength(opt::k);
1032
1033 BloomFilter* bloom;
1034 CascadingBloomFilter* cascadingBloom = NULL;
1035
1036 if (!opt::bloomFilterPaths.empty() && i < opt::bloomFilterPaths.size()) {
1037
1038 temp = "Loading bloom filter from `" + opt::bloomFilterPaths.at(i) + "'...\n";
1039 printLog(logStream, temp);
1040
1041 bloom = new BloomFilter();
1042
1043 const char* inputPath = opt::bloomFilterPaths.at(i).c_str();
1044 ifstream inputBloom(inputPath, ios_base::in | ios_base::binary);
1045 assert_good(inputBloom, inputPath);
1046 inputBloom >> *bloom;
1047 assert_good(inputBloom, inputPath);
1048 inputBloom.close();
1049 } else {
1050 printLog(logStream, "Building bloom filter\n");
1051
1052 size_t bits = opt::bloomSize * 8 / 2;
1053 cascadingBloom = new CascadingBloomFilter(bits, opt::max_count);
1054 #ifdef _OPENMP
1055 ConcurrentBloomFilter<CascadingBloomFilter> cbf(*cascadingBloom, 1000);
1056 for (int i = optind; i < argc; i++)
1057 Bloom::loadFile(cbf, opt::k, argv[i], opt::verbose >= 2);
1058 #else
1059 for (int i = optind; i < argc; i++)
1060 Bloom::loadFile(*cascadingBloom, opt::k, argv[i], opt::verbose >= 2);
1061 #endif
1062 bloom = &cascadingBloom->getBloomFilter(opt::max_count - 1);
1063 }
1064
1065 assert(bloom != NULL);
1066
1067 if (opt::verbose)
1068 cerr << "Bloom filter FPR: " << setprecision(3)
1069 << 100 * bloom->FPR() << "%\n";
1070
1071 DBGBloom<BloomFilter> g(*bloom);
1072
1073 temp = "Starting K run with k = " + IntToString(opt::k) + "\n";
1074 printLog(logStream, temp);
1075
1076 kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream, gapStream);
1077
1078 temp = "k" + IntToString(opt::k) + " run complete\n"
1079 + "Total gaps closed so far = " + IntToString(gapsclosed) + "\n\n";
1080 printLog(logStream, temp);
1081
1082 if (!opt::inputBloomPath.empty())
1083 delete bloom;
1084 else
1085 delete cascadingBloom;
1086 }
1087
1088 printLog(logStream, "K sweep complete\nCreating new scaffold with gaps closed...\n");
1089
1090 map<string, map<int, map<string, string> > >::iterator scaf_it;
1091 map<int, map<string, string> >::reverse_iterator pos_it;
1092 FastaReader reader2(scaffoldInputPath, FastaReader::FOLD_CASE);
1093 unsigned gapsclosedfinal = 0;
1094
1095 /** creating new scaffold with gaps closed */
1096 for (FastaRecord record;;) {
1097 bool good;
1098 good = reader2 >> record;
1099 if (good) {
1100 insertIntoScaffold(scaffoldStream, mergedStream, record, allmerged, gapsclosedfinal);
1101 }
1102 else
1103 break;
1104 }
1105 printLog(logStream, "New scaffold complete\n");
1106 printLog(logStream, "Gaps closed = " + IntToString(gapsclosed) + "\n");
1107 logStream << (float)100 * gapsclosed / gapsfound << "%\n\n";
1108 if (opt::verbose > 0) {
1109 cerr << (float)100 * gapsclosed / gapsfound << "%\n\n";
1110 }
1111
1112 assert_good(scaffoldStream, scaffoldOutputPath.c_str());
1113 scaffoldStream.close();
1114 assert_good(mergedStream, mergedOutputPath.c_str());
1115 mergedStream.close();
1116 assert_good(logStream, logOutputPath.c_str());
1117 logStream.close();
1118
1119 if (!opt::dotPath.empty()) {
1120 assert_good(dotStream, opt::dotPath);
1121 dotStream.close();
1122 }
1123
1124 if (!opt::tracefilePath.empty()) {
1125 assert_good(traceStream, opt::tracefilePath);
1126 traceStream.close();
1127 }
1128
1129 if (!opt::gapfilePath.empty()) {
1130 assert_good(gapStream, opt::gapfilePath);
1131 gapStream.close();
1132 }
1133
1134 return 0;
1135 }
1136