1 /**
2  * Close intra-scaffold gaps
3  * Copyright 2014 Canada's Michael Smith Genome Science Centre
4  */
5 
6 #include "config.h"
7 
8 #include "Konnector/konnector.h"
9 #include "Konnector/DBGBloom.h"
10 #include "Konnector/DBGBloomAlgorithms.h"
11 #include "Bloom/CascadingBloomFilter.h"
12 
13 #include "Align/alignGlobal.h"
14 #include "Common/IOUtil.h"
15 #include "Common/Options.h"
16 #include "Common/StringUtil.h"
17 #include "DataLayer/FastaConcat.h"
18 #include "DataLayer/Options.h"
19 #include "Graph/DotIO.h"
20 #include "Graph/Options.h"
21 #include "Graph/GraphUtil.h"
22 
23 #include <cassert>
24 #include <getopt.h>
25 #include <iostream>
26 #include <cstring>
27 
28 #include <string>
29 #include <algorithm>
30 #include <fstream>
31 #include <sstream>
32 #include <map>
33 
34 
35 #if _OPENMP
36 # include <omp.h>
37 # include "Bloom/ConcurrentBloomFilter.h"
38 #endif
39 
40 #undef USESEQAN
41 
42 #if USESEQAN
43 #include <seqan/align.h>
44 #include <seqan/sequence.h>
45 #include <seqan/align_split.h>
46 #endif
47 
48 using namespace std;
49 using Konnector::BloomFilter;
50 #if USESEQAN
51 using namespace seqan;
52 #endif
53 
54 #define PROGRAM "abyss-sealer"
55 
56 static const char VERSION_MESSAGE[] =
57 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
58 "Written by Shaun Jackman, Hamid Mohamadi, Anthony Raymond,\n"
59 "Ben Vandervalk and Daniel Paulino\n"
60 "\n"
61 "Copyright 2014 Canada's Michael Smith Genome Science Centre\n";
62 
63 static const char USAGE_MESSAGE[] =
64 "Usage: " PROGRAM "-b <Bloom filter size> -k <kmer size> -k <kmer size>... -o <output_prefix> -S <path to scaffold file> [options]... <reads1> [reads2]...\n"
65 "i.e. abyss-sealer -b20G -k90 -k80 -k70 -k60 -k50 -k40 -k30 -o test -S scaffold.fa read1.fa read2.fa\n\n"
66 "Close gaps by using left and right flanking sequences of gaps as 'reads' for Konnector\n"
67 "and performing multiple runs with each of the supplied K values.\n"
68 "\n"
69 " Options:\n"
70 "\n"
71 "      --print-flanks           outputs flank files\n"
72 "  -S, --input-scaffold=FILE    load scaffold from FILE\n"
73 "  -L, --flank-length=N         length of flanks to be used as pseudoreads [100]\n"
74 "  -G, --max-gap-length=N       max gap size to fill in bp [800]; runtime increases\n"
75 "                               exponentially with respect to this parameter\n"
76 "  -j, --threads=N              use N parallel threads [1]\n"
77 "  -k, --kmer=N                 the size of a k-mer\n"
78 "  -b, --bloom-size=N           size of Bloom filter (e.g. '40G'). Required\n"
79 "                               when not using pre-built Bloom filter(s)\n"
80 "                               (-i option)\n"
81 "  -d, --dot-file=FILE          write graph traversals to a DOT file\n"
82 "  -e, --fix-errors             find and fix single-base errors when reads\n"
83 "                               have no kmers in bloom filter [disabled]\n"
84 "  -C, --max-cost=N             max edges to traverse during each graph search [100000]\n"
85 "  -i, --input-bloom=FILE       load bloom filter from FILE\n"
86 "      --mask                   mask new and changed bases as lower case\n"
87 "      --no-mask                do not mask bases [default]\n"
88 "      --chastity               discard unchaste reads [default]\n"
89 "      --no-chastity            do not discard unchaste reads\n"
90 "      --trim-masked            trim masked bases from the ends of reads\n"
91 "      --no-trim-masked         do not trim masked bases from the ends\n"
92 "                               of reads [default]\n"
93 "  -m, --flank-mismatches=N     max mismatches between paths and flanks; use\n"
94 "                               'nolimit' for no limit [nolimit]\n"
95 "  -M, --max-mismatches=N       max mismatches between all alternate paths;\n"
96 "                               use 'nolimit' for no limit [nolimit]\n"
97 "  -n  --no-limits              disable all limits; equivalent to\n"
98 "                               '-B nolimit -m nolimit -M nolimit -P nolimit'\n"
99 "  -o, --output-prefix=FILE     prefix of output FASTA files [required]\n"
100 "  -P, --max-paths=N            merge at most N alternate paths; use 'nolimit'\n"
101 "                               for no limit [2]\n"
102 "  -q, --trim-quality=N         trim bases from the ends of reads whose\n"
103 "                               quality is less than the threshold\n"
104 "      --standard-quality       zero quality is `!' (33)\n"
105 "                               default for FASTQ and SAM files\n"
106 "      --illumina-quality       zero quality is `@' (64)\n"
107 "                               default for qseq and export files\n"
108 "  -r, --read-name=STR          only process reads with names that contain STR\n"
109 "  -s, --search-mem=N           mem limit for graph searches; multiply by the\n"
110 "                               number of threads (-j) to get the total mem used\n"
111 "                               for graph traversal [500M]\n"
112 "  -g, --gap-file=FILE          write sealed gaps to FILE\n"
113 "  -t, --trace-file=FILE        write graph search stats to FILE\n"
114 "  -v, --verbose                display verbose output\n"
115 "      --help                   display this help and exit\n"
116 "      --version                output version information and exit\n"
117 "\n"
118 " Deprecated Options:\n"
119 "\n"
120 "  -B, --max-branches=N         max branches in de Bruijn graph traversal;\n"
121 "                               use 'nolimit' for no limit [nolimit]\n"
122 "  -f, --min-frag=N             min fragment size in base pairs\n"
123 "  -F, --max-frag=N             max fragment size in base pairs\n"
124 "\n"
125 "  Note 1: --max-branches was not effective for truncating expensive searches,\n"
126 "  and has been superseded by the --max-cost option.\n"
127 "\n"
128 "  Note 2: --max-frag was formerly used to determine the maximum gap\n"
129 "  size that abyss-sealer would attempt to close, according to the formula\n"
130 "  max_gap_size = max_frag - 2 * flank_length, where flank_length is\n"
131 "  determined by the -L option.  --max-frag is superseded by the more\n"
132 "  intuitive -G (--max-gap-length) option. The related option --min-frag\n"
133 "  does not seem to have any practical use.\n"
134 "\n"
135 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
136 
137 namespace opt {
138 
139 	/** Length of flank. */
140 	int flankLength = 100;
141 
142 	/** Max gap size to fill */
143 	unsigned maxGapLength = 800;
144 
145 	/** scaffold file input. */
146 	static string inputScaffold;
147 
148 	/** The number of parallel threads. */
149 	static unsigned threads = 1;
150 
151 	/** The size of the bloom filter in bytes. */
152 	size_t bloomSize = 0;
153 
154 	/** The maximum count value of the BLoom filter. */
155 	unsigned max_count = 2;
156 
157 	/** Input read files are interleaved? */
158 	bool interleaved = false;
159 
160 	/** Max active branches during de Bruijn graph traversal */
161 	unsigned maxBranches = NO_LIMIT;
162 
163 	/**
164 	 * Max cost for a connecting path search. Searches that
165 	 * exceed this cost limit will be aborted and the gap
166 	 * will remain unfilled.
167 	 */
168 	unsigned maxCost = 100000;
169 
170 	/** multi-graph DOT file containing graph traversals */
171 	static string dotPath;
172 
173 	/**
174 	 * Find and fix single base errors when a read has no
175 	 * kmers in the bloom filter.
176 	 */
177 	bool fixErrors = false;
178 
179 	/** The size of a k-mer. */
180 	unsigned k;
181 
182 	/** Vector of kmers. */
183 	vector<unsigned> kvector;
184 
185 	/** Vector of Bloom filter paths corresponding to kmer values */
186 	vector<string> bloomFilterPaths;
187 
188 	/** The minimum fragment size */
189 	unsigned minFrag = 0;
190 
191 	/** The maximum fragment size */
192 	unsigned maxFrag = 0;
193 
194 	/** Bloom filter input file */
195 	static string inputBloomPath;
196 
197 	/** Max paths between left and right flanking sequences */
198 	unsigned maxPaths = 2;
199 
200 	/** Prefix for output files */
201 	static string outputPrefix;
202 
203 	/** Max mismatches allowed when building consensus seqs */
204 	unsigned maxMismatches = NO_LIMIT;
205 
206 	/** Only process flanks that contain this substring. */
207 	static string readName;
208 
209 	/** Max mem used per thread during graph traversal */
210 	static size_t searchMem = 500 * 1024 * 1024;
211 
212 	/** Output file for graph search stats */
213 	static string tracefilePath;
214 
215 	/** Output file for sealed gaps */
216 	static string gapfilePath;
217 
218 	/** Mask bases not in flanks */
219 	static int mask = 0;
220 
221 	/** Max mismatches between consensus and flanks */
222 	static unsigned maxFlankMismatches = NO_LIMIT;
223 
224 	/** Output flanks files */
225 	static int printFlanks = 0;
226 
227 	/** Output detailed stats */
228 	static int detailedStats = 0;
229 }
230 
231 /** Counters */
232 
233 struct Counters {
234 	size_t noStartOrGoalKmer;
235 	size_t noPath;
236 	size_t uniquePath;
237 	size_t multiplePaths;
238 	size_t tooManyPaths;
239 	size_t tooManyBranches;
240 	size_t tooManyMismatches;
241 	size_t tooManyReadMismatches;
242 	size_t containsCycle;
243 	size_t maxCostExceeded;
244 	size_t exceededMemLimit;
245 	size_t traversalMemExceeded;
246 	size_t readPairsProcessed;
247 	size_t readPairsMerged;
248 	size_t skipped;
249 };
250 
251 static const char shortopts[] = "S:L:b:B:C:d:ef:F:G:g:i:Ij:k:lm:M:no:P:q:r:s:t:v";
252 
253 enum { OPT_HELP = 1, OPT_VERSION };
254 
255 static const struct option longopts[] = {
256 	{ "detailed-stats",   no_argument, &opt::detailedStats, 1},
257 	{ "print-flanks",     no_argument, &opt::printFlanks, 1},
258 	{ "input-scaffold",   required_argument, NULL, 'S' },
259 	{ "flank-length",     required_argument, NULL, 'L' },
260 	{ "max-gap-length",   required_argument, NULL, 'G' },
261 	{ "bloom-size",       required_argument, NULL, 'b' },
262 	{ "max-branches",     required_argument, NULL, 'B' },
263 	{ "max-cost",         required_argument, NULL, 'C' },
264 	{ "dot-file",         required_argument, NULL, 'd' },
265 	{ "fix-errors",       no_argument, NULL, 'e' },
266 	{ "min-frag",         required_argument, NULL, 'f' },
267 	{ "max-frag",         required_argument, NULL, 'F' },
268 	{ "input-bloom",      required_argument, NULL, 'i' },
269 	{ "interleaved",      no_argument, NULL, 'I' },
270 	{ "threads",          required_argument, NULL, 'j' },
271 	{ "kmer",             required_argument, NULL, 'k' },
272 	{ "chastity",         no_argument, &opt::chastityFilter, 1 },
273 	{ "no-chastity",      no_argument, &opt::chastityFilter, 0 },
274 	{ "mask",             no_argument, &opt::mask, 1 },
275 	{ "no-mask",          no_argument, &opt::mask, 0 },
276 	{ "no-limits",        no_argument, NULL, 'n' },
277 	{ "trim-masked",      no_argument, &opt::trimMasked, 1 },
278 	{ "no-trim-masked",   no_argument, &opt::trimMasked, 0 },
279 	{ "output-prefix",    required_argument, NULL, 'o' },
280 	{ "flank-mismatches", required_argument, NULL, 'm' },
281 	{ "max-mismatches",   required_argument, NULL, 'M' },
282 	{ "max-paths",        required_argument, NULL, 'P' },
283 	{ "trim-quality",     required_argument, NULL, 'q' },
284 	{ "standard-quality", no_argument, &opt::qualityOffset, 33 },
285 	{ "illumina-quality", no_argument, &opt::qualityOffset, 64 },
286 	{ "read-name",        required_argument, NULL, 'r' },
287 	{ "search-mem",       required_argument, NULL, 's' },
288 	{ "trace-file",       required_argument, NULL, 't' },
289 	{ "gap-file",         required_argument, NULL, 'g' },
290 	{ "verbose",          no_argument, NULL, 'v' },
291 	{ "help",             no_argument, NULL, OPT_HELP },
292 	{ "version",          no_argument, NULL, OPT_VERSION },
293 	{ NULL, 0, NULL, 0 }
294 };
295 
296 /** The coordinates of a feature. */
297 struct Coord
298 {
299 	int start;
300 	int end;
301 
CoordCoord302 	Coord() { }
CoordCoord303 	Coord(int start, int end) : start(start), end(end) { }
304 
305 	/** Return the size of this feature. */
sizeCoord306 	int size() const { return end - start; }
307 };
308 
309 /** The coordinates of a gap and its flanking scaftigs. */
310 struct Gap
311 {
312 	/** The left flanking scaftig. */
313 	Coord left;
314 
315 	/** The right flanking scaftig. */
316 	Coord right;
317 
GapGap318 	Gap() { }
319 
GapGap320 	Gap(int leftStart, int leftEnd, int rightStart, int rightEnd)
321 		: left(leftStart, leftEnd), right(rightStart, rightEnd) { }
322 
323 	/** The start of the gap. */
gapStartGap324 	int gapStart() const { return left.end; }
325 
326 	/** The end of the gap. */
gapEndGap327 	int gapEnd() const { return right.start; }
328 
329 	/** The total size of the flanks plus the gap. */
totalSizeGap330 	int totalSize() const { return right.end - left.start; }
331 
332 	/** The size of the original gap. */
gapSizeGap333 	int gapSize() const { return gapEnd() - gapStart(); }
334 
335 	/** The size of the flanks. */
flankSizeGap336 	int flankSize() const { return left.size() + right.size(); }
337 };
338 
339 /** The coordinates of a gap, its flanking scaftigs, and the sequence that
340  * fills it.
341  */
342 struct ClosedGap : Gap
343 {
344 	/** The sequence that fills the gap. */
345 	std::string seq;
346 
ClosedGapClosedGap347 	ClosedGap() { }
348 
ClosedGapClosedGap349 	ClosedGap(Gap gap, std::string seq) : Gap(gap), seq(seq) { }
350 };
351 
352 #if USESEQAN
353 const string r1 =
354 "AGAATCAACCAACCGTTCAATGATATAATCAAGAGCGATATTGTAATCTTTGTTTCT";
355 const string r2 =
356 "CGACGTCCACCAATTCGTCCCTGTGCACGAGCAGTTTCCAGTCCAGCTTTTGTTCGT";
357 const string ins =
358 "AGAATCAACCAACCGTTCAATGATATAATCAAGAGCGATATTGTAATCTTTGTTTCTGTCACCCGGCCCCCACGACTCAAGGATTAGACCATAAACACCATCCTCTTCACCTATCGAACACTCAGCTTTCAGTTCAATTCCATTATTATCAAAAACATGCATAATATTAATCTTTAATCAATTTTTCACGACAATACTACTTTTATTGATAAAATTGCAACAAGTTGCTGTTGTTTTACTTTCTTTTGTACACAAAGTGTCTTTAACTTTATTTATCCCCTGCAGGAAACCTCTTATACAAAGTTGACACACCAACATCATAGATAATCGCCACCTTCTGGCGAGGAGTTCCTGCTGCAATTAATCGTCCAGCTTGTGCCCATTGTTCTGGTGTAAGTTTGGGACGACGTCCACCAATTCGTCCCTGTGCACGAGCAGTTTCCAGTCCAGCTTTTGTTCGT";
359 
seqanTests()360 static void seqanTests()
361 {
362 	typedef String<Dna> DS;
363 	typedef Align<DS> Alignment;
364 
365 	//DS seq1 = "TTGT";
366 	//DS seq2 = "TTAGT";
367 	DS ref = ins;
368 	DS seq1 = r1;
369 	DS seq2 = r2;
370 
371 	Alignment align1;
372 	resize(rows(align1), 2);
373 	assignSource(row(align1, 0), ref);
374 	assignSource(row(align1, 1), seq1);
375 	Alignment align2;
376 	resize(rows(align2), 2);
377 	assignSource(row(align2, 0), ref);
378 	assignSource(row(align2, 1), seq2);
379 
380 	Score<int> scoring(2, -2, -50, -100);
381 
382 	cout << splitAlignment(align1, align2, scoring) << endl;
383 	cout << align1 << endl;
384 	cout << align2 << endl;
385 
386 	cout << localAlignment(align1, scoring) << endl;
387 	cout << align1 << endl;
388 
389 	cout << localAlignment(align2, scoring) << endl;
390 	cout << align2 << endl;
391 }
392 #endif
393 
394 /**
395  * Set the value for a commandline option, using "nolimit"
396  * to represent NO_LIMIT.
397  */
setMaxOption(unsigned & arg,istream & in)398 static inline void setMaxOption(unsigned& arg, istream& in)
399 {
400 	string str;
401 	getline(in, str);
402 	if (in && str == "nolimit") {
403 		arg = NO_LIMIT;
404 	} else {
405 		istringstream ss(str);
406 		ss >> arg;
407 		// copy state bits (fail, bad, eof) to
408 		// original stream
409 		in.clear(ss.rdstate());
410 	}
411 }
412 
sizetToString(size_t a)413 string sizetToString (size_t a)
414 {
415 	ostringstream temp;
416 	temp << a;
417 	return temp.str();
418 }
419 
IntToString(int a)420 string IntToString (int a)
421 {
422 	ostringstream temp;
423 	temp<<a;
424 	return temp.str();
425 }
426 
427 // returns merged sequence resulting from Konnector
428 template <typename Graph>
merge(const Graph & g,unsigned k,const Gap & gap,const FastaRecord & read1,const FastaRecord & read2,const ConnectPairsParams & params,Counters & g_count,ofstream & traceStream)429 string merge(const Graph& g,
430 	unsigned k,
431 	const Gap& gap,
432 	const FastaRecord &read1,
433 	const FastaRecord &read2,
434 	const ConnectPairsParams& params,
435 	Counters& g_count,
436 	ofstream& traceStream)
437 {
438 	ConnectPairsResult result = connectPairs(k, read1, read2, g, params);
439 	ostringstream ss;
440 	ss << result.readNamePrefix << '_' << gap.gapStart() << '_' << gap.gapSize();
441 	result.readNamePrefix = ss.str();
442 
443 	if (!opt::tracefilePath.empty())
444 #pragma omp critical(tracefile)
445 	{
446 			traceStream << result;
447 			assert_good(traceStream, opt::tracefilePath);
448 	}
449 
450 	vector<FastaRecord>& paths = result.mergedSeqs;
451 	switch (result.pathResult) {
452 
453 		case NO_PATH:
454 			assert(paths.empty());
455 			if (result.foundStartKmer && result.foundGoalKmer)
456 #pragma omp atomic
457 				++g_count.noPath;
458 			else {
459 #pragma omp atomic
460 				++g_count.noStartOrGoalKmer;
461 			}
462 			break;
463 
464 		case FOUND_PATH:
465 			assert(!paths.empty());
466 			if (result.pathMismatches > params.maxPathMismatches ||
467 					result.readMismatches > params.maxReadMismatches) {
468 				if (result.pathMismatches > params.maxPathMismatches)
469 #pragma omp atomic
470 					++g_count.tooManyMismatches;
471 				else
472 #pragma omp atomic
473 					++g_count.tooManyReadMismatches;
474 			}
475 			else if (paths.size() > 1) {
476 #pragma omp atomic
477 				++g_count.multiplePaths;
478 			}
479 			else {
480 #pragma omp atomic
481 				++g_count.uniquePath;
482 			}
483 			break;
484 
485 		case TOO_MANY_PATHS:
486 #pragma omp atomic
487 			++g_count.tooManyPaths;
488 			break;
489 
490 		case TOO_MANY_BRANCHES:
491 #pragma omp atomic
492 			++g_count.tooManyBranches;
493 			break;
494 
495 		case PATH_CONTAINS_CYCLE:
496 #pragma omp atomic
497 			++g_count.containsCycle;
498 			break;
499 
500 		case MAX_COST_EXCEEDED:
501 #pragma omp atomic
502 			++g_count.maxCostExceeded;
503 			break;
504 
505 		case EXCEEDED_MEM_LIMIT:
506 #pragma omp atomic
507 			++g_count.exceededMemLimit;
508 			break;
509 	}
510 
511 	if (result.pathResult == FOUND_PATH) {
512 		if  (result.pathMismatches > params.maxPathMismatches)
513 			return "";
514 		else if (result.mergedSeqs.size() > 1)
515 			return result.consensusSeq;
516 		else
517 			return result.mergedSeqs.front();
518 	}
519 	else
520 		return "";
521 }
522 
printLog(ofstream & logStream,const string & output)523 void printLog(ofstream &logStream, const string &output) {
524 #pragma omp critical(logStream)
525 	logStream << output;
526 	if (opt::verbose > 0)
527 #pragma omp critical(cerr)
528 		cerr << output;
529 }
530 
insertIntoScaffold(ofstream & scaffoldStream,ofstream & mergedStream,FastaRecord & record,map<string,map<int,ClosedGap>> & allmerged,unsigned & gapsclosedfinal)531 void insertIntoScaffold(ofstream &scaffoldStream,
532 	ofstream &mergedStream,
533 	FastaRecord &record,
534 	map<string, map<int, ClosedGap> > &allmerged,
535 	unsigned &gapsclosedfinal)
536 {
537 	map<string, map<int, ClosedGap> >::iterator scaf_it;
538 	map<int, ClosedGap>::reverse_iterator pos_it;
539 
540 	scaf_it = allmerged.find(record.id);
541 	scaffoldStream << ">" << record.id << " " << record.comment << endl;
542 	string modifiedSeq = record.seq;
543 	if (scaf_it != allmerged.end()) {
544 		for (pos_it = allmerged[record.id].rbegin();
545 			pos_it != allmerged[record.id].rend();
546 			pos_it++)
547 		{
548 			const ClosedGap& closedGap = pos_it->second;
549 			int newseqsize = pos_it->second.seq.length() - closedGap.flankSize();
550 			modifiedSeq.replace(
551 				closedGap.left.start,
552 				closedGap.totalSize(),
553 				closedGap.seq
554 			);
555 			mergedStream << ">" << record.id << "_" << pos_it->first << "_" << newseqsize <<  endl;
556 			mergedStream << pos_it->second.seq << endl;
557 			gapsclosedfinal++;
558 		}
559 		scaffoldStream << modifiedSeq << endl;
560 	}
561 	else {
562 		scaffoldStream << record.seq  << endl;
563 	}
564 
565 }
566 
567 std::pair<FastaRecord, FastaRecord>
makePseudoReads(std::string seq,FastaRecord record,const Gap & gap)568 makePseudoReads(std::string seq, FastaRecord record, const Gap& gap)
569 {
570 	std::pair<FastaRecord, FastaRecord> scaftigs;
571 	// extract left flank
572 	std::string leftflank = seq.substr(gap.left.start, gap.left.size());
573 	std::transform(leftflank.begin(), leftflank.end(), leftflank.begin(), ::toupper);
574 	scaftigs.first.seq = leftflank;
575 	scaftigs.first.id = record.id + "/1";
576 
577 	// extract right flank
578 	scaftigs.second.id = record.id + "/2";
579 	std::string rightflank = seq.substr(gap.right.start, gap.right.size());
580 	std::transform(rightflank.begin(), rightflank.end(), rightflank.begin(), ::toupper);
581 	scaftigs.second.seq = reverseComplement(rightflank);
582 
583 	return scaftigs;
584 }
585 
586 template <typename Graph>
kRun(const ConnectPairsParams & params,unsigned k,const Graph & g,map<string,map<int,ClosedGap>> & allmerged,map<FastaRecord,map<FastaRecord,Gap>> & flanks,unsigned & gapsclosed,ofstream & logStream,ofstream & traceStream,ofstream & gapStream)587 void kRun(const ConnectPairsParams& params,
588 	unsigned k,
589 	const Graph& g,
590 	map<string, map<int, ClosedGap> > &allmerged,
591 	map<FastaRecord, map<FastaRecord, Gap> > &flanks,
592 	unsigned &gapsclosed,
593 	ofstream &logStream,
594 	ofstream &traceStream,
595 	ofstream &gapStream)
596 {
597 	map<FastaRecord, map<FastaRecord, Gap> >::iterator read1_it;
598 	map<FastaRecord, Gap>::iterator read2_it;
599 	unsigned uniqueGapsClosed = 0;
600 
601 	Counters g_count;
602 	g_count.noStartOrGoalKmer = 0;
603 	g_count.noPath = 0;
604 	g_count.uniquePath = 0;
605 	g_count.multiplePaths = 0;
606 	g_count.tooManyPaths = 0;
607 	g_count.tooManyBranches = 0;
608 	g_count.tooManyMismatches = 0;
609 	g_count.tooManyReadMismatches = 0;
610 	g_count.containsCycle = 0;
611 	g_count.maxCostExceeded = 0;
612 	g_count.exceededMemLimit = 0;
613 	g_count.traversalMemExceeded = 0;
614 	g_count.readPairsProcessed = 0;
615 	g_count.readPairsMerged = 0;
616 	g_count.skipped = 0;
617 
618 	printLog(logStream, "Flanks inserted into k run = " + IntToString(flanks.size()) + "\n");
619 
620 	int counter = 0;
621 	vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator> flanks_closed;
622 #pragma omp parallel private(read1_it, read2_it) firstprivate(counter)
623 	for (read1_it = flanks.begin(); read1_it != flanks.end(); ++read1_it) {
624 		FastaRecord read1 = read1_it->first;
625 		bool success = false;
626 		for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); ++read2_it, ++counter) {
627 #if _OPENMP
628 			if (counter % omp_get_num_threads() != omp_get_thread_num())
629 				continue;
630 #endif
631 			FastaRecord read2 = read2_it->first;
632 
633 			int startposition = read2_it->second.gapStart();
634 			string tempSeq = merge(g, k, read2_it->second, read1, read2, params, g_count, traceStream);
635 			if (!tempSeq.empty()) {
636 				success = true;
637 #pragma omp critical (allmerged)
638 				allmerged[read1.id.substr(0,read1.id.length()-2)][startposition]
639 					= ClosedGap(read2_it->second, tempSeq);
640 #pragma omp atomic
641 				++uniqueGapsClosed;
642 #pragma omp critical (gapsclosed)
643 				if (++gapsclosed % 100 == 0)
644 					printLog(logStream, IntToString(gapsclosed) + " gaps closed so far\n");
645 
646 				if (!opt::gapfilePath.empty())
647 #pragma omp critical (gapStream)
648 					gapStream << ">" << read1.id.substr(0,read1.id.length()-2)
649 						  << "_" << read2_it->second.gapStart() << "-" << read2_it->second.gapEnd()
650 						  << " LN:i:" << tempSeq.length() << '\n'
651 						  << tempSeq << '\n';
652 			}
653 		}
654 		if (success) {
655 #pragma omp critical (flanks_closed)
656 			flanks_closed.push_back(read1_it);
657 		}
658 	}
659 
660 	for (vector<map<FastaRecord, map<FastaRecord, Gap> >::iterator>::iterator it = flanks_closed.begin();
661 			it != flanks_closed.end(); ++it) {
662 		if (flanks.count((*it)->first) > 0)
663 		    flanks.erase(*it);
664 	}
665 
666 	printLog(logStream, IntToString(uniqueGapsClosed) + " unique gaps closed for k" + IntToString(k) + "\n");
667 
668 	printLog(logStream, "No start/goal kmer: "
669 		+ sizetToString(g_count.noStartOrGoalKmer) + "\n");
670 	printLog(logStream, "No path: "
671 		+ sizetToString(g_count.noPath) + "\n");
672 	printLog(logStream, "Unique path: "
673 		+ sizetToString(g_count.uniquePath) + "\n");
674 	printLog(logStream, "Multiple paths: "
675 		+ sizetToString(g_count.multiplePaths) + "\n");
676 	printLog(logStream, "Too many paths: "
677 		+ sizetToString(g_count.tooManyPaths) + "\n");
678 	printLog(logStream, "Too many branches: "
679 		+ sizetToString(g_count.tooManyBranches) + "\n");
680 	printLog(logStream, "Too many path/path mismatches: "
681 		+ sizetToString(g_count.tooManyMismatches) + "\n");
682 	printLog(logStream, "Too many path/read mismatches: "
683 		+ sizetToString(g_count.tooManyReadMismatches) + "\n");
684 	printLog(logStream, "Contains cycle: "
685 		+ sizetToString(g_count.containsCycle) + "\n");
686 	printLog(logStream, "Max cost exceeded: "
687 		+ sizetToString(g_count.maxCostExceeded) + "\n");
688 	printLog(logStream, "Exceeded mem limit: "
689 		+ sizetToString(g_count.exceededMemLimit) + "\n");
690 	printLog(logStream, "Skipped: "
691 		+ sizetToString(g_count.skipped) + "\n");
692 
693 	printLog(logStream, IntToString(flanks.size()) + " flanks left\n");
694 
695 }
696 
operator <(const FastaRecord & a,const FastaRecord & b)697 bool operator<(const FastaRecord& a, const FastaRecord& b)
698 {
699 	if (a.id == b.id)
700 		return a.seq < b.seq;
701 	else
702 		return a.id < b.id;
703 }
704 
findFlanks(FastaRecord & record,int flanklength,unsigned & gapnumber,map<FastaRecord,map<FastaRecord,Gap>> & flanks)705 void findFlanks(FastaRecord &record,
706 	int flanklength,
707 	unsigned &gapnumber,
708 	map<FastaRecord, map<FastaRecord, Gap> > &flanks)
709 {
710 	const string& seq = record.seq;
711 
712 	// Iterate over the gaps.
713 	for (size_t offset = 0;
714 			seq.string::find_first_of("Nn", offset) != string::npos;) {
715 #pragma omp atomic
716 		gapnumber++;
717 		size_t startposition = seq.string::find_first_of("Nn", offset);
718 
719 		size_t endposition = seq.string::find_first_not_of("Nn", startposition);
720 		if (endposition == string::npos) {
721 			std::cerr << PROGRAM ": Warning: sequence ends with an N: " << record.id << "\n";
722 			break;
723 		}
724 
725 		size_t right_end = seq.string::find_first_of("Nn", endposition);
726 		if (right_end == string::npos)
727 			right_end = seq.length();
728 
729 		Gap gap(
730 			max((ssize_t)offset, (ssize_t)startposition - (ssize_t)flanklength),
731 			startposition,
732 			endposition,
733 			min(right_end, endposition + flanklength));
734 
735 		std::pair<FastaRecord, FastaRecord> scaftigs =
736 			makePseudoReads(seq, record, gap);
737 #pragma omp critical (flanks)
738 		flanks[scaftigs.first][scaftigs.second] = gap;
739 
740 		offset = endposition;
741 	}
742 }
743 
744 /**
745  * Connect pairs using a Bloom filter de Bruijn graph
746  */
main(int argc,char ** argv)747 int main(int argc, char** argv)
748 {
749 	bool die = false;
750 
751 	for (int c; (c = getopt_long(argc, argv,
752 					shortopts, longopts, NULL)) != -1;) {
753 		istringstream arg(optarg != NULL ? optarg : "");
754 		switch (c) {
755 		  case '?':
756 			die = true; break;
757 		  case 'S':
758 			arg >> opt::inputScaffold; break;
759 		  case 'L':
760 			arg >> opt::flankLength; break;
761 		  case 'G':
762 			arg >> opt::maxGapLength; break;
763 		  case 'b':
764 			opt::bloomSize = SIToBytes(arg); break;
765 		  case 'B':
766 			setMaxOption(opt::maxBranches, arg); break;
767 		  case 'C':
768 			setMaxOption(opt::maxCost, arg); break;
769 		  case 'd':
770 			arg >> opt::dotPath; break;
771 		  case 'e':
772 			opt::fixErrors = true; break;
773 		  case 'f':
774 			arg >> opt::minFrag; break;
775 		  case 'F':
776 			arg >> opt::maxFrag; break;
777 		  case 'i': {
778 			string tempPath;
779 			arg >> tempPath;
780 			opt::bloomFilterPaths.push_back(tempPath);
781 			opt::inputBloomPath = tempPath;
782 			break;
783 			}
784 		  case 'I':
785 			opt::interleaved = true; break;
786 		  case 'j':
787 			arg >> opt::threads; break;
788 		  case 'k': {
789 			unsigned tempK;
790 			arg >> tempK;
791 			opt::kvector.push_back(tempK);
792 			opt::k = tempK;
793 			break;
794 			}
795 		  case 'm':
796 			setMaxOption(opt::maxFlankMismatches, arg); break;
797 		  case 'n':
798 			opt::maxBranches = NO_LIMIT;
799 			opt::maxFlankMismatches = NO_LIMIT;
800 			opt::maxMismatches = NO_LIMIT;
801 			opt::maxPaths = NO_LIMIT;
802 			break;
803 		  case 'M':
804 			setMaxOption(opt::maxMismatches, arg); break;
805 		  case 'o':
806 			arg >> opt::outputPrefix; break;
807 		  case 'P':
808 			setMaxOption(opt::maxPaths, arg); break;
809 		  case 'q':
810 			arg >> opt::qualityThreshold; break;
811 		  case 'r':
812 			arg >> opt::readName; break;
813 		  case 's':
814 			opt::searchMem = SIToBytes(arg); break;
815 		  case 't':
816 			arg >> opt::tracefilePath; break;
817 		  case 'g':
818 		    arg >> opt::gapfilePath; break;
819 		  case 'v':
820 			opt::verbose++; break;
821 		  case OPT_HELP:
822 			cout << USAGE_MESSAGE;
823 			exit(EXIT_SUCCESS);
824 		  case OPT_VERSION:
825 			cout << VERSION_MESSAGE;
826 			exit(EXIT_SUCCESS);
827 		}
828 		if (optarg != NULL && (!arg.eof() || arg.fail())) {
829 			cerr << PROGRAM ": invalid option: `-"
830 				<< (char)c << optarg << "'\n";
831 			exit(EXIT_FAILURE);
832 		}
833 	}
834 
835 	/* translate --max-frag to --max-gap-length for backwards compatibility */
836 	if (opt::maxFrag > 0) {
837 		if ((int)opt::maxFrag < 2 * opt::flankLength)
838 			opt::maxGapLength = 0;
839 		else
840 			opt::maxGapLength = opt::maxFrag - 2 * opt::flankLength;
841 	}
842 
843 	if (opt::inputScaffold.empty()) {
844 		cerr << PROGRAM ": missing mandatory option `-S'\n";
845 		die = true;
846 	}
847 
848 	if (opt::k == 0) {
849 		cerr << PROGRAM ": missing mandatory option `-k'\n";
850 		die = true;
851 	}
852 
853 	if (opt::bloomFilterPaths.size() < opt::kvector.size()
854 		&& opt::bloomSize == 0)
855 	{
856 		cerr << PROGRAM ": missing mandatory option `-b' (Bloom filter size)\n"
857 			<< "Here are some guidelines for sizing the Bloom filter:\n"
858 			<< "  * E. coli (~5 Mbp genome), 615X coverage: -b500M\n"
859 			<< "  * S. cerevisiae (~12 Mbp genome), 25X coverage: -b500M\n"
860 			<< "  * C. elegans (~100 Mbp genome), 89X coverage: -b1200M\n"
861 			<< "  * H. sapiens (~3 Gbp genome), 71X coverage: -b40G\n";
862 		die = true;
863 	}
864 
865 	if (opt::outputPrefix.empty()) {
866 		cerr << PROGRAM ": missing mandatory option `-o'\n";
867 		die = true;
868 	}
869 
870 	if (opt::bloomFilterPaths.size() > opt::kvector.size()) {
871 		cerr << PROGRAM ": you must specify a k-mer size (-k) for each Bloom "
872 			" filter file (-i)\n";
873 		die = true;
874 	} else if (opt::bloomFilterPaths.size() < opt::kvector.size()
875 		&& argc - optind < 1) {
876 		cerr << PROGRAM ": missing input file arguments\n";
877 		die = true;
878 	} else if (opt::bloomFilterPaths.size() == opt::kvector.size()
879 		&& argc - optind > 0) {
880 		cerr << PROGRAM ": input FASTA/FASTQ args should be omitted when using "
881 			"pre-built Bloom filters (-i) for all k-mer sizes\n";
882 		die = true;
883 	}
884 
885 	if (die) {
886 		cerr << "Try `" << PROGRAM
887 			<< " --help' for more information.\n";
888 		exit(EXIT_FAILURE);
889 	}
890 
891 #if _OPENMP
892 	if (opt::threads > 0)
893 		omp_set_num_threads(opt::threads);
894 #endif
895 
896 	Kmer::setLength(opt::k);
897 
898 #if USESEQAN
899 	seqanTests();
900 #endif
901 
902 	ofstream dotStream;
903 	if (!opt::dotPath.empty()) {
904 		if (opt::verbose)
905 			cerr << "Writing graph traversals to "
906 				"dot file `" << opt::dotPath << "'\n";
907 		dotStream.open(opt::dotPath.c_str());
908 		assert_good(dotStream, opt::dotPath);
909 	}
910 
911 	ofstream traceStream;
912 	if (!opt::tracefilePath.empty()) {
913 		if (opt::verbose)
914 			cerr << "Writing graph search stats to `"
915 				<< opt::tracefilePath << "'\n";
916 		traceStream.open(opt::tracefilePath.c_str());
917 		assert(traceStream.is_open());
918 		ConnectPairsResult::printHeaders(traceStream);
919 		assert_good(traceStream, opt::tracefilePath);
920 	}
921 
922 	ofstream gapStream;
923 	if (!opt::gapfilePath.empty()) {
924 		gapStream.open(opt::gapfilePath.c_str());
925 		assert(gapStream.is_open());
926 		assert_good(gapStream, opt::gapfilePath);
927 	}
928 
929 	string logOutputPath(opt::outputPrefix);
930 	logOutputPath.append("_log.txt");
931 	ofstream logStream(logOutputPath.c_str());
932 	assert_good(logStream, logOutputPath);
933 
934 	string scaffoldOutputPath(opt::outputPrefix);
935 	scaffoldOutputPath.append("_scaffold.fa");
936 	ofstream scaffoldStream(scaffoldOutputPath.c_str());
937 	assert_good(scaffoldStream, scaffoldOutputPath);
938 
939 	string mergedOutputPath(opt::outputPrefix);
940 	mergedOutputPath.append("_merged.fa");
941 	ofstream mergedStream(mergedOutputPath.c_str());
942 	assert_good(mergedStream, mergedOutputPath);
943 
944 	printLog(logStream, "Finding flanks\n");
945 
946 	ConnectPairsParams params;
947 
948 	params.minMergedSeqLen = opt::minFrag;
949 	params.maxMergedSeqLen = opt::maxGapLength + 2 * opt::flankLength;
950 	params.maxPaths = opt::maxPaths;
951 	params.maxBranches = opt::maxBranches;
952 	params.maxCost = opt::maxCost;
953 	params.maxPathMismatches = opt::maxMismatches;
954 	params.maxReadMismatches = opt::maxFlankMismatches;
955 	/*
956 	 * search from the end of the scaffold flank until
957 	 * we find this many matches in a row.
958 	 */
959 	params.kmerMatchesThreshold = 1;
960 	params.fixErrors = opt::fixErrors;
961 	params.maskBases = opt::mask;
962 	params.memLimit = opt::searchMem;
963 	params.dotPath = opt::dotPath;
964 	params.dotStream = opt::dotPath.empty() ? NULL : &dotStream;
965 
966 	map<FastaRecord, map<FastaRecord, Gap> > flanks;
967 	const char* scaffoldInputPath = opt::inputScaffold.c_str();
968 	FastaReader reader1(scaffoldInputPath, FastaReader::FOLD_CASE);
969 	unsigned gapsfound = 0;
970 	string temp;
971 
972 #pragma omp parallel
973 	for (FastaRecord record;;) {
974 		bool good;
975 #pragma omp critical(reader1)
976 		good = reader1 >> record;
977 		if (good) {
978 			findFlanks(record, opt::flankLength, gapsfound, flanks);
979 		}
980 		else {
981 			break;
982 		}
983 	}
984 
985 	temp = IntToString(gapsfound) + " gaps found\n";
986 	printLog(logStream, temp);
987 	temp = IntToString((int)flanks.size()) + " flanks extracted\n\n";
988 	printLog(logStream, temp);
989 
990 	if (opt::printFlanks > 0) {
991 		map<FastaRecord, map<FastaRecord, Gap> >::iterator read1_it;
992 		map<FastaRecord, Gap>::iterator read2_it;
993 
994 		string read1OutputPath(opt::outputPrefix);
995 		read1OutputPath.append("_flanks_1.fa");
996 		ofstream read1Stream(read1OutputPath.c_str());
997 		assert_good(read1Stream, read1OutputPath);
998 
999 		string read2OutputPath(opt::outputPrefix);
1000 		read2OutputPath.append("_flanks_2.fa");
1001 		ofstream read2Stream(read2OutputPath.c_str());
1002 		assert_good(read2Stream, read2OutputPath);
1003 
1004 		for (read1_it = flanks.begin(); read1_it != flanks.end(); read1_it++) {
1005 			FastaRecord read1 = read1_it->first;
1006 			for (read2_it = flanks[read1].begin(); read2_it != flanks[read1].end(); read2_it++) {
1007 				FastaRecord read2 = read2_it->first;
1008 				const Gap& gap = read2_it->second;
1009 
1010 				read1Stream << ">" << read1.id.substr(0,read2.id.length()-2)
1011 					<< "_" << gap.gapStart() << "_" << gap.gapSize() << "/1\n";
1012 				read1Stream << read1.seq << endl;
1013 
1014 				read2Stream << ">" << read2.id.substr(0,read2.id.length()-2)
1015 					<< "_" << gap.gapStart() << "_" << gap.gapSize() << "/2\n";
1016 				read2Stream << read2.seq << endl;
1017 			}
1018 		}
1019 		assert_good(read1Stream, read1OutputPath.c_str());
1020 		read1Stream.close();
1021 		assert_good(read2Stream, read2OutputPath.c_str());
1022 		read2Stream.close();
1023 	}
1024 
1025 	/** map for merged sequence resutls */
1026 	map<string, map<int, ClosedGap> > allmerged;
1027 	unsigned gapsclosed=0;
1028 
1029 	for (unsigned i = 0; i<opt::kvector.size(); i++) {
1030 		opt::k = opt::kvector.at(i);
1031 		Kmer::setLength(opt::k);
1032 
1033 		BloomFilter* bloom;
1034 		CascadingBloomFilter* cascadingBloom = NULL;
1035 
1036 		if (!opt::bloomFilterPaths.empty() && i < opt::bloomFilterPaths.size()) {
1037 
1038 			temp = "Loading bloom filter from `" + opt::bloomFilterPaths.at(i) + "'...\n";
1039 			printLog(logStream, temp);
1040 
1041 			bloom = new BloomFilter();
1042 
1043 			const char* inputPath = opt::bloomFilterPaths.at(i).c_str();
1044 			ifstream inputBloom(inputPath, ios_base::in | ios_base::binary);
1045 			assert_good(inputBloom, inputPath);
1046 			inputBloom >> *bloom;
1047 			assert_good(inputBloom, inputPath);
1048 			inputBloom.close();
1049 		} else {
1050 			printLog(logStream, "Building bloom filter\n");
1051 
1052 			size_t bits = opt::bloomSize * 8 / 2;
1053 			cascadingBloom = new CascadingBloomFilter(bits, opt::max_count);
1054 #ifdef _OPENMP
1055 			ConcurrentBloomFilter<CascadingBloomFilter> cbf(*cascadingBloom, 1000);
1056 			for (int i = optind; i < argc; i++)
1057 				Bloom::loadFile(cbf, opt::k, argv[i], opt::verbose >= 2);
1058 #else
1059 			for (int i = optind; i < argc; i++)
1060 				Bloom::loadFile(*cascadingBloom, opt::k, argv[i], opt::verbose >= 2);
1061 #endif
1062 			bloom = &cascadingBloom->getBloomFilter(opt::max_count - 1);
1063 		}
1064 
1065 		assert(bloom != NULL);
1066 
1067 		if (opt::verbose)
1068 			cerr << "Bloom filter FPR: " << setprecision(3)
1069 				<< 100 * bloom->FPR() << "%\n";
1070 
1071 		DBGBloom<BloomFilter> g(*bloom);
1072 
1073 		temp = "Starting K run with k = " + IntToString(opt::k) + "\n";
1074 		printLog(logStream, temp);
1075 
1076 		kRun(params, opt::k, g, allmerged, flanks, gapsclosed, logStream, traceStream, gapStream);
1077 
1078 		temp = "k" + IntToString(opt::k) + " run complete\n"
1079 				+ "Total gaps closed so far = " + IntToString(gapsclosed) + "\n\n";
1080 		printLog(logStream, temp);
1081 
1082 		if (!opt::inputBloomPath.empty())
1083 			delete bloom;
1084 		else
1085 			delete cascadingBloom;
1086 	}
1087 
1088 	printLog(logStream, "K sweep complete\nCreating new scaffold with gaps closed...\n");
1089 
1090 	map<string, map<int, map<string, string> > >::iterator scaf_it;
1091 	map<int, map<string, string> >::reverse_iterator pos_it;
1092 	FastaReader reader2(scaffoldInputPath, FastaReader::FOLD_CASE);
1093 	unsigned gapsclosedfinal = 0;
1094 
1095 	/** creating new scaffold with gaps closed */
1096 	for (FastaRecord record;;) {
1097 		bool good;
1098 		good = reader2 >> record;
1099 		if (good) {
1100 			insertIntoScaffold(scaffoldStream, mergedStream, record, allmerged, gapsclosedfinal);
1101 		}
1102 		else
1103 			break;
1104 	}
1105 	printLog(logStream, "New scaffold complete\n");
1106 	printLog(logStream, "Gaps closed = " + IntToString(gapsclosed) + "\n");
1107 	logStream << (float)100 * gapsclosed / gapsfound << "%\n\n";
1108 	if (opt::verbose > 0) {
1109 		cerr << (float)100 * gapsclosed / gapsfound << "%\n\n";
1110 	}
1111 
1112 	assert_good(scaffoldStream, scaffoldOutputPath.c_str());
1113 	scaffoldStream.close();
1114 	assert_good(mergedStream, mergedOutputPath.c_str());
1115 	mergedStream.close();
1116 	assert_good(logStream, logOutputPath.c_str());
1117 	logStream.close();
1118 
1119 	if (!opt::dotPath.empty()) {
1120 		assert_good(dotStream, opt::dotPath);
1121 		dotStream.close();
1122 	}
1123 
1124 	if (!opt::tracefilePath.empty()) {
1125 		assert_good(traceStream, opt::tracefilePath);
1126 		traceStream.close();
1127 	}
1128 
1129 	if (!opt::gapfilePath.empty()) {
1130 		assert_good(gapStream, opt::gapfilePath);
1131 		gapStream.close();
1132 	}
1133 
1134 	return 0;
1135 }
1136