1 #ifndef BLOOM_DBG_H
2 #define BLOOM_DBG_H 1
3 
4 #include "config.h"
5 
6 #include "BloomDBG/AssemblyCounters.h"
7 #include "BloomDBG/AssemblyParams.h"
8 #include "BloomDBG/BloomIO.h"
9 #include "BloomDBG/Checkpoint.h"
10 #include "BloomDBG/MaskedKmer.h"
11 #include "BloomDBG/RollingBloomDBG.h"
12 #include "BloomDBG/RollingHash.h"
13 #include "BloomDBG/RollingHashIterator.h"
14 #include "Common/Hash.h"
15 #include "Common/IOUtil.h"
16 #include "Common/Sequence.h"
17 #include "Common/Uncompress.h"
18 #include "Common/UnorderedSet.h"
19 #include "DataLayer/FastaConcat.h"
20 #include "DataLayer/FastaReader.h"
21 #include "Graph/BreadthFirstSearch.h"
22 #include "Graph/ExtendPath.h"
23 #include "Graph/Path.h"
24 #include "vendor/btl_bloomfilter/BloomFilter.hpp"
25 
26 #include <cmath>
27 #include <iomanip>
28 #include <iostream>
29 #include <limits>
30 #include <sstream>
31 #include <string>
32 
33 #if _OPENMP
34 #include <omp.h>
35 #endif
36 
37 #if HAVE_GOOGLE_SPARSE_HASH_MAP
38 
39 #include <google/sparse_hash_set>
40 typedef google::sparse_hash_set<RollingBloomDBGVertex, hash<RollingBloomDBGVertex>> KmerHash;
41 
42 #else
43 
44 #include "Common/UnorderedSet.h"
45 typedef unordered_set<RollingBloomDBGVertex, hash<RollingBloomDBGVertex>> KmerHash;
46 
47 #endif
48 
49 namespace BloomDBG {
50 
51 /**
52  * Type for a vertex in the de Bruijn graph.
53  */
54 typedef RollingBloomDBGVertex Vertex;
55 
56 /**
57  * Return true if all of the k-mers in `seq` are contained in `bloom`
58  * and false otherwise.
59  */
60 template<typename BloomT>
61 inline static bool
allKmersInBloom(const Sequence & seq,const BloomT & bloom)62 allKmersInBloom(const Sequence& seq, const BloomT& bloom)
63 {
64 	const unsigned k = bloom.getKmerSize();
65 	const unsigned numHashes = bloom.getHashNum();
66 	assert(seq.length() >= k);
67 	unsigned validKmers = 0;
68 	for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end();
69 	     ++it, ++validKmers) {
70 		if (!bloom.contains(*it))
71 			return false;
72 	}
73 	/* if we skipped over k-mers containing non-ACGT chars */
74 	if (validKmers < seq.length() - k + 1)
75 		return false;
76 	return true;
77 }
78 
79 /**
80  * Add all k-mers of a DNA sequence to a Bloom filter.
81  */
82 template<typename BloomT>
83 inline static void
addKmersToBloom(const Sequence & seq,BloomT & bloom)84 addKmersToBloom(const Sequence& seq, BloomT& bloom)
85 {
86 	const unsigned k = bloom.getKmerSize();
87 	const unsigned numHashes = bloom.getHashNum();
88 	for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); ++it) {
89 		bloom.insert(*it);
90 	}
91 }
92 
93 /**
94  * Returns the sum of all kmer multiplicities in `seq` by querying `bloom`
95  */
96 template<typename CountingBloomT>
97 inline static unsigned
getSeqAbsoluteKmerCoverage(const Sequence & seq,const CountingBloomT & bloom)98 getSeqAbsoluteKmerCoverage(const Sequence& seq, const CountingBloomT& bloom)
99 {
100 	const unsigned k = bloom.getKmerSize();
101 	const unsigned numHashes = bloom.getHashNum();
102 	assert(seq.length() >= k);
103 	unsigned coverage = 0;
104 	for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end();
105 	     ++it) {
106 		coverage += bloom.minCount(*it);
107 	}
108 	return coverage;
109 }
110 
111 /**
112  * Translate a DNA sequence to an equivalent path in the
113  * de Bruijn graph.
114  */
115 inline static Path<Vertex>
seqToPath(const Sequence & seq,unsigned k,unsigned numHashes)116 seqToPath(const Sequence& seq, unsigned k, unsigned numHashes)
117 {
118 	Path<Vertex> path;
119 	assert(seq.length() >= k);
120 	for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); ++it) {
121 		path.push_back(Vertex(it.kmer().c_str(), it.rollingHash()));
122 	}
123 	return path;
124 }
125 
126 /**
127  * Translate a path in the de Bruijn graph to an equivalent
128  * DNA sequence.
129  */
130 inline static Sequence
pathToSeq(const Path<Vertex> & path,unsigned k)131 pathToSeq(const Path<Vertex>& path, unsigned k)
132 {
133 	assert(path.size() > 0);
134 	assert(k > 0);
135 
136 	const std::string& spacedSeed = MaskedKmer::mask();
137 	assert(spacedSeed.empty() || spacedSeed.length() == k);
138 	Sequence seq;
139 	seq.resize(path.size() + k - 1, 'N');
140 
141 	for (size_t i = 0; i < path.size(); ++i) {
142 		std::string kmer(path.at(i).kmer().c_str());
143 		for (size_t j = 0; j < k; ++j) {
144 			if (spacedSeed.empty() || spacedSeed.at(j) == '1') {
145 				if (seq.at(i + j) != 'N' && seq.at(i + j) != kmer.at(j)) {
146 					std::cerr << "warning: inconsistent DBG path detected "
147 					             "at position "
148 					          << i + j << ": " << seq.substr(0, i + j) << " (orig base: '"
149 					          << seq.at(i + j) << "'"
150 					          << ", new base: '" << kmer.at(j) << "')" << std::endl;
151 				}
152 				seq.at(i + j) = kmer.at(j);
153 			}
154 		}
155 	}
156 
157 	return seq;
158 }
159 
160 enum SeedType
161 {
162 	ST_BRANCH_KMER = 0,
163 	ST_READ
164 };
165 
166 static inline std::string
seedTypeStr(const SeedType & type)167 seedTypeStr(const SeedType& type)
168 {
169 	switch (type) {
170 	case ST_BRANCH_KMER:
171 		return "BRANCH_KMER";
172 	case ST_READ:
173 		return "READ";
174 	default:
175 		break;
176 	}
177 	assert(false);
178 }
179 
180 /**
181  * Results for the extension of a read segment.
182  * Each instance represents a row in the trace file generated
183  * by the '-T' option for abyss-bloom-dbg.
184  */
185 struct ContigRecord
186 {
187 	/** output FASTA ID for this contig */
188 	size_t contigID;
189 	/** length of contig (bp) */
190 	unsigned length;
191 	/** coverage of contig */
192 	unsigned coverage;
193 	/** FASTA ID of seeding read */
194 	std::string readID;
195 	/** Type of sequence used to seed contig (branch k-mer or full read) */
196 	SeedType seedType;
197 	/** Starting sequence prior to extensions */
198 	Sequence seed;
199 	/** Result code for attempted left sequence extension (e.g. DEAD END) */
200 	PathExtensionResult leftExtensionResult;
201 	/** Result code for attempted left sequence extension (e.g. DEAD END) */
202 	PathExtensionResult rightExtensionResult;
203 
204 	/**
205 	 * True if the extended sequence was excluded from the output contigs
206 	 * because it was redundant. (An identical sequence was generated
207 	 * when extending a previous read.)
208 	 */
209 	bool redundant;
210 
ContigRecordContigRecord211 	ContigRecord()
212 	  : contigID(std::numeric_limits<size_t>::max())
213 	  , readID()
214 	  , leftExtensionResult(std::make_pair(0, ER_DEAD_END))
215 	  , rightExtensionResult(std::make_pair(0, ER_DEAD_END))
216 	  , redundant(false)
217 	{}
218 
printHeadersContigRecord219 	static std::ostream& printHeaders(std::ostream& out)
220 	{
221 		out << "contig_id" << '\t' << "length" << '\t' << "redundant" << '\t' << "read_id" << '\t'
222 		    << "left_result" << '\t' << "left_extension" << '\t' << "right_result" << '\t'
223 		    << "right_extension" << '\t' << "seed_type" << '\t' << "seed_length" << '\t' << "seed"
224 		    << '\n';
225 		return out;
226 	}
227 
228 	friend std::ostream& operator<<(std::ostream& out, const ContigRecord& o)
229 	{
230 		if (o.redundant)
231 			out << "NA" << '\t';
232 		else
233 			out << o.contigID << '\t';
234 
235 		out << o.length << '\t' << o.redundant << '\t' << o.readID << '\t';
236 
237 		if (o.leftExtensionResult.first > 0)
238 			out << pathExtensionResultStr(o.leftExtensionResult.second) << '\t'
239 			    << o.leftExtensionResult.first << '\t';
240 		else
241 			out << "NA" << '\t' << "NA" << '\t';
242 
243 		if (o.rightExtensionResult.first > 0)
244 			out << pathExtensionResultStr(o.rightExtensionResult.second) << '\t'
245 			    << o.rightExtensionResult.first << '\t';
246 		else
247 			out << "NA" << '\t' << "NA" << '\t';
248 
249 		out << seedTypeStr(o.seedType) << '\t' << o.seed.length() << '\t' << o.seed << '\n';
250 
251 		return out;
252 	}
253 };
254 
255 enum ReadResult
256 {
257 	RR_UNINITIALIZED = 0,
258 	RR_SHORTER_THAN_K,
259 	RR_NON_ACGT,
260 	RR_BLUNT_END,
261 	RR_NOT_SOLID,
262 	RR_ALL_KMERS_VISITED,
263 	RR_ALL_BRANCH_KMERS_VISITED,
264 	RR_GENERATED_CONTIGS
265 };
266 
267 static inline std::string
readResultStr(const ReadResult & result)268 readResultStr(const ReadResult& result)
269 {
270 	switch (result) {
271 	case RR_UNINITIALIZED:
272 		return "NA";
273 	case RR_SHORTER_THAN_K:
274 		return "SHORTER_THAN_K";
275 	case RR_NON_ACGT:
276 		return "NON_ACGT";
277 	case RR_BLUNT_END:
278 		return "BLUNT_END";
279 	case RR_NOT_SOLID:
280 		return "NOT_SOLID";
281 	case RR_ALL_KMERS_VISITED:
282 		return "ALL_KMERS_VISITED";
283 	case RR_ALL_BRANCH_KMERS_VISITED:
284 		return "ALL_BRANCH_KMERS_VISITED";
285 	case RR_GENERATED_CONTIGS:
286 		return "GENERATED_CONTIGS";
287 	default:
288 		break;
289 	}
290 	assert(false);
291 }
292 
293 /**
294  * Records results of processing a sequencing read.
295  * Each instance represents a row in the read log file generated
296  * by the `--read-log` option of `abyss-bloom-dbg`.
297  */
298 struct ReadRecord
299 {
300 	/** FASTA ID for this read */
301 	std::string readID;
302 	/** outcome of processing the read */
303 	ReadResult result;
304 
ReadRecordReadRecord305 	ReadRecord()
306 	  : readID()
307 	  , result(RR_UNINITIALIZED)
308 	{}
309 
ReadRecordReadRecord310 	ReadRecord(const std::string& readID)
311 	  : readID(readID)
312 	  , result(RR_UNINITIALIZED)
313 	{}
314 
ReadRecordReadRecord315 	ReadRecord(const std::string& readID, ReadResult result)
316 	  : readID(readID)
317 	  , result(result)
318 	{}
319 
printHeadersReadRecord320 	static std::ostream& printHeaders(std::ostream& out)
321 	{
322 		out << "read_id" << '\t' << "result" << '\n';
323 		return out;
324 	}
325 
326 	friend std::ostream& operator<<(std::ostream& out, const ReadRecord& o)
327 	{
328 		out << o.readID << '\t' << readResultStr(o.result) << '\n';
329 
330 		return out;
331 	}
332 };
333 
334 /** Print an intermediate progress message during assembly */
335 void
readsProgressMessage(AssemblyCounters counters)336 readsProgressMessage(AssemblyCounters counters)
337 {
338 	std::cerr << "Processed " << counters.readsProcessed << " reads"
339 	          << ", solid reads: " << counters.solidReads << " (" << std::setprecision(3)
340 	          << (float)100 * counters.solidReads / counters.readsProcessed << "%)"
341 	          << ", visited reads: " << counters.visitedReads << " (" << std::setprecision(3)
342 	          << (float)100 * counters.visitedReads / counters.readsProcessed << "%)" << std::endl;
343 }
344 
345 /** Print an intermediate progress message during assembly */
346 void
basesProgressMessage(AssemblyCounters counters)347 basesProgressMessage(AssemblyCounters counters)
348 {
349 	std::cerr << "Assembled " << counters.basesAssembled << " bp in " << counters.contigID + 1
350 	          << " contigs" << std::endl;
351 }
352 
353 /**
354  * Split a path at branching k-mers (degree > 2).
355  */
356 template<typename GraphT>
357 inline static std::vector<Path<typename boost::graph_traits<GraphT>::vertex_descriptor>>
splitPath(const Path<typename boost::graph_traits<GraphT>::vertex_descriptor> & path,const GraphT & dbg,unsigned minBranchLen)358 splitPath(
359     const Path<typename boost::graph_traits<GraphT>::vertex_descriptor>& path,
360     const GraphT& dbg,
361     unsigned minBranchLen)
362 {
363 	assert(path.size() > 0);
364 
365 	typedef typename boost::graph_traits<GraphT>::vertex_descriptor V;
366 	typedef typename Path<V>::const_iterator PathIt;
367 
368 	std::vector<Path<V>> splitPaths;
369 	Path<V> currentPath;
370 	for (PathIt it = path.begin(); it != path.end(); ++it) {
371 		currentPath.push_back(*it);
372 		unsigned inDegree = trueBranches(*it, REVERSE, dbg, minBranchLen).size();
373 		unsigned outDegree = trueBranches(*it, FORWARD, dbg, minBranchLen).size();
374 		if (inDegree > 1 || outDegree > 1) {
375 			/* we've hit a branching point -- end the current
376 			 * path and start a new one */
377 			splitPaths.push_back(currentPath);
378 			currentPath.clear();
379 			currentPath.push_back(*it);
380 		}
381 	}
382 	if (currentPath.size() > 1 || splitPaths.empty())
383 		splitPaths.push_back(currentPath);
384 
385 	assert(splitPaths.size() >= 1);
386 	return splitPaths;
387 }
388 
389 /**
390  * Trim a sequence down to the longest contiguous subsequence
391  * of "good" k-mers.  If the sequence has length < k or contains
392  * no good k-mers, the trimmed sequence will be the empty string.
393  *
394  * @param seq the DNA sequence to be trimmed
395  * @param goodKmerSet Bloom filter containing "good" k-mers
396  */
397 template<typename BloomT>
398 static inline void
trimSeq(Sequence & seq,const BloomT & goodKmerSet)399 trimSeq(Sequence& seq, const BloomT& goodKmerSet)
400 {
401 	const unsigned k = goodKmerSet.getKmerSize();
402 	const unsigned numHashes = goodKmerSet.getHashNum();
403 
404 	if (seq.length() < k) {
405 		seq.clear();
406 		return;
407 	}
408 
409 	const unsigned UNSET = UINT_MAX;
410 	unsigned prevPos = UNSET;
411 	unsigned matchStart = UNSET;
412 	unsigned matchLen = 0;
413 	unsigned maxMatchStart = UNSET;
414 	unsigned maxMatchLen = 0;
415 
416 	/* note: RollingHashIterator skips over k-mer
417 	 * positions with non-ACGT chars */
418 	for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end();
419 	     prevPos = it.pos(), ++it) {
420 		if (!goodKmerSet.contains(*it) || (prevPos != UNSET && it.pos() - prevPos > 1)) {
421 			/* end any previous match */
422 			if (matchStart != UNSET && matchLen > maxMatchLen) {
423 				maxMatchLen = matchLen;
424 				maxMatchStart = matchStart;
425 			}
426 			matchStart = UNSET;
427 			matchLen = 0;
428 		}
429 		if (goodKmerSet.contains(*it)) {
430 			/* initiate or extend match */
431 			if (matchStart == UNSET)
432 				matchStart = it.pos();
433 			matchLen++;
434 		}
435 	}
436 	/* handles case last match extends to end of seq */
437 	if (matchStart != UNSET && matchLen > maxMatchLen) {
438 		maxMatchLen = matchLen;
439 		maxMatchStart = matchStart;
440 	}
441 	/* if there were no matching k-mers */
442 	if (maxMatchLen == 0) {
443 		seq.clear();
444 		return;
445 	}
446 	/* trim read down to longest matching subseq */
447 	seq = seq.substr(maxMatchStart, maxMatchLen + k - 1);
448 }
449 
450 /**
451  * Append a contig to the output FASTA stream.
452  */
453 inline static void
printContig(const Sequence & seq,unsigned length,unsigned coverage,size_t contigID,const std::string & readID,unsigned k,std::ostream & out)454 printContig(
455     const Sequence& seq,
456     unsigned length,
457     unsigned coverage,
458     size_t contigID,
459     const std::string& readID,
460     unsigned k,
461     std::ostream& out)
462 {
463 	assert(seq.length() >= k);
464 
465 	FastaRecord contig;
466 
467 	/* set FASTA id */
468 	std::ostringstream id;
469 	id << contigID;
470 
471 	/* add FASTA comment indicating extended read id */
472 	std::ostringstream comment;
473 	comment << length << ' ' << coverage << ' ';
474 	comment << "read:" << readID;
475 	assert(id.good());
476 	contig.id = id.str();
477 	contig.comment = comment.str();
478 
479 	/* set seq */
480 	contig.seq = seq;
481 
482 	/* output FASTQ record */
483 	out << contig;
484 	assert(out);
485 }
486 
487 /**
488  * Return true if the left end of the given sequence is a blunt end
489  * in the Bloom filterde Bruijn graph. PRECONDITION: `seq` does not contain
490  * any non-ACGT chars.
491  */
492 template<typename GraphT>
493 inline static unsigned
leftIsBluntEnd(const Sequence & seq,const GraphT & graph,const AssemblyParams & params)494 leftIsBluntEnd(const Sequence& seq, const GraphT& graph, const AssemblyParams& params)
495 {
496 	unsigned k = params.k;
497 	unsigned numHashes = params.numHashes;
498 	unsigned fpLookAhead = 5;
499 
500 	if (seq.length() < k)
501 		return false;
502 
503 	Sequence firstKmerStr(seq, 0, k);
504 	Path<Vertex> path = seqToPath(firstKmerStr, k, numHashes);
505 	Vertex& firstKmer = path.front();
506 
507 	return !lookAhead(firstKmer, REVERSE, fpLookAhead, graph);
508 }
509 
510 /**
511  * Return true if the left and/or right end of the sequence is a blunt
512  * end in the de Bruijn graph.  Such reads likely have read errors
513  * and thus we should not try to extend these into contings.
514  * When testing for a blunt end, we account for the possibility of
515  * false positive extensions by using a small amount of look-ahead
516  * (as dictated by params.fpLookAhead).
517  */
518 template<typename GraphT>
519 inline static bool
hasBluntEnd(const Sequence & seq,const GraphT & graph,const AssemblyParams & params)520 hasBluntEnd(const Sequence& seq, const GraphT& graph, const AssemblyParams& params)
521 {
522 	if (leftIsBluntEnd(seq, graph, params))
523 		return true;
524 
525 	Sequence rc = reverseComplement(seq);
526 	if (leftIsBluntEnd(rc, graph, params))
527 		return true;
528 
529 	return false;
530 }
531 
532 /**
533  * Output a contig sequence if it is not redundant, i.e. it has not already
534  * been generated from a different read / thread of execution.
535  */
536 template<typename SolidKmerSetT, typename AssembledKmerSetT, typename AssemblyStreamsT>
537 inline static void
outputContig(const Path<Vertex> & contigPath,ContigRecord & rec,SolidKmerSetT & solidKmerSet,AssembledKmerSetT & assembledKmerSet,KmerHash & contigEndKmers,const AssemblyParams & params,AssemblyCounters & counters,AssemblyStreamsT & streams)538 outputContig(
539     const Path<Vertex>& contigPath,
540     ContigRecord& rec,
541     SolidKmerSetT& solidKmerSet,
542     AssembledKmerSetT& assembledKmerSet,
543     KmerHash& contigEndKmers,
544     const AssemblyParams& params,
545     AssemblyCounters& counters,
546     AssemblyStreamsT& streams)
547 {
548 	const unsigned fpLookAhead = 5;
549 
550 	Sequence seq = pathToSeq(contigPath, params.k);
551 
552 	/* vertices representing start/end k-mers of contig */
553 
554 	Sequence kmer1 = seq.substr(0, params.k);
555 	canonicalize(kmer1);
556 	RollingHash hash1(kmer1.c_str(), params.numHashes, params.k);
557 	Vertex v1(kmer1.c_str(), hash1);
558 
559 	Sequence kmer2 = seq.substr(seq.length() - params.k);
560 	canonicalize(kmer2);
561 	RollingHash hash2(kmer2.c_str(), params.numHashes, params.k);
562 	Vertex v2(kmer2.c_str(), hash2);
563 
564 	bool redundant = false;
565 #pragma omp critical(redundancyCheck)
566 	{
567 		/*
568 		 * If we use `assembledKmerSet` to check very short contigs,
569 		 * we may get full-length matches purely due to Bloom filter
570 		 * positives.  For such contigs, we additionally track
571 		 * the start and end in a separate hash table called
572 		 * `contigEndKmers`.
573 		 */
574 		if (seq.length() < params.k + fpLookAhead - 1) {
575 
576 			if (contigEndKmers.find(v1) != contigEndKmers.end() &&
577 			    contigEndKmers.find(v2) != contigEndKmers.end()) {
578 				redundant = true;
579 			} else {
580 				contigEndKmers.insert(v1);
581 				contigEndKmers.insert(v2);
582 			}
583 
584 		} else if (allKmersInBloom(seq, assembledKmerSet)) {
585 			redundant = true;
586 		}
587 
588 		if (!redundant) {
589 
590 			/* mark remaining k-mers as assembled */
591 			addKmersToBloom(seq, assembledKmerSet);
592 		}
593 	}
594 	rec.redundant = redundant;
595 
596 	if (!redundant) {
597 #pragma omp critical(fasta)
598 		{
599 			rec.length = seq.length();
600 			rec.coverage = getSeqAbsoluteKmerCoverage(seq, solidKmerSet);
601 
602 			/* add contig to output FASTA */
603 			printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.out);
604 
605 			/* add contig to checkpoint FASTA file */
606 			if (params.checkpointsEnabled())
607 				printContig(seq, rec.length, rec.coverage, counters.contigID, rec.readID, params.k, streams.checkpointOut);
608 
609 			rec.contigID = counters.contigID;
610 
611 			counters.contigID++;
612 			counters.basesAssembled += seq.length();
613 		}
614 	}
615 
616 #pragma omp critical(trace)
617 	streams.traceOut << rec;
618 }
619 
620 enum ContigType
621 {
622 	CT_LINEAR,
623 	CT_CIRCULAR,
624 	CT_HAIRPIN
625 };
626 
627 template<typename GraphT>
628 static inline ContigType
getContigType(const Path<Vertex> & contigPath,const GraphT & dbg)629 getContigType(const Path<Vertex>& contigPath, const GraphT& dbg)
630 {
631 	if (edge(contigPath.back(), contigPath.front(), dbg).second) {
632 
633 		Vertex v = contigPath.front().clone();
634 		v.shift(ANTISENSE, contigPath.back().kmer().getBase(0));
635 
636 		if (v.kmer() == contigPath.back().kmer())
637 			return CT_CIRCULAR;
638 		else
639 			return CT_HAIRPIN;
640 	}
641 
642 	return CT_LINEAR;
643 }
644 
645 /** Special case behaviour for circular/hairpin contigs */
646 template<typename GraphT>
647 static inline void
preprocessCircularContig(Path<Vertex> & contigPath,const GraphT & dbg,unsigned trim)648 preprocessCircularContig(Path<Vertex>& contigPath, const GraphT& dbg, unsigned trim)
649 {
650 	assert(!contigPath.empty());
651 
652 	ContigType contigType = getContigType(contigPath, dbg);
653 	assert(contigType != CT_LINEAR);
654 
655 	if (contigPath.size() <= 2)
656 		return;
657 
658 	/* longest branch of Bloom filter positives */
659 	const unsigned fpTrim = 5;
660 
661 	/*
662 	 * Note: A circular/hairpin contig contains at most two branch k-mers.
663 	 * And if it does contain branch k-mers, they will be the first/last
664 	 * k-mers.
665 	 *
666 	 * If only one end of the circular/hairpin contig is a branch k-mer,
667 	 * add that k-mer to the other end of the contig as well. This
668 	 * enables the usual branch k-mer trimming logic for linear contigs to
669 	 * produce the correct results downstream.
670 	 */
671 
672 	bool branchStart = ambiguous(contigPath.front(), FORWARD, dbg, trim, fpTrim) ||
673 	                   ambiguous(contigPath.front(), REVERSE, dbg, trim, fpTrim);
674 
675 	bool branchEnd = ambiguous(contigPath.back(), FORWARD, dbg, trim, fpTrim) ||
676 	                 ambiguous(contigPath.back(), REVERSE, dbg, trim, fpTrim);
677 
678 	if (branchStart && !branchEnd) {
679 
680 		if (contigType == CT_CIRCULAR) {
681 			contigPath.push_back(contigPath.front());
682 		} else {
683 			assert(contigType == CT_HAIRPIN);
684 			Vertex rc = contigPath.front().clone();
685 			rc.reverseComplement();
686 			contigPath.push_back(rc);
687 		}
688 
689 	} else if (!branchStart && branchEnd) {
690 
691 		if (contigType == CT_CIRCULAR) {
692 			contigPath.push_front(contigPath.back());
693 		} else {
694 			assert(contigType == CT_HAIRPIN);
695 			Vertex rc = contigPath.back().clone();
696 			rc.reverseComplement();
697 			contigPath.push_front(rc);
698 		}
699 	}
700 }
701 
702 /**
703  * Ensure that branching k-mers are not repeated in the output
704  * contigs by selectively trimming contig ends.
705  *
706  * The idea is to keep a branch k-mer at the end of contig only
707  * if the edge leading up to it is unambigous. For example, in the
708  * diagram below the contig generated from the right side would
709  * include the branching k-mer k5, whereas the two contigs entering
710  * on the left would discard it:
711  *
712  * ...-k1-k2
713  *          \
714  *           k5-k6-...
715  *          /
716  * ...-k3-k4
717  *
718  * If a branch k-mer has both indegree > 1 and outdegree > 1, it is
719  * output separately as its own contig of length k.
720  */
721 template<typename GraphT>
722 inline static void
trimBranchKmers(Path<Vertex> & contigPath,const GraphT & dbg,unsigned trim)723 trimBranchKmers(Path<Vertex>& contigPath, const GraphT& dbg, unsigned trim)
724 {
725 	assert(!contigPath.empty());
726 
727 	if (contigPath.size() == 1)
728 		return;
729 
730 	/* special case: circular/hairpin contigs */
731 
732 	ContigType contigType = getContigType(contigPath, dbg);
733 	if (contigType == CT_CIRCULAR || contigType == CT_HAIRPIN)
734 		preprocessCircularContig(contigPath, dbg, trim);
735 
736 	unsigned l = contigPath.size();
737 
738 	/* longest branch of Bloom filter false positives */
739 	const unsigned fpTrim = 5;
740 
741 	bool ambiguous1 = ambiguous(contigPath.at(0), contigPath.at(1), FORWARD, dbg, trim, fpTrim);
742 
743 	bool ambiguous2 =
744 	    ambiguous(contigPath.at(l - 1), contigPath.at(l - 2), REVERSE, dbg, trim, fpTrim);
745 
746 	if (ambiguous1)
747 		contigPath.pop_front();
748 
749 	assert(!contigPath.empty());
750 
751 	if (ambiguous2)
752 		contigPath.pop_back();
753 
754 	assert(!contigPath.empty());
755 }
756 
757 bool
isTip(unsigned length,PathExtensionResultCode leftResult,PathExtensionResultCode rightResult,unsigned trim)758 isTip(
759     unsigned length,
760     PathExtensionResultCode leftResult,
761     PathExtensionResultCode rightResult,
762     unsigned trim)
763 {
764 	if (length > trim)
765 		return false;
766 
767 	if (leftResult == ER_DEAD_END && (rightResult == ER_DEAD_END || rightResult == ER_AMBI_IN))
768 		return true;
769 
770 	if (rightResult == ER_DEAD_END && (leftResult == ER_DEAD_END || leftResult == ER_AMBI_IN))
771 		return true;
772 
773 	return false;
774 }
775 
776 /**
777  * Decide if a read should be extended and if so extend it into a contig.
778  */
779 template<typename SolidKmerSetT, typename AssembledKmerSetT, typename AssemblyStreamsT>
780 static inline ReadRecord
processRead(const FastaRecord & rec,const SolidKmerSetT & solidKmerSet,AssembledKmerSetT & assembledKmerSet,KmerHash & contigEndKmers,KmerHash & visitedBranchKmers,const AssemblyParams & params,AssemblyCounters & counters,AssemblyStreamsT & streams)781 processRead(
782     const FastaRecord& rec,
783     const SolidKmerSetT& solidKmerSet,
784     AssembledKmerSetT& assembledKmerSet,
785     KmerHash& contigEndKmers,
786     KmerHash& visitedBranchKmers,
787     const AssemblyParams& params,
788     AssemblyCounters& counters,
789     AssemblyStreamsT& streams)
790 {
791 	(void)visitedBranchKmers;
792 
793 	typedef typename Path<Vertex>::iterator PathIt;
794 
795 	/* Boost graph API for Bloom filter */
796 	RollingBloomDBG<SolidKmerSetT> dbg(solidKmerSet);
797 
798 	unsigned k = params.k;
799 	const Sequence& seq = rec.seq;
800 
801 	/* we can't extend reads shorter than k */
802 	if (seq.length() < k)
803 		return ReadRecord(rec.id, RR_SHORTER_THAN_K);
804 
805 	/* skip reads with non-ACGT chars */
806 	if (!allACGT(seq))
807 		return ReadRecord(rec.id, RR_NON_ACGT);
808 
809 	/* don't extend reads that are tips */
810 	if (hasBluntEnd(seq, dbg, params))
811 		return ReadRecord(rec.id, RR_BLUNT_END);
812 
813 	/* only extend "solid" reads */
814 	if (!allKmersInBloom(seq, solidKmerSet))
815 		return ReadRecord(rec.id, RR_NOT_SOLID);
816 
817 #pragma omp atomic
818 	counters.solidReads++;
819 
820 	/* skip reads in previously assembled regions */
821 	if (allKmersInBloom(seq, assembledKmerSet)) {
822 #pragma omp atomic
823 		counters.visitedReads++;
824 		return ReadRecord(rec.id, RR_ALL_KMERS_VISITED);
825 	}
826 
827 	/*
828 	 * We use `assembledKmers` to track read k-mers
829 	 * that have already been included in the output contigs, and
830 	 * therefore do not need to be processed (extended) again.
831 	 * Note that we do not use `assembledKmerSet` (a Bloom filter)
832 	 * for this purpose because Bloom filter false positives
833 	 * may cause us to omit short contigs (e.g. contigs with length k).
834 	 */
835 	unordered_set<Vertex> assembledKmers;
836 
837 	Path<Vertex> path = seqToPath(rec.seq, params.k, params.numHashes);
838 	for (PathIt it = path.begin(); it != path.end(); ++it) {
839 
840 		if (assembledKmers.find(*it) != assembledKmers.end())
841 			continue;
842 
843 		ExtendPathParams extendParams;
844 		extendParams.trimLen = params.trim;
845 		extendParams.fpTrim = 5;
846 		extendParams.maxLen = NO_LIMIT;
847 		extendParams.lookBehind = true;
848 		extendParams.lookBehindStartVertex = false;
849 
850 		ContigRecord contigRec;
851 		contigRec.readID = rec.id;
852 		contigRec.seedType = ST_READ;
853 		contigRec.seed = it->kmer().c_str();
854 
855 		Path<Vertex> contigPath;
856 		contigPath.push_back(*it);
857 
858 		contigRec.leftExtensionResult = extendPath(contigPath, REVERSE, dbg, extendParams);
859 
860 		contigRec.rightExtensionResult = extendPath(contigPath, FORWARD, dbg, extendParams);
861 
862 		PathExtensionResultCode leftResult = contigRec.leftExtensionResult.second;
863 		PathExtensionResultCode rightResult = contigRec.rightExtensionResult.second;
864 
865 		if (!isTip(contigPath.size(), leftResult, rightResult, params.trim)) {
866 			/* selectively trim branch k-mers from contig ends */
867 			trimBranchKmers(contigPath, dbg, params.trim);
868 
869 			/* output contig to FASTA file */
870 			outputContig(
871 			    contigPath, contigRec, solidKmerSet, assembledKmerSet, contigEndKmers, params, counters, streams);
872 		}
873 
874 		/* mark contig k-mers as visited */
875 		for (PathIt it2 = contigPath.begin(); it2 != contigPath.end(); ++it2)
876 			assembledKmers.insert(*it2);
877 	}
878 
879 	return ReadRecord(rec.id, RR_GENERATED_CONTIGS);
880 }
881 
882 /**
883  * Perform a Bloom-filter-based de Bruijn graph assembly.
884  * Contigs are generated by extending reads left/right within
885  * the de Bruijn graph, up to the next branching point or dead end.
886  * Short branches due to Bloom filter false positives are
887  * ignored.
888  *
889  * @param argc number of input FASTA files
890  * @param argv array of input FASTA filenames
891  * @param genomeSize approx genome size
892  * @param goodKmerSet Bloom filter containing k-mers that
893  * occur more than once in the input data
894  * @param out output stream for contigs (FASTA)
895  * @param verbose set to true to print progress messages to
896  * STDERR
897  */
898 template<typename SolidKmerSetT>
899 inline static void
assemble(int argc,char ** argv,SolidKmerSetT & solidKmerSet,const AssemblyParams & params,std::ostream & out)900 assemble(
901     int argc,
902     char** argv,
903     SolidKmerSetT& solidKmerSet,
904     const AssemblyParams& params,
905     std::ostream& out)
906 {
907 	/* k-mers in previously assembled contigs */
908 	BloomFilter assembledKmerSet(
909 	    solidKmerSet.size(), solidKmerSet.getHashNum(), solidKmerSet.getKmerSize());
910 
911 	/* counters for progress messages */
912 	AssemblyCounters counters;
913 
914 	/* input reads */
915 	FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
916 
917 	/* duplicate FASTA output for checkpoints */
918 	std::ofstream checkpointOut;
919 	if (params.checkpointsEnabled()) {
920 		assert(!params.checkpointPathPrefix.empty());
921 		std::string path = params.checkpointPathPrefix + CHECKPOINT_FASTA_EXT + CHECKPOINT_TMP_EXT;
922 		checkpointOut.open(path.c_str());
923 		assert_good(checkpointOut, path);
924 	}
925 
926 	/* trace file output ('-T' option) */
927 	std::ofstream traceOut;
928 	if (!params.tracePath.empty()) {
929 		traceOut.open(params.tracePath.c_str());
930 		assert_good(traceOut, params.tracePath);
931 		ContigRecord::printHeaders(traceOut);
932 		assert_good(traceOut, params.tracePath);
933 	}
934 
935 	/* logs outcome of processing of each read (`--read-log`) */
936 	std::ofstream readLogOut;
937 	if (!params.readLogPath.empty()) {
938 		readLogOut.open(params.readLogPath.c_str());
939 		assert_good(readLogOut, params.readLogPath);
940 		ReadRecord::printHeaders(readLogOut);
941 		assert_good(readLogOut, params.readLogPath);
942 	}
943 
944 	/* bundle output streams */
945 	AssemblyStreams<FastaConcat> streams(in, out, checkpointOut, traceOut, readLogOut);
946 
947 	/* run the assembly */
948 	assemble(solidKmerSet, assembledKmerSet, counters, params, streams);
949 }
950 
951 /**
952  * Perform a Bloom-filter-based de Bruijn graph assembly.
953  * Contigs are generated by extending reads left/right within
954  * the de Bruijn graph, up to the next branching point or dead end.
955  * Short branches due to Bloom filter false positives are
956  * ignored.
957  *
958  * @param argc number of input FASTA files
959  * @param argv array of input FASTA filenames
960  * @param genomeSize approx genome size
961  * @param goodKmerSet Bloom filter containing k-mers that
962  * occur more than once in the input data
963  * @param assembledKmerSet Bloom filter containing k-mers that have
964  * already been included in contigs
965  * @param in input stream for sequencing reads (FASTA)
966  * @param out output stream for contigs (FASTA)
967  * @param verbose set to true to print progress messages to
968  * STDERR
969  */
970 template<typename SolidKmerSetT, typename AssembledKmerSetT, typename InputReadStreamT>
971 inline static void
assemble(const SolidKmerSetT & goodKmerSet,AssembledKmerSetT & assembledKmerSet,AssemblyCounters & counters,const AssemblyParams & params,AssemblyStreams<InputReadStreamT> & streams)972 assemble(
973     const SolidKmerSetT& goodKmerSet,
974     AssembledKmerSetT& assembledKmerSet,
975     AssemblyCounters& counters,
976     const AssemblyParams& params,
977     AssemblyStreams<InputReadStreamT>& streams)
978 {
979 	assert(params.initialized());
980 
981 	/* per-thread I/O buffer (size is in bases) */
982 	const size_t SEQ_BUFFER_SIZE = 1000000;
983 
984 	if (params.verbose)
985 		std::cerr << "Trimming branches " << params.trim << " k-mers or shorter" << std::endl;
986 
987 	InputReadStreamT& in = streams.in;
988 	std::ostream& checkpointOut = streams.checkpointOut;
989 
990 	KmerHash contigEndKmers;
991 	contigEndKmers.rehash((size_t)pow(2, 28));
992 
993 	KmerHash visitedBranchKmers;
994 
995 	/*
996 	 * Print a progress message whenever `READS_PROGRESS_STEP`
997 	 * reads have been processed or `BASES_PROGRESS_STEP`
998 	 * bases have been assembled. The purpose of using
999 	 * two measures of progess is to show steady updates
1000 	 * throughout the assembly process.
1001 	 */
1002 
1003 	const size_t READS_PROGRESS_STEP = 100000;
1004 	const size_t BASES_PROGRESS_STEP = 1000000;
1005 	size_t basesProgressLine = BASES_PROGRESS_STEP;
1006 
1007 	while (true) {
1008 		size_t readsUntilCheckpoint = params.readsPerCheckpoint;
1009 
1010 #pragma omp parallel
1011 		for (std::vector<FastaRecord> buffer;;) {
1012 			/* read sequences in batches to reduce I/O contention */
1013 			buffer.clear();
1014 			size_t bufferSize;
1015 			bool good = true;
1016 #pragma omp critical(in)
1017 			for (bufferSize = 0; bufferSize < SEQ_BUFFER_SIZE && readsUntilCheckpoint > 0;) {
1018 				FastaRecord rec;
1019 				good = in >> rec;
1020 				if (!good)
1021 					break;
1022 #pragma omp atomic
1023 				readsUntilCheckpoint--;
1024 				buffer.push_back(rec);
1025 				bufferSize += rec.seq.length();
1026 			}
1027 			if (buffer.size() == 0)
1028 				break;
1029 
1030 			for (std::vector<FastaRecord>::iterator it = buffer.begin(); it != buffer.end(); ++it) {
1031 				ReadRecord result = processRead(
1032 				    *it,
1033 				    goodKmerSet,
1034 				    assembledKmerSet,
1035 				    contigEndKmers,
1036 				    visitedBranchKmers,
1037 				    params,
1038 				    counters,
1039 				    streams);
1040 
1041 #pragma omp critical(readsProgress)
1042 				{
1043 					++counters.readsProcessed;
1044 					if (params.verbose && counters.readsProcessed % READS_PROGRESS_STEP == 0)
1045 						readsProgressMessage(counters);
1046 				}
1047 
1048 				if (params.verbose)
1049 #pragma omp critical(basesProgress)
1050 				{
1051 					if (counters.basesAssembled >= basesProgressLine) {
1052 						basesProgressMessage(counters);
1053 						while (counters.basesAssembled >= basesProgressLine)
1054 							basesProgressLine += BASES_PROGRESS_STEP;
1055 					}
1056 				}
1057 
1058 				if (!params.readLogPath.empty()) {
1059 #pragma omp critical(readLog)
1060 					streams.readLogOut << result;
1061 				}
1062 			}
1063 
1064 		} /* for batch of reads between I/O operations */
1065 
1066 		if (readsUntilCheckpoint > 0) {
1067 			assert(in.eof());
1068 			break;
1069 		}
1070 
1071 		/* save assembly state to checkpoint files */
1072 		checkpointOut.flush();
1073 		createCheckpoint(goodKmerSet, assembledKmerSet, counters, params);
1074 
1075 	} /* for each batch of reads between checkpoints */
1076 
1077 	assert(in.eof());
1078 
1079 	if (params.verbose) {
1080 		readsProgressMessage(counters);
1081 		basesProgressMessage(counters);
1082 		std::cerr << "Assembly complete" << std::endl;
1083 	}
1084 
1085 	if (params.checkpointsEnabled() && !params.keepCheckpoint)
1086 		removeCheckpointData(params);
1087 }
1088 
1089 /**
1090  * Visitor class that outputs visited nodes/edges in GraphViz format during
1091  * a breadth first traversal. An instance of this class may be passed
1092  * as an argument to the `breadthFirstSearch` function.
1093  */
1094 template<typename GraphT>
1095 class GraphvizBFSVisitor : public DefaultBFSVisitor<GraphT>
1096 {
1097 	typedef typename boost::graph_traits<GraphT>::vertex_descriptor VertexT;
1098 	typedef typename boost::graph_traits<GraphT>::edge_descriptor EdgeT;
1099 
1100   public:
1101 	/** Constructor */
GraphvizBFSVisitor(std::ostream & out)1102 	GraphvizBFSVisitor(std::ostream& out)
1103 	  : m_out(out)
1104 	  , m_nodesVisited(0)
1105 	  , m_edgesVisited(0)
1106 	{
1107 		/* start directed graph (GraphViz) */
1108 		m_out << "digraph g {\n";
1109 	}
1110 
1111 	/** Destructor */
~GraphvizBFSVisitor()1112 	~GraphvizBFSVisitor()
1113 	{
1114 		/* end directed graph (GraphViz) */
1115 		m_out << "}\n";
1116 	}
1117 
1118 	/** Invoked when a vertex is visited for the first time */
discover_vertex(const VertexT & v,const GraphT &)1119 	BFSVisitorResult discover_vertex(const VertexT& v, const GraphT&)
1120 	{
1121 		++m_nodesVisited;
1122 		/* declare vertex (GraphViz) */
1123 		m_out << '\t' << v.kmer().c_str() << ";\n";
1124 
1125 		return BFS_SUCCESS;
1126 	}
1127 
1128 	/**
1129 	 * Invoked when an edge is traversed. (Each edge
1130 	 * in the graph is traversed exactly once.)
1131 	 */
examine_edge(const EdgeT & e,const GraphT & g)1132 	BFSVisitorResult examine_edge(const EdgeT& e, const GraphT& g)
1133 	{
1134 		++m_edgesVisited;
1135 		const VertexT& u = source(e, g);
1136 		const VertexT& v = target(e, g);
1137 
1138 		/* declare edge (GraphViz) */
1139 		m_out << '\t' << u.kmer().c_str() << " -> " << v.kmer().c_str() << ";\n";
1140 
1141 		return BFS_SUCCESS;
1142 	}
1143 
1144 	/** Return number of distinct nodes visited */
getNumNodesVisited()1145 	size_t getNumNodesVisited() const { return m_nodesVisited; }
1146 
1147 	/** Get number of distinct edges visited */
getNumEdgesVisited()1148 	size_t getNumEdgesVisited() const { return m_edgesVisited; }
1149 
1150   protected:
1151 	/** output stream for GraphViz serialization */
1152 	std::ostream& m_out;
1153 	/** number of nodes visited so far */
1154 	size_t m_nodesVisited;
1155 	/** number of edges visited so far */
1156 	size_t m_edgesVisited;
1157 };
1158 
1159 /**
1160  * Output a GraphViz serialization of the de Bruijn graph
1161  * using FASTA files and a Bloom filter as input.
1162  *
1163  * @param argc number of input FASTA files
1164  * @param argv array of input FASTA filenames
1165  * @param kmerSet Bloom filter containing valid k-mers
1166  * @param out output stream for GraphViz serialization
1167  * @param verbose prints progress messages to STDERR if true
1168  */
1169 template<typename BloomT>
1170 static inline void
outputGraph(int argc,char ** argv,const BloomT & kmerSet,const AssemblyParams & params,std::ostream & out)1171 outputGraph(
1172     int argc,
1173     char** argv,
1174     const BloomT& kmerSet,
1175     const AssemblyParams& params,
1176     std::ostream& out)
1177 {
1178 	assert(params.initialized());
1179 
1180 	typedef RollingBloomDBG<BloomT> GraphT;
1181 
1182 	/* interval for progress messages */
1183 	const unsigned progressStep = 1000;
1184 	const unsigned k = kmerSet.getKmerSize();
1185 	const unsigned numHashes = kmerSet.getHashNum();
1186 
1187 	/* counter for progress messages */
1188 	size_t readsProcessed = 0;
1189 
1190 	/* Boost graph API over rolling hash Bloom filter */
1191 	GraphT dbg(kmerSet);
1192 
1193 	/* Marks visited nodes in breadth-first traversal */
1194 	DefaultColorMap<GraphT> colorMap;
1195 
1196 	/* BFS Visitor -- generates GraphViz output as nodes
1197 	 * and edges are traversed. */
1198 	GraphvizBFSVisitor<GraphT> visitor(out);
1199 
1200 	if (params.verbose)
1201 		std::cerr << "Generating GraphViz output..." << std::endl;
1202 
1203 	FastaConcat in(argv, argv + argc, FastaReader::FOLD_CASE);
1204 	for (FastaRecord rec;;) {
1205 		bool good;
1206 		good = in >> rec;
1207 		if (!good)
1208 			break;
1209 		Sequence& seq = rec.seq;
1210 
1211 		/* Trim down to longest subsequence of "good" k-mers */
1212 		trimSeq(seq, kmerSet);
1213 		if (seq.length() > 0) {
1214 
1215 			/* BFS traversal in forward dir */
1216 			std::string startKmer = seq.substr(0, k);
1217 			Vertex start(startKmer.c_str(), RollingHash(startKmer, numHashes, k));
1218 			breadthFirstSearch(start, dbg, colorMap, visitor);
1219 
1220 			/* BFS traversal in reverse dir */
1221 			Sequence rcSeq = reverseComplement(seq);
1222 			std::string rcStartKmer = rcSeq.substr(0, k);
1223 			Vertex rcStart(rcStartKmer.c_str(), RollingHash(rcStartKmer, numHashes, k));
1224 			breadthFirstSearch(rcStart, dbg, colorMap, visitor);
1225 		}
1226 
1227 		if (++readsProcessed % progressStep == 0 && params.verbose) {
1228 			std::cerr << "processed " << readsProcessed
1229 			          << " (k-mers visited: " << visitor.getNumNodesVisited()
1230 			          << ", edges visited: " << visitor.getNumEdgesVisited() << ")" << std::endl;
1231 		}
1232 	}
1233 	assert(in.eof());
1234 	if (params.verbose) {
1235 		std::cerr << "processed " << readsProcessed
1236 		          << " reads (k-mers visited: " << visitor.getNumNodesVisited()
1237 		          << ", edges visited: " << visitor.getNumEdgesVisited() << ")" << std::endl;
1238 		std::cerr << "GraphViz generation complete" << std::endl;
1239 	}
1240 }
1241 
1242 /**
1243  * Write a single block of a 'variableStep' WIG file.
1244  *
1245  * @param chr chromosome name
1246  * @param start start coordinate of block
1247  * @param length length of block
1248  * @param val value of block
1249  * @param out output stream for WIG file
1250  * @param outPath path for output WIG file
1251  */
1252 static inline void
outputWigBlock(const std::string & chr,size_t start,size_t length,unsigned val,ostream & out,const std::string & outPath)1253 outputWigBlock(
1254     const std::string& chr,
1255     size_t start,
1256     size_t length,
1257     unsigned val,
1258     ostream& out,
1259     const std::string& outPath)
1260 {
1261 	assert(length > 0);
1262 	out << "variableStep chrom=" << chr << " span=" << length << "\n";
1263 	out << start << ' ' << val << '\n';
1264 	assert_good(out, outPath);
1265 }
1266 
1267 /**
1268  * Write a WIG file for a reference genome, using the values 0 and 1
1269  * to indicate whether or not a given k-mer had sufficient coverage
1270  * in the reads to exceed the minimum coverage threshold.
1271  *
1272  * @param goodKmerSet Bloom filter of k-mers that exceed the
1273  * minimum coverage threshold
1274  * @param params encapsulates all command line options for the
1275  * assembly, including the reference genome and the output path
1276  * for the WIG file.
1277  */
1278 template<class BloomT>
1279 static inline void
writeCovTrack(const BloomT & goodKmerSet,const AssemblyParams & params)1280 writeCovTrack(const BloomT& goodKmerSet, const AssemblyParams& params)
1281 {
1282 	assert(!params.covTrackPath.empty());
1283 	assert(!params.refPath.empty());
1284 
1285 	const unsigned k = goodKmerSet.getKmerSize();
1286 	const unsigned numHashes = goodKmerSet.getHashNum();
1287 
1288 	std::ofstream covTrack(params.covTrackPath.c_str());
1289 	assert_good(covTrack, params.covTrackPath);
1290 
1291 	if (params.verbose)
1292 		std::cerr << "Writing 0/1 k-mer coverage track for `" << params.refPath << "` to `"
1293 		          << params.covTrackPath << "`" << std::endl;
1294 
1295 	FastaReader ref(params.refPath.c_str(), FastaReader::FOLD_CASE);
1296 	for (FastaRecord rec; ref >> rec;) {
1297 		std::string chr = rec.id;
1298 		bool firstVal = true;
1299 		size_t blockStart = 1;
1300 		size_t blockLength = 0;
1301 		uint8_t blockVal = 0;
1302 		for (RollingHashIterator it(rec.seq, numHashes, k); it != RollingHashIterator::end();
1303 		     ++it) {
1304 			uint8_t val = goodKmerSet.contains(*it) ? 1 : 0;
1305 			if (firstVal) {
1306 				firstVal = false;
1307 				/* WIG standard uses 1-based coords */
1308 				blockStart = it.pos() + 1;
1309 				blockLength = 1;
1310 				blockVal = val;
1311 			} else if (val != blockVal) {
1312 				assert(firstVal == false);
1313 				outputWigBlock(
1314 				    chr, blockStart, blockLength, blockVal, covTrack, params.covTrackPath);
1315 				/* WIG standard uses 1-based coords */
1316 				blockStart = it.pos() + 1;
1317 				blockLength = 1;
1318 				blockVal = val;
1319 			} else {
1320 				blockLength++;
1321 			}
1322 		}
1323 		/* output last block */
1324 		if (blockLength > 0) {
1325 			outputWigBlock(chr, blockStart, blockLength, blockVal, covTrack, params.covTrackPath);
1326 		}
1327 	}
1328 	assert(ref.eof());
1329 
1330 	assert_good(covTrack, params.covTrackPath);
1331 	covTrack.close();
1332 }
1333 
1334 } /* BloomDBG namespace */
1335 
1336 #endif
1337