1 #include "Common/UnorderedMap.h"
2 #include "ContigNode.h"
3 #include "ContigPath.h"
4 #include "ContigProperties.h"
5 #include "DataBase/DB.h"
6 #include "DataBase/Options.h"
7 #include "Estimate.h"
8 #include "Graph/Assemble.h"
9 #include "Graph/ContigGraph.h"
10 #include "Graph/ContigGraphAlgorithms.h"
11 #include "Graph/DirectedGraph.h"
12 #include "Graph/GraphAlgorithms.h"
13 #include "Graph/GraphIO.h"
14 #include "Graph/GraphUtil.h"
15 #include "Graph/PopBubbles.h"
16 #include "IOUtil.h"
17 #include "Iterator.h"
18 #include "Uncompress.h"
19 #include "config.h"
20 #include <cassert>
21 #include <climits>
22 #include <cmath>
23 #include <cstdlib>
24 #include <fstream>
25 #include <functional>
26 #include <getopt.h>
27 #include <iostream>
28 #include <utility>
29 
30 using namespace std;
31 using namespace std::rel_ops;
32 using boost::edge_bundle_type;
33 using boost::tie;
34 
35 #define PROGRAM "abyss-scaffold"
36 
37 DB db;
38 
39 static const char VERSION_MESSAGE[] =
40     PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
41             "Written by Shaun Jackman.\n"
42             "\n"
43             "Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
44 
45 static const char USAGE_MESSAGE[] =
46     "Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA|OVERLAP DIST...\n"
47     "Scaffold contigs using the distance estimate graph.\n"
48     "\n"
49     " Arguments:\n"
50     "\n"
51     "  FASTA    contigs in FASTA format\n"
52     "  OVERLAP  the contig overlap graph\n"
53     "  DIST     estimates of the distance between contigs\n"
54     "\n"
55     " Options:\n"
56     "\n"
57     "  -n, --npairs=N        minimum number of pairs [0]\n"
58     "      or -n A-B:S       Find the value of n in [A,B] with step size S\n"
59     "                        that maximizes the scaffold N50.\n"
60     "                        Default value for the step size is 1, if unspecified.\n"
61     "  -s, --seed-length=N   minimum contig length [1000]\n"
62     "      or -s A-B         Find the value of s in [A,B]\n"
63     "                        that maximizes the scaffold N50.\n"
64     "      --grid            optimize using a grid search [default]\n"
65     "      --line            optimize using a line search\n"
66     "  -k, --kmer=N          length of a k-mer\n"
67     "  -G, --genome-size=N   expected genome size. Used to calculate NG50\n"
68     "                        and associated stats [disabled]\n"
69     "      --min-gap=N       minimum scaffold gap length to output [50]\n"
70     "      --max-gap=N       maximum scaffold gap length to output [inf]\n"
71     "      --complex         remove complex transitive edges\n"
72     "      --no-complex      don't remove complex transitive edges [default]\n"
73     "      --SS              expect contigs to be oriented correctly\n"
74     "      --no-SS           no assumption about contig orientation [default]\n"
75     "  -o, --out=FILE        write the paths to FILE\n"
76     "  -g, --graph=FILE      write the graph to FILE\n"
77     "  -v, --verbose         display verbose output\n"
78     "      --help            display this help and exit\n"
79     "      --version         output version information and exit\n"
80     "      --db=FILE         specify path of database repository in FILE\n"
81     "      --library=NAME    specify library NAME for sqlite\n"
82     "      --strain=NAME     specify strain NAME for sqlite\n"
83     "      --species=NAME    specify species NAME for sqlite\n"
84     "\n"
85     "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
86 
87 namespace opt {
88 string db;
89 dbVars metaVars;
90 
91 unsigned k; // used by ContigProperties
92 
93 /** Optimization search strategy. */
94 static int searchStrategy;
95 
96 /** Minimum number of pairs. */
97 static unsigned minEdgeWeight;
98 static unsigned minEdgeWeightEnd;
99 static unsigned minEdgeWeightStep;
100 
101 /** Minimum contig length. */
102 static unsigned minContigLength = 1000;
103 static unsigned minContigLengthEnd = 1000;
104 
105 /** Genome size. Used to calculate NG50. */
106 static long long unsigned genomeSize;
107 
108 /** Minimum scaffold gap length to output. */
109 static int minGap = 50;
110 
111 /** Maximum scaffold gap length to output.
112  * -ve value means no maximum. */
113 static int maxGap = -1;
114 
115 /** Write the paths to this file. */
116 static string out;
117 
118 /** Write the graph to this file. */
119 static string graphPath;
120 
121 /** Run a strand-specific RNA-Seq assembly. */
122 static int ss;
123 
124 /** Verbose output. */
125 int verbose; // used by PopBubbles
126 
127 /** Output format */
128 int format = DOT; // used by DistanceEst
129 
130 /** Remove complex transitive edges */
131 static int comp_trans;
132 }
133 
134 static const char shortopts[] = "G:g:k:n:o:s:v";
135 
136 enum
137 {
138 	OPT_HELP = 1,
139 	OPT_VERSION,
140 	OPT_MIN_GAP,
141 	OPT_MAX_GAP,
142 	OPT_COMP,
143 	OPT_DB,
144 	OPT_LIBRARY,
145 	OPT_STRAIN,
146 	OPT_SPECIES
147 };
148 
149 /** Optimization search strategy. */
150 enum
151 {
152 	GRID_SEARCH,
153 	LINE_SEARCH
154 };
155 
156 static const struct option longopts[] = {
157 	{ "graph", no_argument, NULL, 'g' },
158 	{ "kmer", required_argument, NULL, 'k' },
159 	{ "genome-size", required_argument, NULL, 'G' },
160 	{ "min-gap", required_argument, NULL, OPT_MIN_GAP },
161 	{ "max-gap", required_argument, NULL, OPT_MAX_GAP },
162 	{ "npairs", required_argument, NULL, 'n' },
163 	{ "grid", no_argument, &opt::searchStrategy, GRID_SEARCH },
164 	{ "line", no_argument, &opt::searchStrategy, LINE_SEARCH },
165 	{ "out", required_argument, NULL, 'o' },
166 	{ "seed-length", required_argument, NULL, 's' },
167 	{ "complex", no_argument, &opt::comp_trans, 1 },
168 	{ "no-complex", no_argument, &opt::comp_trans, 0 },
169 	{ "SS", no_argument, &opt::ss, 1 },
170 	{ "no-SS", no_argument, &opt::ss, 0 },
171 	{ "verbose", no_argument, NULL, 'v' },
172 	{ "help", no_argument, NULL, OPT_HELP },
173 	{ "version", no_argument, NULL, OPT_VERSION },
174 	{ "db", required_argument, NULL, OPT_DB },
175 	{ "library", required_argument, NULL, OPT_LIBRARY },
176 	{ "strain", required_argument, NULL, OPT_STRAIN },
177 	{ "species", required_argument, NULL, OPT_SPECIES },
178 	{ NULL, 0, NULL, 0 }
179 };
180 
181 /** A distance estimate graph. */
182 typedef DirectedGraph<Length, DistanceEst> DG;
183 typedef ContigGraph<DG> Graph;
184 
185 /** Return whether this edge is invalid.
186  * An edge is invalid when the overlap is larger than the length of
187  * either of its incident sequences.
188  */
189 struct InvalidEdge
190 {
InvalidEdgeInvalidEdge191 	InvalidEdge(Graph& g)
192 	  : m_g(g)
193 	{}
operator ()InvalidEdge194 	bool operator()(graph_traits<Graph>::edge_descriptor e) const
195 	{
196 		int d = m_g[e].distance;
197 		int ulength = m_g[source(e, m_g)].length;
198 		int vlength = m_g[target(e, m_g)].length;
199 		return d + ulength <= 0 || d + vlength <= 0;
200 	}
201 	const Graph& m_g;
202 };
203 
204 /** Return whether the specified edges has sufficient support. */
205 struct PoorSupport
206 {
PoorSupportPoorSupport207 	PoorSupport(Graph& g, unsigned minEdgeWeight)
208 	  : m_g(g)
209 	  , m_minEdgeWeight(minEdgeWeight)
210 	{}
operator ()PoorSupport211 	bool operator()(graph_traits<Graph>::edge_descriptor e) const
212 	{
213 		return m_g[e].numPairs < m_minEdgeWeight;
214 	}
215 	const Graph& m_g;
216 	unsigned m_minEdgeWeight;
217 };
218 
219 /** Remove short vertices and unsupported edges from the graph. */
220 static void
filterGraph(Graph & g,unsigned minEdgeWeight,unsigned minContigLength)221 filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength)
222 {
223 	typedef graph_traits<Graph> GTraits;
224 	typedef GTraits::vertex_descriptor V;
225 	typedef GTraits::vertex_iterator Vit;
226 
227 	// Remove short contigs.
228 	unsigned numRemovedV = 0;
229 	std::pair<Vit, Vit> urange = vertices(g);
230 	for (Vit uit = urange.first; uit != urange.second; ++uit) {
231 		V u = *uit;
232 		if (g[u].length < minContigLength)
233 			clear_vertex(u, g);
234 		if (out_degree(u, g) == 0 && in_degree(u, g) == 0) {
235 			remove_vertex(u, g);
236 			numRemovedV++;
237 		}
238 	}
239 	if (opt::verbose > 0)
240 		cerr << "Removed " << numRemovedV << " vertices.\n";
241 
242 	// Remove poorly-supported edges.
243 	unsigned numBefore = num_edges(g);
244 	remove_edge_if(PoorSupport(g, minEdgeWeight), static_cast<DG&>(g));
245 	unsigned numRemovedE = numBefore - num_edges(g);
246 	if (opt::verbose > 0)
247 		cerr << "Removed " << numRemovedE << " edges.\n";
248 	if (!opt::db.empty()) {
249 		addToDb(db, "V_removed", numRemovedV);
250 		addToDb(db, "E_removed", numRemovedE);
251 	}
252 }
253 
254 /** Return true if the specified edge is a cycle. */
255 static bool
isCycle(Graph & g,graph_traits<Graph>::edge_descriptor e)256 isCycle(Graph& g, graph_traits<Graph>::edge_descriptor e)
257 {
258 	return edge(target(e, g), source(e, g), g).second;
259 }
260 
261 /** Remove simple cycles of length two from the graph. */
262 static void
removeCycles(Graph & g)263 removeCycles(Graph& g)
264 {
265 	typedef graph_traits<Graph>::edge_descriptor E;
266 	typedef graph_traits<Graph>::edge_iterator Eit;
267 
268 	// Identify the cycles.
269 	vector<E> cycles;
270 	Eit eit, elast;
271 	for (tie(eit, elast) = edges(g); eit != elast; ++eit) {
272 		E e = *eit;
273 		if (isCycle(g, e))
274 			cycles.push_back(e);
275 	}
276 
277 	/** Remove the cycles. */
278 	remove_edges(g, cycles.begin(), cycles.end());
279 	if (opt::verbose > 0) {
280 		cerr << "Removed " << cycles.size() << " cyclic edges.\n";
281 		printGraphStats(cerr, g);
282 	}
283 
284 	if (!opt::db.empty())
285 		addToDb(db, "E_removed_cyclic", cycles.size());
286 }
287 
288 /** Find edges in g0 that resolve forks in g.
289  * For a pair of edges (u,v1) and (u,v2) in g, if exactly one of the
290  * edges (v1,v2) or (v2,v1) exists in g0, add that edge to g.
291  */
292 static void
resolveForks(Graph & g,const Graph & g0)293 resolveForks(Graph& g, const Graph& g0)
294 {
295 	typedef graph_traits<Graph>::adjacency_iterator Vit;
296 	typedef graph_traits<Graph>::edge_descriptor E;
297 	typedef graph_traits<Graph>::vertex_iterator Uit;
298 	typedef graph_traits<Graph>::vertex_descriptor V;
299 
300 	unsigned numEdges = 0;
301 	pair<Uit, Uit> urange = vertices(g);
302 	for (Uit uit = urange.first; uit != urange.second; ++uit) {
303 		V u = *uit;
304 		if (out_degree(u, g) < 2)
305 			continue;
306 		pair<Vit, Vit> vrange = adjacent_vertices(u, g);
307 		for (Vit vit1 = vrange.first; vit1 != vrange.second;) {
308 			V v1 = *vit1;
309 			++vit1;
310 			assert(v1 != u);
311 			for (Vit vit2 = vit1; vit2 != vrange.second; ++vit2) {
312 				V v2 = *vit2;
313 				assert(v2 != u);
314 				assert(v1 != v2);
315 				if (edge(v1, v2, g).second || edge(v2, v1, g).second)
316 					continue;
317 				pair<E, bool> e12 = edge(v1, v2, g0);
318 				pair<E, bool> e21 = edge(v2, v1, g0);
319 				if (e12.second && e21.second) {
320 					if (opt::verbose > 1)
321 						cerr << "cycle: " << get(vertex_name, g, v1) << ' '
322 						     << get(vertex_name, g, v2) << '\n';
323 				} else if (e12.second || e21.second) {
324 					E e = e12.second ? e12.first : e21.first;
325 					V v = source(e, g0), w = target(e, g0);
326 					add_edge(v, w, g0[e], g);
327 					numEdges++;
328 					if (opt::verbose > 1)
329 						cerr << get(vertex_name, g, u) << " -> " << get(vertex_name, g, v) << " -> "
330 						     << get(vertex_name, g, w) << " [" << g0[e] << "]\n";
331 				}
332 			}
333 		}
334 	}
335 	if (opt::verbose > 0)
336 		cerr << "Added " << numEdges << " edges to ambiguous vertices.\n";
337 	if (!opt::db.empty())
338 		addToDb(db, "E_added_ambig", numEdges);
339 }
340 
341 /** Remove tips.
342  * For an edge (u,v), remove the vertex v if deg+(u) > 1
343  * and deg-(v) = 1 and deg+(v) = 0.
344  */
345 static void
pruneTips(Graph & g)346 pruneTips(Graph& g)
347 {
348 	/** Identify the tips. */
349 	size_t n = 0;
350 	pruneTips(g, CountingOutputIterator(n));
351 
352 	if (opt::verbose > 0) {
353 		cerr << "Removed " << n << " tips.\n";
354 		printGraphStats(cerr, g);
355 	}
356 
357 	if (!opt::db.empty())
358 		addToDb(db, "Tips_removed", n);
359 }
360 
361 /** Remove repetitive vertices from this graph.
362  * input: digraph g { t1->v1 t2->v2 t1->u t2->u u->v1 u->v2 }
363  * operation: remove vertex u
364  * output: digraph g { t1->v1 t2->v2 }
365  */
366 static void
removeRepeats(Graph & g)367 removeRepeats(Graph& g)
368 {
369 	typedef graph_traits<Graph>::adjacency_iterator Ait;
370 	typedef graph_traits<Graph>::edge_descriptor E;
371 	typedef graph_traits<Graph>::vertex_descriptor V;
372 
373 	vector<V> repeats;
374 	vector<E> transitive;
375 	find_transitive_edges(g, back_inserter(transitive));
376 	for (vector<E>::const_iterator it = transitive.begin(); it != transitive.end(); ++it) {
377 		// Iterate through the transitive edges, u->w1.
378 		V u = source(*it, g), w1 = target(*it, g);
379 		Ait vit, vlast;
380 		for (tie(vit, vlast) = adjacent_vertices(u, g); vit != vlast; ++vit) {
381 			V v = *vit;
382 			assert(u != v); // no self loops
383 			if (!edge(v, w1, g).second)
384 				continue;
385 			// u->w1 is a transitive edge spanning u->v->w1.
386 			Ait wit, wlast;
387 			for (tie(wit, wlast) = adjacent_vertices(v, g); wit != wlast; ++wit) {
388 				// For each edge v->w2, check that an edge
389 				// w1->w2 or w2->w1 exists. If not, v is a repeat.
390 				V w2 = *wit;
391 				assert(v != w2); // no self loops
392 				if (w1 != w2 && !edge(w1, w2, g).second && !edge(w2, w1, g).second) {
393 					repeats.push_back(v);
394 					break;
395 				}
396 			}
397 		}
398 	}
399 
400 	sort(repeats.begin(), repeats.end());
401 	repeats.erase(unique(repeats.begin(), repeats.end()), repeats.end());
402 	if (opt::verbose > 1) {
403 		cerr << "Ambiguous:";
404 		for (vector<V>::const_iterator it = repeats.begin(); it != repeats.end(); ++it)
405 			cerr << ' ' << get(vertex_name, g, *it);
406 		cerr << '\n';
407 	}
408 
409 	// Remove the repetitive vertices.
410 	unsigned numRemoved = 0;
411 	for (vector<V>::const_iterator it = repeats.begin(); it != repeats.end(); ++it) {
412 		V u = *it;
413 		V uc = get(vertex_complement, g, u);
414 		clear_out_edges(u, g);
415 		if (it != repeats.begin() && it[-1] == uc) {
416 			remove_vertex(u, g);
417 			numRemoved++;
418 		}
419 	}
420 
421 	if (opt::verbose > 0) {
422 		cerr << "Cleared " << repeats.size() << " ambiguous vertices.\n"
423 		     << "Removed " << numRemoved << " ambiguous vertices.\n";
424 		printGraphStats(cerr, g);
425 	}
426 	if (!opt::db.empty()) {
427 		addToDb(db, "V_cleared_ambg", repeats.size());
428 		addToDb(db, "V_removed_ambg", numRemoved);
429 	}
430 }
431 
432 /** Remove weak edges from this graph.
433  * input: digraph g { u1->v2 u1->v1 u2->v2 }
434  *        (u1,v2).n < (u1,v1).n and (u1,v2).n < (u2,v2).n
435  * operation: remove edge u1->v2
436  * output: digraph g {u1->v1 u2->v2 }
437  */
438 static void
removeWeakEdges(Graph & g)439 removeWeakEdges(Graph& g)
440 {
441 	typedef graph_traits<Graph>::edge_descriptor E;
442 	typedef graph_traits<Graph>::edge_iterator Eit;
443 	typedef graph_traits<Graph>::in_edge_iterator Iit;
444 	typedef graph_traits<Graph>::out_edge_iterator Oit;
445 	typedef graph_traits<Graph>::vertex_descriptor V;
446 
447 	vector<E> weak;
448 	Eit eit, elast;
449 	for (tie(eit, elast) = edges(g); eit != elast; ++eit) {
450 		E u1v2 = *eit;
451 		V u1 = source(u1v2, g), v2 = target(u1v2, g);
452 		if (out_degree(u1, g) != 2 || in_degree(v2, g) != 2)
453 			continue;
454 
455 		Oit oit, olast;
456 		tie(oit, olast) = out_edges(u1, g);
457 		E u1v1;
458 		if (target(*oit, g) == v2) {
459 			++oit;
460 			u1v1 = *oit;
461 		} else {
462 			u1v1 = *oit;
463 			++oit;
464 		}
465 		assert(++oit == olast);
466 		V v1 = target(u1v1, g);
467 		assert(v1 != v2);
468 		if (in_degree(v1, g) != 1)
469 			continue;
470 
471 		Iit iit, ilast;
472 		tie(iit, ilast) = in_edges(v2, g);
473 		E u2v2;
474 		if (source(*iit, g) == u1) {
475 			++iit;
476 			assert(iit != ilast);
477 			u2v2 = *iit;
478 		} else {
479 			assert(iit != ilast);
480 			u2v2 = *iit;
481 			++iit;
482 		}
483 		assert(++iit == ilast);
484 		V u2 = source(u2v2, g);
485 		assert(u1 != u2);
486 		if (out_degree(u2, g) != 1)
487 			continue;
488 
489 		unsigned n = g[u1v2].numPairs;
490 		if (n < g[u1v1].numPairs && n < g[u2v2].numPairs)
491 			weak.push_back(u1v2);
492 	}
493 
494 	if (opt::verbose > 1) {
495 		cerr << "Weak edges:\n";
496 		for (vector<E>::const_iterator it = weak.begin(); it != weak.end(); ++it) {
497 			E e = *it;
498 			cerr << '\t' << get(edge_name, g, e) << " [" << g[e] << "]\n";
499 		}
500 	}
501 
502 	/** Remove the weak edges. */
503 	remove_edges(g, weak.begin(), weak.end());
504 	if (opt::verbose > 0) {
505 		cerr << "Removed " << weak.size() << " weak edges.\n";
506 		printGraphStats(cerr, g);
507 	}
508 	if (!opt::db.empty())
509 		addToDb(db, "E_removed_weak", weak.size());
510 }
511 
512 static void
removeLongEdges(Graph & g)513 removeLongEdges(Graph& g)
514 {
515 	typedef graph_traits<Graph>::edge_descriptor E;
516 	typedef graph_traits<Graph>::edge_iterator Eit;
517 
518 	vector<E> long_e;
519 	Eit eit, elast;
520 	for (tie(eit, elast) = edges(g); eit != elast; ++eit) {
521 		E e = *eit;
522 		if (g[e].distance > opt::maxGap)
523 			long_e.push_back(e);
524 	}
525 	remove_edges(g, long_e.begin(), long_e.end());
526 }
527 
528 /** Return whether the specified distance estimate is an exact
529  * overlap.
530  */
531 static bool
isOverlap(const DistanceEst & d)532 isOverlap(const DistanceEst& d)
533 {
534 	if (d.stdDev == 0) {
535 		assert(d.distance < 0);
536 		return true;
537 	} else
538 		return false;
539 }
540 
541 /** Add distance estimates to a path.
542  * @param g0 the original graph
543  * @param g1 the transformed graph
544  */
545 static ContigPath
addDistEst(const Graph & g0,const Graph & g1,const ContigPath & path)546 addDistEst(const Graph& g0, const Graph& g1, const ContigPath& path)
547 {
548 	typedef graph_traits<Graph>::edge_descriptor E;
549 	typedef edge_bundle_type<Graph>::type EP;
550 
551 	ContigPath out;
552 	out.reserve(2 * path.size());
553 	ContigNode u = path.front();
554 	out.push_back(u);
555 	for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) {
556 		ContigNode v = *it;
557 		assert(!v.ambiguous());
558 		pair<E, bool> e0 = edge(u, v, g0);
559 		pair<E, bool> e1 = edge(u, v, g1);
560 		if (!e0.second && !e1.second)
561 			std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> "
562 			          << get(vertex_name, g0, v) << '\n';
563 		assert(e0.second || e1.second);
564 		const EP& ep = e0.second ? g0[e0.first] : g1[e1.first];
565 		if (!isOverlap(ep)) {
566 			int distance = max(ep.distance, (int)opt::minGap);
567 			int numN = distance + opt::k - 1; // by convention
568 			assert(numN >= 0);
569 			numN = max(numN, 1);
570 			out.push_back(ContigNode(numN, 'N'));
571 		}
572 		out.push_back(v);
573 		u = v;
574 	}
575 	return out;
576 }
577 
578 /** Read a graph from the specified file. */
579 static void
readGraph(const string & path,Graph & g)580 readGraph(const string& path, Graph& g)
581 {
582 	if (opt::verbose > 0)
583 		cerr << "Reading `" << path << "'...\n";
584 	ifstream fin(path.c_str());
585 	istream& in = path == "-" ? cin : fin;
586 	assert_good(in, path);
587 	read_graph(in, g, BetterDistanceEst());
588 	assert(in.eof());
589 	if (opt::verbose > 0)
590 		printGraphStats(cerr, g);
591 
592 	vector<int> vals = passGraphStatsVal(g);
593 	vector<string> keys = make_vector<string>() << "V_readGraph"
594 	                                            << "E_readGraph"
595 	                                            << "degree0_readGraph"
596 	                                            << "degree1_readGraph"
597 	                                            << "degree234_readGraph"
598 	                                            << "degree5_readGraph"
599 	                                            << "max_readGraph";
600 
601 	if (!opt::db.empty()) {
602 		for (unsigned i = 0; i < vals.size(); i++)
603 			addToDb(db, keys[i], vals[i]);
604 	}
605 	g_contigNames.lock();
606 }
607 
608 /** Return the scaffold length of [first, last), not counting gaps. */
609 template<typename It>
610 unsigned
addLength(const Graph & g,It first,It last)611 addLength(const Graph& g, It first, It last)
612 {
613 	typedef typename graph_traits<Graph>::vertex_descriptor V;
614 	assert(first != last);
615 	unsigned length = g[*first].length;
616 	for (It it = first + 1; it != last; ++it) {
617 		V u = *(it - 1);
618 		V v = *it;
619 		length += min(0, get(edge_bundle, g, u, v).distance);
620 		length += g[v].length;
621 	}
622 	return length;
623 }
624 
625 /** A container of contig paths. */
626 typedef vector<ContigPath> ContigPaths;
627 
628 /**
629  * Build the scaffold length histogram.
630  * @param g The graph g is destroyed.
631  */
632 static Histogram
buildScaffoldLengthHistogram(Graph & g,const ContigPaths & paths)633 buildScaffoldLengthHistogram(Graph& g, const ContigPaths& paths)
634 {
635 	Histogram h;
636 
637 	// Clear the removed flag.
638 	typedef graph_traits<Graph>::vertex_iterator Vit;
639 	Vit uit, ulast;
640 	for (tie(uit, ulast) = vertices(g); uit != ulast; ++uit)
641 		put(vertex_removed, g, *uit, false);
642 
643 	// Remove the vertices that are used in paths
644 	// and add the lengths of the scaffolds.
645 	for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
646 		h.insert(addLength(g, it->begin(), it->end()));
647 		remove_vertex_if(
648 		    g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); });
649 	}
650 
651 	// Add the contigs that were not used in paths.
652 	for (tie(uit, ulast) = vertices(g); uit != ulast; ++++uit) {
653 		typedef graph_traits<Graph>::vertex_descriptor V;
654 		V u = *uit;
655 		if (!get(vertex_removed, g, u))
656 			h.insert(g[u].length);
657 	}
658 
659 	return h;
660 }
661 
662 /** Add contiguity stats to database */
663 static void
addCntgStatsToDb(const Histogram h,const unsigned min)664 addCntgStatsToDb(const Histogram h, const unsigned min)
665 {
666 	vector<int> vals = passContiguityStatsVal(h, min);
667 	vector<string> keys = make_vector<string>() << "n"
668 	                                            << "n200"
669 	                                            << "nN50"
670 	                                            << "min"
671 	                                            << "N75"
672 	                                            << "N50"
673 	                                            << "N25"
674 	                                            << "Esize"
675 	                                            << "max"
676 	                                            << "sum"
677 	                                            << "nNG50"
678 	                                            << "NG50";
679 	if (!opt::db.empty()) {
680 		for (unsigned i = 0; i < vals.size(); i++)
681 			addToDb(db, keys[i], vals[i]);
682 	}
683 }
684 
685 /** Parameters of scaffolding. */
686 struct ScaffoldParam
687 {
ScaffoldParamScaffoldParam688 	ScaffoldParam(unsigned n, unsigned s)
689 	  : n(n)
690 	  , s(s)
691 	{}
operator ==ScaffoldParam692 	bool operator==(const ScaffoldParam& o) const { return n == o.n && s == o.s; }
693 	unsigned n;
694 	unsigned s;
695 };
696 
697 NAMESPACE_STD_HASH_BEGIN
698 template<>
699 struct hash<ScaffoldParam>
700 {
operator ()hash701 	size_t operator()(const ScaffoldParam& param) const
702 	{
703 		return hash<unsigned>()(param.n) ^ hash<unsigned>()(param.s);
704 	}
705 };
706 NAMESPACE_STD_HASH_END
707 
708 /** Result of scaffolding. */
709 struct ScaffoldResult : ScaffoldParam
710 {
ScaffoldResultScaffoldResult711 	ScaffoldResult()
712 	  : ScaffoldParam(0, 0)
713 	  , n50(0)
714 	{}
ScaffoldResultScaffoldResult715 	ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics)
716 	  : ScaffoldParam(n, s)
717 	  , n50(n50)
718 	  , metrics(metrics)
719 	{}
720 	unsigned n50;
721 	std::string metrics;
722 };
723 
724 /** Build scaffold paths.
725  * @param output write the results
726  * @return the scaffold N50
727  */
728 ScaffoldResult
scaffold(const Graph & g0,unsigned minEdgeWeight,unsigned minContigLength,bool output)729 scaffold(const Graph& g0, unsigned minEdgeWeight, unsigned minContigLength, bool output)
730 {
731 	Graph g(g0);
732 
733 	// Filter the graph.
734 	filterGraph(g, minEdgeWeight, minContigLength);
735 	if (opt::verbose > 0)
736 		printGraphStats(cerr, g);
737 
738 	// Remove cycles.
739 	removeCycles(g);
740 
741 	// Resolve forks.
742 	resolveForks(g, g0);
743 
744 	// Prune tips.
745 	pruneTips(g);
746 
747 	// Remove repeats.
748 	removeRepeats(g);
749 
750 	// Remove transitive edges.
751 	unsigned numTransitive;
752 	if (opt::comp_trans)
753 		numTransitive = remove_complex_transitive_edges(g);
754 	else
755 		numTransitive = remove_transitive_edges(g);
756 
757 	if (opt::verbose > 0) {
758 		cerr << "Removed " << numTransitive << " transitive edges.\n";
759 		printGraphStats(cerr, g);
760 	}
761 
762 	if (!opt::db.empty())
763 		addToDb(db, "Edges_transitive", numTransitive);
764 
765 	// Prune tips.
766 	pruneTips(g);
767 
768 	// Pop bubbles.
769 	typedef graph_traits<Graph>::vertex_descriptor V;
770 	vector<V> popped = popBubbles(g);
771 	if (opt::verbose > 0) {
772 		cerr << "Removed " << popped.size() << " vertices in bubbles.\n";
773 		printGraphStats(cerr, g);
774 	}
775 
776 	if (!opt::db.empty())
777 		addToDb(db, "Vertices_bubblePopped", popped.size());
778 
779 	if (opt::verbose > 1) {
780 		cerr << "Popped:";
781 		for (vector<V>::const_iterator it = popped.begin(); it != popped.end(); ++it)
782 			cerr << ' ' << get(vertex_name, g, *it);
783 		cerr << '\n';
784 	}
785 
786 	// Remove weak edges.
787 	removeWeakEdges(g);
788 
789 	// Remove any edges longer than opt::maxGap.
790 	if (opt::maxGap >= 0)
791 		removeLongEdges(g);
792 
793 	// Assemble the paths.
794 	ContigPaths paths;
795 	assembleDFS(g, back_inserter(paths), opt::ss);
796 	sort(paths.begin(), paths.end());
797 	unsigned n = 0;
798 	if (opt::verbose > 0) {
799 		for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it)
800 			n += it->size();
801 		cerr << "Assembled " << n << " contigs in " << paths.size() << " scaffolds.\n";
802 		printGraphStats(cerr, g);
803 	}
804 
805 	if (!opt::db.empty()) {
806 		addToDb(db, "contigs_assembled", n);
807 		addToDb(db, "scaffolds_assembled", paths.size());
808 	}
809 
810 	if (output) {
811 		// Output the paths.
812 		ofstream fout(opt::out.c_str());
813 		ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
814 		assert_good(out, opt::out);
815 		g_contigNames.unlock();
816 		for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it)
817 			out << createContigName() << '\t' << addDistEst(g0, g, *it) << '\n';
818 		assert_good(out, opt::out);
819 
820 		// Output the graph.
821 		if (!opt::graphPath.empty()) {
822 			ofstream out(opt::graphPath.c_str());
823 			assert_good(out, opt::graphPath);
824 			write_dot(out, g);
825 			assert_good(out, opt::graphPath);
826 		}
827 	}
828 
829 	// Compute contiguity metrics.
830 	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
831 	std::ostringstream ss;
832 	Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths);
833 	printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH, false, "\t", opt::genomeSize)
834 	    << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n';
835 	std::string metrics = ss.str();
836 	addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH);
837 
838 	return ScaffoldResult(
839 	    minEdgeWeight,
840 	    minContigLength,
841 	    scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(),
842 	    metrics);
843 }
844 
845 /** Memoize the optimization results so far. */
846 typedef unordered_map<ScaffoldParam, ScaffoldResult> ScaffoldMemo;
847 
848 /** Build scaffold paths, memoized. */
849 ScaffoldResult
scaffold_memoized(const Graph & g,unsigned n,unsigned s,ScaffoldMemo & memo)850 scaffold_memoized(const Graph& g, unsigned n, unsigned s, ScaffoldMemo& memo)
851 {
852 	ScaffoldParam param(n, s);
853 	ScaffoldMemo::const_iterator it = memo.find(param);
854 	if (it != memo.end()) {
855 		// Clear the metrics string, so that this result is not listed
856 		// multiple times in the final table of metrics.
857 		ScaffoldResult result(it->second);
858 		result.metrics.clear();
859 		return result;
860 	}
861 
862 	if (opt::verbose > 0)
863 		std::cerr << "\nScaffolding with n=" << n << " s=" << s << "\n\n";
864 	ScaffoldResult result = scaffold(g, n, s, false);
865 	memo[param] = result;
866 
867 	// Print assembly metrics.
868 	if (opt::verbose > 0) {
869 		std::cerr << '\n';
870 		const unsigned STATS_MIN_LENGTH = opt::minContigLength;
871 		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
872 	}
873 	std::cerr << result.metrics;
874 	if (opt::verbose > 0)
875 		cerr << '\n';
876 	return result;
877 }
878 
879 /** Find the value of n that maximizes the scaffold N50. */
880 static ScaffoldResult
optimize_n(const Graph & g,std::pair<unsigned,unsigned> minEdgeWeight,unsigned minContigLength,ScaffoldMemo & memo)881 optimize_n(
882     const Graph& g,
883     std::pair<unsigned, unsigned> minEdgeWeight,
884     unsigned minContigLength,
885     ScaffoldMemo& memo)
886 {
887 	std::string metrics_table;
888 	unsigned bestn = 0, bestN50 = 0;
889 	for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
890 		ScaffoldResult result = scaffold_memoized(g, n, minContigLength, memo);
891 		metrics_table += result.metrics;
892 		if (result.n50 > bestN50) {
893 			bestN50 = result.n50;
894 			bestn = n;
895 		}
896 	}
897 
898 	return ScaffoldResult(bestn, minContigLength, bestN50, metrics_table);
899 }
900 
901 /** Find the value of s that maximizes the scaffold N50. */
902 static ScaffoldResult
optimize_s(const Graph & g,unsigned minEdgeWeight,std::pair<unsigned,unsigned> minContigLength,ScaffoldMemo & memo)903 optimize_s(
904     const Graph& g,
905     unsigned minEdgeWeight,
906     std::pair<unsigned, unsigned> minContigLength,
907     ScaffoldMemo& memo)
908 {
909 	std::string metrics_table;
910 	unsigned bests = 0, bestN50 = 0;
911 	const double STEP = cbrt(10); // Three steps per decade.
912 	unsigned ilast = (unsigned)round(log(minContigLength.second) / log(STEP));
913 	for (unsigned i = (unsigned)round(log(minContigLength.first) / log(STEP)); i <= ilast; ++i) {
914 		unsigned s = (unsigned)pow(STEP, (int)i);
915 
916 		// Round to 1 figure.
917 		double nearestDecade = pow(10, floor(log10(s)));
918 		s = unsigned(round(s / nearestDecade) * nearestDecade);
919 
920 		ScaffoldResult result = scaffold_memoized(g, minEdgeWeight, s, memo);
921 		metrics_table += result.metrics;
922 		if (result.n50 > bestN50) {
923 			bestN50 = result.n50;
924 			bests = s;
925 		}
926 	}
927 
928 	return ScaffoldResult(minEdgeWeight, bests, bestN50, metrics_table);
929 }
930 
931 /** Find the values of n and s that maximizes the scaffold N50. */
932 static ScaffoldResult
optimize_grid_search(const Graph & g,std::pair<unsigned,unsigned> minEdgeWeight,std::pair<unsigned,unsigned> minContigLength)933 optimize_grid_search(
934     const Graph& g,
935     std::pair<unsigned, unsigned> minEdgeWeight,
936     std::pair<unsigned, unsigned> minContigLength)
937 {
938 	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
939 	if (opt::verbose == 0)
940 		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
941 
942 	ScaffoldMemo memo;
943 	std::string metrics_table;
944 	ScaffoldResult best(0, 0, 0, "");
945 	for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
946 		ScaffoldResult result = optimize_s(g, n, minContigLength, memo);
947 		metrics_table += result.metrics;
948 		if (result.n50 > best.n50)
949 			best = result;
950 	}
951 
952 	best.metrics = metrics_table;
953 	return best;
954 }
955 
956 /** Find the values of n and s that maximizes the scaffold N50. */
957 static ScaffoldResult
optimize_line_search(const Graph & g,std::pair<unsigned,unsigned> minEdgeWeight,std::pair<unsigned,unsigned> minContigLength)958 optimize_line_search(
959     const Graph& g,
960     std::pair<unsigned, unsigned> minEdgeWeight,
961     std::pair<unsigned, unsigned> minContigLength)
962 {
963 	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
964 	if (opt::verbose == 0)
965 		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
966 
967 	ScaffoldMemo memo;
968 	std::string metrics_table;
969 	ScaffoldResult best(
970 	    (minEdgeWeight.first + minEdgeWeight.second) / 2, minContigLength.second, 0, "");
971 	// An upper limit on the number of iterations.
972 	const unsigned MAX_ITERATIONS =
973 	    1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep;
974 	for (unsigned i = 0; i < MAX_ITERATIONS; ++i) {
975 		// Optimize s.
976 		if (opt::verbose > 0) {
977 			std::cerr << "\nOptimizing s for n=" << best.n << "\n\n";
978 			printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
979 		}
980 		unsigned previous_s = best.s;
981 		best = optimize_s(g, best.n, minContigLength, memo);
982 		metrics_table += best.metrics;
983 		if (best.s == previous_s)
984 			break;
985 
986 		// Optimize n.
987 		if (opt::verbose > 0) {
988 			std::cerr << "\nOptimizing n for s=" << best.s << "\n\n";
989 			printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
990 		}
991 		unsigned previous_n = best.n;
992 		best = optimize_n(g, minEdgeWeight, best.s, memo);
993 		metrics_table += best.metrics;
994 		if (best.n == previous_n)
995 			break;
996 	}
997 
998 	std::cerr << "\nLine search converged in " << memo.size() << " iterations.\n";
999 
1000 	best.metrics = metrics_table;
1001 	return best;
1002 }
1003 
1004 /** Run abyss-scaffold. */
1005 int
main(int argc,char ** argv)1006 main(int argc, char** argv)
1007 {
1008 	if (!opt::db.empty())
1009 		opt::metaVars.resize(3);
1010 
1011 	bool die = false;
1012 	for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
1013 		istringstream arg(optarg != NULL ? optarg : "");
1014 		switch (c) {
1015 		case '?':
1016 			die = true;
1017 			break;
1018 		case 'k':
1019 			arg >> opt::k;
1020 			break;
1021 		case 'G': {
1022 			double x;
1023 			arg >> x;
1024 			opt::genomeSize = x;
1025 			break;
1026 		}
1027 		case 'g':
1028 			arg >> opt::graphPath;
1029 			break;
1030 		case 'n':
1031 			arg >> opt::minEdgeWeight;
1032 			if (arg.peek() == '-') {
1033 				arg >> expect("-") >> opt::minEdgeWeightEnd;
1034 				assert(opt::minEdgeWeight <= opt::minEdgeWeightEnd);
1035 			} else
1036 				opt::minEdgeWeightEnd = opt::minEdgeWeight;
1037 			if (arg.peek() == ':')
1038 				arg >> expect(":") >> opt::minEdgeWeightStep;
1039 			else
1040 				opt::minEdgeWeightStep = 1;
1041 			break;
1042 		case 'o':
1043 			arg >> opt::out;
1044 			break;
1045 		case 's':
1046 			arg >> opt::minContigLength;
1047 			if (arg.peek() == '-') {
1048 				opt::minContigLengthEnd = 100 * opt::minContigLength;
1049 				arg >> expect("-") >> opt::minContigLengthEnd;
1050 				assert(opt::minContigLength <= opt::minContigLengthEnd);
1051 			} else
1052 				opt::minContigLengthEnd = opt::minContigLength;
1053 			break;
1054 		case 'v':
1055 			opt::verbose++;
1056 			break;
1057 		case OPT_MIN_GAP:
1058 			arg >> opt::minGap;
1059 			break;
1060 		case OPT_MAX_GAP:
1061 			arg >> opt::maxGap;
1062 			break;
1063 		case OPT_HELP:
1064 			cout << USAGE_MESSAGE;
1065 			exit(EXIT_SUCCESS);
1066 		case OPT_VERSION:
1067 			cout << VERSION_MESSAGE;
1068 			exit(EXIT_SUCCESS);
1069 		case OPT_DB:
1070 			arg >> opt::db;
1071 			break;
1072 		case OPT_LIBRARY:
1073 			arg >> opt::metaVars[0];
1074 			break;
1075 		case OPT_STRAIN:
1076 			arg >> opt::metaVars[1];
1077 			break;
1078 		case OPT_SPECIES:
1079 			arg >> opt::metaVars[2];
1080 			break;
1081 		}
1082 		if (optarg != NULL && !arg.eof()) {
1083 			cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
1084 			exit(EXIT_FAILURE);
1085 		}
1086 	}
1087 
1088 	if (opt::k <= 0) {
1089 		cerr << PROGRAM ": "
1090 		     << "missing -k,--kmer option\n";
1091 		die = true;
1092 	}
1093 
1094 	if (argc - optind < 0) {
1095 		cerr << PROGRAM ": missing arguments\n";
1096 		die = true;
1097 	}
1098 
1099 	if (die) {
1100 		cerr << "Try `" << PROGRAM << " --help' for more information.\n";
1101 		exit(EXIT_FAILURE);
1102 	}
1103 	if (!opt::db.empty()) {
1104 		init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
1105 		addToDb(db, "K", opt::k);
1106 	}
1107 
1108 	Graph g;
1109 	if (optind < argc) {
1110 		for (; optind < argc; optind++)
1111 			readGraph(argv[optind], g);
1112 	} else
1113 		readGraph("-", g);
1114 
1115 	// Add any missing complementary edges.
1116 	size_t numAdded = addComplementaryEdges(g);
1117 	if (opt::verbose > 0) {
1118 		cerr << "Added " << numAdded << " complementary edges.\n";
1119 		printGraphStats(cerr, g);
1120 	}
1121 
1122 	if (!opt::db.empty())
1123 		addToDb(db, "add_complement_edges", numAdded);
1124 
1125 	// Remove invalid edges.
1126 	unsigned numBefore = num_edges(g);
1127 	remove_edge_if(InvalidEdge(g), static_cast<DG&>(g));
1128 	unsigned numRemoved = numBefore - num_edges(g);
1129 	if (numRemoved > 0)
1130 		cerr << "warning: Removed " << numRemoved << " invalid edges.\n";
1131 
1132 	if (!opt::db.empty())
1133 		addToDb(db, "Edges_invalid", numRemoved);
1134 
1135 	const unsigned STATS_MIN_LENGTH = opt::minContigLength;
1136 	if (opt::minEdgeWeight == opt::minEdgeWeightEnd &&
1137 	    opt::minContigLength == opt::minContigLengthEnd) {
1138 		ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true);
1139 		// Print assembly contiguity statistics.
1140 		if (opt::verbose > 0)
1141 			std::cerr << '\n';
1142 		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
1143 		std::cerr << result.metrics;
1144 	} else {
1145 		ScaffoldResult best(0, 0, 0, "");
1146 		switch (opt::searchStrategy) {
1147 		case GRID_SEARCH:
1148 			best = optimize_grid_search(
1149 			    g,
1150 			    std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1151 			    std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1152 			break;
1153 		case LINE_SEARCH:
1154 			best = optimize_line_search(
1155 			    g,
1156 			    std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1157 			    std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1158 			break;
1159 		default:
1160 			abort();
1161 			break;
1162 		}
1163 
1164 		if (opt::verbose > 0)
1165 			std::cerr << "\nScaffolding with n=" << best.n << " s=" << best.s << "\n\n";
1166 		ScaffoldResult result = scaffold(g, best.n, best.s, true);
1167 
1168 		// Print the table of all the parameter values attempted and their metrics.
1169 		if (opt::verbose > 0) {
1170 			std::cerr << '\n';
1171 			printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
1172 			std::cerr << best.metrics;
1173 		}
1174 
1175 		std::cerr << '\n'
1176 		          << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s
1177 		          << ".\n";
1178 
1179 		// Print assembly contiguity statistics.
1180 		std::cerr << '\n';
1181 		printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
1182 		std::cerr << result.metrics;
1183 	}
1184 
1185 	return 0;
1186 }
1187