1 #include "Common/UnorderedMap.h"
2 #include "ContigNode.h"
3 #include "ContigPath.h"
4 #include "ContigProperties.h"
5 #include "DataBase/DB.h"
6 #include "DataBase/Options.h"
7 #include "Estimate.h"
8 #include "Graph/Assemble.h"
9 #include "Graph/ContigGraph.h"
10 #include "Graph/ContigGraphAlgorithms.h"
11 #include "Graph/DirectedGraph.h"
12 #include "Graph/GraphAlgorithms.h"
13 #include "Graph/GraphIO.h"
14 #include "Graph/GraphUtil.h"
15 #include "Graph/PopBubbles.h"
16 #include "IOUtil.h"
17 #include "Iterator.h"
18 #include "Uncompress.h"
19 #include "config.h"
20 #include <cassert>
21 #include <climits>
22 #include <cmath>
23 #include <cstdlib>
24 #include <fstream>
25 #include <functional>
26 #include <getopt.h>
27 #include <iostream>
28 #include <utility>
29
30 using namespace std;
31 using namespace std::rel_ops;
32 using boost::edge_bundle_type;
33 using boost::tie;
34
35 #define PROGRAM "abyss-scaffold"
36
37 DB db;
38
39 static const char VERSION_MESSAGE[] =
40 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
41 "Written by Shaun Jackman.\n"
42 "\n"
43 "Copyright 2018 Canada's Michael Smith Genome Sciences Centre\n";
44
45 static const char USAGE_MESSAGE[] =
46 "Usage: " PROGRAM " -k<kmer> [OPTION]... FASTA|OVERLAP DIST...\n"
47 "Scaffold contigs using the distance estimate graph.\n"
48 "\n"
49 " Arguments:\n"
50 "\n"
51 " FASTA contigs in FASTA format\n"
52 " OVERLAP the contig overlap graph\n"
53 " DIST estimates of the distance between contigs\n"
54 "\n"
55 " Options:\n"
56 "\n"
57 " -n, --npairs=N minimum number of pairs [0]\n"
58 " or -n A-B:S Find the value of n in [A,B] with step size S\n"
59 " that maximizes the scaffold N50.\n"
60 " Default value for the step size is 1, if unspecified.\n"
61 " -s, --seed-length=N minimum contig length [1000]\n"
62 " or -s A-B Find the value of s in [A,B]\n"
63 " that maximizes the scaffold N50.\n"
64 " --grid optimize using a grid search [default]\n"
65 " --line optimize using a line search\n"
66 " -k, --kmer=N length of a k-mer\n"
67 " -G, --genome-size=N expected genome size. Used to calculate NG50\n"
68 " and associated stats [disabled]\n"
69 " --min-gap=N minimum scaffold gap length to output [50]\n"
70 " --max-gap=N maximum scaffold gap length to output [inf]\n"
71 " --complex remove complex transitive edges\n"
72 " --no-complex don't remove complex transitive edges [default]\n"
73 " --SS expect contigs to be oriented correctly\n"
74 " --no-SS no assumption about contig orientation [default]\n"
75 " -o, --out=FILE write the paths to FILE\n"
76 " -g, --graph=FILE write the graph to FILE\n"
77 " -v, --verbose display verbose output\n"
78 " --help display this help and exit\n"
79 " --version output version information and exit\n"
80 " --db=FILE specify path of database repository in FILE\n"
81 " --library=NAME specify library NAME for sqlite\n"
82 " --strain=NAME specify strain NAME for sqlite\n"
83 " --species=NAME specify species NAME for sqlite\n"
84 "\n"
85 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
86
87 namespace opt {
88 string db;
89 dbVars metaVars;
90
91 unsigned k; // used by ContigProperties
92
93 /** Optimization search strategy. */
94 static int searchStrategy;
95
96 /** Minimum number of pairs. */
97 static unsigned minEdgeWeight;
98 static unsigned minEdgeWeightEnd;
99 static unsigned minEdgeWeightStep;
100
101 /** Minimum contig length. */
102 static unsigned minContigLength = 1000;
103 static unsigned minContigLengthEnd = 1000;
104
105 /** Genome size. Used to calculate NG50. */
106 static long long unsigned genomeSize;
107
108 /** Minimum scaffold gap length to output. */
109 static int minGap = 50;
110
111 /** Maximum scaffold gap length to output.
112 * -ve value means no maximum. */
113 static int maxGap = -1;
114
115 /** Write the paths to this file. */
116 static string out;
117
118 /** Write the graph to this file. */
119 static string graphPath;
120
121 /** Run a strand-specific RNA-Seq assembly. */
122 static int ss;
123
124 /** Verbose output. */
125 int verbose; // used by PopBubbles
126
127 /** Output format */
128 int format = DOT; // used by DistanceEst
129
130 /** Remove complex transitive edges */
131 static int comp_trans;
132 }
133
134 static const char shortopts[] = "G:g:k:n:o:s:v";
135
136 enum
137 {
138 OPT_HELP = 1,
139 OPT_VERSION,
140 OPT_MIN_GAP,
141 OPT_MAX_GAP,
142 OPT_COMP,
143 OPT_DB,
144 OPT_LIBRARY,
145 OPT_STRAIN,
146 OPT_SPECIES
147 };
148
149 /** Optimization search strategy. */
150 enum
151 {
152 GRID_SEARCH,
153 LINE_SEARCH
154 };
155
156 static const struct option longopts[] = {
157 { "graph", no_argument, NULL, 'g' },
158 { "kmer", required_argument, NULL, 'k' },
159 { "genome-size", required_argument, NULL, 'G' },
160 { "min-gap", required_argument, NULL, OPT_MIN_GAP },
161 { "max-gap", required_argument, NULL, OPT_MAX_GAP },
162 { "npairs", required_argument, NULL, 'n' },
163 { "grid", no_argument, &opt::searchStrategy, GRID_SEARCH },
164 { "line", no_argument, &opt::searchStrategy, LINE_SEARCH },
165 { "out", required_argument, NULL, 'o' },
166 { "seed-length", required_argument, NULL, 's' },
167 { "complex", no_argument, &opt::comp_trans, 1 },
168 { "no-complex", no_argument, &opt::comp_trans, 0 },
169 { "SS", no_argument, &opt::ss, 1 },
170 { "no-SS", no_argument, &opt::ss, 0 },
171 { "verbose", no_argument, NULL, 'v' },
172 { "help", no_argument, NULL, OPT_HELP },
173 { "version", no_argument, NULL, OPT_VERSION },
174 { "db", required_argument, NULL, OPT_DB },
175 { "library", required_argument, NULL, OPT_LIBRARY },
176 { "strain", required_argument, NULL, OPT_STRAIN },
177 { "species", required_argument, NULL, OPT_SPECIES },
178 { NULL, 0, NULL, 0 }
179 };
180
181 /** A distance estimate graph. */
182 typedef DirectedGraph<Length, DistanceEst> DG;
183 typedef ContigGraph<DG> Graph;
184
185 /** Return whether this edge is invalid.
186 * An edge is invalid when the overlap is larger than the length of
187 * either of its incident sequences.
188 */
189 struct InvalidEdge
190 {
InvalidEdgeInvalidEdge191 InvalidEdge(Graph& g)
192 : m_g(g)
193 {}
operator ()InvalidEdge194 bool operator()(graph_traits<Graph>::edge_descriptor e) const
195 {
196 int d = m_g[e].distance;
197 int ulength = m_g[source(e, m_g)].length;
198 int vlength = m_g[target(e, m_g)].length;
199 return d + ulength <= 0 || d + vlength <= 0;
200 }
201 const Graph& m_g;
202 };
203
204 /** Return whether the specified edges has sufficient support. */
205 struct PoorSupport
206 {
PoorSupportPoorSupport207 PoorSupport(Graph& g, unsigned minEdgeWeight)
208 : m_g(g)
209 , m_minEdgeWeight(minEdgeWeight)
210 {}
operator ()PoorSupport211 bool operator()(graph_traits<Graph>::edge_descriptor e) const
212 {
213 return m_g[e].numPairs < m_minEdgeWeight;
214 }
215 const Graph& m_g;
216 unsigned m_minEdgeWeight;
217 };
218
219 /** Remove short vertices and unsupported edges from the graph. */
220 static void
filterGraph(Graph & g,unsigned minEdgeWeight,unsigned minContigLength)221 filterGraph(Graph& g, unsigned minEdgeWeight, unsigned minContigLength)
222 {
223 typedef graph_traits<Graph> GTraits;
224 typedef GTraits::vertex_descriptor V;
225 typedef GTraits::vertex_iterator Vit;
226
227 // Remove short contigs.
228 unsigned numRemovedV = 0;
229 std::pair<Vit, Vit> urange = vertices(g);
230 for (Vit uit = urange.first; uit != urange.second; ++uit) {
231 V u = *uit;
232 if (g[u].length < minContigLength)
233 clear_vertex(u, g);
234 if (out_degree(u, g) == 0 && in_degree(u, g) == 0) {
235 remove_vertex(u, g);
236 numRemovedV++;
237 }
238 }
239 if (opt::verbose > 0)
240 cerr << "Removed " << numRemovedV << " vertices.\n";
241
242 // Remove poorly-supported edges.
243 unsigned numBefore = num_edges(g);
244 remove_edge_if(PoorSupport(g, minEdgeWeight), static_cast<DG&>(g));
245 unsigned numRemovedE = numBefore - num_edges(g);
246 if (opt::verbose > 0)
247 cerr << "Removed " << numRemovedE << " edges.\n";
248 if (!opt::db.empty()) {
249 addToDb(db, "V_removed", numRemovedV);
250 addToDb(db, "E_removed", numRemovedE);
251 }
252 }
253
254 /** Return true if the specified edge is a cycle. */
255 static bool
isCycle(Graph & g,graph_traits<Graph>::edge_descriptor e)256 isCycle(Graph& g, graph_traits<Graph>::edge_descriptor e)
257 {
258 return edge(target(e, g), source(e, g), g).second;
259 }
260
261 /** Remove simple cycles of length two from the graph. */
262 static void
removeCycles(Graph & g)263 removeCycles(Graph& g)
264 {
265 typedef graph_traits<Graph>::edge_descriptor E;
266 typedef graph_traits<Graph>::edge_iterator Eit;
267
268 // Identify the cycles.
269 vector<E> cycles;
270 Eit eit, elast;
271 for (tie(eit, elast) = edges(g); eit != elast; ++eit) {
272 E e = *eit;
273 if (isCycle(g, e))
274 cycles.push_back(e);
275 }
276
277 /** Remove the cycles. */
278 remove_edges(g, cycles.begin(), cycles.end());
279 if (opt::verbose > 0) {
280 cerr << "Removed " << cycles.size() << " cyclic edges.\n";
281 printGraphStats(cerr, g);
282 }
283
284 if (!opt::db.empty())
285 addToDb(db, "E_removed_cyclic", cycles.size());
286 }
287
288 /** Find edges in g0 that resolve forks in g.
289 * For a pair of edges (u,v1) and (u,v2) in g, if exactly one of the
290 * edges (v1,v2) or (v2,v1) exists in g0, add that edge to g.
291 */
292 static void
resolveForks(Graph & g,const Graph & g0)293 resolveForks(Graph& g, const Graph& g0)
294 {
295 typedef graph_traits<Graph>::adjacency_iterator Vit;
296 typedef graph_traits<Graph>::edge_descriptor E;
297 typedef graph_traits<Graph>::vertex_iterator Uit;
298 typedef graph_traits<Graph>::vertex_descriptor V;
299
300 unsigned numEdges = 0;
301 pair<Uit, Uit> urange = vertices(g);
302 for (Uit uit = urange.first; uit != urange.second; ++uit) {
303 V u = *uit;
304 if (out_degree(u, g) < 2)
305 continue;
306 pair<Vit, Vit> vrange = adjacent_vertices(u, g);
307 for (Vit vit1 = vrange.first; vit1 != vrange.second;) {
308 V v1 = *vit1;
309 ++vit1;
310 assert(v1 != u);
311 for (Vit vit2 = vit1; vit2 != vrange.second; ++vit2) {
312 V v2 = *vit2;
313 assert(v2 != u);
314 assert(v1 != v2);
315 if (edge(v1, v2, g).second || edge(v2, v1, g).second)
316 continue;
317 pair<E, bool> e12 = edge(v1, v2, g0);
318 pair<E, bool> e21 = edge(v2, v1, g0);
319 if (e12.second && e21.second) {
320 if (opt::verbose > 1)
321 cerr << "cycle: " << get(vertex_name, g, v1) << ' '
322 << get(vertex_name, g, v2) << '\n';
323 } else if (e12.second || e21.second) {
324 E e = e12.second ? e12.first : e21.first;
325 V v = source(e, g0), w = target(e, g0);
326 add_edge(v, w, g0[e], g);
327 numEdges++;
328 if (opt::verbose > 1)
329 cerr << get(vertex_name, g, u) << " -> " << get(vertex_name, g, v) << " -> "
330 << get(vertex_name, g, w) << " [" << g0[e] << "]\n";
331 }
332 }
333 }
334 }
335 if (opt::verbose > 0)
336 cerr << "Added " << numEdges << " edges to ambiguous vertices.\n";
337 if (!opt::db.empty())
338 addToDb(db, "E_added_ambig", numEdges);
339 }
340
341 /** Remove tips.
342 * For an edge (u,v), remove the vertex v if deg+(u) > 1
343 * and deg-(v) = 1 and deg+(v) = 0.
344 */
345 static void
pruneTips(Graph & g)346 pruneTips(Graph& g)
347 {
348 /** Identify the tips. */
349 size_t n = 0;
350 pruneTips(g, CountingOutputIterator(n));
351
352 if (opt::verbose > 0) {
353 cerr << "Removed " << n << " tips.\n";
354 printGraphStats(cerr, g);
355 }
356
357 if (!opt::db.empty())
358 addToDb(db, "Tips_removed", n);
359 }
360
361 /** Remove repetitive vertices from this graph.
362 * input: digraph g { t1->v1 t2->v2 t1->u t2->u u->v1 u->v2 }
363 * operation: remove vertex u
364 * output: digraph g { t1->v1 t2->v2 }
365 */
366 static void
removeRepeats(Graph & g)367 removeRepeats(Graph& g)
368 {
369 typedef graph_traits<Graph>::adjacency_iterator Ait;
370 typedef graph_traits<Graph>::edge_descriptor E;
371 typedef graph_traits<Graph>::vertex_descriptor V;
372
373 vector<V> repeats;
374 vector<E> transitive;
375 find_transitive_edges(g, back_inserter(transitive));
376 for (vector<E>::const_iterator it = transitive.begin(); it != transitive.end(); ++it) {
377 // Iterate through the transitive edges, u->w1.
378 V u = source(*it, g), w1 = target(*it, g);
379 Ait vit, vlast;
380 for (tie(vit, vlast) = adjacent_vertices(u, g); vit != vlast; ++vit) {
381 V v = *vit;
382 assert(u != v); // no self loops
383 if (!edge(v, w1, g).second)
384 continue;
385 // u->w1 is a transitive edge spanning u->v->w1.
386 Ait wit, wlast;
387 for (tie(wit, wlast) = adjacent_vertices(v, g); wit != wlast; ++wit) {
388 // For each edge v->w2, check that an edge
389 // w1->w2 or w2->w1 exists. If not, v is a repeat.
390 V w2 = *wit;
391 assert(v != w2); // no self loops
392 if (w1 != w2 && !edge(w1, w2, g).second && !edge(w2, w1, g).second) {
393 repeats.push_back(v);
394 break;
395 }
396 }
397 }
398 }
399
400 sort(repeats.begin(), repeats.end());
401 repeats.erase(unique(repeats.begin(), repeats.end()), repeats.end());
402 if (opt::verbose > 1) {
403 cerr << "Ambiguous:";
404 for (vector<V>::const_iterator it = repeats.begin(); it != repeats.end(); ++it)
405 cerr << ' ' << get(vertex_name, g, *it);
406 cerr << '\n';
407 }
408
409 // Remove the repetitive vertices.
410 unsigned numRemoved = 0;
411 for (vector<V>::const_iterator it = repeats.begin(); it != repeats.end(); ++it) {
412 V u = *it;
413 V uc = get(vertex_complement, g, u);
414 clear_out_edges(u, g);
415 if (it != repeats.begin() && it[-1] == uc) {
416 remove_vertex(u, g);
417 numRemoved++;
418 }
419 }
420
421 if (opt::verbose > 0) {
422 cerr << "Cleared " << repeats.size() << " ambiguous vertices.\n"
423 << "Removed " << numRemoved << " ambiguous vertices.\n";
424 printGraphStats(cerr, g);
425 }
426 if (!opt::db.empty()) {
427 addToDb(db, "V_cleared_ambg", repeats.size());
428 addToDb(db, "V_removed_ambg", numRemoved);
429 }
430 }
431
432 /** Remove weak edges from this graph.
433 * input: digraph g { u1->v2 u1->v1 u2->v2 }
434 * (u1,v2).n < (u1,v1).n and (u1,v2).n < (u2,v2).n
435 * operation: remove edge u1->v2
436 * output: digraph g {u1->v1 u2->v2 }
437 */
438 static void
removeWeakEdges(Graph & g)439 removeWeakEdges(Graph& g)
440 {
441 typedef graph_traits<Graph>::edge_descriptor E;
442 typedef graph_traits<Graph>::edge_iterator Eit;
443 typedef graph_traits<Graph>::in_edge_iterator Iit;
444 typedef graph_traits<Graph>::out_edge_iterator Oit;
445 typedef graph_traits<Graph>::vertex_descriptor V;
446
447 vector<E> weak;
448 Eit eit, elast;
449 for (tie(eit, elast) = edges(g); eit != elast; ++eit) {
450 E u1v2 = *eit;
451 V u1 = source(u1v2, g), v2 = target(u1v2, g);
452 if (out_degree(u1, g) != 2 || in_degree(v2, g) != 2)
453 continue;
454
455 Oit oit, olast;
456 tie(oit, olast) = out_edges(u1, g);
457 E u1v1;
458 if (target(*oit, g) == v2) {
459 ++oit;
460 u1v1 = *oit;
461 } else {
462 u1v1 = *oit;
463 ++oit;
464 }
465 assert(++oit == olast);
466 V v1 = target(u1v1, g);
467 assert(v1 != v2);
468 if (in_degree(v1, g) != 1)
469 continue;
470
471 Iit iit, ilast;
472 tie(iit, ilast) = in_edges(v2, g);
473 E u2v2;
474 if (source(*iit, g) == u1) {
475 ++iit;
476 assert(iit != ilast);
477 u2v2 = *iit;
478 } else {
479 assert(iit != ilast);
480 u2v2 = *iit;
481 ++iit;
482 }
483 assert(++iit == ilast);
484 V u2 = source(u2v2, g);
485 assert(u1 != u2);
486 if (out_degree(u2, g) != 1)
487 continue;
488
489 unsigned n = g[u1v2].numPairs;
490 if (n < g[u1v1].numPairs && n < g[u2v2].numPairs)
491 weak.push_back(u1v2);
492 }
493
494 if (opt::verbose > 1) {
495 cerr << "Weak edges:\n";
496 for (vector<E>::const_iterator it = weak.begin(); it != weak.end(); ++it) {
497 E e = *it;
498 cerr << '\t' << get(edge_name, g, e) << " [" << g[e] << "]\n";
499 }
500 }
501
502 /** Remove the weak edges. */
503 remove_edges(g, weak.begin(), weak.end());
504 if (opt::verbose > 0) {
505 cerr << "Removed " << weak.size() << " weak edges.\n";
506 printGraphStats(cerr, g);
507 }
508 if (!opt::db.empty())
509 addToDb(db, "E_removed_weak", weak.size());
510 }
511
512 static void
removeLongEdges(Graph & g)513 removeLongEdges(Graph& g)
514 {
515 typedef graph_traits<Graph>::edge_descriptor E;
516 typedef graph_traits<Graph>::edge_iterator Eit;
517
518 vector<E> long_e;
519 Eit eit, elast;
520 for (tie(eit, elast) = edges(g); eit != elast; ++eit) {
521 E e = *eit;
522 if (g[e].distance > opt::maxGap)
523 long_e.push_back(e);
524 }
525 remove_edges(g, long_e.begin(), long_e.end());
526 }
527
528 /** Return whether the specified distance estimate is an exact
529 * overlap.
530 */
531 static bool
isOverlap(const DistanceEst & d)532 isOverlap(const DistanceEst& d)
533 {
534 if (d.stdDev == 0) {
535 assert(d.distance < 0);
536 return true;
537 } else
538 return false;
539 }
540
541 /** Add distance estimates to a path.
542 * @param g0 the original graph
543 * @param g1 the transformed graph
544 */
545 static ContigPath
addDistEst(const Graph & g0,const Graph & g1,const ContigPath & path)546 addDistEst(const Graph& g0, const Graph& g1, const ContigPath& path)
547 {
548 typedef graph_traits<Graph>::edge_descriptor E;
549 typedef edge_bundle_type<Graph>::type EP;
550
551 ContigPath out;
552 out.reserve(2 * path.size());
553 ContigNode u = path.front();
554 out.push_back(u);
555 for (ContigPath::const_iterator it = path.begin() + 1; it != path.end(); ++it) {
556 ContigNode v = *it;
557 assert(!v.ambiguous());
558 pair<E, bool> e0 = edge(u, v, g0);
559 pair<E, bool> e1 = edge(u, v, g1);
560 if (!e0.second && !e1.second)
561 std::cerr << "error: missing edge: " << get(vertex_name, g0, u) << " -> "
562 << get(vertex_name, g0, v) << '\n';
563 assert(e0.second || e1.second);
564 const EP& ep = e0.second ? g0[e0.first] : g1[e1.first];
565 if (!isOverlap(ep)) {
566 int distance = max(ep.distance, (int)opt::minGap);
567 int numN = distance + opt::k - 1; // by convention
568 assert(numN >= 0);
569 numN = max(numN, 1);
570 out.push_back(ContigNode(numN, 'N'));
571 }
572 out.push_back(v);
573 u = v;
574 }
575 return out;
576 }
577
578 /** Read a graph from the specified file. */
579 static void
readGraph(const string & path,Graph & g)580 readGraph(const string& path, Graph& g)
581 {
582 if (opt::verbose > 0)
583 cerr << "Reading `" << path << "'...\n";
584 ifstream fin(path.c_str());
585 istream& in = path == "-" ? cin : fin;
586 assert_good(in, path);
587 read_graph(in, g, BetterDistanceEst());
588 assert(in.eof());
589 if (opt::verbose > 0)
590 printGraphStats(cerr, g);
591
592 vector<int> vals = passGraphStatsVal(g);
593 vector<string> keys = make_vector<string>() << "V_readGraph"
594 << "E_readGraph"
595 << "degree0_readGraph"
596 << "degree1_readGraph"
597 << "degree234_readGraph"
598 << "degree5_readGraph"
599 << "max_readGraph";
600
601 if (!opt::db.empty()) {
602 for (unsigned i = 0; i < vals.size(); i++)
603 addToDb(db, keys[i], vals[i]);
604 }
605 g_contigNames.lock();
606 }
607
608 /** Return the scaffold length of [first, last), not counting gaps. */
609 template<typename It>
610 unsigned
addLength(const Graph & g,It first,It last)611 addLength(const Graph& g, It first, It last)
612 {
613 typedef typename graph_traits<Graph>::vertex_descriptor V;
614 assert(first != last);
615 unsigned length = g[*first].length;
616 for (It it = first + 1; it != last; ++it) {
617 V u = *(it - 1);
618 V v = *it;
619 length += min(0, get(edge_bundle, g, u, v).distance);
620 length += g[v].length;
621 }
622 return length;
623 }
624
625 /** A container of contig paths. */
626 typedef vector<ContigPath> ContigPaths;
627
628 /**
629 * Build the scaffold length histogram.
630 * @param g The graph g is destroyed.
631 */
632 static Histogram
buildScaffoldLengthHistogram(Graph & g,const ContigPaths & paths)633 buildScaffoldLengthHistogram(Graph& g, const ContigPaths& paths)
634 {
635 Histogram h;
636
637 // Clear the removed flag.
638 typedef graph_traits<Graph>::vertex_iterator Vit;
639 Vit uit, ulast;
640 for (tie(uit, ulast) = vertices(g); uit != ulast; ++uit)
641 put(vertex_removed, g, *uit, false);
642
643 // Remove the vertices that are used in paths
644 // and add the lengths of the scaffolds.
645 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
646 h.insert(addLength(g, it->begin(), it->end()));
647 remove_vertex_if(
648 g, it->begin(), it->end(), [](const ContigNode& c) { return !c.ambiguous(); });
649 }
650
651 // Add the contigs that were not used in paths.
652 for (tie(uit, ulast) = vertices(g); uit != ulast; ++++uit) {
653 typedef graph_traits<Graph>::vertex_descriptor V;
654 V u = *uit;
655 if (!get(vertex_removed, g, u))
656 h.insert(g[u].length);
657 }
658
659 return h;
660 }
661
662 /** Add contiguity stats to database */
663 static void
addCntgStatsToDb(const Histogram h,const unsigned min)664 addCntgStatsToDb(const Histogram h, const unsigned min)
665 {
666 vector<int> vals = passContiguityStatsVal(h, min);
667 vector<string> keys = make_vector<string>() << "n"
668 << "n200"
669 << "nN50"
670 << "min"
671 << "N75"
672 << "N50"
673 << "N25"
674 << "Esize"
675 << "max"
676 << "sum"
677 << "nNG50"
678 << "NG50";
679 if (!opt::db.empty()) {
680 for (unsigned i = 0; i < vals.size(); i++)
681 addToDb(db, keys[i], vals[i]);
682 }
683 }
684
685 /** Parameters of scaffolding. */
686 struct ScaffoldParam
687 {
ScaffoldParamScaffoldParam688 ScaffoldParam(unsigned n, unsigned s)
689 : n(n)
690 , s(s)
691 {}
operator ==ScaffoldParam692 bool operator==(const ScaffoldParam& o) const { return n == o.n && s == o.s; }
693 unsigned n;
694 unsigned s;
695 };
696
697 NAMESPACE_STD_HASH_BEGIN
698 template<>
699 struct hash<ScaffoldParam>
700 {
operator ()hash701 size_t operator()(const ScaffoldParam& param) const
702 {
703 return hash<unsigned>()(param.n) ^ hash<unsigned>()(param.s);
704 }
705 };
706 NAMESPACE_STD_HASH_END
707
708 /** Result of scaffolding. */
709 struct ScaffoldResult : ScaffoldParam
710 {
ScaffoldResultScaffoldResult711 ScaffoldResult()
712 : ScaffoldParam(0, 0)
713 , n50(0)
714 {}
ScaffoldResultScaffoldResult715 ScaffoldResult(unsigned n, unsigned s, unsigned n50, std::string metrics)
716 : ScaffoldParam(n, s)
717 , n50(n50)
718 , metrics(metrics)
719 {}
720 unsigned n50;
721 std::string metrics;
722 };
723
724 /** Build scaffold paths.
725 * @param output write the results
726 * @return the scaffold N50
727 */
728 ScaffoldResult
scaffold(const Graph & g0,unsigned minEdgeWeight,unsigned minContigLength,bool output)729 scaffold(const Graph& g0, unsigned minEdgeWeight, unsigned minContigLength, bool output)
730 {
731 Graph g(g0);
732
733 // Filter the graph.
734 filterGraph(g, minEdgeWeight, minContigLength);
735 if (opt::verbose > 0)
736 printGraphStats(cerr, g);
737
738 // Remove cycles.
739 removeCycles(g);
740
741 // Resolve forks.
742 resolveForks(g, g0);
743
744 // Prune tips.
745 pruneTips(g);
746
747 // Remove repeats.
748 removeRepeats(g);
749
750 // Remove transitive edges.
751 unsigned numTransitive;
752 if (opt::comp_trans)
753 numTransitive = remove_complex_transitive_edges(g);
754 else
755 numTransitive = remove_transitive_edges(g);
756
757 if (opt::verbose > 0) {
758 cerr << "Removed " << numTransitive << " transitive edges.\n";
759 printGraphStats(cerr, g);
760 }
761
762 if (!opt::db.empty())
763 addToDb(db, "Edges_transitive", numTransitive);
764
765 // Prune tips.
766 pruneTips(g);
767
768 // Pop bubbles.
769 typedef graph_traits<Graph>::vertex_descriptor V;
770 vector<V> popped = popBubbles(g);
771 if (opt::verbose > 0) {
772 cerr << "Removed " << popped.size() << " vertices in bubbles.\n";
773 printGraphStats(cerr, g);
774 }
775
776 if (!opt::db.empty())
777 addToDb(db, "Vertices_bubblePopped", popped.size());
778
779 if (opt::verbose > 1) {
780 cerr << "Popped:";
781 for (vector<V>::const_iterator it = popped.begin(); it != popped.end(); ++it)
782 cerr << ' ' << get(vertex_name, g, *it);
783 cerr << '\n';
784 }
785
786 // Remove weak edges.
787 removeWeakEdges(g);
788
789 // Remove any edges longer than opt::maxGap.
790 if (opt::maxGap >= 0)
791 removeLongEdges(g);
792
793 // Assemble the paths.
794 ContigPaths paths;
795 assembleDFS(g, back_inserter(paths), opt::ss);
796 sort(paths.begin(), paths.end());
797 unsigned n = 0;
798 if (opt::verbose > 0) {
799 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it)
800 n += it->size();
801 cerr << "Assembled " << n << " contigs in " << paths.size() << " scaffolds.\n";
802 printGraphStats(cerr, g);
803 }
804
805 if (!opt::db.empty()) {
806 addToDb(db, "contigs_assembled", n);
807 addToDb(db, "scaffolds_assembled", paths.size());
808 }
809
810 if (output) {
811 // Output the paths.
812 ofstream fout(opt::out.c_str());
813 ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
814 assert_good(out, opt::out);
815 g_contigNames.unlock();
816 for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it)
817 out << createContigName() << '\t' << addDistEst(g0, g, *it) << '\n';
818 assert_good(out, opt::out);
819
820 // Output the graph.
821 if (!opt::graphPath.empty()) {
822 ofstream out(opt::graphPath.c_str());
823 assert_good(out, opt::graphPath);
824 write_dot(out, g);
825 assert_good(out, opt::graphPath);
826 }
827 }
828
829 // Compute contiguity metrics.
830 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
831 std::ostringstream ss;
832 Histogram scaffold_histogram = buildScaffoldLengthHistogram(g, paths);
833 printContiguityStats(ss, scaffold_histogram, STATS_MIN_LENGTH, false, "\t", opt::genomeSize)
834 << "\tn=" << minEdgeWeight << " s=" << minContigLength << '\n';
835 std::string metrics = ss.str();
836 addCntgStatsToDb(scaffold_histogram, STATS_MIN_LENGTH);
837
838 return ScaffoldResult(
839 minEdgeWeight,
840 minContigLength,
841 scaffold_histogram.trimLow(STATS_MIN_LENGTH).n50(),
842 metrics);
843 }
844
845 /** Memoize the optimization results so far. */
846 typedef unordered_map<ScaffoldParam, ScaffoldResult> ScaffoldMemo;
847
848 /** Build scaffold paths, memoized. */
849 ScaffoldResult
scaffold_memoized(const Graph & g,unsigned n,unsigned s,ScaffoldMemo & memo)850 scaffold_memoized(const Graph& g, unsigned n, unsigned s, ScaffoldMemo& memo)
851 {
852 ScaffoldParam param(n, s);
853 ScaffoldMemo::const_iterator it = memo.find(param);
854 if (it != memo.end()) {
855 // Clear the metrics string, so that this result is not listed
856 // multiple times in the final table of metrics.
857 ScaffoldResult result(it->second);
858 result.metrics.clear();
859 return result;
860 }
861
862 if (opt::verbose > 0)
863 std::cerr << "\nScaffolding with n=" << n << " s=" << s << "\n\n";
864 ScaffoldResult result = scaffold(g, n, s, false);
865 memo[param] = result;
866
867 // Print assembly metrics.
868 if (opt::verbose > 0) {
869 std::cerr << '\n';
870 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
871 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
872 }
873 std::cerr << result.metrics;
874 if (opt::verbose > 0)
875 cerr << '\n';
876 return result;
877 }
878
879 /** Find the value of n that maximizes the scaffold N50. */
880 static ScaffoldResult
optimize_n(const Graph & g,std::pair<unsigned,unsigned> minEdgeWeight,unsigned minContigLength,ScaffoldMemo & memo)881 optimize_n(
882 const Graph& g,
883 std::pair<unsigned, unsigned> minEdgeWeight,
884 unsigned minContigLength,
885 ScaffoldMemo& memo)
886 {
887 std::string metrics_table;
888 unsigned bestn = 0, bestN50 = 0;
889 for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
890 ScaffoldResult result = scaffold_memoized(g, n, minContigLength, memo);
891 metrics_table += result.metrics;
892 if (result.n50 > bestN50) {
893 bestN50 = result.n50;
894 bestn = n;
895 }
896 }
897
898 return ScaffoldResult(bestn, minContigLength, bestN50, metrics_table);
899 }
900
901 /** Find the value of s that maximizes the scaffold N50. */
902 static ScaffoldResult
optimize_s(const Graph & g,unsigned minEdgeWeight,std::pair<unsigned,unsigned> minContigLength,ScaffoldMemo & memo)903 optimize_s(
904 const Graph& g,
905 unsigned minEdgeWeight,
906 std::pair<unsigned, unsigned> minContigLength,
907 ScaffoldMemo& memo)
908 {
909 std::string metrics_table;
910 unsigned bests = 0, bestN50 = 0;
911 const double STEP = cbrt(10); // Three steps per decade.
912 unsigned ilast = (unsigned)round(log(minContigLength.second) / log(STEP));
913 for (unsigned i = (unsigned)round(log(minContigLength.first) / log(STEP)); i <= ilast; ++i) {
914 unsigned s = (unsigned)pow(STEP, (int)i);
915
916 // Round to 1 figure.
917 double nearestDecade = pow(10, floor(log10(s)));
918 s = unsigned(round(s / nearestDecade) * nearestDecade);
919
920 ScaffoldResult result = scaffold_memoized(g, minEdgeWeight, s, memo);
921 metrics_table += result.metrics;
922 if (result.n50 > bestN50) {
923 bestN50 = result.n50;
924 bests = s;
925 }
926 }
927
928 return ScaffoldResult(minEdgeWeight, bests, bestN50, metrics_table);
929 }
930
931 /** Find the values of n and s that maximizes the scaffold N50. */
932 static ScaffoldResult
optimize_grid_search(const Graph & g,std::pair<unsigned,unsigned> minEdgeWeight,std::pair<unsigned,unsigned> minContigLength)933 optimize_grid_search(
934 const Graph& g,
935 std::pair<unsigned, unsigned> minEdgeWeight,
936 std::pair<unsigned, unsigned> minContigLength)
937 {
938 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
939 if (opt::verbose == 0)
940 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
941
942 ScaffoldMemo memo;
943 std::string metrics_table;
944 ScaffoldResult best(0, 0, 0, "");
945 for (unsigned n = minEdgeWeight.first; n <= minEdgeWeight.second; n += opt::minEdgeWeightStep) {
946 ScaffoldResult result = optimize_s(g, n, minContigLength, memo);
947 metrics_table += result.metrics;
948 if (result.n50 > best.n50)
949 best = result;
950 }
951
952 best.metrics = metrics_table;
953 return best;
954 }
955
956 /** Find the values of n and s that maximizes the scaffold N50. */
957 static ScaffoldResult
optimize_line_search(const Graph & g,std::pair<unsigned,unsigned> minEdgeWeight,std::pair<unsigned,unsigned> minContigLength)958 optimize_line_search(
959 const Graph& g,
960 std::pair<unsigned, unsigned> minEdgeWeight,
961 std::pair<unsigned, unsigned> minContigLength)
962 {
963 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
964 if (opt::verbose == 0)
965 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
966
967 ScaffoldMemo memo;
968 std::string metrics_table;
969 ScaffoldResult best(
970 (minEdgeWeight.first + minEdgeWeight.second) / 2, minContigLength.second, 0, "");
971 // An upper limit on the number of iterations.
972 const unsigned MAX_ITERATIONS =
973 1 + (minEdgeWeight.second - minEdgeWeight.first) / opt::minEdgeWeightStep;
974 for (unsigned i = 0; i < MAX_ITERATIONS; ++i) {
975 // Optimize s.
976 if (opt::verbose > 0) {
977 std::cerr << "\nOptimizing s for n=" << best.n << "\n\n";
978 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
979 }
980 unsigned previous_s = best.s;
981 best = optimize_s(g, best.n, minContigLength, memo);
982 metrics_table += best.metrics;
983 if (best.s == previous_s)
984 break;
985
986 // Optimize n.
987 if (opt::verbose > 0) {
988 std::cerr << "\nOptimizing n for s=" << best.s << "\n\n";
989 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
990 }
991 unsigned previous_n = best.n;
992 best = optimize_n(g, minEdgeWeight, best.s, memo);
993 metrics_table += best.metrics;
994 if (best.n == previous_n)
995 break;
996 }
997
998 std::cerr << "\nLine search converged in " << memo.size() << " iterations.\n";
999
1000 best.metrics = metrics_table;
1001 return best;
1002 }
1003
1004 /** Run abyss-scaffold. */
1005 int
main(int argc,char ** argv)1006 main(int argc, char** argv)
1007 {
1008 if (!opt::db.empty())
1009 opt::metaVars.resize(3);
1010
1011 bool die = false;
1012 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
1013 istringstream arg(optarg != NULL ? optarg : "");
1014 switch (c) {
1015 case '?':
1016 die = true;
1017 break;
1018 case 'k':
1019 arg >> opt::k;
1020 break;
1021 case 'G': {
1022 double x;
1023 arg >> x;
1024 opt::genomeSize = x;
1025 break;
1026 }
1027 case 'g':
1028 arg >> opt::graphPath;
1029 break;
1030 case 'n':
1031 arg >> opt::minEdgeWeight;
1032 if (arg.peek() == '-') {
1033 arg >> expect("-") >> opt::minEdgeWeightEnd;
1034 assert(opt::minEdgeWeight <= opt::minEdgeWeightEnd);
1035 } else
1036 opt::minEdgeWeightEnd = opt::minEdgeWeight;
1037 if (arg.peek() == ':')
1038 arg >> expect(":") >> opt::minEdgeWeightStep;
1039 else
1040 opt::minEdgeWeightStep = 1;
1041 break;
1042 case 'o':
1043 arg >> opt::out;
1044 break;
1045 case 's':
1046 arg >> opt::minContigLength;
1047 if (arg.peek() == '-') {
1048 opt::minContigLengthEnd = 100 * opt::minContigLength;
1049 arg >> expect("-") >> opt::minContigLengthEnd;
1050 assert(opt::minContigLength <= opt::minContigLengthEnd);
1051 } else
1052 opt::minContigLengthEnd = opt::minContigLength;
1053 break;
1054 case 'v':
1055 opt::verbose++;
1056 break;
1057 case OPT_MIN_GAP:
1058 arg >> opt::minGap;
1059 break;
1060 case OPT_MAX_GAP:
1061 arg >> opt::maxGap;
1062 break;
1063 case OPT_HELP:
1064 cout << USAGE_MESSAGE;
1065 exit(EXIT_SUCCESS);
1066 case OPT_VERSION:
1067 cout << VERSION_MESSAGE;
1068 exit(EXIT_SUCCESS);
1069 case OPT_DB:
1070 arg >> opt::db;
1071 break;
1072 case OPT_LIBRARY:
1073 arg >> opt::metaVars[0];
1074 break;
1075 case OPT_STRAIN:
1076 arg >> opt::metaVars[1];
1077 break;
1078 case OPT_SPECIES:
1079 arg >> opt::metaVars[2];
1080 break;
1081 }
1082 if (optarg != NULL && !arg.eof()) {
1083 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
1084 exit(EXIT_FAILURE);
1085 }
1086 }
1087
1088 if (opt::k <= 0) {
1089 cerr << PROGRAM ": "
1090 << "missing -k,--kmer option\n";
1091 die = true;
1092 }
1093
1094 if (argc - optind < 0) {
1095 cerr << PROGRAM ": missing arguments\n";
1096 die = true;
1097 }
1098
1099 if (die) {
1100 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
1101 exit(EXIT_FAILURE);
1102 }
1103 if (!opt::db.empty()) {
1104 init(db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
1105 addToDb(db, "K", opt::k);
1106 }
1107
1108 Graph g;
1109 if (optind < argc) {
1110 for (; optind < argc; optind++)
1111 readGraph(argv[optind], g);
1112 } else
1113 readGraph("-", g);
1114
1115 // Add any missing complementary edges.
1116 size_t numAdded = addComplementaryEdges(g);
1117 if (opt::verbose > 0) {
1118 cerr << "Added " << numAdded << " complementary edges.\n";
1119 printGraphStats(cerr, g);
1120 }
1121
1122 if (!opt::db.empty())
1123 addToDb(db, "add_complement_edges", numAdded);
1124
1125 // Remove invalid edges.
1126 unsigned numBefore = num_edges(g);
1127 remove_edge_if(InvalidEdge(g), static_cast<DG&>(g));
1128 unsigned numRemoved = numBefore - num_edges(g);
1129 if (numRemoved > 0)
1130 cerr << "warning: Removed " << numRemoved << " invalid edges.\n";
1131
1132 if (!opt::db.empty())
1133 addToDb(db, "Edges_invalid", numRemoved);
1134
1135 const unsigned STATS_MIN_LENGTH = opt::minContigLength;
1136 if (opt::minEdgeWeight == opt::minEdgeWeightEnd &&
1137 opt::minContigLength == opt::minContigLengthEnd) {
1138 ScaffoldResult result = scaffold(g, opt::minEdgeWeight, opt::minContigLength, true);
1139 // Print assembly contiguity statistics.
1140 if (opt::verbose > 0)
1141 std::cerr << '\n';
1142 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
1143 std::cerr << result.metrics;
1144 } else {
1145 ScaffoldResult best(0, 0, 0, "");
1146 switch (opt::searchStrategy) {
1147 case GRID_SEARCH:
1148 best = optimize_grid_search(
1149 g,
1150 std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1151 std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1152 break;
1153 case LINE_SEARCH:
1154 best = optimize_line_search(
1155 g,
1156 std::make_pair(opt::minEdgeWeight, opt::minEdgeWeightEnd),
1157 std::make_pair(opt::minContigLength, opt::minContigLengthEnd));
1158 break;
1159 default:
1160 abort();
1161 break;
1162 }
1163
1164 if (opt::verbose > 0)
1165 std::cerr << "\nScaffolding with n=" << best.n << " s=" << best.s << "\n\n";
1166 ScaffoldResult result = scaffold(g, best.n, best.s, true);
1167
1168 // Print the table of all the parameter values attempted and their metrics.
1169 if (opt::verbose > 0) {
1170 std::cerr << '\n';
1171 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
1172 std::cerr << best.metrics;
1173 }
1174
1175 std::cerr << '\n'
1176 << "Best scaffold N50 is " << best.n50 << " at n=" << best.n << " s=" << best.s
1177 << ".\n";
1178
1179 // Print assembly contiguity statistics.
1180 std::cerr << '\n';
1181 printContiguityStatsHeader(std::cerr, STATS_MIN_LENGTH, "\t", opt::genomeSize);
1182 std::cerr << result.metrics;
1183 }
1184
1185 return 0;
1186 }
1187