1 /**
2 * Remove short contigs that do not contribute any relevant
3 * information to the assembly.
4 * Written by Tony Raymond <traymond@bcgsc.ca>
5 */
6
7 #include "Common/Options.h"
8 #include "ContigID.h"
9 #include "ContigPath.h"
10 #include "ContigProperties.h"
11 #include "FastaReader.h"
12 #include "Graph/ContigGraph.h"
13 #include "Graph/ContigGraphAlgorithms.h"
14 #include "Graph/DirectedGraph.h"
15 #include "Graph/GraphIO.h"
16 #include "Graph/GraphUtil.h"
17 #include "IOUtil.h"
18 #include "Uncompress.h"
19 #include <algorithm>
20 #include <boost/lambda/bind.hpp>
21 #include <boost/lambda/lambda.hpp>
22 #include <fstream>
23 #include <functional>
24 #include <getopt.h>
25 #include <iostream>
26 #include <iterator>
27 #include <sstream>
28 #include <string>
29 #include <utility>
30 #include <vector>
31
32 using namespace std;
33 using namespace rel_ops;
34 using namespace boost::lambda;
35 using boost::tie;
36
37 #define PROGRAM "abyss-filtergraph"
38
39 static const char VERSION_MESSAGE[] =
40 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
41 "Written by Tony Raymond.\n"
42 "\n"
43 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
44
45 static const char USAGE_MESSAGE[] =
46 "Usage: " PROGRAM " -k<kmer> [OPTION]... ADJ [FASTA]\n"
47 "Remove short contigs that do not contribute any relevant\n"
48 "information to the assembly.\n"
49 "\n"
50 " Arguments:\n"
51 "\n"
52 " ADJ contig adjacency graph\n"
53 " FASTA contigs to check consistency of ADJ edges\n"
54 "\n"
55 " Options:\n"
56 "\n"
57 " -k, --kmer=N k-mer size\n"
58 " --SS expect contigs to be oriented correctly\n"
59 " --no-SS no assumption about contig orientation\n"
60 " -T, --island=N remove islands shorter than N [0]\n"
61 " -t, --tip=N remove tips shorter than N [0]\n"
62 " -l, --length=N remove contigs shorter than N [0]\n"
63 " -L, --max-length=N remove contigs longer than N [0]\n"
64 " -c, --coverage=FLOAT remove contigs with mean k-mer coverage less than FLOAT [0]\n"
65 " -C, --max-coverage=FLOAT remove contigs with mean k-mer coverage at least FLOAT [0]\n"
66 " --shim remove filler contigs that only contribute\n"
67 " to adjacency [default]\n"
68 " --no-shim disable filler contigs removal\n"
69 " --shim-max-degree=N only remove shims where the smaller of \n"
70 " in/out degree is smaller than N [1]\n"
71 " -m, --min-overlap=N require a minimum overlap of N bases [10]\n"
72 " --assemble assemble unambiguous paths\n"
73 " --no-assemble disable assembling of paths [default]\n"
74 " -g, --graph=FILE write the contig adjacency graph to FILE\n"
75 " -i, --ignore=FILE ignore contigs seen in FILE\n"
76 " -r, --remove=FILE remove contigs seen in FILE\n"
77 " --adj output the graph in ADJ format [default]\n"
78 " --asqg output the graph in ASQG format\n"
79 " --dot output the graph in GraphViz format\n"
80 " --gfa output the graph in GFA1 format\n"
81 " --gfa1 output the graph in GFA1 format\n"
82 " --gfa2 output the graph in GFA2 format\n"
83 " --gv output the graph in GraphViz format\n"
84 " --sam output the graph in SAM format\n"
85 " -v, --verbose display verbose output\n"
86 " --help display this help and exit\n"
87 " --version output version information and exit\n"
88 "\n"
89 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
90
91 namespace opt {
92 unsigned k; // used by ContigProperties
93
94 /** Run a strand-specific RNA-Seq assembly. */
95 static int ss;
96
97 /** Remove island contigs less than this length. */
98 static unsigned minIslandLen = 0;
99
100 /** Remove tips less than this length. */
101 static unsigned minTipLen = 0;
102
103 /** Remove all contigs less than this length. */
104 static unsigned minLen = 0;
105
106 /** Remove all contigs more than this length. */
107 static unsigned maxLen = 0;
108
109 /** Remove contigs with mean k-mer coverage less than this threshold. */
110 static float minCoverage = 0;
111
112 /** Remove contigs with mean k-mer coverage at least this threshold. */
113 static float maxCoverage = 0;
114
115 /** Remove short contigs that don't contribute any sequence. */
116 static int shim = 1;
117
118 /** Only remove shims where the smaller of in/out degree is small
119 * enough. */
120 static unsigned shimMaxDegree = 1;
121
122 /** Assemble unambiguous paths. */
123 static int assemble = 0;
124
125 /** Write the contig adjacency graph to this file. */
126 static string graphPath;
127
128 /** Contigs to ignore. */
129 static string ignorePath;
130
131 /** Contigs to remove. */
132 static string removePath;
133
134 /** The minimum overlap allowed between two contigs. */
135 static int minOverlap = 10;
136
137 /** Output graph format. */
138 int format = ADJ; // used by ContigProperties
139 }
140
141 static const char shortopts[] = "c:C:g:i:r:k:l:L:m:t:T:v";
142
143 enum
144 {
145 OPT_HELP = 1,
146 OPT_VERSION,
147 OPT_SHIM_MAX_DEG
148 };
149
150 static const struct option longopts[] = {
151 { "adj", no_argument, &opt::format, ADJ },
152 { "asqg", no_argument, &opt::format, ASQG },
153 { "dot", no_argument, &opt::format, DOT },
154 { "gfa", no_argument, &opt::format, GFA1 },
155 { "gfa1", no_argument, &opt::format, GFA1 },
156 { "gfa2", no_argument, &opt::format, GFA2 },
157 { "gv", no_argument, &opt::format, DOT },
158 { "sam", no_argument, &opt::format, SAM },
159 { "graph", required_argument, NULL, 'g' },
160 { "ignore", required_argument, NULL, 'i' },
161 { "remove", required_argument, NULL, 'r' },
162 { "SS", no_argument, &opt::ss, 1 },
163 { "no-SS", no_argument, &opt::ss, 0 },
164 { "kmer", required_argument, NULL, 'k' },
165 { "island", required_argument, NULL, 'T' },
166 { "tip", required_argument, NULL, 't' },
167 { "length", required_argument, NULL, 'l' },
168 { "max-length", required_argument, NULL, 'L' },
169 { "coverage", required_argument, NULL, 'c' },
170 { "max-coverage", required_argument, NULL, 'C' },
171 { "shim", no_argument, &opt::shim, 1 },
172 { "no-shim", no_argument, &opt::shim, 0 },
173 { "shim-max-degree", required_argument, NULL, OPT_SHIM_MAX_DEG },
174 { "assemble", no_argument, &opt::assemble, 1 },
175 { "no-assemble", no_argument, &opt::assemble, 0 },
176 { "min-overlap", required_argument, NULL, 'm' },
177 { "verbose", no_argument, NULL, 'v' },
178 { "help", no_argument, NULL, OPT_HELP },
179 { "version", no_argument, NULL, OPT_VERSION },
180 { NULL, 0, NULL, 0 }
181 };
182
183 static vector<ContigID> g_removed;
184
185 /** Contig adjacency graph. */
186 typedef ContigGraph<DirectedGraph<ContigProperties, Distance>> Graph;
187 typedef Graph::vertex_descriptor vertex_descriptor;
188 typedef Graph::edge_descriptor edge_descriptor;
189
190 /** Data for verbose output. */
191 static struct
192 {
193 unsigned removed;
194 unsigned tails;
195 unsigned too_long;
196 unsigned too_complex;
197 unsigned self_adj;
198 unsigned checked;
199 unsigned parallel_edge;
200 } g_count;
201
202 /** Returns if the contig can be removed from the graph. */
203 static bool
removable(const Graph * pg,vertex_descriptor v)204 removable(const Graph* pg, vertex_descriptor v)
205 {
206 typedef graph_traits<Graph> GTraits;
207 typedef GTraits::out_edge_iterator OEit;
208 typedef GTraits::in_edge_iterator IEit;
209 typedef GTraits::vertex_descriptor V;
210
211 const Graph& g = *pg;
212
213 // Check if previously removed
214 if (get(vertex_removed, g, v)) {
215 g_count.removed++;
216 return false;
217 }
218
219 unsigned min_degree = min(out_degree(v, g), in_degree(v, g));
220
221 // Check for tails
222 if (min_degree == 0) {
223 g_count.tails++;
224 return false;
225 }
226
227 // Check that the result will be less complex that the original
228 if (min_degree > opt::shimMaxDegree) {
229 g_count.too_complex++;
230 return false;
231 }
232
233 // Check if self adjacent
234 OEit oei0, oei1;
235 tie(oei0, oei1) = out_edges(v, g);
236 for (OEit vw = oei0; vw != oei1; ++vw) {
237 V w = target(*vw, g);
238 V vc = get(vertex_complement, g, v);
239 if (v == w || vc == w) {
240 g_count.self_adj++;
241 return false;
242 }
243 }
244
245 // Check that removing the contig will result in adjacent contigs
246 // overlapping by at least opt::minOverlap.
247 IEit iei0, iei1;
248 tie(iei0, iei1) = in_edges(v, g);
249 IEit maxuv = iei0;
250 for (IEit uv = iei0; uv != iei1; ++uv)
251 if (g[*maxuv].distance < g[*uv].distance)
252 maxuv = uv;
253 OEit maxvw = oei0;
254 for (OEit vw = oei0; vw != oei1; ++vw)
255 if (g[*maxvw].distance < g[*vw].distance)
256 maxvw = vw;
257
258 if (g[*maxuv].distance + (int)g[v].length + g[*maxvw].distance > -opt::minOverlap) {
259 g_count.too_long++;
260 return false;
261 }
262 return true;
263 }
264
265 /** Data to store information of an edge. */
266 struct EdgeInfo
267 {
268 vertex_descriptor u;
269 vertex_descriptor w;
270 edge_bundle_type<Graph>::type ep;
271
EdgeInfoEdgeInfo272 EdgeInfo(vertex_descriptor u, vertex_descriptor w, int ep)
273 : u(u)
274 , w(w)
275 , ep(ep)
276 {}
EdgeInfoEdgeInfo277 EdgeInfo()
278 : u()
279 , w()
280 , ep()
281 {}
282 };
283
284 /** Returns a list of edges that may be added when the vertex v is
285 * removed. */
286 static bool
findNewEdges(const Graph & g,vertex_descriptor v,vector<EdgeInfo> & eds,vector<bool> & markedContigs)287 findNewEdges(
288 const Graph& g,
289 vertex_descriptor v,
290 vector<EdgeInfo>& eds,
291 vector<bool>& markedContigs)
292 {
293 typedef graph_traits<Graph> GTraits;
294 typedef GTraits::vertex_descriptor V;
295 typedef GTraits::out_edge_iterator OEit;
296 typedef GTraits::in_edge_iterator IEit;
297
298 IEit iei0, iei1;
299 tie(iei0, iei1) = in_edges(v, g);
300 OEit oei0, oei1;
301 tie(oei0, oei1) = out_edges(v, g);
302
303 vector<V> marked;
304
305 // if not marked and longest link LE contig length.
306 // for every edge from u->v and v->w we must add an edge u->w
307 for (IEit uv = iei0; uv != iei1; ++uv) {
308 for (OEit vw = oei0; vw != oei1; ++vw) {
309 int x = g[*uv].distance + (int)g[v].length + g[*vw].distance;
310 assert(x <= 0);
311 EdgeInfo ed(source(*uv, g), target(*vw, g), x);
312 eds.push_back(ed);
313 if (out_degree(v, g) > 1)
314 marked.push_back(ed.u);
315 if (in_degree(v, g) > 1)
316 marked.push_back(ed.w);
317 }
318 }
319 for (vector<V>::const_iterator it = marked.begin(); it != marked.end(); it++)
320 markedContigs[get(vertex_index, g, *it)] = true;
321 return true;
322 }
323
324 /** Adds all edges described in the vector eds. */
325 static void
addNewEdges(Graph & g,const vector<EdgeInfo> & eds)326 addNewEdges(Graph& g, const vector<EdgeInfo>& eds)
327 {
328 for (vector<EdgeInfo>::const_iterator edsit = eds.begin(); edsit != eds.end(); ++edsit) {
329 // Don't add parallel edges! This can happen when removing a palindrome.
330 if (edge(edsit->u, edsit->w, g).second) {
331 g_count.parallel_edge++;
332 continue;
333 }
334 assert(edsit->ep.distance <= -opt::minOverlap);
335 add_edge(edsit->u, edsit->w, edsit->ep, g);
336 }
337 }
338
339 static void
removeContig(vertex_descriptor v,Graph & g)340 removeContig(vertex_descriptor v, Graph& g)
341 {
342 clear_vertex(v, g);
343 remove_vertex(v, g);
344 g_removed.push_back(get(vertex_contig_index, g, v));
345 g_count.removed++;
346 }
347
348 /** Remove the specified contig from the adjacency graph. */
349 static void
removeContigs(Graph & g,vector<vertex_descriptor> & sc)350 removeContigs(Graph& g, vector<vertex_descriptor>& sc)
351 {
352 typedef graph_traits<Graph> GTraits;
353 typedef GTraits::vertex_descriptor V;
354
355 vector<vertex_descriptor> out;
356 out.reserve(sc.size());
357
358 vector<bool> markedContigs(g.num_vertices());
359 for (vector<vertex_descriptor>::iterator it = sc.begin(); it != sc.end(); ++it) {
360 V v = *it;
361 if (opt::verbose > 0 && ++g_count.checked % 10000000 == 0)
362 cerr << "Removed " << g_count.removed << "/" << g_count.checked
363 << " vertices that have been checked.\n";
364
365 if (markedContigs[get(vertex_index, g, v)]) {
366 out.push_back(v);
367 continue;
368 }
369
370 if (!removable(&g, v))
371 continue;
372
373 vector<EdgeInfo> eds;
374 if (findNewEdges(g, v, eds, markedContigs))
375 addNewEdges(g, eds);
376 else
377 continue;
378
379 removeContig(v, g);
380 }
381 sc.swap(out);
382 }
383
384 /** Return the value of the bit at the specified index. */
385 struct Marked : unary_function<vertex_descriptor, bool>
386 {
387 typedef vector<bool> Data;
MarkedMarked388 Marked(const Graph& g, const Data& data)
389 : m_g(g)
390 , m_data(data)
391 {}
operator ()Marked392 bool operator()(vertex_descriptor u) const { return m_data[get(vertex_contig_index, m_g, u)]; }
393
394 private:
395 const Graph& m_g;
396 const Data& m_data;
397 };
398
399 /** Finds all potentially removable contigs in the graph. */
400 static void
findShortContigs(const Graph & g,const vector<bool> & seen,vector<vertex_descriptor> & sc)401 findShortContigs(const Graph& g, const vector<bool>& seen, vector<vertex_descriptor>& sc)
402 {
403 typedef graph_traits<Graph> GTraits;
404 typedef GTraits::vertex_iterator Vit;
405 Vit first, second;
406 tie(first, second) = vertices(g);
407 ::copy_if(
408 first,
409 second,
410 back_inserter(sc),
411 !boost::lambda::bind(Marked(g, seen), _1) && boost::lambda::bind(removable, &g, _1));
412 }
413
414 /** Functor used for sorting contigs based on degree, then size,
415 * and then ID. */
416 struct sortContigs
417 {
418 const Graph& g;
419
sortContigssortContigs420 sortContigs(const Graph& g)
421 : g(g)
422 {}
423
424 template<typename V>
operator ()sortContigs425 bool operator()(V a, V b)
426 {
427 const ContigProperties& ap = g[a];
428 const ContigProperties& bp = g[b];
429
430 unsigned dega = out_degree(a, g) * in_degree(a, g);
431 unsigned degb = out_degree(b, g) * in_degree(b, g);
432
433 return dega != degb ? dega < degb : ap.length != bp.length ? ap.length < bp.length : a < b;
434 }
435 };
436
437 struct ShorterThanX : unary_function<vertex_descriptor, bool>
438 {
439 const Graph& g;
440 const vector<bool>& seen;
441 size_t x;
442
ShorterThanXShorterThanX443 ShorterThanX(const Graph& g, const vector<bool>& seen, size_t x)
444 : g(g)
445 , seen(seen)
446 , x(x)
447 {}
448
operator ()ShorterThanX449 bool operator()(vertex_descriptor y) const
450 {
451 return g[y].length < x && !get(vertex_removed, g, y) &&
452 !seen[get(vertex_contig_index, g, y)];
453 }
454 };
455
456 struct LongerThanX : unary_function<vertex_descriptor, bool>
457 {
458 const Graph& g;
459 const vector<bool>& seen;
460 size_t x;
461
LongerThanXLongerThanX462 LongerThanX(const Graph& g, const vector<bool>& seen, size_t x)
463 : g(g)
464 , seen(seen)
465 , x(x)
466 {}
467
operator ()LongerThanX468 bool operator()(vertex_descriptor y) const
469 {
470 return g[y].length > x && !get(vertex_removed, g, y) &&
471 !seen[get(vertex_contig_index, g, y)];
472 }
473 };
474
475 struct CoverageLessThan : unary_function<vertex_descriptor, bool>
476 {
477 const Graph& g;
478 const vector<bool>& seen;
479 float minCov;
480
CoverageLessThanCoverageLessThan481 CoverageLessThan(const Graph& g, const vector<bool>& seen, float minCov)
482 : g(g)
483 , seen(seen)
484 , minCov(minCov)
485 {}
486
operator ()CoverageLessThan487 bool operator()(vertex_descriptor u) const
488 {
489 assert(opt::k > 0);
490 float meanCoverage = (float)g[u].coverage / (g[u].length - opt::k + 1);
491 return meanCoverage < minCov && !get(vertex_removed, g, u) &&
492 !seen[get(vertex_contig_index, g, u)];
493 }
494 };
495
496 static void
removeShims(Graph & g,const vector<bool> & seen)497 removeShims(Graph& g, const vector<bool>& seen)
498 {
499 if (opt::verbose > 0)
500 cerr << "Removing shim contigs from the graph...\n";
501 vector<vertex_descriptor> shortContigs;
502 findShortContigs(g, seen, shortContigs);
503 for (unsigned i = 0; !shortContigs.empty(); ++i) {
504 if (opt::verbose > 0)
505 cerr << "Pass " << i + 1 << ": Checking " << shortContigs.size() << " contigs.\n";
506 sort(shortContigs.begin(), shortContigs.end(), sortContigs(g));
507 removeContigs(g, shortContigs);
508 }
509 if (opt::verbose > 0) {
510 cerr << "Shim removal stats:\n";
511 cerr << "Removed: " << g_count.removed / 2 << " Too Complex: " << g_count.too_complex / 2
512 << " Tails: " << g_count.tails / 2 << " Too Long: " << g_count.too_long / 2
513 << " Self Adjacent: " << g_count.self_adj / 2
514 << " Parallel Edges: " << g_count.parallel_edge / 2 << '\n';
515 }
516 }
517
518 template<typename pred>
519 static void
removeContigs_if(Graph & g,pred p)520 removeContigs_if(Graph& g, pred p)
521 {
522 typedef graph_traits<Graph> GTraits;
523 typedef GTraits::vertex_iterator Vit;
524 typedef GTraits::vertex_descriptor V;
525 Vit first, second;
526 tie(first, second) = vertices(g);
527 vector<V> sc;
528 ::copy_if(first, second, back_inserter(sc), p);
529 remove_vertex_if(g, sc.begin(), sc.end(), True<V>());
530 transform(sc.begin(), sc.end(), back_inserter(g_removed), [](const ContigNode& c) {
531 return c.contigIndex();
532 });
533 if (opt::verbose > 0)
534 cerr << "Removed " << sc.size() / 2 << " contigs.\n";
535 }
536
537 /** Contig sequences. */
538 typedef vector<const_string> Contigs;
539 static Contigs g_contigs;
540
541 /** Return the sequence of vertex u. */
542 static string
getSequence(const Graph & g,vertex_descriptor u)543 getSequence(const Graph& g, vertex_descriptor u)
544 {
545 size_t i = get(vertex_contig_index, g, u);
546 assert(i < g_contigs.size());
547 string seq(g_contigs[i]);
548 return get(vertex_sense, g, u) ? reverseComplement(seq) : seq;
549 }
550
551 /** Return whether the specified edge is inconsistent. */
552 struct is_edge_inconsistent : unary_function<edge_descriptor, bool>
553 {
554 const Graph& g;
555
is_edge_inconsistentis_edge_inconsistent556 is_edge_inconsistent(const Graph& g)
557 : g(g)
558 {}
559
operator ()is_edge_inconsistent560 bool operator()(edge_descriptor e) const
561 {
562 vertex_descriptor u = source(e, g);
563 vertex_descriptor v = target(e, g);
564
565 int overlap = g[e].distance;
566 assert(overlap < 0);
567
568 string su = getSequence(g, u);
569 string sv = getSequence(g, v);
570 const unsigned u_start = su.length() + overlap;
571
572 for (unsigned i = 0; i < (unsigned)-overlap; i++)
573 if (!(ambiguityToBitmask(su[u_start + i]) & ambiguityToBitmask(sv[i])))
574 return true;
575 return false;
576 }
577 };
578
579 template<typename It>
580 static void
remove_edge(Graph & g,It first,It last)581 remove_edge(Graph& g, It first, It last)
582 {
583 for (; first != last; first++)
584 remove_edge(*first, g);
585 }
586
587 template<typename pred>
588 static void
removeEdges_if(Graph & g,pred p)589 removeEdges_if(Graph& g, pred p)
590 {
591 typedef graph_traits<Graph> GTraits;
592 typedef GTraits::edge_iterator Eit;
593 typedef GTraits::edge_descriptor E;
594 Eit first, second;
595 tie(first, second) = edges(g);
596 vector<E> sc;
597 ::copy_if(first, second, back_inserter(sc), p);
598 remove_edge(g, sc.begin(), sc.end());
599 if (opt::verbose > 0) {
600 cerr << "Edge removal stats:\n";
601 cerr << "Removed: " << sc.size() << '\n';
602 }
603 }
604
605 int
main(int argc,char ** argv)606 main(int argc, char** argv)
607 {
608 string commandLine;
609 {
610 ostringstream ss;
611 char** last = argv + argc - 1;
612 copy(argv, last, ostream_iterator<const char*>(ss, " "));
613 ss << *last;
614 commandLine = ss.str();
615 }
616
617 bool die = false;
618 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
619 istringstream arg(optarg != NULL ? optarg : "");
620 switch (c) {
621 case '?':
622 die = true;
623 break;
624 case 'c':
625 arg >> opt::minCoverage;
626 break;
627 case 'C':
628 arg >> opt::maxCoverage;
629 break;
630 case 'l':
631 arg >> opt::minLen;
632 break;
633 case 'L':
634 arg >> opt::maxLen;
635 break;
636 case 'm':
637 arg >> opt::minOverlap;
638 break;
639 case 'g':
640 arg >> opt::graphPath;
641 break;
642 case 'i':
643 arg >> opt::ignorePath;
644 break;
645 case 'r':
646 arg >> opt::removePath;
647 break;
648 case 'k':
649 arg >> opt::k;
650 break;
651 case 'T':
652 arg >> opt::minIslandLen;
653 break;
654 case 't':
655 arg >> opt::minTipLen;
656 break;
657 case 'v':
658 opt::verbose++;
659 break;
660 case OPT_SHIM_MAX_DEG:
661 arg >> opt::shimMaxDegree;
662 break;
663 case OPT_HELP:
664 cout << USAGE_MESSAGE;
665 exit(EXIT_SUCCESS);
666 case OPT_VERSION:
667 cout << VERSION_MESSAGE;
668 exit(EXIT_SUCCESS);
669 }
670 if (optarg != NULL && !arg.eof()) {
671 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
672 exit(EXIT_FAILURE);
673 }
674 }
675
676 if (opt::minOverlap < 0) {
677 cerr << PROGRAM ": "
678 << "--min-overlap must be a positive integer.\n";
679 die = true;
680 }
681
682 if (opt::k <= 0) {
683 cerr << PROGRAM ": "
684 << "missing -k,--kmer option\n";
685 die = true;
686 }
687
688 if (argc - optind < 1) {
689 cerr << PROGRAM ": missing arguments\n";
690 die = true;
691 }
692
693 if (argc - optind > 2) {
694 cerr << PROGRAM ": too many arguments\n";
695 die = true;
696 }
697
698 if (die) {
699 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
700 exit(EXIT_FAILURE);
701 }
702
703 Graph g;
704 // Read the contig adjacency graph.
705 {
706 string adjPath(argv[optind++]);
707 if (opt::verbose > 0)
708 cerr << "Loading graph from file: " << adjPath << '\n';
709 ifstream fin(adjPath.c_str());
710 assert_good(fin, adjPath);
711 fin >> g;
712 assert(fin.eof());
713 }
714
715 // Read the set of contigs to ignore.
716 vector<bool> seen(num_vertices(g) / 2);
717 if (!opt::ignorePath.empty()) {
718 ifstream in(opt::ignorePath.c_str());
719 assert_good(in, opt::ignorePath);
720 markSeenInPath(in, seen);
721 }
722
723 if (opt::verbose > 0) {
724 cerr << "Graph stats before:\n";
725 printGraphStats(cerr, g);
726 }
727
728 // Remove list of contigs
729 if (!opt::removePath.empty()) {
730 ifstream in(opt::removePath.c_str());
731 assert(in.good());
732 string s;
733 size_t b = g_removed.size();
734 while (in >> s) {
735 size_t i = get(g_contigNames, s);
736 removeContig(ContigNode(i, 0), g);
737 }
738 assert(in.eof());
739 if (opt::verbose)
740 cerr << "Removed " << g_removed.size() - b << " contigs.\n";
741 }
742
743 // Remove shims.
744 if (opt::shim)
745 removeShims(g, seen);
746
747 // Remove islands.
748 if (opt::minIslandLen > 0) {
749 size_t s = g_removed.size();
750 removeIslands_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minIslandLen));
751 if (opt::verbose)
752 cerr << "Removed " << g_removed.size() - s << " islands.\n";
753 }
754
755 // Remove tips.
756 if (opt::minTipLen > 0) {
757 size_t s, prev;
758 s = g_removed.size();
759 do {
760 prev = g_removed.size();
761 pruneTips_if(g, back_inserter(g_removed), ShorterThanX(g, seen, opt::minTipLen));
762 } while (prev < g_removed.size());
763 if (opt::verbose)
764 cerr << "Removed " << g_removed.size() - s << " tips.\n";
765 }
766
767 // Remove short contigs.
768 if (opt::minLen > 0)
769 removeContigs_if(g, ShorterThanX(g, seen, opt::minLen));
770
771 // Remove long contigs.
772 if (opt::maxLen > 0)
773 removeContigs_if(g, LongerThanX(g, seen, opt::maxLen));
774
775 // Remove contigs with low mean k-mer coverage.
776 if (opt::minCoverage > 0)
777 removeContigs_if(g, CoverageLessThan(g, seen, opt::minCoverage));
778
779 // Remove contigs with high mean k-mer coverage.
780 if (opt::maxCoverage > 0)
781 removeContigs_if(g, std::not1(CoverageLessThan(g, seen, opt::maxCoverage)));
782
783 // Remove inconsistent edges of spaceseeds
784 if (argc - optind == 1) {
785 const char* contigsPath(argv[optind++]);
786 Contigs& contigs = g_contigs;
787 if (opt::verbose > 0)
788 cerr << "Reading `" << contigsPath << "'...\n";
789 FastaReader in(contigsPath, FastaReader::NO_FOLD_CASE);
790 for (FastaRecord rec; in >> rec;) {
791 if (g_contigNames.count(rec.id) == 0)
792 continue;
793 assert(contigs.size() == get(g_contigNames, rec.id));
794 contigs.push_back(rec.seq);
795 }
796 assert(in.eof());
797
798 removeEdges_if(g, is_edge_inconsistent(g));
799 }
800
801 if (opt::verbose > 0) {
802 cerr << "Graph stats after:\n";
803 printGraphStats(cerr, g);
804 }
805
806 // Output the updated adjacency graph.
807 if (!opt::graphPath.empty()) {
808 ofstream fout(opt::graphPath.c_str());
809 assert_good(fout, opt::graphPath);
810 write_graph(fout, g, PROGRAM, commandLine);
811 assert_good(fout, opt::graphPath);
812 }
813
814 // Assemble unambiguous paths. These need to be assembled by
815 // MergeContigs before being processed by other applications.
816 if (opt::assemble) {
817 size_t numContigs = num_vertices(g) / 2;
818 typedef vector<ContigPath> ContigPaths;
819 ContigPaths paths;
820 if (opt::ss)
821 assemble_stranded(g, back_inserter(paths));
822 else
823 assemble(g, back_inserter(paths));
824 g_contigNames.unlock();
825 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it) {
826 ContigNode u(numContigs + it - paths.begin(), false);
827 string name = createContigName();
828 put(vertex_name, g, u, name);
829 cout << name << '\t' << *it << '\n';
830 }
831 g_contigNames.lock();
832 }
833
834 return 0;
835 }
836