1 #include "ContigPath.h"
2 #include "ContigProperties.h"
3 #include "Graph/Assemble.h"
4 #include "Graph/ContigGraph.h"
5 #include "Graph/ContigGraphAlgorithms.h"
6 #include "Graph/DirectedGraph.h"
7 #include "Graph/GraphAlgorithms.h"
8 #include "Graph/GraphIO.h"
9 #include "Graph/GraphUtil.h"
10 #include "Uncompress.h"
11 #include "config.h"
12 #include <cassert>
13 #include <cstdlib>
14 #include <fstream>
15 #include <getopt.h>
16 #include <iostream>
17 #include <sstream>
18 #include <string>
19 #include <utility>
20
21 using namespace std;
22 using boost::tie;
23
24 #define PROGRAM "abyss-layout"
25
26 static const char VERSION_MESSAGE[] = PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
27 "Written by Shaun Jackman.\n"
28 "\n"
29 "Copyright 2012 Shaun Jackman\n";
30
31 static const char USAGE_MESSAGE[] =
32 "Usage: " PROGRAM " [OPTION]... OVERLAP\n"
33 "Layout contigs using the sequence overlap graph.\n"
34 "Output sequence paths.\n"
35 "\n"
36 " Arguments:\n"
37 "\n"
38 " OVERLAP the sequence overlap graph\n"
39 "\n"
40 " Options:\n"
41 "\n"
42 " -s, --min-length=N minimum sequence length [0]\n"
43 " -m, --min-overlap=N minimum overlap [0]\n"
44 " -k, --kmer=N length of a k-mer\n"
45 " -o, --out=FILE write the paths to FILE\n"
46 " -g, --graph=FILE write the graph to FILE\n"
47 " --tred remove transitive edges\n"
48 " --no-tred do not remove transitive edges [default]\n"
49 " --SS expect contigs to be oriented correctly\n"
50 " --no-SS no assumption about contig orientation [default]\n"
51 " -v, --verbose display verbose output\n"
52 " --help display this help and exit\n"
53 " --version output version information and exit\n"
54 "\n"
55 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
56
57 namespace opt {
58 unsigned k; // used by ContigProperties
59
60 /** Minimum sequence length. */
61 static unsigned minLength;
62
63 /** Minimum overlap. */
64 static unsigned minOverlap;
65
66 /** Write the paths to this file. */
67 static string out;
68
69 /** Write the graph to this file. */
70 static string graphPath;
71
72 /** Remove transitive edges. */
73 static int tred;
74
75 /** Run a strand-specific RNA-Seq assembly. */
76 static int ss;
77
78 /** Verbose output. */
79 int verbose; // used by PopBubbles
80
81 /** Output format */
82 int format = DOT;
83 }
84
85 static const char shortopts[] = "g:k:m:o:s:v";
86
87 enum
88 {
89 OPT_HELP = 1,
90 OPT_VERSION
91 };
92
93 static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' },
94 { "kmer", required_argument, NULL, 'k' },
95 { "min-overlap", required_argument, NULL, 'm' },
96 { "out", required_argument, NULL, 'o' },
97 { "min-length", required_argument, NULL, 's' },
98 { "tred", no_argument, &opt::tred, true },
99 { "no-tred", no_argument, &opt::tred, false },
100 { "SS", no_argument, &opt::ss, 1 },
101 { "no-SS", no_argument, &opt::ss, 0 },
102 { "verbose", no_argument, NULL, 'v' },
103 { "help", no_argument, NULL, OPT_HELP },
104 { "version", no_argument, NULL, OPT_VERSION },
105 { NULL, 0, NULL, 0 } };
106
107 /** An overlap graph. */
108 typedef DirectedGraph<Length, Distance> DG;
109 typedef ContigGraph<DG> Graph;
110
111 /** Remove short vertices. */
112 static void
filterVertices(Graph & g,unsigned minLength)113 filterVertices(Graph& g, unsigned minLength)
114 {
115 typedef graph_traits<Graph> GTraits;
116 typedef GTraits::vertex_descriptor V;
117 typedef GTraits::vertex_iterator Vit;
118
119 if (minLength == 0)
120 return;
121
122 // Remove short sequences.
123 unsigned numRemoved = 0;
124 std::pair<Vit, Vit> urange = vertices(g);
125 for (Vit uit = urange.first; uit != urange.second; ++uit) {
126 V u = *uit;
127 if (g[u].length < minLength)
128 clear_vertex(u, g);
129 if (out_degree(u, g) == 0 && in_degree(u, g) == 0) {
130 remove_vertex(u, g);
131 numRemoved++;
132 }
133 }
134
135 if (opt::verbose > 0) {
136 cerr << "Ignored " << numRemoved << " sequences shorter than " << minLength << " bp.\n";
137 printGraphStats(cerr, g);
138 }
139 }
140
141 /** Return true if the edge is a small overlap. */
142 struct IsSmallOverlap
143 {
IsSmallOverlapIsSmallOverlap144 IsSmallOverlap(Graph& g)
145 : m_g(g)
146 {}
operator ()IsSmallOverlap147 bool operator()(graph_traits<Graph>::edge_descriptor e) const
148 {
149 int maxDistance = -opt::minOverlap;
150 return m_g[e].distance > maxDistance;
151 }
152 const Graph& m_g;
153 };
154
155 /** Remove small overlaps. */
156 static void
filterEdges(Graph & g,unsigned minOverlap)157 filterEdges(Graph& g, unsigned minOverlap)
158 {
159 if (minOverlap == 0)
160 return;
161 unsigned numBefore = num_edges(g);
162 remove_edge_if(IsSmallOverlap(g), static_cast<DG&>(g));
163 unsigned numRemoved = numBefore - num_edges(g);
164 if (opt::verbose > 0) {
165 cerr << "Removed " << numRemoved << " small overlaps.\n";
166 printGraphStats(cerr, g);
167 }
168 }
169
170 /** Read a graph from the specified file. */
171 static void
readGraph(const string & path,Graph & g)172 readGraph(const string& path, Graph& g)
173 {
174 if (opt::verbose > 0)
175 cerr << "Reading `" << path << "'...\n";
176 ifstream fin(path.c_str());
177 istream& in = path == "-" ? cin : fin;
178 assert_good(in, path);
179 in >> g;
180 assert(in.eof());
181 if (opt::verbose > 0)
182 printGraphStats(cerr, g);
183 g_contigNames.lock();
184 }
185
186 /** Return the length histogram. */
187 static Histogram
buildLengthHistogram(const Graph & g)188 buildLengthHistogram(const Graph& g)
189 {
190 typedef graph_traits<Graph>::vertex_descriptor V;
191 typedef graph_traits<Graph>::vertex_iterator Vit;
192 Histogram h;
193 Vit uit, ulast;
194 for (tie(uit, ulast) = vertices(g); uit != ulast; ++++uit) {
195 V u = *uit;
196 if (!get(vertex_removed, g, u))
197 h.insert(g[u].length);
198 }
199 return h;
200 }
201
202 /** Run abyss-layout. */
203 int
main(int argc,char ** argv)204 main(int argc, char** argv)
205 {
206 bool die = false;
207 for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
208 istringstream arg(optarg != NULL ? optarg : "");
209 switch (c) {
210 case '?':
211 die = true;
212 break;
213 case 'k':
214 arg >> opt::k;
215 break;
216 case 'g':
217 arg >> opt::graphPath;
218 break;
219 case 'm':
220 arg >> opt::minOverlap;
221 break;
222 case 'o':
223 arg >> opt::out;
224 break;
225 case 's':
226 arg >> opt::minLength;
227 break;
228 case 'v':
229 opt::verbose++;
230 break;
231 case OPT_HELP:
232 cout << USAGE_MESSAGE;
233 exit(EXIT_SUCCESS);
234 case OPT_VERSION:
235 cout << VERSION_MESSAGE;
236 exit(EXIT_SUCCESS);
237 }
238 if (optarg != NULL && !arg.eof()) {
239 cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
240 exit(EXIT_FAILURE);
241 }
242 }
243
244 if (argc - optind < 1) {
245 cerr << PROGRAM ": missing arguments\n";
246 die = true;
247 }
248
249 if (die) {
250 cerr << "Try `" << PROGRAM << " --help' for more information.\n";
251 exit(EXIT_FAILURE);
252 }
253
254 Graph g;
255 if (optind < argc) {
256 for (; optind < argc; optind++)
257 readGraph(argv[optind], g);
258 } else
259 readGraph("-", g);
260
261 // Remove short sequences.
262 filterVertices(g, opt::minLength);
263
264 // Remove small overlaps.
265 filterEdges(g, opt::minOverlap);
266
267 // Remove transitive edges.
268 if (opt::tred) {
269 unsigned numTransitive = remove_transitive_edges(g);
270 if (opt::verbose > 0) {
271 cerr << "Removed " << numTransitive << " transitive edges.\n";
272 printGraphStats(cerr, g);
273 }
274 }
275
276 /** A container of contig paths. */
277 typedef vector<ContigPath> ContigPaths;
278
279 // Assemble the paths.
280 ContigPaths paths;
281 assembleDFS(g, back_inserter(paths), opt::ss);
282 sort(paths.begin(), paths.end());
283 if (opt::verbose > 0) {
284 unsigned n = 0;
285 for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it)
286 n += it->size();
287 cerr << "Assembled " << n << " sequences in " << paths.size() << " contigs.\n";
288 printGraphStats(cerr, g);
289 }
290
291 // Output the paths.
292 ofstream fout(opt::out.c_str());
293 ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
294 assert_good(out, opt::out);
295 g_contigNames.unlock();
296 for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it)
297 out << createContigName() << '\t' << *it << '\n';
298 assert_good(out, opt::out);
299
300 // Create the new vertices.
301 for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it) {
302 const ContigPath& path = *it;
303 merge(g, path.begin(), path.end());
304 remove_vertex_if(
305 g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); });
306 }
307 if (opt::verbose > 0)
308 printGraphStats(cerr, g);
309
310 // Output the graph.
311 if (!opt::graphPath.empty()) {
312 ofstream out(opt::graphPath.c_str());
313 assert_good(out, opt::graphPath);
314 write_dot(out, g);
315 assert_good(out, opt::graphPath);
316 }
317
318 // Print assembly contiguity statistics.
319 if (opt::verbose > 0) {
320 Histogram h = buildLengthHistogram(g);
321 const unsigned STATS_MIN_LENGTH = 200; // bp
322 printContiguityStats(cerr, h, STATS_MIN_LENGTH) << '\n';
323 }
324
325 return 0;
326 }
327