1 #include "ContigPath.h"
2 #include "ContigProperties.h"
3 #include "Graph/Assemble.h"
4 #include "Graph/ContigGraph.h"
5 #include "Graph/ContigGraphAlgorithms.h"
6 #include "Graph/DirectedGraph.h"
7 #include "Graph/GraphAlgorithms.h"
8 #include "Graph/GraphIO.h"
9 #include "Graph/GraphUtil.h"
10 #include "Uncompress.h"
11 #include "config.h"
12 #include <cassert>
13 #include <cstdlib>
14 #include <fstream>
15 #include <getopt.h>
16 #include <iostream>
17 #include <sstream>
18 #include <string>
19 #include <utility>
20 
21 using namespace std;
22 using boost::tie;
23 
24 #define PROGRAM "abyss-layout"
25 
26 static const char VERSION_MESSAGE[] = PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
27                                               "Written by Shaun Jackman.\n"
28                                               "\n"
29                                               "Copyright 2012 Shaun Jackman\n";
30 
31 static const char USAGE_MESSAGE[] =
32     "Usage: " PROGRAM " [OPTION]... OVERLAP\n"
33     "Layout contigs using the sequence overlap graph.\n"
34     "Output sequence paths.\n"
35     "\n"
36     " Arguments:\n"
37     "\n"
38     "  OVERLAP  the sequence overlap graph\n"
39     "\n"
40     " Options:\n"
41     "\n"
42     "  -s, --min-length=N    minimum sequence length [0]\n"
43     "  -m, --min-overlap=N   minimum overlap [0]\n"
44     "  -k, --kmer=N          length of a k-mer\n"
45     "  -o, --out=FILE        write the paths to FILE\n"
46     "  -g, --graph=FILE      write the graph to FILE\n"
47     "      --tred            remove transitive edges\n"
48     "      --no-tred         do not remove transitive edges [default]\n"
49     "      --SS              expect contigs to be oriented correctly\n"
50     "      --no-SS           no assumption about contig orientation [default]\n"
51     "  -v, --verbose         display verbose output\n"
52     "      --help            display this help and exit\n"
53     "      --version         output version information and exit\n"
54     "\n"
55     "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
56 
57 namespace opt {
58 unsigned k; // used by ContigProperties
59 
60 /** Minimum sequence length. */
61 static unsigned minLength;
62 
63 /** Minimum overlap. */
64 static unsigned minOverlap;
65 
66 /** Write the paths to this file. */
67 static string out;
68 
69 /** Write the graph to this file. */
70 static string graphPath;
71 
72 /** Remove transitive edges. */
73 static int tred;
74 
75 /** Run a strand-specific RNA-Seq assembly. */
76 static int ss;
77 
78 /** Verbose output. */
79 int verbose; // used by PopBubbles
80 
81 /** Output format */
82 int format = DOT;
83 }
84 
85 static const char shortopts[] = "g:k:m:o:s:v";
86 
87 enum
88 {
89 	OPT_HELP = 1,
90 	OPT_VERSION
91 };
92 
93 static const struct option longopts[] = { { "graph", required_argument, NULL, 'g' },
94 	                                      { "kmer", required_argument, NULL, 'k' },
95 	                                      { "min-overlap", required_argument, NULL, 'm' },
96 	                                      { "out", required_argument, NULL, 'o' },
97 	                                      { "min-length", required_argument, NULL, 's' },
98 	                                      { "tred", no_argument, &opt::tred, true },
99 	                                      { "no-tred", no_argument, &opt::tred, false },
100 	                                      { "SS", no_argument, &opt::ss, 1 },
101 	                                      { "no-SS", no_argument, &opt::ss, 0 },
102 	                                      { "verbose", no_argument, NULL, 'v' },
103 	                                      { "help", no_argument, NULL, OPT_HELP },
104 	                                      { "version", no_argument, NULL, OPT_VERSION },
105 	                                      { NULL, 0, NULL, 0 } };
106 
107 /** An overlap graph. */
108 typedef DirectedGraph<Length, Distance> DG;
109 typedef ContigGraph<DG> Graph;
110 
111 /** Remove short vertices. */
112 static void
filterVertices(Graph & g,unsigned minLength)113 filterVertices(Graph& g, unsigned minLength)
114 {
115 	typedef graph_traits<Graph> GTraits;
116 	typedef GTraits::vertex_descriptor V;
117 	typedef GTraits::vertex_iterator Vit;
118 
119 	if (minLength == 0)
120 		return;
121 
122 	// Remove short sequences.
123 	unsigned numRemoved = 0;
124 	std::pair<Vit, Vit> urange = vertices(g);
125 	for (Vit uit = urange.first; uit != urange.second; ++uit) {
126 		V u = *uit;
127 		if (g[u].length < minLength)
128 			clear_vertex(u, g);
129 		if (out_degree(u, g) == 0 && in_degree(u, g) == 0) {
130 			remove_vertex(u, g);
131 			numRemoved++;
132 		}
133 	}
134 
135 	if (opt::verbose > 0) {
136 		cerr << "Ignored " << numRemoved << " sequences shorter than " << minLength << " bp.\n";
137 		printGraphStats(cerr, g);
138 	}
139 }
140 
141 /** Return true if the edge is a small overlap. */
142 struct IsSmallOverlap
143 {
IsSmallOverlapIsSmallOverlap144 	IsSmallOverlap(Graph& g)
145 	  : m_g(g)
146 	{}
operator ()IsSmallOverlap147 	bool operator()(graph_traits<Graph>::edge_descriptor e) const
148 	{
149 		int maxDistance = -opt::minOverlap;
150 		return m_g[e].distance > maxDistance;
151 	}
152 	const Graph& m_g;
153 };
154 
155 /** Remove small overlaps. */
156 static void
filterEdges(Graph & g,unsigned minOverlap)157 filterEdges(Graph& g, unsigned minOverlap)
158 {
159 	if (minOverlap == 0)
160 		return;
161 	unsigned numBefore = num_edges(g);
162 	remove_edge_if(IsSmallOverlap(g), static_cast<DG&>(g));
163 	unsigned numRemoved = numBefore - num_edges(g);
164 	if (opt::verbose > 0) {
165 		cerr << "Removed " << numRemoved << " small overlaps.\n";
166 		printGraphStats(cerr, g);
167 	}
168 }
169 
170 /** Read a graph from the specified file. */
171 static void
readGraph(const string & path,Graph & g)172 readGraph(const string& path, Graph& g)
173 {
174 	if (opt::verbose > 0)
175 		cerr << "Reading `" << path << "'...\n";
176 	ifstream fin(path.c_str());
177 	istream& in = path == "-" ? cin : fin;
178 	assert_good(in, path);
179 	in >> g;
180 	assert(in.eof());
181 	if (opt::verbose > 0)
182 		printGraphStats(cerr, g);
183 	g_contigNames.lock();
184 }
185 
186 /** Return the length histogram. */
187 static Histogram
buildLengthHistogram(const Graph & g)188 buildLengthHistogram(const Graph& g)
189 {
190 	typedef graph_traits<Graph>::vertex_descriptor V;
191 	typedef graph_traits<Graph>::vertex_iterator Vit;
192 	Histogram h;
193 	Vit uit, ulast;
194 	for (tie(uit, ulast) = vertices(g); uit != ulast; ++++uit) {
195 		V u = *uit;
196 		if (!get(vertex_removed, g, u))
197 			h.insert(g[u].length);
198 	}
199 	return h;
200 }
201 
202 /** Run abyss-layout. */
203 int
main(int argc,char ** argv)204 main(int argc, char** argv)
205 {
206 	bool die = false;
207 	for (int c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
208 		istringstream arg(optarg != NULL ? optarg : "");
209 		switch (c) {
210 		case '?':
211 			die = true;
212 			break;
213 		case 'k':
214 			arg >> opt::k;
215 			break;
216 		case 'g':
217 			arg >> opt::graphPath;
218 			break;
219 		case 'm':
220 			arg >> opt::minOverlap;
221 			break;
222 		case 'o':
223 			arg >> opt::out;
224 			break;
225 		case 's':
226 			arg >> opt::minLength;
227 			break;
228 		case 'v':
229 			opt::verbose++;
230 			break;
231 		case OPT_HELP:
232 			cout << USAGE_MESSAGE;
233 			exit(EXIT_SUCCESS);
234 		case OPT_VERSION:
235 			cout << VERSION_MESSAGE;
236 			exit(EXIT_SUCCESS);
237 		}
238 		if (optarg != NULL && !arg.eof()) {
239 			cerr << PROGRAM ": invalid option: `-" << (char)c << optarg << "'\n";
240 			exit(EXIT_FAILURE);
241 		}
242 	}
243 
244 	if (argc - optind < 1) {
245 		cerr << PROGRAM ": missing arguments\n";
246 		die = true;
247 	}
248 
249 	if (die) {
250 		cerr << "Try `" << PROGRAM << " --help' for more information.\n";
251 		exit(EXIT_FAILURE);
252 	}
253 
254 	Graph g;
255 	if (optind < argc) {
256 		for (; optind < argc; optind++)
257 			readGraph(argv[optind], g);
258 	} else
259 		readGraph("-", g);
260 
261 	// Remove short sequences.
262 	filterVertices(g, opt::minLength);
263 
264 	// Remove small overlaps.
265 	filterEdges(g, opt::minOverlap);
266 
267 	// Remove transitive edges.
268 	if (opt::tred) {
269 		unsigned numTransitive = remove_transitive_edges(g);
270 		if (opt::verbose > 0) {
271 			cerr << "Removed " << numTransitive << " transitive edges.\n";
272 			printGraphStats(cerr, g);
273 		}
274 	}
275 
276 	/** A container of contig paths. */
277 	typedef vector<ContigPath> ContigPaths;
278 
279 	// Assemble the paths.
280 	ContigPaths paths;
281 	assembleDFS(g, back_inserter(paths), opt::ss);
282 	sort(paths.begin(), paths.end());
283 	if (opt::verbose > 0) {
284 		unsigned n = 0;
285 		for (ContigPaths::const_iterator it = paths.begin(); it != paths.end(); ++it)
286 			n += it->size();
287 		cerr << "Assembled " << n << " sequences in " << paths.size() << " contigs.\n";
288 		printGraphStats(cerr, g);
289 	}
290 
291 	// Output the paths.
292 	ofstream fout(opt::out.c_str());
293 	ostream& out = opt::out.empty() || opt::out == "-" ? cout : fout;
294 	assert_good(out, opt::out);
295 	g_contigNames.unlock();
296 	for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it)
297 		out << createContigName() << '\t' << *it << '\n';
298 	assert_good(out, opt::out);
299 
300 	// Create the new vertices.
301 	for (vector<ContigPath>::const_iterator it = paths.begin(); it != paths.end(); ++it) {
302 		const ContigPath& path = *it;
303 		merge(g, path.begin(), path.end());
304 		remove_vertex_if(
305 		    g, path.begin(), path.end(), [](const ContigNode& c) { return !c.ambiguous(); });
306 	}
307 	if (opt::verbose > 0)
308 		printGraphStats(cerr, g);
309 
310 	// Output the graph.
311 	if (!opt::graphPath.empty()) {
312 		ofstream out(opt::graphPath.c_str());
313 		assert_good(out, opt::graphPath);
314 		write_dot(out, g);
315 		assert_good(out, opt::graphPath);
316 	}
317 
318 	// Print assembly contiguity statistics.
319 	if (opt::verbose > 0) {
320 		Histogram h = buildLengthHistogram(g);
321 		const unsigned STATS_MIN_LENGTH = 200; // bp
322 		printContiguityStats(cerr, h, STATS_MIN_LENGTH) << '\n';
323 	}
324 
325 	return 0;
326 }
327