1 #include "Common/Options.h"
2 #include "Common/Sequence.h"
3 #include "DataLayer/Options.h"
4 #include "ContigNode.h"
5 #include "ContigProperties.h"
6 #include "FastaReader.h"
7 #include "Iterator.h"
8 #include "Kmer.h"
9 #include "StringUtil.h"
10 #include "SuffixArray.h"
11 #include "Uncompress.h"
12 #include "UnorderedMap.h"
13 #include "Graph/ContigGraph.h"
14 #include "Graph/DirectedGraph.h"
15 #include "Graph/GraphIO.h"
16 #include "Graph/GraphUtil.h"
17 #include <cassert>
18 #include <cctype>
19 #include <cstdlib>
20 #include <getopt.h>
21 #include <iostream>
22 #include <set>
23 #include <vector>
24 
25 #include "DataBase/Options.h"
26 #include "DataBase/DB.h"
27 
28 #if PAIRED_DBG
29 #include "PairedDBG/KmerPair.h"
30 #endif
31 
32 using namespace std;
33 
34 #define PROGRAM "AdjList"
35 
36 DB db;
37 static const char VERSION_MESSAGE[] =
38 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
39 "Written by Shaun Jackman.\n"
40 "\n"
41 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
42 
43 static const char USAGE_MESSAGE[] =
44 "Usage: " PROGRAM " -k<kmer> [OPTION]... [FILE]...\n"
45 "Find overlaps of [m,k) bases. Contigs may be read from FILE(s)\n"
46 "or standard input. Output is written to standard output.\n"
47 "Overlaps of exactly k-1 bases are found using a hash table.\n"
48 "Overlaps of fewer than k-1 bases are found using a suffix array.\n"
49 "\n"
50 " Options:\n"
51 "\n"
52 "  -k, --kmer=N          the length of a k-mer (when -K is not set)\n"
53 "                        or the span of a k-mer pair (when -K is set)\n"
54 "  -K, --single-kmer=N   the length of a single k-mer in a k-mer pair\n"
55 "  -m, --min-overlap=M   require a minimum overlap of M bases [50]\n"
56 "      --adj             output the graph in ADJ format [default]\n"
57 "      --asqg            output the graph in ASQG format\n"
58 "      --dot             output the graph in GraphViz format\n"
59 "      --gfa             output the graph in GFA1 format\n"
60 "      --gfa1            output the graph in GFA1 format\n"
61 "      --gfa2            output the graph in GFA2 format\n"
62 "      --gv              output the graph in GraphViz format\n"
63 "      --sam             output the graph in SAM format\n"
64 "      --SS              expect contigs to be oriented correctly\n"
65 "      --no-SS           no assumption about contig orientation\n"
66 "  -v, --verbose         display verbose output\n"
67 "      --help            display this help and exit\n"
68 "      --version         output version information and exit\n"
69 "      --db=FILE         specify path of database repository in FILE\n"
70 "      --library=NAME    specify library NAME for database\n"
71 "      --strain=NAME     specify strain NAME for database\n"
72 "      --species=NAME    specify species NAME for database\n"
73 "\n"
74 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
75 
76 namespace opt {
77 	string db;
78 	dbVars metaVars;
79 	unsigned k; // used by GraphIO
80 
81 	/** Length of a single-kmer in a kmer pair */
82 	unsigned singleKmerSize = 0;
83 
84 	int format; // used by GraphIO
85 
86 	/** Run a strand-specific RNA-Seq assembly. */
87 	static int ss;
88 
89 	/** The minimum required amount of overlap. */
90 	static unsigned minOverlap = 50;
91 }
92 
93 static const char shortopts[] = "k:K:m:v";
94 
95 enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
96 //enum { OPT_HELP = 1, OPT_VERSION };
97 
98 static const struct option longopts[] = {
99 	{ "kmer",    required_argument, NULL, 'k' },
100 	{ "single-kmer", required_argument, NULL, 'K' },
101 	{ "min-overlap", required_argument, NULL, 'm' },
102 	{ "adj",     no_argument,       &opt::format, ADJ },
103 	{ "asqg",    no_argument,       &opt::format, ASQG },
104 	{ "dot",     no_argument,       &opt::format, DOT },
105 	{ "gfa",     no_argument,       &opt::format, GFA1 },
106 	{ "gfa1",    no_argument,       &opt::format, GFA1 },
107 	{ "gfa2",    no_argument,       &opt::format, GFA2 },
108 	{ "gv",      no_argument,       &opt::format, DOT },
109 	{ "sam",     no_argument,       &opt::format, SAM },
110 	{ "SS",      no_argument,       &opt::ss, 1 },
111 	{ "no-SS",   no_argument,       &opt::ss, 0 },
112 	{ "verbose", no_argument,       NULL, 'v' },
113 	{ "help",    no_argument,       NULL, OPT_HELP },
114 	{ "version", no_argument,       NULL, OPT_VERSION },
115 	{ "db",      required_argument, NULL, OPT_DB },
116 	{ "library", required_argument, NULL, OPT_LIBRARY },
117 	{ "strain",  required_argument, NULL, OPT_STRAIN },
118 	{ "species", required_argument, NULL, OPT_SPECIES },
119 	{ NULL, 0, NULL, 0 }
120 };
121 
122 /** A contig adjacency graph. */
123 typedef DirectedGraph<ContigProperties, Distance> DG;
124 typedef ContigGraph<DG> Graph;
125 
126 /** Parse and return the coverage from the specified FASTA comment. */
getCoverage(const string & comment)127 static unsigned getCoverage(const string& comment)
128 {
129 	istringstream ss(comment);
130 	unsigned length, coverage = 0;
131 	ss >> length >> coverage;
132 	return coverage;
133 }
134 
135 /** Add the overlaps of vseq to the graph. */
addOverlapsSA(Graph & g,const SuffixArray & sa,ContigNode v,const string & vseq)136 static void addOverlapsSA(Graph& g, const SuffixArray& sa,
137 		ContigNode v, const string& vseq)
138 {
139 	assert(!vseq.empty());
140 	set<ContigNode> seen;
141 	typedef SuffixArray::const_iterator It;
142 	for (string q(vseq, 0, vseq.size() - 1);
143 			q.size() >= opt::minOverlap; chop(q)) {
144 		pair<It, It> range = sa.equal_range(q);
145 		for (It it = range.first; it != range.second; ++it) {
146 			ContigNode u(it->second);
147 			if (opt::ss && u.sense() != v.sense())
148 				continue;
149 			if (seen.insert(u).second) {
150 				// Add the longest overlap between two vertices.
151 				unsigned overlap = it->first.size();
152 				add_edge(u, v, -overlap, static_cast<DG&>(g));
153 			}
154 		}
155 	}
156 }
157 
158 /** Add overlaps of fewer than k-1 bp to the graph. */
addOverlapsSA(Graph & g,const vector<Kmer> & prefixes)159 static void addOverlapsSA(Graph& g, const vector<Kmer>& prefixes)
160 {
161 	// Construct a suffix array of the blunt contigs.
162 	typedef pair<string, ContigNode> Suffix;
163 	typedef vector<Suffix> Suffixes;
164 	Suffixes suffixes;
165 
166 	typedef graph_traits<Graph>::vertex_descriptor V;
167 	typedef graph_traits<Graph>::vertex_iterator Vit;
168 	pair<Vit, Vit> vertices = g.vertices();
169 	for (Vit it = vertices.first; it != vertices.second; ++it) {
170 		ContigNode u(*it);
171 		if (out_degree(u, g) > 0)
172 			continue;
173 		size_t uci = get(vertex_index, g,
174 				get(vertex_complement, g, u));
175 		assert(uci < prefixes.size());
176 		string suffix(reverseComplement(prefixes[uci]).str());
177 		suffixes.push_back(Suffix(suffix, u));
178 	}
179 
180 	SuffixArray sa(opt::minOverlap);
181 	for (Suffixes::const_iterator it = suffixes.begin();
182 			it != suffixes.end(); ++it)
183 		sa.insert(*it);
184 	sa.construct();
185 
186 	for (Suffixes::const_iterator it = suffixes.begin();
187 			it != suffixes.end(); ++it) {
188 		V uc = get(vertex_complement, g, it->second);
189 		addOverlapsSA(g, sa, uc, reverseComplement(it->first));
190 	}
191 }
192 
193 /** Read contigs. Add contig properties to the graph. Add prefixes to
194  * the collection and add suffixes to their index.
195  */
196 template <class KmerType>
readContigs(const string & path,Graph & g,vector<KmerType> & prefixes,unordered_map<KmerType,vector<ContigNode>> & suffixMap)197 static void readContigs(const string& path,
198 		Graph& g, vector<KmerType>& prefixes,
199 		unordered_map<KmerType, vector<ContigNode> >& suffixMap)
200 {
201 	if (opt::verbose > 0)
202 		cerr << "Reading `" << path << "'...\n";
203 
204 	unsigned count = 0;
205 	FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
206 	for (FastaRecord rec; in >> rec;) {
207 		Sequence& seq = rec.seq;
208 		if (count++ == 0) {
209 			// Detect colour-space contigs.
210 			opt::colourSpace = isdigit(seq[0]);
211 		} else {
212 			if (opt::colourSpace)
213 				assert(isdigit(seq[0]));
214 			else
215 				assert(isalpha(seq[0]));
216 		}
217 
218 		flattenAmbiguityCodes(seq);
219 
220 		// Add the prefix to the collection.
221 		unsigned overlap = opt::k - 1;
222 		assert(seq.length() > overlap);
223 		KmerType prefix(seq.substr(0, overlap));
224 		KmerType suffix(seq.substr(seq.length() - overlap));
225 		prefixes.push_back(prefix);
226 		prefixes.push_back(reverseComplement(suffix));
227 
228 		// Add the suffix to the index.
229 		ContigProperties vp(seq.length(), getCoverage(rec.comment));
230 		ContigNode u = add_vertex(vp, g);
231 		put(vertex_name, g, u, rec.id);
232 		suffixMap[suffix].push_back(u);
233 		suffixMap[reverseComplement(prefix)].push_back(
234 				get(vertex_complement, g, u));
235 	}
236 	assert(in.eof());
237 }
238 
239 /** Build contig overlap graph. */
240 template <class KmerType>
buildOverlapGraph(Graph & g,vector<KmerType> & prefixes,unordered_map<KmerType,vector<ContigNode>> & suffixMap)241 static void buildOverlapGraph(Graph& g, vector<KmerType>& prefixes,
242 	unordered_map<KmerType, vector<ContigNode> >& suffixMap)
243 {
244 	// Add the overlap edges of exactly k-1 bp.
245 
246 	typedef graph_traits<Graph>::vertex_descriptor V;
247 	typedef unordered_map<KmerType, vector<ContigNode> > SuffixMap;
248 	typedef typename vector<KmerType>::const_iterator PrefixIterator;
249 	typedef const typename SuffixMap::mapped_type Edges;
250 	typedef typename SuffixMap::mapped_type::const_iterator SuffixIterator;
251 
252 	if (opt::verbose > 0)
253 		cerr << "Finding overlaps of exactly k-1 bp...\n";
254 	for (PrefixIterator it = prefixes.begin(); it != prefixes.end(); ++it) {
255 		ContigNode v(it - prefixes.begin());
256 		Edges& edges = suffixMap[*it];
257 		for (SuffixIterator itu = edges.begin(); itu != edges.end(); ++itu) {
258 			V uc = get(vertex_complement, g, *itu);
259 			V vc = get(vertex_complement, g, v);
260 			if (opt::ss && uc.sense() != vc.sense())
261 				continue;
262 			add_edge(vc, uc, -(int)opt::k + 1, static_cast<DG&>(g));
263 		}
264 	}
265 	SuffixMap().swap(suffixMap);
266 
267 	if (opt::verbose > 0)
268 		printGraphStats(cerr, g);
269 }
270 
271 template <class KmerType>
loadDataStructures(Graph & g,vector<KmerType> & prefixes,unordered_map<KmerType,vector<ContigNode>> & suffixMap,int argc,char ** argv)272 void loadDataStructures(Graph& g, vector<KmerType>& prefixes,
273 	unordered_map<KmerType, vector<ContigNode> >& suffixMap,
274 	int argc, char** argv)
275 {
276 	if (optind < argc) {
277 		for (; optind < argc; optind++)
278 			readContigs(argv[optind], g, prefixes, suffixMap);
279 	} else
280 		readContigs("-", g, prefixes, suffixMap);
281 	g_contigNames.lock();
282 }
283 
284 /** Build contig overlap graph for standard de Bruijn graph */
buildOverlapGraph(Graph & g,int argc,char ** argv)285 void buildOverlapGraph(Graph& g, int argc, char** argv)
286 {
287 	Kmer::setLength(opt::k - 1);
288 
289 	vector<Kmer> prefixes;
290 	unordered_map<Kmer, vector<ContigNode> >
291 		suffixMap(prefixes.size());
292 
293 	loadDataStructures(g, prefixes, suffixMap, argc, argv);
294 	buildOverlapGraph(g, prefixes, suffixMap);
295 
296 	if (opt::minOverlap < opt::k - 1) {
297 		// Add the overlap edges of fewer than k-1 bp.
298 		if (opt::verbose > 0)
299 			cerr << "Finding overlaps of fewer than k-1 bp...\n";
300 		addOverlapsSA(g, prefixes);
301 		if (opt::verbose > 0)
302 			printGraphStats(cerr, g);
303 	}
304 }
305 
306 #if PAIRED_DBG
307 /** Build contig overlap graph for paired de Bruijn graph */
buildPairedOverlapGraph(Graph & g,int argc,char ** argv)308 void buildPairedOverlapGraph(Graph& g, int argc, char** argv)
309 {
310 	Kmer::setLength(opt::singleKmerSize - 1);
311 	KmerPair::setLength(opt::k - 1);
312 
313 	vector<KmerPair> prefixes;
314 	unordered_map<KmerPair, vector<ContigNode> >
315 		suffixMap(prefixes.size());
316 
317 	loadDataStructures(g, prefixes, suffixMap, argc, argv);
318 	buildOverlapGraph(g, prefixes, suffixMap);
319 }
320 #endif
321 
main(int argc,char ** argv)322 int main(int argc, char** argv)
323 {
324 	string commandLine;
325 	{
326 		ostringstream ss;
327 		char** last = argv + argc - 1;
328 		copy(argv, last, ostream_iterator<const char *>(ss, " "));
329 		ss << *last;
330 		commandLine = ss.str();
331 	}
332 
333 	if (!opt::db.empty())
334 		opt::metaVars.resize(3);
335 
336 	bool die = false;
337 	for (int c; (c = getopt_long(argc, argv,
338 					shortopts, longopts, NULL)) != -1;) {
339 		istringstream arg(optarg != NULL ? optarg : "");
340 		switch (c) {
341 			case '?': die = true; break;
342 			case 'k': arg >> opt::k; break;
343 			case 'K': arg >> opt::singleKmerSize; break;
344 			case 'm': arg >> opt::minOverlap; break;
345 			case 'v': opt::verbose++; break;
346 			case OPT_HELP:
347 				cout << USAGE_MESSAGE;
348 				exit(EXIT_SUCCESS);
349 			case OPT_VERSION:
350 				cout << VERSION_MESSAGE;
351 				exit(EXIT_SUCCESS);
352 			case OPT_DB:
353 				arg >> opt::db; break;
354 			case OPT_LIBRARY:
355 				arg >> opt::metaVars[0]; break;
356 			case OPT_STRAIN:
357 				arg >> opt::metaVars[1]; break;
358 			case OPT_SPECIES:
359 				arg >> opt::metaVars[2]; break;
360 		}
361 		if (optarg != NULL && !arg.eof()) {
362 			cerr << PROGRAM ": invalid option: `-"
363 				<< (char)c << optarg << "'\n";
364 			exit(EXIT_FAILURE);
365 		}
366 	}
367 
368 	if (opt::k <= 0) {
369 		cerr << PROGRAM ": " << "missing -k,--kmer option\n";
370 		die = true;
371 	}
372 
373 	if (die) {
374 		cerr << "Try `" << PROGRAM
375 			<< " --help' for more information.\n";
376 		exit(EXIT_FAILURE);
377 	}
378 
379 	if (opt::minOverlap == 0)
380 		opt::minOverlap = opt::k - 1;
381 	opt::minOverlap = min(opt::minOverlap, opt::k - 1);
382 
383 	if (!opt::db.empty())
384 		init (db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
385 	opt::trimMasked = false;
386 
387 	// contig overlap graph
388 	Graph g;
389 
390 #if PAIRED_DBG
391 	if (opt::singleKmerSize > 0)
392 		buildPairedOverlapGraph(g, argc, argv);
393 	else
394 		buildOverlapGraph(g, argc, argv);
395 #else
396 	buildOverlapGraph(g, argc, argv);
397 #endif
398 
399 	// Output the graph.
400 	write_graph(cout, g, PROGRAM, commandLine);
401 	assert(cout.good());
402 	vector<int> vals = make_vector<int>()
403 		<< opt::ss
404 		<< opt::k;
405 	vector<int> new_vals = passGraphStatsVal(g);
406 	vals.insert(vals.end(), new_vals.begin(), new_vals.end());
407 
408 	vector<string> keys = make_vector<string>()
409 		<< "SS"
410 		<< "K"
411 		<< "V"
412 		<< "E"
413 		<< "degree0pctg"
414 		<< "degree1pctg"
415 		<< "degree234pctg"
416 		<< "degree5pctg"
417 		<< "degree_max";
418 	if (!opt::db.empty()) {
419 		for (unsigned i=0; i<vals.size(); i++)
420 			addToDb(db, keys[i], vals[i]);
421 	}
422 
423 	return 0;
424 }
425