1 #include "Common/Options.h"
2 #include "Common/Sequence.h"
3 #include "DataLayer/Options.h"
4 #include "ContigNode.h"
5 #include "ContigProperties.h"
6 #include "FastaReader.h"
7 #include "Iterator.h"
8 #include "Kmer.h"
9 #include "StringUtil.h"
10 #include "SuffixArray.h"
11 #include "Uncompress.h"
12 #include "UnorderedMap.h"
13 #include "Graph/ContigGraph.h"
14 #include "Graph/DirectedGraph.h"
15 #include "Graph/GraphIO.h"
16 #include "Graph/GraphUtil.h"
17 #include <cassert>
18 #include <cctype>
19 #include <cstdlib>
20 #include <getopt.h>
21 #include <iostream>
22 #include <set>
23 #include <vector>
24
25 #include "DataBase/Options.h"
26 #include "DataBase/DB.h"
27
28 #if PAIRED_DBG
29 #include "PairedDBG/KmerPair.h"
30 #endif
31
32 using namespace std;
33
34 #define PROGRAM "AdjList"
35
36 DB db;
37 static const char VERSION_MESSAGE[] =
38 PROGRAM " (" PACKAGE_NAME ") " VERSION "\n"
39 "Written by Shaun Jackman.\n"
40 "\n"
41 "Copyright 2014 Canada's Michael Smith Genome Sciences Centre\n";
42
43 static const char USAGE_MESSAGE[] =
44 "Usage: " PROGRAM " -k<kmer> [OPTION]... [FILE]...\n"
45 "Find overlaps of [m,k) bases. Contigs may be read from FILE(s)\n"
46 "or standard input. Output is written to standard output.\n"
47 "Overlaps of exactly k-1 bases are found using a hash table.\n"
48 "Overlaps of fewer than k-1 bases are found using a suffix array.\n"
49 "\n"
50 " Options:\n"
51 "\n"
52 " -k, --kmer=N the length of a k-mer (when -K is not set)\n"
53 " or the span of a k-mer pair (when -K is set)\n"
54 " -K, --single-kmer=N the length of a single k-mer in a k-mer pair\n"
55 " -m, --min-overlap=M require a minimum overlap of M bases [50]\n"
56 " --adj output the graph in ADJ format [default]\n"
57 " --asqg output the graph in ASQG format\n"
58 " --dot output the graph in GraphViz format\n"
59 " --gfa output the graph in GFA1 format\n"
60 " --gfa1 output the graph in GFA1 format\n"
61 " --gfa2 output the graph in GFA2 format\n"
62 " --gv output the graph in GraphViz format\n"
63 " --sam output the graph in SAM format\n"
64 " --SS expect contigs to be oriented correctly\n"
65 " --no-SS no assumption about contig orientation\n"
66 " -v, --verbose display verbose output\n"
67 " --help display this help and exit\n"
68 " --version output version information and exit\n"
69 " --db=FILE specify path of database repository in FILE\n"
70 " --library=NAME specify library NAME for database\n"
71 " --strain=NAME specify strain NAME for database\n"
72 " --species=NAME specify species NAME for database\n"
73 "\n"
74 "Report bugs to <" PACKAGE_BUGREPORT ">.\n";
75
76 namespace opt {
77 string db;
78 dbVars metaVars;
79 unsigned k; // used by GraphIO
80
81 /** Length of a single-kmer in a kmer pair */
82 unsigned singleKmerSize = 0;
83
84 int format; // used by GraphIO
85
86 /** Run a strand-specific RNA-Seq assembly. */
87 static int ss;
88
89 /** The minimum required amount of overlap. */
90 static unsigned minOverlap = 50;
91 }
92
93 static const char shortopts[] = "k:K:m:v";
94
95 enum { OPT_HELP = 1, OPT_VERSION, OPT_DB, OPT_LIBRARY, OPT_STRAIN, OPT_SPECIES };
96 //enum { OPT_HELP = 1, OPT_VERSION };
97
98 static const struct option longopts[] = {
99 { "kmer", required_argument, NULL, 'k' },
100 { "single-kmer", required_argument, NULL, 'K' },
101 { "min-overlap", required_argument, NULL, 'm' },
102 { "adj", no_argument, &opt::format, ADJ },
103 { "asqg", no_argument, &opt::format, ASQG },
104 { "dot", no_argument, &opt::format, DOT },
105 { "gfa", no_argument, &opt::format, GFA1 },
106 { "gfa1", no_argument, &opt::format, GFA1 },
107 { "gfa2", no_argument, &opt::format, GFA2 },
108 { "gv", no_argument, &opt::format, DOT },
109 { "sam", no_argument, &opt::format, SAM },
110 { "SS", no_argument, &opt::ss, 1 },
111 { "no-SS", no_argument, &opt::ss, 0 },
112 { "verbose", no_argument, NULL, 'v' },
113 { "help", no_argument, NULL, OPT_HELP },
114 { "version", no_argument, NULL, OPT_VERSION },
115 { "db", required_argument, NULL, OPT_DB },
116 { "library", required_argument, NULL, OPT_LIBRARY },
117 { "strain", required_argument, NULL, OPT_STRAIN },
118 { "species", required_argument, NULL, OPT_SPECIES },
119 { NULL, 0, NULL, 0 }
120 };
121
122 /** A contig adjacency graph. */
123 typedef DirectedGraph<ContigProperties, Distance> DG;
124 typedef ContigGraph<DG> Graph;
125
126 /** Parse and return the coverage from the specified FASTA comment. */
getCoverage(const string & comment)127 static unsigned getCoverage(const string& comment)
128 {
129 istringstream ss(comment);
130 unsigned length, coverage = 0;
131 ss >> length >> coverage;
132 return coverage;
133 }
134
135 /** Add the overlaps of vseq to the graph. */
addOverlapsSA(Graph & g,const SuffixArray & sa,ContigNode v,const string & vseq)136 static void addOverlapsSA(Graph& g, const SuffixArray& sa,
137 ContigNode v, const string& vseq)
138 {
139 assert(!vseq.empty());
140 set<ContigNode> seen;
141 typedef SuffixArray::const_iterator It;
142 for (string q(vseq, 0, vseq.size() - 1);
143 q.size() >= opt::minOverlap; chop(q)) {
144 pair<It, It> range = sa.equal_range(q);
145 for (It it = range.first; it != range.second; ++it) {
146 ContigNode u(it->second);
147 if (opt::ss && u.sense() != v.sense())
148 continue;
149 if (seen.insert(u).second) {
150 // Add the longest overlap between two vertices.
151 unsigned overlap = it->first.size();
152 add_edge(u, v, -overlap, static_cast<DG&>(g));
153 }
154 }
155 }
156 }
157
158 /** Add overlaps of fewer than k-1 bp to the graph. */
addOverlapsSA(Graph & g,const vector<Kmer> & prefixes)159 static void addOverlapsSA(Graph& g, const vector<Kmer>& prefixes)
160 {
161 // Construct a suffix array of the blunt contigs.
162 typedef pair<string, ContigNode> Suffix;
163 typedef vector<Suffix> Suffixes;
164 Suffixes suffixes;
165
166 typedef graph_traits<Graph>::vertex_descriptor V;
167 typedef graph_traits<Graph>::vertex_iterator Vit;
168 pair<Vit, Vit> vertices = g.vertices();
169 for (Vit it = vertices.first; it != vertices.second; ++it) {
170 ContigNode u(*it);
171 if (out_degree(u, g) > 0)
172 continue;
173 size_t uci = get(vertex_index, g,
174 get(vertex_complement, g, u));
175 assert(uci < prefixes.size());
176 string suffix(reverseComplement(prefixes[uci]).str());
177 suffixes.push_back(Suffix(suffix, u));
178 }
179
180 SuffixArray sa(opt::minOverlap);
181 for (Suffixes::const_iterator it = suffixes.begin();
182 it != suffixes.end(); ++it)
183 sa.insert(*it);
184 sa.construct();
185
186 for (Suffixes::const_iterator it = suffixes.begin();
187 it != suffixes.end(); ++it) {
188 V uc = get(vertex_complement, g, it->second);
189 addOverlapsSA(g, sa, uc, reverseComplement(it->first));
190 }
191 }
192
193 /** Read contigs. Add contig properties to the graph. Add prefixes to
194 * the collection and add suffixes to their index.
195 */
196 template <class KmerType>
readContigs(const string & path,Graph & g,vector<KmerType> & prefixes,unordered_map<KmerType,vector<ContigNode>> & suffixMap)197 static void readContigs(const string& path,
198 Graph& g, vector<KmerType>& prefixes,
199 unordered_map<KmerType, vector<ContigNode> >& suffixMap)
200 {
201 if (opt::verbose > 0)
202 cerr << "Reading `" << path << "'...\n";
203
204 unsigned count = 0;
205 FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
206 for (FastaRecord rec; in >> rec;) {
207 Sequence& seq = rec.seq;
208 if (count++ == 0) {
209 // Detect colour-space contigs.
210 opt::colourSpace = isdigit(seq[0]);
211 } else {
212 if (opt::colourSpace)
213 assert(isdigit(seq[0]));
214 else
215 assert(isalpha(seq[0]));
216 }
217
218 flattenAmbiguityCodes(seq);
219
220 // Add the prefix to the collection.
221 unsigned overlap = opt::k - 1;
222 assert(seq.length() > overlap);
223 KmerType prefix(seq.substr(0, overlap));
224 KmerType suffix(seq.substr(seq.length() - overlap));
225 prefixes.push_back(prefix);
226 prefixes.push_back(reverseComplement(suffix));
227
228 // Add the suffix to the index.
229 ContigProperties vp(seq.length(), getCoverage(rec.comment));
230 ContigNode u = add_vertex(vp, g);
231 put(vertex_name, g, u, rec.id);
232 suffixMap[suffix].push_back(u);
233 suffixMap[reverseComplement(prefix)].push_back(
234 get(vertex_complement, g, u));
235 }
236 assert(in.eof());
237 }
238
239 /** Build contig overlap graph. */
240 template <class KmerType>
buildOverlapGraph(Graph & g,vector<KmerType> & prefixes,unordered_map<KmerType,vector<ContigNode>> & suffixMap)241 static void buildOverlapGraph(Graph& g, vector<KmerType>& prefixes,
242 unordered_map<KmerType, vector<ContigNode> >& suffixMap)
243 {
244 // Add the overlap edges of exactly k-1 bp.
245
246 typedef graph_traits<Graph>::vertex_descriptor V;
247 typedef unordered_map<KmerType, vector<ContigNode> > SuffixMap;
248 typedef typename vector<KmerType>::const_iterator PrefixIterator;
249 typedef const typename SuffixMap::mapped_type Edges;
250 typedef typename SuffixMap::mapped_type::const_iterator SuffixIterator;
251
252 if (opt::verbose > 0)
253 cerr << "Finding overlaps of exactly k-1 bp...\n";
254 for (PrefixIterator it = prefixes.begin(); it != prefixes.end(); ++it) {
255 ContigNode v(it - prefixes.begin());
256 Edges& edges = suffixMap[*it];
257 for (SuffixIterator itu = edges.begin(); itu != edges.end(); ++itu) {
258 V uc = get(vertex_complement, g, *itu);
259 V vc = get(vertex_complement, g, v);
260 if (opt::ss && uc.sense() != vc.sense())
261 continue;
262 add_edge(vc, uc, -(int)opt::k + 1, static_cast<DG&>(g));
263 }
264 }
265 SuffixMap().swap(suffixMap);
266
267 if (opt::verbose > 0)
268 printGraphStats(cerr, g);
269 }
270
271 template <class KmerType>
loadDataStructures(Graph & g,vector<KmerType> & prefixes,unordered_map<KmerType,vector<ContigNode>> & suffixMap,int argc,char ** argv)272 void loadDataStructures(Graph& g, vector<KmerType>& prefixes,
273 unordered_map<KmerType, vector<ContigNode> >& suffixMap,
274 int argc, char** argv)
275 {
276 if (optind < argc) {
277 for (; optind < argc; optind++)
278 readContigs(argv[optind], g, prefixes, suffixMap);
279 } else
280 readContigs("-", g, prefixes, suffixMap);
281 g_contigNames.lock();
282 }
283
284 /** Build contig overlap graph for standard de Bruijn graph */
buildOverlapGraph(Graph & g,int argc,char ** argv)285 void buildOverlapGraph(Graph& g, int argc, char** argv)
286 {
287 Kmer::setLength(opt::k - 1);
288
289 vector<Kmer> prefixes;
290 unordered_map<Kmer, vector<ContigNode> >
291 suffixMap(prefixes.size());
292
293 loadDataStructures(g, prefixes, suffixMap, argc, argv);
294 buildOverlapGraph(g, prefixes, suffixMap);
295
296 if (opt::minOverlap < opt::k - 1) {
297 // Add the overlap edges of fewer than k-1 bp.
298 if (opt::verbose > 0)
299 cerr << "Finding overlaps of fewer than k-1 bp...\n";
300 addOverlapsSA(g, prefixes);
301 if (opt::verbose > 0)
302 printGraphStats(cerr, g);
303 }
304 }
305
306 #if PAIRED_DBG
307 /** Build contig overlap graph for paired de Bruijn graph */
buildPairedOverlapGraph(Graph & g,int argc,char ** argv)308 void buildPairedOverlapGraph(Graph& g, int argc, char** argv)
309 {
310 Kmer::setLength(opt::singleKmerSize - 1);
311 KmerPair::setLength(opt::k - 1);
312
313 vector<KmerPair> prefixes;
314 unordered_map<KmerPair, vector<ContigNode> >
315 suffixMap(prefixes.size());
316
317 loadDataStructures(g, prefixes, suffixMap, argc, argv);
318 buildOverlapGraph(g, prefixes, suffixMap);
319 }
320 #endif
321
main(int argc,char ** argv)322 int main(int argc, char** argv)
323 {
324 string commandLine;
325 {
326 ostringstream ss;
327 char** last = argv + argc - 1;
328 copy(argv, last, ostream_iterator<const char *>(ss, " "));
329 ss << *last;
330 commandLine = ss.str();
331 }
332
333 if (!opt::db.empty())
334 opt::metaVars.resize(3);
335
336 bool die = false;
337 for (int c; (c = getopt_long(argc, argv,
338 shortopts, longopts, NULL)) != -1;) {
339 istringstream arg(optarg != NULL ? optarg : "");
340 switch (c) {
341 case '?': die = true; break;
342 case 'k': arg >> opt::k; break;
343 case 'K': arg >> opt::singleKmerSize; break;
344 case 'm': arg >> opt::minOverlap; break;
345 case 'v': opt::verbose++; break;
346 case OPT_HELP:
347 cout << USAGE_MESSAGE;
348 exit(EXIT_SUCCESS);
349 case OPT_VERSION:
350 cout << VERSION_MESSAGE;
351 exit(EXIT_SUCCESS);
352 case OPT_DB:
353 arg >> opt::db; break;
354 case OPT_LIBRARY:
355 arg >> opt::metaVars[0]; break;
356 case OPT_STRAIN:
357 arg >> opt::metaVars[1]; break;
358 case OPT_SPECIES:
359 arg >> opt::metaVars[2]; break;
360 }
361 if (optarg != NULL && !arg.eof()) {
362 cerr << PROGRAM ": invalid option: `-"
363 << (char)c << optarg << "'\n";
364 exit(EXIT_FAILURE);
365 }
366 }
367
368 if (opt::k <= 0) {
369 cerr << PROGRAM ": " << "missing -k,--kmer option\n";
370 die = true;
371 }
372
373 if (die) {
374 cerr << "Try `" << PROGRAM
375 << " --help' for more information.\n";
376 exit(EXIT_FAILURE);
377 }
378
379 if (opt::minOverlap == 0)
380 opt::minOverlap = opt::k - 1;
381 opt::minOverlap = min(opt::minOverlap, opt::k - 1);
382
383 if (!opt::db.empty())
384 init (db, opt::db, opt::verbose, PROGRAM, opt::getCommand(argc, argv), opt::metaVars);
385 opt::trimMasked = false;
386
387 // contig overlap graph
388 Graph g;
389
390 #if PAIRED_DBG
391 if (opt::singleKmerSize > 0)
392 buildPairedOverlapGraph(g, argc, argv);
393 else
394 buildOverlapGraph(g, argc, argv);
395 #else
396 buildOverlapGraph(g, argc, argv);
397 #endif
398
399 // Output the graph.
400 write_graph(cout, g, PROGRAM, commandLine);
401 assert(cout.good());
402 vector<int> vals = make_vector<int>()
403 << opt::ss
404 << opt::k;
405 vector<int> new_vals = passGraphStatsVal(g);
406 vals.insert(vals.end(), new_vals.begin(), new_vals.end());
407
408 vector<string> keys = make_vector<string>()
409 << "SS"
410 << "K"
411 << "V"
412 << "E"
413 << "degree0pctg"
414 << "degree1pctg"
415 << "degree234pctg"
416 << "degree5pctg"
417 << "degree_max";
418 if (!opt::db.empty()) {
419 for (unsigned i=0; i<vals.size(); i++)
420 addToDb(db, keys[i], vals[i]);
421 }
422
423 return 0;
424 }
425