1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifdef BOWTIE_INSPECT_MAIN
21
22 #include <string>
23 #include <iostream>
24 #include <getopt.h>
25 #include <stdexcept>
26
27 #include "assert_helpers.h"
28 #include "endian_swap.h"
29 #include "bt2_idx.h"
30 #include "reference.h"
31 #include "ds.h"
32
33 using namespace std;
34
35 static bool showVersion = false; // just print version and quit?
36 int verbose = 0; // be talkative
37 static int names_only = 0; // just print the sequence names in the index
38 static int summarize_only = 0; // just print summary of index and quit
39 static int across = 60; // number of characters across in FASTA output
40 static bool refFromEbwt = false; // true -> when printing reference, decode it from Ebwt instead of reading it from BitPairReference
41 static string wrapper;
42 static const char *short_options = "vhnsea:";
43
44 enum {
45 ARG_VERSION = 256,
46 ARG_WRAPPER,
47 ARG_USAGE,
48 };
49
50 static struct option long_options[] = {
51 {(char*)"verbose", no_argument, 0, 'v'},
52 {(char*)"version", no_argument, 0, ARG_VERSION},
53 {(char*)"usage", no_argument, 0, ARG_USAGE},
54 {(char*)"names", no_argument, 0, 'n'},
55 {(char*)"summary", no_argument, 0, 's'},
56 {(char*)"help", no_argument, 0, 'h'},
57 {(char*)"across", required_argument, 0, 'a'},
58 {(char*)"ebwt-ref", no_argument, 0, 'e'},
59 {(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
60 {(char*)0, 0, 0, 0} // terminator
61 };
62
63 /**
64 * Print a summary usage message to the provided output stream.
65 */
printUsage(ostream & out)66 static void printUsage(ostream& out) {
67 out << "Bowtie 2 version " << string(BOWTIE2_VERSION).c_str() << " by Ben Langmead (langmea@cs.jhu.edu, www.cs.jhu.edu/~langmea)" << endl;
68 out
69 << "Usage: bowtie2-inspect [options]* <bt2_base>" << endl
70 << " <bt2_base> bt2 filename minus trailing .1." + gEbwt_ext + "/.2." + gEbwt_ext << endl
71 << endl
72 << " By default, prints FASTA records of the indexed nucleotide sequences to" << endl
73 << " standard out. With -n, just prints names. With -s, just prints a summary of" << endl
74 << " the index parameters and sequences." << endl
75 << endl
76 << "Options:" << endl;
77 if(wrapper == "basic-0") {
78 out << " --large-index force inspection of the 'large' index, even if a" << endl
79 << " 'small' one is present." << endl
80 << " --debug use the debug binary; slower, assertions enabled" << endl
81 << " --sanitized use sanitized binary; slower, uses ASan and/or UBSan" << endl
82 << " --verbose log the issued command" << endl;
83 }
84 out << " -a/--across <int> Number of characters across in FASTA output (default: 60)" << endl
85 << " -n/--names Print reference sequence names only" << endl
86 << " -s/--summary Print summary incl. ref names, lengths, index properties" << endl
87 << " -v/--verbose Verbose output (for debugging)" << endl
88 << " -h/--help print detailed description of tool and its options" << endl
89 ;
90 if(wrapper.empty()) {
91 cerr << endl
92 << "*** Warning ***" << endl
93 << "'boowtie2-inspect' was run directly. It is recommended "
94 << "to use the wrapper script instead."
95 << endl << endl;
96 }
97 }
98
99 /**
100 * Parse an int out of optarg and enforce that it be at least 'lower';
101 * if it is less than 'lower', than output the given error message and
102 * exit with an error and a usage message.
103 */
parseInt(int lower,const char * errmsg)104 static int parseInt(int lower, const char *errmsg) {
105 long l;
106 char *endPtr= NULL;
107 l = strtol(optarg, &endPtr, 10);
108 if (endPtr != NULL) {
109 if (l < lower) {
110 cerr << errmsg << endl;
111 printUsage(cerr);
112 throw 1;
113 }
114 return (int32_t)l;
115 }
116 cerr << errmsg << endl;
117 printUsage(cerr);
118 throw 1;
119 return -1;
120 }
121
122 /**
123 * Read command-line arguments
124 */
parseOptions(int argc,char ** argv)125 static void parseOptions(int argc, char **argv) {
126 int option_index = 0;
127 int next_option;
128 do {
129 next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
130 switch (next_option) {
131 case ARG_WRAPPER:
132 wrapper = optarg;
133 break;
134 case ARG_USAGE:
135 case 'h':
136 printUsage(cout);
137 throw 0;
138 break;
139 case 'v': verbose = true; break;
140 case ARG_VERSION: showVersion = true; break;
141 case 'e': refFromEbwt = true; break;
142 case 'n': names_only = true; break;
143 case 's': summarize_only = true; break;
144 case 'a': across = parseInt(-1, "-a/--across arg must be at least 1"); break;
145 case -1: break; /* Done with options. */
146 case 0:
147 if (long_options[option_index].flag != 0)
148 break;
149 default:
150 printUsage(cerr);
151 throw 1;
152 }
153 } while(next_option != -1);
154 }
155
print_fasta_record(ostream & fout,const string & defline,const string & seq)156 static void print_fasta_record(
157 ostream& fout,
158 const string& defline,
159 const string& seq)
160 {
161 fout << ">";
162 fout << defline.c_str() << endl;
163
164 if(across > 0) {
165 size_t i = 0;
166 while (i + across < seq.length())
167 {
168 fout << seq.substr(i, across).c_str() << endl;
169 i += across;
170 }
171 if (i < seq.length())
172 fout << seq.substr(i).c_str() << endl;
173 } else {
174 fout << seq.c_str() << endl;
175 }
176 }
177
178 /**
179 * Given output stream, BitPairReference, reference index, name and
180 * length, print the whole nucleotide reference with the appropriate
181 * number of columns.
182 */
print_ref_sequence(ostream & fout,BitPairReference & ref,const string & name,size_t refi,size_t len)183 static void print_ref_sequence(
184 ostream& fout,
185 BitPairReference& ref,
186 const string& name,
187 size_t refi,
188 size_t len)
189 {
190 bool newlines = across > 0;
191 int myacross = across > 0 ? across : 60;
192 size_t incr = myacross * 1000;
193 uint32_t *buf = new uint32_t[(incr + 128)/4];
194 fout << ">" << name.c_str() << "\n";
195 ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
196 for(size_t i = 0; i < len; i += incr) {
197 size_t amt = min(incr, len-i);
198 assert_leq(amt, incr);
199 int off = ref.getStretch(buf, refi, i, amt ASSERT_ONLY(, destU32));
200 uint8_t *cb = ((uint8_t*)buf) + off;
201 for(size_t j = 0; j < amt; j++) {
202 if(newlines && j > 0 && (j % myacross) == 0) fout << "\n";
203 assert_range(0, 4, (int)cb[j]);
204 fout << "ACGTN"[(int)cb[j]];
205 }
206 fout << "\n";
207 }
208 delete [] buf;
209 }
210
211 /**
212 * Create a BitPairReference encapsulating the reference portion of the
213 * index at the given basename. Iterate through the reference
214 * sequences, sending each one to print_ref_sequence to print.
215 */
print_ref_sequences(ostream & fout,bool color,const EList<string> & refnames,const TIndexOffU * plen,const string & adjustedEbwtFileBase)216 static void print_ref_sequences(
217 ostream& fout,
218 bool color,
219 const EList<string>& refnames,
220 const TIndexOffU* plen,
221 const string& adjustedEbwtFileBase)
222 {
223 BitPairReference ref(
224 adjustedEbwtFileBase, // input basename
225 color, // true -> expect colorspace reference
226 false, // sanity-check reference
227 NULL, // infiles
228 NULL, // originals
229 false, // infiles are sequences
230 false, // memory-map
231 false, // use shared memory
232 false, // sweep mm-mapped ref
233 verbose, // be talkative
234 verbose); // be talkative at startup
235 assert_eq(ref.numRefs(), refnames.size());
236 for(size_t i = 0; i < ref.numRefs(); i++) {
237 print_ref_sequence(
238 fout,
239 ref,
240 refnames[i],
241 i,
242 plen[i] + (color ? 1 : 0));
243 }
244 }
245
246 /**
247 * Given an index, reconstruct the reference by LF mapping through the
248 * entire thing.
249 */
250 template<typename TStr>
print_index_sequences(ostream & fout,Ebwt & ebwt)251 static void print_index_sequences(ostream& fout, Ebwt& ebwt)
252 {
253 EList<string>* refnames = &(ebwt.refnames());
254
255 TStr cat_ref;
256 ebwt.restore(cat_ref);
257
258 TIndexOffU curr_ref = OFF_MASK;
259 string curr_ref_seq = "";
260 TIndexOffU curr_ref_len = OFF_MASK;
261 TIndexOffU last_text_off = 0;
262 size_t orig_len = cat_ref.length();
263 TIndexOffU tlen = OFF_MASK;
264 bool first = true;
265 for(size_t i = 0; i < orig_len; i++) {
266 TIndexOffU tidx = OFF_MASK;
267 TIndexOffU textoff = OFF_MASK;
268 tlen = OFF_MASK;
269 bool straddled = false;
270 ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);
271
272 if (tidx != OFF_MASK && textoff < tlen)
273 {
274 if (curr_ref != tidx)
275 {
276 if (curr_ref != OFF_MASK)
277 {
278 // Add trailing gaps, if any exist
279 if(curr_ref_seq.length() < curr_ref_len) {
280 curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
281 }
282 print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
283 }
284 curr_ref = tidx;
285 curr_ref_seq = "";
286 curr_ref_len = tlen;
287 last_text_off = 0;
288 first = true;
289 }
290
291 TIndexOffU textoff_adj = textoff;
292 if(first && textoff > 0) textoff_adj++;
293 if (textoff_adj - last_text_off > 1)
294 curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');
295
296 curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]);
297 last_text_off = textoff;
298 first = false;
299 }
300 }
301 if (curr_ref < refnames->size())
302 {
303 // Add trailing gaps, if any exist
304 if(curr_ref_seq.length() < curr_ref_len) {
305 curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
306 }
307 print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
308 }
309
310 }
311
312 static char *argv0 = NULL;
313
print_index_sequence_names(const string & fname,ostream & fout)314 static void print_index_sequence_names(const string& fname, ostream& fout)
315 {
316 EList<string> p_refnames;
317 readEbwtRefnames(fname, p_refnames);
318 for(size_t i = 0; i < p_refnames.size(); i++) {
319 cout << p_refnames[i].c_str() << endl;
320 }
321 }
322
323 /**
324 * Print a short summary of what's in the index and its flags.
325 */
print_index_summary(const string & fname,ostream & fout)326 static void print_index_summary(
327 const string& fname,
328 ostream& fout)
329 {
330 int32_t flags = Ebwt::readFlags(fname);
331 int32_t flagsr = Ebwt::readFlags(fname + ".rev");
332 bool color = readEbwtColor(fname);
333 bool entireReverse = readEntireReverse(fname + ".rev");
334 Ebwt ebwt(
335 fname,
336 color, // index is colorspace
337 -1, // don't require entire reverse
338 true, // index is for the forward direction
339 -1, // offrate (-1 = index default)
340 0, // offrate-plus (0 = index default)
341 false, // use memory-mapped IO
342 false, // use shared memory
343 false, // sweep memory-mapped memory
344 true, // load names?
345 false, // load SA sample?
346 false, // load ftab?
347 false, // load rstarts?
348 verbose, // be talkative?
349 verbose, // be talkative at startup?
350 false, // pass up memory exceptions?
351 false); // sanity check?
352 EList<string> p_refnames;
353 readEbwtRefnames(fname, p_refnames);
354 cout << "Flags" << '\t' << (-flags) << endl;
355 cout << "Reverse flags" << '\t' << (-flagsr) << endl;
356 cout << "2.0-compatible" << '\t' << (entireReverse ? "1" : "0") << endl;
357 cout << "SA-Sample" << "\t1 in " << (1 << ebwt.eh().offRate()) << endl;
358 cout << "FTab-Chars" << '\t' << ebwt.eh().ftabChars() << endl;
359 assert_eq(ebwt.nPat(), p_refnames.size());
360 for(size_t i = 0; i < p_refnames.size(); i++) {
361 cout << "Sequence-" << (i+1)
362 << '\t' << p_refnames[i].c_str()
363 << '\t' << (ebwt.plen()[i] + (color ? 1 : 0))
364 << endl;
365 }
366 }
367
driver(const string & ebwtFileBase,const string & query)368 static void driver(
369 const string& ebwtFileBase,
370 const string& query)
371 {
372 // Adjust
373 string adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
374
375 if (names_only) {
376 print_index_sequence_names(adjustedEbwtFileBase, cout);
377 } else if(summarize_only) {
378 print_index_summary(adjustedEbwtFileBase, cout);
379 } else {
380 // Initialize Ebwt object
381 bool color = readEbwtColor(adjustedEbwtFileBase);
382 Ebwt ebwt(
383 adjustedEbwtFileBase,
384 color, // index is colorspace
385 -1, // don't care about entire-reverse
386 true, // index is for the forward direction
387 -1, // offrate (-1 = index default)
388 0, // offrate-plus (0 = index default)
389 false, // use memory-mapped IO
390 false, // use shared memory
391 false, // sweep memory-mapped memory
392 true, // load names?
393 true, // load SA sample?
394 true, // load ftab?
395 true, // load rstarts?
396 verbose, // be talkative?
397 verbose, // be talkative at startup?
398 false, // pass up memory exceptions?
399 false); // sanity check?
400 // Load whole index into memory
401 if(refFromEbwt) {
402 ebwt.loadIntoMemory(
403 -1, // color
404 -1, // need entire reverse
405 true, // load SA sample
406 true, // load ftab
407 true, // load rstarts
408 true, // load names
409 false); // verbose
410 print_index_sequences<SString<char> >(cout, ebwt);
411 } else {
412 EList<string> refnames;
413 readEbwtRefnames(adjustedEbwtFileBase, refnames);
414 print_ref_sequences(
415 cout,
416 readEbwtColor(ebwtFileBase),
417 refnames,
418 ebwt.plen(),
419 adjustedEbwtFileBase);
420 }
421 // Evict any loaded indexes from memory
422 if(ebwt.isInMemory()) {
423 ebwt.evictFromMemory();
424 }
425 }
426 }
427
428 /**
429 * main function. Parses command-line arguments.
430 */
main(int argc,char ** argv)431 int main(int argc, char **argv) {
432 try {
433 string ebwtFile; // read serialized Ebwt from this file
434 string query; // read query string(s) from this file
435 EList<string> queries;
436 string outfile; // write query results to this file
437 argv0 = argv[0];
438 parseOptions(argc, argv);
439 if(showVersion) {
440 cout << argv0 << " version " << BOWTIE2_VERSION << endl;
441 if(sizeof(void*) == 4) {
442 cout << "32-bit" << endl;
443 } else if(sizeof(void*) == 8) {
444 cout << "64-bit" << endl;
445 } else {
446 cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
447 }
448 cout << "Built on " << BUILD_HOST << endl;
449 cout << BUILD_TIME << endl;
450 cout << "Compiler: " << COMPILER_VERSION << endl;
451 cout << "Options: " << COMPILER_OPTIONS << endl;
452 cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
453 << sizeof(int)
454 << ", " << sizeof(long) << ", " << sizeof(long long)
455 << ", " << sizeof(void *) << ", " << sizeof(size_t)
456 << ", " << sizeof(off_t) << "}" << endl;
457 return 0;
458 }
459
460 // Get input filename
461 if(optind >= argc) {
462 cerr << "No index name given!" << endl;
463 printUsage(cerr);
464 return 1;
465 }
466 ebwtFile = argv[optind++];
467
468 // Optionally summarize
469 if(verbose) {
470 cout << "Input ebwt file: \"" << ebwtFile.c_str() << "\"" << endl;
471 cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
472 cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
473 #ifdef NDEBUG
474 cout << "Assertions: disabled" << endl;
475 #else
476 cout << "Assertions: enabled" << endl;
477 #endif
478 }
479 driver(ebwtFile, query);
480 return 0;
481 } catch(std::exception& e) {
482 cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
483 cerr << "Command: ";
484 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
485 cerr << endl;
486 return 1;
487 } catch(int e) {
488 if(e != 0) {
489 cerr << "Error: Encountered internal Bowtie 2 exception (#" << e << ")" << endl;
490 cerr << "Command: ";
491 for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
492 cerr << endl;
493 }
494 return e;
495 }
496 }
497
498 #endif /*def BOWTIE_INSPECT_MAIN*/
499