1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #ifdef BOWTIE_INSPECT_MAIN
21 
22 #include <string>
23 #include <iostream>
24 #include <getopt.h>
25 #include <stdexcept>
26 
27 #include "assert_helpers.h"
28 #include "endian_swap.h"
29 #include "bt2_idx.h"
30 #include "reference.h"
31 #include "ds.h"
32 
33 using namespace std;
34 
35 static bool showVersion = false; // just print version and quit?
36 int verbose             = 0;  // be talkative
37 static int names_only   = 0;  // just print the sequence names in the index
38 static int summarize_only = 0; // just print summary of index and quit
39 static int across       = 60; // number of characters across in FASTA output
40 static bool refFromEbwt = false; // true -> when printing reference, decode it from Ebwt instead of reading it from BitPairReference
41 static string wrapper;
42 static const char *short_options = "vhnsea:";
43 
44 enum {
45 	ARG_VERSION = 256,
46 	ARG_WRAPPER,
47 	ARG_USAGE,
48 };
49 
50 static struct option long_options[] = {
51 	{(char*)"verbose",  no_argument,        0, 'v'},
52 	{(char*)"version",  no_argument,        0, ARG_VERSION},
53 	{(char*)"usage",    no_argument,        0, ARG_USAGE},
54 	{(char*)"names",    no_argument,        0, 'n'},
55 	{(char*)"summary",  no_argument,        0, 's'},
56 	{(char*)"help",     no_argument,        0, 'h'},
57 	{(char*)"across",   required_argument,  0, 'a'},
58 	{(char*)"ebwt-ref", no_argument,        0, 'e'},
59 	{(char*)"wrapper",  required_argument,  0, ARG_WRAPPER},
60 	{(char*)0, 0, 0, 0} // terminator
61 };
62 
63 /**
64  * Print a summary usage message to the provided output stream.
65  */
printUsage(ostream & out)66 static void printUsage(ostream& out) {
67 	out << "Bowtie 2 version " << string(BOWTIE2_VERSION).c_str() << " by Ben Langmead (langmea@cs.jhu.edu, www.cs.jhu.edu/~langmea)" << endl;
68 	out
69 	<< "Usage: bowtie2-inspect [options]* <bt2_base>" << endl
70 	<< "  <bt2_base>         bt2 filename minus trailing .1." + gEbwt_ext + "/.2." + gEbwt_ext << endl
71 	<< endl
72 	<< "  By default, prints FASTA records of the indexed nucleotide sequences to" << endl
73 	<< "  standard out.  With -n, just prints names.  With -s, just prints a summary of" << endl
74 	<< "  the index parameters and sequences." << endl
75 	<< endl
76 	<< "Options:" << endl;
77 	if(wrapper == "basic-0") {
78 		out << "  --large-index      force inspection of the 'large' index, even if a" << endl
79 			<< "                     'small' one is present." << endl
80 			<< "  --debug            use the debug binary; slower, assertions enabled" << endl
81 			<< "  --sanitized        use sanitized binary; slower, uses ASan and/or UBSan" << endl
82 			<< "  --verbose          log the issued command" << endl;
83 	}
84 	out << "  -a/--across <int>  Number of characters across in FASTA output (default: 60)" << endl
85 	<< "  -n/--names         Print reference sequence names only" << endl
86 	<< "  -s/--summary       Print summary incl. ref names, lengths, index properties" << endl
87 	<< "  -v/--verbose       Verbose output (for debugging)" << endl
88 	<< "  -h/--help          print detailed description of tool and its options" << endl
89 	;
90 	if(wrapper.empty()) {
91 		cerr << endl
92 		     << "*** Warning ***" << endl
93 			 << "'boowtie2-inspect' was run directly.  It is recommended "
94 			 << "to use the wrapper script instead."
95 			 << endl << endl;
96 	}
97 }
98 
99 /**
100  * Parse an int out of optarg and enforce that it be at least 'lower';
101  * if it is less than 'lower', than output the given error message and
102  * exit with an error and a usage message.
103  */
parseInt(int lower,const char * errmsg)104 static int parseInt(int lower, const char *errmsg) {
105 	long l;
106 	char *endPtr= NULL;
107 	l = strtol(optarg, &endPtr, 10);
108 	if (endPtr != NULL) {
109 		if (l < lower) {
110 			cerr << errmsg << endl;
111 			printUsage(cerr);
112 			throw 1;
113 		}
114 		return (int32_t)l;
115 	}
116 	cerr << errmsg << endl;
117 	printUsage(cerr);
118 	throw 1;
119 	return -1;
120 }
121 
122 /**
123  * Read command-line arguments
124  */
parseOptions(int argc,char ** argv)125 static void parseOptions(int argc, char **argv) {
126 	int option_index = 0;
127 	int next_option;
128 	do {
129 		next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
130 		switch (next_option) {
131 			case ARG_WRAPPER:
132 				wrapper = optarg;
133 				break;
134 			case ARG_USAGE:
135 			case 'h':
136 				printUsage(cout);
137 				throw 0;
138 				break;
139 			case 'v': verbose = true; break;
140 			case ARG_VERSION: showVersion = true; break;
141 			case 'e': refFromEbwt = true; break;
142 			case 'n': names_only = true; break;
143 			case 's': summarize_only = true; break;
144 			case 'a': across = parseInt(-1, "-a/--across arg must be at least 1"); break;
145 			case -1: break; /* Done with options. */
146 			case 0:
147 				if (long_options[option_index].flag != 0)
148 					break;
149 			default:
150 				printUsage(cerr);
151 				throw 1;
152 		}
153 	} while(next_option != -1);
154 }
155 
print_fasta_record(ostream & fout,const string & defline,const string & seq)156 static void print_fasta_record(
157 	ostream& fout,
158 	const string& defline,
159 	const string& seq)
160 {
161 	fout << ">";
162 	fout << defline.c_str() << endl;
163 
164 	if(across > 0) {
165 		size_t i = 0;
166 		while (i + across < seq.length())
167 		{
168 			fout << seq.substr(i, across).c_str() << endl;
169 			i += across;
170 		}
171 		if (i < seq.length())
172 			fout << seq.substr(i).c_str() << endl;
173 	} else {
174 		fout << seq.c_str() << endl;
175 	}
176 }
177 
178 /**
179  * Given output stream, BitPairReference, reference index, name and
180  * length, print the whole nucleotide reference with the appropriate
181  * number of columns.
182  */
print_ref_sequence(ostream & fout,BitPairReference & ref,const string & name,size_t refi,size_t len)183 static void print_ref_sequence(
184 	ostream& fout,
185 	BitPairReference& ref,
186 	const string& name,
187 	size_t refi,
188 	size_t len)
189 {
190 	bool newlines = across > 0;
191 	int myacross = across > 0 ? across : 60;
192 	size_t incr = myacross * 1000;
193 	uint32_t *buf = new uint32_t[(incr + 128)/4];
194 	fout << ">" << name.c_str() << "\n";
195 	ASSERT_ONLY(SStringExpandable<uint32_t> destU32);
196 	for(size_t i = 0; i < len; i += incr) {
197 		size_t amt = min(incr, len-i);
198 		assert_leq(amt, incr);
199 		int off = ref.getStretch(buf, refi, i, amt ASSERT_ONLY(, destU32));
200 		uint8_t *cb = ((uint8_t*)buf) + off;
201 		for(size_t j = 0; j < amt; j++) {
202 			if(newlines && j > 0 && (j % myacross) == 0) fout << "\n";
203 			assert_range(0, 4, (int)cb[j]);
204 			fout << "ACGTN"[(int)cb[j]];
205 		}
206 		fout << "\n";
207 	}
208 	delete [] buf;
209 }
210 
211 /**
212  * Create a BitPairReference encapsulating the reference portion of the
213  * index at the given basename.  Iterate through the reference
214  * sequences, sending each one to print_ref_sequence to print.
215  */
print_ref_sequences(ostream & fout,bool color,const EList<string> & refnames,const TIndexOffU * plen,const string & adjustedEbwtFileBase)216 static void print_ref_sequences(
217 	ostream& fout,
218 	bool color,
219 	const EList<string>& refnames,
220 	const TIndexOffU* plen,
221 	const string& adjustedEbwtFileBase)
222 {
223 	BitPairReference ref(
224 		adjustedEbwtFileBase, // input basename
225 		color,                // true -> expect colorspace reference
226 		false,                // sanity-check reference
227 		NULL,                 // infiles
228 		NULL,                 // originals
229 		false,                // infiles are sequences
230 		false,                // memory-map
231 		false,                // use shared memory
232 		false,                // sweep mm-mapped ref
233 		verbose,              // be talkative
234 		verbose);             // be talkative at startup
235 	assert_eq(ref.numRefs(), refnames.size());
236 	for(size_t i = 0; i < ref.numRefs(); i++) {
237 		print_ref_sequence(
238 			fout,
239 			ref,
240 			refnames[i],
241 			i,
242 			plen[i] + (color ? 1 : 0));
243 	}
244 }
245 
246 /**
247  * Given an index, reconstruct the reference by LF mapping through the
248  * entire thing.
249  */
250 template<typename TStr>
print_index_sequences(ostream & fout,Ebwt & ebwt)251 static void print_index_sequences(ostream& fout, Ebwt& ebwt)
252 {
253 	EList<string>* refnames = &(ebwt.refnames());
254 
255 	TStr cat_ref;
256 	ebwt.restore(cat_ref);
257 
258 	TIndexOffU curr_ref = OFF_MASK;
259 	string curr_ref_seq = "";
260 	TIndexOffU curr_ref_len = OFF_MASK;
261 	TIndexOffU last_text_off = 0;
262 	size_t orig_len = cat_ref.length();
263 	TIndexOffU tlen = OFF_MASK;
264 	bool first = true;
265 	for(size_t i = 0; i < orig_len; i++) {
266 		TIndexOffU tidx = OFF_MASK;
267 		TIndexOffU textoff = OFF_MASK;
268 		tlen = OFF_MASK;
269 		bool straddled = false;
270 		ebwt.joinedToTextOff(1 /* qlen */, (TIndexOffU)i, tidx, textoff, tlen, true, straddled);
271 
272 		if (tidx != OFF_MASK && textoff < tlen)
273 		{
274 			if (curr_ref != tidx)
275 			{
276 				if (curr_ref != OFF_MASK)
277 				{
278 					// Add trailing gaps, if any exist
279 					if(curr_ref_seq.length() < curr_ref_len) {
280 						curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
281 					}
282 					print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
283 				}
284 				curr_ref = tidx;
285 				curr_ref_seq = "";
286 				curr_ref_len = tlen;
287 				last_text_off = 0;
288 				first = true;
289 			}
290 
291 			TIndexOffU textoff_adj = textoff;
292 			if(first && textoff > 0) textoff_adj++;
293 			if (textoff_adj - last_text_off > 1)
294 				curr_ref_seq += string(textoff_adj - last_text_off - 1, 'N');
295 
296 			curr_ref_seq.push_back("ACGT"[int(cat_ref[i])]);
297 			last_text_off = textoff;
298 			first = false;
299 		}
300 	}
301 	if (curr_ref < refnames->size())
302 	{
303 		// Add trailing gaps, if any exist
304 		if(curr_ref_seq.length() < curr_ref_len) {
305 			curr_ref_seq += string(curr_ref_len - curr_ref_seq.length(), 'N');
306 		}
307 		print_fasta_record(fout, (*refnames)[curr_ref], curr_ref_seq);
308 	}
309 
310 }
311 
312 static char *argv0 = NULL;
313 
print_index_sequence_names(const string & fname,ostream & fout)314 static void print_index_sequence_names(const string& fname, ostream& fout)
315 {
316 	EList<string> p_refnames;
317 	readEbwtRefnames(fname, p_refnames);
318 	for(size_t i = 0; i < p_refnames.size(); i++) {
319 		cout << p_refnames[i].c_str() << endl;
320 	}
321 }
322 
323 /**
324  * Print a short summary of what's in the index and its flags.
325  */
print_index_summary(const string & fname,ostream & fout)326 static void print_index_summary(
327 	const string& fname,
328 	ostream& fout)
329 {
330 	int32_t flags = Ebwt::readFlags(fname);
331 	int32_t flagsr = Ebwt::readFlags(fname + ".rev");
332 	bool color = readEbwtColor(fname);
333 	bool entireReverse = readEntireReverse(fname + ".rev");
334 	Ebwt ebwt(
335 		fname,
336 		color,                // index is colorspace
337 		-1,                   // don't require entire reverse
338 		true,                 // index is for the forward direction
339 		-1,                   // offrate (-1 = index default)
340 		0,                    // offrate-plus (0 = index default)
341 		false,                // use memory-mapped IO
342 		false,                // use shared memory
343 		false,                // sweep memory-mapped memory
344 		true,                 // load names?
345 		false,                // load SA sample?
346 		false,                // load ftab?
347 		false,                // load rstarts?
348 		verbose,              // be talkative?
349 		verbose,              // be talkative at startup?
350 		false,                // pass up memory exceptions?
351 		false);               // sanity check?
352 	EList<string> p_refnames;
353 	readEbwtRefnames(fname, p_refnames);
354 	cout << "Flags" << '\t' << (-flags) << endl;
355 	cout << "Reverse flags" << '\t' << (-flagsr) << endl;
356 	cout << "2.0-compatible" << '\t' << (entireReverse ? "1" : "0") << endl;
357 	cout << "SA-Sample" << "\t1 in " << (1 << ebwt.eh().offRate()) << endl;
358 	cout << "FTab-Chars" << '\t' << ebwt.eh().ftabChars() << endl;
359 	assert_eq(ebwt.nPat(), p_refnames.size());
360 	for(size_t i = 0; i < p_refnames.size(); i++) {
361 		cout << "Sequence-" << (i+1)
362 		     << '\t' << p_refnames[i].c_str()
363 		     << '\t' << (ebwt.plen()[i] + (color ? 1 : 0))
364 		     << endl;
365 	}
366 }
367 
driver(const string & ebwtFileBase,const string & query)368 static void driver(
369 	const string& ebwtFileBase,
370 	const string& query)
371 {
372 	// Adjust
373 	string adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
374 
375 	if (names_only) {
376 		print_index_sequence_names(adjustedEbwtFileBase, cout);
377 	} else if(summarize_only) {
378 		print_index_summary(adjustedEbwtFileBase, cout);
379 	} else {
380 		// Initialize Ebwt object
381 		bool color = readEbwtColor(adjustedEbwtFileBase);
382 		Ebwt ebwt(
383 			adjustedEbwtFileBase,
384 			color,                // index is colorspace
385 			-1,                   // don't care about entire-reverse
386 			true,                 // index is for the forward direction
387 			-1,                   // offrate (-1 = index default)
388 			0,                    // offrate-plus (0 = index default)
389 			false,                // use memory-mapped IO
390 			false,                // use shared memory
391 			false,                // sweep memory-mapped memory
392 			true,                 // load names?
393 			true,                 // load SA sample?
394 			true,                 // load ftab?
395 			true,                 // load rstarts?
396 			verbose,              // be talkative?
397 			verbose,              // be talkative at startup?
398 			false,                // pass up memory exceptions?
399 			false);               // sanity check?
400 		// Load whole index into memory
401 		if(refFromEbwt) {
402 			ebwt.loadIntoMemory(
403 				-1,     // color
404 				-1,     // need entire reverse
405 				true,   // load SA sample
406 				true,   // load ftab
407 				true,   // load rstarts
408 				true,   // load names
409 				false); // verbose
410 			print_index_sequences<SString<char> >(cout, ebwt);
411 		} else {
412 			EList<string> refnames;
413 			readEbwtRefnames(adjustedEbwtFileBase, refnames);
414 			print_ref_sequences(
415 				cout,
416 				readEbwtColor(ebwtFileBase),
417 				refnames,
418 				ebwt.plen(),
419 				adjustedEbwtFileBase);
420 		}
421 		// Evict any loaded indexes from memory
422 		if(ebwt.isInMemory()) {
423 			ebwt.evictFromMemory();
424 		}
425 	}
426 }
427 
428 /**
429  * main function.  Parses command-line arguments.
430  */
main(int argc,char ** argv)431 int main(int argc, char **argv) {
432 	try {
433 		string ebwtFile;  // read serialized Ebwt from this file
434 		string query;   // read query string(s) from this file
435 		EList<string> queries;
436 		string outfile; // write query results to this file
437 		argv0 = argv[0];
438 		parseOptions(argc, argv);
439 		if(showVersion) {
440 			cout << argv0 << " version " << BOWTIE2_VERSION << endl;
441 			if(sizeof(void*) == 4) {
442 				cout << "32-bit" << endl;
443 			} else if(sizeof(void*) == 8) {
444 				cout << "64-bit" << endl;
445 			} else {
446 				cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
447 			}
448 			cout << "Built on " << BUILD_HOST << endl;
449 			cout << BUILD_TIME << endl;
450 			cout << "Compiler: " << COMPILER_VERSION << endl;
451 			cout << "Options: " << COMPILER_OPTIONS << endl;
452 			cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
453 				 << sizeof(int)
454 				 << ", " << sizeof(long) << ", " << sizeof(long long)
455 				 << ", " << sizeof(void *) << ", " << sizeof(size_t)
456 				 << ", " << sizeof(off_t) << "}" << endl;
457 			return 0;
458 		}
459 
460 		// Get input filename
461 		if(optind >= argc) {
462 			cerr << "No index name given!" << endl;
463 			printUsage(cerr);
464 			return 1;
465 		}
466 		ebwtFile = argv[optind++];
467 
468 		// Optionally summarize
469 		if(verbose) {
470 			cout << "Input ebwt file: \"" << ebwtFile.c_str() << "\"" << endl;
471 			cout << "Output file: \"" << outfile.c_str() << "\"" << endl;
472 			cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
473 #ifdef NDEBUG
474 			cout << "Assertions: disabled" << endl;
475 #else
476 			cout << "Assertions: enabled" << endl;
477 #endif
478 		}
479 		driver(ebwtFile, query);
480 		return 0;
481 	} catch(std::exception& e) {
482 		cerr << "Error: Encountered exception: '" << e.what() << "'" << endl;
483 		cerr << "Command: ";
484 		for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
485 		cerr << endl;
486 		return 1;
487 	} catch(int e) {
488 		if(e != 0) {
489 			cerr << "Error: Encountered internal Bowtie 2 exception (#" << e << ")" << endl;
490 			cerr << "Command: ";
491 			for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
492 			cerr << endl;
493 		}
494 		return e;
495 	}
496 }
497 
498 #endif /*def BOWTIE_INSPECT_MAIN*/
499