1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #ifndef EBWT_H_
21 #define EBWT_H_
22 
23 #include <stdint.h>
24 #include <string.h>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <memory>
29 #include <fcntl.h>
30 #include <math.h>
31 #include <errno.h>
32 #include <stdexcept>
33 #include <sys/stat.h>
34 #ifdef BOWTIE_MM
35 #include <sys/mman.h>
36 #include <sys/shm.h>
37 #endif
38 #include "shmem.h"
39 #include "alphabet.h"
40 #include "assert_helpers.h"
41 #include "bitpack.h"
42 #include "blockwise_sa.h"
43 #include "endian_swap.h"
44 #include "word_io.h"
45 #include "random_source.h"
46 #include "ref_read.h"
47 #include "threading.h"
48 #include "str_util.h"
49 #include "mm.h"
50 #include "timer.h"
51 #include "reference.h"
52 #include "search_globals.h"
53 #include "ds.h"
54 #include "random_source.h"
55 #include "mem_ids.h"
56 #include "btypes.h"
57 
58 #ifdef POPCNT_CAPABILITY
59 #include "processor_support.h"
60 #endif
61 
62 #if __cplusplus <= 199711L
63 #define unique_ptr auto_ptr
64 #endif
65 
66 using namespace std;
67 
68 // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
69 extern uint8_t cCntLUT_4[4][4][256];
70 
71 static const uint64_t c_table[4] = {
72 	0xffffffffffffffff,
73 	0xaaaaaaaaaaaaaaaa,
74 	0x5555555555555555,
75 	0x0000000000000000
76 };
77 
78 #ifndef VMSG_NL
79 #define VMSG_NL(...)				\
80 	if(this->verbose()) {			\
81 		stringstream tmp;		\
82 		tmp << __VA_ARGS__ << endl;	\
83 		this->verbose(tmp.str());	\
84 	}
85 #endif
86 
87 #ifndef VMSG
88 #define VMSG(...)				\
89 	if(this->verbose()) {			\
90 		stringstream tmp;		\
91 		tmp << __VA_ARGS__;		\
92 		this->verbose(tmp.str());	\
93 	}
94 #endif
95 
96 /**
97  * Flags describing type of Ebwt.
98  */
99 enum EBWT_FLAGS {
100 	EBWT_COLOR = 2,     // true -> Ebwt is colorspace
101 	EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
102 	                    // concatenated string reversed, rather than
103 	// each stretch reversed
104 };
105 
106 /**
107  * Extended Burrows-Wheeler transform header.  This together with the
108  * actual data arrays and other text-specific parameters defined in
109  * class Ebwt constitute the entire Ebwt.
110  */
111 class EbwtParams {
112 
113 public:
EbwtParams()114 	EbwtParams() { }
115 
EbwtParams(TIndexOffU len,int32_t lineRate,int32_t offRate,int32_t ftabChars,bool color,bool entireReverse)116 	EbwtParams(
117 		TIndexOffU len,
118 		int32_t lineRate,
119 		int32_t offRate,
120 		int32_t ftabChars,
121 		bool color,
122 		bool entireReverse)
123 		{
124 			init(len, lineRate, offRate, ftabChars, color, entireReverse);
125 		}
126 
EbwtParams(const EbwtParams & eh)127 	EbwtParams(const EbwtParams& eh) {
128 		init(eh._len, eh._lineRate, eh._offRate,
129 		     eh._ftabChars, eh._color, eh._entireReverse);
130 	}
131 
init(TIndexOffU len,int32_t lineRate,int32_t offRate,int32_t ftabChars,bool color,bool entireReverse)132 	void init(
133 		TIndexOffU len,
134 		int32_t lineRate,
135 		int32_t offRate,
136 		int32_t ftabChars,
137 		bool color,
138 		bool entireReverse)
139 		{
140 			_color = color;
141 			_entireReverse = entireReverse;
142 			_len = len;
143 			_bwtLen = _len + 1;
144 			_sz = (len+3)/4;
145 			_bwtSz = (len/4 + 1);
146 			_lineRate = lineRate;
147 			_origOffRate = offRate;
148 			_offRate = offRate;
149 			_offMask = OFF_MASK << _offRate;
150 			_ftabChars = ftabChars;
151 			_eftabLen = _ftabChars*2;
152 			_eftabSz = _eftabLen*OFF_SIZE;
153 			_ftabLen = (1 << (_ftabChars*2))+1;
154 			_ftabSz = _ftabLen*OFF_SIZE;
155 			_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
156 			_offsSz = (uint64_t)_offsLen*OFF_SIZE;
157 			_lineSz = 1 << _lineRate;
158 			_sideSz = _lineSz * 1 /* lines per side */;
159 			_sideBwtSz = _sideSz - OFF_SIZE*4;
160 			_sideBwtLen = _sideBwtSz*4;
161 			_numSides = (_bwtSz+(_sideBwtSz)-1)/(_sideBwtSz);
162 			_numLines = _numSides * 1 /* lines per side */;
163 			_ebwtTotLen = _numSides * _sideSz;
164 			_ebwtTotSz = _ebwtTotLen;
165 			assert(repOk());
166 		}
167 
len()168 	TIndexOffU len() const           { return _len; }
lenNucs()169 	TIndexOffU lenNucs() const       { return _len + (_color ? 1 : 0); }
bwtLen()170 	TIndexOffU bwtLen() const        { return _bwtLen; }
sz()171 	TIndexOffU sz() const            { return _sz; }
bwtSz()172 	TIndexOffU bwtSz() const         { return _bwtSz; }
lineRate()173 	int32_t   lineRate() const      { return _lineRate; }
origOffRate()174 	int32_t   origOffRate() const   { return _origOffRate; }
offRate()175 	int32_t   offRate() const       { return _offRate; }
offMask()176 	TIndexOffU offMask() const       { return _offMask; }
ftabChars()177 	int32_t   ftabChars() const     { return _ftabChars; }
eftabLen()178 	int32_t eftabLen() const      { return _eftabLen; }
eftabSz()179 	int32_t eftabSz() const       { return _eftabSz; }
ftabLen()180 	TIndexOffU ftabLen() const       { return _ftabLen; }
ftabSz()181 	TIndexOffU ftabSz() const        { return _ftabSz; }
offsLen()182 	TIndexOffU offsLen() const       { return _offsLen; }
offsSz()183 	uint64_t offsSz() const        { return _offsSz; }
lineSz()184 	int32_t lineSz() const        { return _lineSz; }
sideSz()185 	int32_t sideSz() const        { return _sideSz; }
sideBwtSz()186 	int32_t sideBwtSz() const     { return _sideBwtSz; }
sideBwtLen()187 	int32_t sideBwtLen() const    { return _sideBwtLen; }
numSides()188 	TIndexOffU numSides() const      { return _numSides; }
numLines()189 	TIndexOffU numLines() const      { return _numLines; }
ebwtTotLen()190 	TIndexOffU ebwtTotLen() const    { return _ebwtTotLen; }
ebwtTotSz()191 	TIndexOffU ebwtTotSz() const     { return _ebwtTotSz; }
color()192 	bool color() const             { return _color; }
entireReverse()193 	bool entireReverse() const     { return _entireReverse; }
194 
195 	/**
196 	 * Set a new suffix-array sampling rate, which involves updating
197 	 * rate, mask, sample length, and sample size.
198 	 */
setOffRate(int __offRate)199 	void setOffRate(int __offRate) {
200 		_offRate = __offRate;
201 		_offMask = OFF_MASK << _offRate;
202 		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
203 		_offsSz = (uint64_t)_offsLen * OFF_SIZE;
204 	}
205 
206 #ifndef NDEBUG
207 	/// Check that this EbwtParams is internally consistent
repOk()208 	bool repOk() const {
209 		assert_gt(_len, 0);
210 		assert_gt(_lineRate, 3);
211 		assert_geq(_offRate, 0);
212 		assert_leq(_ftabChars, 16);
213 		assert_geq(_ftabChars, 1);
214 		assert_lt(_lineRate, 32);
215 		assert_lt(_ftabChars, 32);
216 		assert_eq(0, _ebwtTotSz % _lineSz);
217 		return true;
218 	}
219 #endif
220 
221 	/**
222 	 * Pretty-print the header contents to the given output stream.
223 	 */
print(ostream & out)224 	void print(ostream& out) const {
225 		out << "Headers:" << endl
226 		    << "    len: "          << _len << endl
227 		    << "    bwtLen: "       << _bwtLen << endl
228 		    << "    sz: "           << _sz << endl
229 		    << "    bwtSz: "        << _bwtSz << endl
230 		    << "    lineRate: "     << _lineRate << endl
231 		    << "    offRate: "      << _offRate << endl
232 		    << "    offMask: 0x"    << hex << _offMask << dec << endl
233 		    << "    ftabChars: "    << _ftabChars << endl
234 		    << "    eftabLen: "     << _eftabLen << endl
235 		    << "    eftabSz: "      << _eftabSz << endl
236 		    << "    ftabLen: "      << _ftabLen << endl
237 		    << "    ftabSz: "       << _ftabSz << endl
238 		    << "    offsLen: "      << _offsLen << endl
239 		    << "    offsSz: "       << _offsSz << endl
240 		    << "    lineSz: "       << _lineSz << endl
241 		    << "    sideSz: "       << _sideSz << endl
242 		    << "    sideBwtSz: "    << _sideBwtSz << endl
243 		    << "    sideBwtLen: "   << _sideBwtLen << endl
244 		    << "    numSides: "     << _numSides << endl
245 		    << "    numLines: "     << _numLines << endl
246 		    << "    ebwtTotLen: "   << _ebwtTotLen << endl
247 		    << "    ebwtTotSz: "    << _ebwtTotSz << endl
248 		    << "    color: "        << _color << endl
249 		    << "    reverse: "      << _entireReverse << endl;
250 	}
251 
252 	TIndexOffU _len;
253 	TIndexOffU _bwtLen;
254 	TIndexOffU _sz;
255 	TIndexOffU _bwtSz;
256 	int32_t  _lineRate;
257 	int32_t  _origOffRate;
258 	int32_t  _offRate;
259 	TIndexOffU _offMask;
260 	int32_t  _ftabChars;
261 	uint32_t _eftabLen;
262 	uint32_t _eftabSz;
263 	TIndexOffU _ftabLen;
264 	TIndexOffU _ftabSz;
265 	TIndexOffU _offsLen;
266 	uint64_t _offsSz;
267 	uint32_t _lineSz;
268 	uint32_t _sideSz;
269 	uint32_t _sideBwtSz;
270 	uint32_t _sideBwtLen;
271 	TIndexOffU _numSides;
272 	TIndexOffU _numLines;
273 	TIndexOffU _ebwtTotLen;
274 	TIndexOffU _ebwtTotSz;
275 	bool     _color;
276 	bool     _entireReverse;
277 };
278 
279 /**
280  * Exception to throw when a file-realted error occurs.
281  */
282 class EbwtFileOpenException : public std::runtime_error {
283 public:
284 	EbwtFileOpenException(const std::string& msg = "") :
runtime_error(msg)285 		std::runtime_error(msg) { }
286 };
287 
288 /**
289  * Calculate size of file with given name.
290  */
fileSize(const char * name)291 static inline int64_t fileSize(const char* name) {
292 	std::ifstream f;
293 	f.open(name, std::ios_base::binary | std::ios_base::in);
294 	if (!f.good() || f.eof() || !f.is_open()) { return 0; }
295 	f.seekg(0, std::ios_base::beg);
296 	std::ifstream::pos_type begin_pos = f.tellg();
297 	f.seekg(0, std::ios_base::end);
298 	return static_cast<int64_t>(f.tellg() - begin_pos);
299 }
300 
301 /**
302  * Encapsulates a location in the bwt text in terms of the side it
303  * occurs in and its offset within the side.
304  */
305 struct SideLocus {
SideLocusSideLocus306 	SideLocus() :
307 		_sideByteOff(0),
308 		_sideNum(0),
309 		_charOff(0),
310 		_by(-1),
311 		_bp(-1) { }
312 
313 	/**
314 	 * Construct from row and other relevant information about the Ebwt.
315 	 */
SideLocusSideLocus316 	SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
317 		initFromRow(row, ep, ebwt);
318 	}
319 
320 	/**
321 	 * Init two SideLocus objects from a top/bot pair, using the result
322 	 * from one call to initFromRow to possibly avoid a second call.
323 	 */
initFromTopBotSideLocus324 	static void initFromTopBot(
325 		TIndexOffU top,
326 		TIndexOffU bot,
327 		const EbwtParams& ep,
328 		const uint8_t* ebwt,
329 		SideLocus& ltop,
330 		SideLocus& lbot)
331 		{
332 			const TIndexOffU sideBwtLen = ep._sideBwtLen;
333 			assert_gt(bot, top);
334 			ltop.initFromRow(top, ep, ebwt);
335 			TIndexOffU spread = bot - top;
336 			// Many cache misses on the following lines
337 			if(ltop._charOff + spread < sideBwtLen) {
338 				lbot._charOff = (uint32_t)(ltop._charOff + spread);
339 				lbot._sideNum = ltop._sideNum;
340 				lbot._sideByteOff = ltop._sideByteOff;
341 				lbot._by = lbot._charOff >> 2;
342 				assert_lt(lbot._by, (int)ep._sideBwtSz);
343 				lbot._bp = lbot._charOff & 3;
344 			} else {
345 				lbot.initFromRow(bot, ep, ebwt);
346 			}
347 		}
348 
349 	/**
350 	 * Calculate SideLocus based on a row and other relevant
351 	 * information about the shape of the Ebwt.
352 	 */
initFromRowSideLocus353 	void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
354 		const int32_t sideSz     = ep._sideSz;
355 		// Side length is hard-coded for now; this allows the compiler
356 		// to do clever things to accelerate / and %.
357 		_sideNum                  = row / (48*OFF_SIZE);
358 		assert_lt(_sideNum, ep._numSides);
359 		_charOff                  = row % (48*OFF_SIZE);
360 		_sideByteOff              = _sideNum * sideSz;
361 		assert_leq(row, ep._len);
362 		assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
363 		// Tons of cache misses on the next line
364 		_by = _charOff >> 2; // byte within side
365 		assert_lt(_by, (int)ep._sideBwtSz);
366 		_bp = _charOff & 3;  // bit-pair within byte
367 	}
368 
369 	/**
370 	 * Transform this SideLocus to refer to the next side (i.e. the one
371 	 * corresponding to the next side downstream).  Set all cursors to
372 	 * point to the beginning of the side.
373 	 */
nextSideSideLocus374 	void nextSide(const EbwtParams& ep) {
375 		assert(valid());
376 		_sideByteOff += ep.sideSz();
377 		_sideNum++;
378 		_by = _bp = _charOff = 0;
379 		assert(valid());
380 	}
381 
382 	/**
383 	 * Return true iff this is an initialized SideLocus
384 	 */
validSideLocus385 	bool valid() const {
386 		if(_bp != -1) {
387 			return true;
388 		}
389 		return false;
390 	}
391 
392 	/**
393 	 * Convert locus to BW row it corresponds to.
394 	 */
toBWRowSideLocus395 	TIndexOffU toBWRow() const {
396 		return _sideNum * 48*OFF_SIZE + _charOff;
397 	}
398 
399 #ifndef NDEBUG
400 	/**
401 	 * Check that SideLocus is internally consistent and consistent
402 	 * with the (provided) EbwtParams.
403 	 */
repOkSideLocus404 	bool repOk(const EbwtParams& ep) const {
405 		ASSERT_ONLY(TIndexOffU row = _sideNum * 48*OFF_SIZE + _charOff);
406 		assert_leq(row, ep._len);
407 		assert_range(-1, 3, _bp);
408 		assert_range(0, (int)ep._sideBwtSz, _by);
409 		return true;
410 	}
411 #endif
412 
413 	/// Make this look like an invalid SideLocus
invalidateSideLocus414 	void invalidate() {
415 		_bp = -1;
416 	}
417 
418 	/**
419 	 * Return a read-only pointer to the beginning of the top side.
420 	 */
sideSideLocus421 	const uint8_t *side(const uint8_t* ebwt) const {
422 		return ebwt + _sideByteOff;
423 	}
424 
425 	TIndexOffU _sideByteOff; // offset of top side within ebwt[]
426 	TIndexOffU _sideNum;     // index of side
427 	uint32_t _charOff;      // character offset within side
428 	int32_t _by;            // byte within side (not adjusted for bw sides)
429 	int32_t _bp;            // bitpair within byte (not adjusted for bw sides)
430 };
431 #ifdef POPCNT_CAPABILITY   // wrapping of "struct"
432 struct USE_POPCNT_GENERIC {
433 #endif
434 // Use this standard bit-bashing population count
pop64USE_POPCNT_GENERIC435 	inline static int pop64(uint64_t x) {
436 		// Lots of cache misses on following lines (>10K)
437 		x = x - ((x >> 1) & 0x5555555555555555llu);
438 		x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
439 		x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
440 		x = x + (x >> 8);
441 		x = x + (x >> 16);
442 		x = x + (x >> 32);
443 		return (int)(x & 0x3Fllu);
444 	}
445 #ifdef POPCNT_CAPABILITY  // wrapping a "struct"
446 };
447 #endif
448 
449 #ifdef POPCNT_CAPABILITY
450 struct USE_POPCNT_INSTRUCTION {
pop64USE_POPCNT_INSTRUCTION451 	inline static int pop64(uint64_t x) {
452 		return __builtin_popcountll(x);
453 	}
454 };
455 #endif
456 
457 /**
458  * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
459  * within a 64-bit argument.
460  */
461 #ifdef POPCNT_CAPABILITY
462 template<typename Operation>
463 #endif
countInU64(int c,uint64_t dw)464 inline static int countInU64(int c, uint64_t dw) {
465 	uint64_t c0 = c_table[c];
466 	uint64_t x0 = dw ^ c0;
467 	uint64_t x1 = (x0 >> 1);
468 	uint64_t x2 = x1 & (0x5555555555555555);
469 	uint64_t x3 = x0 & x2;
470 #ifdef POPCNT_CAPABILITY
471 	uint64_t tmp = Operation().pop64(x3);
472 #else
473 	uint64_t tmp = pop64(x3);
474 #endif
475 	return (int) tmp;
476 }
477 
478 // Forward declarations for Ebwt class
479 class EbwtSearchParams;
480 
481 /**
482  * Extended Burrows-Wheeler transform data.
483  *
484  * An Ebwt may be transferred to and from RAM with calls to
485  * evictFromMemory() and loadIntoMemory().  By default, a newly-created
486  * Ebwt is not loaded into memory; if the user would like to use a
487  * newly-created Ebwt to answer queries, they must first call
488  * loadIntoMemory().
489  */
490 class Ebwt {
491 public:
492 #define Ebwt_INITS					\
493 	_switchEndian(false),				\
494 		_overrideOffRate(overrideOffRate),	\
495 		_verbose(verbose),			\
496 		_passMemExc(passMemExc),		\
497 		_sanity(sanityCheck),			\
498 		fw_(fw),				\
499 		_in1(NULL),				\
500 		_in2(NULL),				\
501 		_zOff(OFF_MASK),			\
502 		_zEbwtByteOff(OFF_MASK),		\
503 		_zEbwtBpOff(-1),			\
504 		_nPat(0),				\
505 		_nFrag(0),				\
506 		_plen(EBWT_CAT),			\
507 		_rstarts(EBWT_CAT),			\
508 		_fchr(EBWT_CAT),			\
509 		_ftab(EBWT_CAT),			\
510 		_eftab(EBWT_CAT),			\
511 		_offs(EBWT_CAT),			\
512 		_ebwt(EBWT_CAT),			\
513 		_useMm(false),				\
514 		useShmem_(false),			\
515 		_refnames(EBWT_CAT),			\
516 		mmFile1_(NULL),				\
517 		mmFile2_(NULL)
518 
519 	/// Construct an Ebwt from the given input file
Ebwt(const string & in,int color,int needEntireReverse,bool fw,int32_t overrideOffRate,int32_t offRatePlus,bool useMm,bool useShmem,bool mmSweep,bool loadNames,bool loadSASamp,bool loadFtab,bool loadRstarts,bool verbose,bool startVerbose,bool passMemExc,bool sanityCheck)520 	Ebwt(const string& in,
521 	     int color,
522 	     int needEntireReverse,
523 	     bool fw,
524 	     int32_t overrideOffRate, // = -1,
525 	     int32_t offRatePlus, // = -1,
526 	     bool useMm, // = false,
527 	     bool useShmem, // = false,
528 	     bool mmSweep, // = false,
529 	     bool loadNames, // = false,
530 	     bool loadSASamp, // = true,
531 	     bool loadFtab, // = true,
532 	     bool loadRstarts, // = true,
533 	     bool verbose, // = false,
534 	     bool startVerbose, // = false,
535 	     bool passMemExc, // = false,
536 	     bool sanityCheck) : // = false) :
537 		Ebwt_INITS
538 	{
539 		assert(!useMm || !useShmem);
540 #ifdef POPCNT_CAPABILITY
541 		ProcessorSupport ps;
542 		_usePOPCNTinstruction = ps.POPCNTenabled();
543 #endif
544 		packed_ = false;
545 		_useMm = useMm;
546 		useShmem_ = useShmem;
547 		_in1Str = in + ".1." + gEbwt_ext;
548 		_in2Str = in + ".2." + gEbwt_ext;
549 		readIntoMemory(
550 			color,       // expect index to be colorspace?
551 			fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
552 			loadSASamp,  // load the SA sample portion?
553 			loadFtab,    // load the ftab & eftab?
554 			loadRstarts, // load the rstarts array?
555 			true,        // stop after loading the header portion?
556 			&_eh,        // params
557 			mmSweep,     // mmSweep
558 			loadNames,   // loadNames
559 			startVerbose); // startVerbose
560 		// If the offRate has been overridden, reflect that in the
561 		// _eh._offRate field
562 		if(offRatePlus > 0 && _overrideOffRate == -1) {
563 			_overrideOffRate = _eh._offRate + offRatePlus;
564 		}
565 		if(_overrideOffRate > _eh._offRate) {
566 			_eh.setOffRate(_overrideOffRate);
567 			assert_eq(_overrideOffRate, _eh._offRate);
568 		}
569 		assert(repOk());
570 	}
571 
572 	/// Construct an Ebwt from the given header parameters and string
573 	/// vector, optionally using a blockwise suffix sorter with the
574 	/// given 'bmax' and 'dcv' parameters.  The string vector is
575 	/// ultimately joined and the joined string is passed to buildToDisk().
576 	template<typename TStr>
577 	Ebwt(
578 		TStr exampleStr,
579 		bool packed,
580 		int color,
581 		int needEntireReverse,
582 		int32_t lineRate,
583 		int32_t offRate,
584 		int32_t ftabChars,
585 		int nthreads,
586 		const string& file,   // base filename for EBWT files
587 		bool fw,
588 		bool useBlockwise,
589 		TIndexOffU bmax,
590 		TIndexOffU bmaxSqrtMult,
591 		TIndexOffU bmaxDivN,
592 		int dcv,
593 		EList<FileBuf*>& is,
594 		EList<RefRecord>& szs,
595 		TIndexOffU sztot,
596 		const RefReadInParams& refparams,
597 		uint32_t seed,
598 		int32_t overrideOffRate = -1,
599 		bool doSaFile = false,
600 		bool doBwtFile = false,
601 		bool verbose = false,
602 		bool passMemExc = false,
603 		bool sanityCheck = false) :
604 		Ebwt_INITS,
605 	_eh(
606 		joinedLen(szs),
607 		lineRate,
608 		offRate,
609 		ftabChars,
610 		color,
611 		refparams.reverse == REF_READ_REVERSE)
612 	{
613 #ifdef POPCNT_CAPABILITY
614 		ProcessorSupport ps;
615 		_usePOPCNTinstruction = ps.POPCNTenabled();
616 #endif
617 		_in1Str = file + ".1." + gEbwt_ext;
618 		_in2Str = file + ".2." + gEbwt_ext;
619 		packed_ = packed;
620 		// Open output files
621 		ofstream fout1(_in1Str.c_str(), ios::binary);
622 		if(!fout1.good()) {
623 			cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl
624 			     << "Please make sure the directory exists and that permissions allow writing by" << endl
625 			     << "Bowtie." << endl;
626 			throw 1;
627 		}
628 		ofstream fout2(_in2Str.c_str(), ios::binary);
629 		if(!fout2.good()) {
630 			cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl
631 			     << "Please make sure the directory exists and that permissions allow writing by" << endl
632 			     << "Bowtie." << endl;
633 			throw 1;
634 		}
635 		_inSaStr = file + ".sa";
636 		_inBwtStr = file + ".bwt";
637 		ofstream *saOut = NULL, *bwtOut = NULL;
638 		if(doSaFile) {
639 			saOut = new ofstream(_inSaStr.c_str(), ios::binary);
640 			if(!saOut->good()) {
641 				cerr << "Could not open suffix-array file for writing: \"" << _inSaStr.c_str() << "\"" << endl
642 				     << "Please make sure the directory exists and that permissions allow writing by" << endl
643 				     << "Bowtie." << endl;
644 				throw 1;
645 			}
646 		}
647 		if(doBwtFile) {
648 			bwtOut = new ofstream(_inBwtStr.c_str(), ios::binary);
649 			if(!bwtOut->good()) {
650 				cerr << "Could not open suffix-array file for writing: \"" << _inBwtStr.c_str() << "\"" << endl
651 				     << "Please make sure the directory exists and that permissions allow writing by" << endl
652 				     << "Bowtie." << endl;
653 				throw 1;
654 			}
655 		}
656 		// Build SA(T) and BWT(T) block by block
657 		initFromVector<TStr>(
658 			is,
659 			szs,
660 			sztot,
661 			refparams,
662 			fout1,
663 			fout2,
664 			file,
665 			saOut,
666 			bwtOut,
667 			nthreads,
668 			useBlockwise,
669 			bmax,
670 			bmaxSqrtMult,
671 			bmaxDivN,
672 			dcv,
673 			seed,
674 			verbose);
675 		// Close output files
676 		fout1.flush();
677 
678 		int64_t tellpSz1 = (int64_t)fout1.tellp();
679 		VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str.c_str());
680 		fout1.close();
681 		bool err = false;
682 		if(tellpSz1 > fileSize(_in1Str.c_str())) {
683 			err = true;
684 			cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1
685 			     << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
686 		}
687 		fout2.flush();
688 
689 		int64_t tellpSz2 = (int64_t)fout2.tellp();
690 		VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str.c_str());
691 		fout2.close();
692 		if(tellpSz2 > fileSize(_in2Str.c_str())) {
693 			err = true;
694 			cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
695 			     << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
696 		}
697 
698 		if(saOut != NULL) {
699 			// Check on suffix array output file size
700 			int64_t tellpSzSa = (int64_t)saOut->tellp();
701 			VMSG_NL("Wrote " << tellpSzSa << " bytes to suffix-array file: " << _inSaStr.c_str());
702 			saOut->close();
703 			delete saOut;
704 			if(tellpSzSa > fileSize(_inSaStr.c_str())) {
705 				err = true;
706 				cerr << "Index is corrupt: File size for " << _inSaStr.c_str() << " should have been " << tellpSzSa
707 				     << " but is actually " << fileSize(_inSaStr.c_str()) << "." << endl;
708 			}
709 		}
710 
711 		if(bwtOut != NULL) {
712 			// Check on suffix array output file size
713 			int64_t tellpSzBwt = (int64_t)bwtOut->tellp();
714 			VMSG_NL("Wrote " << tellpSzBwt << " bytes to BWT file: " << _inBwtStr.c_str());
715 			bwtOut->close();
716 			delete bwtOut;
717 			if(tellpSzBwt > fileSize(_inBwtStr.c_str())) {
718 				err = true;
719 				cerr << "Index is corrupt: File size for " << _inBwtStr.c_str() << " should have been " << tellpSzBwt
720 				     << " but is actually " << fileSize(_inBwtStr.c_str()) << "." << endl;
721 			}
722 		}
723 
724 		if(err) {
725 			cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
726 			throw 1;
727 		}
728 
729 		// Reopen as input streams
730 		VMSG_NL("Re-opening _in1 and _in2 as input streams");
731 		if(_sanity) {
732 			VMSG_NL("Sanity-checking Bt2");
733 			assert(!isInMemory());
734 			readIntoMemory(
735 				color,                       // colorspace?
736 				fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
737 				true,                        // load SA sample (_offs[])?
738 				true,                        // load ftab (_ftab[] & _eftab[])?
739 				true,                        // load r-starts (_rstarts[])?
740 				false,                       // just load header?
741 				NULL,                        // Params object to fill
742 				false,                       // mm sweep?
743 				true,                        // load names?
744 				false);                      // verbose startup?
745 			sanityCheckAll(refparams.reverse);
746 			evictFromMemory();
747 			assert(!isInMemory());
748 		}
749 		VMSG_NL("Returning from Ebwt constructor");
750 	}
751 
752 	/**
753 	 * Static constructor for a pair of forward/reverse indexes for the
754 	 * given reference string.
755 	 */
756 	template<typename TStr>
757 	static pair<Ebwt*, Ebwt*>
fromString(const char * str,bool packed,int color,int reverse,bool bigEndian,int32_t lineRate,int32_t offRate,int32_t ftabChars,const string & file,bool useBlockwise,TIndexOffU bmax,TIndexOffU bmaxSqrtMult,TIndexOffU bmaxDivN,int dcv,uint32_t seed,bool verbose,bool autoMem,bool sanity)758 	fromString(
759 		const char* str,
760 		bool packed,
761 		int color,
762 		int reverse,
763 		bool bigEndian,
764 		int32_t lineRate,
765 		int32_t offRate,
766 		int32_t ftabChars,
767 		const string& file,
768 		bool useBlockwise,
769 		TIndexOffU bmax,
770 		TIndexOffU bmaxSqrtMult,
771 		TIndexOffU bmaxDivN,
772 		int dcv,
773 		uint32_t seed,
774 		bool verbose,
775 		bool autoMem,
776 		bool sanity)
777 		{
778 			EList<std::string> strs(EBWT_CAT);
779 			strs.push_back(std::string(str));
780 			return fromStrings<TStr>(
781 				strs,
782 				packed,
783 				color,
784 				reverse,
785 				bigEndian,
786 				lineRate,
787 				offRate,
788 				ftabChars,
789 				file,
790 				useBlockwise,
791 				bmax,
792 				bmaxSqrtMult,
793 				bmaxDivN,
794 				dcv,
795 				seed,
796 				verbose,
797 				autoMem,
798 				sanity);
799 		}
800 
801 	/**
802 	 * Static constructor for a pair of forward/reverse indexes for the
803 	 * given list of reference strings.
804 	 */
805 	template<typename TStr>
806 	static pair<Ebwt*, Ebwt*>
fromStrings(const EList<std::string> & strs,bool packed,int color,int reverse,bool bigEndian,int32_t lineRate,int32_t offRate,int32_t ftabChars,const string & file,bool useBlockwise,TIndexOffU bmax,TIndexOffU bmaxSqrtMult,TIndexOffU bmaxDivN,int dcv,uint32_t seed,bool verbose,bool autoMem,bool sanity)807 	fromStrings(
808 		const EList<std::string>& strs,
809 		bool packed,
810 		int color,
811 		int reverse,
812 		bool bigEndian,
813 		int32_t lineRate,
814 		int32_t offRate,
815 		int32_t ftabChars,
816 		const string& file,
817 		bool useBlockwise,
818 		TIndexOffU bmax,
819 		TIndexOffU bmaxSqrtMult,
820 		TIndexOffU bmaxDivN,
821 		int dcv,
822 		uint32_t seed,
823 		bool verbose,
824 		bool autoMem,
825 		bool sanity)
826 		{
827 			assert(!strs.empty());
828 			EList<FileBuf*> is(EBWT_CAT);
829 			RefReadInParams refparams(color, REF_READ_FORWARD, false, false);
830 			// Adapt sequence strings to stringstreams open for input
831 			unique_ptr<stringstream> ss(new stringstream());
832 			for(TIndexOffU i = 0; i < strs.size(); i++) {
833 				(*ss) << ">" << i << endl << strs[i] << endl;
834 			}
835 			unique_ptr<FileBuf> fb(new FileBuf(ss.get()));
836 			assert(!fb->eof());
837 			assert(fb->get() == '>');
838 			ASSERT_ONLY(fb->reset());
839 			assert(!fb->eof());
840 			is.push_back(fb.get());
841 			// Vector for the ordered list of "records" comprising the input
842 			// sequences.  A record represents a stretch of unambiguous
843 			// characters in one of the input sequences.
844 			EList<RefRecord> szs(EBWT_CAT);
845 			std::pair<TIndexOffU, TIndexOffU> sztot;
846 			sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
847 			// Construct Ebwt from input strings and parameters
848 			Ebwt *ebwtFw = new Ebwt(
849 				TStr(),
850 				packed,
851 				refparams.color ? 1 : 0,
852 				-1,           // fw
853 				lineRate,
854 				offRate,      // suffix-array sampling rate
855 				ftabChars,    // number of chars in initial arrow-pair calc
856 				file,         // basename for .?.ebwt files
857 				true,         // fw?
858 				useBlockwise, // useBlockwise
859 				bmax,         // block size for blockwise SA builder
860 				bmaxSqrtMult, // block size as multiplier of sqrt(len)
861 				bmaxDivN,     // block size as divisor of len
862 				dcv,          // difference-cover period
863 				is,           // list of input streams
864 				szs,          // list of reference sizes
865 				sztot.first,  // total size of all unambiguous ref chars
866 				refparams,    // reference read-in parameters
867 				seed,         // pseudo-random number generator seed
868 				-1,           // override offRate
869 				verbose,      // be talkative
870 				autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
871 				sanity);      // verify results and internal consistency
872 			refparams.reverse = reverse;
873 			szs.clear();
874 			sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
875 			// Construct Ebwt from input strings and parameters
876 			Ebwt *ebwtBw = new Ebwt(
877 				TStr(),
878 				packed,
879 				refparams.color ? 1 : 0,
880 				reverse == REF_READ_REVERSE,
881 				lineRate,
882 				offRate,      // suffix-array sampling rate
883 				ftabChars,    // number of chars in initial arrow-pair calc
884 				file + ".rev",// basename for .?.ebwt files
885 				false,        // fw?
886 				useBlockwise, // useBlockwise
887 				bmax,         // block size for blockwise SA builder
888 				bmaxSqrtMult, // block size as multiplier of sqrt(len)
889 				bmaxDivN,     // block size as divisor of len
890 				dcv,          // difference-cover period
891 				is,           // list of input streams
892 				szs,          // list of reference sizes
893 				sztot.first,  // total size of all unambiguous ref chars
894 				refparams,    // reference read-in parameters
895 				seed,         // pseudo-random number generator seed
896 				-1,           // override offRate
897 				verbose,      // be talkative
898 				autoMem,      // pass exceptions up to the toplevel so that we can adjust memory settings automatically
899 				sanity);      // verify results and internal consistency
900 			return make_pair(ebwtFw, ebwtBw);
901 		}
902 
903 	/// Return true iff the Ebwt is packed
isPacked()904 	bool isPacked() { return packed_; }
905 
906 	/**
907 	 * Write the rstarts array given the szs array for the reference.
908 	 */
909 	void szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse);
910 
911 	/**
912 	 * Helper for the constructors above.  Takes a vector of text
913 	 * strings and joins them into a single string with a call to
914 	 * joinToDisk, which does a join (with padding) and writes some of
915 	 * the resulting data directly to disk rather than keep it in
916 	 * memory.  It then constructs a suffix-array producer (what kind
917 	 * depends on 'useBlockwise') for the resulting sequence.  The
918 	 * suffix-array producer can then be used to obtain chunks of the
919 	 * joined string's suffix array.
920 	 */
921 	template <typename TStr>
initFromVector(EList<FileBuf * > & is,EList<RefRecord> & szs,TIndexOffU sztot,const RefReadInParams & refparams,ofstream & out1,ofstream & out2,const string & outfile,ofstream * saOut,ofstream * bwtOut,int nthreads,bool useBlockwise,TIndexOffU bmax,TIndexOffU bmaxSqrtMult,TIndexOffU bmaxDivN,int dcv,uint32_t seed,bool verbose)922 	void initFromVector(EList<FileBuf*>& is,
923 	                    EList<RefRecord>& szs,
924 	                    TIndexOffU sztot,
925 	                    const RefReadInParams& refparams,
926 	                    ofstream& out1,
927 			    ofstream& out2,
928                             const string& outfile,
929 	                    ofstream* saOut,
930 	                    ofstream* bwtOut,
931                             int nthreads,
932 	                    bool useBlockwise,
933 	                    TIndexOffU bmax,
934 	                    TIndexOffU bmaxSqrtMult,
935 	                    TIndexOffU bmaxDivN,
936 	                    int dcv,
937 	                    uint32_t seed,
938 	                    bool verbose)
939 		{
940 			// Compose text strings into single string
941 			VMSG_NL("Calculating joined length");
942 			TStr s; // holds the entire joined reference after call to joinToDisk
943 			TIndexOffU jlen;
944 			jlen = joinedLen(szs);
945 			assert_geq(jlen, sztot);
946 			VMSG_NL("Writing header");
947 			writeFromMemory(true, out1, out2);
948 			try {
949 				VMSG_NL("Reserving space for joined string");
950 				s.resize(jlen);
951 				VMSG_NL("Joining reference sequences");
952 				if(refparams.reverse == REF_READ_REVERSE) {
953 					{
954 						Timer timer(cout, "  Time to join reference sequences: ", _verbose);
955 						joinToDisk(is, szs, sztot, refparams, s, out1, out2);
956 					}
957 					{
958 						Timer timer(cout, "  Time to reverse reference sequence: ", _verbose);
959 						EList<RefRecord> tmp(EBWT_CAT);
960 						s.reverse();
961 						reverseRefRecords(szs, tmp, false, verbose);
962 						szsToDisk(tmp, out1, refparams.reverse);
963 					}
964 				} else {
965 					Timer timer(cout, "  Time to join reference sequences: ", _verbose);
966 					joinToDisk(is, szs, sztot, refparams, s, out1, out2);
967 					szsToDisk(szs, out1, refparams.reverse);
968 				}
969 				// Joined reference sequence now in 's'
970 			} catch(bad_alloc& e) {
971 				// If we throw an allocation exception in the try block,
972 				// that means that the joined version of the reference
973 				// string itself is too larger to fit in memory.  The only
974 				// alternatives are to tell the user to give us more memory
975 				// or to try again with a packed representation of the
976 				// reference (if we haven't tried that already).
977 				cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
978 				if(!isPacked() && _passMemExc) {
979 					// Pass the exception up so that we can retry using a
980 					// packed string representation
981 					throw e;
982 				}
983 				// There's no point passing this exception on.  The fact
984 				// that we couldn't allocate the joined string means that
985 				// --bmax is irrelevant - the user should re-run with
986 				// ebwt-build-packed
987 				if(isPacked()) {
988 					cerr << "Please try running bowtie-build on a computer with more memory." << endl;
989 				} else {
990 					cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
991 					     << "mode (-a/--auto), or try again on a computer with more memory." << endl;
992 				}
993 				if(sizeof(void*) == 4) {
994 					cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
995 					     << "this executable is 32-bit." << endl;
996 				}
997 				throw 1;
998 			}
999 			// Succesfully obtained joined reference string
1000 			assert_geq(s.length(), jlen);
1001 			if(bmax != OFF_MASK) {
1002 				VMSG_NL("bmax according to bmax setting: " << bmax);
1003 			}
1004 			else if(bmaxSqrtMult != OFF_MASK) {
1005 				bmax *= bmaxSqrtMult;
1006 				VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
1007 			}
1008 			else if(bmaxDivN != OFF_MASK) {
1009 				bmax = max<TIndexOffU>(jlen / bmaxDivN, 1);
1010 				VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
1011 			}
1012 			else {
1013 				bmax = (TIndexOffU)sqrt(s.length());
1014 				VMSG_NL("bmax defaulted to: " << bmax);
1015 			}
1016 			int iter = 0;
1017 			bool first = true;
1018 			streampos out1pos = out1.tellp();
1019 			streampos out2pos = out2.tellp();
1020 			// Look for bmax/dcv parameters that work.
1021 			while(true) {
1022 				if(!first && bmax < 40 && _passMemExc) {
1023 					cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
1024 					if(!isPacked()) {
1025 						// Throw an exception exception so that we can
1026 						// retry using a packed string representation
1027 						throw bad_alloc();
1028 					} else {
1029 						cerr << "Already tried a packed string representation." << endl;
1030 					}
1031 					cerr << "Please try indexing this reference on a computer with more memory." << endl;
1032 					if(sizeof(void*) == 4) {
1033 						cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
1034 						     << "this executable is 32-bit." << endl;
1035 					}
1036 					throw 1;
1037 				}
1038 				if(!first) {
1039 					out1.seekp(out1pos);
1040 					out2.seekp(out2pos);
1041 				}
1042 				if(dcv > 4096) dcv = 4096;
1043 				if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
1044 					dcv <<= 1; // double difference-cover period
1045 				} else {
1046 					bmax -= (bmax >> 2); // reduce by 25%
1047 				}
1048 				VMSG("Using parameters --bmax " << bmax);
1049 				if(dcv == 0) {
1050 					VMSG_NL(" and *no difference cover*");
1051 				} else {
1052 					VMSG_NL(" --dcv " << dcv);
1053 				}
1054 				iter++;
1055 				try {
1056 					{
1057 						VMSG_NL("  Doing ahead-of-time memory usage test");
1058 						// Make a quick-and-dirty attempt to force a bad_alloc iff
1059 						// we would have thrown one eventually as part of
1060 						// constructing the DifferenceCoverSample
1061 						dcv <<= 1;
1062 						TIndexOffU sz = (TIndexOffU)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
1063 						if(nthreads > 1) sz *= (nthreads + 1);
1064 						AutoArray<uint8_t> tmp(sz, EBWT_CAT);
1065 						dcv >>= 1;
1066 						// Likewise with the KarkkainenBlockwiseSA
1067 						sz = (TIndexOffU)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
1068 						AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
1069 						// Now throw in the 'ftab' and 'isaSample' structures
1070 						// that we'll eventually allocate in buildToDisk
1071 						AutoArray<TIndexOffU> ftab(_eh._ftabLen * 2, EBWT_CAT);
1072 						AutoArray<uint8_t> side(_eh._sideSz, EBWT_CAT);
1073 						// Grab another 20 MB out of caution
1074 						AutoArray<uint32_t> extra(20*1024*1024, EBWT_CAT);
1075 						// If we made it here without throwing bad_alloc, then we
1076 						// passed the memory-usage stress test
1077 						VMSG("  Passed!  Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
1078 						if(isPacked()) {
1079 							VMSG(" --packed");
1080 						}
1081 						VMSG_NL("");
1082 					}
1083 					VMSG_NL("Constructing suffix-array element generator");
1084 					KarkkainenBlockwiseSA<TStr> bsa(s, bmax, nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile);
1085 					assert(bsa.suffixItrIsReset());
1086 					assert_eq(bsa.size(), s.length()+1);
1087 					VMSG_NL("Converting suffix-array elements to index image");
1088 					buildToDisk(bsa, s, out1, out2, saOut, bwtOut);
1089 					out1.flush(); out2.flush();
1090 					bool failed = out1.fail() || out2.fail();
1091 					if(saOut != NULL) {
1092 						saOut->flush();
1093 						failed = failed || saOut->fail();
1094 					}
1095 					if(bwtOut != NULL) {
1096 						bwtOut->flush();
1097 						failed = failed || bwtOut->fail();
1098 					}
1099 					if(failed) {
1100 						cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
1101 						throw 1;
1102 					}
1103 					break;
1104 				} catch(bad_alloc& e) {
1105 					if(_passMemExc) {
1106 						VMSG_NL("  Ran out of memory; automatically trying more memory-economical parameters.");
1107 					} else {
1108 						cerr << "Out of memory while constructing suffix array.  Please try using a smaller" << endl
1109 						     << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
1110 						throw 1;
1111 					}
1112 				}
1113 				first = false;
1114 			}
1115 			assert(repOk());
1116 			// Now write reference sequence names on the end
1117 			assert_eq(this->_refnames.size(), this->_nPat);
1118 			for(TIndexOffU i = 0; i < this->_refnames.size(); i++) {
1119 				out1 << this->_refnames[i].c_str() << endl;
1120 			}
1121 			out1 << '\0';
1122 			out1.flush(); out2.flush();
1123 			if(out1.fail() || out2.fail()) {
1124 				cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
1125 				throw 1;
1126 			}
1127 			VMSG_NL("Returning from initFromVector");
1128 		}
1129 
1130 	/**
1131 	 * Return the length that the joined string of the given string
1132 	 * list will have.  Note that this is indifferent to how the text
1133 	 * fragments correspond to input sequences - it just cares about
1134 	 * the lengths of the fragments.
1135 	 */
joinedLen(EList<RefRecord> & szs)1136 	TIndexOffU joinedLen(EList<RefRecord>& szs) {
1137 		TIndexOffU ret = 0;
1138 		for(unsigned int i = 0; i < szs.size(); i++) {
1139 			ret += szs[i].len;
1140 		}
1141 		return ret;
1142 	}
1143 
1144 	/// Destruct an Ebwt
~Ebwt()1145 	~Ebwt() {
1146 		_fchr.reset();
1147 		_ftab.reset();
1148 		_eftab.reset();
1149 		_plen.reset();
1150 		_rstarts.reset();
1151 		_offs.reset();
1152 		_ebwt.reset();
1153 		if(offs() != NULL && useShmem_) {
1154 			FREE_SHARED(offs());
1155 		}
1156 		if(ebwt() != NULL && useShmem_) {
1157 			FREE_SHARED(ebwt());
1158 		}
1159 		if (_in1 != NULL) fclose(_in1);
1160 		if (_in2 != NULL) fclose(_in2);
1161 	}
1162 
1163 	/// Accessors
eh()1164 	inline const EbwtParams& eh() const     { return _eh; }
zOff()1165 	TIndexOffU    zOff() const         { return _zOff; }
zEbwtByteOff()1166 	TIndexOffU    zEbwtByteOff() const { return _zEbwtByteOff; }
zEbwtBpOff()1167 	TIndexOff         zEbwtBpOff() const   { return _zEbwtBpOff; }
nPat()1168 	TIndexOffU    nPat() const         { return _nPat; }
nFrag()1169 	TIndexOffU    nFrag() const        { return _nFrag; }
fchr()1170 	inline TIndexOffU*   fchr()              { return _fchr.get(); }
ftab()1171 	inline TIndexOffU*   ftab()              { return _ftab.get(); }
eftab()1172 	inline TIndexOffU*   eftab()             { return _eftab.get(); }
offs()1173 	inline TIndexOffU*   offs()              { return _offs.get(); }
plen()1174 	inline TIndexOffU*   plen()              { return _plen.get(); }
rstarts()1175 	inline TIndexOffU*   rstarts()           { return _rstarts.get(); }
ebwt()1176 	inline uint8_t*    ebwt()              { return _ebwt.get(); }
fchr()1177 	inline const TIndexOffU* fchr() const    { return _fchr.get(); }
ftab()1178 	inline const TIndexOffU* ftab() const    { return _ftab.get(); }
eftab()1179 	inline const TIndexOffU* eftab() const   { return _eftab.get(); }
offs()1180 	inline const TIndexOffU* offs() const    { return _offs.get(); }
plen()1181 	inline const TIndexOffU* plen() const    { return _plen.get(); }
rstarts()1182 	inline const TIndexOffU* rstarts() const { return _rstarts.get(); }
ebwt()1183 	inline const uint8_t*  ebwt() const    { return _ebwt.get(); }
verbose()1184 	bool        verbose() const      { return _verbose; }
sanityCheck()1185 	bool        sanityCheck() const  { return _sanity; }
refnames()1186 	EList<string>& refnames()        { return _refnames; }
fw()1187 	bool        fw() const           { return fw_; }
1188 #ifdef POPCNT_CAPABILITY
1189 	bool _usePOPCNTinstruction;
1190 #endif
1191 
1192 	/**
1193 	 * Returns true iff the index contains the given string (exactly).  The
1194 	 * given string must contain only unambiguous characters.  TODO:
1195 	 * support skipping of ambiguous characters.
1196 	 */
1197 	bool contains(
1198 		const BTDnaString& str,
1199 		TIndexOffU *top = NULL,
1200 		TIndexOffU *bot = NULL) const;
1201 
1202 	/**
1203 	 * Returns true iff the index contains the given string (exactly).  The
1204 	 * given string must contain only unambiguous characters.  TODO:
1205 	 * support skipping of ambiguous characters.
1206 	 */
1207 	bool contains(
1208 		const char *str,
1209 		TIndexOffU *top = NULL,
1210 		TIndexOffU *bot = NULL) const
1211 		{
1212 			return contains(BTDnaString(str, true), top, bot);
1213 		}
1214 
1215 	/// Return true iff the Ebwt is currently in memory
isInMemory()1216 	bool isInMemory() const {
1217 		if(ebwt() != NULL) {
1218 			// Note: We might have skipped loading _offs, _ftab,
1219 			// _eftab, and _rstarts depending on whether this is the
1220 			// reverse index and what algorithm is being used.
1221 			assert(_eh.repOk());
1222 			//assert(_ftab != NULL);
1223 			//assert(_eftab != NULL);
1224 			assert(fchr() != NULL);
1225 			//assert(_offs != NULL);
1226 			//assert(_rstarts != NULL);
1227 			assert_neq(_zEbwtByteOff, OFF_MASK);
1228 			assert_neq(_zEbwtBpOff, -1);
1229 			return true;
1230 		} else {
1231 			assert(ftab() == NULL);
1232 			assert(eftab() == NULL);
1233 			assert(fchr() == NULL);
1234 			assert(offs() == NULL);
1235 			assert(rstarts() == NULL);
1236 			assert_eq(_zEbwtByteOff, OFF_MASK);
1237 			assert_eq(_zEbwtBpOff, -1);
1238 			return false;
1239 		}
1240 	}
1241 
1242 	/// Return true iff the Ebwt is currently stored on disk
isEvicted()1243 	bool isEvicted() const {
1244 		return !isInMemory();
1245 	}
1246 
1247 	/**
1248 	 * Load this Ebwt into memory by reading it in from the _in1 and
1249 	 * _in2 streams.
1250 	 */
loadIntoMemory(int color,int needEntireReverse,bool loadSASamp,bool loadFtab,bool loadRstarts,bool loadNames,bool verbose)1251 	void loadIntoMemory(
1252 		int color,
1253 		int needEntireReverse,
1254 		bool loadSASamp,
1255 		bool loadFtab,
1256 		bool loadRstarts,
1257 		bool loadNames,
1258 		bool verbose)
1259 		{
1260 			readIntoMemory(
1261 				color,       // expect index to be colorspace?
1262 				needEntireReverse, // require reverse index to be concatenated reference reversed
1263 				loadSASamp,  // load the SA sample portion?
1264 				loadFtab,    // load the ftab (_ftab[] and _eftab[])?
1265 				loadRstarts, // load the r-starts (_rstarts[])?
1266 				false,       // stop after loading the header portion?
1267 				NULL,        // params
1268 				false,       // mmSweep
1269 				loadNames,   // loadNames
1270 				verbose);    // startVerbose
1271 		}
1272 
1273 	/**
1274 	 * Frees memory associated with the Ebwt.
1275 	 */
evictFromMemory()1276 	void evictFromMemory() {
1277 		assert(isInMemory());
1278 		_fchr.free();
1279 		_ftab.free();
1280 		_eftab.free();
1281 		_rstarts.free();
1282 		_offs.free(); // might not be under control of APtrWrap
1283 		_ebwt.free(); // might not be under control of APtrWrap
1284 		// Keep plen; it's small and the client may want to seq it
1285 		// even when the others are evicted.
1286 		//_plen  = NULL;
1287 		_zEbwtByteOff = OFF_MASK;
1288 		_zEbwtBpOff = -1;
1289 	}
1290 
1291 	/**
1292 	 * Turn a substring of 'seq' starting at offset 'off' and having
1293 	 * length equal to the index's 'ftabChars' into an int that can be
1294 	 * used to index into the ftab array.
1295 	 */
ftabSeqToInt(const BTDnaString & seq,size_t off,bool rev)1296 	TIndexOffU ftabSeqToInt(
1297 		const BTDnaString& seq,
1298 		size_t off,
1299 		bool rev) const
1300 		{
1301 			int fc = _eh._ftabChars;
1302 			size_t lo = off, hi = lo + fc;
1303 			assert_leq(hi, seq.length());
1304 			TIndexOffU ftabOff = 0;
1305 			for(int i = 0; i < fc; i++) {
1306 				bool fwex = fw();
1307 				if(rev) fwex = !fwex;
1308 				// We add characters to the ftabOff in the order they would
1309 				// have been consumed in a normal search.  For BWT, this
1310 				// means right-to-left order; for BWT' it's left-to-right.
1311 				int c = (fwex ? seq[lo + i] : seq[hi - i - 1]);
1312 				if(c > 3) {
1313 					return std::numeric_limits<TIndexOffU>::max();
1314 				}
1315 				assert_range(0, 3, c);
1316 				ftabOff <<= 2;
1317 				ftabOff |= c;
1318 			}
1319 			return ftabOff;
1320 		}
1321 
1322 	/**
1323 	 * Non-static facade for static function ftabHi.
1324 	 */
ftabHi(TIndexOffU i)1325 	TIndexOffU ftabHi(TIndexOffU i) const {
1326 		return Ebwt::ftabHi(
1327 			ftab(),
1328 			eftab(),
1329 			_eh._len,
1330 			_eh._ftabLen,
1331 			_eh._eftabLen,
1332 			i);
1333 	}
1334 
1335 	/**
1336 	 * Get "high interpretation" of ftab entry at index i.  The high
1337 	 * interpretation of a regular ftab entry is just the entry
1338 	 * itself.  The high interpretation of an extended entry is the
1339 	 * second correpsonding ui32 in the eftab.
1340 	 *
1341 	 * It's a static member because it's convenient to ask this
1342 	 * question before the Ebwt is fully initialized.
1343 	 */
ftabHi(const TIndexOffU * ftab,const TIndexOffU * eftab,TIndexOffU len,TIndexOffU ftabLen,TIndexOffU eftabLen,TIndexOffU i)1344 	static TIndexOffU ftabHi(
1345 		const TIndexOffU *ftab,
1346 		const TIndexOffU *eftab,
1347 		TIndexOffU len,
1348 		TIndexOffU ftabLen,
1349 		TIndexOffU eftabLen,
1350 		TIndexOffU i)
1351 		{
1352 			assert_lt(i, ftabLen);
1353 			if(ftab[i] <= len) {
1354 				return ftab[i];
1355 			} else {
1356 				TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
1357 				assert_lt(efIdx*2+1, eftabLen);
1358 				return eftab[efIdx*2+1];
1359 			}
1360 		}
1361 
1362 	/**
1363 	 * Non-static facade for static function ftabLo.
1364 	 */
ftabLo(TIndexOffU i)1365 	TIndexOffU ftabLo(TIndexOffU i) const {
1366 		return Ebwt::ftabLo(
1367 			ftab(),
1368 			eftab(),
1369 			_eh._len,
1370 			_eh._ftabLen,
1371 			_eh._eftabLen,
1372 			i);
1373 	}
1374 
1375 	/**
1376 	 * Get low bound of ftab range.
1377 	 */
ftabLo(const BTDnaString & seq,size_t off)1378 	TIndexOffU ftabLo(const BTDnaString& seq, size_t off) const {
1379 		return ftabLo(ftabSeqToInt(seq, off, false));
1380 	}
1381 
1382 	/**
1383 	 * Get high bound of ftab range.
1384 	 */
ftabHi(const BTDnaString & seq,size_t off)1385 	TIndexOffU ftabHi(const BTDnaString& seq, size_t off) const {
1386 		return ftabHi(ftabSeqToInt(seq, off, false));
1387 	}
1388 
1389 	/**
1390 	 * Extract characters from seq starting at offset 'off' and going either
1391 	 * forward or backward, depending on 'rev'.  Order matters when compiling
1392 	 * the integer that gets looked up in the ftab.  Each successive character
1393 	 * is ORed into the least significant bit-pair, and characters are
1394 	 * integrated in the direction of the search.
1395 	 */
1396 	bool
ftabLoHi(const BTDnaString & seq,size_t off,bool rev,TIndexOffU & top,TIndexOffU & bot)1397 	ftabLoHi(
1398 		const BTDnaString& seq, // sequence to extract from
1399 		size_t off,             // offset into seq to begin extracting
1400 		bool rev,               // reverse while extracting
1401 		TIndexOffU& top,
1402 		TIndexOffU& bot) const
1403 		{
1404 			TIndexOffU fi = ftabSeqToInt(seq, off, rev);
1405 			if(fi == std::numeric_limits<TIndexOffU>::max()) {
1406 				return false;
1407 			}
1408 			top = ftabHi(fi);
1409 			bot = ftabLo(fi+1);
1410 			assert_geq(bot, top);
1411 			return true;
1412 		}
1413 
1414 	/**
1415 	 * Get "low interpretation" of ftab entry at index i.  The low
1416 	 * interpretation of a regular ftab entry is just the entry
1417 	 * itself.  The low interpretation of an extended entry is the
1418 	 * first correpsonding ui32 in the eftab.
1419 	 *
1420 	 * It's a static member because it's convenient to ask this
1421 	 * question before the Ebwt is fully initialized.
1422 	 */
ftabLo(const TIndexOffU * ftab,const TIndexOffU * eftab,TIndexOffU len,TIndexOffU ftabLen,TIndexOffU eftabLen,TIndexOffU i)1423 	static TIndexOffU ftabLo(
1424 		const TIndexOffU *ftab,
1425 		const TIndexOffU *eftab,
1426 		TIndexOffU len,
1427 		TIndexOffU ftabLen,
1428 		TIndexOffU eftabLen,
1429 		TIndexOffU i)
1430 		{
1431 			assert_lt(i, ftabLen);
1432 			if(ftab[i] <= len) {
1433 				return ftab[i];
1434 			} else {
1435 				TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
1436 				assert_lt(efIdx*2+1, eftabLen);
1437 				return eftab[efIdx*2];
1438 			}
1439 		}
1440 
1441 	/**
1442 	 * Try to resolve the reference offset of the BW element 'elt'.  If
1443 	 * it can be resolved immediately, return the reference offset.  If
1444 	 * it cannot be resolved immediately, return max value.
1445 	 */
tryOffset(TIndexOffU elt)1446 	TIndexOffU tryOffset(TIndexOffU elt) const {
1447 		assert(offs() != NULL);
1448 		if(elt == _zOff) return 0;
1449 		if((elt & _eh._offMask) == elt) {
1450 			TIndexOffU eltOff = elt >> _eh._offRate;
1451 			assert_lt(eltOff, _eh._offsLen);
1452 			TIndexOffU off = offs()[eltOff];
1453 			assert_neq(OFF_MASK, off);
1454 			return off;
1455 		} else {
1456 			// Try looking at zoff
1457 			return OFF_MASK;
1458 		}
1459 	}
1460 
1461 	/**
1462 	 * Try to resolve the reference offset of the BW element 'elt' such
1463 	 * that the offset returned is at the right-hand side of the
1464 	 * forward reference substring involved in the hit.
1465 	 */
tryOffset(TIndexOffU elt,bool fw,TIndexOffU hitlen)1466 	TIndexOffU tryOffset(
1467 		TIndexOffU elt,
1468 		bool fw,
1469 		TIndexOffU hitlen) const
1470 		{
1471 			TIndexOffU off = tryOffset(elt);
1472 			if(off != OFF_MASK && !fw) {
1473 				assert_lt(off, _eh._len);
1474 				off = _eh._len - off - 1;
1475 				assert_geq(off, hitlen-1);
1476 				off -= (hitlen-1);
1477 				assert_lt(off, _eh._len);
1478 			}
1479 			return off;
1480 		}
1481 
1482 	/**
1483 	 * Walk 'steps' steps to the left and return the row arrived at.
1484 	 */
1485 	TIndexOffU walkLeft(TIndexOffU row, TIndexOffU steps) const;
1486 
1487 	/**
1488 	 * Resolve the reference offset of the BW element 'elt'.
1489 	 */
1490 	TIndexOffU getOffset(TIndexOffU row) const;
1491 
1492 	/**
1493 	 * Resolve the reference offset of the BW element 'elt' such that
1494 	 * the offset returned is at the right-hand side of the forward
1495 	 * reference substring involved in the hit.
1496 	 */
1497 	TIndexOffU getOffset(
1498 		TIndexOffU elt,
1499 		bool fw,
1500 		TIndexOffU hitlen) const;
1501 
1502 	/**
1503 	 * When using read() to create an Ebwt, we have to set a couple of
1504 	 * additional fields in the Ebwt object that aren't part of the
1505 	 * parameter list and are not stored explicitly in the file.  Right
1506 	 * now, this just involves initializing _zEbwtByteOff and
1507 	 * _zEbwtBpOff from _zOff.
1508 	 */
postReadInit(EbwtParams & eh)1509 	void postReadInit(EbwtParams& eh) {
1510 		TIndexOffU sideNum     = _zOff / eh._sideBwtLen;
1511 		TIndexOffU sideCharOff = _zOff % eh._sideBwtLen;
1512 		TIndexOffU sideByteOff = sideNum * eh._sideSz;
1513 		_zEbwtByteOff = sideCharOff >> 2;
1514 		assert_lt(_zEbwtByteOff, eh._sideBwtSz);
1515 		_zEbwtBpOff = sideCharOff & 3;
1516 		assert_lt(_zEbwtBpOff, 4);
1517 		_zEbwtByteOff += sideByteOff;
1518 		assert(repOk(eh)); // Ebwt should be fully initialized now
1519 	}
1520 
1521 	/**
1522 	 * Given basename of an Ebwt index, read and return its flag.
1523 	 */
1524 	static int32_t readFlags(const string& instr);
1525 
1526 	/**
1527 	 * Pretty-print the Ebwt to the given output stream.
1528 	 */
print(ostream & out)1529 	void print(ostream& out) const {
1530 		print(out, _eh);
1531 	}
1532 
1533 	/**
1534 	 * Pretty-print the Ebwt and given EbwtParams to the given output
1535 	 * stream.
1536 	 */
print(ostream & out,const EbwtParams & eh)1537 	void print(ostream& out, const EbwtParams& eh) const {
1538 		eh.print(out); // print params
1539 		out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
1540 		    << "    zOff: "         << _zOff << endl
1541 		    << "    zEbwtByteOff: " << _zEbwtByteOff << endl
1542 		    << "    zEbwtBpOff: "   << _zEbwtBpOff << endl
1543 		    << "    nPat: "  << _nPat << endl
1544 		    << "    plen: ";
1545 		if(plen() == NULL) {
1546 			out << "NULL" << endl;
1547 		} else {
1548 			out << "non-NULL, [0] = " << plen()[0] << endl;
1549 		}
1550 		out << "    rstarts: ";
1551 		if(rstarts() == NULL) {
1552 			out << "NULL" << endl;
1553 		} else {
1554 			out << "non-NULL, [0] = " << rstarts()[0] << endl;
1555 		}
1556 		out << "    ebwt: ";
1557 		if(ebwt() == NULL) {
1558 			out << "NULL" << endl;
1559 		} else {
1560 			out << "non-NULL, [0] = " << ebwt()[0] << endl;
1561 		}
1562 		out << "    fchr: ";
1563 		if(fchr() == NULL) {
1564 			out << "NULL" << endl;
1565 		} else {
1566 			out << "non-NULL, [0] = " << fchr()[0] << endl;
1567 		}
1568 		out << "    ftab: ";
1569 		if(ftab() == NULL) {
1570 			out << "NULL" << endl;
1571 		} else {
1572 			out << "non-NULL, [0] = " << ftab()[0] << endl;
1573 		}
1574 		out << "    eftab: ";
1575 		if(eftab() == NULL) {
1576 			out << "NULL" << endl;
1577 		} else {
1578 			out << "non-NULL, [0] = " << eftab()[0] << endl;
1579 		}
1580 		out << "    offs: ";
1581 		if(offs() == NULL) {
1582 			out << "NULL" << endl;
1583 		} else {
1584 			out << "non-NULL, [0] = " << offs()[0] << endl;
1585 		}
1586 	}
1587 
1588 	// Building
1589 	template <typename TStr> static TStr join(EList<TStr>& l, uint32_t seed);
1590 	template <typename TStr> static TStr join(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed);
1591 	template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
1592 	template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2, ostream* saOut, ostream* bwtOut);
1593 
1594 	// I/O
1595 	void readIntoMemory(int color, int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
1596 	void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
1597 	void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
1598 
1599 	// Sanity checking
1600 	void sanityCheckUpToSide(TIndexOff upToSide) const;
1601 	void sanityCheckAll(int reverse) const;
1602 	void restore(SString<char>& s) const;
1603 	void checkOrigs(const EList<SString<char> >& os, bool color, bool mirror) const;
1604 
1605 	// Searching and reporting
1606 	void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen, bool rejectStraddle, bool& straddled) const;
1607 
1608 #define WITHIN_BWT_LEN(x)				\
1609 	assert_leq(x[0], this->_eh._sideBwtLen);	\
1610 	assert_leq(x[1], this->_eh._sideBwtLen);	\
1611 	assert_leq(x[2], this->_eh._sideBwtLen);	\
1612 	assert_leq(x[3], this->_eh._sideBwtLen)
1613 
1614 #define WITHIN_FCHR(x)				\
1615 	assert_leq(x[0], this->fchr()[1]);	\
1616 	assert_leq(x[1], this->fchr()[2]);	\
1617 	assert_leq(x[2], this->fchr()[3]);	\
1618 	assert_leq(x[3], this->fchr()[4])
1619 
1620 #define WITHIN_FCHR_DOLLARA(x)			\
1621 	assert_leq(x[0], this->fchr()[1]+1);	\
1622 	assert_leq(x[1], this->fchr()[2]);	\
1623 	assert_leq(x[2], this->fchr()[3]);	\
1624 	assert_leq(x[3], this->fchr()[4])
1625 
1626 	/**
1627 	 * Count all occurrences of character c from the beginning of the
1628 	 * forward side to <by,bp> and add in the occ[] count up to the side
1629 	 * break just prior to the side.
1630 	 *
1631 	 * A Bowtie 2 side is shaped like:
1632 	 *
1633 	 * XXXXXXXXXXXXXXXX [A] [C] [G] [T]
1634 	 * --------48------ -4- -4- -4- -4-  (numbers in bytes)
1635 	 */
countBt2Side(const SideLocus & l,int c)1636 	inline TIndexOffU countBt2Side(const SideLocus& l, int c) const {
1637 		assert_range(0, 3, c);
1638 		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
1639 		assert_range(0, 3, (int)l._bp);
1640 		const uint8_t *side = l.side(this->ebwt());
1641 		TIndexOffU cCnt = countUpTo(l, c);
1642 		assert_leq(cCnt, l.toBWRow());
1643 		assert_leq(cCnt, this->_eh._sideBwtLen);
1644 		if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
1645 			// Adjust for the fact that we represented $ with an 'A', but
1646 			// shouldn't count it as an 'A' here
1647 			if((l._sideByteOff + l._by > _zEbwtByteOff) ||
1648 			   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
1649 			{
1650 				cCnt--; // Adjust for '$' looking like an 'A'
1651 			}
1652 		}
1653 		TIndexOffU ret;
1654 		// Now factor in the occ[] count at the side break
1655 		const uint8_t *acgt8 = side + _eh._sideBwtSz;
1656 		const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt8);
1657 		assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
1658 		assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
1659 		assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
1660 		assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
1661 		ret = endianizeU<TIndexOffU>(acgt[c], _switchEndian) + cCnt + this->fchr()[c];
1662 #ifndef NDEBUG
1663 		assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section
1664 		if(c == 0) {
1665 			assert_leq(cCnt, this->_eh._sideBwtLen);
1666 		} else {
1667 			assert_leq(ret, this->_eh._bwtLen);
1668 		}
1669 #endif
1670 		return ret;
1671 	}
1672 
1673 	/**
1674 	 * Count all occurrences of all four nucleotides up to the starting
1675 	 * point (which must be in a forward side) given by 'l' storing the
1676 	 * result in 'cntsUpto', then count nucleotide occurrences within the
1677 	 * range of length 'num' storing the result in 'cntsIn'.  Also, keep
1678 	 * track of the characters occurring within the range by setting
1679 	 * 'masks' accordingly (masks[1][10] == true -> 11th character is a
1680 	 * 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false.
1681 	 */
countBt2SideRange(SideLocus & l,TIndexOffU num,TIndexOffU * cntsUpto,TIndexOffU * cntsIn,EList<bool> * masks)1682 	inline void countBt2SideRange(
1683 		SideLocus& l,        // top locus
1684 		TIndexOffU num,        // number of elts in range to tall // @double-check
1685 		TIndexOffU* cntsUpto, // A/C/G/T counts up to top
1686 		TIndexOffU* cntsIn,   // A/C/G/T counts within range
1687 		EList<bool> *masks) const // masks indicating which range elts = A/C/G/T
1688 		{
1689 			assert_gt(num, 0);
1690 			assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
1691 			assert_range(0, 3, (int)l._bp);
1692 			countUpToEx(l, cntsUpto);
1693 			WITHIN_FCHR_DOLLARA(cntsUpto);
1694 			WITHIN_BWT_LEN(cntsUpto);
1695 			const uint8_t *side = l.side(this->ebwt());
1696 			if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
1697 				// Adjust for the fact that we represented $ with an 'A', but
1698 				// shouldn't count it as an 'A' here
1699 				if((l._sideByteOff + l._by > _zEbwtByteOff) ||
1700 				   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
1701 				{
1702 					cntsUpto[0]--; // Adjust for '$' looking like an 'A'
1703 				}
1704 			}
1705 			// Now factor in the occ[] count at the side break
1706 			const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(side + _eh._sideBwtSz);
1707 			assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->fchr()[1] + this->_eh.sideBwtLen());
1708 			assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->fchr()[2]-this->fchr()[1]);
1709 			assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->fchr()[3]-this->fchr()[2]);
1710 			assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->fchr()[4]-this->fchr()[3]);
1711 			assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._len + this->_eh.sideBwtLen());
1712 			assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
1713 			assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
1714 			assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
1715 			cntsUpto[0] += (endianizeU<TIndexOffU>(acgt[0], _switchEndian) + this->fchr()[0]);
1716 			cntsUpto[1] += (endianizeU<TIndexOffU>(acgt[1], _switchEndian) + this->fchr()[1]);
1717 			cntsUpto[2] += (endianizeU<TIndexOffU>(acgt[2], _switchEndian) + this->fchr()[2]);
1718 			cntsUpto[3] += (endianizeU<TIndexOffU>(acgt[3], _switchEndian) + this->fchr()[3]);
1719 			masks[0].resize(num);
1720 			masks[1].resize(num);
1721 			masks[2].resize(num);
1722 			masks[3].resize(num);
1723 			WITHIN_FCHR_DOLLARA(cntsUpto);
1724 			WITHIN_FCHR_DOLLARA(cntsIn);
1725 			// 'cntsUpto' is complete now.
1726 			// Walk forward until we've tallied the entire 'In' range
1727 			TIndexOffU nm = 0;
1728 			// Rest of this side
1729 			nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm);
1730 			assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
1731 			assert_leq(nm, num);
1732 			SideLocus lcopy = l;
1733 			while(nm < num) {
1734 				// Subsequent sides, if necessary
1735 				lcopy.nextSide(this->_eh);
1736 				nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm);
1737 				WITHIN_FCHR_DOLLARA(cntsIn);
1738 				assert_leq(nm, num);
1739 				assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
1740 			}
1741 			assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
1742 			WITHIN_FCHR_DOLLARA(cntsIn);
1743 		}
1744 
1745 	/**
1746 	 * Count all occurrences of character c from the beginning of the
1747 	 * forward side to <by,bp> and add in the occ[] count up to the side
1748 	 * break just prior to the side.
1749 	 *
1750 	 * A forward side is shaped like:
1751 	 *
1752 	 * [A] [C] XXXXXXXXXXXXXXXX
1753 	 * -4- -4- --------56------ (numbers in bytes)
1754 	 *         ^
1755 	 *         Side ptr (result from SideLocus.side())
1756 	 *
1757 	 * And following it is a reverse side shaped like:
1758 	 *
1759 	 * [G] [T] XXXXXXXXXXXXXXXX
1760 	 * -4- -4- --------56------ (numbers in bytes)
1761 	 *         ^
1762 	 *         Side ptr (result from SideLocus.side())
1763 	 *
1764 	 */
countBt2SideEx(const SideLocus & l,TIndexOffU * arrs)1765 	inline void countBt2SideEx(const SideLocus& l, TIndexOffU* arrs) const {
1766 		assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
1767 		assert_range(0, 3, (int)l._bp);
1768 		countUpToEx(l, arrs);
1769 		if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
1770 			// Adjust for the fact that we represented $ with an 'A', but
1771 			// shouldn't count it as an 'A' here
1772 			if((l._sideByteOff + l._by > _zEbwtByteOff) ||
1773 			   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
1774 			{
1775 				arrs[0]--; // Adjust for '$' looking like an 'A'
1776 			}
1777 		}
1778 		WITHIN_FCHR(arrs);
1779 		WITHIN_BWT_LEN(arrs);
1780 		// Now factor in the occ[] count at the side break
1781 		const uint8_t *side = l.side(this->ebwt());
1782 		const uint8_t *acgt16 = side + this->_eh._sideSz - OFF_SIZE*4;
1783 		const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt16);
1784 		assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->fchr()[1] + this->_eh.sideBwtLen());
1785 		assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->fchr()[2]-this->fchr()[1]);
1786 		assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->fchr()[3]-this->fchr()[2]);
1787 		assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->fchr()[4]-this->fchr()[3]);
1788 		assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._len + this->_eh.sideBwtLen());
1789 		assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
1790 		assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
1791 		assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
1792 		arrs[0] += (endianizeU<TIndexOffU>(acgt[0], _switchEndian) + this->fchr()[0]);
1793 		arrs[1] += (endianizeU<TIndexOffU>(acgt[1], _switchEndian) + this->fchr()[1]);
1794 		arrs[2] += (endianizeU<TIndexOffU>(acgt[2], _switchEndian) + this->fchr()[2]);
1795 		arrs[3] += (endianizeU<TIndexOffU>(acgt[3], _switchEndian) + this->fchr()[3]);
1796 		WITHIN_FCHR(arrs);
1797 	}
1798 
1799 	/**
1800 	 * Counts the number of occurrences of character 'c' in the given Ebwt
1801 	 * side up to (but not including) the given byte/bitpair (by/bp).
1802 	 *
1803 	 * This is a performance-critical function.  This is the top search-
1804 	 * related hit in the time profile.
1805 	 *
1806 	 * Function gets 11.09% in profile
1807 	 */
countUpTo(const SideLocus & l,int c)1808 	inline TIndexOffU countUpTo(const SideLocus& l, int c) const { // @double-check
1809 		// Count occurrences of c in each 64-bit (using bit trickery);
1810 		// Someday countInU64() and pop() functions should be
1811 		// vectorized/SSE-ized in case that helps.
1812 		TIndexOffU cCnt = 0;
1813 		const uint8_t *side = l.side(this->ebwt());
1814 		int i = 0;
1815 #ifdef POPCNT_CAPABILITY
1816 		if ( _usePOPCNTinstruction) {
1817 			for(; i + 7 < l._by; i += 8) {
1818 				cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
1819 			}
1820 		}
1821 		else {
1822 			for(; i + 7 < l._by; i += 8) {
1823 				cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
1824 			}
1825 		}
1826 #else
1827 		for(; i + 7 < l._by; i += 8) {
1828 			cCnt += countInU64(c, *(uint64_t*)&side[i]);
1829 		}
1830 #endif
1831 		// Count occurences of c in the rest of the side (using LUT)
1832 		for(; i < l._by; i++) {
1833 			cCnt += cCntLUT_4[0][c][side[i]];
1834 		}
1835 		// Count occurences of c in the rest of the byte
1836 		if(l._bp > 0) {
1837 			cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
1838 		}
1839 		return cCnt;
1840 	}
1841 
1842 	/**
1843 	 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
1844 	 * within a 64-bit argument.
1845 	 *
1846 	 * Function gets 2.32% in profile
1847 	 */
1848 #ifdef POPCNT_CAPABILITY
1849 	template<typename Operation>
1850 #endif
countInU64Ex(uint64_t dw,TIndexOffU * arrs)1851 	inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) {
1852 		uint64_t c0 = c_table[0];
1853 		uint64_t x0 = dw ^ c0;
1854 		uint64_t x1 = (x0 >> 1);
1855 		uint64_t x2 = x1 & (0x5555555555555555llu);
1856 		uint64_t x3 = x0 & x2;
1857 #ifdef POPCNT_CAPABILITY
1858 		uint64_t tmp = Operation().pop64(x3);
1859 #else
1860 		uint64_t tmp = pop64(x3);
1861 #endif
1862 		arrs[0] += (uint32_t) tmp;
1863 
1864 		c0 = c_table[1];
1865 		x0 = dw ^ c0;
1866 		x1 = (x0 >> 1);
1867 		x2 = x1 & (0x5555555555555555llu);
1868 		x3 = x0 & x2;
1869 #ifdef POPCNT_CAPABILITY
1870 		tmp = Operation().pop64(x3);
1871 #else
1872 		tmp = pop64(x3);
1873 #endif
1874 		arrs[1] += (uint32_t) tmp;
1875 
1876 		c0 = c_table[2];
1877 		x0 = dw ^ c0;
1878 		x1 = (x0 >> 1);
1879 		x2 = x1 & (0x5555555555555555llu);
1880 		x3 = x0 & x2;
1881 #ifdef POPCNT_CAPABILITY
1882 		tmp = Operation().pop64(x3);
1883 #else
1884 		tmp = pop64(x3);
1885 #endif
1886 		arrs[2] += (uint32_t) tmp;
1887 
1888 		c0 = c_table[3];
1889 		x0 = dw ^ c0;
1890 		x1 = (x0 >> 1);
1891 		x2 = x1 & (0x5555555555555555llu);
1892 		x3 = x0 & x2;
1893 #ifdef POPCNT_CAPABILITY
1894 		tmp = Operation().pop64(x3);
1895 #else
1896 		tmp = pop64(x3);
1897 #endif
1898 		arrs[3] += (uint32_t) tmp;
1899 	}
1900 
1901 	/**
1902 	 * Counts the number of occurrences of all four nucleotides in the
1903 	 * given side up to (but not including) the given byte/bitpair (by/bp).
1904 	 * Count for 'a' goes in arrs[0], 'c' in arrs[1], etc.
1905 	 */
countUpToEx(const SideLocus & l,TIndexOffU * arrs)1906 	inline void countUpToEx(const SideLocus& l, TIndexOffU* arrs) const {
1907 		int i = 0;
1908 		// Count occurrences of each nucleotide in each 64-bit word using
1909 		// bit trickery; note: this seems does not seem to lend a
1910 		// significant boost to performance in practice.  If you comment
1911 		// out this whole loop (which won't affect correctness - it will
1912 		// just cause the following loop to take up the slack) then runtime
1913 		// does not change noticeably. Someday the countInU64() and pop()
1914 		// functions should be vectorized/SSE-ized in case that helps.
1915 		const uint8_t *side = l.side(this->ebwt());
1916 
1917 #ifdef POPCNT_CAPABILITY
1918 		if (_usePOPCNTinstruction) {
1919 			for(; i+7 < l._by; i += 8) {
1920 				countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
1921 			}
1922 		}
1923 		else {
1924 			for(; i+7 < l._by; i += 8) {
1925 				countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
1926 			}
1927 		}
1928 #else
1929 		for(; i+7 < l._by; i += 8) {
1930 			countInU64Ex(*(uint64_t*)&side[i], arrs);
1931 		}
1932 #endif
1933 		// Count occurences of nucleotides in the rest of the side (using LUT)
1934 		// Many cache misses on following lines (~20K)
1935 		for(; i < l._by; i++) {
1936 			arrs[0] += cCntLUT_4[0][0][side[i]];
1937 			arrs[1] += cCntLUT_4[0][1][side[i]];
1938 			arrs[2] += cCntLUT_4[0][2][side[i]];
1939 			arrs[3] += cCntLUT_4[0][3][side[i]];
1940 		}
1941 		// Count occurences of c in the rest of the byte
1942 		if(l._bp > 0) {
1943 			arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
1944 			arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
1945 			arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
1946 			arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
1947 		}
1948 	}
1949 
1950 #ifndef NDEBUG
1951 	/**
1952 	 * Given top and bot loci, calculate counts of all four DNA chars up to
1953 	 * those loci.  Used for more advanced backtracking-search.
1954 	 */
1955 	inline void mapLFEx(
1956 		const SideLocus& l,
1957 		TIndexOffU *arrs
1958 		ASSERT_ONLY(, bool overrideSanity = false)
1959 		) const
1960 		{
1961 			assert_eq(0, arrs[0]);
1962 			assert_eq(0, arrs[1]);
1963 			assert_eq(0, arrs[2]);
1964 			assert_eq(0, arrs[3]);
1965 			countBt2SideEx(l, arrs);
1966 			if(_sanity && !overrideSanity) {
1967 				// Make sure results match up with individual calls to mapLF;
1968 				// be sure to override sanity-checking in the callee, or we'll
1969 				// have infinite recursion
1970 				assert_eq(mapLF(l, 0, true), arrs[0]);
1971 				assert_eq(mapLF(l, 1, true), arrs[1]);
1972 				assert_eq(mapLF(l, 2, true), arrs[2]);
1973 				assert_eq(mapLF(l, 3, true), arrs[3]);
1974 			}
1975 		}
1976 #endif
1977 
1978 	/**
1979 	 * Given top and bot rows, calculate counts of all four DNA chars up to
1980 	 * those loci.
1981 	 */
1982 	inline void mapLFEx(
1983 		TIndexOffU top,
1984 		TIndexOffU bot,
1985 		TIndexOffU *tops,
1986 		TIndexOffU *bots
1987 		ASSERT_ONLY(, bool overrideSanity = false)
1988 		) const
1989 		{
1990 			SideLocus ltop, lbot;
1991 			SideLocus::initFromTopBot(top, bot, _eh, ebwt(), ltop, lbot);
1992 			mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity));
1993 		}
1994 
1995 	/**
1996 	 * Given top and bot loci, calculate counts of all four DNA chars up to
1997 	 * those loci.  Used for more advanced backtracking-search.
1998 	 */
1999 	inline void mapLFEx(
2000 		const SideLocus& ltop,
2001 		const SideLocus& lbot,
2002 		TIndexOffU *tops,
2003 		TIndexOffU *bots
2004 		ASSERT_ONLY(, bool overrideSanity = false)
2005 		) const
2006 		{
2007 			assert(ltop.repOk(this->eh()));
2008 			assert(lbot.repOk(this->eh()));
2009 			assert_eq(0, tops[0]); assert_eq(0, bots[0]);
2010 			assert_eq(0, tops[1]); assert_eq(0, bots[1]);
2011 			assert_eq(0, tops[2]); assert_eq(0, bots[2]);
2012 			assert_eq(0, tops[3]); assert_eq(0, bots[3]);
2013 			countBt2SideEx(ltop, tops);
2014 			countBt2SideEx(lbot, bots);
2015 #ifndef NDEBUG
2016 			if(_sanity && !overrideSanity) {
2017 				// Make sure results match up with individual calls to mapLF;
2018 				// be sure to override sanity-checking in the callee, or we'll
2019 				// have infinite recursion
2020 				assert_eq(mapLF(ltop, 0, true), tops[0]);
2021 				assert_eq(mapLF(ltop, 1, true), tops[1]);
2022 				assert_eq(mapLF(ltop, 2, true), tops[2]);
2023 				assert_eq(mapLF(ltop, 3, true), tops[3]);
2024 				assert_eq(mapLF(lbot, 0, true), bots[0]);
2025 				assert_eq(mapLF(lbot, 1, true), bots[1]);
2026 				assert_eq(mapLF(lbot, 2, true), bots[2]);
2027 				assert_eq(mapLF(lbot, 3, true), bots[3]);
2028 			}
2029 #endif
2030 		}
2031 
2032 	/**
2033 	 * Counts the number of occurrences of all four nucleotides in the
2034 	 * given side from the given byte/bitpair (l->_by/l->_bp) (or the
2035 	 * beginning of the side if l == 0).  Count for 'a' goes in arrs[0],
2036 	 * 'c' in arrs[1], etc.
2037 	 *
2038 	 * Note: must account for $.
2039 	 *
2040 	 * Must fill in masks
2041 	 */
countBt2SideRange2(const SideLocus & l,bool startAtLocus,TIndexOffU num,TIndexOffU * arrs,EList<bool> * masks,TIndexOffU maskOff)2042 	inline TIndexOffU countBt2SideRange2( // @double-check
2043 		const SideLocus& l,
2044 		bool startAtLocus,
2045 		TIndexOffU num,
2046 		TIndexOffU* arrs,
2047 		EList<bool> *masks,
2048 		TIndexOffU maskOff) const
2049 		{
2050 			assert(!masks[0].empty());
2051 			assert_eq(masks[0].size(), masks[1].size());
2052 			assert_eq(masks[0].size(), masks[2].size());
2053 			assert_eq(masks[0].size(), masks[3].size());
2054 			ASSERT_ONLY(TIndexOffU myarrs[4] = {0, 0, 0, 0});
2055 			TIndexOffU nm = 0; // number of nucleotides tallied so far
2056 			int iby = 0;      // initial byte offset
2057 			int ibp = 0;      // initial base-pair offset
2058 			if(startAtLocus) {
2059 				iby = l._by;
2060 				ibp = l._bp;
2061 			} else {
2062 				// Start at beginning
2063 			}
2064 			int by = iby, bp = ibp;
2065 			assert_lt(bp, 4);
2066 			assert_lt(by, (int)this->_eh._sideBwtSz);
2067 			const uint8_t *side = l.side(this->ebwt());
2068 			while(nm < num) {
2069 				int c = (side[by] >> (bp * 2)) & 3;
2070 				assert_lt(maskOff + nm, masks[c].size());
2071 				masks[0][maskOff + nm] = masks[1][maskOff + nm] =
2072 					masks[2][maskOff + nm] = masks[3][maskOff + nm] = false;
2073 				assert_range(0, 3, c);
2074 				// Note: we tally $ just like an A
2075 				arrs[c]++; // tally it
2076 				ASSERT_ONLY(myarrs[c]++);
2077 				masks[c][maskOff + nm] = true; // not dead
2078 				nm++;
2079 				if(++bp == 4) {
2080 					bp = 0;
2081 					by++;
2082 					assert_leq(by, (int)this->_eh._sideBwtSz);
2083 					if(by == (int)this->_eh._sideBwtSz) {
2084 						// Fell off the end of the side
2085 						break;
2086 					}
2087 				}
2088 			}
2089 			WITHIN_FCHR_DOLLARA(arrs);
2090 #ifndef NDEBUG
2091 			if(_sanity) {
2092 				// Make sure results match up with a call to mapLFEx.
2093 				TIndexOffU tops[4] = {0, 0, 0, 0};
2094 				TIndexOffU bots[4] = {0, 0, 0, 0};
2095 				TIndexOffU top = l.toBWRow();
2096 				TIndexOffU bot = top + nm;
2097 				mapLFEx(top, bot, tops, bots, false);
2098 				assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1);
2099 				assert_eq(myarrs[1], bots[1] - tops[1]);
2100 				assert_eq(myarrs[2], bots[2] - tops[2]);
2101 				assert_eq(myarrs[3], bots[3] - tops[3]);
2102 			}
2103 #endif
2104 			return nm;
2105 		}
2106 
2107 	/**
2108 	 * Return the final character in row i (i.e. the i'th character in the
2109 	 * BWT transform).  Note that the 'L' in the name of the function
2110 	 * stands for 'last', as in the literature.
2111 	 */
rowL(const SideLocus & l)2112 	inline int rowL(const SideLocus& l) const {
2113 		// Extract and return appropriate bit-pair
2114 		return unpack_2b_from_8b(l.side(this->ebwt())[l._by], l._bp);
2115 	}
2116 
2117 	/**
2118 	 * Return the final character in row i (i.e. the i'th character in the
2119 	 * BWT transform).  Note that the 'L' in the name of the function
2120 	 * stands for 'last', as in the literature.
2121 	 */
rowL(TIndexOffU i)2122 	inline int rowL(TIndexOffU i) const {
2123 		// Extract and return appropriate bit-pair
2124 		SideLocus l;
2125 		l.initFromRow(i, _eh, ebwt());
2126 		return rowL(l);
2127 	}
2128 
2129 	/**
2130 	 * Given top and bot loci, calculate counts of all four DNA chars up to
2131 	 * those loci.  Used for more advanced backtracking-search.
2132 	 */
2133 	inline void mapLFRange(
2134 		SideLocus& ltop,
2135 		SideLocus& lbot,
2136 		TIndexOffU num,        // Number of elts
2137 		TIndexOffU* cntsUpto,  // A/C/G/T counts up to top
2138 		TIndexOffU* cntsIn,    // A/C/G/T counts within range
2139 		EList<bool> *masks
2140 		ASSERT_ONLY(, bool overrideSanity = false)
2141 		) const
2142 		{
2143 			assert(ltop.repOk(this->eh()));
2144 			assert(lbot.repOk(this->eh()));
2145 			assert_eq(num, lbot.toBWRow() - ltop.toBWRow());
2146 			assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]);
2147 			assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]);
2148 			assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]);
2149 			assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]);
2150 			countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks);
2151 			assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
2152 #ifndef NDEBUG
2153 			if(_sanity && !overrideSanity) {
2154 				// Make sure results match up with individual calls to mapLF;
2155 				// be sure to override sanity-checking in the callee, or we'll
2156 				// have infinite recursion
2157 				TIndexOffU tops[4] = {0, 0, 0, 0};
2158 				TIndexOffU bots[4] = {0, 0, 0, 0};
2159 				assert(ltop.repOk(this->eh()));
2160 				assert(lbot.repOk(this->eh()));
2161 				mapLFEx(ltop, lbot, tops, bots, false);
2162 				for(int i = 0; i < 4; i++) {
2163 					assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]);
2164 					if(i == 0) {
2165 						assert(cntsIn[i] == bots[i]-tops[i] ||
2166 						       cntsIn[i] == bots[i]-tops[i]+1);
2167 					} else {
2168 						assert_eq(cntsIn[i], bots[i]-tops[i]);
2169 					}
2170 				}
2171 			}
2172 #endif
2173 		}
2174 
2175 	/**
2176 	 * Given row i, return the row that the LF mapping maps i to.
2177 	 */
2178 	inline TIndexOffU mapLF(
2179 		const SideLocus& l
2180 		ASSERT_ONLY(, bool overrideSanity = false)
2181 		) const
2182 		{
2183 			ASSERT_ONLY(TIndexOffU srcrow = l.toBWRow());
2184 			TIndexOffU ret;
2185 			assert(l.side(this->ebwt()) != NULL);
2186 			int c = rowL(l);
2187 			assert_lt(c, 4);
2188 			assert_geq(c, 0);
2189 			ret = countBt2Side(l, c);
2190 			assert_lt(ret, this->_eh._bwtLen);
2191 			assert_neq(srcrow, ret);
2192 #ifndef NDEBUG
2193 			if(_sanity && !overrideSanity) {
2194 				// Make sure results match up with results from mapLFEx;
2195 				// be sure to override sanity-checking in the callee, or we'll
2196 				// have infinite recursion
2197 				TIndexOffU arrs[] = { 0, 0, 0, 0 };
2198 				mapLFEx(l, arrs, true);
2199 				assert_eq(arrs[c], ret);
2200 			}
2201 #endif
2202 			return ret;
2203 		}
2204 
2205 	/**
2206 	 * Given row i and character c, return the row that the LF mapping maps
2207 	 * i to on character c.
2208 	 */
2209 	inline TIndexOffU mapLF(
2210 		const SideLocus& l, int c
2211 		ASSERT_ONLY(, bool overrideSanity = false)
2212 		) const
2213 		{
2214 			TIndexOffU ret;
2215 			assert_lt(c, 4);
2216 			assert_geq(c, 0);
2217 			ret = countBt2Side(l, c);
2218 			assert_lt(ret, this->_eh._bwtLen);
2219 #ifndef NDEBUG
2220 			if(_sanity && !overrideSanity) {
2221 				// Make sure results match up with results from mapLFEx;
2222 				// be sure to override sanity-checking in the callee, or we'll
2223 				// have infinite recursion
2224 				TIndexOffU arrs[] = { 0, 0, 0, 0 };
2225 				mapLFEx(l, arrs, true);
2226 				assert_eq(arrs[c], ret);
2227 			}
2228 #endif
2229 			return ret;
2230 		}
2231 
2232 	/**
2233 	 * Given top and bot loci, calculate counts of all four DNA chars up to
2234 	 * those loci.  Also, update a set of tops and bots for the reverse
2235 	 * index/direction using the idea from the bi-directional BWT paper.
2236 	 */
2237 	inline void mapBiLFEx(
2238 		const SideLocus& ltop,
2239 		const SideLocus& lbot,
2240 		TIndexOffU *tops,
2241 		TIndexOffU *bots,
2242 		TIndexOffU *topsP, // topsP[0] = top
2243 		TIndexOffU *botsP
2244 		ASSERT_ONLY(, bool overrideSanity = false)
2245 		) const
2246 		{
2247 #ifndef NDEBUG
2248 			for(int i = 0; i < 4; i++) {
2249 				assert_eq(0, tops[0]);  assert_eq(0, bots[0]);
2250 			}
2251 #endif
2252 			countBt2SideEx(ltop, tops);
2253 			countBt2SideEx(lbot, bots);
2254 #ifndef NDEBUG
2255 			if(_sanity && !overrideSanity) {
2256 				// Make sure results match up with individual calls to mapLF;
2257 				// be sure to override sanity-checking in the callee, or we'll
2258 				// have infinite recursion
2259 				assert_eq(mapLF(ltop, 0, true), tops[0]);
2260 				assert_eq(mapLF(ltop, 1, true), tops[1]);
2261 				assert_eq(mapLF(ltop, 2, true), tops[2]);
2262 				assert_eq(mapLF(ltop, 3, true), tops[3]);
2263 				assert_eq(mapLF(lbot, 0, true), bots[0]);
2264 				assert_eq(mapLF(lbot, 1, true), bots[1]);
2265 				assert_eq(mapLF(lbot, 2, true), bots[2]);
2266 				assert_eq(mapLF(lbot, 3, true), bots[3]);
2267 			}
2268 #endif
2269 			// bots[0..3] - tops[0..3] = # of ways to extend the suffix with an
2270 			// A, C, G, T
2271 			botsP[0] = topsP[0] + (bots[0] - tops[0]);
2272 			topsP[1] = botsP[0];
2273 			botsP[1] = topsP[1] + (bots[1] - tops[1]);
2274 			topsP[2] = botsP[1];
2275 			botsP[2] = topsP[2] + (bots[2] - tops[2]);
2276 			topsP[3] = botsP[2];
2277 			botsP[3] = topsP[3] + (bots[3] - tops[3]);
2278 		}
2279 
2280 	/**
2281 	 * Given row and its locus information, proceed on the given character
2282 	 * and return the next row, or all-fs if we can't proceed on that
2283 	 * character.  Returns max value if this row ends in $.
2284 	 */
2285 	inline TIndexOffU mapLF1(
2286 		TIndexOffU row,       // starting row
2287 		const SideLocus& l, // locus for starting row
2288 		int c               // character to proceed on
2289 		ASSERT_ONLY(, bool overrideSanity = false)
2290 		) const
2291 		{
2292 			if(rowL(l) != c || row == _zOff) return OFF_MASK;
2293 			TIndexOffU ret;
2294 			assert_lt(c, 4);
2295 			assert_geq(c, 0);
2296 			ret = countBt2Side(l, c);
2297 			assert_lt(ret, this->_eh._bwtLen);
2298 #ifndef NDEBUG
2299 			if(_sanity && !overrideSanity) {
2300 				// Make sure results match up with results from mapLFEx;
2301 				// be sure to override sanity-checking in the callee, or we'll
2302 				// have infinite recursion
2303 				TIndexOffU arrs[] = { 0, 0, 0, 0 };
2304 				mapLFEx(l, arrs, true);
2305 				assert_eq(arrs[c], ret);
2306 			}
2307 #endif
2308 			return ret;
2309 		}
2310 
2311 
2312 	/**
2313 	 * Given row and its locus information, set the row to LF(row) and
2314 	 * return the character that was in the final column.
2315 	 */
2316 	inline int mapLF1(
2317 		TIndexOffU& row,      // starting row
2318 		const SideLocus& l  // locus for starting row
2319 		ASSERT_ONLY(, bool overrideSanity = false)
2320 		) const
2321 		{
2322 			if(row == _zOff) return -1;
2323 			int c = rowL(l);
2324 			assert_range(0, 3, c);
2325 			row = countBt2Side(l, c);
2326 			assert_lt(row, this->_eh._bwtLen);
2327 #ifndef NDEBUG
2328 			if(_sanity && !overrideSanity) {
2329 				// Make sure results match up with results from mapLFEx;
2330 				// be sure to override sanity-checking in the callee, or we'll
2331 				// have infinite recursion
2332 				TIndexOffU arrs[] = { 0, 0, 0, 0 };
2333 				mapLFEx(l, arrs, true);
2334 				assert_eq(arrs[c], row);
2335 			}
2336 #endif
2337 			return c;
2338 		}
2339 
2340 #ifndef NDEBUG
2341 	/// Check that in-memory Ebwt is internally consistent with respect
2342 	/// to given EbwtParams; assert if not
inMemoryRepOk(const EbwtParams & eh)2343 	bool inMemoryRepOk(const EbwtParams& eh) const {
2344 		assert_geq(_zEbwtBpOff, 0);
2345 		assert_lt(_zEbwtBpOff, 4);
2346 		assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
2347 		assert_lt(_zOff, eh._bwtLen);
2348 		assert_geq(_nFrag, _nPat);
2349 		return true;
2350 	}
2351 
2352 	/// Check that in-memory Ebwt is internally consistent; assert if
2353 	/// not
inMemoryRepOk()2354 	bool inMemoryRepOk() const {
2355 		return repOk(_eh);
2356 	}
2357 
2358 	/// Check that Ebwt is internally consistent with respect to given
2359 	/// EbwtParams; assert if not
repOk(const EbwtParams & eh)2360 	bool repOk(const EbwtParams& eh) const {
2361 		assert(_eh.repOk());
2362 		if(isInMemory()) {
2363 			return inMemoryRepOk(eh);
2364 		}
2365 		return true;
2366 	}
2367 
2368 	/// Check that Ebwt is internally consistent; assert if not
repOk()2369 	bool repOk() const {
2370 		return repOk(_eh);
2371 	}
2372 #endif
2373 
2374 	bool       _switchEndian;
2375 	int32_t    _overrideOffRate;
2376 	bool       _verbose;
2377 	bool       _passMemExc;
2378 	bool       _sanity;
2379 	bool       fw_;     // true iff this is a forward index
2380 	FILE       *_in1;    // input fd for primary index file
2381 	FILE       *_in2;    // input fd for secondary index file
2382 	string     _in1Str; // filename for primary index file
2383 	string     _in2Str; // filename for secondary index file
2384 	string     _inSaStr;  // filename for suffix-array file
2385 	string     _inBwtStr; // filename for BWT file
2386 	TIndexOffU  _zOff;
2387 	TIndexOffU  _zEbwtByteOff;
2388 	TIndexOff   _zEbwtBpOff;
2389 	TIndexOffU  _nPat;  /// number of reference texts
2390 	TIndexOffU  _nFrag; /// number of fragments
2391 	APtrWrap<TIndexOffU> _plen;
2392 	APtrWrap<TIndexOffU> _rstarts; // starting offset of fragments / text indexes
2393 	// _fchr, _ftab and _eftab are expected to be relatively small
2394 	// (usually < 1MB, perhaps a few MB if _fchr is particularly large
2395 	// - like, say, 11).  For this reason, we don't bother with writing
2396 	// them to disk through separate output streams; we
2397 	APtrWrap<TIndexOffU> _fchr;
2398 	APtrWrap<TIndexOffU> _ftab;
2399 	APtrWrap<TIndexOffU> _eftab; // "extended" entries for _ftab
2400 	// _offs may be extremely large.  E.g. for DNA w/ offRate=4 (one
2401 	// offset every 16 rows), the total size of _offs is the same as
2402 	// the total size of the input sequence
2403 	APtrWrap<TIndexOffU> _offs;
2404 	// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
2405 	// is at least as large as the input sequence.
2406 	APtrWrap<uint8_t> _ebwt;
2407 	bool       _useMm;        /// use memory-mapped files to hold the index
2408 	bool       useShmem_;     /// use shared memory to hold large parts of the index
2409 	EList<string> _refnames; /// names of the reference sequences
2410 	char *mmFile1_;
2411 	char *mmFile2_;
2412 	EbwtParams _eh;
2413 	bool packed_;
2414 
2415 	static const TIndexOffU default_bmax = OFF_MASK;
2416 	static const TIndexOffU default_bmaxMultSqrt = OFF_MASK;
2417 	static const TIndexOffU default_bmaxDivN = 4;
2418 	static const int      default_dcv = 1024;
2419 	static const bool     default_noDc = false;
2420 	static const bool     default_useBlockwise = true;
2421 	static const uint32_t default_seed = 0;
2422 #ifdef BOWTIE_64BIT_INDEX
2423 	static const int      default_lineRate = 7;
2424 #else
2425 	static const int      default_lineRate = 6;
2426 #endif
2427 	static const int      default_offRate = 5;
2428 	static const int      default_offRatePlus = 0;
2429 	static const int      default_ftabChars = 10;
2430 	static const bool     default_bigEndian = false;
2431 
2432 private:
2433 
log()2434 	ostream& log() const {
2435 		return cout; // TODO: turn this into a parameter
2436 	}
2437 
2438 	/// Print a verbose message and flush (flushing is helpful for
2439 	/// debugging)
verbose(const string & s)2440 	void verbose(const string& s) const {
2441 		if(this->verbose()) {
2442 			this->log() << s.c_str();
2443 			this->log().flush();
2444 		}
2445 	}
2446 };
2447 
2448 /**
2449  * Read reference names from an input stream 'in' for an Ebwt primary
2450  * file and store them in 'refnames'.
2451  */
2452 void readEbwtRefnames(FILE* fin, EList<string>& refnames);
2453 
2454 /**
2455  * Read reference names from the index with basename 'in' and store
2456  * them in 'refnames'.
2457  */
2458 void readEbwtRefnames(const string& instr, EList<string>& refnames);
2459 
2460 /**
2461  * Read just enough of the Ebwt's header to determine whether it's
2462  * colorspace.
2463  */
2464 bool readEbwtColor(const string& instr);
2465 
2466 /**
2467  * Read just enough of the Ebwt's header to determine whether it's
2468  * entirely reversed.
2469  */
2470 bool readEntireReverse(const string& instr);
2471 
2472 ///////////////////////////////////////////////////////////////////////
2473 //
2474 // Functions for building Ebwts
2475 //
2476 ///////////////////////////////////////////////////////////////////////
2477 
2478 /**
2479  * Join several text strings together in a way that's compatible with
2480  * the text-chunking scheme dictated by chunkRate parameter.
2481  *
2482  * The non-static member Ebwt::join additionally builds auxilliary
2483  * arrays that maintain a mapping between chunks in the joined string
2484  * and the original text strings.
2485  */
2486 template<typename TStr>
join(EList<TStr> & l,uint32_t seed)2487 TStr Ebwt::join(EList<TStr>& l, uint32_t seed) {
2488 	RandomSource rand; // reproducible given same seed
2489 	rand.init(seed);
2490 	TStr ret;
2491 	TIndexOffU guessLen = 0;
2492 	for(TIndexOffU i = 0; i < l.size(); i++) {
2493 		guessLen += length(l[i]);
2494 	}
2495 	ret.resize(guessLen);
2496 	TIndexOffU off = 0;
2497 	for(size_t i = 0; i < l.size(); i++) {
2498 		TStr& s = l[i];
2499 		assert_gt(s.length(), 0);
2500 		for(size_t j = 0; j < s.size(); j++) {
2501 			ret.set(s[j], off++);
2502 		}
2503 	}
2504 	return ret;
2505 }
2506 
2507 /**
2508  * Join several text strings together in a way that's compatible with
2509  * the text-chunking scheme dictated by chunkRate parameter.
2510  *
2511  * The non-static member Ebwt::join additionally builds auxilliary
2512  * arrays that maintain a mapping between chunks in the joined string
2513  * and the original text strings.
2514  */
2515 template<typename TStr>
join(EList<FileBuf * > & l,EList<RefRecord> & szs,TIndexOffU sztot,const RefReadInParams & refparams,uint32_t seed)2516 TStr Ebwt::join(EList<FileBuf*>& l,
2517                 EList<RefRecord>& szs,
2518                 TIndexOffU sztot,
2519                 const RefReadInParams& refparams,
2520                 uint32_t seed)
2521 {
2522 	RandomSource rand; // reproducible given same seed
2523 	rand.init(seed);
2524 	RefReadInParams rpcp = refparams;
2525 	TStr ret;
2526 	TIndexOffU guessLen = sztot;
2527 	ret.resize(guessLen);
2528 	ASSERT_ONLY(TIndexOffU szsi = 0);
2529 	TIndexOffU dstoff = 0;
2530 	for(TIndexOffU i = 0; i < l.size(); i++) {
2531 		// For each sequence we can pull out of istream l[i]...
2532 		assert(!l[i]->eof());
2533 		bool first = true;
2534 		while(!l[i]->eof()) {
2535 			RefRecord rec = fastaRefReadAppend(*l[i], first, ret, dstoff, rpcp);
2536 			first = false;
2537 			if(rec.first && rec.len == 0) {
2538 				continue;
2539 			}
2540 			TIndexOffU bases = rec.len;
2541 			assert_eq(rec.off, szs[szsi].off);
2542 			assert_eq(rec.len, szs[szsi].len);
2543 			assert_eq(rec.first, szs[szsi].first);
2544 			ASSERT_ONLY(szsi++);
2545 			if(bases == 0) continue;
2546 		}
2547 	}
2548 	return ret;
2549 }
2550 
2551 /**
2552  * Join several text strings together according to the text-chunking
2553  * scheme specified in the EbwtParams.  Ebwt fields calculated in this
2554  * function are written directly to disk.
2555  *
2556  * It is assumed, but not required, that the header values have already
2557  * been written to 'out1' before this function is called.
2558  *
2559  * The static member Ebwt::join just returns a joined version of a
2560  * list of strings without building any of the auxilliary arrays.
2561  */
2562 template<typename TStr>
joinToDisk(EList<FileBuf * > & l,EList<RefRecord> & szs,TIndexOffU sztot,const RefReadInParams & refparams,TStr & ret,ostream & out1,ostream & out2)2563 void Ebwt::joinToDisk(
2564 	EList<FileBuf*>& l,
2565 	EList<RefRecord>& szs,
2566 	TIndexOffU sztot,
2567 	const RefReadInParams& refparams,
2568 	TStr& ret,
2569 	ostream& out1,
2570 	ostream& out2)
2571 {
2572 	RefReadInParams rpcp = refparams;
2573 	assert_gt(szs.size(), 0);
2574 	assert_gt(l.size(), 0);
2575 	assert_gt(sztot, 0);
2576 	// Not every fragment represents a distinct sequence - many
2577 	// fragments may correspond to a single sequence.  Count the
2578 	// number of sequences here by counting the number of "first"
2579 	// fragments.
2580 	this->_nPat = 0;
2581 	this->_nFrag = 0;
2582 	for(TIndexOffU i = 0; i < szs.size(); i++) {
2583 		if(szs[i].len > 0) this->_nFrag++;
2584 		if(szs[i].first && szs[i].len > 0) this->_nPat++;
2585 	}
2586 	assert_gt(this->_nPat, 0);
2587 	assert_geq(this->_nFrag, this->_nPat);
2588 	_rstarts.reset();
2589 	writeU<TIndexOffU>(out1, this->_nPat, _switchEndian);
2590 	// Allocate plen[]
2591 	try {
2592 		this->_plen.init(new TIndexOffU[this->_nPat], this->_nPat);
2593 	} catch(bad_alloc& e) {
2594 		cerr << "Out of memory allocating plen[] in Ebwt::join)"
2595 		     << " at " << __FILE__ << ":" << __LINE__ << endl;
2596 		throw e;
2597 	}
2598 	// For each pattern, set plen
2599 	TIndexOff npat = -1;
2600 	for(TIndexOffU i = 0; i < szs.size(); i++) {
2601 		if(szs[i].first && szs[i].len > 0) {
2602 			if(npat >= 0) {
2603 				writeU<TIndexOffU>(out1, this->plen()[npat], _switchEndian);
2604 			}
2605 			this->plen()[++npat] = (szs[i].len + szs[i].off);
2606 		} else if(!szs[i].first) {
2607 			// edge case, but we could get here with npat == -1
2608 			// e.g. when building from a reference of all Ns
2609 			if (npat < 0) npat = 0;
2610 			this->plen()[npat] += (szs[i].len + szs[i].off);
2611 		}
2612 	}
2613 	assert_eq((TIndexOffU)npat, this->_nPat-1);
2614 	writeU<TIndexOffU>(out1, this->plen()[npat], _switchEndian);
2615 	// Write the number of fragments
2616 	writeU<TIndexOffU>(out1, this->_nFrag, _switchEndian);
2617 	TIndexOffU seqsRead = 0;
2618 	ASSERT_ONLY(TIndexOffU szsi = 0);
2619 	ASSERT_ONLY(TIndexOffU entsWritten = 0);
2620 	TIndexOffU dstoff = 0;
2621 	// For each filebuf
2622 	for(unsigned int i = 0; i < l.size(); i++) {
2623 		assert(!l[i]->eof());
2624 		bool first = true;
2625 		TIndexOffU patoff = 0;
2626 		// For each *fragment* (not necessary an entire sequence) we
2627 		// can pull out of istream l[i]...
2628 		while(!l[i]->eof()) {
2629 			string name;
2630 			// Push a new name onto our vector
2631 			_refnames.push_back("");
2632 			RefRecord rec = fastaRefReadAppend(
2633 				*l[i], first, ret, dstoff, rpcp, &_refnames.back());
2634 			first = false;
2635 			TIndexOffU bases = rec.len;
2636 			if(rec.first && rec.len > 0) {
2637 				if(_refnames.back().length() == 0) {
2638 					// If name was empty, replace with an index
2639 					ostringstream stm;
2640 					stm << seqsRead;
2641 					_refnames.back() = stm.str();
2642 				}
2643 			} else {
2644 				// This record didn't actually start a new sequence so
2645 				// no need to add a name
2646 				//assert_eq(0, _refnames.back().length());
2647 				_refnames.pop_back();
2648 			}
2649 			if(rec.first && rec.len == 0) {
2650 				continue;
2651 			}
2652 			assert_lt(szsi, szs.size());
2653 			assert_eq(rec.off, szs[szsi].off);
2654 			assert_eq(rec.len, szs[szsi].len);
2655 			assert_eq(rec.first, szs[szsi].first);
2656 			assert(rec.first || rec.off > 0);
2657 			ASSERT_ONLY(szsi++);
2658 			// Increment seqsRead if this is the first fragment
2659 			if(rec.first) seqsRead++;
2660 			if(bases == 0) continue;
2661 			assert_leq(bases, this->plen()[seqsRead-1]);
2662 			// Reset the patoff if this is the first fragment
2663 			if(rec.first) patoff = 0;
2664 			patoff += rec.off; // add fragment's offset from end of last frag.
2665 			// Adjust rpcps
2666 			//uint32_t seq = seqsRead-1;
2667 			ASSERT_ONLY(entsWritten++);
2668 			// This is where rstarts elements are written to the output stream
2669 			//writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
2670 			//writeU32(out1, seq,       this->toBe()); // sequence id
2671 			//writeU32(out1, patoff,    this->toBe()); // offset into sequence
2672 			patoff += bases;
2673 		}
2674 		assert_gt(szsi, 0);
2675 		l[i]->reset();
2676 		assert(!l[i]->eof());
2677 #ifndef NDEBUG
2678 		int c = l[i]->get();
2679 		assert_eq('>', c);
2680 		assert(!l[i]->eof());
2681 		l[i]->reset();
2682 		assert(!l[i]->eof());
2683 #endif
2684 	}
2685 	assert_eq(entsWritten, this->_nFrag);
2686 }
2687 
2688 /**
2689  * Build an Ebwt from a string 's' and its suffix array 'sa' (which
2690  * might actually be a suffix array *builder* that builds blocks of the
2691  * array on demand).  The bulk of the Ebwt, i.e. the ebwt and offs
2692  * arrays, is written directly to disk.  This is by design: keeping
2693  * those arrays in memory needlessly increases the footprint of the
2694  * building process.  Instead, we prefer to build the Ebwt directly
2695  * "to disk" and then read it back into memory later as necessary.
2696  *
2697  * It is assumed that the header values and join-related values (nPat,
2698  * plen) have already been written to 'out1' before this function
2699  * is called.  When this function is finished, it will have
2700  * additionally written ebwt, zOff, fchr, ftab and eftab to the primary
2701  * file and offs to the secondary file.
2702  *
2703  * Assume DNA/RNA/any alphabet with 4 or fewer elements.
2704  * Assume occ array entries are 32 bits each.
2705  *
2706  * @param sa            the suffix array to convert to a Ebwt
2707  * @param s             the original string
2708  * @param out
2709  */
2710 template<typename TStr>
buildToDisk(InorderBlockwiseSA<TStr> & sa,const TStr & s,ostream & out1,ostream & out2,ostream * saOut,ostream * bwtOut)2711 void Ebwt::buildToDisk(
2712 	InorderBlockwiseSA<TStr>& sa,
2713 	const TStr& s,
2714 	ostream& out1,
2715 	ostream& out2,
2716 	ostream* saOut,
2717 	ostream* bwtOut)
2718 {
2719 	const EbwtParams& eh = this->_eh;
2720 
2721 	assert(eh.repOk());
2722 	assert_eq(s.length()+1, sa.size());
2723 	assert_eq(s.length(), eh._len);
2724 	assert_gt(eh._lineRate, 3);
2725 	assert(sa.suffixItrIsReset());
2726 
2727 	TIndexOffU len = eh._len;
2728 	TIndexOffU ftabLen = eh._ftabLen;
2729 	TIndexOffU sideSz = eh._sideSz;
2730 	TIndexOffU ebwtTotSz = eh._ebwtTotSz;
2731 	TIndexOffU fchr[] = {0, 0, 0, 0, 0};
2732 	EList<TIndexOffU> ftab(EBWT_CAT);
2733 	TIndexOffU zOff = OFF_MASK;
2734 
2735 	// Save # of occurrences of each character as we walk along the bwt
2736 	TIndexOffU occ[4] = {0, 0, 0, 0};
2737 	TIndexOffU occSave[4] = {0, 0, 0, 0};
2738 
2739 	// Record rows that should "absorb" adjacent rows in the ftab.
2740 	// The absorbed rows represent suffixes shorter than the ftabChars
2741 	// cutoff.
2742 	uint8_t absorbCnt = 0;
2743 	EList<uint8_t> absorbFtab(EBWT_CAT);
2744 	try {
2745 		VMSG_NL("Allocating ftab, absorbFtab");
2746 		ftab.resize(ftabLen);
2747 		ftab.fillZero();
2748 		absorbFtab.resize(ftabLen);
2749 		absorbFtab.fillZero();
2750 	} catch(bad_alloc &e) {
2751 		cerr << "Out of memory allocating ftab[] or absorbFtab[] "
2752 		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
2753 		     << __LINE__ << endl;
2754 		throw e;
2755 	}
2756 
2757 	// Allocate the side buffer; holds a single side as its being
2758 	// constructed and then written to disk.  Reused across all sides.
2759 #ifdef SIXTY4_FORMAT
2760 	EList<uint64_t> ebwtSide(EBWT_CAT);
2761 #else
2762 	EList<uint8_t> ebwtSide(EBWT_CAT);
2763 #endif
2764 	try {
2765 #ifdef SIXTY4_FORMAT
2766 		ebwtSide.resize(sideSz >> 3);
2767 #else
2768 		ebwtSide.resize(sideSz);
2769 #endif
2770 	} catch(bad_alloc &e) {
2771 		cerr << "Out of memory allocating ebwtSide[] in "
2772 		     << "Ebwt::buildToDisk() at " << __FILE__ << ":"
2773 		     << __LINE__ << endl;
2774 		throw e;
2775 	}
2776 
2777 	// Points to the base offset within ebwt for the side currently
2778 	// being written
2779 	TIndexOffU side = 0;
2780 
2781 	// Whether we're assembling a forward or a reverse bucket
2782 	bool fw;
2783 	TIndexOff sideCur = 0;
2784 	fw = true;
2785 
2786 	// Have we skipped the '$' in the last column yet?
2787 	ASSERT_ONLY(bool dollarSkipped = false);
2788 
2789 	TIndexOffU si = 0;   // string offset (chars)
2790 	ASSERT_ONLY(TIndexOffU lastSufInt = 0);
2791 	ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
2792 	                               // array (as opposed to the padding at the
2793 	                               // end)
2794 	// Iterate over packed bwt bytes
2795 	VMSG_NL("Entering Ebwt loop");
2796 	ASSERT_ONLY(TIndexOffU beforeEbwtOff = (TIndexOffU)out1.tellp()); // @double-check - pos_type, std::streampos
2797 
2798 	// First integer in the suffix-array output file is the length of the
2799 	// array, including $
2800 	if(saOut != NULL) {
2801 		// Write length word
2802 		writeU<TIndexOffU>(*saOut, len+1, _switchEndian);
2803 	}
2804 
2805 	// First integer in the BWT output file is the length of BWT(T), including $
2806 	if(bwtOut != NULL) {
2807 		// Write length word
2808 		writeU<TIndexOffU>(*bwtOut, len+1, _switchEndian);
2809 	}
2810 
2811 	while(side < ebwtTotSz) {
2812 		// Sanity-check our cursor into the side buffer
2813 		assert_geq(sideCur, 0);
2814 		assert_lt(sideCur, (int)eh._sideBwtSz);
2815 		assert_eq(0, side % sideSz); // 'side' must be on side boundary
2816 		ebwtSide[sideCur] = 0; // clear
2817 		assert_lt(side + sideCur, ebwtTotSz);
2818 		// Iterate over bit-pairs in the si'th character of the BWT
2819 #ifdef SIXTY4_FORMAT
2820 		for(int bpi = 0; bpi < 32; bpi++, si++)
2821 #else
2822 			for(int bpi = 0; bpi < 4; bpi++, si++)
2823 #endif
2824 			{
2825 				int bwtChar;
2826 				bool count = true;
2827 				if(si <= len) {
2828 					// Still in the SA; extract the bwtChar
2829 					TIndexOffU saElt = sa.nextSuffix();
2830 					// Write it to the optional suffix-array output file
2831 					if(saOut != NULL) {
2832 						writeU<TIndexOffU>(*saOut, saElt, _switchEndian);
2833 					}
2834 					// TODO: what exactly to write to the BWT output file?  How to
2835 					// represent $?  How to pack nucleotides into bytes/words?
2836 
2837 					// (that might have triggered sa to calc next suf block)
2838 					if(saElt == 0) {
2839 						// Don't add the '$' in the last column to the BWT
2840 						// transform; we can't encode a $ (only A C T or G)
2841 						// and counting it as, say, an A, will mess up the
2842 						// LR mapping
2843 						bwtChar = 0; count = false;
2844 						ASSERT_ONLY(dollarSkipped = true);
2845 						zOff = si; // remember the SA row that
2846 						// corresponds to the 0th suffix
2847 					} else {
2848 						bwtChar = (int)(s[saElt-1]);
2849 						assert_lt(bwtChar, 4);
2850 						// Update the fchr
2851 						fchr[bwtChar]++;
2852 					}
2853 					// Update ftab
2854 					if((len-saElt) >= (TIndexOffU)eh._ftabChars) {
2855 						// Turn the first ftabChars characters of the
2856 						// suffix into an integer index into ftab.  The
2857 						// leftmost (lowest index) character of the suffix
2858 						// goes in the most significant bit pair if the
2859 						// integer.
2860 						TIndexOffU sufInt = 0;
2861 						for(int i = 0; i < eh._ftabChars; i++) {
2862 							sufInt <<= 2;
2863 							assert_lt((TIndexOffU)i, len-saElt);
2864 							sufInt |= (unsigned char)(s[saElt+i]);
2865 						}
2866 						// Assert that this prefix-of-suffix is greater
2867 						// than or equal to the last one (true b/c the
2868 						// suffix array is sorted)
2869 #ifndef NDEBUG
2870 						if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
2871 						lastSufInt = sufInt;
2872 #endif
2873 						// Update ftab
2874 						assert_lt(sufInt+1, ftabLen);
2875 						ftab[sufInt+1]++;
2876 						if(absorbCnt > 0) {
2877 							// Absorb all short suffixes since the last
2878 							// transition into this transition
2879 							absorbFtab[sufInt] = absorbCnt;
2880 							absorbCnt = 0;
2881 						}
2882 					} else {
2883 						// Otherwise if suffix is fewer than ftabChars
2884 						// characters long, then add it to the 'absorbCnt';
2885 						// it will be absorbed into the next transition
2886 						assert_lt(absorbCnt, 255);
2887 						absorbCnt++;
2888 					}
2889 					// Suffix array offset boundary? - update offset array
2890 					if((si & eh._offMask) == si) {
2891 						assert_lt((si >> eh._offRate), eh._offsLen);
2892 						// Write offsets directly to the secondary output
2893 						// stream, thereby avoiding keeping them in memory
2894 						writeU<TIndexOffU>(out2, saElt, _switchEndian);
2895 					}
2896 				} else {
2897 					// Strayed off the end of the SA, now we're just
2898 					// padding out a bucket
2899 #ifndef NDEBUG
2900 					if(inSA) {
2901 						// Assert that we wrote all the characters in the
2902 						// string before now
2903 						assert_eq(si, len+1);
2904 						inSA = false;
2905 					}
2906 #endif
2907 					// 'A' used for padding; important that padding be
2908 					// counted in the occ[] array
2909 					bwtChar = 0;
2910 				}
2911 				if(count) occ[bwtChar]++;
2912 				// Append BWT char to bwt section of current side
2913 				if(fw) {
2914 					// Forward bucket: fill from least to most
2915 #ifdef SIXTY4_FORMAT
2916 					ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
2917 					if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
2918 #else
2919 					pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi);
2920 					assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar);
2921 #endif
2922 				} else {
2923 					// Backward bucket: fill from most to least
2924 #ifdef SIXTY4_FORMAT
2925 					ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
2926 					if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
2927 #else
2928 					pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi);
2929 					assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
2930 #endif
2931 				}
2932 			} // end loop over bit-pairs
2933 		assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
2934 #ifdef SIXTY4_FORMAT
2935 		assert_eq(0, si & 31);
2936 #else
2937 		assert_eq(0, si & 3);
2938 #endif
2939 
2940 		sideCur++;
2941 		if(sideCur == (int)eh._sideBwtSz) {
2942 			sideCur = 0;
2943 			TIndexOffU *cpptr = reinterpret_cast<TIndexOffU*>(ebwtSide.ptr());
2944 			// Write 'A', 'C', 'G' and 'T' tallies
2945 			side += sideSz;
2946 			assert_leq(side, eh._ebwtTotSz);
2947 #ifdef BOWTIE_64BIT_INDEX
2948 			cpptr[(sideSz >> 3)-4] = endianizeU<TIndexOffU>(occSave[0], _switchEndian);
2949 			cpptr[(sideSz >> 3)-3] = endianizeU<TIndexOffU>(occSave[1], _switchEndian);
2950 			cpptr[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occSave[2], _switchEndian);
2951 			cpptr[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occSave[3], _switchEndian);
2952 #else
2953 			cpptr[(sideSz >> 2)-4] = endianizeU<TIndexOffU>(occSave[0], _switchEndian);
2954 			cpptr[(sideSz >> 2)-3] = endianizeU<TIndexOffU>(occSave[1], _switchEndian);
2955 			cpptr[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occSave[2], _switchEndian);
2956 			cpptr[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occSave[3], _switchEndian);
2957 #endif
2958 			occSave[0] = occ[0];
2959 			occSave[1] = occ[1];
2960 			occSave[2] = occ[2];
2961 			occSave[3] = occ[3];
2962 			// Write backward side to primary file
2963 			out1.write((const char *)ebwtSide.ptr(), sideSz);
2964 		}
2965 	}
2966 	VMSG_NL("Exited Ebwt loop");
2967 	assert_neq(zOff, OFF_MASK);
2968 	if(absorbCnt > 0) {
2969 		// Absorb any trailing, as-yet-unabsorbed short suffixes into
2970 		// the last element of ftab
2971 		absorbFtab[ftabLen-1] = absorbCnt;
2972 	}
2973 	// Assert that our loop counter got incremented right to the end
2974 	assert_eq(side, eh._ebwtTotSz);
2975 	// Assert that we wrote the expected amount to out1
2976 	assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); // @double-check - pos_type
2977 	// assert that the last thing we did was write a forward bucket
2978 
2979 	//
2980 	// Write zOff to primary stream
2981 	//
2982 	writeU<TIndexOffU>(out1, zOff, _switchEndian);
2983 
2984 	//
2985 	// Finish building fchr
2986 	//
2987 	// Exclusive prefix sum on fchr
2988 	for(int i = 1; i < 4; i++) {
2989 		fchr[i] += fchr[i-1];
2990 	}
2991 	assert_eq(fchr[3], len);
2992 	// Shift everybody up by one
2993 	for(int i = 4; i >= 1; i--) {
2994 		fchr[i] = fchr[i-1];
2995 	}
2996 	fchr[0] = 0;
2997 	if(_verbose) {
2998 		for(int i = 0; i < 5; i++)
2999 			cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
3000 	}
3001 	// Write fchr to primary file
3002 	for(int i = 0; i < 5; i++) {
3003 		writeU<TIndexOffU>(out1, fchr[i], _switchEndian);
3004 	}
3005 
3006 	//
3007 	// Finish building ftab and build eftab
3008 	//
3009 	// Prefix sum on ftable
3010 	TIndexOffU eftabLen = 0;
3011 	assert_eq(0, absorbFtab[0]);
3012 	for(TIndexOffU i = 1; i < ftabLen; i++) {
3013 		if(absorbFtab[i] > 0) eftabLen += 2;
3014 	}
3015 	assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2);
3016 	eftabLen = eh._ftabChars*2;
3017 	EList<TIndexOffU> eftab(EBWT_CAT);
3018 	try {
3019 		eftab.resize(eftabLen);
3020 		eftab.fillZero();
3021 	} catch(bad_alloc &e) {
3022 		cerr << "Out of memory allocating eftab[] "
3023 		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
3024 		     << __LINE__ << endl;
3025 		throw e;
3026 	}
3027 	TIndexOffU eftabCur = 0;
3028 	for(TIndexOffU i = 1; i < ftabLen; i++) {
3029 		TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
3030 		if(absorbFtab[i] > 0) {
3031 			// Skip a number of short pattern indicated by absorbFtab[i]
3032 			TIndexOffU hi = lo + absorbFtab[i];
3033 			assert_lt(eftabCur*2+1, eftabLen);
3034 			eftab[eftabCur*2] = lo;
3035 			eftab[eftabCur*2+1] = hi;
3036 			ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab
3037 			assert_eq(lo, Ebwt::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
3038 			assert_eq(hi, Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
3039 		} else {
3040 			ftab[i] = lo;
3041 		}
3042 	}
3043 	assert_eq(Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
3044 	// Write ftab to primary file
3045 	for(TIndexOffU i = 0; i < ftabLen; i++) {
3046 		writeU<TIndexOffU>(out1, ftab[i], _switchEndian);
3047 	}
3048 	// Write eftab to primary file
3049 	for(TIndexOffU i = 0; i < eftabLen; i++) {
3050 		writeU<TIndexOffU>(out1, eftab[i], _switchEndian);
3051 	}
3052 
3053 	// Note: if you'd like to sanity-check the Ebwt, you'll have to
3054 	// read it back into memory first!
3055 	assert(!isInMemory());
3056 	VMSG_NL("Exiting Ebwt::buildToDisk()");
3057 }
3058 
3059 /**
3060  * Try to find the Bowtie index specified by the user.  First try the
3061  * exact path given by the user.  Then try the user-provided string
3062  * appended onto the path of the "indexes" subdirectory below this
3063  * executable, then try the provided string appended onto
3064  * "$BOWTIE2_INDEXES/".
3065  */
3066 string adjustEbwtBase(const string& cmdline,
3067 		      const string& ebwtFileBase,
3068 		      bool verbose);
3069 
3070 
3071 extern string gLastIOErrMsg;
3072 
3073 /* Checks whether a call to read() failed or not. */
is_read_err(int fdesc,ssize_t ret,size_t count)3074 inline bool is_read_err(int fdesc, ssize_t ret, size_t count){
3075 	if (ret < 0) {
3076 		std::stringstream sstm;
3077 		sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
3078 		gLastIOErrMsg = sstm.str();
3079 		return true;
3080 	}
3081 	return false;
3082 }
3083 
3084 /* Checks whether a call to fread() failed or not. */
is_fread_err(FILE * file_hd,size_t ret,size_t count)3085 inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){
3086 	if (ferror(file_hd)) {
3087 		gLastIOErrMsg = "Error Reading File!";
3088 		return true;
3089 	}
3090 	return false;
3091 }
3092 
3093 
3094 #endif /*EBWT_H_*/
3095