1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #ifndef EBWT_H_
21 #define EBWT_H_
22
23 #include <stdint.h>
24 #include <string.h>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <memory>
29 #include <fcntl.h>
30 #include <math.h>
31 #include <errno.h>
32 #include <stdexcept>
33 #include <sys/stat.h>
34 #ifdef BOWTIE_MM
35 #include <sys/mman.h>
36 #include <sys/shm.h>
37 #endif
38 #include "shmem.h"
39 #include "alphabet.h"
40 #include "assert_helpers.h"
41 #include "bitpack.h"
42 #include "blockwise_sa.h"
43 #include "endian_swap.h"
44 #include "word_io.h"
45 #include "random_source.h"
46 #include "ref_read.h"
47 #include "threading.h"
48 #include "str_util.h"
49 #include "mm.h"
50 #include "timer.h"
51 #include "reference.h"
52 #include "search_globals.h"
53 #include "ds.h"
54 #include "random_source.h"
55 #include "mem_ids.h"
56 #include "btypes.h"
57
58 #ifdef POPCNT_CAPABILITY
59 #include "processor_support.h"
60 #endif
61
62 #if __cplusplus <= 199711L
63 #define unique_ptr auto_ptr
64 #endif
65
66 using namespace std;
67
68 // From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
69 extern uint8_t cCntLUT_4[4][4][256];
70
71 static const uint64_t c_table[4] = {
72 0xffffffffffffffff,
73 0xaaaaaaaaaaaaaaaa,
74 0x5555555555555555,
75 0x0000000000000000
76 };
77
78 #ifndef VMSG_NL
79 #define VMSG_NL(...) \
80 if(this->verbose()) { \
81 stringstream tmp; \
82 tmp << __VA_ARGS__ << endl; \
83 this->verbose(tmp.str()); \
84 }
85 #endif
86
87 #ifndef VMSG
88 #define VMSG(...) \
89 if(this->verbose()) { \
90 stringstream tmp; \
91 tmp << __VA_ARGS__; \
92 this->verbose(tmp.str()); \
93 }
94 #endif
95
96 /**
97 * Flags describing type of Ebwt.
98 */
99 enum EBWT_FLAGS {
100 EBWT_COLOR = 2, // true -> Ebwt is colorspace
101 EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
102 // concatenated string reversed, rather than
103 // each stretch reversed
104 };
105
106 /**
107 * Extended Burrows-Wheeler transform header. This together with the
108 * actual data arrays and other text-specific parameters defined in
109 * class Ebwt constitute the entire Ebwt.
110 */
111 class EbwtParams {
112
113 public:
EbwtParams()114 EbwtParams() { }
115
EbwtParams(TIndexOffU len,int32_t lineRate,int32_t offRate,int32_t ftabChars,bool color,bool entireReverse)116 EbwtParams(
117 TIndexOffU len,
118 int32_t lineRate,
119 int32_t offRate,
120 int32_t ftabChars,
121 bool color,
122 bool entireReverse)
123 {
124 init(len, lineRate, offRate, ftabChars, color, entireReverse);
125 }
126
EbwtParams(const EbwtParams & eh)127 EbwtParams(const EbwtParams& eh) {
128 init(eh._len, eh._lineRate, eh._offRate,
129 eh._ftabChars, eh._color, eh._entireReverse);
130 }
131
init(TIndexOffU len,int32_t lineRate,int32_t offRate,int32_t ftabChars,bool color,bool entireReverse)132 void init(
133 TIndexOffU len,
134 int32_t lineRate,
135 int32_t offRate,
136 int32_t ftabChars,
137 bool color,
138 bool entireReverse)
139 {
140 _color = color;
141 _entireReverse = entireReverse;
142 _len = len;
143 _bwtLen = _len + 1;
144 _sz = (len+3)/4;
145 _bwtSz = (len/4 + 1);
146 _lineRate = lineRate;
147 _origOffRate = offRate;
148 _offRate = offRate;
149 _offMask = OFF_MASK << _offRate;
150 _ftabChars = ftabChars;
151 _eftabLen = _ftabChars*2;
152 _eftabSz = _eftabLen*OFF_SIZE;
153 _ftabLen = (1 << (_ftabChars*2))+1;
154 _ftabSz = _ftabLen*OFF_SIZE;
155 _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
156 _offsSz = (uint64_t)_offsLen*OFF_SIZE;
157 _lineSz = 1 << _lineRate;
158 _sideSz = _lineSz * 1 /* lines per side */;
159 _sideBwtSz = _sideSz - OFF_SIZE*4;
160 _sideBwtLen = _sideBwtSz*4;
161 _numSides = (_bwtSz+(_sideBwtSz)-1)/(_sideBwtSz);
162 _numLines = _numSides * 1 /* lines per side */;
163 _ebwtTotLen = _numSides * _sideSz;
164 _ebwtTotSz = _ebwtTotLen;
165 assert(repOk());
166 }
167
len()168 TIndexOffU len() const { return _len; }
lenNucs()169 TIndexOffU lenNucs() const { return _len + (_color ? 1 : 0); }
bwtLen()170 TIndexOffU bwtLen() const { return _bwtLen; }
sz()171 TIndexOffU sz() const { return _sz; }
bwtSz()172 TIndexOffU bwtSz() const { return _bwtSz; }
lineRate()173 int32_t lineRate() const { return _lineRate; }
origOffRate()174 int32_t origOffRate() const { return _origOffRate; }
offRate()175 int32_t offRate() const { return _offRate; }
offMask()176 TIndexOffU offMask() const { return _offMask; }
ftabChars()177 int32_t ftabChars() const { return _ftabChars; }
eftabLen()178 int32_t eftabLen() const { return _eftabLen; }
eftabSz()179 int32_t eftabSz() const { return _eftabSz; }
ftabLen()180 TIndexOffU ftabLen() const { return _ftabLen; }
ftabSz()181 TIndexOffU ftabSz() const { return _ftabSz; }
offsLen()182 TIndexOffU offsLen() const { return _offsLen; }
offsSz()183 uint64_t offsSz() const { return _offsSz; }
lineSz()184 int32_t lineSz() const { return _lineSz; }
sideSz()185 int32_t sideSz() const { return _sideSz; }
sideBwtSz()186 int32_t sideBwtSz() const { return _sideBwtSz; }
sideBwtLen()187 int32_t sideBwtLen() const { return _sideBwtLen; }
numSides()188 TIndexOffU numSides() const { return _numSides; }
numLines()189 TIndexOffU numLines() const { return _numLines; }
ebwtTotLen()190 TIndexOffU ebwtTotLen() const { return _ebwtTotLen; }
ebwtTotSz()191 TIndexOffU ebwtTotSz() const { return _ebwtTotSz; }
color()192 bool color() const { return _color; }
entireReverse()193 bool entireReverse() const { return _entireReverse; }
194
195 /**
196 * Set a new suffix-array sampling rate, which involves updating
197 * rate, mask, sample length, and sample size.
198 */
setOffRate(int __offRate)199 void setOffRate(int __offRate) {
200 _offRate = __offRate;
201 _offMask = OFF_MASK << _offRate;
202 _offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
203 _offsSz = (uint64_t)_offsLen * OFF_SIZE;
204 }
205
206 #ifndef NDEBUG
207 /// Check that this EbwtParams is internally consistent
repOk()208 bool repOk() const {
209 assert_gt(_len, 0);
210 assert_gt(_lineRate, 3);
211 assert_geq(_offRate, 0);
212 assert_leq(_ftabChars, 16);
213 assert_geq(_ftabChars, 1);
214 assert_lt(_lineRate, 32);
215 assert_lt(_ftabChars, 32);
216 assert_eq(0, _ebwtTotSz % _lineSz);
217 return true;
218 }
219 #endif
220
221 /**
222 * Pretty-print the header contents to the given output stream.
223 */
print(ostream & out)224 void print(ostream& out) const {
225 out << "Headers:" << endl
226 << " len: " << _len << endl
227 << " bwtLen: " << _bwtLen << endl
228 << " sz: " << _sz << endl
229 << " bwtSz: " << _bwtSz << endl
230 << " lineRate: " << _lineRate << endl
231 << " offRate: " << _offRate << endl
232 << " offMask: 0x" << hex << _offMask << dec << endl
233 << " ftabChars: " << _ftabChars << endl
234 << " eftabLen: " << _eftabLen << endl
235 << " eftabSz: " << _eftabSz << endl
236 << " ftabLen: " << _ftabLen << endl
237 << " ftabSz: " << _ftabSz << endl
238 << " offsLen: " << _offsLen << endl
239 << " offsSz: " << _offsSz << endl
240 << " lineSz: " << _lineSz << endl
241 << " sideSz: " << _sideSz << endl
242 << " sideBwtSz: " << _sideBwtSz << endl
243 << " sideBwtLen: " << _sideBwtLen << endl
244 << " numSides: " << _numSides << endl
245 << " numLines: " << _numLines << endl
246 << " ebwtTotLen: " << _ebwtTotLen << endl
247 << " ebwtTotSz: " << _ebwtTotSz << endl
248 << " color: " << _color << endl
249 << " reverse: " << _entireReverse << endl;
250 }
251
252 TIndexOffU _len;
253 TIndexOffU _bwtLen;
254 TIndexOffU _sz;
255 TIndexOffU _bwtSz;
256 int32_t _lineRate;
257 int32_t _origOffRate;
258 int32_t _offRate;
259 TIndexOffU _offMask;
260 int32_t _ftabChars;
261 uint32_t _eftabLen;
262 uint32_t _eftabSz;
263 TIndexOffU _ftabLen;
264 TIndexOffU _ftabSz;
265 TIndexOffU _offsLen;
266 uint64_t _offsSz;
267 uint32_t _lineSz;
268 uint32_t _sideSz;
269 uint32_t _sideBwtSz;
270 uint32_t _sideBwtLen;
271 TIndexOffU _numSides;
272 TIndexOffU _numLines;
273 TIndexOffU _ebwtTotLen;
274 TIndexOffU _ebwtTotSz;
275 bool _color;
276 bool _entireReverse;
277 };
278
279 /**
280 * Exception to throw when a file-realted error occurs.
281 */
282 class EbwtFileOpenException : public std::runtime_error {
283 public:
284 EbwtFileOpenException(const std::string& msg = "") :
runtime_error(msg)285 std::runtime_error(msg) { }
286 };
287
288 /**
289 * Calculate size of file with given name.
290 */
fileSize(const char * name)291 static inline int64_t fileSize(const char* name) {
292 std::ifstream f;
293 f.open(name, std::ios_base::binary | std::ios_base::in);
294 if (!f.good() || f.eof() || !f.is_open()) { return 0; }
295 f.seekg(0, std::ios_base::beg);
296 std::ifstream::pos_type begin_pos = f.tellg();
297 f.seekg(0, std::ios_base::end);
298 return static_cast<int64_t>(f.tellg() - begin_pos);
299 }
300
301 /**
302 * Encapsulates a location in the bwt text in terms of the side it
303 * occurs in and its offset within the side.
304 */
305 struct SideLocus {
SideLocusSideLocus306 SideLocus() :
307 _sideByteOff(0),
308 _sideNum(0),
309 _charOff(0),
310 _by(-1),
311 _bp(-1) { }
312
313 /**
314 * Construct from row and other relevant information about the Ebwt.
315 */
SideLocusSideLocus316 SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
317 initFromRow(row, ep, ebwt);
318 }
319
320 /**
321 * Init two SideLocus objects from a top/bot pair, using the result
322 * from one call to initFromRow to possibly avoid a second call.
323 */
initFromTopBotSideLocus324 static void initFromTopBot(
325 TIndexOffU top,
326 TIndexOffU bot,
327 const EbwtParams& ep,
328 const uint8_t* ebwt,
329 SideLocus& ltop,
330 SideLocus& lbot)
331 {
332 const TIndexOffU sideBwtLen = ep._sideBwtLen;
333 assert_gt(bot, top);
334 ltop.initFromRow(top, ep, ebwt);
335 TIndexOffU spread = bot - top;
336 // Many cache misses on the following lines
337 if(ltop._charOff + spread < sideBwtLen) {
338 lbot._charOff = (uint32_t)(ltop._charOff + spread);
339 lbot._sideNum = ltop._sideNum;
340 lbot._sideByteOff = ltop._sideByteOff;
341 lbot._by = lbot._charOff >> 2;
342 assert_lt(lbot._by, (int)ep._sideBwtSz);
343 lbot._bp = lbot._charOff & 3;
344 } else {
345 lbot.initFromRow(bot, ep, ebwt);
346 }
347 }
348
349 /**
350 * Calculate SideLocus based on a row and other relevant
351 * information about the shape of the Ebwt.
352 */
initFromRowSideLocus353 void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
354 const int32_t sideSz = ep._sideSz;
355 // Side length is hard-coded for now; this allows the compiler
356 // to do clever things to accelerate / and %.
357 _sideNum = row / (48*OFF_SIZE);
358 assert_lt(_sideNum, ep._numSides);
359 _charOff = row % (48*OFF_SIZE);
360 _sideByteOff = _sideNum * sideSz;
361 assert_leq(row, ep._len);
362 assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
363 // Tons of cache misses on the next line
364 _by = _charOff >> 2; // byte within side
365 assert_lt(_by, (int)ep._sideBwtSz);
366 _bp = _charOff & 3; // bit-pair within byte
367 }
368
369 /**
370 * Transform this SideLocus to refer to the next side (i.e. the one
371 * corresponding to the next side downstream). Set all cursors to
372 * point to the beginning of the side.
373 */
nextSideSideLocus374 void nextSide(const EbwtParams& ep) {
375 assert(valid());
376 _sideByteOff += ep.sideSz();
377 _sideNum++;
378 _by = _bp = _charOff = 0;
379 assert(valid());
380 }
381
382 /**
383 * Return true iff this is an initialized SideLocus
384 */
validSideLocus385 bool valid() const {
386 if(_bp != -1) {
387 return true;
388 }
389 return false;
390 }
391
392 /**
393 * Convert locus to BW row it corresponds to.
394 */
toBWRowSideLocus395 TIndexOffU toBWRow() const {
396 return _sideNum * 48*OFF_SIZE + _charOff;
397 }
398
399 #ifndef NDEBUG
400 /**
401 * Check that SideLocus is internally consistent and consistent
402 * with the (provided) EbwtParams.
403 */
repOkSideLocus404 bool repOk(const EbwtParams& ep) const {
405 ASSERT_ONLY(TIndexOffU row = _sideNum * 48*OFF_SIZE + _charOff);
406 assert_leq(row, ep._len);
407 assert_range(-1, 3, _bp);
408 assert_range(0, (int)ep._sideBwtSz, _by);
409 return true;
410 }
411 #endif
412
413 /// Make this look like an invalid SideLocus
invalidateSideLocus414 void invalidate() {
415 _bp = -1;
416 }
417
418 /**
419 * Return a read-only pointer to the beginning of the top side.
420 */
sideSideLocus421 const uint8_t *side(const uint8_t* ebwt) const {
422 return ebwt + _sideByteOff;
423 }
424
425 TIndexOffU _sideByteOff; // offset of top side within ebwt[]
426 TIndexOffU _sideNum; // index of side
427 uint32_t _charOff; // character offset within side
428 int32_t _by; // byte within side (not adjusted for bw sides)
429 int32_t _bp; // bitpair within byte (not adjusted for bw sides)
430 };
431 #ifdef POPCNT_CAPABILITY // wrapping of "struct"
432 struct USE_POPCNT_GENERIC {
433 #endif
434 // Use this standard bit-bashing population count
pop64USE_POPCNT_GENERIC435 inline static int pop64(uint64_t x) {
436 // Lots of cache misses on following lines (>10K)
437 x = x - ((x >> 1) & 0x5555555555555555llu);
438 x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
439 x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
440 x = x + (x >> 8);
441 x = x + (x >> 16);
442 x = x + (x >> 32);
443 return (int)(x & 0x3Fllu);
444 }
445 #ifdef POPCNT_CAPABILITY // wrapping a "struct"
446 };
447 #endif
448
449 #ifdef POPCNT_CAPABILITY
450 struct USE_POPCNT_INSTRUCTION {
pop64USE_POPCNT_INSTRUCTION451 inline static int pop64(uint64_t x) {
452 return __builtin_popcountll(x);
453 }
454 };
455 #endif
456
457 /**
458 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
459 * within a 64-bit argument.
460 */
461 #ifdef POPCNT_CAPABILITY
462 template<typename Operation>
463 #endif
countInU64(int c,uint64_t dw)464 inline static int countInU64(int c, uint64_t dw) {
465 uint64_t c0 = c_table[c];
466 uint64_t x0 = dw ^ c0;
467 uint64_t x1 = (x0 >> 1);
468 uint64_t x2 = x1 & (0x5555555555555555);
469 uint64_t x3 = x0 & x2;
470 #ifdef POPCNT_CAPABILITY
471 uint64_t tmp = Operation().pop64(x3);
472 #else
473 uint64_t tmp = pop64(x3);
474 #endif
475 return (int) tmp;
476 }
477
478 // Forward declarations for Ebwt class
479 class EbwtSearchParams;
480
481 /**
482 * Extended Burrows-Wheeler transform data.
483 *
484 * An Ebwt may be transferred to and from RAM with calls to
485 * evictFromMemory() and loadIntoMemory(). By default, a newly-created
486 * Ebwt is not loaded into memory; if the user would like to use a
487 * newly-created Ebwt to answer queries, they must first call
488 * loadIntoMemory().
489 */
490 class Ebwt {
491 public:
492 #define Ebwt_INITS \
493 _switchEndian(false), \
494 _overrideOffRate(overrideOffRate), \
495 _verbose(verbose), \
496 _passMemExc(passMemExc), \
497 _sanity(sanityCheck), \
498 fw_(fw), \
499 _in1(NULL), \
500 _in2(NULL), \
501 _zOff(OFF_MASK), \
502 _zEbwtByteOff(OFF_MASK), \
503 _zEbwtBpOff(-1), \
504 _nPat(0), \
505 _nFrag(0), \
506 _plen(EBWT_CAT), \
507 _rstarts(EBWT_CAT), \
508 _fchr(EBWT_CAT), \
509 _ftab(EBWT_CAT), \
510 _eftab(EBWT_CAT), \
511 _offs(EBWT_CAT), \
512 _ebwt(EBWT_CAT), \
513 _useMm(false), \
514 useShmem_(false), \
515 _refnames(EBWT_CAT), \
516 mmFile1_(NULL), \
517 mmFile2_(NULL)
518
519 /// Construct an Ebwt from the given input file
Ebwt(const string & in,int color,int needEntireReverse,bool fw,int32_t overrideOffRate,int32_t offRatePlus,bool useMm,bool useShmem,bool mmSweep,bool loadNames,bool loadSASamp,bool loadFtab,bool loadRstarts,bool verbose,bool startVerbose,bool passMemExc,bool sanityCheck)520 Ebwt(const string& in,
521 int color,
522 int needEntireReverse,
523 bool fw,
524 int32_t overrideOffRate, // = -1,
525 int32_t offRatePlus, // = -1,
526 bool useMm, // = false,
527 bool useShmem, // = false,
528 bool mmSweep, // = false,
529 bool loadNames, // = false,
530 bool loadSASamp, // = true,
531 bool loadFtab, // = true,
532 bool loadRstarts, // = true,
533 bool verbose, // = false,
534 bool startVerbose, // = false,
535 bool passMemExc, // = false,
536 bool sanityCheck) : // = false) :
537 Ebwt_INITS
538 {
539 assert(!useMm || !useShmem);
540 #ifdef POPCNT_CAPABILITY
541 ProcessorSupport ps;
542 _usePOPCNTinstruction = ps.POPCNTenabled();
543 #endif
544 packed_ = false;
545 _useMm = useMm;
546 useShmem_ = useShmem;
547 _in1Str = in + ".1." + gEbwt_ext;
548 _in2Str = in + ".2." + gEbwt_ext;
549 readIntoMemory(
550 color, // expect index to be colorspace?
551 fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
552 loadSASamp, // load the SA sample portion?
553 loadFtab, // load the ftab & eftab?
554 loadRstarts, // load the rstarts array?
555 true, // stop after loading the header portion?
556 &_eh, // params
557 mmSweep, // mmSweep
558 loadNames, // loadNames
559 startVerbose); // startVerbose
560 // If the offRate has been overridden, reflect that in the
561 // _eh._offRate field
562 if(offRatePlus > 0 && _overrideOffRate == -1) {
563 _overrideOffRate = _eh._offRate + offRatePlus;
564 }
565 if(_overrideOffRate > _eh._offRate) {
566 _eh.setOffRate(_overrideOffRate);
567 assert_eq(_overrideOffRate, _eh._offRate);
568 }
569 assert(repOk());
570 }
571
572 /// Construct an Ebwt from the given header parameters and string
573 /// vector, optionally using a blockwise suffix sorter with the
574 /// given 'bmax' and 'dcv' parameters. The string vector is
575 /// ultimately joined and the joined string is passed to buildToDisk().
576 template<typename TStr>
577 Ebwt(
578 TStr exampleStr,
579 bool packed,
580 int color,
581 int needEntireReverse,
582 int32_t lineRate,
583 int32_t offRate,
584 int32_t ftabChars,
585 int nthreads,
586 const string& file, // base filename for EBWT files
587 bool fw,
588 bool useBlockwise,
589 TIndexOffU bmax,
590 TIndexOffU bmaxSqrtMult,
591 TIndexOffU bmaxDivN,
592 int dcv,
593 EList<FileBuf*>& is,
594 EList<RefRecord>& szs,
595 TIndexOffU sztot,
596 const RefReadInParams& refparams,
597 uint32_t seed,
598 int32_t overrideOffRate = -1,
599 bool doSaFile = false,
600 bool doBwtFile = false,
601 bool verbose = false,
602 bool passMemExc = false,
603 bool sanityCheck = false) :
604 Ebwt_INITS,
605 _eh(
606 joinedLen(szs),
607 lineRate,
608 offRate,
609 ftabChars,
610 color,
611 refparams.reverse == REF_READ_REVERSE)
612 {
613 #ifdef POPCNT_CAPABILITY
614 ProcessorSupport ps;
615 _usePOPCNTinstruction = ps.POPCNTenabled();
616 #endif
617 _in1Str = file + ".1." + gEbwt_ext;
618 _in2Str = file + ".2." + gEbwt_ext;
619 packed_ = packed;
620 // Open output files
621 ofstream fout1(_in1Str.c_str(), ios::binary);
622 if(!fout1.good()) {
623 cerr << "Could not open index file for writing: \"" << _in1Str.c_str() << "\"" << endl
624 << "Please make sure the directory exists and that permissions allow writing by" << endl
625 << "Bowtie." << endl;
626 throw 1;
627 }
628 ofstream fout2(_in2Str.c_str(), ios::binary);
629 if(!fout2.good()) {
630 cerr << "Could not open index file for writing: \"" << _in2Str.c_str() << "\"" << endl
631 << "Please make sure the directory exists and that permissions allow writing by" << endl
632 << "Bowtie." << endl;
633 throw 1;
634 }
635 _inSaStr = file + ".sa";
636 _inBwtStr = file + ".bwt";
637 ofstream *saOut = NULL, *bwtOut = NULL;
638 if(doSaFile) {
639 saOut = new ofstream(_inSaStr.c_str(), ios::binary);
640 if(!saOut->good()) {
641 cerr << "Could not open suffix-array file for writing: \"" << _inSaStr.c_str() << "\"" << endl
642 << "Please make sure the directory exists and that permissions allow writing by" << endl
643 << "Bowtie." << endl;
644 throw 1;
645 }
646 }
647 if(doBwtFile) {
648 bwtOut = new ofstream(_inBwtStr.c_str(), ios::binary);
649 if(!bwtOut->good()) {
650 cerr << "Could not open suffix-array file for writing: \"" << _inBwtStr.c_str() << "\"" << endl
651 << "Please make sure the directory exists and that permissions allow writing by" << endl
652 << "Bowtie." << endl;
653 throw 1;
654 }
655 }
656 // Build SA(T) and BWT(T) block by block
657 initFromVector<TStr>(
658 is,
659 szs,
660 sztot,
661 refparams,
662 fout1,
663 fout2,
664 file,
665 saOut,
666 bwtOut,
667 nthreads,
668 useBlockwise,
669 bmax,
670 bmaxSqrtMult,
671 bmaxDivN,
672 dcv,
673 seed,
674 verbose);
675 // Close output files
676 fout1.flush();
677
678 int64_t tellpSz1 = (int64_t)fout1.tellp();
679 VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str.c_str());
680 fout1.close();
681 bool err = false;
682 if(tellpSz1 > fileSize(_in1Str.c_str())) {
683 err = true;
684 cerr << "Index is corrupt: File size for " << _in1Str.c_str() << " should have been " << tellpSz1
685 << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
686 }
687 fout2.flush();
688
689 int64_t tellpSz2 = (int64_t)fout2.tellp();
690 VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str.c_str());
691 fout2.close();
692 if(tellpSz2 > fileSize(_in2Str.c_str())) {
693 err = true;
694 cerr << "Index is corrupt: File size for " << _in2Str.c_str() << " should have been " << tellpSz2
695 << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
696 }
697
698 if(saOut != NULL) {
699 // Check on suffix array output file size
700 int64_t tellpSzSa = (int64_t)saOut->tellp();
701 VMSG_NL("Wrote " << tellpSzSa << " bytes to suffix-array file: " << _inSaStr.c_str());
702 saOut->close();
703 delete saOut;
704 if(tellpSzSa > fileSize(_inSaStr.c_str())) {
705 err = true;
706 cerr << "Index is corrupt: File size for " << _inSaStr.c_str() << " should have been " << tellpSzSa
707 << " but is actually " << fileSize(_inSaStr.c_str()) << "." << endl;
708 }
709 }
710
711 if(bwtOut != NULL) {
712 // Check on suffix array output file size
713 int64_t tellpSzBwt = (int64_t)bwtOut->tellp();
714 VMSG_NL("Wrote " << tellpSzBwt << " bytes to BWT file: " << _inBwtStr.c_str());
715 bwtOut->close();
716 delete bwtOut;
717 if(tellpSzBwt > fileSize(_inBwtStr.c_str())) {
718 err = true;
719 cerr << "Index is corrupt: File size for " << _inBwtStr.c_str() << " should have been " << tellpSzBwt
720 << " but is actually " << fileSize(_inBwtStr.c_str()) << "." << endl;
721 }
722 }
723
724 if(err) {
725 cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
726 throw 1;
727 }
728
729 // Reopen as input streams
730 VMSG_NL("Re-opening _in1 and _in2 as input streams");
731 if(_sanity) {
732 VMSG_NL("Sanity-checking Bt2");
733 assert(!isInMemory());
734 readIntoMemory(
735 color, // colorspace?
736 fw ? -1 : needEntireReverse, // 1 -> need the reverse to be reverse-of-concat
737 true, // load SA sample (_offs[])?
738 true, // load ftab (_ftab[] & _eftab[])?
739 true, // load r-starts (_rstarts[])?
740 false, // just load header?
741 NULL, // Params object to fill
742 false, // mm sweep?
743 true, // load names?
744 false); // verbose startup?
745 sanityCheckAll(refparams.reverse);
746 evictFromMemory();
747 assert(!isInMemory());
748 }
749 VMSG_NL("Returning from Ebwt constructor");
750 }
751
752 /**
753 * Static constructor for a pair of forward/reverse indexes for the
754 * given reference string.
755 */
756 template<typename TStr>
757 static pair<Ebwt*, Ebwt*>
fromString(const char * str,bool packed,int color,int reverse,bool bigEndian,int32_t lineRate,int32_t offRate,int32_t ftabChars,const string & file,bool useBlockwise,TIndexOffU bmax,TIndexOffU bmaxSqrtMult,TIndexOffU bmaxDivN,int dcv,uint32_t seed,bool verbose,bool autoMem,bool sanity)758 fromString(
759 const char* str,
760 bool packed,
761 int color,
762 int reverse,
763 bool bigEndian,
764 int32_t lineRate,
765 int32_t offRate,
766 int32_t ftabChars,
767 const string& file,
768 bool useBlockwise,
769 TIndexOffU bmax,
770 TIndexOffU bmaxSqrtMult,
771 TIndexOffU bmaxDivN,
772 int dcv,
773 uint32_t seed,
774 bool verbose,
775 bool autoMem,
776 bool sanity)
777 {
778 EList<std::string> strs(EBWT_CAT);
779 strs.push_back(std::string(str));
780 return fromStrings<TStr>(
781 strs,
782 packed,
783 color,
784 reverse,
785 bigEndian,
786 lineRate,
787 offRate,
788 ftabChars,
789 file,
790 useBlockwise,
791 bmax,
792 bmaxSqrtMult,
793 bmaxDivN,
794 dcv,
795 seed,
796 verbose,
797 autoMem,
798 sanity);
799 }
800
801 /**
802 * Static constructor for a pair of forward/reverse indexes for the
803 * given list of reference strings.
804 */
805 template<typename TStr>
806 static pair<Ebwt*, Ebwt*>
fromStrings(const EList<std::string> & strs,bool packed,int color,int reverse,bool bigEndian,int32_t lineRate,int32_t offRate,int32_t ftabChars,const string & file,bool useBlockwise,TIndexOffU bmax,TIndexOffU bmaxSqrtMult,TIndexOffU bmaxDivN,int dcv,uint32_t seed,bool verbose,bool autoMem,bool sanity)807 fromStrings(
808 const EList<std::string>& strs,
809 bool packed,
810 int color,
811 int reverse,
812 bool bigEndian,
813 int32_t lineRate,
814 int32_t offRate,
815 int32_t ftabChars,
816 const string& file,
817 bool useBlockwise,
818 TIndexOffU bmax,
819 TIndexOffU bmaxSqrtMult,
820 TIndexOffU bmaxDivN,
821 int dcv,
822 uint32_t seed,
823 bool verbose,
824 bool autoMem,
825 bool sanity)
826 {
827 assert(!strs.empty());
828 EList<FileBuf*> is(EBWT_CAT);
829 RefReadInParams refparams(color, REF_READ_FORWARD, false, false);
830 // Adapt sequence strings to stringstreams open for input
831 unique_ptr<stringstream> ss(new stringstream());
832 for(TIndexOffU i = 0; i < strs.size(); i++) {
833 (*ss) << ">" << i << endl << strs[i] << endl;
834 }
835 unique_ptr<FileBuf> fb(new FileBuf(ss.get()));
836 assert(!fb->eof());
837 assert(fb->get() == '>');
838 ASSERT_ONLY(fb->reset());
839 assert(!fb->eof());
840 is.push_back(fb.get());
841 // Vector for the ordered list of "records" comprising the input
842 // sequences. A record represents a stretch of unambiguous
843 // characters in one of the input sequences.
844 EList<RefRecord> szs(EBWT_CAT);
845 std::pair<TIndexOffU, TIndexOffU> sztot;
846 sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
847 // Construct Ebwt from input strings and parameters
848 Ebwt *ebwtFw = new Ebwt(
849 TStr(),
850 packed,
851 refparams.color ? 1 : 0,
852 -1, // fw
853 lineRate,
854 offRate, // suffix-array sampling rate
855 ftabChars, // number of chars in initial arrow-pair calc
856 file, // basename for .?.ebwt files
857 true, // fw?
858 useBlockwise, // useBlockwise
859 bmax, // block size for blockwise SA builder
860 bmaxSqrtMult, // block size as multiplier of sqrt(len)
861 bmaxDivN, // block size as divisor of len
862 dcv, // difference-cover period
863 is, // list of input streams
864 szs, // list of reference sizes
865 sztot.first, // total size of all unambiguous ref chars
866 refparams, // reference read-in parameters
867 seed, // pseudo-random number generator seed
868 -1, // override offRate
869 verbose, // be talkative
870 autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
871 sanity); // verify results and internal consistency
872 refparams.reverse = reverse;
873 szs.clear();
874 sztot = BitPairReference::szsFromFasta(is, file, bigEndian, refparams, szs, sanity);
875 // Construct Ebwt from input strings and parameters
876 Ebwt *ebwtBw = new Ebwt(
877 TStr(),
878 packed,
879 refparams.color ? 1 : 0,
880 reverse == REF_READ_REVERSE,
881 lineRate,
882 offRate, // suffix-array sampling rate
883 ftabChars, // number of chars in initial arrow-pair calc
884 file + ".rev",// basename for .?.ebwt files
885 false, // fw?
886 useBlockwise, // useBlockwise
887 bmax, // block size for blockwise SA builder
888 bmaxSqrtMult, // block size as multiplier of sqrt(len)
889 bmaxDivN, // block size as divisor of len
890 dcv, // difference-cover period
891 is, // list of input streams
892 szs, // list of reference sizes
893 sztot.first, // total size of all unambiguous ref chars
894 refparams, // reference read-in parameters
895 seed, // pseudo-random number generator seed
896 -1, // override offRate
897 verbose, // be talkative
898 autoMem, // pass exceptions up to the toplevel so that we can adjust memory settings automatically
899 sanity); // verify results and internal consistency
900 return make_pair(ebwtFw, ebwtBw);
901 }
902
903 /// Return true iff the Ebwt is packed
isPacked()904 bool isPacked() { return packed_; }
905
906 /**
907 * Write the rstarts array given the szs array for the reference.
908 */
909 void szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse);
910
911 /**
912 * Helper for the constructors above. Takes a vector of text
913 * strings and joins them into a single string with a call to
914 * joinToDisk, which does a join (with padding) and writes some of
915 * the resulting data directly to disk rather than keep it in
916 * memory. It then constructs a suffix-array producer (what kind
917 * depends on 'useBlockwise') for the resulting sequence. The
918 * suffix-array producer can then be used to obtain chunks of the
919 * joined string's suffix array.
920 */
921 template <typename TStr>
initFromVector(EList<FileBuf * > & is,EList<RefRecord> & szs,TIndexOffU sztot,const RefReadInParams & refparams,ofstream & out1,ofstream & out2,const string & outfile,ofstream * saOut,ofstream * bwtOut,int nthreads,bool useBlockwise,TIndexOffU bmax,TIndexOffU bmaxSqrtMult,TIndexOffU bmaxDivN,int dcv,uint32_t seed,bool verbose)922 void initFromVector(EList<FileBuf*>& is,
923 EList<RefRecord>& szs,
924 TIndexOffU sztot,
925 const RefReadInParams& refparams,
926 ofstream& out1,
927 ofstream& out2,
928 const string& outfile,
929 ofstream* saOut,
930 ofstream* bwtOut,
931 int nthreads,
932 bool useBlockwise,
933 TIndexOffU bmax,
934 TIndexOffU bmaxSqrtMult,
935 TIndexOffU bmaxDivN,
936 int dcv,
937 uint32_t seed,
938 bool verbose)
939 {
940 // Compose text strings into single string
941 VMSG_NL("Calculating joined length");
942 TStr s; // holds the entire joined reference after call to joinToDisk
943 TIndexOffU jlen;
944 jlen = joinedLen(szs);
945 assert_geq(jlen, sztot);
946 VMSG_NL("Writing header");
947 writeFromMemory(true, out1, out2);
948 try {
949 VMSG_NL("Reserving space for joined string");
950 s.resize(jlen);
951 VMSG_NL("Joining reference sequences");
952 if(refparams.reverse == REF_READ_REVERSE) {
953 {
954 Timer timer(cout, " Time to join reference sequences: ", _verbose);
955 joinToDisk(is, szs, sztot, refparams, s, out1, out2);
956 }
957 {
958 Timer timer(cout, " Time to reverse reference sequence: ", _verbose);
959 EList<RefRecord> tmp(EBWT_CAT);
960 s.reverse();
961 reverseRefRecords(szs, tmp, false, verbose);
962 szsToDisk(tmp, out1, refparams.reverse);
963 }
964 } else {
965 Timer timer(cout, " Time to join reference sequences: ", _verbose);
966 joinToDisk(is, szs, sztot, refparams, s, out1, out2);
967 szsToDisk(szs, out1, refparams.reverse);
968 }
969 // Joined reference sequence now in 's'
970 } catch(bad_alloc& e) {
971 // If we throw an allocation exception in the try block,
972 // that means that the joined version of the reference
973 // string itself is too larger to fit in memory. The only
974 // alternatives are to tell the user to give us more memory
975 // or to try again with a packed representation of the
976 // reference (if we haven't tried that already).
977 cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
978 if(!isPacked() && _passMemExc) {
979 // Pass the exception up so that we can retry using a
980 // packed string representation
981 throw e;
982 }
983 // There's no point passing this exception on. The fact
984 // that we couldn't allocate the joined string means that
985 // --bmax is irrelevant - the user should re-run with
986 // ebwt-build-packed
987 if(isPacked()) {
988 cerr << "Please try running bowtie-build on a computer with more memory." << endl;
989 } else {
990 cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
991 << "mode (-a/--auto), or try again on a computer with more memory." << endl;
992 }
993 if(sizeof(void*) == 4) {
994 cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
995 << "this executable is 32-bit." << endl;
996 }
997 throw 1;
998 }
999 // Succesfully obtained joined reference string
1000 assert_geq(s.length(), jlen);
1001 if(bmax != OFF_MASK) {
1002 VMSG_NL("bmax according to bmax setting: " << bmax);
1003 }
1004 else if(bmaxSqrtMult != OFF_MASK) {
1005 bmax *= bmaxSqrtMult;
1006 VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
1007 }
1008 else if(bmaxDivN != OFF_MASK) {
1009 bmax = max<TIndexOffU>(jlen / bmaxDivN, 1);
1010 VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
1011 }
1012 else {
1013 bmax = (TIndexOffU)sqrt(s.length());
1014 VMSG_NL("bmax defaulted to: " << bmax);
1015 }
1016 int iter = 0;
1017 bool first = true;
1018 streampos out1pos = out1.tellp();
1019 streampos out2pos = out2.tellp();
1020 // Look for bmax/dcv parameters that work.
1021 while(true) {
1022 if(!first && bmax < 40 && _passMemExc) {
1023 cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
1024 if(!isPacked()) {
1025 // Throw an exception exception so that we can
1026 // retry using a packed string representation
1027 throw bad_alloc();
1028 } else {
1029 cerr << "Already tried a packed string representation." << endl;
1030 }
1031 cerr << "Please try indexing this reference on a computer with more memory." << endl;
1032 if(sizeof(void*) == 4) {
1033 cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
1034 << "this executable is 32-bit." << endl;
1035 }
1036 throw 1;
1037 }
1038 if(!first) {
1039 out1.seekp(out1pos);
1040 out2.seekp(out2pos);
1041 }
1042 if(dcv > 4096) dcv = 4096;
1043 if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
1044 dcv <<= 1; // double difference-cover period
1045 } else {
1046 bmax -= (bmax >> 2); // reduce by 25%
1047 }
1048 VMSG("Using parameters --bmax " << bmax);
1049 if(dcv == 0) {
1050 VMSG_NL(" and *no difference cover*");
1051 } else {
1052 VMSG_NL(" --dcv " << dcv);
1053 }
1054 iter++;
1055 try {
1056 {
1057 VMSG_NL(" Doing ahead-of-time memory usage test");
1058 // Make a quick-and-dirty attempt to force a bad_alloc iff
1059 // we would have thrown one eventually as part of
1060 // constructing the DifferenceCoverSample
1061 dcv <<= 1;
1062 TIndexOffU sz = (TIndexOffU)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
1063 if(nthreads > 1) sz *= (nthreads + 1);
1064 AutoArray<uint8_t> tmp(sz, EBWT_CAT);
1065 dcv >>= 1;
1066 // Likewise with the KarkkainenBlockwiseSA
1067 sz = (TIndexOffU)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
1068 AutoArray<uint8_t> tmp2(sz, EBWT_CAT);
1069 // Now throw in the 'ftab' and 'isaSample' structures
1070 // that we'll eventually allocate in buildToDisk
1071 AutoArray<TIndexOffU> ftab(_eh._ftabLen * 2, EBWT_CAT);
1072 AutoArray<uint8_t> side(_eh._sideSz, EBWT_CAT);
1073 // Grab another 20 MB out of caution
1074 AutoArray<uint32_t> extra(20*1024*1024, EBWT_CAT);
1075 // If we made it here without throwing bad_alloc, then we
1076 // passed the memory-usage stress test
1077 VMSG(" Passed! Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
1078 if(isPacked()) {
1079 VMSG(" --packed");
1080 }
1081 VMSG_NL("");
1082 }
1083 VMSG_NL("Constructing suffix-array element generator");
1084 KarkkainenBlockwiseSA<TStr> bsa(s, bmax, nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile);
1085 assert(bsa.suffixItrIsReset());
1086 assert_eq(bsa.size(), s.length()+1);
1087 VMSG_NL("Converting suffix-array elements to index image");
1088 buildToDisk(bsa, s, out1, out2, saOut, bwtOut);
1089 out1.flush(); out2.flush();
1090 bool failed = out1.fail() || out2.fail();
1091 if(saOut != NULL) {
1092 saOut->flush();
1093 failed = failed || saOut->fail();
1094 }
1095 if(bwtOut != NULL) {
1096 bwtOut->flush();
1097 failed = failed || bwtOut->fail();
1098 }
1099 if(failed) {
1100 cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
1101 throw 1;
1102 }
1103 break;
1104 } catch(bad_alloc& e) {
1105 if(_passMemExc) {
1106 VMSG_NL(" Ran out of memory; automatically trying more memory-economical parameters.");
1107 } else {
1108 cerr << "Out of memory while constructing suffix array. Please try using a smaller" << endl
1109 << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
1110 throw 1;
1111 }
1112 }
1113 first = false;
1114 }
1115 assert(repOk());
1116 // Now write reference sequence names on the end
1117 assert_eq(this->_refnames.size(), this->_nPat);
1118 for(TIndexOffU i = 0; i < this->_refnames.size(); i++) {
1119 out1 << this->_refnames[i].c_str() << endl;
1120 }
1121 out1 << '\0';
1122 out1.flush(); out2.flush();
1123 if(out1.fail() || out2.fail()) {
1124 cerr << "An error occurred writing the index to disk. Please check if the disk is full." << endl;
1125 throw 1;
1126 }
1127 VMSG_NL("Returning from initFromVector");
1128 }
1129
1130 /**
1131 * Return the length that the joined string of the given string
1132 * list will have. Note that this is indifferent to how the text
1133 * fragments correspond to input sequences - it just cares about
1134 * the lengths of the fragments.
1135 */
joinedLen(EList<RefRecord> & szs)1136 TIndexOffU joinedLen(EList<RefRecord>& szs) {
1137 TIndexOffU ret = 0;
1138 for(unsigned int i = 0; i < szs.size(); i++) {
1139 ret += szs[i].len;
1140 }
1141 return ret;
1142 }
1143
1144 /// Destruct an Ebwt
~Ebwt()1145 ~Ebwt() {
1146 _fchr.reset();
1147 _ftab.reset();
1148 _eftab.reset();
1149 _plen.reset();
1150 _rstarts.reset();
1151 _offs.reset();
1152 _ebwt.reset();
1153 if(offs() != NULL && useShmem_) {
1154 FREE_SHARED(offs());
1155 }
1156 if(ebwt() != NULL && useShmem_) {
1157 FREE_SHARED(ebwt());
1158 }
1159 if (_in1 != NULL) fclose(_in1);
1160 if (_in2 != NULL) fclose(_in2);
1161 }
1162
1163 /// Accessors
eh()1164 inline const EbwtParams& eh() const { return _eh; }
zOff()1165 TIndexOffU zOff() const { return _zOff; }
zEbwtByteOff()1166 TIndexOffU zEbwtByteOff() const { return _zEbwtByteOff; }
zEbwtBpOff()1167 TIndexOff zEbwtBpOff() const { return _zEbwtBpOff; }
nPat()1168 TIndexOffU nPat() const { return _nPat; }
nFrag()1169 TIndexOffU nFrag() const { return _nFrag; }
fchr()1170 inline TIndexOffU* fchr() { return _fchr.get(); }
ftab()1171 inline TIndexOffU* ftab() { return _ftab.get(); }
eftab()1172 inline TIndexOffU* eftab() { return _eftab.get(); }
offs()1173 inline TIndexOffU* offs() { return _offs.get(); }
plen()1174 inline TIndexOffU* plen() { return _plen.get(); }
rstarts()1175 inline TIndexOffU* rstarts() { return _rstarts.get(); }
ebwt()1176 inline uint8_t* ebwt() { return _ebwt.get(); }
fchr()1177 inline const TIndexOffU* fchr() const { return _fchr.get(); }
ftab()1178 inline const TIndexOffU* ftab() const { return _ftab.get(); }
eftab()1179 inline const TIndexOffU* eftab() const { return _eftab.get(); }
offs()1180 inline const TIndexOffU* offs() const { return _offs.get(); }
plen()1181 inline const TIndexOffU* plen() const { return _plen.get(); }
rstarts()1182 inline const TIndexOffU* rstarts() const { return _rstarts.get(); }
ebwt()1183 inline const uint8_t* ebwt() const { return _ebwt.get(); }
verbose()1184 bool verbose() const { return _verbose; }
sanityCheck()1185 bool sanityCheck() const { return _sanity; }
refnames()1186 EList<string>& refnames() { return _refnames; }
fw()1187 bool fw() const { return fw_; }
1188 #ifdef POPCNT_CAPABILITY
1189 bool _usePOPCNTinstruction;
1190 #endif
1191
1192 /**
1193 * Returns true iff the index contains the given string (exactly). The
1194 * given string must contain only unambiguous characters. TODO:
1195 * support skipping of ambiguous characters.
1196 */
1197 bool contains(
1198 const BTDnaString& str,
1199 TIndexOffU *top = NULL,
1200 TIndexOffU *bot = NULL) const;
1201
1202 /**
1203 * Returns true iff the index contains the given string (exactly). The
1204 * given string must contain only unambiguous characters. TODO:
1205 * support skipping of ambiguous characters.
1206 */
1207 bool contains(
1208 const char *str,
1209 TIndexOffU *top = NULL,
1210 TIndexOffU *bot = NULL) const
1211 {
1212 return contains(BTDnaString(str, true), top, bot);
1213 }
1214
1215 /// Return true iff the Ebwt is currently in memory
isInMemory()1216 bool isInMemory() const {
1217 if(ebwt() != NULL) {
1218 // Note: We might have skipped loading _offs, _ftab,
1219 // _eftab, and _rstarts depending on whether this is the
1220 // reverse index and what algorithm is being used.
1221 assert(_eh.repOk());
1222 //assert(_ftab != NULL);
1223 //assert(_eftab != NULL);
1224 assert(fchr() != NULL);
1225 //assert(_offs != NULL);
1226 //assert(_rstarts != NULL);
1227 assert_neq(_zEbwtByteOff, OFF_MASK);
1228 assert_neq(_zEbwtBpOff, -1);
1229 return true;
1230 } else {
1231 assert(ftab() == NULL);
1232 assert(eftab() == NULL);
1233 assert(fchr() == NULL);
1234 assert(offs() == NULL);
1235 assert(rstarts() == NULL);
1236 assert_eq(_zEbwtByteOff, OFF_MASK);
1237 assert_eq(_zEbwtBpOff, -1);
1238 return false;
1239 }
1240 }
1241
1242 /// Return true iff the Ebwt is currently stored on disk
isEvicted()1243 bool isEvicted() const {
1244 return !isInMemory();
1245 }
1246
1247 /**
1248 * Load this Ebwt into memory by reading it in from the _in1 and
1249 * _in2 streams.
1250 */
loadIntoMemory(int color,int needEntireReverse,bool loadSASamp,bool loadFtab,bool loadRstarts,bool loadNames,bool verbose)1251 void loadIntoMemory(
1252 int color,
1253 int needEntireReverse,
1254 bool loadSASamp,
1255 bool loadFtab,
1256 bool loadRstarts,
1257 bool loadNames,
1258 bool verbose)
1259 {
1260 readIntoMemory(
1261 color, // expect index to be colorspace?
1262 needEntireReverse, // require reverse index to be concatenated reference reversed
1263 loadSASamp, // load the SA sample portion?
1264 loadFtab, // load the ftab (_ftab[] and _eftab[])?
1265 loadRstarts, // load the r-starts (_rstarts[])?
1266 false, // stop after loading the header portion?
1267 NULL, // params
1268 false, // mmSweep
1269 loadNames, // loadNames
1270 verbose); // startVerbose
1271 }
1272
1273 /**
1274 * Frees memory associated with the Ebwt.
1275 */
evictFromMemory()1276 void evictFromMemory() {
1277 assert(isInMemory());
1278 _fchr.free();
1279 _ftab.free();
1280 _eftab.free();
1281 _rstarts.free();
1282 _offs.free(); // might not be under control of APtrWrap
1283 _ebwt.free(); // might not be under control of APtrWrap
1284 // Keep plen; it's small and the client may want to seq it
1285 // even when the others are evicted.
1286 //_plen = NULL;
1287 _zEbwtByteOff = OFF_MASK;
1288 _zEbwtBpOff = -1;
1289 }
1290
1291 /**
1292 * Turn a substring of 'seq' starting at offset 'off' and having
1293 * length equal to the index's 'ftabChars' into an int that can be
1294 * used to index into the ftab array.
1295 */
ftabSeqToInt(const BTDnaString & seq,size_t off,bool rev)1296 TIndexOffU ftabSeqToInt(
1297 const BTDnaString& seq,
1298 size_t off,
1299 bool rev) const
1300 {
1301 int fc = _eh._ftabChars;
1302 size_t lo = off, hi = lo + fc;
1303 assert_leq(hi, seq.length());
1304 TIndexOffU ftabOff = 0;
1305 for(int i = 0; i < fc; i++) {
1306 bool fwex = fw();
1307 if(rev) fwex = !fwex;
1308 // We add characters to the ftabOff in the order they would
1309 // have been consumed in a normal search. For BWT, this
1310 // means right-to-left order; for BWT' it's left-to-right.
1311 int c = (fwex ? seq[lo + i] : seq[hi - i - 1]);
1312 if(c > 3) {
1313 return std::numeric_limits<TIndexOffU>::max();
1314 }
1315 assert_range(0, 3, c);
1316 ftabOff <<= 2;
1317 ftabOff |= c;
1318 }
1319 return ftabOff;
1320 }
1321
1322 /**
1323 * Non-static facade for static function ftabHi.
1324 */
ftabHi(TIndexOffU i)1325 TIndexOffU ftabHi(TIndexOffU i) const {
1326 return Ebwt::ftabHi(
1327 ftab(),
1328 eftab(),
1329 _eh._len,
1330 _eh._ftabLen,
1331 _eh._eftabLen,
1332 i);
1333 }
1334
1335 /**
1336 * Get "high interpretation" of ftab entry at index i. The high
1337 * interpretation of a regular ftab entry is just the entry
1338 * itself. The high interpretation of an extended entry is the
1339 * second correpsonding ui32 in the eftab.
1340 *
1341 * It's a static member because it's convenient to ask this
1342 * question before the Ebwt is fully initialized.
1343 */
ftabHi(const TIndexOffU * ftab,const TIndexOffU * eftab,TIndexOffU len,TIndexOffU ftabLen,TIndexOffU eftabLen,TIndexOffU i)1344 static TIndexOffU ftabHi(
1345 const TIndexOffU *ftab,
1346 const TIndexOffU *eftab,
1347 TIndexOffU len,
1348 TIndexOffU ftabLen,
1349 TIndexOffU eftabLen,
1350 TIndexOffU i)
1351 {
1352 assert_lt(i, ftabLen);
1353 if(ftab[i] <= len) {
1354 return ftab[i];
1355 } else {
1356 TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
1357 assert_lt(efIdx*2+1, eftabLen);
1358 return eftab[efIdx*2+1];
1359 }
1360 }
1361
1362 /**
1363 * Non-static facade for static function ftabLo.
1364 */
ftabLo(TIndexOffU i)1365 TIndexOffU ftabLo(TIndexOffU i) const {
1366 return Ebwt::ftabLo(
1367 ftab(),
1368 eftab(),
1369 _eh._len,
1370 _eh._ftabLen,
1371 _eh._eftabLen,
1372 i);
1373 }
1374
1375 /**
1376 * Get low bound of ftab range.
1377 */
ftabLo(const BTDnaString & seq,size_t off)1378 TIndexOffU ftabLo(const BTDnaString& seq, size_t off) const {
1379 return ftabLo(ftabSeqToInt(seq, off, false));
1380 }
1381
1382 /**
1383 * Get high bound of ftab range.
1384 */
ftabHi(const BTDnaString & seq,size_t off)1385 TIndexOffU ftabHi(const BTDnaString& seq, size_t off) const {
1386 return ftabHi(ftabSeqToInt(seq, off, false));
1387 }
1388
1389 /**
1390 * Extract characters from seq starting at offset 'off' and going either
1391 * forward or backward, depending on 'rev'. Order matters when compiling
1392 * the integer that gets looked up in the ftab. Each successive character
1393 * is ORed into the least significant bit-pair, and characters are
1394 * integrated in the direction of the search.
1395 */
1396 bool
ftabLoHi(const BTDnaString & seq,size_t off,bool rev,TIndexOffU & top,TIndexOffU & bot)1397 ftabLoHi(
1398 const BTDnaString& seq, // sequence to extract from
1399 size_t off, // offset into seq to begin extracting
1400 bool rev, // reverse while extracting
1401 TIndexOffU& top,
1402 TIndexOffU& bot) const
1403 {
1404 TIndexOffU fi = ftabSeqToInt(seq, off, rev);
1405 if(fi == std::numeric_limits<TIndexOffU>::max()) {
1406 return false;
1407 }
1408 top = ftabHi(fi);
1409 bot = ftabLo(fi+1);
1410 assert_geq(bot, top);
1411 return true;
1412 }
1413
1414 /**
1415 * Get "low interpretation" of ftab entry at index i. The low
1416 * interpretation of a regular ftab entry is just the entry
1417 * itself. The low interpretation of an extended entry is the
1418 * first correpsonding ui32 in the eftab.
1419 *
1420 * It's a static member because it's convenient to ask this
1421 * question before the Ebwt is fully initialized.
1422 */
ftabLo(const TIndexOffU * ftab,const TIndexOffU * eftab,TIndexOffU len,TIndexOffU ftabLen,TIndexOffU eftabLen,TIndexOffU i)1423 static TIndexOffU ftabLo(
1424 const TIndexOffU *ftab,
1425 const TIndexOffU *eftab,
1426 TIndexOffU len,
1427 TIndexOffU ftabLen,
1428 TIndexOffU eftabLen,
1429 TIndexOffU i)
1430 {
1431 assert_lt(i, ftabLen);
1432 if(ftab[i] <= len) {
1433 return ftab[i];
1434 } else {
1435 TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
1436 assert_lt(efIdx*2+1, eftabLen);
1437 return eftab[efIdx*2];
1438 }
1439 }
1440
1441 /**
1442 * Try to resolve the reference offset of the BW element 'elt'. If
1443 * it can be resolved immediately, return the reference offset. If
1444 * it cannot be resolved immediately, return max value.
1445 */
tryOffset(TIndexOffU elt)1446 TIndexOffU tryOffset(TIndexOffU elt) const {
1447 assert(offs() != NULL);
1448 if(elt == _zOff) return 0;
1449 if((elt & _eh._offMask) == elt) {
1450 TIndexOffU eltOff = elt >> _eh._offRate;
1451 assert_lt(eltOff, _eh._offsLen);
1452 TIndexOffU off = offs()[eltOff];
1453 assert_neq(OFF_MASK, off);
1454 return off;
1455 } else {
1456 // Try looking at zoff
1457 return OFF_MASK;
1458 }
1459 }
1460
1461 /**
1462 * Try to resolve the reference offset of the BW element 'elt' such
1463 * that the offset returned is at the right-hand side of the
1464 * forward reference substring involved in the hit.
1465 */
tryOffset(TIndexOffU elt,bool fw,TIndexOffU hitlen)1466 TIndexOffU tryOffset(
1467 TIndexOffU elt,
1468 bool fw,
1469 TIndexOffU hitlen) const
1470 {
1471 TIndexOffU off = tryOffset(elt);
1472 if(off != OFF_MASK && !fw) {
1473 assert_lt(off, _eh._len);
1474 off = _eh._len - off - 1;
1475 assert_geq(off, hitlen-1);
1476 off -= (hitlen-1);
1477 assert_lt(off, _eh._len);
1478 }
1479 return off;
1480 }
1481
1482 /**
1483 * Walk 'steps' steps to the left and return the row arrived at.
1484 */
1485 TIndexOffU walkLeft(TIndexOffU row, TIndexOffU steps) const;
1486
1487 /**
1488 * Resolve the reference offset of the BW element 'elt'.
1489 */
1490 TIndexOffU getOffset(TIndexOffU row) const;
1491
1492 /**
1493 * Resolve the reference offset of the BW element 'elt' such that
1494 * the offset returned is at the right-hand side of the forward
1495 * reference substring involved in the hit.
1496 */
1497 TIndexOffU getOffset(
1498 TIndexOffU elt,
1499 bool fw,
1500 TIndexOffU hitlen) const;
1501
1502 /**
1503 * When using read() to create an Ebwt, we have to set a couple of
1504 * additional fields in the Ebwt object that aren't part of the
1505 * parameter list and are not stored explicitly in the file. Right
1506 * now, this just involves initializing _zEbwtByteOff and
1507 * _zEbwtBpOff from _zOff.
1508 */
postReadInit(EbwtParams & eh)1509 void postReadInit(EbwtParams& eh) {
1510 TIndexOffU sideNum = _zOff / eh._sideBwtLen;
1511 TIndexOffU sideCharOff = _zOff % eh._sideBwtLen;
1512 TIndexOffU sideByteOff = sideNum * eh._sideSz;
1513 _zEbwtByteOff = sideCharOff >> 2;
1514 assert_lt(_zEbwtByteOff, eh._sideBwtSz);
1515 _zEbwtBpOff = sideCharOff & 3;
1516 assert_lt(_zEbwtBpOff, 4);
1517 _zEbwtByteOff += sideByteOff;
1518 assert(repOk(eh)); // Ebwt should be fully initialized now
1519 }
1520
1521 /**
1522 * Given basename of an Ebwt index, read and return its flag.
1523 */
1524 static int32_t readFlags(const string& instr);
1525
1526 /**
1527 * Pretty-print the Ebwt to the given output stream.
1528 */
print(ostream & out)1529 void print(ostream& out) const {
1530 print(out, _eh);
1531 }
1532
1533 /**
1534 * Pretty-print the Ebwt and given EbwtParams to the given output
1535 * stream.
1536 */
print(ostream & out,const EbwtParams & eh)1537 void print(ostream& out, const EbwtParams& eh) const {
1538 eh.print(out); // print params
1539 out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
1540 << " zOff: " << _zOff << endl
1541 << " zEbwtByteOff: " << _zEbwtByteOff << endl
1542 << " zEbwtBpOff: " << _zEbwtBpOff << endl
1543 << " nPat: " << _nPat << endl
1544 << " plen: ";
1545 if(plen() == NULL) {
1546 out << "NULL" << endl;
1547 } else {
1548 out << "non-NULL, [0] = " << plen()[0] << endl;
1549 }
1550 out << " rstarts: ";
1551 if(rstarts() == NULL) {
1552 out << "NULL" << endl;
1553 } else {
1554 out << "non-NULL, [0] = " << rstarts()[0] << endl;
1555 }
1556 out << " ebwt: ";
1557 if(ebwt() == NULL) {
1558 out << "NULL" << endl;
1559 } else {
1560 out << "non-NULL, [0] = " << ebwt()[0] << endl;
1561 }
1562 out << " fchr: ";
1563 if(fchr() == NULL) {
1564 out << "NULL" << endl;
1565 } else {
1566 out << "non-NULL, [0] = " << fchr()[0] << endl;
1567 }
1568 out << " ftab: ";
1569 if(ftab() == NULL) {
1570 out << "NULL" << endl;
1571 } else {
1572 out << "non-NULL, [0] = " << ftab()[0] << endl;
1573 }
1574 out << " eftab: ";
1575 if(eftab() == NULL) {
1576 out << "NULL" << endl;
1577 } else {
1578 out << "non-NULL, [0] = " << eftab()[0] << endl;
1579 }
1580 out << " offs: ";
1581 if(offs() == NULL) {
1582 out << "NULL" << endl;
1583 } else {
1584 out << "non-NULL, [0] = " << offs()[0] << endl;
1585 }
1586 }
1587
1588 // Building
1589 template <typename TStr> static TStr join(EList<TStr>& l, uint32_t seed);
1590 template <typename TStr> static TStr join(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed);
1591 template <typename TStr> void joinToDisk(EList<FileBuf*>& l, EList<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2);
1592 template <typename TStr> void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2, ostream* saOut, ostream* bwtOut);
1593
1594 // I/O
1595 void readIntoMemory(int color, int needEntireRev, bool loadSASamp, bool loadFtab, bool loadRstarts, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
1596 void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
1597 void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;
1598
1599 // Sanity checking
1600 void sanityCheckUpToSide(TIndexOff upToSide) const;
1601 void sanityCheckAll(int reverse) const;
1602 void restore(SString<char>& s) const;
1603 void checkOrigs(const EList<SString<char> >& os, bool color, bool mirror) const;
1604
1605 // Searching and reporting
1606 void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen, bool rejectStraddle, bool& straddled) const;
1607
1608 #define WITHIN_BWT_LEN(x) \
1609 assert_leq(x[0], this->_eh._sideBwtLen); \
1610 assert_leq(x[1], this->_eh._sideBwtLen); \
1611 assert_leq(x[2], this->_eh._sideBwtLen); \
1612 assert_leq(x[3], this->_eh._sideBwtLen)
1613
1614 #define WITHIN_FCHR(x) \
1615 assert_leq(x[0], this->fchr()[1]); \
1616 assert_leq(x[1], this->fchr()[2]); \
1617 assert_leq(x[2], this->fchr()[3]); \
1618 assert_leq(x[3], this->fchr()[4])
1619
1620 #define WITHIN_FCHR_DOLLARA(x) \
1621 assert_leq(x[0], this->fchr()[1]+1); \
1622 assert_leq(x[1], this->fchr()[2]); \
1623 assert_leq(x[2], this->fchr()[3]); \
1624 assert_leq(x[3], this->fchr()[4])
1625
1626 /**
1627 * Count all occurrences of character c from the beginning of the
1628 * forward side to <by,bp> and add in the occ[] count up to the side
1629 * break just prior to the side.
1630 *
1631 * A Bowtie 2 side is shaped like:
1632 *
1633 * XXXXXXXXXXXXXXXX [A] [C] [G] [T]
1634 * --------48------ -4- -4- -4- -4- (numbers in bytes)
1635 */
countBt2Side(const SideLocus & l,int c)1636 inline TIndexOffU countBt2Side(const SideLocus& l, int c) const {
1637 assert_range(0, 3, c);
1638 assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
1639 assert_range(0, 3, (int)l._bp);
1640 const uint8_t *side = l.side(this->ebwt());
1641 TIndexOffU cCnt = countUpTo(l, c);
1642 assert_leq(cCnt, l.toBWRow());
1643 assert_leq(cCnt, this->_eh._sideBwtLen);
1644 if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
1645 // Adjust for the fact that we represented $ with an 'A', but
1646 // shouldn't count it as an 'A' here
1647 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
1648 (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
1649 {
1650 cCnt--; // Adjust for '$' looking like an 'A'
1651 }
1652 }
1653 TIndexOffU ret;
1654 // Now factor in the occ[] count at the side break
1655 const uint8_t *acgt8 = side + _eh._sideBwtSz;
1656 const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt8);
1657 assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
1658 assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
1659 assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
1660 assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
1661 ret = endianizeU<TIndexOffU>(acgt[c], _switchEndian) + cCnt + this->fchr()[c];
1662 #ifndef NDEBUG
1663 assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section
1664 if(c == 0) {
1665 assert_leq(cCnt, this->_eh._sideBwtLen);
1666 } else {
1667 assert_leq(ret, this->_eh._bwtLen);
1668 }
1669 #endif
1670 return ret;
1671 }
1672
1673 /**
1674 * Count all occurrences of all four nucleotides up to the starting
1675 * point (which must be in a forward side) given by 'l' storing the
1676 * result in 'cntsUpto', then count nucleotide occurrences within the
1677 * range of length 'num' storing the result in 'cntsIn'. Also, keep
1678 * track of the characters occurring within the range by setting
1679 * 'masks' accordingly (masks[1][10] == true -> 11th character is a
1680 * 'C', and masks[0][10] == masks[2][10] == masks[3][10] == false.
1681 */
countBt2SideRange(SideLocus & l,TIndexOffU num,TIndexOffU * cntsUpto,TIndexOffU * cntsIn,EList<bool> * masks)1682 inline void countBt2SideRange(
1683 SideLocus& l, // top locus
1684 TIndexOffU num, // number of elts in range to tall // @double-check
1685 TIndexOffU* cntsUpto, // A/C/G/T counts up to top
1686 TIndexOffU* cntsIn, // A/C/G/T counts within range
1687 EList<bool> *masks) const // masks indicating which range elts = A/C/G/T
1688 {
1689 assert_gt(num, 0);
1690 assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
1691 assert_range(0, 3, (int)l._bp);
1692 countUpToEx(l, cntsUpto);
1693 WITHIN_FCHR_DOLLARA(cntsUpto);
1694 WITHIN_BWT_LEN(cntsUpto);
1695 const uint8_t *side = l.side(this->ebwt());
1696 if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
1697 // Adjust for the fact that we represented $ with an 'A', but
1698 // shouldn't count it as an 'A' here
1699 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
1700 (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
1701 {
1702 cntsUpto[0]--; // Adjust for '$' looking like an 'A'
1703 }
1704 }
1705 // Now factor in the occ[] count at the side break
1706 const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(side + _eh._sideBwtSz);
1707 assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->fchr()[1] + this->_eh.sideBwtLen());
1708 assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->fchr()[2]-this->fchr()[1]);
1709 assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->fchr()[3]-this->fchr()[2]);
1710 assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->fchr()[4]-this->fchr()[3]);
1711 assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._len + this->_eh.sideBwtLen());
1712 assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
1713 assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
1714 assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
1715 cntsUpto[0] += (endianizeU<TIndexOffU>(acgt[0], _switchEndian) + this->fchr()[0]);
1716 cntsUpto[1] += (endianizeU<TIndexOffU>(acgt[1], _switchEndian) + this->fchr()[1]);
1717 cntsUpto[2] += (endianizeU<TIndexOffU>(acgt[2], _switchEndian) + this->fchr()[2]);
1718 cntsUpto[3] += (endianizeU<TIndexOffU>(acgt[3], _switchEndian) + this->fchr()[3]);
1719 masks[0].resize(num);
1720 masks[1].resize(num);
1721 masks[2].resize(num);
1722 masks[3].resize(num);
1723 WITHIN_FCHR_DOLLARA(cntsUpto);
1724 WITHIN_FCHR_DOLLARA(cntsIn);
1725 // 'cntsUpto' is complete now.
1726 // Walk forward until we've tallied the entire 'In' range
1727 TIndexOffU nm = 0;
1728 // Rest of this side
1729 nm += countBt2SideRange2(l, true, num - nm, cntsIn, masks, nm);
1730 assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
1731 assert_leq(nm, num);
1732 SideLocus lcopy = l;
1733 while(nm < num) {
1734 // Subsequent sides, if necessary
1735 lcopy.nextSide(this->_eh);
1736 nm += countBt2SideRange2(lcopy, false, num - nm, cntsIn, masks, nm);
1737 WITHIN_FCHR_DOLLARA(cntsIn);
1738 assert_leq(nm, num);
1739 assert_eq(nm, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
1740 }
1741 assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
1742 WITHIN_FCHR_DOLLARA(cntsIn);
1743 }
1744
1745 /**
1746 * Count all occurrences of character c from the beginning of the
1747 * forward side to <by,bp> and add in the occ[] count up to the side
1748 * break just prior to the side.
1749 *
1750 * A forward side is shaped like:
1751 *
1752 * [A] [C] XXXXXXXXXXXXXXXX
1753 * -4- -4- --------56------ (numbers in bytes)
1754 * ^
1755 * Side ptr (result from SideLocus.side())
1756 *
1757 * And following it is a reverse side shaped like:
1758 *
1759 * [G] [T] XXXXXXXXXXXXXXXX
1760 * -4- -4- --------56------ (numbers in bytes)
1761 * ^
1762 * Side ptr (result from SideLocus.side())
1763 *
1764 */
countBt2SideEx(const SideLocus & l,TIndexOffU * arrs)1765 inline void countBt2SideEx(const SideLocus& l, TIndexOffU* arrs) const {
1766 assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
1767 assert_range(0, 3, (int)l._bp);
1768 countUpToEx(l, arrs);
1769 if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
1770 // Adjust for the fact that we represented $ with an 'A', but
1771 // shouldn't count it as an 'A' here
1772 if((l._sideByteOff + l._by > _zEbwtByteOff) ||
1773 (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
1774 {
1775 arrs[0]--; // Adjust for '$' looking like an 'A'
1776 }
1777 }
1778 WITHIN_FCHR(arrs);
1779 WITHIN_BWT_LEN(arrs);
1780 // Now factor in the occ[] count at the side break
1781 const uint8_t *side = l.side(this->ebwt());
1782 const uint8_t *acgt16 = side + this->_eh._sideSz - OFF_SIZE*4;
1783 const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt16);
1784 assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->fchr()[1] + this->_eh.sideBwtLen());
1785 assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->fchr()[2]-this->fchr()[1]);
1786 assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->fchr()[3]-this->fchr()[2]);
1787 assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->fchr()[4]-this->fchr()[3]);
1788 assert_leq(endianizeU<TIndexOffU>(acgt[0], _switchEndian), this->_eh._len + this->_eh.sideBwtLen());
1789 assert_leq(endianizeU<TIndexOffU>(acgt[1], _switchEndian), this->_eh._len);
1790 assert_leq(endianizeU<TIndexOffU>(acgt[2], _switchEndian), this->_eh._len);
1791 assert_leq(endianizeU<TIndexOffU>(acgt[3], _switchEndian), this->_eh._len);
1792 arrs[0] += (endianizeU<TIndexOffU>(acgt[0], _switchEndian) + this->fchr()[0]);
1793 arrs[1] += (endianizeU<TIndexOffU>(acgt[1], _switchEndian) + this->fchr()[1]);
1794 arrs[2] += (endianizeU<TIndexOffU>(acgt[2], _switchEndian) + this->fchr()[2]);
1795 arrs[3] += (endianizeU<TIndexOffU>(acgt[3], _switchEndian) + this->fchr()[3]);
1796 WITHIN_FCHR(arrs);
1797 }
1798
1799 /**
1800 * Counts the number of occurrences of character 'c' in the given Ebwt
1801 * side up to (but not including) the given byte/bitpair (by/bp).
1802 *
1803 * This is a performance-critical function. This is the top search-
1804 * related hit in the time profile.
1805 *
1806 * Function gets 11.09% in profile
1807 */
countUpTo(const SideLocus & l,int c)1808 inline TIndexOffU countUpTo(const SideLocus& l, int c) const { // @double-check
1809 // Count occurrences of c in each 64-bit (using bit trickery);
1810 // Someday countInU64() and pop() functions should be
1811 // vectorized/SSE-ized in case that helps.
1812 TIndexOffU cCnt = 0;
1813 const uint8_t *side = l.side(this->ebwt());
1814 int i = 0;
1815 #ifdef POPCNT_CAPABILITY
1816 if ( _usePOPCNTinstruction) {
1817 for(; i + 7 < l._by; i += 8) {
1818 cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
1819 }
1820 }
1821 else {
1822 for(; i + 7 < l._by; i += 8) {
1823 cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
1824 }
1825 }
1826 #else
1827 for(; i + 7 < l._by; i += 8) {
1828 cCnt += countInU64(c, *(uint64_t*)&side[i]);
1829 }
1830 #endif
1831 // Count occurences of c in the rest of the side (using LUT)
1832 for(; i < l._by; i++) {
1833 cCnt += cCntLUT_4[0][c][side[i]];
1834 }
1835 // Count occurences of c in the rest of the byte
1836 if(l._bp > 0) {
1837 cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
1838 }
1839 return cCnt;
1840 }
1841
1842 /**
1843 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
1844 * within a 64-bit argument.
1845 *
1846 * Function gets 2.32% in profile
1847 */
1848 #ifdef POPCNT_CAPABILITY
1849 template<typename Operation>
1850 #endif
countInU64Ex(uint64_t dw,TIndexOffU * arrs)1851 inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) {
1852 uint64_t c0 = c_table[0];
1853 uint64_t x0 = dw ^ c0;
1854 uint64_t x1 = (x0 >> 1);
1855 uint64_t x2 = x1 & (0x5555555555555555llu);
1856 uint64_t x3 = x0 & x2;
1857 #ifdef POPCNT_CAPABILITY
1858 uint64_t tmp = Operation().pop64(x3);
1859 #else
1860 uint64_t tmp = pop64(x3);
1861 #endif
1862 arrs[0] += (uint32_t) tmp;
1863
1864 c0 = c_table[1];
1865 x0 = dw ^ c0;
1866 x1 = (x0 >> 1);
1867 x2 = x1 & (0x5555555555555555llu);
1868 x3 = x0 & x2;
1869 #ifdef POPCNT_CAPABILITY
1870 tmp = Operation().pop64(x3);
1871 #else
1872 tmp = pop64(x3);
1873 #endif
1874 arrs[1] += (uint32_t) tmp;
1875
1876 c0 = c_table[2];
1877 x0 = dw ^ c0;
1878 x1 = (x0 >> 1);
1879 x2 = x1 & (0x5555555555555555llu);
1880 x3 = x0 & x2;
1881 #ifdef POPCNT_CAPABILITY
1882 tmp = Operation().pop64(x3);
1883 #else
1884 tmp = pop64(x3);
1885 #endif
1886 arrs[2] += (uint32_t) tmp;
1887
1888 c0 = c_table[3];
1889 x0 = dw ^ c0;
1890 x1 = (x0 >> 1);
1891 x2 = x1 & (0x5555555555555555llu);
1892 x3 = x0 & x2;
1893 #ifdef POPCNT_CAPABILITY
1894 tmp = Operation().pop64(x3);
1895 #else
1896 tmp = pop64(x3);
1897 #endif
1898 arrs[3] += (uint32_t) tmp;
1899 }
1900
1901 /**
1902 * Counts the number of occurrences of all four nucleotides in the
1903 * given side up to (but not including) the given byte/bitpair (by/bp).
1904 * Count for 'a' goes in arrs[0], 'c' in arrs[1], etc.
1905 */
countUpToEx(const SideLocus & l,TIndexOffU * arrs)1906 inline void countUpToEx(const SideLocus& l, TIndexOffU* arrs) const {
1907 int i = 0;
1908 // Count occurrences of each nucleotide in each 64-bit word using
1909 // bit trickery; note: this seems does not seem to lend a
1910 // significant boost to performance in practice. If you comment
1911 // out this whole loop (which won't affect correctness - it will
1912 // just cause the following loop to take up the slack) then runtime
1913 // does not change noticeably. Someday the countInU64() and pop()
1914 // functions should be vectorized/SSE-ized in case that helps.
1915 const uint8_t *side = l.side(this->ebwt());
1916
1917 #ifdef POPCNT_CAPABILITY
1918 if (_usePOPCNTinstruction) {
1919 for(; i+7 < l._by; i += 8) {
1920 countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
1921 }
1922 }
1923 else {
1924 for(; i+7 < l._by; i += 8) {
1925 countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
1926 }
1927 }
1928 #else
1929 for(; i+7 < l._by; i += 8) {
1930 countInU64Ex(*(uint64_t*)&side[i], arrs);
1931 }
1932 #endif
1933 // Count occurences of nucleotides in the rest of the side (using LUT)
1934 // Many cache misses on following lines (~20K)
1935 for(; i < l._by; i++) {
1936 arrs[0] += cCntLUT_4[0][0][side[i]];
1937 arrs[1] += cCntLUT_4[0][1][side[i]];
1938 arrs[2] += cCntLUT_4[0][2][side[i]];
1939 arrs[3] += cCntLUT_4[0][3][side[i]];
1940 }
1941 // Count occurences of c in the rest of the byte
1942 if(l._bp > 0) {
1943 arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
1944 arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
1945 arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
1946 arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
1947 }
1948 }
1949
1950 #ifndef NDEBUG
1951 /**
1952 * Given top and bot loci, calculate counts of all four DNA chars up to
1953 * those loci. Used for more advanced backtracking-search.
1954 */
1955 inline void mapLFEx(
1956 const SideLocus& l,
1957 TIndexOffU *arrs
1958 ASSERT_ONLY(, bool overrideSanity = false)
1959 ) const
1960 {
1961 assert_eq(0, arrs[0]);
1962 assert_eq(0, arrs[1]);
1963 assert_eq(0, arrs[2]);
1964 assert_eq(0, arrs[3]);
1965 countBt2SideEx(l, arrs);
1966 if(_sanity && !overrideSanity) {
1967 // Make sure results match up with individual calls to mapLF;
1968 // be sure to override sanity-checking in the callee, or we'll
1969 // have infinite recursion
1970 assert_eq(mapLF(l, 0, true), arrs[0]);
1971 assert_eq(mapLF(l, 1, true), arrs[1]);
1972 assert_eq(mapLF(l, 2, true), arrs[2]);
1973 assert_eq(mapLF(l, 3, true), arrs[3]);
1974 }
1975 }
1976 #endif
1977
1978 /**
1979 * Given top and bot rows, calculate counts of all four DNA chars up to
1980 * those loci.
1981 */
1982 inline void mapLFEx(
1983 TIndexOffU top,
1984 TIndexOffU bot,
1985 TIndexOffU *tops,
1986 TIndexOffU *bots
1987 ASSERT_ONLY(, bool overrideSanity = false)
1988 ) const
1989 {
1990 SideLocus ltop, lbot;
1991 SideLocus::initFromTopBot(top, bot, _eh, ebwt(), ltop, lbot);
1992 mapLFEx(ltop, lbot, tops, bots ASSERT_ONLY(, overrideSanity));
1993 }
1994
1995 /**
1996 * Given top and bot loci, calculate counts of all four DNA chars up to
1997 * those loci. Used for more advanced backtracking-search.
1998 */
1999 inline void mapLFEx(
2000 const SideLocus& ltop,
2001 const SideLocus& lbot,
2002 TIndexOffU *tops,
2003 TIndexOffU *bots
2004 ASSERT_ONLY(, bool overrideSanity = false)
2005 ) const
2006 {
2007 assert(ltop.repOk(this->eh()));
2008 assert(lbot.repOk(this->eh()));
2009 assert_eq(0, tops[0]); assert_eq(0, bots[0]);
2010 assert_eq(0, tops[1]); assert_eq(0, bots[1]);
2011 assert_eq(0, tops[2]); assert_eq(0, bots[2]);
2012 assert_eq(0, tops[3]); assert_eq(0, bots[3]);
2013 countBt2SideEx(ltop, tops);
2014 countBt2SideEx(lbot, bots);
2015 #ifndef NDEBUG
2016 if(_sanity && !overrideSanity) {
2017 // Make sure results match up with individual calls to mapLF;
2018 // be sure to override sanity-checking in the callee, or we'll
2019 // have infinite recursion
2020 assert_eq(mapLF(ltop, 0, true), tops[0]);
2021 assert_eq(mapLF(ltop, 1, true), tops[1]);
2022 assert_eq(mapLF(ltop, 2, true), tops[2]);
2023 assert_eq(mapLF(ltop, 3, true), tops[3]);
2024 assert_eq(mapLF(lbot, 0, true), bots[0]);
2025 assert_eq(mapLF(lbot, 1, true), bots[1]);
2026 assert_eq(mapLF(lbot, 2, true), bots[2]);
2027 assert_eq(mapLF(lbot, 3, true), bots[3]);
2028 }
2029 #endif
2030 }
2031
2032 /**
2033 * Counts the number of occurrences of all four nucleotides in the
2034 * given side from the given byte/bitpair (l->_by/l->_bp) (or the
2035 * beginning of the side if l == 0). Count for 'a' goes in arrs[0],
2036 * 'c' in arrs[1], etc.
2037 *
2038 * Note: must account for $.
2039 *
2040 * Must fill in masks
2041 */
countBt2SideRange2(const SideLocus & l,bool startAtLocus,TIndexOffU num,TIndexOffU * arrs,EList<bool> * masks,TIndexOffU maskOff)2042 inline TIndexOffU countBt2SideRange2( // @double-check
2043 const SideLocus& l,
2044 bool startAtLocus,
2045 TIndexOffU num,
2046 TIndexOffU* arrs,
2047 EList<bool> *masks,
2048 TIndexOffU maskOff) const
2049 {
2050 assert(!masks[0].empty());
2051 assert_eq(masks[0].size(), masks[1].size());
2052 assert_eq(masks[0].size(), masks[2].size());
2053 assert_eq(masks[0].size(), masks[3].size());
2054 ASSERT_ONLY(TIndexOffU myarrs[4] = {0, 0, 0, 0});
2055 TIndexOffU nm = 0; // number of nucleotides tallied so far
2056 int iby = 0; // initial byte offset
2057 int ibp = 0; // initial base-pair offset
2058 if(startAtLocus) {
2059 iby = l._by;
2060 ibp = l._bp;
2061 } else {
2062 // Start at beginning
2063 }
2064 int by = iby, bp = ibp;
2065 assert_lt(bp, 4);
2066 assert_lt(by, (int)this->_eh._sideBwtSz);
2067 const uint8_t *side = l.side(this->ebwt());
2068 while(nm < num) {
2069 int c = (side[by] >> (bp * 2)) & 3;
2070 assert_lt(maskOff + nm, masks[c].size());
2071 masks[0][maskOff + nm] = masks[1][maskOff + nm] =
2072 masks[2][maskOff + nm] = masks[3][maskOff + nm] = false;
2073 assert_range(0, 3, c);
2074 // Note: we tally $ just like an A
2075 arrs[c]++; // tally it
2076 ASSERT_ONLY(myarrs[c]++);
2077 masks[c][maskOff + nm] = true; // not dead
2078 nm++;
2079 if(++bp == 4) {
2080 bp = 0;
2081 by++;
2082 assert_leq(by, (int)this->_eh._sideBwtSz);
2083 if(by == (int)this->_eh._sideBwtSz) {
2084 // Fell off the end of the side
2085 break;
2086 }
2087 }
2088 }
2089 WITHIN_FCHR_DOLLARA(arrs);
2090 #ifndef NDEBUG
2091 if(_sanity) {
2092 // Make sure results match up with a call to mapLFEx.
2093 TIndexOffU tops[4] = {0, 0, 0, 0};
2094 TIndexOffU bots[4] = {0, 0, 0, 0};
2095 TIndexOffU top = l.toBWRow();
2096 TIndexOffU bot = top + nm;
2097 mapLFEx(top, bot, tops, bots, false);
2098 assert(myarrs[0] == (bots[0] - tops[0]) || myarrs[0] == (bots[0] - tops[0])+1);
2099 assert_eq(myarrs[1], bots[1] - tops[1]);
2100 assert_eq(myarrs[2], bots[2] - tops[2]);
2101 assert_eq(myarrs[3], bots[3] - tops[3]);
2102 }
2103 #endif
2104 return nm;
2105 }
2106
2107 /**
2108 * Return the final character in row i (i.e. the i'th character in the
2109 * BWT transform). Note that the 'L' in the name of the function
2110 * stands for 'last', as in the literature.
2111 */
rowL(const SideLocus & l)2112 inline int rowL(const SideLocus& l) const {
2113 // Extract and return appropriate bit-pair
2114 return unpack_2b_from_8b(l.side(this->ebwt())[l._by], l._bp);
2115 }
2116
2117 /**
2118 * Return the final character in row i (i.e. the i'th character in the
2119 * BWT transform). Note that the 'L' in the name of the function
2120 * stands for 'last', as in the literature.
2121 */
rowL(TIndexOffU i)2122 inline int rowL(TIndexOffU i) const {
2123 // Extract and return appropriate bit-pair
2124 SideLocus l;
2125 l.initFromRow(i, _eh, ebwt());
2126 return rowL(l);
2127 }
2128
2129 /**
2130 * Given top and bot loci, calculate counts of all four DNA chars up to
2131 * those loci. Used for more advanced backtracking-search.
2132 */
2133 inline void mapLFRange(
2134 SideLocus& ltop,
2135 SideLocus& lbot,
2136 TIndexOffU num, // Number of elts
2137 TIndexOffU* cntsUpto, // A/C/G/T counts up to top
2138 TIndexOffU* cntsIn, // A/C/G/T counts within range
2139 EList<bool> *masks
2140 ASSERT_ONLY(, bool overrideSanity = false)
2141 ) const
2142 {
2143 assert(ltop.repOk(this->eh()));
2144 assert(lbot.repOk(this->eh()));
2145 assert_eq(num, lbot.toBWRow() - ltop.toBWRow());
2146 assert_eq(0, cntsUpto[0]); assert_eq(0, cntsIn[0]);
2147 assert_eq(0, cntsUpto[1]); assert_eq(0, cntsIn[1]);
2148 assert_eq(0, cntsUpto[2]); assert_eq(0, cntsIn[2]);
2149 assert_eq(0, cntsUpto[3]); assert_eq(0, cntsIn[3]);
2150 countBt2SideRange(ltop, num, cntsUpto, cntsIn, masks);
2151 assert_eq(num, cntsIn[0] + cntsIn[1] + cntsIn[2] + cntsIn[3]);
2152 #ifndef NDEBUG
2153 if(_sanity && !overrideSanity) {
2154 // Make sure results match up with individual calls to mapLF;
2155 // be sure to override sanity-checking in the callee, or we'll
2156 // have infinite recursion
2157 TIndexOffU tops[4] = {0, 0, 0, 0};
2158 TIndexOffU bots[4] = {0, 0, 0, 0};
2159 assert(ltop.repOk(this->eh()));
2160 assert(lbot.repOk(this->eh()));
2161 mapLFEx(ltop, lbot, tops, bots, false);
2162 for(int i = 0; i < 4; i++) {
2163 assert(cntsUpto[i] == tops[i] || tops[i] == bots[i]);
2164 if(i == 0) {
2165 assert(cntsIn[i] == bots[i]-tops[i] ||
2166 cntsIn[i] == bots[i]-tops[i]+1);
2167 } else {
2168 assert_eq(cntsIn[i], bots[i]-tops[i]);
2169 }
2170 }
2171 }
2172 #endif
2173 }
2174
2175 /**
2176 * Given row i, return the row that the LF mapping maps i to.
2177 */
2178 inline TIndexOffU mapLF(
2179 const SideLocus& l
2180 ASSERT_ONLY(, bool overrideSanity = false)
2181 ) const
2182 {
2183 ASSERT_ONLY(TIndexOffU srcrow = l.toBWRow());
2184 TIndexOffU ret;
2185 assert(l.side(this->ebwt()) != NULL);
2186 int c = rowL(l);
2187 assert_lt(c, 4);
2188 assert_geq(c, 0);
2189 ret = countBt2Side(l, c);
2190 assert_lt(ret, this->_eh._bwtLen);
2191 assert_neq(srcrow, ret);
2192 #ifndef NDEBUG
2193 if(_sanity && !overrideSanity) {
2194 // Make sure results match up with results from mapLFEx;
2195 // be sure to override sanity-checking in the callee, or we'll
2196 // have infinite recursion
2197 TIndexOffU arrs[] = { 0, 0, 0, 0 };
2198 mapLFEx(l, arrs, true);
2199 assert_eq(arrs[c], ret);
2200 }
2201 #endif
2202 return ret;
2203 }
2204
2205 /**
2206 * Given row i and character c, return the row that the LF mapping maps
2207 * i to on character c.
2208 */
2209 inline TIndexOffU mapLF(
2210 const SideLocus& l, int c
2211 ASSERT_ONLY(, bool overrideSanity = false)
2212 ) const
2213 {
2214 TIndexOffU ret;
2215 assert_lt(c, 4);
2216 assert_geq(c, 0);
2217 ret = countBt2Side(l, c);
2218 assert_lt(ret, this->_eh._bwtLen);
2219 #ifndef NDEBUG
2220 if(_sanity && !overrideSanity) {
2221 // Make sure results match up with results from mapLFEx;
2222 // be sure to override sanity-checking in the callee, or we'll
2223 // have infinite recursion
2224 TIndexOffU arrs[] = { 0, 0, 0, 0 };
2225 mapLFEx(l, arrs, true);
2226 assert_eq(arrs[c], ret);
2227 }
2228 #endif
2229 return ret;
2230 }
2231
2232 /**
2233 * Given top and bot loci, calculate counts of all four DNA chars up to
2234 * those loci. Also, update a set of tops and bots for the reverse
2235 * index/direction using the idea from the bi-directional BWT paper.
2236 */
2237 inline void mapBiLFEx(
2238 const SideLocus& ltop,
2239 const SideLocus& lbot,
2240 TIndexOffU *tops,
2241 TIndexOffU *bots,
2242 TIndexOffU *topsP, // topsP[0] = top
2243 TIndexOffU *botsP
2244 ASSERT_ONLY(, bool overrideSanity = false)
2245 ) const
2246 {
2247 #ifndef NDEBUG
2248 for(int i = 0; i < 4; i++) {
2249 assert_eq(0, tops[0]); assert_eq(0, bots[0]);
2250 }
2251 #endif
2252 countBt2SideEx(ltop, tops);
2253 countBt2SideEx(lbot, bots);
2254 #ifndef NDEBUG
2255 if(_sanity && !overrideSanity) {
2256 // Make sure results match up with individual calls to mapLF;
2257 // be sure to override sanity-checking in the callee, or we'll
2258 // have infinite recursion
2259 assert_eq(mapLF(ltop, 0, true), tops[0]);
2260 assert_eq(mapLF(ltop, 1, true), tops[1]);
2261 assert_eq(mapLF(ltop, 2, true), tops[2]);
2262 assert_eq(mapLF(ltop, 3, true), tops[3]);
2263 assert_eq(mapLF(lbot, 0, true), bots[0]);
2264 assert_eq(mapLF(lbot, 1, true), bots[1]);
2265 assert_eq(mapLF(lbot, 2, true), bots[2]);
2266 assert_eq(mapLF(lbot, 3, true), bots[3]);
2267 }
2268 #endif
2269 // bots[0..3] - tops[0..3] = # of ways to extend the suffix with an
2270 // A, C, G, T
2271 botsP[0] = topsP[0] + (bots[0] - tops[0]);
2272 topsP[1] = botsP[0];
2273 botsP[1] = topsP[1] + (bots[1] - tops[1]);
2274 topsP[2] = botsP[1];
2275 botsP[2] = topsP[2] + (bots[2] - tops[2]);
2276 topsP[3] = botsP[2];
2277 botsP[3] = topsP[3] + (bots[3] - tops[3]);
2278 }
2279
2280 /**
2281 * Given row and its locus information, proceed on the given character
2282 * and return the next row, or all-fs if we can't proceed on that
2283 * character. Returns max value if this row ends in $.
2284 */
2285 inline TIndexOffU mapLF1(
2286 TIndexOffU row, // starting row
2287 const SideLocus& l, // locus for starting row
2288 int c // character to proceed on
2289 ASSERT_ONLY(, bool overrideSanity = false)
2290 ) const
2291 {
2292 if(rowL(l) != c || row == _zOff) return OFF_MASK;
2293 TIndexOffU ret;
2294 assert_lt(c, 4);
2295 assert_geq(c, 0);
2296 ret = countBt2Side(l, c);
2297 assert_lt(ret, this->_eh._bwtLen);
2298 #ifndef NDEBUG
2299 if(_sanity && !overrideSanity) {
2300 // Make sure results match up with results from mapLFEx;
2301 // be sure to override sanity-checking in the callee, or we'll
2302 // have infinite recursion
2303 TIndexOffU arrs[] = { 0, 0, 0, 0 };
2304 mapLFEx(l, arrs, true);
2305 assert_eq(arrs[c], ret);
2306 }
2307 #endif
2308 return ret;
2309 }
2310
2311
2312 /**
2313 * Given row and its locus information, set the row to LF(row) and
2314 * return the character that was in the final column.
2315 */
2316 inline int mapLF1(
2317 TIndexOffU& row, // starting row
2318 const SideLocus& l // locus for starting row
2319 ASSERT_ONLY(, bool overrideSanity = false)
2320 ) const
2321 {
2322 if(row == _zOff) return -1;
2323 int c = rowL(l);
2324 assert_range(0, 3, c);
2325 row = countBt2Side(l, c);
2326 assert_lt(row, this->_eh._bwtLen);
2327 #ifndef NDEBUG
2328 if(_sanity && !overrideSanity) {
2329 // Make sure results match up with results from mapLFEx;
2330 // be sure to override sanity-checking in the callee, or we'll
2331 // have infinite recursion
2332 TIndexOffU arrs[] = { 0, 0, 0, 0 };
2333 mapLFEx(l, arrs, true);
2334 assert_eq(arrs[c], row);
2335 }
2336 #endif
2337 return c;
2338 }
2339
2340 #ifndef NDEBUG
2341 /// Check that in-memory Ebwt is internally consistent with respect
2342 /// to given EbwtParams; assert if not
inMemoryRepOk(const EbwtParams & eh)2343 bool inMemoryRepOk(const EbwtParams& eh) const {
2344 assert_geq(_zEbwtBpOff, 0);
2345 assert_lt(_zEbwtBpOff, 4);
2346 assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
2347 assert_lt(_zOff, eh._bwtLen);
2348 assert_geq(_nFrag, _nPat);
2349 return true;
2350 }
2351
2352 /// Check that in-memory Ebwt is internally consistent; assert if
2353 /// not
inMemoryRepOk()2354 bool inMemoryRepOk() const {
2355 return repOk(_eh);
2356 }
2357
2358 /// Check that Ebwt is internally consistent with respect to given
2359 /// EbwtParams; assert if not
repOk(const EbwtParams & eh)2360 bool repOk(const EbwtParams& eh) const {
2361 assert(_eh.repOk());
2362 if(isInMemory()) {
2363 return inMemoryRepOk(eh);
2364 }
2365 return true;
2366 }
2367
2368 /// Check that Ebwt is internally consistent; assert if not
repOk()2369 bool repOk() const {
2370 return repOk(_eh);
2371 }
2372 #endif
2373
2374 bool _switchEndian;
2375 int32_t _overrideOffRate;
2376 bool _verbose;
2377 bool _passMemExc;
2378 bool _sanity;
2379 bool fw_; // true iff this is a forward index
2380 FILE *_in1; // input fd for primary index file
2381 FILE *_in2; // input fd for secondary index file
2382 string _in1Str; // filename for primary index file
2383 string _in2Str; // filename for secondary index file
2384 string _inSaStr; // filename for suffix-array file
2385 string _inBwtStr; // filename for BWT file
2386 TIndexOffU _zOff;
2387 TIndexOffU _zEbwtByteOff;
2388 TIndexOff _zEbwtBpOff;
2389 TIndexOffU _nPat; /// number of reference texts
2390 TIndexOffU _nFrag; /// number of fragments
2391 APtrWrap<TIndexOffU> _plen;
2392 APtrWrap<TIndexOffU> _rstarts; // starting offset of fragments / text indexes
2393 // _fchr, _ftab and _eftab are expected to be relatively small
2394 // (usually < 1MB, perhaps a few MB if _fchr is particularly large
2395 // - like, say, 11). For this reason, we don't bother with writing
2396 // them to disk through separate output streams; we
2397 APtrWrap<TIndexOffU> _fchr;
2398 APtrWrap<TIndexOffU> _ftab;
2399 APtrWrap<TIndexOffU> _eftab; // "extended" entries for _ftab
2400 // _offs may be extremely large. E.g. for DNA w/ offRate=4 (one
2401 // offset every 16 rows), the total size of _offs is the same as
2402 // the total size of the input sequence
2403 APtrWrap<TIndexOffU> _offs;
2404 // _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
2405 // is at least as large as the input sequence.
2406 APtrWrap<uint8_t> _ebwt;
2407 bool _useMm; /// use memory-mapped files to hold the index
2408 bool useShmem_; /// use shared memory to hold large parts of the index
2409 EList<string> _refnames; /// names of the reference sequences
2410 char *mmFile1_;
2411 char *mmFile2_;
2412 EbwtParams _eh;
2413 bool packed_;
2414
2415 static const TIndexOffU default_bmax = OFF_MASK;
2416 static const TIndexOffU default_bmaxMultSqrt = OFF_MASK;
2417 static const TIndexOffU default_bmaxDivN = 4;
2418 static const int default_dcv = 1024;
2419 static const bool default_noDc = false;
2420 static const bool default_useBlockwise = true;
2421 static const uint32_t default_seed = 0;
2422 #ifdef BOWTIE_64BIT_INDEX
2423 static const int default_lineRate = 7;
2424 #else
2425 static const int default_lineRate = 6;
2426 #endif
2427 static const int default_offRate = 5;
2428 static const int default_offRatePlus = 0;
2429 static const int default_ftabChars = 10;
2430 static const bool default_bigEndian = false;
2431
2432 private:
2433
log()2434 ostream& log() const {
2435 return cout; // TODO: turn this into a parameter
2436 }
2437
2438 /// Print a verbose message and flush (flushing is helpful for
2439 /// debugging)
verbose(const string & s)2440 void verbose(const string& s) const {
2441 if(this->verbose()) {
2442 this->log() << s.c_str();
2443 this->log().flush();
2444 }
2445 }
2446 };
2447
2448 /**
2449 * Read reference names from an input stream 'in' for an Ebwt primary
2450 * file and store them in 'refnames'.
2451 */
2452 void readEbwtRefnames(FILE* fin, EList<string>& refnames);
2453
2454 /**
2455 * Read reference names from the index with basename 'in' and store
2456 * them in 'refnames'.
2457 */
2458 void readEbwtRefnames(const string& instr, EList<string>& refnames);
2459
2460 /**
2461 * Read just enough of the Ebwt's header to determine whether it's
2462 * colorspace.
2463 */
2464 bool readEbwtColor(const string& instr);
2465
2466 /**
2467 * Read just enough of the Ebwt's header to determine whether it's
2468 * entirely reversed.
2469 */
2470 bool readEntireReverse(const string& instr);
2471
2472 ///////////////////////////////////////////////////////////////////////
2473 //
2474 // Functions for building Ebwts
2475 //
2476 ///////////////////////////////////////////////////////////////////////
2477
2478 /**
2479 * Join several text strings together in a way that's compatible with
2480 * the text-chunking scheme dictated by chunkRate parameter.
2481 *
2482 * The non-static member Ebwt::join additionally builds auxilliary
2483 * arrays that maintain a mapping between chunks in the joined string
2484 * and the original text strings.
2485 */
2486 template<typename TStr>
join(EList<TStr> & l,uint32_t seed)2487 TStr Ebwt::join(EList<TStr>& l, uint32_t seed) {
2488 RandomSource rand; // reproducible given same seed
2489 rand.init(seed);
2490 TStr ret;
2491 TIndexOffU guessLen = 0;
2492 for(TIndexOffU i = 0; i < l.size(); i++) {
2493 guessLen += length(l[i]);
2494 }
2495 ret.resize(guessLen);
2496 TIndexOffU off = 0;
2497 for(size_t i = 0; i < l.size(); i++) {
2498 TStr& s = l[i];
2499 assert_gt(s.length(), 0);
2500 for(size_t j = 0; j < s.size(); j++) {
2501 ret.set(s[j], off++);
2502 }
2503 }
2504 return ret;
2505 }
2506
2507 /**
2508 * Join several text strings together in a way that's compatible with
2509 * the text-chunking scheme dictated by chunkRate parameter.
2510 *
2511 * The non-static member Ebwt::join additionally builds auxilliary
2512 * arrays that maintain a mapping between chunks in the joined string
2513 * and the original text strings.
2514 */
2515 template<typename TStr>
join(EList<FileBuf * > & l,EList<RefRecord> & szs,TIndexOffU sztot,const RefReadInParams & refparams,uint32_t seed)2516 TStr Ebwt::join(EList<FileBuf*>& l,
2517 EList<RefRecord>& szs,
2518 TIndexOffU sztot,
2519 const RefReadInParams& refparams,
2520 uint32_t seed)
2521 {
2522 RandomSource rand; // reproducible given same seed
2523 rand.init(seed);
2524 RefReadInParams rpcp = refparams;
2525 TStr ret;
2526 TIndexOffU guessLen = sztot;
2527 ret.resize(guessLen);
2528 ASSERT_ONLY(TIndexOffU szsi = 0);
2529 TIndexOffU dstoff = 0;
2530 for(TIndexOffU i = 0; i < l.size(); i++) {
2531 // For each sequence we can pull out of istream l[i]...
2532 assert(!l[i]->eof());
2533 bool first = true;
2534 while(!l[i]->eof()) {
2535 RefRecord rec = fastaRefReadAppend(*l[i], first, ret, dstoff, rpcp);
2536 first = false;
2537 if(rec.first && rec.len == 0) {
2538 continue;
2539 }
2540 TIndexOffU bases = rec.len;
2541 assert_eq(rec.off, szs[szsi].off);
2542 assert_eq(rec.len, szs[szsi].len);
2543 assert_eq(rec.first, szs[szsi].first);
2544 ASSERT_ONLY(szsi++);
2545 if(bases == 0) continue;
2546 }
2547 }
2548 return ret;
2549 }
2550
2551 /**
2552 * Join several text strings together according to the text-chunking
2553 * scheme specified in the EbwtParams. Ebwt fields calculated in this
2554 * function are written directly to disk.
2555 *
2556 * It is assumed, but not required, that the header values have already
2557 * been written to 'out1' before this function is called.
2558 *
2559 * The static member Ebwt::join just returns a joined version of a
2560 * list of strings without building any of the auxilliary arrays.
2561 */
2562 template<typename TStr>
joinToDisk(EList<FileBuf * > & l,EList<RefRecord> & szs,TIndexOffU sztot,const RefReadInParams & refparams,TStr & ret,ostream & out1,ostream & out2)2563 void Ebwt::joinToDisk(
2564 EList<FileBuf*>& l,
2565 EList<RefRecord>& szs,
2566 TIndexOffU sztot,
2567 const RefReadInParams& refparams,
2568 TStr& ret,
2569 ostream& out1,
2570 ostream& out2)
2571 {
2572 RefReadInParams rpcp = refparams;
2573 assert_gt(szs.size(), 0);
2574 assert_gt(l.size(), 0);
2575 assert_gt(sztot, 0);
2576 // Not every fragment represents a distinct sequence - many
2577 // fragments may correspond to a single sequence. Count the
2578 // number of sequences here by counting the number of "first"
2579 // fragments.
2580 this->_nPat = 0;
2581 this->_nFrag = 0;
2582 for(TIndexOffU i = 0; i < szs.size(); i++) {
2583 if(szs[i].len > 0) this->_nFrag++;
2584 if(szs[i].first && szs[i].len > 0) this->_nPat++;
2585 }
2586 assert_gt(this->_nPat, 0);
2587 assert_geq(this->_nFrag, this->_nPat);
2588 _rstarts.reset();
2589 writeU<TIndexOffU>(out1, this->_nPat, _switchEndian);
2590 // Allocate plen[]
2591 try {
2592 this->_plen.init(new TIndexOffU[this->_nPat], this->_nPat);
2593 } catch(bad_alloc& e) {
2594 cerr << "Out of memory allocating plen[] in Ebwt::join)"
2595 << " at " << __FILE__ << ":" << __LINE__ << endl;
2596 throw e;
2597 }
2598 // For each pattern, set plen
2599 TIndexOff npat = -1;
2600 for(TIndexOffU i = 0; i < szs.size(); i++) {
2601 if(szs[i].first && szs[i].len > 0) {
2602 if(npat >= 0) {
2603 writeU<TIndexOffU>(out1, this->plen()[npat], _switchEndian);
2604 }
2605 this->plen()[++npat] = (szs[i].len + szs[i].off);
2606 } else if(!szs[i].first) {
2607 // edge case, but we could get here with npat == -1
2608 // e.g. when building from a reference of all Ns
2609 if (npat < 0) npat = 0;
2610 this->plen()[npat] += (szs[i].len + szs[i].off);
2611 }
2612 }
2613 assert_eq((TIndexOffU)npat, this->_nPat-1);
2614 writeU<TIndexOffU>(out1, this->plen()[npat], _switchEndian);
2615 // Write the number of fragments
2616 writeU<TIndexOffU>(out1, this->_nFrag, _switchEndian);
2617 TIndexOffU seqsRead = 0;
2618 ASSERT_ONLY(TIndexOffU szsi = 0);
2619 ASSERT_ONLY(TIndexOffU entsWritten = 0);
2620 TIndexOffU dstoff = 0;
2621 // For each filebuf
2622 for(unsigned int i = 0; i < l.size(); i++) {
2623 assert(!l[i]->eof());
2624 bool first = true;
2625 TIndexOffU patoff = 0;
2626 // For each *fragment* (not necessary an entire sequence) we
2627 // can pull out of istream l[i]...
2628 while(!l[i]->eof()) {
2629 string name;
2630 // Push a new name onto our vector
2631 _refnames.push_back("");
2632 RefRecord rec = fastaRefReadAppend(
2633 *l[i], first, ret, dstoff, rpcp, &_refnames.back());
2634 first = false;
2635 TIndexOffU bases = rec.len;
2636 if(rec.first && rec.len > 0) {
2637 if(_refnames.back().length() == 0) {
2638 // If name was empty, replace with an index
2639 ostringstream stm;
2640 stm << seqsRead;
2641 _refnames.back() = stm.str();
2642 }
2643 } else {
2644 // This record didn't actually start a new sequence so
2645 // no need to add a name
2646 //assert_eq(0, _refnames.back().length());
2647 _refnames.pop_back();
2648 }
2649 if(rec.first && rec.len == 0) {
2650 continue;
2651 }
2652 assert_lt(szsi, szs.size());
2653 assert_eq(rec.off, szs[szsi].off);
2654 assert_eq(rec.len, szs[szsi].len);
2655 assert_eq(rec.first, szs[szsi].first);
2656 assert(rec.first || rec.off > 0);
2657 ASSERT_ONLY(szsi++);
2658 // Increment seqsRead if this is the first fragment
2659 if(rec.first) seqsRead++;
2660 if(bases == 0) continue;
2661 assert_leq(bases, this->plen()[seqsRead-1]);
2662 // Reset the patoff if this is the first fragment
2663 if(rec.first) patoff = 0;
2664 patoff += rec.off; // add fragment's offset from end of last frag.
2665 // Adjust rpcps
2666 //uint32_t seq = seqsRead-1;
2667 ASSERT_ONLY(entsWritten++);
2668 // This is where rstarts elements are written to the output stream
2669 //writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
2670 //writeU32(out1, seq, this->toBe()); // sequence id
2671 //writeU32(out1, patoff, this->toBe()); // offset into sequence
2672 patoff += bases;
2673 }
2674 assert_gt(szsi, 0);
2675 l[i]->reset();
2676 assert(!l[i]->eof());
2677 #ifndef NDEBUG
2678 int c = l[i]->get();
2679 assert_eq('>', c);
2680 assert(!l[i]->eof());
2681 l[i]->reset();
2682 assert(!l[i]->eof());
2683 #endif
2684 }
2685 assert_eq(entsWritten, this->_nFrag);
2686 }
2687
2688 /**
2689 * Build an Ebwt from a string 's' and its suffix array 'sa' (which
2690 * might actually be a suffix array *builder* that builds blocks of the
2691 * array on demand). The bulk of the Ebwt, i.e. the ebwt and offs
2692 * arrays, is written directly to disk. This is by design: keeping
2693 * those arrays in memory needlessly increases the footprint of the
2694 * building process. Instead, we prefer to build the Ebwt directly
2695 * "to disk" and then read it back into memory later as necessary.
2696 *
2697 * It is assumed that the header values and join-related values (nPat,
2698 * plen) have already been written to 'out1' before this function
2699 * is called. When this function is finished, it will have
2700 * additionally written ebwt, zOff, fchr, ftab and eftab to the primary
2701 * file and offs to the secondary file.
2702 *
2703 * Assume DNA/RNA/any alphabet with 4 or fewer elements.
2704 * Assume occ array entries are 32 bits each.
2705 *
2706 * @param sa the suffix array to convert to a Ebwt
2707 * @param s the original string
2708 * @param out
2709 */
2710 template<typename TStr>
buildToDisk(InorderBlockwiseSA<TStr> & sa,const TStr & s,ostream & out1,ostream & out2,ostream * saOut,ostream * bwtOut)2711 void Ebwt::buildToDisk(
2712 InorderBlockwiseSA<TStr>& sa,
2713 const TStr& s,
2714 ostream& out1,
2715 ostream& out2,
2716 ostream* saOut,
2717 ostream* bwtOut)
2718 {
2719 const EbwtParams& eh = this->_eh;
2720
2721 assert(eh.repOk());
2722 assert_eq(s.length()+1, sa.size());
2723 assert_eq(s.length(), eh._len);
2724 assert_gt(eh._lineRate, 3);
2725 assert(sa.suffixItrIsReset());
2726
2727 TIndexOffU len = eh._len;
2728 TIndexOffU ftabLen = eh._ftabLen;
2729 TIndexOffU sideSz = eh._sideSz;
2730 TIndexOffU ebwtTotSz = eh._ebwtTotSz;
2731 TIndexOffU fchr[] = {0, 0, 0, 0, 0};
2732 EList<TIndexOffU> ftab(EBWT_CAT);
2733 TIndexOffU zOff = OFF_MASK;
2734
2735 // Save # of occurrences of each character as we walk along the bwt
2736 TIndexOffU occ[4] = {0, 0, 0, 0};
2737 TIndexOffU occSave[4] = {0, 0, 0, 0};
2738
2739 // Record rows that should "absorb" adjacent rows in the ftab.
2740 // The absorbed rows represent suffixes shorter than the ftabChars
2741 // cutoff.
2742 uint8_t absorbCnt = 0;
2743 EList<uint8_t> absorbFtab(EBWT_CAT);
2744 try {
2745 VMSG_NL("Allocating ftab, absorbFtab");
2746 ftab.resize(ftabLen);
2747 ftab.fillZero();
2748 absorbFtab.resize(ftabLen);
2749 absorbFtab.fillZero();
2750 } catch(bad_alloc &e) {
2751 cerr << "Out of memory allocating ftab[] or absorbFtab[] "
2752 << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
2753 << __LINE__ << endl;
2754 throw e;
2755 }
2756
2757 // Allocate the side buffer; holds a single side as its being
2758 // constructed and then written to disk. Reused across all sides.
2759 #ifdef SIXTY4_FORMAT
2760 EList<uint64_t> ebwtSide(EBWT_CAT);
2761 #else
2762 EList<uint8_t> ebwtSide(EBWT_CAT);
2763 #endif
2764 try {
2765 #ifdef SIXTY4_FORMAT
2766 ebwtSide.resize(sideSz >> 3);
2767 #else
2768 ebwtSide.resize(sideSz);
2769 #endif
2770 } catch(bad_alloc &e) {
2771 cerr << "Out of memory allocating ebwtSide[] in "
2772 << "Ebwt::buildToDisk() at " << __FILE__ << ":"
2773 << __LINE__ << endl;
2774 throw e;
2775 }
2776
2777 // Points to the base offset within ebwt for the side currently
2778 // being written
2779 TIndexOffU side = 0;
2780
2781 // Whether we're assembling a forward or a reverse bucket
2782 bool fw;
2783 TIndexOff sideCur = 0;
2784 fw = true;
2785
2786 // Have we skipped the '$' in the last column yet?
2787 ASSERT_ONLY(bool dollarSkipped = false);
2788
2789 TIndexOffU si = 0; // string offset (chars)
2790 ASSERT_ONLY(TIndexOffU lastSufInt = 0);
2791 ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
2792 // array (as opposed to the padding at the
2793 // end)
2794 // Iterate over packed bwt bytes
2795 VMSG_NL("Entering Ebwt loop");
2796 ASSERT_ONLY(TIndexOffU beforeEbwtOff = (TIndexOffU)out1.tellp()); // @double-check - pos_type, std::streampos
2797
2798 // First integer in the suffix-array output file is the length of the
2799 // array, including $
2800 if(saOut != NULL) {
2801 // Write length word
2802 writeU<TIndexOffU>(*saOut, len+1, _switchEndian);
2803 }
2804
2805 // First integer in the BWT output file is the length of BWT(T), including $
2806 if(bwtOut != NULL) {
2807 // Write length word
2808 writeU<TIndexOffU>(*bwtOut, len+1, _switchEndian);
2809 }
2810
2811 while(side < ebwtTotSz) {
2812 // Sanity-check our cursor into the side buffer
2813 assert_geq(sideCur, 0);
2814 assert_lt(sideCur, (int)eh._sideBwtSz);
2815 assert_eq(0, side % sideSz); // 'side' must be on side boundary
2816 ebwtSide[sideCur] = 0; // clear
2817 assert_lt(side + sideCur, ebwtTotSz);
2818 // Iterate over bit-pairs in the si'th character of the BWT
2819 #ifdef SIXTY4_FORMAT
2820 for(int bpi = 0; bpi < 32; bpi++, si++)
2821 #else
2822 for(int bpi = 0; bpi < 4; bpi++, si++)
2823 #endif
2824 {
2825 int bwtChar;
2826 bool count = true;
2827 if(si <= len) {
2828 // Still in the SA; extract the bwtChar
2829 TIndexOffU saElt = sa.nextSuffix();
2830 // Write it to the optional suffix-array output file
2831 if(saOut != NULL) {
2832 writeU<TIndexOffU>(*saOut, saElt, _switchEndian);
2833 }
2834 // TODO: what exactly to write to the BWT output file? How to
2835 // represent $? How to pack nucleotides into bytes/words?
2836
2837 // (that might have triggered sa to calc next suf block)
2838 if(saElt == 0) {
2839 // Don't add the '$' in the last column to the BWT
2840 // transform; we can't encode a $ (only A C T or G)
2841 // and counting it as, say, an A, will mess up the
2842 // LR mapping
2843 bwtChar = 0; count = false;
2844 ASSERT_ONLY(dollarSkipped = true);
2845 zOff = si; // remember the SA row that
2846 // corresponds to the 0th suffix
2847 } else {
2848 bwtChar = (int)(s[saElt-1]);
2849 assert_lt(bwtChar, 4);
2850 // Update the fchr
2851 fchr[bwtChar]++;
2852 }
2853 // Update ftab
2854 if((len-saElt) >= (TIndexOffU)eh._ftabChars) {
2855 // Turn the first ftabChars characters of the
2856 // suffix into an integer index into ftab. The
2857 // leftmost (lowest index) character of the suffix
2858 // goes in the most significant bit pair if the
2859 // integer.
2860 TIndexOffU sufInt = 0;
2861 for(int i = 0; i < eh._ftabChars; i++) {
2862 sufInt <<= 2;
2863 assert_lt((TIndexOffU)i, len-saElt);
2864 sufInt |= (unsigned char)(s[saElt+i]);
2865 }
2866 // Assert that this prefix-of-suffix is greater
2867 // than or equal to the last one (true b/c the
2868 // suffix array is sorted)
2869 #ifndef NDEBUG
2870 if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
2871 lastSufInt = sufInt;
2872 #endif
2873 // Update ftab
2874 assert_lt(sufInt+1, ftabLen);
2875 ftab[sufInt+1]++;
2876 if(absorbCnt > 0) {
2877 // Absorb all short suffixes since the last
2878 // transition into this transition
2879 absorbFtab[sufInt] = absorbCnt;
2880 absorbCnt = 0;
2881 }
2882 } else {
2883 // Otherwise if suffix is fewer than ftabChars
2884 // characters long, then add it to the 'absorbCnt';
2885 // it will be absorbed into the next transition
2886 assert_lt(absorbCnt, 255);
2887 absorbCnt++;
2888 }
2889 // Suffix array offset boundary? - update offset array
2890 if((si & eh._offMask) == si) {
2891 assert_lt((si >> eh._offRate), eh._offsLen);
2892 // Write offsets directly to the secondary output
2893 // stream, thereby avoiding keeping them in memory
2894 writeU<TIndexOffU>(out2, saElt, _switchEndian);
2895 }
2896 } else {
2897 // Strayed off the end of the SA, now we're just
2898 // padding out a bucket
2899 #ifndef NDEBUG
2900 if(inSA) {
2901 // Assert that we wrote all the characters in the
2902 // string before now
2903 assert_eq(si, len+1);
2904 inSA = false;
2905 }
2906 #endif
2907 // 'A' used for padding; important that padding be
2908 // counted in the occ[] array
2909 bwtChar = 0;
2910 }
2911 if(count) occ[bwtChar]++;
2912 // Append BWT char to bwt section of current side
2913 if(fw) {
2914 // Forward bucket: fill from least to most
2915 #ifdef SIXTY4_FORMAT
2916 ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
2917 if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
2918 #else
2919 pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi);
2920 assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar);
2921 #endif
2922 } else {
2923 // Backward bucket: fill from most to least
2924 #ifdef SIXTY4_FORMAT
2925 ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
2926 if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
2927 #else
2928 pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi);
2929 assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
2930 #endif
2931 }
2932 } // end loop over bit-pairs
2933 assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
2934 #ifdef SIXTY4_FORMAT
2935 assert_eq(0, si & 31);
2936 #else
2937 assert_eq(0, si & 3);
2938 #endif
2939
2940 sideCur++;
2941 if(sideCur == (int)eh._sideBwtSz) {
2942 sideCur = 0;
2943 TIndexOffU *cpptr = reinterpret_cast<TIndexOffU*>(ebwtSide.ptr());
2944 // Write 'A', 'C', 'G' and 'T' tallies
2945 side += sideSz;
2946 assert_leq(side, eh._ebwtTotSz);
2947 #ifdef BOWTIE_64BIT_INDEX
2948 cpptr[(sideSz >> 3)-4] = endianizeU<TIndexOffU>(occSave[0], _switchEndian);
2949 cpptr[(sideSz >> 3)-3] = endianizeU<TIndexOffU>(occSave[1], _switchEndian);
2950 cpptr[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occSave[2], _switchEndian);
2951 cpptr[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occSave[3], _switchEndian);
2952 #else
2953 cpptr[(sideSz >> 2)-4] = endianizeU<TIndexOffU>(occSave[0], _switchEndian);
2954 cpptr[(sideSz >> 2)-3] = endianizeU<TIndexOffU>(occSave[1], _switchEndian);
2955 cpptr[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occSave[2], _switchEndian);
2956 cpptr[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occSave[3], _switchEndian);
2957 #endif
2958 occSave[0] = occ[0];
2959 occSave[1] = occ[1];
2960 occSave[2] = occ[2];
2961 occSave[3] = occ[3];
2962 // Write backward side to primary file
2963 out1.write((const char *)ebwtSide.ptr(), sideSz);
2964 }
2965 }
2966 VMSG_NL("Exited Ebwt loop");
2967 assert_neq(zOff, OFF_MASK);
2968 if(absorbCnt > 0) {
2969 // Absorb any trailing, as-yet-unabsorbed short suffixes into
2970 // the last element of ftab
2971 absorbFtab[ftabLen-1] = absorbCnt;
2972 }
2973 // Assert that our loop counter got incremented right to the end
2974 assert_eq(side, eh._ebwtTotSz);
2975 // Assert that we wrote the expected amount to out1
2976 assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz); // @double-check - pos_type
2977 // assert that the last thing we did was write a forward bucket
2978
2979 //
2980 // Write zOff to primary stream
2981 //
2982 writeU<TIndexOffU>(out1, zOff, _switchEndian);
2983
2984 //
2985 // Finish building fchr
2986 //
2987 // Exclusive prefix sum on fchr
2988 for(int i = 1; i < 4; i++) {
2989 fchr[i] += fchr[i-1];
2990 }
2991 assert_eq(fchr[3], len);
2992 // Shift everybody up by one
2993 for(int i = 4; i >= 1; i--) {
2994 fchr[i] = fchr[i-1];
2995 }
2996 fchr[0] = 0;
2997 if(_verbose) {
2998 for(int i = 0; i < 5; i++)
2999 cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
3000 }
3001 // Write fchr to primary file
3002 for(int i = 0; i < 5; i++) {
3003 writeU<TIndexOffU>(out1, fchr[i], _switchEndian);
3004 }
3005
3006 //
3007 // Finish building ftab and build eftab
3008 //
3009 // Prefix sum on ftable
3010 TIndexOffU eftabLen = 0;
3011 assert_eq(0, absorbFtab[0]);
3012 for(TIndexOffU i = 1; i < ftabLen; i++) {
3013 if(absorbFtab[i] > 0) eftabLen += 2;
3014 }
3015 assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2);
3016 eftabLen = eh._ftabChars*2;
3017 EList<TIndexOffU> eftab(EBWT_CAT);
3018 try {
3019 eftab.resize(eftabLen);
3020 eftab.fillZero();
3021 } catch(bad_alloc &e) {
3022 cerr << "Out of memory allocating eftab[] "
3023 << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
3024 << __LINE__ << endl;
3025 throw e;
3026 }
3027 TIndexOffU eftabCur = 0;
3028 for(TIndexOffU i = 1; i < ftabLen; i++) {
3029 TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i-1);
3030 if(absorbFtab[i] > 0) {
3031 // Skip a number of short pattern indicated by absorbFtab[i]
3032 TIndexOffU hi = lo + absorbFtab[i];
3033 assert_lt(eftabCur*2+1, eftabLen);
3034 eftab[eftabCur*2] = lo;
3035 eftab[eftabCur*2+1] = hi;
3036 ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab
3037 assert_eq(lo, Ebwt::ftabLo(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
3038 assert_eq(hi, Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, i));
3039 } else {
3040 ftab[i] = lo;
3041 }
3042 }
3043 assert_eq(Ebwt::ftabHi(ftab.ptr(), eftab.ptr(), len, ftabLen, eftabLen, ftabLen-1), len+1);
3044 // Write ftab to primary file
3045 for(TIndexOffU i = 0; i < ftabLen; i++) {
3046 writeU<TIndexOffU>(out1, ftab[i], _switchEndian);
3047 }
3048 // Write eftab to primary file
3049 for(TIndexOffU i = 0; i < eftabLen; i++) {
3050 writeU<TIndexOffU>(out1, eftab[i], _switchEndian);
3051 }
3052
3053 // Note: if you'd like to sanity-check the Ebwt, you'll have to
3054 // read it back into memory first!
3055 assert(!isInMemory());
3056 VMSG_NL("Exiting Ebwt::buildToDisk()");
3057 }
3058
3059 /**
3060 * Try to find the Bowtie index specified by the user. First try the
3061 * exact path given by the user. Then try the user-provided string
3062 * appended onto the path of the "indexes" subdirectory below this
3063 * executable, then try the provided string appended onto
3064 * "$BOWTIE2_INDEXES/".
3065 */
3066 string adjustEbwtBase(const string& cmdline,
3067 const string& ebwtFileBase,
3068 bool verbose);
3069
3070
3071 extern string gLastIOErrMsg;
3072
3073 /* Checks whether a call to read() failed or not. */
is_read_err(int fdesc,ssize_t ret,size_t count)3074 inline bool is_read_err(int fdesc, ssize_t ret, size_t count){
3075 if (ret < 0) {
3076 std::stringstream sstm;
3077 sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
3078 gLastIOErrMsg = sstm.str();
3079 return true;
3080 }
3081 return false;
3082 }
3083
3084 /* Checks whether a call to fread() failed or not. */
is_fread_err(FILE * file_hd,size_t ret,size_t count)3085 inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){
3086 if (ferror(file_hd)) {
3087 gLastIOErrMsg = "Error Reading File!";
3088 return true;
3089 }
3090 return false;
3091 }
3092
3093
3094 #endif /*EBWT_H_*/
3095