1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <string>
21 #include <stdexcept>
22 #include <iostream>
23 #include <fstream>
24 #include <stdlib.h>
25 #include "bt2_idx.h"
26 #include <iomanip>
27
28 using namespace std;
29
30 ///////////////////////////////////////////////////////////////////////
31 //
32 // Functions for reading and writing Ebwts
33 //
34 ///////////////////////////////////////////////////////////////////////
35
36 /**
37 * Read an Ebwt from file with given filename.
38 */
readIntoMemory(int color,int needEntireRev,bool loadSASamp,bool loadFtab,bool loadRstarts,bool justHeader,EbwtParams * params,bool mmSweep,bool loadNames,bool startVerbose)39 void Ebwt::readIntoMemory(
40 int color,
41 int needEntireRev,
42 bool loadSASamp,
43 bool loadFtab,
44 bool loadRstarts,
45 bool justHeader,
46 EbwtParams *params,
47 bool mmSweep,
48 bool loadNames,
49 bool startVerbose)
50 {
51 #ifdef BOWTIE_MM
52 char *mmFile[] = { NULL, NULL };
53 #endif
54 if(_in1Str.length() > 0) {
55 if(_verbose || startVerbose) {
56 cerr << " About to open input files: ";
57 logTime(cerr);
58 }
59 // Initialize our primary and secondary input-stream fields
60 if(_in1 != NULL) fclose(_in1);
61 if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str.c_str() << "\"" << endl;
62 if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
63 cerr << "Could not open index file " << _in1Str.c_str() << endl;
64 }
65 if(loadSASamp) {
66 if(_in2 != NULL) fclose(_in2);
67 if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str.c_str() << "\"" << endl;
68 if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
69 cerr << "Could not open index file " << _in2Str.c_str() << endl;
70 }
71 }
72 if(_verbose || startVerbose) {
73 cerr << " Finished opening input files: ";
74 logTime(cerr);
75 }
76
77 #ifdef BOWTIE_MM
78 if(_useMm /*&& !justHeader*/) {
79 const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
80 int fds[] = { fileno(_in1), fileno(_in2) };
81 for(int i = 0; i < (loadSASamp ? 2 : 1); i++) {
82 if(_verbose || startVerbose) {
83 cerr << " Memory-mapping input file " << (i+1) << ": ";
84 logTime(cerr);
85 }
86 struct stat sbuf;
87 if (stat(names[i], &sbuf) == -1) {
88 perror("stat");
89 cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
90 throw 1;
91 }
92 mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size,
93 PROT_READ, MAP_SHARED, fds[(size_t)i], 0);
94 if(mmFile[i] == (void *)(-1)) {
95 perror("mmap");
96 cerr << "Error: Could not memory-map the index file " << names[i] << endl;
97 throw 1;
98 }
99 if(mmSweep) {
100 int sum = 0;
101 for(off_t j = 0; j < sbuf.st_size; j += 1024) {
102 sum += (int) mmFile[i][j];
103 }
104 if(startVerbose) {
105 cerr << " Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
106 logTime(cerr);
107 }
108 }
109 }
110 mmFile1_ = mmFile[0];
111 mmFile2_ = loadSASamp ? mmFile[1] : NULL;
112 }
113 #endif
114 }
115 #ifdef BOWTIE_MM
116 else if(_useMm && !justHeader) {
117 mmFile[0] = mmFile1_;
118 mmFile[1] = mmFile2_;
119 }
120 if(_useMm && !justHeader) {
121 assert(mmFile[0] == mmFile1_);
122 assert(mmFile[1] == mmFile2_);
123 }
124 #endif
125
126 if(_verbose || startVerbose) {
127 cerr << " Reading header: ";
128 logTime(cerr);
129 }
130
131 // Read endianness hints from both streams
132 uint64_t bytesRead = 0;
133 _switchEndian = false;
134 uint32_t one = readU<uint32_t>(_in1, _switchEndian); // 1st word of primary stream
135 bytesRead += 4;
136 if(loadSASamp) {
137 #ifndef NDEBUG
138 assert_eq(one, readU<uint32_t>(_in2, _switchEndian)); // should match!
139 #else
140 readU<uint32_t>(_in2, _switchEndian);
141 #endif
142 }
143 if(one != 1) {
144 assert_eq((1u<<24), one);
145 assert_eq(1, endianSwapU32(one));
146 _switchEndian = true;
147 }
148
149 // Can't switch endianness and use memory-mapped files; in order to
150 // support this, someone has to modify the file to switch
151 // endiannesses appropriately, and we can't do this inside Bowtie
152 // or we might be setting up a race condition with other processes.
153 if(_switchEndian && _useMm) {
154 cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
155 throw 1;
156 }
157
158 // Reads header entries one by one from primary stream
159 TIndexOffU len = readU<TIndexOffU>(_in1, _switchEndian);
160 bytesRead += OFF_SIZE;
161 int32_t lineRate = readI<int32_t>(_in1, _switchEndian);
162 bytesRead += 4;
163 /*int32_t linesPerSide =*/ readI<int32_t>(_in1, _switchEndian);
164 bytesRead += 4;
165 int32_t offRate = readI<int32_t>(_in1, _switchEndian);
166 bytesRead += 4;
167 // TODO: add isaRate to the actual file format (right now, the
168 // user has to tell us whether there's an ISA sample and what the
169 // sampling rate is.
170 int32_t ftabChars = readI<int32_t>(_in1, _switchEndian);
171 bytesRead += 4;
172 // chunkRate was deprecated in an earlier version of Bowtie; now
173 // we use it to hold flags.
174 int32_t flags = readI<int32_t>(_in1, _switchEndian);
175 bool entireRev = false;
176 if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
177 if(color != -1 && !color) {
178 cerr << "Error: -C was not specified when running bowtie, but index is in colorspace. If" << endl
179 << "your reads are in colorspace, please use the -C option. If your reads are not" << endl
180 << "in colorspace, please use a normal index (one built without specifying -C to" << endl
181 << "bowtie-build)." << endl;
182 throw 1;
183 }
184 color = 1;
185 } else if(flags < 0) {
186 if(color != -1 && color) {
187 cerr << "Error: -C was specified when running bowtie, but index is not in colorspace. If" << endl
188 << "your reads are in colorspace, please use a colorspace index (one built using" << endl
189 << "bowtie-build -C). If your reads are not in colorspace, don't specify -C when" << endl
190 << "running bowtie." << endl;
191 throw 1;
192 }
193 color = 0;
194 }
195 if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) {
196 if(needEntireRev != -1 && needEntireRev != 0) {
197 cerr << "Error: This index is compatible with 0.* versions of Bowtie, but not with 2.*" << endl
198 << "versions. Please build or download a version of the index that is compitble" << endl
199 << "with Bowtie 2.* (i.e. built with bowtie-build 2.* or later)" << endl;
200 throw 1;
201 }
202 } else entireRev = true;
203 bytesRead += 4;
204
205 // Create a new EbwtParams from the entries read from primary stream
206 EbwtParams *eh;
207 bool deleteEh = false;
208 if(params != NULL) {
209 params->init(len, lineRate, offRate, ftabChars, color, entireRev);
210 if(_verbose || startVerbose) params->print(cerr);
211 eh = params;
212 } else {
213 eh = new EbwtParams(len, lineRate, offRate, ftabChars, color, entireRev);
214 deleteEh = true;
215 }
216
217 // Set up overridden suffix-array-sample parameters
218 TIndexOffU offsLen = eh->_offsLen;
219 uint64_t offsSz = eh->_offsSz;
220 TIndexOffU offRateDiff = 0;
221 TIndexOffU offsLenSampled = offsLen;
222 if(_overrideOffRate > offRate) {
223 offRateDiff = _overrideOffRate - offRate;
224 }
225 if(offRateDiff > 0) {
226 offsLenSampled >>= offRateDiff;
227 if((offsLen & ~(OFF_MASK << offRateDiff)) != 0) {
228 offsLenSampled++;
229 }
230 }
231
232 // Can't override the offrate or isarate and use memory-mapped
233 // files; ultimately, all processes need to copy the sparser sample
234 // into their own memory spaces.
235 if(_useMm && (offRateDiff)) {
236 cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl;
237 throw 1;
238 }
239
240 // Read nPat from primary stream
241 this->_nPat = readI<TIndexOffU>(_in1, _switchEndian);
242 bytesRead += OFF_SIZE;
243 _plen.reset();
244 // Read plen from primary stream
245 if(_useMm) {
246 #ifdef BOWTIE_MM
247 _plen.init((TIndexOffU*)(mmFile[0] + bytesRead), _nPat, false);
248 bytesRead += _nPat*OFF_SIZE;
249 fseeko(_in1, _nPat*OFF_SIZE, SEEK_CUR);
250 #endif
251 } else {
252 try {
253 if(_verbose || startVerbose) {
254 cerr << "Reading plen (" << this->_nPat << "): ";
255 logTime(cerr);
256 }
257 _plen.init(new TIndexOffU[_nPat], _nPat, true);
258 if(_switchEndian) {
259 for(TIndexOffU i = 0; i < this->_nPat; i++) {
260 plen()[i] = readU<TIndexOffU>(_in1, _switchEndian);
261 }
262 } else {
263 size_t r = MM_READ(_in1, (void*)(plen()), _nPat*OFF_SIZE);
264 if(r != (size_t)(_nPat*OFF_SIZE)) {
265 cerr << "Error reading _plen[] array: " << r << ", " << _nPat*OFF_SIZE << endl;
266 throw 1;
267 }
268 }
269 } catch(bad_alloc& e) {
270 cerr << "Out of memory allocating plen[] in Ebwt::read()"
271 << " at " << __FILE__ << ":" << __LINE__ << endl;
272 throw e;
273 }
274 }
275
276 bool shmemLeader;
277
278 // TODO: I'm not consistent on what "header" means. Here I'm using
279 // "header" to mean everything that would exist in memory if we
280 // started to build the Ebwt but stopped short of the build*() step
281 // (i.e. everything up to and including join()).
282 if(justHeader) goto done;
283
284 this->_nFrag = readU<TIndexOffU>(_in1, _switchEndian);
285 bytesRead += OFF_SIZE;
286 if(_verbose || startVerbose) {
287 cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
288 logTime(cerr);
289 }
290 assert_geq(this->_nFrag, this->_nPat);
291 _rstarts.reset();
292 if(loadRstarts) {
293 if(_useMm) {
294 #ifdef BOWTIE_MM
295 _rstarts.init((TIndexOffU*)(mmFile[0] + bytesRead), _nFrag*3, false);
296 bytesRead += this->_nFrag*OFF_SIZE*3;
297 fseeko(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
298 #endif
299 } else {
300 _rstarts.init(new TIndexOffU[_nFrag*3], _nFrag*3, true);
301 if(_switchEndian) {
302 for(TIndexOffU i = 0; i < this->_nFrag*3; i += 3) {
303 // fragment starting position in joined reference
304 // string, text id, and fragment offset within text
305 this->rstarts()[i] = readU<TIndexOffU>(_in1, _switchEndian);
306 this->rstarts()[i+1] = readU<TIndexOffU>(_in1, _switchEndian);
307 this->rstarts()[i+2] = readU<TIndexOffU>(_in1, _switchEndian);
308 }
309 } else {
310 size_t r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*OFF_SIZE*3);
311 if(r != (size_t)(this->_nFrag*OFF_SIZE*3)) {
312 cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*OFF_SIZE*3) << endl;
313 throw 1;
314 }
315 }
316 }
317 } else {
318 // Skip em
319 assert(rstarts() == NULL);
320 bytesRead += this->_nFrag*OFF_SIZE*3;
321 fseeko(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
322 }
323
324 _ebwt.reset();
325 if(_useMm) {
326 #ifdef BOWTIE_MM
327 _ebwt.init((uint8_t*)(mmFile[0] + bytesRead), eh->_ebwtTotLen, false);
328 bytesRead += eh->_ebwtTotLen;
329 fseek(_in1, eh->_ebwtTotLen, SEEK_CUR);
330 #endif
331 } else {
332 // Allocate ebwt (big allocation)
333 if(_verbose || startVerbose) {
334 cerr << "Reading ebwt (" << eh->_ebwtTotLen << "): ";
335 logTime(cerr);
336 }
337 bool shmemLeader = true;
338 if(useShmem_) {
339 uint8_t *tmp = NULL;
340 shmemLeader = ALLOC_SHARED_U8(
341 (_in1Str + "[ebwt]"), eh->_ebwtTotLen, &tmp,
342 "ebwt[]", (_verbose || startVerbose));
343 assert(tmp != NULL);
344 _ebwt.init(tmp, eh->_ebwtTotLen, false);
345 if(_verbose || startVerbose) {
346 cerr << " shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
347 }
348 } else {
349 try {
350 _ebwt.init(new uint8_t[eh->_ebwtTotLen], eh->_ebwtTotLen, true);
351 } catch(bad_alloc& e) {
352 cerr << "Out of memory allocating the ebwt[] array for the Bowtie index. Please try" << endl
353 << "again on a computer with more memory." << endl;
354 throw 1;
355 }
356 }
357 if(shmemLeader) {
358 // Read ebwt from primary stream
359 uint64_t bytesLeft = eh->_ebwtTotLen;
360 char *pebwt = (char*)this->ebwt();
361
362 while (bytesLeft>0){
363 size_t r = MM_READ(_in1, (void *)pebwt, bytesLeft);
364 if(MM_IS_IO_ERR(_in1,r,bytesLeft)) {
365 cerr << "Error reading _ebwt[] array: " << r << ", "
366 << bytesLeft << gLastIOErrMsg << endl;
367 throw 1;
368 } else if (r == 0) {
369 cerr << "Error reading _ebwt[] array: no more data" << endl;
370 throw 1;
371 }
372 pebwt += r;
373 bytesLeft -= r;
374 }
375 #ifdef BOWTIE_SHARED_MEM
376 if(useShmem_) NOTIFY_SHARED(ebwt(), eh->_ebwtTotLen);
377 #endif
378 } else {
379 // Seek past the data and wait until master is finished
380 fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR);
381 #ifdef BOWTIE_SHARED_MEM
382 if(useShmem_) WAIT_SHARED(ebwt(), eh->_ebwtTotLen);
383 #endif
384 }
385 }
386
387 // Read zOff from primary stream
388 _zOff = readU<TIndexOffU>(_in1, _switchEndian);
389 bytesRead += OFF_SIZE;
390 assert_lt(_zOff, len);
391
392 try {
393 // Read fchr from primary stream
394 if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
395 _fchr.reset();
396 if(_useMm) {
397 #ifdef BOWTIE_MM
398 _fchr.init((TIndexOffU*)(mmFile[0] + bytesRead), 5, false);
399 bytesRead += 5*OFF_SIZE;
400 fseek(_in1, 5*OFF_SIZE, SEEK_CUR);
401 #endif
402 } else {
403 _fchr.init(new TIndexOffU[5], 5, true);
404 for(int i = 0; i < 5; i++) {
405 this->fchr()[i] = readU<TIndexOffU>(_in1, _switchEndian);
406 assert_leq(this->fchr()[i], len);
407 assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]);
408 }
409 }
410 assert_gt(this->fchr()[4], this->fchr()[0]);
411 // Read ftab from primary stream
412 if(_verbose || startVerbose) {
413 if(loadFtab) {
414 cerr << "Reading ftab (" << eh->_ftabLen << "): ";
415 logTime(cerr);
416 } else {
417 cerr << "Skipping ftab (" << eh->_ftabLen << "): ";
418 }
419 }
420 _ftab.reset();
421 if(loadFtab) {
422 if(_useMm) {
423 #ifdef BOWTIE_MM
424 _ftab.init((TIndexOffU*)(mmFile[0] + bytesRead), eh->_ftabLen, false);
425 bytesRead += eh->_ftabLen*OFF_SIZE;
426 fseeko(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
427 #endif
428 } else {
429 _ftab.init(new TIndexOffU[eh->_ftabLen], eh->_ftabLen, true);
430 if(_switchEndian) {
431 for(TIndexOffU i = 0; i < eh->_ftabLen; i++)
432 this->ftab()[i] = readU<TIndexOffU>(_in1, _switchEndian);
433 } else {
434 size_t r = MM_READ(_in1, (void *)ftab(), eh->_ftabLen*OFF_SIZE);
435 if(r != (size_t)(eh->_ftabLen*OFF_SIZE)) {
436 cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*OFF_SIZE) << endl;
437 throw 1;
438 }
439 }
440 }
441 // Read etab from primary stream
442 if(_verbose || startVerbose) {
443 if(loadFtab) {
444 cerr << "Reading eftab (" << eh->_eftabLen << "): ";
445 logTime(cerr);
446 } else {
447 cerr << "Skipping eftab (" << eh->_eftabLen << "): ";
448 }
449
450 }
451 _eftab.reset();
452 if(_useMm) {
453 #ifdef BOWTIE_MM
454 _eftab.init((TIndexOffU*)(mmFile[0] + bytesRead), eh->_eftabLen, false);
455 bytesRead += eh->_eftabLen*OFF_SIZE;
456 fseeko(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
457 #endif
458 } else {
459 _eftab.init(new TIndexOffU[eh->_eftabLen], eh->_eftabLen, true);
460 if(_switchEndian) {
461 for(TIndexOffU i = 0; i < eh->_eftabLen; i++)
462 this->eftab()[i] = readU<TIndexOffU>(_in1, _switchEndian);
463 } else {
464 size_t r = MM_READ(_in1, (void *)this->eftab(), eh->_eftabLen*OFF_SIZE);
465 if(r != (size_t)(eh->_eftabLen*OFF_SIZE)) {
466 cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*OFF_SIZE) << endl;
467 throw 1;
468 }
469 }
470 }
471 for(TIndexOffU i = 0; i < eh->_eftabLen; i++) {
472 if(i > 0 && this->eftab()[i] > 0) {
473 assert_geq(this->eftab()[i], this->eftab()[i-1]);
474 } else if(i > 0 && this->eftab()[i-1] == 0) {
475 assert_eq(0, this->eftab()[i]);
476 }
477 }
478 } else {
479 assert(ftab() == NULL);
480 assert(eftab() == NULL);
481 // Skip ftab
482 bytesRead += eh->_ftabLen*OFF_SIZE;
483 fseeko(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
484 // Skip eftab
485 bytesRead += eh->_eftabLen*OFF_SIZE;
486 fseeko(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
487 }
488 } catch(bad_alloc& e) {
489 cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
490 << "Please try again on a computer with more memory." << endl;
491 throw 1;
492 }
493
494 // Read reference sequence names from primary index file (or not,
495 // if --refidx is specified)
496 if(loadNames) {
497 while(true) {
498 char c = '\0';
499 if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break;
500 bytesRead++;
501 if(c == '\0') break;
502 else if(c == '\n') {
503 this->_refnames.push_back("");
504 } else {
505 if(this->_refnames.size() == 0) {
506 this->_refnames.push_back("");
507 }
508 this->_refnames.back().push_back(c);
509 }
510 }
511 }
512
513 _offs.reset();
514 if(loadSASamp) {
515 bytesRead = 4; // reset for secondary index file (already read 1-sentinel)
516
517 shmemLeader = true;
518 if(_verbose || startVerbose) {
519 cerr << "Reading offs (" << offsLenSampled << std::setw(2) << OFF_SIZE*8 <<"-bit words): ";
520 logTime(cerr);
521 }
522
523 if(!_useMm) {
524 if(!useShmem_) {
525 // Allocate offs_
526 try {
527 _offs.init(new TIndexOffU[offsLenSampled], offsLenSampled, true);
528 } catch(bad_alloc& e) {
529 cerr << "Out of memory allocating the offs[] array for the Bowtie index." << endl
530 << "Please try again on a computer with more memory." << endl;
531 throw 1;
532 }
533 } else {
534 TIndexOffU *tmp = NULL;
535 shmemLeader = ALLOC_SHARED_U(
536 (_in2Str + "[offs]"), offsLenSampled*OFF_SIZE, &tmp,
537 "offs", (_verbose || startVerbose));
538 _offs.init((TIndexOffU*)tmp, offsLenSampled, false);
539 }
540 }
541
542 if(_overrideOffRate < 32) {
543 if(shmemLeader) {
544 // Allocate offs (big allocation)
545 if(_switchEndian || offRateDiff > 0) {
546 assert(!_useMm);
547 const TIndexOffU blockMaxSz = (2 * 1024 * 1024); // 2 MB block size
548 const TIndexOffU blockMaxSzU = (blockMaxSz >> (OFF_SIZE/4 + 1)); // # U32s per block
549 char *buf;
550 try {
551 buf = new char[blockMaxSz];
552 } catch(std::bad_alloc& e) {
553 cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl;
554 throw e;
555 }
556 for(TIndexOffU i = 0; i < offsLen; i += blockMaxSzU) {
557 TIndexOffU block = min<TIndexOffU>(blockMaxSzU, offsLen - i);
558 size_t r = MM_READ(_in2, (void *)buf, block << (OFF_SIZE/4 + 1));
559 if(r != (size_t)(block << (OFF_SIZE/4 + 1))) {
560 cerr << "Error reading block of _offs[] array: " << r << ", " << (block << (OFF_SIZE/4 + 1)) << endl;
561 throw 1;
562 }
563 TIndexOffU idx = i >> offRateDiff;
564 for(TIndexOffU j = 0; j < block; j += (1 << offRateDiff)) {
565 assert_lt(idx, offsLenSampled);
566 this->offs()[idx] = ((TIndexOffU*)buf)[j];
567 if(_switchEndian) {
568 this->offs()[idx] = endianSwapU(this->offs()[idx]);
569 }
570 idx++;
571 }
572 }
573 delete[] buf;
574 } else {
575 if(_useMm) {
576 #ifdef BOWTIE_MM
577 _offs.init((TIndexOffU*)(mmFile[1] + bytesRead), offsLen, false);
578 bytesRead += offsSz;
579 fseeko(_in2, offsSz, SEEK_CUR);
580 #endif
581 } else {
582 // Workaround for small-index mode where MM_READ may
583 // not be able to handle read amounts greater than 2^32
584 // bytes.
585 uint64_t bytesLeft = offsSz;
586 char *offs = (char *)this->offs();
587
588 while(bytesLeft > 0) {
589 size_t r = MM_READ(_in2, (void*)offs, bytesLeft);
590 if(MM_IS_IO_ERR(_in2,r,bytesLeft)) {
591 cerr << "Error reading block of _offs[] array: "
592 << r << ", " << bytesLeft << gLastIOErrMsg << endl;
593 throw 1;
594 } else if (r == 0) {
595 cerr << "Error reading block of _offs[] array: no more data" << endl;
596 throw 1;
597 }
598 offs += r;
599 bytesLeft -= r;
600 }
601 }
602 }
603 #ifdef BOWTIE_SHARED_MEM
604 if(useShmem_) NOTIFY_SHARED(offs(), offsLenSampled*OFF_SIZE);
605 #endif
606 } else {
607 // Not the shmem leader
608 fseeko(_in2, offsLenSampled*OFF_SIZE, SEEK_CUR);
609 #ifdef BOWTIE_SHARED_MEM
610 if(useShmem_) WAIT_SHARED(offs(), offsLenSampled*OFF_SIZE);
611 #endif
612 }
613 }
614 }
615
616 this->postReadInit(*eh); // Initialize fields of Ebwt not read from file
617 if(_verbose || startVerbose) print(cerr, *eh);
618
619 // The fact that _ebwt and friends actually point to something
620 // (other than NULL) now signals to other member functions that the
621 // Ebwt is loaded into memory.
622
623 done: // Exit hatch for both justHeader and !justHeader
624
625 // Be kind
626 if(deleteEh) delete eh;
627 if(_in1 != NULL) {
628 rewind(_in1);
629 }
630 if(_in2 != NULL) {
631 rewind(_in2);
632 }
633 }
634
635 /**
636 * Read reference names from an input stream 'in' for an Ebwt primary
637 * file and store them in 'refnames'.
638 */
639 void
readEbwtRefnames(FILE * fin,EList<string> & refnames)640 readEbwtRefnames(FILE* fin, EList<string>& refnames) {
641 // _in1 must already be open with the get cursor at the
642 // beginning and no error flags set.
643 assert(fin != NULL);
644 assert_eq(ftello(fin), 0);
645
646 // Read endianness hints from both streams
647 bool _switchEndian = false;
648 uint32_t one = readU<uint32_t>(fin, _switchEndian); // 1st word of primary stream
649 if(one != 1) {
650 assert_eq((1u<<24), one);
651 _switchEndian = true;
652 }
653
654 // Reads header entries one by one from primary stream
655 TIndexOffU len = readU<TIndexOffU>(fin, _switchEndian);
656 int32_t lineRate = readI<int32_t>(fin, _switchEndian);
657 /*int32_t linesPerSide =*/ readI<int32_t>(fin, _switchEndian);
658 int32_t offRate = readI<int32_t>(fin, _switchEndian);
659 int32_t ftabChars = readI<int32_t>(fin, _switchEndian);
660 // BTL: chunkRate is now deprecated
661 int32_t flags = readI<int32_t>(fin, _switchEndian);
662 bool color = false;
663 bool entireReverse = false;
664 if(flags < 0) {
665 color = (((-flags) & EBWT_COLOR) != 0);
666 entireReverse = (((-flags) & EBWT_ENTIRE_REV) != 0);
667 }
668
669 // Create a new EbwtParams from the entries read from primary stream
670 EbwtParams eh(len, lineRate, offRate, ftabChars, color, entireReverse);
671
672 TIndexOffU nPat = readI<TIndexOffU>(fin, _switchEndian); // nPat
673 fseeko(fin, nPat*OFF_SIZE, SEEK_CUR);
674
675 // Skip rstarts
676 TIndexOffU nFrag = readU<TIndexOffU>(fin, _switchEndian);
677 fseeko(fin, nFrag*OFF_SIZE*3, SEEK_CUR);
678
679 // Skip ebwt
680 fseeko(fin, eh._ebwtTotLen, SEEK_CUR);
681
682 // Skip zOff from primary stream
683 readU<TIndexOffU>(fin, _switchEndian);
684
685 // Skip fchr
686 fseeko(fin, 5 * OFF_SIZE, SEEK_CUR);
687
688 // Skip ftab
689 fseeko(fin, eh._ftabLen*OFF_SIZE, SEEK_CUR);
690
691 // Skip eftab
692 fseeko(fin, eh._eftabLen*OFF_SIZE, SEEK_CUR);
693
694 // Read reference sequence names from primary index file
695 while(true) {
696 char c = '\0';
697 int read_value = 0;
698 read_value = fgetc(fin);
699 if(read_value == EOF) break;
700 c = read_value;
701 if(c == '\0') break;
702 else if(c == '\n') {
703 refnames.push_back("");
704 } else {
705 if(refnames.size() == 0) {
706 refnames.push_back("");
707 }
708 refnames.back().push_back(c);
709 }
710 }
711 if(refnames.back().empty()) {
712 refnames.pop_back();
713 }
714
715 // Be kind
716 fseeko(fin, 0, SEEK_SET);
717 assert(ferror(fin) == 0);
718 }
719
720 /**
721 * Read reference names from the index with basename 'in' and store
722 * them in 'refnames'.
723 */
724 void
readEbwtRefnames(const string & instr,EList<string> & refnames)725 readEbwtRefnames(const string& instr, EList<string>& refnames) {
726 FILE* fin;
727 // Initialize our primary and secondary input-stream fields
728 fin = fopen((instr + ".1." + gEbwt_ext).c_str(),"rb");
729 if(fin == NULL) {
730 throw EbwtFileOpenException("Cannot open file " + instr);
731 }
732 assert_eq(ftello(fin), 0);
733 readEbwtRefnames(fin, refnames);
734 fclose(fin);
735 }
736
737 /**
738 * Read just enough of the Ebwt's header to get its flags
739 */
readFlags(const string & instr)740 int32_t Ebwt::readFlags(const string& instr) {
741 ifstream in;
742 // Initialize our primary and secondary input-stream fields
743 in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
744 if(!in.is_open()) {
745 throw EbwtFileOpenException("Cannot open file " + instr);
746 }
747 assert(in.is_open());
748 assert(in.good());
749 bool _switchEndian = false;
750 uint32_t one = readU<uint32_t>(in, _switchEndian); // 1st word of primary stream
751 if(one != 1) {
752 assert_eq((1u<<24), one);
753 assert_eq(1, endianSwapU32(one));
754 _switchEndian = true;
755 }
756 readU<TIndexOffU>(in, _switchEndian);
757 readI<int32_t>(in, _switchEndian);
758 readI<int32_t>(in, _switchEndian);
759 readI<int32_t>(in, _switchEndian);
760 readI<int32_t>(in, _switchEndian);
761 int32_t flags = readI<int32_t>(in, _switchEndian);
762 return flags;
763 }
764
765 /**
766 * Read just enough of the Ebwt's header to determine whether it's
767 * colorspace.
768 */
769 bool
readEbwtColor(const string & instr)770 readEbwtColor(const string& instr) {
771 int32_t flags = Ebwt::readFlags(instr);
772 if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
773 return true;
774 } else {
775 return false;
776 }
777 }
778
779 /**
780 * Read just enough of the Ebwt's header to determine whether it's
781 * entirely reversed.
782 */
783 bool
readEntireReverse(const string & instr)784 readEntireReverse(const string& instr) {
785 int32_t flags = Ebwt::readFlags(instr);
786 if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) != 0)) {
787 return true;
788 } else {
789 return false;
790 }
791 }
792
793 /**
794 * Write an extended Burrows-Wheeler transform to a pair of output
795 * streams.
796 *
797 * @param out1 output stream to primary file
798 * @param out2 output stream to secondary file
799 * @param be write in big endian?
800 */
writeFromMemory(bool justHeader,ostream & out1,ostream & out2) const801 void Ebwt::writeFromMemory(bool justHeader,
802 ostream& out1,
803 ostream& out2) const
804 {
805 const EbwtParams& eh = this->_eh;
806 assert(eh.repOk());
807 assert(out1.good());
808 assert(out2.good());
809
810 // When building an Ebwt, these header parameters are known
811 // "up-front", i.e., they can be written to disk immediately,
812 // before we join() or buildToDisk()
813 writeI<int32_t>(out1, 1, _switchEndian); // endian hint for priamry stream
814 writeI<int32_t>(out2, 1, _switchEndian); // endian hint for secondary stream
815 writeU<TIndexOffU>(out1, eh._len, _switchEndian); // length of string (and bwt and suffix array)
816 writeI<int32_t>(out1, eh._lineRate, _switchEndian); // 2^lineRate = size in bytes of 1 line
817 writeI<int32_t>(out1, 2, _switchEndian); // not used
818 writeI<int32_t>(out1, eh._offRate, _switchEndian); // every 2^offRate chars is "marked"
819 writeI<int32_t>(out1, eh._ftabChars, _switchEndian); // number of 2-bit chars used to address ftab
820 int32_t flags = 1;
821 if(eh._color) flags |= EBWT_COLOR;
822 if(eh._entireReverse) flags |= EBWT_ENTIRE_REV;
823 writeI<int32_t>(out1, -flags, _switchEndian); // BTL: chunkRate is now deprecated
824
825 if(!justHeader) {
826 assert(rstarts() != NULL);
827 assert(offs() != NULL);
828 assert(ftab() != NULL);
829 assert(eftab() != NULL);
830 assert(isInMemory());
831 // These Ebwt parameters are known after the inputs strings have
832 // been joined() but before they have been built(). These can
833 // written to the disk next and then discarded from memory.
834 writeU<TIndexOffU>(out1, this->_nPat, _switchEndian);
835 for(TIndexOffU i = 0; i < this->_nPat; i++)
836 writeU<TIndexOffU>(out1, this->plen()[i], _switchEndian);
837 assert_geq(this->_nFrag, this->_nPat);
838 writeU<TIndexOffU>(out1, this->_nFrag, _switchEndian);
839 for(TIndexOffU i = 0; i < this->_nFrag*3; i++)
840 writeU<TIndexOffU>(out1, this->rstarts()[i], _switchEndian);
841
842 // These Ebwt parameters are discovered only as the Ebwt is being
843 // built (in buildToDisk()). Of these, only 'offs' and 'ebwt' are
844 // terribly large. 'ebwt' is written to the primary file and then
845 // discarded from memory as it is built; 'offs' is similarly
846 // written to the secondary file and discarded.
847 out1.write((const char *)this->ebwt(), eh._ebwtTotLen);
848 writeU<TIndexOffU>(out1, this->zOff(), _switchEndian);
849 TIndexOffU offsLen = eh._offsLen;
850 for(TIndexOffU i = 0; i < offsLen; i++)
851 writeU<TIndexOffU>(out2, this->offs()[i], _switchEndian);
852
853 // 'fchr', 'ftab' and 'eftab' are not fully determined until the
854 // loop is finished, so they are written to the primary file after
855 // all of 'ebwt' has already been written and only then discarded
856 // from memory.
857 for(int i = 0; i < 5; i++)
858 writeU<TIndexOffU>(out1, this->fchr()[i], _switchEndian);
859 for(TIndexOffU i = 0; i < eh._ftabLen; i++)
860 writeU<TIndexOffU>(out1, this->ftab()[i], _switchEndian);
861 for(TIndexOffU i = 0; i < eh._eftabLen; i++)
862 writeU<TIndexOffU>(out1, this->eftab()[i], _switchEndian);
863 }
864 }
865
866 /**
867 * Given a pair of strings representing output filenames, and assuming
868 * this Ebwt object is currently in memory, write out this Ebwt to the
869 * specified files.
870 *
871 * If sanity-checking is enabled, then once the streams have been
872 * fully written and closed, we reopen them and read them into a
873 * (hopefully) exact copy of this Ebwt. We then assert that the
874 * current Ebwt and the copy match in all of their fields.
875 */
writeFromMemory(bool justHeader,const string & out1,const string & out2) const876 void Ebwt::writeFromMemory(bool justHeader,
877 const string& out1,
878 const string& out2) const
879 {
880 ASSERT_ONLY(const EbwtParams& eh = this->_eh);
881 assert(isInMemory());
882 assert(eh.repOk());
883
884 ofstream fout1(out1.c_str(), ios::binary);
885 ofstream fout2(out2.c_str(), ios::binary);
886 writeFromMemory(justHeader, fout1, fout2);
887 fout1.close();
888 fout2.close();
889
890 // Read the file back in and assert that all components match
891 if(_sanity) {
892 #if 0
893 if(_verbose)
894 cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl;
895 Ebwt copy(out1, out2, _verbose, _sanity);
896 assert(!isInMemory());
897 copy.loadIntoMemory(eh._color ? 1 : 0, true, false, false);
898 assert(isInMemory());
899 assert_eq(eh._lineRate, copy.eh()._lineRate);
900 assert_eq(eh._offRate, copy.eh()._offRate);
901 assert_eq(eh._ftabChars, copy.eh()._ftabChars);
902 assert_eq(eh._len, copy.eh()._len);
903 assert_eq(_zOff, copy.zOff());
904 assert_eq(_zEbwtBpOff, copy.zEbwtBpOff());
905 assert_eq(_zEbwtByteOff, copy.zEbwtByteOff());
906 assert_eq(_nPat, copy.nPat());
907 for(TIndexOffU i = 0; i < _nPat; i++)
908 assert_eq(this->_plen[i], copy.plen()[i]);
909 assert_eq(this->_nFrag, copy.nFrag());
910 for(TIndexOffU i = 0; i < this->nFrag*3; i++) {
911 assert_eq(this->_rstarts[i], copy.rstarts()[i]);
912 }
913 for(int i = 0; i < 5; i++)
914 assert_eq(this->_fchr[i], copy.fchr()[i]);
915 for(TIndexOffU i = 0; i < eh._ftabLen; i++)
916 assert_eq(this->ftab()[i], copy.ftab()[i]);
917 for(TIndexOffU i = 0; i < eh._eftabLen; i++)
918 assert_eq(this->eftab()[i], copy.eftab()[i]);
919 for(TIndexOffU i = 0; i < eh._offsLen; i++)
920 assert_eq(this->_offs[i], copy.offs()[i]);
921 for(TIndexOffU i = 0; i < eh._ebwtTotLen; i++)
922 assert_eq(this->ebwt()[i], copy.ebwt()[i]);
923 copy.sanityCheckAll();
924 if(_verbose)
925 cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl;
926 #endif
927 }
928 }
929
930 /**
931 * Write the rstarts array given the szs array for the reference.
932 */
szsToDisk(const EList<RefRecord> & szs,ostream & os,int reverse)933 void Ebwt::szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse) {
934 TIndexOffU seq = 0;
935 TIndexOffU off = 0;
936 TIndexOffU totlen = 0;
937 for(unsigned int i = 0; i < szs.size(); i++) {
938 if(szs[i].len == 0) continue;
939 if(szs[i].first) off = 0;
940 off += szs[i].off;
941 if(szs[i].first && szs[i].len > 0) seq++;
942 TIndexOffU seqm1 = seq-1;
943 assert_lt(seqm1, _nPat);
944 TIndexOffU fwoff = off;
945 if(reverse == REF_READ_REVERSE) {
946 // Invert pattern idxs
947 seqm1 = _nPat - seqm1 - 1;
948 // Invert pattern idxs
949 assert_leq(off + szs[i].len, plen()[seqm1]);
950 fwoff = plen()[seqm1] - (off + szs[i].len);
951 }
952 writeU<TIndexOffU>(os, totlen, _switchEndian); // offset from beginning of joined string
953 writeU<TIndexOffU>(os, seqm1, _switchEndian); // sequence id
954 writeU<TIndexOffU>(os, fwoff, _switchEndian); // offset into sequence
955 totlen += szs[i].len;
956 off += szs[i].len;
957 }
958 }
959