1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <string>
21 #include <stdexcept>
22 #include <iostream>
23 #include <fstream>
24 #include <stdlib.h>
25 #include "bt2_idx.h"
26 #include <iomanip>
27 
28 using namespace std;
29 
30 ///////////////////////////////////////////////////////////////////////
31 //
32 // Functions for reading and writing Ebwts
33 //
34 ///////////////////////////////////////////////////////////////////////
35 
36 /**
37  * Read an Ebwt from file with given filename.
38  */
readIntoMemory(int color,int needEntireRev,bool loadSASamp,bool loadFtab,bool loadRstarts,bool justHeader,EbwtParams * params,bool mmSweep,bool loadNames,bool startVerbose)39 void Ebwt::readIntoMemory(
40 	int color,
41 	int needEntireRev,
42 	bool loadSASamp,
43 	bool loadFtab,
44 	bool loadRstarts,
45 	bool justHeader,
46 	EbwtParams *params,
47 	bool mmSweep,
48 	bool loadNames,
49 	bool startVerbose)
50 {
51 #ifdef BOWTIE_MM
52 	char *mmFile[] = { NULL, NULL };
53 #endif
54 	if(_in1Str.length() > 0) {
55 		if(_verbose || startVerbose) {
56 			cerr << "  About to open input files: ";
57 			logTime(cerr);
58 		}
59 		// Initialize our primary and secondary input-stream fields
60 		if(_in1 != NULL) fclose(_in1);
61 		if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str.c_str() << "\"" << endl;
62 		if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
63 			cerr << "Could not open index file " << _in1Str.c_str() << endl;
64 		}
65 		if(loadSASamp) {
66 			if(_in2 != NULL) fclose(_in2);
67 			if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str.c_str() << "\"" << endl;
68 			if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
69 				cerr << "Could not open index file " << _in2Str.c_str() << endl;
70 			}
71 		}
72 		if(_verbose || startVerbose) {
73 			cerr << "  Finished opening input files: ";
74 			logTime(cerr);
75 		}
76 
77 #ifdef BOWTIE_MM
78 		if(_useMm /*&& !justHeader*/) {
79 			const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
80 			int fds[] = { fileno(_in1), fileno(_in2) };
81 			for(int i = 0; i < (loadSASamp ? 2 : 1); i++) {
82 				if(_verbose || startVerbose) {
83 					cerr << "  Memory-mapping input file " << (i+1) << ": ";
84 					logTime(cerr);
85 				}
86 				struct stat sbuf;
87 				if (stat(names[i], &sbuf) == -1) {
88 					perror("stat");
89 					cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
90 					throw 1;
91 				}
92 				mmFile[i] = (char*)mmap((void *)0, (size_t)sbuf.st_size,
93 										PROT_READ, MAP_SHARED, fds[(size_t)i], 0);
94 				if(mmFile[i] == (void *)(-1)) {
95 					perror("mmap");
96 					cerr << "Error: Could not memory-map the index file " << names[i] << endl;
97 					throw 1;
98 				}
99 				if(mmSweep) {
100 					int sum = 0;
101 					for(off_t j = 0; j < sbuf.st_size; j += 1024) {
102 						sum += (int) mmFile[i][j];
103 					}
104 					if(startVerbose) {
105 						cerr << "  Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
106 						logTime(cerr);
107 					}
108 				}
109 			}
110 			mmFile1_ = mmFile[0];
111 			mmFile2_ = loadSASamp ? mmFile[1] : NULL;
112 		}
113 #endif
114 	}
115 #ifdef BOWTIE_MM
116 	else if(_useMm && !justHeader) {
117 		mmFile[0] = mmFile1_;
118 		mmFile[1] = mmFile2_;
119 	}
120 	if(_useMm && !justHeader) {
121 		assert(mmFile[0] == mmFile1_);
122 		assert(mmFile[1] == mmFile2_);
123 	}
124 #endif
125 
126 	if(_verbose || startVerbose) {
127 		cerr << "  Reading header: ";
128 		logTime(cerr);
129 	}
130 
131 	// Read endianness hints from both streams
132 	uint64_t bytesRead = 0;
133 	_switchEndian = false;
134 	uint32_t one = readU<uint32_t>(_in1, _switchEndian); // 1st word of primary stream
135 	bytesRead += 4;
136 	if(loadSASamp) {
137 #ifndef NDEBUG
138 		assert_eq(one, readU<uint32_t>(_in2, _switchEndian)); // should match!
139 #else
140 		readU<uint32_t>(_in2, _switchEndian);
141 #endif
142 	}
143 	if(one != 1) {
144 		assert_eq((1u<<24), one);
145 		assert_eq(1, endianSwapU32(one));
146 		_switchEndian = true;
147 	}
148 
149 	// Can't switch endianness and use memory-mapped files; in order to
150 	// support this, someone has to modify the file to switch
151 	// endiannesses appropriately, and we can't do this inside Bowtie
152 	// or we might be setting up a race condition with other processes.
153 	if(_switchEndian && _useMm) {
154 		cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
155 		throw 1;
156 	}
157 
158 	// Reads header entries one by one from primary stream
159 	TIndexOffU len          = readU<TIndexOffU>(_in1, _switchEndian);
160 	bytesRead += OFF_SIZE;
161 	int32_t  lineRate     = readI<int32_t>(_in1, _switchEndian);
162 	bytesRead += 4;
163 	/*int32_t  linesPerSide =*/ readI<int32_t>(_in1, _switchEndian);
164 	bytesRead += 4;
165 	int32_t  offRate      = readI<int32_t>(_in1, _switchEndian);
166 	bytesRead += 4;
167 	// TODO: add isaRate to the actual file format (right now, the
168 	// user has to tell us whether there's an ISA sample and what the
169 	// sampling rate is.
170 	int32_t  ftabChars    = readI<int32_t>(_in1, _switchEndian);
171 	bytesRead += 4;
172 	// chunkRate was deprecated in an earlier version of Bowtie; now
173 	// we use it to hold flags.
174 	int32_t flags = readI<int32_t>(_in1, _switchEndian);
175 	bool entireRev = false;
176 	if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
177 		if(color != -1 && !color) {
178 			cerr << "Error: -C was not specified when running bowtie, but index is in colorspace.  If" << endl
179 			     << "your reads are in colorspace, please use the -C option.  If your reads are not" << endl
180 			     << "in colorspace, please use a normal index (one built without specifying -C to" << endl
181 			     << "bowtie-build)." << endl;
182 			throw 1;
183 		}
184 		color = 1;
185 	} else if(flags < 0) {
186 		if(color != -1 && color) {
187 			cerr << "Error: -C was specified when running bowtie, but index is not in colorspace.  If" << endl
188 			     << "your reads are in colorspace, please use a colorspace index (one built using" << endl
189 			     << "bowtie-build -C).  If your reads are not in colorspace, don't specify -C when" << endl
190 			     << "running bowtie." << endl;
191 			throw 1;
192 		}
193 		color = 0;
194 	}
195 	if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) {
196 		if(needEntireRev != -1 && needEntireRev != 0) {
197 			cerr << "Error: This index is compatible with 0.* versions of Bowtie, but not with 2.*" << endl
198 			     << "versions.  Please build or download a version of the index that is compitble" << endl
199 				 << "with Bowtie 2.* (i.e. built with bowtie-build 2.* or later)" << endl;
200 			throw 1;
201 		}
202 	} else entireRev = true;
203 	bytesRead += 4;
204 
205 	// Create a new EbwtParams from the entries read from primary stream
206 	EbwtParams *eh;
207 	bool deleteEh = false;
208 	if(params != NULL) {
209 		params->init(len, lineRate, offRate, ftabChars, color, entireRev);
210 		if(_verbose || startVerbose) params->print(cerr);
211 		eh = params;
212 	} else {
213 		eh = new EbwtParams(len, lineRate, offRate, ftabChars, color, entireRev);
214 		deleteEh = true;
215 	}
216 
217 	// Set up overridden suffix-array-sample parameters
218 	TIndexOffU offsLen = eh->_offsLen;
219 	uint64_t offsSz = eh->_offsSz;
220 	TIndexOffU offRateDiff = 0;
221 	TIndexOffU offsLenSampled = offsLen;
222 	if(_overrideOffRate > offRate) {
223 		offRateDiff = _overrideOffRate - offRate;
224 	}
225 	if(offRateDiff > 0) {
226 		offsLenSampled >>= offRateDiff;
227 		if((offsLen & ~(OFF_MASK << offRateDiff)) != 0) {
228 			offsLenSampled++;
229 		}
230 	}
231 
232 	// Can't override the offrate or isarate and use memory-mapped
233 	// files; ultimately, all processes need to copy the sparser sample
234 	// into their own memory spaces.
235 	if(_useMm && (offRateDiff)) {
236 		cerr << "Error: Can't use memory-mapped files when the offrate is overridden" << endl;
237 		throw 1;
238 	}
239 
240 	// Read nPat from primary stream
241 	this->_nPat = readI<TIndexOffU>(_in1, _switchEndian);
242 	bytesRead += OFF_SIZE;
243 	_plen.reset();
244 	// Read plen from primary stream
245 	if(_useMm) {
246 #ifdef BOWTIE_MM
247 		_plen.init((TIndexOffU*)(mmFile[0] + bytesRead), _nPat, false);
248 		bytesRead += _nPat*OFF_SIZE;
249 		fseeko(_in1, _nPat*OFF_SIZE, SEEK_CUR);
250 #endif
251 	} else {
252 		try {
253 			if(_verbose || startVerbose) {
254 				cerr << "Reading plen (" << this->_nPat << "): ";
255 				logTime(cerr);
256 			}
257 			_plen.init(new TIndexOffU[_nPat], _nPat, true);
258 			if(_switchEndian) {
259 				for(TIndexOffU i = 0; i < this->_nPat; i++) {
260 					plen()[i] = readU<TIndexOffU>(_in1, _switchEndian);
261 				}
262 			} else {
263 				size_t r = MM_READ(_in1, (void*)(plen()), _nPat*OFF_SIZE);
264 				if(r != (size_t)(_nPat*OFF_SIZE)) {
265 					cerr << "Error reading _plen[] array: " << r << ", " << _nPat*OFF_SIZE << endl;
266 					throw 1;
267 				}
268 			}
269 		} catch(bad_alloc& e) {
270 			cerr << "Out of memory allocating plen[] in Ebwt::read()"
271 			<< " at " << __FILE__ << ":" << __LINE__ << endl;
272 			throw e;
273 		}
274 	}
275 
276 	bool shmemLeader;
277 
278 	// TODO: I'm not consistent on what "header" means.  Here I'm using
279 	// "header" to mean everything that would exist in memory if we
280 	// started to build the Ebwt but stopped short of the build*() step
281 	// (i.e. everything up to and including join()).
282 	if(justHeader) goto done;
283 
284 	this->_nFrag = readU<TIndexOffU>(_in1, _switchEndian);
285 	bytesRead += OFF_SIZE;
286 	if(_verbose || startVerbose) {
287 		cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
288 		logTime(cerr);
289 	}
290 	assert_geq(this->_nFrag, this->_nPat);
291 	_rstarts.reset();
292 	if(loadRstarts) {
293 		if(_useMm) {
294 #ifdef BOWTIE_MM
295 			_rstarts.init((TIndexOffU*)(mmFile[0] + bytesRead), _nFrag*3, false);
296 			bytesRead += this->_nFrag*OFF_SIZE*3;
297 			fseeko(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
298 #endif
299 		} else {
300 			_rstarts.init(new TIndexOffU[_nFrag*3], _nFrag*3, true);
301 			if(_switchEndian) {
302 				for(TIndexOffU i = 0; i < this->_nFrag*3; i += 3) {
303 					// fragment starting position in joined reference
304 					// string, text id, and fragment offset within text
305 					this->rstarts()[i]   = readU<TIndexOffU>(_in1, _switchEndian);
306 					this->rstarts()[i+1] = readU<TIndexOffU>(_in1, _switchEndian);
307 					this->rstarts()[i+2] = readU<TIndexOffU>(_in1, _switchEndian);
308 				}
309 			} else {
310 				size_t r = MM_READ(_in1, (void *)rstarts(), this->_nFrag*OFF_SIZE*3);
311 				if(r != (size_t)(this->_nFrag*OFF_SIZE*3)) {
312 					cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*OFF_SIZE*3) << endl;
313 					throw 1;
314 				}
315 			}
316 		}
317 	} else {
318 		// Skip em
319 		assert(rstarts() == NULL);
320 		bytesRead += this->_nFrag*OFF_SIZE*3;
321 		fseeko(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
322 	}
323 
324 	_ebwt.reset();
325 	if(_useMm) {
326 #ifdef BOWTIE_MM
327 		_ebwt.init((uint8_t*)(mmFile[0] + bytesRead), eh->_ebwtTotLen, false);
328 		bytesRead += eh->_ebwtTotLen;
329 		fseek(_in1, eh->_ebwtTotLen, SEEK_CUR);
330 #endif
331 	} else {
332 		// Allocate ebwt (big allocation)
333 		if(_verbose || startVerbose) {
334 			cerr << "Reading ebwt (" << eh->_ebwtTotLen << "): ";
335 			logTime(cerr);
336 		}
337 		bool shmemLeader = true;
338 		if(useShmem_) {
339 			uint8_t *tmp = NULL;
340 			shmemLeader = ALLOC_SHARED_U8(
341 				(_in1Str + "[ebwt]"), eh->_ebwtTotLen, &tmp,
342 				"ebwt[]", (_verbose || startVerbose));
343 			assert(tmp != NULL);
344 			_ebwt.init(tmp, eh->_ebwtTotLen, false);
345 			if(_verbose || startVerbose) {
346 				cerr << "  shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
347 			}
348 		} else {
349 			try {
350 				_ebwt.init(new uint8_t[eh->_ebwtTotLen], eh->_ebwtTotLen, true);
351 			} catch(bad_alloc& e) {
352 				cerr << "Out of memory allocating the ebwt[] array for the Bowtie index.  Please try" << endl
353 				<< "again on a computer with more memory." << endl;
354 				throw 1;
355 			}
356 		}
357 		if(shmemLeader) {
358 			// Read ebwt from primary stream
359 			uint64_t bytesLeft = eh->_ebwtTotLen;
360 			char *pebwt = (char*)this->ebwt();
361 
362 			while (bytesLeft>0){
363 				size_t r = MM_READ(_in1, (void *)pebwt, bytesLeft);
364 				if(MM_IS_IO_ERR(_in1,r,bytesLeft)) {
365 					cerr << "Error reading _ebwt[] array: " << r << ", "
366 						 << bytesLeft << gLastIOErrMsg << endl;
367 					throw 1;
368 				} else if (r == 0) {
369 					cerr << "Error reading _ebwt[] array: no more data" << endl;
370 					throw 1;
371 				}
372 				pebwt += r;
373 				bytesLeft -= r;
374 			}
375 #ifdef BOWTIE_SHARED_MEM
376 			if(useShmem_) NOTIFY_SHARED(ebwt(), eh->_ebwtTotLen);
377 #endif
378 		} else {
379 			// Seek past the data and wait until master is finished
380 			fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR);
381 #ifdef BOWTIE_SHARED_MEM
382 			if(useShmem_) WAIT_SHARED(ebwt(), eh->_ebwtTotLen);
383 #endif
384 		}
385 	}
386 
387 	// Read zOff from primary stream
388 	_zOff = readU<TIndexOffU>(_in1, _switchEndian);
389 	bytesRead += OFF_SIZE;
390 	assert_lt(_zOff, len);
391 
392 	try {
393 		// Read fchr from primary stream
394 		if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
395 		_fchr.reset();
396 		if(_useMm) {
397 #ifdef BOWTIE_MM
398 			_fchr.init((TIndexOffU*)(mmFile[0] + bytesRead), 5, false);
399 			bytesRead += 5*OFF_SIZE;
400 			fseek(_in1, 5*OFF_SIZE, SEEK_CUR);
401 #endif
402 		} else {
403 			_fchr.init(new TIndexOffU[5], 5, true);
404 			for(int i = 0; i < 5; i++) {
405 				this->fchr()[i] = readU<TIndexOffU>(_in1, _switchEndian);
406 				assert_leq(this->fchr()[i], len);
407 				assert(i <= 0 || this->fchr()[i] >= this->fchr()[i-1]);
408 			}
409 		}
410 		assert_gt(this->fchr()[4], this->fchr()[0]);
411 		// Read ftab from primary stream
412 		if(_verbose || startVerbose) {
413 			if(loadFtab) {
414 				cerr << "Reading ftab (" << eh->_ftabLen << "): ";
415 				logTime(cerr);
416 			} else {
417 				cerr << "Skipping ftab (" << eh->_ftabLen << "): ";
418 			}
419 		}
420 		_ftab.reset();
421 		if(loadFtab) {
422 			if(_useMm) {
423 #ifdef BOWTIE_MM
424 				_ftab.init((TIndexOffU*)(mmFile[0] + bytesRead), eh->_ftabLen, false);
425 				bytesRead += eh->_ftabLen*OFF_SIZE;
426 				fseeko(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
427 #endif
428 			} else {
429 				_ftab.init(new TIndexOffU[eh->_ftabLen], eh->_ftabLen, true);
430 				if(_switchEndian) {
431 					for(TIndexOffU i = 0; i < eh->_ftabLen; i++)
432 						this->ftab()[i] = readU<TIndexOffU>(_in1, _switchEndian);
433 				} else {
434 					size_t r = MM_READ(_in1, (void *)ftab(), eh->_ftabLen*OFF_SIZE);
435 					if(r != (size_t)(eh->_ftabLen*OFF_SIZE)) {
436 						cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*OFF_SIZE) << endl;
437 						throw 1;
438 					}
439 				}
440 			}
441 			// Read etab from primary stream
442 			if(_verbose || startVerbose) {
443 				if(loadFtab) {
444 					cerr << "Reading eftab (" << eh->_eftabLen << "): ";
445 					logTime(cerr);
446 				} else {
447 					cerr << "Skipping eftab (" << eh->_eftabLen << "): ";
448 				}
449 
450 			}
451 			_eftab.reset();
452 			if(_useMm) {
453 #ifdef BOWTIE_MM
454 				_eftab.init((TIndexOffU*)(mmFile[0] + bytesRead), eh->_eftabLen, false);
455 				bytesRead += eh->_eftabLen*OFF_SIZE;
456 				fseeko(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
457 #endif
458 			} else {
459 				_eftab.init(new TIndexOffU[eh->_eftabLen], eh->_eftabLen, true);
460 				if(_switchEndian) {
461 					for(TIndexOffU i = 0; i < eh->_eftabLen; i++)
462 						this->eftab()[i] = readU<TIndexOffU>(_in1, _switchEndian);
463 				} else {
464 					size_t r = MM_READ(_in1, (void *)this->eftab(), eh->_eftabLen*OFF_SIZE);
465 					if(r != (size_t)(eh->_eftabLen*OFF_SIZE)) {
466 						cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*OFF_SIZE) << endl;
467 						throw 1;
468 					}
469 				}
470 			}
471 			for(TIndexOffU i = 0; i < eh->_eftabLen; i++) {
472 				if(i > 0 && this->eftab()[i] > 0) {
473 					assert_geq(this->eftab()[i], this->eftab()[i-1]);
474 				} else if(i > 0 && this->eftab()[i-1] == 0) {
475 					assert_eq(0, this->eftab()[i]);
476 				}
477 			}
478 		} else {
479 			assert(ftab() == NULL);
480 			assert(eftab() == NULL);
481 			// Skip ftab
482 			bytesRead += eh->_ftabLen*OFF_SIZE;
483 			fseeko(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
484 			// Skip eftab
485 			bytesRead += eh->_eftabLen*OFF_SIZE;
486 			fseeko(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
487 		}
488 	} catch(bad_alloc& e) {
489 		cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
490 		<< "Please try again on a computer with more memory." << endl;
491 		throw 1;
492 	}
493 
494 	// Read reference sequence names from primary index file (or not,
495 	// if --refidx is specified)
496 	if(loadNames) {
497 		while(true) {
498 			char c = '\0';
499 			if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break;
500 			bytesRead++;
501 			if(c == '\0') break;
502 			else if(c == '\n') {
503 				this->_refnames.push_back("");
504 			} else {
505 				if(this->_refnames.size() == 0) {
506 					this->_refnames.push_back("");
507 				}
508 				this->_refnames.back().push_back(c);
509 			}
510 		}
511 	}
512 
513 	_offs.reset();
514 	if(loadSASamp) {
515 		bytesRead = 4; // reset for secondary index file (already read 1-sentinel)
516 
517 		shmemLeader = true;
518 		if(_verbose || startVerbose) {
519 			cerr << "Reading offs (" << offsLenSampled << std::setw(2) << OFF_SIZE*8 <<"-bit words): ";
520 			logTime(cerr);
521 		}
522 
523 		if(!_useMm) {
524 			if(!useShmem_) {
525 				// Allocate offs_
526 				try {
527 					_offs.init(new TIndexOffU[offsLenSampled], offsLenSampled, true);
528 				} catch(bad_alloc& e) {
529 					cerr << "Out of memory allocating the offs[] array  for the Bowtie index." << endl
530 					<< "Please try again on a computer with more memory." << endl;
531 					throw 1;
532 				}
533 			} else {
534 				TIndexOffU *tmp = NULL;
535 				shmemLeader = ALLOC_SHARED_U(
536 					(_in2Str + "[offs]"), offsLenSampled*OFF_SIZE, &tmp,
537 					"offs", (_verbose || startVerbose));
538 				_offs.init((TIndexOffU*)tmp, offsLenSampled, false);
539 			}
540 		}
541 
542 		if(_overrideOffRate < 32) {
543 			if(shmemLeader) {
544 				// Allocate offs (big allocation)
545 				if(_switchEndian || offRateDiff > 0) {
546 					assert(!_useMm);
547 					const TIndexOffU blockMaxSz = (2 * 1024 * 1024); // 2 MB block size
548 					const TIndexOffU blockMaxSzU = (blockMaxSz >> (OFF_SIZE/4 + 1)); // # U32s per block
549 					char *buf;
550 					try {
551 						buf = new char[blockMaxSz];
552 					} catch(std::bad_alloc& e) {
553 						cerr << "Error: Out of memory allocating part of _offs array: '" << e.what() << "'" << endl;
554 						throw e;
555 					}
556 					for(TIndexOffU i = 0; i < offsLen; i += blockMaxSzU) {
557 						TIndexOffU block = min<TIndexOffU>(blockMaxSzU, offsLen - i);
558 						size_t r = MM_READ(_in2, (void *)buf, block << (OFF_SIZE/4 + 1));
559 						if(r != (size_t)(block << (OFF_SIZE/4 + 1))) {
560 							cerr << "Error reading block of _offs[] array: " << r << ", " << (block << (OFF_SIZE/4 + 1)) << endl;
561 							throw 1;
562 						}
563 						TIndexOffU idx = i >> offRateDiff;
564 						for(TIndexOffU j = 0; j < block; j += (1 << offRateDiff)) {
565 							assert_lt(idx, offsLenSampled);
566 							this->offs()[idx] = ((TIndexOffU*)buf)[j];
567 							if(_switchEndian) {
568 								this->offs()[idx] = endianSwapU(this->offs()[idx]);
569 							}
570 							idx++;
571 						}
572 					}
573 					delete[] buf;
574 				} else {
575 					if(_useMm) {
576 #ifdef BOWTIE_MM
577 						_offs.init((TIndexOffU*)(mmFile[1] + bytesRead), offsLen, false);
578 						bytesRead += offsSz;
579 						fseeko(_in2, offsSz, SEEK_CUR);
580 #endif
581 					} else {
582 						// Workaround for small-index mode where MM_READ may
583 						// not be able to handle read amounts greater than 2^32
584 						// bytes.
585 						uint64_t bytesLeft = offsSz;
586 						char *offs = (char *)this->offs();
587 
588 						while(bytesLeft > 0) {
589 							size_t r = MM_READ(_in2, (void*)offs, bytesLeft);
590 							if(MM_IS_IO_ERR(_in2,r,bytesLeft)) {
591 								cerr << "Error reading block of _offs[] array: "
592 								     << r << ", " << bytesLeft << gLastIOErrMsg << endl;
593 								throw 1;
594 							} else if (r == 0) {
595 								cerr << "Error reading block of _offs[] array: no more data" << endl;
596 								throw 1;
597 							}
598 							offs += r;
599 							bytesLeft -= r;
600 						}
601 					}
602 				}
603 #ifdef BOWTIE_SHARED_MEM
604 				if(useShmem_) NOTIFY_SHARED(offs(), offsLenSampled*OFF_SIZE);
605 #endif
606 			} else {
607 				// Not the shmem leader
608 				fseeko(_in2, offsLenSampled*OFF_SIZE, SEEK_CUR);
609 #ifdef BOWTIE_SHARED_MEM
610 				if(useShmem_) WAIT_SHARED(offs(), offsLenSampled*OFF_SIZE);
611 #endif
612 			}
613 		}
614 	}
615 
616 	this->postReadInit(*eh); // Initialize fields of Ebwt not read from file
617 	if(_verbose || startVerbose) print(cerr, *eh);
618 
619 	// The fact that _ebwt and friends actually point to something
620 	// (other than NULL) now signals to other member functions that the
621 	// Ebwt is loaded into memory.
622 
623 done: // Exit hatch for both justHeader and !justHeader
624 
625 	// Be kind
626 	if(deleteEh) delete eh;
627 	if(_in1 != NULL) {
628 		rewind(_in1);
629 	}
630 	if(_in2 != NULL) {
631 		rewind(_in2);
632 	}
633 }
634 
635 /**
636  * Read reference names from an input stream 'in' for an Ebwt primary
637  * file and store them in 'refnames'.
638  */
639 void
readEbwtRefnames(FILE * fin,EList<string> & refnames)640 readEbwtRefnames(FILE* fin, EList<string>& refnames) {
641 	// _in1 must already be open with the get cursor at the
642 	// beginning and no error flags set.
643 	assert(fin != NULL);
644 	assert_eq(ftello(fin), 0);
645 
646 	// Read endianness hints from both streams
647 	bool _switchEndian = false;
648 	uint32_t one = readU<uint32_t>(fin, _switchEndian); // 1st word of primary stream
649 	if(one != 1) {
650 		assert_eq((1u<<24), one);
651 		_switchEndian = true;
652 	}
653 
654 	// Reads header entries one by one from primary stream
655 	TIndexOffU len          = readU<TIndexOffU>(fin, _switchEndian);
656 	int32_t  lineRate     = readI<int32_t>(fin, _switchEndian);
657 	/*int32_t  linesPerSide =*/ readI<int32_t>(fin, _switchEndian);
658 	int32_t  offRate      = readI<int32_t>(fin, _switchEndian);
659 	int32_t  ftabChars    = readI<int32_t>(fin, _switchEndian);
660 	// BTL: chunkRate is now deprecated
661 	int32_t flags = readI<int32_t>(fin, _switchEndian);
662 	bool color = false;
663 	bool entireReverse = false;
664 	if(flags < 0) {
665 		color = (((-flags) & EBWT_COLOR) != 0);
666 		entireReverse = (((-flags) & EBWT_ENTIRE_REV) != 0);
667 	}
668 
669 	// Create a new EbwtParams from the entries read from primary stream
670 	EbwtParams eh(len, lineRate, offRate, ftabChars, color, entireReverse);
671 
672 	TIndexOffU nPat = readI<TIndexOffU>(fin, _switchEndian); // nPat
673 	fseeko(fin, nPat*OFF_SIZE, SEEK_CUR);
674 
675 	// Skip rstarts
676 	TIndexOffU nFrag = readU<TIndexOffU>(fin, _switchEndian);
677 	fseeko(fin, nFrag*OFF_SIZE*3, SEEK_CUR);
678 
679 	// Skip ebwt
680 	fseeko(fin, eh._ebwtTotLen, SEEK_CUR);
681 
682 	// Skip zOff from primary stream
683 	readU<TIndexOffU>(fin, _switchEndian);
684 
685 	// Skip fchr
686 	fseeko(fin, 5 * OFF_SIZE, SEEK_CUR);
687 
688 	// Skip ftab
689 	fseeko(fin, eh._ftabLen*OFF_SIZE, SEEK_CUR);
690 
691 	// Skip eftab
692 	fseeko(fin, eh._eftabLen*OFF_SIZE, SEEK_CUR);
693 
694 	// Read reference sequence names from primary index file
695 	while(true) {
696 		char c = '\0';
697 		int read_value = 0;
698         read_value = fgetc(fin);
699 		if(read_value == EOF) break;
700         c = read_value;
701 		if(c == '\0') break;
702 		else if(c == '\n') {
703 			refnames.push_back("");
704 		} else {
705 			if(refnames.size() == 0) {
706 				refnames.push_back("");
707 			}
708 			refnames.back().push_back(c);
709 		}
710 	}
711 	if(refnames.back().empty()) {
712 		refnames.pop_back();
713 	}
714 
715 	// Be kind
716     fseeko(fin, 0, SEEK_SET);
717 	assert(ferror(fin) == 0);
718 }
719 
720 /**
721  * Read reference names from the index with basename 'in' and store
722  * them in 'refnames'.
723  */
724 void
readEbwtRefnames(const string & instr,EList<string> & refnames)725 readEbwtRefnames(const string& instr, EList<string>& refnames) {
726     FILE* fin;
727 	// Initialize our primary and secondary input-stream fields
728     fin = fopen((instr + ".1." + gEbwt_ext).c_str(),"rb");
729 	if(fin == NULL) {
730 		throw EbwtFileOpenException("Cannot open file " + instr);
731 	}
732 	assert_eq(ftello(fin), 0);
733 	readEbwtRefnames(fin, refnames);
734     fclose(fin);
735 }
736 
737 /**
738  * Read just enough of the Ebwt's header to get its flags
739  */
readFlags(const string & instr)740 int32_t Ebwt::readFlags(const string& instr) {
741 	ifstream in;
742 	// Initialize our primary and secondary input-stream fields
743 	in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
744 	if(!in.is_open()) {
745 		throw EbwtFileOpenException("Cannot open file " + instr);
746 	}
747 	assert(in.is_open());
748 	assert(in.good());
749 	bool _switchEndian = false;
750 	uint32_t one = readU<uint32_t>(in, _switchEndian); // 1st word of primary stream
751 	if(one != 1) {
752 		assert_eq((1u<<24), one);
753 		assert_eq(1, endianSwapU32(one));
754 		_switchEndian = true;
755 	}
756 	readU<TIndexOffU>(in, _switchEndian);
757 	readI<int32_t>(in, _switchEndian);
758 	readI<int32_t>(in, _switchEndian);
759 	readI<int32_t>(in, _switchEndian);
760 	readI<int32_t>(in, _switchEndian);
761 	int32_t flags = readI<int32_t>(in, _switchEndian);
762 	return flags;
763 }
764 
765 /**
766  * Read just enough of the Ebwt's header to determine whether it's
767  * colorspace.
768  */
769 bool
readEbwtColor(const string & instr)770 readEbwtColor(const string& instr) {
771 	int32_t flags = Ebwt::readFlags(instr);
772 	if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
773 		return true;
774 	} else {
775 		return false;
776 	}
777 }
778 
779 /**
780  * Read just enough of the Ebwt's header to determine whether it's
781  * entirely reversed.
782  */
783 bool
readEntireReverse(const string & instr)784 readEntireReverse(const string& instr) {
785 	int32_t flags = Ebwt::readFlags(instr);
786 	if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) != 0)) {
787 		return true;
788 	} else {
789 		return false;
790 	}
791 }
792 
793 /**
794  * Write an extended Burrows-Wheeler transform to a pair of output
795  * streams.
796  *
797  * @param out1 output stream to primary file
798  * @param out2 output stream to secondary file
799  * @param be   write in big endian?
800  */
writeFromMemory(bool justHeader,ostream & out1,ostream & out2) const801 void Ebwt::writeFromMemory(bool justHeader,
802                            ostream& out1,
803                            ostream& out2) const
804 {
805 	const EbwtParams& eh = this->_eh;
806 	assert(eh.repOk());
807 	assert(out1.good());
808 	assert(out2.good());
809 
810 	// When building an Ebwt, these header parameters are known
811 	// "up-front", i.e., they can be written to disk immediately,
812 	// before we join() or buildToDisk()
813 	writeI<int32_t>(out1, 1, _switchEndian); // endian hint for priamry stream
814 	writeI<int32_t>(out2, 1, _switchEndian); // endian hint for secondary stream
815 	writeU<TIndexOffU>(out1, eh._len,          _switchEndian); // length of string (and bwt and suffix array)
816 	writeI<int32_t>(out1, eh._lineRate,     _switchEndian); // 2^lineRate = size in bytes of 1 line
817 	writeI<int32_t>(out1, 2,                _switchEndian); // not used
818 	writeI<int32_t>(out1, eh._offRate,      _switchEndian); // every 2^offRate chars is "marked"
819 	writeI<int32_t>(out1, eh._ftabChars,    _switchEndian); // number of 2-bit chars used to address ftab
820 	int32_t flags = 1;
821 	if(eh._color) flags |= EBWT_COLOR;
822 	if(eh._entireReverse) flags |= EBWT_ENTIRE_REV;
823 	writeI<int32_t>(out1, -flags, _switchEndian); // BTL: chunkRate is now deprecated
824 
825 	if(!justHeader) {
826 		assert(rstarts() != NULL);
827 		assert(offs() != NULL);
828 		assert(ftab() != NULL);
829 		assert(eftab() != NULL);
830 		assert(isInMemory());
831 		// These Ebwt parameters are known after the inputs strings have
832 		// been joined() but before they have been built().  These can
833 		// written to the disk next and then discarded from memory.
834 		writeU<TIndexOffU>(out1, this->_nPat,      _switchEndian);
835 		for(TIndexOffU i = 0; i < this->_nPat; i++)
836 			writeU<TIndexOffU>(out1, this->plen()[i], _switchEndian);
837 		assert_geq(this->_nFrag, this->_nPat);
838 		writeU<TIndexOffU>(out1, this->_nFrag, _switchEndian);
839 		for(TIndexOffU i = 0; i < this->_nFrag*3; i++)
840 			writeU<TIndexOffU>(out1, this->rstarts()[i], _switchEndian);
841 
842 		// These Ebwt parameters are discovered only as the Ebwt is being
843 		// built (in buildToDisk()).  Of these, only 'offs' and 'ebwt' are
844 		// terribly large.  'ebwt' is written to the primary file and then
845 		// discarded from memory as it is built; 'offs' is similarly
846 		// written to the secondary file and discarded.
847 		out1.write((const char *)this->ebwt(), eh._ebwtTotLen);
848 		writeU<TIndexOffU>(out1, this->zOff(), _switchEndian);
849 		TIndexOffU offsLen = eh._offsLen;
850 		for(TIndexOffU i = 0; i < offsLen; i++)
851 			writeU<TIndexOffU>(out2, this->offs()[i], _switchEndian);
852 
853 		// 'fchr', 'ftab' and 'eftab' are not fully determined until the
854 		// loop is finished, so they are written to the primary file after
855 		// all of 'ebwt' has already been written and only then discarded
856 		// from memory.
857 		for(int i = 0; i < 5; i++)
858 			writeU<TIndexOffU>(out1, this->fchr()[i], _switchEndian);
859 		for(TIndexOffU i = 0; i < eh._ftabLen; i++)
860 			writeU<TIndexOffU>(out1, this->ftab()[i], _switchEndian);
861 		for(TIndexOffU i = 0; i < eh._eftabLen; i++)
862 			writeU<TIndexOffU>(out1, this->eftab()[i], _switchEndian);
863 	}
864 }
865 
866 /**
867  * Given a pair of strings representing output filenames, and assuming
868  * this Ebwt object is currently in memory, write out this Ebwt to the
869  * specified files.
870  *
871  * If sanity-checking is enabled, then once the streams have been
872  * fully written and closed, we reopen them and read them into a
873  * (hopefully) exact copy of this Ebwt.  We then assert that the
874  * current Ebwt and the copy match in all of their fields.
875  */
writeFromMemory(bool justHeader,const string & out1,const string & out2) const876 void Ebwt::writeFromMemory(bool justHeader,
877                            const string& out1,
878                            const string& out2) const
879 {
880 	ASSERT_ONLY(const EbwtParams& eh = this->_eh);
881 	assert(isInMemory());
882 	assert(eh.repOk());
883 
884 	ofstream fout1(out1.c_str(), ios::binary);
885 	ofstream fout2(out2.c_str(), ios::binary);
886 	writeFromMemory(justHeader, fout1, fout2);
887 	fout1.close();
888 	fout2.close();
889 
890 	// Read the file back in and assert that all components match
891 	if(_sanity) {
892 #if 0
893 		if(_verbose)
894 			cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl;
895 		Ebwt copy(out1, out2, _verbose, _sanity);
896 		assert(!isInMemory());
897 		copy.loadIntoMemory(eh._color ? 1 : 0, true, false, false);
898 		assert(isInMemory());
899 	    assert_eq(eh._lineRate,     copy.eh()._lineRate);
900 	    assert_eq(eh._offRate,      copy.eh()._offRate);
901 	    assert_eq(eh._ftabChars,    copy.eh()._ftabChars);
902 	    assert_eq(eh._len,          copy.eh()._len);
903 	    assert_eq(_zOff,             copy.zOff());
904 	    assert_eq(_zEbwtBpOff,       copy.zEbwtBpOff());
905 	    assert_eq(_zEbwtByteOff,     copy.zEbwtByteOff());
906 		assert_eq(_nPat,             copy.nPat());
907 		for(TIndexOffU i = 0; i < _nPat; i++)
908 			assert_eq(this->_plen[i], copy.plen()[i]);
909 		assert_eq(this->_nFrag, copy.nFrag());
910 		for(TIndexOffU i = 0; i < this->nFrag*3; i++) {
911 			assert_eq(this->_rstarts[i], copy.rstarts()[i]);
912 		}
913 		for(int i = 0; i < 5; i++)
914 			assert_eq(this->_fchr[i], copy.fchr()[i]);
915 		for(TIndexOffU i = 0; i < eh._ftabLen; i++)
916 			assert_eq(this->ftab()[i], copy.ftab()[i]);
917 		for(TIndexOffU i = 0; i < eh._eftabLen; i++)
918 			assert_eq(this->eftab()[i], copy.eftab()[i]);
919 		for(TIndexOffU i = 0; i < eh._offsLen; i++)
920 			assert_eq(this->_offs[i], copy.offs()[i]);
921 		for(TIndexOffU i = 0; i < eh._ebwtTotLen; i++)
922 			assert_eq(this->ebwt()[i], copy.ebwt()[i]);
923 		copy.sanityCheckAll();
924 		if(_verbose)
925 			cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl;
926 #endif
927 	}
928 }
929 
930 /**
931  * Write the rstarts array given the szs array for the reference.
932  */
szsToDisk(const EList<RefRecord> & szs,ostream & os,int reverse)933 void Ebwt::szsToDisk(const EList<RefRecord>& szs, ostream& os, int reverse) {
934 	TIndexOffU seq = 0;
935 	TIndexOffU off = 0;
936 	TIndexOffU totlen = 0;
937 	for(unsigned int i = 0; i < szs.size(); i++) {
938 		if(szs[i].len == 0) continue;
939 		if(szs[i].first) off = 0;
940 		off += szs[i].off;
941 		if(szs[i].first && szs[i].len > 0) seq++;
942 		TIndexOffU seqm1 = seq-1;
943 		assert_lt(seqm1, _nPat);
944 		TIndexOffU fwoff = off;
945 		if(reverse == REF_READ_REVERSE) {
946 			// Invert pattern idxs
947 			seqm1 = _nPat - seqm1 - 1;
948 			// Invert pattern idxs
949 			assert_leq(off + szs[i].len, plen()[seqm1]);
950 			fwoff = plen()[seqm1] - (off + szs[i].len);
951 		}
952 		writeU<TIndexOffU>(os, totlen, _switchEndian); // offset from beginning of joined string
953 		writeU<TIndexOffU>(os, seqm1,  _switchEndian); // sequence id
954 		writeU<TIndexOffU>(os, fwoff,  _switchEndian); // offset into sequence
955 		totlen += szs[i].len;
956 		off += szs[i].len;
957 	}
958 }
959