1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <string>
21 #include <stdexcept>
22 #include <iostream>
23 #include <fstream>
24 #include <stdlib.h>
25 #include "bt2_idx.h"
26 
27 using namespace std;
28 
29 #ifdef BOWTIE_64BIT_INDEX
30 
31 const std::string gEbwt_ext("bt2l");
32 
33 #else
34 
35 const std::string gEbwt_ext("bt2");
36 
37 #endif  // BOWTIE_64BIT_INDEX
38 
39 string gLastIOErrMsg;
40 
41 ///////////////////////////////////////////////////////////////////////
42 //
43 // Functions for searching Ebwts
44 // (But most of them are defined in the header)
45 //
46 ///////////////////////////////////////////////////////////////////////
47 
48 /**
49  * Take an offset into the joined text and translate it into the
50  * reference of the index it falls on, the offset into the reference,
51  * and the length of the reference.  Use a binary search through the
52  * sorted list of reference fragment ranges t
53  */
joinedToTextOff(TIndexOffU qlen,TIndexOffU off,TIndexOffU & tidx,TIndexOffU & textoff,TIndexOffU & tlen,bool rejectStraddle,bool & straddled) const54 void Ebwt::joinedToTextOff(
55 	TIndexOffU qlen,
56 	TIndexOffU off,
57 	TIndexOffU& tidx,
58 	TIndexOffU& textoff,
59 	TIndexOffU& tlen,
60 	bool rejectStraddle,
61 	bool& straddled) const
62 {
63 	assert(rstarts() != NULL); // must have loaded rstarts
64 	TIndexOffU top = 0;
65 	TIndexOffU bot = _nFrag; // 1 greater than largest addressable element
66 	TIndexOffU elt = OFF_MASK;
67 	// Begin binary search
68 	while(true) {
69 		ASSERT_ONLY(TIndexOffU oldelt = elt);
70 		elt = top + ((bot - top) >> 1);
71 		assert_neq(oldelt, elt); // must have made progress
72 		TIndexOffU lower = rstarts()[elt*3];
73 		TIndexOffU upper;
74 		if(elt == _nFrag-1) {
75 			upper = _eh._len;
76 		} else {
77 			upper = rstarts()[((elt+1)*3)];
78 		}
79 		assert_gt(upper, lower);
80 		TIndexOffU fraglen = upper - lower;
81 		if(lower <= off) {
82 			if(upper > off) { // not last element, but it's within
83 				// off is in this range; check if it falls off
84 				if(off + qlen > upper) {
85 					straddled = true;
86 					if(rejectStraddle) {
87 						// it falls off; signal no-go and return
88 						tidx = OFF_MASK;
89 						assert_lt(elt, _nFrag-1);
90 						return;
91 					}
92 				}
93 				// This is the correct text idx whether the index is
94 				// forward or reverse
95 				tidx = rstarts()[(elt*3)+1];
96 				assert_lt(tidx, this->_nPat);
97 				assert_leq(fraglen, this->plen()[tidx]);
98 				// it doesn't fall off; now calculate textoff.
99 				// Initially it's the number of characters that precede
100 				// the alignment in the fragment
101 				TIndexOffU fragoff = off - rstarts()[(elt*3)];
102 				if(!this->fw_) {
103 					fragoff = fraglen - fragoff - 1;
104 					fragoff -= (qlen-1);
105 				}
106 				// Add the alignment's offset into the fragment
107 				// ('fragoff') to the fragment's offset within the text
108 				textoff = fragoff + rstarts()[(elt*3)+2];
109 				assert_lt(textoff, this->plen()[tidx]);
110 				break; // done with binary search
111 			} else {
112 				// 'off' belongs somewhere in the region between elt
113 				// and bot
114 				top = elt;
115 			}
116 		} else {
117 			// 'off' belongs somewhere in the region between top and
118 			// elt
119 			bot = elt;
120 		}
121 		// continue with binary search
122 	}
123 	tlen = this->plen()[tidx];
124 }
125 
126 /**
127  * Walk 'steps' steps to the left and return the row arrived at.  If we
128  * walk through the dollar sign, return max value.
129  */
walkLeft(TIndexOffU row,TIndexOffU steps) const130 TIndexOffU Ebwt::walkLeft(TIndexOffU row, TIndexOffU steps) const {
131 	assert(offs() != NULL);
132 	assert_neq(OFF_MASK, row);
133 	SideLocus l;
134 	if(steps > 0) l.initFromRow(row, _eh, ebwt());
135 	while(steps > 0) {
136 		if(row == _zOff) return OFF_MASK;
137 		TIndexOffU newrow = this->mapLF(l ASSERT_ONLY(, false));
138 		assert_neq(OFF_MASK, newrow);
139 		assert_neq(newrow, row);
140 		row = newrow;
141 		steps--;
142 		if(steps > 0) l.initFromRow(row, _eh, ebwt());
143 	}
144 	return row;
145 }
146 
147 /**
148  * Resolve the reference offset of the BW element 'elt'.
149  */
getOffset(TIndexOffU row) const150 TIndexOffU Ebwt::getOffset(TIndexOffU row) const {
151 	assert(offs() != NULL);
152 	assert_neq(OFF_MASK, row);
153 	if(row == _zOff) return 0;
154 	if((row & _eh._offMask) == row) return this->offs()[row >> _eh._offRate];
155 	TIndexOffU jumps = 0;
156 	SideLocus l;
157 	l.initFromRow(row, _eh, ebwt());
158 	while(true) {
159 		TIndexOffU newrow = this->mapLF(l ASSERT_ONLY(, false));
160 		jumps++;
161 		assert_neq(OFF_MASK, newrow);
162 		assert_neq(newrow, row);
163 		row = newrow;
164 		if(row == _zOff) {
165 			return jumps;
166 		} else if((row & _eh._offMask) == row) {
167 			return jumps + this->offs()[row >> _eh._offRate];
168 		}
169 		l.initFromRow(row, _eh, ebwt());
170 	}
171 }
172 
173 /**
174  * Resolve the reference offset of the BW element 'elt' such that
175  * the offset returned is at the right-hand side of the forward
176  * reference substring involved in the hit.
177  */
getOffset(TIndexOffU elt,bool fw,TIndexOffU hitlen) const178 TIndexOffU Ebwt::getOffset(
179 	TIndexOffU elt,
180 	bool fw,
181 	TIndexOffU hitlen) const
182 {
183 	TIndexOffU off = getOffset(elt);
184 	assert_neq(OFF_MASK, off);
185 	if(!fw) {
186 		assert_lt(off, _eh._len);
187 		off = _eh._len - off - 1;
188 		assert_geq(off, hitlen-1);
189 		off -= (hitlen-1);
190 		assert_lt(off, _eh._len);
191 	}
192 	return off;
193 }
194 
195 /**
196  * Returns true iff the index contains the given string (exactly).  The given
197  * string must contain only unambiguous characters.  TODO: support ambiguous
198  * characters in 'str'.
199  */
contains(const BTDnaString & str,TIndexOffU * otop,TIndexOffU * obot) const200 bool Ebwt::contains(
201 	const BTDnaString& str,
202 	TIndexOffU *otop,
203 	TIndexOffU *obot) const
204 {
205 	assert(isInMemory());
206 	SideLocus tloc, bloc;
207 	if(str.empty()) {
208 		if(otop != NULL && obot != NULL) *otop = *obot = 0;
209 		return true;
210 	}
211 	int c = str[str.length()-1];
212 	assert_range(0, 4, c);
213 	TIndexOffU top = 0, bot = 0;
214 	if(c < 4) {
215 		top = fchr()[c];
216 		bot = fchr()[c+1];
217 	} else {
218 		bool set = false;
219 		for(int i = 0; i < 4; i++) {
220 			if(fchr()[c] < fchr()[c+1]) {
221 				if(set) {
222 					return false;
223 				} else {
224 					set = true;
225 					top = fchr()[c];
226 					bot = fchr()[c+1];
227 				}
228 			}
229 		}
230 	}
231 	assert_geq(bot, top);
232 	tloc.initFromRow(top, eh(), ebwt());
233 	bloc.initFromRow(bot, eh(), ebwt());
234 	ASSERT_ONLY(TIndexOffU lastDiff = bot - top);
235 	for(TIndexOff i = (TIndexOff)str.length()-2; i >= 0; i--) {
236 		c = str[i];
237 		assert_range(0, 4, c);
238 		if(c <= 3) {
239 			top = mapLF(tloc, c);
240 			bot = mapLF(bloc, c);
241 		} else {
242 			TIndexOffU sz = bot - top;
243 			int c1 = mapLF1(top, tloc ASSERT_ONLY(, false));
244 			bot = mapLF(bloc, c1);
245 			assert_leq(bot - top, sz);
246 			if(bot - top < sz) {
247 				// Encountered an N and could not proceed through it because
248 				// there was more than one possible nucleotide we could replace
249 				// it with
250 				return false;
251 			}
252 		}
253 		assert_geq(bot, top);
254 		assert_leq(bot-top, lastDiff);
255 		ASSERT_ONLY(lastDiff = bot-top);
256 		if(i > 0) {
257 			tloc.initFromRow(top, eh(), ebwt());
258 			bloc.initFromRow(bot, eh(), ebwt());
259 		}
260 	}
261 	if(otop != NULL && obot != NULL) {
262 		*otop = top; *obot = bot;
263 	}
264 	return bot > top;
265 }
266 
267 /**
268  * Try to find the Bowtie index specified by the user.  First try the
269  * exact path given by the user.  Then try the user-provided string
270  * appended onto the path of the "indexes" subdirectory below this
271  * executable, then try the provided string appended onto
272  * "$BOWTIE2_INDEXES/".
273  */
adjustEbwtBase(const string & cmdline,const string & ebwtFileBase,bool verbose=false)274 string adjustEbwtBase(const string& cmdline,
275 					  const string& ebwtFileBase,
276 					  bool verbose = false)
277 {
278 	string str = ebwtFileBase;
279 	ifstream in;
280 	if(verbose) cout << "Trying " << str.c_str() << endl;
281 	in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
282 	if(!in.is_open()) {
283 		if(verbose) cout << "  didn't work" << endl;
284 		in.close();
285 		if(getenv("BOWTIE2_INDEXES") != NULL) {
286 			str = string(getenv("BOWTIE2_INDEXES")) + "/" + ebwtFileBase;
287 			if(verbose) cout << "Trying " << str.c_str() << endl;
288 			in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
289 			if(!in.is_open()) {
290 				if(verbose) cout << "  didn't work" << endl;
291 				in.close();
292 			} else {
293 				if(verbose) cout << "  worked" << endl;
294 			}
295 		}
296 	}
297 	if(!in.is_open()) {
298 		cerr << "Could not locate a Bowtie index corresponding to basename \"" << ebwtFileBase.c_str() << "\"" << endl;
299 		throw 1;
300 	}
301 	return str;
302 }
303