1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <string>
21 #include <stdexcept>
22 #include <iostream>
23 #include <fstream>
24 #include <stdlib.h>
25 #include "bt2_idx.h"
26
27 using namespace std;
28
29 #ifdef BOWTIE_64BIT_INDEX
30
31 const std::string gEbwt_ext("bt2l");
32
33 #else
34
35 const std::string gEbwt_ext("bt2");
36
37 #endif // BOWTIE_64BIT_INDEX
38
39 string gLastIOErrMsg;
40
41 ///////////////////////////////////////////////////////////////////////
42 //
43 // Functions for searching Ebwts
44 // (But most of them are defined in the header)
45 //
46 ///////////////////////////////////////////////////////////////////////
47
48 /**
49 * Take an offset into the joined text and translate it into the
50 * reference of the index it falls on, the offset into the reference,
51 * and the length of the reference. Use a binary search through the
52 * sorted list of reference fragment ranges t
53 */
joinedToTextOff(TIndexOffU qlen,TIndexOffU off,TIndexOffU & tidx,TIndexOffU & textoff,TIndexOffU & tlen,bool rejectStraddle,bool & straddled) const54 void Ebwt::joinedToTextOff(
55 TIndexOffU qlen,
56 TIndexOffU off,
57 TIndexOffU& tidx,
58 TIndexOffU& textoff,
59 TIndexOffU& tlen,
60 bool rejectStraddle,
61 bool& straddled) const
62 {
63 assert(rstarts() != NULL); // must have loaded rstarts
64 TIndexOffU top = 0;
65 TIndexOffU bot = _nFrag; // 1 greater than largest addressable element
66 TIndexOffU elt = OFF_MASK;
67 // Begin binary search
68 while(true) {
69 ASSERT_ONLY(TIndexOffU oldelt = elt);
70 elt = top + ((bot - top) >> 1);
71 assert_neq(oldelt, elt); // must have made progress
72 TIndexOffU lower = rstarts()[elt*3];
73 TIndexOffU upper;
74 if(elt == _nFrag-1) {
75 upper = _eh._len;
76 } else {
77 upper = rstarts()[((elt+1)*3)];
78 }
79 assert_gt(upper, lower);
80 TIndexOffU fraglen = upper - lower;
81 if(lower <= off) {
82 if(upper > off) { // not last element, but it's within
83 // off is in this range; check if it falls off
84 if(off + qlen > upper) {
85 straddled = true;
86 if(rejectStraddle) {
87 // it falls off; signal no-go and return
88 tidx = OFF_MASK;
89 assert_lt(elt, _nFrag-1);
90 return;
91 }
92 }
93 // This is the correct text idx whether the index is
94 // forward or reverse
95 tidx = rstarts()[(elt*3)+1];
96 assert_lt(tidx, this->_nPat);
97 assert_leq(fraglen, this->plen()[tidx]);
98 // it doesn't fall off; now calculate textoff.
99 // Initially it's the number of characters that precede
100 // the alignment in the fragment
101 TIndexOffU fragoff = off - rstarts()[(elt*3)];
102 if(!this->fw_) {
103 fragoff = fraglen - fragoff - 1;
104 fragoff -= (qlen-1);
105 }
106 // Add the alignment's offset into the fragment
107 // ('fragoff') to the fragment's offset within the text
108 textoff = fragoff + rstarts()[(elt*3)+2];
109 assert_lt(textoff, this->plen()[tidx]);
110 break; // done with binary search
111 } else {
112 // 'off' belongs somewhere in the region between elt
113 // and bot
114 top = elt;
115 }
116 } else {
117 // 'off' belongs somewhere in the region between top and
118 // elt
119 bot = elt;
120 }
121 // continue with binary search
122 }
123 tlen = this->plen()[tidx];
124 }
125
126 /**
127 * Walk 'steps' steps to the left and return the row arrived at. If we
128 * walk through the dollar sign, return max value.
129 */
walkLeft(TIndexOffU row,TIndexOffU steps) const130 TIndexOffU Ebwt::walkLeft(TIndexOffU row, TIndexOffU steps) const {
131 assert(offs() != NULL);
132 assert_neq(OFF_MASK, row);
133 SideLocus l;
134 if(steps > 0) l.initFromRow(row, _eh, ebwt());
135 while(steps > 0) {
136 if(row == _zOff) return OFF_MASK;
137 TIndexOffU newrow = this->mapLF(l ASSERT_ONLY(, false));
138 assert_neq(OFF_MASK, newrow);
139 assert_neq(newrow, row);
140 row = newrow;
141 steps--;
142 if(steps > 0) l.initFromRow(row, _eh, ebwt());
143 }
144 return row;
145 }
146
147 /**
148 * Resolve the reference offset of the BW element 'elt'.
149 */
getOffset(TIndexOffU row) const150 TIndexOffU Ebwt::getOffset(TIndexOffU row) const {
151 assert(offs() != NULL);
152 assert_neq(OFF_MASK, row);
153 if(row == _zOff) return 0;
154 if((row & _eh._offMask) == row) return this->offs()[row >> _eh._offRate];
155 TIndexOffU jumps = 0;
156 SideLocus l;
157 l.initFromRow(row, _eh, ebwt());
158 while(true) {
159 TIndexOffU newrow = this->mapLF(l ASSERT_ONLY(, false));
160 jumps++;
161 assert_neq(OFF_MASK, newrow);
162 assert_neq(newrow, row);
163 row = newrow;
164 if(row == _zOff) {
165 return jumps;
166 } else if((row & _eh._offMask) == row) {
167 return jumps + this->offs()[row >> _eh._offRate];
168 }
169 l.initFromRow(row, _eh, ebwt());
170 }
171 }
172
173 /**
174 * Resolve the reference offset of the BW element 'elt' such that
175 * the offset returned is at the right-hand side of the forward
176 * reference substring involved in the hit.
177 */
getOffset(TIndexOffU elt,bool fw,TIndexOffU hitlen) const178 TIndexOffU Ebwt::getOffset(
179 TIndexOffU elt,
180 bool fw,
181 TIndexOffU hitlen) const
182 {
183 TIndexOffU off = getOffset(elt);
184 assert_neq(OFF_MASK, off);
185 if(!fw) {
186 assert_lt(off, _eh._len);
187 off = _eh._len - off - 1;
188 assert_geq(off, hitlen-1);
189 off -= (hitlen-1);
190 assert_lt(off, _eh._len);
191 }
192 return off;
193 }
194
195 /**
196 * Returns true iff the index contains the given string (exactly). The given
197 * string must contain only unambiguous characters. TODO: support ambiguous
198 * characters in 'str'.
199 */
contains(const BTDnaString & str,TIndexOffU * otop,TIndexOffU * obot) const200 bool Ebwt::contains(
201 const BTDnaString& str,
202 TIndexOffU *otop,
203 TIndexOffU *obot) const
204 {
205 assert(isInMemory());
206 SideLocus tloc, bloc;
207 if(str.empty()) {
208 if(otop != NULL && obot != NULL) *otop = *obot = 0;
209 return true;
210 }
211 int c = str[str.length()-1];
212 assert_range(0, 4, c);
213 TIndexOffU top = 0, bot = 0;
214 if(c < 4) {
215 top = fchr()[c];
216 bot = fchr()[c+1];
217 } else {
218 bool set = false;
219 for(int i = 0; i < 4; i++) {
220 if(fchr()[c] < fchr()[c+1]) {
221 if(set) {
222 return false;
223 } else {
224 set = true;
225 top = fchr()[c];
226 bot = fchr()[c+1];
227 }
228 }
229 }
230 }
231 assert_geq(bot, top);
232 tloc.initFromRow(top, eh(), ebwt());
233 bloc.initFromRow(bot, eh(), ebwt());
234 ASSERT_ONLY(TIndexOffU lastDiff = bot - top);
235 for(TIndexOff i = (TIndexOff)str.length()-2; i >= 0; i--) {
236 c = str[i];
237 assert_range(0, 4, c);
238 if(c <= 3) {
239 top = mapLF(tloc, c);
240 bot = mapLF(bloc, c);
241 } else {
242 TIndexOffU sz = bot - top;
243 int c1 = mapLF1(top, tloc ASSERT_ONLY(, false));
244 bot = mapLF(bloc, c1);
245 assert_leq(bot - top, sz);
246 if(bot - top < sz) {
247 // Encountered an N and could not proceed through it because
248 // there was more than one possible nucleotide we could replace
249 // it with
250 return false;
251 }
252 }
253 assert_geq(bot, top);
254 assert_leq(bot-top, lastDiff);
255 ASSERT_ONLY(lastDiff = bot-top);
256 if(i > 0) {
257 tloc.initFromRow(top, eh(), ebwt());
258 bloc.initFromRow(bot, eh(), ebwt());
259 }
260 }
261 if(otop != NULL && obot != NULL) {
262 *otop = top; *obot = bot;
263 }
264 return bot > top;
265 }
266
267 /**
268 * Try to find the Bowtie index specified by the user. First try the
269 * exact path given by the user. Then try the user-provided string
270 * appended onto the path of the "indexes" subdirectory below this
271 * executable, then try the provided string appended onto
272 * "$BOWTIE2_INDEXES/".
273 */
adjustEbwtBase(const string & cmdline,const string & ebwtFileBase,bool verbose=false)274 string adjustEbwtBase(const string& cmdline,
275 const string& ebwtFileBase,
276 bool verbose = false)
277 {
278 string str = ebwtFileBase;
279 ifstream in;
280 if(verbose) cout << "Trying " << str.c_str() << endl;
281 in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
282 if(!in.is_open()) {
283 if(verbose) cout << " didn't work" << endl;
284 in.close();
285 if(getenv("BOWTIE2_INDEXES") != NULL) {
286 str = string(getenv("BOWTIE2_INDEXES")) + "/" + ebwtFileBase;
287 if(verbose) cout << "Trying " << str.c_str() << endl;
288 in.open((str + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
289 if(!in.is_open()) {
290 if(verbose) cout << " didn't work" << endl;
291 in.close();
292 } else {
293 if(verbose) cout << " worked" << endl;
294 }
295 }
296 }
297 if(!in.is_open()) {
298 cerr << "Could not locate a Bowtie index corresponding to basename \"" << ebwtFileBase.c_str() << "\"" << endl;
299 throw 1;
300 }
301 return str;
302 }
303