1 /*
2  *  Copyright (C) 2012-2013  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 #include "Tabix.h"
18 #include <stdexcept>
19 #include "StringBasics.h"
20 
Tabix()21 Tabix::Tabix()
22     : IndexBase(),
23       myChromNamesBuffer(NULL)
24 {
25 }
26 
27 
~Tabix()28 Tabix::~Tabix()
29 {
30     if(myChromNamesBuffer != NULL)
31     {
32         delete[] myChromNamesBuffer;
33         myChromNamesBuffer = NULL;
34     }
35 }
36 
37 
38 // Reset the member data for a new index file.
resetIndex()39 void Tabix::resetIndex()
40 {
41     IndexBase::resetIndex();
42     if(myChromNamesBuffer != NULL)
43     {
44         delete[] myChromNamesBuffer;
45         myChromNamesBuffer = NULL;
46     }
47     myChromNamesVector.clear();
48 }
49 
50 
51 // Read & parse the specified index file.
readIndex(const char * filename)52 StatGenStatus::Status Tabix::readIndex(const char* filename)
53 {
54     // Reset the index from anything that may previously be set.
55     resetIndex();
56 
57     IFILE indexFile = ifopen(filename, "rb");
58 
59     // Failed to open the index file.
60     if(indexFile == NULL)
61     {
62         return(StatGenStatus::FAIL_IO);
63     }
64 
65     // read the tabix index structure.
66 
67     // Read the magic string.
68     char magic[4];
69     if(ifread(indexFile, magic, 4) != 4)
70     {
71         // Failed to read the magic
72         return(StatGenStatus::FAIL_IO);
73     }
74 
75     // If this is not an index file, set num references to 0.
76     if (magic[0] != 'T' || magic[1] != 'B' || magic[2] != 'I' || magic[3] != 1)
77     {
78         // Not a Tabix Index file.
79         return(StatGenStatus::FAIL_PARSE);
80     }
81 
82     // It is a tabix index file.
83     // Read the number of reference sequences.
84     if(ifread(indexFile, &n_ref, 4) != 4)
85     {
86         // Failed to read.
87         return(StatGenStatus::FAIL_IO);
88     }
89 
90     // Size the references.
91     myRefs.resize(n_ref);
92 
93     // Read the Format configuration.
94     if(ifread(indexFile, &myFormat, sizeof(myFormat)) != sizeof(myFormat))
95     {
96         // Failed to read.
97         return(StatGenStatus::FAIL_IO);
98     }
99 
100     // Read the length of the chromosome names.
101     uint32_t l_nm;
102 
103     if(ifread(indexFile, &l_nm, sizeof(l_nm)) != sizeof(l_nm))
104     {
105         // Failed to read.
106         return(StatGenStatus::FAIL_IO);
107     }
108 
109     // Read the chromosome names.
110     myChromNamesBuffer = new char[l_nm];
111     if(ifread(indexFile, myChromNamesBuffer, l_nm) != l_nm)
112     {
113         return(StatGenStatus::FAIL_IO);
114     }
115     myChromNamesVector.resize(n_ref);
116 
117     // Parse out the chromosome names.
118     bool prevNull = true;
119     int chromIndex = 0;
120     for(uint32_t i = 0; i < l_nm; i++)
121     {
122         if(chromIndex >= n_ref)
123         {
124             // already set the pointer for the last chromosome name,
125             // so stop looping.
126             break;
127         }
128         if(prevNull == true)
129         {
130             myChromNamesVector[chromIndex++] = myChromNamesBuffer + i;
131             prevNull = false;
132         }
133         if(myChromNamesBuffer[i] == '\0')
134         {
135             prevNull = true;
136         }
137     }
138 
139     for(int refIndex = 0; refIndex < n_ref; refIndex++)
140     {
141         // Read each reference.
142         Reference* ref = &(myRefs[refIndex]);
143 
144         // Read the number of bins.
145         if(ifread(indexFile, &(ref->n_bin), 4) != 4)
146         {
147             // Failed to read the number of bins.
148             // Return failure.
149             return(StatGenStatus::FAIL_PARSE);
150         }
151 
152         // Resize the bins.
153         ref->bins.resize(ref->n_bin + 1);
154 
155         // Read each bin.
156         for(int binIndex = 0; binIndex < ref->n_bin; binIndex++)
157         {
158             uint32_t binNumber;
159 
160             // Read in the bin number.
161             if(ifread(indexFile, &(binNumber), 4) != 4)
162             {
163                 // Failed to read the bin number.
164                 // Return failure.
165                 return(StatGenStatus::FAIL_IO);
166             }
167 
168             // Add the bin to the reference and get the
169             // pointer back so the values can be set in it.
170             Bin* binPtr = &(ref->bins[binIndex]);
171             binPtr->bin = binNumber;
172 
173             // Read in the number of chunks.
174             if(ifread(indexFile, &(binPtr->n_chunk), 4) != 4)
175             {
176                 // Failed to read number of chunks.
177                 // Return failure.
178                 return(StatGenStatus::FAIL_IO);
179             }
180 
181             // Read in the chunks.
182             // Allocate space for the chunks.
183             uint32_t sizeOfChunkList = binPtr->n_chunk * sizeof(Chunk);
184             binPtr->chunks = (Chunk*)malloc(sizeOfChunkList);
185             if(ifread(indexFile, binPtr->chunks, sizeOfChunkList) != sizeOfChunkList)
186             {
187                 // Failed to read the chunks.
188                 // Return failure.
189                 return(StatGenStatus::FAIL_IO);
190             }
191         }
192 
193         // Read the number of intervals.
194         if(ifread(indexFile, &(ref->n_intv), 4) != 4)
195         {
196             // Failed to read, set to 0.
197             ref->n_intv = 0;
198             // Return failure.
199             return(StatGenStatus::FAIL_IO);
200         }
201 
202         // Allocate space for the intervals and read them.
203         uint32_t linearIndexSize = ref->n_intv * sizeof(uint64_t);
204         ref->ioffsets = (uint64_t*)malloc(linearIndexSize);
205         if(ifread(indexFile, ref->ioffsets, linearIndexSize) != linearIndexSize)
206         {
207             // Failed to read the linear index.
208             // Return failure.
209             return(StatGenStatus::FAIL_IO);
210         }
211     }
212 
213     // Successfully read teh bam index file.
214     return(StatGenStatus::SUCCESS);
215 }
216 
217 
getStartPos(const char * refName,int32_t start,uint64_t & fileStartPos) const218 bool Tabix::getStartPos(const char* refName, int32_t start,
219                         uint64_t& fileStartPos) const
220 {
221     // Look for the reference name in the list.
222     int refID = 0;
223     for(refID = 0; refID < n_ref; refID++)
224     {
225         if(strcmp(refName, myChromNamesVector[refID]) == 0)
226         {
227             // found the reference
228             break;
229         }
230     }
231     if(refID >= n_ref)
232     {
233         // Didn't find the refName, so return false.
234         return(false);
235     }
236 
237     // Look up in the linear index.
238     if(start < 0)
239     {
240         // Negative index, so start at 0.
241         start = 0;
242     }
243     return(getMinOffsetFromLinearIndex(refID, start, fileStartPos));
244 }
245 
246 
getRefName(unsigned int indexNum) const247 const char* Tabix::getRefName(unsigned int indexNum) const
248 {
249     if(indexNum >= myChromNamesVector.size())
250     {
251         String message = "ERROR: Out of range on Tabix::getRefName(";
252         message += indexNum;
253         message += ")";
254         throw(std::runtime_error(message.c_str()));
255         return(NULL);
256     }
257     return(myChromNamesVector[indexNum]);
258 }
259