1 #include "PrefilteringIndexReader.h"
2 #include "DBWriter.h"
3 #include "Prefiltering.h"
4 #include "ExtendedSubstitutionMatrix.h"
5 #include "FileUtil.h"
6 #include "IndexBuilder.h"
7 #include "Parameters.h"
8
9 const char* PrefilteringIndexReader::CURRENT_VERSION = "16";
10 unsigned int PrefilteringIndexReader::VERSION = 0;
11 unsigned int PrefilteringIndexReader::META = 1;
12 unsigned int PrefilteringIndexReader::SCOREMATRIXNAME = 2;
13 unsigned int PrefilteringIndexReader::SCOREMATRIX2MER = 3;
14 unsigned int PrefilteringIndexReader::SCOREMATRIX3MER = 4;
15 unsigned int PrefilteringIndexReader::DBR1INDEX = 5;
16 unsigned int PrefilteringIndexReader::DBR1DATA = 6;
17 unsigned int PrefilteringIndexReader::DBR2INDEX = 7;
18 unsigned int PrefilteringIndexReader::DBR2DATA = 8;
19 unsigned int PrefilteringIndexReader::ENTRIES = 9;
20 unsigned int PrefilteringIndexReader::ENTRIESOFFSETS = 10;
21 unsigned int PrefilteringIndexReader::ENTRIESGRIDSIZE = 11;
22 unsigned int PrefilteringIndexReader::ENTRIESNUM = 12;
23 unsigned int PrefilteringIndexReader::SEQCOUNT = 13;
24 unsigned int PrefilteringIndexReader::SEQINDEXDATA = 14;
25 unsigned int PrefilteringIndexReader::SEQINDEXDATASIZE = 15;
26 unsigned int PrefilteringIndexReader::SEQINDEXSEQOFFSET = 16;
27 unsigned int PrefilteringIndexReader::HDR1INDEX = 18;
28 unsigned int PrefilteringIndexReader::HDR1DATA = 19;
29 unsigned int PrefilteringIndexReader::HDR2INDEX = 20;
30 unsigned int PrefilteringIndexReader::HDR2DATA = 21;
31 unsigned int PrefilteringIndexReader::GENERATOR = 22;
32 unsigned int PrefilteringIndexReader::SPACEDPATTERN = 23;
33
34 extern const char* version;
35
checkIfIndexFile(DBReader<unsigned int> * reader)36 bool PrefilteringIndexReader::checkIfIndexFile(DBReader<unsigned int>* reader) {
37 char * version = reader->getDataByDBKey(VERSION, 0);
38 if(version == NULL){
39 return false;
40 }
41 return (strncmp(version, CURRENT_VERSION, strlen(CURRENT_VERSION)) == 0 ) ? true : false;
42 }
43
indexName(const std::string & outDB)44 std::string PrefilteringIndexReader::indexName(const std::string &outDB) {
45 std::string result(outDB);
46 result.append(".idx");
47 return result;
48 }
49
createIndexFile(const std::string & outDB,DBReader<unsigned int> * dbr1,DBReader<unsigned int> * dbr2,DBReader<unsigned int> * hdbr1,DBReader<unsigned int> * hdbr2,BaseMatrix * subMat,int maxSeqLen,bool hasSpacedKmer,const std::string & spacedKmerPattern,bool compBiasCorrection,int alphabetSize,int kmerSize,int maskMode,int maskLowerCase,int kmerThr,int splits)50 void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
51 DBReader<unsigned int> *dbr1, DBReader<unsigned int> *dbr2,
52 DBReader<unsigned int> *hdbr1, DBReader<unsigned int> *hdbr2,
53 BaseMatrix *subMat, int maxSeqLen,
54 bool hasSpacedKmer, const std::string &spacedKmerPattern,
55 bool compBiasCorrection, int alphabetSize, int kmerSize,
56 int maskMode, int maskLowerCase, int kmerThr, int splits) {
57
58 const int SPLIT_META = splits > 1 ? 0 : 0;
59 const int SPLIT_SEQS = splits > 1 ? 1 : 0;
60 const int SPLIT_INDX = splits > 1 ? 2 : 0;
61
62 DBWriter writer(outDB.c_str(), std::string(outDB).append(".index").c_str(), splits > 1 ? splits + 2 : 1, Parameters::WRITER_ASCII_MODE, Parameters::DBTYPE_INDEX_DB);
63 writer.open();
64
65 Debug(Debug::INFO) << "Write VERSION (" << VERSION << ")\n";
66 writer.writeData((char *) CURRENT_VERSION, strlen(CURRENT_VERSION) * sizeof(char), VERSION, SPLIT_META);
67 writer.alignToPageSize(SPLIT_META);
68
69 Debug(Debug::INFO) << "Write META (" << META << ")\n";
70 const int biasCorr = compBiasCorrection ? 1 : 0;
71 const int mask = maskMode > 0;
72 const int spacedKmer = (hasSpacedKmer) ? 1 : 0;
73 const int headers1 = (hdbr1 != NULL) ? 1 : 0;
74 const int headers2 = (hdbr2 != NULL) ? 1 : 0;
75 const int seqType = dbr1->getDbtype();
76 const int srcSeqType = (dbr2 !=NULL) ? dbr2->getDbtype() : seqType;
77 int metadata[] = {maxSeqLen, kmerSize, biasCorr, alphabetSize, mask, spacedKmer, kmerThr, seqType, srcSeqType, headers1, headers2, splits};
78 char *metadataptr = (char *) &metadata;
79 writer.writeData(metadataptr, sizeof(metadata), META, SPLIT_META);
80 writer.alignToPageSize(SPLIT_META);
81
82 if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false &&
83 Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ) == false) {
84 int alphabetSize = subMat->alphabetSize;
85 subMat->alphabetSize = subMat->alphabetSize-1;
86 ScoreMatrix s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
87 ScoreMatrix s2 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 2);
88 subMat->alphabetSize = alphabetSize;
89
90 char* serialized3mer = ScoreMatrix::serialize(s3);
91 Debug(Debug::INFO) << "Write SCOREMATRIX3MER (" << SCOREMATRIX3MER << ")\n";
92 writer.writeData(serialized3mer, ScoreMatrix::size(s3), SCOREMATRIX3MER, SPLIT_META);
93 writer.alignToPageSize(SPLIT_META);
94 ExtendedSubstitutionMatrix::freeScoreMatrix(s3);
95 free(serialized3mer);
96
97 char* serialized2mer = ScoreMatrix::serialize(s2);
98 Debug(Debug::INFO) << "Write SCOREMATRIX2MER (" << SCOREMATRIX2MER << ")\n";
99 writer.writeData(serialized2mer, ScoreMatrix::size(s2), SCOREMATRIX2MER, SPLIT_META);
100 writer.alignToPageSize(SPLIT_META);
101 ExtendedSubstitutionMatrix::freeScoreMatrix(s2);
102 free(serialized2mer);
103 }
104
105 Debug(Debug::INFO) << "Write SCOREMATRIXNAME (" << SCOREMATRIXNAME << ")\n";
106 char* subData = BaseMatrix::serialize(subMat->matrixName, subMat->matrixData);
107 writer.writeData(subData, BaseMatrix::memorySize(subMat->matrixName, subMat->matrixData), SCOREMATRIXNAME, SPLIT_META);
108 writer.alignToPageSize(SPLIT_META);
109 free(subData);
110
111 if (spacedKmerPattern.empty() != false) {
112 Debug(Debug::INFO) << "Write SPACEDPATTERN (" << SPACEDPATTERN << ")\n";
113 writer.writeData(spacedKmerPattern.c_str(), spacedKmerPattern.length(), SPACEDPATTERN, SPLIT_META);
114 writer.alignToPageSize(SPLIT_META);
115 }
116
117 Debug(Debug::INFO) << "Write GENERATOR (" << GENERATOR << ")\n";
118 writer.writeData(version, strlen(version), GENERATOR, SPLIT_META);
119 writer.alignToPageSize(SPLIT_META);
120
121 Debug(Debug::INFO) << "Write DBR1INDEX (" << DBR1INDEX << ")\n";
122 char* data = DBReader<unsigned int>::serialize(*dbr1);
123 size_t offsetIndex = writer.getOffset(SPLIT_SEQS);
124 writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*dbr1), DBR1INDEX, SPLIT_SEQS);
125 writer.alignToPageSize(SPLIT_SEQS);
126
127 Debug(Debug::INFO) << "Write DBR1DATA (" << DBR1DATA << ")\n";
128 size_t offsetData = writer.getOffset(SPLIT_SEQS);
129 writer.writeStart(SPLIT_SEQS);
130 for(size_t fileIdx = 0; fileIdx < dbr1->getDataFileCnt(); fileIdx++) {
131 writer.writeAdd(dbr1->getDataForFile(fileIdx), dbr1->getDataSizeForFile(fileIdx), SPLIT_SEQS);
132 }
133 writer.writeEnd(DBR1DATA, SPLIT_SEQS);
134 writer.alignToPageSize(SPLIT_SEQS);
135 free(data);
136
137 if (dbr2 == NULL) {
138 writer.writeIndexEntry(DBR2INDEX, offsetIndex, DBReader<unsigned int>::indexMemorySize(*dbr1)+1, SPLIT_SEQS);
139 writer.writeIndexEntry(DBR2DATA, offsetData, dbr1->getTotalDataSize()+1, SPLIT_SEQS);
140 } else {
141 Debug(Debug::INFO) << "Write DBR2INDEX (" << DBR2INDEX << ")\n";
142 data = DBReader<unsigned int>::serialize(*dbr2);
143 writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*dbr2), DBR2INDEX, SPLIT_SEQS);
144 writer.alignToPageSize(SPLIT_SEQS);
145 Debug(Debug::INFO) << "Write DBR2DATA (" << DBR2DATA << ")\n";
146 writer.writeStart(SPLIT_SEQS);
147 for(size_t fileIdx = 0; fileIdx < dbr2->getDataFileCnt(); fileIdx++) {
148 writer.writeAdd(dbr2->getDataForFile(fileIdx), dbr2->getDataSizeForFile(fileIdx), SPLIT_SEQS);
149 }
150 writer.writeEnd(DBR2DATA, SPLIT_SEQS);
151 writer.alignToPageSize(SPLIT_SEQS);
152 free(data);
153 }
154
155 if (hdbr1 != NULL) {
156 Debug(Debug::INFO) << "Write HDR1INDEX (" << HDR1INDEX << ")\n";
157 data = DBReader<unsigned int>::serialize(*hdbr1);
158 size_t offsetIndex = writer.getOffset(SPLIT_SEQS);
159 writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*hdbr1), HDR1INDEX, SPLIT_SEQS);
160 writer.alignToPageSize(SPLIT_SEQS);
161
162 Debug(Debug::INFO) << "Write HDR1DATA (" << HDR1DATA << ")\n";
163 size_t offsetData = writer.getOffset(SPLIT_SEQS);
164 writer.writeStart(SPLIT_SEQS);
165 for(size_t fileIdx = 0; fileIdx < hdbr1->getDataFileCnt(); fileIdx++) {
166 writer.writeAdd(hdbr1->getDataForFile(fileIdx), hdbr1->getDataSizeForFile(fileIdx), SPLIT_SEQS);
167 }
168 writer.writeEnd(HDR1DATA, SPLIT_SEQS);
169 writer.alignToPageSize(SPLIT_SEQS);
170 free(data);
171 if (hdbr2 == NULL) {
172 writer.writeIndexEntry(HDR2INDEX, offsetIndex, DBReader<unsigned int>::indexMemorySize(*hdbr1)+1, SPLIT_SEQS);
173 writer.writeIndexEntry(HDR2DATA, offsetData, hdbr1->getTotalDataSize()+1, SPLIT_SEQS);
174 }
175 }
176 if (hdbr2 != NULL) {
177 Debug(Debug::INFO) << "Write HDR2INDEX (" << HDR2INDEX << ")\n";
178 data = DBReader<unsigned int>::serialize(*hdbr2);
179 writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*hdbr2), HDR2INDEX, SPLIT_SEQS);
180 writer.alignToPageSize(SPLIT_SEQS);
181 Debug(Debug::INFO) << "Write HDR2DATA (" << HDR2DATA << ")\n";
182 writer.writeStart(SPLIT_SEQS);
183 for(size_t fileIdx = 0; fileIdx < hdbr2->getDataFileCnt(); fileIdx++) {
184 writer.writeAdd(hdbr2->getDataForFile(fileIdx), hdbr2->getDataSizeForFile(fileIdx), SPLIT_SEQS);
185 }
186 writer.writeEnd(HDR2DATA, SPLIT_SEQS);
187 writer.alignToPageSize(SPLIT_SEQS);
188 free(data);
189 }
190
191 Sequence seq(maxSeqLen, seqType, subMat, kmerSize, hasSpacedKmer, compBiasCorrection, true, spacedKmerPattern);
192 // remove x (not needed in index)
193 const int adjustAlphabetSize =
194 (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS))
195 ? alphabetSize -1: alphabetSize;
196
197 for (int s = 0; s < splits; s++) {
198 size_t dbFrom = 0;
199 size_t dbSize = 0;
200 dbr1->decomposeDomainByAminoAcid(s, splits, &dbFrom, &dbSize);
201 if (dbSize == 0) {
202 continue;
203 }
204
205 IndexTable indexTable(adjustAlphabetSize, kmerSize, false);
206 SequenceLookup *sequenceLookup = NULL;
207 IndexBuilder::fillDatabase(&indexTable,
208 (maskMode == 1 || maskLowerCase == 1) ? &sequenceLookup : NULL,
209 (maskMode == 0 ) ? &sequenceLookup : NULL,
210 *subMat, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr, maskMode, maskLowerCase);
211 indexTable.printStatistics(subMat->num2aa);
212
213 if (sequenceLookup == NULL) {
214 Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n";
215 EXIT(EXIT_FAILURE);
216 }
217
218 // save the entries
219 unsigned int keyOffset = 1000 * s;
220 Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
221 char *entries = (char *) indexTable.getEntries();
222 size_t entriesSize = indexTable.getTableEntriesNum() * indexTable.getSizeOfEntry();
223 writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
224 writer.alignToPageSize(SPLIT_INDX + s);
225
226 // save the size
227 Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
228 char *offsets = (char*)indexTable.getOffsets();
229 size_t offsetsSize = (indexTable.getTableSize() + 1) * sizeof(size_t);
230 writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
231 writer.alignToPageSize(SPLIT_INDX + s);
232 indexTable.deleteEntries();
233
234 Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n";
235 int64_t seqindexDataSize = sequenceLookup->getDataSize();
236 char *seqindexDataSizePtr = (char *) &seqindexDataSize;
237 writer.writeData(seqindexDataSizePtr, 1 * sizeof(int64_t), (keyOffset + SEQINDEXDATASIZE), SPLIT_INDX + s);
238 writer.alignToPageSize(SPLIT_INDX + s);
239
240 size_t *sequenceOffsets = sequenceLookup->getOffsets();
241 size_t sequenceCount = sequenceLookup->getSequenceCount();
242 Debug(Debug::INFO) << "Write SEQINDEXSEQOFFSET (" << (keyOffset + SEQINDEXSEQOFFSET) << ")\n";
243 writer.writeData((char *) sequenceOffsets, (sequenceCount + 1) * sizeof(size_t), (keyOffset + SEQINDEXSEQOFFSET), SPLIT_INDX + s);
244 writer.alignToPageSize(SPLIT_INDX + s);
245
246 Debug(Debug::INFO) << "Write SEQINDEXDATA (" << (keyOffset + SEQINDEXDATA) << ")\n";
247 writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s);
248 writer.alignToPageSize(SPLIT_INDX + s);
249 delete sequenceLookup;
250
251 // ENTRIESNUM
252 Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
253 uint64_t entriesNum = indexTable.getTableEntriesNum();
254 char *entriesNumPtr = (char *) &entriesNum;
255 writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
256 writer.alignToPageSize(SPLIT_INDX + s);
257
258 // SEQCOUNT
259 Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
260 size_t tablesize = indexTable.getSize();
261 char *tablesizePtr = (char *) &tablesize;
262 writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
263 writer.alignToPageSize(SPLIT_INDX + s);
264 }
265
266 writer.close(false);
267 }
268
openNewHeaderReader(DBReader<unsigned int> * dbr,unsigned int dataIdx,unsigned int indexIdx,int threads,bool touchIndex,bool touchData)269 DBReader<unsigned int> *PrefilteringIndexReader::openNewHeaderReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, int threads, bool touchIndex, bool touchData) {
270 size_t indexId = dbr->getId(indexIdx);
271 char *indexData = dbr->getData(indexId, 0);
272 if (touchIndex) {
273 dbr->touchData(indexId);
274 }
275
276 size_t dataId = dbr->getId(dataIdx);
277 char *data = dbr->getData(dataId, 0);
278
279 size_t currDataOffset = dbr->getOffset(dataId);
280 size_t nextDataOffset = dbr->findNextOffsetid(dataId);
281 size_t dataSize = nextDataOffset-currDataOffset;
282
283 if (touchData) {
284 dbr->touchData(dataId);
285 }
286
287 DBReader<unsigned int> *reader = DBReader<unsigned int>::unserialize(indexData, threads);
288 reader->open(DBReader<unsigned int>::NOSORT);
289 reader->setData(data, dataSize);
290 reader->setMode(DBReader<unsigned int>::USE_DATA);
291 return reader;
292 }
293
openNewReader(DBReader<unsigned int> * dbr,unsigned int dataIdx,unsigned int indexIdx,bool includeData,int threads,bool touchIndex,bool touchData)294 DBReader<unsigned int> *PrefilteringIndexReader::openNewReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, bool includeData, int threads, bool touchIndex, bool touchData) {
295 size_t id = dbr->getId(indexIdx);
296 char *data = dbr->getDataUncompressed(id);
297 if (touchIndex) {
298 dbr->touchData(id);
299 }
300
301 if (includeData) {
302 id = dbr->getId(dataIdx);
303 if (id == UINT_MAX) {
304 return NULL;
305 }
306 if (touchData) {
307 dbr->touchData(id);
308 }
309
310 DBReader<unsigned int> *reader = DBReader<unsigned int>::unserialize(data, threads);
311 reader->open(DBReader<unsigned int>::NOSORT);
312 size_t currDataOffset = dbr->getOffset(id);
313 size_t nextDataOffset = dbr->findNextOffsetid(id);
314 size_t dataSize = nextDataOffset-currDataOffset;
315 reader->setData(dbr->getDataUncompressed(id), dataSize);
316 reader->setMode(DBReader<unsigned int>::USE_DATA);
317 return reader;
318 }
319
320 DBReader<unsigned int> *reader = DBReader<unsigned int>::unserialize(data, threads);
321 reader->open(DBReader<unsigned int>::NOSORT);
322 return reader;
323 }
324
getSequenceLookup(unsigned int split,DBReader<unsigned int> * dbr,int preloadMode)325 SequenceLookup *PrefilteringIndexReader::getSequenceLookup(unsigned int split, DBReader<unsigned int> *dbr, int preloadMode) {
326 PrefilteringIndexData data = getMetadata(dbr);
327 if (split >= (unsigned int)data.splits) {
328 Debug(Debug::ERROR) << "Invalid split " << split << " out of " << data.splits << " chosen.\n";
329 EXIT(EXIT_FAILURE);
330 }
331
332 unsigned int splitOffset = split * 1000;
333
334 size_t id = dbr->getId(splitOffset + SEQINDEXDATA);
335 if (id == UINT_MAX) {
336 return NULL;
337 }
338
339 char * seqData = dbr->getDataUncompressed(id);
340
341 size_t seqOffsetsId = dbr->getId(splitOffset + SEQINDEXSEQOFFSET);
342 char * seqOffsetsData = dbr->getDataUncompressed(seqOffsetsId);
343
344 size_t seqDataSizeId = dbr->getId(splitOffset + SEQINDEXDATASIZE);
345 int64_t seqDataSize = *((int64_t *)dbr->getDataUncompressed(seqDataSizeId));
346
347 size_t sequenceCountId = dbr->getId(splitOffset + SEQCOUNT);
348 size_t sequenceCount = *((size_t *)dbr->getDataUncompressed(sequenceCountId));
349
350 if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
351 SequenceLookup *sequenceLookup = new SequenceLookup(sequenceCount, seqDataSize);
352 sequenceLookup->initLookupByExternalDataCopy(seqData, (size_t *) seqOffsetsData);
353 return sequenceLookup;
354 }
355
356 if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
357 dbr->touchData(id);
358 dbr->touchData(seqOffsetsId);
359 }
360
361 SequenceLookup *sequenceLookup = new SequenceLookup(sequenceCount);
362 sequenceLookup->initLookupByExternalData(seqData, seqDataSize, (size_t *) seqOffsetsData);
363 return sequenceLookup;
364 }
365
getIndexTable(unsigned int split,DBReader<unsigned int> * dbr,int preloadMode)366 IndexTable *PrefilteringIndexReader::getIndexTable(unsigned int split, DBReader<unsigned int> *dbr, int preloadMode) {
367 PrefilteringIndexData data = getMetadata(dbr);
368 if (split >= (unsigned int)data.splits) {
369 Debug(Debug::ERROR) << "Invalid split " << split << " out of " << data.splits << " chosen.\n";
370 EXIT(EXIT_FAILURE);
371 }
372
373 unsigned int splitOffset = split * 1000;
374 size_t entriesNumId = dbr->getId(splitOffset + ENTRIESNUM);
375 int64_t entriesNum = *((int64_t *)dbr->getDataUncompressed(entriesNumId));
376 size_t sequenceCountId = dbr->getId(splitOffset +SEQCOUNT);
377 size_t sequenceCount = *((size_t *)dbr->getDataUncompressed(sequenceCountId));
378
379 size_t entriesDataId = dbr->getId(splitOffset + ENTRIES);
380 char *entriesData = dbr->getDataUncompressed(entriesDataId);
381
382 size_t entriesOffsetsDataId = dbr->getId(splitOffset + ENTRIESOFFSETS);
383 char *entriesOffsetsData = dbr->getDataUncompressed(entriesOffsetsDataId);
384
385 int adjustAlphabetSize;
386 if (Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_AMINO_ACIDS)) {
387 adjustAlphabetSize = data.alphabetSize - 1;
388 } else {
389 adjustAlphabetSize = data.alphabetSize;
390 }
391
392 if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
393 IndexTable* table = new IndexTable(adjustAlphabetSize, data.kmerSize, false);
394 table->initTableByExternalDataCopy(sequenceCount, entriesNum, (IndexEntryLocal*) entriesData, (size_t *)entriesOffsetsData);
395 return table;
396 }
397
398 if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
399 dbr->touchData(entriesNumId);
400 dbr->touchData(sequenceCountId);
401 dbr->touchData(entriesDataId);
402 dbr->touchData(entriesOffsetsDataId);
403 }
404
405 IndexTable* table = new IndexTable(adjustAlphabetSize, data.kmerSize, true);
406 table->initTableByExternalData(sequenceCount, entriesNum, (IndexEntryLocal*) entriesData, (size_t *)entriesOffsetsData);
407 return table;
408 }
409
printSummary(DBReader<unsigned int> * dbr)410 void PrefilteringIndexReader::printSummary(DBReader<unsigned int> *dbr) {
411 Debug(Debug::INFO) << "Index version: " << dbr->getDataByDBKey(VERSION, 0) << "\n";
412
413 size_t id;
414 if ((id = dbr->getId(GENERATOR)) != UINT_MAX) {
415 Debug(Debug::INFO)
416 << "Generated by: " << dbr->getDataUncompressed(id) << "\n";
417 }
418
419 char * subMatData = dbr->getDataByDBKey(SCOREMATRIXNAME, 0);
420 size_t pos = 0;
421 while(subMatData[pos]!='\0') {
422 if (subMatData[pos] == '.'
423 && subMatData[pos + 1] == 'o'
424 && subMatData[pos + 2] == 'u'
425 && subMatData[pos + 3] == 't'
426 && subMatData[pos + 4] == ':') {
427 break;
428 }
429 pos++;
430 }
431 Debug(Debug::INFO) << "ScoreMatrix: " << std::string(subMatData, pos+4) << "\n";
432 }
433
printMeta(int * metadata_tmp)434 void PrefilteringIndexReader::printMeta(int *metadata_tmp) {
435 Debug(Debug::INFO) << "MaxSeqLength: " << metadata_tmp[0] << "\n";
436 Debug(Debug::INFO) << "KmerSize: " << metadata_tmp[1] << "\n";
437 Debug(Debug::INFO) << "CompBiasCorr: " << metadata_tmp[2] << "\n";
438 Debug(Debug::INFO) << "AlphabetSize: " << metadata_tmp[3] << "\n";
439 Debug(Debug::INFO) << "Masked: " << metadata_tmp[4] << "\n";
440 Debug(Debug::INFO) << "Spaced: " << metadata_tmp[5] << "\n";
441 Debug(Debug::INFO) << "KmerScore: " << metadata_tmp[6] << "\n";
442 Debug(Debug::INFO) << "SequenceType: " << Parameters::getDbTypeName(metadata_tmp[7]) << "\n";
443 Debug(Debug::INFO) << "SourcSeqType: " << Parameters::getDbTypeName(metadata_tmp[8]) << "\n";
444 Debug(Debug::INFO) << "Headers1: " << metadata_tmp[9] << "\n";
445 Debug(Debug::INFO) << "Headers2: " << metadata_tmp[10] << "\n";
446 // Keep compatible to index version 15
447 Debug(Debug::INFO) << "Splits: " << (metadata_tmp[11] == 0 ? 1 : metadata_tmp[11]) << "\n";
448 }
449
getMetadata(DBReader<unsigned int> * dbr)450 PrefilteringIndexData PrefilteringIndexReader::getMetadata(DBReader<unsigned int> *dbr) {
451 int *meta = (int *)dbr->getDataByDBKey(META, 0);
452
453 PrefilteringIndexData data;
454 data.maxSeqLength = meta[0];
455 data.kmerSize = meta[1];
456 data.compBiasCorr = meta[2];
457 data.alphabetSize = meta[3];
458 data.mask = meta[4];
459 data.spacedKmer = meta[5];
460 data.kmerThr = meta[6];
461 data.seqType = meta[7];
462 data.srcSeqType = meta[8];
463 data.headers1 = meta[9];
464 data.headers2 = meta[10];
465 // Keep compatible to index version 15, where meta[11] would have been zero due to the alignment padding
466 data.splits = meta[11] == 0 ? 1 : meta[11];
467
468 return data;
469 }
470
getSubstitutionMatrixName(DBReader<unsigned int> * dbr)471 std::string PrefilteringIndexReader::getSubstitutionMatrixName(DBReader<unsigned int> *dbr) {
472 unsigned int key = dbr->getDbKey(SCOREMATRIXNAME);
473 if (key == UINT_MAX) {
474 return "";
475 }
476 const char *data = dbr->getData(key, 0);
477 size_t len = dbr->getEntryLen(key) - 1;
478 std::string matrixName;
479 bool found = false;
480 for (size_t pos = 0; pos < std::max(len, (size_t)4) - 4 && found == false; pos++) {
481 if (data[pos] == '.'
482 && data[pos + 1] == 'o'
483 && data[pos + 2] == 'u'
484 && data[pos + 3] == 't'
485 && data[pos + 4] == ':') {
486 matrixName = std::string(data, pos + 4);
487 found = true;
488 }
489 }
490 if (found == false) {
491 matrixName = std::string(data);
492 }
493 return matrixName;
494 }
495
getSubstitutionMatrix(DBReader<unsigned int> * dbr)496 std::string PrefilteringIndexReader::getSubstitutionMatrix(DBReader<unsigned int> *dbr) {
497 return std::string(dbr->getDataByDBKey(SCOREMATRIXNAME, 0));
498 }
499
getSpacedPattern(DBReader<unsigned int> * dbr)500 std::string PrefilteringIndexReader::getSpacedPattern(DBReader<unsigned int> *dbr) {
501 size_t id = dbr->getId(SPACEDPATTERN);
502 if (id == UINT_MAX) {
503 return "";
504 }
505 return std::string(dbr->getDataUncompressed(id));
506 }
507
get2MerScoreMatrix(DBReader<unsigned int> * dbr,int preloadMode)508 ScoreMatrix PrefilteringIndexReader::get2MerScoreMatrix(DBReader<unsigned int> *dbr, int preloadMode) {
509 size_t id = dbr->getId(SCOREMATRIX2MER);
510 if (id == UINT_MAX) {
511 return ScoreMatrix();
512 }
513
514 // read alphabetsize to remove x (not needed in index)
515 PrefilteringIndexData meta = getMetadata(dbr);
516
517 char *data = dbr->getDataUncompressed(id);
518 if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
519 return ScoreMatrix::unserializeCopy(data, meta.alphabetSize-1, 2);
520 }
521
522 if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
523 dbr->touchData(id);
524 }
525 return ScoreMatrix::unserialize(data, meta.alphabetSize-1, 2);
526 }
527
get3MerScoreMatrix(DBReader<unsigned int> * dbr,int preloadMode)528 ScoreMatrix PrefilteringIndexReader::get3MerScoreMatrix(DBReader<unsigned int> *dbr, int preloadMode) {
529 size_t id = dbr->getId(SCOREMATRIX3MER);
530 if (id == UINT_MAX) {
531 return ScoreMatrix();
532 }
533
534 // read alphabetsize to remove x (not needed in index)
535 PrefilteringIndexData meta = getMetadata(dbr);
536
537 char *data = dbr->getDataUncompressed(id);
538 if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
539 return ScoreMatrix::unserializeCopy(data, meta.alphabetSize-1, 3);
540 }
541
542 if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
543 dbr->touchData(id);
544 }
545 return ScoreMatrix::unserialize(data, meta.alphabetSize-1, 3);
546 }
547
searchForIndex(const std::string & pathToDB)548 std::string PrefilteringIndexReader::searchForIndex(const std::string &pathToDB) {
549 std::string outIndexName = pathToDB + ".idx";
550 if (FileUtil::fileExists((outIndexName + ".dbtype").c_str()) == true) {
551 return outIndexName;
552 }
553 return "";
554 }
555
dbPathWithoutIndex(std::string & dbname)556 std::string PrefilteringIndexReader::dbPathWithoutIndex(std::string & dbname) {
557 std::string rawname = dbname;
558 // check for .idx
559 size_t idxlastpos = dbname.rfind(".idx");
560 if(idxlastpos != std::string::npos && dbname.size() - idxlastpos == 4){
561 rawname = dbname.substr(0, idxlastpos);
562 }
563 // check for .linidx
564 size_t linidxlastpos = dbname.rfind(".linidx");
565 if(linidxlastpos != std::string::npos && dbname.size() - linidxlastpos == 7){
566 rawname = dbname.substr(0, linidxlastpos);
567 }
568 return rawname;
569 }
570
571
572