1 #include "PrefilteringIndexReader.h"
2 #include "DBWriter.h"
3 #include "Prefiltering.h"
4 #include "ExtendedSubstitutionMatrix.h"
5 #include "FileUtil.h"
6 #include "IndexBuilder.h"
7 #include "Parameters.h"
8 
9 const char*  PrefilteringIndexReader::CURRENT_VERSION = "16";
10 unsigned int PrefilteringIndexReader::VERSION = 0;
11 unsigned int PrefilteringIndexReader::META = 1;
12 unsigned int PrefilteringIndexReader::SCOREMATRIXNAME = 2;
13 unsigned int PrefilteringIndexReader::SCOREMATRIX2MER = 3;
14 unsigned int PrefilteringIndexReader::SCOREMATRIX3MER = 4;
15 unsigned int PrefilteringIndexReader::DBR1INDEX = 5;
16 unsigned int PrefilteringIndexReader::DBR1DATA  = 6;
17 unsigned int PrefilteringIndexReader::DBR2INDEX = 7;
18 unsigned int PrefilteringIndexReader::DBR2DATA  = 8;
19 unsigned int PrefilteringIndexReader::ENTRIES = 9;
20 unsigned int PrefilteringIndexReader::ENTRIESOFFSETS = 10;
21 unsigned int PrefilteringIndexReader::ENTRIESGRIDSIZE = 11;
22 unsigned int PrefilteringIndexReader::ENTRIESNUM = 12;
23 unsigned int PrefilteringIndexReader::SEQCOUNT = 13;
24 unsigned int PrefilteringIndexReader::SEQINDEXDATA = 14;
25 unsigned int PrefilteringIndexReader::SEQINDEXDATASIZE = 15;
26 unsigned int PrefilteringIndexReader::SEQINDEXSEQOFFSET = 16;
27 unsigned int PrefilteringIndexReader::HDR1INDEX = 18;
28 unsigned int PrefilteringIndexReader::HDR1DATA = 19;
29 unsigned int PrefilteringIndexReader::HDR2INDEX = 20;
30 unsigned int PrefilteringIndexReader::HDR2DATA = 21;
31 unsigned int PrefilteringIndexReader::GENERATOR = 22;
32 unsigned int PrefilteringIndexReader::SPACEDPATTERN = 23;
33 
34 extern const char* version;
35 
checkIfIndexFile(DBReader<unsigned int> * reader)36 bool PrefilteringIndexReader::checkIfIndexFile(DBReader<unsigned int>* reader) {
37     char * version = reader->getDataByDBKey(VERSION, 0);
38     if(version == NULL){
39         return false;
40     }
41     return (strncmp(version, CURRENT_VERSION, strlen(CURRENT_VERSION)) == 0 ) ? true : false;
42 }
43 
indexName(const std::string & outDB)44 std::string PrefilteringIndexReader::indexName(const std::string &outDB) {
45     std::string result(outDB);
46     result.append(".idx");
47     return result;
48 }
49 
createIndexFile(const std::string & outDB,DBReader<unsigned int> * dbr1,DBReader<unsigned int> * dbr2,DBReader<unsigned int> * hdbr1,DBReader<unsigned int> * hdbr2,BaseMatrix * subMat,int maxSeqLen,bool hasSpacedKmer,const std::string & spacedKmerPattern,bool compBiasCorrection,int alphabetSize,int kmerSize,int maskMode,int maskLowerCase,int kmerThr,int splits)50 void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
51                                               DBReader<unsigned int> *dbr1, DBReader<unsigned int> *dbr2,
52                                               DBReader<unsigned int> *hdbr1, DBReader<unsigned int> *hdbr2,
53                                               BaseMatrix *subMat, int maxSeqLen,
54                                               bool hasSpacedKmer, const std::string &spacedKmerPattern,
55                                               bool compBiasCorrection, int alphabetSize, int kmerSize,
56                                               int maskMode, int maskLowerCase, int kmerThr, int splits) {
57 
58     const int SPLIT_META = splits > 1 ? 0 : 0;
59     const int SPLIT_SEQS = splits > 1 ? 1 : 0;
60     const int SPLIT_INDX = splits > 1 ? 2 : 0;
61 
62     DBWriter writer(outDB.c_str(), std::string(outDB).append(".index").c_str(), splits > 1 ? splits + 2 : 1, Parameters::WRITER_ASCII_MODE, Parameters::DBTYPE_INDEX_DB);
63     writer.open();
64 
65     Debug(Debug::INFO) << "Write VERSION (" << VERSION << ")\n";
66     writer.writeData((char *) CURRENT_VERSION, strlen(CURRENT_VERSION) * sizeof(char), VERSION, SPLIT_META);
67     writer.alignToPageSize(SPLIT_META);
68 
69     Debug(Debug::INFO) << "Write META (" << META << ")\n";
70     const int biasCorr = compBiasCorrection ? 1 : 0;
71     const int mask = maskMode > 0;
72     const int spacedKmer = (hasSpacedKmer) ? 1 : 0;
73     const int headers1 = (hdbr1 != NULL) ? 1 : 0;
74     const int headers2 = (hdbr2 != NULL) ? 1 : 0;
75     const int seqType = dbr1->getDbtype();
76     const int srcSeqType = (dbr2 !=NULL) ? dbr2->getDbtype() : seqType;
77     int metadata[] = {maxSeqLen, kmerSize, biasCorr, alphabetSize, mask, spacedKmer, kmerThr, seqType, srcSeqType, headers1, headers2, splits};
78     char *metadataptr = (char *) &metadata;
79     writer.writeData(metadataptr, sizeof(metadata), META, SPLIT_META);
80     writer.alignToPageSize(SPLIT_META);
81 
82     if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false &&
83         Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ) == false) {
84         int alphabetSize = subMat->alphabetSize;
85         subMat->alphabetSize = subMat->alphabetSize-1;
86         ScoreMatrix s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
87         ScoreMatrix s2 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 2);
88         subMat->alphabetSize = alphabetSize;
89 
90         char* serialized3mer = ScoreMatrix::serialize(s3);
91         Debug(Debug::INFO) << "Write SCOREMATRIX3MER (" << SCOREMATRIX3MER << ")\n";
92         writer.writeData(serialized3mer, ScoreMatrix::size(s3), SCOREMATRIX3MER, SPLIT_META);
93         writer.alignToPageSize(SPLIT_META);
94         ExtendedSubstitutionMatrix::freeScoreMatrix(s3);
95         free(serialized3mer);
96 
97         char* serialized2mer = ScoreMatrix::serialize(s2);
98         Debug(Debug::INFO) << "Write SCOREMATRIX2MER (" << SCOREMATRIX2MER << ")\n";
99         writer.writeData(serialized2mer, ScoreMatrix::size(s2), SCOREMATRIX2MER, SPLIT_META);
100         writer.alignToPageSize(SPLIT_META);
101         ExtendedSubstitutionMatrix::freeScoreMatrix(s2);
102         free(serialized2mer);
103     }
104 
105     Debug(Debug::INFO) << "Write SCOREMATRIXNAME (" << SCOREMATRIXNAME << ")\n";
106     char* subData = BaseMatrix::serialize(subMat->matrixName, subMat->matrixData);
107     writer.writeData(subData, BaseMatrix::memorySize(subMat->matrixName, subMat->matrixData), SCOREMATRIXNAME, SPLIT_META);
108     writer.alignToPageSize(SPLIT_META);
109     free(subData);
110 
111     if (spacedKmerPattern.empty() != false) {
112         Debug(Debug::INFO) << "Write SPACEDPATTERN (" << SPACEDPATTERN << ")\n";
113         writer.writeData(spacedKmerPattern.c_str(), spacedKmerPattern.length(), SPACEDPATTERN, SPLIT_META);
114         writer.alignToPageSize(SPLIT_META);
115     }
116 
117     Debug(Debug::INFO) << "Write GENERATOR (" << GENERATOR << ")\n";
118     writer.writeData(version, strlen(version), GENERATOR, SPLIT_META);
119     writer.alignToPageSize(SPLIT_META);
120 
121     Debug(Debug::INFO) << "Write DBR1INDEX (" << DBR1INDEX << ")\n";
122     char* data = DBReader<unsigned int>::serialize(*dbr1);
123     size_t offsetIndex = writer.getOffset(SPLIT_SEQS);
124     writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*dbr1), DBR1INDEX, SPLIT_SEQS);
125     writer.alignToPageSize(SPLIT_SEQS);
126 
127     Debug(Debug::INFO) << "Write DBR1DATA (" << DBR1DATA << ")\n";
128     size_t offsetData = writer.getOffset(SPLIT_SEQS);
129     writer.writeStart(SPLIT_SEQS);
130     for(size_t fileIdx = 0; fileIdx < dbr1->getDataFileCnt(); fileIdx++) {
131         writer.writeAdd(dbr1->getDataForFile(fileIdx), dbr1->getDataSizeForFile(fileIdx), SPLIT_SEQS);
132     }
133     writer.writeEnd(DBR1DATA, SPLIT_SEQS);
134     writer.alignToPageSize(SPLIT_SEQS);
135     free(data);
136 
137     if (dbr2 == NULL) {
138         writer.writeIndexEntry(DBR2INDEX, offsetIndex, DBReader<unsigned int>::indexMemorySize(*dbr1)+1, SPLIT_SEQS);
139         writer.writeIndexEntry(DBR2DATA,  offsetData,  dbr1->getTotalDataSize()+1, SPLIT_SEQS);
140     } else {
141         Debug(Debug::INFO) << "Write DBR2INDEX (" << DBR2INDEX << ")\n";
142         data = DBReader<unsigned int>::serialize(*dbr2);
143         writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*dbr2), DBR2INDEX, SPLIT_SEQS);
144         writer.alignToPageSize(SPLIT_SEQS);
145         Debug(Debug::INFO) << "Write DBR2DATA (" << DBR2DATA << ")\n";
146         writer.writeStart(SPLIT_SEQS);
147         for(size_t fileIdx = 0; fileIdx < dbr2->getDataFileCnt(); fileIdx++) {
148             writer.writeAdd(dbr2->getDataForFile(fileIdx), dbr2->getDataSizeForFile(fileIdx), SPLIT_SEQS);
149         }
150         writer.writeEnd(DBR2DATA, SPLIT_SEQS);
151         writer.alignToPageSize(SPLIT_SEQS);
152         free(data);
153     }
154 
155     if (hdbr1 != NULL) {
156         Debug(Debug::INFO) << "Write HDR1INDEX (" << HDR1INDEX << ")\n";
157         data = DBReader<unsigned int>::serialize(*hdbr1);
158         size_t offsetIndex = writer.getOffset(SPLIT_SEQS);
159         writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*hdbr1), HDR1INDEX, SPLIT_SEQS);
160         writer.alignToPageSize(SPLIT_SEQS);
161 
162         Debug(Debug::INFO) << "Write HDR1DATA (" << HDR1DATA << ")\n";
163         size_t offsetData = writer.getOffset(SPLIT_SEQS);
164         writer.writeStart(SPLIT_SEQS);
165         for(size_t fileIdx = 0; fileIdx < hdbr1->getDataFileCnt(); fileIdx++) {
166             writer.writeAdd(hdbr1->getDataForFile(fileIdx), hdbr1->getDataSizeForFile(fileIdx), SPLIT_SEQS);
167         }
168         writer.writeEnd(HDR1DATA, SPLIT_SEQS);
169         writer.alignToPageSize(SPLIT_SEQS);
170         free(data);
171         if (hdbr2 == NULL) {
172             writer.writeIndexEntry(HDR2INDEX, offsetIndex, DBReader<unsigned int>::indexMemorySize(*hdbr1)+1, SPLIT_SEQS);
173             writer.writeIndexEntry(HDR2DATA,  offsetData, hdbr1->getTotalDataSize()+1, SPLIT_SEQS);
174         }
175     }
176     if (hdbr2 != NULL) {
177         Debug(Debug::INFO) << "Write HDR2INDEX (" << HDR2INDEX << ")\n";
178         data = DBReader<unsigned int>::serialize(*hdbr2);
179         writer.writeData(data, DBReader<unsigned int>::indexMemorySize(*hdbr2), HDR2INDEX, SPLIT_SEQS);
180         writer.alignToPageSize(SPLIT_SEQS);
181         Debug(Debug::INFO) << "Write HDR2DATA (" << HDR2DATA << ")\n";
182         writer.writeStart(SPLIT_SEQS);
183         for(size_t fileIdx = 0; fileIdx < hdbr2->getDataFileCnt(); fileIdx++) {
184             writer.writeAdd(hdbr2->getDataForFile(fileIdx), hdbr2->getDataSizeForFile(fileIdx), SPLIT_SEQS);
185         }
186         writer.writeEnd(HDR2DATA, SPLIT_SEQS);
187         writer.alignToPageSize(SPLIT_SEQS);
188         free(data);
189     }
190 
191     Sequence seq(maxSeqLen, seqType, subMat, kmerSize, hasSpacedKmer, compBiasCorrection, true, spacedKmerPattern);
192     // remove x (not needed in index)
193     const int adjustAlphabetSize =
194             (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_AMINO_ACIDS))
195                 ? alphabetSize -1: alphabetSize;
196 
197     for (int s = 0; s < splits; s++) {
198         size_t dbFrom = 0;
199         size_t dbSize = 0;
200         dbr1->decomposeDomainByAminoAcid(s, splits, &dbFrom, &dbSize);
201         if (dbSize == 0) {
202             continue;
203         }
204 
205         IndexTable indexTable(adjustAlphabetSize, kmerSize, false);
206         SequenceLookup *sequenceLookup = NULL;
207         IndexBuilder::fillDatabase(&indexTable,
208                                    (maskMode == 1 || maskLowerCase == 1) ? &sequenceLookup : NULL,
209                                    (maskMode == 0 ) ? &sequenceLookup : NULL,
210                                    *subMat, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr, maskMode, maskLowerCase);
211         indexTable.printStatistics(subMat->num2aa);
212 
213         if (sequenceLookup == NULL) {
214             Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n";
215             EXIT(EXIT_FAILURE);
216         }
217 
218         // save the entries
219         unsigned int keyOffset = 1000 * s;
220         Debug(Debug::INFO) << "Write ENTRIES (" << (keyOffset + ENTRIES) << ")\n";
221         char *entries = (char *) indexTable.getEntries();
222         size_t entriesSize = indexTable.getTableEntriesNum() * indexTable.getSizeOfEntry();
223         writer.writeData(entries, entriesSize, (keyOffset + ENTRIES), SPLIT_INDX + s);
224         writer.alignToPageSize(SPLIT_INDX + s);
225 
226         // save the size
227         Debug(Debug::INFO) << "Write ENTRIESOFFSETS (" << (keyOffset + ENTRIESOFFSETS) << ")\n";
228         char *offsets = (char*)indexTable.getOffsets();
229         size_t offsetsSize = (indexTable.getTableSize() + 1) * sizeof(size_t);
230         writer.writeData(offsets, offsetsSize, (keyOffset + ENTRIESOFFSETS), SPLIT_INDX + s);
231         writer.alignToPageSize(SPLIT_INDX + s);
232         indexTable.deleteEntries();
233 
234         Debug(Debug::INFO) << "Write SEQINDEXDATASIZE (" << (keyOffset + SEQINDEXDATASIZE) << ")\n";
235         int64_t seqindexDataSize = sequenceLookup->getDataSize();
236         char *seqindexDataSizePtr = (char *) &seqindexDataSize;
237         writer.writeData(seqindexDataSizePtr, 1 * sizeof(int64_t), (keyOffset + SEQINDEXDATASIZE), SPLIT_INDX + s);
238         writer.alignToPageSize(SPLIT_INDX + s);
239 
240         size_t *sequenceOffsets = sequenceLookup->getOffsets();
241         size_t sequenceCount = sequenceLookup->getSequenceCount();
242         Debug(Debug::INFO) << "Write SEQINDEXSEQOFFSET (" << (keyOffset + SEQINDEXSEQOFFSET) << ")\n";
243         writer.writeData((char *) sequenceOffsets, (sequenceCount + 1) * sizeof(size_t), (keyOffset + SEQINDEXSEQOFFSET), SPLIT_INDX + s);
244         writer.alignToPageSize(SPLIT_INDX + s);
245 
246         Debug(Debug::INFO) << "Write SEQINDEXDATA (" << (keyOffset + SEQINDEXDATA) << ")\n";
247         writer.writeData(sequenceLookup->getData(), (sequenceLookup->getDataSize() + 1) * sizeof(char), (keyOffset + SEQINDEXDATA), SPLIT_INDX + s);
248         writer.alignToPageSize(SPLIT_INDX + s);
249         delete sequenceLookup;
250 
251         // ENTRIESNUM
252         Debug(Debug::INFO) << "Write ENTRIESNUM (" << (keyOffset + ENTRIESNUM) << ")\n";
253         uint64_t entriesNum = indexTable.getTableEntriesNum();
254         char *entriesNumPtr = (char *) &entriesNum;
255         writer.writeData(entriesNumPtr, 1 * sizeof(uint64_t), (keyOffset + ENTRIESNUM), SPLIT_INDX + s);
256         writer.alignToPageSize(SPLIT_INDX + s);
257 
258         // SEQCOUNT
259         Debug(Debug::INFO) << "Write SEQCOUNT (" << (keyOffset + SEQCOUNT) << ")\n";
260         size_t tablesize = indexTable.getSize();
261         char *tablesizePtr = (char *) &tablesize;
262         writer.writeData(tablesizePtr, 1 * sizeof(size_t), (keyOffset + SEQCOUNT), SPLIT_INDX + s);
263         writer.alignToPageSize(SPLIT_INDX + s);
264     }
265 
266     writer.close(false);
267 }
268 
openNewHeaderReader(DBReader<unsigned int> * dbr,unsigned int dataIdx,unsigned int indexIdx,int threads,bool touchIndex,bool touchData)269 DBReader<unsigned int> *PrefilteringIndexReader::openNewHeaderReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, int threads,  bool touchIndex, bool touchData) {
270     size_t indexId = dbr->getId(indexIdx);
271     char *indexData = dbr->getData(indexId, 0);
272     if (touchIndex) {
273         dbr->touchData(indexId);
274     }
275 
276     size_t dataId = dbr->getId(dataIdx);
277     char *data = dbr->getData(dataId, 0);
278 
279     size_t currDataOffset = dbr->getOffset(dataId);
280     size_t nextDataOffset = dbr->findNextOffsetid(dataId);
281     size_t dataSize = nextDataOffset-currDataOffset;
282 
283     if (touchData) {
284         dbr->touchData(dataId);
285     }
286 
287     DBReader<unsigned int> *reader = DBReader<unsigned int>::unserialize(indexData, threads);
288     reader->open(DBReader<unsigned int>::NOSORT);
289     reader->setData(data, dataSize);
290     reader->setMode(DBReader<unsigned int>::USE_DATA);
291     return reader;
292 }
293 
openNewReader(DBReader<unsigned int> * dbr,unsigned int dataIdx,unsigned int indexIdx,bool includeData,int threads,bool touchIndex,bool touchData)294 DBReader<unsigned int> *PrefilteringIndexReader::openNewReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, bool includeData, int threads, bool touchIndex, bool touchData) {
295     size_t id = dbr->getId(indexIdx);
296     char *data = dbr->getDataUncompressed(id);
297     if (touchIndex) {
298         dbr->touchData(id);
299     }
300 
301     if (includeData) {
302         id = dbr->getId(dataIdx);
303         if (id == UINT_MAX) {
304             return NULL;
305         }
306         if (touchData) {
307             dbr->touchData(id);
308         }
309 
310         DBReader<unsigned int> *reader = DBReader<unsigned int>::unserialize(data, threads);
311         reader->open(DBReader<unsigned int>::NOSORT);
312         size_t currDataOffset = dbr->getOffset(id);
313         size_t nextDataOffset = dbr->findNextOffsetid(id);
314         size_t dataSize = nextDataOffset-currDataOffset;
315         reader->setData(dbr->getDataUncompressed(id), dataSize);
316         reader->setMode(DBReader<unsigned int>::USE_DATA);
317         return reader;
318     }
319 
320     DBReader<unsigned int> *reader = DBReader<unsigned int>::unserialize(data, threads);
321     reader->open(DBReader<unsigned int>::NOSORT);
322     return reader;
323 }
324 
getSequenceLookup(unsigned int split,DBReader<unsigned int> * dbr,int preloadMode)325 SequenceLookup *PrefilteringIndexReader::getSequenceLookup(unsigned int split, DBReader<unsigned int> *dbr, int preloadMode) {
326     PrefilteringIndexData data = getMetadata(dbr);
327     if (split >= (unsigned int)data.splits) {
328         Debug(Debug::ERROR) << "Invalid split " << split << " out of " << data.splits << " chosen.\n";
329         EXIT(EXIT_FAILURE);
330     }
331 
332     unsigned int splitOffset = split * 1000;
333 
334     size_t id = dbr->getId(splitOffset + SEQINDEXDATA);
335     if (id == UINT_MAX) {
336         return NULL;
337     }
338 
339     char * seqData = dbr->getDataUncompressed(id);
340 
341     size_t seqOffsetsId = dbr->getId(splitOffset + SEQINDEXSEQOFFSET);
342     char * seqOffsetsData = dbr->getDataUncompressed(seqOffsetsId);
343 
344     size_t seqDataSizeId = dbr->getId(splitOffset + SEQINDEXDATASIZE);
345     int64_t seqDataSize = *((int64_t *)dbr->getDataUncompressed(seqDataSizeId));
346 
347     size_t sequenceCountId = dbr->getId(splitOffset + SEQCOUNT);
348     size_t sequenceCount = *((size_t *)dbr->getDataUncompressed(sequenceCountId));
349 
350     if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
351         SequenceLookup *sequenceLookup = new SequenceLookup(sequenceCount, seqDataSize);
352         sequenceLookup->initLookupByExternalDataCopy(seqData, (size_t *) seqOffsetsData);
353         return sequenceLookup;
354     }
355 
356     if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
357         dbr->touchData(id);
358         dbr->touchData(seqOffsetsId);
359     }
360 
361     SequenceLookup *sequenceLookup = new SequenceLookup(sequenceCount);
362     sequenceLookup->initLookupByExternalData(seqData, seqDataSize, (size_t *) seqOffsetsData);
363     return sequenceLookup;
364 }
365 
getIndexTable(unsigned int split,DBReader<unsigned int> * dbr,int preloadMode)366 IndexTable *PrefilteringIndexReader::getIndexTable(unsigned int split, DBReader<unsigned int> *dbr, int preloadMode) {
367     PrefilteringIndexData data = getMetadata(dbr);
368     if (split >= (unsigned int)data.splits) {
369         Debug(Debug::ERROR) << "Invalid split " << split << " out of " << data.splits << " chosen.\n";
370         EXIT(EXIT_FAILURE);
371     }
372 
373     unsigned int splitOffset = split * 1000;
374     size_t entriesNumId = dbr->getId(splitOffset + ENTRIESNUM);
375     int64_t entriesNum = *((int64_t *)dbr->getDataUncompressed(entriesNumId));
376     size_t sequenceCountId = dbr->getId(splitOffset +SEQCOUNT);
377     size_t sequenceCount = *((size_t *)dbr->getDataUncompressed(sequenceCountId));
378 
379     size_t entriesDataId = dbr->getId(splitOffset + ENTRIES);
380     char *entriesData = dbr->getDataUncompressed(entriesDataId);
381 
382     size_t entriesOffsetsDataId = dbr->getId(splitOffset + ENTRIESOFFSETS);
383     char *entriesOffsetsData = dbr->getDataUncompressed(entriesOffsetsDataId);
384 
385     int adjustAlphabetSize;
386     if (Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_NUCLEOTIDES) || Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_AMINO_ACIDS)) {
387         adjustAlphabetSize = data.alphabetSize - 1;
388     } else {
389         adjustAlphabetSize = data.alphabetSize;
390     }
391 
392     if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
393         IndexTable* table = new IndexTable(adjustAlphabetSize, data.kmerSize, false);
394         table->initTableByExternalDataCopy(sequenceCount, entriesNum, (IndexEntryLocal*) entriesData, (size_t *)entriesOffsetsData);
395         return table;
396     }
397 
398     if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
399         dbr->touchData(entriesNumId);
400         dbr->touchData(sequenceCountId);
401         dbr->touchData(entriesDataId);
402         dbr->touchData(entriesOffsetsDataId);
403     }
404 
405     IndexTable* table = new IndexTable(adjustAlphabetSize, data.kmerSize, true);
406     table->initTableByExternalData(sequenceCount, entriesNum, (IndexEntryLocal*) entriesData, (size_t *)entriesOffsetsData);
407     return table;
408 }
409 
printSummary(DBReader<unsigned int> * dbr)410 void PrefilteringIndexReader::printSummary(DBReader<unsigned int> *dbr) {
411     Debug(Debug::INFO) << "Index version: " << dbr->getDataByDBKey(VERSION, 0) << "\n";
412 
413     size_t id;
414     if ((id = dbr->getId(GENERATOR)) != UINT_MAX) {
415         Debug(Debug::INFO)
416                        << "Generated by:  " << dbr->getDataUncompressed(id) << "\n";
417     }
418 
419     char * subMatData = dbr->getDataByDBKey(SCOREMATRIXNAME, 0);
420     size_t pos = 0;
421     while(subMatData[pos]!='\0') {
422         if (subMatData[pos] == '.'
423             && subMatData[pos + 1] == 'o'
424             && subMatData[pos + 2] == 'u'
425             && subMatData[pos + 3] == 't'
426             && subMatData[pos + 4] == ':') {
427             break;
428         }
429         pos++;
430     }
431     Debug(Debug::INFO) << "ScoreMatrix:  " << std::string(subMatData, pos+4) << "\n";
432 }
433 
printMeta(int * metadata_tmp)434 void PrefilteringIndexReader::printMeta(int *metadata_tmp) {
435     Debug(Debug::INFO) << "MaxSeqLength: " << metadata_tmp[0] << "\n";
436     Debug(Debug::INFO) << "KmerSize:     " << metadata_tmp[1] << "\n";
437     Debug(Debug::INFO) << "CompBiasCorr: " << metadata_tmp[2] << "\n";
438     Debug(Debug::INFO) << "AlphabetSize: " << metadata_tmp[3] << "\n";
439     Debug(Debug::INFO) << "Masked:       " << metadata_tmp[4] << "\n";
440     Debug(Debug::INFO) << "Spaced:       " << metadata_tmp[5] << "\n";
441     Debug(Debug::INFO) << "KmerScore:    " << metadata_tmp[6] << "\n";
442     Debug(Debug::INFO) << "SequenceType: " << Parameters::getDbTypeName(metadata_tmp[7]) << "\n";
443     Debug(Debug::INFO) << "SourcSeqType: " << Parameters::getDbTypeName(metadata_tmp[8]) << "\n";
444     Debug(Debug::INFO) << "Headers1:     " << metadata_tmp[9] << "\n";
445     Debug(Debug::INFO) << "Headers2:     " << metadata_tmp[10] << "\n";
446     // Keep compatible to index version 15
447     Debug(Debug::INFO) << "Splits:       " << (metadata_tmp[11] == 0 ? 1 : metadata_tmp[11]) << "\n";
448 }
449 
getMetadata(DBReader<unsigned int> * dbr)450 PrefilteringIndexData PrefilteringIndexReader::getMetadata(DBReader<unsigned int> *dbr) {
451     int *meta = (int *)dbr->getDataByDBKey(META, 0);
452 
453     PrefilteringIndexData data;
454     data.maxSeqLength = meta[0];
455     data.kmerSize = meta[1];
456     data.compBiasCorr = meta[2];
457     data.alphabetSize = meta[3];
458     data.mask = meta[4];
459     data.spacedKmer = meta[5];
460     data.kmerThr = meta[6];
461     data.seqType = meta[7];
462     data.srcSeqType = meta[8];
463     data.headers1 = meta[9];
464     data.headers2 = meta[10];
465     // Keep compatible to index version 15, where meta[11] would have been zero due to the alignment padding
466     data.splits = meta[11] == 0 ? 1 : meta[11];
467 
468     return data;
469 }
470 
getSubstitutionMatrixName(DBReader<unsigned int> * dbr)471 std::string PrefilteringIndexReader::getSubstitutionMatrixName(DBReader<unsigned int> *dbr) {
472     unsigned int key = dbr->getDbKey(SCOREMATRIXNAME);
473     if (key == UINT_MAX) {
474         return "";
475     }
476     const char *data = dbr->getData(key, 0);
477     size_t len = dbr->getEntryLen(key) - 1;
478     std::string matrixName;
479     bool found = false;
480     for (size_t pos = 0; pos < std::max(len, (size_t)4) - 4 && found == false; pos++) {
481         if (data[pos] == '.'
482             && data[pos + 1] == 'o'
483             && data[pos + 2] == 'u'
484             && data[pos + 3] == 't'
485             && data[pos + 4] == ':') {
486             matrixName = std::string(data, pos + 4);
487             found = true;
488         }
489     }
490     if (found == false) {
491         matrixName = std::string(data);
492     }
493     return matrixName;
494 }
495 
getSubstitutionMatrix(DBReader<unsigned int> * dbr)496 std::string PrefilteringIndexReader::getSubstitutionMatrix(DBReader<unsigned int> *dbr) {
497     return std::string(dbr->getDataByDBKey(SCOREMATRIXNAME, 0));
498 }
499 
getSpacedPattern(DBReader<unsigned int> * dbr)500 std::string PrefilteringIndexReader::getSpacedPattern(DBReader<unsigned int> *dbr) {
501     size_t id = dbr->getId(SPACEDPATTERN);
502     if (id == UINT_MAX) {
503         return "";
504     }
505     return std::string(dbr->getDataUncompressed(id));
506 }
507 
get2MerScoreMatrix(DBReader<unsigned int> * dbr,int preloadMode)508 ScoreMatrix PrefilteringIndexReader::get2MerScoreMatrix(DBReader<unsigned int> *dbr, int preloadMode) {
509     size_t id = dbr->getId(SCOREMATRIX2MER);
510     if (id == UINT_MAX) {
511         return ScoreMatrix();
512     }
513 
514     // read alphabetsize to remove x (not needed in index)
515     PrefilteringIndexData meta = getMetadata(dbr);
516 
517     char *data = dbr->getDataUncompressed(id);
518     if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
519         return ScoreMatrix::unserializeCopy(data, meta.alphabetSize-1, 2);
520     }
521 
522     if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
523         dbr->touchData(id);
524     }
525     return ScoreMatrix::unserialize(data, meta.alphabetSize-1, 2);
526 }
527 
get3MerScoreMatrix(DBReader<unsigned int> * dbr,int preloadMode)528 ScoreMatrix PrefilteringIndexReader::get3MerScoreMatrix(DBReader<unsigned int> *dbr, int preloadMode) {
529     size_t id = dbr->getId(SCOREMATRIX3MER);
530     if (id == UINT_MAX) {
531         return ScoreMatrix();
532     }
533 
534     // read alphabetsize to remove x (not needed in index)
535     PrefilteringIndexData meta = getMetadata(dbr);
536 
537     char *data = dbr->getDataUncompressed(id);
538     if (preloadMode == Parameters::PRELOAD_MODE_FREAD) {
539         return ScoreMatrix::unserializeCopy(data, meta.alphabetSize-1, 3);
540     }
541 
542     if (preloadMode == Parameters::PRELOAD_MODE_MMAP_TOUCH) {
543         dbr->touchData(id);
544     }
545     return ScoreMatrix::unserialize(data, meta.alphabetSize-1, 3);
546 }
547 
searchForIndex(const std::string & pathToDB)548 std::string PrefilteringIndexReader::searchForIndex(const std::string &pathToDB) {
549     std::string outIndexName = pathToDB + ".idx";
550     if (FileUtil::fileExists((outIndexName + ".dbtype").c_str()) == true) {
551         return outIndexName;
552     }
553     return "";
554 }
555 
dbPathWithoutIndex(std::string & dbname)556 std::string PrefilteringIndexReader::dbPathWithoutIndex(std::string & dbname) {
557     std::string rawname = dbname;
558     // check for .idx
559     size_t idxlastpos = dbname.rfind(".idx");
560     if(idxlastpos != std::string::npos && dbname.size() - idxlastpos == 4){
561         rawname  = dbname.substr(0, idxlastpos);
562     }
563     // check for .linidx
564     size_t linidxlastpos = dbname.rfind(".linidx");
565     if(linidxlastpos != std::string::npos && dbname.size() - linidxlastpos == 7){
566         rawname  = dbname.substr(0, linidxlastpos);
567     }
568     return rawname;
569 }
570 
571 
572