1 #ifndef INDEX_TABLE_H
2 #define INDEX_TABLE_H
3 
4 //
5 // Written by Martin Steinegger martin.steinegger@snu.ac.kr and Maria Hauser mhauser@genzentrum.lmu.de
6 //
7 // Abstract: Index table stores the list of DB sequences containing a certain k-mer, for each k-mer.
8 //
9 
10 #include "DBReader.h"
11 #include "Sequence.h"
12 #include "Indexer.h"
13 #include "Debug.h"
14 #include "Util.h"
15 #include "SequenceLookup.h"
16 #include "MathUtil.h"
17 #include "KmerGenerator.h"
18 #include "Parameters.h"
19 #include "FastSort.h"
20 #include <stdlib.h>
21 #include <algorithm>
22 
23 // IndexEntryLocal is an entry with position and seqId for a kmer
24 // structure needs to be packed or it will need 8 bytes instead of 6
25 struct __attribute__((__packed__)) IndexEntryLocal {
26     unsigned int seqId;
27     unsigned short position_j;
comapreByIdAndPosIndexEntryLocal28     static bool comapreByIdAndPos(IndexEntryLocal first, IndexEntryLocal second){
29         if(first.seqId < second.seqId )
30             return true;
31         if(second.seqId < first.seqId )
32             return false;
33         if(first.position_j < second.position_j )
34             return true;
35         if(second.position_j < first.position_j )
36             return false;
37         return false;
38     }
39 };
40 
41 struct __attribute__((__packed__)) IndexEntryLocalTmp {
42     unsigned int kmer;
43     unsigned int seqId;
44     unsigned short position_j;
45 
IndexEntryLocalTmpIndexEntryLocalTmp46     IndexEntryLocalTmp(unsigned int kmer, unsigned int seqId, unsigned short position_j)
47             :kmer(kmer),seqId(seqId), position_j(position_j)
48     {}
49 
IndexEntryLocalTmpIndexEntryLocalTmp50     IndexEntryLocalTmp() {}
51 
comapreByIdAndPosIndexEntryLocalTmp52     static bool comapreByIdAndPos(IndexEntryLocalTmp first, IndexEntryLocalTmp second){
53         if(first.kmer < second.kmer )
54             return true;
55         if(second.kmer < first.kmer )
56             return false;
57         if(first.position_j < second.position_j )
58             return true;
59         if(second.position_j < first.position_j )
60             return false;
61         return false;
62     }
63 };
64 
65 class IndexTable {
66 public:
IndexTable(int alphabetSize,int kmerSize,bool externalData)67     IndexTable(int alphabetSize, int kmerSize, bool externalData)
68             : tableSize(MathUtil::ipow<size_t>(alphabetSize, kmerSize)), alphabetSize(alphabetSize),
69               kmerSize(kmerSize), externalData(externalData), tableEntriesNum(0), size(0),
70               indexer(new Indexer(alphabetSize, kmerSize)), entries(NULL), offsets(NULL) {
71         if (externalData == false) {
72             offsets = new(std::nothrow) size_t[tableSize + 1];
73             Util::checkAllocation(offsets, "Can not allocate entries memory in IndexTable");
74             memset(offsets, 0, (tableSize + 1) * sizeof(size_t));
75         }
76     }
77 
~IndexTable()78     virtual ~IndexTable() {
79         deleteEntries();
80         delete indexer;
81     }
82 
deleteEntries()83     void deleteEntries() {
84         if (externalData == false) {
85             if (entries != NULL) {
86                 delete[] entries;
87                 entries = NULL;
88             }
89             if (offsets != NULL) {
90                 delete[] offsets;
91                 offsets = NULL;
92             }
93         }
94     }
95 
96     // count k-mers in the sequence, so enough memory for the sequence lists can be allocated in the end
addSimilarKmerCount(Sequence * s,KmerGenerator * kmerGenerator)97     size_t addSimilarKmerCount(Sequence* s, KmerGenerator* kmerGenerator){
98         s->resetCurrPos();
99         std::vector<unsigned int> seqKmerPosBuffer;
100 
101         //idxer->reset();
102         while(s->hasNextKmer()){
103             const unsigned char * kmer = s->nextKmer();
104             const std::pair<size_t *, size_t> kmerList = kmerGenerator->generateKmerList(kmer);
105 
106             //unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize);
107             for(size_t i = 0; i < kmerList.second; i++){
108                 seqKmerPosBuffer.push_back(kmerList.first[i]);
109             }
110         }
111         if(seqKmerPosBuffer.size() > 1){
112             SORT_SERIAL(seqKmerPosBuffer.begin(), seqKmerPosBuffer.end());
113         }
114         size_t countUniqKmer = 0;
115         unsigned int prevKmerIdx = UINT_MAX;
116         for(size_t i = 0; i < seqKmerPosBuffer.size(); i++){
117             unsigned int kmerIdx = seqKmerPosBuffer[i];
118             if(prevKmerIdx != kmerIdx){
119                 //table[kmerIdx] += 1;
120                 // size increases by one
121                 __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
122                 countUniqKmer++;
123             }
124             prevKmerIdx = kmerIdx;
125         }
126         return countUniqKmer;
127     }
128 
129     // count k-mers in the sequence, so enough memory for the sequence lists can be allocated in the end
addKmerCount(Sequence * s,Indexer * idxer,unsigned int * seqKmerPosBuffer,int threshold,char * diagonalScore)130     size_t addKmerCount(Sequence *s, Indexer *idxer, unsigned int *seqKmerPosBuffer,
131                         int threshold, char *diagonalScore) {
132         s->resetCurrPos();
133         size_t countKmer = 0;
134         bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) ||
135                         Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS));
136         while(s->hasNextKmer()){
137             const unsigned char * kmer = s->nextKmer();
138             if(removeX && s->kmerContainsX()){
139                 continue;
140             }
141             if(threshold > 0){
142                 int score = 0;
143                 for(int pos = 0; pos < kmerSize; pos++){
144                     score += diagonalScore[kmer[pos]];
145                 }
146                 if(score < threshold){
147                     continue;
148                 }
149             }
150             unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize);
151             seqKmerPosBuffer[countKmer] = kmerIdx;
152             countKmer++;
153         }
154         if(countKmer > 1){
155             SORT_SERIAL(seqKmerPosBuffer, seqKmerPosBuffer + countKmer);
156         }
157         size_t countUniqKmer = 0;
158         unsigned int prevKmerIdx = UINT_MAX;
159         for(size_t i = 0; i < countKmer; i++){
160             unsigned int kmerIdx = seqKmerPosBuffer[i];
161             if(prevKmerIdx != kmerIdx){
162                 //table[kmerIdx] += 1;
163                 // size increases by one
164                 __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
165                 countUniqKmer++;
166             }
167             prevKmerIdx = kmerIdx;
168         }
169         return countUniqKmer;
170     }
171 
172     // get list of DB sequences containing this k-mer
getDBSeqList(size_t kmer,size_t * matchedListSize)173     inline IndexEntryLocal *getDBSeqList(size_t kmer, size_t *matchedListSize) {
174         const ptrdiff_t diff = offsets[kmer + 1] - offsets[kmer];
175         *matchedListSize = static_cast<size_t>(diff);
176         return (entries + offsets[kmer]);
177     }
178 
sortDBSeqLists()179     void sortDBSeqLists() {
180         #pragma omp parallel for
181         for (size_t i = 0; i < tableSize; i++) {
182             size_t entrySize;
183             IndexEntryLocal *entries = getDBSeqList(i, &entrySize);
184             SORT_SERIAL(entries, entries + entrySize, IndexEntryLocal::comapreByIdAndPos);
185         }
186     }
187 
188     // get pointer to entries array
getEntries()189     IndexEntryLocal *getEntries() {
190         return entries;
191     }
192 
getOffset(size_t kmer)193     inline size_t getOffset(size_t kmer) {
194         return offsets[kmer];
195     }
196 
getOffsets()197     size_t *getOffsets() {
198         return offsets;
199     }
200 
201     // init the arrays for the sequence lists
initMemory(size_t dbSize)202     void initMemory(size_t dbSize) {
203         size_t tableEntriesNum = 0;
204         for (size_t i = 0; i < getTableSize(); i++) {
205             tableEntriesNum += getOffset(i);
206         }
207 
208         this->tableEntriesNum = tableEntriesNum;
209         this->size = dbSize; // amount of sequences added
210 
211         // allocate memory for the sequence id lists
212         entries = new(std::nothrow) IndexEntryLocal[tableEntriesNum];
213         Util::checkAllocation(entries, "Can not allocate entries memory in IndexTable::initMemory");
214     }
215 
216     // allocates memory for index tables
init()217     void init() {
218         // set the pointers in the index table to the start of the list for a certain k-mer
219         size_t offset = 0;
220         for (size_t i = 0; i < tableSize; i++) {
221             const size_t currentOffset = offsets[i];
222             offsets[i] = offset;
223             offset += currentOffset;
224         }
225         offsets[tableSize] = offset;
226     }
227 
228     // init index table with external data (needed for index readin)
initTableByExternalData(size_t sequenceCount,size_t tableEntriesNum,IndexEntryLocal * entries,size_t * entryOffsets)229     void initTableByExternalData(size_t sequenceCount, size_t tableEntriesNum, IndexEntryLocal *entries, size_t *entryOffsets) {
230         this->tableEntriesNum = tableEntriesNum;
231         this->size = sequenceCount;
232 
233         this->entries = entries;
234         this->offsets = entryOffsets;
235     }
236 
initTableByExternalDataCopy(size_t sequenceCount,size_t tableEntriesNum,IndexEntryLocal * entries,size_t * entryOffsets)237     void initTableByExternalDataCopy(size_t sequenceCount, size_t tableEntriesNum, IndexEntryLocal *entries, size_t *entryOffsets) {
238         this->tableEntriesNum = tableEntriesNum;
239         this->size = sequenceCount;
240 
241         this->entries = new(std::nothrow) IndexEntryLocal[tableEntriesNum];
242         Util::checkAllocation(entries, "Can not allocate " + SSTR(tableEntriesNum * sizeof(IndexEntryLocal)) + " bytes for entries in IndexTable::initMemory");
243         memcpy(this->entries, entries, tableEntriesNum * sizeof(IndexEntryLocal));
244 
245         memcpy(this->offsets, entryOffsets, (tableSize + 1) * sizeof(size_t));
246     }
247 
revertPointer()248     void revertPointer() {
249         for (size_t i = tableSize; i > 0; i--) {
250             offsets[i] = offsets[i - 1];
251         }
252         offsets[0] = 0;
253     }
254 
printStatistics(char * num2aa)255     void printStatistics(char *num2aa) {
256         const size_t top_N = 10;
257         std::pair<size_t, size_t> topElements[top_N];
258         for (size_t j = 0; j < top_N; j++) {
259             topElements[j].first = 0;
260         }
261 
262         size_t entrySize = 0;
263         size_t minKmer = 0;
264         size_t emptyKmer = 0;
265         for (size_t i = 0; i < tableSize; i++) {
266             const ptrdiff_t size = offsets[i + 1] - offsets[i];
267             minKmer = std::min(minKmer, (size_t) size);
268             entrySize += size;
269             if (size == 0) {
270                 emptyKmer++;
271             }
272             if (((size_t) size) < topElements[top_N - 1].first)
273                 continue;
274             for (size_t j = 0; j < top_N; j++) {
275                 if (topElements[j].first < ((size_t) size)) {
276                     topElements[j].first = static_cast<unsigned long>(size);
277                     topElements[j].second = i;
278                     break;
279                 }
280             }
281         }
282 
283         double avgKmer = ((double) entrySize) / ((double) tableSize);
284         Debug(Debug::INFO) << "Index statistics\n";
285         Debug(Debug::INFO) << "Entries:          " << entrySize << "\n";
286         Debug(Debug::INFO) << "DB size:          " << (entrySize * sizeof(IndexEntryLocal) + tableSize * sizeof(size_t))/1024/1024 << " MB\n";
287         Debug(Debug::INFO) << "Avg k-mer size:   " << avgKmer << "\n";
288         Debug(Debug::INFO) << "Top " << top_N << " k-mers\n";
289         for (size_t j = 0; j < top_N; j++) {
290             Debug(Debug::INFO) << "    ";
291             indexer->printKmer(topElements[j].second, kmerSize, num2aa);
292             Debug(Debug::INFO) << "\t" << topElements[j].first << "\n";
293         }
294     }
295 
296     // FUNCTIONS TO OVERWRITE
297     // add k-mers of the sequence to the index table
addSimilarSequence(Sequence * s,KmerGenerator * kmerGenerator,IndexEntryLocalTmp ** buffer,size_t & bufferSize,Indexer * idxer)298     void addSimilarSequence(Sequence* s, KmerGenerator* kmerGenerator, IndexEntryLocalTmp ** buffer, size_t &bufferSize, Indexer * idxer) {
299         // iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy)
300         s->resetCurrPos();
301         idxer->reset();
302         size_t kmerPos = 0;
303         while(s->hasNextKmer()){
304             const unsigned char * kmer = s->nextKmer();
305             std::pair<size_t *, size_t> scoreMatrix = kmerGenerator->generateKmerList(kmer);
306             if(kmerPos+scoreMatrix.second >= bufferSize){
307                 *buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2));
308                 bufferSize = bufferSize*2;
309             }
310             for(size_t i = 0; i < scoreMatrix.second; i++) {
311                 unsigned int kmerIdx = scoreMatrix.first[i];
312 
313                 // if region got masked do not add kmer
314                 if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0)
315                     continue;
316                 (*buffer)[kmerPos].kmer = kmerIdx;
317                 (*buffer)[kmerPos].seqId = s->getId();
318                 (*buffer)[kmerPos].position_j = s->getCurrentPosition();
319                 kmerPos++;
320             }
321 
322         }
323 
324         if(kmerPos>1){
325             SORT_SERIAL(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
326         }
327         unsigned int prevKmer = UINT_MAX;
328         for(size_t pos = 0; pos < kmerPos; pos++){
329             unsigned int kmerIdx = (*buffer)[pos].kmer;
330             if(kmerIdx != prevKmer){
331                 size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
332                 IndexEntryLocal *entry = &entries[offset];
333                 entry->seqId      = (*buffer)[pos].seqId;
334                 entry->position_j = (*buffer)[pos].position_j;
335             }
336             prevKmer = kmerIdx;
337         }
338     }
339 
340     // add k-mers of the sequence to the index table
addSequence(Sequence * s,Indexer * idxer,IndexEntryLocalTmp ** buffer,size_t bufferSize,int threshold,char * diagonalScore)341     void addSequence (Sequence* s, Indexer * idxer,
342                       IndexEntryLocalTmp ** buffer, size_t bufferSize,
343                       int threshold, char * diagonalScore){
344         // iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy)
345         s->resetCurrPos();
346         idxer->reset();
347         size_t kmerPos = 0;
348         bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) ||
349                         Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS));
350         while (s->hasNextKmer()){
351             const unsigned char * kmer = s->nextKmer();
352             if(removeX && s->kmerContainsX()){
353                 continue;
354             }
355             if(threshold > 0) {
356                 int score = 0;
357                 for (int pos = 0; pos < kmerSize; pos++) {
358                     score += diagonalScore[kmer[pos]];
359                 }
360                 if (score < threshold) {
361                     continue;
362                 }
363             }
364             unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize);
365             // if region got masked do not add kmer
366             if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0)
367                 continue;
368 
369             (*buffer)[kmerPos].kmer = kmerIdx;
370             (*buffer)[kmerPos].seqId      = s->getId();
371             (*buffer)[kmerPos].position_j = s->getCurrentPosition();
372             kmerPos++;
373             if(kmerPos >= bufferSize){
374                 *buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2));
375                 bufferSize = bufferSize*2;
376             }
377         }
378 
379         if(kmerPos>1){
380             SORT_SERIAL(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
381         }
382 
383         unsigned int prevKmer = UINT_MAX;
384         for(size_t pos = 0; pos < kmerPos; pos++){
385             unsigned int kmerIdx = (*buffer)[pos].kmer;
386             if(kmerIdx != prevKmer){
387                 size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
388                 IndexEntryLocal *entry = &entries[offset];
389                 entry->seqId      = (*buffer)[pos].seqId;
390                 entry->position_j = (*buffer)[pos].position_j;
391             }
392             prevKmer = kmerIdx;
393         }
394     }
395 
396     // prints the IndexTable
print(char * num2aa)397     void print(char *num2aa) {
398         for (size_t i = 0; i < tableSize; i++) {
399             ptrdiff_t entrySize = offsets[i + 1] - offsets[i];
400             if (entrySize > 0) {
401                 indexer->printKmer(i, kmerSize, num2aa);
402 
403                 Debug(Debug::INFO) << "\n";
404                 IndexEntryLocal *e = &entries[offsets[i]];
405                 for (unsigned int j = 0; j < entrySize; j++) {
406                     Debug(Debug::INFO) << "\t(" << e[j].seqId << ", " << e[j].position_j << ")\n";
407                 }
408             }
409         }
410     };
411 
412     // get amount of sequences in Index
getSize()413     size_t getSize() { return size; };
414 
415     // returns the size of  table entries
getTableEntriesNum()416     uint64_t getTableEntriesNum() { return tableEntriesNum; };
417 
418     // returns table size
getTableSize()419     size_t getTableSize() { return tableSize; };
420 
421     // returns the size of the entry (int for global) (IndexEntryLocal for local)
getSizeOfEntry()422     size_t getSizeOfEntry() { return sizeof(IndexEntryLocal); }
423 
getKmerSize()424     int getKmerSize() {
425         return kmerSize;
426     }
427 
getAlphabetSize()428     int getAlphabetSize() {
429         return alphabetSize;
430     }
431 
computeKmerSize(size_t aaSize)432     static int computeKmerSize(size_t aaSize) {
433         return aaSize < getUpperBoundAACountForKmerSize(6) ? 6 : 7;
434     }
435 
436 
getUpperBoundAACountForKmerSize(int kmerSize)437     static size_t getUpperBoundAACountForKmerSize(int kmerSize) {
438         switch (kmerSize) {
439             case 6:
440                 return 3350000000;
441             case 7:
442                 return (SIZE_MAX - 1); // SIZE_MAX is often reserved as safe flag
443             default:
444                 Debug(Debug::ERROR) << "Invalid kmer size of " << kmerSize << "!\n";
445                 EXIT(EXIT_FAILURE);
446         }
447     }
448 
getUpperBoundNucCountForKmerSize(int kmerSize)449     static size_t getUpperBoundNucCountForKmerSize(int kmerSize) {
450         switch (kmerSize) {
451             case 14:
452                 return 3350000000;
453             case 15:
454                 return (SIZE_MAX - 1); // SIZE_MAX is often reserved as safe flag
455             default:
456                 Debug(Debug::ERROR) << "Invalid kmer size of " << kmerSize << "!\n";
457                 EXIT(EXIT_FAILURE);
458         }
459     }
460 
461 protected:
462     // alphabetSize**kmerSize
463     const size_t tableSize;
464     const int alphabetSize;
465     const int kmerSize;
466 
467     // external data from mmap
468     const bool externalData;
469 
470     // number of entries in all sequence lists - must be 64bit
471     uint64_t tableEntriesNum;
472     // number of sequences in Index
473     size_t size;
474 
475     Indexer *indexer;
476 
477     // Index table entries: ids of sequences containing a certain k-mer, stored sequentially in the memory
478     IndexEntryLocal *entries;
479     size_t *offsets;
480 
481     // sequence lookup
482     SequenceLookup *sequenceLookup;
483 };
484 #endif
485