1 #ifndef INDEX_TABLE_H 2 #define INDEX_TABLE_H 3 4 // 5 // Written by Martin Steinegger martin.steinegger@snu.ac.kr and Maria Hauser mhauser@genzentrum.lmu.de 6 // 7 // Abstract: Index table stores the list of DB sequences containing a certain k-mer, for each k-mer. 8 // 9 10 #include "DBReader.h" 11 #include "Sequence.h" 12 #include "Indexer.h" 13 #include "Debug.h" 14 #include "Util.h" 15 #include "SequenceLookup.h" 16 #include "MathUtil.h" 17 #include "KmerGenerator.h" 18 #include "Parameters.h" 19 #include "FastSort.h" 20 #include <stdlib.h> 21 #include <algorithm> 22 23 // IndexEntryLocal is an entry with position and seqId for a kmer 24 // structure needs to be packed or it will need 8 bytes instead of 6 25 struct __attribute__((__packed__)) IndexEntryLocal { 26 unsigned int seqId; 27 unsigned short position_j; comapreByIdAndPosIndexEntryLocal28 static bool comapreByIdAndPos(IndexEntryLocal first, IndexEntryLocal second){ 29 if(first.seqId < second.seqId ) 30 return true; 31 if(second.seqId < first.seqId ) 32 return false; 33 if(first.position_j < second.position_j ) 34 return true; 35 if(second.position_j < first.position_j ) 36 return false; 37 return false; 38 } 39 }; 40 41 struct __attribute__((__packed__)) IndexEntryLocalTmp { 42 unsigned int kmer; 43 unsigned int seqId; 44 unsigned short position_j; 45 IndexEntryLocalTmpIndexEntryLocalTmp46 IndexEntryLocalTmp(unsigned int kmer, unsigned int seqId, unsigned short position_j) 47 :kmer(kmer),seqId(seqId), position_j(position_j) 48 {} 49 IndexEntryLocalTmpIndexEntryLocalTmp50 IndexEntryLocalTmp() {} 51 comapreByIdAndPosIndexEntryLocalTmp52 static bool comapreByIdAndPos(IndexEntryLocalTmp first, IndexEntryLocalTmp second){ 53 if(first.kmer < second.kmer ) 54 return true; 55 if(second.kmer < first.kmer ) 56 return false; 57 if(first.position_j < second.position_j ) 58 return true; 59 if(second.position_j < first.position_j ) 60 return false; 61 return false; 62 } 63 }; 64 65 class IndexTable { 66 public: IndexTable(int alphabetSize,int kmerSize,bool externalData)67 IndexTable(int alphabetSize, int kmerSize, bool externalData) 68 : tableSize(MathUtil::ipow<size_t>(alphabetSize, kmerSize)), alphabetSize(alphabetSize), 69 kmerSize(kmerSize), externalData(externalData), tableEntriesNum(0), size(0), 70 indexer(new Indexer(alphabetSize, kmerSize)), entries(NULL), offsets(NULL) { 71 if (externalData == false) { 72 offsets = new(std::nothrow) size_t[tableSize + 1]; 73 Util::checkAllocation(offsets, "Can not allocate entries memory in IndexTable"); 74 memset(offsets, 0, (tableSize + 1) * sizeof(size_t)); 75 } 76 } 77 ~IndexTable()78 virtual ~IndexTable() { 79 deleteEntries(); 80 delete indexer; 81 } 82 deleteEntries()83 void deleteEntries() { 84 if (externalData == false) { 85 if (entries != NULL) { 86 delete[] entries; 87 entries = NULL; 88 } 89 if (offsets != NULL) { 90 delete[] offsets; 91 offsets = NULL; 92 } 93 } 94 } 95 96 // count k-mers in the sequence, so enough memory for the sequence lists can be allocated in the end addSimilarKmerCount(Sequence * s,KmerGenerator * kmerGenerator)97 size_t addSimilarKmerCount(Sequence* s, KmerGenerator* kmerGenerator){ 98 s->resetCurrPos(); 99 std::vector<unsigned int> seqKmerPosBuffer; 100 101 //idxer->reset(); 102 while(s->hasNextKmer()){ 103 const unsigned char * kmer = s->nextKmer(); 104 const std::pair<size_t *, size_t> kmerList = kmerGenerator->generateKmerList(kmer); 105 106 //unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize); 107 for(size_t i = 0; i < kmerList.second; i++){ 108 seqKmerPosBuffer.push_back(kmerList.first[i]); 109 } 110 } 111 if(seqKmerPosBuffer.size() > 1){ 112 SORT_SERIAL(seqKmerPosBuffer.begin(), seqKmerPosBuffer.end()); 113 } 114 size_t countUniqKmer = 0; 115 unsigned int prevKmerIdx = UINT_MAX; 116 for(size_t i = 0; i < seqKmerPosBuffer.size(); i++){ 117 unsigned int kmerIdx = seqKmerPosBuffer[i]; 118 if(prevKmerIdx != kmerIdx){ 119 //table[kmerIdx] += 1; 120 // size increases by one 121 __sync_fetch_and_add(&(offsets[kmerIdx]), 1); 122 countUniqKmer++; 123 } 124 prevKmerIdx = kmerIdx; 125 } 126 return countUniqKmer; 127 } 128 129 // count k-mers in the sequence, so enough memory for the sequence lists can be allocated in the end addKmerCount(Sequence * s,Indexer * idxer,unsigned int * seqKmerPosBuffer,int threshold,char * diagonalScore)130 size_t addKmerCount(Sequence *s, Indexer *idxer, unsigned int *seqKmerPosBuffer, 131 int threshold, char *diagonalScore) { 132 s->resetCurrPos(); 133 size_t countKmer = 0; 134 bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) || 135 Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS)); 136 while(s->hasNextKmer()){ 137 const unsigned char * kmer = s->nextKmer(); 138 if(removeX && s->kmerContainsX()){ 139 continue; 140 } 141 if(threshold > 0){ 142 int score = 0; 143 for(int pos = 0; pos < kmerSize; pos++){ 144 score += diagonalScore[kmer[pos]]; 145 } 146 if(score < threshold){ 147 continue; 148 } 149 } 150 unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize); 151 seqKmerPosBuffer[countKmer] = kmerIdx; 152 countKmer++; 153 } 154 if(countKmer > 1){ 155 SORT_SERIAL(seqKmerPosBuffer, seqKmerPosBuffer + countKmer); 156 } 157 size_t countUniqKmer = 0; 158 unsigned int prevKmerIdx = UINT_MAX; 159 for(size_t i = 0; i < countKmer; i++){ 160 unsigned int kmerIdx = seqKmerPosBuffer[i]; 161 if(prevKmerIdx != kmerIdx){ 162 //table[kmerIdx] += 1; 163 // size increases by one 164 __sync_fetch_and_add(&(offsets[kmerIdx]), 1); 165 countUniqKmer++; 166 } 167 prevKmerIdx = kmerIdx; 168 } 169 return countUniqKmer; 170 } 171 172 // get list of DB sequences containing this k-mer getDBSeqList(size_t kmer,size_t * matchedListSize)173 inline IndexEntryLocal *getDBSeqList(size_t kmer, size_t *matchedListSize) { 174 const ptrdiff_t diff = offsets[kmer + 1] - offsets[kmer]; 175 *matchedListSize = static_cast<size_t>(diff); 176 return (entries + offsets[kmer]); 177 } 178 sortDBSeqLists()179 void sortDBSeqLists() { 180 #pragma omp parallel for 181 for (size_t i = 0; i < tableSize; i++) { 182 size_t entrySize; 183 IndexEntryLocal *entries = getDBSeqList(i, &entrySize); 184 SORT_SERIAL(entries, entries + entrySize, IndexEntryLocal::comapreByIdAndPos); 185 } 186 } 187 188 // get pointer to entries array getEntries()189 IndexEntryLocal *getEntries() { 190 return entries; 191 } 192 getOffset(size_t kmer)193 inline size_t getOffset(size_t kmer) { 194 return offsets[kmer]; 195 } 196 getOffsets()197 size_t *getOffsets() { 198 return offsets; 199 } 200 201 // init the arrays for the sequence lists initMemory(size_t dbSize)202 void initMemory(size_t dbSize) { 203 size_t tableEntriesNum = 0; 204 for (size_t i = 0; i < getTableSize(); i++) { 205 tableEntriesNum += getOffset(i); 206 } 207 208 this->tableEntriesNum = tableEntriesNum; 209 this->size = dbSize; // amount of sequences added 210 211 // allocate memory for the sequence id lists 212 entries = new(std::nothrow) IndexEntryLocal[tableEntriesNum]; 213 Util::checkAllocation(entries, "Can not allocate entries memory in IndexTable::initMemory"); 214 } 215 216 // allocates memory for index tables init()217 void init() { 218 // set the pointers in the index table to the start of the list for a certain k-mer 219 size_t offset = 0; 220 for (size_t i = 0; i < tableSize; i++) { 221 const size_t currentOffset = offsets[i]; 222 offsets[i] = offset; 223 offset += currentOffset; 224 } 225 offsets[tableSize] = offset; 226 } 227 228 // init index table with external data (needed for index readin) initTableByExternalData(size_t sequenceCount,size_t tableEntriesNum,IndexEntryLocal * entries,size_t * entryOffsets)229 void initTableByExternalData(size_t sequenceCount, size_t tableEntriesNum, IndexEntryLocal *entries, size_t *entryOffsets) { 230 this->tableEntriesNum = tableEntriesNum; 231 this->size = sequenceCount; 232 233 this->entries = entries; 234 this->offsets = entryOffsets; 235 } 236 initTableByExternalDataCopy(size_t sequenceCount,size_t tableEntriesNum,IndexEntryLocal * entries,size_t * entryOffsets)237 void initTableByExternalDataCopy(size_t sequenceCount, size_t tableEntriesNum, IndexEntryLocal *entries, size_t *entryOffsets) { 238 this->tableEntriesNum = tableEntriesNum; 239 this->size = sequenceCount; 240 241 this->entries = new(std::nothrow) IndexEntryLocal[tableEntriesNum]; 242 Util::checkAllocation(entries, "Can not allocate " + SSTR(tableEntriesNum * sizeof(IndexEntryLocal)) + " bytes for entries in IndexTable::initMemory"); 243 memcpy(this->entries, entries, tableEntriesNum * sizeof(IndexEntryLocal)); 244 245 memcpy(this->offsets, entryOffsets, (tableSize + 1) * sizeof(size_t)); 246 } 247 revertPointer()248 void revertPointer() { 249 for (size_t i = tableSize; i > 0; i--) { 250 offsets[i] = offsets[i - 1]; 251 } 252 offsets[0] = 0; 253 } 254 printStatistics(char * num2aa)255 void printStatistics(char *num2aa) { 256 const size_t top_N = 10; 257 std::pair<size_t, size_t> topElements[top_N]; 258 for (size_t j = 0; j < top_N; j++) { 259 topElements[j].first = 0; 260 } 261 262 size_t entrySize = 0; 263 size_t minKmer = 0; 264 size_t emptyKmer = 0; 265 for (size_t i = 0; i < tableSize; i++) { 266 const ptrdiff_t size = offsets[i + 1] - offsets[i]; 267 minKmer = std::min(minKmer, (size_t) size); 268 entrySize += size; 269 if (size == 0) { 270 emptyKmer++; 271 } 272 if (((size_t) size) < topElements[top_N - 1].first) 273 continue; 274 for (size_t j = 0; j < top_N; j++) { 275 if (topElements[j].first < ((size_t) size)) { 276 topElements[j].first = static_cast<unsigned long>(size); 277 topElements[j].second = i; 278 break; 279 } 280 } 281 } 282 283 double avgKmer = ((double) entrySize) / ((double) tableSize); 284 Debug(Debug::INFO) << "Index statistics\n"; 285 Debug(Debug::INFO) << "Entries: " << entrySize << "\n"; 286 Debug(Debug::INFO) << "DB size: " << (entrySize * sizeof(IndexEntryLocal) + tableSize * sizeof(size_t))/1024/1024 << " MB\n"; 287 Debug(Debug::INFO) << "Avg k-mer size: " << avgKmer << "\n"; 288 Debug(Debug::INFO) << "Top " << top_N << " k-mers\n"; 289 for (size_t j = 0; j < top_N; j++) { 290 Debug(Debug::INFO) << " "; 291 indexer->printKmer(topElements[j].second, kmerSize, num2aa); 292 Debug(Debug::INFO) << "\t" << topElements[j].first << "\n"; 293 } 294 } 295 296 // FUNCTIONS TO OVERWRITE 297 // add k-mers of the sequence to the index table addSimilarSequence(Sequence * s,KmerGenerator * kmerGenerator,IndexEntryLocalTmp ** buffer,size_t & bufferSize,Indexer * idxer)298 void addSimilarSequence(Sequence* s, KmerGenerator* kmerGenerator, IndexEntryLocalTmp ** buffer, size_t &bufferSize, Indexer * idxer) { 299 // iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy) 300 s->resetCurrPos(); 301 idxer->reset(); 302 size_t kmerPos = 0; 303 while(s->hasNextKmer()){ 304 const unsigned char * kmer = s->nextKmer(); 305 std::pair<size_t *, size_t> scoreMatrix = kmerGenerator->generateKmerList(kmer); 306 if(kmerPos+scoreMatrix.second >= bufferSize){ 307 *buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2)); 308 bufferSize = bufferSize*2; 309 } 310 for(size_t i = 0; i < scoreMatrix.second; i++) { 311 unsigned int kmerIdx = scoreMatrix.first[i]; 312 313 // if region got masked do not add kmer 314 if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0) 315 continue; 316 (*buffer)[kmerPos].kmer = kmerIdx; 317 (*buffer)[kmerPos].seqId = s->getId(); 318 (*buffer)[kmerPos].position_j = s->getCurrentPosition(); 319 kmerPos++; 320 } 321 322 } 323 324 if(kmerPos>1){ 325 SORT_SERIAL(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos); 326 } 327 unsigned int prevKmer = UINT_MAX; 328 for(size_t pos = 0; pos < kmerPos; pos++){ 329 unsigned int kmerIdx = (*buffer)[pos].kmer; 330 if(kmerIdx != prevKmer){ 331 size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1); 332 IndexEntryLocal *entry = &entries[offset]; 333 entry->seqId = (*buffer)[pos].seqId; 334 entry->position_j = (*buffer)[pos].position_j; 335 } 336 prevKmer = kmerIdx; 337 } 338 } 339 340 // add k-mers of the sequence to the index table addSequence(Sequence * s,Indexer * idxer,IndexEntryLocalTmp ** buffer,size_t bufferSize,int threshold,char * diagonalScore)341 void addSequence (Sequence* s, Indexer * idxer, 342 IndexEntryLocalTmp ** buffer, size_t bufferSize, 343 int threshold, char * diagonalScore){ 344 // iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy) 345 s->resetCurrPos(); 346 idxer->reset(); 347 size_t kmerPos = 0; 348 bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) || 349 Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS)); 350 while (s->hasNextKmer()){ 351 const unsigned char * kmer = s->nextKmer(); 352 if(removeX && s->kmerContainsX()){ 353 continue; 354 } 355 if(threshold > 0) { 356 int score = 0; 357 for (int pos = 0; pos < kmerSize; pos++) { 358 score += diagonalScore[kmer[pos]]; 359 } 360 if (score < threshold) { 361 continue; 362 } 363 } 364 unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize); 365 // if region got masked do not add kmer 366 if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0) 367 continue; 368 369 (*buffer)[kmerPos].kmer = kmerIdx; 370 (*buffer)[kmerPos].seqId = s->getId(); 371 (*buffer)[kmerPos].position_j = s->getCurrentPosition(); 372 kmerPos++; 373 if(kmerPos >= bufferSize){ 374 *buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2)); 375 bufferSize = bufferSize*2; 376 } 377 } 378 379 if(kmerPos>1){ 380 SORT_SERIAL(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos); 381 } 382 383 unsigned int prevKmer = UINT_MAX; 384 for(size_t pos = 0; pos < kmerPos; pos++){ 385 unsigned int kmerIdx = (*buffer)[pos].kmer; 386 if(kmerIdx != prevKmer){ 387 size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1); 388 IndexEntryLocal *entry = &entries[offset]; 389 entry->seqId = (*buffer)[pos].seqId; 390 entry->position_j = (*buffer)[pos].position_j; 391 } 392 prevKmer = kmerIdx; 393 } 394 } 395 396 // prints the IndexTable print(char * num2aa)397 void print(char *num2aa) { 398 for (size_t i = 0; i < tableSize; i++) { 399 ptrdiff_t entrySize = offsets[i + 1] - offsets[i]; 400 if (entrySize > 0) { 401 indexer->printKmer(i, kmerSize, num2aa); 402 403 Debug(Debug::INFO) << "\n"; 404 IndexEntryLocal *e = &entries[offsets[i]]; 405 for (unsigned int j = 0; j < entrySize; j++) { 406 Debug(Debug::INFO) << "\t(" << e[j].seqId << ", " << e[j].position_j << ")\n"; 407 } 408 } 409 } 410 }; 411 412 // get amount of sequences in Index getSize()413 size_t getSize() { return size; }; 414 415 // returns the size of table entries getTableEntriesNum()416 uint64_t getTableEntriesNum() { return tableEntriesNum; }; 417 418 // returns table size getTableSize()419 size_t getTableSize() { return tableSize; }; 420 421 // returns the size of the entry (int for global) (IndexEntryLocal for local) getSizeOfEntry()422 size_t getSizeOfEntry() { return sizeof(IndexEntryLocal); } 423 getKmerSize()424 int getKmerSize() { 425 return kmerSize; 426 } 427 getAlphabetSize()428 int getAlphabetSize() { 429 return alphabetSize; 430 } 431 computeKmerSize(size_t aaSize)432 static int computeKmerSize(size_t aaSize) { 433 return aaSize < getUpperBoundAACountForKmerSize(6) ? 6 : 7; 434 } 435 436 getUpperBoundAACountForKmerSize(int kmerSize)437 static size_t getUpperBoundAACountForKmerSize(int kmerSize) { 438 switch (kmerSize) { 439 case 6: 440 return 3350000000; 441 case 7: 442 return (SIZE_MAX - 1); // SIZE_MAX is often reserved as safe flag 443 default: 444 Debug(Debug::ERROR) << "Invalid kmer size of " << kmerSize << "!\n"; 445 EXIT(EXIT_FAILURE); 446 } 447 } 448 getUpperBoundNucCountForKmerSize(int kmerSize)449 static size_t getUpperBoundNucCountForKmerSize(int kmerSize) { 450 switch (kmerSize) { 451 case 14: 452 return 3350000000; 453 case 15: 454 return (SIZE_MAX - 1); // SIZE_MAX is often reserved as safe flag 455 default: 456 Debug(Debug::ERROR) << "Invalid kmer size of " << kmerSize << "!\n"; 457 EXIT(EXIT_FAILURE); 458 } 459 } 460 461 protected: 462 // alphabetSize**kmerSize 463 const size_t tableSize; 464 const int alphabetSize; 465 const int kmerSize; 466 467 // external data from mmap 468 const bool externalData; 469 470 // number of entries in all sequence lists - must be 64bit 471 uint64_t tableEntriesNum; 472 // number of sequences in Index 473 size_t size; 474 475 Indexer *indexer; 476 477 // Index table entries: ids of sequences containing a certain k-mer, stored sequentially in the memory 478 IndexEntryLocal *entries; 479 size_t *offsets; 480 481 // sequence lookup 482 SequenceLookup *sequenceLookup; 483 }; 484 #endif 485