1 /*
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25
26 /*****************************************************************************
27
28 File name: kmercounts.cpp
29
30 Author: Greg Boratyn
31
32 Contents: Implementation of k-mer counting classes
33
34 ******************************************************************************/
35
36
37 #include <ncbi_pch.hpp>
38
39 #include <math.h>
40
41 #include <corelib/ncbistre.hpp>
42 #include <corelib/ncbi_limits.hpp>
43 #include <corelib/ncbiexpt.hpp>
44
45 #include <objmgr/seq_vector.hpp>
46
47 #include <algo/cobalt/kmercounts.hpp>
48
49
50 USING_NCBI_SCOPE;
51 USING_SCOPE(cobalt);
52
53 // Default values for default params
54 unsigned int CSparseKmerCounts::sm_KmerLength = 4;
55 unsigned int CSparseKmerCounts::sm_AlphabetSize = kAlphabetSize;
56 vector<Uint1> CSparseKmerCounts::sm_TransTable;
57 bool CSparseKmerCounts::sm_UseCompressed = false;
58 CSparseKmerCounts::TCount* CSparseKmerCounts::sm_Buffer = NULL;
59 bool CSparseKmerCounts::sm_ForceSmallerMem = false;
60
61 unsigned int CBinaryKmerCounts::sm_KmerLength = 3;
62 unsigned int CBinaryKmerCounts::sm_AlphabetSize = kAlphabetSize;
63 vector<Uint1> CBinaryKmerCounts::sm_TransTable;
64 bool CBinaryKmerCounts::sm_UseCompressed = false;
65
66 static const Uint1 kXaa = 21;
67
68
CSparseKmerCounts(const objects::CSeq_loc & seq,objects::CScope & scope)69 CSparseKmerCounts::CSparseKmerCounts(const objects::CSeq_loc& seq,
70 objects::CScope& scope)
71 {
72 Reset(seq, scope);
73 }
74
75 // Initialize position bit vector for k-mer counting
76 // @param sv Sequence [in]
77 // @param pos Index of the k-mer count [out]
78 // @param index Index of letter in sequence [in|out]
79 // @param num_bits Number of bits per letter in pos [in]
80 // @param kmer_len K-mer length [in]
81 // @return True if a valid k-mer was found, false otherwise
InitPosBits(const objects::CSeqVector & sv,Uint4 & pos,unsigned int & index,Uint4 num_bits,Uint4 kmer_len)82 bool CSparseKmerCounts::InitPosBits(const objects::CSeqVector& sv, Uint4& pos,
83 unsigned int& index,
84 Uint4 num_bits, Uint4 kmer_len)
85
86 {
87 pos = 0;
88 unsigned i = 0;
89 while (index + kmer_len - 1 < sv.size() && i < kmer_len) {
90
91 // Skip kmers that contain X (unspecified aa)
92 if (sv[index + i] == kXaa) {
93 index += i + 1;
94
95 pos = 0;
96 i = 0;
97 continue;
98 }
99 pos |= GetAALetter(sv[index + i]) << (num_bits * (kmer_len - i - 1));
100 i++;
101 }
102
103 if (i < kmer_len) {
104 return false;
105 }
106
107 index += i;
108 return true;
109 }
110
111
MarkUsed(Uint4 pos,vector<Uint4> & entries,int chunk)112 static void MarkUsed(Uint4 pos, vector<Uint4>& entries, int chunk)
113 {
114 int index = pos / chunk;
115 int offset = pos - index * chunk;
116 Uint4 mask = 0x80000000 >> offset;
117 entries[index] |= mask;
118 }
119
120
ReserveCountsMem(unsigned int num_bits)121 CSparseKmerCounts::TCount* CSparseKmerCounts::ReserveCountsMem(
122 unsigned int num_bits)
123 {
124 Uint4 num_elements;
125 TCount* counts = NULL;
126
127 // Reserve memory for storing counts
128 // there are two methods for indexing counts (see the Reset() method)
129 // if memory cannot be allocated try to allocate for the second method
130 // that requires less memory
131 if (!sm_ForceSmallerMem && sm_KmerLength * num_bits
132 < kLengthBitsThreshold) {
133
134 num_elements = 1 << (num_bits * sm_KmerLength);
135 try {
136 counts = new TCount[num_elements];
137 }
138 catch (std::bad_alloc) {
139 sm_ForceSmallerMem = true;
140 num_elements = (Uint4)pow((double)sm_AlphabetSize,
141 (double)sm_KmerLength);
142
143 try {
144 counts = new TCount[num_elements];
145 }
146 catch (std::bad_alloc) {
147 NCBI_THROW(CKmerCountsException, eMemoryAllocation,
148 "Memory cannot be allocated for k-mer counting."
149 " Try using compressed alphabet or smaller k.");
150 }
151
152 }
153 }
154 return counts;
155 }
156
Reset(const objects::CSeq_loc & seq,objects::CScope & scope)157 void CSparseKmerCounts::Reset(const objects::CSeq_loc& seq,
158 objects::CScope& scope)
159 {
160 unsigned int kmer_len = sm_KmerLength;
161 unsigned int alphabet_size = sm_AlphabetSize;
162
163 _ASSERT(kmer_len > 0 && alphabet_size > 0);
164
165 if (sm_UseCompressed && sm_TransTable.empty()) {
166 NCBI_THROW(CKmerCountsException, eInvalidOptions,
167 "Compressed alphabet selected, but translation table not"
168 " specified");
169 }
170
171 if (!seq.IsWhole() && !seq.IsInt()) {
172 NCBI_THROW(CKmerCountsException, eUnsupportedSeqLoc,
173 "Unsupported SeqLoc encountered");
174 }
175
176 _ASSERT(seq.GetId());
177 objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
178
179 unsigned int num_elements;
180 unsigned int seq_len = sv.size();
181
182 m_SeqLength = sv.size();
183 m_Counts.clear();
184 m_NumCounts = 0;
185
186 if (m_SeqLength < kmer_len) {
187 NCBI_THROW(CKmerCountsException, eBadSequence,
188 "Sequence shorter than desired k-mer length");
189 }
190
191 // Compute number of bits needed to represent all letters
192 unsigned int mask = 1;
193 int num = 0;
194 while (alphabet_size > mask) {
195 mask <<= 1;
196 num++;
197 }
198 const int kNumBits = num;
199
200 TCount* counts = sm_Buffer ? sm_Buffer : ReserveCountsMem(kNumBits);
201
202 // Vecotr of counts is first computed using regular vector that is later
203 // converted to the sparse vector (list of position-value pairs).
204 // Positions are calculated as binary representations of k-mers, if they
205 // fit in 32 bits. Otherwise as numbers in system with base alphabet size.
206 if (!sm_ForceSmallerMem && kmer_len * kNumBits < kLengthBitsThreshold) {
207
208 num_elements = 1 << (kNumBits * kmer_len);
209 const Uint4 kMask = num_elements - (1 << kNumBits);
210
211 _ASSERT(counts);
212 memset(counts, 0, num_elements * sizeof(TCount));
213
214 const int kBitChunk = sizeof(Uint4) * 8;
215
216 // Vector indicating non-zero elements
217 vector<Uint4> used_entries(num_elements / kBitChunk + 1);
218
219 //first k-mer
220 Uint4 i = 0;
221 Uint4 pos;
222 bool is_pos = InitPosBits(sv, pos, i, kNumBits, kmer_len);
223 if (is_pos) {
224 _ASSERT(pos < num_elements);
225 counts[pos]++;
226 MarkUsed(pos, used_entries, kBitChunk);
227 m_NumCounts++;
228
229 //for each next kmer
230 for (;i < seq_len && is_pos;i++) {
231
232 if (GetAALetter(sv[i]) >= alphabet_size) {
233 NCBI_THROW(CKmerCountsException, eBadSequence,
234 "Letter out of alphabet in sequnece");
235 }
236
237 // Kmers that contain unspecified amino acid X are not
238 // considered
239 if (sv[i] == kXaa) {
240 i++;
241 is_pos = InitPosBits(sv, pos, i, kNumBits, kmer_len);
242
243 if (i >= seq_len || !is_pos) {
244 break;
245 }
246 }
247
248 pos <<= kNumBits;
249 pos &= kMask;
250 pos |= GetAALetter(sv[i]);
251 _ASSERT(pos < num_elements);
252 counts[pos]++;
253 MarkUsed(pos, used_entries, kBitChunk);
254 m_NumCounts++;
255 }
256 }
257
258 // Convert to sparse vector
259 m_Counts.reserve(m_SeqLength - kmer_len + 1);
260 size_t ind = 0;
261 Uint4 num_bit_chunks = num_elements / kBitChunk + 1;
262 while (ind < num_elements / kBitChunk + 1) {
263
264 // find next chunk with at least one non-zero count
265 while (ind < num_bit_chunks && used_entries[ind] == 0) {
266 ind++;
267 }
268
269 if (ind == num_bit_chunks) {
270 break;
271 }
272
273 // find the set bit and get position in the counts vector
274 for (Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
275 j++, mask>>=1) {
276 _ASSERT(j < 32);
277 if ((used_entries[ind] & mask) != 0) {
278 pos = ind * kBitChunk + j;
279
280 _ASSERT(counts[pos] > 0);
281 m_Counts.push_back(SVectorElement(pos, counts[pos]));
282
283 used_entries[ind] ^= mask;
284 }
285 }
286 ind++;
287 }
288
289 }
290 else {
291 _ASSERT(pow((double)alphabet_size, (double)kmer_len)
292 < numeric_limits<Uint4>::max());
293
294 AutoArray<double> base(kmer_len);
295 for (Uint4 i=0;i < kmer_len;i++) {
296 base[i] = pow((double)alphabet_size, (double)i);
297 }
298
299 num_elements = (Uint4)pow((double)alphabet_size, (double)kmer_len);
300
301 _ASSERT(counts);
302 memset(counts, 0, num_elements * sizeof(TCount));
303
304 // Vector indicating non-zero elements
305 const int kBitChunk = sizeof(Uint4) * 8;
306 vector<Uint4> used_entries(num_elements / kBitChunk + 1);
307
308 Uint4 pos;
309 for (unsigned i=0;i < seq_len - kmer_len + 1;i++) {
310
311 // Kmers that contain unspecified amino acid X are not considered
312 if (sv[i + kmer_len - 1] == kXaa) {
313 i += kmer_len - 1;
314 continue;
315 }
316
317 pos = GetAALetter(sv[i]) - 1;
318 _ASSERT(GetAALetter(sv[i]) <= alphabet_size);
319 for (Uint4 j=1;j < kmer_len;j++) {
320 pos += (Uint4)(((double)GetAALetter(sv[i + j]) - 1) * base[j]);
321 _ASSERT(GetAALetter(sv[i + j]) <= alphabet_size);
322 }
323 counts[pos]++;
324 MarkUsed(pos, used_entries, kBitChunk);
325 m_NumCounts++;
326 }
327
328 // Convert to sparse vector
329 m_Counts.reserve(m_SeqLength - kmer_len + 1);
330 size_t ind = 0;
331 Uint4 num_bit_chunks = num_elements / kBitChunk + 1;
332 while (ind < num_elements / kBitChunk + 1) {
333
334 // find next chunk with at least one non-zero count
335 while (ind < num_bit_chunks && used_entries[ind] == 0) {
336 ind++;
337 }
338
339 if (ind == num_bit_chunks) {
340 break;
341 }
342
343 // find the set bit and get position in the counts vector
344 for (Uint4 mask=0x80000000,j=0;used_entries[ind] != 0;
345 j++, mask>>=1) {
346 _ASSERT(j < 32);
347 if ((used_entries[ind] & mask) != 0) {
348 pos = ind * kBitChunk + j;
349
350 _ASSERT(counts[pos] > 0);
351 m_Counts.push_back(SVectorElement(pos, counts[pos]));
352
353 used_entries[ind] ^= mask;
354 }
355 }
356 ind++;
357 }
358 }
359
360 if (!sm_Buffer && counts) {
361 delete [] counts;
362 }
363
364 }
365
FractionCommonKmersDist(const CSparseKmerCounts & v1,const CSparseKmerCounts & v2)366 double CSparseKmerCounts::FractionCommonKmersDist(
367 const CSparseKmerCounts& v1,
368 const CSparseKmerCounts& v2)
369 {
370 _ASSERT(GetKmerLength() > 0);
371
372 unsigned int num_common = CountCommonKmers(v1, v2, true);
373
374 unsigned int num_counts1 = v1.GetNumCounts();
375 unsigned int num_counts2 = v2.GetNumCounts();
376 unsigned int fewer_counts
377 = num_counts1 < num_counts2 ? num_counts1 : num_counts2;
378
379 // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
380 // SeqLen - k + 1 that is equal to number of counts only if sequence
381 // does not contain Xaa.
382 return 1.0 - (double)num_common / (double)fewer_counts;
383 }
384
385
FractionCommonKmersGlobalDist(const CSparseKmerCounts & v1,const CSparseKmerCounts & v2)386 double CSparseKmerCounts::FractionCommonKmersGlobalDist(
387 const CSparseKmerCounts& v1,
388 const CSparseKmerCounts& v2)
389 {
390 _ASSERT(GetKmerLength() > 0);
391
392 unsigned int num_common = CountCommonKmers(v1, v2, true);
393
394 unsigned int num_counts1 = v1.GetNumCounts();
395 unsigned int num_counts2 = v2.GetNumCounts();
396 unsigned int more_counts
397 = num_counts1 > num_counts2 ? num_counts1 : num_counts2;
398
399 // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
400 // SeqLen - k + 1 that is equal to number of counts only if sequence
401 // does not contain Xaa.
402 return 1.0 - (double)num_common / (double)more_counts;
403 }
404
405
CountCommonKmers(const CSparseKmerCounts & vect1,const CSparseKmerCounts & vect2,bool repetitions)406 unsigned int CSparseKmerCounts::CountCommonKmers(
407 const CSparseKmerCounts& vect1,
408 const CSparseKmerCounts& vect2,
409 bool repetitions)
410
411 {
412
413 unsigned int result = 0;
414 TNonZeroCounts_CI it1 = vect1.m_Counts.begin();
415 TNonZeroCounts_CI it2 = vect2.m_Counts.begin();
416
417 // Iterating through non zero counts in both vectors
418 do {
419 // For each vector element that is non zero in vect1 and vect2
420 while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end()
421 && it1->position == it2->position) {
422
423 // Increase number of common kmers found
424 if (repetitions) {
425 result += (unsigned)(it1->value < it2->value
426 ? it1->value : it2->value);
427 }
428 else {
429 result++;
430 }
431 ++it1;
432 ++it2;
433 }
434
435 //Finding the next pair of non-zero element in both vect1 and vect2
436
437 while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end()
438 && it1->position < it2->position) {
439 ++it1;
440 }
441
442 while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end()
443 && it2->position < it1->position) {
444 ++it2;
445 }
446
447
448 } while (it1 != vect1.m_Counts.end() && it2 != vect2.m_Counts.end());
449
450 return result;
451 }
452
453
PreCount(void)454 void CSparseKmerCounts::PreCount(void)
455 {
456 // Reserve memory for storing counts of all possible k-mers
457 // compute number of bits needed to represent all letters
458 unsigned int mask = 1;
459 int num_bits = 0;
460 while (sm_AlphabetSize > mask) {
461 mask <<= 1;
462 num_bits++;
463 }
464
465 sm_Buffer = ReserveCountsMem(num_bits);
466 }
467
PostCount(void)468 void CSparseKmerCounts::PostCount(void)
469 {
470 if (sm_Buffer) {
471 delete [] sm_Buffer;
472 }
473 sm_Buffer = NULL;
474 sm_ForceSmallerMem = false;
475 }
476
477
Print(CNcbiOstream & ostr) const478 CNcbiOstream& CSparseKmerCounts::Print(CNcbiOstream& ostr) const
479 {
480 for (CSparseKmerCounts::TNonZeroCounts_CI it=BeginNonZero();
481 it != EndNonZero();++it) {
482 ostr << it->position << ":" << (int)it->value << " ";
483 }
484 ostr << NcbiEndl;
485
486 return ostr;
487 }
488
489
CBinaryKmerCounts(const objects::CSeq_loc & seq,objects::CScope & scope)490 CBinaryKmerCounts::CBinaryKmerCounts(const objects::CSeq_loc& seq,
491 objects::CScope& scope)
492 {
493 Reset(seq, scope);
494 }
495
496
Reset(const objects::CSeq_loc & seq,objects::CScope & scope)497 void CBinaryKmerCounts::Reset(const objects::CSeq_loc& seq,
498 objects::CScope& scope)
499 {
500 unsigned int kmer_len = sm_KmerLength;
501 unsigned int alphabet_size = sm_AlphabetSize;
502
503 _ASSERT(kmer_len > 0 && alphabet_size > 0);
504
505 if (sm_UseCompressed && sm_TransTable.empty()) {
506 NCBI_THROW(CKmerCountsException, eInvalidOptions,
507 "Compressed alphabet selected, but translation table not"
508 " specified");
509 }
510
511 if (!seq.IsWhole() && !seq.IsInt()) {
512 NCBI_THROW(CKmerCountsException, eUnsupportedSeqLoc,
513 "Unsupported SeqLoc encountered");
514 }
515
516 _ASSERT(seq.GetId());
517 objects::CSeqVector sv = scope.GetBioseqHandle(*seq.GetId()).GetSeqVector();
518
519 unsigned int num_elements;
520 unsigned int seq_len = sv.size();
521
522 m_SeqLength = sv.size();
523 m_Counts.clear();
524 m_NumCounts = 0;
525
526 if (m_SeqLength < kmer_len) {
527 NCBI_THROW(CKmerCountsException, eBadSequence,
528 "Sequence shorter than desired k-mer length");
529 }
530
531 const int kBitChunk = sizeof(Uint4) * 8;
532
533 // Vecotr of counts is first computed using regular vector that is later
534 // converted to the sparse vector (list of position-value pairs).
535 // Positions are calculated as binary representations of k-mers, if they
536 // fit in 32 bits. Otherwise as numbers in system with base alphabet size.
537
538 _ASSERT(pow((double)alphabet_size, (double)kmer_len)
539 < numeric_limits<Uint4>::max());
540
541 AutoArray<double> base(kmer_len);
542 for (Uint4 i=0;i < kmer_len;i++) {
543 base[i] = pow((double)alphabet_size, (double)i);
544 }
545
546 num_elements = (Uint4)pow((double)alphabet_size, (double)kmer_len);
547
548 m_Counts.resize(num_elements / 32 + 1, (Uint4)0);
549
550 Uint4 pos;
551 unsigned i = 0;
552
553 // find the first k-mer that does not contain Xaa
554 bool is_xaa;
555 do {
556 is_xaa = false;
557 for (unsigned j=0;j < kmer_len && i < seq_len - kmer_len + 1;j++) {
558
559 if (sv[i + j] == kXaa) {
560 i += kmer_len;
561 is_xaa = true;
562 break;
563 }
564 }
565 } while (i < seq_len - kmer_len + 1 && is_xaa);
566 // if sequences contains only Xaa's then exit
567 if (i >= seq_len - kmer_len + 1) {
568 return;
569 }
570
571 // for each subsequence of kmer_len residues
572 for (;i < seq_len - kmer_len + 1;i++) {
573
574 // k-mers that contain unspecified amino acid X are not considered
575 if (sv[i + kmer_len - 1] == kXaa) {
576
577 // move k-mer window past Xaa
578 i += kmer_len;
579
580 // find first k-mer that does not contain Xaa
581 do {
582 is_xaa = false;
583 for (unsigned j=0;j < kmer_len && i < seq_len - kmer_len + 1;
584 j++) {
585
586 if (sv[i + j] == kXaa) {
587 i += kmer_len;
588 is_xaa = true;
589 break;
590 }
591 }
592 } while (i < seq_len - kmer_len + 1 && is_xaa);
593
594 // if Xaa are found till the end of sequence exit
595 if (i >= seq_len - kmer_len + 1) {
596 break;
597 }
598 }
599
600 pos = GetAALetter(sv[i]) - 1;
601 _ASSERT(GetAALetter(sv[i]) <= alphabet_size);
602 for (Uint4 j=1;j < kmer_len;j++) {
603 pos += (Uint4)(((double)GetAALetter(sv[i + j]) - 1) * base[j]);
604 _ASSERT(GetAALetter(sv[i + j]) <= alphabet_size);
605 }
606 MarkUsed(pos, m_Counts, kBitChunk);
607 }
608
609 m_NumCounts = 0;
610 for (size_t i=0;i < m_Counts.size();i++) {
611 m_NumCounts += x_Popcount(m_Counts[i]);
612 }
613 }
614
615
FractionCommonKmersDist(const CBinaryKmerCounts & v1,const CBinaryKmerCounts & v2)616 double CBinaryKmerCounts::FractionCommonKmersDist(const CBinaryKmerCounts& v1,
617 const CBinaryKmerCounts& v2)
618 {
619 _ASSERT(GetKmerLength() > 0);
620
621 unsigned int num_common = CountCommonKmers(v1, v2);
622
623 unsigned int num_counts1 = v1.GetNumCounts();
624 unsigned int num_counts2 = v2.GetNumCounts();
625 unsigned int fewer_counts
626 = num_counts1 < num_counts2 ? num_counts1 : num_counts2;
627
628 // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
629 // SeqLen - k + 1 that is equal to number of counts only if sequence
630 // does not contain Xaa.
631 return 1.0 - (double)num_common / (double)fewer_counts;
632 }
633
634
FractionCommonKmersGlobalDist(const CBinaryKmerCounts & v1,const CBinaryKmerCounts & v2)635 double CBinaryKmerCounts::FractionCommonKmersGlobalDist(
636 const CBinaryKmerCounts& v1,
637 const CBinaryKmerCounts& v2)
638 {
639 _ASSERT(GetKmerLength() > 0);
640
641 unsigned int num_common = CountCommonKmers(v1, v2);
642
643 unsigned int num_counts1 = v1.GetNumCounts();
644 unsigned int num_counts2 = v2.GetNumCounts();
645 unsigned int more_counts
646 = num_counts1 > num_counts2 ? num_counts1 : num_counts2;
647
648 // In RC Edgar, BMC Bioinformatics 5:113, 2004 the denominator is
649 // SeqLen - k + 1 that is equal to number of counts only if sequence
650 // does not contain Xaa.
651 return 1.0 - (double)num_common / (double)more_counts;
652 }
653
654
CountCommonKmers(const CBinaryKmerCounts & vect1,const CBinaryKmerCounts & vect2)655 unsigned int CBinaryKmerCounts::CountCommonKmers(
656 const CBinaryKmerCounts& vect1,
657 const CBinaryKmerCounts& vect2)
658 {
659 unsigned int result = 0;
660 const Uint4* counts1 = &vect1.m_Counts[0];
661 const Uint4* counts2 = &vect2.m_Counts[0];
662 int size = vect1.m_Counts.size();
663
664 for (int i=0;i < size;i++) {
665 result += x_Popcount(counts1[i] & counts2[i]);
666 }
667
668 return result;
669 }
670