1 #include "Konnector/DBGBloomAlgorithms.h"
2 #include "Bloom/Bloom.h"
3 #include "Bloom/CascadingBloomFilter.h"
4 #include "Common/Sequence.h"
5 #include <gtest/gtest.h>
6 #include <string>
7
8 using namespace std;
9 using Konnector::BloomFilter;
10
11 /*
12 * Tests for getStartKmerPos() function, which does
13 * the following:
14 *
15 * Choose a suitable starting kmer for a path search and
16 * return its position. More specifically, find the kmer
17 * closest to the end of the given sequence that is followed by
18 * at least (numMatchesThreshold - 1) consecutive kmers that
19 * are also present in the Bloom filter de Bruijn graph. If there
20 * is no sequence of matches of length numMatchesThreshold,
21 * use the longest sequence of matching kmers instead.
22 *
23 * @param seq sequence in which to find start kmer
24 * @param k kmer size
25 * @param g de Bruijn graph
26 * @param numMatchesThreshold if we encounter a sequence
27 * of numMatchesThreshold consecutive kmers in the Bloom filter,
28 * choose the kmer at the beginning of that sequence
29 * @return position of chosen start kmer
30 */
31 class GetStartKmerPosTest : public testing::Test {
32
33 protected:
34
35 FastaRecord testRead;
36 static const int k = 2;
37 // set this large to avoid false positives
38 static const int bloomFilterSize = 1000000;
39
GetStartKmerPosTest()40 GetStartKmerPosTest() {
41 Kmer::setLength(k);
42 testRead.seq = "TACAGTG";
43 }
44
45 };
46
TEST_F(GetStartKmerPosTest,FullReadMatch)47 TEST_F(GetStartKmerPosTest, FullReadMatch)
48 {
49 BloomFilter bloom(bloomFilterSize);
50 Bloom::loadSeq(bloom, k, testRead.seq);
51 DBGBloom<BloomFilter> g(bloom);
52 const unsigned numMatchesThreshold = 1;
53
54 EXPECT_EQ(5U, getStartKmerPos(testRead, k, FORWARD, g,
55 numMatchesThreshold));
56 }
57
TEST_F(GetStartKmerPosTest,FullReadMismatch)58 TEST_F(GetStartKmerPosTest, FullReadMismatch)
59 {
60 BloomFilter bloom(bloomFilterSize);
61 // Leave the bloom filter empty to generate a mismatch
62 // for every kmer in the read.
63 DBGBloom<BloomFilter> g(bloom);
64 EXPECT_EQ(NO_MATCH, getStartKmerPos(testRead, k, FORWARD, g));
65 }
66
TEST_F(GetStartKmerPosTest,NumMatchesThreshold)67 TEST_F(GetStartKmerPosTest, NumMatchesThreshold)
68 {
69 const string& seq = testRead.seq;
70 BloomFilter bloom(bloomFilterSize);
71 DBGBloom<BloomFilter> g(bloom);
72
73 // This loop creates kmer match vector 101101
74 for (unsigned i = 0; i < seq.length() - k + 1; i++) {
75 // non-matching kmers
76 if (i == 1 || i == 4)
77 continue;
78 Bloom::loadSeq(bloom, k, seq.substr(i,k));
79 }
80
81 unsigned numMatchesThreshold;
82
83 numMatchesThreshold = 1;
84 EXPECT_EQ(5U, getStartKmerPos(testRead, k, FORWARD, g,
85 numMatchesThreshold));
86
87 numMatchesThreshold = 2;
88 EXPECT_EQ(2U, getStartKmerPos(testRead, k, FORWARD, g,
89 numMatchesThreshold));
90
91 numMatchesThreshold = 3;
92 EXPECT_EQ(2U, getStartKmerPos(testRead, k, FORWARD, g,
93 numMatchesThreshold));
94
95 numMatchesThreshold = 1;
96 EXPECT_EQ(0U, getStartKmerPos(testRead, k, REVERSE, g,
97 numMatchesThreshold));
98
99 numMatchesThreshold = 2;
100 EXPECT_EQ(3U, getStartKmerPos(testRead, k, REVERSE, g,
101 numMatchesThreshold));
102
103 numMatchesThreshold = 3;
104 EXPECT_EQ(3U, getStartKmerPos(testRead, k, REVERSE, g,
105 numMatchesThreshold));
106 }
107
TEST_F(GetStartKmerPosTest,EqualLengthMatchRegions)108 TEST_F(GetStartKmerPosTest, EqualLengthMatchRegions)
109 {
110 const string& seq = testRead.seq;
111 BloomFilter bloom(bloomFilterSize);
112 DBGBloom<BloomFilter> g(bloom);
113
114 // This loop creates kmer match vector 011011
115 for (unsigned i = 0; i < seq.length() - k + 1; i++) {
116 // non-matching kmers
117 if (i == 0 || i == 3)
118 continue;
119 Bloom::loadSeq(bloom, k, seq.substr(i,k));
120 }
121
122 unsigned numMatchesThreshold;
123
124 numMatchesThreshold = 2;
125 EXPECT_EQ(4U, getStartKmerPos(testRead, k, FORWARD, g,
126 numMatchesThreshold));
127
128 numMatchesThreshold = 2;
129 EXPECT_EQ(2U, getStartKmerPos(testRead, k, REVERSE, g,
130 numMatchesThreshold));
131 }
132
133 class CorrectSingleBaseErrorTest : public testing::Test {
134
135 protected:
136
137 FastaRecord correctRead;
138 FastaRecord singleErrorRead;
139 string simulatedFalsePositive;
140 static const int k = 6;
141 static const size_t errorPos1 = 4;
142 // set this large to avoid false positives
143 static const int bloomFilterSize = 1000000;
144
CorrectSingleBaseErrorTest()145 CorrectSingleBaseErrorTest() {
146 Kmer::setLength(k);
147 correctRead.seq = "TACAGTGCC";
148 singleErrorRead.seq = correctRead.seq;
149 singleErrorRead.seq[errorPos1] = 'C';
150 simulatedFalsePositive = "TGCAGT";
151 }
152
153 };
154
155
TEST_F(CorrectSingleBaseErrorTest,SingleError)156 TEST_F(CorrectSingleBaseErrorTest, SingleError)
157 {
158 BloomFilter bloom(bloomFilterSize);
159 Bloom::loadSeq(bloom, k, correctRead.seq);
160 DBGBloom<BloomFilter> g(bloom);
161
162 size_t correctedPos = numeric_limits<size_t>::max();
163 FastaRecord read = singleErrorRead;
164 bool success = correctSingleBaseError(g, k, read, correctedPos);
165
166 ASSERT_TRUE(success);
167 EXPECT_EQ(read.seq, read.seq);
168 EXPECT_TRUE(correctedPos == errorPos1);
169 }
170
TEST_F(CorrectSingleBaseErrorTest,ReverseComplement)171 TEST_F(CorrectSingleBaseErrorTest, ReverseComplement)
172 {
173 BloomFilter bloom(bloomFilterSize);
174 Bloom::loadSeq(bloom, k, correctRead.seq);
175 DBGBloom<BloomFilter> g(bloom);
176
177 size_t correctedPos = numeric_limits<size_t>::max();
178 FastaRecord read = singleErrorRead;
179 bool success = correctSingleBaseError(g, k, read, correctedPos, true);
180
181 ASSERT_TRUE(success);
182 EXPECT_EQ(read.seq, read.seq);
183 EXPECT_TRUE(correctedPos == errorPos1);
184 }
185
TEST_F(CorrectSingleBaseErrorTest,NoError)186 TEST_F(CorrectSingleBaseErrorTest, NoError)
187 {
188 BloomFilter bloom(bloomFilterSize);
189 Bloom::loadSeq(bloom, k, singleErrorRead.seq);
190 DBGBloom<BloomFilter> g(bloom);
191
192 size_t correctedPos = numeric_limits<size_t>::max();
193 FastaRecord read = singleErrorRead;
194 bool success = correctSingleBaseError(g, k, read, correctedPos);
195 ASSERT_FALSE(success);
196 }
197
TEST_F(CorrectSingleBaseErrorTest,SkipFalsePositive)198 TEST_F(CorrectSingleBaseErrorTest, SkipFalsePositive)
199 {
200 BloomFilter bloom(bloomFilterSize);
201 Bloom::loadSeq(bloom, k, correctRead.seq);
202 Bloom::loadSeq(bloom, k, simulatedFalsePositive);
203 DBGBloom<BloomFilter> g(bloom);
204
205 size_t correctedPos = numeric_limits<size_t>::max();
206 FastaRecord read = singleErrorRead;
207 bool success = correctSingleBaseError(g, k, read, correctedPos);
208 ASSERT_TRUE(success);
209 EXPECT_EQ(read.seq, read.seq);
210 EXPECT_TRUE(correctedPos == errorPos1);
211 }
212