1 #include "Konnector/DBGBloomAlgorithms.h"
2 #include "Bloom/Bloom.h"
3 #include "Bloom/CascadingBloomFilter.h"
4 #include "Common/Sequence.h"
5 #include <gtest/gtest.h>
6 #include <string>
7 
8 using namespace std;
9 using Konnector::BloomFilter;
10 
11 /*
12  * Tests for getStartKmerPos() function, which does
13  * the following:
14  *
15  * Choose a suitable starting kmer for a path search and
16  * return its position. More specifically, find the kmer
17  * closest to the end of the given sequence that is followed by
18  * at least (numMatchesThreshold - 1) consecutive kmers that
19  * are also present in the Bloom filter de Bruijn graph. If there
20  * is no sequence of matches of length numMatchesThreshold,
21  * use the longest sequence of matching kmers instead.
22  *
23  * @param seq sequence in which to find start kmer
24  * @param k kmer size
25  * @param g de Bruijn graph
26  * @param numMatchesThreshold if we encounter a sequence
27  * of numMatchesThreshold consecutive kmers in the Bloom filter,
28  * choose the kmer at the beginning of that sequence
29  * @return position of chosen start kmer
30  */
31 class GetStartKmerPosTest : public testing::Test {
32 
33 protected:
34 
35 	FastaRecord testRead;
36 	static const int k = 2;
37 	// set this large to avoid false positives
38 	static const int bloomFilterSize = 1000000;
39 
GetStartKmerPosTest()40 	GetStartKmerPosTest() {
41 		Kmer::setLength(k);
42 		testRead.seq = "TACAGTG";
43 	}
44 
45 };
46 
TEST_F(GetStartKmerPosTest,FullReadMatch)47 TEST_F(GetStartKmerPosTest, FullReadMatch)
48 {
49 	BloomFilter bloom(bloomFilterSize);
50 	Bloom::loadSeq(bloom, k, testRead.seq);
51 	DBGBloom<BloomFilter> g(bloom);
52 	const unsigned numMatchesThreshold = 1;
53 
54 	EXPECT_EQ(5U, getStartKmerPos(testRead, k, FORWARD, g,
55 		numMatchesThreshold));
56 }
57 
TEST_F(GetStartKmerPosTest,FullReadMismatch)58 TEST_F(GetStartKmerPosTest, FullReadMismatch)
59 {
60 	BloomFilter bloom(bloomFilterSize);
61 	// Leave the bloom filter empty to generate a mismatch
62 	// for every kmer in the read.
63 	DBGBloom<BloomFilter> g(bloom);
64 	EXPECT_EQ(NO_MATCH, getStartKmerPos(testRead, k, FORWARD, g));
65 }
66 
TEST_F(GetStartKmerPosTest,NumMatchesThreshold)67 TEST_F(GetStartKmerPosTest, NumMatchesThreshold)
68 {
69 	const string& seq = testRead.seq;
70 	BloomFilter bloom(bloomFilterSize);
71 	DBGBloom<BloomFilter> g(bloom);
72 
73 	// This loop creates kmer match vector 101101
74 	for (unsigned i = 0; i < seq.length() - k + 1; i++) {
75 		// non-matching kmers
76 		if (i == 1 || i == 4)
77 			continue;
78 		Bloom::loadSeq(bloom, k, seq.substr(i,k));
79 	}
80 
81 	unsigned numMatchesThreshold;
82 
83 	numMatchesThreshold = 1;
84 	EXPECT_EQ(5U, getStartKmerPos(testRead, k, FORWARD, g,
85 		numMatchesThreshold));
86 
87 	numMatchesThreshold = 2;
88 	EXPECT_EQ(2U, getStartKmerPos(testRead, k, FORWARD, g,
89 		numMatchesThreshold));
90 
91 	numMatchesThreshold = 3;
92 	EXPECT_EQ(2U, getStartKmerPos(testRead, k, FORWARD, g,
93 		numMatchesThreshold));
94 
95 	numMatchesThreshold = 1;
96 	EXPECT_EQ(0U, getStartKmerPos(testRead, k, REVERSE, g,
97 		numMatchesThreshold));
98 
99 	numMatchesThreshold = 2;
100 	EXPECT_EQ(3U, getStartKmerPos(testRead, k, REVERSE, g,
101 		numMatchesThreshold));
102 
103 	numMatchesThreshold = 3;
104 	EXPECT_EQ(3U, getStartKmerPos(testRead, k, REVERSE, g,
105 		numMatchesThreshold));
106 }
107 
TEST_F(GetStartKmerPosTest,EqualLengthMatchRegions)108 TEST_F(GetStartKmerPosTest, EqualLengthMatchRegions)
109 {
110 	const string& seq = testRead.seq;
111 	BloomFilter bloom(bloomFilterSize);
112 	DBGBloom<BloomFilter> g(bloom);
113 
114 	// This loop creates kmer match vector 011011
115 	for (unsigned i = 0; i < seq.length() - k + 1; i++) {
116 		// non-matching kmers
117 		if (i == 0 || i == 3)
118 			continue;
119 		Bloom::loadSeq(bloom, k, seq.substr(i,k));
120 	}
121 
122 	unsigned numMatchesThreshold;
123 
124 	numMatchesThreshold = 2;
125 	EXPECT_EQ(4U, getStartKmerPos(testRead, k, FORWARD, g,
126 		numMatchesThreshold));
127 
128 	numMatchesThreshold = 2;
129 	EXPECT_EQ(2U, getStartKmerPos(testRead, k, REVERSE, g,
130 		numMatchesThreshold));
131 }
132 
133 class CorrectSingleBaseErrorTest : public testing::Test {
134 
135 protected:
136 
137 	FastaRecord correctRead;
138 	FastaRecord singleErrorRead;
139 	string simulatedFalsePositive;
140 	static const int k = 6;
141 	static const size_t errorPos1 = 4;
142 	// set this large to avoid false positives
143 	static const int bloomFilterSize = 1000000;
144 
CorrectSingleBaseErrorTest()145 	CorrectSingleBaseErrorTest() {
146 		Kmer::setLength(k);
147 		correctRead.seq = "TACAGTGCC";
148 		singleErrorRead.seq = correctRead.seq;
149 		singleErrorRead.seq[errorPos1] = 'C';
150 		simulatedFalsePositive = "TGCAGT";
151 	}
152 
153 };
154 
155 
TEST_F(CorrectSingleBaseErrorTest,SingleError)156 TEST_F(CorrectSingleBaseErrorTest, SingleError)
157 {
158 	BloomFilter bloom(bloomFilterSize);
159 	Bloom::loadSeq(bloom, k, correctRead.seq);
160 	DBGBloom<BloomFilter> g(bloom);
161 
162 	size_t correctedPos = numeric_limits<size_t>::max();
163 	FastaRecord read = singleErrorRead;
164 	bool success = correctSingleBaseError(g, k, read, correctedPos);
165 
166 	ASSERT_TRUE(success);
167 	EXPECT_EQ(read.seq, read.seq);
168 	EXPECT_TRUE(correctedPos == errorPos1);
169 }
170 
TEST_F(CorrectSingleBaseErrorTest,ReverseComplement)171 TEST_F(CorrectSingleBaseErrorTest, ReverseComplement)
172 {
173 	BloomFilter bloom(bloomFilterSize);
174 	Bloom::loadSeq(bloom, k, correctRead.seq);
175 	DBGBloom<BloomFilter> g(bloom);
176 
177 	size_t correctedPos = numeric_limits<size_t>::max();
178 	FastaRecord read = singleErrorRead;
179 	bool success = correctSingleBaseError(g, k, read, correctedPos, true);
180 
181 	ASSERT_TRUE(success);
182 	EXPECT_EQ(read.seq, read.seq);
183 	EXPECT_TRUE(correctedPos == errorPos1);
184 }
185 
TEST_F(CorrectSingleBaseErrorTest,NoError)186 TEST_F(CorrectSingleBaseErrorTest, NoError)
187 {
188 	BloomFilter bloom(bloomFilterSize);
189 	Bloom::loadSeq(bloom, k, singleErrorRead.seq);
190 	DBGBloom<BloomFilter> g(bloom);
191 
192 	size_t correctedPos = numeric_limits<size_t>::max();
193 	FastaRecord read = singleErrorRead;
194 	bool success = correctSingleBaseError(g, k, read, correctedPos);
195 	ASSERT_FALSE(success);
196 }
197 
TEST_F(CorrectSingleBaseErrorTest,SkipFalsePositive)198 TEST_F(CorrectSingleBaseErrorTest, SkipFalsePositive)
199 {
200 	BloomFilter bloom(bloomFilterSize);
201 	Bloom::loadSeq(bloom, k, correctRead.seq);
202 	Bloom::loadSeq(bloom, k, simulatedFalsePositive);
203 	DBGBloom<BloomFilter> g(bloom);
204 
205 	size_t correctedPos = numeric_limits<size_t>::max();
206 	FastaRecord read = singleErrorRead;
207 	bool success = correctSingleBaseError(g, k, read, correctedPos);
208 	ASSERT_TRUE(success);
209 	EXPECT_EQ(read.seq, read.seq);
210 	EXPECT_TRUE(correctedPos == errorPos1);
211 }
212