1 /*  $Id: proteinkmer_unit_test.cpp 631630 2021-05-19 19:43:37Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Tom Madden, NCBI
27 *
28 * File Description:
29 *   Unit tests for proteinkmer library.
30 *
31 *
32 * ===========================================================================
33 */
34 
35 #define NCBI_TEST_APPLICATION
36 #include <ncbi_pch.hpp>
37 
38 #include <corelib/ncbi_system.hpp>
39 #include <corelib/ncbiapp.hpp>
40 
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 #include <serial/objostr.hpp>
44 #include <serial/iterator.hpp>
45 
46 #include <objects/seq/Bioseq.hpp>
47 #include <objects/seqloc/Seq_id.hpp>
48 #include <objects/seqloc/Seq_loc.hpp>
49 
50 #include <objtools/simple/simple_om.hpp>
51 
52 #include <objmgr/object_manager.hpp>
53 #include <objmgr/scope.hpp>
54 
55 #include <util/sequtil/sequtil_convert.hpp>
56 #include <algo/blast/proteinkmer/blastkmer.hpp>
57 #include <algo/blast/proteinkmer/blastkmerutils.hpp>
58 #include <algo/blast/proteinkmer/blastkmerresults.hpp>
59 #include <algo/blast/proteinkmer/blastkmeroptions.hpp>
60 #include <algo/blast/proteinkmer/blastkmerindex.hpp>
61 #include <algo/blast/proteinkmer/blastkmerindex.hpp>
62 #include <algo/blast/proteinkmer/kblastapi.hpp>
63 
64 #include <algo/blast/api/objmgr_query_data.hpp>
65 #include <algo/blast/api/uniform_search.hpp>
66 #include <algo/blast/api/blast_prot_options.hpp>
67 #include <algo/blast/api/blast_advprot_options.hpp>
68 
69 #include <algo/blast/blastinput/blast_input.hpp>
70 
71 // This header must be included before all Boost.Test headers if there are any
72 #include <corelib/test_boost.hpp>
73 
74 #include <util/random_gen.hpp>
75 
76 
77 USING_NCBI_SCOPE;
78 USING_SCOPE(blast);
79 USING_SCOPE(objects);
80 
81 BOOST_AUTO_TEST_SUITE(proteinkmers)
82 
83 
BOOST_AUTO_TEST_CASE(KmerResults)84 BOOST_AUTO_TEST_CASE(KmerResults)
85 {
86 	const int kNumHits = 6;
87 	// nr_test has GIs from array subjid in the order given.
88 	const TGi subjid[kNumHits] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
89 	const double scores[kNumHits] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
90 	TBlastKmerPrelimScoreVector prelim_vector;
91 	for (int index=0; index<kNumHits; index++)
92 	{
93 		pair<uint32_t, double> retval(index, scores[index]);
94 		prelim_vector.push_back(retval);
95 	}
96 	CRef<CSeq_id> qid(new CSeq_id("gi|129296"));
97 	CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
98 	// values in stats are: hit_count, jd_count, jd_oid_count, oids_considered, total_matches
99 	BlastKmerStats stats( 3256, 506, 253, 77623, 27);
100 
101 	CBlastKmerResults results(qid, prelim_vector, stats, seqdb, TQueryMessages());
102 
103         CRef<CScope> scope(CSimpleOM::NewScope());
104 	TSeqLocVector tsl;
105 	results.GetTSL(tsl, scope);
106 	int index=0;
107 	for (TSeqLocVector::iterator iter=tsl.begin(); iter != tsl.end(); ++iter)
108 	{
109 		BOOST_REQUIRE_EQUAL((*iter).seqloc->GetId()->GetGi(), subjid[index]);
110 		index++;
111 	}
112 
113 	const TBlastKmerScoreVector kmerscores = results.GetScores();
114 	index=0;
115 	for (TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
116 	{
117 		CRef<CSeq_id> sid = (*iter).first;
118 		BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid[index]);
119 		BOOST_REQUIRE_EQUAL((*iter).second, scores[index]);
120 		index++;
121 	}
122 
123 	const BlastKmerStats& newstats = results.GetStats();
124 	BOOST_REQUIRE_EQUAL(newstats.hit_count, stats.hit_count);
125 	BOOST_REQUIRE_EQUAL(newstats.jd_count, stats.jd_count);
126 }
127 
BOOST_AUTO_TEST_CASE(KmerResultsSet)128 BOOST_AUTO_TEST_CASE(KmerResultsSet)
129 {
130 	const int kNumQueries = 2;
131 	const int kNumHits1 = 6;
132 	const int kNumHits2 = 2;
133 	// nr_test has GIs from array subjid in the order given by subjid1.
134 	const TGi subjid1[kNumHits1] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
135 	const TGi subjid2[kNumHits2] = { 129295, 677974076};
136 	const double scores1[kNumHits1] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
137 	const double scores2[kNumHits1] = {0.359375, 0.710938};
138 	TBlastKmerPrelimScoreVector prelim_vector1;
139 	TBlastKmerPrelimScoreVector prelim_vector2;
140 	for (int index=0; index<kNumHits1; index++)
141 	{
142 		pair<uint32_t, double> retval(index, scores1[index]);
143 		prelim_vector1.push_back(retval);
144 	}
145         // Ordering of OID to GIs are off here.
146 	for (int index=0; index<kNumHits2; index++)
147 	{	// Matches start at fourth sequence
148 		pair<uint32_t, double> retval(index+4, scores2[index]);
149 		prelim_vector2.push_back(retval);
150 	}
151 	CBlastKmerResultsSet::TBlastKmerPrelimScoreVectorSet vec_set;
152 	vec_set.push_back(prelim_vector1);
153 	vec_set.push_back(prelim_vector2);
154 	CRef<CSeq_id> qid1(new CSeq_id("gi|129296"));
155 	CRef<CSeq_id> qid2(new CSeq_id("gi|129295"));
156 	CBlastKmerResultsSet::TQueryIdVector id_vec;
157 	id_vec.push_back(qid1);
158 	id_vec.push_back(qid2);
159 
160 	CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
161 
162 	// values in stats are: hit_count, jd_count, jd_oid_count, oids_considered, total_matches
163 	BlastKmerStats stats1( 3256, 506, 253, 77623, 27);
164 	BlastKmerStats stats2( 3255, 505, 255, 77625, 25);
165 	CBlastKmerResultsSet::TBlastKmerStatsVector stats_vec;
166 	stats_vec.push_back(stats1);
167 	stats_vec.push_back(stats2);
168 	TSearchMessages errs;
169 	errs.resize(kNumQueries);
170 
171 	CBlastKmerResultsSet result_set(id_vec, vec_set, stats_vec, seqdb, errs);
172 
173 	BOOST_REQUIRE_EQUAL(result_set.GetNumQueries(), kNumQueries);
174 
175 	CBlastKmerResults results = result_set[0];
176 
177 	const TBlastKmerScoreVector kmerscores = results.GetScores();
178 	int index=0;
179 	for (TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
180 	{
181 		CRef<CSeq_id> sid = (*iter).first;
182 		BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid1[index]);
183 		BOOST_REQUIRE_EQUAL((*iter).second, scores1[index]);
184 		index++;
185 	}
186 
187 	const BlastKmerStats& newstats = results.GetStats();
188 	BOOST_REQUIRE_EQUAL(newstats.hit_count, stats1.hit_count);
189 	BOOST_REQUIRE_EQUAL(newstats.jd_count, stats1.jd_count);
190 
191 	// Results from second query.
192 	results = result_set[1];
193 	const TBlastKmerScoreVector kmerscores2 = results.GetScores();
194 	index=0;
195 	for (TBlastKmerScoreVector::const_iterator iter=kmerscores2.begin(); iter != kmerscores2.end(); ++iter)
196 	{
197 		CRef<CSeq_id> sid = (*iter).first;
198 		BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid2[index]);
199 		BOOST_REQUIRE_EQUAL((*iter).second, scores2[index]);
200 		index++;
201 	}
202 
203 	// ID in this search set
204 	CRef<CBlastKmerResults> resultsNotNULL = result_set[*qid1];
205 	BOOST_REQUIRE(resultsNotNULL.IsNull() == false);
206 
207 	// ID not in this search set.
208 	CRef<CSeq_id> qid_bogus(new CSeq_id("gi|555"));
209 	CRef<CBlastKmerResults> resultsNull = result_set[*qid_bogus];
210 	BOOST_REQUIRE(resultsNull.IsNull() == true);
211 }
212 
BOOST_AUTO_TEST_CASE(KmerResultsSetPushBack)213 BOOST_AUTO_TEST_CASE(KmerResultsSetPushBack)
214 {
215 	const int kNumQueries = 2;
216 	const int kNumHits1 = 6;
217 	const int kNumHits2 = 2;
218 	// nr_test has GIs from array subjid in the order given by subjid1.
219 	const TGi subjid1[kNumHits1] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
220     const TGi subjid2[kNumHits2] = { 129295, 677974076};
221 	const double scores1[kNumHits1] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
222 	const double scores2[kNumHits1] = {0.359375, 0.710938};
223 	TBlastKmerPrelimScoreVector prelim_vector1;
224 	TBlastKmerPrelimScoreVector prelim_vector2;
225 	for (int index=0; index<kNumHits1; index++)
226 	{
227 		pair<uint32_t, double> retval(index, scores1[index]);
228 		prelim_vector1.push_back(retval);
229 	}
230         // Ordering of OID to GIs are off here.
231 	for (int index=0; index<kNumHits2; index++)
232 	{	// Matches start at fourth sequence
233 		pair<uint32_t, double> retval(index+4, scores2[index]);
234 		prelim_vector2.push_back(retval);
235 	}
236 
237 	CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
238 	TQueryMessages errs = TQueryMessages();
239 
240 	CRef<CSeq_id> qid1(new CSeq_id("gi|129296"));
241 	BlastKmerStats stats1( 3256, 506, 253, 77623, 27);
242 	CRef<CBlastKmerResults> results1;
243 	results1.Reset(new CBlastKmerResults(qid1, prelim_vector1, stats1, seqdb, TQueryMessages()));
244 
245 	CRef<CSeq_id> qid2(new CSeq_id("gi|129295"));
246 	BlastKmerStats stats2( 3255, 505, 255, 77625, 25);
247 	CRef<CBlastKmerResults> results2;
248 	results2.Reset(new CBlastKmerResults(qid2, prelim_vector2, stats2, seqdb, errs));
249 
250 	CBlastKmerResultsSet result_set;
251 
252 	result_set.push_back(results1);
253 	result_set.push_back(results2);
254 
255 	BOOST_REQUIRE_EQUAL(result_set.GetNumQueries(), kNumQueries);
256 
257 	CBlastKmerResults results = result_set[0];
258 
259 	const TBlastKmerScoreVector kmerscores = results.GetScores();
260 	int index=0;
261 	for (TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
262 	{
263 		CRef<CSeq_id> sid = (*iter).first;
264 		BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid1[index]);
265 		BOOST_REQUIRE_EQUAL((*iter).second, scores1[index]);
266 		index++;
267 	}
268 
269 	const BlastKmerStats& newstats = results.GetStats();
270 	BOOST_REQUIRE_EQUAL(newstats.hit_count, stats1.hit_count);
271 	BOOST_REQUIRE_EQUAL(newstats.jd_count, stats1.jd_count);
272 
273 	// Results from second query.
274 	results = result_set[1];
275 	const TBlastKmerScoreVector kmerscores2 = results.GetScores();
276 	index=0;
277 	for (TBlastKmerScoreVector::const_iterator iter=kmerscores2.begin(); iter != kmerscores2.end(); ++iter)
278 	{
279 		CRef<CSeq_id> sid = (*iter).first;
280 		BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid2[index]);
281 		BOOST_REQUIRE_EQUAL((*iter).second, scores2[index]);
282 		index++;
283 	}
284 }
285 
286 /* Test subset??
287 BOOST_AUTO_TEST_CASE(SearchWithBadQuery)
288 {
289 	CBioseq bioseq;
290 	CNcbiIfstream i_file("data/bad.asn");
291 	unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
292 
293 	*is >> bioseq;
294 	CRef<CScope> scope(CSimpleOM::NewScope(false));
295 	scope->AddBioseq(bioseq);
296 	CRef<CSeq_loc> loc(new CSeq_loc);
297     	loc->SetWhole().Assign(*(bioseq.GetId().front()));
298 
299 	unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
300 	TSeqLocVector query_vector;
301 	query_vector.push_back(*ssl);
302 	CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
303 	CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
304 
305 	CBlastKmer kmersearch(query_vector, options, db);
306 
307 	CRef<CBlastKmerResultsSet> results = kmersearch.Run();
308 	BOOST_REQUIRE((*results)[0].HasErrors() == false);
309 	BOOST_REQUIRE((*results)[0].HasWarnings() == true);
310 }
311 */
312 
BOOST_AUTO_TEST_CASE(SearchWithBadDatabase)313 BOOST_AUTO_TEST_CASE(SearchWithBadDatabase)
314 {
315 	CBioseq bioseq;
316 	CNcbiIfstream i_file("data/129295.stdaa");
317 	unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
318 
319 	*is >> bioseq;
320 	CRef<CScope> scope(CSimpleOM::NewScope(false));
321 	scope->AddBioseq(bioseq);
322 	CRef<CSeq_loc> loc(new CSeq_loc);
323     	loc->SetWhole().Assign(*(bioseq.GetId().front()));
324 
325 	SSeqLoc ssl(*loc, *scope);
326 	CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
327 
328 	string dbname("JUNK");
329 	BOOST_REQUIRE_THROW(CBlastKmer kmersearch(ssl, options, dbname), CSeqDBException);
330 }
331 
332 
333 /* smaller test??
334 BOOST_AUTO_TEST_CASE(GIListLimitSearch)
335 {
336 	CBioseq bioseq;
337 	CNcbiIfstream i_file("data/129295.iupacaa");
338 	unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
339 
340 	*is >> bioseq;
341 	CRef<CScope> scope(CSimpleOM::NewScope(false));
342 	scope->AddBioseq(bioseq);
343 	CRef<CSeq_loc> loc(new CSeq_loc);
344     	loc->SetWhole().Assign(*(bioseq.GetId().front()));
345 
346 	unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
347 	TSeqLocVector query_vector;
348 	query_vector.push_back(*ssl);
349 	CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
350 	CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
351 	options->SetThresh(0.15);
352 
353 	CBlastKmer kmersearch(query_vector, options, db);
354 	CRef<CSeqDBGiList> seqdb_gilist(new CSeqDBGiList());
355 	seqdb_gilist->AddGi(3091);
356 	seqdb_gilist->AddGi(129296);
357 	seqdb_gilist->AddGi(448824824);
358 	kmersearch.SetGiListLimit(seqdb_gilist);
359 	CRef<CBlastKmerResultsSet> results = kmersearch.Run();
360 
361 	const TBlastKmerScoreVector& scores = (*results)[0].GetScores();
362 	BOOST_REQUIRE_EQUAL(scores.size(), 2);
363 }
364 */
365 
366 /*
367 BOOST_AUTO_TEST_CASE(NegativeGIListLimitSearch)
368 {
369 	CBioseq bioseq;
370 	CNcbiIfstream i_file("data/129295.iupacaa");
371 	unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
372 
373 	*is >> bioseq;
374 	CRef<CScope> scope(CSimpleOM::NewScope(false));
375 	scope->AddBioseq(bioseq);
376 	CRef<CSeq_loc> loc(new CSeq_loc);
377     	loc->SetWhole().Assign(*(bioseq.GetId().front()));
378 
379 	unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
380 	TSeqLocVector query_vector;
381 	query_vector.push_back(*ssl);
382 	CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
383 	CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
384 	options->SetThresh(0.15);
385 
386 	CBlastKmer kmersearch(query_vector, options, db);
387 	CRef<CSeqDBNegativeList> seqdb_gilist(new CSeqDBNegativeList());
388 	seqdb_gilist->AddGi(3091);
389 	seqdb_gilist->AddGi(129296);
390 	seqdb_gilist->AddGi(448824824);
391 	kmersearch.SetGiListLimit(seqdb_gilist);
392 	CRef<CBlastKmerResultsSet> resultSet = kmersearch.RunSearches();
393 	CBlastKmerResults results = (*resultSet)[0];
394 
395 	const TBlastKmerScoreVector& scores = results.GetScores();
396 	BOOST_REQUIRE_EQUAL(scores.size(), 3);
397 }
398 */
399 
s_GetRandomNumbers(uint32_t * a,uint32_t * b,int numHashes)400 void s_GetRandomNumbers(uint32_t* a, uint32_t* b, int numHashes)
401 {
402         CRandom random(1);  // Always have the same random numbers.
403         for(int i=0;i<numHashes;i++)
404         {
405                 do
406                 {
407                         a[i]=random.GetRand();
408                 }
409                 while (a[i] == 0);
410 
411                 b[i]=random.GetRand();
412         }
413 }
414 
415 
BOOST_AUTO_TEST_CASE(CheckQueryHashes)416 BOOST_AUTO_TEST_CASE(CheckQueryHashes)
417 {
418 	string queryseq_eaa =
419 	"MDSISVTNAKFCFDVFNEMKVHHVNENILYCPLSILTALAMVYLGARGNTESQMKKVLHFDSITGAGSTTDSQCGSSEYV"
420 	"HNLFKELLSEITRPNATYSLEIADKLYVDKTFSVLPEYLSCARKFYTGGVEEVNFKTAAEEARQLINSWVEKETNGQIKD"
421 	"LLVSSSIDFGTTMVFINTIYFKGIWKIAFNTEDTREMPFSMTKEESKPVQMMCMNNSFNVATLPAEKMKILELPYASGDL";
422 
423 	string queryseq_stdaa;
424 	CSeqConvert::Convert(queryseq_eaa, CSeqUtil::e_Ncbieaa, 0, queryseq_eaa.length(),
425 		queryseq_stdaa, CSeqUtil::e_Ncbistdaa);
426 
427 	const int kNumHashes=128;
428 	const int kDoSeg=0;
429 	const int kKmerNum=5;
430 	const int kAlphabet=0; // 15 letters
431 	uint32_t a[kNumHashes];
432 	uint32_t b[kNumHashes];
433 	const int kChunkSize=150;
434 
435 	s_GetRandomNumbers(a, b, kNumHashes);
436 	vector < vector <uint32_t> > seq_hash;
437 
438 	minhash_query(queryseq_stdaa, seq_hash, kNumHashes, a, b, kDoSeg, kKmerNum, kAlphabet, kChunkSize);
439 
440 	BOOST_REQUIRE_EQUAL(seq_hash.size(), 2);
441 	BOOST_REQUIRE_EQUAL(seq_hash[0].size(), kNumHashes);
442 	BOOST_REQUIRE_EQUAL(seq_hash[0][0], 529895);
443 	BOOST_REQUIRE_EQUAL(seq_hash[0][1], 798115);
444 	BOOST_REQUIRE_EQUAL(seq_hash[0][63], 90979);
445 	BOOST_REQUIRE_EQUAL(seq_hash[0][83], 336201);
446 
447 	// Expected (correct) values.
448 	const int lsh_hash_length = 13;
449 	const int lsh_hash_vals[lsh_hash_length] = { 973119,1097197,1157729,1681152,1913970,1933659,2018075,2123893,2355301,2800673,2940688,2941967,3535701};
450 
451 	const int kRowsPerBand=2;
452 	const int kNumBands = kNumHashes/kRowsPerBand;
453 	vector< vector <uint32_t> > lsh_hash_vec;
454 	get_LSH_hashes(seq_hash, lsh_hash_vec, kNumBands, kRowsPerBand);
455 	int index=0;
456 	int numChunks = lsh_hash_vec.size();
457 	for (int i=0; i<numChunks && index<lsh_hash_length; i++)
458 	{
459 		for(vector<uint32_t>::iterator iter=lsh_hash_vec[i].begin(); iter != lsh_hash_vec[i].end(); ++iter)
460 		{
461   			BOOST_REQUIRE_EQUAL(*iter, lsh_hash_vals[index]);
462 			index++;
463 			if (index == lsh_hash_length)
464 				break;
465 		}
466 	}
467 }
468 
BOOST_AUTO_TEST_CASE(CheckQueryHashesVersion3)469 BOOST_AUTO_TEST_CASE(CheckQueryHashesVersion3)
470 {
471 	string queryseq_eaa =
472 	"MDSISVTNAKFCFDVFNEMKVHHVNENILYCPLSILTALAMVYLGARGNTESQMKKVLHFDSITGAGSTTDSQCGSSEYV"
473 	"HNLFKELLSEITRPNATYSLEIADKLYVDKTFSVLPEYLSCARKFYTGGVEEVNFKTAAEEARQLINSWVEKETNGQIKD"
474 	"LLVSSSIDFGTTMVFINTIYFKGIWKIAFNTEDTREMPFSMTKEESKPVQMMCMNNSFNVATLPAEKMKILELPYASGDL";
475 
476 	string queryseq_stdaa;
477 	CSeqConvert::Convert(queryseq_eaa, CSeqUtil::e_Ncbieaa, 0, queryseq_eaa.length(),
478 		queryseq_stdaa, CSeqUtil::e_Ncbistdaa);
479 
480 	const int kNumHashes=32;
481 	const int kKmerNum=5;
482 	const int kAlphabet=0; // 15 letters
483 	vector<int> badMers;
484 	const int kChunkSize=150;
485 
486 	vector < vector <uint32_t> > seq_hash;
487 
488 	minhash_query2(queryseq_stdaa, seq_hash, kKmerNum, kNumHashes, kAlphabet, badMers, kChunkSize);
489 
490 	BOOST_REQUIRE_EQUAL(seq_hash.size(), 2);
491 	BOOST_REQUIRE_EQUAL(seq_hash[0].size(), kNumHashes);
492 	BOOST_REQUIRE_EQUAL(seq_hash[0][0], 2683052);
493 	BOOST_REQUIRE_EQUAL(seq_hash[0][1], 26519505);
494 	BOOST_REQUIRE_EQUAL(seq_hash[0][2], 45619224);
495 	BOOST_REQUIRE_EQUAL(seq_hash[0][15], 396863844);
496         // Values are ordered from smallest to largest
497 	BOOST_REQUIRE(seq_hash[1][0] < seq_hash[0][1]);
498 	BOOST_REQUIRE(seq_hash[1][1] < seq_hash[0][2]);
499 	BOOST_REQUIRE(seq_hash[1][3] < seq_hash[0][kNumHashes-1]);
500 
501 	// Expected (correct) values.
502 	const int lsh_hash_length = 13;
503 	const int lsh_hash_vals[lsh_hash_length] = { 700168,1293774,1377419,1712432,1819660,2314660,2484152,2944352,2951476,3273866,3625878,3709806,3837843};
504 
505 	const int kRowsPerBand=2;
506 	vector< vector <uint32_t> > lsh_hash_vec;
507 	get_LSH_hashes5(seq_hash, lsh_hash_vec, kNumHashes, kRowsPerBand);
508 	int index=0;
509 	int numChunks = lsh_hash_vec.size();
510 	for (int i=0; i<numChunks && index<lsh_hash_length; i++)
511 	{
512 		for(vector<uint32_t>::iterator iter=lsh_hash_vec[i].begin(); iter != lsh_hash_vec[i].end(); ++iter)
513 		{
514  			BOOST_REQUIRE_EQUAL(*iter, lsh_hash_vals[index]);
515 			index++;
516 			if (index == lsh_hash_length)
517 				break;
518 		}
519 	}
520 }
521 
s_GetNumLSHHits(uint64_t * lsh,int lshSize)522 int s_GetNumLSHHits(uint64_t* lsh, int lshSize)
523 {
524 	int count=0;
525 	for (int index=0; index<lshSize; index++)
526 		if (lsh[index] != 0)
527 			count++;
528 
529 	return count;
530 }
531 
BOOST_AUTO_TEST_CASE(BuildIndex)532 BOOST_AUTO_TEST_CASE(BuildIndex)
533 {
534 
535 
536         CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
537 
538         const int kHashFct=32;
539 	const int kKmerSize=5;
540 
541         CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct);
542 
543 	string index_name("nr_test");
544 	CFileDeleteAtExit::Add(index_name + ".pki");
545 	CFileDeleteAtExit::Add(index_name + ".pkd");
546         build_index.Build();
547 
548 	CMinHashFile mhfile(index_name);
549 
550 	BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
551 	BOOST_REQUIRE_EQUAL(2, mhfile.GetDataWidth());
552 	BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
553 
554 	uint64_t* lsh_array = mhfile.GetLSHArray();
555 	int lsh_size = mhfile.GetLSHSize();
556 	BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
557 
558 	int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
559 	BOOST_REQUIRE_EQUAL(187, lsh_counts);
560 }
561 
BOOST_AUTO_TEST_CASE(BuildIndexRepeats)562 BOOST_AUTO_TEST_CASE(BuildIndexRepeats)
563 {
564         CRef<CSeqDB> seqdb(new CSeqDB("data/XP_001468867", CSeqDB::eProtein));
565 
566         const int kHashFct=32;
567 	const int kKmerSize=5;
568 
569         CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct);
570 
571 	string index_name("XP_001468867");
572 	CFileDeleteAtExit::Add(index_name + ".pki");
573 	CFileDeleteAtExit::Add(index_name + ".pkd");
574         build_index.Build();
575 
576 	CMinHashFile mhfile(index_name);
577 	BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
578 	BOOST_REQUIRE_EQUAL(2, mhfile.GetDataWidth());
579 	BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
580 
581 	uint64_t* lsh_array = mhfile.GetLSHArray();
582 	int lsh_size = mhfile.GetLSHSize();
583 	BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
584 
585 	int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
586  	BOOST_REQUIRE_EQUAL(213, lsh_counts);
587 
588 }
589 
BOOST_AUTO_TEST_CASE(BuildIndexNotValidSequence)590 BOOST_AUTO_TEST_CASE(BuildIndexNotValidSequence)
591 {
592         CRef<CSeqDB> seqdb(new CSeqDB("data/manyXs", CSeqDB::eProtein));
593 
594         const int kHashFct=32;
595 	const int kKmerSize=5;
596 
597         CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct);
598 
599 	string index_name("manyXs");
600 	CFileDeleteAtExit::Add(index_name + ".pki");
601 	CFileDeleteAtExit::Add(index_name + ".pkd");
602         build_index.Build();
603 	// data file is empty/missing.  Should there be an exception for that?
604 
605 	BOOST_REQUIRE_THROW(CMinHashFile mhfile(index_name), CMinHashException);
606 }
607 
BOOST_AUTO_TEST_CASE(BuildIndexWidth4Kmer4)608 BOOST_AUTO_TEST_CASE(BuildIndexWidth4Kmer4)
609 {
610 
611 
612         CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
613 
614         const int kHashFct=32;
615 	const int kKmerSize=4;
616 	const int kWidth=4;
617 
618         CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, 0, kWidth);
619 
620 	string index_name("nr_test");
621 	CFileDeleteAtExit::Add(index_name + ".pki");
622 	CFileDeleteAtExit::Add(index_name + ".pkd");
623         build_index.Build();
624 
625 	CMinHashFile mhfile(index_name);
626 	BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
627 	BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
628 	BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
629 
630 	uint64_t* lsh_array = mhfile.GetLSHArray();
631 	int lsh_size = mhfile.GetLSHSize();
632 	BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
633 
634 	int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
635 	BOOST_REQUIRE_EQUAL(172, lsh_counts);
636 }
637 
BOOST_AUTO_TEST_CASE(BuildIndexFewerBands)638 BOOST_AUTO_TEST_CASE(BuildIndexFewerBands)
639 {
640 
641         CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
642 
643         const int kHashFct=64;
644 	const int kKmerSize=4;
645 	const int kWidth=2;
646 
647         CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, 0, kWidth);
648 
649 	string index_name("nr_test");
650 	CFileDeleteAtExit::Add(index_name + ".pki");
651 	CFileDeleteAtExit::Add(index_name + ".pkd");
652         build_index.Build();
653 
654 	CMinHashFile mhfile(index_name);
655 	BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
656 	BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
657 	BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
658 
659 	uint64_t* lsh_array = mhfile.GetLSHArray();
660 	int lsh_size = mhfile.GetLSHSize();
661 	BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
662 
663 	int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
664 	BOOST_REQUIRE_EQUAL(339, lsh_counts);
665 }
666 
BOOST_AUTO_TEST_CASE(BuildIndex10letterAlphabet)667 BOOST_AUTO_TEST_CASE(BuildIndex10letterAlphabet)
668 {
669 
670         CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
671 
672         const int kHashFct=32;
673 	const int kKmerSize=5;
674 	const int kWidth=2;
675 	const int kAlphabet=1;
676 
677         CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, 0, kWidth, kAlphabet);
678 
679 	string index_name("nr_test");
680 	CFileDeleteAtExit::Add(index_name + ".pki");
681 	CFileDeleteAtExit::Add(index_name + ".pkd");
682         build_index.Build();
683 
684 	CMinHashFile mhfile(index_name);
685 	BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
686 	BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
687 	BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
688 	BOOST_REQUIRE_EQUAL(kAlphabet, mhfile.GetAlphabet());
689 
690 	uint64_t* lsh_array = mhfile.GetLSHArray();
691 	int lsh_size = mhfile.GetLSHSize();
692 	BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
693 
694 	int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
695 	BOOST_REQUIRE_EQUAL(173, lsh_counts);
696 }
697 
BOOST_AUTO_TEST_CASE(BuildIndex10letterVersion3)698 BOOST_AUTO_TEST_CASE(BuildIndex10letterVersion3)
699 {
700 
701         CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
702 
703         const int kHashFct=32;
704 	const int kKmerSize=5;
705 	const int kSamples=30;
706 	const int kWidth=2;
707 	const int kAlphabet=1;
708 	const int kVersion=3;
709 	const int kLSHStart=312;
710 
711         CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, kSamples, kWidth, kAlphabet, kVersion);
712 
713 	string index_name("nr_test");
714 	CFileDeleteAtExit::Add(index_name + ".pki");
715 	CFileDeleteAtExit::Add(index_name + ".pkd");
716         build_index.Build();
717 
718 	CMinHashFile mhfile(index_name);
719 	BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
720 	BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
721 	BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
722 	BOOST_REQUIRE_EQUAL(kAlphabet, mhfile.GetAlphabet());
723 	BOOST_REQUIRE_EQUAL(kVersion, mhfile.GetVersion());
724 	BOOST_REQUIRE_EQUAL(kLSHStart, mhfile.GetLSHStart());
725 
726 	uint64_t* lsh_array = mhfile.GetLSHArray();
727 	int lsh_size = mhfile.GetLSHSize();
728 	BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
729 
730 	int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
731 	BOOST_REQUIRE_EQUAL(563, lsh_counts);
732 
733 	int chunkSize = mhfile.GetChunkSize();
734 	BOOST_REQUIRE_EQUAL(150, chunkSize);
735 }
736 
BOOST_AUTO_TEST_CASE(CheckEmptyIndexName)737 BOOST_AUTO_TEST_CASE(CheckEmptyIndexName)
738 {
739 
740 	string index_name="";
741 	BOOST_REQUIRE_THROW(CMinHashFile mhfile(index_name), CMinHashException);
742 }
743 
744 
BOOST_AUTO_TEST_CASE(CheckNoIndex)745 BOOST_AUTO_TEST_CASE(CheckNoIndex)
746 {
747 	CBioseq bioseq;
748 	CNcbiIfstream i_file("data/129295.ncbieaa");
749 	unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
750 
751 	*is >> bioseq;
752 	CRef<CScope> scope(CSimpleOM::NewScope(false));
753 	scope->AddBioseq(bioseq);
754 	CRef<CSeq_loc> loc(new CSeq_loc);
755     	loc->SetWhole().Assign(*(bioseq.GetId().front()));
756 
757 	unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
758 	TSeqLocVector query_vector;
759 	query_vector.push_back(*ssl);
760         CRef<CSeqDB> seqdb(new CSeqDB("data/XP_001468867", CSeqDB::eProtein));
761 	CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
762 
763 	CBlastKmer kmersearch(query_vector, options, seqdb);
764 	BOOST_REQUIRE_THROW(kmersearch.Run(), CFileException);
765 }
766 
BOOST_AUTO_TEST_CASE(CheckOptionValidation)767 BOOST_AUTO_TEST_CASE(CheckOptionValidation)
768 {
769 	CRef<CBlastKmerOptions>  opts(new CBlastKmerOptions());
770 
771 	double myThresh=0.1;
772 	opts->SetThresh(myThresh);
773 	BOOST_REQUIRE_EQUAL(opts->GetThresh(), myThresh);
774 
775 	int hits=373;
776 	opts->SetMinHits(hits);
777 	BOOST_REQUIRE_EQUAL(opts->GetMinHits(), hits);
778 
779 	int targetSeqs=234;
780 	opts->SetNumTargetSeqs(targetSeqs);
781 	BOOST_REQUIRE_EQUAL(opts->GetNumTargetSeqs(), targetSeqs);
782 
783 	BOOST_REQUIRE_EQUAL(opts->Validate(), true);
784 
785 	opts->SetMinHits(0);
786 	BOOST_REQUIRE_EQUAL(opts->Validate(), true);
787 
788 	opts->SetThresh(-0.707);
789 	BOOST_REQUIRE_EQUAL(opts->Validate(), false);
790 
791 	opts->SetMinHits(-1);
792 	BOOST_REQUIRE_EQUAL(opts->Validate(), false);
793 }
794 
BOOST_AUTO_TEST_CASE(BadOptionsThrow)795 BOOST_AUTO_TEST_CASE(BadOptionsThrow)
796 {
797 
798         CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
799 
800 	CBioseq bioseq;
801 	CNcbiIfstream i_file("data/129295.stdaa");
802 	unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
803 
804 	*is >> bioseq;
805 	CRef<CScope> scope(CSimpleOM::NewScope(false));
806 	scope->AddBioseq(bioseq);
807 	CRef<CSeq_loc> loc(new CSeq_loc);
808     	loc->SetWhole().Assign(*(bioseq.GetId().front()));
809 
810 	unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
811 	TSeqLocVector query_vector;
812 	query_vector.push_back(*ssl);
813 	CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
814 	CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
815 	options->SetThresh(1.3); // Value should be one or less.
816 	BOOST_REQUIRE_THROW(CBlastKmer kmersearch(query_vector, options, db), CException);
817 }
818 
819 
820 BOOST_AUTO_TEST_SUITE_END()
821