1 /* $Id: proteinkmer_unit_test.cpp 631630 2021-05-19 19:43:37Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Tom Madden, NCBI
27 *
28 * File Description:
29 * Unit tests for proteinkmer library.
30 *
31 *
32 * ===========================================================================
33 */
34
35 #define NCBI_TEST_APPLICATION
36 #include <ncbi_pch.hpp>
37
38 #include <corelib/ncbi_system.hpp>
39 #include <corelib/ncbiapp.hpp>
40
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 #include <serial/objostr.hpp>
44 #include <serial/iterator.hpp>
45
46 #include <objects/seq/Bioseq.hpp>
47 #include <objects/seqloc/Seq_id.hpp>
48 #include <objects/seqloc/Seq_loc.hpp>
49
50 #include <objtools/simple/simple_om.hpp>
51
52 #include <objmgr/object_manager.hpp>
53 #include <objmgr/scope.hpp>
54
55 #include <util/sequtil/sequtil_convert.hpp>
56 #include <algo/blast/proteinkmer/blastkmer.hpp>
57 #include <algo/blast/proteinkmer/blastkmerutils.hpp>
58 #include <algo/blast/proteinkmer/blastkmerresults.hpp>
59 #include <algo/blast/proteinkmer/blastkmeroptions.hpp>
60 #include <algo/blast/proteinkmer/blastkmerindex.hpp>
61 #include <algo/blast/proteinkmer/blastkmerindex.hpp>
62 #include <algo/blast/proteinkmer/kblastapi.hpp>
63
64 #include <algo/blast/api/objmgr_query_data.hpp>
65 #include <algo/blast/api/uniform_search.hpp>
66 #include <algo/blast/api/blast_prot_options.hpp>
67 #include <algo/blast/api/blast_advprot_options.hpp>
68
69 #include <algo/blast/blastinput/blast_input.hpp>
70
71 // This header must be included before all Boost.Test headers if there are any
72 #include <corelib/test_boost.hpp>
73
74 #include <util/random_gen.hpp>
75
76
77 USING_NCBI_SCOPE;
78 USING_SCOPE(blast);
79 USING_SCOPE(objects);
80
81 BOOST_AUTO_TEST_SUITE(proteinkmers)
82
83
BOOST_AUTO_TEST_CASE(KmerResults)84 BOOST_AUTO_TEST_CASE(KmerResults)
85 {
86 const int kNumHits = 6;
87 // nr_test has GIs from array subjid in the order given.
88 const TGi subjid[kNumHits] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
89 const double scores[kNumHits] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
90 TBlastKmerPrelimScoreVector prelim_vector;
91 for (int index=0; index<kNumHits; index++)
92 {
93 pair<uint32_t, double> retval(index, scores[index]);
94 prelim_vector.push_back(retval);
95 }
96 CRef<CSeq_id> qid(new CSeq_id("gi|129296"));
97 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
98 // values in stats are: hit_count, jd_count, jd_oid_count, oids_considered, total_matches
99 BlastKmerStats stats( 3256, 506, 253, 77623, 27);
100
101 CBlastKmerResults results(qid, prelim_vector, stats, seqdb, TQueryMessages());
102
103 CRef<CScope> scope(CSimpleOM::NewScope());
104 TSeqLocVector tsl;
105 results.GetTSL(tsl, scope);
106 int index=0;
107 for (TSeqLocVector::iterator iter=tsl.begin(); iter != tsl.end(); ++iter)
108 {
109 BOOST_REQUIRE_EQUAL((*iter).seqloc->GetId()->GetGi(), subjid[index]);
110 index++;
111 }
112
113 const TBlastKmerScoreVector kmerscores = results.GetScores();
114 index=0;
115 for (TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
116 {
117 CRef<CSeq_id> sid = (*iter).first;
118 BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid[index]);
119 BOOST_REQUIRE_EQUAL((*iter).second, scores[index]);
120 index++;
121 }
122
123 const BlastKmerStats& newstats = results.GetStats();
124 BOOST_REQUIRE_EQUAL(newstats.hit_count, stats.hit_count);
125 BOOST_REQUIRE_EQUAL(newstats.jd_count, stats.jd_count);
126 }
127
BOOST_AUTO_TEST_CASE(KmerResultsSet)128 BOOST_AUTO_TEST_CASE(KmerResultsSet)
129 {
130 const int kNumQueries = 2;
131 const int kNumHits1 = 6;
132 const int kNumHits2 = 2;
133 // nr_test has GIs from array subjid in the order given by subjid1.
134 const TGi subjid1[kNumHits1] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
135 const TGi subjid2[kNumHits2] = { 129295, 677974076};
136 const double scores1[kNumHits1] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
137 const double scores2[kNumHits1] = {0.359375, 0.710938};
138 TBlastKmerPrelimScoreVector prelim_vector1;
139 TBlastKmerPrelimScoreVector prelim_vector2;
140 for (int index=0; index<kNumHits1; index++)
141 {
142 pair<uint32_t, double> retval(index, scores1[index]);
143 prelim_vector1.push_back(retval);
144 }
145 // Ordering of OID to GIs are off here.
146 for (int index=0; index<kNumHits2; index++)
147 { // Matches start at fourth sequence
148 pair<uint32_t, double> retval(index+4, scores2[index]);
149 prelim_vector2.push_back(retval);
150 }
151 CBlastKmerResultsSet::TBlastKmerPrelimScoreVectorSet vec_set;
152 vec_set.push_back(prelim_vector1);
153 vec_set.push_back(prelim_vector2);
154 CRef<CSeq_id> qid1(new CSeq_id("gi|129296"));
155 CRef<CSeq_id> qid2(new CSeq_id("gi|129295"));
156 CBlastKmerResultsSet::TQueryIdVector id_vec;
157 id_vec.push_back(qid1);
158 id_vec.push_back(qid2);
159
160 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
161
162 // values in stats are: hit_count, jd_count, jd_oid_count, oids_considered, total_matches
163 BlastKmerStats stats1( 3256, 506, 253, 77623, 27);
164 BlastKmerStats stats2( 3255, 505, 255, 77625, 25);
165 CBlastKmerResultsSet::TBlastKmerStatsVector stats_vec;
166 stats_vec.push_back(stats1);
167 stats_vec.push_back(stats2);
168 TSearchMessages errs;
169 errs.resize(kNumQueries);
170
171 CBlastKmerResultsSet result_set(id_vec, vec_set, stats_vec, seqdb, errs);
172
173 BOOST_REQUIRE_EQUAL(result_set.GetNumQueries(), kNumQueries);
174
175 CBlastKmerResults results = result_set[0];
176
177 const TBlastKmerScoreVector kmerscores = results.GetScores();
178 int index=0;
179 for (TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
180 {
181 CRef<CSeq_id> sid = (*iter).first;
182 BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid1[index]);
183 BOOST_REQUIRE_EQUAL((*iter).second, scores1[index]);
184 index++;
185 }
186
187 const BlastKmerStats& newstats = results.GetStats();
188 BOOST_REQUIRE_EQUAL(newstats.hit_count, stats1.hit_count);
189 BOOST_REQUIRE_EQUAL(newstats.jd_count, stats1.jd_count);
190
191 // Results from second query.
192 results = result_set[1];
193 const TBlastKmerScoreVector kmerscores2 = results.GetScores();
194 index=0;
195 for (TBlastKmerScoreVector::const_iterator iter=kmerscores2.begin(); iter != kmerscores2.end(); ++iter)
196 {
197 CRef<CSeq_id> sid = (*iter).first;
198 BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid2[index]);
199 BOOST_REQUIRE_EQUAL((*iter).second, scores2[index]);
200 index++;
201 }
202
203 // ID in this search set
204 CRef<CBlastKmerResults> resultsNotNULL = result_set[*qid1];
205 BOOST_REQUIRE(resultsNotNULL.IsNull() == false);
206
207 // ID not in this search set.
208 CRef<CSeq_id> qid_bogus(new CSeq_id("gi|555"));
209 CRef<CBlastKmerResults> resultsNull = result_set[*qid_bogus];
210 BOOST_REQUIRE(resultsNull.IsNull() == true);
211 }
212
BOOST_AUTO_TEST_CASE(KmerResultsSetPushBack)213 BOOST_AUTO_TEST_CASE(KmerResultsSetPushBack)
214 {
215 const int kNumQueries = 2;
216 const int kNumHits1 = 6;
217 const int kNumHits2 = 2;
218 // nr_test has GIs from array subjid in the order given by subjid1.
219 const TGi subjid1[kNumHits1] = { 129296, 385145541, 448824824, 510032768, 129295, 677974076};
220 const TGi subjid2[kNumHits2] = { 129295, 677974076};
221 const double scores1[kNumHits1] = {0.359375, 0.710938, 0.242188, 0.234375, 0.28125, 0.234375};
222 const double scores2[kNumHits1] = {0.359375, 0.710938};
223 TBlastKmerPrelimScoreVector prelim_vector1;
224 TBlastKmerPrelimScoreVector prelim_vector2;
225 for (int index=0; index<kNumHits1; index++)
226 {
227 pair<uint32_t, double> retval(index, scores1[index]);
228 prelim_vector1.push_back(retval);
229 }
230 // Ordering of OID to GIs are off here.
231 for (int index=0; index<kNumHits2; index++)
232 { // Matches start at fourth sequence
233 pair<uint32_t, double> retval(index+4, scores2[index]);
234 prelim_vector2.push_back(retval);
235 }
236
237 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
238 TQueryMessages errs = TQueryMessages();
239
240 CRef<CSeq_id> qid1(new CSeq_id("gi|129296"));
241 BlastKmerStats stats1( 3256, 506, 253, 77623, 27);
242 CRef<CBlastKmerResults> results1;
243 results1.Reset(new CBlastKmerResults(qid1, prelim_vector1, stats1, seqdb, TQueryMessages()));
244
245 CRef<CSeq_id> qid2(new CSeq_id("gi|129295"));
246 BlastKmerStats stats2( 3255, 505, 255, 77625, 25);
247 CRef<CBlastKmerResults> results2;
248 results2.Reset(new CBlastKmerResults(qid2, prelim_vector2, stats2, seqdb, errs));
249
250 CBlastKmerResultsSet result_set;
251
252 result_set.push_back(results1);
253 result_set.push_back(results2);
254
255 BOOST_REQUIRE_EQUAL(result_set.GetNumQueries(), kNumQueries);
256
257 CBlastKmerResults results = result_set[0];
258
259 const TBlastKmerScoreVector kmerscores = results.GetScores();
260 int index=0;
261 for (TBlastKmerScoreVector::const_iterator iter=kmerscores.begin(); iter != kmerscores.end(); ++iter)
262 {
263 CRef<CSeq_id> sid = (*iter).first;
264 BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid1[index]);
265 BOOST_REQUIRE_EQUAL((*iter).second, scores1[index]);
266 index++;
267 }
268
269 const BlastKmerStats& newstats = results.GetStats();
270 BOOST_REQUIRE_EQUAL(newstats.hit_count, stats1.hit_count);
271 BOOST_REQUIRE_EQUAL(newstats.jd_count, stats1.jd_count);
272
273 // Results from second query.
274 results = result_set[1];
275 const TBlastKmerScoreVector kmerscores2 = results.GetScores();
276 index=0;
277 for (TBlastKmerScoreVector::const_iterator iter=kmerscores2.begin(); iter != kmerscores2.end(); ++iter)
278 {
279 CRef<CSeq_id> sid = (*iter).first;
280 BOOST_REQUIRE_EQUAL(sid->GetGi(), subjid2[index]);
281 BOOST_REQUIRE_EQUAL((*iter).second, scores2[index]);
282 index++;
283 }
284 }
285
286 /* Test subset??
287 BOOST_AUTO_TEST_CASE(SearchWithBadQuery)
288 {
289 CBioseq bioseq;
290 CNcbiIfstream i_file("data/bad.asn");
291 unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
292
293 *is >> bioseq;
294 CRef<CScope> scope(CSimpleOM::NewScope(false));
295 scope->AddBioseq(bioseq);
296 CRef<CSeq_loc> loc(new CSeq_loc);
297 loc->SetWhole().Assign(*(bioseq.GetId().front()));
298
299 unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
300 TSeqLocVector query_vector;
301 query_vector.push_back(*ssl);
302 CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
303 CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
304
305 CBlastKmer kmersearch(query_vector, options, db);
306
307 CRef<CBlastKmerResultsSet> results = kmersearch.Run();
308 BOOST_REQUIRE((*results)[0].HasErrors() == false);
309 BOOST_REQUIRE((*results)[0].HasWarnings() == true);
310 }
311 */
312
BOOST_AUTO_TEST_CASE(SearchWithBadDatabase)313 BOOST_AUTO_TEST_CASE(SearchWithBadDatabase)
314 {
315 CBioseq bioseq;
316 CNcbiIfstream i_file("data/129295.stdaa");
317 unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
318
319 *is >> bioseq;
320 CRef<CScope> scope(CSimpleOM::NewScope(false));
321 scope->AddBioseq(bioseq);
322 CRef<CSeq_loc> loc(new CSeq_loc);
323 loc->SetWhole().Assign(*(bioseq.GetId().front()));
324
325 SSeqLoc ssl(*loc, *scope);
326 CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
327
328 string dbname("JUNK");
329 BOOST_REQUIRE_THROW(CBlastKmer kmersearch(ssl, options, dbname), CSeqDBException);
330 }
331
332
333 /* smaller test??
334 BOOST_AUTO_TEST_CASE(GIListLimitSearch)
335 {
336 CBioseq bioseq;
337 CNcbiIfstream i_file("data/129295.iupacaa");
338 unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
339
340 *is >> bioseq;
341 CRef<CScope> scope(CSimpleOM::NewScope(false));
342 scope->AddBioseq(bioseq);
343 CRef<CSeq_loc> loc(new CSeq_loc);
344 loc->SetWhole().Assign(*(bioseq.GetId().front()));
345
346 unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
347 TSeqLocVector query_vector;
348 query_vector.push_back(*ssl);
349 CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
350 CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
351 options->SetThresh(0.15);
352
353 CBlastKmer kmersearch(query_vector, options, db);
354 CRef<CSeqDBGiList> seqdb_gilist(new CSeqDBGiList());
355 seqdb_gilist->AddGi(3091);
356 seqdb_gilist->AddGi(129296);
357 seqdb_gilist->AddGi(448824824);
358 kmersearch.SetGiListLimit(seqdb_gilist);
359 CRef<CBlastKmerResultsSet> results = kmersearch.Run();
360
361 const TBlastKmerScoreVector& scores = (*results)[0].GetScores();
362 BOOST_REQUIRE_EQUAL(scores.size(), 2);
363 }
364 */
365
366 /*
367 BOOST_AUTO_TEST_CASE(NegativeGIListLimitSearch)
368 {
369 CBioseq bioseq;
370 CNcbiIfstream i_file("data/129295.iupacaa");
371 unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
372
373 *is >> bioseq;
374 CRef<CScope> scope(CSimpleOM::NewScope(false));
375 scope->AddBioseq(bioseq);
376 CRef<CSeq_loc> loc(new CSeq_loc);
377 loc->SetWhole().Assign(*(bioseq.GetId().front()));
378
379 unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
380 TSeqLocVector query_vector;
381 query_vector.push_back(*ssl);
382 CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
383 CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
384 options->SetThresh(0.15);
385
386 CBlastKmer kmersearch(query_vector, options, db);
387 CRef<CSeqDBNegativeList> seqdb_gilist(new CSeqDBNegativeList());
388 seqdb_gilist->AddGi(3091);
389 seqdb_gilist->AddGi(129296);
390 seqdb_gilist->AddGi(448824824);
391 kmersearch.SetGiListLimit(seqdb_gilist);
392 CRef<CBlastKmerResultsSet> resultSet = kmersearch.RunSearches();
393 CBlastKmerResults results = (*resultSet)[0];
394
395 const TBlastKmerScoreVector& scores = results.GetScores();
396 BOOST_REQUIRE_EQUAL(scores.size(), 3);
397 }
398 */
399
s_GetRandomNumbers(uint32_t * a,uint32_t * b,int numHashes)400 void s_GetRandomNumbers(uint32_t* a, uint32_t* b, int numHashes)
401 {
402 CRandom random(1); // Always have the same random numbers.
403 for(int i=0;i<numHashes;i++)
404 {
405 do
406 {
407 a[i]=random.GetRand();
408 }
409 while (a[i] == 0);
410
411 b[i]=random.GetRand();
412 }
413 }
414
415
BOOST_AUTO_TEST_CASE(CheckQueryHashes)416 BOOST_AUTO_TEST_CASE(CheckQueryHashes)
417 {
418 string queryseq_eaa =
419 "MDSISVTNAKFCFDVFNEMKVHHVNENILYCPLSILTALAMVYLGARGNTESQMKKVLHFDSITGAGSTTDSQCGSSEYV"
420 "HNLFKELLSEITRPNATYSLEIADKLYVDKTFSVLPEYLSCARKFYTGGVEEVNFKTAAEEARQLINSWVEKETNGQIKD"
421 "LLVSSSIDFGTTMVFINTIYFKGIWKIAFNTEDTREMPFSMTKEESKPVQMMCMNNSFNVATLPAEKMKILELPYASGDL";
422
423 string queryseq_stdaa;
424 CSeqConvert::Convert(queryseq_eaa, CSeqUtil::e_Ncbieaa, 0, queryseq_eaa.length(),
425 queryseq_stdaa, CSeqUtil::e_Ncbistdaa);
426
427 const int kNumHashes=128;
428 const int kDoSeg=0;
429 const int kKmerNum=5;
430 const int kAlphabet=0; // 15 letters
431 uint32_t a[kNumHashes];
432 uint32_t b[kNumHashes];
433 const int kChunkSize=150;
434
435 s_GetRandomNumbers(a, b, kNumHashes);
436 vector < vector <uint32_t> > seq_hash;
437
438 minhash_query(queryseq_stdaa, seq_hash, kNumHashes, a, b, kDoSeg, kKmerNum, kAlphabet, kChunkSize);
439
440 BOOST_REQUIRE_EQUAL(seq_hash.size(), 2);
441 BOOST_REQUIRE_EQUAL(seq_hash[0].size(), kNumHashes);
442 BOOST_REQUIRE_EQUAL(seq_hash[0][0], 529895);
443 BOOST_REQUIRE_EQUAL(seq_hash[0][1], 798115);
444 BOOST_REQUIRE_EQUAL(seq_hash[0][63], 90979);
445 BOOST_REQUIRE_EQUAL(seq_hash[0][83], 336201);
446
447 // Expected (correct) values.
448 const int lsh_hash_length = 13;
449 const int lsh_hash_vals[lsh_hash_length] = { 973119,1097197,1157729,1681152,1913970,1933659,2018075,2123893,2355301,2800673,2940688,2941967,3535701};
450
451 const int kRowsPerBand=2;
452 const int kNumBands = kNumHashes/kRowsPerBand;
453 vector< vector <uint32_t> > lsh_hash_vec;
454 get_LSH_hashes(seq_hash, lsh_hash_vec, kNumBands, kRowsPerBand);
455 int index=0;
456 int numChunks = lsh_hash_vec.size();
457 for (int i=0; i<numChunks && index<lsh_hash_length; i++)
458 {
459 for(vector<uint32_t>::iterator iter=lsh_hash_vec[i].begin(); iter != lsh_hash_vec[i].end(); ++iter)
460 {
461 BOOST_REQUIRE_EQUAL(*iter, lsh_hash_vals[index]);
462 index++;
463 if (index == lsh_hash_length)
464 break;
465 }
466 }
467 }
468
BOOST_AUTO_TEST_CASE(CheckQueryHashesVersion3)469 BOOST_AUTO_TEST_CASE(CheckQueryHashesVersion3)
470 {
471 string queryseq_eaa =
472 "MDSISVTNAKFCFDVFNEMKVHHVNENILYCPLSILTALAMVYLGARGNTESQMKKVLHFDSITGAGSTTDSQCGSSEYV"
473 "HNLFKELLSEITRPNATYSLEIADKLYVDKTFSVLPEYLSCARKFYTGGVEEVNFKTAAEEARQLINSWVEKETNGQIKD"
474 "LLVSSSIDFGTTMVFINTIYFKGIWKIAFNTEDTREMPFSMTKEESKPVQMMCMNNSFNVATLPAEKMKILELPYASGDL";
475
476 string queryseq_stdaa;
477 CSeqConvert::Convert(queryseq_eaa, CSeqUtil::e_Ncbieaa, 0, queryseq_eaa.length(),
478 queryseq_stdaa, CSeqUtil::e_Ncbistdaa);
479
480 const int kNumHashes=32;
481 const int kKmerNum=5;
482 const int kAlphabet=0; // 15 letters
483 vector<int> badMers;
484 const int kChunkSize=150;
485
486 vector < vector <uint32_t> > seq_hash;
487
488 minhash_query2(queryseq_stdaa, seq_hash, kKmerNum, kNumHashes, kAlphabet, badMers, kChunkSize);
489
490 BOOST_REQUIRE_EQUAL(seq_hash.size(), 2);
491 BOOST_REQUIRE_EQUAL(seq_hash[0].size(), kNumHashes);
492 BOOST_REQUIRE_EQUAL(seq_hash[0][0], 2683052);
493 BOOST_REQUIRE_EQUAL(seq_hash[0][1], 26519505);
494 BOOST_REQUIRE_EQUAL(seq_hash[0][2], 45619224);
495 BOOST_REQUIRE_EQUAL(seq_hash[0][15], 396863844);
496 // Values are ordered from smallest to largest
497 BOOST_REQUIRE(seq_hash[1][0] < seq_hash[0][1]);
498 BOOST_REQUIRE(seq_hash[1][1] < seq_hash[0][2]);
499 BOOST_REQUIRE(seq_hash[1][3] < seq_hash[0][kNumHashes-1]);
500
501 // Expected (correct) values.
502 const int lsh_hash_length = 13;
503 const int lsh_hash_vals[lsh_hash_length] = { 700168,1293774,1377419,1712432,1819660,2314660,2484152,2944352,2951476,3273866,3625878,3709806,3837843};
504
505 const int kRowsPerBand=2;
506 vector< vector <uint32_t> > lsh_hash_vec;
507 get_LSH_hashes5(seq_hash, lsh_hash_vec, kNumHashes, kRowsPerBand);
508 int index=0;
509 int numChunks = lsh_hash_vec.size();
510 for (int i=0; i<numChunks && index<lsh_hash_length; i++)
511 {
512 for(vector<uint32_t>::iterator iter=lsh_hash_vec[i].begin(); iter != lsh_hash_vec[i].end(); ++iter)
513 {
514 BOOST_REQUIRE_EQUAL(*iter, lsh_hash_vals[index]);
515 index++;
516 if (index == lsh_hash_length)
517 break;
518 }
519 }
520 }
521
s_GetNumLSHHits(uint64_t * lsh,int lshSize)522 int s_GetNumLSHHits(uint64_t* lsh, int lshSize)
523 {
524 int count=0;
525 for (int index=0; index<lshSize; index++)
526 if (lsh[index] != 0)
527 count++;
528
529 return count;
530 }
531
BOOST_AUTO_TEST_CASE(BuildIndex)532 BOOST_AUTO_TEST_CASE(BuildIndex)
533 {
534
535
536 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
537
538 const int kHashFct=32;
539 const int kKmerSize=5;
540
541 CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct);
542
543 string index_name("nr_test");
544 CFileDeleteAtExit::Add(index_name + ".pki");
545 CFileDeleteAtExit::Add(index_name + ".pkd");
546 build_index.Build();
547
548 CMinHashFile mhfile(index_name);
549
550 BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
551 BOOST_REQUIRE_EQUAL(2, mhfile.GetDataWidth());
552 BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
553
554 uint64_t* lsh_array = mhfile.GetLSHArray();
555 int lsh_size = mhfile.GetLSHSize();
556 BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
557
558 int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
559 BOOST_REQUIRE_EQUAL(187, lsh_counts);
560 }
561
BOOST_AUTO_TEST_CASE(BuildIndexRepeats)562 BOOST_AUTO_TEST_CASE(BuildIndexRepeats)
563 {
564 CRef<CSeqDB> seqdb(new CSeqDB("data/XP_001468867", CSeqDB::eProtein));
565
566 const int kHashFct=32;
567 const int kKmerSize=5;
568
569 CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct);
570
571 string index_name("XP_001468867");
572 CFileDeleteAtExit::Add(index_name + ".pki");
573 CFileDeleteAtExit::Add(index_name + ".pkd");
574 build_index.Build();
575
576 CMinHashFile mhfile(index_name);
577 BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
578 BOOST_REQUIRE_EQUAL(2, mhfile.GetDataWidth());
579 BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
580
581 uint64_t* lsh_array = mhfile.GetLSHArray();
582 int lsh_size = mhfile.GetLSHSize();
583 BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
584
585 int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
586 BOOST_REQUIRE_EQUAL(213, lsh_counts);
587
588 }
589
BOOST_AUTO_TEST_CASE(BuildIndexNotValidSequence)590 BOOST_AUTO_TEST_CASE(BuildIndexNotValidSequence)
591 {
592 CRef<CSeqDB> seqdb(new CSeqDB("data/manyXs", CSeqDB::eProtein));
593
594 const int kHashFct=32;
595 const int kKmerSize=5;
596
597 CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct);
598
599 string index_name("manyXs");
600 CFileDeleteAtExit::Add(index_name + ".pki");
601 CFileDeleteAtExit::Add(index_name + ".pkd");
602 build_index.Build();
603 // data file is empty/missing. Should there be an exception for that?
604
605 BOOST_REQUIRE_THROW(CMinHashFile mhfile(index_name), CMinHashException);
606 }
607
BOOST_AUTO_TEST_CASE(BuildIndexWidth4Kmer4)608 BOOST_AUTO_TEST_CASE(BuildIndexWidth4Kmer4)
609 {
610
611
612 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
613
614 const int kHashFct=32;
615 const int kKmerSize=4;
616 const int kWidth=4;
617
618 CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, 0, kWidth);
619
620 string index_name("nr_test");
621 CFileDeleteAtExit::Add(index_name + ".pki");
622 CFileDeleteAtExit::Add(index_name + ".pkd");
623 build_index.Build();
624
625 CMinHashFile mhfile(index_name);
626 BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
627 BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
628 BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
629
630 uint64_t* lsh_array = mhfile.GetLSHArray();
631 int lsh_size = mhfile.GetLSHSize();
632 BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
633
634 int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
635 BOOST_REQUIRE_EQUAL(172, lsh_counts);
636 }
637
BOOST_AUTO_TEST_CASE(BuildIndexFewerBands)638 BOOST_AUTO_TEST_CASE(BuildIndexFewerBands)
639 {
640
641 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
642
643 const int kHashFct=64;
644 const int kKmerSize=4;
645 const int kWidth=2;
646
647 CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, 0, kWidth);
648
649 string index_name("nr_test");
650 CFileDeleteAtExit::Add(index_name + ".pki");
651 CFileDeleteAtExit::Add(index_name + ".pkd");
652 build_index.Build();
653
654 CMinHashFile mhfile(index_name);
655 BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
656 BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
657 BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
658
659 uint64_t* lsh_array = mhfile.GetLSHArray();
660 int lsh_size = mhfile.GetLSHSize();
661 BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
662
663 int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
664 BOOST_REQUIRE_EQUAL(339, lsh_counts);
665 }
666
BOOST_AUTO_TEST_CASE(BuildIndex10letterAlphabet)667 BOOST_AUTO_TEST_CASE(BuildIndex10letterAlphabet)
668 {
669
670 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
671
672 const int kHashFct=32;
673 const int kKmerSize=5;
674 const int kWidth=2;
675 const int kAlphabet=1;
676
677 CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, 0, kWidth, kAlphabet);
678
679 string index_name("nr_test");
680 CFileDeleteAtExit::Add(index_name + ".pki");
681 CFileDeleteAtExit::Add(index_name + ".pkd");
682 build_index.Build();
683
684 CMinHashFile mhfile(index_name);
685 BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
686 BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
687 BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
688 BOOST_REQUIRE_EQUAL(kAlphabet, mhfile.GetAlphabet());
689
690 uint64_t* lsh_array = mhfile.GetLSHArray();
691 int lsh_size = mhfile.GetLSHSize();
692 BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
693
694 int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
695 BOOST_REQUIRE_EQUAL(173, lsh_counts);
696 }
697
BOOST_AUTO_TEST_CASE(BuildIndex10letterVersion3)698 BOOST_AUTO_TEST_CASE(BuildIndex10letterVersion3)
699 {
700
701 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
702
703 const int kHashFct=32;
704 const int kKmerSize=5;
705 const int kSamples=30;
706 const int kWidth=2;
707 const int kAlphabet=1;
708 const int kVersion=3;
709 const int kLSHStart=312;
710
711 CBlastKmerBuildIndex build_index(seqdb, kKmerSize, kHashFct, kSamples, kWidth, kAlphabet, kVersion);
712
713 string index_name("nr_test");
714 CFileDeleteAtExit::Add(index_name + ".pki");
715 CFileDeleteAtExit::Add(index_name + ".pkd");
716 build_index.Build();
717
718 CMinHashFile mhfile(index_name);
719 BOOST_REQUIRE_EQUAL(kHashFct, mhfile.GetNumHashes());
720 BOOST_REQUIRE_EQUAL(kWidth, mhfile.GetDataWidth());
721 BOOST_REQUIRE_EQUAL(kKmerSize, mhfile.GetKmerSize());
722 BOOST_REQUIRE_EQUAL(kAlphabet, mhfile.GetAlphabet());
723 BOOST_REQUIRE_EQUAL(kVersion, mhfile.GetVersion());
724 BOOST_REQUIRE_EQUAL(kLSHStart, mhfile.GetLSHStart());
725
726 uint64_t* lsh_array = mhfile.GetLSHArray();
727 int lsh_size = mhfile.GetLSHSize();
728 BOOST_REQUIRE_EQUAL(0x1000001, lsh_size);
729
730 int lsh_counts = s_GetNumLSHHits(lsh_array, lsh_size-1);
731 BOOST_REQUIRE_EQUAL(563, lsh_counts);
732
733 int chunkSize = mhfile.GetChunkSize();
734 BOOST_REQUIRE_EQUAL(150, chunkSize);
735 }
736
BOOST_AUTO_TEST_CASE(CheckEmptyIndexName)737 BOOST_AUTO_TEST_CASE(CheckEmptyIndexName)
738 {
739
740 string index_name="";
741 BOOST_REQUIRE_THROW(CMinHashFile mhfile(index_name), CMinHashException);
742 }
743
744
BOOST_AUTO_TEST_CASE(CheckNoIndex)745 BOOST_AUTO_TEST_CASE(CheckNoIndex)
746 {
747 CBioseq bioseq;
748 CNcbiIfstream i_file("data/129295.ncbieaa");
749 unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
750
751 *is >> bioseq;
752 CRef<CScope> scope(CSimpleOM::NewScope(false));
753 scope->AddBioseq(bioseq);
754 CRef<CSeq_loc> loc(new CSeq_loc);
755 loc->SetWhole().Assign(*(bioseq.GetId().front()));
756
757 unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
758 TSeqLocVector query_vector;
759 query_vector.push_back(*ssl);
760 CRef<CSeqDB> seqdb(new CSeqDB("data/XP_001468867", CSeqDB::eProtein));
761 CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
762
763 CBlastKmer kmersearch(query_vector, options, seqdb);
764 BOOST_REQUIRE_THROW(kmersearch.Run(), CFileException);
765 }
766
BOOST_AUTO_TEST_CASE(CheckOptionValidation)767 BOOST_AUTO_TEST_CASE(CheckOptionValidation)
768 {
769 CRef<CBlastKmerOptions> opts(new CBlastKmerOptions());
770
771 double myThresh=0.1;
772 opts->SetThresh(myThresh);
773 BOOST_REQUIRE_EQUAL(opts->GetThresh(), myThresh);
774
775 int hits=373;
776 opts->SetMinHits(hits);
777 BOOST_REQUIRE_EQUAL(opts->GetMinHits(), hits);
778
779 int targetSeqs=234;
780 opts->SetNumTargetSeqs(targetSeqs);
781 BOOST_REQUIRE_EQUAL(opts->GetNumTargetSeqs(), targetSeqs);
782
783 BOOST_REQUIRE_EQUAL(opts->Validate(), true);
784
785 opts->SetMinHits(0);
786 BOOST_REQUIRE_EQUAL(opts->Validate(), true);
787
788 opts->SetThresh(-0.707);
789 BOOST_REQUIRE_EQUAL(opts->Validate(), false);
790
791 opts->SetMinHits(-1);
792 BOOST_REQUIRE_EQUAL(opts->Validate(), false);
793 }
794
BOOST_AUTO_TEST_CASE(BadOptionsThrow)795 BOOST_AUTO_TEST_CASE(BadOptionsThrow)
796 {
797
798 CRef<CSeqDB> seqdb(new CSeqDB("data/nr_test", CSeqDB::eProtein));
799
800 CBioseq bioseq;
801 CNcbiIfstream i_file("data/129295.stdaa");
802 unique_ptr<CObjectIStream> is(CObjectIStream::Open(eSerial_AsnText, i_file));
803
804 *is >> bioseq;
805 CRef<CScope> scope(CSimpleOM::NewScope(false));
806 scope->AddBioseq(bioseq);
807 CRef<CSeq_loc> loc(new CSeq_loc);
808 loc->SetWhole().Assign(*(bioseq.GetId().front()));
809
810 unique_ptr<SSeqLoc> ssl(new SSeqLoc(*loc, *scope));
811 TSeqLocVector query_vector;
812 query_vector.push_back(*ssl);
813 CRef<CSeqDB> db(new CSeqDB("data/nr_test", CSeqDB::eProtein));
814 CRef<CBlastKmerOptions> options(new CBlastKmerOptions());
815 options->SetThresh(1.3); // Value should be one or less.
816 BOOST_REQUIRE_THROW(CBlastKmer kmersearch(query_vector, options, db), CException);
817 }
818
819
820 BOOST_AUTO_TEST_SUITE_END()
821