1 /* $Id: bl2seq_unit_test.cpp 607143 2020-04-30 13:01:21Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Christiam Camacho
27 *
28 */
29
30 /** @file blast_unit_test.cpp
31 * Unit tests for the CBl2Seq class
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <corelib/test_boost.hpp>
36 #include <algo/blast/api/bl2seq.hpp>
37 #include <objects/seqalign/Seq_align.hpp>
38 #include <objects/seqalign/Seq_align_set.hpp>
39 #include <objects/seqalign/Std_seg.hpp>
40 #include <objects/seqalign/Dense_seg.hpp>
41 #include <objects/seqalign/Score.hpp>
42 #include <objects/general/Object_id.hpp>
43
44 #include <serial/serial.hpp>
45 #include <serial/iterator.hpp>
46 #include <serial/objostr.hpp>
47
48 #include <algo/blast/api/tblastn_options.hpp>
49 #include <algo/blast/format/blastfmtutil.hpp>
50
51 #include <algo/blast/api/blast_options_handle.hpp>
52 #include <algo/blast/api/blast_prot_options.hpp>
53 #include <algo/blast/api/blastx_options.hpp>
54 #include <algo/blast/api/tblastn_options.hpp>
55 #include <algo/blast/api/blast_nucl_options.hpp>
56 #include <algo/blast/api/disc_nucl_options.hpp>
57 #include <algo/blast/api/local_blast.hpp> // for CLocalBlast
58 #include <algo/blast/api/local_db_adapter.hpp> // for CLocalDbAdapter
59 #include <algo/blast/api/objmgr_query_data.hpp> // for CObjMgr_QueryFactory
60 #include <algo/blast/blastinput/blast_input.hpp>
61 #include <algo/blast/blastinput/blast_fasta_input.hpp>
62 #include <algo/blast/api/windowmask_filter.hpp>
63
64 #include <objtools/data_loaders/genbank/gbloader.hpp>
65 #include <objtools/simple/simple_om.hpp> // for CSimpleOM
66 #include <objtools/readers/fasta.hpp> // for CFastaReader
67 #include <objmgr/util/seq_loc_util.hpp>
68 #include <objmgr/util/sequence.hpp>
69
70 #include "test_objmgr.hpp"
71
72 #include <util/random_gen.hpp>
73
74 #include <corelib/test_boost.hpp>
75
76 #ifndef SKIP_DOXYGEN_PROCESSING
77
78 USING_NCBI_SCOPE;
79 USING_SCOPE(blast);
80 USING_SCOPE(objects);
81
BOOST_AUTO_TEST_SUITE(bl2seq)82 BOOST_AUTO_TEST_SUITE(bl2seq)
83
84 CRef<CSeq_loc> s_MakePackedInt2(CRef<CSeq_id> id, vector<TSeqRange>& range_vec)
85 {
86 CRef<CSeq_loc> retval(new CSeq_loc());
87 NON_CONST_ITERATE(vector<TSeqRange>, itr, range_vec)
88 {
89 retval->SetPacked_int().AddInterval(*id, (*itr).GetFrom(), (*itr).GetTo());
90 }
91 return retval;
92 }
93
BOOST_AUTO_TEST_CASE(NucleotideMultipleSeqLocs1)94 BOOST_AUTO_TEST_CASE(NucleotideMultipleSeqLocs1) {
95
96 CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 224514841));
97 CRef<CScope> scope(CSimpleOM::NewScope());
98 TSeqLocVector query;
99
100 vector<TSeqRange> range_vec;
101
102 range_vec.push_back(TSeqRange(73011288, 73011591));
103 range_vec.push_back(TSeqRange(73080052, 73080223));
104 range_vec.push_back(TSeqRange(73096483, 73096589));
105 range_vec.push_back(TSeqRange(73097765, 73097864));
106 range_vec.push_back(TSeqRange(73113762, 73113809));
107 range_vec.push_back(TSeqRange(73119266, 73119340));
108 range_vec.push_back(TSeqRange(73168955, 73169141));
109 range_vec.push_back(TSeqRange(73178294, 73178376));
110 range_vec.push_back(TSeqRange(73220818, 73220920));
111 range_vec.push_back(TSeqRange(73223091, 73223365));
112
113 CRef<CSeq_loc> temp_loc1 = s_MakePackedInt2(id, range_vec);
114 ofstream o1("temploc1.out.asn");
115 o1 << MSerial_AsnText << *temp_loc1 ;
116 query.push_back(SSeqLoc(temp_loc1, scope));
117
118 TSeqLocVector subjects;
119 CRef<CSeq_id> sid1(new CSeq_id(CSeq_id::e_Gi, 262050671));
120 CRef<CSeq_loc> ssl1(new CSeq_loc());
121 ssl1->SetWhole(*sid1);
122 SSeqLoc subj_seqloc(ssl1, scope);
123 subjects.push_back(subj_seqloc);
124
125 CBl2Seq bl2seq(query, subjects, eMegablast);
126 TSeqAlignVector sav(bl2seq.Run());
127 BOOST_CHECK_EQUAL(10, (int) sav.size());
128 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
129 BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
130 int num_ident = 0;
131 sar->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
132 BOOST_CHECK_EQUAL((int)303, num_ident);
133 BOOST_CHECK_EQUAL((TSeqPos)73011288, sar->GetSeqStart(0));
134 BOOST_CHECK_EQUAL((TSeqPos)1, sar->GetSeqStart(1));
135
136 #if 1
137 ofstream o("seqalign.out.asn");
138 ITERATE(TSeqAlignVector, v, sav) {
139 o << MSerial_AsnText << **v ;
140 }
141 o.close();
142 #endif
143 }
144
145
BOOST_AUTO_TEST_CASE(ProteinBlastInvalidSeqIdSelfHit)146 BOOST_AUTO_TEST_CASE(ProteinBlastInvalidSeqIdSelfHit)
147 {
148 CRef<CSeq_loc> loc(new CSeq_loc());
149 loc->SetWhole().SetGi(INVALID_GI);
150
151 CRef<CScope> scope(new CScope(CTestObjMgr::Instance().GetObjMgr()));
152 scope->AddDefaults();
153 SSeqLoc query(loc, scope);
154
155 TSeqLocVector subjects;
156 {
157 CRef<CSeq_loc> local_loc(new CSeq_loc());
158 local_loc->SetWhole().SetGi(INVALID_GI);
159
160 CScope* local_scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
161 local_scope->AddDefaults();
162 subjects.push_back(SSeqLoc(local_loc, local_scope));
163 }
164
165 // BLAST by concatenating all queries
166 CBl2Seq blaster4all(query, subjects, eBlastp);
167 TSeqAlignVector sas_v;
168 BOOST_CHECK_THROW(sas_v = blaster4all.Run(), CBlastException);
169 }
170
171 enum EBl2seqTest {
172 eBlastp_129295_129295 = 0,
173 eBlastn_555_555,
174 eMegablast_555_555,
175 eDiscMegablast_555_555,
176 eBlastx_555_129295,
177 eTblastn_129295_555,
178 eTblastn_129295_555_large_word,
179 eTblastx_555_555,
180 eTblastx_many_hits,
181 eBlastp_129295_7662354,
182 eBlastn_555_3090,
183 eBlastp_multi_q,
184 eBlastn_multi_q,
185 eBlastp_multi_q_s,
186 eTblastn_oof,
187 eBlastx_oof,
188 eDiscMegablast_U02544_U61969,
189 eMegablast_chrom_mrna
190 };
191
192 /* The following functions are used to test the functionality to interrupt
193 * CBl2Seq runs */
194
195 /// Returns true so that the processing stops upon the first invocation of this
196 /// callback
interrupt_immediately(SBlastProgress *)197 extern "C" Boolean interrupt_immediately(SBlastProgress* /*progress_info*/)
198 {
199 return TRUE;
200 }
201
202 /// Returns false so that the processing never stops in spite of a callback
203 /// function to interrupt the process is provided
do_not_interrupt(SBlastProgress *)204 extern "C" Boolean do_not_interrupt(SBlastProgress* /*progress_info*/)
205 {
206 return FALSE;
207 }
208
209 /// This callback never interrupts the BLAST search, its only purpose is to
210 /// count the number of times this is invoked for the given input. Also to be
211 /// used in CBl2SeqTest::testInterruptXExitAtRandom.
callback_counter(SBlastProgress * progress_info)212 extern "C" Boolean callback_counter(SBlastProgress* progress_info)
213 {
214 int& counter = *reinterpret_cast<int*>(progress_info->user_data);
215 counter++;
216 return FALSE;
217 }
218
219 /// This callback interrupts the BLAST search after the callback has been
220 /// executed the requested number of times in the pair's second member.
221 /// This is used in CBl2SeqTest::testInterruptXExitAtRandom.
interrupt_at_random(SBlastProgress * progress_info)222 extern "C" Boolean interrupt_at_random(SBlastProgress* progress_info)
223 {
224 pair<int, int>& progress_pair =
225 *reinterpret_cast< pair<int, int>* >(progress_info->user_data);
226
227 if (++progress_pair.first == progress_pair.second) {
228 return TRUE;
229 } else {
230 return FALSE;
231 }
232 }
233
234 /// The interruption occurs after 3 invokations of this callback
interrupt_after3calls(SBlastProgress *)235 extern "C" Boolean interrupt_after3calls(SBlastProgress* /*progress_info*/)
236 {
237 static int num_calls = 0;
238 if (++num_calls < 3) {
239 return FALSE;
240 } else {
241 return TRUE;
242 }
243 }
244
245 /// The interruption occurs after starting the traceback stage
interrupt_on_traceback(SBlastProgress * progress_info)246 extern "C" Boolean interrupt_on_traceback(SBlastProgress* progress_info)
247 {
248 if (progress_info->stage == eTracebackSearch) {
249 return TRUE;
250 } else {
251 return FALSE;
252 }
253 }
254
testRawCutoffs(CBl2Seq & blaster,EProgram program,EBl2seqTest test_id)255 void testRawCutoffs(CBl2Seq& blaster, EProgram program,
256 EBl2seqTest test_id)
257 {
258 BlastRawCutoffs* raw_cutoffs =
259 blaster.GetDiagnostics()->cutoffs;
260 int x_drop_ungapped;
261 int gap_trigger;
262
263 if (program == eBlastn || program == eDiscMegablast) {
264 x_drop_ungapped = 22;
265 gap_trigger = 16;
266 } else if (program == eMegablast) {
267 x_drop_ungapped = 8;
268 gap_trigger = 8;
269 } else {
270 x_drop_ungapped = 16;
271 gap_trigger = 41;
272 }
273
274 switch (test_id) {
275 case eBlastn_555_3090:
276 x_drop_ungapped = 22;
277 gap_trigger = 18;
278 break;
279 case eBlastn_multi_q:
280 x_drop_ungapped = 22;
281 gap_trigger = 18;
282 break;
283 case eMegablast_chrom_mrna:
284 x_drop_ungapped = 7;
285 gap_trigger = 7;
286 break;
287 case eDiscMegablast_U02544_U61969:
288 x_drop_ungapped = 22;
289 gap_trigger = 20;
290 break;
291 case eBlastp_multi_q:
292 gap_trigger = 23;
293 break;
294 case eBlastp_multi_q_s:
295 gap_trigger = 19;
296 break;
297 case eBlastp_129295_129295:
298 gap_trigger = 19; break;
299 case eTblastn_129295_555:
300 gap_trigger = 23; break;
301 case eTblastn_129295_555_large_word:
302 gap_trigger = 23; break;
303 case eBlastp_129295_7662354:
304 gap_trigger = 23; break;
305 case eBlastx_555_129295:
306 gap_trigger = 19; break;
307 case eTblastn_oof:
308 gap_trigger = 43;
309 default:
310 break;
311 }
312
313 switch (program) {
314 case eBlastn: case eDiscMegablast:
315 BOOST_CHECK_EQUAL(x_drop_ungapped,
316 raw_cutoffs->x_drop_ungapped);
317 BOOST_CHECK_EQUAL(33, raw_cutoffs->x_drop_gap);
318 // CC changed 08/07/08
319 //BOOST_CHECK_EQUAL(55, raw_cutoffs->x_drop_gap_final);
320 BOOST_CHECK_EQUAL(110, raw_cutoffs->x_drop_gap_final);
321 BOOST_CHECK_EQUAL(gap_trigger, raw_cutoffs->ungapped_cutoff);
322 break;
323 case eMegablast:
324 BOOST_CHECK_EQUAL(x_drop_ungapped,
325 raw_cutoffs->x_drop_ungapped);
326 BOOST_CHECK_EQUAL(13, raw_cutoffs->x_drop_gap);
327 // CC changed 08/07/08
328 //BOOST_CHECK_EQUAL(27, raw_cutoffs->x_drop_gap_final);
329 BOOST_CHECK_EQUAL(54, raw_cutoffs->x_drop_gap_final);
330 BOOST_CHECK_EQUAL(gap_trigger, raw_cutoffs->ungapped_cutoff);
331 break;
332 case eBlastp: case eBlastx: case eTblastn:
333 BOOST_CHECK_EQUAL(38, raw_cutoffs->x_drop_gap);
334 BOOST_CHECK_EQUAL(64, raw_cutoffs->x_drop_gap_final);
335 BOOST_CHECK_EQUAL(gap_trigger, raw_cutoffs->ungapped_cutoff);
336 /* No break intentional: next test is valid for all the above
337 programs */
338 case eTblastx:
339 BOOST_CHECK_EQUAL(x_drop_ungapped,
340 raw_cutoffs->x_drop_ungapped);
341 break;
342 default: break;
343 }
344 }
345
testResultAlignments(size_t num_queries,size_t num_subjects,TSeqAlignVector result_alnvec)346 void testResultAlignments(size_t num_queries,
347 size_t num_subjects,
348 TSeqAlignVector result_alnvec)
349 {
350 size_t num_total_alns = num_queries * num_subjects;
351
352 // test the number of resulting alignments
353 BOOST_REQUIRE_EQUAL(result_alnvec.size(), num_total_alns);
354
355 // test the correct ordering of resulting alignments
356 // (q1 s1 q1 s2 ... q2 s1 q2 s2 ...)
357
358 CConstRef<CSeq_id> id_query, id_prev_query;
359 CConstRef<CSeq_id> id_subject;
360 vector< CConstRef<CSeq_id> > id_prev_subjects;
361 id_prev_subjects.resize(num_subjects);
362
363 bool prev_query_available = false;
364 vector<bool> prev_subjects_available(num_subjects, false);
365
366 /* DEBUG OUTPUT
367 cerr << "................................................" << endl;
368 for (size_t i = 0; i < result_alnvec.size(); i++)
369 cerr << "\n<" << i << ">\n"
370 << MSerial_AsnText << result_alnvec[i].GetObject() << endl;
371 cerr << "................................................" << endl;
372 ------------ */
373
374 for (size_t i_query = 0; i_query < num_queries; i_query++)
375 {
376 prev_query_available = false;
377 for (size_t i_subject = 0; i_subject < num_subjects; i_subject++)
378 {
379 size_t i_lin_index = i_query * num_subjects + i_subject;
380 CRef<CSeq_align_set> aln_set = result_alnvec[i_lin_index];
381
382 // test if the alignment set is available (even if empty)
383 BOOST_REQUIRE(aln_set.NotNull());
384
385 // if the alignment set is not empty, take the first alignment
386 // and see if the ID's are in correct order
387 if (aln_set->Get().size() > 0)
388 {
389 CRef<CSeq_align> aln = aln_set->Get().front();
390 id_query.Reset(&(aln->GetSeq_id(0)));
391 id_subject.Reset(&(aln->GetSeq_id(1)));
392
393 // check if the query id was the same
394 // for the previous subject
395 if (i_subject > 0 &&
396 prev_query_available)
397 {
398 BOOST_REQUIRE(
399 id_query->Match(
400 id_prev_query.GetObject()));
401 }
402
403 // check if the subject id was the same
404 // on the same position for the previous query
405 if (i_query > 0 &&
406 prev_subjects_available[i_subject])
407 {
408 BOOST_REQUIRE(
409 id_subject->Match(
410 id_prev_subjects[i_subject].GetObject()));
411 }
412
413 // update the entry in previous subjects vector
414 prev_subjects_available[i_subject] = true;
415 id_prev_subjects[i_subject] = id_subject;
416
417 // update the previous query entry
418 prev_query_available = true;
419 id_prev_query = id_query;
420 }
421 }
422 }
423 }
424
testBlastHitCounts(CBl2Seq & blaster,EBl2seqTest test_id)425 void testBlastHitCounts(CBl2Seq& blaster, EBl2seqTest test_id)
426 {
427 BlastUngappedStats* ungapped_stats =
428 blaster.GetDiagnostics()->ungapped_stat;
429 BlastGappedStats* gapped_stats =
430 blaster.GetDiagnostics()->gapped_stat;
431
432 switch (test_id) {
433 case eBlastp_129295_129295:
434 BOOST_CHECK_EQUAL(314, (int)ungapped_stats->lookup_hits);
435 BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
436 BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
437 BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
438 BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
439 break;
440 case eBlastn_555_555:
441 BOOST_CHECK_EQUAL(157, (int)ungapped_stats->lookup_hits);
442 BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
443 BOOST_CHECK_EQUAL(3, ungapped_stats->good_init_extends);
444 BOOST_CHECK_EQUAL(3, gapped_stats->extensions);
445 BOOST_CHECK_EQUAL(3, gapped_stats->good_extensions);
446 break;
447 case eMegablast_555_555:
448 BOOST_CHECK_EQUAL(30, (int)ungapped_stats->lookup_hits);
449 BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
450 BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
451 BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
452 BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
453 break;
454 case eDiscMegablast_555_555:
455 BOOST_CHECK_EQUAL(582, (int)ungapped_stats->lookup_hits);
456 // CC changed 08/07/08
457 //BOOST_CHECK_EQUAL(32, ungapped_stats->init_extends);
458 BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
459 // CC changed 08/07/08
460 //BOOST_CHECK_EQUAL(32, ungapped_stats->good_init_extends);
461 BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
462 BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
463 BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
464 break;
465 case eBlastx_555_129295:
466 BOOST_CHECK_EQUAL(282, (int)ungapped_stats->lookup_hits);
467 BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
468 BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
469 BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
470 BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
471 break;
472 case eTblastn_129295_555:
473 BOOST_CHECK_EQUAL(157, (int)ungapped_stats->lookup_hits);
474 BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
475 BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
476 BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
477 BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
478 break;
479 case eTblastn_129295_555_large_word:
480 BOOST_CHECK_EQUAL(5, (int)ungapped_stats->lookup_hits);
481 BOOST_CHECK_EQUAL(4, ungapped_stats->init_extends);
482 BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
483 BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
484 BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
485 break;
486 case eTblastx_555_555:
487 BOOST_CHECK_EQUAL(2590, (int)ungapped_stats->lookup_hits);
488 BOOST_CHECK_EQUAL(57, ungapped_stats->init_extends);
489 BOOST_CHECK_EQUAL(41, ungapped_stats->good_init_extends);
490 break;
491 case eTblastx_many_hits:
492 BOOST_CHECK_EQUAL(18587, (int)ungapped_stats->lookup_hits);
493 BOOST_CHECK_EQUAL(348, ungapped_stats->init_extends);
494 BOOST_CHECK_EQUAL(66, ungapped_stats->good_init_extends);
495 break;
496 case eBlastp_129295_7662354:
497 BOOST_CHECK_EQUAL(210, (int)ungapped_stats->lookup_hits);
498 BOOST_CHECK_EQUAL(10, ungapped_stats->init_extends);
499 BOOST_CHECK_EQUAL(3, ungapped_stats->good_init_extends);
500 BOOST_CHECK_EQUAL(3, gapped_stats->extensions);
501 BOOST_CHECK_EQUAL(3, gapped_stats->good_extensions);
502 break;
503 case eBlastn_555_3090:
504 BOOST_CHECK_EQUAL(15, (int)ungapped_stats->lookup_hits);
505 BOOST_CHECK_EQUAL(2, ungapped_stats->init_extends);
506 BOOST_CHECK_EQUAL(2, ungapped_stats->good_init_extends);
507 BOOST_CHECK_EQUAL(2, gapped_stats->extensions);
508 BOOST_CHECK_EQUAL(2, gapped_stats->good_extensions);
509 break;
510 case eBlastp_multi_q:
511 BOOST_CHECK_EQUAL(2129, (int)ungapped_stats->lookup_hits);
512 BOOST_CHECK_EQUAL(76, ungapped_stats->init_extends);
513 BOOST_CHECK_EQUAL(14, ungapped_stats->good_init_extends);
514 BOOST_CHECK_EQUAL(8, gapped_stats->extensions);
515 BOOST_CHECK_EQUAL(8, gapped_stats->good_extensions);
516 break;
517 case eBlastn_multi_q:
518 BOOST_CHECK_EQUAL(963, (int)ungapped_stats->lookup_hits);
519 BOOST_CHECK_EQUAL(13, ungapped_stats->init_extends);
520 BOOST_CHECK_EQUAL(13, ungapped_stats->good_init_extends);
521 BOOST_CHECK_EQUAL(11, gapped_stats->extensions);
522 BOOST_CHECK_EQUAL(11, gapped_stats->good_extensions);
523 break;
524 case eBlastp_multi_q_s:
525 #if 0
526 // The following 2 numbers are different in Release and Debug modes
527 // due to a minor discrepancy in locations masked by seg filtering.
528 // The latter is due to a tiny difference in values involved in
529 // a comparison of real numbers inside the seg algorithm.
530 // In Debug mode:
531 BOOST_CHECK_EQUAL(3579, (int)ungapped_stats->lookup_hits);
532 BOOST_CHECK_EQUAL(138, ungapped_stats->init_extends);
533 // In Release mode:
534 BOOST_CHECK_EQUAL(3580, (int)ungapped_stats->lookup_hits);
535 BOOST_CHECK_EQUAL(140, ungapped_stats->init_extends);
536 #endif
537 // Note: Seg is not enabled for this case anymore
538 // (changed blastp defaults)
539 BOOST_CHECK_EQUAL(3939, (int)ungapped_stats->lookup_hits);
540 BOOST_CHECK_EQUAL(159, ungapped_stats->init_extends);
541 BOOST_CHECK_EQUAL(63, ungapped_stats->good_init_extends);
542 BOOST_CHECK_EQUAL(27, gapped_stats->extensions);
543 BOOST_CHECK_EQUAL(24, gapped_stats->good_extensions);
544 break;
545 case eTblastn_oof:
546 BOOST_CHECK_EQUAL(2666, (int)ungapped_stats->lookup_hits);
547 BOOST_CHECK_EQUAL(50, ungapped_stats->init_extends);
548 BOOST_CHECK_EQUAL(4, ungapped_stats->good_init_extends);
549 BOOST_CHECK_EQUAL(2, gapped_stats->extensions);
550 BOOST_CHECK_EQUAL(2, gapped_stats->good_extensions);
551 break;
552 case eBlastx_oof:
553 BOOST_CHECK_EQUAL(5950, (int)ungapped_stats->lookup_hits);
554 BOOST_CHECK_EQUAL(155, ungapped_stats->init_extends);
555 BOOST_CHECK_EQUAL(6, ungapped_stats->good_init_extends);
556 BOOST_CHECK_EQUAL(2, gapped_stats->extensions);
557 BOOST_CHECK_EQUAL(2, gapped_stats->good_extensions);
558 break;
559 case eDiscMegablast_U02544_U61969:
560 BOOST_CHECK_EQUAL(108, (int)ungapped_stats->lookup_hits);
561 // CC changed 08/07/08
562 //BOOST_CHECK_EQUAL(15, ungapped_stats->init_extends);
563 //BOOST_CHECK_EQUAL(15, ungapped_stats->good_init_extends);
564 BOOST_CHECK_EQUAL(3, ungapped_stats->init_extends);
565 BOOST_CHECK_EQUAL(3, ungapped_stats->good_init_extends);
566 BOOST_CHECK_EQUAL(3, gapped_stats->extensions);
567 BOOST_CHECK_EQUAL(3, gapped_stats->good_extensions);
568 break;
569 case eMegablast_chrom_mrna:
570 BOOST_CHECK_EQUAL(14, (int)ungapped_stats->lookup_hits);
571 BOOST_CHECK_EQUAL(1, ungapped_stats->init_extends);
572 BOOST_CHECK_EQUAL(1, ungapped_stats->good_init_extends);
573 BOOST_CHECK_EQUAL(1, gapped_stats->extensions);
574 BOOST_CHECK_EQUAL(1, gapped_stats->good_extensions);
575 break;
576 default: break;
577 }
578 }
579
BOOST_AUTO_TEST_CASE(ProteinBlastSelfHit)580 BOOST_AUTO_TEST_CASE(ProteinBlastSelfHit)
581 {
582 //const int kSeqLength = 232;
583 CSeq_id id("gi|129295");
584 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
585
586 CBl2Seq blaster(*sl, *sl, eBlastp);
587 TSeqAlignVector sav(blaster.Run());
588 BOOST_REQUIRE(sav[0].NotEmpty());
589 BOOST_REQUIRE( !sav[0]->IsEmpty() );
590 BOOST_REQUIRE(sav[0]->Get().begin()->NotEmpty());
591 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
592 BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
593 testBlastHitCounts(blaster, eBlastp_129295_129295);
594 testRawCutoffs(blaster, eBlastp, eBlastp_129295_129295);
595
596 // the number of identities is NOT calculated when composition based
597 // statistics is turned on (default for blastp)
598 int num_ident = 0;
599 sar->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
600 #if 0
601 ofstream o("0.asn");
602 o << MSerial_AsnText << *sar ;
603 o.close();
604 #endif
605 BOOST_CHECK_EQUAL(232, num_ident);
606
607 // calculate the number of identities using the BLAST formatter
608 /*
609 double percent_identity =
610 CBlastFormatUtil::GetPercentIdentity(*sar, *sl->scope, false);
611 BOOST_CHECK_EQUAL(1, (int) percent_identity);
612 */
613
614 // Check the ancillary results
615 CSearchResultSet::TAncillaryVector ancillary_data;
616 blaster.GetAncillaryResults(ancillary_data);
617 BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
618 BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
619 BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
620 BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
621
622 }
623
BOOST_AUTO_TEST_CASE(TBlastn2Seqs)624 BOOST_AUTO_TEST_CASE(TBlastn2Seqs)
625 {
626 CSeq_id qid("gi|129295");
627 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
628
629 CSeq_id sid("gi|555");
630 auto_ptr<SSeqLoc> subj(
631 CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
632
633 CBl2Seq blaster(*query, *subj, eTblastn);
634 TSeqAlignVector sav(blaster.Run());
635 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
636
637 #if 0
638 ofstream o("1.asn");
639 o << MSerial_AsnText << *sar ;
640 o.close();
641 #endif
642
643 BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
644 testBlastHitCounts(blaster, eTblastn_129295_555);
645 testRawCutoffs(blaster, eTblastn, eTblastn_129295_555);
646
647 int score = 0, comp_adj = 0;
648 sar->GetNamedScore(CSeq_align::eScore_Score, score);
649 sar->GetNamedScore(CSeq_align::eScore_CompAdjMethod, comp_adj);
650 BOOST_CHECK_EQUAL(26, score);
651 BOOST_CHECK_EQUAL(2, comp_adj);
652
653 // Check the ancillary results
654 CSearchResultSet::TAncillaryVector ancillary_data;
655 blaster.GetAncillaryResults(ancillary_data);
656 BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
657 BOOST_REQUIRE( ancillary_data.front().NotEmpty() );
658 BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
659 BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
660 BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
661 }
662
BOOST_AUTO_TEST_CASE(TBlastn2SeqsRevStrand1)663 BOOST_AUTO_TEST_CASE(TBlastn2SeqsRevStrand1)
664 {
665 CSeq_id qid("gi|1945390");
666 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
667
668 pair<TSeqPos, TSeqPos> range(150000, 170000);
669 CSeq_id sid("gi|4755212");
670 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_minus));
671
672 CBl2Seq blaster(*query, *subj, eTblastn);
673 TSeqAlignVector sav(blaster.Run());
674 BOOST_CHECK_EQUAL(11, (int) sav[0]->Get().size());
675 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
676 BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
677 vector < CRef< CSeq_loc > > locs = sar->GetSegs().GetStd().front()->GetLoc();
678 BOOST_CHECK_EQUAL((int)eNa_strand_minus, (int)(locs[1])->GetStrand());
679 int num_ident = 0;
680 sar->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
681 BOOST_CHECK_EQUAL(161, num_ident);
682 #if 0
683 ofstream o("minus1.new.asn");
684 ITERATE(TSeqAlignVector, v, sav) {
685 o << MSerial_AsnText << **v ;
686 }
687 o.close();
688 #endif
689 }
690
BOOST_AUTO_TEST_CASE(TBlastn2SeqsRevStrand2)691 BOOST_AUTO_TEST_CASE(TBlastn2SeqsRevStrand2)
692 {
693 CSeq_id qid("gi|1945390");
694 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
695
696 CSeq_id sid("gi|1945388");
697 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_minus));
698
699 CBl2Seq blaster(*query, *subj, eTblastn);
700 TSeqAlignVector sav(blaster.Run());
701 BOOST_CHECK_EQUAL(1, (int) sav[0]->Get().size());
702 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
703 BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
704 vector < CRef< CSeq_loc > > locs = sar->GetSegs().GetStd().front()->GetLoc();
705 BOOST_CHECK_EQUAL((int)eNa_strand_minus, (int)(locs[1])->GetStrand());
706 int num_ident = 0;
707 sar->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
708 BOOST_CHECK_EQUAL(11, num_ident);
709 #if 0
710 ofstream o("minus2.asn");
711 o << MSerial_AsnText << *sar ;
712 o.close();
713 #endif
714 }
715
716
BOOST_AUTO_TEST_CASE(TBlastn2SeqsCompBasedStats)717 BOOST_AUTO_TEST_CASE(TBlastn2SeqsCompBasedStats)
718 {
719 CSeq_id qid("gi|68737"); // "pir|A01243|DXCH"
720 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
721
722 CSeq_id sid("gi|118086484");
723 auto_ptr<SSeqLoc> subj(
724 CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
725
726 CRef<CBlastOptionsHandle> opts(CBlastOptionsFactory::Create(eTblastn));
727 opts->SetOptions().SetCompositionBasedStats(eCompositionBasedStats);
728
729 CBl2Seq blaster(*query, *subj, *opts);
730 TSeqAlignVector sav(blaster.Run());
731 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
732 BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetStd().size());
733
734 int num_ident = 0;
735 sar->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
736 BOOST_CHECK_EQUAL(229, num_ident);
737 #if 0
738 ofstream o("2.asn");
739 o << MSerial_AsnText << *sar ;
740 o.close();
741 #endif
742
743 // Check the ancillary results
744 CSearchResultSet::TAncillaryVector ancillary_data;
745 blaster.GetAncillaryResults(ancillary_data);
746 BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
747 BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
748 BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
749 BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
750 }
751
BOOST_AUTO_TEST_CASE(TBlastn2SeqsLargeWord)752 BOOST_AUTO_TEST_CASE(TBlastn2SeqsLargeWord)
753 {
754 CSeq_id qid("gi|129295");
755 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
756
757 CSeq_id sid("gi|555");
758 auto_ptr<SSeqLoc> subj(
759 CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
760
761 CRef<CBlastOptionsHandle> opts(CBlastOptionsFactory::Create(eTblastn));
762 opts->SetOptions().SetWordSize(6);
763 opts->SetOptions().SetLookupTableType(eCompressedAaLookupTable);
764 opts->SetOptions().SetWordThreshold(21.69);
765 opts->SetOptions().SetWindowSize(0);
766 opts->SetOptions().SetCompositionBasedStats(eNoCompositionBasedStats);
767
768 CBl2Seq blaster(*query, *subj, *opts);
769 TSeqAlignVector sav(blaster.Run());
770 BOOST_CHECK_EQUAL(1, (int)sav[0]->Size());
771 testBlastHitCounts(blaster, eTblastn_129295_555_large_word);
772 testRawCutoffs(blaster, eTblastn, eTblastn_129295_555_large_word);
773
774 int num_ident = 0;
775 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
776 sar->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
777 #if 0
778 ofstream o("3.asn");
779 o << MSerial_AsnText << *sar ;
780 o.close();
781 #endif
782 BOOST_CHECK_EQUAL(5, num_ident);
783 }
784
BOOST_AUTO_TEST_CASE(IdenticalProteins)785 BOOST_AUTO_TEST_CASE(IdenticalProteins)
786 {
787 //const int kSeqLength = 377;
788 CSeq_id qid("gi|34810917");
789 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
790 CSeq_id sid("gi|34810916");
791 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
792
793 CBl2Seq blaster(*query, *subj, eBlastp);
794 TSeqAlignVector sav(blaster.Run());
795 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
796 BOOST_CHECK_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
797
798 // the number of identities is NOT calculated when composition based
799 // statistics is turned on (default for blastp)
800 int num_ident = 0;
801 sar->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
802 #if 0
803 ofstream o("4.asn");
804 o << MSerial_AsnText << *sar ;
805 o.close();
806 #endif
807 BOOST_CHECK_EQUAL(377, num_ident);
808
809 // calculate the number of identities using the BLAST formatter
810 /*
811 double percent_identity =
812 CBlastFormatUtil::GetPercentIdentity(*sar, *query->scope, false);
813 BOOST_CHECK_EQUAL(1, (int) percent_identity);
814 */
815
816 // Check the ancillary results
817 CSearchResultSet::TAncillaryVector ancillary_data;
818 blaster.GetAncillaryResults(ancillary_data);
819 BOOST_CHECK_EQUAL((size_t)1, ancillary_data.size());
820 BOOST_CHECK( ancillary_data.front()->GetGappedKarlinBlk() != NULL );
821 BOOST_CHECK( ancillary_data.front()->GetUngappedKarlinBlk() != NULL );
822 BOOST_CHECK( ancillary_data.front()->GetSearchSpace() != (Int8)0 );
823 }
824
BOOST_AUTO_TEST_CASE(UnsupportedOption)825 BOOST_AUTO_TEST_CASE(UnsupportedOption) {
826 CDiscNucleotideOptionsHandle opts_handle;
827 BOOST_REQUIRE_THROW(opts_handle.SetTraditionalBlastnDefaults(),
828 CBlastException);
829 }
830
BOOST_AUTO_TEST_CASE(PositiveMismatchOption)831 BOOST_AUTO_TEST_CASE(PositiveMismatchOption) {
832 CSeq_id qid("gi|408478"); // zebrafish sequence U02544
833 CSeq_id sid("gi|1546012"); // mouse sequence U61969
834
835 auto_ptr<SSeqLoc> query(
836 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
837 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
838
839 const int kMatch = 2;
840 const int kMismatch = 5; // Positive mismatch not allowed.
841
842 CBlastNucleotideOptionsHandle nucl_options_handle;
843
844 nucl_options_handle.SetMatchReward(kMatch);
845 nucl_options_handle.SetMismatchPenalty(kMismatch);
846 CBl2Seq blaster(*query, *subj, nucl_options_handle);
847 try {
848 TSeqAlignVector sav(blaster.Run());
849 } catch (const CException& e) {
850 BOOST_REQUIRE(
851 !strcmp("BLASTN penalty must be negative",
852 e.GetMsg().c_str()));
853 }
854 }
855
856 // If a single query is fully masked, throw an exception (i.e.: an error)
BOOST_AUTO_TEST_CASE(FullyMaskedSequence)857 BOOST_AUTO_TEST_CASE(FullyMaskedSequence) {
858 CSeq_id qid("ref|NT_024524.13");
859 pair<TSeqPos, TSeqPos> range(27886902, 27886932);
860 auto_ptr<SSeqLoc> query(
861 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
862 eNa_strand_plus));
863 range.first = 2052;
864 range.second = 2082;
865 CSeq_id sid("emb|BX641126.1");
866 auto_ptr<SSeqLoc> subj(
867 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
868 eNa_strand_minus));
869 CRef<CBlastNucleotideOptionsHandle> options(new CBlastNucleotideOptionsHandle);
870 options->SetTraditionalBlastnDefaults();
871 options->SetMismatchPenalty(-1);
872 options->SetMatchReward(1);
873 options->SetGapXDropoff(100);
874 options->SetMaskAtHash(false);
875 CBl2Seq blaster(*query, *subj, *options);
876 CRef<CSearchResultSet> results;
877 BOOST_REQUIRE_NO_THROW(results = blaster.RunEx());
878 }
879
880 // In the case of multiple queries and one being masked, only emit a warning
881 // N.B.: this doesn't test the MT case as MT unit tests aren't supported
BOOST_AUTO_TEST_CASE(MultipleQueries1FullyMasked)882 BOOST_AUTO_TEST_CASE(MultipleQueries1FullyMasked) {
883 const bool kHasProteinQuery(true);
884 CBlastInputSourceConfig iconfig(kHasProteinQuery);
885 const CSearchDatabase dbinfo("data/nt.41646578", CSearchDatabase::eBlastDbIsNucleotide);
886 CRef<CBlastOptionsHandle> opts_handle(CBlastOptionsFactory::Create(eTblastn));
887
888 CNcbiIfstream infile("data/mult_queries_1fully_masked.fsa");
889 CRef<CScope> scope = CBlastScopeSource(kHasProteinQuery).NewScope();
890 CRef<CBlastFastaInputSource> fasta_src
891 (new CBlastFastaInputSource(infile, iconfig));
892 CRef<CBlastInput> input(new CBlastInput(&*fasta_src));
893 TSeqLocVector query_vec = input->GetAllSeqLocs(*scope);
894
895 // The first iteration runs the search *without* splitting, the second one
896 // forces to use query splitting
897 for (int env = 0; env < 2; env++) {
898
899 CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
900 auto_ptr<CAutoEnvironmentVariable> envvar1, envvar2;
901 if (env) {
902 envvar1.reset(new CAutoEnvironmentVariable("OVERLAP_CHUNK_SIZE", NStr::SizetToString(10)));
903 envvar2.reset(new CAutoEnvironmentVariable("CHUNK_SIZE", NStr::SizetToString(100)));
904 }
905 CLocalBlast blaster(queries, opts_handle, dbinfo);
906 CRef<CSearchResultSet> results;
907 BOOST_REQUIRE_NO_THROW(results = blaster.Run());
908 BOOST_REQUIRE_EQUAL(3U, results->size());
909 for (size_t query_idx = 0; query_idx < results->size(); query_idx++) {
910 const string& warnings((*results)[query_idx].GetWarningStrings());
911 const string& errors((*results)[query_idx].GetErrorStrings());
912
913 CNcbiOstrstream oss;
914 if (env) {
915 oss << "Forced splitting of queries: ";
916 } else {
917 oss << "No splitting of queries: ";
918 }
919
920 if (query_idx == 0 || query_idx == 2) {
921 oss << " expected no warnings/errors, got errors='" << errors << "'; "
922 << "warnings='" << warnings << "'";
923 const string msg = CNcbiOstrstreamToString(oss);
924 BOOST_CHECK_MESSAGE(kEmptyStr == warnings, msg);
925 BOOST_CHECK_MESSAGE(kEmptyStr == errors, msg);
926 } else {
927 oss << " expected warnings to match '" << kBlastErrMsg_CantCalculateUngappedKAParams
928 << "; instead got '" << warnings << "'";
929 const string msg = CNcbiOstrstreamToString(oss);
930 BOOST_CHECK_MESSAGE(warnings.find(kBlastErrMsg_CantCalculateUngappedKAParams) != NPOS, msg);
931 }
932 }
933 }
934 }
935
BOOST_AUTO_TEST_CASE(testInterruptBlastpExitImmediately)936 BOOST_AUTO_TEST_CASE(testInterruptBlastpExitImmediately) {
937 CSeq_id id("gi|129295");
938 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
939
940 CBl2Seq blaster(*sl, *sl, eBlastp);
941 TInterruptFnPtr fnptr =
942 blaster.SetInterruptCallback(interrupt_immediately);
943 BOOST_REQUIRE(fnptr == NULL);
944
945 TSeqAlignVector sav;
946 try { sav = blaster.Run(); }
947 catch (...) {
948 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
949 }
950 }
951
952 // SB-814
BOOST_AUTO_TEST_CASE(Tblastn2Seqs_PlusStrandOnly)953 BOOST_AUTO_TEST_CASE(Tblastn2Seqs_PlusStrandOnly) {
954 // We expect only 1 hit between these two sequences in the plus strand
955 TSeqLocVector query_vec, subj_vec;
956 CSeq_id qid(CSeq_id::e_Gi, 7019569);
957 auto_ptr<SSeqLoc> ql(CTestObjMgr::Instance().CreateSSeqLoc(qid));
958 query_vec.push_back(*ql);
959 CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
960
961 const ENa_strand target_strand = eNa_strand_plus;
962 CSeq_id sid(CSeq_id::e_Gi, 17865806);
963 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(sid,
964 target_strand));
965 subj_vec.push_back(*sl);
966 CRef<IQueryFactory> subjects(new CObjMgr_QueryFactory(subj_vec));
967
968 CRef<CBlastOptionsHandle> tblastn_opts(new CTBlastnOptionsHandle());
969 // N.B.: this setting is not applicable, instead the strand should be set
970 // in the subject seqlocs. FIXME: add a comment to this effect to
971 // SetStrandOption.
972 // N.B.2: BLAST doesn't support setting the subject strand for a database
973 // search
974 //tblastn_opts->SetOptions().SetStrandOption(target_strand);
975 CRef<CLocalDbAdapter> db_adapter(new CLocalDbAdapter(subjects, tblastn_opts));
976
977 CLocalBlast blaster(queries, tblastn_opts, db_adapter);
978
979 CRef<CSearchResultSet> results = blaster.Run();
980 BOOST_REQUIRE_EQUAL(eSequenceComparison, results->GetResultType());
981 BOOST_CHECK_EQUAL((size_t)1, results->GetNumResults());
982 const CSearchResults& result = (*results)[0];
983 BOOST_CHECK(result.HasAlignments());
984 CConstRef<CSeq_align_set> alignment = result.GetSeqAlign();
985 BOOST_CHECK_EQUAL((size_t)1, alignment->Size());
986 ITERATE(CSeq_align_set::Tdata, aln, alignment->Get()) {
987 // check the query's strand (protein)
988 BOOST_CHECK_EQUAL(static_cast<int>(eNa_strand_unknown), static_cast<int>((*aln)->GetSeqStrand(0)));
989 // check the subject's strand (nucleotide)
990 BOOST_CHECK_EQUAL(static_cast<int>(eNa_strand_plus), static_cast<int>((*aln)->GetSeqStrand(1)));
991 }
992 }
993
994 // SB-386
995 BOOST_AUTO_TEST_CASE_TIMEOUT(testInterruptBlastSetup, 3);
BOOST_AUTO_TEST_CASE(testInterruptBlastSetup)996 BOOST_AUTO_TEST_CASE(testInterruptBlastSetup) {
997 CSeq_id id("NC_000002.11");
998 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
999
1000 CBl2Seq blaster(*sl, *sl, eBlastn);
1001 TInterruptFnPtr fnptr =
1002 blaster.SetInterruptCallback(interrupt_immediately);
1003 BOOST_REQUIRE(fnptr == NULL);
1004
1005 TSeqAlignVector sav;
1006 try { sav = blaster.Run(); }
1007 catch (...) {
1008 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1009 }
1010 }
1011
BOOST_AUTO_TEST_CASE(testInterruptBlastnExitImmediately)1012 BOOST_AUTO_TEST_CASE(testInterruptBlastnExitImmediately) {
1013 CSeq_id id("gi|555");
1014 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
1015
1016 CBl2Seq blaster(*sl, *sl, eBlastn);
1017 TInterruptFnPtr fnptr =
1018 blaster.SetInterruptCallback(interrupt_immediately);
1019 BOOST_REQUIRE(fnptr == NULL);
1020
1021 TSeqAlignVector sav;
1022 try { sav = blaster.Run(); }
1023 catch (...) {
1024 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1025 }
1026 }
1027
BOOST_AUTO_TEST_CASE(testInterruptBlastxExitImmediately)1028 BOOST_AUTO_TEST_CASE(testInterruptBlastxExitImmediately) {
1029 CSeq_id query_id("gi|555");
1030 auto_ptr<SSeqLoc> slq(CTestObjMgr::Instance().CreateSSeqLoc(query_id));
1031 CSeq_id subj_id("gi|129295");
1032 auto_ptr<SSeqLoc> sls(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
1033
1034 CBl2Seq blaster(*slq, *sls, eBlastx);
1035 TInterruptFnPtr fnptr =
1036 blaster.SetInterruptCallback(interrupt_immediately);
1037 BOOST_REQUIRE(fnptr == NULL);
1038
1039 TSeqAlignVector sav;
1040 try { sav = blaster.Run(); }
1041 catch (...) {
1042 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1043 }
1044 }
1045
BOOST_AUTO_TEST_CASE(testInterruptTblastxExitImmediately)1046 BOOST_AUTO_TEST_CASE(testInterruptTblastxExitImmediately) {
1047 CSeq_id query_id("gi|555");
1048 auto_ptr<SSeqLoc> slq(CTestObjMgr::Instance().CreateSSeqLoc(query_id));
1049 CSeq_id subj_id("gi|555");
1050 auto_ptr<SSeqLoc> sls(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
1051
1052 CBl2Seq blaster(*slq, *sls, eTblastx);
1053 TInterruptFnPtr fnptr =
1054 blaster.SetInterruptCallback(interrupt_immediately);
1055 BOOST_REQUIRE(fnptr == NULL);
1056
1057 TSeqAlignVector sav;
1058 try { sav = blaster.Run(); }
1059 catch (...) {
1060 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1061 }
1062 }
1063
BOOST_AUTO_TEST_CASE(testInterruptTblastnExitImmediately)1064 BOOST_AUTO_TEST_CASE(testInterruptTblastnExitImmediately) {
1065 CSeq_id query_id("gi|129295");
1066 auto_ptr<SSeqLoc> slq(CTestObjMgr::Instance().CreateSSeqLoc(query_id));
1067 CSeq_id subj_id("gi|555");
1068 auto_ptr<SSeqLoc> sls(CTestObjMgr::Instance().CreateSSeqLoc(subj_id));
1069
1070 CBl2Seq blaster(*slq, *sls, eTblastn);
1071 TInterruptFnPtr fnptr =
1072 blaster.SetInterruptCallback(interrupt_immediately);
1073 BOOST_REQUIRE(fnptr == NULL);
1074
1075 TSeqAlignVector sav;
1076 try { sav = blaster.Run(); }
1077 catch (...) {
1078 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1079 }
1080 }
1081
1082 #define ARRAY_SIZE(a) (sizeof(a)/sizeof(*a))
1083 static
s_SetupWithMultipleQueriesAndSubjects(bool query_is_nucl,bool subj_is_nucl,EProgram program)1084 CRef<CBl2Seq> s_SetupWithMultipleQueriesAndSubjects(bool query_is_nucl,
1085 bool subj_is_nucl,
1086 EProgram program) {
1087
1088 TIntId protein_gis[] = { 6, 129295, 15606659, 4336138, 5556 };
1089 TIntId nucl_gis[] = { 272208, 272217, 272211, 272247, 272227, 272236,
1090 272219 };
1091
1092 vector<TIntId> q_gis, s_gis;
1093 if (query_is_nucl) {
1094 copy(&nucl_gis[0],
1095 &nucl_gis[ARRAY_SIZE(nucl_gis)],
1096 back_inserter(q_gis));
1097 } else {
1098 copy(&protein_gis[0],
1099 &protein_gis[ARRAY_SIZE(protein_gis)],
1100 back_inserter(q_gis));
1101 }
1102
1103 if (subj_is_nucl) {
1104 copy(&nucl_gis[0],
1105 &nucl_gis[ARRAY_SIZE(nucl_gis)],
1106 back_inserter(s_gis));
1107 } else {
1108 copy(&protein_gis[0],
1109 &protein_gis[ARRAY_SIZE(protein_gis)],
1110 back_inserter(s_gis));
1111 }
1112
1113
1114 TSeqLocVector queries;
1115 ITERATE(vector<TIntId>, itr, q_gis) {
1116 CRef<CSeq_loc> loc(new CSeq_loc());
1117 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1118
1119 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1120 scope->AddDefaults();
1121 queries.push_back(SSeqLoc(loc, scope));
1122 }
1123
1124 TSeqLocVector subjects;
1125 ITERATE(vector<TIntId>, itr, s_gis) {
1126 CRef<CSeq_loc> loc(new CSeq_loc());
1127 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1128
1129 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1130 scope->AddDefaults();
1131 subjects.push_back(SSeqLoc(loc, scope));
1132 }
1133
1134 return CRef<CBl2Seq>(new CBl2Seq(queries, subjects, program));
1135 }
1136
BOOST_AUTO_TEST_CASE(testInterruptBlastpExitAtRandom)1137 BOOST_AUTO_TEST_CASE(testInterruptBlastpExitAtRandom) {
1138
1139 CRef<CBl2Seq> blaster = s_SetupWithMultipleQueriesAndSubjects(false,
1140 false,
1141 eBlastp);
1142
1143 int num_callbacks_executed(0);
1144 TInterruptFnPtr fnptr =
1145 blaster->SetInterruptCallback(callback_counter,
1146 (void*) &num_callbacks_executed);
1147 BOOST_REQUIRE(fnptr == NULL);
1148
1149 TSeqAlignVector sav(blaster->Run()); // won't throw
1150 CRandom r((CRandom::TValue)time(0));
1151 int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1152 pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1153
1154 fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1155 (void*)&progress_pair);
1156 BOOST_REQUIRE(fnptr == callback_counter);
1157 sav.clear();
1158
1159 try { sav = blaster->Run(); }
1160 catch (...) {
1161 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1162 }
1163 }
1164
BOOST_AUTO_TEST_CASE(testInterruptBlastnExitAtRandom)1165 BOOST_AUTO_TEST_CASE(testInterruptBlastnExitAtRandom) {
1166
1167 CRef<CBl2Seq> blaster =
1168 s_SetupWithMultipleQueriesAndSubjects(true, true, eBlastn);
1169
1170 int num_callbacks_executed(0);
1171 TInterruptFnPtr fnptr =
1172 blaster->SetInterruptCallback(callback_counter,
1173 (void*)&num_callbacks_executed);
1174 BOOST_REQUIRE(fnptr == NULL);
1175
1176 TSeqAlignVector sav(blaster->Run()); // won't throw
1177 CRandom r((CRandom::TValue)time(0));
1178 int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1179 pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1180
1181 fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1182 (void*)&progress_pair);
1183 BOOST_REQUIRE(fnptr == callback_counter);
1184 sav.clear();
1185
1186 try { sav = blaster->Run(); }
1187 catch (...) {
1188 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1189 }
1190 }
1191
1192 // interrupt_at_random.
BOOST_AUTO_TEST_CASE(testInterruptBlastxExitAtRandom)1193 BOOST_AUTO_TEST_CASE(testInterruptBlastxExitAtRandom) {
1194
1195 CRef<CBl2Seq> blaster =
1196 s_SetupWithMultipleQueriesAndSubjects(true, false, eBlastx);
1197
1198 int num_callbacks_executed(0);
1199 TInterruptFnPtr fnptr =
1200 blaster->SetInterruptCallback(callback_counter,
1201 (void*) & num_callbacks_executed);
1202 BOOST_REQUIRE(fnptr == NULL);
1203
1204 TSeqAlignVector sav(blaster->Run()); // won't throw
1205 CRandom r((CRandom::TValue)time(0));
1206 int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1207 pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1208
1209 fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1210 (void*)&progress_pair);
1211 BOOST_REQUIRE(fnptr == callback_counter);
1212 sav.clear();
1213
1214 try { sav = blaster->Run(); }
1215 catch (...) {
1216 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1217 }
1218 }
1219
BOOST_AUTO_TEST_CASE(testInterruptTblastnExitAtRandom)1220 BOOST_AUTO_TEST_CASE(testInterruptTblastnExitAtRandom) {
1221
1222 CRef<CBl2Seq> blaster =
1223 s_SetupWithMultipleQueriesAndSubjects(false, true, eTblastn);
1224
1225 int num_callbacks_executed(0);
1226 TInterruptFnPtr fnptr =
1227 blaster->SetInterruptCallback(callback_counter,
1228 (void*)&num_callbacks_executed);
1229 BOOST_REQUIRE(fnptr == NULL);
1230
1231 TSeqAlignVector sav(blaster->Run()); // won't throw
1232 CRandom r((CRandom::TValue)time(0));
1233 int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1234 pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1235
1236 fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1237 (void*)&progress_pair);
1238 BOOST_REQUIRE(fnptr == callback_counter);
1239 sav.clear();
1240
1241 try { sav = blaster->Run(); }
1242 catch (...) {
1243 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1244 }
1245 }
1246
BOOST_AUTO_TEST_CASE(testInterruptTblastxExitAtRandom)1247 BOOST_AUTO_TEST_CASE(testInterruptTblastxExitAtRandom) {
1248
1249 CRef<CBl2Seq> blaster =
1250 s_SetupWithMultipleQueriesAndSubjects(true, true, eTblastx);
1251
1252 int num_callbacks_executed(0);
1253 TInterruptFnPtr fnptr =
1254 blaster->SetInterruptCallback(callback_counter,
1255 (void*) & num_callbacks_executed);
1256 BOOST_REQUIRE(fnptr == NULL);
1257
1258 TSeqAlignVector sav(blaster->Run()); // won't throw
1259 CRandom r((CRandom::TValue)time(0));
1260 int max_interrupt_callbacks = r.GetRand(1, num_callbacks_executed);
1261 pair<int, int> progress_pair(make_pair(0, max_interrupt_callbacks));
1262
1263 fnptr = blaster->SetInterruptCallback(interrupt_at_random,
1264 (void*)&progress_pair);
1265 BOOST_REQUIRE(fnptr == callback_counter);
1266 sav.clear();
1267
1268 try { sav = blaster->Run(); }
1269 catch (...) {
1270 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1271 }
1272 }
1273
BOOST_AUTO_TEST_CASE(testInterruptBlastpExitAfter3Callbacks)1274 BOOST_AUTO_TEST_CASE(testInterruptBlastpExitAfter3Callbacks) {
1275 CSeq_id id("gi|129295");
1276 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
1277
1278 CBl2Seq blaster(*sl, *sl, eBlastp);
1279 TInterruptFnPtr fnptr =
1280 blaster.SetInterruptCallback(interrupt_after3calls);
1281 BOOST_REQUIRE(fnptr == NULL);
1282
1283 TSeqAlignVector sav;
1284 try { sav = blaster.Run(); }
1285 catch (...) {
1286 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1287 }
1288 }
1289
BOOST_AUTO_TEST_CASE(testInterruptBlastxExitOnTraceback)1290 BOOST_AUTO_TEST_CASE(testInterruptBlastxExitOnTraceback) {
1291
1292 CRef<CBl2Seq> blaster = s_SetupWithMultipleQueriesAndSubjects(true,
1293 false,
1294 eBlastx);
1295 TInterruptFnPtr fnptr =
1296 blaster->SetInterruptCallback(interrupt_on_traceback);
1297 BOOST_REQUIRE(fnptr == NULL);
1298
1299 TSeqAlignVector sav;
1300 try { sav = blaster->Run(); }
1301 catch (...) {
1302 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1303 }
1304 }
1305
BOOST_AUTO_TEST_CASE(testInterruptTblastxExitOnTraceback)1306 BOOST_AUTO_TEST_CASE(testInterruptTblastxExitOnTraceback) {
1307
1308 CRef<CBl2Seq> blaster = s_SetupWithMultipleQueriesAndSubjects
1309 (true, true, eTblastx);
1310 TInterruptFnPtr fnptr =
1311 blaster->SetInterruptCallback(interrupt_on_traceback);
1312 BOOST_REQUIRE(fnptr == NULL);
1313
1314 TSeqAlignVector sav;
1315 try { sav = blaster->Run(); }
1316 catch (...) {
1317 BOOST_REQUIRE_EQUAL((size_t)0, sav.size());
1318 }
1319 }
1320
BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueriesWithInvalidSeqId)1321 BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueriesWithInvalidSeqId) {
1322 vector<TIntId> q_gis, s_gis;
1323
1324 // Setup the queries
1325 q_gis.push_back(129295);
1326 q_gis.push_back(-1); // invalid seqid
1327
1328 // setup the subjects
1329 s_gis.push_back(129295);
1330 s_gis.push_back(4336138); // no hits with gi 129295
1331
1332 TSeqLocVector queries;
1333 ITERATE(vector<TIntId>, itr, q_gis) {
1334 CRef<CSeq_loc> loc(new CSeq_loc());
1335 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1336
1337 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1338 scope->AddDefaults();
1339 queries.push_back(SSeqLoc(loc, scope));
1340 }
1341
1342 TSeqLocVector subjects;
1343 ITERATE(vector<TIntId>, itr, s_gis) {
1344 CRef<CSeq_loc> loc(new CSeq_loc());
1345 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1346
1347 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1348 scope->AddDefaults();
1349 subjects.push_back(SSeqLoc(loc, scope));
1350 }
1351
1352 // BLAST by concatenating all queries
1353 CBl2Seq blaster4all(queries, subjects, eBlastp);
1354 TSeqAlignVector sas_v = blaster4all.Run();
1355
1356 TSearchMessages m;
1357 blaster4all.GetMessages(m);
1358 BOOST_REQUIRE_EQUAL(subjects.size()*queries.size(), sas_v.size());
1359
1360 BOOST_REQUIRE(m[0].empty());
1361 BOOST_REQUIRE(!m[1].empty());
1362
1363 // Verify the error message
1364 TQueryMessages qm = m[1];
1365 BOOST_REQUIRE(qm.front()->GetMessage().find("Cannot resolve") !=
1366 string::npos);
1367
1368 // Verify that the alignments corresponding to the 2nd query are indeed empty
1369 // in older version this was sas_v[1], order has changed
1370 BOOST_REQUIRE_EQUAL(0, (int) sas_v[2]->Size());
1371
1372 CRef<CSearchResultSet> res = blaster4all.RunEx();
1373 BOOST_REQUIRE(res.NotEmpty());
1374 BOOST_REQUIRE_EQUAL(q_gis.size()*s_gis.size(), res->GetNumResults());
1375 BOOST_REQUIRE(res->GetResults(0,0).HasAlignments());
1376 BOOST_REQUIRE(!res->GetResults(0,1).HasAlignments());
1377 BOOST_REQUIRE(!res->GetResults(1,0).HasAlignments());
1378 BOOST_REQUIRE(!res->GetResults(1,1).HasAlignments());
1379 }
1380
1381 // JIRA:SB-1242
BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueriesWithBadQuery)1382 BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueriesWithBadQuery) {
1383 vector<TIntId> q_gis, s_gis;
1384
1385 // Setup the queries
1386 q_gis.push_back(296863684); // All X's
1387 q_gis.push_back(129295);
1388
1389 // setup the subjects
1390 s_gis.push_back(129296);
1391
1392 TSeqLocVector queries;
1393 ITERATE(vector<TIntId>, itr, q_gis) {
1394 CRef<CSeq_loc> loc(new CSeq_loc());
1395 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1396
1397 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1398 scope->AddDefaults();
1399 queries.push_back(SSeqLoc(loc, scope));
1400 }
1401
1402 TSeqLocVector subjects;
1403 ITERATE(vector<TIntId>, itr, s_gis) {
1404 CRef<CSeq_loc> loc(new CSeq_loc());
1405 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
1406
1407 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1408 scope->AddDefaults();
1409 subjects.push_back(SSeqLoc(loc, scope));
1410 }
1411
1412 // BLAST by concatenating all queries
1413 CBl2Seq blaster4all(queries, subjects, eBlastp);
1414 TSeqAlignVector sas_v = blaster4all.Run();
1415
1416 TSearchMessages m;
1417 blaster4all.GetMessages(m);
1418 BOOST_REQUIRE_EQUAL(subjects.size()*queries.size(), sas_v.size());
1419
1420 BOOST_REQUIRE(!m[0].empty());
1421 BOOST_REQUIRE(m[1].empty());
1422 }
1423
BOOST_AUTO_TEST_CASE(NucleotideBlastMultipleQueriesWithInvalidSeqId)1424 BOOST_AUTO_TEST_CASE(NucleotideBlastMultipleQueriesWithInvalidSeqId) {
1425 CRef<CSeq_id> id1(new CSeq_id(CSeq_id::e_Gi, 555));
1426 auto_ptr<SSeqLoc> sl1(CTestObjMgr::Instance().CreateSSeqLoc(*id1));
1427 CRef<CSeq_id> id2(new CSeq_id(CSeq_id::e_Gi, 556));
1428 auto_ptr<SSeqLoc> sl2(CTestObjMgr::Instance().CreateSSeqLoc(*id2));
1429
1430 const TSeqPos kFakeBioseqLength = 12;
1431 const char byte(0); // string of 4 A's in ncbi2na
1432 vector<char> na_data(kFakeBioseqLength/4, byte);
1433
1434 CRef<CSeq_id> fake_id(new CSeq_id("lcl|77"));
1435 CRef<CBioseq> fake_bioseq(new CBioseq);
1436 fake_bioseq->SetInst().SetLength(kFakeBioseqLength);
1437 fake_bioseq->SetInst().SetSeq_data().SetNcbi2na().Set().swap(na_data);
1438 fake_bioseq->SetInst().SetMol(CSeq_inst::eMol_na);
1439 fake_bioseq->SetInst().SetRepr(CSeq_inst::eRepr_raw);
1440 fake_bioseq->SetId().push_back(fake_id);
1441 CRef<CSeq_loc> fake_loc(new CSeq_loc);
1442 fake_loc->SetWhole(*fake_id);
1443
1444 CRef<CScope> scope(CSimpleOM::NewScope(false));
1445 scope->AddBioseq(*fake_bioseq);
1446 auto_ptr<SSeqLoc> sl_bad(new SSeqLoc(*fake_loc, *scope));
1447
1448 TSeqPos len = sequence::GetLength(*sl_bad->seqloc, sl_bad->scope);
1449 BOOST_REQUIRE_EQUAL(kFakeBioseqLength, len);
1450
1451 TSeqLocVector queries;
1452 queries.push_back(*sl1);
1453 queries.push_back(*sl_bad);
1454 queries.push_back(*sl2);
1455
1456 // All subjects have matches against this gi
1457 CRef<CSeq_id> subj_id(new CSeq_id(CSeq_id::e_Gi, 555));
1458 auto_ptr<SSeqLoc> subj_loc
1459 (CTestObjMgr::Instance().CreateSSeqLoc(*subj_id));
1460 TSeqLocVector subject;
1461 subject.push_back(*subj_loc);;
1462
1463 CRef<CBlastNucleotideOptionsHandle> opts_handle(new CBlastNucleotideOptionsHandle);
1464 opts_handle->SetMaskAtHash(false);
1465 CBl2Seq bl2seq(queries, subject, *opts_handle);
1466 TSeqAlignVector sas_v = bl2seq.Run();
1467 sas_v = bl2seq.Run();
1468 TSearchMessages m;
1469 bl2seq.GetMessages(m);
1470 BOOST_REQUIRE_EQUAL(sas_v.size(), m.size());
1471 BOOST_REQUIRE_EQUAL(queries.size(), sas_v.size());
1472
1473 BOOST_REQUIRE(m[0].empty());
1474 BOOST_REQUIRE(!m[1].empty());
1475 BOOST_REQUIRE(m[2].empty());
1476
1477 TQueryMessages qm = m[1];
1478
1479 // no duplicate messages for the contexts
1480 BOOST_REQUIRE(qm.size() == 1);
1481 // Verify the error message
1482 ITERATE(TQueryMessages, itr, qm) {
1483 BOOST_REQUIRE((*itr)->GetMessage().find("Could not calculate "
1484 "ungapped Karlin-Altschul "
1485 "parameters")
1486 != string::npos);
1487 }
1488 // Verify that the alignments corresponding to the 2nd query are indeed
1489 // empty
1490 ITERATE(CSeq_align_set::Tdata, alignments, sas_v[1]->Get()) {
1491 BOOST_REQUIRE((*alignments)->GetSegs().IsDisc());
1492 BOOST_REQUIRE((*alignments)->GetSegs().GetDisc().Get().empty());
1493 }
1494 }
1495
BOOST_AUTO_TEST_CASE(ProteinSelfHitWithMask)1496 BOOST_AUTO_TEST_CASE(ProteinSelfHitWithMask) {
1497 CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 129295));
1498 CRef<CSeq_loc> sl(new CSeq_loc());
1499 sl->SetWhole(*id);
1500 CRef<CSeq_loc> mask(new CSeq_loc(*id, 50, 100));
1501 CRef<CScope> scope(CSimpleOM::NewScope());
1502 SSeqLoc seqloc(sl, scope, mask);
1503
1504 CBl2Seq bl2seq(seqloc, seqloc, eBlastp);
1505 TSeqAlignVector sav(bl2seq.Run());
1506 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1507 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1508 }
1509
1510 // Inspired by SB-285
BOOST_AUTO_TEST_CASE(NucleotideMaskedLocation)1511 BOOST_AUTO_TEST_CASE(NucleotideMaskedLocation) {
1512 CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 83219349));
1513 CRef<CSeq_loc> sl(new CSeq_loc());
1514 sl->SetWhole(*id);
1515 CRef<CSeq_loc> mask(new CSeq_loc(*id, 57, 484));
1516 CRef<CScope> scope(CSimpleOM::NewScope());
1517 SSeqLoc query_seqloc(sl, scope, mask);
1518
1519 CRef<CSeq_id> sid(new CSeq_id(CSeq_id::e_Gi, 88954065));
1520 CRef<CSeq_loc> ssl(new CSeq_loc(*sid, 9909580-100, 9909607+100));
1521 SSeqLoc subj_seqloc(ssl, scope);
1522
1523 CBl2Seq bl2seq(query_seqloc, subj_seqloc, eMegablast);
1524 TSeqAlignVector sav(bl2seq.Run());
1525 BOOST_REQUIRE_EQUAL(0U, sav[0]->Get().size());
1526 }
1527
1528 // Inspired by SB-285
BOOST_AUTO_TEST_CASE(NucleotideMaskedLocation_FromFile)1529 BOOST_AUTO_TEST_CASE(NucleotideMaskedLocation_FromFile) {
1530 CNcbiIfstream infile("data/masked.fsa");
1531 const bool is_protein(false);
1532 CBlastInputSourceConfig iconfig(is_protein);
1533 iconfig.SetLowercaseMask(true);
1534 CRef<CBlastFastaInputSource> fasta_src
1535 (new CBlastFastaInputSource(infile, iconfig));
1536 CRef<CBlastInput> input(new CBlastInput(&*fasta_src));
1537 //CRef<CScope> scope(new CScope(*CObjectManager::GetInstance()));
1538 //scope->AddDefaults();
1539 CRef<CScope> scope = CBlastScopeSource(is_protein).NewScope();
1540
1541 CRef<blast::CBlastQueryVector> seqs = input->GetNextSeqBatch(*scope);
1542 CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(*seqs));
1543
1544 TSeqLocVector subj_vec;
1545 CRef<CSeq_id> sid(new CSeq_id(CSeq_id::e_Gi, 88954065));
1546 CRef<CSeq_loc> ssl(new CSeq_loc(*sid, 9909580-100, 9909607+100));
1547 subj_vec.push_back(SSeqLoc(ssl, scope));
1548 CRef<IQueryFactory> subj_qf(new CObjMgr_QueryFactory(subj_vec));
1549 CRef<CBlastOptionsHandle>
1550 opts_handle(CBlastOptionsFactory::Create(eBlastn));
1551 CRef<CLocalDbAdapter> subjects(new CLocalDbAdapter(subj_qf,
1552 opts_handle));
1553
1554 size_t num_queries = seqs->Size();
1555 size_t num_subjects = subj_vec.size();
1556 BOOST_REQUIRE_EQUAL((size_t)1, num_queries);
1557 BOOST_REQUIRE_EQUAL((size_t)1, num_subjects);
1558
1559 // BLAST by concatenating all queries
1560 CLocalBlast blaster(queries, opts_handle, subjects);
1561 CRef<CSearchResultSet> results = blaster.Run();
1562 BOOST_REQUIRE(results->GetResultType() == eSequenceComparison);
1563 BOOST_REQUIRE_EQUAL((num_queries*num_subjects),
1564 results->GetNumResults());
1565 BOOST_REQUIRE_EQUAL((num_queries*num_subjects), results->size());
1566 BOOST_REQUIRE_EQUAL(num_queries, results->GetNumQueries());
1567 BOOST_REQUIRE_EQUAL(num_subjects,
1568 results->GetNumResults()/results->GetNumQueries());
1569
1570 CSearchResults& res = (*results)[0];
1571 BOOST_REQUIRE(res.HasAlignments() == false);
1572 }
1573
1574 // JIRA SB-732
BOOST_AUTO_TEST_CASE(InvalidMaskingAlgorithm)1575 BOOST_AUTO_TEST_CASE(InvalidMaskingAlgorithm) {
1576 CRef<blast::CSearchDatabase> m_SearchDb;
1577 m_SearchDb.Reset(new CSearchDatabase("nr",
1578 CSearchDatabase::eBlastDbIsProtein));
1579 m_SearchDb->SetFilteringAlgorithm(66, eSoftSubjMasking);
1580 CRef<CLocalDbAdapter> subject_adapter(new CLocalDbAdapter(*m_SearchDb));
1581 CRef<CBlastOptionsHandle> opts(CBlastOptionsFactory::Create(eBlastp));
1582 vector<TIntId> q_gis;
1583 // Setup the queries
1584 CSeq_id seq_id("P01013.1");
1585
1586 TSeqLocVector queries;
1587 CRef<CSeq_loc> loc(new CSeq_loc());
1588 loc->SetWhole(seq_id);
1589
1590 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
1591 scope->AddDefaults();
1592 queries.push_back(SSeqLoc(loc, scope));
1593
1594 CRef<IQueryFactory> query_fact(new CObjMgr_QueryFactory(queries));
1595 BOOST_REQUIRE_THROW(CLocalBlast blaster(query_fact, opts, subject_adapter),
1596 CBlastException);
1597 }
1598
1599 // test for the case where the use of composition based
1600 // satistics should have deleted a hit but did not (used to crash)
BOOST_AUTO_TEST_CASE(ProteinCompBasedStats)1601 BOOST_AUTO_TEST_CASE(ProteinCompBasedStats) {
1602
1603 CRef<CObjectManager> kObjMgr = CObjectManager::GetInstance();
1604 CRef<CScope> scope(new CScope(*kObjMgr));
1605 CRef<CSeq_entry> seq_entry1;
1606 const string kFileName("data/blastp_compstats.fa");
1607 ifstream in1(kFileName.c_str());
1608 if ( !in1 )
1609 throw runtime_error("Failed to open " + kFileName);
1610 if ( !(seq_entry1 = CFastaReader(in1).ReadOneSeq()))
1611 throw runtime_error("Failed to read sequence from " + kFileName);
1612 scope->AddTopLevelSeqEntry(*seq_entry1);
1613 CRef<CSeq_loc> seqloc1(new CSeq_loc);
1614 const string kSeqIdString1("lcl|1");
1615 CRef<CSeq_id> id1(new CSeq_id(kSeqIdString1));
1616 seqloc1->SetWhole(*id1);
1617 SSeqLoc ss1(seqloc1, scope);
1618
1619 CSeq_id id("gi|4503637");
1620 auto_ptr<SSeqLoc> ss2(CTestObjMgr::Instance().CreateSSeqLoc(id));
1621
1622 CRef<CBlastProteinOptionsHandle> opts_handle (new CBlastProteinOptionsHandle);
1623 opts_handle->SetWordSize(2);
1624 opts_handle->SetEvalueThreshold(20000);
1625 opts_handle->SetFilterString("F");/* NCBI_FAKE_WARNING */
1626 opts_handle->SetMatrixName("PAM30");
1627 opts_handle->SetGapOpeningCost(9);
1628 opts_handle->SetGapExtensionCost(1);
1629 opts_handle->SetOptions().SetCompositionBasedStats(
1630 eCompositionBasedStats);
1631
1632 CBl2Seq blaster(ss1, *ss2, *opts_handle);
1633 TSeqAlignVector sav(blaster.Run());
1634 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1635 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1636 }
1637
BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryBothStrands)1638 BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryBothStrands) {
1639 CSeq_id qid("gi|555");
1640 auto_ptr<SSeqLoc> query(
1641 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
1642 query->genetic_code_id = 1;
1643
1644 CSeq_id sid("gi|129295");
1645 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
1646
1647 CBl2Seq blaster(*query, *subj, eBlastx);
1648 TSeqAlignVector sav(blaster.Run());
1649 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1650 // cerr << "Align " << MSerial_AsnText << *sar << endl;
1651 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetStd().size());
1652 testBlastHitCounts(blaster, eBlastx_555_129295);
1653 testRawCutoffs(blaster, eBlastx, eBlastx_555_129295);
1654 }
1655
BOOST_AUTO_TEST_CASE(NucleotideSelfHitWithSubjectMask)1656 BOOST_AUTO_TEST_CASE(NucleotideSelfHitWithSubjectMask) {
1657 CRef<CSeq_id> query_id(new CSeq_id(CSeq_id::e_Gi, 148727250));
1658 CRef<CSeq_id> subj_id(new CSeq_id(CSeq_id::e_Gi, 89059606));
1659 CRef<CSeq_loc> qsl(new CSeq_loc(*query_id, 0, 1000));
1660 CRef<CSeq_loc> ssl(new CSeq_loc(*subj_id, 0, 1000));
1661 CPacked_seqint::TRanges mask_vector;
1662 mask_vector.push_back(TSeqRange(0, 44));
1663 mask_vector.push_back(TSeqRange(69, 582));
1664 mask_vector.push_back(TSeqRange(610, 834));
1665 mask_vector.push_back(TSeqRange(854, 1000));
1666 CRef<CPacked_seqint> masks(new CPacked_seqint(*subj_id,
1667 mask_vector));
1668 CRef<CSeq_loc> subj_mask(new CSeq_loc());
1669 subj_mask->SetPacked_int(*masks);
1670 CRef<CScope> scope(CSimpleOM::NewScope());
1671 SSeqLoc query(qsl, scope);
1672 auto_ptr<SSeqLoc> subject(new SSeqLoc(ssl, scope, subj_mask));
1673 {
1674 CBl2Seq bl2seq(query, *subject, eBlastn);
1675 TSeqAlignVector sav(bl2seq.Run());
1676 BOOST_REQUIRE_EQUAL((size_t)1, sav.front()->Get().size());
1677 }
1678
1679 // Now compare the same sequences, without the subject masks
1680 subject.reset(new SSeqLoc(ssl, scope));
1681 {
1682 CBl2Seq bl2seq(query, *subject, eBlastn);
1683 TSeqAlignVector sav(bl2seq.Run());
1684 BOOST_REQUIRE_EQUAL((size_t)4, sav.front()->Get().size());
1685 }
1686 }
1687
BOOST_AUTO_TEST_CASE(NucleotideBlastSelfHit)1688 BOOST_AUTO_TEST_CASE(NucleotideBlastSelfHit) {
1689 CSeq_id id("gi|555");
1690 auto_ptr<SSeqLoc> sl(
1691 CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
1692
1693 // Traditional blastn search
1694 CRef<CBlastOptionsHandle> opts(CBlastOptionsFactory::Create(eBlastn));
1695 CBl2Seq blaster(*sl, *sl, *opts);
1696 TSeqAlignVector sav = blaster.Run();
1697 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1698 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1699 testBlastHitCounts(blaster, eBlastn_555_555);
1700 testRawCutoffs(blaster, eBlastn, eBlastn_555_555);
1701
1702 // Change the options to megablast
1703 opts.Reset(CBlastOptionsFactory::Create(eMegablast));
1704 blaster.SetOptionsHandle() = *opts;
1705 sav = blaster.Run();
1706 BOOST_REQUIRE_EQUAL(1, (int)sav.size());
1707 sar = *(sav[0]->Get().begin());
1708 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1709 testBlastHitCounts(blaster, eMegablast_555_555);
1710 testRawCutoffs(blaster, eMegablast, eMegablast_555_555);
1711
1712 // Change the options to discontiguous megablast
1713 opts.Reset(CBlastOptionsFactory::Create(eDiscMegablast));
1714 blaster.SetOptionsHandle() = *opts;
1715 sav = blaster.Run();
1716 sar = *(sav[0]->Get().begin());
1717 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1718 testBlastHitCounts(blaster, eDiscMegablast_555_555);
1719 testRawCutoffs(blaster, eDiscMegablast, eDiscMegablast_555_555);
1720 }
1721
BOOST_AUTO_TEST_CASE(MegablastGreedyTraceback)1722 BOOST_AUTO_TEST_CASE(MegablastGreedyTraceback) {
1723 CSeq_id query_id("gi|2655203");
1724 auto_ptr<SSeqLoc> ql(
1725 CTestObjMgr::Instance().CreateSSeqLoc(query_id,
1726 eNa_strand_plus));
1727
1728 CSeq_id subject_id("gi|200811");
1729 auto_ptr<SSeqLoc> sl(
1730 CTestObjMgr::Instance().CreateSSeqLoc(subject_id,
1731 eNa_strand_minus));
1732
1733 // test a fix for a bug that corrupted the traceback
1734 // in the one hit returned from this search
1735
1736 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
1737 opts->SetTraditionalMegablastDefaults();
1738 opts->SetMatchReward(1);
1739 opts->SetMismatchPenalty(-2);
1740 opts->SetGapOpeningCost(3);
1741 opts->SetGapExtensionCost(1);
1742 opts->SetWordSize(24);
1743 opts->SetGapExtnAlgorithm(eGreedyScoreOnly);
1744 opts->SetGapTracebackAlgorithm(eGreedyTbck);
1745
1746 CBl2Seq blaster(*ql, *sl, *opts);
1747
1748 CRef<CSearchResultSet> res = blaster.RunEx();
1749 BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
1750 BOOST_REQUIRE_EQUAL(1U, res->size());
1751 CConstRef<CSeq_align_set> sas = (*res)[0].GetSeqAlign();
1752 BOOST_REQUIRE(sas.NotEmpty());
1753 BOOST_REQUIRE_EQUAL(1U, sas->Get().size());
1754 CRef<CSeq_align> sa = sas->Get().front();
1755 int score = 0;
1756 sa->GetNamedScore(CSeq_align::eScore_Score, score);
1757 BOOST_REQUIRE_EQUAL(832, score);
1758
1759 TSeqAlignVector alignments = blaster.Run();
1760 BOOST_REQUIRE_EQUAL(1U, alignments.size());
1761 sa = alignments[0]->Get().front();
1762 sa->GetNamedScore(CSeq_align::eScore_Score, score);
1763 BOOST_REQUIRE_EQUAL(832, score);
1764 }
1765
1766
BOOST_AUTO_TEST_CASE(MegablastGreedyTraceback2)1767 BOOST_AUTO_TEST_CASE(MegablastGreedyTraceback2) {
1768 CRef<CObjectManager> kObjMgr = CObjectManager::GetInstance();
1769 CRef<CScope> scope(new CScope(*kObjMgr));
1770
1771 CRef<CSeq_entry> seq_entry1;
1772 ifstream in1("data/greedy1a.fsa");
1773 if ( !in1 )
1774 throw runtime_error("Failed to open file1");
1775 if ( !(seq_entry1 = CFastaReader(in1).ReadOneSeq()))
1776 throw runtime_error("Failed to read sequence from file1");
1777 scope->AddTopLevelSeqEntry(*seq_entry1);
1778 CRef<CSeq_loc> seqloc1(new CSeq_loc);
1779 const string kSeqIdString1("lcl|1");
1780 CRef<CSeq_id> id1(new CSeq_id(kSeqIdString1));
1781 seqloc1->SetWhole(*id1);
1782 SSeqLoc ss1(seqloc1, scope);
1783
1784 CRef<CSeq_entry> seq_entry2;
1785 ifstream in2("data/greedy1b.fsa");
1786 if ( !in2 )
1787 throw runtime_error("Failed to open file2");
1788 if ( !(seq_entry2 = CFastaReader(in2).ReadOneSeq()))
1789 throw runtime_error("Failed to read sequence from file2");
1790 scope->AddTopLevelSeqEntry(*seq_entry2);
1791 CRef<CSeq_loc> seqloc2(new CSeq_loc);
1792 const string kSeqIdString2("lcl|2");
1793 CRef<CSeq_id> id2(new CSeq_id(kSeqIdString2));
1794 seqloc2->SetWhole(*id2);
1795 SSeqLoc ss2(seqloc2, scope);
1796
1797 CRef<CBlastNucleotideOptionsHandle> handle(new CBlastNucleotideOptionsHandle);
1798 handle->SetGapOpeningCost(0);
1799 handle->SetGapExtensionCost(0);
1800 handle->SetDustFiltering(false);
1801
1802 // test multiple bug fixes in greedy gapped alignment
1803
1804 CBl2Seq blaster1(ss1, ss2, *handle);
1805 TSeqAlignVector sav(blaster1.Run());
1806 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1807 BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
1808
1809 const CSeq_align& seqalign1 = *sar;
1810 BOOST_REQUIRE(seqalign1.IsSetScore());
1811 ITERATE(CSeq_align::TScore, itr, seqalign1.GetScore()) {
1812 BOOST_REQUIRE((*itr)->IsSetId());
1813 if ((*itr)->GetId().GetStr() == "score") {
1814 BOOST_REQUIRE_EQUAL(619, (*itr)->GetValue().GetInt());
1815 break;
1816 }
1817 }
1818
1819 handle->SetMatchReward(10);
1820 handle->SetMismatchPenalty(-25);
1821 handle->SetGapXDropoff(100.0);
1822 handle->SetGapXDropoffFinal(100.0);
1823
1824 CBl2Seq blaster2(ss1, ss2, *handle);
1825 sav = blaster2.Run();
1826 sar = *(sav[0]->Get().begin());
1827 BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
1828
1829 const CSeq_align& seqalign2 = *sar;
1830 BOOST_REQUIRE(seqalign2.IsSetScore());
1831 ITERATE(CSeq_align::TScore, itr, seqalign2.GetScore()) {
1832 BOOST_REQUIRE((*itr)->IsSetId());
1833 if ((*itr)->GetId().GetStr() == "score") {
1834 BOOST_REQUIRE_EQUAL(6035, (*itr)->GetValue().GetInt());
1835 break;
1836 }
1837 }
1838 }
1839
BOOST_AUTO_TEST_CASE(MegablastGreedyTracebackSelfHits)1840 BOOST_AUTO_TEST_CASE(MegablastGreedyTracebackSelfHits) {
1841 // the following tests are for JIRA SB-1041, where the greedy
1842 // traceback may stop prematurely.
1843 CSeq_id query_id("gi|56384585");
1844 auto_ptr<SSeqLoc> ql(
1845 CTestObjMgr::Instance().CreateSSeqLoc(query_id,
1846 eNa_strand_plus));
1847
1848 CSeq_id subject_id("gi|56384585");
1849 auto_ptr<SSeqLoc> sl(
1850 CTestObjMgr::Instance().CreateSSeqLoc(subject_id,
1851 eNa_strand_plus));
1852
1853 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
1854 opts->SetTraditionalMegablastDefaults();
1855 opts->SetMatchReward(1);
1856 opts->SetMismatchPenalty(-2);
1857 opts->SetGapOpeningCost(0);
1858 opts->SetGapExtensionCost(0);
1859 opts->SetWordSize(28);
1860 opts->SetGapExtnAlgorithm(eGreedyScoreOnly);
1861 opts->SetGapTracebackAlgorithm(eGreedyTbck);
1862
1863 CBl2Seq blaster(*ql, *sl, *opts);
1864
1865 CRef<CSearchResultSet> res = blaster.RunEx();
1866 BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
1867 BOOST_REQUIRE_EQUAL(1U, res->size());
1868 CConstRef<CSeq_align_set> sas = (*res)[0].GetSeqAlign();
1869 BOOST_REQUIRE(sas.NotEmpty());
1870 CRef<CSeq_align> sa = sas->Get().front();
1871 int score = 0;
1872 sa->GetNamedScore(CSeq_align::eScore_Score, score);
1873 BOOST_REQUIRE_EQUAL(3794584, score);
1874 }
1875
BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryPlusStrand)1876 BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryPlusStrand) {
1877 CSeq_id qid("gi|555");
1878 auto_ptr<SSeqLoc> query(
1879 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_plus));
1880
1881 CSeq_id sid("gi|129295");
1882 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
1883
1884 CBl2Seq blaster(*query, *subj, eBlastx);
1885 TSeqAlignVector sav(blaster.Run());
1886 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1887 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetStd().size());
1888 }
1889
BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryMinusStrand)1890 BOOST_AUTO_TEST_CASE(Blastx2Seqs_QueryMinusStrand) {
1891 CSeq_id qid("gi|555");
1892 auto_ptr<SSeqLoc> query(
1893 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_minus));
1894
1895 CSeq_id sid("gi|129295");
1896 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
1897
1898 CBl2Seq blaster(*query, *subj, eBlastx);
1899 TSeqAlignVector sav(blaster.Run());
1900 // No hits. Empty CSeq_align_set returned.
1901 BOOST_REQUIRE(sav[0]->IsEmpty() == true);
1902 }
1903
1904
BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryBothStrands)1905 BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryBothStrands) {
1906 CSeq_id id("gi|555");
1907 auto_ptr<SSeqLoc> sl(
1908 CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_both));
1909
1910 CBl2Seq blaster(*sl, *sl, eTblastx);
1911 TSeqAlignVector sav(blaster.Run());
1912 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1913 BOOST_REQUIRE_EQUAL(39, (int)sar->GetSegs().GetStd().size());
1914 testBlastHitCounts(blaster, eTblastx_555_555);
1915 testRawCutoffs(blaster, eTblastx, eTblastx_555_555);
1916 }
1917
BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryPlusStrand)1918 BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryPlusStrand) {
1919 CSeq_id id("gi|555");
1920 auto_ptr<SSeqLoc> sl(
1921 CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_plus));
1922
1923 CBl2Seq blaster(*sl, *sl, eTblastx);
1924 TSeqAlignVector sav(blaster.Run());
1925 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1926 BOOST_REQUIRE_EQUAL(11, (int)sar->GetSegs().GetStd().size());
1927 }
1928
BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryMinusStrand)1929 BOOST_AUTO_TEST_CASE(TBlastx2Seqs_QueryMinusStrand) {
1930 CSeq_id id("gi|555");
1931 auto_ptr<SSeqLoc> sl(
1932 CTestObjMgr::Instance().CreateSSeqLoc(id, eNa_strand_minus));
1933
1934 CBl2Seq blaster(*sl, *sl, eTblastx);
1935 TSeqAlignVector sav(blaster.Run());
1936 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1937 BOOST_REQUIRE_EQUAL(12, (int)sar->GetSegs().GetStd().size());
1938 }
1939
1940
BOOST_AUTO_TEST_CASE(TblastxManyHits)1941 BOOST_AUTO_TEST_CASE(TblastxManyHits) {
1942 const int total_num_hsps = 49;
1943 const int num_hsps_to_check = 8;
1944 const int score_array[num_hsps_to_check] =
1945 { 947, 125, 820, 113, 624, 221, 39, 778};
1946 const int sum_n_array[num_hsps_to_check] =
1947 { 2, 2, 2, 2, 3, 3, 3, 0};
1948 CSeq_id qid("gi|24719404");
1949 auto_ptr<SSeqLoc> qsl(
1950 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
1951 CSeq_id sid("gi|29807292");
1952 pair<TSeqPos, TSeqPos> range(15185000, 15195000);
1953 auto_ptr<SSeqLoc> ssl(
1954 CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_both));
1955 CBl2Seq blaster(*qsl, *ssl, eTblastx);
1956 blaster.SetOptionsHandle().SetMaxNumHspPerSequence(total_num_hsps);
1957
1958 TSeqAlignVector sav(blaster.Run());
1959
1960 testBlastHitCounts(blaster, eTblastx_many_hits);
1961 testRawCutoffs(blaster, eTblastx, eTblastx_many_hits);
1962
1963 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1964 list< CRef<CStd_seg> >& segs = sar->SetSegs().SetStd();
1965 BOOST_REQUIRE_EQUAL(total_num_hsps, (int)segs.size());
1966 int index = 0;
1967 ITERATE(list< CRef<CStd_seg> >, itr, segs) {
1968 const vector< CRef< CScore > >& score_v = (*itr)->GetScores();
1969 ITERATE(CSeq_align::TScore, sitr, score_v) {
1970 BOOST_REQUIRE((*sitr)->IsSetId());
1971 if ((*sitr)->GetId().GetStr() == "score") {
1972 BOOST_REQUIRE_EQUAL(score_array[index],
1973 (*sitr)->GetValue().GetInt());
1974 } else if ((*sitr)->GetId().GetStr() == "sum_n") {
1975 BOOST_REQUIRE_EQUAL(sum_n_array[index],
1976 (*sitr)->GetValue().GetInt());
1977 }
1978 }
1979 if (++index == num_hsps_to_check)
1980 break;
1981 }
1982 }
1983
BOOST_AUTO_TEST_CASE(ProteinBlast2Seqs)1984 BOOST_AUTO_TEST_CASE(ProteinBlast2Seqs) {
1985 CSeq_id id("gi|129295");
1986 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(id));
1987
1988 id.SetGi(GI_CONST(7662354));
1989 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(id));
1990
1991 CBl2Seq blaster(*query, *subj, eBlastp);
1992 TSeqAlignVector sav(blaster.Run());
1993 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
1994 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
1995 testBlastHitCounts(blaster, eBlastp_129295_7662354);
1996 testRawCutoffs(blaster, eBlastp, eBlastp_129295_7662354);
1997 }
1998
BOOST_AUTO_TEST_CASE(BlastnWithRepeatFiltering_InvalidDB)1999 BOOST_AUTO_TEST_CASE(BlastnWithRepeatFiltering_InvalidDB) {
2000 CSeq_id qid("gi|555");
2001 auto_ptr<SSeqLoc> query(
2002 CTestObjMgr::Instance().CreateSSeqLoc(qid));
2003
2004 CBlastNucleotideOptionsHandle opts;
2005 opts.SetTraditionalMegablastDefaults();
2006 const string kRepeatDb("junk");
2007 opts.SetRepeatFilteringDB(kRepeatDb.c_str());
2008 bool is_repeat_filtering_on = opts.GetRepeatFiltering();
2009 BOOST_REQUIRE(is_repeat_filtering_on);
2010 string repeat_db(opts.GetRepeatFilteringDB()
2011 ? opts.GetRepeatFilteringDB()
2012 : kEmptyStr);
2013 BOOST_REQUIRE_EQUAL(kRepeatDb, repeat_db);
2014
2015 CBl2Seq blaster(*query, *query, opts);
2016 BOOST_CHECK_THROW((void) blaster.Run(), CBlastException);
2017 BOOST_CHECK_THROW((void) blaster.RunEx(), CBlastException);
2018 }
2019
BOOST_AUTO_TEST_CASE(BlastnWithRepeatFiltering)2020 BOOST_AUTO_TEST_CASE(BlastnWithRepeatFiltering) {
2021 CSeq_id qid("gi|555");
2022 auto_ptr<SSeqLoc> query(
2023 CTestObjMgr::Instance().CreateSSeqLoc(qid));
2024
2025 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2026 opts->SetTraditionalMegablastDefaults();
2027 opts->SetRepeatFiltering(true);
2028 string repeat_db(opts->GetRepeatFilteringDB()
2029 ? opts->GetRepeatFilteringDB()
2030 : kEmptyStr);
2031 BOOST_REQUIRE_EQUAL(string(kDefaultRepeatFilterDb), repeat_db);
2032 // it's harmless to set them both, but only the latter one will be used
2033 const string kRepeatDb("repeat/repeat_9606");
2034 opts->SetRepeatFilteringDB(kRepeatDb.c_str());
2035 repeat_db.assign(opts->GetRepeatFilteringDB()
2036 ? opts->GetRepeatFilteringDB()
2037 : kEmptyStr);
2038 BOOST_REQUIRE_EQUAL(kRepeatDb, repeat_db);
2039
2040 bool is_repeat_filtering_on = opts->GetRepeatFiltering();
2041 BOOST_REQUIRE(is_repeat_filtering_on);
2042
2043 CBl2Seq blaster(*query, *query, *opts);
2044 TSeqAlignVector sav(blaster.Run());
2045 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2046 BOOST_REQUIRE(sar.NotEmpty());
2047 BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2048 }
2049
BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_Db)2050 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_Db) {
2051 CSeq_id qid("gi|555");
2052 auto_ptr<SSeqLoc> query(
2053 CTestObjMgr::Instance().CreateSSeqLoc(qid));
2054
2055 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2056 opts->SetTraditionalMegablastDefaults();
2057 const string kWindowMaskerDb = WindowMaskerTaxidToDb(9606);
2058 opts->SetWindowMaskerDatabase(kWindowMaskerDb.c_str());
2059 string wmdb(opts->GetWindowMaskerDatabase()
2060 ? opts->GetWindowMaskerDatabase() : kEmptyStr);
2061 BOOST_REQUIRE_EQUAL(kWindowMaskerDb, wmdb);
2062 BOOST_REQUIRE_EQUAL(0, opts->GetWindowMaskerTaxId());
2063 CBl2Seq blaster(*query, *query, *opts);
2064 TSeqAlignVector sav(blaster.Run());
2065 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2066 BOOST_REQUIRE(sar.NotEmpty());
2067 BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2068 }
2069
BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_Taxid)2070 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_Taxid) {
2071 CSeq_id qid("gi|555");
2072 auto_ptr<SSeqLoc> query(
2073 CTestObjMgr::Instance().CreateSSeqLoc(qid));
2074
2075 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2076 opts->SetTraditionalMegablastDefaults();
2077 opts->SetWindowMaskerTaxId(9606);
2078 CBl2Seq blaster(*query, *query, *opts);
2079 TSeqAlignVector sav(blaster.Run());
2080 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2081 BOOST_REQUIRE(sar.NotEmpty());
2082 BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2083 }
2084
BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_InvalidDb)2085 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_InvalidDb) {
2086 CSeq_id qid("gi|555");
2087 auto_ptr<SSeqLoc> query(
2088 CTestObjMgr::Instance().CreateSSeqLoc(qid));
2089
2090 CBlastNucleotideOptionsHandle opts;
2091 opts.SetTraditionalMegablastDefaults();
2092 const string kWindowMaskerDb("Dummydb");
2093 opts.SetWindowMaskerDatabase(kWindowMaskerDb.c_str());
2094 string wmdb(opts.GetWindowMaskerDatabase()
2095 ? opts.GetWindowMaskerDatabase() : kEmptyStr);
2096 BOOST_REQUIRE_EQUAL(kWindowMaskerDb, wmdb);
2097 CBl2Seq blaster(*query, *query, opts);
2098 BOOST_CHECK_THROW((void) blaster.Run(), CBlastException);
2099 BOOST_CHECK_THROW((void) blaster.RunEx(), CBlastException);
2100 }
2101
BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_InvalidTaxid)2102 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_InvalidTaxid) {
2103 CSeq_id qid("gi|555");
2104 auto_ptr<SSeqLoc> query(
2105 CTestObjMgr::Instance().CreateSSeqLoc(qid));
2106
2107 CBlastNucleotideOptionsHandle opts;
2108 opts.SetTraditionalMegablastDefaults();
2109 const int kInvalidTaxId = -1;
2110 opts.SetWindowMaskerTaxId(kInvalidTaxId);
2111 BOOST_REQUIRE_EQUAL(kInvalidTaxId, opts.GetWindowMaskerTaxId());
2112 CBl2Seq blaster(*query, *query, opts);
2113 BOOST_CHECK_THROW((void) blaster.Run(), CBlastException);
2114 BOOST_CHECK_THROW((void) blaster.RunEx(), CBlastException);
2115 }
2116
BOOST_AUTO_TEST_CASE(WindowMaskerPathInitInvalid)2117 BOOST_AUTO_TEST_CASE(WindowMaskerPathInitInvalid) {
2118 int rv = WindowMaskerPathInit(kEmptyStr);
2119 BOOST_CHECK_EQUAL(1, rv);
2120 rv = WindowMaskerPathInit("blast_unit_test");
2121 BOOST_CHECK_EQUAL(1, rv);
2122 }
2123
BOOST_AUTO_TEST_CASE(WindowMaskerPathInitValid)2124 BOOST_AUTO_TEST_CASE(WindowMaskerPathInitValid) {
2125 int rv = WindowMaskerPathInit(CDir::GetCwd());
2126 BOOST_CHECK_EQUAL(0, rv);
2127 WindowMaskerPathReset();
2128 }
2129
BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_DbAndTaxid)2130 BOOST_AUTO_TEST_CASE(BlastnWithWindowMasker_DbAndTaxid) {
2131 CSeq_id qid("gi|555");
2132 auto_ptr<SSeqLoc> query(
2133 CTestObjMgr::Instance().CreateSSeqLoc(qid));
2134
2135 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2136 opts->SetTraditionalMegablastDefaults();
2137 // if both are set, the database name will be given preference
2138 opts->SetWindowMaskerDatabase(WindowMaskerTaxidToDb(9606).c_str());
2139 opts->SetWindowMaskerTaxId(-1);
2140 CBl2Seq blaster(*query, *query, *opts);
2141 TSeqAlignVector sav(blaster.Run());
2142 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2143 BOOST_REQUIRE(sar.NotEmpty());
2144 BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2145 }
2146
BOOST_AUTO_TEST_CASE(ConvertTaxIdToWindowMaskerDb)2147 BOOST_AUTO_TEST_CASE(ConvertTaxIdToWindowMaskerDb) {
2148 string path;
2149 BOOST_CHECK_NO_THROW(path = WindowMaskerTaxidToDb(9606));
2150 BOOST_REQUIRE(NStr::EndsWith(path, "wmasker.obinary") ||
2151 NStr::EndsWith(path, "wmasker.oascii") );
2152 }
2153
2154 // Bug report from Alex Astashyn
BOOST_AUTO_TEST_CASE(Alex)2155 BOOST_AUTO_TEST_CASE(Alex) {
2156 CSeq_id qid("NG_007092.2");
2157 TSeqRange qr(0, 2311633);
2158 auto_ptr<SSeqLoc> query(
2159 CTestObjMgr::Instance().CreateSSeqLoc(qid, qr, eNa_strand_plus));
2160
2161 CSeq_id sid("NT_007914.14");
2162 TSeqRange sr(5233652, 9849919);
2163 auto_ptr<SSeqLoc> subj(
2164 CTestObjMgr::Instance().CreateSSeqLoc(sid, sr));
2165
2166 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2167 opts->SetTraditionalMegablastDefaults();
2168 opts->SetRepeatFiltering(true);
2169 CBl2Seq blaster(*query, *subj, *opts);
2170 TSeqAlignVector sav(blaster.Run());
2171 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2172 BOOST_REQUIRE(sar.NotEmpty());
2173 BOOST_REQUIRE(sar->GetSegs().GetDenseg().GetNumseg() >= 1);
2174 }
2175
BOOST_AUTO_TEST_CASE(NucleotideBlast2Seqs)2176 BOOST_AUTO_TEST_CASE(NucleotideBlast2Seqs) {
2177 CSeq_id qid("gi|555");
2178 auto_ptr<SSeqLoc> query(
2179 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2180
2181 CSeq_id sid("gi|3090");
2182 auto_ptr<SSeqLoc> subj(
2183 CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2184
2185 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2186 opts->SetTraditionalBlastnDefaults();
2187 CBl2Seq blaster(*query, *subj, *opts);
2188 TSeqAlignVector sav(blaster.Run());
2189 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2190 BOOST_REQUIRE_EQUAL(3, (int)sar->GetSegs().GetDenseg().GetNumseg());
2191 testBlastHitCounts(blaster, eBlastn_555_3090);
2192 testRawCutoffs(blaster, eBlastn, eBlastn_555_3090);
2193 }
2194
BOOST_AUTO_TEST_CASE(ProteinBlastChangeQuery)2195 BOOST_AUTO_TEST_CASE(ProteinBlastChangeQuery) {
2196 CSeq_id id("gi|129295");
2197 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(id));
2198
2199 id.SetGi(GI_CONST(7662354));
2200 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(id));
2201
2202 // Run self hit first
2203 CBl2Seq blaster(*subj, *subj, eBlastp);
2204 TSeqAlignVector sav(blaster.Run());
2205 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2206 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2207
2208 // Change the query sequence (recreates the lookup table)
2209 blaster.SetQuery(*query);
2210 sav = blaster.Run();
2211 sar = *(sav[0]->Get().begin());
2212 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2213 }
2214
BOOST_AUTO_TEST_CASE(ProteinBlastChangeSubject)2215 BOOST_AUTO_TEST_CASE(ProteinBlastChangeSubject) {
2216 CSeq_id qid("gi|129295");
2217 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2218
2219 CSeq_id sid("gi|7662354");
2220 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2221
2222 // Run self hit first
2223 CBl2Seq blaster(*query, *query, eBlastp);
2224 TSeqAlignVector sav(blaster.Run());
2225 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2226 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2227
2228 // Change the subject sequence
2229 blaster.SetSubject(*subj);
2230 sav = blaster.Run();
2231 BOOST_REQUIRE_EQUAL(1, (int)sav.size());
2232 sar = *(sav[0]->Get().begin());
2233 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2234 }
2235
BOOST_AUTO_TEST_CASE(NucleotideBlastChangeQuery)2236 BOOST_AUTO_TEST_CASE(NucleotideBlastChangeQuery) {
2237 CSeq_id qid("gi|555");
2238 auto_ptr<SSeqLoc> query(
2239 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2240
2241 CSeq_id sid("gi|3090");
2242 auto_ptr<SSeqLoc> subj(
2243 CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2244
2245 // Run self hit first
2246 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2247 opts->SetTraditionalBlastnDefaults();
2248 CBl2Seq blaster(*subj, *subj, *opts);
2249 TSeqAlignVector sav(blaster.Run());
2250 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2251 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2252
2253 // Change the query sequence (recreates the lookup table)
2254 blaster.SetQuery(*query);
2255 sav = blaster.Run();
2256 BOOST_REQUIRE_EQUAL(2, (int)sav[0]->Size());
2257 sar = *(sav[0]->Get().begin());
2258 BOOST_REQUIRE_EQUAL(3, (int)sar->GetSegs().GetDenseg().GetNumseg());
2259 }
2260
BOOST_AUTO_TEST_CASE(NucleotideBlastChangeSubject)2261 BOOST_AUTO_TEST_CASE(NucleotideBlastChangeSubject) {
2262 CSeq_id qid("gi|555");
2263 auto_ptr<SSeqLoc> query(
2264 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2265
2266 CSeq_id sid("gi|3090");
2267 auto_ptr<SSeqLoc> subj(
2268 CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2269
2270 // Run self hit first
2271 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2272 opts->SetTraditionalBlastnDefaults();
2273 CBl2Seq blaster(*query, *query, *opts);
2274 TSeqAlignVector sav(blaster.Run());
2275 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2276 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2277
2278 // Change the subject sequence
2279 blaster.SetSubject(*subj);
2280 sav = blaster.Run();
2281 sar = *(sav[0]->Get().begin());
2282 BOOST_REQUIRE_EQUAL(3, (int)sar->GetSegs().GetDenseg().GetNumseg());
2283 }
2284
2285
BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueries)2286 BOOST_AUTO_TEST_CASE(ProteinBlastMultipleQueries) {
2287 TSeqLocVector sequences;
2288
2289 CSeq_id qid("gi|129295");
2290 auto_ptr<SSeqLoc> sl1(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2291 sequences.push_back(*sl1);
2292
2293 CSeq_id sid("gi|7662354");
2294 auto_ptr<SSeqLoc> sl2(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2295 sequences.push_back(*sl2);
2296
2297 CBl2Seq blaster(sequences, sequences, eBlastp);
2298 TSeqAlignVector seqalign_v = blaster.Run();
2299
2300 /* DEBUG OUTPUT
2301 for (size_t i = 0; i < seqalign_v.size(); i++)
2302 cerr << "\n<" << i << ">\n"
2303 << MSerial_AsnText << seqalign_v[i].GetObject() << endl;
2304 */
2305
2306 BOOST_REQUIRE_EQUAL(4, (int)seqalign_v.size());
2307 BOOST_REQUIRE_EQUAL(2, (int)sequences.size());
2308
2309 CRef<CSeq_align> sar;
2310
2311 BOOST_REQUIRE_EQUAL(1U, seqalign_v[0]->Get().size());
2312 sar = *(seqalign_v[0]->Get().begin());
2313 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2314
2315 BOOST_REQUIRE_EQUAL((size_t)2, seqalign_v[1]->Get().size());
2316 sar = *(seqalign_v[1]->Get().begin());
2317 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2318 sar = *(++(seqalign_v[1]->Get().begin()));
2319 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2320
2321 BOOST_REQUIRE_EQUAL((size_t)3, seqalign_v[2]->Get().size());
2322 sar = *(seqalign_v[2]->Get().begin());
2323 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2324 sar = *(++(seqalign_v[2]->Get().begin()));
2325 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2326
2327 BOOST_REQUIRE_EQUAL((size_t)1, seqalign_v[3]->Get().size());
2328 sar = *(seqalign_v[3]->Get().begin());
2329 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2330
2331
2332 testBlastHitCounts(blaster, eBlastp_multi_q);
2333 testRawCutoffs(blaster, eBlastp, eBlastp_multi_q);
2334
2335 // test the order of queries and subjects:
2336 testResultAlignments(sequences.size(), sequences.size(),
2337 seqalign_v);
2338 }
2339
BOOST_AUTO_TEST_CASE(NucleotideBlastMultipleQueries)2340 BOOST_AUTO_TEST_CASE(NucleotideBlastMultipleQueries) {
2341 TSeqLocVector sequences;
2342
2343 CSeq_id qid("gi|555");
2344 auto_ptr<SSeqLoc> sl1(
2345 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2346 sequences.push_back(*sl1);
2347 BOOST_REQUIRE(sl1->mask.Empty());
2348
2349 CSeq_id sid("gi|3090");
2350 auto_ptr<SSeqLoc> sl2(
2351 CTestObjMgr::Instance().CreateSSeqLoc(sid, eNa_strand_both));
2352 sequences.push_back(*sl2);
2353 BOOST_REQUIRE(sl2->mask.Empty());
2354
2355 CBl2Seq blaster(sequences, sequences, eBlastn);
2356 TSeqAlignVector seqalign_v = blaster.Run();
2357 BOOST_REQUIRE_EQUAL(2, (int)sequences.size());
2358 BOOST_REQUIRE_EQUAL(4, (int)seqalign_v.size());
2359
2360 CRef<CSeq_align> sar = *(seqalign_v[0]->Get().begin());
2361 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2362
2363 // in older version this was seqalign_v[1], order has changed
2364 sar = *(seqalign_v[2]->Get().begin());
2365 BOOST_REQUIRE_EQUAL((CDense_seg::TNumseg)1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2366
2367 testBlastHitCounts(blaster, eBlastn_multi_q);
2368 testRawCutoffs(blaster, eBlastn, eBlastn_multi_q);
2369
2370 // test the order of queries and subjects:
2371 testResultAlignments(sequences.size(), sequences.size(),
2372 seqalign_v);
2373 }
2374
2375 #if 0
2376 void DoSearchWordSize4(const char *file1, const char *file2) {
2377 CRef<CObjectManager> kObjMgr = CObjectManager::GetInstance();
2378 CRef<CScope> scope(new CScope(*kObjMgr));
2379
2380 CRef<CSeq_entry> seq_entry1;
2381 ifstream in1(file1);
2382 if ( !in1 )
2383 throw runtime_error("Failed to open file1");
2384 if ( !(seq_entry1 = CFastaReader(in1).ReadOneSeq()))
2385 throw runtime_error("Failed to read sequence from file1");
2386 scope->AddTopLevelSeqEntry(*seq_entry1);
2387 CRef<CSeq_loc> seqloc1(new CSeq_loc);
2388 const string kSeqIdString1("lcl|1");
2389 CRef<CSeq_id> id1(new CSeq_id(kSeqIdString1));
2390 seqloc1->SetWhole(*id1);
2391 SSeqLoc ss1(seqloc1, scope);
2392
2393 CRef<CSeq_entry> seq_entry2;
2394 ifstream in2(file2);
2395 if ( !in2 )
2396 throw runtime_error("Failed to open file2");
2397 if ( !(seq_entry2 = CFastaReader(in2).ReadOneSeq()))
2398 throw runtime_error("Failed to read sequence from file2");
2399 scope->AddTopLevelSeqEntry(*seq_entry2);
2400 CRef<CSeq_loc> seqloc2(new CSeq_loc);
2401 const string kSeqIdString2("lcl|2");
2402 CRef<CSeq_id> id2(new CSeq_id(kSeqIdString2));
2403 seqloc2->SetWhole(*id2);
2404 SSeqLoc ss2(seqloc2, scope);
2405
2406 CBlastNucleotideOptionsHandle handle;
2407 handle.SetTraditionalBlastnDefaults();
2408 handle.SetWordSize(4);
2409 handle.SetDustFiltering(false);
2410 handle.SetMismatchPenalty(-1);
2411 handle.SetMatchReward(1);
2412 handle.SetEvalueThreshold(10000);
2413
2414 CBl2Seq blaster(ss1, ss2, handle);
2415 blaster.RunWithoutSeqalignGeneration(); /* NCBI_FAKE_WARNING */
2416 BlastHSPResults *results = blaster.GetResults(); /* NCBI_FAKE_WARNING */
2417 BOOST_REQUIRE(results != NULL);
2418 BOOST_REQUIRE(results->hitlist_array[0] != NULL);
2419 BOOST_REQUIRE(results->hitlist_array[0]->hsplist_array[0] != NULL);
2420 BlastHSPList *hsp_list = results->hitlist_array[0]->hsplist_array[0];
2421 BOOST_REQUIRE(hsp_list->hspcnt > 0);
2422 BOOST_REQUIRE(hsp_list->hsp_array[0] != NULL);
2423
2424 // verify that all hits are properly formed, and
2425 // at least as long as the word size
2426
2427 for (int i = 0; i < hsp_list->hspcnt; i++) {
2428 BlastHSP *hsp = hsp_list->hsp_array[i];
2429 BOOST_REQUIRE(hsp != NULL);
2430 BOOST_REQUIRE(hsp->query.offset < hsp->query.end);
2431 BOOST_REQUIRE(hsp->subject.offset < hsp->subject.end);
2432 BOOST_REQUIRE(hsp->query.gapped_start >= hsp->query.offset &&
2433 hsp->query.gapped_start < hsp->query.end);
2434 BOOST_REQUIRE(hsp->subject.gapped_start >= hsp->subject.offset &&
2435 hsp->subject.gapped_start < hsp->subject.end);
2436 BOOST_REQUIRE(hsp->query.end - hsp->query.offset >= 4);
2437 BOOST_REQUIRE(hsp->subject.end - hsp->subject.offset >= 4);
2438 }
2439
2440 CRef<CSearchResultSet> res = blaster.RunEx();
2441 BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
2442 BOOST_REQUIRE_EQUAL(1, res->GetNumQueries());
2443 BOOST_REQUIRE_EQUAL(1, res->GetNumResults());
2444 BOOST_REQUIRE_EQUAL(1, res->size());
2445 CSearchResults& r = res->GetResults(0, 0);
2446 BOOST_REQUIRE(r.HasAlignments());
2447 CConstRef<CSeq_align_set> a = r.GetSeqAlign();
2448 cerr << MSerial_AsnText << * a<< endl;
2449
2450 }
2451
2452 BOOST_AUTO_TEST_CASE(NucleotideBlastWordSize4) {
2453 DoSearchWordSize4("data/blastn_size4a.fsa",
2454 "data/blastn_size4b.fsa");
2455 }
2456
2457 // test bug fix when size-4 seed falls at the end of
2458 // the subject sequence
2459 BOOST_AUTO_TEST_CASE(NucleotideBlastWordSize4_EOS) {
2460 DoSearchWordSize4("data/blastn_size4c.fsa",
2461 "data/blastn_size4d.fsa");
2462 }
2463 #endif
2464
BOOST_AUTO_TEST_CASE(TblastnOutOfFrame)2465 BOOST_AUTO_TEST_CASE(TblastnOutOfFrame) {
2466 CSeq_id qid("NP_647642.2"); // Protein sequence
2467 CSeq_id sid("BC042576.1"); // DNA sequence
2468
2469 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2470 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2471
2472 // Set the options
2473 CRef<CTBlastnOptionsHandle> opts(new CTBlastnOptionsHandle);
2474 opts->SetOutOfFrameMode();
2475 opts->SetFrameShiftPenalty(10);
2476 opts->SetFilterString("m;L");/* NCBI_FAKE_WARNING */
2477 opts->SetEvalueThreshold(0.01);
2478 opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2479
2480 CBl2Seq blaster(*query, *subj, *opts);
2481 TSeqAlignVector sav(blaster.Run());
2482 BOOST_REQUIRE_EQUAL(1, (int)sav.size());
2483 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2484 BOOST_REQUIRE_EQUAL(2, (int)sav[0]->Size());
2485 testBlastHitCounts(blaster, eTblastn_oof);
2486 testRawCutoffs(blaster, eTblastn, eTblastn_oof);
2487 }
2488
2489 // test multiple fixes for OOF alignments on both subject strands
BOOST_AUTO_TEST_CASE(TblastnOutOfFrame2)2490 BOOST_AUTO_TEST_CASE(TblastnOutOfFrame2) {
2491 CSeq_id qid("gi|38111923"); // Protein sequence
2492 CSeq_id sid("gi|6648925"); // DNA sequence
2493
2494 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateSSeqLoc(qid));
2495 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2496
2497 // Set the options
2498 CRef<CTBlastnOptionsHandle> opts(new CTBlastnOptionsHandle);
2499 opts->SetOutOfFrameMode();
2500 opts->SetFrameShiftPenalty(5);
2501 opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2502 opts->SetFilterString("L");/* NCBI_FAKE_WARNING */
2503
2504 CBl2Seq blaster(*query, *subj, *opts);
2505 TSeqAlignVector sav(blaster.Run());
2506 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2507 BOOST_REQUIRE_EQUAL(3, (int)sav[0]->Size());
2508
2509 // test fix for a bug generating OOF traceback
2510
2511 const CSeq_align& seqalign = *sar;
2512 BOOST_REQUIRE(seqalign.IsSetScore());
2513 ITERATE(CSeq_align::TScore, itr, seqalign.GetScore()) {
2514 BOOST_REQUIRE((*itr)->IsSetId());
2515 if ((*itr)->GetId().GetStr() == "num_ident") {
2516 BOOST_REQUIRE_EQUAL(80, (*itr)->GetValue().GetInt());
2517 break;
2518 }
2519 }
2520 }
2521
BOOST_AUTO_TEST_CASE(BlastxOutOfFrame)2522 BOOST_AUTO_TEST_CASE(BlastxOutOfFrame) {
2523 CSeq_id qid("BC042576.1"); // DNA sequence
2524 CSeq_id sid("NP_647642.2"); // Protein sequence
2525
2526 auto_ptr<SSeqLoc> query(
2527 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2528 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2529
2530 // Set the options
2531 CRef<CBlastxOptionsHandle> opts(new CBlastxOptionsHandle);
2532 opts->SetOutOfFrameMode();
2533 opts->SetFrameShiftPenalty(10);
2534 opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2535 opts->SetFilterString("m;L");/* NCBI_FAKE_WARNING */
2536 opts->SetEvalueThreshold(0.01);
2537
2538 CBl2Seq blaster(*query, *subj, *opts);
2539 TSeqAlignVector sav(blaster.Run());
2540 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2541 BOOST_REQUIRE_EQUAL(2, (int)sav[0]->Size());
2542 testBlastHitCounts(blaster, eBlastx_oof);
2543 testRawCutoffs(blaster, eBlastx, eBlastx_oof);
2544 }
2545
2546 // test for a bug computing OOF sequence lengths during traceback
2547
BOOST_AUTO_TEST_CASE(BlastxOutOfFrame_DifferentFrames)2548 BOOST_AUTO_TEST_CASE(BlastxOutOfFrame_DifferentFrames) {
2549 CSeq_id qid("gi|27486285"); // DNA sequence
2550 CSeq_id sid("gi|7331210"); // Protein sequence
2551
2552 auto_ptr<SSeqLoc> query(
2553 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
2554 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
2555
2556 // Set the options
2557 CRef<CBlastxOptionsHandle> opts(new CBlastxOptionsHandle);
2558 opts->SetOutOfFrameMode();
2559 opts->SetFrameShiftPenalty(10);
2560 opts->SetFilterString("L");/* NCBI_FAKE_WARNING */
2561 opts->SetCompositionBasedStats(eNoCompositionBasedStats);
2562
2563 CBl2Seq blaster(*query, *subj, *opts);
2564 TSeqAlignVector sav(blaster.Run());
2565 BOOST_REQUIRE_EQUAL(5, (int)sav[0]->Size());
2566 }
2567
2568 // The following 3 functions are for checking results in the strand
2569 // combinations tests.
2570
x_TestAlignmentQuerySubjStrandCombinations(TSeqAlignVector & sav,string aligned_strands)2571 void x_TestAlignmentQuerySubjStrandCombinations(TSeqAlignVector& sav,
2572 string aligned_strands) {
2573
2574 // Starting offsets in alignment as query/subject pairs
2575 vector< pair<TSignedSeqPos, TSignedSeqPos> > starts;
2576 starts.push_back(make_pair(7685759, 10));
2577 starts.push_back(make_pair(7685758, -1));
2578 starts.push_back(make_pair(7685718, 269));
2579 starts.push_back(make_pair(7685717, -1));
2580 starts.push_back(make_pair(7685545, 309));
2581
2582 const size_t kNumSegments(starts.size());
2583
2584 // Lengths of the aligned regions defined in starts vector
2585 vector<TSeqPos> lengths;
2586 lengths.reserve(kNumSegments);
2587 lengths.push_back(259);
2588 lengths.push_back(1);
2589 lengths.push_back(40);
2590 lengths.push_back(1);
2591 lengths.push_back(172);
2592
2593 // Strands of the involved aligned segments as query/subject pairs
2594 typedef vector< pair<ENa_strand, ENa_strand> > TStrandPairs;
2595 TStrandPairs strands(kNumSegments,
2596 make_pair(eNa_strand_minus, eNa_strand_plus));
2597
2598 // Reverse the contents of the vectors if necessary
2599 if (aligned_strands == "plus-minus") {
2600 reverse(starts.begin(), starts.end());
2601 reverse(lengths.begin(), lengths.end());
2602 NON_CONST_ITERATE(TStrandPairs, itr, strands) {
2603 swap(itr->first, itr->second);
2604 }
2605 }
2606 BOOST_REQUIRE_EQUAL(kNumSegments, lengths.size());
2607 BOOST_REQUIRE_EQUAL(kNumSegments, strands.size());
2608
2609 // Obtain the data from the Seq-align's dense segs ...
2610 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2611 BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
2612 // BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
2613 const CDense_seg& ds = sar->GetSegs().GetDenseg();
2614 // CTypeIterator<CDense_seg> segs_itr(Begin(*sar));
2615 const size_t kNumDim(ds.GetDim());
2616 vector< TSignedSeqPos > seg_starts = ds.GetStarts();
2617 vector< TSeqPos> seg_lengths = ds.GetLens();
2618 vector< ENa_strand> seg_strands = ds.GetStrands();
2619 BOOST_REQUIRE_EQUAL(kNumSegments, seg_lengths.size());
2620 BOOST_REQUIRE_EQUAL(kNumSegments*kNumDim, seg_starts.size());
2621
2622 // ... and compare it to what is expected
2623 for (size_t index = 0; index < kNumSegments; ++index) {
2624 ostringstream os;
2625 os << "Segment " << index << ": expected " << lengths[index]
2626 << " actual " << seg_lengths[index];
2627 BOOST_REQUIRE_MESSAGE(lengths[index] == seg_lengths[index],
2628 os.str());
2629
2630 os.str("");
2631 os << "Segment " << index << ": expected " << starts[index].first
2632 << " actual " << seg_starts[2*index];
2633 BOOST_REQUIRE_MESSAGE(starts[index].first == seg_starts[2*index],
2634 os.str());
2635 os.str("");
2636 os << "Segment " << index << ": expected " << starts[index].second
2637 << " actual " << seg_starts[2*index];
2638 BOOST_REQUIRE_MESSAGE(starts[index].second == seg_starts[2*index+1],
2639 os.str());
2640 os.str("");
2641 os << "Segment " << index << ": expected " << (int)strands[index].first
2642 << " actual " << (int)seg_strands[2*index];
2643 BOOST_REQUIRE_MESSAGE(strands[index].first == seg_strands[2*index],
2644 os.str());
2645 os.str("");
2646 os << "Segment " << index << ": expected " << (int)strands[index].second
2647 << " actual " << (int)seg_strands[2*index];
2648 BOOST_REQUIRE_MESSAGE(strands[index].second == seg_strands[2*index+1],
2649 os.str());
2650 }
2651 }
2652
testIntervalWholeAlignment(TSeqAlignVector & sav)2653 static void testIntervalWholeAlignment(TSeqAlignVector& sav)
2654 {
2655 const int num_segs = 5;
2656 const int num_starts = 10;
2657 const int starts[num_starts] = { 7685759, 0, 7685758, -1, 7685718,
2658 269, 7685717, -1, 7685545, 309 };
2659 const int lengths[num_segs] = { 269, 1, 40, 1, 172 };
2660 int index;
2661
2662 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2663 BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
2664 CTypeIterator<CDense_seg> segs_itr(Begin(*sar));
2665 vector< TSignedSeqPos > seg_starts = segs_itr->GetStarts();
2666 vector< TSeqPos> seg_lengths = segs_itr->GetLens();
2667 vector< ENa_strand> seg_strands = segs_itr->GetStrands();
2668 BOOST_REQUIRE_EQUAL(num_segs, (int)seg_lengths.size());
2669 BOOST_REQUIRE_EQUAL(num_starts, (int)seg_starts.size());
2670 for (index = 0; index < num_segs; ++index) {
2671 BOOST_REQUIRE_EQUAL(lengths[index], (int)seg_lengths[index]);
2672 BOOST_REQUIRE_EQUAL(starts[2*index], (int)seg_starts[2*index]);
2673 BOOST_REQUIRE_EQUAL(starts[2*index+1], (int)seg_starts[2*index+1]);
2674 BOOST_REQUIRE(seg_strands[2*index] == eNa_strand_minus);
2675 BOOST_REQUIRE(seg_strands[2*index+1] == eNa_strand_plus);
2676 }
2677 }
2678
testWholeIntervalAlignment(TSeqAlignVector & sav)2679 static void testWholeIntervalAlignment(TSeqAlignVector& sav)
2680 {
2681 const int num_segs = 5;
2682 const int num_starts = 10;
2683 const int starts[num_starts] = { 309, 7685545, -1, 7685717, 269, 7685718,
2684 -1, 7685758, 0, 7685759 };
2685 const int lengths[num_segs] = { 172, 1, 40, 1, 269 };
2686 int index;
2687
2688 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
2689 BOOST_REQUIRE_EQUAL(1, (int)sav[0]->Size());
2690 CTypeIterator<CDense_seg> segs_itr(Begin(*sar));
2691 vector< TSignedSeqPos > seg_starts = segs_itr->GetStarts();
2692 vector< TSeqPos> seg_lengths = segs_itr->GetLens();
2693 vector< ENa_strand> seg_strands = segs_itr->GetStrands();
2694 BOOST_REQUIRE_EQUAL(num_segs, (int)seg_lengths.size());
2695 BOOST_REQUIRE_EQUAL(num_starts, (int)seg_starts.size());
2696 for (index = 0; index < num_segs; ++index) {
2697 BOOST_REQUIRE_EQUAL(lengths[index], (int)seg_lengths[index]);
2698 BOOST_REQUIRE_EQUAL(starts[2*index], (int)seg_starts[2*index]);
2699 BOOST_REQUIRE_EQUAL(starts[2*index+1], (int)seg_starts[2*index+1]);
2700 BOOST_REQUIRE(seg_strands[2*index] == eNa_strand_minus);
2701 BOOST_REQUIRE(seg_strands[2*index+1] == eNa_strand_plus);
2702 }
2703 }
2704
2705 // should find alignments
BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjBothStrands)2706 BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjBothStrands) {
2707 // Alignment in these sequences is from plus/minus strands
2708 CSeq_id qid("NT_004487.15");
2709 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2710 auto_ptr<SSeqLoc> query(
2711 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2712 eNa_strand_both));
2713
2714 CSeq_id sid("AA441981.1");
2715 range.first = 10;
2716 range.second = 480;
2717 auto_ptr<SSeqLoc> subj(
2718 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2719 eNa_strand_both));
2720
2721 CBlastNucleotideOptionsHandle* opts = new CBlastNucleotideOptionsHandle;
2722 opts->SetTraditionalBlastnDefaults();
2723 CBl2Seq blaster(*query, *subj, *opts);
2724 TSeqAlignVector sav(blaster.Run());
2725 x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2726 }
2727
2728 // should find alignment
BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjPlusStrand)2729 BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjPlusStrand) {
2730 // Alignment in these sequences is from plus/minus strands
2731 CSeq_id qid("NT_004487.15");
2732 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2733 auto_ptr<SSeqLoc> query(
2734 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2735 eNa_strand_both));
2736
2737 CSeq_id sid("AA441981.1");
2738 range.first = 10;
2739 range.second = 480;
2740 auto_ptr<SSeqLoc> subj(
2741 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2742 eNa_strand_plus));
2743
2744 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2745 opts->SetTraditionalBlastnDefaults();
2746 CBl2Seq blaster(*query, *subj, *opts);
2747 TSeqAlignVector sav(blaster.Run());
2748 x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2749 }
2750
2751 // should find alignment
BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjMinusStrand)2752 BOOST_AUTO_TEST_CASE(Blastn_QueryBothStrands_SubjMinusStrand) {
2753 // Alignment in these sequences is from plus/minus strands
2754 CSeq_id qid("NT_004487.15");
2755 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2756 auto_ptr<SSeqLoc> query(
2757 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2758 eNa_strand_both));
2759
2760 CSeq_id sid("AA441981.1");
2761 range.first = 10;
2762 range.second = 480;
2763 auto_ptr<SSeqLoc> subj(
2764 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2765 eNa_strand_minus));
2766
2767 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2768 opts->SetTraditionalBlastnDefaults();
2769 CBl2Seq blaster(*query, *subj, *opts);
2770 TSeqAlignVector sav(blaster.Run());
2771 x_TestAlignmentQuerySubjStrandCombinations(sav, "plus-minus");
2772 }
2773
2774 // shouldn't find an alignment
BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjPlusStrand)2775 BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjPlusStrand) {
2776 // Alignment in these sequences is from plus/minus strands
2777 CSeq_id qid("NT_004487.15");
2778 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2779 auto_ptr<SSeqLoc> query(
2780 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2781 eNa_strand_plus));
2782
2783 CSeq_id sid("AA441981.1");
2784 range.first = 10;
2785 range.second = 480;
2786 auto_ptr<SSeqLoc> subj(
2787 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2788 eNa_strand_plus));
2789
2790 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2791 opts->SetTraditionalBlastnDefaults();
2792 CBl2Seq blaster(*query, *subj, *opts);
2793 TSeqAlignVector sav(blaster.Run());
2794 BOOST_REQUIRE(sav[0]->IsEmpty() == true);
2795 }
2796
2797 // should find an alignment
BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjMinusStrand)2798 BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjMinusStrand) {
2799 // Alignment in these sequences is from plus/minus strands
2800 CSeq_id qid("NT_004487.15");
2801 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2802 auto_ptr<SSeqLoc> query(
2803 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2804 eNa_strand_plus));
2805
2806 CSeq_id sid("AA441981.1");
2807 range.first = 10;
2808 range.second = 480;
2809 auto_ptr<SSeqLoc> subj(
2810 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2811 eNa_strand_minus));
2812
2813 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2814 opts->SetTraditionalBlastnDefaults();
2815 CBl2Seq blaster(*query, *subj, *opts);
2816 TSeqAlignVector sav(blaster.Run());
2817 x_TestAlignmentQuerySubjStrandCombinations(sav, "plus-minus");
2818 }
2819
2820 // should NOT find an alignment because we only search the plus strand of
2821 // the subject sequence
BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjBothStrands)2822 BOOST_AUTO_TEST_CASE(Blastn_QueryPlusStrand_SubjBothStrands) {
2823 // Alignment in these sequences is from plus/minus strands
2824 CSeq_id qid("NT_004487.15");
2825 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2826 auto_ptr<SSeqLoc> query(
2827 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2828 eNa_strand_plus));
2829
2830 CSeq_id sid("AA441981.1");
2831 range.first = 10;
2832 range.second = 480;
2833 auto_ptr<SSeqLoc> subj(
2834 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2835 eNa_strand_both));
2836
2837 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2838 opts->SetTraditionalBlastnDefaults();
2839 CBl2Seq blaster(*query, *subj, *opts);
2840 TSeqAlignVector sav(blaster.Run());
2841 BOOST_REQUIRE(sav[0]->IsEmpty() == true);
2842 }
2843
2844 // should not find an alignment because alignment is on opposite strands
BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjMinusStrand)2845 BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjMinusStrand) {
2846 // Alignment in these sequences is from plus/minus strands
2847 CSeq_id qid("NT_004487.15");
2848 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2849 auto_ptr<SSeqLoc> query(
2850 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2851 eNa_strand_minus));
2852
2853 CSeq_id sid("AA441981.1");
2854 range.first = 10;
2855 range.second = 480;
2856 auto_ptr<SSeqLoc> subj(
2857 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2858 eNa_strand_minus));
2859
2860 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2861 opts->SetTraditionalBlastnDefaults();
2862 CBl2Seq blaster(*query, *subj, *opts);
2863 TSeqAlignVector sav(blaster.Run());
2864 BOOST_REQUIRE(sav[0]->IsEmpty() == true);
2865 }
2866
2867 // should find alignment
BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjPlusStrand)2868 BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjPlusStrand) {
2869 // Alignment in these sequences is from plus/minus strands
2870 CSeq_id qid("NT_004487.15");
2871 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2872 auto_ptr<SSeqLoc> query(
2873 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2874 eNa_strand_minus));
2875
2876 CSeq_id sid("AA441981.1");
2877 range.first = 10;
2878 range.second = 480;
2879 auto_ptr<SSeqLoc> subj(
2880 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2881 eNa_strand_plus));
2882
2883 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2884 opts->SetTraditionalBlastnDefaults();
2885 CBl2Seq blaster(*query, *subj, *opts);
2886 TSeqAlignVector sav(blaster.Run());
2887 x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2888 }
2889
2890 // should find alignment
BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjBothStrands)2891 BOOST_AUTO_TEST_CASE(Blastn_QueryMinusStrand_SubjBothStrands) {
2892 // Alignment in these sequences is from plus/minus strands
2893 CSeq_id qid("NT_004487.15");
2894 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2895 auto_ptr<SSeqLoc> query(
2896 CTestObjMgr::Instance().CreateSSeqLoc(qid, range,
2897 eNa_strand_minus));
2898
2899 CSeq_id sid("AA441981.1");
2900 range.first = 10;
2901 range.second = 480;
2902 auto_ptr<SSeqLoc> subj(
2903 CTestObjMgr::Instance().CreateSSeqLoc(sid, range,
2904 eNa_strand_both));
2905
2906 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2907 opts->SetTraditionalBlastnDefaults();
2908 CBl2Seq blaster(*query, *subj, *opts);
2909 TSeqAlignVector sav(blaster.Run());
2910 x_TestAlignmentQuerySubjStrandCombinations(sav, "minus-plus");
2911 }
2912
2913 // Should properly find alignment
BOOST_AUTO_TEST_CASE(Blastn_QueryWhole_SubjInterval)2914 BOOST_AUTO_TEST_CASE(Blastn_QueryWhole_SubjInterval)
2915 {
2916 CRef<CSeq_id> qid(new CSeq_id("AA441981.1"));
2917 auto_ptr<SSeqLoc> query(CTestObjMgr::Instance().CreateWholeSSeqLoc(*qid));
2918
2919 CRef<CSeq_id> sid(new CSeq_id("NT_004487.15"));
2920 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2921 auto_ptr<SSeqLoc> subj(
2922 CTestObjMgr::Instance().CreateSSeqLoc(*sid, range,
2923 eNa_strand_both));
2924
2925 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2926 opts->SetTraditionalBlastnDefaults();
2927 CBl2Seq blaster(*query, *subj, *opts);
2928 TSeqAlignVector sav(blaster.Run());
2929 testWholeIntervalAlignment(sav);
2930 }
2931
BOOST_AUTO_TEST_CASE(Blastn_QueryInterval_SubjWhole)2932 BOOST_AUTO_TEST_CASE(Blastn_QueryInterval_SubjWhole)
2933 {
2934 CRef<CSeq_id> qid(new CSeq_id("NT_004487.15"));
2935 pair<TSeqPos, TSeqPos> range(7685545, 7686027);
2936 auto_ptr<SSeqLoc> query(
2937 CTestObjMgr::Instance().CreateSSeqLoc(*qid, range,
2938 eNa_strand_both));
2939
2940 CRef<CSeq_id> sid(new CSeq_id("AA441981.1"));
2941 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateWholeSSeqLoc(*sid));
2942
2943 CRef<CBlastNucleotideOptionsHandle> opts(new CBlastNucleotideOptionsHandle);
2944 opts->SetTraditionalBlastnDefaults();
2945 CBl2Seq blaster(*query, *subj, *opts);
2946 TSeqAlignVector sav(blaster.Run());
2947 testIntervalWholeAlignment(sav);
2948 }
2949
BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs)2950 BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs) {
2951 vector<TIntId> q_gis, s_gis;
2952
2953 // Setup the queries
2954 q_gis.push_back(6);
2955 q_gis.push_back(129295);
2956 q_gis.push_back(15606659);
2957
2958 // setup the subjects
2959 s_gis.push_back(129295);
2960 s_gis.push_back(6);
2961 s_gis.push_back(4336138); // no hits with gis 6 and 129295
2962 s_gis.push_back(15606659);
2963 s_gis.push_back(5556);
2964
2965 TSeqLocVector queries;
2966 ITERATE(vector<TIntId>, itr, q_gis) {
2967 CRef<CSeq_loc> loc(new CSeq_loc());
2968 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
2969
2970 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
2971 scope->AddDefaults();
2972 queries.push_back(SSeqLoc(loc, scope));
2973 }
2974
2975 TSeqLocVector subjects;
2976 ITERATE(vector<TIntId>, itr, s_gis) {
2977 CRef<CSeq_loc> loc(new CSeq_loc());
2978 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
2979
2980 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
2981 scope->AddDefaults();
2982 subjects.push_back(SSeqLoc(loc, scope));
2983 }
2984
2985 size_t num_queries = queries.size();
2986 size_t num_subjects = subjects.size();
2987
2988 // BLAST by concatenating all queries
2989 CBl2Seq blaster4all(queries, subjects, eBlastp);
2990 TSeqAlignVector sas_v = blaster4all.Run();
2991 BOOST_REQUIRE_EQUAL(num_queries*num_subjects, sas_v.size());
2992 testBlastHitCounts(blaster4all, eBlastp_multi_q_s);
2993 testRawCutoffs(blaster4all, eBlastp, eBlastp_multi_q_s);
2994
2995 // test the order of queries and subjects:
2996 testResultAlignments(num_queries, num_subjects,
2997 sas_v);
2998 }
2999
BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs_RunEx)3000 BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs_RunEx) {
3001 vector<TIntId> q_gis, s_gis;
3002
3003 // Setup the queries
3004 q_gis.push_back(6);
3005 q_gis.push_back(129295);
3006 q_gis.push_back(15606659);
3007
3008 // setup the subjects
3009 s_gis.push_back(129295);
3010 s_gis.push_back(6);
3011 s_gis.push_back(4336138); // no hits with gis 6 and 129295
3012 s_gis.push_back(15606659);
3013 s_gis.push_back(5556);
3014
3015 TSeqLocVector queries;
3016 ITERATE(vector<TIntId>, itr, q_gis) {
3017 CRef<CSeq_loc> loc(new CSeq_loc());
3018 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3019
3020 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3021 scope->AddDefaults();
3022 queries.push_back(SSeqLoc(loc, scope));
3023 }
3024
3025 TSeqLocVector subjects;
3026 ITERATE(vector<TIntId>, itr, s_gis) {
3027 CRef<CSeq_loc> loc(new CSeq_loc());
3028 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3029
3030 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3031 scope->AddDefaults();
3032 subjects.push_back(SSeqLoc(loc, scope));
3033 }
3034
3035 size_t num_queries = queries.size();
3036 size_t num_subjects = subjects.size();
3037
3038 // BLAST by concatenating all queries
3039 CBl2Seq blaster4all(queries, subjects, eBlastp);
3040 CRef<CSearchResultSet> results = blaster4all.RunEx();
3041 BOOST_REQUIRE(results->GetResultType() == eSequenceComparison);
3042 BOOST_REQUIRE_EQUAL((num_queries*num_subjects),
3043 results->GetNumResults());
3044
3045 // build the seqalign vector from the result set
3046 TSeqAlignVector sas_v;
3047 for (size_t i = 0; i < num_queries; i++)
3048 {
3049 for (size_t j = 0; j < num_subjects; j++)
3050 {
3051 CSearchResults& res_ij = results->GetResults(i, j);
3052 CRef<CSeq_align_set> aln_set;
3053 aln_set.Reset(const_cast<CSeq_align_set*>
3054 (res_ij.GetSeqAlign().GetPointer()));
3055 sas_v.push_back(aln_set);
3056 }
3057 }
3058
3059 // do the rest of the tests on sas_v as in the
3060 // BlastpMultipleQueries_MultipleSubjs function:
3061
3062 BOOST_REQUIRE_EQUAL(num_queries*num_subjects, sas_v.size());
3063 testBlastHitCounts(blaster4all, eBlastp_multi_q_s);
3064 testRawCutoffs(blaster4all, eBlastp, eBlastp_multi_q_s);
3065
3066 // test the order of queries and subjects:
3067 testResultAlignments(num_queries, num_subjects,
3068 sas_v);
3069 }
3070
3071 // This closely resembles how the command line applications invoke bl2seq
BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs_CLocalBlast)3072 BOOST_AUTO_TEST_CASE(BlastpMultipleQueries_MultipleSubjs_CLocalBlast) {
3073 vector<TIntId> q_gis, s_gis;
3074
3075 // Setup the queries
3076 q_gis.push_back(6);
3077 q_gis.push_back(129295);
3078 q_gis.push_back(15606659);
3079
3080 // setup the subjects
3081 s_gis.push_back(129295);
3082 s_gis.push_back(6);
3083 s_gis.push_back(4336138); // no hits with gis 6 and 129295
3084 s_gis.push_back(15606659);
3085 s_gis.push_back(5556);
3086
3087 TSeqLocVector query_vec;
3088 ITERATE(vector<TIntId>, itr, q_gis) {
3089 CRef<CSeq_loc> loc(new CSeq_loc());
3090 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3091
3092 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3093 scope->AddDefaults();
3094 query_vec.push_back(SSeqLoc(loc, scope));
3095 }
3096 CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
3097
3098 CRef<CBlastOptionsHandle>
3099 opts_handle(CBlastOptionsFactory::Create(eBlastp));
3100
3101 TSeqLocVector subj_vec;
3102 ITERATE(vector<TIntId>, itr, s_gis) {
3103 CRef<CSeq_loc> loc(new CSeq_loc());
3104 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3105
3106 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3107 scope->AddDefaults();
3108 subj_vec.push_back(SSeqLoc(loc, scope));
3109 }
3110 CRef<IQueryFactory> subj_qf(new CObjMgr_QueryFactory(subj_vec));
3111 CRef<CLocalDbAdapter> subjects(new CLocalDbAdapter(subj_qf,
3112 opts_handle));
3113
3114 BOOST_REQUIRE(subjects->IsDbScanMode() == false);
3115
3116 size_t num_queries = query_vec.size();
3117 size_t num_subjects = subj_vec.size();
3118
3119 // BLAST by concatenating all queries
3120 CLocalBlast blaster(queries, opts_handle, subjects);
3121 CRef<CSearchResultSet> results = blaster.Run();
3122 BOOST_REQUIRE(results->GetResultType() == eSequenceComparison);
3123 BOOST_REQUIRE_EQUAL((num_queries*num_subjects),
3124 results->GetNumResults());
3125 BOOST_REQUIRE_EQUAL((num_queries*num_subjects), results->size());
3126 BOOST_REQUIRE_EQUAL(num_queries, results->GetNumQueries());
3127 BOOST_REQUIRE_EQUAL(num_subjects,
3128 results->GetNumResults()/results->GetNumQueries());
3129
3130 // build the seqalign vector from the result set
3131 TSeqAlignVector sas_v;
3132 for (size_t i = 0; i < num_queries; i++)
3133 {
3134 for (size_t j = 0; j < num_subjects; j++)
3135 {
3136 CSearchResults& res_ij = results->GetResults(i, j);
3137 CRef<CSeq_align_set> aln_set;
3138 aln_set.Reset(const_cast<CSeq_align_set*>
3139 (res_ij.GetSeqAlign().GetPointer()));
3140 sas_v.push_back(aln_set);
3141 }
3142 }
3143
3144 // do the rest of the tests on sas_v as in the
3145 // BlastpMultipleQueries_MultipleSubjs function:
3146 BOOST_REQUIRE_EQUAL(num_queries*num_subjects, sas_v.size());
3147
3148 // test the order of queries and subjects:
3149 testResultAlignments(num_queries, num_subjects, sas_v);
3150 }
3151
BOOST_AUTO_TEST_CASE(Blastp_MultipleSubjs_SearchAsSet)3152 BOOST_AUTO_TEST_CASE(Blastp_MultipleSubjs_SearchAsSet) {
3153 vector<TIntId> q_gis, s_gis;
3154
3155 // Setup the queries
3156 q_gis.push_back(816838863);
3157
3158 // setup the subjects
3159 s_gis.push_back(16130156);
3160 s_gis.push_back(15644111);
3161 s_gis.push_back(126699345);
3162 s_gis.push_back(504220075);
3163 s_gis.push_back(21222553);
3164 s_gis.push_back(24376189);
3165 s_gis.push_back(15598078);
3166 s_gis.push_back(15599919);
3167 s_gis.push_back(15597767);
3168 s_gis.push_back(16131833);
3169 s_gis.push_back(15599742);
3170 s_gis.push_back(15598387);
3171 s_gis.push_back(15600358);
3172 s_gis.push_back(24375949);
3173 s_gis.push_back(126698248);
3174 s_gis.push_back(24375956);
3175 s_gis.push_back(24375382);
3176 s_gis.push_back(126698598);
3177
3178 TSeqLocVector query_vec;
3179 ITERATE(vector<TIntId>, itr, q_gis) {
3180 CRef<CSeq_loc> loc(new CSeq_loc());
3181 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3182
3183 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3184 scope->AddDefaults();
3185 query_vec.push_back(SSeqLoc(loc, scope));
3186 }
3187 CRef<IQueryFactory> queries(new CObjMgr_QueryFactory(query_vec));
3188
3189 CRef<CBlastOptionsHandle>
3190 opts_handle(CBlastOptionsFactory::Create(eBlastp));
3191
3192 TSeqLocVector subj_vec;
3193 ITERATE(vector<TIntId>, itr, s_gis) {
3194 CRef<CSeq_loc> loc(new CSeq_loc());
3195 loc->SetWhole().SetGi(GI_FROM(TIntId, *itr));
3196
3197 CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
3198 scope->AddDefaults();
3199 subj_vec.push_back(SSeqLoc(loc, scope));
3200 }
3201 CRef<IQueryFactory> subj_qf(new CObjMgr_QueryFactory(subj_vec));
3202 CRef<CLocalDbAdapter> subjects(new CLocalDbAdapter(subj_qf,
3203 opts_handle, true));
3204
3205 BOOST_REQUIRE(subjects->IsDbScanMode() == true);
3206
3207 size_t num_queries = query_vec.size();
3208 size_t num_subjects = subj_vec.size();
3209
3210 // BLAST by concatenating all queries
3211 CLocalBlast blaster(queries, opts_handle, subjects);
3212 CRef<CSearchResultSet> results = blaster.Run();
3213 BOOST_REQUIRE(results->GetResultType() == eDatabaseSearch);
3214 BOOST_REQUIRE_EQUAL(num_queries,
3215 results->GetNumResults());
3216 BOOST_REQUIRE_EQUAL(num_queries, results->size());
3217 BOOST_REQUIRE_EQUAL(num_queries, results->GetNumQueries());
3218
3219 // build the seqalign vector from the result set
3220 CRef<CSeq_align_set> aln_set;
3221 CSearchResults& res = (*results)[0];
3222 aln_set.Reset(const_cast<CSeq_align_set*>
3223 (res.GetSeqAlign().GetPointer()));
3224 // gi|15599742|ref|NP_253236.1| has two alignments, all others have one.
3225 BOOST_REQUIRE_EQUAL(num_subjects+1, aln_set->Size());
3226
3227 }
3228
BOOST_AUTO_TEST_CASE(BlastOptionsEquality)3229 BOOST_AUTO_TEST_CASE(BlastOptionsEquality) {
3230 // Create options object through factory
3231 auto_ptr<CBlastOptionsHandle> megablast_options_handle(
3232 CBlastOptionsFactory::Create(eMegablast));
3233 CBlastNucleotideOptionsHandle nucl_options_handle;
3234 BOOST_REQUIRE(megablast_options_handle->GetOptions() ==
3235 nucl_options_handle.GetOptions());
3236 }
3237
BOOST_AUTO_TEST_CASE(BlastOptionsInequality)3238 BOOST_AUTO_TEST_CASE(BlastOptionsInequality) {
3239 CBlastProteinOptionsHandle prot_options_handle;
3240 CBlastNucleotideOptionsHandle nucl_options_handle;
3241 BOOST_REQUIRE(prot_options_handle.GetOptions() !=
3242 nucl_options_handle.GetOptions());
3243
3244 // Blastn and Megablast are different
3245 auto_ptr<CBlastOptionsHandle> blastn_options_handle(
3246 CBlastOptionsFactory::Create(eBlastn));
3247 BOOST_REQUIRE(blastn_options_handle->GetOptions() !=
3248 nucl_options_handle.GetOptions());
3249
3250 // Change the matrix and compare
3251 CBlastProteinOptionsHandle prot_options_handle2;
3252 prot_options_handle.SetMatrixName("pam30");
3253 BOOST_REQUIRE(prot_options_handle.GetOptions() !=
3254 prot_options_handle2.GetOptions());
3255 }
3256
BOOST_AUTO_TEST_CASE(DiscontiguousMB)3257 BOOST_AUTO_TEST_CASE(DiscontiguousMB) {
3258 CSeq_id qid("gi|408478"); // zebrafish sequence U02544
3259 CSeq_id sid("gi|1546012"); // mouse sequence U61969
3260 auto_ptr<SSeqLoc> query(
3261 CTestObjMgr::Instance().CreateSSeqLoc(qid, eNa_strand_both));
3262 auto_ptr<SSeqLoc> subj(CTestObjMgr::Instance().CreateSSeqLoc(sid));
3263
3264 CBl2Seq blaster(*query, *subj, eDiscMegablast);
3265 TSeqAlignVector sav(blaster.Run());
3266 BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3267
3268 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3269 BOOST_REQUIRE_EQUAL(13, (int)sar->GetSegs().GetDenseg().GetNumseg());
3270 testBlastHitCounts(blaster, eDiscMegablast_U02544_U61969);
3271 testRawCutoffs(blaster, eDiscMegablast, eDiscMegablast_U02544_U61969);
3272 }
3273
BOOST_AUTO_TEST_CASE(BlastnHumanChrom_MRNA)3274 BOOST_AUTO_TEST_CASE(BlastnHumanChrom_MRNA) {
3275 CSeq_id qid("NT_004487.16");
3276 CSeq_id sid("AA621478.1");
3277 pair<TSeqPos, TSeqPos> qrange(7868209-1, 7868602-1);
3278 pair<TSeqPos, TSeqPos> srange(2-1, 397-1);
3279 auto_ptr<SSeqLoc> query(
3280 CTestObjMgr::Instance().CreateSSeqLoc(qid,
3281 qrange, eNa_strand_plus));
3282 auto_ptr<SSeqLoc> subj(
3283 CTestObjMgr::Instance().CreateSSeqLoc(sid,
3284 srange, eNa_strand_plus));
3285
3286 CRef<CBlastNucleotideOptionsHandle> options(new CBlastNucleotideOptionsHandle);
3287 CBl2Seq blaster(*query, *subj, *options);
3288 TSeqAlignVector sav(blaster.Run());
3289 BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3290
3291 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3292 BOOST_REQUIRE_EQUAL(5, (int)sar->GetSegs().GetDenseg().GetNumseg());
3293 testBlastHitCounts(blaster, eMegablast_chrom_mrna);
3294 testRawCutoffs(blaster, eMegablast, eMegablast_chrom_mrna);
3295 }
3296
3297 // Checks that results for multiple subjects are put in correct places
3298 // in the vector of Seq-aligns
BOOST_AUTO_TEST_CASE(testOneSubjectResults2CSeqAlign)3299 BOOST_AUTO_TEST_CASE(testOneSubjectResults2CSeqAlign)
3300 {
3301 const int num_subjects = 15;
3302 const int results_size[num_subjects] =
3303 { 1, 1, 0, 1, 1, 1, 2, 1, 2, 0, 0, 0, 0, 2, 1 };
3304 const int query_gi = 7274302;
3305 const int gi_diff = 28;
3306 CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, query_gi));
3307 auto_ptr<SSeqLoc> sl(
3308 CTestObjMgr::Instance().CreateSSeqLoc(*id, eNa_strand_both));
3309 TSeqLocVector query;
3310 query.push_back(*sl);
3311 TSeqLocVector subjects;
3312 int index;
3313 for (index = 0; index < num_subjects; ++index) {
3314 id.Reset(new CSeq_id(CSeq_id::e_Gi, (query_gi + gi_diff + index)));
3315 //cerr << query_gi + gi_diff + index << endl;
3316 sl.reset(CTestObjMgr::Instance().CreateSSeqLoc(*id,
3317 eNa_strand_both));
3318 subjects.push_back(*sl);
3319 }
3320 CBl2Seq blaster(query, subjects, eMegablast);
3321 TSeqAlignVector seqalign_v = blaster.Run();
3322 BOOST_REQUIRE_EQUAL(num_subjects, (int)seqalign_v.size());
3323
3324 index = 0;
3325 ITERATE(TSeqAlignVector, itr, seqalign_v)
3326 {
3327 //cerr << index << endl;
3328 BOOST_REQUIRE_EQUAL(results_size[index], (int) (*itr)->Get().size());
3329 index++;
3330 }
3331 }
3332
BOOST_AUTO_TEST_CASE(testMultiSeqSearchSymmetry)3333 BOOST_AUTO_TEST_CASE(testMultiSeqSearchSymmetry)
3334 {
3335 const size_t num_seqs = 19;
3336 const int gi_list[num_seqs] =
3337 { 1346057, 125527, 121064, 1711551, 125412, 128337, 2507199,
3338 1170625, 1730070, 585365, 140977, 1730069, 20455504, 125206,
3339 125319, 114152, 1706450, 1706307, 125565 };
3340 const int score_cutoff = 70;
3341
3342 TSeqLocVector seq_vec;
3343 size_t index;
3344 for (index = 0; index < num_seqs; ++index) {
3345 CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, gi_list[index]));
3346 auto_ptr<SSeqLoc> sl(
3347 CTestObjMgr::Instance().CreateSSeqLoc(*id, eNa_strand_both));
3348 seq_vec.push_back(*sl);
3349 }
3350
3351 CRef<CBlastProteinOptionsHandle> prot_opts(new CBlastProteinOptionsHandle);
3352 prot_opts->SetSegFiltering(false);
3353 prot_opts->SetCutoffScore(score_cutoff);
3354 CBl2Seq blaster(seq_vec, seq_vec, *prot_opts);
3355 #if 0
3356 blaster.RunWithoutSeqalignGeneration(); /* NCBI_FAKE_WARNING */
3357 BlastHSPResults* results = blaster.GetResults(); /* NCBI_FAKE_WARNING */
3358
3359 int qindex, sindex, qindex1, sindex1;
3360 for (qindex = 0; qindex < num_seqs; ++qindex) {
3361 for (sindex = 0; sindex < results->hitlist_array[qindex]->hsplist_count;
3362 ++sindex) {
3363 BlastHSPList* hsp_list1, *hsp_list2 = NULL;
3364 hsp_list1 = results->hitlist_array[qindex]->hsplist_array[sindex];
3365 qindex1 = hsp_list1->oid;
3366 BlastHitList* hitlist = results->hitlist_array[qindex1];
3367 for (sindex1 = 0; sindex1 < hitlist->hsplist_count; ++sindex1) {
3368 if (hitlist->hsplist_array[sindex1]->oid == qindex) {
3369 hsp_list2 = hitlist->hsplist_array[sindex1];
3370 break;
3371 }
3372 }
3373 BOOST_REQUIRE(hsp_list2 != NULL);
3374 int hindex;
3375 for (hindex = 0; hindex < hsp_list1->hspcnt; ++hindex) {
3376 if (hsp_list1->hsp_array[hindex]->score <= score_cutoff)
3377 break;
3378 BOOST_REQUIRE(hindex < hsp_list2->hspcnt);
3379 BOOST_REQUIRE_EQUAL(hsp_list1->hsp_array[hindex]->score,
3380 hsp_list2->hsp_array[hindex]->score);
3381 }
3382 }
3383 }
3384 #endif
3385
3386 CRef<CSearchResultSet> res = blaster.RunEx();
3387 BOOST_REQUIRE_EQUAL(eSequenceComparison, res->GetResultType());
3388 BOOST_REQUIRE_EQUAL(num_seqs, res->GetNumQueries());
3389 BOOST_REQUIRE_EQUAL(num_seqs*num_seqs, res->GetNumResults());
3390 BOOST_REQUIRE_EQUAL(num_seqs*num_seqs, res->size());
3391 typedef CSearchResultSet::size_type size_type;
3392
3393 for (size_type q = 0; q < res->GetNumQueries(); q++) {
3394 for (size_type s = 0; s < res->GetNumQueries(); s++) {
3395 const CSearchResults& results = res->GetResults(q, s);
3396 if (q == s) { // self-hit
3397 BOOST_REQUIRE(results.HasAlignments());
3398 CConstRef<CSeq_align_set> sas = results.GetSeqAlign();
3399 BOOST_REQUIRE(sas.NotEmpty());
3400 BOOST_REQUIRE_EQUAL((size_type)1, sas->Get().size());
3401 CConstRef<CSeq_align> sa = sas->Get().front();
3402 const CSeq_id& qid = sa->GetSeq_id(0);
3403 const CSeq_id& sid = sa->GetSeq_id(1);
3404 BOOST_REQUIRE(qid.Match(sid));
3405 int num_ident = 0;
3406 sa->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
3407 int seqlen = (int)sequence::GetLength(*seq_vec[q].seqloc,
3408 &*seq_vec[q].scope);
3409 BOOST_REQUIRE_EQUAL(seqlen, num_ident);
3410 }
3411 const CSearchResults& results2 = res->GetResults(s, q);
3412 BOOST_REQUIRE_EQUAL(results.HasAlignments(),
3413 results2.HasAlignments());
3414 if ( !results.HasAlignments() ) {
3415 continue;
3416 }
3417
3418 // Compare Q-S and S-Q hits
3419 CConstRef<CSeq_align_set> sas1 = results.GetSeqAlign();
3420 CConstRef<CSeq_align_set> sas2 = results2.GetSeqAlign();
3421 BOOST_REQUIRE(sas1.NotEmpty());
3422 BOOST_REQUIRE(sas2.NotEmpty());
3423 //cerr << "Alignments1 " << MSerial_AsnText << *sas2 << endl;
3424 //cerr << "Alignments2 " << MSerial_AsnText << *sas2 << endl;
3425 BOOST_REQUIRE_EQUAL(sas1->Get().size(), sas2->Get().size());
3426 CSeq_align_set::Tdata::const_iterator i1 = sas1->Get().begin(),
3427 i2 = sas2->Get().begin();
3428 for (; i1 != sas1->Get().end(); ++i1, ++i2) {
3429 CConstRef<CSeq_align> al1 = *i1;
3430 CConstRef<CSeq_align> al2 = *i2;
3431 BOOST_REQUIRE(al1.NotEmpty());
3432 BOOST_REQUIRE(al2.NotEmpty());
3433 //cerr << "Align1 " << MSerial_AsnText << *al1 << endl;
3434 //cerr << "Align2 " << MSerial_AsnText << *al2 << endl;
3435
3436 int score1 = 0, score2 = 0;
3437 al1->GetNamedScore(CSeq_align::eScore_Score, score1);
3438 al2->GetNamedScore(CSeq_align::eScore_Score, score2);
3439 BOOST_REQUIRE_EQUAL(score1, score2);
3440
3441 double bit_score1 = .0, bit_score2 = .0;
3442 al1->GetNamedScore(CSeq_align::eScore_BitScore, bit_score1);
3443 al2->GetNamedScore(CSeq_align::eScore_BitScore, bit_score2);
3444 BOOST_REQUIRE_CLOSE(bit_score1, bit_score2, 10e-2);
3445
3446 double evalue1 = .0, evalue2 = .0;
3447 al1->GetNamedScore(CSeq_align::eScore_EValue, evalue1);
3448 al2->GetNamedScore(CSeq_align::eScore_EValue, evalue2);
3449 BOOST_REQUIRE_CLOSE(evalue1, evalue2, 10e-5);
3450 }
3451 }
3452 }
3453
3454
3455 // Repeat the tests above using the TSeqAlignVector
3456 TSeqAlignVector alignments = blaster.Run();
3457 BOOST_REQUIRE_EQUAL(num_seqs*num_seqs, alignments.size());
3458 for (size_t q = 0; q < num_seqs; q++) {
3459 for (size_t s = 0; s < num_seqs; s++) {
3460 size_t idx1 = q*num_seqs + s;
3461 CConstRef<CSeq_align_set> al1 = alignments[idx1];
3462 if (idx1 == (q*num_seqs+q)) { // self-hit
3463 BOOST_REQUIRE(al1.NotEmpty());
3464 BOOST_REQUIRE_EQUAL(1U, al1->Get().size());
3465 CConstRef<CSeq_align> sa = al1->Get().front();
3466 const CSeq_id& qid = sa->GetSeq_id(0);
3467 const CSeq_id& sid = sa->GetSeq_id(1);
3468 BOOST_REQUIRE(qid.Match(sid));
3469 int num_ident = 0;
3470 sa->GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident);
3471 int seqlen = (int)sequence::GetLength(*seq_vec[q].seqloc,
3472 &*seq_vec[q].scope);
3473 BOOST_REQUIRE_EQUAL(seqlen, num_ident);
3474 continue;
3475 }
3476 size_t idx2 = s*num_seqs + q;
3477 CConstRef<CSeq_align_set> al2 = alignments[idx2];
3478 if (al1.Empty() || al2.Empty()) {
3479 BOOST_REQUIRE_EQUAL(al1.Empty(), al2.Empty());
3480 continue;
3481 }
3482 BOOST_REQUIRE_EQUAL(al1.NotEmpty(), al2.NotEmpty());
3483 // compare al1 with al2
3484 {{
3485 CConstRef<CSeq_align> sa1 = al1->Get().front();
3486 CConstRef<CSeq_align> sa2 = al2->Get().front();
3487 BOOST_REQUIRE(sa1.NotEmpty());
3488 BOOST_REQUIRE(sa2.NotEmpty());
3489 //cerr << "Align1 " << MSerial_AsnText << *sa1 << endl;
3490 //cerr << "Align2 " << MSerial_AsnText << *sa2 << endl;
3491
3492 int score1 = 0, score2 = 0;
3493 sa1->GetNamedScore(CSeq_align::eScore_Score, score1);
3494 sa2->GetNamedScore(CSeq_align::eScore_Score, score2);
3495 BOOST_REQUIRE_EQUAL(score1, score2);
3496
3497 double bit_score1 = .0, bit_score2 = .0;
3498 sa1->GetNamedScore(CSeq_align::eScore_BitScore, bit_score1);
3499 sa2->GetNamedScore(CSeq_align::eScore_BitScore, bit_score2);
3500 BOOST_REQUIRE_CLOSE(bit_score1, bit_score2, 10e-2);
3501
3502 double evalue1 = .0, evalue2 = .0;
3503 sa1->GetNamedScore(CSeq_align::eScore_EValue, evalue1);
3504 sa2->GetNamedScore(CSeq_align::eScore_EValue, evalue2);
3505 BOOST_REQUIRE_CLOSE(evalue1, evalue2, 10e-5);
3506 }}
3507 }
3508 }
3509
3510 }
3511
BOOST_AUTO_TEST_CASE(testInterruptCallbackWithNull)3512 BOOST_AUTO_TEST_CASE(testInterruptCallbackWithNull) {
3513 CSeq_id id("gi|129295");
3514 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
3515
3516 CBl2Seq blaster(*sl, *sl, eBlastp);
3517 TInterruptFnPtr null_fnptr = 0;
3518 TInterruptFnPtr fnptr = blaster.SetInterruptCallback(null_fnptr);
3519 BOOST_REQUIRE(fnptr == NULL);
3520
3521 TSeqAlignVector sav(blaster.Run());
3522 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3523 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
3524
3525 fnptr = blaster.SetInterruptCallback(interrupt_immediately);
3526 // make sure we get the previous interrupt callback
3527 BOOST_REQUIRE(fnptr == null_fnptr);
3528
3529 fnptr = blaster.SetInterruptCallback(null_fnptr);
3530 // make sure we get the previous interrupt callback
3531 BOOST_REQUIRE(fnptr == interrupt_immediately);
3532
3533 // Retry the search now that we've removed the interrupt callback
3534 sav = blaster.Run();
3535 BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3536 sar = *(sav[0]->Get().begin());
3537 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
3538 }
3539
BOOST_AUTO_TEST_CASE(testInterruptCallbackDoNotInterrupt)3540 BOOST_AUTO_TEST_CASE(testInterruptCallbackDoNotInterrupt) {
3541 CSeq_id id("gi|129295");
3542 auto_ptr<SSeqLoc> sl(CTestObjMgr::Instance().CreateSSeqLoc(id));
3543
3544 CBl2Seq blaster(*sl, *sl, eBlastp);
3545 TInterruptFnPtr fnptr = blaster.SetInterruptCallback(do_not_interrupt);
3546 BOOST_REQUIRE(fnptr == NULL);
3547
3548 TSeqAlignVector sav(blaster.Run());
3549 BOOST_REQUIRE_EQUAL(1, (int)sav.size());
3550 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3551 BOOST_REQUIRE_EQUAL(1, (int)sar->GetSegs().GetDenseg().GetNumseg());
3552 }
3553
3554 #if SEQLOC_MIX_QUERY_OK
BOOST_AUTO_TEST_CASE(MultiIntervalLoc)3555 BOOST_AUTO_TEST_CASE(MultiIntervalLoc) {
3556 const size_t kNumInts = 20;
3557 const size_t kStarts[kNumInts] =
3558 { 838, 1838, 6542, 7459, 9246, 10431, 14807, 16336, 19563,
3559 20606, 21232, 22615, 23822, 27941, 29597, 30136, 31287,
3560 31786, 33315, 35402 };
3561 const size_t kEnds[kNumInts] =
3562 { 961, 2010, 6740, 7573, 9408, 10609, 15043, 16511, 19783,
3563 20748, 21365, 22817, 24049, 28171, 29839, 30348, 31362,
3564 31911, 33485, 37952 };
3565 size_t index;
3566
3567 CSeq_id qid("gi|3417288");
3568 CRef<CSeq_loc> qloc(new CSeq_loc());
3569 for (index = 0; index < kNumInts; ++index) {
3570 CRef<CSeq_loc> next_loc(new CSeq_loc());
3571 next_loc->SetInt().SetFrom(kStarts[index]);
3572 next_loc->SetInt().SetTo(kEnds[index]);
3573 next_loc->SetInt().SetId(qid);
3574 qloc->SetMix().Set().push_back(next_loc);
3575 }
3576
3577 CRef<CScope> scope(new CScope(CTestObjMgr::Instance().GetObjMgr()));
3578 scope->AddDefaults();
3579
3580 auto_ptr<SSeqLoc> query(new SSeqLoc(qloc, scope));
3581
3582 CSeq_id sid("gi|51511732");
3583 pair<TSeqPos, TSeqPos> range(15595732, 15705419);
3584 auto_ptr<SSeqLoc> subject(
3585 CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_both));
3586 CBl2Seq blaster(*query, *subject, eBlastn);
3587 TSeqAlignVector sav(blaster.Run());
3588 CRef<CSeq_align> sar = *(sav[0]->Get().begin());
3589 BOOST_REQUIRE_EQUAL(60, (int)sar->GetSegs().GetDisc().Get().size());
3590 }
3591 #endif
3592
BOOST_AUTO_TEST_CASE(QueryMaskIgnoredInMiniExtension)3593 BOOST_AUTO_TEST_CASE(QueryMaskIgnoredInMiniExtension) {
3594 CRef<CSeq_loc> qloc(new CSeq_loc());
3595 qloc->SetWhole().SetGi(GI_CONST(4505696));
3596 CSeq_id sid("gi|29809252");
3597 pair<TSeqPos, TSeqPos> range(662070, 662129);
3598
3599 CRef<CScope> scope(new CScope(CTestObjMgr::Instance().GetObjMgr()));
3600 scope->AddDefaults();
3601
3602 auto_ptr<SSeqLoc> query(new SSeqLoc(qloc, scope));
3603 auto_ptr<SSeqLoc> subject(
3604 CTestObjMgr::Instance().CreateSSeqLoc(sid, range, eNa_strand_both));
3605
3606 CBl2Seq blaster(*query, *subject, eMegablast);
3607 TSeqAlignVector sav(blaster.Run());
3608 CRef<CSeq_align_set> sas = sav.front();
3609 BOOST_REQUIRE(sas->Get().empty());
3610 }
3611
3612 #endif /* SKIP_DOXYGEN_PROCESSING */
3613
3614 BOOST_AUTO_TEST_SUITE_END()
3615