1 /* $Id: blastinput_unit_test.cpp 617780 2020-10-06 16:24:16Z gouriano $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Jason Papadopoulos
27 *
28 * File Description:
29 * Unit tests for CBlastInput, CBlastInputSource and derived classes
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <objmgr/object_manager.hpp>
35 #include <objmgr/bioseq_handle.hpp>
36 #include <objects/seqloc/Seq_loc.hpp>
37 #include <objects/seqloc/Packed_seqint.hpp>
38 #include <objects/seqloc/Seq_interval.hpp>
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seqloc/PDB_seq_id.hpp>
41 #include <objects/seq/Seq_data.hpp>
42 #include <objects/seq/NCBIeaa.hpp>
43 #include <objects/seqset/Bioseq_set.hpp>
44
45 #include <corelib/ncbienv.hpp>
46 #include <objtools/readers/reader_exception.hpp>
47 #include <objtools/data_loaders/genbank/gbloader.hpp>
48 #include <algo/blast/api/sseqloc.hpp>
49 #include <algo/blast/core/blast_query_info.h>
50 #include <algo/blast/blastinput/blast_input.hpp>
51 #include <algo/blast/blastinput/blast_input_aux.hpp>
52 #include <algo/blast/blastinput/blast_fasta_input.hpp>
53 #include <algo/blast/blastinput/blast_asn1_input.hpp>
54 #include <objmgr/util/sequence.hpp>
55 #include <objmgr/seq_vector.hpp>
56
57 #include <algo/blast/blastinput/blastp_args.hpp>
58 #include <algo/blast/blastinput/blastn_args.hpp>
59 #include <algo/blast/blastinput/blastx_args.hpp>
60 #include <algo/blast/blastinput/tblastn_args.hpp>
61 #include <algo/blast/blastinput/tblastx_args.hpp>
62 #include <algo/blast/blastinput/psiblast_args.hpp>
63 #include <algo/blast/blastinput/rpsblast_args.hpp>
64 #include "blast_input_unit_test_aux.hpp"
65
66 #include <unordered_map>
67
68 #undef NCBI_BOOST_NO_AUTO_TEST_MAIN
69 #include <corelib/test_boost.hpp>
70
71 #ifndef SKIP_DOXYGEN_PROCESSING
72
73 USING_NCBI_SCOPE;
74 USING_SCOPE(blast);
75 USING_SCOPE(objects);
76
77 static CRef<CBlastInput>
s_DeclareBlastInput(CNcbiIstream & input_file,const CBlastInputSourceConfig & iconfig,int batch_size=kMax_Int)78 s_DeclareBlastInput(CNcbiIstream& input_file,
79 const CBlastInputSourceConfig& iconfig,
80 int batch_size = kMax_Int)
81 {
82 CRef<CBlastFastaInputSource> fasta_src
83 (new CBlastFastaInputSource(input_file, iconfig));
84 return CRef<CBlastInput>(new CBlastInput(&*fasta_src, batch_size));
85 }
86
87 static CRef<CBlastInput>
s_DeclareBlastInput(const string & user_input,const CBlastInputSourceConfig & iconfig)88 s_DeclareBlastInput(const string& user_input,
89 const CBlastInputSourceConfig& iconfig)
90 {
91 CRef<CBlastFastaInputSource> fasta_src
92 (new CBlastFastaInputSource(user_input, iconfig));
93 return CRef<CBlastInput>(new CBlastInput(&*fasta_src));
94 }
95
96 BOOST_AUTO_TEST_SUITE(blastinput)
97
BOOST_AUTO_TEST_CASE(ReadAccession_MismatchNuclProt)98 BOOST_AUTO_TEST_CASE(ReadAccession_MismatchNuclProt)
99 {
100 CNcbiIfstream infile("data/nucl_acc.txt");
101 const bool is_protein(true);
102 CBlastInputSourceConfig iconfig(is_protein);
103 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
104 CScope scope(*CObjectManager::GetInstance());
105
106 BOOST_REQUIRE(source->End() == false);
107 bool caught_exception(false);
108 try {
109 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
110 // here's a 'misplaced' test for blast::IsLocalId
111 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
112 }
113 catch (const CInputException& e) {
114 string msg(e.what());
115 BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: protein input required but nucleotide provided")
116 != NPOS);
117 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
118 caught_exception = true;
119 }
120 BOOST_REQUIRE(caught_exception);
121 BOOST_REQUIRE(source->End() == true);
122 scope.GetObjectManager().RevokeAllDataLoaders();
123 }
124
BOOST_AUTO_TEST_CASE(ReadAccession_MismatchProtNucl)125 BOOST_AUTO_TEST_CASE(ReadAccession_MismatchProtNucl)
126 {
127 CNcbiIfstream infile("data/prot_acc.txt");
128 const bool is_protein(false);
129 CBlastInputSourceConfig iconfig(is_protein);
130 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
131 CScope scope(*CObjectManager::GetInstance());
132
133 BOOST_REQUIRE(source->End() == false);
134 bool caught_exception(false);
135 try {
136 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
137 // here's a 'misplaced' test for blast::IsLocalId
138 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
139 }
140 catch (const CInputException& e) {
141 string msg(e.what());
142 BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: nucleotide input required but protein provided")
143 != NPOS);
144 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
145 caught_exception = true;
146 }
147 BOOST_REQUIRE(caught_exception);
148 BOOST_REQUIRE(source->End() == true);
149 scope.GetObjectManager().RevokeAllDataLoaders();
150 }
151
BOOST_AUTO_TEST_CASE(ReadGi_MismatchNuclProt)152 BOOST_AUTO_TEST_CASE(ReadGi_MismatchNuclProt)
153 {
154 CNcbiIfstream infile("data/gi.txt");
155 const bool is_protein(true);
156 CBlastInputSourceConfig iconfig(is_protein);
157 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
158 CScope scope(*CObjectManager::GetInstance());
159
160 BOOST_REQUIRE(source->End() == false);
161 bool caught_exception(false);
162 try {
163 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
164 // here's a 'misplaced' test for blast::IsLocalId
165 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
166 }
167 catch (const CInputException& e) {
168 string msg(e.what());
169 BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: protein input required but nucleotide provided")
170 != NPOS);
171 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
172 caught_exception = true;
173 }
174 BOOST_REQUIRE(caught_exception);
175 BOOST_REQUIRE(source->End() == true);
176 scope.GetObjectManager().RevokeAllDataLoaders();
177 }
178
BOOST_AUTO_TEST_CASE(ReadGi_MismatchProtNucl)179 BOOST_AUTO_TEST_CASE(ReadGi_MismatchProtNucl)
180 {
181 CNcbiIfstream infile("data/prot_gi.txt");
182 const bool is_protein(false);
183 CBlastInputSourceConfig iconfig(is_protein);
184 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
185 CScope scope(*CObjectManager::GetInstance());
186
187 BOOST_REQUIRE(source->End() == false);
188 bool caught_exception(false);
189 try {
190 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
191 // here's a 'misplaced' test for blast::IsLocalId
192 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
193 }
194 catch (const CInputException& e) {
195 string msg(e.what());
196 BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: nucleotide input required but protein provided")
197 != NPOS);
198 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
199 caught_exception = true;
200 }
201 BOOST_REQUIRE(caught_exception);
202 BOOST_REQUIRE(source->End() == true);
203 scope.GetObjectManager().RevokeAllDataLoaders();
204 }
205
206 struct SDubiousShortSequence
207 {
208 string sequence_data;
209 CSeq_inst::EMol mol_type;
210
SDubiousShortSequenceSDubiousShortSequence211 SDubiousShortSequence(const string& seq,
212 CSeq_inst::EMol mol_type)
213 : sequence_data(seq), mol_type(mol_type)
214 {
215 seqlen = NStr::Replace(sequence_data, " ", kEmptyStr).length();
216 }
217
IsProteinSDubiousShortSequence218 bool IsProtein() const { return CSeq_inst::IsAa(mol_type); }
GetLengthSDubiousShortSequence219 TSeqPos GetLength() const { return seqlen; }
220
221 private:
222 TSeqPos seqlen;
223 };
224
BOOST_AUTO_TEST_CASE(TestSmallDubiousSequences)225 BOOST_AUTO_TEST_CASE(TestSmallDubiousSequences)
226 {
227 string seq;
228
229 vector<SDubiousShortSequence> test_data;
230 test_data.push_back(SDubiousShortSequence("NNWNN", CSeq_inst::eMol_aa));
231 // P84064
232 seq.assign("ykrggggwgg gggwkggggg gggwkggggg gkgggg");
233 test_data.push_back(SDubiousShortSequence(seq, CSeq_inst::eMol_aa));
234 // AAB32668
235 seq.assign("GGGGGGGGGGGGGGG");
236 test_data.push_back(SDubiousShortSequence(seq, CSeq_inst::eMol_aa));
237
238 CRef<CObjectManager> om(CObjectManager::GetInstance());
239
240 // First test the usage of the sequence length threshold
241 ITERATE(vector<SDubiousShortSequence>, itr, test_data) {
242 CBlastInputSourceConfig iconfig(itr->IsProtein());
243 iconfig.SetSeqLenThreshold2Guess(itr->GetLength() + 1);
244
245 CRef<CBlastFastaInputSource> fasta_source
246 (new CBlastFastaInputSource(itr->sequence_data, iconfig));
247 CRef<CBlastInput> source(new CBlastInput(&*fasta_source));
248
249 CScope scope(*om);
250 BOOST_REQUIRE(source->End() == false);
251 bool caught_exception(false);
252 blast::SSeqLoc ssl;
253 try {
254 ssl = source->GetNextSeqLocBatch(scope).front();
255 // here's a 'misplaced' test for blast::IsLocalId
256 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
257 }
258 catch (const CInputException& e) {
259 string msg(e.what());
260 BOOST_REQUIRE(msg.find("Gi/accession mismatch: ") != NPOS);
261 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch,
262 e.GetErrCode());
263 caught_exception = true;
264 }
265 BOOST_REQUIRE(caught_exception == false);
266 BOOST_REQUIRE(source->End() == true);
267
268 TSeqPos length = sequence::GetLength(*ssl.seqloc, ssl.scope);
269 BOOST_REQUIRE_EQUAL(itr->GetLength(), length);
270 scope.GetObjectManager().RevokeAllDataLoaders();
271 }
272
273 // Now check that these sequences will be rejected as being the wrong
274 // molecule type (achieved by setting seqlen_thresh2guess argument to
275 // CBlastFastaInputSource to a small value
276 ITERATE(vector<SDubiousShortSequence>, itr, test_data) {
277
278 CBlastInputSourceConfig iconfig(itr->IsProtein());
279 iconfig.SetSeqLenThreshold2Guess(5);
280
281 CRef<CBlastFastaInputSource> fasta_source
282 (new CBlastFastaInputSource(itr->sequence_data, iconfig));
283 CRef<CBlastInput> source(new CBlastInput(&*fasta_source));
284
285 CScope scope(*om);
286 BOOST_REQUIRE(source->End() == false);
287 bool caught_exception(false);
288 blast::SSeqLoc ssl;
289 try {
290 ssl = source->GetNextSeqLocBatch(scope).front();
291 // here's a 'misplaced' test for blast::IsLocalId
292 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
293 }
294 catch (const CInputException& e) {
295 string msg(e.what());
296 BOOST_REQUIRE(msg.find("Nucleotide FASTA provided for prot") != NPOS);
297 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch,
298 e.GetErrCode());
299 caught_exception = true;
300 }
301 BOOST_REQUIRE(caught_exception == true);
302 BOOST_REQUIRE(source->End() == true);
303 scope.GetObjectManager().RevokeAllDataLoaders();
304 }
305
306 }
307
BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchProtNucl)308 BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchProtNucl)
309 {
310 CNcbiIfstream infile("data/aa.129295");
311 const bool is_protein(false);
312 CBlastInputSourceConfig iconfig(is_protein);
313 iconfig.SetSeqLenThreshold2Guess(25);
314 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
315
316 CScope scope(*CObjectManager::GetInstance());
317 BOOST_REQUIRE(source->End() == false);
318 bool caught_exception(false);
319 try { blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front(); }
320 catch (const CInputException& e) {
321 string msg(e.what());
322 BOOST_REQUIRE(msg.find("Protein FASTA provided for nucleotide") != NPOS);
323 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
324 caught_exception = true;
325 }
326 BOOST_REQUIRE(caught_exception);
327 BOOST_REQUIRE(source->End() == true);
328 scope.GetObjectManager().RevokeAllDataLoaders();
329 }
330
BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchNuclProt)331 BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchNuclProt)
332 {
333 CNcbiIfstream infile("data/nt.555");
334 const bool is_protein(true);
335 CBlastInputSourceConfig iconfig(is_protein);
336 iconfig.SetSeqLenThreshold2Guess(25);
337 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
338
339 CScope scope(*CObjectManager::GetInstance());
340 BOOST_REQUIRE(source->End() == false);
341 bool caught_exception(false);
342 try { blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front(); }
343 catch (const CInputException& e) {
344 string msg(e.what());
345 BOOST_REQUIRE(msg.find("Nucleotide FASTA provided for protein") != NPOS);
346 BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
347 caught_exception = true;
348 }
349 BOOST_REQUIRE(caught_exception);
350 BOOST_REQUIRE(source->End() == true);
351 scope.GetObjectManager().RevokeAllDataLoaders();
352 }
353
BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProtein_Single)354 BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProtein_Single)
355 {
356 CNcbiIfstream infile("data/aa.129295");
357 const bool is_protein(true);
358 CBlastInputSourceConfig iconfig(is_protein);
359 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
360
361 CScope scope(*CObjectManager::GetInstance());
362 BOOST_REQUIRE(source->End() == false);
363 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
364 BOOST_REQUIRE(source->End() == true);
365
366 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
367 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
368
369 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
370 BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
371
372 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
373 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
374
375 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
376 const TSeqPos length(232);
377 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
378
379 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
380 BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
381
382 BOOST_REQUIRE(!ssl.mask);
383 scope.GetObjectManager().RevokeAllDataLoaders();
384 }
385
BOOST_AUTO_TEST_CASE(RawFastaWithSpaces)386 BOOST_AUTO_TEST_CASE(RawFastaWithSpaces)
387 {
388 // this is gi 555, length 624
389 CNcbiIfstream infile("data/raw_fasta.na");
390 const bool is_protein(false);
391 CBlastInputSourceConfig iconfig(is_protein);
392 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
393
394 CScope scope(*CObjectManager::GetInstance());
395 BOOST_REQUIRE(source->End() == false);
396 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
397 BOOST_REQUIRE(source->End() == true);
398
399 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
400 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
401
402 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
403 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
404
405 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
406 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
407
408 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
409 const TSeqPos length(624);
410 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
411
412 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
413 BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
414
415 BOOST_REQUIRE(!ssl.mask);
416 scope.GetObjectManager().RevokeAllDataLoaders();
417 }
418
BOOST_AUTO_TEST_CASE(ReadProteinWithGaps)419 BOOST_AUTO_TEST_CASE(ReadProteinWithGaps)
420 {
421 CNcbiIfstream infile("data/prot_w_gaps.txt");
422 const bool is_protein(true);
423 CBlastInputSourceConfig iconfig(is_protein);
424 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
425
426 CScope scope(*CObjectManager::GetInstance());
427 TSeqLocVector seqs = source->GetAllSeqLocs(scope);
428 blast::SSeqLoc ssl = seqs.front();
429 BOOST_REQUIRE(source->End() == true);
430
431 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
432 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
433
434 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
435 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
436
437 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
438 const TSeqPos length(91); // it's actually 103 with gaps
439 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
440
441 const CSeq_id * seqid = ssl.seqloc->GetId();
442 CBioseq_Handle bh = scope.GetBioseqHandle(*seqid);
443 CSeqVector sv = bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
444
445 for (size_t i = 0; i < sv.size(); i++) {
446 BOOST_CHECK_NE('-', (char)sv[i]);
447 }
448
449 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
450 const CBioseq& bioseq = bioseqs->GetSeq_set().front()->GetSeq();
451 const CSeq_inst& inst = bioseq.GetInst();
452 BOOST_REQUIRE_EQUAL(inst.GetLength(), length);
453 BOOST_REQUIRE(inst.IsSetSeq_data());
454 const CSeq_data& seq_data = inst.GetSeq_data();
455 BOOST_REQUIRE(seq_data.IsNcbieaa());
456 const string& seq = seq_data.GetNcbieaa().Get();
457 for (size_t i = 0; i < seq.size(); i++) {
458 BOOST_CHECK_NE('-', (char)seq[i]);
459 }
460 scope.GetObjectManager().RevokeAllDataLoaders();
461 }
462
BOOST_AUTO_TEST_CASE(RawFastaNoSpaces)463 BOOST_AUTO_TEST_CASE(RawFastaNoSpaces)
464 {
465 // this is gi 555, length 624
466 CNcbiIfstream infile("data/raw_fasta2.na");
467 const bool is_protein(false);
468 CBlastInputSourceConfig iconfig(is_protein);
469 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
470
471 CScope scope(*CObjectManager::GetInstance());
472 TSeqLocVector seqs = source->GetAllSeqLocs(scope);
473 blast::SSeqLoc ssl = seqs[0];
474 BOOST_REQUIRE(source->End() == true);
475
476 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
477 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
478
479 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
480 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
481
482 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
483 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
484
485 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
486 const TSeqPos length(624);
487 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
488
489 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
490 BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
491
492 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
493 BOOST_REQUIRE(bioseqs.NotEmpty());
494
495 BOOST_REQUIRE(!ssl.mask);
496 scope.GetObjectManager().RevokeAllDataLoaders();
497 }
498
BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN_ReadDeltaSeq)499 BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN_ReadDeltaSeq)
500 {
501 // Note the setting of the environment variable
502 CAutoEnvironmentVariable env("BLASTINPUT_GEN_DELTA_SEQ");
503 CNcbiIfstream infile("data/nucl_w_n.fsa");
504 const bool is_protein(false);
505 CBlastInputSourceConfig iconfig(is_protein);
506 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
507
508 CScope s(*CObjectManager::GetInstance());
509 blast::TSeqLocVector seqs = source->GetAllSeqLocs(s);
510 blast::SSeqLoc ssl = seqs.front();
511 (void)ssl;
512 BOOST_REQUIRE(source->End() == true);
513
514 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
515 BOOST_REQUIRE(bioseqs->CanGetSeq_set());
516 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
517 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().CanGetInst());
518 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().GetInst().CanGetRepr());
519 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().GetInst().GetRepr()
520 == CSeq_inst::eRepr_delta);
521 s.GetObjectManager().RevokeAllDataLoaders();
522 }
523
524
BOOST_AUTO_TEST_CASE(ReadGenbankReport)525 BOOST_AUTO_TEST_CASE(ReadGenbankReport)
526 {
527 CDiagRestorer diag_restorer;
528
529 // Redirect the output warnings
530 SetDiagPostLevel(eDiag_Warning);
531 CNcbiOstrstream error_stream;
532 SetDiagStream(&error_stream);
533
534 // this is gi 555, length 624
535 CNcbiIfstream infile("data/gbreport.txt");
536 const bool is_protein(false);
537 CBlastInputSourceConfig iconfig(is_protein);
538 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
539
540 CScope scope(*CObjectManager::GetInstance());
541 BOOST_REQUIRE(source->End() == false);
542 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
543 BOOST_REQUIRE(source->End() == true);
544
545 string s = CNcbiOstrstreamToString(error_stream);
546 BOOST_REQUIRE(s.find("Ignoring invalid residues at ") != NPOS);
547
548 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
549 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
550
551 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
552 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
553
554 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
555 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
556
557 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
558 const TSeqPos length(624);
559 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
560
561 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
562 BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
563
564 BOOST_REQUIRE(!ssl.mask);
565 scope.GetObjectManager().RevokeAllDataLoaders();
566 }
567
BOOST_AUTO_TEST_CASE(ReadInvalidGi)568 BOOST_AUTO_TEST_CASE(ReadInvalidGi)
569 {
570 const char* fname = "data/invalid_gi.txt";
571 const bool is_protein(false);
572 CBlastInputSourceConfig iconfig(is_protein);
573
574 CNcbiIfstream infile(fname);
575 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
576 BOOST_REQUIRE(source->End() == false);
577
578 CScope scope(*CObjectManager::GetInstance());
579 blast::SSeqLoc ssl;
580 bool caught_exception(false);
581 try { ssl = source->GetNextSeqLocBatch(scope).front(); }
582 catch (const CInputException& e) {
583 string msg(e.what());
584 BOOST_REQUIRE(msg.find("Sequence ID not found: ") != NPOS);
585 BOOST_REQUIRE_EQUAL(CInputException::eSeqIdNotFound, e.GetErrCode());
586 caught_exception = true;
587 }
588 BOOST_REQUIRE(caught_exception);
589 BOOST_REQUIRE(source->End() == true);
590 scope.GetObjectManager().RevokeAllDataLoaders();
591 }
592
BOOST_AUTO_TEST_CASE(ReadInvalidSeqId)593 BOOST_AUTO_TEST_CASE(ReadInvalidSeqId)
594 {
595 const char* fname = "data/bad_seqid.txt";
596 const bool is_protein(false);
597 CBlastInputSourceConfig iconfig(is_protein);
598
599 CNcbiIfstream infile(fname);
600 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
601 BOOST_REQUIRE(source->End() == false);
602
603 CScope scope(*CObjectManager::GetInstance());
604 blast::SSeqLoc ssl;
605 bool caught_exception(false);
606 try { ssl = source->GetNextSeqLocBatch(scope).front(); }
607 catch (const CSeqIdException& e) {
608 string msg(e.what());
609 BOOST_REQUIRE_EQUAL(CSeqIdException::eFormat, e.GetErrCode());
610 caught_exception = true;
611 }
612 BOOST_REQUIRE(caught_exception);
613 BOOST_REQUIRE(source->End() == true);
614 scope.GetObjectManager().RevokeAllDataLoaders();
615 }
616
BOOST_AUTO_TEST_CASE(ReadBadUserInput)617 BOOST_AUTO_TEST_CASE(ReadBadUserInput)
618 {
619 const char* fname = "data/bad_input.txt";
620 const bool is_protein(false);
621 const size_t kNumQueries(0);
622 CBlastInputSourceConfig iconfig(is_protein);
623 CScope scope(*CObjectManager::GetInstance());
624
625 {
626 CNcbiIfstream infile(fname);
627 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
628 BOOST_REQUIRE(source->End() == false);
629
630 blast::TSeqLocVector query_vector;
631 BOOST_REQUIRE_THROW(query_vector = source->GetAllSeqLocs(scope),
632 CObjReaderParseException);
633 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
634 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
635
636 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
637 BOOST_REQUIRE(bioseqs.Empty());
638 }
639
640 {
641 CNcbiIfstream infile(fname);
642 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
643 BOOST_REQUIRE(source->End() == false);
644
645 CRef<blast::CBlastQueryVector> query_vector;
646 BOOST_REQUIRE_THROW(query_vector = source->GetAllSeqs(scope),
647 CObjReaderParseException);
648 BOOST_REQUIRE(query_vector.Empty());
649 }
650 scope.GetObjectManager().RevokeAllDataLoaders();
651 }
652
653 /// This unit test proves that if one input is bad, all of them are rejected.
BOOST_AUTO_TEST_CASE(ReadMultipleGis_WithBadInput)654 BOOST_AUTO_TEST_CASE(ReadMultipleGis_WithBadInput)
655 {
656 const char* fname = "data/gis_bad_input.txt";
657 CNcbiIfstream infile(fname);
658 const bool is_protein(false);
659 CBlastInputSourceConfig iconfig(is_protein);
660 iconfig.SetRetrieveSeqData(false);
661
662 vector< pair<long, long> > gi_length;
663 gi_length.push_back(make_pair(89161185L, 247249719L));
664 // this is never read...
665 //gi_length.push_back(make_pair(0L, 0L)); // bad sequence
666 //gi_length.push_back(make_pair(557L, 489L));
667
668 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
669 BOOST_REQUIRE(source->End() == false);
670
671 CScope scope(*CObjectManager::GetInstance());
672
673 blast::TSeqLocVector seqs;
674 BOOST_REQUIRE_THROW(seqs = source->GetAllSeqLocs(scope),
675 CObjReaderParseException);
676 scope.GetObjectManager().RevokeAllDataLoaders();
677 }
678
BOOST_AUTO_TEST_CASE(ReadEmptyUserInput)679 BOOST_AUTO_TEST_CASE(ReadEmptyUserInput)
680 {
681 const char* fname("/dev/null");
682 const bool is_protein(true);
683 CScope scope(*CObjectManager::GetInstance());
684 CBlastInputSourceConfig iconfig(is_protein);
685 {
686 CNcbiIfstream infile(fname);
687 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
688 BOOST_REQUIRE(source->End() == true);
689
690 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
691 BOOST_REQUIRE(query_vector.empty());
692
693 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
694 BOOST_REQUIRE(bioseqs.Empty());
695 }
696
697 {
698 CNcbiIfstream infile(fname);
699 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
700 BOOST_REQUIRE(source->End() == true);
701
702 CRef<blast::CBlastQueryVector> queries = source->GetAllSeqs(scope);
703 BOOST_REQUIRE(queries->Empty());
704 }
705
706 // Read from buffer
707 {
708 const string empty;
709 CRef<CObjectManager> om(CObjectManager::GetInstance());
710 CRef<CBlastFastaInputSource> source;
711
712 bool caught_exception(false);
713 try { source.Reset(new CBlastFastaInputSource(empty, iconfig)); }
714 catch (const CInputException& e) {
715 string msg(e.what());
716 BOOST_REQUIRE(msg.find("No sequence input was provided") != NPOS);
717 BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
718 caught_exception = true;
719 }
720 BOOST_REQUIRE(caught_exception);
721 }
722 scope.GetObjectManager().RevokeAllDataLoaders();
723 }
724
725 // Basic test case to ensure CFastaReader changes don't break basic
726 // functionality required by BLAST
BOOST_AUTO_TEST_CASE(ReadSingleFasta_WithTitle)727 BOOST_AUTO_TEST_CASE(ReadSingleFasta_WithTitle)
728 {
729 const string kFileName("data/isprot.fa");
730 const string kExpectedTitle("seq");
731 const bool is_protein(false);
732
733 CScope scope(*CObjectManager::GetInstance());
734 CBlastInputSourceConfig iconfig(is_protein);
735
736 CNcbiIfstream infile(kFileName.c_str());
737 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
738
739 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
740 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
741 BOOST_REQUIRE(!bioseqs.Empty());
742
743 string title;
744 ITERATE(CBioseq::TDescr::Tdata, itr, bioseqs->GetSeq_set().front()->GetSeq().GetDescr().Get()) {
745 const CSeqdesc& desc = **itr;
746 if (desc.IsTitle()) {
747 title = desc.GetTitle();
748 break;
749 }
750 }
751 BOOST_REQUIRE_EQUAL(kExpectedTitle, title);
752 scope.GetObjectManager().RevokeAllDataLoaders();
753 }
754
755 static
s_ReadAndTestQueryFromString_CFastaReader(const string & input,TSeqPos expected_length)756 void s_ReadAndTestQueryFromString_CFastaReader(const string& input,
757 TSeqPos expected_length)
758 {
759 CFastaReader::TFlags defaultBLASTflags = CFastaReader::fNoParseID |
760 CFastaReader::fDLOptional;
761 defaultBLASTflags += CFastaReader::fAssumeNuc;
762 defaultBLASTflags += CFastaReader::fNoSplit;
763 defaultBLASTflags += CFastaReader::fHyphensIgnoreAndWarn;
764 defaultBLASTflags += CFastaReader::fDisableNoResidues;
765 defaultBLASTflags += CFastaReader::fQuickIDCheck;
766
767 CRef<ILineReader> line_reader(new CMemoryLineReader(input.c_str(),
768 input.size()));
769 CFastaReader fasta_reader(*line_reader, defaultBLASTflags);
770 fasta_reader.IgnoreProblem(ILineError::eProblem_ModifierFoundButNoneExpected);
771 fasta_reader.IgnoreProblem(ILineError::eProblem_TooLong);
772
773 CRef<CSeqIdGenerator> idgen(new CSeqIdGenerator(1, kEmptyStr));
774 fasta_reader.SetIDGenerator(*idgen);
775
776 CRef<CSeq_entry> se(fasta_reader.ReadOneSeq());
777 BOOST_REQUIRE_EQUAL(expected_length, se->GetSeq().GetLength());
778 }
779
BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoNewLineAfterSeq)780 BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoNewLineAfterSeq)
781 {
782 const string kUserInput(">seq_1\nATGC");
783 const TSeqPos kExpectedLength(4);
784 s_ReadAndTestQueryFromString_CFastaReader(kUserInput, kExpectedLength);
785 }
BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderWithNewLines)786 BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderWithNewLines)
787 {
788 const string kUserInput(">seq_1\nATGC\n");
789 const TSeqPos kExpectedLength(4);
790 s_ReadAndTestQueryFromString_CFastaReader(kUserInput, kExpectedLength);
791 }
BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoDeflineNoNewLines)792 BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoDeflineNoNewLines)
793 {
794 const string kUserInput("ATGC");
795 const TSeqPos kExpectedLength(4);
796 s_ReadAndTestQueryFromString_CFastaReader(kUserInput, kExpectedLength);
797 }
798
799 static
s_ReadAndTestQueryFromString(const string & input,TSeqPos expected_length,bool is_protein)800 void s_ReadAndTestQueryFromString(const string& input, TSeqPos expected_length,
801 bool is_protein)
802 {
803 CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
804 BOOST_REQUIRE(objmgr);
805
806 SDataLoaderConfig dlconfig(is_protein);
807 CBlastInputSourceConfig iconfig(dlconfig);
808 CBlastFastaInputSource queryInput(input, iconfig);
809 CScope scope(*objmgr);
810 CBlastInput qIn(&queryInput);
811 blast::TSeqLocVector query = qIn.GetAllSeqLocs(scope);
812 BOOST_REQUIRE_EQUAL(expected_length,
813 sequence::GetLength(*query.front().seqloc, &scope));
814 CRef<CSeqVector> sv(new CSeqVector(*query.front().seqloc, scope));
815 BOOST_REQUIRE_EQUAL(expected_length, sv->size());
816 BOOST_REQUIRE_EQUAL(is_protein, sv->IsProtein());
817 sv->SetIupacCoding();
818 string::size_type input_pos = input.find_first_of("ACTG");
819 BOOST_REQUIRE(input_pos != string::npos);
820 for (TSeqPos pos = 0; pos < sv->size(); pos++, input_pos++) {
821 CNcbiOstrstream oss;
822 oss << "Sequence data differs at position " << pos << ": '"
823 << input[input_pos] << "' .vs '" << (*sv)[pos] << "'";
824 string msg = CNcbiOstrstreamToString(oss);
825 BOOST_REQUIRE_MESSAGE(input[input_pos] == (*sv)[pos], msg);
826 }
827 scope.GetObjectManager().RevokeAllDataLoaders();
828 }
829
BOOST_AUTO_TEST_CASE(SingleSequenceString_NoNewLineAfterSeq)830 BOOST_AUTO_TEST_CASE(SingleSequenceString_NoNewLineAfterSeq)
831 {
832 const string kUserInput(">seq_1\nATGC");
833 const TSeqPos kExpectedLength(4);
834 s_ReadAndTestQueryFromString(kUserInput, kExpectedLength, false);
835 }
836
BOOST_AUTO_TEST_CASE(SingleSequenceString_WithNewLines)837 BOOST_AUTO_TEST_CASE(SingleSequenceString_WithNewLines)
838 {
839 const string kUserInput(">seq_1\nATGC\n");
840 const TSeqPos kExpectedLength(4);
841 s_ReadAndTestQueryFromString(kUserInput, kExpectedLength, false);
842 }
843
BOOST_AUTO_TEST_CASE(SingleSequenceString_NoDeflineNoNewLines)844 BOOST_AUTO_TEST_CASE(SingleSequenceString_NoDeflineNoNewLines)
845 {
846 const string kUserInput("ATGC");
847 const TSeqPos kExpectedLength(4);
848 s_ReadAndTestQueryFromString(kUserInput, kExpectedLength, false);
849 }
850
BOOST_AUTO_TEST_CASE(ReadEmptyUserInput_OnlyTitle)851 BOOST_AUTO_TEST_CASE(ReadEmptyUserInput_OnlyTitle)
852 {
853 CTmpFile tmpfile;
854 const string kUserInput(">mygene\n");
855 CNcbiOfstream out(tmpfile.GetFileName().c_str());
856 out << kUserInput;
857 out.close();
858
859
860 const bool is_protein(false);
861 CScope scope(*CObjectManager::GetInstance());
862 CBlastInputSourceConfig iconfig(is_protein);
863 bool caught_exception(false);
864 string warnings;
865 {
866 CNcbiIfstream infile(tmpfile.GetFileName().c_str());
867 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
868
869 blast::TSeqLocVector query_vector;
870 try { CheckForEmptySequences(query_vector, warnings); }
871 catch (const CInputException& e) {
872 BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
873 }
874
875 query_vector = source->GetAllSeqLocs(scope);
876 try { CheckForEmptySequences(query_vector, warnings); }
877 catch (const CInputException& e) {
878 string msg(e.what());
879 BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
880 BOOST_REQUIRE(warnings.empty());
881 BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
882 caught_exception = true;
883 }
884 BOOST_REQUIRE(caught_exception);
885 BOOST_REQUIRE(query_vector.empty() == false);
886
887 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
888 BOOST_REQUIRE(!bioseqs.Empty());
889 caught_exception = false;
890 try { CheckForEmptySequences(bioseqs, warnings); }
891 catch (const CInputException& e) {
892 string msg(e.what());
893 BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
894 BOOST_REQUIRE(warnings.empty());
895 BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
896 caught_exception = true;
897 }
898 BOOST_REQUIRE(caught_exception);
899 }
900
901 {
902 CNcbiIfstream infile(tmpfile.GetFileName().c_str());
903 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
904
905 caught_exception = false;
906 CRef<blast::CBlastQueryVector> queries = source->GetAllSeqs(scope);
907 try { CheckForEmptySequences(queries, warnings); }
908 catch (const CInputException& e) {
909 string msg(e.what());
910 BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
911 BOOST_REQUIRE(warnings.empty());
912 BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
913 caught_exception = true;
914 }
915 BOOST_REQUIRE(caught_exception);
916 BOOST_REQUIRE(!queries.Empty());
917 }
918
919 // Read from buffer
920 {
921 const string empty;
922 CRef<CObjectManager> om(CObjectManager::GetInstance());
923 CRef<CBlastInput> source(s_DeclareBlastInput(kUserInput, iconfig));
924 CRef<blast::CBlastQueryVector> queries;
925 try { CheckForEmptySequences(queries, warnings); }
926 catch (const CInputException& e) {
927 BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
928 }
929
930 caught_exception = false;
931 queries = source->GetAllSeqs(scope);
932 try { CheckForEmptySequences(queries, warnings); }
933 catch (const CInputException& e) {
934 string msg(e.what());
935 BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
936 BOOST_REQUIRE(warnings.empty());
937 BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
938 caught_exception = true;
939 }
940 BOOST_REQUIRE(caught_exception);
941 }
942 scope.GetObjectManager().RevokeAllDataLoaders();
943 }
944
BOOST_AUTO_TEST_CASE(ReadSingleAccession)945 BOOST_AUTO_TEST_CASE(ReadSingleAccession)
946 {
947 CNcbiIfstream infile("data/accession.txt");
948 const bool is_protein(false);
949 CBlastInputSourceConfig iconfig(is_protein);
950 iconfig.SetRetrieveSeqData(false);
951 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
952
953 CScope scope(*CObjectManager::GetInstance());
954 BOOST_REQUIRE(source->End() == false);
955 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
956 blast::SSeqLoc ssl = seqs.front();
957 BOOST_REQUIRE(source->End() == true);
958
959 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
960 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
961
962 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
963 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
964
965 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
966 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
967
968 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
969 const TSeqPos length(248956422);
970 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
971
972 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
973 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
974 const string accession("NC_000001");
975 BOOST_REQUIRE_EQUAL(accession,
976 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
977
978 BOOST_REQUIRE(!ssl.mask);
979
980 /// Validate the data that would be retrieved by blast.cgi
981 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
982 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
983 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
984 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
985 BOOST_REQUIRE(b.IsNa());
986 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, b.GetId().front()->Which());
987 BOOST_REQUIRE_EQUAL(accession, b.GetId().front()->GetOther().GetAccession());
988 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
989 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
990 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
991 scope.GetObjectManager().RevokeAllDataLoaders();
992 }
993
BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequence)994 BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequence)
995 {
996 CNcbiIfstream infile("data/accession.txt");
997 const bool is_protein(false);
998 const TIntId kGi = 568815597;
999 const TSeqPos kStart = 0;
1000 const TSeqPos kStop(248956421);
1001 SDataLoaderConfig dlconfig("GPIPE/9606/current/GCF_000005045.24_top_level", is_protein);
1002 dlconfig.OptimizeForWholeLargeSequenceRetrieval(true);
1003
1004 CBlastInputSourceConfig iconfig(dlconfig);
1005 iconfig.SetRetrieveSeqData(true);
1006 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1007
1008 CRef<CScope> scope(CBlastScopeSource(dlconfig).NewScope());
1009 BOOST_REQUIRE(source->End() == false);
1010
1011 blast::TSeqLocVector seqs = source->GetAllSeqLocs(*scope);
1012 blast::SSeqLoc ssl = seqs.front();
1013 BOOST_REQUIRE(source->End() == true);
1014
1015 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1016 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1017
1018 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1019 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1020
1021 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1022 BOOST_REQUIRE_EQUAL(kStart, ssl.seqloc->GetInt().GetFrom());
1023
1024 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1025 BOOST_REQUIRE_EQUAL(kStop, ssl.seqloc->GetInt().GetTo());
1026
1027 const string accession = "NC_000001";
1028 const int version = 11;
1029 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1030 if ( !CSeq_id::PreferAccessionOverGi() ) {
1031 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1032 BOOST_REQUIRE_EQUAL(GI_CONST(kGi), ssl.seqloc->GetInt().GetId().GetGi());
1033 }
1034 else {
1035 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1036 BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1037 BOOST_REQUIRE_EQUAL(version, ssl.seqloc->GetInt().GetId().GetOther().GetVersion());
1038 }
1039
1040 BOOST_REQUIRE(!ssl.mask);
1041
1042 /// Validate the data that would be retrieved by a BLAST command line
1043 /// binary
1044 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1045 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1046 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1047 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1048 BOOST_REQUIRE(b.IsNa());
1049 bool found_gi = false, found_accession = false;
1050 ITERATE(CBioseq::TId, id, b.GetId()) {
1051 if ((*id)->Which() == CSeq_id::e_Gi) {
1052 BOOST_REQUIRE_EQUAL(GI_CONST(kGi), (*id)->GetGi());
1053 found_gi = true;
1054 } else if ((*id)->Which() == CSeq_id::e_Other) {
1055 CNcbiOstrstream os;
1056 (*id)->GetOther().AsFastaString(os);
1057 const string fasta_acc = CNcbiOstrstreamToString(os);
1058 BOOST_REQUIRE(NStr::Find(fasta_acc, accession) != NPOS);
1059 found_accession = true;
1060 }
1061 }
1062 BOOST_REQUIRE(found_gi);
1063 BOOST_REQUIRE(found_accession);
1064 // the BLAST database data loader will fetch this as a delta sequence
1065 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_delta, b.GetInst().GetRepr());
1066 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1067 BOOST_REQUIRE_EQUAL(kStop+1, b.GetInst().GetLength());
1068 scope->GetObjectManager().RevokeAllDataLoaders();
1069 }
1070 #ifdef _DEBUG
1071 const int kTimeOutLargeSeq = 60;
1072 #else
1073 const int kTimeOutLargeSeq = 20;
1074 #endif
1075 BOOST_AUTO_TEST_CASE_TIMEOUT(ReadSingleAccession_RetrieveLargeSequence,
1076 kTimeOutLargeSeq);
1077
BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequenceWithRange)1078 BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequenceWithRange)
1079 {
1080 CNcbiIfstream infile("data/accession.txt");
1081 const bool is_protein(false);
1082 CBlastInputSourceConfig iconfig(is_protein);
1083 const TSeqPos kStart = 1;
1084 const TSeqPos kStop = 1000;
1085 iconfig.SetRange().SetFrom(kStart);
1086 iconfig.SetRange().SetTo(kStop);
1087 // comment the line below to fetch the sequence data
1088 iconfig.SetRetrieveSeqData(false);
1089 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1090
1091 SDataLoaderConfig dlconfig(is_protein);
1092 CRef<CScope> scope(CBlastScopeSource(dlconfig).NewScope());
1093 BOOST_REQUIRE(source->End() == false);
1094 blast::TSeqLocVector seqs = source->GetAllSeqLocs(*scope);
1095 blast::SSeqLoc ssl = seqs.front();
1096 BOOST_REQUIRE(source->End() == true);
1097
1098 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1099 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1100
1101 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1102 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1103
1104 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1105 BOOST_REQUIRE_EQUAL(kStart, ssl.seqloc->GetInt().GetFrom());
1106
1107 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1108 BOOST_REQUIRE_EQUAL(kStop, ssl.seqloc->GetInt().GetTo());
1109
1110 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1111 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1112 const string accession("NC_000001");
1113 BOOST_REQUIRE_EQUAL(accession,
1114 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1115 BOOST_REQUIRE(!ssl.mask);
1116
1117 /// Validate the data that would be retrieved by blast.cgi
1118 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1119 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1120 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1121 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1122 BOOST_REQUIRE(b.IsNa());
1123 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, b.GetId().front()->Which());
1124 BOOST_REQUIRE_EQUAL(accession, b.GetId().front()->GetOther().GetAccession());
1125 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1126 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1127 const TSeqPos length(248956422);
1128 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1129 scope->GetObjectManager().RevokeAllDataLoaders();
1130 }
1131 #ifdef _DEBUG
1132 const int kTimeOutLargeSeqWithRange = 60;
1133 #else
1134 const int kTimeOutLargeSeqWithRange = 15;
1135 #endif
1136 BOOST_AUTO_TEST_CASE_TIMEOUT(ReadSingleAccession_RetrieveLargeSequenceWithRange,
1137 kTimeOutLargeSeqWithRange);
1138
BOOST_AUTO_TEST_CASE(ReadMultipleAccessions)1139 BOOST_AUTO_TEST_CASE(ReadMultipleAccessions)
1140 {
1141 CNcbiIfstream infile("data/accessions.txt");
1142 const bool is_protein(false);
1143 CBlastInputSourceConfig iconfig(is_protein);
1144 iconfig.SetRetrieveSeqData(false);
1145 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1146
1147 vector< pair<string, long> > accession_lengths;
1148 accession_lengths.push_back(make_pair(string("NC_000001"), 248956422L));
1149 accession_lengths.push_back(make_pair(string("NC_000010.9"), 135374737L));
1150 accession_lengths.push_back(make_pair(string("NC_000011.8"), 134452384L));
1151 accession_lengths.push_back(make_pair(string("NC_000012.10"), 132349534L));
1152
1153 const size_t kNumQueries(accession_lengths.size());
1154 CScope scope(*CObjectManager::GetInstance());
1155 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1156 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1157 BOOST_REQUIRE(source->End() == true);
1158
1159 {{
1160 blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1161 BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1162 BOOST_REQUIRE(source->End() == true);
1163 }}
1164
1165 for (size_t i = 0; i < kNumQueries; i++) {
1166
1167 blast::SSeqLoc& ssl = query_vector[i];
1168 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
1169 BOOST_REQUIRE_EQUAL((TSeqPos)accession_lengths[i].second - 1,
1170 ssl.seqloc->GetInt().GetTo());
1171
1172 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1173 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1174 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1175 string accession;
1176 int version;
1177 switch (i) {
1178 case 0: accession.assign("NC_000001"); version = 0; break;
1179 case 1: accession.assign("NC_000010"); version = 9; break;
1180 case 2: accession.assign("NC_000011"); version = 8; break;
1181 case 3: accession.assign("NC_000012"); version = 10; break;
1182 default: abort();
1183 }
1184
1185 BOOST_REQUIRE_EQUAL(accession,
1186 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1187 if (version != 0) {
1188 BOOST_REQUIRE_EQUAL(version,
1189 ssl.seqloc->GetInt().GetId().GetOther().GetVersion());
1190 }
1191 BOOST_REQUIRE(!ssl.mask);
1192
1193 }
1194
1195 /// Validate the data that would be retrieved by blast.cgi
1196 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1197 BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1198 scope.GetObjectManager().RevokeAllDataLoaders();
1199 }
1200
1201 // This test was created to test issues in jira/browse/CXX-82
BOOST_AUTO_TEST_CASE(ReadMultipleAccessionsFromMemory)1202 BOOST_AUTO_TEST_CASE(ReadMultipleAccessionsFromMemory)
1203 {
1204 typedef vector< pair<string, int> > TStringIntVector;
1205 TStringIntVector accession_lengths;
1206 accession_lengths.push_back(make_pair(string("P01012.2"), 386));
1207 accession_lengths.push_back(make_pair(string("1OVA-A"), 386));
1208 // Fails in entrez, we implemented regex for this in CBlastInputReader
1209 accession_lengths.push_back(make_pair(string("pdb|1OVA-A"), 386));
1210 // Note the double bar..
1211 accession_lengths.push_back(make_pair(string("prf||0705172A"), 385));
1212 // Fails in entrez, we implemented regex for this in CBlastInputReader
1213 accession_lengths.push_back(make_pair(string("sp|P01012.2"), 386));
1214
1215 // This we're not even going to try to fix...
1216 //accession_lengths.push_back(make_pair(string("0705172A"), 385));
1217
1218 string user_input;
1219 ITERATE(TStringIntVector, itr, accession_lengths) {
1220 user_input += itr->first + "\n";
1221 }
1222 istringstream instream(user_input);
1223
1224 const bool is_protein(true);
1225 CBlastInputSourceConfig iconfig(is_protein);
1226 iconfig.SetRetrieveSeqData(false);
1227 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
1228
1229 const size_t kNumQueries(accession_lengths.size());
1230 CScope scope(*CObjectManager::GetInstance());
1231 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1232 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1233 BOOST_REQUIRE(source->End() == true);
1234
1235 {{
1236 blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1237 BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1238 BOOST_REQUIRE(source->End() == true);
1239 }}
1240
1241 for (size_t i = 0; i < kNumQueries; i++) {
1242
1243 const string& accession = accession_lengths[i].first;
1244 CNcbiOstrstream oss;
1245 blast::SSeqLoc& ssl = query_vector[i];
1246 oss << "Accession " << accession << " difference in lengths: "
1247 << ((TSeqPos)accession_lengths[i].second - 1) << " vs. "
1248 << ssl.seqloc->GetInt().GetTo();
1249 string msg = CNcbiOstrstreamToString(oss);
1250 BOOST_REQUIRE_MESSAGE(((TSeqPos)accession_lengths[i].second - 1) ==
1251 ssl.seqloc->GetInt().GetTo(), msg);
1252 BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetStrand());
1253 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1254 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1255 }
1256
1257 /// Validate the data that would be retrieved by blast.cgi
1258 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1259 BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1260 scope.GetObjectManager().RevokeAllDataLoaders();
1261 }
1262
BOOST_AUTO_TEST_CASE(ReadSingleGi)1263 BOOST_AUTO_TEST_CASE(ReadSingleGi)
1264 {
1265 CNcbiIfstream infile("data/gi.txt");
1266 const bool is_protein(false);
1267 CBlastInputSourceConfig iconfig(is_protein);
1268 iconfig.SetRetrieveSeqData(false);
1269 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1270
1271 CScope scope(*CObjectManager::GetInstance());
1272 BOOST_REQUIRE(source->End() == false);
1273 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1274 blast::SSeqLoc ssl = seqs.front();
1275 BOOST_REQUIRE(source->End() == true);
1276
1277 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1278 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1279
1280 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1281 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1282
1283 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1284 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1285
1286 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1287 const TSeqPos length = 247249719;
1288 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1289
1290 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1291 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1292 const TGi gi = GI_CONST(89161185);
1293 BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1294
1295 BOOST_REQUIRE(!ssl.mask);
1296
1297 /// Validate the data that would be retrieved by blast.cgi
1298 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1299 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1300 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1301 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1302 BOOST_REQUIRE(b.IsNa());
1303 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, b.GetId().front()->Which());
1304 BOOST_REQUIRE_EQUAL(gi, b.GetId().front()->GetGi());
1305 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1306 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1307 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1308 scope.GetObjectManager().RevokeAllDataLoaders();
1309 }
1310
BOOST_AUTO_TEST_CASE(ReadMultipleGis)1311 BOOST_AUTO_TEST_CASE(ReadMultipleGis)
1312 {
1313 CNcbiIfstream infile("data/gis.txt");
1314 const bool is_protein(false);
1315 CBlastInputSourceConfig iconfig(is_protein);
1316 iconfig.SetRetrieveSeqData(false);
1317 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1318
1319 vector< pair<TIntId, long> > gi_length;
1320 gi_length.push_back(make_pair(89161185, 247249719L));
1321 gi_length.push_back(make_pair(555, 624L));
1322 gi_length.push_back(make_pair(557, 489L));
1323
1324 const size_t kNumQueries(gi_length.size());
1325 CScope scope(*CObjectManager::GetInstance());
1326 BOOST_REQUIRE(source->End() == false);
1327 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1328 BOOST_REQUIRE(source->End() == true);
1329
1330 for (size_t i = 0; i < kNumQueries; i++) {
1331 blast::SSeqLoc ssl = seqs[i];
1332 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1333
1334 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1335 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1336
1337 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1338 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1339
1340 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1341 const TSeqPos length = gi_length[i].second;
1342 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1343
1344 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1345 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1346 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1347 const TIntId gi = gi_length[i].first;
1348 BOOST_REQUIRE_EQUAL(GI_FROM(TIntId, gi), ssl.seqloc->GetInt().GetId().GetGi());
1349
1350 BOOST_REQUIRE(!ssl.mask);
1351 }
1352
1353 /// Validate the data that would be retrieved by blast.cgi
1354 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1355 BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1356
1357 CBioseq_set::TSeq_set::const_iterator itr = bioseqs->GetSeq_set().begin();
1358 CBioseq_set::TSeq_set::const_iterator end = bioseqs->GetSeq_set().end();
1359 for (size_t i = 0; i < kNumQueries; i++, ++itr) {
1360 BOOST_REQUIRE(itr != end);
1361 BOOST_REQUIRE((*itr)->IsSeq());
1362 const CBioseq& b = (*itr)->GetSeq();
1363 BOOST_REQUIRE(b.IsNa());
1364 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, b.GetId().front()->Which());
1365 BOOST_REQUIRE_EQUAL(GI_FROM(TIntId, gi_length[i].first), b.GetId().front()->GetGi());
1366 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1367 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1368 BOOST_REQUIRE_EQUAL((long)gi_length[i].second, (long)b.GetInst().GetLength());
1369 }
1370 scope.GetObjectManager().RevokeAllDataLoaders();
1371 }
1372
1373 // This input file contains very short sequences (1-3 bases) which were product
1374 // of a sequencing machine
BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencer)1375 BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencer)
1376 {
1377 CNcbiIfstream infile("data/DF-1.txt");
1378 const bool is_protein(false);
1379 CBlastInputSourceConfig iconfig(is_protein);
1380 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1381 const size_t kNumQueries(96);
1382
1383 BOOST_REQUIRE(source->End() == false);
1384
1385 CScope scope(*CObjectManager::GetInstance());
1386 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1387 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1388 BOOST_REQUIRE(blast::IsLocalId(query_vector.front().seqloc->GetId()));
1389 scope.GetObjectManager().RevokeAllDataLoaders();
1390 }
1391
BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencerParseLocalIds)1392 BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencerParseLocalIds)
1393 {
1394 CNcbiIfstream infile("data/DF-1.txt");
1395 const bool kIsProtein(false);
1396 const bool kParseID(true);
1397 SDataLoaderConfig dlconfig(kIsProtein);
1398 CBlastInputSourceConfig iconfig(dlconfig, objects::eNa_strand_other, false, kParseID);
1399 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1400 const size_t kNumQueries(96);
1401
1402 BOOST_REQUIRE(source->End() == false);
1403
1404 CScope scope(*CObjectManager::GetInstance());
1405 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1406 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1407 BOOST_REQUIRE(blast::IsLocalId(query_vector.front().seqloc->GetId()));
1408 // Check that the first three IDs went through.
1409 BOOST_REQUIRE_EQUAL(query_vector[0].seqloc->GetId()->AsFastaString(), string("lcl|seq#474_A03_564_c_T3+40.ab1"));
1410 BOOST_REQUIRE_EQUAL(query_vector[1].seqloc->GetId()->AsFastaString(), string("lcl|seq#474_A01_564_a_T3+40.ab1"));
1411 BOOST_REQUIRE_EQUAL(query_vector[2].seqloc->GetId()->AsFastaString(), string("lcl|seq#474_A02_564_b_T3+40.ab1"));
1412 scope.GetObjectManager().RevokeAllDataLoaders();
1413 }
1414
BOOST_AUTO_TEST_CASE(ReadSequenceWithlclID)1415 BOOST_AUTO_TEST_CASE(ReadSequenceWithlclID)
1416 {
1417 CNcbiIfstream infile("data/localid.txt");
1418 const bool kIsProtein(false);
1419 const bool kParseID(true);
1420 SDataLoaderConfig dlconfig(kIsProtein);
1421 CBlastInputSourceConfig iconfig(dlconfig, objects::eNa_strand_other, false, kParseID);
1422 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1423
1424 CScope scope(*CObjectManager::GetInstance());
1425 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1426 BOOST_REQUIRE(blast::IsLocalId(query_vector.front().seqloc->GetId()));
1427 // Check that the local ID went through.
1428 BOOST_REQUIRE_EQUAL(query_vector[0].seqloc->GetId()->AsFastaString(), string("lcl|mylocalID555"));
1429 scope.GetObjectManager().RevokeAllDataLoaders();
1430 }
1431
1432 // This input file contains several sequences in FASTA format, but one of them
1433 // is empty, this should proceed with no problems
BOOST_AUTO_TEST_CASE(ReadMultipleSequences_OneEmpty)1434 BOOST_AUTO_TEST_CASE(ReadMultipleSequences_OneEmpty)
1435 {
1436 CNcbiIfstream infile("data/nt.multiple_queries.one.empty");
1437 const bool is_protein(false);
1438 CBlastInputSourceConfig iconfig(is_protein);
1439 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1440 const size_t kNumQueries(6);
1441
1442 BOOST_REQUIRE(source->End() == false);
1443
1444 CScope scope(*CObjectManager::GetInstance());
1445 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1446 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1447 BOOST_REQUIRE(source->End() == true);
1448 TSeqPos query_lengths[] = { 1920, 1, 130, 0, 2, 1552 };
1449 int i = 0;
1450 ITERATE(blast::TSeqLocVector, q, query_vector) {
1451 BOOST_REQUIRE(blast::IsLocalId(query_vector[i].seqloc->GetId()));
1452 BOOST_REQUIRE_EQUAL(query_lengths[i],
1453 sequence::GetLength(*query_vector[i].seqloc,
1454 query_vector[i].scope));
1455 i++;
1456 }
1457
1458 string warnings;
1459 CheckForEmptySequences(query_vector, warnings);
1460 BOOST_REQUIRE(warnings.find("following sequences had no sequence data:")
1461 != NPOS);
1462
1463 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1464 warnings.clear();
1465 CheckForEmptySequences(bioseqs, warnings);
1466 BOOST_REQUIRE(warnings.find("following sequences had no sequence data:")
1467 != NPOS);
1468 scope.GetObjectManager().RevokeAllDataLoaders();
1469 }
1470
BOOST_AUTO_TEST_CASE(ReadMultipleTis)1471 BOOST_AUTO_TEST_CASE(ReadMultipleTis)
1472 {
1473 CNcbiIfstream infile("data/tis.txt");
1474 const bool is_protein(false);
1475 CBlastInputSourceConfig iconfig(is_protein);
1476 iconfig.SetRetrieveSeqData(false);
1477 iconfig.SetDataLoaderConfig().m_BlastDbName = "data/WGS_test" ;
1478 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1479 CScope scope(*CObjectManager::GetInstance());
1480
1481 BOOST_REQUIRE(source->End() == false);
1482
1483 vector< pair<int, long> > ti_lengths;
1484 ti_lengths.push_back(make_pair(12345, 657L));
1485 ti_lengths.push_back(make_pair(12347, 839L));
1486 ti_lengths.push_back(make_pair(12348, 658L));
1487 ti_lengths.push_back(make_pair(10000, 670L));
1488
1489 const size_t kNumQueries(ti_lengths.size());
1490 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1491 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1492 BOOST_REQUIRE(source->End() == true);
1493
1494 {{
1495 blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1496 BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1497 BOOST_REQUIRE(source->End() == true);
1498 }}
1499
1500 const string db("ti");
1501 for (size_t i = 0; i < kNumQueries; i++) {
1502
1503 const blast::SSeqLoc& ssl = query_vector[i];
1504 BOOST_REQUIRE(ssl.seqloc->IsInt());
1505 const CSeq_interval& seqint = ssl.seqloc->GetInt();
1506 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
1507 BOOST_REQUIRE_EQUAL((TSeqPos)ti_lengths[i].second - 1, seqint.GetTo());
1508
1509 BOOST_REQUIRE(seqint.IsSetId() == true);
1510 BOOST_REQUIRE( !blast::IsLocalId(query_vector.front().seqloc->GetId()));
1511 BOOST_REQUIRE_EQUAL(CSeq_id::e_General, seqint.GetId().Which());
1512 BOOST_REQUIRE_EQUAL(db, seqint.GetId().GetGeneral().GetDb());
1513 BOOST_REQUIRE_EQUAL(ti_lengths[i].first,
1514 seqint.GetId().GetGeneral().GetTag().GetId());
1515 BOOST_REQUIRE(!ssl.mask);
1516 }
1517
1518 /// Validate the data that would be retrieved by blast.cgi
1519 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1520 BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1521 scope.GetObjectManager().RevokeAllDataLoaders();
1522 }
1523
BOOST_AUTO_TEST_CASE(ReadSingleTi)1524 BOOST_AUTO_TEST_CASE(ReadSingleTi)
1525 {
1526 CNcbiIfstream infile("data/ti.txt");
1527 const bool is_protein(false);
1528 CBlastInputSourceConfig iconfig(is_protein);
1529 iconfig.SetRetrieveSeqData(true);
1530 iconfig.SetDataLoaderConfig().m_BlastDbName = "data/WGS_test" ;
1531 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1532
1533 CScope scope(*CObjectManager::GetInstance());
1534 BOOST_REQUIRE(source->End() == false);
1535 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1536 blast::SSeqLoc ssl = seqs.front();
1537 BOOST_REQUIRE(source->End() == true);
1538
1539 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1540 BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1541
1542 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1543 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1544
1545 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1546 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1547
1548 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1549 const TSeqPos length(657);
1550 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1551
1552 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1553 BOOST_REQUIRE_EQUAL(CSeq_id::e_General, ssl.seqloc->GetInt().GetId().Which());
1554 const string db("ti");
1555 BOOST_REQUIRE_EQUAL(db, ssl.seqloc->GetInt().GetId().GetGeneral().GetDb());
1556 BOOST_REQUIRE(ssl.seqloc->GetInt().GetId().GetGeneral().GetTag().IsId());
1557 const int ti(12345);
1558 BOOST_REQUIRE_EQUAL(ti, ssl.seqloc->GetInt().GetId().GetGeneral().GetTag().GetId());
1559
1560 BOOST_REQUIRE(!ssl.mask);
1561
1562 /// Validate the data that would be retrieved by blast.cgi
1563 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1564 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1565 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1566 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1567 BOOST_REQUIRE(b.IsNa());
1568 BOOST_REQUIRE_EQUAL(CSeq_id::e_General, b.GetId().front()->Which());
1569 BOOST_REQUIRE_EQUAL(db, b.GetId().back()->GetGeneral().GetDb());
1570 BOOST_REQUIRE_EQUAL(ti, b.GetId().back()->GetGeneral().GetTag().GetId());
1571 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1572 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1573 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1574 scope.GetObjectManager().RevokeAllDataLoaders();
1575 }
1576
BOOST_AUTO_TEST_CASE(ReadAccessionsAndGisWithNewLines)1577 BOOST_AUTO_TEST_CASE(ReadAccessionsAndGisWithNewLines)
1578 {
1579 CNcbiIfstream infile("data/accgis_nl.txt");
1580 const bool is_protein(false);
1581 CBlastInputSourceConfig iconfig(is_protein);
1582 iconfig.SetRetrieveSeqData(false);
1583 iconfig.SetDataLoaderConfig().m_BlastDbName = "data/WGS_test" ;
1584
1585 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1586
1587 vector< pair<string, long> > gi_accessions;
1588 gi_accessions.push_back(make_pair(string("89161215"), 111583154L));
1589 gi_accessions.push_back(make_pair(string("89161217"), 155407050L));
1590 gi_accessions.push_back(make_pair(string("89161219"), 11133097L));
1591 gi_accessions.push_back(make_pair(string("NC_000001"), 248956422L));
1592 gi_accessions.push_back(make_pair(string("NC_000010.9"), 135374737L));
1593 gi_accessions.push_back(make_pair(string("gnl|ti|12345"), 657L));
1594 gi_accessions.push_back(make_pair(string("NC_000011.8"), 134452384L));
1595 gi_accessions.push_back(make_pair(string("NC_000012.10"), 132349534L));
1596
1597 const size_t kNumQueries(gi_accessions.size());
1598 CScope scope(*CObjectManager::GetInstance());
1599 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1600 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1601 BOOST_REQUIRE(source->End() == true);
1602
1603 {{
1604 blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1605 BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1606 BOOST_REQUIRE(source->End() == true);
1607 }}
1608
1609 for (size_t i = 0; i < kNumQueries; i++) {
1610
1611 blast::SSeqLoc& ssl = query_vector[i];
1612 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
1613 BOOST_REQUIRE_EQUAL((TSeqPos)gi_accessions[i].second - 1,
1614 ssl.seqloc->GetInt().GetTo());
1615
1616 const string& id = gi_accessions[i].first;
1617
1618 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1619 BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1620 TGi gi = ZERO_GI;
1621 if ( (gi = NStr::StringToNumeric<TGi>(id, NStr::fConvErr_NoThrow)) != ZERO_GI) {
1622 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1623 BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1624 } else if (i == 5) {
1625 BOOST_REQUIRE_EQUAL(CSeq_id::e_General,
1626 ssl.seqloc->GetInt().GetId().Which());
1627 const string db("ti");
1628 BOOST_REQUIRE_EQUAL(db, ssl.seqloc->GetInt().GetId().GetGeneral().GetDb());
1629 BOOST_REQUIRE(ssl.seqloc->GetInt().GetId().GetGeneral().GetTag().IsId());
1630 const int ti(12345);
1631 BOOST_REQUIRE_EQUAL(ti,
1632 ssl.seqloc->GetInt().GetId().
1633 GetGeneral().GetTag().GetId());
1634 } else {
1635 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1636 string accession;
1637 int version;
1638
1639 switch (i) {
1640 case 3: accession.assign("NC_000001"); version = 0; break;
1641 case 4: accession.assign("NC_000010"); version = 9; break;
1642 case 6: accession.assign("NC_000011"); version = 8; break;
1643 case 7: accession.assign("NC_000012"); version = 10; break;
1644 default: abort();
1645 }
1646
1647 BOOST_REQUIRE_EQUAL(accession,
1648 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1649 if (version != 0) {
1650 BOOST_REQUIRE_EQUAL(version,
1651 ssl.seqloc->GetInt().GetId().GetOther().GetVersion());
1652 }
1653 }
1654 BOOST_REQUIRE(!ssl.mask);
1655
1656 }
1657
1658 /// Validate the data that would be retrieved by blast.cgi
1659 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1660 BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1661 scope.GetObjectManager().RevokeAllDataLoaders();
1662 }
1663
1664 static string*
s_FileContents2String(const char * file_name)1665 s_FileContents2String(const char* file_name)
1666 {
1667 CNcbiIfstream file(file_name);
1668 char buffer[2048] = { '\0' };
1669 auto_ptr<string> retval(new string);
1670
1671 while (file.getline(buffer, sizeof(buffer))) {
1672 (*retval) += string(buffer) + "\n";
1673 }
1674
1675 return retval.release();
1676 }
1677
BOOST_AUTO_TEST_CASE(ReadAccessionNucleotideIntoBuffer_Single)1678 BOOST_AUTO_TEST_CASE(ReadAccessionNucleotideIntoBuffer_Single)
1679 {
1680 const char* fname("data/accession.txt");
1681 auto_ptr<string> user_input(s_FileContents2String(fname));
1682
1683 CRef<CObjectManager> om(CObjectManager::GetInstance());
1684 CBlastInputSourceConfig iconfig(false);
1685 iconfig.SetRetrieveSeqData(false);
1686 CRef<CBlastInput> source(s_DeclareBlastInput(*user_input, iconfig));
1687
1688 CScope scope(*om);
1689 BOOST_REQUIRE(source->End() == false);
1690 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1691 blast::SSeqLoc ssl = seqs.front();
1692
1693
1694 BOOST_REQUIRE(source->End() == true);
1695
1696 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1697
1698 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1699 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1700
1701 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1702 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1703
1704 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1705 const TSeqPos length(248956422);
1706 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1707
1708 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1709 BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1710 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1711 const string accession("NC_000001");
1712 BOOST_REQUIRE_EQUAL(accession,
1713 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1714
1715 BOOST_REQUIRE(!ssl.mask);
1716
1717 /// Validate the data that would be retrieved by blast.cgi
1718 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1719 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1720 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1721 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1722 BOOST_REQUIRE(b.IsNa());
1723 BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, b.GetId().front()->Which());
1724 BOOST_REQUIRE_EQUAL(accession, b.GetId().front()->GetOther().GetAccession());
1725 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1726 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1727 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1728 scope.GetObjectManager().RevokeAllDataLoaders();
1729
1730 }
1731
BOOST_AUTO_TEST_CASE(ReadGiNuclWithFlankingSpacesIntoBuffer_Single)1732 BOOST_AUTO_TEST_CASE(ReadGiNuclWithFlankingSpacesIntoBuffer_Single)
1733 {
1734 // N.B.: the extra newline causes the CFastaReader to throw an EOF exception
1735 auto_ptr<string> user_input(new string(" 1945386 \n "));
1736
1737 CRef<CObjectManager> om(CObjectManager::GetInstance());
1738 CBlastInputSourceConfig iconfig(false);
1739 CRef<CBlastInput> source(s_DeclareBlastInput(*user_input, iconfig));
1740
1741 CScope scope(*om);
1742 BOOST_REQUIRE(source->End() == false);
1743 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1744 BOOST_REQUIRE(source->End() == true);
1745 blast::SSeqLoc ssl = seqs.front();
1746
1747 BOOST_REQUIRE(source->End() == true);
1748
1749 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1750
1751 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1752 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1753
1754 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1755 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1756
1757 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1758 const TSeqPos length(2772);
1759 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1760
1761 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1762 BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1763 const TGi gi = GI_CONST(1945386);
1764 const string gb_name = "HSU93236";
1765 const string gb_accession = "U93236";
1766 const int gb_version = 1;
1767 if ( !CSeq_id::PreferAccessionOverGi() ) {
1768 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1769 BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1770 }
1771 else {
1772 BOOST_REQUIRE_EQUAL(CSeq_id::e_Genbank, ssl.seqloc->GetInt().GetId().Which());
1773 BOOST_REQUIRE_EQUAL(gb_name, ssl.seqloc->GetInt().GetId().GetGenbank().GetName());
1774 BOOST_REQUIRE_EQUAL(gb_accession, ssl.seqloc->GetInt().GetId().GetGenbank().GetAccession());
1775 BOOST_REQUIRE_EQUAL(gb_version, ssl.seqloc->GetInt().GetId().GetGenbank().GetVersion());
1776 }
1777
1778 BOOST_REQUIRE(!ssl.mask);
1779
1780 /// Validate the data that would be retrieved by blast.cgi
1781 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1782 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1783 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1784 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1785 BOOST_REQUIRE(b.IsNa());
1786
1787 CRef<CSeq_id> id = FindBestChoice(b.GetId(), CSeq_id::BestRank);
1788 BOOST_REQUIRE(id.NotNull());
1789 if ( !CSeq_id::PreferAccessionOverGi() ) {
1790 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, id->Which());
1791 BOOST_REQUIRE_EQUAL(gi, id->GetGi());
1792 }
1793 else {
1794 BOOST_REQUIRE_EQUAL(CSeq_id::e_Genbank, id->Which());
1795 BOOST_REQUIRE_EQUAL(gb_name, id->GetGenbank().GetName());
1796 BOOST_REQUIRE_EQUAL(gb_accession, id->GetGenbank().GetAccession());
1797 BOOST_REQUIRE_EQUAL(gb_version, id->GetGenbank().GetVersion());
1798 }
1799 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1800 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1801 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1802 scope.GetObjectManager().RevokeAllDataLoaders();
1803
1804 }
1805
BOOST_AUTO_TEST_CASE(ReadAccessionNuclWithFlankingSpacesIntoBuffer_Single)1806 BOOST_AUTO_TEST_CASE(ReadAccessionNuclWithFlankingSpacesIntoBuffer_Single)
1807 {
1808 auto_ptr<string> user_input(new string(" X65215.1 "));
1809
1810 CRef<CObjectManager> om(CObjectManager::GetInstance());
1811 CBlastInputSourceConfig iconfig(false);
1812 CBlastFastaInputSource fasta_source(*user_input, iconfig);
1813 CBlastInput source(&fasta_source);
1814
1815 CScope scope(*om);
1816 BOOST_REQUIRE(source.End() == false);
1817 blast::TSeqLocVector seqs = source.GetAllSeqLocs(scope);
1818 blast::SSeqLoc ssl = seqs.front();
1819
1820 BOOST_REQUIRE(source.End() == true);
1821
1822 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1823
1824 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1825 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1826
1827 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1828 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1829
1830 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1831 const TSeqPos length(624);
1832 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1833
1834 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1835 BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1836
1837 const TGi gi = GI_CONST(555);
1838 const string accession = "X65215";
1839 const int version = 1;
1840 if ( !CSeq_id::PreferAccessionOverGi() ) {
1841 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1842 BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1843 }
1844 else {
1845 BOOST_REQUIRE_EQUAL(CSeq_id::e_Embl, ssl.seqloc->GetInt().GetId().Which());
1846 BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetInt().GetId().GetEmbl().GetAccession());
1847 BOOST_REQUIRE_EQUAL(version, ssl.seqloc->GetInt().GetId().GetEmbl().GetVersion());
1848 }
1849
1850 BOOST_REQUIRE(!ssl.mask);
1851
1852 /// Validate the data that would be retrieved by blast.cgi
1853 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1854 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1855 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1856 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1857 BOOST_REQUIRE(b.IsNa());
1858 bool found_gi = false, found_accession = false;
1859 ITERATE(CBioseq::TId, id, b.GetId()) {
1860 if ((*id)->Which() == CSeq_id::e_Gi) {
1861 BOOST_REQUIRE_EQUAL(GI_CONST(555), (*id)->GetGi());
1862 found_gi = true;
1863 } else if ((*id)->Which() == CSeq_id::e_Embl) {
1864 CNcbiOstrstream os;
1865 (*id)->GetEmbl().AsFastaString(os);
1866 const string fasta_acc = CNcbiOstrstreamToString(os);
1867 BOOST_REQUIRE(NStr::Find(fasta_acc, accession) != NPOS);
1868 found_accession = true;
1869 }
1870 }
1871 BOOST_REQUIRE(found_gi);
1872 BOOST_REQUIRE(found_accession);
1873 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1874 BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1875 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1876 scope.GetObjectManager().RevokeAllDataLoaders();
1877 }
1878
BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProteinIntoBuffer_Single)1879 BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProteinIntoBuffer_Single)
1880 {
1881 const char* fname("data/aa.129295");
1882 auto_ptr<string> user_input(s_FileContents2String(fname));
1883
1884 CRef<CObjectManager> om(CObjectManager::GetInstance());
1885 CBlastInputSourceConfig iconfig(true);
1886 CBlastFastaInputSource fasta_source(*user_input, iconfig);
1887 CBlastInput source(&fasta_source);
1888
1889 CScope scope(*om);
1890 BOOST_REQUIRE(source.End() == false);
1891 blast::TSeqLocVector seqs = source.GetAllSeqLocs(scope);
1892 blast::SSeqLoc ssl = seqs.front();
1893
1894 BOOST_REQUIRE(source.End() == true);
1895
1896 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1897
1898 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1899 BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
1900
1901 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1902 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1903
1904 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1905 const TSeqPos length = 232;
1906 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1907
1908 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1909 BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
1910 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
1911
1912 BOOST_REQUIRE(!ssl.mask);
1913
1914 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1915 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1916 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1917 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1918 BOOST_REQUIRE(b.IsAa());
1919 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1920 BOOST_REQUIRE_EQUAL(CSeq_inst::eMol_aa, b.GetInst().GetMol());
1921 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1922 scope.GetObjectManager().RevokeAllDataLoaders();
1923
1924 }
1925
BOOST_AUTO_TEST_CASE(RangeBoth)1926 BOOST_AUTO_TEST_CASE(RangeBoth)
1927 {
1928 CNcbiIfstream infile("data/aa.129295");
1929 const bool is_protein(true);
1930 const TSeqPos start(50);
1931 const TSeqPos stop(100);
1932 CBlastInputSourceConfig iconfig(is_protein);
1933 iconfig.SetRange().SetFrom(start);
1934 iconfig.SetRange().SetTo(stop);
1935 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1936
1937 CScope scope(*CObjectManager::GetInstance());
1938 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
1939
1940 BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetInt().GetFrom());
1941 BOOST_REQUIRE_EQUAL(stop, ssl.seqloc->GetInt().GetTo());
1942 BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetStart(eExtreme_Positional));
1943 BOOST_REQUIRE_EQUAL(stop, ssl.seqloc->GetStop(eExtreme_Positional));
1944 scope.GetObjectManager().RevokeAllDataLoaders();
1945 }
1946
BOOST_AUTO_TEST_CASE(RangeStartOnly)1947 BOOST_AUTO_TEST_CASE(RangeStartOnly)
1948 {
1949 CNcbiIfstream infile("data/aa.129295");
1950 const bool is_protein(true);
1951 const TSeqPos start(50);
1952 const TSeqPos length(232);
1953 CBlastInputSourceConfig iconfig(is_protein);
1954 iconfig.SetRange().SetFrom(start);
1955 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1956
1957 CScope scope(*CObjectManager::GetInstance());
1958 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
1959
1960 BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetInt().GetFrom());
1961 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1962 BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetStart(eExtreme_Positional));
1963 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetStop(eExtreme_Positional));
1964 scope.GetObjectManager().RevokeAllDataLoaders();
1965 }
1966
BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanTo)1967 BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanTo)
1968 {
1969 CNcbiIfstream infile("data/aa.129295");
1970 const bool is_protein(true);
1971 CBlastInputSourceConfig iconfig(is_protein);
1972 iconfig.SetRange().SetFrom(100);
1973 iconfig.SetRange().SetTo(50);
1974 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1975 CScope scope(*CObjectManager::GetInstance());
1976
1977 try { source->GetNextSeqLocBatch(scope).front(); }
1978 catch (const CInputException& e) {
1979 string msg(e.what());
1980 BOOST_REQUIRE(msg.find("Invalid sequence range") != NPOS);
1981 BOOST_REQUIRE_EQUAL(CInputException::eInvalidRange, e.GetErrCode());
1982 return;
1983 }
1984 BOOST_REQUIRE(false); // should never get here
1985 scope.GetObjectManager().RevokeAllDataLoaders();
1986 }
1987
BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanSequenceLength)1988 BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanSequenceLength)
1989 {
1990 CNcbiIfstream infile("data/aa.129295");
1991 const bool is_protein(true);
1992 CBlastInputSourceConfig iconfig(is_protein);
1993 iconfig.SetRange().SetFrom(1000);
1994 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1995 CScope scope(*CObjectManager::GetInstance());
1996
1997 try { source->GetNextSeqLocBatch(scope).front(); }
1998 catch (const CInputException& e) {
1999 string msg(e.what());
2000 BOOST_REQUIRE(msg.find("Invalid from coordinate") != NPOS);
2001 BOOST_REQUIRE_EQUAL(CInputException::eInvalidRange, e.GetErrCode());
2002 return;
2003 }
2004 BOOST_REQUIRE(false); // should never get here
2005 scope.GetObjectManager().RevokeAllDataLoaders();
2006 }
2007
BOOST_AUTO_TEST_CASE(RangeInvalid_ToEqualThanSequenceLength)2008 BOOST_AUTO_TEST_CASE(RangeInvalid_ToEqualThanSequenceLength)
2009 {
2010 CNcbiIfstream infile("data/aa.129295");
2011 const bool is_protein(true);
2012 const TSeqPos length(232);
2013 CBlastInputSourceConfig iconfig(is_protein);
2014 iconfig.SetRange().SetFrom(10);
2015 iconfig.SetRange().SetTo(length);
2016 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2017 CScope scope(*CObjectManager::GetInstance());
2018
2019 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
2020
2021 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2022
2023 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2024 BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2025
2026 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2027 BOOST_REQUIRE_EQUAL((TSeqPos)10, ssl.seqloc->GetInt().GetFrom());
2028
2029 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2030 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2031 scope.GetObjectManager().RevokeAllDataLoaders();
2032 }
2033
BOOST_AUTO_TEST_CASE(RangeInvalid_ToGreaterThanSequenceLength)2034 BOOST_AUTO_TEST_CASE(RangeInvalid_ToGreaterThanSequenceLength)
2035 {
2036 CNcbiIfstream infile("data/aa.129295");
2037 const bool is_protein(true);
2038 const TSeqPos length(232);
2039 CBlastInputSourceConfig iconfig(is_protein);
2040 iconfig.SetRange().SetFrom(10);
2041 iconfig.SetRange().SetTo(length*2);
2042 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2043 CScope scope(*CObjectManager::GetInstance());
2044
2045 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
2046
2047 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2048
2049 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2050 BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2051
2052 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2053 BOOST_REQUIRE_EQUAL((TSeqPos)10, ssl.seqloc->GetInt().GetFrom());
2054
2055 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2056 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2057 scope.GetObjectManager().RevokeAllDataLoaders();
2058 }
2059
BOOST_AUTO_TEST_CASE(ParseDefline)2060 BOOST_AUTO_TEST_CASE(ParseDefline)
2061 {
2062 CNcbiIfstream infile("data/aa.129295");
2063 const bool is_protein(true);
2064 CBlastInputSourceConfig iconfig(is_protein);
2065 iconfig.SetBelieveDeflines(true);
2066 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2067 CScope scope(*CObjectManager::GetInstance());
2068
2069 const TGi gi = GI_CONST(129295);
2070 const string name = "OVAX_CHICK";
2071 const string accession = "P01013";
2072 const string release = "reviewed";
2073 blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
2074 BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
2075
2076 if ( !CSeq_id::PreferAccessionOverGi() ) {
2077 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetId()->Which());
2078 BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetId()->GetGi());
2079 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
2080 BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
2081 }
2082 else {
2083 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, ssl.seqloc->GetId()->Which());
2084 BOOST_REQUIRE_EQUAL(name, ssl.seqloc->GetId()->GetSwissprot().GetName());
2085 BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetId()->GetSwissprot().GetAccession());
2086 BOOST_REQUIRE_EQUAL(release, ssl.seqloc->GetId()->GetSwissprot().GetRelease());
2087 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, ssl.seqloc->GetInt().GetId().Which());
2088 BOOST_REQUIRE_EQUAL(name, ssl.seqloc->GetInt().GetId().GetSwissprot().GetName());
2089 BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2090 BOOST_REQUIRE_EQUAL(release, ssl.seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2091 }
2092 scope.GetObjectManager().RevokeAllDataLoaders();
2093 }
2094
BOOST_AUTO_TEST_CASE(BadProtStrand)2095 BOOST_AUTO_TEST_CASE(BadProtStrand)
2096 {
2097 CNcbiIfstream infile("data/aa.129295");
2098 const bool is_protein(true);
2099 CBlastInputSourceConfig iconfig(is_protein);
2100 iconfig.SetStrand(eNa_strand_both);
2101 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2102 CScope scope(*CObjectManager::GetInstance());
2103
2104 bool caught_exception(false);
2105 try { blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front(); }
2106 catch (const CInputException& e) {
2107 string msg(e.what());
2108 BOOST_REQUIRE(msg.find("Cannot assign nucleotide strand to protein")
2109 != NPOS);
2110 BOOST_REQUIRE_EQUAL(CInputException::eInvalidStrand, e.GetErrCode());
2111 caught_exception = true;
2112 }
2113 BOOST_REQUIRE(caught_exception);
2114 BOOST_REQUIRE(source->End() == true);
2115 scope.GetObjectManager().RevokeAllDataLoaders();
2116 }
2117
BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineNucl_Multiple)2118 BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineNucl_Multiple)
2119 {
2120 CNcbiIfstream infile("data/nt.cat");
2121 const bool is_protein(false);
2122 CBlastInputSourceConfig iconfig(is_protein);
2123 iconfig.SetStrand(eNa_strand_both);
2124 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2125
2126 const size_t kNumQueries(2);
2127 CScope scope(*CObjectManager::GetInstance());
2128 blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
2129 BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
2130 BOOST_REQUIRE(source->End() == true);
2131
2132 {{
2133 blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
2134 BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
2135 BOOST_REQUIRE(source->End() == true);
2136 }}
2137
2138 blast::SSeqLoc ssl = query_vector.front();
2139 TSeqPos length = 646;
2140
2141 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
2142 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetStop(eExtreme_Positional));
2143 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
2144 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2145 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2146
2147 ssl = query_vector.back();
2148
2149 length = 360;
2150 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
2151 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetStop(eExtreme_Positional));
2152 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
2153 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2154 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2155 BOOST_REQUIRE(!ssl.mask);
2156 scope.GetObjectManager().RevokeAllDataLoaders();
2157 }
2158
BOOST_AUTO_TEST_CASE(NuclStrand)2159 BOOST_AUTO_TEST_CASE(NuclStrand)
2160 {
2161 const char* fname("data/nt.cat");
2162 const bool is_protein(false);
2163 CBlastInputSourceConfig iconfig(is_protein);
2164 CScope scope(*CObjectManager::GetInstance());
2165
2166 // Test plus strand
2167 {
2168 CNcbiIfstream infile(fname);
2169 const ENa_strand strand(eNa_strand_plus);
2170 iconfig.SetStrand(strand);
2171 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2172 TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2173
2174 ITERATE(TSeqLocVector, itr, seqs) {
2175 const blast::SSeqLoc& ssl = *itr;
2176 BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetStrand());
2177 BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetInt().GetStrand());
2178 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2179 }
2180 }
2181
2182 // Test minus strand
2183 {
2184 CNcbiIfstream infile(fname);
2185 const ENa_strand strand(eNa_strand_minus);
2186 iconfig.SetStrand(strand);
2187 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2188 TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2189
2190 ITERATE(TSeqLocVector, itr, seqs) {
2191 const blast::SSeqLoc& ssl = *itr;
2192 BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetStrand());
2193 BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetInt().GetStrand());
2194 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2195 }
2196 }
2197 scope.GetObjectManager().RevokeAllDataLoaders();
2198 }
2199
BOOST_AUTO_TEST_CASE(NuclLcaseMask_TSeqLocVector)2200 BOOST_AUTO_TEST_CASE(NuclLcaseMask_TSeqLocVector)
2201 {
2202 CNcbiIfstream infile("data/nt.cat");
2203 const bool is_protein(false);
2204 CBlastInputSourceConfig iconfig(is_protein);
2205 BOOST_REQUIRE(iconfig.GetBelieveDeflines() == false);
2206 BOOST_REQUIRE(iconfig.GetLowercaseMask() == false);
2207 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)iconfig.GetStrand());
2208 iconfig.SetLowercaseMask(true);
2209 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2210 CScope scope(*CObjectManager::GetInstance());
2211
2212 blast::TSeqLocVector seqs = source->GetNextSeqLocBatch(scope);
2213 blast::TSeqLocVector::iterator itr = seqs.begin();
2214 blast::SSeqLoc ssl = *itr;
2215 BOOST_REQUIRE(ssl.mask);
2216 BOOST_REQUIRE(ssl.mask->IsPacked_int());
2217
2218 CPacked_seqint::Tdata masklocs = ssl.mask->GetPacked_int();
2219 BOOST_REQUIRE_EQUAL((size_t)2, masklocs.size());
2220 BOOST_REQUIRE_EQUAL((TSeqPos)126, masklocs.front()->GetFrom());
2221 BOOST_REQUIRE_EQUAL((TSeqPos)167, masklocs.front()->GetTo());
2222 // any masks read from the file are expected to be in the plus strand
2223 BOOST_REQUIRE(masklocs.front()->CanGetStrand());
2224 BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)masklocs.front()->GetStrand());
2225
2226 BOOST_REQUIRE_EQUAL((TSeqPos)330, masklocs.back()->GetFrom());
2227 BOOST_REQUIRE_EQUAL((TSeqPos)356, masklocs.back()->GetTo());
2228 // any masks read from the file are expected to be in the plus strand
2229 BOOST_REQUIRE(masklocs.back()->CanGetStrand());
2230 BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)masklocs.back()->GetStrand());
2231
2232 ssl = *++itr;
2233 BOOST_REQUIRE(ssl.mask);
2234 BOOST_REQUIRE(ssl.mask->IsNull());
2235 scope.GetObjectManager().RevokeAllDataLoaders();
2236 }
2237
BOOST_AUTO_TEST_CASE(NuclLcaseMask_BlastQueryVector)2238 BOOST_AUTO_TEST_CASE(NuclLcaseMask_BlastQueryVector)
2239 {
2240 CNcbiIfstream infile("data/nt.cat");
2241 const bool is_protein(false);
2242 CBlastInputSourceConfig iconfig(is_protein);
2243 iconfig.SetLowercaseMask(true);
2244 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2245 CScope scope(*CObjectManager::GetInstance());
2246
2247 CRef<blast::CBlastQueryVector> seqs = source->GetNextSeqBatch(scope);
2248 BOOST_REQUIRE( !seqs->Empty() );
2249 BOOST_REQUIRE_EQUAL((int)2, (int)seqs->size());
2250 CRef<blast::CBlastSearchQuery> query = (*seqs)[0];
2251 BOOST_REQUIRE( !query->GetMaskedRegions().empty());
2252
2253 CRef<CPacked_seqint> masks =
2254 query->GetMaskedRegions().ConvertToCPacked_seqint();
2255 CPacked_seqint::Tdata masklocs = masks->Get();
2256 CPacked_seqint::Tdata::const_iterator itr = masks->Get().begin();
2257 BOOST_REQUIRE_EQUAL((size_t)4, masklocs.size());
2258
2259 // Note that for this case, the masks even though are also read from the
2260 // file (as the unit test above), these are returned for both strands.
2261 BOOST_REQUIRE_EQUAL((TSeqPos)126, (*itr)->GetFrom());
2262 BOOST_REQUIRE_EQUAL((TSeqPos)167, (*itr)->GetTo());
2263 BOOST_REQUIRE((*itr)->CanGetStrand());
2264 BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)(*itr)->GetStrand());
2265 ++itr;
2266 BOOST_REQUIRE_EQUAL((TSeqPos)126, (*itr)->GetFrom());
2267 BOOST_REQUIRE_EQUAL((TSeqPos)167, (*itr)->GetTo());
2268 BOOST_REQUIRE((*itr)->CanGetStrand());
2269 BOOST_REQUIRE_EQUAL((int)eNa_strand_minus, (int)(*itr)->GetStrand());
2270 ++itr;
2271
2272 BOOST_REQUIRE_EQUAL((TSeqPos)330, (*itr)->GetFrom());
2273 BOOST_REQUIRE_EQUAL((TSeqPos)356, (*itr)->GetTo());
2274 BOOST_REQUIRE((*itr)->CanGetStrand());
2275 BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)(*itr)->GetStrand());
2276 ++itr;
2277 BOOST_REQUIRE_EQUAL((TSeqPos)330, (*itr)->GetFrom());
2278 BOOST_REQUIRE_EQUAL((TSeqPos)356, (*itr)->GetTo());
2279 BOOST_REQUIRE((*itr)->CanGetStrand());
2280 BOOST_REQUIRE_EQUAL((int)eNa_strand_minus, (int)(*itr)->GetStrand());
2281 ++itr;
2282
2283 BOOST_REQUIRE(itr == masks->Get().end());
2284
2285 query = (*seqs)[1];
2286 BOOST_REQUIRE(query->GetMaskedRegions().empty());
2287 scope.GetObjectManager().RevokeAllDataLoaders();
2288 }
2289
BOOST_AUTO_TEST_CASE(MultiSeq)2290 BOOST_AUTO_TEST_CASE(MultiSeq)
2291 {
2292 CNcbiIfstream infile("data/aa.cat");
2293 const bool is_protein(true);
2294 CBlastInputSourceConfig iconfig(is_protein);
2295 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2296 CScope scope(*CObjectManager::GetInstance());
2297
2298 blast::TSeqLocVector v = source->GetAllSeqLocs(scope);
2299 BOOST_REQUIRE(source->End());
2300 BOOST_REQUIRE_EQUAL((size_t)19, v.size());
2301 scope.GetObjectManager().RevokeAllDataLoaders();
2302 }
2303
BOOST_AUTO_TEST_CASE(MultiRange)2304 BOOST_AUTO_TEST_CASE(MultiRange)
2305 {
2306 CNcbiIfstream infile("data/aa.cat");
2307 const bool is_protein(true);
2308 const TSeqPos start(50);
2309 const TSeqPos stop(100);
2310 CBlastInputSourceConfig iconfig(is_protein);
2311 iconfig.SetRange().SetFrom(start).SetTo(stop);
2312 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2313 CScope scope(*CObjectManager::GetInstance());
2314
2315 blast::TSeqLocVector v = source->GetAllSeqLocs(scope);
2316 NON_CONST_ITERATE(blast::TSeqLocVector, itr, v) {
2317 BOOST_REQUIRE_EQUAL(start, itr->seqloc->GetStart(eExtreme_Positional));
2318 BOOST_REQUIRE_EQUAL(stop, itr->seqloc->GetStop(eExtreme_Positional));
2319 BOOST_REQUIRE_EQUAL(start, itr->seqloc->GetInt().GetFrom());
2320 BOOST_REQUIRE_EQUAL(stop, itr->seqloc->GetInt().GetTo());
2321 }
2322 scope.GetObjectManager().RevokeAllDataLoaders();
2323 }
2324
BOOST_AUTO_TEST_CASE(MultiBatch)2325 BOOST_AUTO_TEST_CASE(MultiBatch)
2326 {
2327 CNcbiIfstream infile("data/aa.cat");
2328 const bool is_protein(true);
2329 CBlastInputSourceConfig iconfig(is_protein);
2330 iconfig.SetBelieveDeflines(true);
2331 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig, 5000));
2332 CScope scope(*CObjectManager::GetInstance());
2333
2334 TGi gi;
2335 blast::TSeqLocVector v;
2336
2337 v = source->GetNextSeqLocBatch(scope);
2338 BOOST_REQUIRE_EQUAL((size_t)7, v.size());
2339 BOOST_REQUIRE_EQUAL((TSeqPos)530, v[0].seqloc->GetInt().GetTo());
2340 gi = GI_CONST(1346057);
2341 string name = "G11A_ORYSA";
2342 string accession = "P47997";
2343 string release = "reviewed";
2344 BOOST_REQUIRE( !blast::IsLocalId(v[0].seqloc->GetId()) );
2345 if ( !CSeq_id::PreferAccessionOverGi() ) {
2346 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetInt().GetId().Which());
2347 BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetInt().GetId().GetGi());
2348 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetId()->Which());
2349 BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetId()->GetGi());
2350 }
2351 else {
2352 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetInt().GetId().Which());
2353 BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetInt().GetId().GetSwissprot().GetName());
2354 BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2355 BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2356 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetId()->Which());
2357 BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetId()->GetSwissprot().GetName());
2358 BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetId()->GetSwissprot().GetAccession());
2359 BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetId()->GetSwissprot().GetRelease());
2360 }
2361
2362 v = source->GetNextSeqLocBatch(scope);
2363 BOOST_REQUIRE_EQUAL((size_t)8, v.size());
2364 BOOST_REQUIRE_EQUAL((TSeqPos)445, v[0].seqloc->GetInt().GetTo());
2365 gi = GI_CONST(1170625);
2366 name = "KCC1_YEAST";
2367 accession = "P27466";
2368 release = "reviewed";
2369 BOOST_REQUIRE( !blast::IsLocalId(v[0].seqloc->GetId()) );
2370 if ( !CSeq_id::PreferAccessionOverGi() ) {
2371 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetInt().GetId().Which());
2372 BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetInt().GetId().GetGi());
2373 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetId()->Which());
2374 BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetId()->GetGi());
2375 }
2376 else {
2377 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetInt().GetId().Which());
2378 BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetInt().GetId().GetSwissprot().GetName());
2379 BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2380 BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2381 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetId()->Which());
2382 BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetId()->GetSwissprot().GetName());
2383 BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetId()->GetSwissprot().GetAccession());
2384 BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetId()->GetSwissprot().GetRelease());
2385 }
2386
2387 v = source->GetNextSeqLocBatch(scope);
2388 BOOST_REQUIRE_EQUAL((size_t)4, v.size());
2389 BOOST_REQUIRE_EQUAL((TSeqPos)688, v[0].seqloc->GetInt().GetTo());
2390 gi = GI_CONST(114152);
2391 name = "ARK1_HUMAN";
2392 accession = "P25098";
2393 release = "reviewed";
2394 BOOST_REQUIRE( !blast::IsLocalId(v[0].seqloc->GetId()) );
2395 if ( !CSeq_id::PreferAccessionOverGi() ) {
2396 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetInt().GetId().Which());
2397 BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetInt().GetId().GetGi());
2398 BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetId()->Which());
2399 BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetId()->GetGi());
2400 }
2401 else {
2402 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetInt().GetId().Which());
2403 BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetInt().GetId().GetSwissprot().GetName());
2404 BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2405 BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2406 BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetId()->Which());
2407 BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetId()->GetSwissprot().GetName());
2408 BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetId()->GetSwissprot().GetAccession());
2409 BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetId()->GetSwissprot().GetRelease());
2410 }
2411
2412 BOOST_REQUIRE(source->End());
2413 scope.GetObjectManager().RevokeAllDataLoaders();
2414 }
2415
BOOST_AUTO_TEST_CASE(NoDeflineExpected)2416 BOOST_AUTO_TEST_CASE(NoDeflineExpected)
2417 {
2418 CNcbiIfstream infile("data/tiny.fa");
2419 const bool is_protein(false);
2420 CBlastInputSourceConfig iconfig(is_protein);
2421 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2422 CScope scope(*CObjectManager::GetInstance());
2423
2424 blast::TSeqLocVector v = source->GetAllSeqLocs(scope);
2425 BOOST_REQUIRE(source->End());
2426 BOOST_REQUIRE_EQUAL((size_t)1, v.size());
2427 scope.GetObjectManager().RevokeAllDataLoaders();
2428 }
2429
BOOST_AUTO_TEST_CASE(NoDeflineUnexpected)2430 BOOST_AUTO_TEST_CASE(NoDeflineUnexpected)
2431 {
2432 CNcbiIfstream infile("data/tiny.fa");
2433 const bool is_protein(false);
2434 CBlastInputSourceConfig iconfig(is_protein);
2435 iconfig.SetBelieveDeflines(true);
2436 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2437 CScope scope(*CObjectManager::GetInstance());
2438
2439 BOOST_REQUIRE_THROW(source->GetAllSeqLocs(scope), CException);
2440 scope.GetObjectManager().RevokeAllDataLoaders();
2441 }
BOOST_AUTO_TEST_CASE(wb325_1)2442 BOOST_AUTO_TEST_CASE(wb325_1) {
2443 string input("gb|ABZI01000088\ngb|ABZN01000067");
2444 istringstream instream(input);
2445
2446 const bool is_protein(false);
2447 CBlastInputSourceConfig iconfig(is_protein);
2448 iconfig.SetRetrieveSeqData(false);
2449 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2450 CScope scope(*CObjectManager::GetInstance());
2451
2452 BOOST_REQUIRE(source->End() == false);
2453 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2454 BOOST_REQUIRE(source->End() == true);
2455 BOOST_REQUIRE_EQUAL(2u, seqs.size());
2456 //blast::SSeqLoc ssl = seqs.front();
2457 scope.GetObjectManager().RevokeAllDataLoaders();
2458 }
2459
BOOST_AUTO_TEST_CASE(wb325_2)2460 BOOST_AUTO_TEST_CASE(wb325_2)
2461 {
2462 string input("gb|ABZN01000067\ngb|ABZI01000088");
2463 istringstream instream(input);
2464
2465 const bool is_protein(false);
2466 CBlastInputSourceConfig iconfig(is_protein);
2467 iconfig.SetRetrieveSeqData(false);
2468 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2469 CScope scope(*CObjectManager::GetInstance());
2470
2471 BOOST_REQUIRE(source->End() == false);
2472 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2473 BOOST_REQUIRE(source->End() == true);
2474 BOOST_REQUIRE_EQUAL(2u, seqs.size());
2475 //blast::SSeqLoc ssl = seqs.front();
2476 scope.GetObjectManager().RevokeAllDataLoaders();
2477 }
2478
BOOST_AUTO_TEST_CASE(wb325_single1)2479 BOOST_AUTO_TEST_CASE(wb325_single1)
2480 {
2481 string input("gb|ABZN01000067");
2482 //string input("218001205");
2483 istringstream instream(input);
2484
2485 const bool is_protein(false);
2486 CBlastInputSourceConfig iconfig(is_protein);
2487 iconfig.SetRetrieveSeqData(false);
2488 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2489 CScope scope(*CObjectManager::GetInstance());
2490
2491 BOOST_REQUIRE(source->End() == false);
2492 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2493 BOOST_REQUIRE(source->End() == true);
2494 BOOST_REQUIRE_EQUAL(1u, seqs.size());
2495 //blast::SSeqLoc ssl = seqs.front();
2496 scope.GetObjectManager().RevokeAllDataLoaders();
2497 }
2498
BOOST_AUTO_TEST_CASE(wb325_single2)2499 BOOST_AUTO_TEST_CASE(wb325_single2)
2500 {
2501 string input("gb|ABZI01000088");
2502 //string input("217999527");
2503 istringstream instream(input);
2504
2505 const bool is_protein(false);
2506 CBlastInputSourceConfig iconfig(is_protein);
2507 iconfig.SetRetrieveSeqData(false);
2508 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2509 CScope scope(*CObjectManager::GetInstance());
2510
2511 BOOST_REQUIRE(source->End() == false);
2512 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2513 BOOST_REQUIRE(source->End() == true);
2514 BOOST_REQUIRE_EQUAL(1u, seqs.size());
2515 //blast::SSeqLoc ssl = seqs.front();
2516 scope.GetObjectManager().RevokeAllDataLoaders();
2517 }
2518
BOOST_AUTO_TEST_CASE(ReadSinglePdb)2519 BOOST_AUTO_TEST_CASE(ReadSinglePdb)
2520 {
2521 string pdb_mol("1QCF");
2522 string pdb_chain("A");
2523 string pdb(pdb_mol + '_' + pdb_chain);
2524 istringstream instream(pdb);
2525
2526 const bool is_protein(true);
2527 CBlastInputSourceConfig iconfig(is_protein);
2528 iconfig.SetRetrieveSeqData(false);
2529 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2530 CScope scope(*CObjectManager::GetInstance());
2531
2532 BOOST_REQUIRE(source->End() == false);
2533 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2534 blast::SSeqLoc ssl = seqs.front();
2535
2536 BOOST_REQUIRE(source->End() == true);
2537
2538 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2539
2540 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2541 BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2542
2543 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2544 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2545
2546 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2547 const TSeqPos length(454);
2548 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2549
2550 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
2551 BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, ssl.seqloc->GetInt().GetId().Which());
2552
2553 BOOST_REQUIRE_EQUAL(pdb_mol, ssl.seqloc->GetInt().GetId().GetPdb().GetMol().Get());
2554
2555 BOOST_REQUIRE(!ssl.mask);
2556
2557 /// Validate the data that would be retrieved by blast.cgi
2558 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2559 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
2560 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
2561 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
2562 BOOST_REQUIRE(! b.IsNa());
2563 BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, b.GetId().front()->Which());
2564 BOOST_REQUIRE_EQUAL(pdb_mol, b.GetId().front()->GetPdb().GetMol().Get());
2565 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
2566 BOOST_REQUIRE(! CSeq_inst::IsNa(b.GetInst().GetMol()));
2567 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
2568 scope.GetObjectManager().RevokeAllDataLoaders();
2569 }
2570
BOOST_AUTO_TEST_CASE(ThrowOnEmptySequence)2571 BOOST_AUTO_TEST_CASE(ThrowOnEmptySequence)
2572 {
2573 string wgs_master("NZ_ABFD00000000.2"); // Contains no sequence
2574 istringstream instream(wgs_master);
2575
2576 const bool is_protein(false);
2577 CBlastInputSourceConfig iconfig(is_protein);
2578 iconfig.SetRetrieveSeqData(false);
2579 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2580 CScope scope(*CObjectManager::GetInstance());
2581 BOOST_REQUIRE_THROW(source->GetAllSeqLocs(scope), CInputException);
2582 scope.GetObjectManager().RevokeAllDataLoaders();
2583 }
2584
BOOST_AUTO_TEST_CASE(FetchSraID)2585 BOOST_AUTO_TEST_CASE(FetchSraID)
2586 {
2587 CNcbiIfstream infile("data/sra_seqid.txt");
2588 const bool is_protein(false);
2589 SDataLoaderConfig dlconfig(is_protein,
2590 SDataLoaderConfig::eUseGenbankDataLoader);
2591 CBlastInputSourceConfig iconfig(dlconfig);
2592 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2593 CScope scope(*CObjectManager::GetInstance());
2594
2595 TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2596 blast::SSeqLoc ssl = seqs.front();
2597 BOOST_CHECK(source->End() == true);
2598
2599 // Obtained by running
2600 // fastq-dump SRR066117 -N 18823 -X 18823 --fasta 80 --split-spot --skip-technical --minReadLen 6 --clip
2601 const string kSeqData =
2602 "AGCACCACGACTGCTAACCGTAACGCCAGGTGTATAACCTAATGCTTCTTTACAGACTGAAATTGATGCATCTGCATCTC"
2603 "TTCATTTGTCACAACCGAAATA";
2604
2605 BOOST_CHECK(ssl.seqloc->IsInt());
2606 BOOST_REQUIRE(ssl.seqloc->GetId()->IsGeneral());
2607 BOOST_REQUIRE_EQUAL(CDbtag::eDbtagType_SRA,
2608 ssl.seqloc->GetId()->GetGeneral().GetType());
2609
2610 BOOST_CHECK(ssl.seqloc->GetInt().IsSetFrom() == true);
2611 BOOST_CHECK_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2612
2613 BOOST_CHECK(ssl.seqloc->GetInt().IsSetTo() == true);
2614 BOOST_CHECK_EQUAL(kSeqData.size()-1, ssl.seqloc->GetInt().GetTo());
2615
2616 const CSeq_id * seqid = ssl.seqloc->GetId();
2617 CBioseq_Handle bh = scope.GetBioseqHandle(*seqid);
2618 CSeqVector sv = bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
2619
2620 BOOST_CHECK_EQUAL(kSeqData.size(), sv.size());
2621 for (size_t i = 0; i < std::min((TSeqPos)kSeqData.size(), sv.size()); i++) {
2622 CNcbiOstrstream oss;
2623 oss << "Base number " << i+1 << " differs: got '"
2624 << (char)sv[i] << "', expected '" << kSeqData[i]
2625 << "'";
2626 string msg = CNcbiOstrstreamToString(oss);
2627 BOOST_CHECK_MESSAGE((char)sv[i] == kSeqData[i], msg);
2628 BOOST_CHECK_NE('-', (char)sv[i]);
2629 }
2630
2631 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2632 const CBioseq& bioseq = bioseqs->GetSeq_set().front()->GetSeq();
2633 const CSeq_inst& inst = bioseq.GetInst();
2634 BOOST_CHECK_EQUAL(inst.GetLength(), kSeqData.size());
2635 BOOST_REQUIRE(inst.IsSetSeq_data());
2636 const CSeq_data& seq_data = inst.GetSeq_data();
2637 BOOST_REQUIRE(seq_data.IsIupacna());
2638 const string& seq = seq_data.GetIupacna().Get();
2639 for (size_t i = 0; i < seq.size(); i++) {
2640 CNcbiOstrstream oss;
2641 oss << "Base number " << i+1 << " differs: got '"
2642 << (char)sv[i] << "', expected '" << kSeqData[i]
2643 << "'";
2644 string msg = CNcbiOstrstreamToString(oss);
2645 BOOST_CHECK_MESSAGE((char)sv[i] == kSeqData[i], msg);
2646 BOOST_CHECK_NE('-', (char)seq[i]);
2647 }
2648 scope.GetObjectManager().RevokeAllDataLoaders();
2649 }
2650
BOOST_AUTO_TEST_CASE(ReadSinglePdb_InDifferentFormats)2651 BOOST_AUTO_TEST_CASE(ReadSinglePdb_InDifferentFormats)
2652 {
2653 string pdb_mol("1IQR");
2654 string pdb_chain("A");
2655
2656 for (int i = 0; i < 2; i++) {
2657
2658 string pdb;
2659 if (i == 0) {
2660 pdb.assign(pdb_mol + '|' + pdb_chain);
2661 } else {
2662 pdb.assign(pdb_mol + "_" + pdb_chain);
2663 }
2664 istringstream instream(pdb);
2665
2666 const bool is_protein(true);
2667 CBlastInputSourceConfig iconfig(is_protein);
2668 iconfig.SetRetrieveSeqData(false);
2669 CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2670 CScope scope(*CObjectManager::GetInstance());
2671
2672 BOOST_REQUIRE(source->End() == false);
2673 blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2674 blast::SSeqLoc ssl = seqs.front();
2675 BOOST_REQUIRE(source->End() == true);
2676
2677 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2678
2679 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2680 BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2681
2682 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2683 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2684
2685 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2686 const TSeqPos length(420);
2687 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2688
2689 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
2690 BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, ssl.seqloc->GetInt().GetId().Which());
2691
2692 BOOST_REQUIRE_EQUAL(pdb_mol,
2693 ssl.seqloc->GetInt().GetId().GetPdb().GetMol().Get());
2694
2695 BOOST_REQUIRE(!ssl.mask);
2696
2697 /// Validate the data that would be retrieved by blast.cgi
2698 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2699 BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
2700 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
2701 const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
2702 BOOST_REQUIRE(! b.IsNa());
2703 BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, b.GetId().front()->Which());
2704 BOOST_REQUIRE_EQUAL(pdb_mol, b.GetId().front()->GetPdb().GetMol().Get());
2705 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
2706 BOOST_REQUIRE(! CSeq_inst::IsNa(b.GetInst().GetMol()));
2707 BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
2708 scope.GetObjectManager().RevokeAllDataLoaders();
2709 }
2710
2711 }
2712
BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN)2713 BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN)
2714 {
2715 CNcbiEnvironment().Set("BLASTINPUT_GEN_DELTA_SEQ", kEmptyStr);
2716 // this has length 682 and contains an 'N' which without the
2717 // CFastaReader::fNoSplit flag, produces a delta sequence
2718 CNcbiIfstream infile("data/nucl_w_n.fsa");
2719 const bool is_protein(false);
2720 CBlastInputSourceConfig iconfig(is_protein);
2721 CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2722
2723 CScope scope(*CObjectManager::GetInstance());
2724 BOOST_REQUIRE(source->End() == false);
2725 TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2726 blast::SSeqLoc ssl = seqs.front();
2727 BOOST_REQUIRE(source->End() == true);
2728
2729 BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2730 BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
2731
2732 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2733 BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
2734
2735 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2736 BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2737
2738 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2739 const TSeqPos length(682);
2740 BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2741
2742 BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
2743 BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
2744 BOOST_REQUIRE(!ssl.mask);
2745
2746 CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2747 BOOST_REQUIRE(bioseqs->CanGetSeq_set());
2748 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
2749 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().CanGetInst());
2750 BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().GetInst().CanGetRepr());
2751 BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw,
2752 bioseqs->GetSeq_set().front()->GetSeq().GetInst().GetRepr());
2753 scope.GetObjectManager().RevokeAllDataLoaders();
2754 }
2755
2756 template <typename T>
s_Ignore(const T &)2757 inline void s_Ignore(const T&) {}
2758
BOOST_AUTO_TEST_CASE(ParseSequenceRange_EmptyRange)2759 BOOST_AUTO_TEST_CASE(ParseSequenceRange_EmptyRange) {
2760 TSeqRange r;
2761 BOOST_REQUIRE_THROW(r = ParseSequenceRange("4-4"),
2762 CBlastException);
2763 s_Ignore(r); /* to pacify compiler warnings */
2764 }
2765
BOOST_AUTO_TEST_CASE(ParseSequenceRange_0BasedRange)2766 BOOST_AUTO_TEST_CASE(ParseSequenceRange_0BasedRange) {
2767 TSeqRange r;
2768 BOOST_REQUIRE_THROW(r = ParseSequenceRange("0-4"),
2769 CBlastException);
2770 s_Ignore(r); /* to pacify compiler warnings */
2771 }
2772
BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidDelimiter)2773 BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidDelimiter) {
2774 TSeqRange r;
2775 BOOST_REQUIRE_THROW(r = ParseSequenceRange("3,4"),
2776 CBlastException);
2777 s_Ignore(r); /* to pacify compiler warnings */
2778 }
2779
BOOST_AUTO_TEST_CASE(ParseSequenceRange_IncompleteRange)2780 BOOST_AUTO_TEST_CASE(ParseSequenceRange_IncompleteRange) {
2781 TSeqRange r;
2782 BOOST_REQUIRE_THROW(r = ParseSequenceRange("3"),
2783 CBlastException);
2784 BOOST_REQUIRE_THROW(r = ParseSequenceRange("3-"),
2785 CBlastException);
2786 BOOST_REQUIRE_THROW(r = ParseSequenceRange("-3"),
2787 CBlastException);
2788 s_Ignore(r); /* to pacify compiler warnings */
2789 }
2790
BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidRange)2791 BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidRange) {
2792 TSeqRange r;
2793 BOOST_REQUIRE_THROW(r = ParseSequenceRange("9-4"),
2794 CBlastException);
2795 BOOST_REQUIRE_THROW(r = ParseSequenceRange("-4-2"),
2796 CBlastException);
2797 BOOST_REQUIRE_THROW(r = ParseSequenceRange("-4-9"),
2798 CBlastException);
2799 }
2800
BOOST_AUTO_TEST_CASE(ParseSequenceRange_1BasedRange)2801 BOOST_AUTO_TEST_CASE(ParseSequenceRange_1BasedRange) {
2802 TSeqRange r = ParseSequenceRange("1-10");
2803 BOOST_REQUIRE_EQUAL(0U, r.GetFrom());
2804 BOOST_REQUIRE_EQUAL(9U, r.GetTo());
2805 BOOST_REQUIRE_EQUAL(10U, r.GetToOpen());
2806 }
2807
BOOST_AUTO_TEST_CASE(CheckQueryBatchSize)2808 BOOST_AUTO_TEST_CASE(CheckQueryBatchSize) {
2809 BOOST_REQUIRE_EQUAL(100000, GetQueryBatchSize(eBlastn));
2810 BOOST_REQUIRE_EQUAL(10000, GetQueryBatchSize(eBlastn, false, true));
2811 }
2812
2813 // Test case for WB-1304: save GI (i.e.: best ranked Seq-id) if available
BOOST_AUTO_TEST_CASE(FetchGiFromAccessionInput)2814 BOOST_AUTO_TEST_CASE(FetchGiFromAccessionInput)
2815 {
2816 const CSeq_id id(CSeq_id::PreferAccessionOverGi() ?
2817 "ref|NT_026437.13|" : "gi|568802206");
2818 const string input("NT_026437.13");
2819 typedef vector<pair<SDataLoaderConfig::EConfigOpts, string> > TVecOpts;
2820 TVecOpts opts;
2821 opts.push_back(TVecOpts::value_type(SDataLoaderConfig::eUseGenbankDataLoader, "genbank"));
2822 opts.push_back(TVecOpts::value_type(SDataLoaderConfig::eUseBlastDbDataLoader, "BLASTDB"));
2823 ITERATE(TVecOpts, config, opts) {
2824 CAutoNcbiConfigFile acf(config->first);
2825 blast::SDataLoaderConfig dlconfig(false);
2826 if(config->second == "BLASTDB") {
2827 dlconfig.m_BlastDbName = "refseq_genomic";
2828 }
2829 dlconfig.OptimizeForWholeLargeSequenceRetrieval();
2830 blast::CBlastInputSourceConfig input_config(dlconfig);
2831 // this needs to be omitted for this test to work
2832 //input_config.SetRetrieveSeqData(false);
2833 CBlastFastaInputSource fasta_input(input, input_config);
2834 CBlastInput blast_input(&fasta_input);
2835 //CBlastScopeSourceWrapper scope_source(dlconfig);
2836 CRef<CScope> scope = CBlastScopeSource(dlconfig).NewScope();
2837 TSeqLocVector query_loc = blast_input.GetAllSeqLocs(*scope);
2838 BOOST_REQUIRE_EQUAL(1U, query_loc.size());
2839 string fasta_id = id.AsFastaString();
2840 string fasta_query = query_loc[0].seqloc->GetId()->AsFastaString();
2841 if (fasta_id != fasta_query) {
2842 BOOST_CHECK_EQUAL(fasta_id, fasta_query);
2843 BOOST_CHECK_MESSAGE(fasta_id == fasta_query,
2844 "Failed using " + config->second + " data loader");
2845 }
2846 scope->GetObjectManager().RevokeAllDataLoaders();
2847 }
2848
2849 }
2850
2851
2852 BOOST_AUTO_TEST_SUITE_END() // end of blastinput test suite
2853
2854
BOOST_AUTO_TEST_SUITE(short_reads)2855 BOOST_AUTO_TEST_SUITE(short_reads)
2856
2857 static int s_GetSegmentFlags(const CBioseq& bioseq)
2858 {
2859 int retval = 0;
2860
2861 BOOST_REQUIRE(bioseq.IsSetDescr());
2862 for (auto desc : bioseq.GetDescr().Get()) {
2863 if (desc->Which() == CSeqdesc::e_User) {
2864
2865 if (!desc->GetUser().IsSetType() ||
2866 !desc->GetUser().GetType().IsStr() ||
2867 desc->GetUser().GetType().GetStr() != "Mapping") {
2868 continue;
2869 }
2870
2871 BOOST_REQUIRE(desc->GetUser().HasField("has_pair"));
2872 const CUser_field& field = desc->GetUser().GetField("has_pair");
2873 BOOST_REQUIRE(field.GetData().IsInt());
2874
2875 retval = field.GetData().GetInt();
2876 }
2877 }
2878
2879 return retval;
2880 }
2881
s_GetSequenceId(const CBioseq & bioseq)2882 static string s_GetSequenceId(const CBioseq& bioseq)
2883 {
2884 string retval;
2885 if (bioseq.IsSetDescr()) {
2886 for (auto it: bioseq.GetDescr().Get()) {
2887 if (it->IsTitle()) {
2888 vector<string> tokens;
2889 NStr::Split(it->GetTitle(), " ", tokens);
2890 retval = (string)"lcl|" + tokens[0];
2891 }
2892 }
2893 }
2894
2895 if (retval.empty()) {
2896 retval = bioseq.GetFirstId()->AsFastaString();
2897 }
2898
2899 return retval;
2900 }
2901
2902
BOOST_AUTO_TEST_CASE(TestPairedReadsFromFasta)2903 BOOST_AUTO_TEST_CASE(TestPairedReadsFromFasta) {
2904
2905 CNcbiIfstream istr("data/paired_reads.fa");
2906 BOOST_REQUIRE(istr);
2907 unordered_map<string, int> ref_flags = {
2908 {"lcl|pair1", eFirstSegment},
2909 {"lcl|pair2", eLastSegment},
2910 {"lcl|incomplete1.1", eFirstSegment},
2911 {"lcl|incomplete1.2", eLastSegment},
2912 {"lcl|incomplete2.1", eFirstSegment},
2913 {"lcl|incomplete2.2", eLastSegment},
2914 };
2915
2916
2917 CShortReadFastaInputSource input_source(istr,
2918 CShortReadFastaInputSource::eFasta,
2919 true);
2920
2921 CBlastInputOMF input(&input_source, 1000);
2922 CRef<CBioseq_set> queries(new CBioseq_set);
2923 input.GetNextSeqBatch(*queries);
2924 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
2925
2926 size_t count = 0;
2927 for (auto it : queries->GetSeq_set()) {
2928 string id = s_GetSequenceId(it->GetSeq());
2929 int flags = s_GetSegmentFlags(it->GetSeq());
2930 int expected = ref_flags.at(id);
2931
2932 BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
2933 id + " is different from expected " +
2934 NStr::IntToString(flags) + " != " +
2935 NStr::IntToString(expected));
2936 count++;
2937 }
2938
2939 BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
2940 }
2941
BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastaFiles)2942 BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastaFiles) {
2943
2944 CNcbiIfstream istr1("data/paired_reads_1.fa");
2945 CNcbiIfstream istr2("data/paired_reads_2.fa");
2946 BOOST_REQUIRE(istr1);
2947 BOOST_REQUIRE(istr2);
2948 unordered_map<string, int> ref_flags = {
2949 {"lcl|pair1", eFirstSegment},
2950 {"lcl|pair2", eLastSegment},
2951 {"lcl|incomplete1.1", eFirstSegment},
2952 {"lcl|incomplete1.2", eLastSegment},
2953 {"lcl|incomplete2.1", eFirstSegment},
2954 {"lcl|incomplete2.2", eLastSegment},
2955 };
2956
2957
2958 CShortReadFastaInputSource input_source(istr1, istr2,
2959 CShortReadFastaInputSource::eFasta);
2960
2961 CBlastInputOMF input(&input_source, 1000);
2962 CRef<CBioseq_set> queries(new CBioseq_set);
2963 input.GetNextSeqBatch(*queries);
2964 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
2965
2966 size_t count = 0;
2967 for (auto it : queries->GetSeq_set()) {
2968 string id = s_GetSequenceId(it->GetSeq());
2969 int flags = s_GetSegmentFlags(it->GetSeq());
2970 int expected = ref_flags.at(id);
2971
2972 BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
2973 id + " is different from expected " +
2974 NStr::IntToString(flags) + " != " +
2975 NStr::IntToString(expected));
2976 count++;
2977 }
2978
2979 BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
2980 }
2981
BOOST_AUTO_TEST_CASE(TestSingleReadsFromFasta)2982 BOOST_AUTO_TEST_CASE(TestSingleReadsFromFasta) {
2983
2984 CNcbiIfstream istr("data/paired_reads.fa");
2985 CShortReadFastaInputSource input_source(istr,
2986 CShortReadFastaInputSource::eFasta,
2987 false);
2988
2989 CBlastInputOMF input(&input_source, 1000);
2990 CRef<CBioseq_set> queries(new CBioseq_set);
2991 input.GetNextSeqBatch(*queries);
2992 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
2993
2994 size_t count = 0;
2995 for (auto it : queries->GetSeq_set()) {
2996 if (it->GetSeq().IsSetDescr()) {
2997
2998 string id = s_GetSequenceId(it->GetSeq());
2999 int flags = s_GetSegmentFlags(it->GetSeq());
3000 int expected = 0;
3001
3002 BOOST_REQUIRE_MESSAGE(flags == expected,
3003 (string)"Segment flag for " +
3004 id + " is different from expected " +
3005 NStr::IntToString(flags) + " != " +
3006 NStr::IntToString(expected));
3007 }
3008 count++;
3009 }
3010
3011 BOOST_REQUIRE_EQUAL(6u, count);
3012 }
3013
BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastQ)3014 BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastQ) {
3015
3016 CNcbiIfstream istr("data/paired_reads.fastq");
3017 BOOST_REQUIRE(istr);
3018 unordered_map<string, int> ref_flags = {
3019 {"lcl|pair1", eFirstSegment},
3020 {"lcl|pair2", eLastSegment},
3021 {"lcl|incomplete1.1", eFirstSegment},
3022 {"lcl|incomplete1.2", eLastSegment},
3023 {"lcl|incomplete2.1", eFirstSegment},
3024 {"lcl|incomplete2.2", eLastSegment},
3025 };
3026
3027 CShortReadFastaInputSource input_source(istr,
3028 CShortReadFastaInputSource::eFastq,
3029 true);
3030
3031 CBlastInputOMF input(&input_source, 1000);
3032 CRef<CBioseq_set> queries(new CBioseq_set);
3033 input.GetNextSeqBatch(*queries);
3034 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3035
3036 size_t count = 0;
3037 for (auto it : queries->GetSeq_set()) {
3038 string id = s_GetSequenceId(it->GetSeq());
3039 int flags = s_GetSegmentFlags(it->GetSeq());
3040 int expected = ref_flags.at(id);
3041
3042 BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3043 id + " is different from expected " +
3044 NStr::IntToString(flags) + " != " +
3045 NStr::IntToString(expected));
3046 count++;
3047 }
3048
3049 BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3050 }
3051
BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastQFiles)3052 BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastQFiles) {
3053
3054 CNcbiIfstream istr1("data/paired_reads_1.fastq");
3055 CNcbiIfstream istr2("data/paired_reads_2.fastq");
3056 BOOST_REQUIRE(istr1);
3057 BOOST_REQUIRE(istr2);
3058 unordered_map<string, int> ref_flags = {
3059 {"lcl|pair1", eFirstSegment},
3060 {"lcl|pair2", eLastSegment},
3061 {"lcl|incomplete1.1", eFirstSegment},
3062 {"lcl|incomplete1.2", eLastSegment},
3063 {"lcl|incomplete2.1", eFirstSegment},
3064 {"lcl|incomplete2.2", eLastSegment},
3065 };
3066
3067 CShortReadFastaInputSource input_source(istr1, istr2,
3068 CShortReadFastaInputSource::eFastq);
3069
3070 CBlastInputOMF input(&input_source, 1000);
3071 CRef<CBioseq_set> queries(new CBioseq_set);
3072 input.GetNextSeqBatch(*queries);
3073 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3074
3075 size_t count = 0;
3076 for (auto it : queries->GetSeq_set()) {
3077 string id = s_GetSequenceId(it->GetSeq());
3078 int flags = s_GetSegmentFlags(it->GetSeq());
3079 int expected = ref_flags.at(id);
3080
3081 BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3082 id + " is different from expected " +
3083 NStr::IntToString(flags) + " != " +
3084 NStr::IntToString(expected));
3085 count++;
3086 }
3087
3088 BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3089 }
3090
3091
BOOST_AUTO_TEST_CASE(TestPairedReadsFromASN1)3092 BOOST_AUTO_TEST_CASE(TestPairedReadsFromASN1) {
3093
3094 CNcbiIfstream istr("data/paired_reads.asn");
3095 BOOST_REQUIRE(istr);
3096 unordered_map<string, int> ref_flags = {
3097 {"lcl|pair1", eFirstSegment},
3098 {"lcl|pair2", eLastSegment},
3099 {"lcl|incomplete1.1", eFirstSegment},
3100 {"lcl|incomplete1.2", eLastSegment},
3101 {"lcl|incomplete2.1", eFirstSegment},
3102 {"lcl|incomplete2.2", eLastSegment},
3103 };
3104
3105 CASN1InputSourceOMF input_source(istr, false, true);
3106 CBlastInputOMF input(&input_source, 1000);
3107 CRef<CBioseq_set> queries(new CBioseq_set);
3108 input.GetNextSeqBatch(*queries);
3109 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3110
3111 size_t count = 0;
3112 for (auto it : queries->GetSeq_set()) {
3113 string id = it->GetSeq().GetFirstId()->AsFastaString();
3114 int flags = s_GetSegmentFlags(it->GetSeq());
3115 int expected = ref_flags.at(id);
3116
3117 BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3118 id + " is different from expected " +
3119 NStr::IntToString(flags) + " != " +
3120 NStr::IntToString(expected));
3121 count++;
3122 }
3123
3124 BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3125 }
3126
BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoASN1Files)3127 BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoASN1Files) {
3128
3129 CNcbiIfstream istr1("data/paired_reads_1.asn");
3130 CNcbiIfstream istr2("data/paired_reads_2.asn");
3131 BOOST_REQUIRE(istr1);
3132 BOOST_REQUIRE(istr2);
3133 unordered_map<string, int> ref_flags = {
3134 {"lcl|pair1", eFirstSegment},
3135 {"lcl|pair2", eLastSegment},
3136 {"lcl|incomplete1.1", eFirstSegment},
3137 {"lcl|incomplete1.2", eLastSegment},
3138 {"lcl|incomplete2.1", eFirstSegment},
3139 {"lcl|incomplete2.2", eLastSegment},
3140 };
3141
3142 CASN1InputSourceOMF input_source(istr1, istr2, false);
3143 CBlastInputOMF input(&input_source, 1000);
3144 CRef<CBioseq_set> queries(new CBioseq_set);
3145 input.GetNextSeqBatch(*queries);
3146 // input file contains six sequences, but two should have been rejected
3147 // in screening
3148 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3149
3150 size_t count = 0;
3151 for (auto it : queries->GetSeq_set()) {
3152 string id = it->GetSeq().GetFirstId()->AsFastaString();
3153 int flags = s_GetSegmentFlags(it->GetSeq());
3154 int expected = ref_flags.at(id);
3155
3156 BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3157 id + " is different from expected " +
3158 NStr::IntToString(flags) + " != " +
3159 NStr::IntToString(expected));
3160 count++;
3161 }
3162
3163 BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3164 }
3165
3166
BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastC)3167 BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastC) {
3168
3169 CNcbiIfstream istr("data/paired_reads.fastc");
3170 BOOST_REQUIRE(istr);
3171 unordered_map<string, int> ref_flags = {
3172 {"lcl|read1.1", eFirstSegment},
3173 {"lcl|read1.2", eLastSegment},
3174 {"lcl|read2.1", eFirstSegment},
3175 {"lcl|read2.2", eLastSegment},
3176 {"lcl|read3.1", eFirstSegment},
3177 {"lcl|read3.2", eLastSegment},
3178 };
3179
3180 CShortReadFastaInputSource input_source(istr,
3181 CShortReadFastaInputSource::eFastc, true);
3182 CBlastInputOMF input(&input_source, 1000);
3183 CRef<CBioseq_set> queries(new CBioseq_set);
3184 input.GetNextSeqBatch(*queries);
3185 BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3186
3187 size_t count = 0;
3188 for (auto it : queries->GetSeq_set()) {
3189 string id = s_GetSequenceId(it->GetSeq());
3190 int flags = s_GetSegmentFlags(it->GetSeq());
3191 int expected = ref_flags.at(id);
3192
3193 BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3194 id + " is different from expected " +
3195 NStr::IntToString(flags) + " != " +
3196 NStr::IntToString(expected));
3197 count++;
3198 }
3199
3200 BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3201 }
3202
3203
3204 BOOST_AUTO_TEST_SUITE_END() // end of short_reads test suite
3205
3206
3207 BOOST_AUTO_TEST_SUITE(blastargs)
3208
3209 /// Auxiliary class to convert a string into an argument count and vector
3210 class CString2Args
3211 {
3212 public:
CString2Args(const string & cmd_line_args)3213 CString2Args(const string& cmd_line_args) {
3214 x_Init(cmd_line_args);
3215 }
3216
~CString2Args()3217 ~CString2Args() {
3218 x_CleanUp();
3219 }
3220
Reset(const string & cmd_line_args)3221 void Reset(const string& cmd_line_args) {
3222 x_CleanUp();
3223 x_Init(cmd_line_args);
3224 }
3225
CreateCArgs(CBlastAppArgs & args) const3226 CArgs* CreateCArgs(CBlastAppArgs& args) const {
3227 auto_ptr<CArgDescriptions> arg_desc(args.SetCommandLine());
3228 CNcbiArguments ncbi_args(m_Argc, m_Argv);
3229 return arg_desc->CreateArgs(ncbi_args);
3230 }
3231
3232 private:
3233
3234 /// Functor to help remove empty strings from a container
3235 struct empty_string_remover
3236 {
operator ()CString2Args::empty_string_remover3237 bool operator() (const string& str) {
3238 return str.empty();
3239 }
3240 };
3241
3242 /// Extract the arguments from a command line
x_TokenizeCmdLine(const string & cmd_line_args)3243 vector<string> x_TokenizeCmdLine(const string& cmd_line_args) {
3244 vector<string> retval;
3245 NStr::Split(cmd_line_args, " ", retval);
3246 vector<string>::iterator new_end = remove_if(retval.begin(),
3247 retval.end(),
3248 empty_string_remover());
3249 retval.erase(new_end, retval.end());
3250 return retval;
3251 }
3252
3253 /// Convert a C++ string into a C-style string
x_ToCString(const string & str)3254 char* x_ToCString(const string& str) {
3255 char* retval = new char[str.size()+1];
3256 strncpy(retval, str.c_str(), str.size());
3257 retval[str.size()] = '\0';
3258 return retval;
3259 }
3260
x_CleanUp()3261 void x_CleanUp() {
3262 for (size_t i = 0; i < m_Argc; i++) {
3263 delete [] m_Argv[i];
3264 }
3265 delete [] m_Argv;
3266 }
3267
x_Init(const string & cmd_line_args)3268 void x_Init(const string& cmd_line_args) {
3269 const string program_name("./blastinput_unit_test");
3270 vector<string> args = x_TokenizeCmdLine(cmd_line_args);
3271 m_Argc = args.size() + 1; // one extra for dummy program name
3272 m_Argv = new char*[m_Argc];
3273 m_Argv[0] = x_ToCString(program_name);
3274 for (size_t i = 0; i < args.size(); i++) {
3275 m_Argv[i+1] = x_ToCString(args[i]);
3276 }
3277 }
3278
3279 char** m_Argv;
3280 size_t m_Argc;
3281 };
3282
3283 /* Test for the PSI-BLAST command line application arguments */
3284
BOOST_AUTO_TEST_CASE(PsiBlastAppTestMatrix)3285 BOOST_AUTO_TEST_CASE(PsiBlastAppTestMatrix)
3286 {
3287 CPsiBlastAppArgs psiblast_args;
3288 CString2Args s2a("-matrix BLOSUM80 -db ecoli ");
3289 auto_ptr<CArgs> args(s2a.CreateCArgs(psiblast_args));
3290
3291 CRef<CBlastOptionsHandle> opts = psiblast_args.SetOptions(*args);
3292
3293 BOOST_REQUIRE_EQUAL(opts->GetOptions().GetMatrixName(), string("BLOSUM80"));
3294 }
3295
BOOST_AUTO_TEST_CASE(RpsBlastCBS)3296 BOOST_AUTO_TEST_CASE(RpsBlastCBS)
3297 {
3298 CRPSBlastAppArgs rpsblast_args;
3299 CString2Args s2a("-db ecoli ");
3300 auto_ptr<CArgs> args(s2a.CreateCArgs(rpsblast_args));
3301 CRef<CBlastOptionsHandle> opts = rpsblast_args.SetOptions(*args);
3302 BOOST_REQUIRE_EQUAL(opts->GetOptions().GetCompositionBasedStats(), 1);
3303 BOOST_REQUIRE(opts->GetOptions().GetSegFiltering() == false);
3304 }
3305
BOOST_AUTO_TEST_CASE(CheckMutuallyExclusiveOptions)3306 BOOST_AUTO_TEST_CASE(CheckMutuallyExclusiveOptions)
3307 {
3308 CString2Args s2a("-remote -num_threads 2");
3309
3310 typedef vector< CRef<CBlastAppArgs> > TArgClasses;
3311 vector< CRef<CBlastAppArgs> > arg_classes;
3312 arg_classes.push_back(CRef<CBlastAppArgs>(new CPsiBlastAppArgs));
3313 arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastpAppArgs));
3314 arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastnAppArgs));
3315 arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastxAppArgs));
3316 arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastnAppArgs));
3317 arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastxAppArgs));
3318
3319 NON_CONST_ITERATE(TArgClasses, itr, arg_classes) {
3320 auto_ptr<CArgs> args;
3321 BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(**itr)),
3322 CArgException);
3323 }
3324 }
3325
BOOST_AUTO_TEST_CASE(CheckDiscoMegablast)3326 BOOST_AUTO_TEST_CASE(CheckDiscoMegablast) {
3327 auto_ptr<CArgs> args;
3328 CBlastnAppArgs blastn_args;
3329
3330 // missing required template_length argument
3331 CString2Args s2a("-db ecoli -template_type coding ");
3332 BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blastn_args)),
3333 CArgException);
3334 // missing required template_type argument
3335 s2a.Reset("-db ecoli -template_length 21 ");
3336 BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blastn_args)),
3337 CArgException);
3338
3339 // valid combination
3340 s2a.Reset("-db ecoli -template_type coding -template_length 16");
3341 BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blastn_args)));
3342
3343 // test the setting of an invalid word size for disco. megablast
3344 s2a.Reset("-db ecoli -word_size 32 -template_type optimal -template_length 16");
3345 BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blastn_args)));
3346 CRef<CBlastOptionsHandle> opts;
3347 BOOST_REQUIRE_THROW(blastn_args.SetOptions(*args), CInputException);
3348 }
3349
BOOST_AUTO_TEST_CASE(CheckPercentIdentity)3350 BOOST_AUTO_TEST_CASE(CheckPercentIdentity) {
3351 auto_ptr<CArgs> args;
3352 CBlastnAppArgs blast_args;
3353
3354 // invalid value
3355 CString2Args s2a("-db ecoli -perc_identity 104.3");
3356 BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blast_args)),
3357 CArgException);
3358
3359 // valid combination
3360 s2a.Reset("-db ecoli -perc_identity 75.0 ");
3361 BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3362 }
3363
BOOST_AUTO_TEST_CASE(CheckNoGreedyExtension)3364 BOOST_AUTO_TEST_CASE(CheckNoGreedyExtension) {
3365 auto_ptr<CArgs> args;
3366 CBlastnAppArgs blast_args;
3367
3368 CString2Args s2a("-db ecoli -no_greedy");
3369 BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3370 CRef<CBlastOptionsHandle> opts;
3371 // this throws because non-affine gapping costs must be provided for
3372 // non-greedy extension
3373 BOOST_REQUIRE_THROW(blast_args.SetOptions(*args), CInputException);
3374 }
3375
BOOST_AUTO_TEST_CASE(CheckCulling)3376 BOOST_AUTO_TEST_CASE(CheckCulling) {
3377 typedef vector< CRef<CBlastAppArgs> > TArgClasses;
3378 vector< CRef<CBlastAppArgs> > arg_classes;
3379 arg_classes.push_back(CRef<CBlastAppArgs>(new CPsiBlastAppArgs));
3380 arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastpAppArgs));
3381 arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastnAppArgs));
3382 arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastxAppArgs));
3383 arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastnAppArgs));
3384 arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastxAppArgs));
3385
3386 NON_CONST_ITERATE(TArgClasses, itr, arg_classes) {
3387 auto_ptr<CArgs> args;
3388 // invalid value
3389 CString2Args s2a("-db ecoli -culling_limit -4");
3390 BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(**itr)),
3391 CArgException);
3392
3393 // valid combination
3394 s2a.Reset("-db ecoli -culling_limit 0");
3395 BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(**itr)));
3396 }
3397
3398 }
3399
BOOST_AUTO_TEST_CASE(CheckTaskArgs)3400 BOOST_AUTO_TEST_CASE(CheckTaskArgs) {
3401 set<string> tasks
3402 (CBlastOptionsFactory::GetTasks(CBlastOptionsFactory::eNuclNucl));
3403 CRef<IBlastCmdLineArgs> arg;
3404 arg.Reset(new CTaskCmdLineArgs(tasks, "megablast")),
3405 arg.Reset(new CTaskCmdLineArgs(tasks, "dc-megablast")),
3406 arg.Reset(new CTaskCmdLineArgs(tasks, "blastn")),
3407 arg.Reset(new CTaskCmdLineArgs(tasks, "blastn-short")),
3408
3409 tasks = CBlastOptionsFactory::GetTasks(CBlastOptionsFactory::eProtProt);
3410 arg.Reset(new CTaskCmdLineArgs(tasks, "blastp"));
3411 arg.Reset(new CTaskCmdLineArgs(tasks, "blastp-short"));
3412 }
3413
BOOST_AUTO_TEST_CASE(CheckQueryCoveragePercent)3414 BOOST_AUTO_TEST_CASE(CheckQueryCoveragePercent) {
3415 auto_ptr<CArgs> args;
3416 CBlastxAppArgs blast_args;
3417
3418 // invalid value
3419 CString2Args s2a("-db ecoli -qcov_hsp_perc 100.3");
3420 BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blast_args)),
3421 CArgException);
3422
3423 // valid combination
3424 s2a.Reset("-db ecoli -qcov_hsp_perc 15");
3425 BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3426 }
3427
BOOST_AUTO_TEST_CASE(CheckMaxHspsPerSubject)3428 BOOST_AUTO_TEST_CASE(CheckMaxHspsPerSubject) {
3429 auto_ptr<CArgs> args;
3430 CBlastxAppArgs blast_args;
3431
3432 // invalid value
3433 CString2Args s2a("-db ecoli -max_hsps 0");
3434 BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blast_args)),
3435 CArgException);
3436
3437 // valid combination
3438 s2a.Reset("-db ecoli -max_hsps 5");
3439 BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3440 }
3441
3442 BOOST_AUTO_TEST_SUITE_END() // end of blastargs test suite
3443
3444 #endif /* SKIP_DOXYGEN_PROCESSING */
3445