1 /*  $Id: blastinput_unit_test.cpp 617780 2020-10-06 16:24:16Z gouriano $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Jason Papadopoulos
27  *
28  * File Description:
29  *   Unit tests for CBlastInput, CBlastInputSource and derived classes
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <objmgr/object_manager.hpp>
35 #include <objmgr/bioseq_handle.hpp>
36 #include <objects/seqloc/Seq_loc.hpp>
37 #include <objects/seqloc/Packed_seqint.hpp>
38 #include <objects/seqloc/Seq_interval.hpp>
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seqloc/PDB_seq_id.hpp>
41 #include <objects/seq/Seq_data.hpp>
42 #include <objects/seq/NCBIeaa.hpp>
43 #include <objects/seqset/Bioseq_set.hpp>
44 
45 #include <corelib/ncbienv.hpp>
46 #include <objtools/readers/reader_exception.hpp>
47 #include <objtools/data_loaders/genbank/gbloader.hpp>
48 #include <algo/blast/api/sseqloc.hpp>
49 #include <algo/blast/core/blast_query_info.h>
50 #include <algo/blast/blastinput/blast_input.hpp>
51 #include <algo/blast/blastinput/blast_input_aux.hpp>
52 #include <algo/blast/blastinput/blast_fasta_input.hpp>
53 #include <algo/blast/blastinput/blast_asn1_input.hpp>
54 #include <objmgr/util/sequence.hpp>
55 #include <objmgr/seq_vector.hpp>
56 
57 #include <algo/blast/blastinput/blastp_args.hpp>
58 #include <algo/blast/blastinput/blastn_args.hpp>
59 #include <algo/blast/blastinput/blastx_args.hpp>
60 #include <algo/blast/blastinput/tblastn_args.hpp>
61 #include <algo/blast/blastinput/tblastx_args.hpp>
62 #include <algo/blast/blastinput/psiblast_args.hpp>
63 #include <algo/blast/blastinput/rpsblast_args.hpp>
64 #include "blast_input_unit_test_aux.hpp"
65 
66 #include <unordered_map>
67 
68 #undef NCBI_BOOST_NO_AUTO_TEST_MAIN
69 #include <corelib/test_boost.hpp>
70 
71 #ifndef SKIP_DOXYGEN_PROCESSING
72 
73 USING_NCBI_SCOPE;
74 USING_SCOPE(blast);
75 USING_SCOPE(objects);
76 
77 static CRef<CBlastInput>
s_DeclareBlastInput(CNcbiIstream & input_file,const CBlastInputSourceConfig & iconfig,int batch_size=kMax_Int)78 s_DeclareBlastInput(CNcbiIstream& input_file,
79                     const CBlastInputSourceConfig& iconfig,
80                     int batch_size = kMax_Int)
81 {
82     CRef<CBlastFastaInputSource> fasta_src
83         (new CBlastFastaInputSource(input_file, iconfig));
84     return CRef<CBlastInput>(new CBlastInput(&*fasta_src, batch_size));
85 }
86 
87 static CRef<CBlastInput>
s_DeclareBlastInput(const string & user_input,const CBlastInputSourceConfig & iconfig)88 s_DeclareBlastInput(const string& user_input,
89                     const CBlastInputSourceConfig& iconfig)
90 {
91     CRef<CBlastFastaInputSource> fasta_src
92         (new CBlastFastaInputSource(user_input, iconfig));
93     return CRef<CBlastInput>(new CBlastInput(&*fasta_src));
94 }
95 
96 BOOST_AUTO_TEST_SUITE(blastinput)
97 
BOOST_AUTO_TEST_CASE(ReadAccession_MismatchNuclProt)98 BOOST_AUTO_TEST_CASE(ReadAccession_MismatchNuclProt)
99 {
100     CNcbiIfstream infile("data/nucl_acc.txt");
101     const bool is_protein(true);
102     CBlastInputSourceConfig iconfig(is_protein);
103     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
104     CScope scope(*CObjectManager::GetInstance());
105 
106     BOOST_REQUIRE(source->End() == false);
107     bool caught_exception(false);
108     try {
109         blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
110         // here's a 'misplaced' test for blast::IsLocalId
111         BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
112     }
113     catch (const CInputException& e) {
114         string msg(e.what());
115         BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: protein input required but nucleotide provided")
116                     != NPOS);
117         BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
118         caught_exception = true;
119     }
120     BOOST_REQUIRE(caught_exception);
121     BOOST_REQUIRE(source->End() == true);
122     scope.GetObjectManager().RevokeAllDataLoaders();
123 }
124 
BOOST_AUTO_TEST_CASE(ReadAccession_MismatchProtNucl)125 BOOST_AUTO_TEST_CASE(ReadAccession_MismatchProtNucl)
126 {
127     CNcbiIfstream infile("data/prot_acc.txt");
128     const bool is_protein(false);
129     CBlastInputSourceConfig iconfig(is_protein);
130     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
131     CScope scope(*CObjectManager::GetInstance());
132 
133     BOOST_REQUIRE(source->End() == false);
134     bool caught_exception(false);
135     try {
136         blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
137         // here's a 'misplaced' test for blast::IsLocalId
138         BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
139     }
140     catch (const CInputException& e) {
141         string msg(e.what());
142         BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: nucleotide input required but protein provided")
143                     != NPOS);
144         BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
145         caught_exception = true;
146     }
147     BOOST_REQUIRE(caught_exception);
148     BOOST_REQUIRE(source->End() == true);
149     scope.GetObjectManager().RevokeAllDataLoaders();
150 }
151 
BOOST_AUTO_TEST_CASE(ReadGi_MismatchNuclProt)152 BOOST_AUTO_TEST_CASE(ReadGi_MismatchNuclProt)
153 {
154     CNcbiIfstream infile("data/gi.txt");
155     const bool is_protein(true);
156     CBlastInputSourceConfig iconfig(is_protein);
157     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
158     CScope scope(*CObjectManager::GetInstance());
159 
160     BOOST_REQUIRE(source->End() == false);
161     bool caught_exception(false);
162     try {
163         blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
164         // here's a 'misplaced' test for blast::IsLocalId
165         BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
166     }
167     catch (const CInputException& e) {
168         string msg(e.what());
169         BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: protein input required but nucleotide provided")
170                     != NPOS);
171         BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
172         caught_exception = true;
173     }
174     BOOST_REQUIRE(caught_exception);
175     BOOST_REQUIRE(source->End() == true);
176     scope.GetObjectManager().RevokeAllDataLoaders();
177 }
178 
BOOST_AUTO_TEST_CASE(ReadGi_MismatchProtNucl)179 BOOST_AUTO_TEST_CASE(ReadGi_MismatchProtNucl)
180 {
181     CNcbiIfstream infile("data/prot_gi.txt");
182     const bool is_protein(false);
183     CBlastInputSourceConfig iconfig(is_protein);
184     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
185     CScope scope(*CObjectManager::GetInstance());
186 
187     BOOST_REQUIRE(source->End() == false);
188     bool caught_exception(false);
189     try {
190         blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
191         // here's a 'misplaced' test for blast::IsLocalId
192         BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
193     }
194     catch (const CInputException& e) {
195         string msg(e.what());
196         BOOST_REQUIRE(msg.find("GI/accession/sequence mismatch: nucleotide input required but protein provided")
197                     != NPOS);
198         BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
199         caught_exception = true;
200     }
201     BOOST_REQUIRE(caught_exception);
202     BOOST_REQUIRE(source->End() == true);
203     scope.GetObjectManager().RevokeAllDataLoaders();
204 }
205 
206 struct SDubiousShortSequence
207 {
208     string sequence_data;
209     CSeq_inst::EMol mol_type;
210 
SDubiousShortSequenceSDubiousShortSequence211     SDubiousShortSequence(const string& seq,
212                           CSeq_inst::EMol mol_type)
213         : sequence_data(seq), mol_type(mol_type)
214     {
215         seqlen = NStr::Replace(sequence_data, " ", kEmptyStr).length();
216     }
217 
IsProteinSDubiousShortSequence218     bool IsProtein() const { return CSeq_inst::IsAa(mol_type); }
GetLengthSDubiousShortSequence219     TSeqPos GetLength() const { return seqlen; }
220 
221 private:
222     TSeqPos seqlen;
223 };
224 
BOOST_AUTO_TEST_CASE(TestSmallDubiousSequences)225 BOOST_AUTO_TEST_CASE(TestSmallDubiousSequences)
226 {
227     string seq;
228 
229     vector<SDubiousShortSequence> test_data;
230     test_data.push_back(SDubiousShortSequence("NNWNN", CSeq_inst::eMol_aa));
231     // P84064
232     seq.assign("ykrggggwgg gggwkggggg gggwkggggg gkgggg");
233     test_data.push_back(SDubiousShortSequence(seq, CSeq_inst::eMol_aa));
234     // AAB32668
235     seq.assign("GGGGGGGGGGGGGGG");
236     test_data.push_back(SDubiousShortSequence(seq, CSeq_inst::eMol_aa));
237 
238     CRef<CObjectManager> om(CObjectManager::GetInstance());
239 
240     // First test the usage of the sequence length threshold
241     ITERATE(vector<SDubiousShortSequence>, itr, test_data) {
242         CBlastInputSourceConfig iconfig(itr->IsProtein());
243         iconfig.SetSeqLenThreshold2Guess(itr->GetLength() + 1);
244 
245         CRef<CBlastFastaInputSource> fasta_source
246             (new CBlastFastaInputSource(itr->sequence_data, iconfig));
247         CRef<CBlastInput> source(new CBlastInput(&*fasta_source));
248 
249         CScope scope(*om);
250         BOOST_REQUIRE(source->End() == false);
251         bool caught_exception(false);
252         blast::SSeqLoc ssl;
253         try {
254             ssl = source->GetNextSeqLocBatch(scope).front();
255             // here's a 'misplaced' test for blast::IsLocalId
256             BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
257         }
258         catch (const CInputException& e) {
259             string msg(e.what());
260             BOOST_REQUIRE(msg.find("Gi/accession mismatch: ") != NPOS);
261             BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch,
262                               e.GetErrCode());
263             caught_exception = true;
264         }
265         BOOST_REQUIRE(caught_exception == false);
266         BOOST_REQUIRE(source->End() == true);
267 
268         TSeqPos length = sequence::GetLength(*ssl.seqloc, ssl.scope);
269         BOOST_REQUIRE_EQUAL(itr->GetLength(), length);
270         scope.GetObjectManager().RevokeAllDataLoaders();
271     }
272 
273     // Now check that these sequences will be rejected as being the wrong
274     // molecule type (achieved by setting seqlen_thresh2guess argument to
275     // CBlastFastaInputSource to a small value
276     ITERATE(vector<SDubiousShortSequence>, itr, test_data) {
277 
278         CBlastInputSourceConfig iconfig(itr->IsProtein());
279         iconfig.SetSeqLenThreshold2Guess(5);
280 
281         CRef<CBlastFastaInputSource> fasta_source
282             (new CBlastFastaInputSource(itr->sequence_data, iconfig));
283         CRef<CBlastInput> source(new CBlastInput(&*fasta_source));
284 
285         CScope scope(*om);
286         BOOST_REQUIRE(source->End() == false);
287         bool caught_exception(false);
288         blast::SSeqLoc ssl;
289         try {
290             ssl = source->GetNextSeqLocBatch(scope).front();
291             // here's a 'misplaced' test for blast::IsLocalId
292             BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
293         }
294         catch (const CInputException& e) {
295             string msg(e.what());
296             BOOST_REQUIRE(msg.find("Nucleotide FASTA provided for prot") != NPOS);
297             BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch,
298                               e.GetErrCode());
299             caught_exception = true;
300         }
301         BOOST_REQUIRE(caught_exception == true);
302         BOOST_REQUIRE(source->End() == true);
303         scope.GetObjectManager().RevokeAllDataLoaders();
304     }
305 
306 }
307 
BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchProtNucl)308 BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchProtNucl)
309 {
310     CNcbiIfstream infile("data/aa.129295");
311     const bool is_protein(false);
312     CBlastInputSourceConfig iconfig(is_protein);
313     iconfig.SetSeqLenThreshold2Guess(25);
314     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
315 
316     CScope scope(*CObjectManager::GetInstance());
317     BOOST_REQUIRE(source->End() == false);
318     bool caught_exception(false);
319     try { blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front(); }
320     catch (const CInputException& e) {
321         string msg(e.what());
322         BOOST_REQUIRE(msg.find("Protein FASTA provided for nucleotide") != NPOS);
323         BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
324         caught_exception = true;
325     }
326     BOOST_REQUIRE(caught_exception);
327     BOOST_REQUIRE(source->End() == true);
328     scope.GetObjectManager().RevokeAllDataLoaders();
329 }
330 
BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchNuclProt)331 BOOST_AUTO_TEST_CASE(ReadFastaWithDefline_MismatchNuclProt)
332 {
333     CNcbiIfstream infile("data/nt.555");
334     const bool is_protein(true);
335     CBlastInputSourceConfig iconfig(is_protein);
336     iconfig.SetSeqLenThreshold2Guess(25);
337     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
338 
339     CScope scope(*CObjectManager::GetInstance());
340     BOOST_REQUIRE(source->End() == false);
341     bool caught_exception(false);
342     try { blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front(); }
343     catch (const CInputException& e) {
344         string msg(e.what());
345         BOOST_REQUIRE(msg.find("Nucleotide FASTA provided for protein") != NPOS);
346         BOOST_REQUIRE_EQUAL(CInputException::eSequenceMismatch, e.GetErrCode());
347         caught_exception = true;
348     }
349     BOOST_REQUIRE(caught_exception);
350     BOOST_REQUIRE(source->End() == true);
351     scope.GetObjectManager().RevokeAllDataLoaders();
352 }
353 
BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProtein_Single)354 BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProtein_Single)
355 {
356     CNcbiIfstream infile("data/aa.129295");
357     const bool is_protein(true);
358     CBlastInputSourceConfig iconfig(is_protein);
359     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
360 
361     CScope scope(*CObjectManager::GetInstance());
362     BOOST_REQUIRE(source->End() == false);
363     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
364     BOOST_REQUIRE(source->End() == true);
365 
366     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
367     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
368 
369     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
370     BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
371 
372     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
373     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
374 
375     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
376     const TSeqPos length(232);
377     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
378 
379     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
380     BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
381 
382     BOOST_REQUIRE(!ssl.mask);
383     scope.GetObjectManager().RevokeAllDataLoaders();
384 }
385 
BOOST_AUTO_TEST_CASE(RawFastaWithSpaces)386 BOOST_AUTO_TEST_CASE(RawFastaWithSpaces)
387 {
388     // this is gi 555, length 624
389     CNcbiIfstream infile("data/raw_fasta.na");
390     const bool is_protein(false);
391     CBlastInputSourceConfig iconfig(is_protein);
392     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
393 
394     CScope scope(*CObjectManager::GetInstance());
395     BOOST_REQUIRE(source->End() == false);
396     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
397     BOOST_REQUIRE(source->End() == true);
398 
399     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
400     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
401 
402     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
403     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
404 
405     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
406     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
407 
408     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
409     const TSeqPos length(624);
410     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
411 
412     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
413     BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
414 
415     BOOST_REQUIRE(!ssl.mask);
416     scope.GetObjectManager().RevokeAllDataLoaders();
417 }
418 
BOOST_AUTO_TEST_CASE(ReadProteinWithGaps)419 BOOST_AUTO_TEST_CASE(ReadProteinWithGaps)
420 {
421     CNcbiIfstream infile("data/prot_w_gaps.txt");
422     const bool is_protein(true);
423     CBlastInputSourceConfig iconfig(is_protein);
424     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
425 
426     CScope scope(*CObjectManager::GetInstance());
427     TSeqLocVector seqs = source->GetAllSeqLocs(scope);
428     blast::SSeqLoc ssl = seqs.front();
429     BOOST_REQUIRE(source->End() == true);
430 
431     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
432     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
433 
434     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
435     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
436 
437     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
438     const TSeqPos length(91); // it's actually 103 with gaps
439     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
440 
441     const CSeq_id * seqid = ssl.seqloc->GetId();
442     CBioseq_Handle bh = scope.GetBioseqHandle(*seqid);
443     CSeqVector sv = bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
444 
445     for (size_t i = 0; i < sv.size(); i++) {
446         BOOST_CHECK_NE('-', (char)sv[i]);
447     }
448 
449     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
450     const CBioseq& bioseq = bioseqs->GetSeq_set().front()->GetSeq();
451     const CSeq_inst& inst = bioseq.GetInst();
452     BOOST_REQUIRE_EQUAL(inst.GetLength(), length);
453     BOOST_REQUIRE(inst.IsSetSeq_data());
454     const CSeq_data& seq_data = inst.GetSeq_data();
455     BOOST_REQUIRE(seq_data.IsNcbieaa());
456     const string& seq = seq_data.GetNcbieaa().Get();
457     for (size_t i = 0; i < seq.size(); i++) {
458         BOOST_CHECK_NE('-', (char)seq[i]);
459     }
460     scope.GetObjectManager().RevokeAllDataLoaders();
461 }
462 
BOOST_AUTO_TEST_CASE(RawFastaNoSpaces)463 BOOST_AUTO_TEST_CASE(RawFastaNoSpaces)
464 {
465     // this is gi 555, length 624
466     CNcbiIfstream infile("data/raw_fasta2.na");
467     const bool is_protein(false);
468     CBlastInputSourceConfig iconfig(is_protein);
469     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
470 
471     CScope scope(*CObjectManager::GetInstance());
472     TSeqLocVector seqs = source->GetAllSeqLocs(scope);
473     blast::SSeqLoc ssl = seqs[0];
474     BOOST_REQUIRE(source->End() == true);
475 
476     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
477     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
478 
479     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
480     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
481 
482     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
483     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
484 
485     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
486     const TSeqPos length(624);
487     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
488 
489     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
490     BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
491 
492     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
493     BOOST_REQUIRE(bioseqs.NotEmpty());
494 
495     BOOST_REQUIRE(!ssl.mask);
496     scope.GetObjectManager().RevokeAllDataLoaders();
497 }
498 
BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN_ReadDeltaSeq)499 BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN_ReadDeltaSeq)
500 {
501     // Note the setting of the environment variable
502     CAutoEnvironmentVariable env("BLASTINPUT_GEN_DELTA_SEQ");
503     CNcbiIfstream infile("data/nucl_w_n.fsa");
504     const bool is_protein(false);
505     CBlastInputSourceConfig iconfig(is_protein);
506     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
507 
508     CScope s(*CObjectManager::GetInstance());
509     blast::TSeqLocVector seqs = source->GetAllSeqLocs(s);
510     blast::SSeqLoc ssl = seqs.front();
511     (void)ssl;
512     BOOST_REQUIRE(source->End() == true);
513 
514     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
515     BOOST_REQUIRE(bioseqs->CanGetSeq_set());
516     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
517     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().CanGetInst());
518     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().GetInst().CanGetRepr());
519     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().GetInst().GetRepr()
520           == CSeq_inst::eRepr_delta);
521     s.GetObjectManager().RevokeAllDataLoaders();
522 }
523 
524 
BOOST_AUTO_TEST_CASE(ReadGenbankReport)525 BOOST_AUTO_TEST_CASE(ReadGenbankReport)
526 {
527     CDiagRestorer diag_restorer;
528 
529     // Redirect the output warnings
530     SetDiagPostLevel(eDiag_Warning);
531     CNcbiOstrstream error_stream;
532     SetDiagStream(&error_stream);
533 
534     // this is gi 555, length 624
535     CNcbiIfstream infile("data/gbreport.txt");
536     const bool is_protein(false);
537     CBlastInputSourceConfig iconfig(is_protein);
538     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
539 
540     CScope scope(*CObjectManager::GetInstance());
541     BOOST_REQUIRE(source->End() == false);
542     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
543     BOOST_REQUIRE(source->End() == true);
544 
545     string s = CNcbiOstrstreamToString(error_stream);
546     BOOST_REQUIRE(s.find("Ignoring invalid residues at ") != NPOS);
547 
548     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
549     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
550 
551     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
552     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
553 
554     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
555     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
556 
557     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
558     const TSeqPos length(624);
559     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
560 
561     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
562     BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
563 
564     BOOST_REQUIRE(!ssl.mask);
565     scope.GetObjectManager().RevokeAllDataLoaders();
566 }
567 
BOOST_AUTO_TEST_CASE(ReadInvalidGi)568 BOOST_AUTO_TEST_CASE(ReadInvalidGi)
569 {
570     const char* fname = "data/invalid_gi.txt";
571     const bool is_protein(false);
572     CBlastInputSourceConfig iconfig(is_protein);
573 
574     CNcbiIfstream infile(fname);
575     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
576     BOOST_REQUIRE(source->End() == false);
577 
578     CScope scope(*CObjectManager::GetInstance());
579     blast::SSeqLoc ssl;
580     bool caught_exception(false);
581     try { ssl = source->GetNextSeqLocBatch(scope).front(); }
582     catch (const CInputException& e) {
583         string msg(e.what());
584         BOOST_REQUIRE(msg.find("Sequence ID not found: ") != NPOS);
585         BOOST_REQUIRE_EQUAL(CInputException::eSeqIdNotFound, e.GetErrCode());
586         caught_exception = true;
587     }
588     BOOST_REQUIRE(caught_exception);
589     BOOST_REQUIRE(source->End() == true);
590     scope.GetObjectManager().RevokeAllDataLoaders();
591 }
592 
BOOST_AUTO_TEST_CASE(ReadInvalidSeqId)593 BOOST_AUTO_TEST_CASE(ReadInvalidSeqId)
594 {
595     const char* fname = "data/bad_seqid.txt";
596     const bool is_protein(false);
597     CBlastInputSourceConfig iconfig(is_protein);
598 
599     CNcbiIfstream infile(fname);
600     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
601     BOOST_REQUIRE(source->End() == false);
602 
603     CScope scope(*CObjectManager::GetInstance());
604     blast::SSeqLoc ssl;
605     bool caught_exception(false);
606     try { ssl = source->GetNextSeqLocBatch(scope).front(); }
607     catch (const CSeqIdException& e) {
608         string msg(e.what());
609         BOOST_REQUIRE_EQUAL(CSeqIdException::eFormat, e.GetErrCode());
610         caught_exception = true;
611     }
612     BOOST_REQUIRE(caught_exception);
613     BOOST_REQUIRE(source->End() == true);
614     scope.GetObjectManager().RevokeAllDataLoaders();
615 }
616 
BOOST_AUTO_TEST_CASE(ReadBadUserInput)617 BOOST_AUTO_TEST_CASE(ReadBadUserInput)
618 {
619     const char* fname = "data/bad_input.txt";
620     const bool is_protein(false);
621     const size_t kNumQueries(0);
622     CBlastInputSourceConfig iconfig(is_protein);
623     CScope scope(*CObjectManager::GetInstance());
624 
625     {
626         CNcbiIfstream infile(fname);
627         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
628         BOOST_REQUIRE(source->End() == false);
629 
630         blast::TSeqLocVector query_vector;
631         BOOST_REQUIRE_THROW(query_vector = source->GetAllSeqLocs(scope),
632                             CObjReaderParseException);
633         BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
634         BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
635 
636         CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
637         BOOST_REQUIRE(bioseqs.Empty());
638     }
639 
640     {
641         CNcbiIfstream infile(fname);
642         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
643         BOOST_REQUIRE(source->End() == false);
644 
645         CRef<blast::CBlastQueryVector> query_vector;
646         BOOST_REQUIRE_THROW(query_vector = source->GetAllSeqs(scope),
647                             CObjReaderParseException);
648         BOOST_REQUIRE(query_vector.Empty());
649     }
650     scope.GetObjectManager().RevokeAllDataLoaders();
651 }
652 
653 /// This unit test proves that if one input is bad, all of them are rejected.
BOOST_AUTO_TEST_CASE(ReadMultipleGis_WithBadInput)654 BOOST_AUTO_TEST_CASE(ReadMultipleGis_WithBadInput)
655 {
656     const char* fname = "data/gis_bad_input.txt";
657     CNcbiIfstream infile(fname);
658     const bool is_protein(false);
659     CBlastInputSourceConfig iconfig(is_protein);
660     iconfig.SetRetrieveSeqData(false);
661 
662     vector< pair<long, long> > gi_length;
663     gi_length.push_back(make_pair(89161185L, 247249719L));
664     // this is never read...
665     //gi_length.push_back(make_pair(0L, 0L));   // bad sequence
666     //gi_length.push_back(make_pair(557L, 489L));
667 
668     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
669     BOOST_REQUIRE(source->End() == false);
670 
671     CScope scope(*CObjectManager::GetInstance());
672 
673     blast::TSeqLocVector seqs;
674     BOOST_REQUIRE_THROW(seqs = source->GetAllSeqLocs(scope),
675                         CObjReaderParseException);
676     scope.GetObjectManager().RevokeAllDataLoaders();
677 }
678 
BOOST_AUTO_TEST_CASE(ReadEmptyUserInput)679 BOOST_AUTO_TEST_CASE(ReadEmptyUserInput)
680 {
681     const char* fname("/dev/null");
682     const bool is_protein(true);
683     CScope scope(*CObjectManager::GetInstance());
684     CBlastInputSourceConfig iconfig(is_protein);
685     {
686         CNcbiIfstream infile(fname);
687         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
688         BOOST_REQUIRE(source->End() == true);
689 
690         blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
691         BOOST_REQUIRE(query_vector.empty());
692 
693         CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
694         BOOST_REQUIRE(bioseqs.Empty());
695     }
696 
697     {
698         CNcbiIfstream infile(fname);
699         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
700         BOOST_REQUIRE(source->End() == true);
701 
702         CRef<blast::CBlastQueryVector> queries = source->GetAllSeqs(scope);
703         BOOST_REQUIRE(queries->Empty());
704     }
705 
706     // Read from buffer
707     {
708         const string empty;
709         CRef<CObjectManager> om(CObjectManager::GetInstance());
710         CRef<CBlastFastaInputSource> source;
711 
712         bool caught_exception(false);
713         try { source.Reset(new CBlastFastaInputSource(empty, iconfig)); }
714         catch (const CInputException& e) {
715             string msg(e.what());
716             BOOST_REQUIRE(msg.find("No sequence input was provided") != NPOS);
717             BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
718             caught_exception = true;
719         }
720         BOOST_REQUIRE(caught_exception);
721     }
722     scope.GetObjectManager().RevokeAllDataLoaders();
723 }
724 
725 // Basic test case to ensure CFastaReader changes don't break basic
726 // functionality required by BLAST
BOOST_AUTO_TEST_CASE(ReadSingleFasta_WithTitle)727 BOOST_AUTO_TEST_CASE(ReadSingleFasta_WithTitle)
728 {
729     const string kFileName("data/isprot.fa");
730     const string kExpectedTitle("seq");
731     const bool is_protein(false);
732 
733     CScope scope(*CObjectManager::GetInstance());
734     CBlastInputSourceConfig iconfig(is_protein);
735 
736     CNcbiIfstream infile(kFileName.c_str());
737     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
738 
739     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
740     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
741     BOOST_REQUIRE(!bioseqs.Empty());
742 
743     string title;
744     ITERATE(CBioseq::TDescr::Tdata, itr, bioseqs->GetSeq_set().front()->GetSeq().GetDescr().Get()) {
745         const CSeqdesc& desc = **itr;
746         if (desc.IsTitle()) {
747             title = desc.GetTitle();
748             break;
749         }
750     }
751     BOOST_REQUIRE_EQUAL(kExpectedTitle, title);
752     scope.GetObjectManager().RevokeAllDataLoaders();
753 }
754 
755 static
s_ReadAndTestQueryFromString_CFastaReader(const string & input,TSeqPos expected_length)756 void s_ReadAndTestQueryFromString_CFastaReader(const string& input,
757                                                TSeqPos expected_length)
758 {
759     CFastaReader::TFlags defaultBLASTflags = CFastaReader::fNoParseID |
760                                              CFastaReader::fDLOptional;
761     defaultBLASTflags += CFastaReader::fAssumeNuc;
762     defaultBLASTflags += CFastaReader::fNoSplit;
763     defaultBLASTflags += CFastaReader::fHyphensIgnoreAndWarn;
764     defaultBLASTflags += CFastaReader::fDisableNoResidues;
765     defaultBLASTflags += CFastaReader::fQuickIDCheck;
766 
767     CRef<ILineReader> line_reader(new CMemoryLineReader(input.c_str(),
768                                                         input.size()));
769     CFastaReader fasta_reader(*line_reader, defaultBLASTflags);
770     fasta_reader.IgnoreProblem(ILineError::eProblem_ModifierFoundButNoneExpected);
771     fasta_reader.IgnoreProblem(ILineError::eProblem_TooLong);
772 
773     CRef<CSeqIdGenerator> idgen(new CSeqIdGenerator(1, kEmptyStr));
774     fasta_reader.SetIDGenerator(*idgen);
775 
776     CRef<CSeq_entry> se(fasta_reader.ReadOneSeq());
777     BOOST_REQUIRE_EQUAL(expected_length, se->GetSeq().GetLength());
778 }
779 
BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoNewLineAfterSeq)780 BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoNewLineAfterSeq)
781 {
782     const string kUserInput(">seq_1\nATGC");
783     const TSeqPos kExpectedLength(4);
784     s_ReadAndTestQueryFromString_CFastaReader(kUserInput, kExpectedLength);
785 }
BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderWithNewLines)786 BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderWithNewLines)
787 {
788     const string kUserInput(">seq_1\nATGC\n");
789     const TSeqPos kExpectedLength(4);
790     s_ReadAndTestQueryFromString_CFastaReader(kUserInput, kExpectedLength);
791 }
BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoDeflineNoNewLines)792 BOOST_AUTO_TEST_CASE(SingleSequenceString_CFastaReaderNoDeflineNoNewLines)
793 {
794     const string kUserInput("ATGC");
795     const TSeqPos kExpectedLength(4);
796     s_ReadAndTestQueryFromString_CFastaReader(kUserInput, kExpectedLength);
797 }
798 
799 static
s_ReadAndTestQueryFromString(const string & input,TSeqPos expected_length,bool is_protein)800 void s_ReadAndTestQueryFromString(const string& input, TSeqPos expected_length,
801                                   bool is_protein)
802 {
803     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
804     BOOST_REQUIRE(objmgr);
805 
806     SDataLoaderConfig dlconfig(is_protein);
807     CBlastInputSourceConfig iconfig(dlconfig);
808     CBlastFastaInputSource queryInput(input, iconfig);
809     CScope scope(*objmgr);
810     CBlastInput qIn(&queryInput);
811     blast::TSeqLocVector query = qIn.GetAllSeqLocs(scope);
812     BOOST_REQUIRE_EQUAL(expected_length,
813                         sequence::GetLength(*query.front().seqloc, &scope));
814     CRef<CSeqVector> sv(new CSeqVector(*query.front().seqloc, scope));
815     BOOST_REQUIRE_EQUAL(expected_length, sv->size());
816     BOOST_REQUIRE_EQUAL(is_protein, sv->IsProtein());
817     sv->SetIupacCoding();
818     string::size_type input_pos = input.find_first_of("ACTG");
819     BOOST_REQUIRE(input_pos != string::npos);
820     for (TSeqPos pos = 0; pos < sv->size(); pos++, input_pos++) {
821         CNcbiOstrstream oss;
822         oss << "Sequence data differs at position " << pos << ": '"
823             << input[input_pos] << "' .vs '" << (*sv)[pos] << "'";
824         string msg = CNcbiOstrstreamToString(oss);
825         BOOST_REQUIRE_MESSAGE(input[input_pos] == (*sv)[pos],  msg);
826     }
827     scope.GetObjectManager().RevokeAllDataLoaders();
828 }
829 
BOOST_AUTO_TEST_CASE(SingleSequenceString_NoNewLineAfterSeq)830 BOOST_AUTO_TEST_CASE(SingleSequenceString_NoNewLineAfterSeq)
831 {
832     const string kUserInput(">seq_1\nATGC");
833     const TSeqPos kExpectedLength(4);
834     s_ReadAndTestQueryFromString(kUserInput, kExpectedLength, false);
835 }
836 
BOOST_AUTO_TEST_CASE(SingleSequenceString_WithNewLines)837 BOOST_AUTO_TEST_CASE(SingleSequenceString_WithNewLines)
838 {
839     const string kUserInput(">seq_1\nATGC\n");
840     const TSeqPos kExpectedLength(4);
841     s_ReadAndTestQueryFromString(kUserInput, kExpectedLength, false);
842 }
843 
BOOST_AUTO_TEST_CASE(SingleSequenceString_NoDeflineNoNewLines)844 BOOST_AUTO_TEST_CASE(SingleSequenceString_NoDeflineNoNewLines)
845 {
846     const string kUserInput("ATGC");
847     const TSeqPos kExpectedLength(4);
848     s_ReadAndTestQueryFromString(kUserInput, kExpectedLength, false);
849 }
850 
BOOST_AUTO_TEST_CASE(ReadEmptyUserInput_OnlyTitle)851 BOOST_AUTO_TEST_CASE(ReadEmptyUserInput_OnlyTitle)
852 {
853     CTmpFile tmpfile;
854     const string kUserInput(">mygene\n");
855     CNcbiOfstream out(tmpfile.GetFileName().c_str());
856     out << kUserInput;
857     out.close();
858 
859 
860     const bool is_protein(false);
861     CScope scope(*CObjectManager::GetInstance());
862     CBlastInputSourceConfig iconfig(is_protein);
863     bool caught_exception(false);
864     string warnings;
865     {
866         CNcbiIfstream infile(tmpfile.GetFileName().c_str());
867         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
868 
869         blast::TSeqLocVector query_vector;
870         try { CheckForEmptySequences(query_vector, warnings); }
871         catch (const CInputException& e) {
872             BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
873         }
874 
875         query_vector = source->GetAllSeqLocs(scope);
876         try { CheckForEmptySequences(query_vector, warnings); }
877         catch (const CInputException& e) {
878             string msg(e.what());
879             BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
880             BOOST_REQUIRE(warnings.empty());
881             BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
882             caught_exception = true;
883         }
884         BOOST_REQUIRE(caught_exception);
885         BOOST_REQUIRE(query_vector.empty() == false);
886 
887         CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
888         BOOST_REQUIRE(!bioseqs.Empty());
889         caught_exception = false;
890         try { CheckForEmptySequences(bioseqs, warnings); }
891         catch (const CInputException& e) {
892             string msg(e.what());
893             BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
894             BOOST_REQUIRE(warnings.empty());
895             BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
896             caught_exception = true;
897         }
898         BOOST_REQUIRE(caught_exception);
899     }
900 
901     {
902         CNcbiIfstream infile(tmpfile.GetFileName().c_str());
903         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
904 
905         caught_exception = false;
906         CRef<blast::CBlastQueryVector> queries = source->GetAllSeqs(scope);
907         try { CheckForEmptySequences(queries, warnings); }
908         catch (const CInputException& e) {
909             string msg(e.what());
910             BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
911             BOOST_REQUIRE(warnings.empty());
912             BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
913             caught_exception = true;
914         }
915         BOOST_REQUIRE(caught_exception);
916         BOOST_REQUIRE(!queries.Empty());
917     }
918 
919     // Read from buffer
920     {
921         const string empty;
922         CRef<CObjectManager> om(CObjectManager::GetInstance());
923         CRef<CBlastInput> source(s_DeclareBlastInput(kUserInput, iconfig));
924         CRef<blast::CBlastQueryVector> queries;
925         try { CheckForEmptySequences(queries, warnings); }
926         catch (const CInputException& e) {
927             BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
928         }
929 
930         caught_exception = false;
931         queries = source->GetAllSeqs(scope);
932         try { CheckForEmptySequences(queries, warnings); }
933         catch (const CInputException& e) {
934             string msg(e.what());
935             BOOST_REQUIRE(msg.find("Query contains no sequence data") != NPOS);
936             BOOST_REQUIRE(warnings.empty());
937             BOOST_REQUIRE_EQUAL(CInputException::eEmptyUserInput, e.GetErrCode());
938             caught_exception = true;
939         }
940         BOOST_REQUIRE(caught_exception);
941     }
942     scope.GetObjectManager().RevokeAllDataLoaders();
943 }
944 
BOOST_AUTO_TEST_CASE(ReadSingleAccession)945 BOOST_AUTO_TEST_CASE(ReadSingleAccession)
946 {
947     CNcbiIfstream infile("data/accession.txt");
948     const bool is_protein(false);
949     CBlastInputSourceConfig iconfig(is_protein);
950     iconfig.SetRetrieveSeqData(false);
951     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
952 
953     CScope scope(*CObjectManager::GetInstance());
954     BOOST_REQUIRE(source->End() == false);
955     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
956     blast::SSeqLoc ssl = seqs.front();
957     BOOST_REQUIRE(source->End() == true);
958 
959     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
960     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
961 
962     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
963     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
964 
965     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
966     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
967 
968     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
969     const TSeqPos length(248956422);
970     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
971 
972     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
973     BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
974     const string accession("NC_000001");
975     BOOST_REQUIRE_EQUAL(accession,
976                 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
977 
978     BOOST_REQUIRE(!ssl.mask);
979 
980     /// Validate the data that would be retrieved by blast.cgi
981     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
982     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
983     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
984     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
985     BOOST_REQUIRE(b.IsNa());
986     BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, b.GetId().front()->Which());
987     BOOST_REQUIRE_EQUAL(accession, b.GetId().front()->GetOther().GetAccession());
988     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
989     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
990     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
991     scope.GetObjectManager().RevokeAllDataLoaders();
992 }
993 
BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequence)994 BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequence)
995 {
996     CNcbiIfstream infile("data/accession.txt");
997     const bool is_protein(false);
998     const TIntId kGi = 568815597;
999     const TSeqPos kStart = 0;
1000     const TSeqPos kStop(248956421);
1001     SDataLoaderConfig dlconfig("GPIPE/9606/current/GCF_000005045.24_top_level", is_protein);
1002     dlconfig.OptimizeForWholeLargeSequenceRetrieval(true);
1003 
1004     CBlastInputSourceConfig iconfig(dlconfig);
1005     iconfig.SetRetrieveSeqData(true);
1006     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1007 
1008     CRef<CScope> scope(CBlastScopeSource(dlconfig).NewScope());
1009     BOOST_REQUIRE(source->End() == false);
1010 
1011     blast::TSeqLocVector seqs = source->GetAllSeqLocs(*scope);
1012     blast::SSeqLoc ssl = seqs.front();
1013     BOOST_REQUIRE(source->End() == true);
1014 
1015     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1016     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1017 
1018     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1019     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1020 
1021     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1022     BOOST_REQUIRE_EQUAL(kStart, ssl.seqloc->GetInt().GetFrom());
1023 
1024     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1025     BOOST_REQUIRE_EQUAL(kStop, ssl.seqloc->GetInt().GetTo());
1026 
1027     const string accession = "NC_000001";
1028     const int version = 11;
1029     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1030     if ( !CSeq_id::PreferAccessionOverGi() ) {
1031         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1032         BOOST_REQUIRE_EQUAL(GI_CONST(kGi), ssl.seqloc->GetInt().GetId().GetGi());
1033     }
1034     else {
1035         BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1036         BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1037         BOOST_REQUIRE_EQUAL(version, ssl.seqloc->GetInt().GetId().GetOther().GetVersion());
1038     }
1039 
1040     BOOST_REQUIRE(!ssl.mask);
1041 
1042     /// Validate the data that would be retrieved by a BLAST command line
1043     /// binary
1044     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1045     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1046     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1047     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1048     BOOST_REQUIRE(b.IsNa());
1049     bool found_gi = false, found_accession = false;
1050     ITERATE(CBioseq::TId, id, b.GetId()) {
1051         if ((*id)->Which() == CSeq_id::e_Gi) {
1052             BOOST_REQUIRE_EQUAL(GI_CONST(kGi), (*id)->GetGi());
1053             found_gi = true;
1054         } else if ((*id)->Which() == CSeq_id::e_Other) {
1055             CNcbiOstrstream os;
1056             (*id)->GetOther().AsFastaString(os);
1057             const string fasta_acc = CNcbiOstrstreamToString(os);
1058             BOOST_REQUIRE(NStr::Find(fasta_acc, accession) != NPOS);
1059             found_accession = true;
1060         }
1061     }
1062     BOOST_REQUIRE(found_gi);
1063     BOOST_REQUIRE(found_accession);
1064     // the BLAST database data loader will fetch this as a delta sequence
1065     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_delta, b.GetInst().GetRepr());
1066     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1067     BOOST_REQUIRE_EQUAL(kStop+1, b.GetInst().GetLength());
1068     scope->GetObjectManager().RevokeAllDataLoaders();
1069 }
1070 #ifdef _DEBUG
1071 const int kTimeOutLargeSeq = 60;
1072 #else
1073 const int kTimeOutLargeSeq = 20;
1074 #endif
1075 BOOST_AUTO_TEST_CASE_TIMEOUT(ReadSingleAccession_RetrieveLargeSequence,
1076                              kTimeOutLargeSeq);
1077 
BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequenceWithRange)1078 BOOST_AUTO_TEST_CASE(ReadSingleAccession_RetrieveLargeSequenceWithRange)
1079 {
1080     CNcbiIfstream infile("data/accession.txt");
1081     const bool is_protein(false);
1082     CBlastInputSourceConfig iconfig(is_protein);
1083     const TSeqPos kStart = 1;
1084     const TSeqPos kStop = 1000;
1085     iconfig.SetRange().SetFrom(kStart);
1086     iconfig.SetRange().SetTo(kStop);
1087     // comment the line below to fetch the sequence data
1088     iconfig.SetRetrieveSeqData(false);
1089     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1090 
1091     SDataLoaderConfig dlconfig(is_protein);
1092     CRef<CScope> scope(CBlastScopeSource(dlconfig).NewScope());
1093     BOOST_REQUIRE(source->End() == false);
1094     blast::TSeqLocVector seqs = source->GetAllSeqLocs(*scope);
1095     blast::SSeqLoc ssl = seqs.front();
1096     BOOST_REQUIRE(source->End() == true);
1097 
1098     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1099     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1100 
1101     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1102     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1103 
1104     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1105     BOOST_REQUIRE_EQUAL(kStart, ssl.seqloc->GetInt().GetFrom());
1106 
1107     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1108     BOOST_REQUIRE_EQUAL(kStop, ssl.seqloc->GetInt().GetTo());
1109 
1110     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1111     BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1112     const string accession("NC_000001");
1113     BOOST_REQUIRE_EQUAL(accession,
1114                 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1115     BOOST_REQUIRE(!ssl.mask);
1116 
1117     /// Validate the data that would be retrieved by blast.cgi
1118     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1119     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1120     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1121     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1122     BOOST_REQUIRE(b.IsNa());
1123     BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, b.GetId().front()->Which());
1124     BOOST_REQUIRE_EQUAL(accession, b.GetId().front()->GetOther().GetAccession());
1125     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1126     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1127     const TSeqPos length(248956422);
1128     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1129     scope->GetObjectManager().RevokeAllDataLoaders();
1130 }
1131 #ifdef _DEBUG
1132 const int kTimeOutLargeSeqWithRange = 60;
1133 #else
1134 const int kTimeOutLargeSeqWithRange = 15;
1135 #endif
1136 BOOST_AUTO_TEST_CASE_TIMEOUT(ReadSingleAccession_RetrieveLargeSequenceWithRange,
1137                              kTimeOutLargeSeqWithRange);
1138 
BOOST_AUTO_TEST_CASE(ReadMultipleAccessions)1139 BOOST_AUTO_TEST_CASE(ReadMultipleAccessions)
1140 {
1141     CNcbiIfstream infile("data/accessions.txt");
1142     const bool is_protein(false);
1143     CBlastInputSourceConfig iconfig(is_protein);
1144     iconfig.SetRetrieveSeqData(false);
1145     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1146 
1147     vector< pair<string, long> > accession_lengths;
1148     accession_lengths.push_back(make_pair(string("NC_000001"), 248956422L));
1149     accession_lengths.push_back(make_pair(string("NC_000010.9"), 135374737L));
1150     accession_lengths.push_back(make_pair(string("NC_000011.8"), 134452384L));
1151     accession_lengths.push_back(make_pair(string("NC_000012.10"), 132349534L));
1152 
1153     const size_t kNumQueries(accession_lengths.size());
1154     CScope scope(*CObjectManager::GetInstance());
1155     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1156     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1157     BOOST_REQUIRE(source->End() == true);
1158 
1159     {{
1160         blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1161         BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1162         BOOST_REQUIRE(source->End() == true);
1163     }}
1164 
1165     for (size_t i = 0; i < kNumQueries; i++) {
1166 
1167         blast::SSeqLoc& ssl = query_vector[i];
1168         BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
1169         BOOST_REQUIRE_EQUAL((TSeqPos)accession_lengths[i].second - 1,
1170                     ssl.seqloc->GetInt().GetTo());
1171 
1172         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1173         BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1174         BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1175         string accession;
1176         int version;
1177         switch (i) {
1178         case 0: accession.assign("NC_000001"); version = 0; break;
1179         case 1: accession.assign("NC_000010"); version = 9; break;
1180         case 2: accession.assign("NC_000011"); version = 8; break;
1181         case 3: accession.assign("NC_000012"); version = 10; break;
1182         default: abort();
1183         }
1184 
1185         BOOST_REQUIRE_EQUAL(accession,
1186                     ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1187         if (version != 0) {
1188             BOOST_REQUIRE_EQUAL(version,
1189                         ssl.seqloc->GetInt().GetId().GetOther().GetVersion());
1190         }
1191         BOOST_REQUIRE(!ssl.mask);
1192 
1193     }
1194 
1195     /// Validate the data that would be retrieved by blast.cgi
1196     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1197     BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1198     scope.GetObjectManager().RevokeAllDataLoaders();
1199 }
1200 
1201 // This test was created to test issues in jira/browse/CXX-82
BOOST_AUTO_TEST_CASE(ReadMultipleAccessionsFromMemory)1202 BOOST_AUTO_TEST_CASE(ReadMultipleAccessionsFromMemory)
1203 {
1204     typedef vector< pair<string, int> > TStringIntVector;
1205     TStringIntVector accession_lengths;
1206     accession_lengths.push_back(make_pair(string("P01012.2"), 386));
1207     accession_lengths.push_back(make_pair(string("1OVA-A"), 386));
1208     // Fails in entrez, we implemented regex for this in CBlastInputReader
1209     accession_lengths.push_back(make_pair(string("pdb|1OVA-A"), 386));
1210     // Note the double bar..
1211     accession_lengths.push_back(make_pair(string("prf||0705172A"), 385));
1212     // Fails in entrez, we implemented regex for this in CBlastInputReader
1213     accession_lengths.push_back(make_pair(string("sp|P01012.2"), 386));
1214 
1215     // This we're not even going to try to fix...
1216     //accession_lengths.push_back(make_pair(string("0705172A"), 385));
1217 
1218     string user_input;
1219     ITERATE(TStringIntVector, itr, accession_lengths) {
1220         user_input += itr->first + "\n";
1221     }
1222     istringstream instream(user_input);
1223 
1224     const bool is_protein(true);
1225     CBlastInputSourceConfig iconfig(is_protein);
1226     iconfig.SetRetrieveSeqData(false);
1227     CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
1228 
1229     const size_t kNumQueries(accession_lengths.size());
1230     CScope scope(*CObjectManager::GetInstance());
1231     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1232     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1233     BOOST_REQUIRE(source->End() == true);
1234 
1235     {{
1236         blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1237         BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1238         BOOST_REQUIRE(source->End() == true);
1239     }}
1240 
1241     for (size_t i = 0; i < kNumQueries; i++) {
1242 
1243         const string& accession = accession_lengths[i].first;
1244         CNcbiOstrstream oss;
1245         blast::SSeqLoc& ssl = query_vector[i];
1246         oss << "Accession " << accession << " difference in lengths: "
1247             << ((TSeqPos)accession_lengths[i].second - 1) << " vs. "
1248             << ssl.seqloc->GetInt().GetTo();
1249         string msg = CNcbiOstrstreamToString(oss);
1250         BOOST_REQUIRE_MESSAGE(((TSeqPos)accession_lengths[i].second - 1) ==
1251                     ssl.seqloc->GetInt().GetTo(), msg);
1252         BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetStrand());
1253         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1254         BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1255     }
1256 
1257     /// Validate the data that would be retrieved by blast.cgi
1258     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1259     BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1260     scope.GetObjectManager().RevokeAllDataLoaders();
1261 }
1262 
BOOST_AUTO_TEST_CASE(ReadSingleGi)1263 BOOST_AUTO_TEST_CASE(ReadSingleGi)
1264 {
1265     CNcbiIfstream infile("data/gi.txt");
1266     const bool is_protein(false);
1267     CBlastInputSourceConfig iconfig(is_protein);
1268     iconfig.SetRetrieveSeqData(false);
1269     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1270 
1271     CScope scope(*CObjectManager::GetInstance());
1272     BOOST_REQUIRE(source->End() == false);
1273     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1274     blast::SSeqLoc ssl = seqs.front();
1275     BOOST_REQUIRE(source->End() == true);
1276 
1277     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1278     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1279 
1280     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1281     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1282 
1283     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1284     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1285 
1286     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1287     const TSeqPos length = 247249719;
1288     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1289 
1290     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1291     BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1292     const TGi gi = GI_CONST(89161185);
1293     BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1294 
1295     BOOST_REQUIRE(!ssl.mask);
1296 
1297     /// Validate the data that would be retrieved by blast.cgi
1298     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1299     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1300     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1301     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1302     BOOST_REQUIRE(b.IsNa());
1303     BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, b.GetId().front()->Which());
1304     BOOST_REQUIRE_EQUAL(gi, b.GetId().front()->GetGi());
1305     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1306     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1307     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1308     scope.GetObjectManager().RevokeAllDataLoaders();
1309 }
1310 
BOOST_AUTO_TEST_CASE(ReadMultipleGis)1311 BOOST_AUTO_TEST_CASE(ReadMultipleGis)
1312 {
1313     CNcbiIfstream infile("data/gis.txt");
1314     const bool is_protein(false);
1315     CBlastInputSourceConfig iconfig(is_protein);
1316     iconfig.SetRetrieveSeqData(false);
1317     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1318 
1319     vector< pair<TIntId, long> > gi_length;
1320     gi_length.push_back(make_pair(89161185, 247249719L));
1321     gi_length.push_back(make_pair(555, 624L));
1322     gi_length.push_back(make_pair(557, 489L));
1323 
1324     const size_t kNumQueries(gi_length.size());
1325     CScope scope(*CObjectManager::GetInstance());
1326     BOOST_REQUIRE(source->End() == false);
1327     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1328     BOOST_REQUIRE(source->End() == true);
1329 
1330     for (size_t i = 0; i < kNumQueries; i++) {
1331         blast::SSeqLoc ssl = seqs[i];
1332         BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1333 
1334         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1335         BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1336 
1337         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1338         BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1339 
1340         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1341         const TSeqPos length = gi_length[i].second;
1342         BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1343 
1344         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1345         BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == false);
1346         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1347         const TIntId gi = gi_length[i].first;
1348         BOOST_REQUIRE_EQUAL(GI_FROM(TIntId, gi), ssl.seqloc->GetInt().GetId().GetGi());
1349 
1350         BOOST_REQUIRE(!ssl.mask);
1351     }
1352 
1353     /// Validate the data that would be retrieved by blast.cgi
1354     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1355     BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1356 
1357     CBioseq_set::TSeq_set::const_iterator itr = bioseqs->GetSeq_set().begin();
1358     CBioseq_set::TSeq_set::const_iterator end = bioseqs->GetSeq_set().end();
1359     for (size_t i = 0; i < kNumQueries; i++, ++itr) {
1360         BOOST_REQUIRE(itr != end);
1361         BOOST_REQUIRE((*itr)->IsSeq());
1362         const CBioseq& b = (*itr)->GetSeq();
1363         BOOST_REQUIRE(b.IsNa());
1364         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, b.GetId().front()->Which());
1365         BOOST_REQUIRE_EQUAL(GI_FROM(TIntId, gi_length[i].first), b.GetId().front()->GetGi());
1366         BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1367         BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1368         BOOST_REQUIRE_EQUAL((long)gi_length[i].second, (long)b.GetInst().GetLength());
1369     }
1370     scope.GetObjectManager().RevokeAllDataLoaders();
1371 }
1372 
1373 // This input file contains very short sequences (1-3 bases) which were product
1374 // of a sequencing machine
BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencer)1375 BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencer)
1376 {
1377     CNcbiIfstream infile("data/DF-1.txt");
1378     const bool is_protein(false);
1379     CBlastInputSourceConfig iconfig(is_protein);
1380     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1381     const size_t kNumQueries(96);
1382 
1383     BOOST_REQUIRE(source->End() == false);
1384 
1385     CScope scope(*CObjectManager::GetInstance());
1386     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1387     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1388     BOOST_REQUIRE(blast::IsLocalId(query_vector.front().seqloc->GetId()));
1389     scope.GetObjectManager().RevokeAllDataLoaders();
1390 }
1391 
BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencerParseLocalIds)1392 BOOST_AUTO_TEST_CASE(ReadMultipleSequencesFromSequencerParseLocalIds)
1393 {
1394     CNcbiIfstream infile("data/DF-1.txt");
1395     const bool kIsProtein(false);
1396     const bool kParseID(true);
1397     SDataLoaderConfig dlconfig(kIsProtein);
1398     CBlastInputSourceConfig iconfig(dlconfig, objects::eNa_strand_other, false, kParseID);
1399     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1400     const size_t kNumQueries(96);
1401 
1402     BOOST_REQUIRE(source->End() == false);
1403 
1404     CScope scope(*CObjectManager::GetInstance());
1405     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1406     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1407     BOOST_REQUIRE(blast::IsLocalId(query_vector.front().seqloc->GetId()));
1408     // Check that the first three IDs went through.
1409     BOOST_REQUIRE_EQUAL(query_vector[0].seqloc->GetId()->AsFastaString(), string("lcl|seq#474_A03_564_c_T3+40.ab1"));
1410     BOOST_REQUIRE_EQUAL(query_vector[1].seqloc->GetId()->AsFastaString(), string("lcl|seq#474_A01_564_a_T3+40.ab1"));
1411     BOOST_REQUIRE_EQUAL(query_vector[2].seqloc->GetId()->AsFastaString(), string("lcl|seq#474_A02_564_b_T3+40.ab1"));
1412     scope.GetObjectManager().RevokeAllDataLoaders();
1413 }
1414 
BOOST_AUTO_TEST_CASE(ReadSequenceWithlclID)1415 BOOST_AUTO_TEST_CASE(ReadSequenceWithlclID)
1416 {
1417     CNcbiIfstream infile("data/localid.txt");
1418     const bool kIsProtein(false);
1419     const bool kParseID(true);
1420     SDataLoaderConfig dlconfig(kIsProtein);
1421     CBlastInputSourceConfig iconfig(dlconfig, objects::eNa_strand_other, false, kParseID);
1422     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1423 
1424     CScope scope(*CObjectManager::GetInstance());
1425     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1426     BOOST_REQUIRE(blast::IsLocalId(query_vector.front().seqloc->GetId()));
1427     // Check that the local ID went through.
1428     BOOST_REQUIRE_EQUAL(query_vector[0].seqloc->GetId()->AsFastaString(), string("lcl|mylocalID555"));
1429     scope.GetObjectManager().RevokeAllDataLoaders();
1430 }
1431 
1432 // This input file contains several sequences in FASTA format, but one of them
1433 // is empty, this should proceed with no problems
BOOST_AUTO_TEST_CASE(ReadMultipleSequences_OneEmpty)1434 BOOST_AUTO_TEST_CASE(ReadMultipleSequences_OneEmpty)
1435 {
1436     CNcbiIfstream infile("data/nt.multiple_queries.one.empty");
1437     const bool is_protein(false);
1438     CBlastInputSourceConfig iconfig(is_protein);
1439     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1440     const size_t kNumQueries(6);
1441 
1442     BOOST_REQUIRE(source->End() == false);
1443 
1444     CScope scope(*CObjectManager::GetInstance());
1445     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1446     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1447     BOOST_REQUIRE(source->End() == true);
1448     TSeqPos query_lengths[] = { 1920, 1, 130, 0, 2, 1552 };
1449     int i = 0;
1450     ITERATE(blast::TSeqLocVector, q, query_vector) {
1451         BOOST_REQUIRE(blast::IsLocalId(query_vector[i].seqloc->GetId()));
1452         BOOST_REQUIRE_EQUAL(query_lengths[i],
1453                             sequence::GetLength(*query_vector[i].seqloc,
1454                                                 query_vector[i].scope));
1455         i++;
1456     }
1457 
1458     string warnings;
1459     CheckForEmptySequences(query_vector, warnings);
1460     BOOST_REQUIRE(warnings.find("following sequences had no sequence data:")
1461                   != NPOS);
1462 
1463     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1464     warnings.clear();
1465     CheckForEmptySequences(bioseqs, warnings);
1466     BOOST_REQUIRE(warnings.find("following sequences had no sequence data:")
1467                   != NPOS);
1468     scope.GetObjectManager().RevokeAllDataLoaders();
1469 }
1470 
BOOST_AUTO_TEST_CASE(ReadMultipleTis)1471 BOOST_AUTO_TEST_CASE(ReadMultipleTis)
1472 {
1473     CNcbiIfstream infile("data/tis.txt");
1474     const bool is_protein(false);
1475     CBlastInputSourceConfig iconfig(is_protein);
1476     iconfig.SetRetrieveSeqData(false);
1477     iconfig.SetDataLoaderConfig().m_BlastDbName = "data/WGS_test" ;
1478     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1479     CScope scope(*CObjectManager::GetInstance());
1480 
1481     BOOST_REQUIRE(source->End() == false);
1482 
1483     vector< pair<int, long> > ti_lengths;
1484     ti_lengths.push_back(make_pair(12345, 657L));
1485     ti_lengths.push_back(make_pair(12347, 839L));
1486     ti_lengths.push_back(make_pair(12348, 658L));
1487     ti_lengths.push_back(make_pair(10000, 670L));
1488 
1489     const size_t kNumQueries(ti_lengths.size());
1490     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1491     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1492     BOOST_REQUIRE(source->End() == true);
1493 
1494     {{
1495         blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1496         BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1497         BOOST_REQUIRE(source->End() == true);
1498     }}
1499 
1500     const string db("ti");
1501     for (size_t i = 0; i < kNumQueries; i++) {
1502 
1503         const blast::SSeqLoc& ssl = query_vector[i];
1504         BOOST_REQUIRE(ssl.seqloc->IsInt());
1505         const CSeq_interval& seqint = ssl.seqloc->GetInt();
1506         BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
1507         BOOST_REQUIRE_EQUAL((TSeqPos)ti_lengths[i].second - 1, seqint.GetTo());
1508 
1509         BOOST_REQUIRE(seqint.IsSetId() == true);
1510         BOOST_REQUIRE( !blast::IsLocalId(query_vector.front().seqloc->GetId()));
1511         BOOST_REQUIRE_EQUAL(CSeq_id::e_General, seqint.GetId().Which());
1512         BOOST_REQUIRE_EQUAL(db, seqint.GetId().GetGeneral().GetDb());
1513         BOOST_REQUIRE_EQUAL(ti_lengths[i].first,
1514                     seqint.GetId().GetGeneral().GetTag().GetId());
1515         BOOST_REQUIRE(!ssl.mask);
1516     }
1517 
1518     /// Validate the data that would be retrieved by blast.cgi
1519     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1520     BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1521     scope.GetObjectManager().RevokeAllDataLoaders();
1522 }
1523 
BOOST_AUTO_TEST_CASE(ReadSingleTi)1524 BOOST_AUTO_TEST_CASE(ReadSingleTi)
1525 {
1526     CNcbiIfstream infile("data/ti.txt");
1527     const bool is_protein(false);
1528     CBlastInputSourceConfig iconfig(is_protein);
1529     iconfig.SetRetrieveSeqData(true);
1530     iconfig.SetDataLoaderConfig().m_BlastDbName = "data/WGS_test" ;
1531     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1532 
1533     CScope scope(*CObjectManager::GetInstance());
1534     BOOST_REQUIRE(source->End() == false);
1535     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1536     blast::SSeqLoc ssl = seqs.front();
1537     BOOST_REQUIRE(source->End() == true);
1538 
1539     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1540     BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1541 
1542     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1543     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1544 
1545     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1546     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1547 
1548     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1549     const TSeqPos length(657);
1550     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1551 
1552     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1553     BOOST_REQUIRE_EQUAL(CSeq_id::e_General, ssl.seqloc->GetInt().GetId().Which());
1554     const string db("ti");
1555     BOOST_REQUIRE_EQUAL(db, ssl.seqloc->GetInt().GetId().GetGeneral().GetDb());
1556     BOOST_REQUIRE(ssl.seqloc->GetInt().GetId().GetGeneral().GetTag().IsId());
1557     const int ti(12345);
1558     BOOST_REQUIRE_EQUAL(ti, ssl.seqloc->GetInt().GetId().GetGeneral().GetTag().GetId());
1559 
1560     BOOST_REQUIRE(!ssl.mask);
1561 
1562     /// Validate the data that would be retrieved by blast.cgi
1563     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1564     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1565     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1566     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1567     BOOST_REQUIRE(b.IsNa());
1568     BOOST_REQUIRE_EQUAL(CSeq_id::e_General, b.GetId().front()->Which());
1569     BOOST_REQUIRE_EQUAL(db, b.GetId().back()->GetGeneral().GetDb());
1570     BOOST_REQUIRE_EQUAL(ti, b.GetId().back()->GetGeneral().GetTag().GetId());
1571     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1572     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1573     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1574     scope.GetObjectManager().RevokeAllDataLoaders();
1575 }
1576 
BOOST_AUTO_TEST_CASE(ReadAccessionsAndGisWithNewLines)1577 BOOST_AUTO_TEST_CASE(ReadAccessionsAndGisWithNewLines)
1578 {
1579     CNcbiIfstream infile("data/accgis_nl.txt");
1580     const bool is_protein(false);
1581     CBlastInputSourceConfig iconfig(is_protein);
1582     iconfig.SetRetrieveSeqData(false);
1583     iconfig.SetDataLoaderConfig().m_BlastDbName = "data/WGS_test" ;
1584 
1585     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1586 
1587     vector< pair<string, long> > gi_accessions;
1588     gi_accessions.push_back(make_pair(string("89161215"), 111583154L));
1589     gi_accessions.push_back(make_pair(string("89161217"), 155407050L));
1590     gi_accessions.push_back(make_pair(string("89161219"), 11133097L));
1591     gi_accessions.push_back(make_pair(string("NC_000001"), 248956422L));
1592     gi_accessions.push_back(make_pair(string("NC_000010.9"), 135374737L));
1593     gi_accessions.push_back(make_pair(string("gnl|ti|12345"), 657L));
1594     gi_accessions.push_back(make_pair(string("NC_000011.8"), 134452384L));
1595     gi_accessions.push_back(make_pair(string("NC_000012.10"), 132349534L));
1596 
1597     const size_t kNumQueries(gi_accessions.size());
1598     CScope scope(*CObjectManager::GetInstance());
1599     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
1600     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
1601     BOOST_REQUIRE(source->End() == true);
1602 
1603     {{
1604         blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
1605         BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
1606         BOOST_REQUIRE(source->End() == true);
1607     }}
1608 
1609     for (size_t i = 0; i < kNumQueries; i++) {
1610 
1611         blast::SSeqLoc& ssl = query_vector[i];
1612         BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
1613         BOOST_REQUIRE_EQUAL((TSeqPos)gi_accessions[i].second - 1,
1614                     ssl.seqloc->GetInt().GetTo());
1615 
1616         const string& id = gi_accessions[i].first;
1617 
1618         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1619         BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1620         TGi gi = ZERO_GI;
1621         if ( (gi = NStr::StringToNumeric<TGi>(id, NStr::fConvErr_NoThrow)) != ZERO_GI) {
1622             BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1623             BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1624         } else if (i == 5) {
1625             BOOST_REQUIRE_EQUAL(CSeq_id::e_General,
1626                         ssl.seqloc->GetInt().GetId().Which());
1627             const string db("ti");
1628             BOOST_REQUIRE_EQUAL(db, ssl.seqloc->GetInt().GetId().GetGeneral().GetDb());
1629             BOOST_REQUIRE(ssl.seqloc->GetInt().GetId().GetGeneral().GetTag().IsId());
1630             const int ti(12345);
1631             BOOST_REQUIRE_EQUAL(ti,
1632                         ssl.seqloc->GetInt().GetId().
1633                         GetGeneral().GetTag().GetId());
1634         } else {
1635             BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1636             string accession;
1637             int version;
1638 
1639             switch (i) {
1640             case 3: accession.assign("NC_000001"); version = 0; break;
1641             case 4: accession.assign("NC_000010"); version = 9; break;
1642             case 6: accession.assign("NC_000011"); version = 8; break;
1643             case 7: accession.assign("NC_000012"); version = 10; break;
1644             default: abort();
1645             }
1646 
1647             BOOST_REQUIRE_EQUAL(accession,
1648                         ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1649             if (version != 0) {
1650                 BOOST_REQUIRE_EQUAL(version,
1651                         ssl.seqloc->GetInt().GetId().GetOther().GetVersion());
1652             }
1653         }
1654         BOOST_REQUIRE(!ssl.mask);
1655 
1656     }
1657 
1658     /// Validate the data that would be retrieved by blast.cgi
1659     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(query_vector);
1660     BOOST_REQUIRE_EQUAL(kNumQueries, bioseqs->GetSeq_set().size());
1661     scope.GetObjectManager().RevokeAllDataLoaders();
1662 }
1663 
1664 static string*
s_FileContents2String(const char * file_name)1665 s_FileContents2String(const char* file_name)
1666 {
1667     CNcbiIfstream file(file_name);
1668     char buffer[2048] = { '\0' };
1669     auto_ptr<string> retval(new string);
1670 
1671     while (file.getline(buffer, sizeof(buffer))) {
1672         (*retval) += string(buffer) + "\n";
1673     }
1674 
1675     return retval.release();
1676 }
1677 
BOOST_AUTO_TEST_CASE(ReadAccessionNucleotideIntoBuffer_Single)1678 BOOST_AUTO_TEST_CASE(ReadAccessionNucleotideIntoBuffer_Single)
1679 {
1680     const char* fname("data/accession.txt");
1681     auto_ptr<string> user_input(s_FileContents2String(fname));
1682 
1683     CRef<CObjectManager> om(CObjectManager::GetInstance());
1684     CBlastInputSourceConfig iconfig(false);
1685     iconfig.SetRetrieveSeqData(false);
1686     CRef<CBlastInput> source(s_DeclareBlastInput(*user_input, iconfig));
1687 
1688     CScope scope(*om);
1689     BOOST_REQUIRE(source->End() == false);
1690     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1691     blast::SSeqLoc ssl = seqs.front();
1692 
1693 
1694     BOOST_REQUIRE(source->End() == true);
1695 
1696     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1697 
1698     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1699     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1700 
1701     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1702     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1703 
1704     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1705     const TSeqPos length(248956422);
1706     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1707 
1708     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1709     BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1710     BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, ssl.seqloc->GetInt().GetId().Which());
1711     const string accession("NC_000001");
1712     BOOST_REQUIRE_EQUAL(accession,
1713                 ssl.seqloc->GetInt().GetId().GetOther().GetAccession());
1714 
1715     BOOST_REQUIRE(!ssl.mask);
1716 
1717     /// Validate the data that would be retrieved by blast.cgi
1718     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1719     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1720     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1721     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1722     BOOST_REQUIRE(b.IsNa());
1723     BOOST_REQUIRE_EQUAL(CSeq_id::e_Other, b.GetId().front()->Which());
1724     BOOST_REQUIRE_EQUAL(accession, b.GetId().front()->GetOther().GetAccession());
1725     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1726     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1727     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1728     scope.GetObjectManager().RevokeAllDataLoaders();
1729 
1730 }
1731 
BOOST_AUTO_TEST_CASE(ReadGiNuclWithFlankingSpacesIntoBuffer_Single)1732 BOOST_AUTO_TEST_CASE(ReadGiNuclWithFlankingSpacesIntoBuffer_Single)
1733 {
1734     // N.B.: the extra newline causes the CFastaReader to throw an EOF exception
1735     auto_ptr<string> user_input(new string("    1945386  \n "));
1736 
1737     CRef<CObjectManager> om(CObjectManager::GetInstance());
1738     CBlastInputSourceConfig iconfig(false);
1739     CRef<CBlastInput> source(s_DeclareBlastInput(*user_input, iconfig));
1740 
1741     CScope scope(*om);
1742     BOOST_REQUIRE(source->End() == false);
1743     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
1744     BOOST_REQUIRE(source->End() == true);
1745     blast::SSeqLoc ssl = seqs.front();
1746 
1747     BOOST_REQUIRE(source->End() == true);
1748 
1749     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1750 
1751     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1752     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1753 
1754     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1755     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1756 
1757     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1758     const TSeqPos length(2772);
1759     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1760 
1761     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1762     BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1763     const TGi gi = GI_CONST(1945386);
1764     const string gb_name = "HSU93236";
1765     const string gb_accession = "U93236";
1766     const int gb_version = 1;
1767     if ( !CSeq_id::PreferAccessionOverGi() ) {
1768         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1769         BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1770     }
1771     else {
1772         BOOST_REQUIRE_EQUAL(CSeq_id::e_Genbank, ssl.seqloc->GetInt().GetId().Which());
1773         BOOST_REQUIRE_EQUAL(gb_name, ssl.seqloc->GetInt().GetId().GetGenbank().GetName());
1774         BOOST_REQUIRE_EQUAL(gb_accession, ssl.seqloc->GetInt().GetId().GetGenbank().GetAccession());
1775         BOOST_REQUIRE_EQUAL(gb_version, ssl.seqloc->GetInt().GetId().GetGenbank().GetVersion());
1776     }
1777 
1778     BOOST_REQUIRE(!ssl.mask);
1779 
1780     /// Validate the data that would be retrieved by blast.cgi
1781     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1782     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1783     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1784     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1785     BOOST_REQUIRE(b.IsNa());
1786 
1787     CRef<CSeq_id> id = FindBestChoice(b.GetId(), CSeq_id::BestRank);
1788     BOOST_REQUIRE(id.NotNull());
1789     if ( !CSeq_id::PreferAccessionOverGi() ) {
1790         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, id->Which());
1791         BOOST_REQUIRE_EQUAL(gi, id->GetGi());
1792     }
1793     else {
1794         BOOST_REQUIRE_EQUAL(CSeq_id::e_Genbank, id->Which());
1795         BOOST_REQUIRE_EQUAL(gb_name, id->GetGenbank().GetName());
1796         BOOST_REQUIRE_EQUAL(gb_accession, id->GetGenbank().GetAccession());
1797         BOOST_REQUIRE_EQUAL(gb_version, id->GetGenbank().GetVersion());
1798     }
1799     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1800     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1801     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1802     scope.GetObjectManager().RevokeAllDataLoaders();
1803 
1804 }
1805 
BOOST_AUTO_TEST_CASE(ReadAccessionNuclWithFlankingSpacesIntoBuffer_Single)1806 BOOST_AUTO_TEST_CASE(ReadAccessionNuclWithFlankingSpacesIntoBuffer_Single)
1807 {
1808     auto_ptr<string> user_input(new string("  X65215.1   "));
1809 
1810     CRef<CObjectManager> om(CObjectManager::GetInstance());
1811     CBlastInputSourceConfig iconfig(false);
1812     CBlastFastaInputSource fasta_source(*user_input, iconfig);
1813     CBlastInput source(&fasta_source);
1814 
1815     CScope scope(*om);
1816     BOOST_REQUIRE(source.End() == false);
1817     blast::TSeqLocVector seqs = source.GetAllSeqLocs(scope);
1818     blast::SSeqLoc ssl = seqs.front();
1819 
1820     BOOST_REQUIRE(source.End() == true);
1821 
1822     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1823 
1824     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1825     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
1826 
1827     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1828     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1829 
1830     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1831     const TSeqPos length(624);
1832     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1833 
1834     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1835     BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
1836 
1837     const TGi gi = GI_CONST(555);
1838     const string accession = "X65215";
1839     const int version = 1;
1840     if ( !CSeq_id::PreferAccessionOverGi() ) {
1841         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
1842         BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
1843     }
1844     else {
1845         BOOST_REQUIRE_EQUAL(CSeq_id::e_Embl, ssl.seqloc->GetInt().GetId().Which());
1846         BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetInt().GetId().GetEmbl().GetAccession());
1847         BOOST_REQUIRE_EQUAL(version, ssl.seqloc->GetInt().GetId().GetEmbl().GetVersion());
1848     }
1849 
1850     BOOST_REQUIRE(!ssl.mask);
1851 
1852     /// Validate the data that would be retrieved by blast.cgi
1853     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1854     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1855     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1856     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1857     BOOST_REQUIRE(b.IsNa());
1858     bool found_gi = false, found_accession = false;
1859     ITERATE(CBioseq::TId, id, b.GetId()) {
1860         if ((*id)->Which() == CSeq_id::e_Gi) {
1861             BOOST_REQUIRE_EQUAL(GI_CONST(555), (*id)->GetGi());
1862             found_gi = true;
1863         } else if ((*id)->Which() == CSeq_id::e_Embl) {
1864             CNcbiOstrstream os;
1865             (*id)->GetEmbl().AsFastaString(os);
1866             const string fasta_acc = CNcbiOstrstreamToString(os);
1867             BOOST_REQUIRE(NStr::Find(fasta_acc, accession) != NPOS);
1868             found_accession = true;
1869         }
1870     }
1871     BOOST_REQUIRE(found_gi);
1872     BOOST_REQUIRE(found_accession);
1873     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1874     BOOST_REQUIRE(CSeq_inst::IsNa(b.GetInst().GetMol()));
1875     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1876     scope.GetObjectManager().RevokeAllDataLoaders();
1877 }
1878 
BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProteinIntoBuffer_Single)1879 BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineProteinIntoBuffer_Single)
1880 {
1881     const char* fname("data/aa.129295");
1882     auto_ptr<string> user_input(s_FileContents2String(fname));
1883 
1884     CRef<CObjectManager> om(CObjectManager::GetInstance());
1885     CBlastInputSourceConfig iconfig(true);
1886     CBlastFastaInputSource fasta_source(*user_input, iconfig);
1887     CBlastInput source(&fasta_source);
1888 
1889     CScope scope(*om);
1890     BOOST_REQUIRE(source.End() == false);
1891     blast::TSeqLocVector seqs = source.GetAllSeqLocs(scope);
1892     blast::SSeqLoc ssl = seqs.front();
1893 
1894     BOOST_REQUIRE(source.End() == true);
1895 
1896     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
1897 
1898     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
1899     BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
1900 
1901     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
1902     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
1903 
1904     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
1905     const TSeqPos length = 232;
1906     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1907 
1908     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
1909     BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
1910     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
1911 
1912     BOOST_REQUIRE(!ssl.mask);
1913 
1914     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
1915     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
1916     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
1917     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
1918     BOOST_REQUIRE(b.IsAa());
1919     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
1920     BOOST_REQUIRE_EQUAL(CSeq_inst::eMol_aa, b.GetInst().GetMol());
1921     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
1922     scope.GetObjectManager().RevokeAllDataLoaders();
1923 
1924 }
1925 
BOOST_AUTO_TEST_CASE(RangeBoth)1926 BOOST_AUTO_TEST_CASE(RangeBoth)
1927 {
1928     CNcbiIfstream infile("data/aa.129295");
1929     const bool is_protein(true);
1930     const TSeqPos start(50);
1931     const TSeqPos stop(100);
1932     CBlastInputSourceConfig iconfig(is_protein);
1933     iconfig.SetRange().SetFrom(start);
1934     iconfig.SetRange().SetTo(stop);
1935     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1936 
1937     CScope scope(*CObjectManager::GetInstance());
1938     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
1939 
1940     BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetInt().GetFrom());
1941     BOOST_REQUIRE_EQUAL(stop, ssl.seqloc->GetInt().GetTo());
1942     BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetStart(eExtreme_Positional));
1943     BOOST_REQUIRE_EQUAL(stop, ssl.seqloc->GetStop(eExtreme_Positional));
1944     scope.GetObjectManager().RevokeAllDataLoaders();
1945 }
1946 
BOOST_AUTO_TEST_CASE(RangeStartOnly)1947 BOOST_AUTO_TEST_CASE(RangeStartOnly)
1948 {
1949     CNcbiIfstream infile("data/aa.129295");
1950     const bool is_protein(true);
1951     const TSeqPos start(50);
1952     const TSeqPos length(232);
1953     CBlastInputSourceConfig iconfig(is_protein);
1954     iconfig.SetRange().SetFrom(start);
1955     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1956 
1957     CScope scope(*CObjectManager::GetInstance());
1958     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
1959 
1960     BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetInt().GetFrom());
1961     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
1962     BOOST_REQUIRE_EQUAL(start, ssl.seqloc->GetStart(eExtreme_Positional));
1963     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetStop(eExtreme_Positional));
1964     scope.GetObjectManager().RevokeAllDataLoaders();
1965 }
1966 
BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanTo)1967 BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanTo)
1968 {
1969     CNcbiIfstream infile("data/aa.129295");
1970     const bool is_protein(true);
1971     CBlastInputSourceConfig iconfig(is_protein);
1972     iconfig.SetRange().SetFrom(100);
1973     iconfig.SetRange().SetTo(50);
1974     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1975     CScope scope(*CObjectManager::GetInstance());
1976 
1977     try { source->GetNextSeqLocBatch(scope).front(); }
1978     catch (const CInputException& e) {
1979         string msg(e.what());
1980         BOOST_REQUIRE(msg.find("Invalid sequence range") != NPOS);
1981         BOOST_REQUIRE_EQUAL(CInputException::eInvalidRange, e.GetErrCode());
1982         return;
1983     }
1984     BOOST_REQUIRE(false); // should never get here
1985     scope.GetObjectManager().RevokeAllDataLoaders();
1986 }
1987 
BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanSequenceLength)1988 BOOST_AUTO_TEST_CASE(RangeInvalid_FromGreaterThanSequenceLength)
1989 {
1990     CNcbiIfstream infile("data/aa.129295");
1991     const bool is_protein(true);
1992     CBlastInputSourceConfig iconfig(is_protein);
1993     iconfig.SetRange().SetFrom(1000);
1994     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
1995     CScope scope(*CObjectManager::GetInstance());
1996 
1997     try { source->GetNextSeqLocBatch(scope).front(); }
1998     catch (const CInputException& e) {
1999         string msg(e.what());
2000         BOOST_REQUIRE(msg.find("Invalid from coordinate") != NPOS);
2001         BOOST_REQUIRE_EQUAL(CInputException::eInvalidRange, e.GetErrCode());
2002         return;
2003     }
2004     BOOST_REQUIRE(false); // should never get here
2005     scope.GetObjectManager().RevokeAllDataLoaders();
2006 }
2007 
BOOST_AUTO_TEST_CASE(RangeInvalid_ToEqualThanSequenceLength)2008 BOOST_AUTO_TEST_CASE(RangeInvalid_ToEqualThanSequenceLength)
2009 {
2010     CNcbiIfstream infile("data/aa.129295");
2011     const bool is_protein(true);
2012     const TSeqPos length(232);
2013     CBlastInputSourceConfig iconfig(is_protein);
2014     iconfig.SetRange().SetFrom(10);
2015     iconfig.SetRange().SetTo(length);
2016     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2017     CScope scope(*CObjectManager::GetInstance());
2018 
2019     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
2020 
2021     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2022 
2023     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2024     BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2025 
2026     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2027     BOOST_REQUIRE_EQUAL((TSeqPos)10, ssl.seqloc->GetInt().GetFrom());
2028 
2029     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2030     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2031     scope.GetObjectManager().RevokeAllDataLoaders();
2032 }
2033 
BOOST_AUTO_TEST_CASE(RangeInvalid_ToGreaterThanSequenceLength)2034 BOOST_AUTO_TEST_CASE(RangeInvalid_ToGreaterThanSequenceLength)
2035 {
2036     CNcbiIfstream infile("data/aa.129295");
2037     const bool is_protein(true);
2038     const TSeqPos length(232);
2039     CBlastInputSourceConfig iconfig(is_protein);
2040     iconfig.SetRange().SetFrom(10);
2041     iconfig.SetRange().SetTo(length*2);
2042     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2043     CScope scope(*CObjectManager::GetInstance());
2044 
2045     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
2046 
2047     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2048 
2049     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2050     BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2051 
2052     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2053     BOOST_REQUIRE_EQUAL((TSeqPos)10, ssl.seqloc->GetInt().GetFrom());
2054 
2055     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2056     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2057     scope.GetObjectManager().RevokeAllDataLoaders();
2058 }
2059 
BOOST_AUTO_TEST_CASE(ParseDefline)2060 BOOST_AUTO_TEST_CASE(ParseDefline)
2061 {
2062     CNcbiIfstream infile("data/aa.129295");
2063     const bool is_protein(true);
2064     CBlastInputSourceConfig iconfig(is_protein);
2065     iconfig.SetBelieveDeflines(true);
2066     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2067     CScope scope(*CObjectManager::GetInstance());
2068 
2069     const TGi gi = GI_CONST(129295);
2070     const string name = "OVAX_CHICK";
2071     const string accession = "P01013";
2072     const string release = "reviewed";
2073     blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front();
2074     BOOST_REQUIRE( !blast::IsLocalId(ssl.seqloc->GetId()) );
2075 
2076     if ( !CSeq_id::PreferAccessionOverGi() ) {
2077         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetId()->Which());
2078         BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetId()->GetGi());
2079         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, ssl.seqloc->GetInt().GetId().Which());
2080         BOOST_REQUIRE_EQUAL(gi, ssl.seqloc->GetInt().GetId().GetGi());
2081     }
2082     else {
2083         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, ssl.seqloc->GetId()->Which());
2084         BOOST_REQUIRE_EQUAL(name, ssl.seqloc->GetId()->GetSwissprot().GetName());
2085         BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetId()->GetSwissprot().GetAccession());
2086         BOOST_REQUIRE_EQUAL(release, ssl.seqloc->GetId()->GetSwissprot().GetRelease());
2087         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, ssl.seqloc->GetInt().GetId().Which());
2088         BOOST_REQUIRE_EQUAL(name, ssl.seqloc->GetInt().GetId().GetSwissprot().GetName());
2089         BOOST_REQUIRE_EQUAL(accession, ssl.seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2090         BOOST_REQUIRE_EQUAL(release, ssl.seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2091     }
2092     scope.GetObjectManager().RevokeAllDataLoaders();
2093 }
2094 
BOOST_AUTO_TEST_CASE(BadProtStrand)2095 BOOST_AUTO_TEST_CASE(BadProtStrand)
2096 {
2097     CNcbiIfstream infile("data/aa.129295");
2098     const bool is_protein(true);
2099     CBlastInputSourceConfig iconfig(is_protein);
2100     iconfig.SetStrand(eNa_strand_both);
2101     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2102     CScope scope(*CObjectManager::GetInstance());
2103 
2104     bool caught_exception(false);
2105     try { blast::SSeqLoc ssl = source->GetNextSeqLocBatch(scope).front(); }
2106     catch (const CInputException& e) {
2107         string msg(e.what());
2108         BOOST_REQUIRE(msg.find("Cannot assign nucleotide strand to protein")
2109                     != NPOS);
2110         BOOST_REQUIRE_EQUAL(CInputException::eInvalidStrand, e.GetErrCode());
2111         caught_exception = true;
2112     }
2113     BOOST_REQUIRE(caught_exception);
2114     BOOST_REQUIRE(source->End() == true);
2115     scope.GetObjectManager().RevokeAllDataLoaders();
2116 }
2117 
BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineNucl_Multiple)2118 BOOST_AUTO_TEST_CASE(ReadFastaWithDeflineNucl_Multiple)
2119 {
2120     CNcbiIfstream infile("data/nt.cat");
2121     const bool is_protein(false);
2122     CBlastInputSourceConfig iconfig(is_protein);
2123     iconfig.SetStrand(eNa_strand_both);
2124     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2125 
2126     const size_t kNumQueries(2);
2127     CScope scope(*CObjectManager::GetInstance());
2128     blast::TSeqLocVector query_vector = source->GetAllSeqLocs(scope);
2129     BOOST_REQUIRE_EQUAL(kNumQueries, query_vector.size());
2130     BOOST_REQUIRE(source->End() == true);
2131 
2132     {{
2133         blast::TSeqLocVector cached_queries = source->GetAllSeqLocs(scope);
2134         BOOST_REQUIRE_EQUAL((size_t)0, (size_t)cached_queries.size());
2135         BOOST_REQUIRE(source->End() == true);
2136     }}
2137 
2138     blast::SSeqLoc ssl = query_vector.front();
2139     TSeqPos length = 646;
2140 
2141     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
2142     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetStop(eExtreme_Positional));
2143     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
2144     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2145     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2146 
2147     ssl = query_vector.back();
2148 
2149     length = 360;
2150     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetStrand());
2151     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetStop(eExtreme_Positional));
2152     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
2153     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2154     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2155     BOOST_REQUIRE(!ssl.mask);
2156     scope.GetObjectManager().RevokeAllDataLoaders();
2157 }
2158 
BOOST_AUTO_TEST_CASE(NuclStrand)2159 BOOST_AUTO_TEST_CASE(NuclStrand)
2160 {
2161     const char* fname("data/nt.cat");
2162     const bool is_protein(false);
2163     CBlastInputSourceConfig iconfig(is_protein);
2164     CScope scope(*CObjectManager::GetInstance());
2165 
2166     // Test plus strand
2167     {
2168         CNcbiIfstream infile(fname);
2169         const ENa_strand strand(eNa_strand_plus);
2170         iconfig.SetStrand(strand);
2171         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2172         TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2173 
2174         ITERATE(TSeqLocVector, itr, seqs) {
2175             const blast::SSeqLoc& ssl = *itr;
2176             BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetStrand());
2177             BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetInt().GetStrand());
2178             BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2179         }
2180     }
2181 
2182     // Test minus strand
2183     {
2184         CNcbiIfstream infile(fname);
2185         const ENa_strand strand(eNa_strand_minus);
2186         iconfig.SetStrand(strand);
2187         CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2188         TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2189 
2190         ITERATE(TSeqLocVector, itr, seqs) {
2191             const blast::SSeqLoc& ssl = *itr;
2192             BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetStrand());
2193             BOOST_REQUIRE_EQUAL((int)strand, (int)ssl.seqloc->GetInt().GetStrand());
2194             BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()));
2195         }
2196     }
2197     scope.GetObjectManager().RevokeAllDataLoaders();
2198 }
2199 
BOOST_AUTO_TEST_CASE(NuclLcaseMask_TSeqLocVector)2200 BOOST_AUTO_TEST_CASE(NuclLcaseMask_TSeqLocVector)
2201 {
2202     CNcbiIfstream infile("data/nt.cat");
2203     const bool is_protein(false);
2204     CBlastInputSourceConfig iconfig(is_protein);
2205     BOOST_REQUIRE(iconfig.GetBelieveDeflines() == false);
2206     BOOST_REQUIRE(iconfig.GetLowercaseMask() == false);
2207     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)iconfig.GetStrand());
2208     iconfig.SetLowercaseMask(true);
2209     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2210     CScope scope(*CObjectManager::GetInstance());
2211 
2212     blast::TSeqLocVector seqs = source->GetNextSeqLocBatch(scope);
2213     blast::TSeqLocVector::iterator itr = seqs.begin();
2214     blast::SSeqLoc ssl = *itr;
2215     BOOST_REQUIRE(ssl.mask);
2216     BOOST_REQUIRE(ssl.mask->IsPacked_int());
2217 
2218     CPacked_seqint::Tdata masklocs = ssl.mask->GetPacked_int();
2219     BOOST_REQUIRE_EQUAL((size_t)2, masklocs.size());
2220     BOOST_REQUIRE_EQUAL((TSeqPos)126, masklocs.front()->GetFrom());
2221     BOOST_REQUIRE_EQUAL((TSeqPos)167, masklocs.front()->GetTo());
2222     // any masks read from the file are expected to be in the plus strand
2223     BOOST_REQUIRE(masklocs.front()->CanGetStrand());
2224     BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)masklocs.front()->GetStrand());
2225 
2226     BOOST_REQUIRE_EQUAL((TSeqPos)330, masklocs.back()->GetFrom());
2227     BOOST_REQUIRE_EQUAL((TSeqPos)356, masklocs.back()->GetTo());
2228     // any masks read from the file are expected to be in the plus strand
2229     BOOST_REQUIRE(masklocs.back()->CanGetStrand());
2230     BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)masklocs.back()->GetStrand());
2231 
2232     ssl = *++itr;
2233     BOOST_REQUIRE(ssl.mask);
2234     BOOST_REQUIRE(ssl.mask->IsNull());
2235     scope.GetObjectManager().RevokeAllDataLoaders();
2236 }
2237 
BOOST_AUTO_TEST_CASE(NuclLcaseMask_BlastQueryVector)2238 BOOST_AUTO_TEST_CASE(NuclLcaseMask_BlastQueryVector)
2239 {
2240     CNcbiIfstream infile("data/nt.cat");
2241     const bool is_protein(false);
2242     CBlastInputSourceConfig iconfig(is_protein);
2243     iconfig.SetLowercaseMask(true);
2244     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2245     CScope scope(*CObjectManager::GetInstance());
2246 
2247     CRef<blast::CBlastQueryVector> seqs = source->GetNextSeqBatch(scope);
2248     BOOST_REQUIRE( !seqs->Empty() );
2249     BOOST_REQUIRE_EQUAL((int)2, (int)seqs->size());
2250     CRef<blast::CBlastSearchQuery> query = (*seqs)[0];
2251     BOOST_REQUIRE( !query->GetMaskedRegions().empty());
2252 
2253     CRef<CPacked_seqint> masks =
2254         query->GetMaskedRegions().ConvertToCPacked_seqint();
2255     CPacked_seqint::Tdata masklocs = masks->Get();
2256     CPacked_seqint::Tdata::const_iterator itr = masks->Get().begin();
2257     BOOST_REQUIRE_EQUAL((size_t)4, masklocs.size());
2258 
2259     // Note that for this case, the masks even though are also read from the
2260     // file (as the unit test above), these are returned for both strands.
2261     BOOST_REQUIRE_EQUAL((TSeqPos)126, (*itr)->GetFrom());
2262     BOOST_REQUIRE_EQUAL((TSeqPos)167, (*itr)->GetTo());
2263     BOOST_REQUIRE((*itr)->CanGetStrand());
2264     BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)(*itr)->GetStrand());
2265     ++itr;
2266     BOOST_REQUIRE_EQUAL((TSeqPos)126, (*itr)->GetFrom());
2267     BOOST_REQUIRE_EQUAL((TSeqPos)167, (*itr)->GetTo());
2268     BOOST_REQUIRE((*itr)->CanGetStrand());
2269     BOOST_REQUIRE_EQUAL((int)eNa_strand_minus, (int)(*itr)->GetStrand());
2270     ++itr;
2271 
2272     BOOST_REQUIRE_EQUAL((TSeqPos)330, (*itr)->GetFrom());
2273     BOOST_REQUIRE_EQUAL((TSeqPos)356, (*itr)->GetTo());
2274     BOOST_REQUIRE((*itr)->CanGetStrand());
2275     BOOST_REQUIRE_EQUAL((int)eNa_strand_plus, (int)(*itr)->GetStrand());
2276     ++itr;
2277     BOOST_REQUIRE_EQUAL((TSeqPos)330, (*itr)->GetFrom());
2278     BOOST_REQUIRE_EQUAL((TSeqPos)356, (*itr)->GetTo());
2279     BOOST_REQUIRE((*itr)->CanGetStrand());
2280     BOOST_REQUIRE_EQUAL((int)eNa_strand_minus, (int)(*itr)->GetStrand());
2281     ++itr;
2282 
2283     BOOST_REQUIRE(itr == masks->Get().end());
2284 
2285     query = (*seqs)[1];
2286     BOOST_REQUIRE(query->GetMaskedRegions().empty());
2287     scope.GetObjectManager().RevokeAllDataLoaders();
2288 }
2289 
BOOST_AUTO_TEST_CASE(MultiSeq)2290 BOOST_AUTO_TEST_CASE(MultiSeq)
2291 {
2292     CNcbiIfstream infile("data/aa.cat");
2293     const bool is_protein(true);
2294     CBlastInputSourceConfig iconfig(is_protein);
2295     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2296     CScope scope(*CObjectManager::GetInstance());
2297 
2298     blast::TSeqLocVector v = source->GetAllSeqLocs(scope);
2299     BOOST_REQUIRE(source->End());
2300     BOOST_REQUIRE_EQUAL((size_t)19, v.size());
2301     scope.GetObjectManager().RevokeAllDataLoaders();
2302 }
2303 
BOOST_AUTO_TEST_CASE(MultiRange)2304 BOOST_AUTO_TEST_CASE(MultiRange)
2305 {
2306     CNcbiIfstream infile("data/aa.cat");
2307     const bool is_protein(true);
2308     const TSeqPos start(50);
2309     const TSeqPos stop(100);
2310     CBlastInputSourceConfig iconfig(is_protein);
2311     iconfig.SetRange().SetFrom(start).SetTo(stop);
2312     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2313     CScope scope(*CObjectManager::GetInstance());
2314 
2315     blast::TSeqLocVector v = source->GetAllSeqLocs(scope);
2316     NON_CONST_ITERATE(blast::TSeqLocVector, itr, v) {
2317         BOOST_REQUIRE_EQUAL(start, itr->seqloc->GetStart(eExtreme_Positional));
2318         BOOST_REQUIRE_EQUAL(stop, itr->seqloc->GetStop(eExtreme_Positional));
2319         BOOST_REQUIRE_EQUAL(start, itr->seqloc->GetInt().GetFrom());
2320         BOOST_REQUIRE_EQUAL(stop, itr->seqloc->GetInt().GetTo());
2321     }
2322     scope.GetObjectManager().RevokeAllDataLoaders();
2323 }
2324 
BOOST_AUTO_TEST_CASE(MultiBatch)2325 BOOST_AUTO_TEST_CASE(MultiBatch)
2326 {
2327     CNcbiIfstream infile("data/aa.cat");
2328     const bool is_protein(true);
2329     CBlastInputSourceConfig iconfig(is_protein);
2330     iconfig.SetBelieveDeflines(true);
2331     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig, 5000));
2332     CScope scope(*CObjectManager::GetInstance());
2333 
2334     TGi gi;
2335     blast::TSeqLocVector v;
2336 
2337     v = source->GetNextSeqLocBatch(scope);
2338     BOOST_REQUIRE_EQUAL((size_t)7, v.size());
2339     BOOST_REQUIRE_EQUAL((TSeqPos)530, v[0].seqloc->GetInt().GetTo());
2340     gi = GI_CONST(1346057);
2341     string name = "G11A_ORYSA";
2342     string accession = "P47997";
2343     string release = "reviewed";
2344     BOOST_REQUIRE( !blast::IsLocalId(v[0].seqloc->GetId()) );
2345     if ( !CSeq_id::PreferAccessionOverGi() ) {
2346         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetInt().GetId().Which());
2347         BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetInt().GetId().GetGi());
2348         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetId()->Which());
2349         BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetId()->GetGi());
2350     }
2351     else {
2352         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetInt().GetId().Which());
2353         BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetInt().GetId().GetSwissprot().GetName());
2354         BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2355         BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2356         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetId()->Which());
2357         BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetId()->GetSwissprot().GetName());
2358         BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetId()->GetSwissprot().GetAccession());
2359         BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetId()->GetSwissprot().GetRelease());
2360     }
2361 
2362     v = source->GetNextSeqLocBatch(scope);
2363     BOOST_REQUIRE_EQUAL((size_t)8, v.size());
2364     BOOST_REQUIRE_EQUAL((TSeqPos)445, v[0].seqloc->GetInt().GetTo());
2365     gi = GI_CONST(1170625);
2366     name = "KCC1_YEAST";
2367     accession = "P27466";
2368     release = "reviewed";
2369     BOOST_REQUIRE( !blast::IsLocalId(v[0].seqloc->GetId()) );
2370     if ( !CSeq_id::PreferAccessionOverGi() ) {
2371         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetInt().GetId().Which());
2372         BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetInt().GetId().GetGi());
2373         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetId()->Which());
2374         BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetId()->GetGi());
2375     }
2376     else {
2377         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetInt().GetId().Which());
2378         BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetInt().GetId().GetSwissprot().GetName());
2379         BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2380         BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2381         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetId()->Which());
2382         BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetId()->GetSwissprot().GetName());
2383         BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetId()->GetSwissprot().GetAccession());
2384         BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetId()->GetSwissprot().GetRelease());
2385     }
2386 
2387     v = source->GetNextSeqLocBatch(scope);
2388     BOOST_REQUIRE_EQUAL((size_t)4, v.size());
2389     BOOST_REQUIRE_EQUAL((TSeqPos)688, v[0].seqloc->GetInt().GetTo());
2390     gi = GI_CONST(114152);
2391     name = "ARK1_HUMAN";
2392     accession = "P25098";
2393     release = "reviewed";
2394     BOOST_REQUIRE( !blast::IsLocalId(v[0].seqloc->GetId()) );
2395     if ( !CSeq_id::PreferAccessionOverGi() ) {
2396         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetInt().GetId().Which());
2397         BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetInt().GetId().GetGi());
2398         BOOST_REQUIRE_EQUAL(CSeq_id::e_Gi, v[0].seqloc->GetId()->Which());
2399         BOOST_REQUIRE_EQUAL(gi, v[0].seqloc->GetId()->GetGi());
2400     }
2401     else {
2402         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetInt().GetId().Which());
2403         BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetInt().GetId().GetSwissprot().GetName());
2404         BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetInt().GetId().GetSwissprot().GetAccession());
2405         BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetInt().GetId().GetSwissprot().GetRelease());
2406         BOOST_REQUIRE_EQUAL(CSeq_id::e_Swissprot, v[0].seqloc->GetId()->Which());
2407         BOOST_REQUIRE_EQUAL(name, v[0].seqloc->GetId()->GetSwissprot().GetName());
2408         BOOST_REQUIRE_EQUAL(accession, v[0].seqloc->GetId()->GetSwissprot().GetAccession());
2409         BOOST_REQUIRE_EQUAL(release, v[0].seqloc->GetId()->GetSwissprot().GetRelease());
2410     }
2411 
2412     BOOST_REQUIRE(source->End());
2413     scope.GetObjectManager().RevokeAllDataLoaders();
2414 }
2415 
BOOST_AUTO_TEST_CASE(NoDeflineExpected)2416 BOOST_AUTO_TEST_CASE(NoDeflineExpected)
2417 {
2418     CNcbiIfstream infile("data/tiny.fa");
2419     const bool is_protein(false);
2420     CBlastInputSourceConfig iconfig(is_protein);
2421     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2422     CScope scope(*CObjectManager::GetInstance());
2423 
2424     blast::TSeqLocVector v = source->GetAllSeqLocs(scope);
2425     BOOST_REQUIRE(source->End());
2426     BOOST_REQUIRE_EQUAL((size_t)1, v.size());
2427     scope.GetObjectManager().RevokeAllDataLoaders();
2428 }
2429 
BOOST_AUTO_TEST_CASE(NoDeflineUnexpected)2430 BOOST_AUTO_TEST_CASE(NoDeflineUnexpected)
2431 {
2432     CNcbiIfstream infile("data/tiny.fa");
2433     const bool is_protein(false);
2434     CBlastInputSourceConfig iconfig(is_protein);
2435     iconfig.SetBelieveDeflines(true);
2436     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2437     CScope scope(*CObjectManager::GetInstance());
2438 
2439     BOOST_REQUIRE_THROW(source->GetAllSeqLocs(scope), CException);
2440     scope.GetObjectManager().RevokeAllDataLoaders();
2441 }
BOOST_AUTO_TEST_CASE(wb325_1)2442 BOOST_AUTO_TEST_CASE(wb325_1) {
2443     string input("gb|ABZI01000088\ngb|ABZN01000067");
2444     istringstream instream(input);
2445 
2446     const bool is_protein(false);
2447     CBlastInputSourceConfig iconfig(is_protein);
2448     iconfig.SetRetrieveSeqData(false);
2449     CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2450     CScope scope(*CObjectManager::GetInstance());
2451 
2452     BOOST_REQUIRE(source->End() == false);
2453     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2454     BOOST_REQUIRE(source->End() == true);
2455     BOOST_REQUIRE_EQUAL(2u, seqs.size());
2456     //blast::SSeqLoc ssl = seqs.front();
2457     scope.GetObjectManager().RevokeAllDataLoaders();
2458 }
2459 
BOOST_AUTO_TEST_CASE(wb325_2)2460 BOOST_AUTO_TEST_CASE(wb325_2)
2461 {
2462     string input("gb|ABZN01000067\ngb|ABZI01000088");
2463     istringstream instream(input);
2464 
2465     const bool is_protein(false);
2466     CBlastInputSourceConfig iconfig(is_protein);
2467     iconfig.SetRetrieveSeqData(false);
2468     CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2469     CScope scope(*CObjectManager::GetInstance());
2470 
2471     BOOST_REQUIRE(source->End() == false);
2472     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2473     BOOST_REQUIRE(source->End() == true);
2474     BOOST_REQUIRE_EQUAL(2u, seqs.size());
2475     //blast::SSeqLoc ssl = seqs.front();
2476     scope.GetObjectManager().RevokeAllDataLoaders();
2477 }
2478 
BOOST_AUTO_TEST_CASE(wb325_single1)2479 BOOST_AUTO_TEST_CASE(wb325_single1)
2480 {
2481     string input("gb|ABZN01000067");
2482     //string input("218001205");
2483     istringstream instream(input);
2484 
2485     const bool is_protein(false);
2486     CBlastInputSourceConfig iconfig(is_protein);
2487     iconfig.SetRetrieveSeqData(false);
2488     CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2489     CScope scope(*CObjectManager::GetInstance());
2490 
2491     BOOST_REQUIRE(source->End() == false);
2492     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2493     BOOST_REQUIRE(source->End() == true);
2494     BOOST_REQUIRE_EQUAL(1u, seqs.size());
2495     //blast::SSeqLoc ssl = seqs.front();
2496     scope.GetObjectManager().RevokeAllDataLoaders();
2497 }
2498 
BOOST_AUTO_TEST_CASE(wb325_single2)2499 BOOST_AUTO_TEST_CASE(wb325_single2)
2500 {
2501     string input("gb|ABZI01000088");
2502     //string input("217999527");
2503     istringstream instream(input);
2504 
2505     const bool is_protein(false);
2506     CBlastInputSourceConfig iconfig(is_protein);
2507     iconfig.SetRetrieveSeqData(false);
2508     CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2509     CScope scope(*CObjectManager::GetInstance());
2510 
2511     BOOST_REQUIRE(source->End() == false);
2512     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2513     BOOST_REQUIRE(source->End() == true);
2514     BOOST_REQUIRE_EQUAL(1u, seqs.size());
2515     //blast::SSeqLoc ssl = seqs.front();
2516     scope.GetObjectManager().RevokeAllDataLoaders();
2517 }
2518 
BOOST_AUTO_TEST_CASE(ReadSinglePdb)2519 BOOST_AUTO_TEST_CASE(ReadSinglePdb)
2520 {
2521     string pdb_mol("1QCF");
2522     string pdb_chain("A");
2523     string pdb(pdb_mol + '_' + pdb_chain);
2524     istringstream instream(pdb);
2525 
2526     const bool is_protein(true);
2527     CBlastInputSourceConfig iconfig(is_protein);
2528     iconfig.SetRetrieveSeqData(false);
2529     CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2530     CScope scope(*CObjectManager::GetInstance());
2531 
2532     BOOST_REQUIRE(source->End() == false);
2533     blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2534     blast::SSeqLoc ssl = seqs.front();
2535 
2536     BOOST_REQUIRE(source->End() == true);
2537 
2538     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2539 
2540     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2541     BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2542 
2543     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2544     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2545 
2546     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2547     const TSeqPos length(454);
2548     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2549 
2550     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
2551     BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, ssl.seqloc->GetInt().GetId().Which());
2552 
2553     BOOST_REQUIRE_EQUAL(pdb_mol, ssl.seqloc->GetInt().GetId().GetPdb().GetMol().Get());
2554 
2555     BOOST_REQUIRE(!ssl.mask);
2556 
2557     /// Validate the data that would be retrieved by blast.cgi
2558     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2559     BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
2560     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
2561     const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
2562     BOOST_REQUIRE(! b.IsNa());
2563     BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, b.GetId().front()->Which());
2564     BOOST_REQUIRE_EQUAL(pdb_mol, b.GetId().front()->GetPdb().GetMol().Get());
2565     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
2566     BOOST_REQUIRE(! CSeq_inst::IsNa(b.GetInst().GetMol()));
2567     BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
2568     scope.GetObjectManager().RevokeAllDataLoaders();
2569 }
2570 
BOOST_AUTO_TEST_CASE(ThrowOnEmptySequence)2571 BOOST_AUTO_TEST_CASE(ThrowOnEmptySequence)
2572 {
2573     string wgs_master("NZ_ABFD00000000.2"); // Contains no sequence
2574     istringstream instream(wgs_master);
2575 
2576     const bool is_protein(false);
2577     CBlastInputSourceConfig iconfig(is_protein);
2578     iconfig.SetRetrieveSeqData(false);
2579     CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2580     CScope scope(*CObjectManager::GetInstance());
2581     BOOST_REQUIRE_THROW(source->GetAllSeqLocs(scope), CInputException);
2582     scope.GetObjectManager().RevokeAllDataLoaders();
2583 }
2584 
BOOST_AUTO_TEST_CASE(FetchSraID)2585 BOOST_AUTO_TEST_CASE(FetchSraID)
2586 {
2587     CNcbiIfstream infile("data/sra_seqid.txt");
2588     const bool is_protein(false);
2589     SDataLoaderConfig dlconfig(is_protein,
2590                                SDataLoaderConfig::eUseGenbankDataLoader);
2591     CBlastInputSourceConfig iconfig(dlconfig);
2592     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2593     CScope scope(*CObjectManager::GetInstance());
2594 
2595     TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2596     blast::SSeqLoc ssl = seqs.front();
2597     BOOST_CHECK(source->End() == true);
2598 
2599     // Obtained by running
2600     // fastq-dump SRR066117 -N 18823 -X 18823 --fasta 80 --split-spot --skip-technical  --minReadLen 6 --clip
2601     const string kSeqData =
2602         "AGCACCACGACTGCTAACCGTAACGCCAGGTGTATAACCTAATGCTTCTTTACAGACTGAAATTGATGCATCTGCATCTC"
2603         "TTCATTTGTCACAACCGAAATA";
2604 
2605     BOOST_CHECK(ssl.seqloc->IsInt());
2606     BOOST_REQUIRE(ssl.seqloc->GetId()->IsGeneral());
2607     BOOST_REQUIRE_EQUAL(CDbtag::eDbtagType_SRA,
2608                         ssl.seqloc->GetId()->GetGeneral().GetType());
2609 
2610     BOOST_CHECK(ssl.seqloc->GetInt().IsSetFrom() == true);
2611     BOOST_CHECK_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2612 
2613     BOOST_CHECK(ssl.seqloc->GetInt().IsSetTo() == true);
2614     BOOST_CHECK_EQUAL(kSeqData.size()-1, ssl.seqloc->GetInt().GetTo());
2615 
2616     const CSeq_id * seqid = ssl.seqloc->GetId();
2617     CBioseq_Handle bh = scope.GetBioseqHandle(*seqid);
2618     CSeqVector sv = bh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
2619 
2620     BOOST_CHECK_EQUAL(kSeqData.size(), sv.size());
2621     for (size_t i = 0; i < std::min((TSeqPos)kSeqData.size(), sv.size()); i++) {
2622         CNcbiOstrstream oss;
2623         oss << "Base number " << i+1 << " differs: got '"
2624             << (char)sv[i] << "', expected '" << kSeqData[i]
2625             << "'";
2626         string msg = CNcbiOstrstreamToString(oss);
2627         BOOST_CHECK_MESSAGE((char)sv[i] == kSeqData[i], msg);
2628         BOOST_CHECK_NE('-', (char)sv[i]);
2629     }
2630 
2631     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2632     const CBioseq& bioseq = bioseqs->GetSeq_set().front()->GetSeq();
2633     const CSeq_inst& inst = bioseq.GetInst();
2634     BOOST_CHECK_EQUAL(inst.GetLength(), kSeqData.size());
2635     BOOST_REQUIRE(inst.IsSetSeq_data());
2636     const CSeq_data& seq_data = inst.GetSeq_data();
2637     BOOST_REQUIRE(seq_data.IsIupacna());
2638     const string& seq = seq_data.GetIupacna().Get();
2639     for (size_t i = 0; i < seq.size(); i++) {
2640         CNcbiOstrstream oss;
2641         oss << "Base number " << i+1 << " differs: got '"
2642             << (char)sv[i] << "', expected '" << kSeqData[i]
2643             << "'";
2644         string msg = CNcbiOstrstreamToString(oss);
2645         BOOST_CHECK_MESSAGE((char)sv[i] == kSeqData[i], msg);
2646         BOOST_CHECK_NE('-', (char)seq[i]);
2647     }
2648     scope.GetObjectManager().RevokeAllDataLoaders();
2649 }
2650 
BOOST_AUTO_TEST_CASE(ReadSinglePdb_InDifferentFormats)2651 BOOST_AUTO_TEST_CASE(ReadSinglePdb_InDifferentFormats)
2652 {
2653     string pdb_mol("1IQR");
2654     string pdb_chain("A");
2655 
2656     for (int i = 0; i < 2; i++) {
2657 
2658         string pdb;
2659         if (i == 0) {
2660             pdb.assign(pdb_mol + '|' + pdb_chain);
2661         } else {
2662             pdb.assign(pdb_mol + "_" + pdb_chain);
2663         }
2664         istringstream instream(pdb);
2665 
2666         const bool is_protein(true);
2667         CBlastInputSourceConfig iconfig(is_protein);
2668         iconfig.SetRetrieveSeqData(false);
2669         CRef<CBlastInput> source(s_DeclareBlastInput(instream, iconfig));
2670         CScope scope(*CObjectManager::GetInstance());
2671 
2672         BOOST_REQUIRE(source->End() == false);
2673         blast::TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2674         blast::SSeqLoc ssl = seqs.front();
2675         BOOST_REQUIRE(source->End() == true);
2676 
2677         BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2678 
2679         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2680         BOOST_REQUIRE_EQUAL((int)eNa_strand_unknown, (int)ssl.seqloc->GetInt().GetStrand());
2681 
2682         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2683         BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2684 
2685         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2686         const TSeqPos length(420);
2687         BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2688 
2689         BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
2690         BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, ssl.seqloc->GetInt().GetId().Which());
2691 
2692         BOOST_REQUIRE_EQUAL(pdb_mol,
2693                     ssl.seqloc->GetInt().GetId().GetPdb().GetMol().Get());
2694 
2695         BOOST_REQUIRE(!ssl.mask);
2696 
2697         /// Validate the data that would be retrieved by blast.cgi
2698         CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2699         BOOST_REQUIRE_EQUAL((size_t)1, bioseqs->GetSeq_set().size());
2700         BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
2701         const CBioseq& b = bioseqs->GetSeq_set().front()->GetSeq();
2702         BOOST_REQUIRE(! b.IsNa());
2703         BOOST_REQUIRE_EQUAL(CSeq_id::e_Pdb, b.GetId().front()->Which());
2704         BOOST_REQUIRE_EQUAL(pdb_mol, b.GetId().front()->GetPdb().GetMol().Get());
2705         BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw, b.GetInst().GetRepr());
2706         BOOST_REQUIRE(! CSeq_inst::IsNa(b.GetInst().GetMol()));
2707         BOOST_REQUIRE_EQUAL(length, b.GetInst().GetLength());
2708         scope.GetObjectManager().RevokeAllDataLoaders();
2709     }
2710 
2711 }
2712 
BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN)2713 BOOST_AUTO_TEST_CASE(RawFastaNoSpaces_UpperCaseWithN)
2714 {
2715     CNcbiEnvironment().Set("BLASTINPUT_GEN_DELTA_SEQ", kEmptyStr);
2716     // this has length 682 and contains an 'N' which without the
2717     // CFastaReader::fNoSplit flag, produces a delta sequence
2718     CNcbiIfstream infile("data/nucl_w_n.fsa");
2719     const bool is_protein(false);
2720     CBlastInputSourceConfig iconfig(is_protein);
2721     CRef<CBlastInput> source(s_DeclareBlastInput(infile, iconfig));
2722 
2723     CScope scope(*CObjectManager::GetInstance());
2724     BOOST_REQUIRE(source->End() == false);
2725     TSeqLocVector seqs = source->GetAllSeqLocs(scope);
2726     blast::SSeqLoc ssl = seqs.front();
2727     BOOST_REQUIRE(source->End() == true);
2728 
2729     BOOST_REQUIRE(ssl.seqloc->IsInt() == true);
2730     BOOST_REQUIRE(blast::IsLocalId(ssl.seqloc->GetId()) == true);
2731 
2732     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetStrand() == true);
2733     BOOST_REQUIRE_EQUAL((int)eNa_strand_both, (int)ssl.seqloc->GetInt().GetStrand());
2734 
2735     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetFrom() == true);
2736     BOOST_REQUIRE_EQUAL((TSeqPos)0, ssl.seqloc->GetInt().GetFrom());
2737 
2738     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetTo() == true);
2739     const TSeqPos length(682);
2740     BOOST_REQUIRE_EQUAL(length-1, ssl.seqloc->GetInt().GetTo());
2741 
2742     BOOST_REQUIRE(ssl.seqloc->GetInt().IsSetId() == true);
2743     BOOST_REQUIRE_EQUAL(CSeq_id::e_Local, ssl.seqloc->GetInt().GetId().Which());
2744     BOOST_REQUIRE(!ssl.mask);
2745 
2746     CRef<CBioseq_set> bioseqs = TSeqLocVector2Bioseqs(seqs);
2747     BOOST_REQUIRE(bioseqs->CanGetSeq_set());
2748     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->IsSeq());
2749     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().CanGetInst());
2750     BOOST_REQUIRE(bioseqs->GetSeq_set().front()->GetSeq().GetInst().CanGetRepr());
2751     BOOST_REQUIRE_EQUAL(CSeq_inst::eRepr_raw,
2752                 bioseqs->GetSeq_set().front()->GetSeq().GetInst().GetRepr());
2753     scope.GetObjectManager().RevokeAllDataLoaders();
2754 }
2755 
2756 template <typename T>
s_Ignore(const T &)2757 inline void s_Ignore(const T&) {}
2758 
BOOST_AUTO_TEST_CASE(ParseSequenceRange_EmptyRange)2759 BOOST_AUTO_TEST_CASE(ParseSequenceRange_EmptyRange) {
2760     TSeqRange r;
2761     BOOST_REQUIRE_THROW(r = ParseSequenceRange("4-4"),
2762                         CBlastException);
2763     s_Ignore(r); /* to pacify compiler warnings */
2764 }
2765 
BOOST_AUTO_TEST_CASE(ParseSequenceRange_0BasedRange)2766 BOOST_AUTO_TEST_CASE(ParseSequenceRange_0BasedRange) {
2767     TSeqRange r;
2768     BOOST_REQUIRE_THROW(r = ParseSequenceRange("0-4"),
2769                         CBlastException);
2770     s_Ignore(r); /* to pacify compiler warnings */
2771 }
2772 
BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidDelimiter)2773 BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidDelimiter) {
2774     TSeqRange r;
2775     BOOST_REQUIRE_THROW(r = ParseSequenceRange("3,4"),
2776                         CBlastException);
2777     s_Ignore(r); /* to pacify compiler warnings */
2778 }
2779 
BOOST_AUTO_TEST_CASE(ParseSequenceRange_IncompleteRange)2780 BOOST_AUTO_TEST_CASE(ParseSequenceRange_IncompleteRange) {
2781     TSeqRange r;
2782     BOOST_REQUIRE_THROW(r = ParseSequenceRange("3"),
2783                         CBlastException);
2784     BOOST_REQUIRE_THROW(r = ParseSequenceRange("3-"),
2785                         CBlastException);
2786     BOOST_REQUIRE_THROW(r = ParseSequenceRange("-3"),
2787                         CBlastException);
2788     s_Ignore(r); /* to pacify compiler warnings */
2789 }
2790 
BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidRange)2791 BOOST_AUTO_TEST_CASE(ParseSequenceRange_InvalidRange) {
2792     TSeqRange r;
2793     BOOST_REQUIRE_THROW(r = ParseSequenceRange("9-4"),
2794                         CBlastException);
2795     BOOST_REQUIRE_THROW(r = ParseSequenceRange("-4-2"),
2796                         CBlastException);
2797     BOOST_REQUIRE_THROW(r = ParseSequenceRange("-4-9"),
2798                         CBlastException);
2799 }
2800 
BOOST_AUTO_TEST_CASE(ParseSequenceRange_1BasedRange)2801 BOOST_AUTO_TEST_CASE(ParseSequenceRange_1BasedRange) {
2802     TSeqRange r = ParseSequenceRange("1-10");
2803     BOOST_REQUIRE_EQUAL(0U, r.GetFrom());
2804     BOOST_REQUIRE_EQUAL(9U, r.GetTo());
2805     BOOST_REQUIRE_EQUAL(10U, r.GetToOpen());
2806 }
2807 
BOOST_AUTO_TEST_CASE(CheckQueryBatchSize)2808 BOOST_AUTO_TEST_CASE(CheckQueryBatchSize) {
2809     BOOST_REQUIRE_EQUAL(100000, GetQueryBatchSize(eBlastn));
2810     BOOST_REQUIRE_EQUAL(10000, GetQueryBatchSize(eBlastn, false, true));
2811 }
2812 
2813 // Test case for WB-1304: save GI (i.e.: best ranked Seq-id) if available
BOOST_AUTO_TEST_CASE(FetchGiFromAccessionInput)2814 BOOST_AUTO_TEST_CASE(FetchGiFromAccessionInput)
2815 {
2816     const CSeq_id id(CSeq_id::PreferAccessionOverGi() ?
2817         "ref|NT_026437.13|" : "gi|568802206");
2818     const string input("NT_026437.13");
2819     typedef vector<pair<SDataLoaderConfig::EConfigOpts, string> > TVecOpts;
2820     TVecOpts opts;
2821     opts.push_back(TVecOpts::value_type(SDataLoaderConfig::eUseGenbankDataLoader, "genbank"));
2822     opts.push_back(TVecOpts::value_type(SDataLoaderConfig::eUseBlastDbDataLoader, "BLASTDB"));
2823     ITERATE(TVecOpts, config, opts) {
2824         CAutoNcbiConfigFile acf(config->first);
2825         blast::SDataLoaderConfig dlconfig(false);
2826         if(config->second == "BLASTDB") {
2827             dlconfig.m_BlastDbName = "refseq_genomic";
2828         }
2829         dlconfig.OptimizeForWholeLargeSequenceRetrieval();
2830         blast::CBlastInputSourceConfig input_config(dlconfig);
2831         // this needs to be omitted for this test to work
2832         //input_config.SetRetrieveSeqData(false);
2833         CBlastFastaInputSource fasta_input(input, input_config);
2834         CBlastInput blast_input(&fasta_input);
2835         //CBlastScopeSourceWrapper scope_source(dlconfig);
2836         CRef<CScope> scope = CBlastScopeSource(dlconfig).NewScope();
2837         TSeqLocVector query_loc = blast_input.GetAllSeqLocs(*scope);
2838         BOOST_REQUIRE_EQUAL(1U, query_loc.size());
2839         string fasta_id = id.AsFastaString();
2840         string fasta_query = query_loc[0].seqloc->GetId()->AsFastaString();
2841         if (fasta_id != fasta_query) {
2842             BOOST_CHECK_EQUAL(fasta_id, fasta_query);
2843             BOOST_CHECK_MESSAGE(fasta_id == fasta_query,
2844                 "Failed using " + config->second + " data loader");
2845         }
2846         scope->GetObjectManager().RevokeAllDataLoaders();
2847     }
2848 
2849 }
2850 
2851 
2852 BOOST_AUTO_TEST_SUITE_END() // end of blastinput test suite
2853 
2854 
BOOST_AUTO_TEST_SUITE(short_reads)2855 BOOST_AUTO_TEST_SUITE(short_reads)
2856 
2857 static int s_GetSegmentFlags(const CBioseq& bioseq)
2858 {
2859     int retval = 0;
2860 
2861     BOOST_REQUIRE(bioseq.IsSetDescr());
2862     for (auto desc : bioseq.GetDescr().Get()) {
2863         if (desc->Which() == CSeqdesc::e_User) {
2864 
2865             if (!desc->GetUser().IsSetType() ||
2866                 !desc->GetUser().GetType().IsStr() ||
2867                 desc->GetUser().GetType().GetStr() != "Mapping") {
2868                 continue;
2869             }
2870 
2871             BOOST_REQUIRE(desc->GetUser().HasField("has_pair"));
2872             const CUser_field& field = desc->GetUser().GetField("has_pair");
2873             BOOST_REQUIRE(field.GetData().IsInt());
2874 
2875             retval = field.GetData().GetInt();
2876         }
2877     }
2878 
2879     return retval;
2880 }
2881 
s_GetSequenceId(const CBioseq & bioseq)2882 static string s_GetSequenceId(const CBioseq& bioseq)
2883 {
2884     string retval;
2885     if (bioseq.IsSetDescr()) {
2886         for (auto it: bioseq.GetDescr().Get()) {
2887             if (it->IsTitle()) {
2888                 vector<string> tokens;
2889                 NStr::Split(it->GetTitle(), " ", tokens);
2890                 retval = (string)"lcl|" + tokens[0];
2891             }
2892         }
2893     }
2894 
2895     if (retval.empty()) {
2896         retval = bioseq.GetFirstId()->AsFastaString();
2897     }
2898 
2899     return retval;
2900 }
2901 
2902 
BOOST_AUTO_TEST_CASE(TestPairedReadsFromFasta)2903 BOOST_AUTO_TEST_CASE(TestPairedReadsFromFasta) {
2904 
2905     CNcbiIfstream istr("data/paired_reads.fa");
2906     BOOST_REQUIRE(istr);
2907     unordered_map<string, int> ref_flags = {
2908         {"lcl|pair1", eFirstSegment},
2909         {"lcl|pair2", eLastSegment},
2910         {"lcl|incomplete1.1", eFirstSegment},
2911         {"lcl|incomplete1.2", eLastSegment},
2912         {"lcl|incomplete2.1", eFirstSegment},
2913         {"lcl|incomplete2.2", eLastSegment},
2914     };
2915 
2916 
2917     CShortReadFastaInputSource input_source(istr,
2918                                            CShortReadFastaInputSource::eFasta,
2919                                            true);
2920 
2921     CBlastInputOMF input(&input_source, 1000);
2922     CRef<CBioseq_set> queries(new CBioseq_set);
2923     input.GetNextSeqBatch(*queries);
2924     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
2925 
2926     size_t count = 0;
2927     for (auto it : queries->GetSeq_set()) {
2928         string id = s_GetSequenceId(it->GetSeq());
2929         int flags = s_GetSegmentFlags(it->GetSeq());
2930         int expected = ref_flags.at(id);
2931 
2932         BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
2933                 id + " is different from expected " +
2934                 NStr::IntToString(flags) + " != " +
2935                 NStr::IntToString(expected));
2936         count++;
2937     }
2938 
2939     BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
2940 }
2941 
BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastaFiles)2942 BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastaFiles) {
2943 
2944     CNcbiIfstream istr1("data/paired_reads_1.fa");
2945     CNcbiIfstream istr2("data/paired_reads_2.fa");
2946     BOOST_REQUIRE(istr1);
2947     BOOST_REQUIRE(istr2);
2948     unordered_map<string, int> ref_flags = {
2949         {"lcl|pair1", eFirstSegment},
2950         {"lcl|pair2", eLastSegment},
2951         {"lcl|incomplete1.1", eFirstSegment},
2952         {"lcl|incomplete1.2", eLastSegment},
2953         {"lcl|incomplete2.1", eFirstSegment},
2954         {"lcl|incomplete2.2", eLastSegment},
2955     };
2956 
2957 
2958     CShortReadFastaInputSource input_source(istr1, istr2,
2959                                      CShortReadFastaInputSource::eFasta);
2960 
2961     CBlastInputOMF input(&input_source, 1000);
2962     CRef<CBioseq_set> queries(new CBioseq_set);
2963     input.GetNextSeqBatch(*queries);
2964     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
2965 
2966     size_t count = 0;
2967     for (auto it : queries->GetSeq_set()) {
2968         string id = s_GetSequenceId(it->GetSeq());
2969         int flags = s_GetSegmentFlags(it->GetSeq());
2970         int expected = ref_flags.at(id);
2971 
2972         BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
2973                 id + " is different from expected " +
2974                 NStr::IntToString(flags) + " != " +
2975                 NStr::IntToString(expected));
2976         count++;
2977     }
2978 
2979     BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
2980 }
2981 
BOOST_AUTO_TEST_CASE(TestSingleReadsFromFasta)2982 BOOST_AUTO_TEST_CASE(TestSingleReadsFromFasta) {
2983 
2984     CNcbiIfstream istr("data/paired_reads.fa");
2985     CShortReadFastaInputSource input_source(istr,
2986                                      CShortReadFastaInputSource::eFasta,
2987                                      false);
2988 
2989     CBlastInputOMF input(&input_source, 1000);
2990     CRef<CBioseq_set> queries(new CBioseq_set);
2991     input.GetNextSeqBatch(*queries);
2992     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
2993 
2994     size_t count = 0;
2995     for (auto it : queries->GetSeq_set()) {
2996         if (it->GetSeq().IsSetDescr()) {
2997 
2998             string id = s_GetSequenceId(it->GetSeq());
2999             int flags = s_GetSegmentFlags(it->GetSeq());
3000             int expected = 0;
3001 
3002             BOOST_REQUIRE_MESSAGE(flags == expected,
3003                                   (string)"Segment flag for " +
3004                                   id + " is different from expected " +
3005                                   NStr::IntToString(flags) + " != " +
3006                                   NStr::IntToString(expected));
3007         }
3008         count++;
3009     }
3010 
3011     BOOST_REQUIRE_EQUAL(6u, count);
3012 }
3013 
BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastQ)3014 BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastQ) {
3015 
3016     CNcbiIfstream istr("data/paired_reads.fastq");
3017     BOOST_REQUIRE(istr);
3018     unordered_map<string, int> ref_flags = {
3019         {"lcl|pair1", eFirstSegment},
3020         {"lcl|pair2", eLastSegment},
3021         {"lcl|incomplete1.1", eFirstSegment},
3022         {"lcl|incomplete1.2", eLastSegment},
3023         {"lcl|incomplete2.1", eFirstSegment},
3024         {"lcl|incomplete2.2", eLastSegment},
3025     };
3026 
3027     CShortReadFastaInputSource input_source(istr,
3028                                      CShortReadFastaInputSource::eFastq,
3029                                      true);
3030 
3031     CBlastInputOMF input(&input_source, 1000);
3032     CRef<CBioseq_set> queries(new CBioseq_set);
3033     input.GetNextSeqBatch(*queries);
3034     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3035 
3036     size_t count = 0;
3037     for (auto it : queries->GetSeq_set()) {
3038         string id = s_GetSequenceId(it->GetSeq());
3039         int flags = s_GetSegmentFlags(it->GetSeq());
3040         int expected = ref_flags.at(id);
3041 
3042         BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3043                 id + " is different from expected " +
3044                 NStr::IntToString(flags) + " != " +
3045                 NStr::IntToString(expected));
3046         count++;
3047     }
3048 
3049     BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3050 }
3051 
BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastQFiles)3052 BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoFastQFiles) {
3053 
3054     CNcbiIfstream istr1("data/paired_reads_1.fastq");
3055     CNcbiIfstream istr2("data/paired_reads_2.fastq");
3056     BOOST_REQUIRE(istr1);
3057     BOOST_REQUIRE(istr2);
3058     unordered_map<string, int> ref_flags = {
3059         {"lcl|pair1", eFirstSegment},
3060         {"lcl|pair2", eLastSegment},
3061         {"lcl|incomplete1.1", eFirstSegment},
3062         {"lcl|incomplete1.2", eLastSegment},
3063         {"lcl|incomplete2.1", eFirstSegment},
3064         {"lcl|incomplete2.2", eLastSegment},
3065     };
3066 
3067     CShortReadFastaInputSource input_source(istr1, istr2,
3068                                       CShortReadFastaInputSource::eFastq);
3069 
3070     CBlastInputOMF input(&input_source, 1000);
3071     CRef<CBioseq_set> queries(new CBioseq_set);
3072     input.GetNextSeqBatch(*queries);
3073     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3074 
3075     size_t count = 0;
3076     for (auto it : queries->GetSeq_set()) {
3077         string id = s_GetSequenceId(it->GetSeq());
3078         int flags = s_GetSegmentFlags(it->GetSeq());
3079         int expected = ref_flags.at(id);
3080 
3081         BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3082                 id + " is different from expected " +
3083                 NStr::IntToString(flags) + " != " +
3084                 NStr::IntToString(expected));
3085         count++;
3086     }
3087 
3088     BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3089 }
3090 
3091 
BOOST_AUTO_TEST_CASE(TestPairedReadsFromASN1)3092 BOOST_AUTO_TEST_CASE(TestPairedReadsFromASN1) {
3093 
3094     CNcbiIfstream istr("data/paired_reads.asn");
3095     BOOST_REQUIRE(istr);
3096     unordered_map<string, int> ref_flags = {
3097         {"lcl|pair1", eFirstSegment},
3098         {"lcl|pair2", eLastSegment},
3099         {"lcl|incomplete1.1", eFirstSegment},
3100         {"lcl|incomplete1.2", eLastSegment},
3101         {"lcl|incomplete2.1", eFirstSegment},
3102         {"lcl|incomplete2.2", eLastSegment},
3103     };
3104 
3105     CASN1InputSourceOMF input_source(istr, false, true);
3106     CBlastInputOMF input(&input_source, 1000);
3107     CRef<CBioseq_set> queries(new CBioseq_set);
3108     input.GetNextSeqBatch(*queries);
3109     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3110 
3111     size_t count = 0;
3112     for (auto it : queries->GetSeq_set()) {
3113         string id = it->GetSeq().GetFirstId()->AsFastaString();
3114         int flags = s_GetSegmentFlags(it->GetSeq());
3115         int expected = ref_flags.at(id);
3116 
3117         BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3118                 id + " is different from expected " +
3119                 NStr::IntToString(flags) + " != " +
3120                 NStr::IntToString(expected));
3121         count++;
3122     }
3123 
3124     BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3125 }
3126 
BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoASN1Files)3127 BOOST_AUTO_TEST_CASE(TestPairedReadsFromTwoASN1Files) {
3128 
3129     CNcbiIfstream istr1("data/paired_reads_1.asn");
3130     CNcbiIfstream istr2("data/paired_reads_2.asn");
3131     BOOST_REQUIRE(istr1);
3132     BOOST_REQUIRE(istr2);
3133     unordered_map<string, int> ref_flags = {
3134         {"lcl|pair1", eFirstSegment},
3135         {"lcl|pair2", eLastSegment},
3136         {"lcl|incomplete1.1", eFirstSegment},
3137         {"lcl|incomplete1.2", eLastSegment},
3138         {"lcl|incomplete2.1", eFirstSegment},
3139         {"lcl|incomplete2.2", eLastSegment},
3140     };
3141 
3142     CASN1InputSourceOMF input_source(istr1, istr2, false);
3143     CBlastInputOMF input(&input_source, 1000);
3144     CRef<CBioseq_set> queries(new CBioseq_set);
3145     input.GetNextSeqBatch(*queries);
3146     // input file contains six sequences, but two should have been rejected
3147     // in screening
3148     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3149 
3150     size_t count = 0;
3151     for (auto it : queries->GetSeq_set()) {
3152         string id = it->GetSeq().GetFirstId()->AsFastaString();
3153         int flags = s_GetSegmentFlags(it->GetSeq());
3154         int expected = ref_flags.at(id);
3155 
3156         BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3157                 id + " is different from expected " +
3158                 NStr::IntToString(flags) + " != " +
3159                 NStr::IntToString(expected));
3160         count++;
3161     }
3162 
3163     BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3164 }
3165 
3166 
BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastC)3167 BOOST_AUTO_TEST_CASE(TestPairedReadsFromFastC) {
3168 
3169     CNcbiIfstream istr("data/paired_reads.fastc");
3170     BOOST_REQUIRE(istr);
3171     unordered_map<string, int> ref_flags = {
3172         {"lcl|read1.1", eFirstSegment},
3173         {"lcl|read1.2", eLastSegment},
3174         {"lcl|read2.1", eFirstSegment},
3175         {"lcl|read2.2", eLastSegment},
3176         {"lcl|read3.1", eFirstSegment},
3177         {"lcl|read3.2", eLastSegment},
3178     };
3179 
3180     CShortReadFastaInputSource input_source(istr,
3181                                      CShortReadFastaInputSource::eFastc, true);
3182     CBlastInputOMF input(&input_source, 1000);
3183     CRef<CBioseq_set> queries(new CBioseq_set);
3184     input.GetNextSeqBatch(*queries);
3185     BOOST_REQUIRE_EQUAL(queries->GetSeq_set().size(), 6u);
3186 
3187     size_t count = 0;
3188     for (auto it : queries->GetSeq_set()) {
3189         string id = s_GetSequenceId(it->GetSeq());
3190         int flags = s_GetSegmentFlags(it->GetSeq());
3191         int expected = ref_flags.at(id);
3192 
3193         BOOST_REQUIRE_MESSAGE(flags == expected, (string)"Segment flag for " +
3194                 id + " is different from expected " +
3195                 NStr::IntToString(flags) + " != " +
3196                 NStr::IntToString(expected));
3197         count++;
3198     }
3199 
3200     BOOST_REQUIRE_EQUAL(ref_flags.size(), count);
3201 }
3202 
3203 
3204 BOOST_AUTO_TEST_SUITE_END() // end of short_reads test suite
3205 
3206 
3207 BOOST_AUTO_TEST_SUITE(blastargs)
3208 
3209 /// Auxiliary class to convert a string into an argument count and vector
3210 class CString2Args
3211 {
3212 public:
CString2Args(const string & cmd_line_args)3213     CString2Args(const string& cmd_line_args) {
3214         x_Init(cmd_line_args);
3215     }
3216 
~CString2Args()3217     ~CString2Args() {
3218         x_CleanUp();
3219     }
3220 
Reset(const string & cmd_line_args)3221     void Reset(const string& cmd_line_args) {
3222         x_CleanUp();
3223         x_Init(cmd_line_args);
3224     }
3225 
CreateCArgs(CBlastAppArgs & args) const3226     CArgs* CreateCArgs(CBlastAppArgs& args) const {
3227         auto_ptr<CArgDescriptions> arg_desc(args.SetCommandLine());
3228         CNcbiArguments ncbi_args(m_Argc, m_Argv);
3229         return arg_desc->CreateArgs(ncbi_args);
3230     }
3231 
3232 private:
3233 
3234     /// Functor to help remove empty strings from a container
3235     struct empty_string_remover
3236     {
operator ()CString2Args::empty_string_remover3237         bool operator() (const string& str) {
3238             return str.empty();
3239         }
3240     };
3241 
3242     /// Extract the arguments from a command line
x_TokenizeCmdLine(const string & cmd_line_args)3243     vector<string> x_TokenizeCmdLine(const string& cmd_line_args) {
3244         vector<string> retval;
3245         NStr::Split(cmd_line_args, " ", retval);
3246         vector<string>::iterator new_end = remove_if(retval.begin(),
3247                                                      retval.end(),
3248                                                      empty_string_remover());
3249         retval.erase(new_end, retval.end());
3250         return retval;
3251     }
3252 
3253     /// Convert a C++ string into a C-style string
x_ToCString(const string & str)3254     char* x_ToCString(const string& str) {
3255         char* retval = new char[str.size()+1];
3256         strncpy(retval, str.c_str(), str.size());
3257         retval[str.size()] = '\0';
3258         return retval;
3259     }
3260 
x_CleanUp()3261     void x_CleanUp() {
3262         for (size_t i = 0; i < m_Argc; i++) {
3263             delete [] m_Argv[i];
3264         }
3265         delete [] m_Argv;
3266     }
3267 
x_Init(const string & cmd_line_args)3268     void x_Init(const string& cmd_line_args) {
3269         const string program_name("./blastinput_unit_test");
3270         vector<string> args = x_TokenizeCmdLine(cmd_line_args);
3271         m_Argc = args.size() + 1;   // one extra for dummy program name
3272         m_Argv = new char*[m_Argc];
3273         m_Argv[0] = x_ToCString(program_name);
3274         for (size_t i = 0; i < args.size(); i++) {
3275             m_Argv[i+1] = x_ToCString(args[i]);
3276         }
3277     }
3278 
3279     char** m_Argv;
3280     size_t m_Argc;
3281 };
3282 
3283 /* Test for the PSI-BLAST command line application arguments */
3284 
BOOST_AUTO_TEST_CASE(PsiBlastAppTestMatrix)3285 BOOST_AUTO_TEST_CASE(PsiBlastAppTestMatrix)
3286 {
3287     CPsiBlastAppArgs psiblast_args;
3288     CString2Args s2a("-matrix BLOSUM80 -db ecoli ");
3289     auto_ptr<CArgs> args(s2a.CreateCArgs(psiblast_args));
3290 
3291     CRef<CBlastOptionsHandle> opts = psiblast_args.SetOptions(*args);
3292 
3293     BOOST_REQUIRE_EQUAL(opts->GetOptions().GetMatrixName(), string("BLOSUM80"));
3294 }
3295 
BOOST_AUTO_TEST_CASE(RpsBlastCBS)3296 BOOST_AUTO_TEST_CASE(RpsBlastCBS)
3297 {
3298 	CRPSBlastAppArgs rpsblast_args;
3299     	CString2Args s2a("-db ecoli ");
3300     	auto_ptr<CArgs> args(s2a.CreateCArgs(rpsblast_args));
3301 	CRef<CBlastOptionsHandle> opts = rpsblast_args.SetOptions(*args);
3302     	BOOST_REQUIRE_EQUAL(opts->GetOptions().GetCompositionBasedStats(), 1);
3303     	BOOST_REQUIRE(opts->GetOptions().GetSegFiltering() == false);
3304 }
3305 
BOOST_AUTO_TEST_CASE(CheckMutuallyExclusiveOptions)3306 BOOST_AUTO_TEST_CASE(CheckMutuallyExclusiveOptions)
3307 {
3308     CString2Args s2a("-remote -num_threads 2");
3309 
3310     typedef vector< CRef<CBlastAppArgs> > TArgClasses;
3311     vector< CRef<CBlastAppArgs> > arg_classes;
3312     arg_classes.push_back(CRef<CBlastAppArgs>(new CPsiBlastAppArgs));
3313     arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastpAppArgs));
3314     arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastnAppArgs));
3315     arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastxAppArgs));
3316     arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastnAppArgs));
3317     arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastxAppArgs));
3318 
3319     NON_CONST_ITERATE(TArgClasses, itr, arg_classes) {
3320         auto_ptr<CArgs> args;
3321         BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(**itr)),
3322                           CArgException);
3323     }
3324 }
3325 
BOOST_AUTO_TEST_CASE(CheckDiscoMegablast)3326 BOOST_AUTO_TEST_CASE(CheckDiscoMegablast) {
3327     auto_ptr<CArgs> args;
3328     CBlastnAppArgs blastn_args;
3329 
3330     // missing required template_length argument
3331     CString2Args s2a("-db ecoli -template_type coding ");
3332     BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blastn_args)),
3333                       CArgException);
3334     // missing required template_type argument
3335     s2a.Reset("-db ecoli -template_length 21 ");
3336     BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blastn_args)),
3337                       CArgException);
3338 
3339     // valid combination
3340     s2a.Reset("-db ecoli -template_type coding -template_length 16");
3341     BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blastn_args)));
3342 
3343     // test the setting of an invalid word size for disco. megablast
3344     s2a.Reset("-db ecoli -word_size 32 -template_type optimal -template_length 16");
3345     BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blastn_args)));
3346     CRef<CBlastOptionsHandle> opts;
3347     BOOST_REQUIRE_THROW(blastn_args.SetOptions(*args), CInputException);
3348 }
3349 
BOOST_AUTO_TEST_CASE(CheckPercentIdentity)3350 BOOST_AUTO_TEST_CASE(CheckPercentIdentity) {
3351     auto_ptr<CArgs> args;
3352     CBlastnAppArgs blast_args;
3353 
3354     // invalid value
3355     CString2Args s2a("-db ecoli -perc_identity 104.3");
3356     BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blast_args)),
3357                       CArgException);
3358 
3359     // valid combination
3360     s2a.Reset("-db ecoli -perc_identity 75.0 ");
3361     BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3362 }
3363 
BOOST_AUTO_TEST_CASE(CheckNoGreedyExtension)3364 BOOST_AUTO_TEST_CASE(CheckNoGreedyExtension) {
3365     auto_ptr<CArgs> args;
3366     CBlastnAppArgs blast_args;
3367 
3368     CString2Args s2a("-db ecoli -no_greedy");
3369     BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3370     CRef<CBlastOptionsHandle> opts;
3371     // this throws because non-affine gapping costs must be provided for
3372     // non-greedy extension
3373     BOOST_REQUIRE_THROW(blast_args.SetOptions(*args), CInputException);
3374 }
3375 
BOOST_AUTO_TEST_CASE(CheckCulling)3376 BOOST_AUTO_TEST_CASE(CheckCulling) {
3377     typedef vector< CRef<CBlastAppArgs> > TArgClasses;
3378     vector< CRef<CBlastAppArgs> > arg_classes;
3379     arg_classes.push_back(CRef<CBlastAppArgs>(new CPsiBlastAppArgs));
3380     arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastpAppArgs));
3381     arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastnAppArgs));
3382     arg_classes.push_back(CRef<CBlastAppArgs>(new CBlastxAppArgs));
3383     arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastnAppArgs));
3384     arg_classes.push_back(CRef<CBlastAppArgs>(new CTblastxAppArgs));
3385 
3386     NON_CONST_ITERATE(TArgClasses, itr, arg_classes) {
3387         auto_ptr<CArgs> args;
3388         // invalid value
3389         CString2Args s2a("-db ecoli -culling_limit -4");
3390         BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(**itr)),
3391                           CArgException);
3392 
3393         // valid combination
3394         s2a.Reset("-db ecoli -culling_limit 0");
3395         BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(**itr)));
3396     }
3397 
3398 }
3399 
BOOST_AUTO_TEST_CASE(CheckTaskArgs)3400 BOOST_AUTO_TEST_CASE(CheckTaskArgs) {
3401     set<string> tasks
3402         (CBlastOptionsFactory::GetTasks(CBlastOptionsFactory::eNuclNucl));
3403     CRef<IBlastCmdLineArgs> arg;
3404     arg.Reset(new CTaskCmdLineArgs(tasks, "megablast")),
3405     arg.Reset(new CTaskCmdLineArgs(tasks, "dc-megablast")),
3406     arg.Reset(new CTaskCmdLineArgs(tasks, "blastn")),
3407     arg.Reset(new CTaskCmdLineArgs(tasks, "blastn-short")),
3408 
3409     tasks = CBlastOptionsFactory::GetTasks(CBlastOptionsFactory::eProtProt);
3410     arg.Reset(new CTaskCmdLineArgs(tasks, "blastp"));
3411     arg.Reset(new CTaskCmdLineArgs(tasks, "blastp-short"));
3412 }
3413 
BOOST_AUTO_TEST_CASE(CheckQueryCoveragePercent)3414 BOOST_AUTO_TEST_CASE(CheckQueryCoveragePercent) {
3415     auto_ptr<CArgs> args;
3416     CBlastxAppArgs blast_args;
3417 
3418     // invalid value
3419     CString2Args s2a("-db ecoli -qcov_hsp_perc 100.3");
3420     BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blast_args)),
3421                       CArgException);
3422 
3423     // valid combination
3424     s2a.Reset("-db ecoli -qcov_hsp_perc 15");
3425     BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3426 }
3427 
BOOST_AUTO_TEST_CASE(CheckMaxHspsPerSubject)3428 BOOST_AUTO_TEST_CASE(CheckMaxHspsPerSubject) {
3429     auto_ptr<CArgs> args;
3430     CBlastxAppArgs blast_args;
3431 
3432     // invalid value
3433     CString2Args s2a("-db ecoli -max_hsps 0");
3434     BOOST_REQUIRE_THROW(args.reset(s2a.CreateCArgs(blast_args)),
3435                       CArgException);
3436 
3437     // valid combination
3438     s2a.Reset("-db ecoli -max_hsps 5");
3439     BOOST_REQUIRE_NO_THROW(args.reset(s2a.CreateCArgs(blast_args)));
3440 }
3441 
3442 BOOST_AUTO_TEST_SUITE_END() // end of blastargs test suite
3443 
3444 #endif /* SKIP_DOXYGEN_PROCESSING */
3445