1 /*  $Id: split_query_unit_test.cpp 607143 2020-04-30 13:01:21Z grichenk $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Christiam Camacho
27 *
28 * File Description:
29 *   Unit test module for code to split query sequences
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/test_boost.hpp>
35 #include "test_objmgr.hpp"
36 
37 #include <blast_objmgr_priv.hpp>
38 #include <algo/blast/core/split_query.h>
39 #include "blast_aux_priv.hpp"
40 #include "split_query_aux_priv.hpp"
41 #include <algo/blast/api/blast_options_handle.hpp>
42 #include "split_query.hpp"
43 #include <algo/blast/api/objmgr_query_data.hpp>
44 #include <algo/blast/api/local_blast.hpp>
45 #include <util/random_gen.hpp>
46 #include <objtools/simple/simple_om.hpp>
47 
48 /* IMPORTANT NOTE: If you have made changes to the query splitting code, the
49  * data in the configuration file (split_query.ini) might need to be updated.
50  * To aid in this, the xblast library supports tracing messages that output the
51  * internal data structure's contents to facilitate updating this file. To
52  * enable this, please run the unit_test application with the DIAG_TRACE
53  * environment variable set.
54  */
55 
56 typedef vector<vector<Uint4> > TSplitQueryChunkMap;
57 
58 using namespace std;
59 using namespace ncbi;
60 using namespace ncbi::objects;
61 using namespace ncbi::blast;
62 
63 /// Calculate and assign the maximum length field in the BlastQueryInfo
64 /// structure
s_CalculateMaxLength(BlastQueryInfo * query_info)65 static void s_CalculateMaxLength(BlastQueryInfo* query_info)
66 {
67     query_info->max_length = 0;
68     for (int i = query_info->first_context; i <= query_info->last_context; i++)
69     {
70         BOOST_REQUIRE(query_info->contexts[i].query_length >= 0);
71         query_info->max_length =
72             max<Uint4>(query_info->max_length,
73                        query_info->contexts[i].query_length);
74     }
75 }
76 
77 /// Pair for gis and their length (in that order)
78 typedef pair<TIntId, size_t> TGiLenPair;
79 /// Vector containing pairs of gis and their length
80 typedef vector<TGiLenPair> TGiLengthVector;
81 
82 /// Convert a vector of GIs with its lengths into a TSeqLocVector
83 /// @param gi_length vector of TGiLenPair containing GIs and their length [in]
84 /// @param retval the return value of this function [out]
85 /// @param tot_length total length of sequence data contained in gi_length
86 /// (optional) [in]
87 /// @param strands vector of strands to use (optional), if provided it must
88 /// match the size of the gi_length vector [in]
89 /// @param masks vector of masks (optional), if provided it must match the size
90 /// of the gi_length vector [in]
91 static void
s_ConvertToBlastQueries(const TGiLengthVector & gi_length,TSeqLocVector & retval,size_t * tot_length=NULL,vector<ENa_strand> * strands=NULL,const TSeqLocInfoVector * masks=NULL)92 s_ConvertToBlastQueries(const TGiLengthVector& gi_length,
93                         TSeqLocVector& retval,
94                         size_t* tot_length = NULL,
95                         vector<ENa_strand>* strands = NULL,
96                         const TSeqLocInfoVector* masks = NULL)
97 {
98     if (tot_length) {
99         *tot_length = 0;
100     }
101     retval.clear();
102     retval.reserve(gi_length.size());
103 
104     if (strands) {
105         BOOST_REQUIRE(strands->size() == gi_length.size());
106     }
107     if (masks) {
108         BOOST_REQUIRE(masks->size() == gi_length.size());
109     }
110 
111     for (size_t i = 0; i < gi_length.size(); i++) {
112         CRef<CSeq_loc> loc(new CSeq_loc());
113         if (strands) {
114             CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, gi_length[i].first));
115             loc->SetInt().SetFrom(0);
116             loc->SetInt().SetTo(gi_length[i].second-1);
117             loc->SetId(*id);
118             loc->SetStrand((*strands)[i]);
119         } else {
120             loc->SetWhole().SetGi(GI_FROM(TIntId, gi_length[i].first));
121         }
122         CRef<CScope> scope(CSimpleOM::NewScope());
123         retval.push_back(SSeqLoc(loc, &*scope));
124         if (tot_length) {
125             *tot_length += gi_length[i].second;
126         }
127     }
128 
129     if (masks == NULL) {
130         return;
131     }
132 
133     for (size_t i = 0; i < masks->size(); i++) {
134         const TMaskedQueryRegions& single_query_masks = (*masks)[i];
135         // FIXME: don't make the distinction between single and multiple masks
136         CRef<CSeq_loc> m(new CSeq_loc);
137 
138         if (single_query_masks.size() == 1) {
139             const CSeq_interval& interval =
140                 single_query_masks.front()->GetInterval();
141             m->SetInt(const_cast<CSeq_interval&>(interval));
142         } else {
143             ITERATE(TMaskedQueryRegions, mask, single_query_masks) {
144                 const CSeq_interval& interval = (*mask)->GetInterval();
145                 m->SetPacked_int().AddInterval(interval);
146             }
147         }
148         BOOST_REQUIRE(m->IsInt() || m->IsPacked_int());
149         retval[i].mask = m;
150     }
151 }
152 
153 class CSplitQueryTestFixture {
154 public:
155     /// This represents the split_query.ini configuration file
156     CRef<CNcbiRegistry> m_Config;
157     /// Default value used when a field is not present in the config file
158     static const int kDefaultIntValue = -1;
159 
CSplitQueryTestFixture()160     CSplitQueryTestFixture() {
161         // read the configuration file if it hasn't been read yet
162         if (m_Config.Empty()) {
163             const IRegistry::TFlags flags =
164                 IRegistry::fNoOverride |
165                 IRegistry::fTransient |
166                 IRegistry::fNotJustCore |
167                 IRegistry::fTruncate;
168 
169             const string fname("data/split_query.ini");
170             ifstream config_file(fname.c_str());
171             m_Config.Reset(new CNcbiRegistry(config_file, flags));
172 
173             if (m_Config->Empty()) {
174                 throw runtime_error("Failed to read configuration file" +
175                                     fname);
176             }
177         }
178     }
179 
~CSplitQueryTestFixture()180     ~CSplitQueryTestFixture() {
181         BOOST_REQUIRE(m_Config.NotEmpty());
182     }
183 
184     /// Populate a BLAST_SequenceBlk and BlastQueryInfo structures out of an
185     /// array of GIs
186     /// @param gis array of GIs, last element must be -1 indicating the end of
187     /// the array [in]
188     /// @param program program for which the query data will be created [in]
189     /// @param seq_blk BLAST_SequenceBlk structure to populate [out]
190     /// @param qinfo BlastQueryInfo structure to populate [out]
191     /// @param strand strand to use (optional) [in]
x_PrepareBlastQueryStructures(TIntId gis[],EProgram program,BLAST_SequenceBlk ** seq_blk,BlastQueryInfo ** qinfo,ENa_strand * strand=NULL)192     void x_PrepareBlastQueryStructures(TIntId gis[],
193                                        EProgram program,
194                                        BLAST_SequenceBlk** seq_blk,
195                                        BlastQueryInfo** qinfo,
196                                        ENa_strand* strand = NULL)
197     {
198         BOOST_REQUIRE(seq_blk);
199         BOOST_REQUIRE(qinfo);
200         TSeqLocVector queries;
201 
202         for (int i = 0; gis[i] != -1; i++) {
203             CRef<CSeq_loc> loc(new CSeq_loc());
204             loc->SetWhole().SetGi(GI_FROM(TIntId, gis[i]));
205             CScope* scope = new CScope(CTestObjMgr::Instance().GetObjMgr());
206             scope->AddDefaults();
207             queries.push_back(SSeqLoc(loc, scope));
208         }
209 
210         CRef<CBlastOptionsHandle> opts(CBlastOptionsFactory::Create(program));
211 
212         TSearchMessages msgs;
213 
214         const CBlastOptions& kOpts = opts->GetOptions();
215         EBlastProgramType prog = kOpts.GetProgramType();
216         ENa_strand strand_opt = (strand != NULL)
217             ? *strand : kOpts.GetStrandOption();
218 
219         SetupQueryInfo(queries, prog, strand_opt, qinfo);
220         SetupQueries(queries, *qinfo, seq_blk,
221                      prog, strand_opt, msgs);
222         BOOST_REQUIRE(msgs.HasMessages() == false);
223     }
224 
x_TestCContextTranslator(TGiLengthVector & gi_length,size_t chunk_size,size_t num_chunks,blast::EProgram program,vector<vector<int>> & starting_chunks,vector<vector<int>> & absolute_contexts,vector<vector<size_t>> * context_offsets,ENa_strand strand,vector<ENa_strand> * query_strands=NULL)225     void x_TestCContextTranslator(TGiLengthVector& gi_length,
226                                   size_t chunk_size,
227                                   size_t num_chunks,
228                                   blast::EProgram program,
229                                   vector< vector<int> >& starting_chunks,
230                                   vector< vector<int> >& absolute_contexts,
231                                   vector< vector<size_t> >* context_offsets,
232                                   ENa_strand strand,
233                                   vector<ENa_strand>* query_strands = NULL) {
234 
235         if (query_strands) {
236             BOOST_REQUIRE_EQUAL(gi_length.size(), query_strands->size());
237         }
238 
239         size_t tot_length;
240         TSeqLocVector queries;
241         s_ConvertToBlastQueries(gi_length, queries, &tot_length, query_strands);
242 
243         size_t nc = SplitQuery_CalculateNumChunks(
244                                       EProgramToEBlastProgramType(program),
245                                       &chunk_size, tot_length, queries.size());
246         BOOST_REQUIRE_EQUAL(num_chunks, nc);
247 
248         CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
249         CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(program));
250         CRef<CBlastOptions> opts(&opts_h->SetOptions());
251         if ( !query_strands ) {
252             opts->SetStrandOption(strand);
253         }
254         CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
255 
256         CAutoEnvironmentVariable tmp_env("CHUNK_SIZE",
257                                          NStr::SizetToString(chunk_size,
258                                                            NStr::fConvErr_NoThrow));
259         CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
260         CRef<CSplitQueryBlk> sqb = splitter->Split();
261 
262         BOOST_REQUIRE_EQUAL((size_t)splitter->GetNumberOfChunks(), num_chunks);
263 
264         CContextTranslator ctx_translator(*sqb);
265 
266         ostringstream os;
267         for (size_t chunk_num = 0; chunk_num < num_chunks; chunk_num++) {
268             // Test the starting chunks
269             vector<int>& st_chunks = starting_chunks[chunk_num];
270             for (size_t context_in_chunk = 0;
271                  context_in_chunk < st_chunks.size();
272                  context_in_chunk++) {
273                 os.str("");
274                 os << "Starting chunks: ";
275                 os << "Chunk " << chunk_num << ", context " << context_in_chunk;
276                 int sc = ctx_translator.GetStartingChunk(chunk_num,
277                                                          context_in_chunk);
278                 BOOST_REQUIRE_MESSAGE(st_chunks[context_in_chunk]==sc,os.str());
279             }
280 
281             // Test the absolute contexts
282             vector<int>& abs_ctxts = absolute_contexts[chunk_num];
283             for (size_t context_in_chunk = 0;
284                  context_in_chunk < abs_ctxts.size();
285                  context_in_chunk++) {
286                 os.str("");
287                 os << "Absolute contexts: ";
288                 os << "Chunk " << chunk_num << ", context " << context_in_chunk;
289                 int abs_ctx =
290                     ctx_translator.GetAbsoluteContext(chunk_num,
291                                                       context_in_chunk);
292                 BOOST_REQUIRE_MESSAGE(abs_ctxts[context_in_chunk]==abs_ctx,os.str());
293             }
294         }
295 
296         // Check the context offsets
297         if ( !context_offsets ) {
298             return;
299         }
300 
301         const BLAST_SequenceBlk* global_seq = query_data->GetSequenceBlk();
302         const BlastQueryInfo* global_qinfo = query_data->GetQueryInfo();
303         CRef<CSplitQueryBlk> split_query_blk = splitter->m_SplitBlk;
304         for (size_t chunk_num = 0; chunk_num < num_chunks; chunk_num++) {
305             vector<size_t> test_ctx_off =
306                 split_query_blk->GetContextOffsets(chunk_num);
307             const vector<size_t>& ref_ctx_off = (*context_offsets)[chunk_num];
308 
309             os.str("");
310             os << "Number of context offsets in chunk " << chunk_num;
311             BOOST_REQUIRE_MESSAGE(ref_ctx_off.size()==test_ctx_off.size(),os.str());
312 
313             CRef<IQueryFactory> chunk_qf =
314                 splitter->GetQueryFactoryForChunk(chunk_num);
315             CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(opts));
316             const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
317             const BlastQueryInfo* chunk_qinfo = chunk_qd->GetQueryInfo();
318 
319             for (size_t i = 0; i < ref_ctx_off.size(); i++) {
320                 size_t correction = ref_ctx_off[i];
321                 os.str("");
322                 os << "Context correction in chunk " << chunk_num
323                    << ", context " << i << " value now " << test_ctx_off[i]
324                    << " not " << correction;
325                 BOOST_REQUIRE_MESSAGE(correction==test_ctx_off[i],os.str());
326 
327                 int absolute_context =
328                     ctx_translator.GetAbsoluteContext(chunk_num, i);
329                 if (absolute_context == kInvalidContext) {
330                     continue;
331                 }
332 
333                 int global_offset =
334                     global_qinfo->contexts[absolute_context].query_offset +
335                     correction;
336                 int chunk_offset = chunk_qinfo->contexts[i].query_offset;
337                 int num_bases2compare =
338                     min(10, chunk_qinfo->contexts[i].query_length);
339 
340                 os.str("");
341                 os << "Sequence data in chunk " << chunk_num
342                     << ", context " << i;
343                 bool rv =
344                     x_CmpSequenceData(&global_seq->sequence[global_offset],
345                                       &chunk_seq->sequence[chunk_offset],
346                                       num_bases2compare);
347                 BOOST_REQUIRE_MESSAGE(rv,os.str());
348             }
349 
350         }
351     }
352 
353     /** Auxiliary function that compares bytes of sequence data to validate the
354      * context offset corrections
355      * @param global global query sequence data [in]
356      * @param chunk sequence data for chunk [in]
357      * @param len length of the data to compare [in]
358      * @return true if sequence data is identical, false otherwise
359      */
x_CmpSequenceData(const Uint1 * global,const Uint1 * chunk,size_t len)360     bool x_CmpSequenceData(const Uint1* global, const Uint1* chunk, size_t len)
361     {
362         for (size_t i = 0; i < len; i++) {
363             if (global[i] != chunk[i]) {
364                 return false;
365             }
366         }
367         return true;
368     }
369 
QuerySplitter_BlastnSingleQueryMultiChunk(const string & kTestName,ENa_strand strand)370     void QuerySplitter_BlastnSingleQueryMultiChunk(const string& kTestName,
371                                                    ENa_strand strand)
372     {
373         CBlastQueryVector query;
374         CSeq_id id(CSeq_id::e_Gi, 112422322); // 122347 bases long
375         query.AddQuery(CTestObjMgr::Instance().CreateBlastSearchQuery(id));
376 
377         CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(query));
378         CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastn));
379         CRef<CBlastOptions> opts(&opts_h->SetOptions());
380         opts->SetStrandOption(strand);
381         CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
382 
383         CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
384         CRef<CSplitQueryBlk> sqb = splitter->Split();
385 
386         CQuerySplitter::TSplitQueryVector split_query_vector;
387         x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
388         x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
389 
390         x_ValidateChunkBounds(splitter->GetChunkSize(),
391                               query_data->GetSumOfSequenceLengths(),
392                               *sqb, opts->GetProgramType());
393 
394         const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
395                                                            "NumChunks",
396                                                            kDefaultIntValue);
397         BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
398         BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
399 
400         vector< vector<size_t> > queries_per_chunk;
401         x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
402         x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
403 
404         vector< vector<int> > ctxs_per_chunk;
405         x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
406         x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
407 
408         vector< vector<size_t> > ctx_offsets_per_chunk;
409         x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
410                                      ctx_offsets_per_chunk);
411         x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
412 
413         vector<BlastQueryInfo*> split_query_info;
414         x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
415                                     split_query_info);
416         x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
417         NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
418             *itr = BlastQueryInfoFree(*itr);
419         }
420     }
421 
QuerySplitter_BlastnMultiQueryMultiChunk(const string & kTestName,ENa_strand strand,vector<ENa_strand> * query_strands=NULL)422     void QuerySplitter_BlastnMultiQueryMultiChunk(const string& kTestName,
423                                                   ENa_strand strand,
424                                                   vector<ENa_strand>*
425                                                   query_strands = NULL)
426     {
427         TGiLengthVector gi_length;
428         gi_length.push_back(make_pair<int, size_t>(112258880, 362959));
429         gi_length.push_back(make_pair<int, size_t>(112253843, 221853));
430         gi_length.push_back(make_pair<int, size_t>(112193060, 194837));
431         gi_length.push_back(make_pair<int, size_t>(112193059, 204796));
432         if (query_strands) {
433             BOOST_REQUIRE_EQUAL(gi_length.size(), query_strands->size());
434         }
435 
436         size_t tot_length;
437         TSeqLocVector queries;
438         s_ConvertToBlastQueries(gi_length, queries, &tot_length, query_strands);
439 
440         CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
441         CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastn));
442         CRef<CBlastOptions> opts(&opts_h->SetOptions());
443         if ( !query_strands ) {
444             opts->SetStrandOption(strand);
445         }
446         CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
447 
448         CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
449         CRef<CSplitQueryBlk> sqb = splitter->Split();
450 
451         CQuerySplitter::TSplitQueryVector split_query_vector;
452         x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
453         x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
454 
455         x_ValidateChunkBounds(splitter->GetChunkSize(),
456                               query_data->GetSumOfSequenceLengths(),
457                               *sqb, opts->GetProgramType());
458 
459         const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
460                                                            "NumChunks",
461                                                            kDefaultIntValue);
462         BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
463         BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
464 
465         vector< vector<size_t> > queries_per_chunk;
466         x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
467         x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
468 
469         vector< vector<int> > ctxs_per_chunk;
470         x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
471         x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
472 
473         vector< vector<size_t> > ctx_offsets_per_chunk;
474         x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
475                                      ctx_offsets_per_chunk);
476         x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
477 
478         vector<BlastQueryInfo*> split_query_info;
479         x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
480                                     split_query_info);
481         x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
482         NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
483             *itr = BlastQueryInfoFree(*itr);
484         }
485     }
486 
QuerySplitter_BlastxSingleQueryMultiChunk(const string & kTestName,ENa_strand strand)487     void QuerySplitter_BlastxSingleQueryMultiChunk(const string& kTestName,
488                                                    ENa_strand strand)
489     {
490         const size_t kLength = 122347;  // length of the sequence below
491         CRef<CSeq_id> id(new CSeq_id(CSeq_id::e_Gi, 63122693));
492         TSeqRange range(0, kLength);
493         TSeqLocVector query;
494         query.push_back(*CTestObjMgr::Instance().
495                         CreateSSeqLoc(*id, range, strand));
496 
497         CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(query));
498         CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastx));
499         CRef<CBlastOptions> opts(&opts_h->SetOptions());
500         CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
501 
502         CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
503         CRef<CSplitQueryBlk> sqb = splitter->Split();
504 
505         BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
506                                               kDefaultIntValue),
507                              (int)splitter->GetChunkSize());
508 
509         x_ValidateChunkBounds(splitter->GetChunkSize(),
510                               query_data->GetSumOfSequenceLengths(),
511                               *sqb, opts->GetProgramType());
512 
513         const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
514                                                            "NumChunks",
515                                                            kDefaultIntValue);
516         BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
517         BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
518 
519         vector< vector<size_t> > queries_per_chunk;
520         x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
521         x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
522 
523         vector< vector<int> > ctxs_per_chunk;
524         x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
525         x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
526 
527         vector< vector<size_t> > ctx_offsets_per_chunk;
528         x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
529                                      ctx_offsets_per_chunk);
530         x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
531 
532         vector<BlastQueryInfo*> split_query_info;
533         x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
534                                     split_query_info);
535         x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
536         NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
537             *itr = BlastQueryInfoFree(*itr);
538         }
539     }
540 
QuerySplitter_BlastxMultiQueryMultiChunk(const string & kTestName,ENa_strand strand,vector<ENa_strand> * query_strands=NULL)541     void QuerySplitter_BlastxMultiQueryMultiChunk(const string& kTestName,
542                                                   ENa_strand strand,
543                                                   vector<ENa_strand>*
544                                                   query_strands = NULL)
545     {
546         TGiLengthVector gi_length;
547         gi_length.push_back(make_pair<int, size_t>(112817621, 5567));
548         gi_length.push_back(make_pair<int, size_t>(112585373, 5987));
549         gi_length.push_back(make_pair<int, size_t>(112585216, 5531));
550         gi_length.push_back(make_pair<int, size_t>(112585119, 5046));
551         if (query_strands) {
552             BOOST_REQUIRE_EQUAL(gi_length.size(), query_strands->size());
553         }
554 
555         size_t tot_length;
556         TSeqLocVector queries;
557         s_ConvertToBlastQueries(gi_length, queries, &tot_length, query_strands);
558 
559         CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
560         CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastx));
561         CRef<CBlastOptions> opts(&opts_h->SetOptions());
562         if ( !query_strands ) {
563             opts->SetStrandOption(strand);
564         }
565         CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
566 
567         CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
568         CRef<CSplitQueryBlk> sqb = splitter->Split();
569 
570         BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
571                                               kDefaultIntValue),
572                              (int)splitter->GetChunkSize());
573 
574         BOOST_REQUIRE_EQUAL(tot_length, query_data->GetSumOfSequenceLengths());
575         x_ValidateChunkBounds(splitter->GetChunkSize(),
576                               query_data->GetSumOfSequenceLengths(),
577                               *sqb, opts->GetProgramType());
578 
579         const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
580                                                            "NumChunks",
581                                                            kDefaultIntValue);
582         BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
583         BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
584 
585         vector< vector<size_t> > queries_per_chunk;
586         x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
587         x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
588 
589         vector< vector<int> > ctxs_per_chunk;
590         x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
591         x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
592 
593         vector< vector<size_t> > ctx_offsets_per_chunk;
594         x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
595                                      ctx_offsets_per_chunk);
596         x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
597 
598         vector<BlastQueryInfo*> split_query_info;
599         x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
600                                     split_query_info);
601         x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
602         NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
603             *itr = BlastQueryInfoFree(*itr);
604         }
605     }
606 
607     /************ Auxiliary functions **********************************/
608 
609     /// Incrementally compute the query chunk bounds. This will have a direct
610     /// impact on the success of x_ValidateChunkBounds. This function assumes
611     /// that the chunk size doesn't vary between each invocation and that the
612     /// first time this function is called, the chunk_range is initialized with
613     /// its default constructor (e.g.: TChunkRange::GetEmpty())
614     /// @param chunk_range range of the query chunk [in|out]
615     /// @param chunk_size size of the chunk [in]
616     /// @param concatenated_query_length length of the full query [in]
617     /// @param overlap length of the overlap region between each chunk [in]
x_ComputeQueryChunkBounds(TChunkRange & chunk_range,size_t chunk_size,size_t concatenated_query_length,size_t overlap)618     void x_ComputeQueryChunkBounds(TChunkRange& chunk_range,
619                                    size_t chunk_size,
620                                    size_t concatenated_query_length,
621                                    size_t overlap)
622     {
623         if (chunk_range == TChunkRange::GetEmpty()) {
624             chunk_range.SetFrom(0);
625             chunk_range.SetLength(chunk_size);
626         } else {
627             const TSeqPos kIncrement = chunk_size - overlap;
628             chunk_range.SetFrom(chunk_range.GetFrom() + kIncrement);
629             chunk_range.SetToOpen(chunk_range.GetToOpen() + kIncrement);
630         }
631         BOOST_REQUIRE(chunk_range.NotEmpty());
632 
633         if (chunk_range.GetToOpen() > concatenated_query_length) {
634             chunk_range.SetToOpen(concatenated_query_length);
635         }
636     }
637 
638     /// This function reads values in the split_query.ini file with the format
639     /// ChunkNQueryM (where N is the chunk number and M is the query number).
640     /// Each of these entries should have 3 comma-separeted elements: the
641     /// query's starting offset, ending offset, and its strand's enumeration
642     /// value.
643     /// @param kTestName name of the test to read data for [in]
644     /// @param sqb CSplitQueryBlk object from which to get query indices for
645     /// each chunk [in]
646     /// @param split_query_vector query vector where the data from config file
647     /// will be read [out]
x_ReadQueryBoundsPerChunk(const string & kTestName,CConstRef<CSplitQueryBlk> sqb,CQuerySplitter::TSplitQueryVector & split_query_vector)648     void x_ReadQueryBoundsPerChunk(const string& kTestName,
649                                    CConstRef<CSplitQueryBlk> sqb,
650                    CQuerySplitter::TSplitQueryVector& split_query_vector)
651     {
652         CRef<CScope> scope(CSimpleOM::NewScope());
653         TMaskedQueryRegions empty_mask;
654         split_query_vector.clear();
655 
656         ostringstream os;
657 
658         const int kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
659                                                 kDefaultIntValue);
660         if (kNumChunks == kDefaultIntValue) {
661             throw runtime_error("Invalid number of chunks in " + kTestName);
662         }
663 
664         split_query_vector.assign(kNumChunks, CRef<CBlastQueryVector>());
665 
666         for (int i = 0; i < kNumChunks; i++) {
667             os.str("");
668             os << "Chunk" << i;
669             const vector<size_t> kQueryIndices = sqb->GetQueryIndices(i);
670 
671             BOOST_REQUIRE( !kQueryIndices.empty() );
672             split_query_vector[i].Reset(new CBlastQueryVector);
673 
674             ITERATE(vector<size_t>, itr, kQueryIndices) {
675                 ostringstream out;
676                 out << "Query" << *itr;
677 
678                 const string& value = m_Config->Get(kTestName,
679                                                     os.str() + out.str());
680                 // This data corresponds to entries in split_query.ini of the
681                 // form ChunkNQueryM, and each line should contain 3 elements:
682                 // the start and stop for each query in each chunk and the
683                 // strand's enumeration value
684                 vector<size_t> query_data;
685                 x_ParseConfigLine(value, query_data);
686                 BOOST_REQUIRE_MESSAGE((size_t)3==query_data.size(),os.str() + out.str());
687 
688                 CRef<CSeq_loc> sl(new CSeq_loc);
689                 sl->SetInt().SetFrom(query_data[0]);
690                 sl->SetInt().SetTo(query_data[1]);
691                 sl->SetStrand(static_cast<ENa_strand>(query_data[2]));
692                 CRef<CBlastSearchQuery> bsq(new CBlastSearchQuery(*sl,
693                                                                   *scope,
694                                                                   empty_mask));
695                 split_query_vector[i]->AddQuery(bsq);
696             }
697         }
698     }
699 
700     /// Compare the query data (start, stop, strand) for each chunk computed by
701     /// the splitter vs. the data read from the split_query.ini file
702     /// @param splitter object which performs query splitting [in]
703     /// @param split_query_vector data instantiated from what was read from the
704     /// split_query.ini file
705     /// @param splitter CQuerySplitter object to test [in]
706     /// @param split_query_vector data read from config file to test against
707     /// [in]
x_ValidateQuerySeqLocsPerChunk(CRef<CQuerySplitter> splitter,const CQuerySplitter::TSplitQueryVector & split_query_vector)708     void x_ValidateQuerySeqLocsPerChunk(CRef<CQuerySplitter> splitter,
709               const CQuerySplitter::TSplitQueryVector& split_query_vector)
710     {
711         if (split_query_vector.empty()) {
712             return;
713         }
714 
715         ostringstream os;
716         os << "Different split query vector sizes";
717 
718         BOOST_REQUIRE_MESSAGE(split_query_vector.size()==(size_t)splitter->m_NumChunks,os.str());
719 
720         for (size_t i = 0; i < splitter->m_NumChunks; i++) {
721             CRef<CBlastQueryVector> ref_qvector = split_query_vector[i];
722             CRef<CBlastQueryVector> test_qvector =
723                 splitter->m_SplitQueriesInChunk[i];
724 
725             os.str("");
726             os << "Different split query vector sizes for chunk " << i;
727             BOOST_REQUIRE_MESSAGE(ref_qvector->Size()==test_qvector->Size(),os.str());
728 
729             for (size_t j = 0; j < ref_qvector->Size(); j++) {
730                 CConstRef<CSeq_loc> ref_qloc = ref_qvector->GetQuerySeqLoc(j);
731                 CConstRef<CSeq_loc> test_qloc = test_qvector->GetQuerySeqLoc(j);
732                 CSeq_loc::TRange ref_query_range = ref_qloc->GetTotalRange();
733                 CSeq_loc::TRange test_query_range = test_qloc->GetTotalRange();
734 
735                 os.str("");
736                 os << "Starting offset for query " << j << " in chunk " << i << " is now " << test_query_range.GetFrom() << " and not " << ref_query_range.GetFrom();
737                 BOOST_REQUIRE_MESSAGE(ref_query_range.GetFrom()==test_query_range.GetFrom(),os.str());
738                 os.str("");
739                 os << "Ending offset for query " << j << " in chunk " << i << " is now " << test_query_range.GetToOpen() << " and not " << ref_query_range.GetTo();
740                 BOOST_REQUIRE_MESSAGE(ref_query_range.GetTo()==test_query_range.GetToOpen(),os.str());
741                 os.str("");
742                 os << "Strand for query " << j << " in chunk " << i << " is now "
743                     << (int)test_qloc->GetStrand() << " and not " << (int)ref_qloc->GetStrand();
744                 BOOST_REQUIRE_MESSAGE(ref_qloc->GetStrand()==test_qloc->GetStrand(),os.str());
745             }
746         }
747     }
748 
749     /// Reads data to populate multiple BlastQueryInfo structures. This data is
750     /// formatted in the config file as
751     /// BlastQueryInfoN.X[.Y] where N is the chunk number, X is the field of
752     /// the BlastQueryInfo structure and Y is the field of the BlastContextInfo
753     /// structure (only applicable if X has the value contextM, where M is the
754     /// context number)
755     /// @param kTestName name of the test to read data for [in]
756     /// @param program blast program [in]
757     /// @param retval vector of BlastQueryInfo structures, there will be as
758     /// many elements as there are chunks for this test. Caller is
759     /// responsible for deallocating the contents of this vector [out]
x_ReadSplitQueryInfoForTest(const string & kTestName,EBlastProgramType program,vector<BlastQueryInfo * > & retval)760     void x_ReadSplitQueryInfoForTest(const string& kTestName,
761                                      EBlastProgramType program,
762                                      vector<BlastQueryInfo*>& retval)
763     {
764         ostringstream os, errors;
765 
766         const int kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
767                                                 kDefaultIntValue);
768         if (kNumChunks == kDefaultIntValue) {
769             throw runtime_error("Invalid number of chunks in " + kTestName);
770         }
771 
772         retval.clear();
773         retval.reserve(kNumChunks);
774         retval.assign(kNumChunks, static_cast<BlastQueryInfo*>(0));
775 
776         for (int i = 0; i < kNumChunks; i++) {
777             os.str("");
778             os << "BlastQueryInfo" << i << ".";
779             const string kPrefix(os.str());
780             errors.str("Chunk ");
781             errors << i << ": ";
782             const int kNumQueries = m_Config->GetInt(kTestName,
783                                                      kPrefix + "num_queries",
784                                                      kDefaultIntValue);
785             if (kNumQueries == kDefaultIntValue) {
786                 string msg("Invalid BlastQueryInfo::num_queries in ");
787                 msg += kTestName + " or value not specified";
788 return; // FIXME
789                 //throw runtime_error(msg);
790             }
791 
792             retval[i] = BlastQueryInfoNew(program, kNumQueries);
793             errors << "Failed to allocate BlastQueryInfo structure"
794                    << " (Number of queries=" << kNumQueries << ")";
795             BOOST_REQUIRE_MESSAGE(retval[i],errors.str());
796 
797             retval[i]->first_context = m_Config->GetInt(kTestName,
798                                                         kPrefix +
799                                                         "first_context",
800                                                         kDefaultIntValue);
801             errors.str("Chunk ");
802             errors << i;
803             BOOST_REQUIRE_MESSAGE(retval[i]->first_context >= 0,errors.str());
804 
805             retval[i]->last_context = m_Config->GetInt(kTestName,
806                                                        kPrefix +
807                                                        "last_context",
808                                                        kDefaultIntValue);
809             BOOST_REQUIRE_MESSAGE(retval[i]->last_context >= 0,errors.str());
810             BOOST_REQUIRE_MESSAGE(retval[i]->first_context <= retval[i]->last_context,errors.str());
811 
812             for (int c = retval[i]->first_context;
813                  c <= retval[i]->last_context;
814                  c++) {
815 
816                 errors.str("");
817                 errors << "Chunk " << i << ", BlastQueryInfo::context " << c;
818 
819                 ostringstream ctx;
820                 ctx << kPrefix << "context" << c << ".";
821 
822                 retval[i]->contexts[c].query_offset =
823                     m_Config->GetInt(kTestName, ctx.str() +
824                                      "query_offset", kDefaultIntValue);
825                 BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].query_offset >= 0,
826                                       errors.str() + " query_offset >= 0");
827 
828                 retval[i]->contexts[c].query_length =
829                     m_Config->GetInt(kTestName, ctx.str() +
830                                      "query_length", kDefaultIntValue);
831                 BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].query_length >= 0,
832                                       errors.str() + " query_length >= 0");
833 
834                 retval[i]->contexts[c].eff_searchsp =
835                     m_Config->GetInt(kTestName, ctx.str() +
836                                      "eff_searchsp", kDefaultIntValue);
837                 BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].eff_searchsp >= 0,
838                                       errors.str() + " eff_searchsp >= 0");
839 
840                 retval[i]->contexts[c].length_adjustment =
841                     m_Config->GetInt(kTestName, ctx.str() +
842                                      "length_adjustment", kDefaultIntValue);
843                 BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].length_adjustment >= 0,
844                                       errors.str() + " length_adjustment >= 0");
845 
846                 retval[i]->contexts[c].query_index =
847                     m_Config->GetInt(kTestName, ctx.str() +
848                                      "query_index", kDefaultIntValue);
849                 BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].query_index >= 0,
850                                       errors.str() + " query_index");
851 
852                 retval[i]->contexts[c].frame =
853                     m_Config->GetInt(kTestName, ctx.str() +
854                                      "frame", kDefaultIntValue);
855                 BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].frame == 1
856                                    || retval[i]->contexts[c].frame == 2
857                                    || retval[i]->contexts[c].frame == 3
858                                    || retval[i]->contexts[c].frame == -1
859                                    || retval[i]->contexts[c].frame == -2
860                                    || retval[i]->contexts[c].frame == -3
861                                    || retval[i]->contexts[c].frame == 0,
862                                    errors.str() + " frame");
863 
864                 retval[i]->contexts[c].is_valid =
865                     m_Config->GetBool(kTestName, ctx.str() +
866                                      "is_valid", false);
867                 BOOST_REQUIRE_MESSAGE(retval[i]->contexts[c].is_valid,
868                                       errors.str() + " is_valid");
869             }
870             s_CalculateMaxLength(retval[i]);
871         }
872     }
873 
874     /// This method reads entries in the config file of the format
875     /// ChunkNX, here N is the chunk number and X is the value of data_to_read
876     /// @param kTestName name of the test to read data for [in]
877     /// @param data_to_read data for a chunk to read [in]
878     /// @param retval vector of vectors where the data will be returned. The
879     /// first vector will contain as many elements are there are chunks, and
880     /// the contained vectors will contain as many elements as there are items
881     /// on the config file (comma separated values) [out]
882     template <class T>
x_ReadVectorOfVectorsForTest(const string & kTestName,const char * data_to_read,vector<vector<T>> & retval)883     void x_ReadVectorOfVectorsForTest(const string& kTestName,
884                                       const char* data_to_read,
885                                       vector< vector<T> >& retval)
886     {
887         ostringstream os;
888 
889         const int kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
890                                                 kDefaultIntValue);
891         if (kNumChunks == kDefaultIntValue) {
892             throw runtime_error("Invalid number of chunks in " + kTestName);
893         }
894 
895         retval.clear();
896         retval.resize(kNumChunks);
897 
898         for (int i = 0; i < kNumChunks; i++) {
899             os.str("");
900             os << "Chunk" << i << data_to_read;
901 
902             const string& value = m_Config->Get(kTestName, os.str());
903             x_ParseConfigLine(value, retval[i]);
904         }
905     }
906 
907     /// Tokenizes a string containing comma-separated values into a vector of
908     /// values
909     /// @param input string to tokenize [in]
910     /// @param retval vector containing elements found in input string [out]
911     template <class T>
x_ParseConfigLine(const string & input,vector<T> & retval)912     void x_ParseConfigLine(const string& input, vector<T>& retval)
913     {
914         retval.clear();
915         vector<string> tokens;
916         NStr::Split(input, ",", tokens);
917         retval.reserve(tokens.size());
918         ITERATE(vector<string>, token, tokens) {
919             retval.push_back(NStr::StringToInt(NStr::TruncateSpaces(*token)));
920         }
921     }
922 
923     /***************** Generic validation methods ****************************/
924 
925     /// Auxiliary method to validate the chunk bounds calculated by the
926     /// CSplitQueryBlk object and the x_ComputeQueryChunkBounds method
927     /// @param kChunkSize size of the chunk [in]
928     /// @param kQuerySize size of the full query [in]
929     /// @param sqb the CSplitQueryBlk object to test [in]
930     /// @param p the program type [in]
x_ValidateChunkBounds(size_t kChunkSize,size_t kQuerySize,const CSplitQueryBlk & sqb,EBlastProgramType p)931     void x_ValidateChunkBounds(size_t kChunkSize,
932                                size_t kQuerySize,
933                                const CSplitQueryBlk& sqb,
934                                EBlastProgramType p)
935     {
936         const size_t kNumChunks(sqb.GetNumChunks());
937         const size_t kQueryChunkOverlapSize = SplitQuery_GetOverlapChunkSize(p);
938 
939         TChunkRange expected_chunk_range(TChunkRange::GetEmpty());
940         for (size_t i = 0; i < kNumChunks; i++) {
941             x_ComputeQueryChunkBounds(expected_chunk_range, kChunkSize,
942                                       kQuerySize, kQueryChunkOverlapSize);
943             TChunkRange chunk_range = sqb.GetChunkBounds(i);
944             BOOST_REQUIRE_EQUAL(expected_chunk_range.GetFrom(),
945                                  chunk_range.GetFrom());
946             BOOST_REQUIRE_EQUAL(expected_chunk_range.GetToOpen(),
947                                  chunk_range.GetToOpen());
948             TSeqPos chunk_start = i*kChunkSize - (i*kQueryChunkOverlapSize);
949             TSeqPos chunk_end = chunk_start + kChunkSize > kQuerySize
950                 ? kQuerySize
951                 : chunk_start + kChunkSize;
952             BOOST_REQUIRE_EQUAL(expected_chunk_range.GetFrom(), chunk_start);
953             BOOST_REQUIRE_EQUAL(expected_chunk_range.GetToOpen(), chunk_end);
954             TSeqPos chunk_length = chunk_end - chunk_start;
955                 BOOST_REQUIRE_EQUAL(chunk_length,
956                                      expected_chunk_range.GetLength());
957         }
958     }
959 
960     /// Validates the query sequences (by index) assigned to all the chunks
961     /// This compares the data calculated by the sqb parameter to the data read
962     /// from the config file in queries_per_chunk
963     /// @param sqb CSplitQueryBlk object to test [in]
964     /// @param queries_per_chunk data read from config file [in]
x_ValidateQueriesPerChunkAssignment(const CSplitQueryBlk & sqb,const vector<vector<size_t>> & queries_per_chunk)965     void x_ValidateQueriesPerChunkAssignment(const CSplitQueryBlk& sqb,
966                                              const vector< vector<size_t> >&
967                                              queries_per_chunk)
968     {
969         const size_t kNumChunks = sqb.GetNumChunks();
970         BOOST_REQUIRE_EQUAL(kNumChunks, queries_per_chunk.size());
971 
972         for (size_t i = 0; i < kNumChunks; i++) {
973             ostringstream os;
974             os << "Chunk number " << i << " has an invalid number of queries";
975 
976             vector<size_t> data2test = sqb.GetQueryIndices(i);
977             BOOST_REQUIRE_MESSAGE(queries_per_chunk[i].size()==data2test.size(),os.str());
978 
979             for (size_t j = 0; j < data2test.size(); j++) {
980                 os.str("");
981                 os << "Query index mismatch in chunk number " << i
982                    << " entry number " << j;
983                 BOOST_REQUIRE_MESSAGE(queries_per_chunk[i][j]==data2test[j],os.str());
984             }
985         }
986     }
987 
988     /// Validates the query contexts assigned to all the chunks
989     /// @param sqb CSplitQueryBlk object to test [in]
990     /// @param contexts_per_chunk data read from config file [in]
x_ValidateQueryContextsPerChunkAssignment(const CSplitQueryBlk & sqb,const vector<vector<int>> & contexts_per_chunk)991     void x_ValidateQueryContextsPerChunkAssignment(const CSplitQueryBlk& sqb,
992                                              const vector< vector<int> >&
993                                              contexts_per_chunk)
994     {
995         const size_t kNumChunks = sqb.GetNumChunks();
996 
997         BOOST_REQUIRE_EQUAL(kNumChunks, contexts_per_chunk.size());
998         for (size_t i = 0; i < kNumChunks; i++) {
999             ostringstream os;
1000             os << "Chunk number " << i << " has an invalid number of contexts";
1001 
1002             vector<int> data2test = sqb.GetQueryContexts(i);
1003             BOOST_REQUIRE_MESSAGE(contexts_per_chunk[i].size()==data2test.size(),os.str());
1004 
1005             for (size_t j = 0; j < data2test.size(); j++) {
1006                 os.str("");
1007                 os << "Context index mismatch in chunk number " << i
1008                    << " entry number " << j;
1009                 BOOST_REQUIRE_MESSAGE(contexts_per_chunk[i][j]==data2test[j],os.str());
1010             }
1011         }
1012     }
1013 
1014     /// Validates the context offsets assigned to all the chunks
1015     /// @param sqb CSplitQueryBlk object to test [in]
1016     /// @param contexts_offsets_per_chunk data read from config file [in]
x_ValidateContextOffsetsPerChunkAssignment(const CSplitQueryBlk & sqb,const vector<vector<size_t>> & contexts_offsets_per_chunk)1017     void x_ValidateContextOffsetsPerChunkAssignment(const CSplitQueryBlk& sqb,
1018                                              const vector< vector<size_t> >&
1019                                              contexts_offsets_per_chunk)
1020     {
1021         const size_t kNumChunks(sqb.GetNumChunks());
1022         BOOST_REQUIRE_EQUAL(kNumChunks, contexts_offsets_per_chunk.size());
1023         for (size_t i = 0; i < kNumChunks; i++) {
1024             ostringstream os;
1025             os << "Chunk number " << i
1026                << " has an invalid number of context offsets";
1027 
1028             vector<size_t> data2test = sqb.GetContextOffsets(i);
1029             BOOST_REQUIRE_MESSAGE(contexts_offsets_per_chunk[i].size()==data2test.size(),os.str());
1030 
1031             for (size_t j = 0; j < data2test.size(); j++) {
1032                 os.str("");
1033                 os << "Context offset mismatch in chunk number " << i
1034                    << " entry number " << j << " value now " << data2test[j]
1035                    << " not " << contexts_offsets_per_chunk[i][j];
1036 // TLM cerr <<  "data2test " << data2test[j] << " ";
1037                  BOOST_REQUIRE_MESSAGE(contexts_offsets_per_chunk[i][j]==data2test[j],os.str());
1038             }
1039 // TLM cerr << endl;
1040         }
1041     }
1042 
1043     /// Validate the query info structure generated (test) against the expected
1044     /// one (reference) (N.B.: this is called from x_ValidateLocalQueryData)
1045     /// @param reference The "good" BlastQueryInfo structure [in]
1046     /// @param test the BlastQueryInfo structure to test [in]
1047     /// @param the chunk number being tested, this is needed for error
1048     /// reporting purposes [in]
x_ValidateQueryInfoForChunk(const BlastQueryInfo * reference,const BlastQueryInfo * test,size_t chunk_num)1049     void x_ValidateQueryInfoForChunk(const BlastQueryInfo* reference,
1050                                      const BlastQueryInfo* test,
1051                                      size_t chunk_num)
1052     {
1053         ostringstream os;
1054 
1055         os << "Chunk " << chunk_num << ": BlastQueryInfo::first_context";
1056         BOOST_REQUIRE_MESSAGE(reference->first_context==test->first_context,os.str());
1057 
1058         os.str("");
1059         os << "Chunk " << chunk_num << ": BlastQueryInfo::last_context";
1060         BOOST_REQUIRE_MESSAGE(reference->last_context==test->last_context,os.str());
1061 
1062         os.str("");
1063         os << "Chunk " << chunk_num << ": BlastQueryInfo::num_queries";
1064         BOOST_REQUIRE_MESSAGE(reference->num_queries==test->num_queries,os.str());
1065 
1066         os.str("");
1067         os << "Chunk " << chunk_num << ": BlastQueryInfo::max_length";
1068         BOOST_REQUIRE_MESSAGE(reference->max_length==test->max_length,os.str());
1069 
1070         os.str("");
1071         os << "Chunk " << chunk_num << ": BlastQueryInfo::pattern_info";
1072         BOOST_REQUIRE_MESSAGE(reference->pattern_info==test->pattern_info,os.str());
1073 
1074         for (Int4 ctx = reference->first_context;
1075              ctx <= reference->last_context;
1076              ctx++) {
1077 
1078             os.str("");
1079             os << "Chunk " << chunk_num << ", context " << ctx;
1080             BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].query_offset==test->contexts[ctx].query_offset,
1081                                   os.str() + " query_offset");
1082             BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].query_length==test->contexts[ctx].query_length,
1083                                   os.str() + " query_length");
1084             BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].eff_searchsp==test->contexts[ctx].eff_searchsp,
1085                                   os.str() + " eff_searchsp");
1086             BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].query_index==test->contexts[ctx].query_index,
1087                                   os.str() + " query_index");
1088             BOOST_REQUIRE_MESSAGE((int)reference->contexts[ctx].frame==(int)test->contexts[ctx].frame,
1089                                   os.str() + " frame");
1090             BOOST_REQUIRE_MESSAGE(reference->contexts[ctx].is_valid==test->contexts[ctx].is_valid,
1091                                   os.str() + " is_valid");
1092 
1093         }
1094     }
1095 
1096     /// Validate the local query data for all chunks, comparing data produced
1097     /// by the CQuerySplitter object and the BlastQueryInfo structures read
1098     /// from the config file (BLAST_SequenceBlk's are not tested)
1099     /// @param splitter object to test [in]
1100     /// @param options BLAST options [in]
1101     /// @param split_query_info_structs the data to compare to (reference) [in]
x_ValidateLocalQueryData(CRef<CQuerySplitter> splitter,const CBlastOptions * options,vector<BlastQueryInfo * > split_query_info_structs)1102     void x_ValidateLocalQueryData(CRef<CQuerySplitter> splitter,
1103                                   const CBlastOptions* options,
1104                                   vector<BlastQueryInfo*>
1105                                   split_query_info_structs)
1106     {
1107         ostringstream os;
1108         BOOST_REQUIRE(options);
1109         const size_t kNumChunks(splitter->GetNumberOfChunks());
1110 
1111         CRef<CSplitQueryBlk> sqb = splitter->Split();
1112         BOOST_REQUIRE_EQUAL(kNumChunks, split_query_info_structs.size());
1113 
1114         for (size_t i = 0; i < kNumChunks; i++) {
1115             os.str("");
1116             os << "Chunk " << i << ": ";
1117             CRef<IQueryFactory> qf = splitter->GetQueryFactoryForChunk(i);
1118             BOOST_REQUIRE_MESSAGE(qf.NotEmpty(),os.str() + "NULL query factory");
1119             CRef<ILocalQueryData> qd = qf->MakeLocalQueryData(options);
1120             BOOST_REQUIRE_MESSAGE(qd.NotEmpty(),os.str() + "NULL local query data");
1121 
1122             os << "Different number of queries";
1123             BOOST_REQUIRE_MESSAGE((size_t)sqb->GetNumQueriesForChunk(i)==(size_t)qd->GetNumQueries(),os.str());
1124 
1125             // FIXME: turned off for now
1126             // Validate the query info structure
1127             //x_ValidateQueryInfoForChunk(split_query_info_structs[i],
1128             //                            qd->GetQueryInfo(), i);
1129 
1130             //x_ValidateSequenceBlkForChunk();
1131 
1132             // Validate that query in this chunk is indeed valid
1133             //for (int qindex = 0; qindex < qd->GetNumQueries(); qindex++) {
1134             //    os.str("Chunk ");
1135             //    os << i << ": query " << qindex << " is invalid";
1136             //    BOOST_REQUIRE_MESSAGE(qd->IsValidQuery(qindex),os.str());
1137             //}
1138 
1139         }
1140 
1141     }
1142 };
1143 
BOOST_FIXTURE_TEST_SUITE(split_query,CSplitQueryTestFixture)1144 BOOST_FIXTURE_TEST_SUITE(split_query, CSplitQueryTestFixture)
1145 
1146 /*********** Actual unit tests ***************************************/
1147 BOOST_AUTO_TEST_CASE(SplitQueriesIn1Chunk) {
1148     CRef<CSplitQueryBlk> sqb(new CSplitQueryBlk(1));
1149     Int2 rv;
1150 
1151     rv = SplitQueryBlk_AddQueryToChunk(sqb->GetCStruct(), 41, 2);
1152     BOOST_REQUIRE_EQUAL(kBadParameter, rv);
1153 
1154     /// This will be reused for both query indices and contexts
1155     vector<Int4> query_indices_expected;
1156     query_indices_expected.push_back(45);
1157     query_indices_expected.push_back(0);
1158     query_indices_expected.push_back(7);
1159 
1160     ITERATE(vector<Int4>, qi, query_indices_expected) {
1161         rv = SplitQueryBlk_AddQueryToChunk(sqb->GetCStruct(), *qi, 0);
1162         BOOST_REQUIRE_EQUAL((Int2)0, rv);
1163         rv = SplitQueryBlk_AddContextToChunk(sqb->GetCStruct(), *qi, 0);
1164         BOOST_REQUIRE_EQUAL((Int2)0, rv);
1165     }
1166 
1167     Uint4* query_indices = NULL;
1168     rv = SplitQueryBlk_GetQueryIndicesForChunk(sqb->GetCStruct(), 0,
1169                                                &query_indices);
1170     BOOST_REQUIRE_EQUAL((Int2)0, rv);
1171     for (int i = 0; query_indices[i] != UINT4_MAX; i++) {
1172         BOOST_REQUIRE_EQUAL(query_indices_expected[i],
1173                              (Int4)query_indices[i]);
1174     }
1175     sfree(query_indices);
1176 
1177     Int4* query_contexts = NULL;
1178     Uint4 num_query_contexts = 0;
1179     rv = SplitQueryBlk_GetQueryContextsForChunk(sqb->GetCStruct(), 0,
1180                                                 &query_contexts,
1181                                                 &num_query_contexts);
1182     BOOST_REQUIRE_EQUAL((Int2)0, rv);
1183     for (Uint4 i = 0; i < num_query_contexts; i++) {
1184         BOOST_REQUIRE_EQUAL(query_indices_expected[i], query_contexts[i]);
1185     }
1186     sfree(query_contexts);
1187 
1188     size_t num_queries(0);
1189     rv = SplitQueryBlk_GetNumQueriesForChunk(sqb->GetCStruct(), 0,
1190                                              &num_queries);
1191     BOOST_REQUIRE_EQUAL((Int2)0, rv);
1192     BOOST_REQUIRE_EQUAL(query_indices_expected.size(), num_queries);
1193 }
1194 
BOOST_AUTO_TEST_CASE(SplitQueriesRandomly)1195 BOOST_AUTO_TEST_CASE(SplitQueriesRandomly) {
1196     CRandom random((CRandom::TValue)time(0));
1197     const Uint4 kNumChunks(random.GetRand(1, 100));
1198     TSplitQueryChunkMap map;
1199     map.resize(kNumChunks);
1200     Uint4 query_index = 0;
1201 
1202     // Set up the artificial data
1203     for (Uint4 chunk_num = 0; chunk_num < kNumChunks; chunk_num++) {
1204         const Uint4 kQueriesPerChunk(random.GetRand(1, 365));
1205         for (Uint4 i = 0; i < kQueriesPerChunk; i++) {
1206             map[chunk_num].push_back(query_index++);
1207         }
1208     }
1209 
1210     // Set up the SplitQueryBlk structure
1211     CRef<CSplitQueryBlk> sqb(new CSplitQueryBlk(kNumChunks));
1212     for (size_t chunk_num = 0; chunk_num < map.size(); chunk_num++) {
1213         ITERATE(vector<Uint4>, qi, map[chunk_num]) {
1214             Int2 rv = SplitQueryBlk_AddQueryToChunk(sqb->GetCStruct(), *qi,
1215                                                     chunk_num);
1216             BOOST_REQUIRE_EQUAL((Int2)0, rv);
1217         }
1218     }
1219 
1220     for (Uint4 chunk_num = 0; chunk_num < kNumChunks; chunk_num++) {
1221         vector<Uint4> query_indices_expected = map[chunk_num];
1222 
1223         Uint4* query_indices = NULL;
1224         Int2 rv = SplitQueryBlk_GetQueryIndicesForChunk(sqb->GetCStruct(),
1225                                                         chunk_num,
1226                                                         &query_indices);
1227         BOOST_REQUIRE_EQUAL((Int2)0, rv);
1228         BOOST_REQUIRE(query_indices != NULL);
1229 
1230         size_t i;
1231         for (i = 0; i < query_indices_expected.size(); i++) {
1232             BOOST_REQUIRE_EQUAL(query_indices_expected[i],
1233                                  query_indices[i]);
1234         }
1235         BOOST_REQUIRE_EQUAL((Uint4)UINT4_MAX, query_indices[i]);
1236         sfree(query_indices);
1237 
1238         size_t num_queries(0);
1239         rv = SplitQueryBlk_GetNumQueriesForChunk(sqb->GetCStruct(), chunk_num,
1240                                                  &num_queries);
1241         BOOST_REQUIRE_EQUAL((Int2)0, rv);
1242         BOOST_REQUIRE_EQUAL(query_indices_expected.size(), num_queries);
1243     }
1244 }
1245 
BOOST_AUTO_TEST_CASE(Split4QueriesIn3Chunks)1246 BOOST_AUTO_TEST_CASE(Split4QueriesIn3Chunks) {
1247     const Uint4 kNumChunks = 3;
1248     TSplitQueryChunkMap map;
1249     map.resize(kNumChunks);
1250     map[0].push_back(0);
1251     map[0].push_back(1);
1252     map[1].push_back(2);
1253     map[2].push_back(3);
1254 
1255     CRef<CSplitQueryBlk> sqb(new CSplitQueryBlk(kNumChunks));
1256 
1257     for (Uint4 chunk_num = 0; chunk_num < map.size(); chunk_num++) {
1258         ITERATE(vector<Uint4>, qi, map[chunk_num]) {
1259             Int2 rv = SplitQueryBlk_AddQueryToChunk(sqb->GetCStruct(), *qi,
1260                                                     chunk_num);
1261             BOOST_REQUIRE_EQUAL((Int2)0, rv);
1262         }
1263     }
1264 
1265     for (Uint4 chunk_num = 0; chunk_num < kNumChunks; chunk_num++) {
1266         vector<Uint4> query_indices_expected = map[chunk_num];
1267 
1268         Uint4* query_indices = NULL;
1269         Int2 rv = SplitQueryBlk_GetQueryIndicesForChunk(sqb->GetCStruct(),
1270                                                         chunk_num,
1271                                                         &query_indices);
1272         BOOST_REQUIRE_EQUAL((Int2)0, rv);
1273         BOOST_REQUIRE(query_indices != NULL);
1274 
1275         size_t i;
1276         for (i = 0; i < query_indices_expected.size(); i++) {
1277             BOOST_REQUIRE_EQUAL(query_indices_expected[i],
1278                                  query_indices[i]);
1279         }
1280         BOOST_REQUIRE_EQUAL((Uint4)UINT4_MAX, query_indices[i]);
1281         sfree(query_indices);
1282 
1283         size_t num_queries(0);
1284         rv = SplitQueryBlk_GetNumQueriesForChunk(sqb->GetCStruct(), chunk_num,
1285                                                  &num_queries);
1286         BOOST_REQUIRE_EQUAL((Int2)0, rv);
1287         BOOST_REQUIRE_EQUAL(query_indices_expected.size(), num_queries);
1288     }
1289 }
1290 
1291 /// Tests query splitting for blastn of both strands of a single query into
1292 /// multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_BothStrands)1293 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_BothStrands) {
1294     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1295     const string
1296         kTestName("QuerySplitter_BlastnSingleQueryMultiChunk_BothStrands");
1297 
1298     QuerySplitter_BlastnSingleQueryMultiChunk(kTestName, eNa_strand_both);
1299 }
1300 
1301 /// Tests query splitting for blastn of the plus strands of a single query
1302 /// into multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_PlusStrand)1303 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_PlusStrand) {
1304     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1305     const string
1306         kTestName("QuerySplitter_BlastnSingleQueryMultiChunk_PlusStrand");
1307 
1308     QuerySplitter_BlastnSingleQueryMultiChunk(kTestName, eNa_strand_plus);
1309 }
1310 
1311 /// Tests query splitting for blastn of the minus strands of a single query
1312 /// into multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_MinusStrand)1313 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnSingleQueryMultiChunk_MinusStrand) {
1314     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1315     const string
1316         kTestName("QuerySplitter_BlastnSingleQueryMultiChunk_MinusStrand");
1317 
1318     QuerySplitter_BlastnSingleQueryMultiChunk(kTestName, eNa_strand_minus);
1319 }
1320 
1321 /// Tests query splitting for blastn of the plus strands of multiple queries
1322 /// into multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_PlusStrand)1323 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_PlusStrand) {
1324     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1325     const string
1326         kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_PlusStrand");
1327 
1328     QuerySplitter_BlastnMultiQueryMultiChunk(kTestName, eNa_strand_plus);
1329 }
1330 
1331 /// Tests query splitting for blastn of the minus strands of multiple
1332 /// queries into multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_MinusStrand)1333 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_MinusStrand) {
1334     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1335     const string
1336         kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_MinusStrand");
1337 
1338     QuerySplitter_BlastnMultiQueryMultiChunk(kTestName, eNa_strand_minus);
1339 }
1340 
1341 /// Tests query splitting for blastn of both strands of multiple
1342 /// queries into multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_BothStrands)1343 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_BothStrands) {
1344     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1345     const string
1346         kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_BothStrands");
1347     QuerySplitter_BlastnMultiQueryMultiChunk(kTestName, eNa_strand_both);
1348 }
1349 
1350 /// Tests query splitting for blastn with multiple queries in multiple
1351 /// chunks with each query using different strands
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_MixedStrands)1352 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastnMultiQueryMultiChunk_MixedStrands) {
1353     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1354     const string
1355         kTestName("QuerySplitter_BlastnMultiQueryMultiChunk_MixedStrands");
1356     vector<ENa_strand> query_strands;
1357     query_strands.reserve(4);
1358     query_strands.push_back(eNa_strand_plus);
1359     query_strands.push_back(eNa_strand_both);
1360     query_strands.push_back(eNa_strand_minus);
1361     query_strands.push_back(eNa_strand_unknown);
1362 
1363     QuerySplitter_BlastnMultiQueryMultiChunk(kTestName,
1364                                              eNa_strand_unknown,
1365                                              &query_strands);
1366 }
1367 
1368 /*********  This functionality has not been implemented  **************/
1369 #if 0
1370 /// Tests blastx of both strands of a single query into multiple chunks
1371 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxSingleQueryMultiChunk_BothStrands) {
1372     const string
1373         kTestName("QuerySplitter_BlastxSingleQueryMultiChunk_BothStrands");
1374 
1375     QuerySplitter_BlastxSingleQueryMultiChunk(kTestName, eNa_strand_both);
1376 }
1377 
1378 /// Tests blastx of the plus strand of a single query into multiple chunks
1379 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxSingleQueryMultiChunk_PlusStrand) {
1380     const string
1381         kTestName("QuerySplitter_BlastxSingleQueryMultiChunk_PlusStrand");
1382 
1383     QuerySplitter_BlastxSingleQueryMultiChunk(kTestName, eNa_strand_plus);
1384 }
1385 
1386 /// Tests blastx of the minus strand of a single query into multiple chunks
1387 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxSingleQueryMultiChunk_MinusStrand) {
1388     const string
1389         kTestName("QuerySplitter_BlastxSingleQueryMultiChunk_MinusStrand");
1390 
1391     QuerySplitter_BlastxSingleQueryMultiChunk(kTestName, eNa_strand_minus);
1392 }
1393 
1394 
1395 /// Tests blastx of the plus strand of multiple queries into multiple chunks
1396 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_PlusStrand) {
1397     const string
1398         kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_PlusStrand");
1399 
1400     QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_plus);
1401 }
1402 
1403 /// Tests blastx of the minus strand of multiple queries into multiple
1404 /// chunks
1405 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_MinusStrand) {
1406     const string
1407         kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_MinusStrand");
1408 
1409     QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_minus);
1410 }
1411 
1412 /// Tests blastx of both strands of multiple queries into multiple
1413 /// chunks
1414 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_BothStrands) {
1415     const string
1416         kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_BothStrands");
1417 
1418     QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_both);
1419 }
1420 
1421 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastxMultiQueryMultiChunk_MixedStrands) {
1422     const string
1423         kTestName("QuerySplitter_BlastxMultiQueryMultiChunk_MixedStrands");
1424     vector<ENa_strand> query_strands;
1425     query_strands.reserve(4);
1426     query_strands.push_back(eNa_strand_unknown);
1427     query_strands.push_back(eNa_strand_plus);
1428     query_strands.push_back(eNa_strand_both);
1429     query_strands.push_back(eNa_strand_minus);
1430 
1431     QuerySplitter_BlastxMultiQueryMultiChunk(kTestName, eNa_strand_unknown,
1432                                              &query_strands);
1433 }
1434 
1435 #endif
1436 
1437 /// Tests blastp of a single query into multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastpSingleQueryMultiChunk)1438 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastpSingleQueryMultiChunk) {
1439     const string kTestName("QuerySplitter_BlastpSingleQueryMultiChunk");
1440 
1441     const size_t kLength = 33423;    // query length
1442     CBlastQueryVector query;
1443     CSeq_id id(CSeq_id::e_Gi, 110349719);
1444     query.AddQuery(CTestObjMgr::Instance().CreateBlastSearchQuery(id));
1445 
1446     CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(query));
1447     CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastp));
1448     CRef<CBlastOptions> opts(&opts_h->SetOptions());
1449     CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
1450 
1451     CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1452     CRef<CSplitQueryBlk> sqb = splitter->Split();
1453 
1454     BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
1455                                           kDefaultIntValue),
1456                          (int)splitter->GetChunkSize());
1457 
1458     CQuerySplitter::TSplitQueryVector split_query_vector;
1459     x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
1460     x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
1461 
1462     BOOST_REQUIRE_EQUAL(kLength, query_data->GetSumOfSequenceLengths());
1463     x_ValidateChunkBounds(splitter->GetChunkSize(),
1464                           query_data->GetSumOfSequenceLengths(),
1465                           *sqb, opts->GetProgramType());
1466 
1467     const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
1468                                                        "NumChunks",
1469                                                        kDefaultIntValue);
1470     BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
1471     BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
1472 
1473     vector< vector<size_t> > queries_per_chunk;
1474     x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
1475     x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
1476 
1477     vector< vector<int> > ctxs_per_chunk;
1478     x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
1479     x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
1480 
1481     vector< vector<size_t> > ctx_offsets_per_chunk;
1482     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1483                                  ctx_offsets_per_chunk);
1484     x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
1485 
1486     vector<BlastQueryInfo*> split_query_info;
1487     x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
1488                                 split_query_info);
1489     x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
1490     NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
1491         *itr = BlastQueryInfoFree(*itr);
1492     }
1493 }
1494 
1495 /// Tests blastp of multiple queries into multiple chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_BlastpMultiQueryMultiChunk)1496 BOOST_AUTO_TEST_CASE(QuerySplitter_BlastpMultiQueryMultiChunk) {
1497     const string kTestName("QuerySplitter_BlastpMultiQueryMultiChunk");
1498 
1499     TGiLengthVector gi_length;
1500     gi_length.push_back(make_pair<int, size_t>(33624848,  6883));
1501     gi_length.push_back(make_pair<int, size_t>(4758794,   6669));
1502     gi_length.push_back(make_pair<int, size_t>(66821305,  6061));
1503     gi_length.push_back(make_pair<int, size_t>(109075552, 5007));
1504 
1505     size_t tot_length;
1506     TSeqLocVector queries;
1507     s_ConvertToBlastQueries(gi_length, queries, &tot_length);
1508 
1509     CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
1510     CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastp));
1511     CRef<CBlastOptions> opts(&opts_h->SetOptions());
1512     CRef<ILocalQueryData> query_data(qf->MakeLocalQueryData(&*opts));
1513 
1514     CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1515     CRef<CSplitQueryBlk> sqb = splitter->Split();
1516 
1517     BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
1518                                           kDefaultIntValue),
1519                          (int)splitter->GetChunkSize());
1520 
1521     CQuerySplitter::TSplitQueryVector split_query_vector;
1522     x_ReadQueryBoundsPerChunk(kTestName, sqb, split_query_vector);
1523     x_ValidateQuerySeqLocsPerChunk(splitter, split_query_vector);
1524 
1525     BOOST_REQUIRE_EQUAL(tot_length, query_data->GetSumOfSequenceLengths());
1526     x_ValidateChunkBounds(splitter->GetChunkSize(),
1527                           query_data->GetSumOfSequenceLengths(),
1528                           *sqb, opts->GetProgramType());
1529 
1530     const size_t kNumChunks = (size_t)m_Config->GetInt(kTestName,
1531                                                        "NumChunks",
1532                                                        kDefaultIntValue);
1533     BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
1534     BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
1535 
1536     vector< vector<size_t> > queries_per_chunk;
1537     x_ReadVectorOfVectorsForTest(kTestName, "Queries", queries_per_chunk);
1538     x_ValidateQueriesPerChunkAssignment(*sqb, queries_per_chunk);
1539 
1540     vector< vector<int> > ctxs_per_chunk;
1541     x_ReadVectorOfVectorsForTest(kTestName, "Contexts", ctxs_per_chunk);
1542     x_ValidateQueryContextsPerChunkAssignment(*sqb, ctxs_per_chunk);
1543 
1544     vector< vector<size_t> > ctx_offsets_per_chunk;
1545     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1546                                  ctx_offsets_per_chunk);
1547     x_ValidateContextOffsetsPerChunkAssignment(*sqb, ctx_offsets_per_chunk);
1548 
1549     vector<BlastQueryInfo*> split_query_info;
1550     x_ReadSplitQueryInfoForTest(kTestName, opts->GetProgramType(),
1551                                 split_query_info);
1552     x_ValidateLocalQueryData(splitter, &*opts, split_query_info);
1553     NON_CONST_ITERATE(vector<BlastQueryInfo*>, itr, split_query_info) {
1554         *itr = BlastQueryInfoFree(*itr);
1555     }
1556 }
1557 
1558 /// Tests the CContextTranslator class for blastn of both strands of
1559 /// multiple queries
BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_BothStrands)1560 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_BothStrands) {
1561     const string
1562         kTestName("TestCContextTranslator_BlastnMultiQuery_BothStrands");
1563     TGiLengthVector gi_length;
1564     gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1565     gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1566     gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1567 
1568     const size_t chunk_size = 500;
1569     const size_t num_chunks = 9;
1570 
1571     vector< vector<int> > starting_chunks(num_chunks);
1572     vector< vector<int> > absolute_contexts(num_chunks);
1573     vector< vector<size_t> > context_offset_corrections(num_chunks);
1574 
1575     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1576                                  starting_chunks);
1577     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1578                                  absolute_contexts);
1579     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1580                                  context_offset_corrections);
1581 
1582     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastn,
1583                              starting_chunks, absolute_contexts,
1584                              &context_offset_corrections,
1585                              eNa_strand_both);
1586 }
1587 
1588 /// Tests the CContextTranslator class for blastn of the plus strand of
1589 /// multiple queries
BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_PlusStrand)1590 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_PlusStrand) {
1591     const string
1592         kTestName("TestCContextTranslator_BlastnMultiQuery_PlusStrand");
1593     TGiLengthVector gi_length;
1594     gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1595     gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1596     gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1597 
1598     const size_t chunk_size = 500;
1599     const size_t num_chunks = 9;
1600 
1601     vector< vector<int> > starting_chunks(num_chunks);
1602     vector< vector<int> > absolute_contexts(num_chunks);
1603     vector< vector<size_t> > context_offset_corrections(num_chunks);
1604 
1605     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1606                                  starting_chunks);
1607     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1608                                  absolute_contexts);
1609     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1610                                  context_offset_corrections);
1611 
1612     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastn,
1613                              starting_chunks, absolute_contexts,
1614                              &context_offset_corrections,
1615                              eNa_strand_plus);
1616 }
1617 
1618 /// Tests the CContextTranslator class for blastn of the minus strand of
1619 /// multiple queries
BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_MinusStrand)1620 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastnMultiQuery_MinusStrand) {
1621     const string
1622         kTestName("TestCContextTranslator_BlastnMultiQuery_MinusStrand");
1623     TGiLengthVector gi_length;
1624     gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1625     gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1626     gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1627 
1628     const size_t chunk_size = 500;
1629     const size_t num_chunks = 9;
1630 
1631     vector< vector<int> > starting_chunks(num_chunks);
1632     vector< vector<int> > absolute_contexts(num_chunks);
1633     vector< vector<size_t> > context_offset_corrections(num_chunks);
1634 
1635     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1636                                  starting_chunks);
1637     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1638                                  absolute_contexts);
1639     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1640                                  context_offset_corrections);
1641 
1642     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastn,
1643                              starting_chunks, absolute_contexts,
1644                              &context_offset_corrections,
1645                              eNa_strand_minus);
1646 }
1647 
1648 /// Tests the CContextTranslator class for blastx of both strands of
1649 /// a single query with length divisible by CODON_LENGTH
BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_0)1650 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_0) {
1651     const string
1652         kTestName("TestCContextTranslator_BlastxSingleQuery_BothStrands_0");
1653     TGiLengthVector gi_length;
1654     gi_length.push_back(make_pair<int, size_t>(116001669, 33));
1655 
1656     const size_t chunk_size = 15;
1657     const size_t num_chunks = 3;
1658     CAutoEnvironmentVariable tmp_env("OVERLAP_CHUNK_SIZE", "6");
1659 
1660     vector< vector<int> > starting_chunks(num_chunks);
1661     vector< vector<int> > absolute_contexts(num_chunks);
1662     vector< vector<size_t> > context_offset_corrections(num_chunks);
1663 
1664     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1665                                  starting_chunks);
1666     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1667                                  absolute_contexts);
1668     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1669                                  context_offset_corrections);
1670 
1671     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1672                              starting_chunks, absolute_contexts,
1673                              &context_offset_corrections,
1674                              eNa_strand_both);
1675 }
1676 
1677 /// Tests the CContextTranslator class for blastx of both strands of
1678 /// a single query with length not divisible by CODON_LENGTH, instead, the
1679 /// (query length % CODON_LENGTH == 1)
BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_1)1680 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_1) {
1681     const string
1682         kTestName("TestCContextTranslator_BlastxSingleQuery_BothStrands_1");
1683     TGiLengthVector gi_length;
1684     gi_length.push_back(make_pair<int, size_t>(116001673, 34));
1685 
1686     const size_t chunk_size = 15;
1687     const size_t num_chunks = 3;
1688     CAutoEnvironmentVariable tmp_env("OVERLAP_CHUNK_SIZE", "6");
1689 
1690     vector< vector<int> > starting_chunks(num_chunks);
1691     vector< vector<int> > absolute_contexts(num_chunks);
1692     vector< vector<size_t> > context_offset_corrections(num_chunks);
1693 
1694     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1695                                  starting_chunks);
1696     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1697                                  absolute_contexts);
1698     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1699                                  context_offset_corrections);
1700 
1701     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1702                              starting_chunks, absolute_contexts,
1703                              &context_offset_corrections,
1704                              eNa_strand_both);
1705 }
1706 
1707 /// Tests the CContextTranslator class for blastx of both strands of
1708 /// a single query with length not divisible by CODON_LENGTH, instead, the
1709 /// (query length % CODON_LENGTH == 2)
BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_2)1710 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxSingleQuery_BothStrands_2) {
1711     const string
1712         kTestName("TestCContextTranslator_BlastxSingleQuery_BothStrands_2");
1713     TGiLengthVector gi_length;
1714     gi_length.push_back(make_pair<int, size_t>(116001668, 35));
1715 
1716     const size_t chunk_size = 15;
1717     const size_t kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
1718                                                kDefaultIntValue);
1719     CAutoEnvironmentVariable tmp_env("OVERLAP_CHUNK_SIZE", "6");
1720 
1721     vector< vector<int> > starting_chunks(kNumChunks);
1722     vector< vector<int> > absolute_contexts(kNumChunks);
1723     vector< vector<size_t> > context_offset_corrections(kNumChunks);
1724 
1725     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1726                                  starting_chunks);
1727     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1728                                  absolute_contexts);
1729     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1730                                  context_offset_corrections);
1731 
1732     x_TestCContextTranslator(gi_length, chunk_size, kNumChunks, eBlastx,
1733                              starting_chunks, absolute_contexts,
1734                              &context_offset_corrections,
1735                              eNa_strand_both);
1736 }
1737 
1738 /*********  This functionality has not been implemented  **************/
1739 #if 0
1740 
1741 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxMultiQuery_BothStrands) {
1742     const string
1743         kTestName("TestCContextTranslator_BlastxMultiQuery_BothStrands");
1744     TGiLengthVector gi_length;
1745     gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1746     gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1747     gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1748 
1749     const size_t chunk_size = 501;
1750     const size_t num_chunks = 10;
1751 
1752     vector< vector<int> > starting_chunks(num_chunks);
1753     vector< vector<int> > absolute_contexts(num_chunks);
1754     vector< vector<size_t> > context_offset_corrections(num_chunks);
1755 
1756     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1757                                  starting_chunks);
1758     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1759                                  absolute_contexts);
1760     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1761                                  context_offset_corrections);
1762 
1763     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1764                              starting_chunks, absolute_contexts,
1765                              &context_offset_corrections,
1766                              eNa_strand_both);
1767 }
1768 
1769 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxMultiQuery_PlusStrand) {
1770     const string
1771         kTestName("TestCContextTranslator_BlastxMultiQuery_PlusStrand");
1772     TGiLengthVector gi_length;
1773     gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1774     gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1775     gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1776 
1777     const size_t chunk_size = 500;
1778     const size_t num_chunks = 10;
1779 
1780     vector< vector<int> > starting_chunks(num_chunks);
1781     vector< vector<int> > absolute_contexts(num_chunks);
1782     vector< vector<size_t> > context_offset_corrections(num_chunks);
1783 
1784     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1785                                  starting_chunks);
1786     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1787                                  absolute_contexts);
1788     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1789                                  context_offset_corrections);
1790 
1791     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1792                              starting_chunks, absolute_contexts,
1793                              &context_offset_corrections,
1794                              eNa_strand_plus);
1795 }
1796 
1797 BOOST_AUTO_TEST_CASE(TestCContextTranslator_BlastxMultiQuery_MinusStrand) {
1798     const string
1799         kTestName("TestCContextTranslator_BlastxMultiQuery_MinusStrand");
1800     TGiLengthVector gi_length;
1801     gi_length.push_back(make_pair<int, size_t>(107784911, 1000));
1802     gi_length.push_back(make_pair<int, size_t>(115354032, 250));
1803     gi_length.push_back(make_pair<int, size_t>(115381005, 2551));
1804 
1805     const size_t chunk_size = 500;
1806     const size_t num_chunks = 10;
1807 
1808     vector< vector<int> > starting_chunks(num_chunks);
1809     vector< vector<int> > absolute_contexts(num_chunks);
1810     vector< vector<size_t> > context_offset_corrections(num_chunks);
1811 
1812     x_ReadVectorOfVectorsForTest(kTestName, "StartingChunks",
1813                                  starting_chunks);
1814     x_ReadVectorOfVectorsForTest(kTestName, "AbsoluteContexts",
1815                                  absolute_contexts);
1816     x_ReadVectorOfVectorsForTest(kTestName, "ContextOffsets",
1817                                  context_offset_corrections);
1818 
1819     x_TestCContextTranslator(gi_length, chunk_size, num_chunks, eBlastx,
1820                              starting_chunks, absolute_contexts,
1821                              &context_offset_corrections,
1822                              eNa_strand_minus);
1823 }
1824 #endif
1825 
1826 
1827 /// Tests the CQuerySplitter class when no splitting should occur
BOOST_AUTO_TEST_CASE(QuerySplitter_NoSplit)1828 BOOST_AUTO_TEST_CASE(QuerySplitter_NoSplit) {
1829     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1830     const string kTestName("QuerySplitter_NoSplit");
1831     CBlastQueryVector query;
1832     CSeq_id id(CSeq_id::e_Gi, 555);
1833     query.AddQuery(CTestObjMgr::Instance().CreateBlastSearchQuery(id));
1834 
1835     CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(query));
1836     CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastn));
1837     CRef<CBlastOptions> opts(&opts_h->SetOptions());
1838 
1839     const size_t kNumChunks = m_Config->GetInt(kTestName, "NumChunks",
1840                                                kDefaultIntValue);
1841     CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1842 
1843     BOOST_REQUIRE_EQUAL(false, splitter->IsQuerySplit());
1844     BOOST_REQUIRE_EQUAL(m_Config->GetInt(kTestName, "ChunkSize",
1845                                           kDefaultIntValue),
1846                          (int)splitter->GetChunkSize());
1847     BOOST_REQUIRE_EQUAL(kNumChunks, (size_t)splitter->GetNumberOfChunks());
1848 
1849     CRef<CSplitQueryBlk> sqb = splitter->Split();
1850     BOOST_REQUIRE_EQUAL(false, splitter->IsQuerySplit());
1851     BOOST_REQUIRE_EQUAL(kNumChunks, sqb->GetNumChunks());
1852 
1853     try {
1854         // try passing an out-of-range index
1855         (void)sqb->GetNumQueriesForChunk(kNumChunks + 8);
1856         BOOST_REQUIRE(false);
1857     } catch (const runtime_error&) {
1858         BOOST_REQUIRE(true);
1859     }
1860 
1861     CRef<IQueryFactory> chunk_query_factory =
1862         splitter->GetQueryFactoryForChunk(0);
1863     BOOST_REQUIRE_EQUAL(qf, chunk_query_factory);
1864 }
1865 
1866 /// Tests the CQuerySplitter class for retrieval of IQueryFactory objects
1867 /// for given chunks
BOOST_AUTO_TEST_CASE(QuerySplitter_ValidateQueryFactoriesBlastn)1868 BOOST_AUTO_TEST_CASE(QuerySplitter_ValidateQueryFactoriesBlastn) {
1869     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "30000");
1870     TGiLengthVector gi_length;
1871     gi_length.push_back(make_pair<int, size_t>(95116755, 35000));
1872     gi_length.push_back(make_pair<int, size_t>(112123020, 35580));
1873 
1874     TSeqLocVector queries;
1875     s_ConvertToBlastQueries(gi_length, queries);
1876 
1877     CRef<IQueryFactory> qf(new CObjMgr_QueryFactory(queries));
1878     CRef<CBlastOptionsHandle> opts_h(CBlastOptionsFactory::Create(eBlastn));
1879     CRef<CBlastOptions> opts(&opts_h->SetOptions());
1880 
1881     CRef<CQuerySplitter> splitter(new CQuerySplitter(qf, &*opts));
1882     const size_t kNumChunks(2);
1883 
1884     try {
1885         (void)splitter->GetQueryFactoryForChunk(kNumChunks);
1886         BOOST_REQUIRE(false);
1887     } catch (const out_of_range& ) {
1888         BOOST_REQUIRE(true);
1889     }
1890 
1891     CRef<IQueryFactory> chunk_0 = splitter->GetQueryFactoryForChunk(0);
1892     CRef<IQueryFactory> chunk_1 = splitter->GetQueryFactoryForChunk(1);
1893 
1894     BOOST_REQUIRE(chunk_0 != qf);
1895     BOOST_REQUIRE(chunk_1 != qf);
1896 
1897     BOOST_REQUIRE(chunk_0.NotEmpty());
1898     BOOST_REQUIRE(chunk_1.NotEmpty());
1899 }
1900 
BOOST_AUTO_TEST_CASE(CalculateNumberChunks)1901 BOOST_AUTO_TEST_CASE(CalculateNumberChunks)
1902 {
1903     EBlastProgramType program = eBlastTypeBlastx;
1904     size_t chunk_size = 10002;
1905     Uint4 retval = SplitQuery_CalculateNumChunks(program,
1906                        &chunk_size, 10240000, 1);
1907     BOOST_REQUIRE_EQUAL(1055, retval);
1908 
1909     retval = SplitQuery_CalculateNumChunks(eBlastTypeBlastx,
1910                        &chunk_size, chunk_size/2, 1);
1911 
1912     BOOST_REQUIRE_EQUAL(1, retval);
1913 
1914     retval = SplitQuery_CalculateNumChunks(program,
1915                        &chunk_size,
1916                        3*chunk_size-2*SplitQuery_GetOverlapChunkSize(program), 1);
1917 
1918     BOOST_REQUIRE_EQUAL(3, retval);
1919 
1920     retval = SplitQuery_CalculateNumChunks(program,
1921                        &chunk_size,
1922                        1+2*chunk_size+SplitQuery_GetOverlapChunkSize(program), 1);
1923 
1924     BOOST_REQUIRE_EQUAL(2, retval);
1925 }
1926 
BOOST_AUTO_TEST_CASE(InvalidChunkSizeBlastx)1927 BOOST_AUTO_TEST_CASE(InvalidChunkSizeBlastx)
1928 {
1929     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1930     BOOST_REQUIRE_THROW(SplitQuery_GetChunkSize(blast::eBlastx), CBlastException);
1931 }
1932 
BOOST_AUTO_TEST_CASE(InvalidChunkSizeTblastx)1933 BOOST_AUTO_TEST_CASE(InvalidChunkSizeTblastx)
1934 {
1935     CAutoEnvironmentVariable tmp_env("CHUNK_SIZE", "40000");
1936     BOOST_REQUIRE_THROW(SplitQuery_GetChunkSize(blast::eTblastx), CBlastException);
1937 }
1938 
1939 BOOST_AUTO_TEST_SUITE_END()
1940