1 
2 /* ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Christiam Camacho
27  *
28  */
29 
30 /** @file split_query_cxx.cpp
31  * Defines CQuerySplitter, a class to split the query sequence(s)
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include "split_query.hpp"
36 #include <algo/blast/api/sseqloc.hpp>
37 #include <algo/blast/api/blast_options.hpp>
38 #include <algo/blast/api/local_blast.hpp>
39 
40 #include <objtools/simple/simple_om.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp>
43 
44 #include "split_query_aux_priv.hpp"
45 #include "blast_setup.hpp"
46 
47 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(blast)48 BEGIN_SCOPE(blast)
49 
50 CQuerySplitter::CQuerySplitter(CRef<IQueryFactory> query_factory,
51                                const CBlastOptions* options)
52     : m_QueryFactory(query_factory), m_Options(options), m_NumChunks(0),
53     m_LocalQueryData(0), m_TotalQueryLength(0), m_ChunkSize(0)
54 {
55     m_ChunkSize = SplitQuery_GetChunkSize(m_Options->GetProgram());
56     m_LocalQueryData = m_QueryFactory->MakeLocalQueryData(m_Options);
57     m_TotalQueryLength = m_LocalQueryData->GetSumOfSequenceLengths();
58     m_NumChunks = SplitQuery_CalculateNumChunks(m_Options->GetProgramType(),
59         &m_ChunkSize, m_TotalQueryLength, m_LocalQueryData->GetNumQueries());
60     /* No split for ungapped mode JIRA SB-1082 */
61     if (!options->GetGappedMode()) m_NumChunks = 1;
62     x_ExtractCScopesAndMasks();
63 }
64 
operator <<(ostream & out,const CQuerySplitter & rhs)65 ostream& operator<<(ostream& out, const CQuerySplitter& rhs)
66 {
67     ILocalQueryData* query_data =
68         const_cast<ILocalQueryData*>(&*rhs.m_LocalQueryData);
69     const size_t kNumQueries = query_data->GetNumQueries();
70     const size_t kNumChunks = rhs.GetNumberOfChunks();
71 
72     out << endl << "; This is read by x_ReadQueryBoundsPerChunk"
73         << endl << "; Format: query start, query end, strand" << endl;
74 
75     // for all query indices {
76     //     iterate every chunk and collect the coords, print them out
77     // }
78     for (size_t query_index = 0; query_index < kNumQueries; query_index++) {
79         CConstRef<CSeq_id> query_id
80             (query_data->GetSeq_loc(query_index)->GetId());
81         _ASSERT(query_id);
82 
83         for (size_t chunk_index = 0; chunk_index < kNumChunks; chunk_index++) {
84             ASSERT(rhs.m_SplitQueriesInChunk.size());
85             CRef<CBlastQueryVector> queries_in_chunk =
86                 rhs.m_SplitQueriesInChunk[chunk_index];
87 
88             for (size_t qidx = 0; qidx < queries_in_chunk->Size(); qidx++) {
89                 CConstRef<CSeq_loc> query_loc_in_chunk =
90                     queries_in_chunk->GetQuerySeqLoc(qidx);
91                 _ASSERT(query_loc_in_chunk);
92                 CConstRef<CSeq_id> query_id_in_chunk
93                     (query_loc_in_chunk->GetId());
94                 _ASSERT(query_id_in_chunk);
95 
96                 if (query_id->Match(*query_id_in_chunk)) {
97                     const CSeq_loc::TRange& range =
98                         query_loc_in_chunk->GetTotalRange();
99                     out << "Chunk" << chunk_index << "Query" << query_index
100                         << " = " << range.GetFrom() << ", "
101                         << range.GetToOpen() << ", "
102                         << (int)query_loc_in_chunk->GetStrand() << endl;
103                 }
104             }
105         }
106         out << endl;
107     }
108 
109     return out;
110 }
111 
112 void
x_ExtractCScopesAndMasks()113 CQuerySplitter::x_ExtractCScopesAndMasks()
114 {
115     _ASSERT(m_LocalQueryData.NotEmpty());
116     _ASSERT(m_Scopes.empty());
117     _ASSERT(m_UserSpecifiedMasks.empty());
118 
119     const size_t num_queries = m_LocalQueryData->GetNumQueries();
120 
121     CObjMgr_QueryFactory* objmgr_qf = NULL;
122     if ( (objmgr_qf = dynamic_cast<CObjMgr_QueryFactory*>(&*m_QueryFactory)) ) {
123         // extract the scopes and masks ...
124         m_Scopes = objmgr_qf->ExtractScopes();
125         m_UserSpecifiedMasks = objmgr_qf->ExtractUserSpecifiedMasks();
126         _ASSERT(m_Scopes.size() == num_queries);
127     } else {
128         m_NumChunks = 1;
129         m_UserSpecifiedMasks.assign(num_queries, TMaskedQueryRegions());
130     }
131     _ASSERT(m_UserSpecifiedMasks.size() == num_queries);
132 }
133 
134 void
x_ComputeChunkRanges()135 CQuerySplitter::x_ComputeChunkRanges()
136 {
137     _ASSERT(m_SplitBlk.NotEmpty());
138 
139     // Note that this information might not need to be stored in the
140     // SSplitQueryBlk structure, as these ranges can be calculated as follows:
141     // chunk_start = (chunk_num*chunk_size) - (chunk_num*overlap_size);
142     // chunk_end = chunk_start + chunk_size > query_size ? query_size :
143     // chunk_start + chunk_size;
144 
145     size_t chunk_start = 0;
146     const size_t kOverlapSize =
147         SplitQuery_GetOverlapChunkSize(m_Options->GetProgramType());
148     for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
149         size_t chunk_end = chunk_start + m_ChunkSize;
150 
151         // if the chunk end is larger than the sequence ...
152         if (chunk_end >= m_TotalQueryLength ||
153             // ... or this is the last chunk and it didn't make it to the end
154             // of the sequence
155             (chunk_end < m_TotalQueryLength && (chunk_num + 1) == m_NumChunks))
156         {
157             // ... assign this chunk's end to the end of the sequence
158             chunk_end = m_TotalQueryLength;
159         }
160 
161         m_SplitBlk->SetChunkBounds(chunk_num,
162                                    TChunkRange(chunk_start, chunk_end));
163         _TRACE("Chunk " << chunk_num << ": ranges from " << chunk_start
164                << " to " << chunk_end);
165 
166         chunk_start += (m_ChunkSize - kOverlapSize);
167         if (chunk_start > m_TotalQueryLength ||
168             chunk_end == m_TotalQueryLength) {
169             break;
170         }
171     }
172 
173     // For purposes of having an accurate overlap size when stitching back
174     // HSPs, save the overlap size
175     const size_t kOverlap =
176         Blast_QueryIsTranslated(m_Options->GetProgramType())
177         ? kOverlapSize / CODON_LENGTH : kOverlapSize;
178     m_SplitBlk->SetChunkOverlapSize(kOverlap);
179 }
180 
181 /// Auxiliary function to assign the split query's Seq-interval so that it's
182 /// constrained within the chunk boundaries
183 /// @param chunk Range for the chunk [in]
184 /// @param query_range Range of sequence data corresponding to the full query
185 /// [in]
186 /// @param split_query_loc Seq-loc for this query constrained by the chunk's
187 /// boundaries [out]
188 static void
s_SetSplitQuerySeqInterval(const TChunkRange & chunk,const TChunkRange & query_range,int query_offset,CRef<CSeq_loc> split_query_loc)189 s_SetSplitQuerySeqInterval(const TChunkRange& chunk,
190                            const TChunkRange& query_range,
191                            int query_offset,
192                            CRef<CSeq_loc> split_query_loc)
193 {
194     _ASSERT(split_query_loc.NotEmpty());
195     _ASSERT(chunk.IntersectingWith(query_range));
196     CSeq_interval& interval = split_query_loc->SetInt();
197     const int qstart = chunk.GetFrom() - query_range.GetFrom();
198     const int qend = chunk.GetToOpen() - query_range.GetToOpen();
199 
200     interval.SetFrom(max(0, qstart) + query_offset);
201 
202     if (qend >= 0) {
203         interval.SetTo(query_range.GetToOpen() - query_range.GetFrom() + query_offset);
204     } else {
205         interval.SetTo(chunk.GetToOpen() - query_range.GetFrom() + query_offset);
206     }
207 
208     // Note subtraction, as Seq-intervals are assumed to be
209     // open/inclusive
210     interval.SetTo() -= 1;
211 }
212 
213 void
x_ComputeQueryIndicesForChunks()214 CQuerySplitter::x_ComputeQueryIndicesForChunks()
215 {
216     const int kNumQueries = m_LocalQueryData->GetNumQueries();
217     const EBlastProgramType kProgram = m_Options->GetProgramType();
218     const ENa_strand kStrandOption = m_Options->GetStrandOption();
219 
220     // Build vector of query ranges along the concatenated query
221     vector<TChunkRange> query_ranges;
222     query_ranges.reserve(kNumQueries);
223     query_ranges.push_back(TChunkRange(0, m_LocalQueryData->GetSeqLength(0)));
224     _TRACE("Query 0: " << query_ranges.back().GetFrom() << "-" <<
225            query_ranges.back().GetToOpen());
226     for (int i = 1; i < kNumQueries; i++) {
227         TSeqPos query_start = query_ranges[i-1].GetTo() + 1;
228         TSeqPos query_end = query_start + m_LocalQueryData->GetSeqLength(i);
229         query_ranges.push_back(TChunkRange(query_start, query_end));
230         _TRACE("Query " << i << ": " << query_ranges.back().GetFrom()
231                << "-" << query_ranges.back().GetToOpen());
232     }
233 
234     m_SplitQueriesInChunk.assign(m_NumChunks, CRef<CBlastQueryVector>());
235     _ASSERT(m_UserSpecifiedMasks.size() == (size_t)kNumQueries);
236 
237     // determine intersection between query ranges and chunk ranges
238     for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
239         const TChunkRange chunk = m_SplitBlk->GetChunkBounds(chunk_num);
240         // FIXME: can be optimized to avoid examining those that have already
241         // been assigned to a given chunk
242         for (size_t qindex = 0; qindex < query_ranges.size(); qindex++) {
243             const TChunkRange& query_range = query_ranges[qindex];
244             if ( !chunk.IntersectingWith(query_range) ) {
245                 continue;
246             }
247             m_SplitBlk->AddQueryToChunk(chunk_num, qindex);
248             if (m_SplitQueriesInChunk[chunk_num].Empty()) {
249                 m_SplitQueriesInChunk[chunk_num].Reset(new CBlastQueryVector);
250             }
251 
252             // Build the split query seqloc for this query
253 
254             CConstRef<CSeq_loc> query_seqloc(m_LocalQueryData->GetSeq_loc(qindex));
255 
256             CRef<CSeq_loc> split_query_loc(new CSeq_loc);
257             {
258             	int q_sl_offset = 0;
259             	if(query_seqloc->IsInt() && (query_seqloc->GetInt().GetFrom() > 0 )) {
260             		q_sl_offset = query_seqloc->GetInt().GetFrom();
261             	}
262             	s_SetSplitQuerySeqInterval(chunk, query_range, q_sl_offset, split_query_loc);
263             }
264 
265 
266 
267             split_query_loc->SetId(*query_seqloc->GetId());
268             const ENa_strand kStrand =
269                 BlastSetup_GetStrand(*query_seqloc, kProgram, kStrandOption);
270             split_query_loc->SetStrand(kStrand);
271             _TRACE("Chunk " << chunk_num << ": query " << qindex << " ("
272                    << split_query_loc->GetId()->AsFastaString() << ")"
273                    << " " << split_query_loc->GetInt().GetFrom()
274                    << "-" << split_query_loc->GetInt().GetTo()
275                    << " strand " << (int)split_query_loc->GetStrand());
276 
277             // retrieve the split mask corresponding to this chunk of the query
278             CRef<CSeq_loc> mask_query_loc(new CSeq_loc);
279             s_SetSplitQuerySeqInterval(chunk, query_range, 0, mask_query_loc);
280             TMaskedQueryRegions split_mask =
281                 m_UserSpecifiedMasks[qindex].RestrictToSeqInt(mask_query_loc->GetInt());
282 
283             // retrieve the scope to retrieve this query
284             CRef<CScope> scope(m_Scopes[qindex]);
285             _ASSERT(scope.NotEmpty());
286 
287             // our split query chunk :)
288             CRef<CBlastSearchQuery> split_query
289                 (new CBlastSearchQuery(*split_query_loc, *scope, split_mask));
290             m_SplitQueriesInChunk[chunk_num]->AddQuery(split_query);
291         }
292     }
293 }
294 
295 /** Adds the necessary shift to the context to record the query contexts for
296  * the query chunks
297  * @param context query context [in]
298  * @param shift shift to add [in]
299  */
300 static inline unsigned int
s_AddShift(unsigned int context,int shift)301 s_AddShift(unsigned int context, int shift)
302 {
303     _ASSERT(context == 3 || context == 4 || context == 5);
304     _ASSERT(shift == 0 || shift == 1 || shift == -1);
305 
306     unsigned int retval;
307     if (shift == 0) {
308         retval = context;
309     } else if (shift == 1) {
310         retval = context == 3 ? 5 : context - shift;
311     } else if (shift == -1) {
312         retval = context == 5 ? 3 : context - shift;
313     } else {
314         abort();
315     }
316     return retval;
317 }
318 
319 /**
320  * @brief Retrieve the shift for the negative strand
321  *
322  * @param query_length length of the query [in]
323  *
324  * @return shift (either 1, -1, or 0)
325  */
326 static inline int
s_GetShiftForTranslatedNegStrand(size_t query_length)327 s_GetShiftForTranslatedNegStrand(size_t query_length)
328 {
329     int retval;
330     switch (query_length % CODON_LENGTH) {
331     case 1: retval = -1; break;
332     case 2: retval = 1; break;
333     case 0: default: retval = 0; break;
334     }
335     return retval;
336 }
337 
338 void
x_ComputeQueryContextsForChunks()339 CQuerySplitter::x_ComputeQueryContextsForChunks()
340 {
341     const EBlastProgramType kProgram = m_Options->GetProgramType();
342     const unsigned int kNumContexts = GetNumberOfContexts(kProgram);
343     const ENa_strand kStrandOption = m_Options->GetStrandOption();
344     auto_ptr<CQueryDataPerChunk> qdpc;
345 
346     if (Blast_QueryIsTranslated(kProgram)) {
347         qdpc.reset(new CQueryDataPerChunk(*m_SplitBlk, kProgram,
348                                           m_LocalQueryData));
349     }
350 
351     for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
352         vector<size_t> queries = m_SplitBlk->GetQueryIndices(chunk_num);
353 
354         for (size_t i = 0; i < queries.size(); i++) {
355             CConstRef<CSeq_loc> sl = m_LocalQueryData->GetSeq_loc(queries[i]);
356             const ENa_strand kStrand =
357                 BlastSetup_GetStrand(*sl, kProgram, kStrandOption);
358 
359             if (Blast_QueryIsTranslated(kProgram)) {
360                 size_t qlength = qdpc->GetQueryLength(queries[i]);
361                 int last_query_chunk = qdpc->GetLastChunk(queries[i]);
362                 _ASSERT(last_query_chunk != -1);
363                 int shift = s_GetShiftForTranslatedNegStrand(qlength);
364 
365                 for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
366                     // handle the plus strand...
367                     if (ctx % NUM_FRAMES < CODON_LENGTH) {
368                         if (kStrand == eNa_strand_minus) {
369                             m_SplitBlk->AddContextToChunk(chunk_num,
370                                                           kInvalidContext);
371                         } else {
372                             m_SplitBlk->AddContextToChunk(chunk_num,
373                                               kNumContexts*queries[i]+ctx);
374                         }
375                     } else { // handle the negative strand
376                         if (kStrand == eNa_strand_plus) {
377                             m_SplitBlk->AddContextToChunk(chunk_num,
378                                                           kInvalidContext);
379                         } else {
380                             if (chunk_num == (size_t)last_query_chunk) {
381                                 // last chunk doesn't have shift
382                                 m_SplitBlk->AddContextToChunk(chunk_num,
383                                           kNumContexts*queries[i]+ctx);
384                             } else {
385                                 m_SplitBlk->AddContextToChunk(chunk_num,
386                                           kNumContexts*queries[i]+
387                                           s_AddShift(ctx, shift));
388                             }
389                         }
390                     }
391                 }
392             } else if (Blast_QueryIsNucleotide(kProgram)) {
393 
394                 for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
395                     // handle the plus strand...
396                     if (ctx % NUM_STRANDS == 0) {
397                         if (kStrand == eNa_strand_minus) {
398                             m_SplitBlk->AddContextToChunk(chunk_num,
399                                                           kInvalidContext);
400                         } else {
401                             m_SplitBlk->AddContextToChunk(chunk_num,
402                                               kNumContexts*queries[i]+ctx);
403                         }
404                     } else { // handle the negative strand
405                         if (kStrand == eNa_strand_plus) {
406                             m_SplitBlk->AddContextToChunk(chunk_num,
407                                                           kInvalidContext);
408                         } else {
409                             m_SplitBlk->AddContextToChunk(chunk_num,
410                                               kNumContexts*queries[i]+ctx);
411                         }
412                     }
413                 }
414 
415             } else if (Blast_QueryIsProtein(kProgram)) {
416                 m_SplitBlk->AddContextToChunk(chunk_num,
417                                               kNumContexts*queries[i]);
418             } else {
419                 abort();
420             }
421         }
422     }
423 }
424 
425 /**
426  * @brief Determine whether a given context corresponds to the plus or minus
427  * strand
428  *
429  * @param qinfo BlastQueryInfo structure to determine the strand [in]
430  * @param context_number Context number in the BlastQueryInfo structure (index
431  * into the BlastContextInfo array) [in]
432  */
433 static inline bool
s_IsPlusStrand(const BlastQueryInfo * qinfo,Int4 context_number)434 s_IsPlusStrand(const BlastQueryInfo* qinfo, Int4 context_number)
435 {
436     return qinfo->contexts[context_number].frame >= 0;
437 }
438 
439 /**
440  * @brief Get the length of a context in absolute terms (i.e.: in the context
441  * of the full, non-split sequence)
442  *
443  * @param chunk_qinfo vector of BlastQueryInfo structures corresponding to the
444  * various query chunks [in]
445  * @param chunk_num Chunk number, index into the vector above [in]
446  * @param ctx_translator auxiliary context translator object [in]
447  * @param absolute_context context in the full, non-split query
448  *
449  * @return length of the requested context
450  */
451 static size_t
s_GetAbsoluteContextLength(const vector<const BlastQueryInfo * > & chunk_qinfo,int chunk_num,const CContextTranslator & ctx_translator,int absolute_context)452 s_GetAbsoluteContextLength(const vector<const BlastQueryInfo*>& chunk_qinfo,
453                            int chunk_num,
454                            const CContextTranslator& ctx_translator,
455                            int absolute_context)
456 {
457     if (chunk_num < 0) {
458         return 0;
459     }
460 
461     int pos = ctx_translator.GetContextInChunk((size_t)chunk_num,
462                                                absolute_context);
463     if (pos != kInvalidContext) {
464         return chunk_qinfo[chunk_num]->contexts[pos].query_length;
465     }
466     return 0;
467 }
468 
469 //#define DEBUG_COMPARE_SEQUENCES 1
470 
471 #ifdef DEBUG_COMPARE_SEQUENCES
472 
473 /// Convert a sequence into its printable representation
474 /// @param seq sequence data [in]
475 /// @param len length of sequence to print [in]
476 /// @param is_prot whether the sequence is protein or not [in]
s_GetPrintableSequence(const Uint1 * seq,size_t len,bool is_prot)477 static string s_GetPrintableSequence(const Uint1* seq, size_t len, bool is_prot)
478 {
479     string retval;
480     for (size_t i = 0; i < len; i++) {
481         retval.append(1, (is_prot
482                       ? NCBISTDAA_TO_AMINOACID[seq[i]]
483                       : BLASTNA_TO_IUPACNA[seq[i]]));
484     }
485     return retval;
486 }
487 
488 /** Auxiliary function to validate the context offset corrections
489  * @param global global query sequence data [in]
490  * @param chunk sequence data for chunk [in]
491  * @param len length of the data to compare [in]
492  * @param is_prot whether the sequence is protein or not [in]
493  * @return true if sequence data is identical, false otherwise
494  */
cmp_sequence(const Uint1 * global,const Uint1 * chunk,size_t len,bool is_prot)495 static bool cmp_sequence(const Uint1* global, const Uint1* chunk, size_t len,
496                          bool is_prot)
497 {
498     bool retval = true;
499 
500     for (size_t i = 0; i < len; i++) {
501         if (global[i] != chunk[i]) {
502             retval = false;
503             break;
504         }
505     }
506 
507     if (retval == false) {
508         _TRACE("Comparing global: '"
509                << s_GetPrintableSequence(global, len, is_prot) << "'");
510         _TRACE("with chunk: '"
511                << s_GetPrintableSequence(chunk, len, is_prot) << "'");
512     }
513 
514     return retval;
515 }
516 #endif
517 
518 /* ----------------------------------------------------------------------------
519  * To compute the offset correction for the plus strand we need to add the
520  * length of all the contexts spread across the various chunks and subtract
521  * the corresponding overlap regions.
522  *
523  * Assumptions: chunk size and overlap (in nucleotide coordinates) are
524  * divisible by CODON_LENGTH.
525  *
526  * let C = current split query chunk
527  * let SC = starting chunk such that the context in question is found in this
528  * chunk
529  * let ctx = context number in split query chunk C
530  * let O = overlap size in final sequence data coordinates (aa for blastx, na
531  * for blastn)
532  * let x = ctx equivalent in the global (absolute) unsplit query
533  * let ctxlen(a, b) be a function which computes the length of context a (given
534  * in global coordinates) in chunk b. If a is not found in b or if b doesn't
535  * exist (i.e.: is negative), 0 is returned.
536  * let corr = correction to be added to the global query offset for chunk C,
537  * context ctx
538  *
539  * corr = 0;
540  * for (; C != SC; C--) {
541  *       let prev_len = ctxlen(x, C - 1);
542  *       let curr_len = ctxlen(x, C);
543  *       let overlap = min(O, curr_len);
544  *       corr += prev_len - min(overlap, prev_len);
545  * }
546  *
547  * ----------------------------------------------------------------------------
548  * To compute the offset correction for the negative strand we need to subtract
549  * from the query length how far we have advanced into the query's negative
550  * strand  (starting from the right) minus the overlap regions.
551  *
552  * Define the same variables as for the plus strand (except corr, defined
553  * below):
554  * let L = length of context x in the absolute query
555  * let corr = L - S, where S is computed as follows:
556  *
557  * S = 0;
558  * for (; C >= SC; C--) {
559  *       let prev_len = ctxlen(x, C - 1);
560  *       let curr_len = ctxlen(x, C);
561  *       let overlap = min(O, curr_len);
562  *       corr += curr_len - min(overlap, prev_len);
563  * }
564  *
565 */
566 
567 void
x_ComputeContextOffsetsForChunks()568 CQuerySplitter::x_ComputeContextOffsetsForChunks()
569 {
570     if (Blast_QueryIsTranslated(m_Options->GetProgramType())) {
571         x_ComputeContextOffsets_TranslatedQueries();
572     } else {
573         x_ComputeContextOffsets_NonTranslatedQueries();
574     }
575 }
576 
577 // Record the correction needed to synchronize with the complete query all the
578 // query offsets within HSPs that fall within this context
579 void
x_ComputeContextOffsets_NonTranslatedQueries()580 CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries()
581 {
582     _ASSERT( !m_QueryChunkFactories.empty() );
583 
584     const EBlastProgramType kProgram = m_Options->GetProgramType();
585     _ASSERT( !Blast_QueryIsTranslated(kProgram) );
586     const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
587 #ifdef DEBUG_COMPARE_SEQUENCES
588     const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
589 #endif
590     const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram);
591     CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
592                                       m_Options);
593     vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);
594 
595     for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
596         CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
597         CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
598 #ifdef DEBUG_COMPARE_SEQUENCES
599         const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
600 #endif
601 
602         // BlastQueryInfo structure corresponding to chunk number chunk_num
603         chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
604         _ASSERT(chunk_qinfo[chunk_num]);
605 
606         // In case the first context differs from 0, for consistency with the
607         // other data returned by this class...
608         for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
609             m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
610         }
611 
612         for (Int4 ctx = chunk_qinfo[chunk_num]->first_context;
613              ctx <= chunk_qinfo[chunk_num]->last_context;
614              ctx++) {
615 
616             size_t correction = 0;
617             const int starting_chunk =
618                 ctx_translator.GetStartingChunk(chunk_num, ctx);
619             const int absolute_context =
620                 ctx_translator.GetAbsoluteContext(chunk_num, ctx);
621 
622             if (absolute_context == kInvalidContext ||
623                 starting_chunk == kInvalidContext) {
624                 _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
625                 // INT4_MAX is the sentinel value for invalid contexts
626                 m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
627                 continue;
628             }
629 
630             if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
631 
632                 for (int c = chunk_num; c != starting_chunk; c--) {
633                     size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
634                                                          c - 1,
635                                                          ctx_translator,
636                                                          absolute_context);
637                     size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
638                                                          ctx_translator,
639                                                          absolute_context);
640                     size_t overlap = min(kOverlap, curr_len);
641                     correction += prev_len - min(overlap, prev_len);
642                 }
643 
644             } else {
645 
646                 size_t subtrahend = 0;
647 
648                 for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
649                     size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
650                                                          c - 1,
651                                                          ctx_translator,
652                                                          absolute_context);
653                     size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
654                                                          c,
655                                                          ctx_translator,
656                                                          absolute_context);
657                     size_t overlap = min(kOverlap, curr_len);
658                     subtrahend += (curr_len - min(overlap, prev_len));
659                 }
660                 correction =
661                     global_qinfo->contexts[absolute_context].query_length -
662                     subtrahend;
663 
664             }
665             _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
666             m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
667 #ifdef DEBUG_COMPARE_SEQUENCES
668 {
669     int global_offset = global_qinfo->contexts[absolute_context].query_offset +
670         correction;
671     int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
672     if (!cmp_sequence(&global_seq->sequence[global_offset],
673                       &chunk_seq->sequence[chunk_offset], 10,
674                       Blast_QueryIsProtein(kProgram))) {
675         cerr << "Failed to compare sequence data!" << endl;
676     }
677 }
678 #endif
679 
680         }
681     }
682     _TRACE("CContextTranslator contents: " << ctx_translator);
683 }
684 
685 void
x_ComputeContextOffsets_TranslatedQueries()686 CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries()
687 {
688     _ASSERT( !m_QueryChunkFactories.empty() );
689 
690     const EBlastProgramType kProgram = m_Options->GetProgramType();
691     _ASSERT(Blast_QueryIsTranslated(kProgram));
692     const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
693 #ifdef DEBUG_COMPARE_SEQUENCES
694     const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
695 #endif
696     const size_t kOverlap =
697         SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH;
698     CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
699                                       m_Options);
700     CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData);
701     vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);
702 
703     for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
704         CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
705         CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
706 #ifdef DEBUG_COMPARE_SEQUENCES
707         const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
708 #endif
709 
710         // BlastQueryInfo structure corresponding to chunk number chunk_num
711         chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
712         _ASSERT(chunk_qinfo[chunk_num]);
713 
714         // In case the first context differs from 0, for consistency with the
715         // other data returned by this class...
716         for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
717             m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
718         }
719 
720         for (Int4 ctx = chunk_qinfo[chunk_num]->first_context;
721              ctx <= chunk_qinfo[chunk_num]->last_context;
722              ctx++) {
723 
724             size_t correction = 0;
725             const int starting_chunk =
726                 ctx_translator.GetStartingChunk(chunk_num, ctx);
727             const int absolute_context =
728                 ctx_translator.GetAbsoluteContext(chunk_num, ctx);
729             const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx);
730 
731             if (absolute_context == kInvalidContext ||
732                 starting_chunk == kInvalidContext) {
733                 _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
734                 // INT4_MAX is the sentinel value for invalid contexts
735                 m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
736                 continue;
737             }
738 
739             // The corrections for the contexts corresponding to the negative
740             // strand in the last chunk of a query sequence are all 0
741             if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
742                 (chunk_num == (size_t)last_query_chunk) &&
743                 (ctx % NUM_FRAMES >= 3)) {
744                 correction = 0;
745                 goto error_check;
746             }
747 
748             // The corrections for the contexts corresponding to the plus
749             // strand are always the same, so only calculate the first one
750             if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
751                 (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) {
752                 correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
753                 goto error_check;
754             }
755 
756             // If the query length is divisible by CODON_LENGTH, the
757             // corrections for all contexts corresponding to a given strand are
758             // the same, so only calculate the first one
759             if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) &&
760                 (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) {
761                 correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
762                 goto error_check;
763             }
764 
765             // If the query length % CODON_LENGTH == 1, the corrections for the
766             // first two contexts of the negative strand are the same, and the
767             // correction for the last context is one more than that.
768             if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) &&
769                 !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
770 
771                 if (ctx % NUM_FRAMES == 4) {
772                     correction =
773                         m_SplitBlk->GetContextOffsets(chunk_num).back();
774                     goto error_check;
775                 } else if (ctx % NUM_FRAMES == 5) {
776                     correction =
777                         m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
778                     goto error_check;
779                 }
780             }
781 
782             // If the query length % CODON_LENGTH == 2, the corrections for the
783             // last two contexts of the negative strand are the same, which is
784             // one more that the first context on the negative strand.
785             if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) &&
786                 !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
787 
788                 if (ctx % NUM_FRAMES == 4) {
789                     correction =
790                         m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
791                     goto error_check;
792                 } else if (ctx % NUM_FRAMES == 5) {
793                     correction =
794                         m_SplitBlk->GetContextOffsets(chunk_num).back();
795                     goto error_check;
796                 }
797             }
798 
799             if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
800 
801                 for (int c = chunk_num; c != starting_chunk; c--) {
802                     size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
803                                                          c - 1,
804                                                          ctx_translator,
805                                                          absolute_context);
806                     size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
807                                                          ctx_translator,
808                                                          absolute_context);
809                     size_t overlap = min(kOverlap, curr_len);
810                     correction += prev_len - min(overlap, prev_len);
811                 }
812 
813             } else {
814 
815                 size_t subtrahend = 0;
816 
817                 for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
818                     size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
819                                                          c - 1,
820                                                          ctx_translator,
821                                                          absolute_context);
822                     size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
823                                                          c,
824                                                          ctx_translator,
825                                                          absolute_context);
826                     size_t overlap = min(kOverlap, curr_len);
827                     subtrahend += (curr_len - min(overlap, prev_len));
828                 }
829                 correction =
830                     global_qinfo->contexts[absolute_context].query_length -
831                     subtrahend;
832             }
833 
834 error_check:
835             _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
836             m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
837 #ifdef DEBUG_COMPARE_SEQUENCES
838 {
839     int global_offset = global_qinfo->contexts[absolute_context].query_offset +
840         correction;
841     int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
842     int num_bases2compare =
843         min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length);
844     if (!cmp_sequence(&global_seq->sequence[global_offset],
845                       &chunk_seq->sequence[chunk_offset],
846                       num_bases2compare, Blast_QueryIsProtein(kProgram))) {
847         cerr << "Failed to compare sequence data for chunk " << chunk_num
848              << ", context " << ctx << endl;
849     }
850 }
851 #endif
852         }
853     }
854     _TRACE("CContextTranslator contents: " << ctx_translator);
855 }
856 
857 CRef<CSplitQueryBlk>
Split()858 CQuerySplitter::Split()
859 {
860     if (m_SplitBlk.NotEmpty()) {
861         return m_SplitBlk;
862     }
863 
864     m_SplitBlk.Reset(new CSplitQueryBlk(m_NumChunks,
865                                         m_Options->GetGappedMode()));
866     m_QueryChunkFactories.reserve(m_NumChunks);
867 
868     if (m_NumChunks == 1) {
869         m_QueryChunkFactories.push_back(m_QueryFactory);
870     } else {
871         _TRACE("Splitting into " << m_NumChunks << " query chunks");
872         x_ComputeChunkRanges();
873         x_ComputeQueryIndicesForChunks();
874         x_ComputeQueryContextsForChunks();
875 
876         for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
877             CRef<IQueryFactory> qf
878                 (new CObjMgr_QueryFactory(*m_SplitQueriesInChunk[chunk_num]));
879             m_QueryChunkFactories.push_back(qf);
880         }
881 
882         x_ComputeContextOffsetsForChunks();
883     }
884 
885     _TRACE("CSplitQuerBlk contents: " << *m_SplitBlk);
886     _TRACE("CQuerySplitter contents: " << *this);
887 
888     return m_SplitBlk;
889 }
890 
891 CRef<IQueryFactory>
GetQueryFactoryForChunk(Uint4 chunk_num)892 CQuerySplitter::GetQueryFactoryForChunk(Uint4 chunk_num)
893 {
894     if (chunk_num >= m_NumChunks) {
895         string msg("Invalid query chunk number: ");
896         msg += NStr::IntToString(chunk_num) + " out of " +
897             NStr::IntToString(m_NumChunks);
898         throw out_of_range(msg);
899     }
900 
901     if (m_SplitBlk.Empty()) {
902         Split();
903     }
904 
905     return m_QueryChunkFactories[chunk_num];
906 }
907 
908 END_SCOPE(blast)
909 END_NCBI_SCOPE
910 
911