1
2 /* ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 */
29
30 /** @file split_query_cxx.cpp
31 * Defines CQuerySplitter, a class to split the query sequence(s)
32 */
33
34 #include <ncbi_pch.hpp>
35 #include "split_query.hpp"
36 #include <algo/blast/api/sseqloc.hpp>
37 #include <algo/blast/api/blast_options.hpp>
38 #include <algo/blast/api/local_blast.hpp>
39
40 #include <objtools/simple/simple_om.hpp>
41 #include <objmgr/util/sequence.hpp>
42 #include <algo/blast/api/objmgr_query_data.hpp>
43
44 #include "split_query_aux_priv.hpp"
45 #include "blast_setup.hpp"
46
47 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(blast)48 BEGIN_SCOPE(blast)
49
50 CQuerySplitter::CQuerySplitter(CRef<IQueryFactory> query_factory,
51 const CBlastOptions* options)
52 : m_QueryFactory(query_factory), m_Options(options), m_NumChunks(0),
53 m_LocalQueryData(0), m_TotalQueryLength(0), m_ChunkSize(0)
54 {
55 m_ChunkSize = SplitQuery_GetChunkSize(m_Options->GetProgram());
56 m_LocalQueryData = m_QueryFactory->MakeLocalQueryData(m_Options);
57 m_TotalQueryLength = m_LocalQueryData->GetSumOfSequenceLengths();
58 m_NumChunks = SplitQuery_CalculateNumChunks(m_Options->GetProgramType(),
59 &m_ChunkSize, m_TotalQueryLength, m_LocalQueryData->GetNumQueries());
60 /* No split for ungapped mode JIRA SB-1082 */
61 if (!options->GetGappedMode()) m_NumChunks = 1;
62 x_ExtractCScopesAndMasks();
63 }
64
operator <<(ostream & out,const CQuerySplitter & rhs)65 ostream& operator<<(ostream& out, const CQuerySplitter& rhs)
66 {
67 ILocalQueryData* query_data =
68 const_cast<ILocalQueryData*>(&*rhs.m_LocalQueryData);
69 const size_t kNumQueries = query_data->GetNumQueries();
70 const size_t kNumChunks = rhs.GetNumberOfChunks();
71
72 out << endl << "; This is read by x_ReadQueryBoundsPerChunk"
73 << endl << "; Format: query start, query end, strand" << endl;
74
75 // for all query indices {
76 // iterate every chunk and collect the coords, print them out
77 // }
78 for (size_t query_index = 0; query_index < kNumQueries; query_index++) {
79 CConstRef<CSeq_id> query_id
80 (query_data->GetSeq_loc(query_index)->GetId());
81 _ASSERT(query_id);
82
83 for (size_t chunk_index = 0; chunk_index < kNumChunks; chunk_index++) {
84 ASSERT(rhs.m_SplitQueriesInChunk.size());
85 CRef<CBlastQueryVector> queries_in_chunk =
86 rhs.m_SplitQueriesInChunk[chunk_index];
87
88 for (size_t qidx = 0; qidx < queries_in_chunk->Size(); qidx++) {
89 CConstRef<CSeq_loc> query_loc_in_chunk =
90 queries_in_chunk->GetQuerySeqLoc(qidx);
91 _ASSERT(query_loc_in_chunk);
92 CConstRef<CSeq_id> query_id_in_chunk
93 (query_loc_in_chunk->GetId());
94 _ASSERT(query_id_in_chunk);
95
96 if (query_id->Match(*query_id_in_chunk)) {
97 const CSeq_loc::TRange& range =
98 query_loc_in_chunk->GetTotalRange();
99 out << "Chunk" << chunk_index << "Query" << query_index
100 << " = " << range.GetFrom() << ", "
101 << range.GetToOpen() << ", "
102 << (int)query_loc_in_chunk->GetStrand() << endl;
103 }
104 }
105 }
106 out << endl;
107 }
108
109 return out;
110 }
111
112 void
x_ExtractCScopesAndMasks()113 CQuerySplitter::x_ExtractCScopesAndMasks()
114 {
115 _ASSERT(m_LocalQueryData.NotEmpty());
116 _ASSERT(m_Scopes.empty());
117 _ASSERT(m_UserSpecifiedMasks.empty());
118
119 const size_t num_queries = m_LocalQueryData->GetNumQueries();
120
121 CObjMgr_QueryFactory* objmgr_qf = NULL;
122 if ( (objmgr_qf = dynamic_cast<CObjMgr_QueryFactory*>(&*m_QueryFactory)) ) {
123 // extract the scopes and masks ...
124 m_Scopes = objmgr_qf->ExtractScopes();
125 m_UserSpecifiedMasks = objmgr_qf->ExtractUserSpecifiedMasks();
126 _ASSERT(m_Scopes.size() == num_queries);
127 } else {
128 m_NumChunks = 1;
129 m_UserSpecifiedMasks.assign(num_queries, TMaskedQueryRegions());
130 }
131 _ASSERT(m_UserSpecifiedMasks.size() == num_queries);
132 }
133
134 void
x_ComputeChunkRanges()135 CQuerySplitter::x_ComputeChunkRanges()
136 {
137 _ASSERT(m_SplitBlk.NotEmpty());
138
139 // Note that this information might not need to be stored in the
140 // SSplitQueryBlk structure, as these ranges can be calculated as follows:
141 // chunk_start = (chunk_num*chunk_size) - (chunk_num*overlap_size);
142 // chunk_end = chunk_start + chunk_size > query_size ? query_size :
143 // chunk_start + chunk_size;
144
145 size_t chunk_start = 0;
146 const size_t kOverlapSize =
147 SplitQuery_GetOverlapChunkSize(m_Options->GetProgramType());
148 for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
149 size_t chunk_end = chunk_start + m_ChunkSize;
150
151 // if the chunk end is larger than the sequence ...
152 if (chunk_end >= m_TotalQueryLength ||
153 // ... or this is the last chunk and it didn't make it to the end
154 // of the sequence
155 (chunk_end < m_TotalQueryLength && (chunk_num + 1) == m_NumChunks))
156 {
157 // ... assign this chunk's end to the end of the sequence
158 chunk_end = m_TotalQueryLength;
159 }
160
161 m_SplitBlk->SetChunkBounds(chunk_num,
162 TChunkRange(chunk_start, chunk_end));
163 _TRACE("Chunk " << chunk_num << ": ranges from " << chunk_start
164 << " to " << chunk_end);
165
166 chunk_start += (m_ChunkSize - kOverlapSize);
167 if (chunk_start > m_TotalQueryLength ||
168 chunk_end == m_TotalQueryLength) {
169 break;
170 }
171 }
172
173 // For purposes of having an accurate overlap size when stitching back
174 // HSPs, save the overlap size
175 const size_t kOverlap =
176 Blast_QueryIsTranslated(m_Options->GetProgramType())
177 ? kOverlapSize / CODON_LENGTH : kOverlapSize;
178 m_SplitBlk->SetChunkOverlapSize(kOverlap);
179 }
180
181 /// Auxiliary function to assign the split query's Seq-interval so that it's
182 /// constrained within the chunk boundaries
183 /// @param chunk Range for the chunk [in]
184 /// @param query_range Range of sequence data corresponding to the full query
185 /// [in]
186 /// @param split_query_loc Seq-loc for this query constrained by the chunk's
187 /// boundaries [out]
188 static void
s_SetSplitQuerySeqInterval(const TChunkRange & chunk,const TChunkRange & query_range,int query_offset,CRef<CSeq_loc> split_query_loc)189 s_SetSplitQuerySeqInterval(const TChunkRange& chunk,
190 const TChunkRange& query_range,
191 int query_offset,
192 CRef<CSeq_loc> split_query_loc)
193 {
194 _ASSERT(split_query_loc.NotEmpty());
195 _ASSERT(chunk.IntersectingWith(query_range));
196 CSeq_interval& interval = split_query_loc->SetInt();
197 const int qstart = chunk.GetFrom() - query_range.GetFrom();
198 const int qend = chunk.GetToOpen() - query_range.GetToOpen();
199
200 interval.SetFrom(max(0, qstart) + query_offset);
201
202 if (qend >= 0) {
203 interval.SetTo(query_range.GetToOpen() - query_range.GetFrom() + query_offset);
204 } else {
205 interval.SetTo(chunk.GetToOpen() - query_range.GetFrom() + query_offset);
206 }
207
208 // Note subtraction, as Seq-intervals are assumed to be
209 // open/inclusive
210 interval.SetTo() -= 1;
211 }
212
213 void
x_ComputeQueryIndicesForChunks()214 CQuerySplitter::x_ComputeQueryIndicesForChunks()
215 {
216 const int kNumQueries = m_LocalQueryData->GetNumQueries();
217 const EBlastProgramType kProgram = m_Options->GetProgramType();
218 const ENa_strand kStrandOption = m_Options->GetStrandOption();
219
220 // Build vector of query ranges along the concatenated query
221 vector<TChunkRange> query_ranges;
222 query_ranges.reserve(kNumQueries);
223 query_ranges.push_back(TChunkRange(0, m_LocalQueryData->GetSeqLength(0)));
224 _TRACE("Query 0: " << query_ranges.back().GetFrom() << "-" <<
225 query_ranges.back().GetToOpen());
226 for (int i = 1; i < kNumQueries; i++) {
227 TSeqPos query_start = query_ranges[i-1].GetTo() + 1;
228 TSeqPos query_end = query_start + m_LocalQueryData->GetSeqLength(i);
229 query_ranges.push_back(TChunkRange(query_start, query_end));
230 _TRACE("Query " << i << ": " << query_ranges.back().GetFrom()
231 << "-" << query_ranges.back().GetToOpen());
232 }
233
234 m_SplitQueriesInChunk.assign(m_NumChunks, CRef<CBlastQueryVector>());
235 _ASSERT(m_UserSpecifiedMasks.size() == (size_t)kNumQueries);
236
237 // determine intersection between query ranges and chunk ranges
238 for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
239 const TChunkRange chunk = m_SplitBlk->GetChunkBounds(chunk_num);
240 // FIXME: can be optimized to avoid examining those that have already
241 // been assigned to a given chunk
242 for (size_t qindex = 0; qindex < query_ranges.size(); qindex++) {
243 const TChunkRange& query_range = query_ranges[qindex];
244 if ( !chunk.IntersectingWith(query_range) ) {
245 continue;
246 }
247 m_SplitBlk->AddQueryToChunk(chunk_num, qindex);
248 if (m_SplitQueriesInChunk[chunk_num].Empty()) {
249 m_SplitQueriesInChunk[chunk_num].Reset(new CBlastQueryVector);
250 }
251
252 // Build the split query seqloc for this query
253
254 CConstRef<CSeq_loc> query_seqloc(m_LocalQueryData->GetSeq_loc(qindex));
255
256 CRef<CSeq_loc> split_query_loc(new CSeq_loc);
257 {
258 int q_sl_offset = 0;
259 if(query_seqloc->IsInt() && (query_seqloc->GetInt().GetFrom() > 0 )) {
260 q_sl_offset = query_seqloc->GetInt().GetFrom();
261 }
262 s_SetSplitQuerySeqInterval(chunk, query_range, q_sl_offset, split_query_loc);
263 }
264
265
266
267 split_query_loc->SetId(*query_seqloc->GetId());
268 const ENa_strand kStrand =
269 BlastSetup_GetStrand(*query_seqloc, kProgram, kStrandOption);
270 split_query_loc->SetStrand(kStrand);
271 _TRACE("Chunk " << chunk_num << ": query " << qindex << " ("
272 << split_query_loc->GetId()->AsFastaString() << ")"
273 << " " << split_query_loc->GetInt().GetFrom()
274 << "-" << split_query_loc->GetInt().GetTo()
275 << " strand " << (int)split_query_loc->GetStrand());
276
277 // retrieve the split mask corresponding to this chunk of the query
278 CRef<CSeq_loc> mask_query_loc(new CSeq_loc);
279 s_SetSplitQuerySeqInterval(chunk, query_range, 0, mask_query_loc);
280 TMaskedQueryRegions split_mask =
281 m_UserSpecifiedMasks[qindex].RestrictToSeqInt(mask_query_loc->GetInt());
282
283 // retrieve the scope to retrieve this query
284 CRef<CScope> scope(m_Scopes[qindex]);
285 _ASSERT(scope.NotEmpty());
286
287 // our split query chunk :)
288 CRef<CBlastSearchQuery> split_query
289 (new CBlastSearchQuery(*split_query_loc, *scope, split_mask));
290 m_SplitQueriesInChunk[chunk_num]->AddQuery(split_query);
291 }
292 }
293 }
294
295 /** Adds the necessary shift to the context to record the query contexts for
296 * the query chunks
297 * @param context query context [in]
298 * @param shift shift to add [in]
299 */
300 static inline unsigned int
s_AddShift(unsigned int context,int shift)301 s_AddShift(unsigned int context, int shift)
302 {
303 _ASSERT(context == 3 || context == 4 || context == 5);
304 _ASSERT(shift == 0 || shift == 1 || shift == -1);
305
306 unsigned int retval;
307 if (shift == 0) {
308 retval = context;
309 } else if (shift == 1) {
310 retval = context == 3 ? 5 : context - shift;
311 } else if (shift == -1) {
312 retval = context == 5 ? 3 : context - shift;
313 } else {
314 abort();
315 }
316 return retval;
317 }
318
319 /**
320 * @brief Retrieve the shift for the negative strand
321 *
322 * @param query_length length of the query [in]
323 *
324 * @return shift (either 1, -1, or 0)
325 */
326 static inline int
s_GetShiftForTranslatedNegStrand(size_t query_length)327 s_GetShiftForTranslatedNegStrand(size_t query_length)
328 {
329 int retval;
330 switch (query_length % CODON_LENGTH) {
331 case 1: retval = -1; break;
332 case 2: retval = 1; break;
333 case 0: default: retval = 0; break;
334 }
335 return retval;
336 }
337
338 void
x_ComputeQueryContextsForChunks()339 CQuerySplitter::x_ComputeQueryContextsForChunks()
340 {
341 const EBlastProgramType kProgram = m_Options->GetProgramType();
342 const unsigned int kNumContexts = GetNumberOfContexts(kProgram);
343 const ENa_strand kStrandOption = m_Options->GetStrandOption();
344 auto_ptr<CQueryDataPerChunk> qdpc;
345
346 if (Blast_QueryIsTranslated(kProgram)) {
347 qdpc.reset(new CQueryDataPerChunk(*m_SplitBlk, kProgram,
348 m_LocalQueryData));
349 }
350
351 for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
352 vector<size_t> queries = m_SplitBlk->GetQueryIndices(chunk_num);
353
354 for (size_t i = 0; i < queries.size(); i++) {
355 CConstRef<CSeq_loc> sl = m_LocalQueryData->GetSeq_loc(queries[i]);
356 const ENa_strand kStrand =
357 BlastSetup_GetStrand(*sl, kProgram, kStrandOption);
358
359 if (Blast_QueryIsTranslated(kProgram)) {
360 size_t qlength = qdpc->GetQueryLength(queries[i]);
361 int last_query_chunk = qdpc->GetLastChunk(queries[i]);
362 _ASSERT(last_query_chunk != -1);
363 int shift = s_GetShiftForTranslatedNegStrand(qlength);
364
365 for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
366 // handle the plus strand...
367 if (ctx % NUM_FRAMES < CODON_LENGTH) {
368 if (kStrand == eNa_strand_minus) {
369 m_SplitBlk->AddContextToChunk(chunk_num,
370 kInvalidContext);
371 } else {
372 m_SplitBlk->AddContextToChunk(chunk_num,
373 kNumContexts*queries[i]+ctx);
374 }
375 } else { // handle the negative strand
376 if (kStrand == eNa_strand_plus) {
377 m_SplitBlk->AddContextToChunk(chunk_num,
378 kInvalidContext);
379 } else {
380 if (chunk_num == (size_t)last_query_chunk) {
381 // last chunk doesn't have shift
382 m_SplitBlk->AddContextToChunk(chunk_num,
383 kNumContexts*queries[i]+ctx);
384 } else {
385 m_SplitBlk->AddContextToChunk(chunk_num,
386 kNumContexts*queries[i]+
387 s_AddShift(ctx, shift));
388 }
389 }
390 }
391 }
392 } else if (Blast_QueryIsNucleotide(kProgram)) {
393
394 for (unsigned int ctx = 0; ctx < kNumContexts; ctx++) {
395 // handle the plus strand...
396 if (ctx % NUM_STRANDS == 0) {
397 if (kStrand == eNa_strand_minus) {
398 m_SplitBlk->AddContextToChunk(chunk_num,
399 kInvalidContext);
400 } else {
401 m_SplitBlk->AddContextToChunk(chunk_num,
402 kNumContexts*queries[i]+ctx);
403 }
404 } else { // handle the negative strand
405 if (kStrand == eNa_strand_plus) {
406 m_SplitBlk->AddContextToChunk(chunk_num,
407 kInvalidContext);
408 } else {
409 m_SplitBlk->AddContextToChunk(chunk_num,
410 kNumContexts*queries[i]+ctx);
411 }
412 }
413 }
414
415 } else if (Blast_QueryIsProtein(kProgram)) {
416 m_SplitBlk->AddContextToChunk(chunk_num,
417 kNumContexts*queries[i]);
418 } else {
419 abort();
420 }
421 }
422 }
423 }
424
425 /**
426 * @brief Determine whether a given context corresponds to the plus or minus
427 * strand
428 *
429 * @param qinfo BlastQueryInfo structure to determine the strand [in]
430 * @param context_number Context number in the BlastQueryInfo structure (index
431 * into the BlastContextInfo array) [in]
432 */
433 static inline bool
s_IsPlusStrand(const BlastQueryInfo * qinfo,Int4 context_number)434 s_IsPlusStrand(const BlastQueryInfo* qinfo, Int4 context_number)
435 {
436 return qinfo->contexts[context_number].frame >= 0;
437 }
438
439 /**
440 * @brief Get the length of a context in absolute terms (i.e.: in the context
441 * of the full, non-split sequence)
442 *
443 * @param chunk_qinfo vector of BlastQueryInfo structures corresponding to the
444 * various query chunks [in]
445 * @param chunk_num Chunk number, index into the vector above [in]
446 * @param ctx_translator auxiliary context translator object [in]
447 * @param absolute_context context in the full, non-split query
448 *
449 * @return length of the requested context
450 */
451 static size_t
s_GetAbsoluteContextLength(const vector<const BlastQueryInfo * > & chunk_qinfo,int chunk_num,const CContextTranslator & ctx_translator,int absolute_context)452 s_GetAbsoluteContextLength(const vector<const BlastQueryInfo*>& chunk_qinfo,
453 int chunk_num,
454 const CContextTranslator& ctx_translator,
455 int absolute_context)
456 {
457 if (chunk_num < 0) {
458 return 0;
459 }
460
461 int pos = ctx_translator.GetContextInChunk((size_t)chunk_num,
462 absolute_context);
463 if (pos != kInvalidContext) {
464 return chunk_qinfo[chunk_num]->contexts[pos].query_length;
465 }
466 return 0;
467 }
468
469 //#define DEBUG_COMPARE_SEQUENCES 1
470
471 #ifdef DEBUG_COMPARE_SEQUENCES
472
473 /// Convert a sequence into its printable representation
474 /// @param seq sequence data [in]
475 /// @param len length of sequence to print [in]
476 /// @param is_prot whether the sequence is protein or not [in]
s_GetPrintableSequence(const Uint1 * seq,size_t len,bool is_prot)477 static string s_GetPrintableSequence(const Uint1* seq, size_t len, bool is_prot)
478 {
479 string retval;
480 for (size_t i = 0; i < len; i++) {
481 retval.append(1, (is_prot
482 ? NCBISTDAA_TO_AMINOACID[seq[i]]
483 : BLASTNA_TO_IUPACNA[seq[i]]));
484 }
485 return retval;
486 }
487
488 /** Auxiliary function to validate the context offset corrections
489 * @param global global query sequence data [in]
490 * @param chunk sequence data for chunk [in]
491 * @param len length of the data to compare [in]
492 * @param is_prot whether the sequence is protein or not [in]
493 * @return true if sequence data is identical, false otherwise
494 */
cmp_sequence(const Uint1 * global,const Uint1 * chunk,size_t len,bool is_prot)495 static bool cmp_sequence(const Uint1* global, const Uint1* chunk, size_t len,
496 bool is_prot)
497 {
498 bool retval = true;
499
500 for (size_t i = 0; i < len; i++) {
501 if (global[i] != chunk[i]) {
502 retval = false;
503 break;
504 }
505 }
506
507 if (retval == false) {
508 _TRACE("Comparing global: '"
509 << s_GetPrintableSequence(global, len, is_prot) << "'");
510 _TRACE("with chunk: '"
511 << s_GetPrintableSequence(chunk, len, is_prot) << "'");
512 }
513
514 return retval;
515 }
516 #endif
517
518 /* ----------------------------------------------------------------------------
519 * To compute the offset correction for the plus strand we need to add the
520 * length of all the contexts spread across the various chunks and subtract
521 * the corresponding overlap regions.
522 *
523 * Assumptions: chunk size and overlap (in nucleotide coordinates) are
524 * divisible by CODON_LENGTH.
525 *
526 * let C = current split query chunk
527 * let SC = starting chunk such that the context in question is found in this
528 * chunk
529 * let ctx = context number in split query chunk C
530 * let O = overlap size in final sequence data coordinates (aa for blastx, na
531 * for blastn)
532 * let x = ctx equivalent in the global (absolute) unsplit query
533 * let ctxlen(a, b) be a function which computes the length of context a (given
534 * in global coordinates) in chunk b. If a is not found in b or if b doesn't
535 * exist (i.e.: is negative), 0 is returned.
536 * let corr = correction to be added to the global query offset for chunk C,
537 * context ctx
538 *
539 * corr = 0;
540 * for (; C != SC; C--) {
541 * let prev_len = ctxlen(x, C - 1);
542 * let curr_len = ctxlen(x, C);
543 * let overlap = min(O, curr_len);
544 * corr += prev_len - min(overlap, prev_len);
545 * }
546 *
547 * ----------------------------------------------------------------------------
548 * To compute the offset correction for the negative strand we need to subtract
549 * from the query length how far we have advanced into the query's negative
550 * strand (starting from the right) minus the overlap regions.
551 *
552 * Define the same variables as for the plus strand (except corr, defined
553 * below):
554 * let L = length of context x in the absolute query
555 * let corr = L - S, where S is computed as follows:
556 *
557 * S = 0;
558 * for (; C >= SC; C--) {
559 * let prev_len = ctxlen(x, C - 1);
560 * let curr_len = ctxlen(x, C);
561 * let overlap = min(O, curr_len);
562 * corr += curr_len - min(overlap, prev_len);
563 * }
564 *
565 */
566
567 void
x_ComputeContextOffsetsForChunks()568 CQuerySplitter::x_ComputeContextOffsetsForChunks()
569 {
570 if (Blast_QueryIsTranslated(m_Options->GetProgramType())) {
571 x_ComputeContextOffsets_TranslatedQueries();
572 } else {
573 x_ComputeContextOffsets_NonTranslatedQueries();
574 }
575 }
576
577 // Record the correction needed to synchronize with the complete query all the
578 // query offsets within HSPs that fall within this context
579 void
x_ComputeContextOffsets_NonTranslatedQueries()580 CQuerySplitter::x_ComputeContextOffsets_NonTranslatedQueries()
581 {
582 _ASSERT( !m_QueryChunkFactories.empty() );
583
584 const EBlastProgramType kProgram = m_Options->GetProgramType();
585 _ASSERT( !Blast_QueryIsTranslated(kProgram) );
586 const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
587 #ifdef DEBUG_COMPARE_SEQUENCES
588 const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
589 #endif
590 const size_t kOverlap = SplitQuery_GetOverlapChunkSize(kProgram);
591 CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
592 m_Options);
593 vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);
594
595 for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
596 CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
597 CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
598 #ifdef DEBUG_COMPARE_SEQUENCES
599 const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
600 #endif
601
602 // BlastQueryInfo structure corresponding to chunk number chunk_num
603 chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
604 _ASSERT(chunk_qinfo[chunk_num]);
605
606 // In case the first context differs from 0, for consistency with the
607 // other data returned by this class...
608 for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
609 m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
610 }
611
612 for (Int4 ctx = chunk_qinfo[chunk_num]->first_context;
613 ctx <= chunk_qinfo[chunk_num]->last_context;
614 ctx++) {
615
616 size_t correction = 0;
617 const int starting_chunk =
618 ctx_translator.GetStartingChunk(chunk_num, ctx);
619 const int absolute_context =
620 ctx_translator.GetAbsoluteContext(chunk_num, ctx);
621
622 if (absolute_context == kInvalidContext ||
623 starting_chunk == kInvalidContext) {
624 _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
625 // INT4_MAX is the sentinel value for invalid contexts
626 m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
627 continue;
628 }
629
630 if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
631
632 for (int c = chunk_num; c != starting_chunk; c--) {
633 size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
634 c - 1,
635 ctx_translator,
636 absolute_context);
637 size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
638 ctx_translator,
639 absolute_context);
640 size_t overlap = min(kOverlap, curr_len);
641 correction += prev_len - min(overlap, prev_len);
642 }
643
644 } else {
645
646 size_t subtrahend = 0;
647
648 for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
649 size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
650 c - 1,
651 ctx_translator,
652 absolute_context);
653 size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
654 c,
655 ctx_translator,
656 absolute_context);
657 size_t overlap = min(kOverlap, curr_len);
658 subtrahend += (curr_len - min(overlap, prev_len));
659 }
660 correction =
661 global_qinfo->contexts[absolute_context].query_length -
662 subtrahend;
663
664 }
665 _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
666 m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
667 #ifdef DEBUG_COMPARE_SEQUENCES
668 {
669 int global_offset = global_qinfo->contexts[absolute_context].query_offset +
670 correction;
671 int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
672 if (!cmp_sequence(&global_seq->sequence[global_offset],
673 &chunk_seq->sequence[chunk_offset], 10,
674 Blast_QueryIsProtein(kProgram))) {
675 cerr << "Failed to compare sequence data!" << endl;
676 }
677 }
678 #endif
679
680 }
681 }
682 _TRACE("CContextTranslator contents: " << ctx_translator);
683 }
684
685 void
x_ComputeContextOffsets_TranslatedQueries()686 CQuerySplitter::x_ComputeContextOffsets_TranslatedQueries()
687 {
688 _ASSERT( !m_QueryChunkFactories.empty() );
689
690 const EBlastProgramType kProgram = m_Options->GetProgramType();
691 _ASSERT(Blast_QueryIsTranslated(kProgram));
692 const BlastQueryInfo* global_qinfo = m_LocalQueryData->GetQueryInfo();
693 #ifdef DEBUG_COMPARE_SEQUENCES
694 const BLAST_SequenceBlk* global_seq = m_LocalQueryData->GetSequenceBlk();
695 #endif
696 const size_t kOverlap =
697 SplitQuery_GetOverlapChunkSize(kProgram) / CODON_LENGTH;
698 CContextTranslator ctx_translator(*m_SplitBlk, &m_QueryChunkFactories,
699 m_Options);
700 CQueryDataPerChunk qdpc(*m_SplitBlk, kProgram, m_LocalQueryData);
701 vector<const BlastQueryInfo*> chunk_qinfo(m_NumChunks, 0);
702
703 for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
704 CRef<IQueryFactory> chunk_qf(m_QueryChunkFactories[chunk_num]);
705 CRef<ILocalQueryData> chunk_qd(chunk_qf->MakeLocalQueryData(m_Options));
706 #ifdef DEBUG_COMPARE_SEQUENCES
707 const BLAST_SequenceBlk* chunk_seq = chunk_qd->GetSequenceBlk();
708 #endif
709
710 // BlastQueryInfo structure corresponding to chunk number chunk_num
711 chunk_qinfo[chunk_num] = chunk_qd->GetQueryInfo();
712 _ASSERT(chunk_qinfo[chunk_num]);
713
714 // In case the first context differs from 0, for consistency with the
715 // other data returned by this class...
716 for (Int4 ctx = 0; ctx < chunk_qinfo[chunk_num]->first_context; ctx++) {
717 m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
718 }
719
720 for (Int4 ctx = chunk_qinfo[chunk_num]->first_context;
721 ctx <= chunk_qinfo[chunk_num]->last_context;
722 ctx++) {
723
724 size_t correction = 0;
725 const int starting_chunk =
726 ctx_translator.GetStartingChunk(chunk_num, ctx);
727 const int absolute_context =
728 ctx_translator.GetAbsoluteContext(chunk_num, ctx);
729 const int last_query_chunk = qdpc.GetLastChunk(chunk_num, ctx);
730
731 if (absolute_context == kInvalidContext ||
732 starting_chunk == kInvalidContext) {
733 _ASSERT( !chunk_qinfo[chunk_num]->contexts[ctx].is_valid );
734 // INT4_MAX is the sentinel value for invalid contexts
735 m_SplitBlk->AddContextOffsetToChunk(chunk_num, INT4_MAX);
736 continue;
737 }
738
739 // The corrections for the contexts corresponding to the negative
740 // strand in the last chunk of a query sequence are all 0
741 if (!s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
742 (chunk_num == (size_t)last_query_chunk) &&
743 (ctx % NUM_FRAMES >= 3)) {
744 correction = 0;
745 goto error_check;
746 }
747
748 // The corrections for the contexts corresponding to the plus
749 // strand are always the same, so only calculate the first one
750 if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx) &&
751 (ctx % NUM_FRAMES == 1 || ctx % NUM_FRAMES == 2)) {
752 correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
753 goto error_check;
754 }
755
756 // If the query length is divisible by CODON_LENGTH, the
757 // corrections for all contexts corresponding to a given strand are
758 // the same, so only calculate the first one
759 if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 0) &&
760 (ctx % NUM_FRAMES != 0) && (ctx % NUM_FRAMES != 3)) {
761 correction = m_SplitBlk->GetContextOffsets(chunk_num).back();
762 goto error_check;
763 }
764
765 // If the query length % CODON_LENGTH == 1, the corrections for the
766 // first two contexts of the negative strand are the same, and the
767 // correction for the last context is one more than that.
768 if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 1) &&
769 !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
770
771 if (ctx % NUM_FRAMES == 4) {
772 correction =
773 m_SplitBlk->GetContextOffsets(chunk_num).back();
774 goto error_check;
775 } else if (ctx % NUM_FRAMES == 5) {
776 correction =
777 m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
778 goto error_check;
779 }
780 }
781
782 // If the query length % CODON_LENGTH == 2, the corrections for the
783 // last two contexts of the negative strand are the same, which is
784 // one more that the first context on the negative strand.
785 if ((qdpc.GetQueryLength(chunk_num, ctx) % CODON_LENGTH == 2) &&
786 !s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
787
788 if (ctx % NUM_FRAMES == 4) {
789 correction =
790 m_SplitBlk->GetContextOffsets(chunk_num).back() + 1;
791 goto error_check;
792 } else if (ctx % NUM_FRAMES == 5) {
793 correction =
794 m_SplitBlk->GetContextOffsets(chunk_num).back();
795 goto error_check;
796 }
797 }
798
799 if (s_IsPlusStrand(chunk_qinfo[chunk_num], ctx)) {
800
801 for (int c = chunk_num; c != starting_chunk; c--) {
802 size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
803 c - 1,
804 ctx_translator,
805 absolute_context);
806 size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo, c,
807 ctx_translator,
808 absolute_context);
809 size_t overlap = min(kOverlap, curr_len);
810 correction += prev_len - min(overlap, prev_len);
811 }
812
813 } else {
814
815 size_t subtrahend = 0;
816
817 for (int c = chunk_num; c >= starting_chunk && c >= 0; c--) {
818 size_t prev_len = s_GetAbsoluteContextLength(chunk_qinfo,
819 c - 1,
820 ctx_translator,
821 absolute_context);
822 size_t curr_len = s_GetAbsoluteContextLength(chunk_qinfo,
823 c,
824 ctx_translator,
825 absolute_context);
826 size_t overlap = min(kOverlap, curr_len);
827 subtrahend += (curr_len - min(overlap, prev_len));
828 }
829 correction =
830 global_qinfo->contexts[absolute_context].query_length -
831 subtrahend;
832 }
833
834 error_check:
835 _ASSERT((chunk_qinfo[chunk_num]->contexts[ctx].is_valid));
836 m_SplitBlk->AddContextOffsetToChunk(chunk_num, correction);
837 #ifdef DEBUG_COMPARE_SEQUENCES
838 {
839 int global_offset = global_qinfo->contexts[absolute_context].query_offset +
840 correction;
841 int chunk_offset = chunk_qinfo[chunk_num]->contexts[ctx].query_offset;
842 int num_bases2compare =
843 min(10, chunk_qinfo[chunk_num]->contexts[ctx].query_length);
844 if (!cmp_sequence(&global_seq->sequence[global_offset],
845 &chunk_seq->sequence[chunk_offset],
846 num_bases2compare, Blast_QueryIsProtein(kProgram))) {
847 cerr << "Failed to compare sequence data for chunk " << chunk_num
848 << ", context " << ctx << endl;
849 }
850 }
851 #endif
852 }
853 }
854 _TRACE("CContextTranslator contents: " << ctx_translator);
855 }
856
857 CRef<CSplitQueryBlk>
Split()858 CQuerySplitter::Split()
859 {
860 if (m_SplitBlk.NotEmpty()) {
861 return m_SplitBlk;
862 }
863
864 m_SplitBlk.Reset(new CSplitQueryBlk(m_NumChunks,
865 m_Options->GetGappedMode()));
866 m_QueryChunkFactories.reserve(m_NumChunks);
867
868 if (m_NumChunks == 1) {
869 m_QueryChunkFactories.push_back(m_QueryFactory);
870 } else {
871 _TRACE("Splitting into " << m_NumChunks << " query chunks");
872 x_ComputeChunkRanges();
873 x_ComputeQueryIndicesForChunks();
874 x_ComputeQueryContextsForChunks();
875
876 for (size_t chunk_num = 0; chunk_num < m_NumChunks; chunk_num++) {
877 CRef<IQueryFactory> qf
878 (new CObjMgr_QueryFactory(*m_SplitQueriesInChunk[chunk_num]));
879 m_QueryChunkFactories.push_back(qf);
880 }
881
882 x_ComputeContextOffsetsForChunks();
883 }
884
885 _TRACE("CSplitQuerBlk contents: " << *m_SplitBlk);
886 _TRACE("CQuerySplitter contents: " << *this);
887
888 return m_SplitBlk;
889 }
890
891 CRef<IQueryFactory>
GetQueryFactoryForChunk(Uint4 chunk_num)892 CQuerySplitter::GetQueryFactoryForChunk(Uint4 chunk_num)
893 {
894 if (chunk_num >= m_NumChunks) {
895 string msg("Invalid query chunk number: ");
896 msg += NStr::IntToString(chunk_num) + " out of " +
897 NStr::IntToString(m_NumChunks);
898 throw out_of_range(msg);
899 }
900
901 if (m_SplitBlk.Empty()) {
902 Split();
903 }
904
905 return m_QueryChunkFactories[chunk_num];
906 }
907
908 END_SCOPE(blast)
909 END_NCBI_SCOPE
910
911