1 /* $Id: aa_ungapped.c 500404 2016-05-04 14:59:01Z camacho $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 */
26
27 /** @file aa_ungapped.c
28 * Functions to iterate over seed hits and perform ungapped extensions.
29 */
30
31 #include <algo/blast/core/aa_ungapped.h>
32 #include <algo/blast/core/blast_aalookup.h>
33 #include <algo/blast/core/blast_aascan.h>
34 #include <algo/blast/core/blast_util.h>
35
36 /** Scan a subject sequence for word hits and trigger two-hit extensions.
37 *
38 * @param subject the subject sequence [in]
39 * @param query the query sequence [in]
40 * @param lookup_wrap the lookup table [in]
41 * @param ewp Structure containing the diagonal array
42 * @param matrix the substitution matrix [in]
43 * @param word_params structure containing per-context cutoff information [in]
44 * @param query_info structure containing context ranges [in]
45 * @param offset_pairs Array for storing query and subject offsets. [in]
46 * @param array_size the number of elements in each offset array [in]
47 * @param ungapped_hsps hsps resulting from the ungapped extension [out]
48 * @param ungapped_stats Various hit counts. Not filled if NULL [out]
49 */
50 static Int2 s_BlastAaWordFinder_TwoHit(const BLAST_SequenceBlk* subject,
51 const BLAST_SequenceBlk* query,
52 const LookupTableWrap* lookup_wrap,
53 Blast_ExtendWord * ewp,
54 Int4 ** matrix,
55 const BlastInitialWordParameters * word_params,
56 BlastQueryInfo * query_info,
57 BlastOffsetPair* NCBI_RESTRICT offset_pairs,
58 Int4 array_size,
59 BlastInitHitList* ungapped_hsps,
60 BlastUngappedStats* ungapped_stats);
61
62 /** Scan a subject sequence for word hits and trigger two-hit extensions
63 * (specialized for RPS blast).
64 *
65 * @param subject the subject sequence [in]
66 * @param query the query sequence [in]
67 * @param lookup_wrap the lookup table [in]
68 * @param ewp Structure containing the diagonal array [in]
69 * @param matrix the substitution matrix [in]
70 * @param cutoff cutoff score for saving ungapped HSPs [in]
71 * @param dropoff x dropoff [in]
72 * @param ungapped_hsps hsps resulting from the ungapped extension [out]
73 * @param ungapped_stats Various hit counts. Not filled if NULL [out]
74 */
75 static Int2 s_BlastRPSWordFinder_TwoHit(const BLAST_SequenceBlk* subject,
76 const BLAST_SequenceBlk* query,
77 const LookupTableWrap* lookup_wrap,
78 Blast_ExtendWord * ewp,
79 Int4 ** matrix,
80 Int4 cutoff,
81 Int4 dropoff,
82 BlastInitHitList* ungapped_hsps,
83 BlastUngappedStats* ungapped_stats);
84
85 /** Scan a subject sequence for word hits and trigger one-hit extensions.
86 *
87 * @param subject the subject sequence
88 * @param query the query sequence
89 * @param lookup_wrap the lookup table
90 * @param ewp Structure containing the diagonal array [in]
91 * @param matrix the substitution matrix [in]
92 * @param word_params structure containing per-context cutoff information [in]
93 * @param query_info structure containing context ranges [in]
94 * @param offset_pairs Array for storing query and subject offsets. [in]
95 * @param array_size the number of elements in each offset array
96 * @param ungapped_hsps hsps resulting from the ungapped extensions [out]
97 * @param ungapped_stats Various hit counts. Not filled if NULL [out]
98 */
99 static Int2 s_BlastAaWordFinder_OneHit(const BLAST_SequenceBlk* subject,
100 const BLAST_SequenceBlk* query,
101 const LookupTableWrap* lookup_wrap,
102 Blast_ExtendWord * ewp,
103 Int4 ** matrix,
104 const BlastInitialWordParameters * word_params,
105 BlastQueryInfo * query_info,
106 BlastOffsetPair* NCBI_RESTRICT offset_pairs,
107 Int4 array_size,
108 BlastInitHitList* ungapped_hsps,
109 BlastUngappedStats* ungapped_stats);
110
111 /** Scan a subject sequence for word hits and trigger one-hit extensions
112 * (spcialized for RPS blast).
113 *
114 * @param subject the subject sequence
115 * @param query the query sequence
116 * @param lookup_wrap the lookup table
117 * @param ewp Structure containing the diagonal array [in]
118 * @param matrix the substitution matrix [in]
119 * @param cutoff cutoff score for saving ungapped HSPs [in]
120 * @param dropoff x dropoff [in]
121 * @param ungapped_hsps hsps resulting from the ungapped extensions [out]
122 * @param ungapped_stats Various hit counts. Not filled if NULL [out]
123 */
124 static Int2 s_BlastRPSWordFinder_OneHit(const BLAST_SequenceBlk* subject,
125 const BLAST_SequenceBlk* query,
126 const LookupTableWrap* lookup_wrap,
127 Blast_ExtendWord * ewp,
128 Int4 ** matrix,
129 Int4 cutoff,
130 Int4 dropoff,
131 BlastInitHitList* ungapped_hsps,
132 BlastUngappedStats* ungapped_stats);
133
134 /** Perform a one-hit extension. Beginning at the specified hit,
135 * extend to the left, then extend to the right.
136 *
137 * @param matrix the substitution matrix [in]
138 * @param subject subject sequence [in]
139 * @param query query sequence [in]
140 * @param s_off subject offset [in]
141 * @param q_off query offset [in]
142 * @param dropoff X dropoff parameter [in]
143 * @param hsp_q the offset in the query where the HSP begins [out]
144 * @param hsp_s the offset in the subject where the HSP begins [out]
145 * @param hsp_len the length of the HSP [out]
146 * @param word_size number of letters in the initial word hit [in]
147 * @param use_pssm TRUE if the scoring matrix is position-specific [in]
148 * @param s_last_off the rightmost subject offset examined [out]
149 * @return the score of the hsp.
150 */
151 static Int4 s_BlastAaExtendOneHit(Int4 ** matrix,
152 const BLAST_SequenceBlk* subject,
153 const BLAST_SequenceBlk* query,
154 Int4 s_off,
155 Int4 q_off,
156 Int4 dropoff,
157 Int4* hsp_q,
158 Int4* hsp_s,
159 Int4* hsp_len,
160 Int4 word_size,
161 Boolean use_pssm,
162 Int4* s_last_off);
163
164 /** Perform a two-hit extension. Given two hits L and R, begin
165 * at R and extend to the left. If we do not reach L, abort the extension.
166 * Otherwise, begin at R and extend to the right.
167 *
168 * @param matrix the substitution matrix [in]
169 * @param subject subject sequence [in]
170 * @param query query sequence [in]
171 * @param s_left_off left subject offset [in]
172 * @param s_right_off right subject offset [in]
173 * @param q_right_off right query offset [in]
174 * @param dropoff X dropoff parameter [in]
175 * @param hsp_q the offset in the query where the HSP begins [out]
176 * @param hsp_s the offset in the subject where the HSP begins [out]
177 * @param hsp_len the length of the HSP [out]
178 * @param use_pssm TRUE if the scoring matrix is position-specific [in]
179 * @param word_size number of letters in one word [in]
180 * @param right_extend set to TRUE if an extension to the right happened [out]
181 * @param s_last_off the rightmost subject offset examined [out]
182 * @return the score of the hsp.
183 */
184 static Int4 s_BlastAaExtendTwoHit(Int4 ** matrix,
185 const BLAST_SequenceBlk* subject,
186 const BLAST_SequenceBlk* query,
187 Int4 s_left_off,
188 Int4 s_right_off,
189 Int4 q_right_off,
190 Int4 dropoff,
191 Int4* hsp_q,
192 Int4* hsp_s,
193 Int4* hsp_len,
194 Boolean use_pssm,
195 Int4 word_size,
196 Boolean *right_extend,
197 Int4* s_last_off);
198
199
BlastAaWordFinder(BLAST_SequenceBlk * subject,BLAST_SequenceBlk * query,BlastQueryInfo * query_info,LookupTableWrap * lut_wrap,Int4 ** matrix,const BlastInitialWordParameters * word_params,Blast_ExtendWord * ewp,BlastOffsetPair * NCBI_RESTRICT offset_pairs,Int4 offset_array_size,BlastInitHitList * init_hitlist,BlastUngappedStats * ungapped_stats)200 Int2 BlastAaWordFinder(BLAST_SequenceBlk * subject,
201 BLAST_SequenceBlk * query,
202 BlastQueryInfo * query_info,
203 LookupTableWrap * lut_wrap,
204 Int4 ** matrix,
205 const BlastInitialWordParameters * word_params,
206 Blast_ExtendWord * ewp,
207 BlastOffsetPair * NCBI_RESTRICT offset_pairs,
208 Int4 offset_array_size,
209 BlastInitHitList * init_hitlist,
210 BlastUngappedStats * ungapped_stats)
211 {
212 Int2 status = 0;
213
214 if (ewp->diag_table->multiple_hits) {
215 status = s_BlastAaWordFinder_TwoHit(subject, query,
216 lut_wrap, ewp,
217 matrix,
218 word_params,
219 query_info,
220 offset_pairs,
221 offset_array_size,
222 init_hitlist, ungapped_stats);
223 } else {
224 status = s_BlastAaWordFinder_OneHit(subject, query,
225 lut_wrap, ewp,
226 matrix,
227 word_params,
228 query_info,
229 offset_pairs,
230 offset_array_size,
231 init_hitlist, ungapped_stats);
232 }
233
234 Blast_InitHitListSortByScore(init_hitlist);
235 return status;
236 }
237
BlastRPSWordFinder(BLAST_SequenceBlk * subject,BLAST_SequenceBlk * query,BlastQueryInfo * query_info,LookupTableWrap * lut_wrap,Int4 ** matrix,const BlastInitialWordParameters * word_params,Blast_ExtendWord * ewp,BlastOffsetPair * NCBI_RESTRICT offset_pairs,Int4 offset_array_size,BlastInitHitList * init_hitlist,BlastUngappedStats * ungapped_stats)238 Int2 BlastRPSWordFinder(BLAST_SequenceBlk * subject,
239 BLAST_SequenceBlk * query,
240 BlastQueryInfo * query_info,
241 LookupTableWrap * lut_wrap,
242 Int4 ** matrix,
243 const BlastInitialWordParameters * word_params,
244 Blast_ExtendWord * ewp,
245 BlastOffsetPair * NCBI_RESTRICT offset_pairs,
246 Int4 offset_array_size,
247 BlastInitHitList * init_hitlist,
248 BlastUngappedStats * ungapped_stats)
249 {
250 Int2 status = 0;
251 BlastUngappedCutoffs *cutoffs;
252 Int4 context = subject->oid;
253
254 /* for rpsblast, query contains the concatenated database
255 and subject contains one query sequence (or one frame
256 of a translated nucleotide sequence). This means the cutoff
257 and X-drop value is the same for all hits */
258
259 if (subject->frame != 0) {
260 context = subject->oid * NUM_FRAMES +
261 BLAST_FrameToContext(subject->frame,
262 eBlastTypeRpsTblastn);
263 }
264 cutoffs = word_params->cutoffs + context;
265
266 if (ewp->diag_table->multiple_hits) {
267 status = s_BlastRPSWordFinder_TwoHit(subject, query,
268 lut_wrap, ewp,
269 matrix,
270 cutoffs->cutoff_score,
271 cutoffs->x_dropoff,
272 init_hitlist, ungapped_stats);
273 } else {
274 status = s_BlastRPSWordFinder_OneHit(subject, query,
275 lut_wrap, ewp,
276 matrix,
277 cutoffs->cutoff_score,
278 cutoffs->x_dropoff,
279 init_hitlist, ungapped_stats);
280 }
281
282 Blast_InitHitListSortByScore(init_hitlist);
283 return status;
284 }
285
286 static Int2
s_BlastRPSWordFinder_TwoHit(const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,const LookupTableWrap * lookup_wrap,Blast_ExtendWord * ewp,Int4 ** matrix,Int4 cutoff,Int4 dropoff,BlastInitHitList * ungapped_hsps,BlastUngappedStats * ungapped_stats)287 s_BlastRPSWordFinder_TwoHit(const BLAST_SequenceBlk * subject,
288 const BLAST_SequenceBlk * query,
289 const LookupTableWrap * lookup_wrap,
290 Blast_ExtendWord * ewp,
291 Int4 ** matrix,
292 Int4 cutoff,
293 Int4 dropoff,
294 BlastInitHitList * ungapped_hsps,
295 BlastUngappedStats * ungapped_stats)
296 {
297 BlastRPSLookupTable *lookup;
298 Int4 wordsize;
299 Int4 i, j;
300 Int4 hits = 0;
301 Int4 totalhits = 0;
302 Int4 first_offset = 0;
303 Int4 last_offset;
304 Int4 score;
305 Int4 hsp_q, hsp_s, hsp_len;
306 Int4 window;
307 Int4 last_hit, s_last_off, diff;
308 Int4 diag_offset, diag_coord, diag_mask;
309 DiagStruct *diag_array;
310 Boolean right_extend;
311 Int4 hits_extended = 0;
312 BLAST_DiagTable * diag = ewp->diag_table;
313
314 ASSERT(diag != NULL);
315
316 diag_offset = diag->offset;
317 diag_array = diag->hit_level_array;
318
319 ASSERT(diag_array);
320
321 diag_mask = diag->diag_mask;
322 window = diag->window;
323
324 lookup = (BlastRPSLookupTable *) lookup_wrap->lut;
325 wordsize = lookup->wordsize;
326 last_offset = subject->length - wordsize;
327
328 while (first_offset <= last_offset) {
329 /* scan the subject sequence for hits */
330 hits = BlastRPSScanSubject(lookup_wrap, subject, &first_offset);
331
332 totalhits += hits;
333 /* for each region of the concatenated database */
334 for (i = 0; i < lookup->num_buckets; ++i) {
335 RPSBucket *curr_bucket = lookup->bucket_array + i;
336 BlastOffsetPair *offset_pairs = curr_bucket->offset_pairs;
337 hits = curr_bucket->num_filled;
338
339 /* handle each hit in the neighborhood. Because the neighborhood
340 is small and there should be many hits there, proceeding one
341 bucket at a time provides maximum reuse of data structures. A
342 side benefit is that ungapped extensions also cluster together
343 in the concatenated database, reusing PSSM data as well. */
344 for (j = 0; j < hits; ++j) {
345 Uint4 query_offset = offset_pairs[j].qs_offsets.q_off;
346 Uint4 subject_offset = offset_pairs[j].qs_offsets.s_off;
347
348 /* calculate the diagonal associated with this query-subject
349 pair */
350
351 diag_coord = (query_offset - subject_offset) & diag_mask;
352
353 /* If the reset bit is set, an extension just happened. */
354 if (diag_array[diag_coord].flag) {
355 /* If we've already extended past this hit, skip it. */
356 if ((Int4) (subject_offset + diag_offset) <
357 diag_array[diag_coord].last_hit) {
358 continue;
359 }
360 /* Otherwise, start a new hit. */
361 else {
362 diag_array[diag_coord].last_hit =
363 subject_offset + diag_offset;
364 diag_array[diag_coord].flag = 0;
365 }
366 }
367 /* If the reset bit is cleared, try to start an extension. */
368 else {
369 /* find the distance to the last hit on this diagonal */
370 last_hit = diag_array[diag_coord].last_hit - diag_offset;
371 diff = subject_offset - last_hit;
372
373 if (diff >= window) {
374 /* We are beyond the window for this diagonal; start
375 a new hit */
376 diag_array[diag_coord].last_hit =
377 subject_offset + diag_offset;
378 continue;
379 }
380
381 /* If the difference is less than the wordsize (i.e.
382 last hit and this hit overlap), give up */
383
384 if (diff < wordsize) {
385 continue;
386 }
387
388 /* Extend this pair of hits. The extension to the left
389 must reach the end of the first word in order for
390 extension to the right to proceed. */
391
392 score = s_BlastAaExtendTwoHit(matrix, subject, query,
393 last_hit + wordsize,
394 subject_offset, query_offset,
395 dropoff, &hsp_q, &hsp_s,
396 &hsp_len, TRUE,
397 wordsize, &right_extend,
398 &s_last_off);
399
400 ++hits_extended;
401
402 /* if the hsp meets the score threshold, report it */
403 if (score >= cutoff)
404 BlastSaveInitHsp(ungapped_hsps, hsp_q, hsp_s,
405 query_offset, subject_offset,
406 hsp_len, score);
407
408 /* If an extension to the right happened, reset the last
409 hit so that future hits to this diagonal must start
410 over. */
411
412 if (right_extend) {
413 diag_array[diag_coord].flag = 1;
414 diag_array[diag_coord].last_hit =
415 s_last_off - (wordsize - 1) + diag_offset;
416 }
417 /* Otherwise, make the present hit into the previous hit
418 for this diagonal */
419 else {
420 diag_array[diag_coord].last_hit =
421 subject_offset + diag_offset;
422 }
423 } /* end else */
424
425 } /* end for - done with this batch of hits */
426 } /* end for - done with this bucket, call
427 scansubject again */
428 } /* end while - done with the entire sequence.
429 */
430
431 /* increment the offset in the diagonal array */
432 Blast_ExtendWordExit(ewp, subject->length);
433
434 Blast_UngappedStatsUpdate(ungapped_stats, totalhits, hits_extended,
435 ungapped_hsps->total);
436 return 0;
437 }
438
439 static Int2
s_BlastAaWordFinder_TwoHit(const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,const LookupTableWrap * lookup_wrap,Blast_ExtendWord * ewp,Int4 ** matrix,const BlastInitialWordParameters * word_params,BlastQueryInfo * query_info,BlastOffsetPair * NCBI_RESTRICT offset_pairs,Int4 array_size,BlastInitHitList * ungapped_hsps,BlastUngappedStats * ungapped_stats)440 s_BlastAaWordFinder_TwoHit(const BLAST_SequenceBlk * subject,
441 const BLAST_SequenceBlk * query,
442 const LookupTableWrap * lookup_wrap,
443 Blast_ExtendWord * ewp,
444 Int4 ** matrix,
445 const BlastInitialWordParameters * word_params,
446 BlastQueryInfo * query_info,
447 BlastOffsetPair * NCBI_RESTRICT offset_pairs,
448 Int4 array_size,
449 BlastInitHitList * ungapped_hsps,
450 BlastUngappedStats * ungapped_stats)
451 {
452 Boolean use_pssm = FALSE;
453 Int4 wordsize;
454 Int4 i;
455 Int4 hits = 0;
456 Int4 totalhits = 0;
457 Int4 score;
458 Int4 hsp_q, hsp_s, hsp_len = 0;
459 Int4 window;
460 Int4 last_hit, s_last_off, diff;
461 Int4 diag_offset, diag_coord, diag_mask;
462 DiagStruct *diag_array;
463 Boolean right_extend;
464 Int4 hits_extended = 0;
465 Int4 curr_context;
466 BlastUngappedCutoffs *cutoffs;
467 BLAST_DiagTable * diag = ewp->diag_table;
468 TAaScanSubjectFunction scansub;
469 Int4 scan_range[3];
470
471 ASSERT(diag != NULL);
472
473 diag_offset = diag->offset;
474 diag_array = diag->hit_level_array;
475
476 ASSERT(diag_array);
477
478 diag_mask = diag->diag_mask;
479 window = diag->window;
480
481 if (lookup_wrap->lut_type == eAaLookupTable) {
482 BlastAaLookupTable *lookup =
483 (BlastAaLookupTable *)(lookup_wrap->lut);
484 scansub = (TAaScanSubjectFunction)(lookup->scansub_callback);
485 wordsize = lookup->word_length;
486 use_pssm = lookup->use_pssm;
487 }
488 else {
489 BlastCompressedAaLookupTable *lookup =
490 (BlastCompressedAaLookupTable *)(lookup_wrap->lut);
491 scansub = (TAaScanSubjectFunction)(lookup->scansub_callback);
492 wordsize = lookup->word_length;
493 }
494
495 scan_range[0] = 0;
496 scan_range[1] = subject->seq_ranges[0].left;
497 scan_range[2] = subject->seq_ranges[0].right - wordsize;
498
499 if (scan_range[2] < scan_range[1])
500 scan_range[2] = scan_range[1];
501
502 while (scan_range[1] <= scan_range[2]) {
503 /* scan the subject sequence for hits */
504 hits = scansub(lookup_wrap, subject,
505 offset_pairs, array_size, scan_range);
506
507 totalhits += hits;
508 /* for each hit, */
509 for (i = 0; i < hits; ++i) {
510 Uint4 query_offset = offset_pairs[i].qs_offsets.q_off;
511 Uint4 subject_offset = offset_pairs[i].qs_offsets.s_off;
512
513 /* calculate the diagonal associated with this query-subject pair
514 */
515
516 diag_coord = (query_offset - subject_offset) & diag_mask;
517
518 /* If the reset bit is set, an extension just happened. */
519 if (diag_array[diag_coord].flag) {
520 /* If we've already extended past this hit, skip it. */
521 if ((Int4) (subject_offset + diag_offset) <
522 diag_array[diag_coord].last_hit) {
523 continue;
524 }
525 /* Otherwise, start a new hit. */
526 else {
527 diag_array[diag_coord].last_hit =
528 subject_offset + diag_offset;
529 diag_array[diag_coord].flag = 0;
530 }
531 }
532 /* If the reset bit is cleared, try to start an extension. */
533 else {
534 /* find the distance to the last hit on this diagonal */
535 last_hit = diag_array[diag_coord].last_hit - diag_offset;
536 diff = subject_offset - last_hit;
537
538 if (diff >= window) {
539 /* We are beyond the window for this diagonal; start a
540 new hit */
541 diag_array[diag_coord].last_hit =
542 subject_offset + diag_offset;
543 continue;
544 }
545
546 /* If the difference is less than the wordsize (i.e. last
547 hit and this hit overlap), give up */
548
549 if (diff < wordsize) {
550 continue;
551 }
552
553 /* Extend this pair of hits. The extension to the left must
554 reach the end of the first word in order for extension to
555 the right to proceed.
556
557 To use the cutoff and X-drop values appropriate for this
558 extension, the query context must first be found */
559
560 curr_context = BSearchContextInfo(query_offset, query_info);
561
562 /* Check if the last hit hits current query. Because last_hit
563 is never reset, it may contain a hit to the previous
564 concatenated query */
565
566 if (query_offset - diff <
567 query_info->contexts[curr_context].query_offset) {
568
569 /* there was no last hit for this diagnol; start a new hit */
570 diag_array[diag_coord].last_hit =
571 subject_offset + diag_offset;
572 continue;
573 }
574
575 cutoffs = word_params->cutoffs + curr_context;
576 score = s_BlastAaExtendTwoHit(matrix, subject, query,
577 last_hit + wordsize,
578 subject_offset, query_offset,
579 cutoffs->x_dropoff,
580 &hsp_q, &hsp_s,
581 &hsp_len, use_pssm,
582 wordsize, &right_extend,
583 &s_last_off);
584
585 ++hits_extended;
586
587 /* if the hsp meets the score threshold, report it */
588 if (score >= cutoffs->cutoff_score)
589 BlastSaveInitHsp(ungapped_hsps, hsp_q, hsp_s,
590 query_offset, subject_offset, hsp_len,
591 score);
592
593 /* If an extension to the right happened, reset the last hit
594 so that future hits to this diagonal must start over. */
595
596 if (right_extend) {
597 diag_array[diag_coord].flag = 1;
598 diag_array[diag_coord].last_hit =
599 s_last_off - (wordsize - 1) + diag_offset;
600 }
601 /* Otherwise, make the present hit into the previous hit for
602 this diagonal */
603 else {
604 diag_array[diag_coord].last_hit =
605 subject_offset + diag_offset;
606 }
607 } /* end else */
608 } /* end for - done with this batch of hits,
609 call scansubject again. */
610 } /* end while - done with the entire sequence.
611 */
612
613 /* increment the offset in the diagonal array */
614 Blast_ExtendWordExit(ewp, subject->length);
615
616 Blast_UngappedStatsUpdate(ungapped_stats, totalhits, hits_extended,
617 ungapped_hsps->total);
618 return 0;
619 }
620
621 static Int2
s_BlastRPSWordFinder_OneHit(const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,const LookupTableWrap * lookup_wrap,Blast_ExtendWord * ewp,Int4 ** matrix,Int4 cutoff,Int4 dropoff,BlastInitHitList * ungapped_hsps,BlastUngappedStats * ungapped_stats)622 s_BlastRPSWordFinder_OneHit(const BLAST_SequenceBlk * subject,
623 const BLAST_SequenceBlk * query,
624 const LookupTableWrap * lookup_wrap,
625 Blast_ExtendWord * ewp,
626 Int4 ** matrix,
627 Int4 cutoff,
628 Int4 dropoff,
629 BlastInitHitList * ungapped_hsps,
630 BlastUngappedStats * ungapped_stats)
631 {
632 BlastRPSLookupTable *lookup = NULL;
633 Int4 wordsize;
634 Int4 hits = 0;
635 Int4 totalhits = 0;
636 Int4 first_offset = 0;
637 Int4 last_offset;
638 Int4 hsp_q, hsp_s, hsp_len;
639 Int4 s_last_off;
640 Int4 i, j;
641 Int4 score;
642 Int4 diag_offset, diag_coord, diag_mask, diff;
643 DiagStruct *diag_array;
644 Int4 hits_extended = 0;
645 BLAST_DiagTable * diag = ewp->diag_table;
646
647 ASSERT(diag != NULL);
648
649 diag_offset = diag->offset;
650 diag_array = diag->hit_level_array;
651 ASSERT(diag_array);
652
653 diag_mask = diag->diag_mask;
654
655 lookup = (BlastRPSLookupTable *) lookup_wrap->lut;
656 wordsize = lookup->wordsize;
657 last_offset = subject->length - wordsize;
658
659 while (first_offset <= last_offset) {
660 /* scan the subject sequence for hits */
661 hits = BlastRPSScanSubject(lookup_wrap, subject, &first_offset);
662
663 totalhits += hits;
664 for (i = 0; i < lookup->num_buckets; i++) {
665 RPSBucket *curr_bucket = lookup->bucket_array + i;
666 BlastOffsetPair *offset_pairs = curr_bucket->offset_pairs;
667 hits = curr_bucket->num_filled;
668
669 /* handle each hit in the neighborhood. Because the neighborhood
670 is small and there should be many hits there, proceeding one
671 bucket at a time provides maximum reuse of data structures. A
672 side benefit is that ungapped extensions also cluster together
673 in the concatenated database, reusing PSSM data as well. */
674 for (j = 0; j < hits; ++j) {
675 Uint4 query_offset = offset_pairs[j].qs_offsets.q_off;
676 Uint4 subject_offset = offset_pairs[j].qs_offsets.s_off;
677 diag_coord = (subject_offset - query_offset) & diag_mask;
678 diff = subject_offset -
679 (diag_array[diag_coord].last_hit - diag_offset);
680
681 /* do an extension, but only if we have not already extended
682 this far */
683 if (diff >= 0) {
684 ++hits_extended;
685 score = s_BlastAaExtendOneHit(matrix, subject, query,
686 subject_offset, query_offset,
687 dropoff, &hsp_q, &hsp_s,
688 &hsp_len, wordsize, TRUE,
689 &s_last_off);
690
691 /* if the hsp meets the score threshold, report it */
692 if (score >= cutoff) {
693 BlastSaveInitHsp(ungapped_hsps, hsp_q, hsp_s,
694 query_offset, subject_offset,
695 hsp_len, score);
696 }
697 diag_array[diag_coord].last_hit =
698 s_last_off - (wordsize - 1) + diag_offset;
699 }
700 } /* end for (one bucket) */
701 } /* end for (all buckets) */
702 } /* end while */
703
704 /* increment the offset in the diagonal array (no windows used) */
705 Blast_ExtendWordExit(ewp, subject->length);
706
707 Blast_UngappedStatsUpdate(ungapped_stats, totalhits, hits_extended,
708 ungapped_hsps->total);
709 return 0;
710 }
711
712 static Int2
s_BlastAaWordFinder_OneHit(const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,const LookupTableWrap * lookup_wrap,Blast_ExtendWord * ewp,Int4 ** matrix,const BlastInitialWordParameters * word_params,BlastQueryInfo * query_info,BlastOffsetPair * NCBI_RESTRICT offset_pairs,Int4 array_size,BlastInitHitList * ungapped_hsps,BlastUngappedStats * ungapped_stats)713 s_BlastAaWordFinder_OneHit(const BLAST_SequenceBlk * subject,
714 const BLAST_SequenceBlk * query,
715 const LookupTableWrap * lookup_wrap,
716 Blast_ExtendWord * ewp,
717 Int4 ** matrix,
718 const BlastInitialWordParameters * word_params,
719 BlastQueryInfo * query_info,
720 BlastOffsetPair * NCBI_RESTRICT offset_pairs,
721 Int4 array_size,
722 BlastInitHitList * ungapped_hsps,
723 BlastUngappedStats * ungapped_stats)
724 {
725 Boolean use_pssm = FALSE;
726 Int4 wordsize;
727 Int4 hits = 0;
728 Int4 totalhits = 0;
729 Int4 hsp_q, hsp_s, hsp_len;
730 Int4 s_last_off;
731 Int4 i;
732 Int4 score;
733 Int4 diag_offset, diag_coord, diag_mask, diff;
734 DiagStruct *diag_array;
735 Int4 hits_extended = 0;
736 BLAST_DiagTable * diag = ewp->diag_table;
737 TAaScanSubjectFunction scansub;
738 Int4 scan_range[3];
739
740 ASSERT(diag != NULL);
741
742 diag_offset = diag->offset;
743 diag_array = diag->hit_level_array;
744 ASSERT(diag_array);
745
746 diag_mask = diag->diag_mask;
747
748 if (lookup_wrap->lut_type == eAaLookupTable) {
749 BlastAaLookupTable *lookup =
750 (BlastAaLookupTable *)(lookup_wrap->lut);
751 scansub = (TAaScanSubjectFunction)(lookup->scansub_callback);
752 wordsize = lookup->word_length;
753 use_pssm = lookup->use_pssm;
754 }
755 else {
756 BlastCompressedAaLookupTable *lookup =
757 (BlastCompressedAaLookupTable *)(lookup_wrap->lut);
758 scansub = (TAaScanSubjectFunction)(lookup->scansub_callback);
759 wordsize = lookup->word_length;
760 }
761
762 scan_range[0] = 0;
763 scan_range[1] = subject->seq_ranges[0].left;
764 scan_range[2] = subject->seq_ranges[0].right - wordsize;
765
766 while (scan_range[1] <= scan_range[2]) {
767 /* scan the subject sequence for hits */
768 hits = scansub(lookup_wrap, subject,
769 offset_pairs, array_size, scan_range);
770
771 totalhits += hits;
772 /* for each hit, */
773 for (i = 0; i < hits; ++i) {
774 Uint4 query_offset = offset_pairs[i].qs_offsets.q_off;
775 Uint4 subject_offset = offset_pairs[i].qs_offsets.s_off;
776 diag_coord = (subject_offset - query_offset) & diag_mask;
777 diff = subject_offset -
778 (diag_array[diag_coord].last_hit - diag_offset);
779
780 /* do an extension, but only if we have not already extended this
781 far */
782 if (diff >= 0) {
783 Int4 curr_context = BSearchContextInfo(query_offset,
784 query_info);
785 BlastUngappedCutoffs *cutoffs = word_params->cutoffs +
786 curr_context;
787 score = s_BlastAaExtendOneHit(matrix, subject, query,
788 subject_offset, query_offset,
789 cutoffs->x_dropoff,
790 &hsp_q, &hsp_s, &hsp_len,
791 wordsize, use_pssm, &s_last_off);
792
793 /* if the hsp meets the score threshold, report it */
794 if (score >= cutoffs->cutoff_score) {
795 BlastSaveInitHsp(ungapped_hsps, hsp_q, hsp_s,
796 query_offset, subject_offset, hsp_len,
797 score);
798 }
799 diag_array[diag_coord].last_hit =
800 s_last_off - (wordsize - 1) + diag_offset;
801 ++hits_extended;
802 }
803 } /* end for */
804 } /* end while */
805
806 /* increment the offset in the diagonal array (no windows used) */
807 Blast_ExtendWordExit(ewp, subject->length);
808
809 Blast_UngappedStatsUpdate(ungapped_stats, totalhits, hits_extended,
810 ungapped_hsps->total);
811 return 0;
812 }
813
814 /**
815 * Beginning at s_off and q_off in the subject and query, respectively,
816 * extend to the right until the cumulative score becomes negative or
817 * drops by at least 'dropoff', or the end of at least one sequence
818 * is reached.
819 *
820 * @param matrix the substitution matrix [in]
821 * @param subject subject sequence [in]
822 * @param query query sequence [in]
823 * @param s_off subject offset [in]
824 * @param q_off query offset [in]
825 * @param dropoff the X dropoff parameter [in]
826 * @param length the length of the computed extension [out]
827 * @param maxscore the score derived from a previous left extension [in]
828 * @param s_last_off the rightmost subject offset examined [out]
829 * @return The score of the extension
830 */
s_BlastAaExtendRight(Int4 ** matrix,const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,Int4 s_off,Int4 q_off,Int4 dropoff,Int4 * length,Int4 maxscore,Int4 * s_last_off)831 static Int4 s_BlastAaExtendRight(Int4 ** matrix,
832 const BLAST_SequenceBlk * subject,
833 const BLAST_SequenceBlk * query,
834 Int4 s_off, Int4 q_off, Int4 dropoff,
835 Int4 * length, Int4 maxscore, Int4 * s_last_off)
836 {
837 Int4 i, n, best_i = -1;
838 Int4 score = maxscore;
839
840 Uint1 *s, *q;
841 n = MIN(subject->length - s_off, query->length - q_off);
842
843 s = subject->sequence + s_off;
844 q = query->sequence + q_off;
845
846 for (i = 0; i < n; i++) {
847 score += matrix[q[i]][s[i]];
848
849 if (score > maxscore) {
850 maxscore = score;
851 best_i = i;
852 }
853
854 /* The comparison below is really >= and is different than the old
855 code (e.g., blast.c:BlastWordExtend_prelim). In the old code the
856 loop continued as long as sum > X (X being negative). The loop
857 control here is different and we *break out* when the if statement
858 below is true. */
859 if (score <= 0 || (maxscore - score) >= dropoff)
860 break;
861 }
862
863 *length = best_i + 1;
864 *s_last_off = s_off + i;
865 return maxscore;
866 }
867
868
869 /**
870 * Beginning at s_off and q_off in the subject and query, respectively,
871 * extend to the left until the cumulative score drops by at least
872 * 'dropoff', or the end of at least one sequence is reached.
873 *
874 * @param matrix the substitution matrix [in]
875 * @param subject subject sequence [in]
876 * @param query query sequence [in]
877 * @param s_off subject offset [in]
878 * @param q_off query offset [in]
879 * @param dropoff the X dropoff parameter [in]
880 * @param length the length of the computed extension [out]
881 * @param maxscore the best running score from a previous extension;
882 * the running score of the current extension must exceed
883 * this value if the extension is to be of nonzero size [in]
884 * @return The score of the extension
885 */
s_BlastAaExtendLeft(Int4 ** matrix,const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,Int4 s_off,Int4 q_off,Int4 dropoff,Int4 * length,Int4 maxscore)886 static Int4 s_BlastAaExtendLeft(Int4 ** matrix,
887 const BLAST_SequenceBlk * subject,
888 const BLAST_SequenceBlk * query,
889 Int4 s_off, Int4 q_off, Int4 dropoff,
890 Int4 * length, Int4 maxscore)
891 {
892 Int4 i, n, best_i;
893 Int4 score = maxscore;
894
895 Uint1 *s, *q;
896
897 n = MIN(s_off, q_off);
898 best_i = n + 1;
899
900 s = subject->sequence + s_off - n;
901 q = query->sequence + q_off - n;
902
903 for (i = n; i >= 0; i--) {
904 score += matrix[q[i]][s[i]];
905
906 if (score > maxscore) {
907 maxscore = score;
908 best_i = i;
909 }
910 /* The comparison below is really >= and is different than the old
911 code (e.g., blast.c:BlastWordExtend_prelim). In the old code the
912 loop continued as long as sum > X (X being negative). The loop
913 control here is different and we *break out* when the if statement
914 below is true. */
915 if ((maxscore - score) >= dropoff)
916 break;
917 }
918
919 *length = n - best_i + 1;
920 return maxscore;
921 }
922
923 /**
924 * Identical to BlastAaExtendRight, except the score matrix
925 * is position-specific
926 *
927 * @param matrix the substitution matrix representing query sequence [in]
928 * @param subject subject sequence [in]
929 * @param query_size number of rows in the scoring matrix [in]
930 * @param s_off subject offset [in]
931 * @param q_off query offset [in]
932 * @param dropoff the X dropoff parameter [in]
933 * @param length the length of the extension [out]
934 * @param maxscore the score derived from a previous left extension [in]
935 * @param s_last_off the rightmost subject offset examined [out]
936 * @return The score of the extension
937 */
s_BlastPSSMExtendRight(Int4 ** matrix,const BLAST_SequenceBlk * subject,Int4 query_size,Int4 s_off,Int4 q_off,Int4 dropoff,Int4 * length,Int4 maxscore,Int4 * s_last_off)938 static Int4 s_BlastPSSMExtendRight(Int4 ** matrix,
939 const BLAST_SequenceBlk * subject,
940 Int4 query_size, Int4 s_off,
941 Int4 q_off, Int4 dropoff,
942 Int4 * length, Int4 maxscore, Int4 * s_last_off)
943 {
944 Int4 i, n, best_i = -1;
945 Int4 score = maxscore;
946 Uint1 *s;
947
948 n = MIN(subject->length - s_off, query_size - q_off);
949 s = subject->sequence + s_off;
950
951 for (i = 0; i < n; i++) {
952 score += matrix[q_off + i][s[i]];
953
954 if (score > maxscore) {
955 maxscore = score;
956 best_i = i;
957 }
958
959 /* The comparison below is really >= and is different than the old
960 code (e.g., blast.c:BlastWordExtend_prelim). In the old code the
961 loop continued as long as sum > X (X being negative). The loop
962 control here is different and we *break out* when the if statement
963 below is true. */
964 if (score <= 0 || (maxscore - score) >= dropoff)
965 break;
966 }
967
968 *length = best_i + 1;
969 *s_last_off = s_off + i;
970 return maxscore;
971 }
972
973 /**
974 * Identical to BlastAaExtendLeft, except the query is
975 * represented by a position-specific score matrix.
976 *
977 * @param matrix the substitution matrix representing the query [in]
978 * @param subject subject sequence [in]
979 * @param s_off subject offset [in]
980 * @param q_off query offset [in]
981 * @param dropoff the X dropoff parameter [in]
982 * @param length the length of the extension [out]
983 * @param maxscore the score so far (probably from initial word hit) [in]
984 * @return The score of the extension
985 */
s_BlastPSSMExtendLeft(Int4 ** matrix,const BLAST_SequenceBlk * subject,Int4 s_off,Int4 q_off,Int4 dropoff,Int4 * length,Int4 maxscore)986 static Int4 s_BlastPSSMExtendLeft(Int4 ** matrix,
987 const BLAST_SequenceBlk * subject,
988 Int4 s_off, Int4 q_off,
989 Int4 dropoff, Int4 * length, Int4 maxscore)
990 {
991 Int4 i, n, best_i;
992 Int4 score = maxscore;
993 Uint1 *s;
994
995 n = MIN(s_off, q_off);
996 best_i = n + 1;
997 s = subject->sequence + s_off - n;
998
999 for (i = n; i >= 0; i--) {
1000 score += matrix[q_off - n + i][s[i]];
1001
1002 if (score > maxscore) {
1003 maxscore = score;
1004 best_i = i;
1005 }
1006 /* The comparison below is really >= and is different than the old
1007 code (e.g., blast.c:BlastWordExtend_prelim). In the old code the
1008 loop continued as long as sum > X (X being negative). The loop
1009 control here is different and we *break out* when the if statement
1010 below is true. */
1011 if ((maxscore - score) >= dropoff)
1012 break;
1013 }
1014
1015 *length = n - best_i + 1;
1016 return maxscore;
1017 }
1018
1019 static Int4
s_BlastAaExtendOneHit(Int4 ** matrix,const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,Int4 s_off,Int4 q_off,Int4 dropoff,Int4 * hsp_q,Int4 * hsp_s,Int4 * hsp_len,Int4 word_size,Boolean use_pssm,Int4 * s_last_off)1020 s_BlastAaExtendOneHit(Int4 ** matrix,
1021 const BLAST_SequenceBlk * subject,
1022 const BLAST_SequenceBlk * query,
1023 Int4 s_off, Int4 q_off,
1024 Int4 dropoff, Int4 * hsp_q, Int4 * hsp_s,
1025 Int4 * hsp_len, Int4 word_size,
1026 Boolean use_pssm, Int4 * s_last_off)
1027 {
1028 Int4 score = 0, left_score, total_score, sum = 0;
1029 Int4 left_disp = 0, right_disp = 0;
1030 Int4 q_left_off = q_off, q_right_off =
1031 q_off + word_size, q_best_left_off = q_off;
1032 Int4 s_left_off, s_right_off;
1033 Int4 init_hit_width = 0;
1034 Int4 i; /* loop variable. */
1035 Uint1 *q = query->sequence;
1036 Uint1 *s = subject->sequence;
1037
1038 for (i = 0; i < word_size; i++) {
1039 if (use_pssm)
1040 sum += matrix[q_off + i][s[s_off + i]];
1041 else
1042 sum += matrix[q[q_off + i]][s[s_off + i]];
1043
1044 if (sum > score) {
1045 score = sum;
1046 q_best_left_off = q_left_off;
1047 q_right_off = q_off + i;
1048 } else if (sum <= 0) {
1049 sum = 0;
1050 q_left_off = q_off + i + 1;
1051 }
1052 }
1053
1054 init_hit_width = q_right_off - q_left_off + 1;
1055
1056 q_left_off = q_best_left_off;
1057
1058 s_left_off = q_left_off + (s_off - q_off);
1059 s_right_off = q_right_off + (s_off - q_off);
1060
1061 if (use_pssm) {
1062 left_score = s_BlastPSSMExtendLeft(matrix, subject,
1063 s_left_off - 1, q_left_off - 1,
1064 dropoff, &left_disp, score);
1065
1066 total_score = s_BlastPSSMExtendRight(matrix, subject, query->length,
1067 s_right_off + 1, q_right_off + 1,
1068 dropoff, &right_disp, left_score,
1069 s_last_off);
1070 } else {
1071 left_score = s_BlastAaExtendLeft(matrix, subject, query,
1072 s_left_off - 1, q_left_off - 1,
1073 dropoff, &left_disp, score);
1074
1075 total_score = s_BlastAaExtendRight(matrix, subject, query,
1076 s_right_off + 1, q_right_off + 1,
1077 dropoff, &right_disp, left_score,
1078 s_last_off);
1079 }
1080
1081 *hsp_q = q_left_off - left_disp;
1082 *hsp_s = s_left_off - left_disp;
1083 *hsp_len = left_disp + right_disp + init_hit_width;
1084
1085 return total_score;
1086 }
1087
1088 static Int4
s_BlastAaExtendTwoHit(Int4 ** matrix,const BLAST_SequenceBlk * subject,const BLAST_SequenceBlk * query,Int4 s_left_off,Int4 s_right_off,Int4 q_right_off,Int4 dropoff,Int4 * hsp_q,Int4 * hsp_s,Int4 * hsp_len,Boolean use_pssm,Int4 word_size,Boolean * right_extend,Int4 * s_last_off)1089 s_BlastAaExtendTwoHit(Int4 ** matrix,
1090 const BLAST_SequenceBlk * subject,
1091 const BLAST_SequenceBlk * query,
1092 Int4 s_left_off, Int4 s_right_off,
1093 Int4 q_right_off, Int4 dropoff,
1094 Int4 * hsp_q, Int4 * hsp_s, Int4 * hsp_len,
1095 Boolean use_pssm, Int4 word_size,
1096 Boolean * right_extend, Int4 * s_last_off)
1097 {
1098 Int4 left_d = 0, right_d = 0; /* left and right displacements */
1099 Int4 left_score = 0, right_score = 0; /* left and right scores */
1100 Int4 i, score = 0;
1101 Uint1 *s = subject->sequence;
1102 Uint1 *q = query->sequence;
1103
1104 /* find one beyond the position (up to word_size-1 letters to the right)
1105 that gives the largest starting score */
1106 /* Use "one beyond" to make the numbering consistent with how it's done
1107 for BlastAaExtendOneHit and the "Extend" functions called here. */
1108 for (i = 0; i < word_size; i++) {
1109 if (use_pssm)
1110 score += matrix[q_right_off + i][s[s_right_off + i]];
1111 else
1112 score += matrix[q[q_right_off + i]][s[s_right_off + i]];
1113
1114 if (score > left_score) {
1115 left_score = score;
1116 right_d = i + 1; /* Position is one beyond the end of the
1117 word. */
1118 }
1119 }
1120 q_right_off += right_d;
1121 s_right_off += right_d;
1122
1123 right_d = 0;
1124 *right_extend = FALSE;
1125 *s_last_off = s_right_off;
1126
1127 /* first, try to extend left, from the second hit to the first hit. */
1128 if (use_pssm)
1129 left_score = s_BlastPSSMExtendLeft(matrix, subject,
1130 s_right_off - 1, q_right_off - 1,
1131 dropoff, &left_d, 0);
1132 else
1133 left_score = s_BlastAaExtendLeft(matrix, subject, query,
1134 s_right_off - 1, q_right_off - 1,
1135 dropoff, &left_d, 0);
1136
1137 /* Extend to the right only if left extension reached the first hit. */
1138 if (left_d >= (s_right_off - s_left_off)) {
1139
1140 *right_extend = TRUE;
1141 if (use_pssm)
1142 right_score = s_BlastPSSMExtendRight(matrix, subject, query->length,
1143 s_right_off, q_right_off,
1144 dropoff, &right_d, left_score,
1145 s_last_off);
1146 else
1147 right_score = s_BlastAaExtendRight(matrix, subject, query,
1148 s_right_off, q_right_off,
1149 dropoff, &right_d, left_score,
1150 s_last_off);
1151 }
1152
1153
1154 *hsp_q = q_right_off - left_d;
1155 *hsp_s = s_right_off - left_d;
1156 *hsp_len = left_d + right_d;
1157 return MAX(left_score, right_score);
1158 }
1159